diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..f46f5c9 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,19 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + target-branch: "dev" + + - package-ecosystem: "npm" + directory: "/frontend" + schedule: + interval: "weekly" + target-branch: "dev" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + target-branch: "dev" diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index 8b48ddf..f329b68 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -6,6 +6,10 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - - uses: psf/black@stable + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - run: pip install ruff + - run: ruff check . + - run: ruff format --check . diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml index 474e2ba..32c3840 100644 --- a/.github/workflows/claude-code-review.yml +++ b/.github/workflows/claude-code-review.yml @@ -1,30 +1,22 @@ name: Claude Code Review on: - pull_request: - types: [opened, ready_for_review] - # Optional: Only run on specific file changes - # paths: - # - "src/**/*.ts" - # - "src/**/*.tsx" - # - "src/**/*.js" - # - "src/**/*.jsx" + workflow_dispatch: + inputs: + pr_number: + description: 'PR number to review' + required: true + type: number jobs: claude-review: - # Optional: Filter by PR author - # if: | - # github.event.pull_request.user.login == 'external-contributor' || - # github.event.pull_request.user.login == 'new-developer' || - # github.event.pull_request.author_association == 'FIRST_TIME_CONTRIBUTOR' - runs-on: ubuntu-latest permissions: contents: read pull-requests: read issues: read id-token: write - + steps: - name: Checkout repository uses: actions/checkout@v4 @@ -36,43 +28,12 @@ jobs: uses: anthropics/claude-code-action@beta with: claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} - - # Optional: Specify model (defaults to Claude Sonnet 4, uncomment for Claude Opus 4) - # model: "claude-opus-4-20250514" - - # Direct prompt for automated review (no @claude mention needed) direct_prompt: | - Please review this pull request and provide feedback on: + Please review pull request #${{ inputs.pr_number }} and provide feedback on: - Code quality and best practices - Potential bugs or issues - Performance considerations - Security concerns - Test coverage - - Be constructive and helpful in your feedback. - - # Optional: Use sticky comments to make Claude reuse the same comment on subsequent pushes to the same PR - # use_sticky_comment: true - - # Optional: Customize review based on file types - # direct_prompt: | - # Review this PR focusing on: - # - For TypeScript files: Type safety and proper interface usage - # - For API endpoints: Security, input validation, and error handling - # - For React components: Performance, accessibility, and best practices - # - For tests: Coverage, edge cases, and test quality - - # Optional: Different prompts for different authors - # direct_prompt: | - # ${{ github.event.pull_request.author_association == 'FIRST_TIME_CONTRIBUTOR' && - # 'Welcome! Please review this PR from a first-time contributor. Be encouraging and provide detailed explanations for any suggestions.' || - # 'Please provide a thorough code review focusing on our coding standards and best practices.' }} - - # Optional: Add specific tools for running tests or linting - # allowed_tools: "Bash(npm run test),Bash(npm run lint),Bash(npm run typecheck)" - - # Optional: Skip review for certain conditions - # if: | - # !contains(github.event.pull_request.title, '[skip-review]') && - # !contains(github.event.pull_request.title, '[WIP]') + Be constructive and helpful in your feedback. diff --git a/.github/workflows/deploy_store.yml b/.github/workflows/deploy_store.yml index 7bf010e..9972735 100644 --- a/.github/workflows/deploy_store.yml +++ b/.github/workflows/deploy_store.yml @@ -15,11 +15,9 @@ jobs: steps: - name: Checkout uses: actions/checkout@v5 - with: - ref: dev - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v1 + uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -27,32 +25,31 @@ jobs: - name: Login to Amazon ECR id: login-ecr - uses: aws-actions/amazon-ecr-login@v1 + uses: aws-actions/amazon-ecr-login@v2 - name: Build, tag, and push image to Amazon ECR id: build-image env: ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} - ECR_REPOSITORY: seqcolapi-store + ECR_REPOSITORY: seqcolapi IMAGE_TAG: ${{ github.sha }} run: | - cd deployment/seqcolapi-store/ - docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -f Dockerfile . + docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -f deployment/seqcolapi-store/Dockerfile . docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG - echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" + echo "image=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT - name: Fill in the new image ID in the Amazon ECS task definition id: task-def - uses: aws-actions/amazon-ecs-render-task-definition@v1 + uses: aws-actions/amazon-ecs-render-task-definition@v1.6.2 with: task-definition: deployment/seqcolapi-store/task_def.json - container-name: seqcolapi-store + container-name: seqcolapi image: ${{ steps.build-image.outputs.image }} - name: Deploy Amazon ECS task definition - uses: aws-actions/amazon-ecs-deploy-task-definition@v1 + uses: aws-actions/amazon-ecs-deploy-task-definition@v2 with: task-definition: ${{ steps.task-def.outputs.task-definition }} - service: seqcolapi-store-service + service: seqcolapi-service cluster: yeti wait-for-service-stability: true diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index e54ad87..b8cb119 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -1,4 +1,4 @@ -# This workflows will upload a Python Package using Twine when a release is created +# This workflow uploads a Python Package using trusted publishing when a release is created # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries name: Upload Python Package @@ -23,10 +23,9 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel twine - - name: Build and publish + pip install build + - name: Build package run: | - python setup.py sdist bdist_wheel + python -m build - name: Publish package distributions to PyPI uses: pypa/gh-action-pypi-publish@release/v1 - diff --git a/.github/workflows/run-codecov.yml b/.github/workflows/run-codecov.yml deleted file mode 100644 index de9e8f6..0000000 --- a/.github/workflows/run-codecov.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: Run codecov - -on: - pull_request: - branches: [master] - -jobs: - pytest: - runs-on: ${{ matrix.os }} - strategy: - matrix: - python-version: ["3.13"] - os: [ubuntu-latest] - - steps: - - uses: actions/checkout@v2 - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v2 - with: - file: ./coverage.xml - name: py-${{ matrix.python-version }}-${{ matrix.os }} diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 446f4dc..637e616 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -9,7 +9,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ["3.10", "3.13"] + python-version: ["3.10", "3.14"] os: [ubuntu-latest] steps: @@ -20,13 +20,10 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Install test dependencies - run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi - - - name: Install package + - name: Install package with test extras env: PYO3_USE_ABI3_FORWARD_COMPATIBILITY: 1 - run: python -m pip install . + run: python -m pip install ".[test]" - name: Run pytest tests - run: pytest -x -vv --cov=./ --cov-report=xml \ No newline at end of file + run: pytest -x -vv --cov=./ --cov-report=xml diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 9c9f250..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include requirements/* -include README.md -include refget/schemas/* \ No newline at end of file diff --git a/README.md b/README.md index dec9086..c595362 100644 --- a/README.md +++ b/README.md @@ -43,9 +43,48 @@ This starts the test database, runs tests, and cleans up automatically. ## Development and deployment: Backend -### Easy-peasy way +### Store-backed (no database) -In a moment I'll show you how to do these steps individually, but if you're in a hurry, the easy way get a development API running for testing is to just use my very simple shell script like this (no data persistence, just loads demo data): +The store-backed seqcolapi uses a RefgetStore (local files) instead of PostgreSQL. This is the simplest way to run the API: + +#### Quick start + +```console +bash deployment/store_demo_up.sh +``` + +This will: +- Build a local RefgetStore from test FASTA files +- Run the store-backed seqcolapi with uvicorn +- Block the terminal until you press Ctrl+C, which cleans up + +No Docker or database required. + +#### Step-by-step + +1. Build a store from FASTA files: + +```console +python data_loaders/demo_build_store.py test_fasta /tmp/refget_demo_store +``` + +2. Start the store-backed API: + +```console +REFGET_STORE_PATH=/tmp/refget_demo_store uvicorn seqcolapi.main:store_app --reload --port 8100 +``` + +#### Remote store + +To run against a remote (S3) store: + +```console +REFGET_STORE_URL=https://example.com/store uvicorn seqcolapi.main:store_app --port 8100 +``` + +### DB-backed (PostgreSQL) + +If you need a database-backed instance (e.g., for mutable data, advanced queries), use the DB-backed workflow. In a moment I'll show you how to do these steps individually, but if you're in a hurry, the easy way to get a development API running for testing is to just use my very simple shell script like this (no data persistence, just loads demo data): ```console bash deployment/demo_up.sh @@ -58,7 +97,7 @@ This will: - load up the demo data - block the terminal until you press Ctrl+C, which will shut down all services. -### Step-by-step process +### Step-by-step process (DB-backed) Alternatively, if you want to run each step separately to see what's really going on, start here. diff --git a/data_loaders/demo_build_store.py b/data_loaders/demo_build_store.py index 39ae6c0..4fa1669 100644 --- a/data_loaders/demo_build_store.py +++ b/data_loaders/demo_build_store.py @@ -38,7 +38,14 @@ def main(): store = RefgetStore.on_disk(store_path) for fasta in fasta_files: - store.add_sequence_collection_from_fasta(fasta) + result = store.add_sequence_collection_from_fasta(fasta) + # Register the filename (without extension) as a collection alias + basename = os.path.basename(fasta) + name = basename.split(".")[0] # strip .fa, .fasta, .fa.gz, etc. + meta = result[0] if isinstance(result, tuple) else result + if meta: + store.add_collection_alias("fasta_filename", name, meta.digest) + print(f" {name} → {meta.digest}") print(f"Done. Store at: {store_path}") print(f"Stats: {store.stats()}") diff --git a/data_loaders/demo_remote_store.py b/data_loaders/demo_remote_store.py index 137fe52..af575e5 100644 --- a/data_loaders/demo_remote_store.py +++ b/data_loaders/demo_remote_store.py @@ -39,7 +39,7 @@ def main(): print(f"\n1. Loading remote store from:\n {REMOTE_URL}") print(f" Cache directory: {CACHE_DIR}\n") - store = RefgetStore.load_remote(cache_path=str(CACHE_DIR), remote_url=REMOTE_URL) + store = RefgetStore.open_remote(cache_path=str(CACHE_DIR), remote_url=REMOTE_URL) print(f" Loaded! {len(store)} sequences available (metadata only)") @@ -51,9 +51,8 @@ def main(): # 3. List sequences (first 5) print(f"\n3. Listing sequences (first 5 of {len(store)}):") - records = store.sequence_records() - for i, rec in enumerate(records[:5]): - m = rec.metadata + records = store.list_sequences() + for i, m in enumerate(records[:5]): print(f" {i+1}. {m.name[:50]}...") print(f" sha512t24u: {m.sha512t24u}") print(f" length: {m.length:,} bp") @@ -61,7 +60,7 @@ def main(): # 4. Fetch a sequence by ID (downloads sequence data on first access) seq_digest = "du4GiRD_OcmdmCn_RmImyb71YZ4XoCdk" print(f"\n4. Get sequence record by ID (fetches from remote):") - record = store.get_sequence_by_id(seq_digest) + record = store.get_sequence(seq_digest) if record: print(f" Name: {record.metadata.name}") print(f" Length: {record.metadata.length:,} bp") @@ -107,7 +106,7 @@ def main(): print(f" Collection: {EXAMPLE_COLLECTION}") print(f" Sequence: {EXAMPLE_SEQ_NAME[:50]}...") - record = store.get_sequence_by_collection_and_name(EXAMPLE_COLLECTION, EXAMPLE_SEQ_NAME) + record = store.get_sequence_by_name(EXAMPLE_COLLECTION, EXAMPLE_SEQ_NAME) if record: print(f" Found! Length: {record.metadata.length:,} bp") print(f" Digest: {record.metadata.sha512t24u}") @@ -149,9 +148,9 @@ def main(): print(f"\nCache directory: {CACHE_DIR}") print(f"Temp files: {temp_dir}") print("\nKey features demonstrated:") - print(" - load_remote(): Load store from URL, fetch sequences on-demand") - print(" - get_sequence_by_id(): Lookup by SHA-512/24u or MD5 digest") - print(" - get_sequence_by_collection_and_name(): Lookup by sequence name") + print(" - open_remote(): Load store from URL, fetch sequences on-demand") + print(" - get_sequence(): Lookup by SHA-512/24u or MD5 digest") + print(" - get_sequence_by_name(): Lookup by collection digest + sequence name") print(" - substrings_from_regions(): Batch retrieval from BED file") print(" - export_fasta_by_digests(): Export sequences by digest") print(" - export_fasta_from_regions(): Export BED regions to FASTA") diff --git a/data_loaders/load_demo_seqcols.py b/data_loaders/load_demo_seqcols.py index 21c9499..cb49246 100644 --- a/data_loaders/load_demo_seqcols.py +++ b/data_loaders/load_demo_seqcols.py @@ -19,7 +19,7 @@ DEMO_FASTA = json.load(open("test_fasta/test_fasta_digests.json")) # Storage locations from environment (if set, will upload; otherwise use demo defaults with skip_upload) -ENV_STORAGE = json.loads(os.environ.get("FASTA_STORAGE_LOCATIONS", "[]")) +ENV_STORAGE = json.loads(os.environ.get("FASTA_STORAGE_LOCATIONS") or "[]") if ENV_STORAGE: DEMO_STORAGE = ENV_STORAGE SKIP_UPLOAD = False diff --git a/data_loaders/ref-genome-analysis/.gitignore b/data_loaders/ref-genome-analysis/.gitignore new file mode 100644 index 0000000..ff7f203 --- /dev/null +++ b/data_loaders/ref-genome-analysis/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +*.pyc +*.log diff --git a/data_loaders/ref-genome-analysis/CLAUDE.md b/data_loaders/ref-genome-analysis/CLAUDE.md new file mode 100644 index 0000000..cbe896b --- /dev/null +++ b/data_loaders/ref-genome-analysis/CLAUDE.md @@ -0,0 +1,49 @@ +# ref-genome-analysis + +Pipeline for building a RefgetStore from reference genome FASTA files. Inventories genomes from the brickyard, loads them into a refget store, registers NCBI aliases, generates FAIR Header Representation (FHR) metadata, and verifies the result. + +## Setup + +Source the environment for your compute target: +- HPC (from laptop): `source env/remote-hpc.env` +- HPC (direct): `source env/on-cluster.env` + +To start mutagen sync: `./env/mutagen-setup.sh` + +## Pipeline Phases + +Execute in order: + +1. **01_inventory** -- Scan brickyard, generate CSV inventory of all FASTA files + - `python src/01_inventory/inventory_genomes.py` + +2. **02_aliases** -- Download NCBI assembly reports, build alias table, register in store + - Phase A: `python src/02_aliases/build_ncbi_alias_table.py` (downloads from NCBI, slow) + - Phase B: `sbatch src/02_aliases/register_aliases.sbatch` + +3. **03_fhr** -- Generate FAIR Header Representation metadata, load into store + - `python src/03_fhr/batch_generate_fhr.py --inventory $INVENTORY_CSV --output-dir $STAGING/fhr_metadata` + - `python src/03_fhr/load_fhr_metadata.py --store-path $STORE_PATH --fhr-dir $STAGING/fhr_metadata` + +4. **04_verify** -- Validate store integrity + - `python src/04_verify/verify_refgetstore.py` + +## Key Environment Variables + +- `BRICK_ROOT` -- Root of the refgenomes_fasta brick +- `STORE_PATH` -- Path to the RefgetStore database +- `STAGING` -- Staging area for intermediates (assembly reports, alias tables, FHR JSON) +- `INVENTORY_CSV` -- Path to the genome inventory CSV + +## Dependencies + +- Python 3.11+ (via `module load miniforge/24.3.0-py3.11` on Rivanna) +- `refget` or `gtars` Python package (for RefgetStore) +- Internet access for NCBI API calls (phases 2 and 3) + +## Notes + +- All phases are resumable -- cached downloads, idempotent store operations +- Phase 2A rate-limits NCBI requests (0.3s between calls) +- `src/05_profiling/` contains memory/timing benchmarks (not part of the main pipeline) +- `src/examples/` contains a 20-genome integration test diff --git a/data_loaders/ref-genome-analysis/README.md b/data_loaders/ref-genome-analysis/README.md new file mode 100644 index 0000000..a93b618 --- /dev/null +++ b/data_loaders/ref-genome-analysis/README.md @@ -0,0 +1,101 @@ +# ref-genome-analysis + +Pipeline for loading reference genome FASTA files into a RefgetStore and enriching them with NCBI aliases and FHR provenance metadata. + +## Setup + +```bash +source env/on-cluster.env # on Rivanna directly +source env/remote-hpc.env # from laptop, targeting Rivanna +./env/mutagen-setup.sh # start file sync (laptop only) +``` + +## Pipeline stages + +Execute in order: + +``` +inventory --> build --> aliases --> fhr --> verify +``` + +| Stage | Location | Purpose | +|---|---|---| +| **inventory** | `src/01_inventory/` | Scan brickyard FASTA files, produce `refgenomes_inventory.csv` | +| **build** | `src/02_build/` | Compute seqcol digests for all FASTAs, produce `digest_map.csv` | +| **aliases** | `src/02_aliases/` | Download NCBI assembly reports, build alias table, register sequence/collection aliases | +| **fhr** | `src/03_fhr/` | Generate and attach FHR provenance metadata (species, taxon, accession, submitter, etc.) | +| **verify** | `src/04_verify/` | Automated pass/fail checks against the store | +| **profiling** | `src/05_profiling/` | Memory and timing benchmarks | +| **split** | `src/90_split_store.py` | Split combined store into VGP and reference genome stores | +| **backfill** | `src/backfill_sequence_aliases.py` | Re-register aliases into split stores from NCBI alias table | +| **validate** | `src/validate_split_stores.py` | Validate split stores (counts, aliases, FHR, sequences, cross-store) | +| **push** | `src/push_to_s3.sh` | Push split stores to S3 (`s3://refgenie/`) | +| **examples** | `src/examples/` | End-to-end test scripts (e.g., load 20 genomes with FHR) | + +## Environment variables + +All paths come from environment variables set by sourcing an env file. No hardcoded paths in scripts. + +| Variable | Purpose | +|---|---| +| `BRICKYARD` | Lab-wide brickyard root | +| `BRICK_ROOT` | This project's brick (`$BRICKYARD/datasets_downloaded/refgenomes_fasta`) | +| `STORE_PATH` | The RefgetStore database | +| `STAGING` | Pipeline intermediates (assembly reports, alias tables, FHR JSON) | +| `INVENTORY_CSV` | Inventory of all FASTAs | + +## Quick start (Rivanna) + +```bash +source env/on-cluster.env +module load miniforge/24.3.0-py3.11 + +# 1. Inventory +python src/01_inventory/inventory_genomes.py + +# 2. Register NCBI aliases +sbatch src/02_aliases/register_aliases.sbatch + +# 3. Attach FHR metadata +python src/03_fhr/batch_generate_fhr.py --inventory $INVENTORY_CSV --output-dir $STAGING/fhr_metadata +python src/03_fhr/load_fhr_metadata.py --store-path $STORE_PATH --fhr-dir $STAGING/fhr_metadata + +# 4. Verify +python src/04_verify/verify_refgetstore.py + +# 5. Split into VGP + ref stores +sbatch src/90_split_store.sbatch + +# 6. Backfill aliases into split stores +python src/backfill_sequence_aliases.py --target $BRICK_ROOT/vgp_reference_store +python src/backfill_sequence_aliases.py --target $BRICK_ROOT/refgenome_jungle_store + +# 7. Validate split stores +sbatch src/validate_split_stores.sbatch + +# 8. Push to S3 (requires GPG agent forwarding: ssh riva1_gpg) +bash src/push_to_s3.sh both +``` + +## S3 deployment + +Requires GPG agent forwarding for `pass` credentials (see `ssh riva1_gpg` in SSH config). + +```bash +ssh riva1_gpg +cd code/ref-genome-analysis +source env/on-cluster.env +bash src/push_to_s3.sh vgp # or: ref, both, "vgp --dry-run" +``` + +Stores are pushed to `s3://refgenie/refget-store/vgp` and `s3://refgenie/refget-store/jungle`. + +To load from S3: + +```python +from refget.store import RefgetStore +store = RefgetStore.open_remote( + "~/.refget/vgp_cache", + "https://refgenie.s3.us-east-1.amazonaws.com/refget-store/vgp" +) +``` diff --git a/data_loaders/ref-genome-analysis/docs/missing_seqcolapi_collections.md b/data_loaders/ref-genome-analysis/docs/missing_seqcolapi_collections.md new file mode 100644 index 0000000..3ba8d34 --- /dev/null +++ b/data_loaders/ref-genome-analysis/docs/missing_seqcolapi_collections.md @@ -0,0 +1,22 @@ +# Missing seqcolapi collections + +8 collections hosted on seqcolapi.databio.org are not in any RefgetStore. +These were loaded into the PostgreSQL-backed seqcolapi directly from +`fasta/pangenome_reference/` FASTAs that weren't included in the combined store build. + +## TODO + +Load these into the jungle store. 7 of 8 are confirmed in `$BRICK_ROOT/fasta/pangenome_reference/`: + +| Digest | Seqs | FASTA | +|---|---|---| +| `2WhejNO718T5jvB4DVTAz-A_JF03iIkz` | 25 | `GCA_009914755.4_CHM13_T2T_v2.0_genomic.fna.gz` | +| `6DfkalgYxFZiYAKpJf19dbpnS-dGzi4m` | 24 | `chm13.draft_v1.1.fasta.gz` | +| `Hve5dblWYLxu1p9Cp930NB8twHGCsf6X` | 640 | `GCA_000001405.28_GRCh38.p13_genomic.fa.gz` | +| `VDUOdAUYpXHUhvU-MNmOTgYQAl67yRMs` | 445 | `Homo_sapiens.GRCh38.dna.alt.fa.gz` | +| `WwIG41XDzO0BTmEpzT7nPXv6Dfx7h4ju` | 1 | `CM000663.2.fasta.gz` | +| `awlJ5Q7EPDVlwXWH8LPN93oJ5jY2uajW` | 24 | `T2T-CHM13v2.0.unmasked.fa.gz` | +| `qJ79liNTAD-LShR3j_2xntOEt-eC3vhM` | 639 | `Homo_sapiens.GRCh38.dna.toplevel.fa.gz` | +| `gHcfbUVnFzHv3QSqz2sSqVHdUQbDO8N5` | 3366 | Not in pangenome_reference. Likely `GRCh38_full_analysis_set_plus_decoy_hla.fa.gz` from `fasta/jungle/homo_sapiens/` | + +These are needed for seqcol compliance testing since they're currently served by the API. diff --git a/data_loaders/ref-genome-analysis/docs/pephubclient-issues.md b/data_loaders/ref-genome-analysis/docs/pephubclient-issues.md new file mode 100644 index 0000000..bdeb978 --- /dev/null +++ b/data_loaders/ref-genome-analysis/docs/pephubclient-issues.md @@ -0,0 +1,47 @@ +# PEPhub Client: Issues Encountered + +## 1. `--force` doesn't update samples on existing projects + +**Problem:** `phc push --force` and `phc.upload(force=True)` return success (202) but silently fail to update the sample table when the project already exists. The config/metadata may update, but samples remain unchanged. + +**Workaround:** Delete the project first, then push fresh: + +```python +import requests +from pephubclient import PEPHubClient + +phc = PEPHubClient() +jwt = phc._PEPHubClient__jwt_data +headers = {"Authorization": f"Bearer {jwt}"} + +requests.delete( + "https://pephub-api.databio.org/api/v1/projects/NAMESPACE/PROJECT", + params={"tag": "TAG"}, + headers=headers, +) +``` + +Then push normally with `phc push`. + +## 2. Bare CSV push fails with 400 + +**Problem:** The CLI help says `CFG` accepts "Project config file (YAML) or sample table (CSV/TSV)", but pushing a bare CSV fails with `Unexpected Response Error. 400`. + +**Workaround:** Always push a YAML config that references the CSV: + +```yaml +# project_config.yaml +pep_version: "2.1.0" +sample_table: samples.csv +name: my_project +``` + +```bash +phc push --namespace NS --name NAME --tag TAG project_config.yaml +``` + +## 3. `phc.upload()` with peppy Project reports success but uploads empty samples + +**Problem:** Loading a project with `phc.load_project()`, modifying `sample_table` in-place, then calling `phc.upload()` reports success but the server receives no samples. The `project.to_dict()` output is correct (verified locally), so the issue is server-side. + +**Workaround:** Write the modified sample table to a CSV, create a YAML config referencing it, and use `phc push` with the YAML. diff --git a/data_loaders/ref-genome-analysis/env/deploy-deps.sh b/data_loaders/ref-genome-analysis/env/deploy-deps.sh new file mode 100644 index 0000000..5052be2 --- /dev/null +++ b/data_loaders/ref-genome-analysis/env/deploy-deps.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# env/deploy-deps.sh — Build and install dependencies on Rivanna from mutagen-synced source +# +# Requires: DEPLOY_HOST and DEPLOY_DIR set in env file +# Requires: mutagen syncs running (via mutagen-setup.sh) + +if [ -z "$DEPLOY_HOST" ] || [ -z "$DEPLOY_DIR" ]; then + echo "DEPLOY_HOST and DEPLOY_DIR must be set. Source your env file first." + exit 1 +fi + +ssh "$DEPLOY_HOST" 'bash --login -s' << EOF +set -e +source /etc/profile.d/modules.sh +module load miniforge/24.3.0-py3.11 + +# Build gtars from synced source +cd ${DEPLOY_DIR}/gtars/gtars-python +rm -f ../target/wheels/gtars-*.whl +echo "Building gtars..." +maturin build --release --no-default-features --features refget +pip install ../target/wheels/gtars-*.whl --force-reinstall --no-deps +echo "gtars installed." + +# Install refget from synced source +cd ${DEPLOY_DIR}/refget +echo "Installing refget..." +python -m pip install -e . +echo "refget installed." + +echo "Done!" +EOF diff --git a/data_loaders/ref-genome-analysis/env/mutagen-setup.sh b/data_loaders/ref-genome-analysis/env/mutagen-setup.sh new file mode 100755 index 0000000..8a147eb --- /dev/null +++ b/data_loaders/ref-genome-analysis/env/mutagen-setup.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# env/mutagen-setup.sh — Start mutagen sync for this project and its dependencies + +if [ -z "$SYNC_REMOTE" ]; then + echo "SYNC_REMOTE is not set. Set it in your env file to enable sync." + echo "Example: export SYNC_REMOTE=user@host:/path/to/project" + exit 0 +fi + +if [ -z "$PROJECT_NAME" ]; then + PROJECT_NAME=$(basename "$PWD") +fi + +# Sync the project itself +mutagen sync create \ + --name="${PROJECT_NAME}-pipeline" \ + --ignore=__pycache__ \ + --ignore="*.pyc" \ + --ignore="*.log" \ + --ignore=.git \ + . "$SYNC_REMOTE" + +echo "Sync started: ${PROJECT_NAME}-pipeline → $SYNC_REMOTE" + +# Sync dependencies for deployment +if [ -n "$DEPLOY_HOST" ] && [ -n "$DEPLOY_DIR" ]; then + # gtars — local source synced to remote deploy dir + GTARS_LOCAL="$HOME/Dropbox/workspaces/intervals/repos/gtars" + if [ -d "$GTARS_LOCAL" ]; then + mutagen sync create \ + --name="deploy-gtars" \ + --ignore=target \ + --ignore=__pycache__ \ + --ignore="*.pyc" \ + --ignore=.git \ + "$GTARS_LOCAL" "${DEPLOY_HOST}:${DEPLOY_DIR}/gtars" + echo "Sync started: deploy-gtars → ${DEPLOY_HOST}:${DEPLOY_DIR}/gtars" + else + echo "Warning: $GTARS_LOCAL not found, skipping gtars sync" + fi + + # refget — local source synced to remote deploy dir + REFGET_LOCAL="$HOME/Dropbox/workspaces/refgenie/repos/refget" + if [ -d "$REFGET_LOCAL" ]; then + mutagen sync create \ + --name="deploy-refget" \ + --ignore=__pycache__ \ + --ignore="*.pyc" \ + --ignore=.git \ + "$REFGET_LOCAL" "${DEPLOY_HOST}:${DEPLOY_DIR}/refget" + echo "Sync started: deploy-refget → ${DEPLOY_HOST}:${DEPLOY_DIR}/refget" + else + echo "Warning: $REFGET_LOCAL not found, skipping refget sync" + fi +fi diff --git a/data_loaders/ref-genome-analysis/env/on-cluster.env b/data_loaders/ref-genome-analysis/env/on-cluster.env new file mode 100644 index 0000000..f00236d --- /dev/null +++ b/data_loaders/ref-genome-analysis/env/on-cluster.env @@ -0,0 +1,19 @@ +export PROJECT_NAME="ref-genome-analysis" +export BRICKYARD=/project/shefflab/brickyard +export BRICK_ROOT=$BRICKYARD/datasets_downloaded/refgenomes_fasta +export STORE_PATH=$BRICK_ROOT/refget_store +export STAGING=$BRICK_ROOT/refget_staging +export INVENTORY_CSV=$BRICK_ROOT/refgenomes_inventory.csv +export S3_BUCKET=s3://refgenie + +# vgp store +export VGP_STORE_PATH=$BRICK_ROOT/refget-store/vgp +export VGP_S3_PATH=$S3_BUCKET/refget-store/vgp + +# jungle store +export REF_STORE_PATH=$BRICK_ROOT/refget-store/jungle +export REF_S3_PATH=$S3_BUCKET/refget-store/jungle + +# pangenome store +export PANGENOME_STORE_PATH=$BRICK_ROOT/refget-store/pangenome +export PANGENOME_S3_PATH=$S3_BUCKET/refget-store/pangenome diff --git a/data_loaders/ref-genome-analysis/env/remote-hpc.env b/data_loaders/ref-genome-analysis/env/remote-hpc.env new file mode 100644 index 0000000..3ec463f --- /dev/null +++ b/data_loaders/ref-genome-analysis/env/remote-hpc.env @@ -0,0 +1,24 @@ +export PROJECT_NAME="ref-genome-analysis" +export BRICKYARD=/project/shefflab/brickyard +export BRICK_ROOT=$BRICKYARD/datasets_downloaded/refgenomes_fasta +export STORE_PATH=$BRICK_ROOT/refget_store +export STAGING=$BRICK_ROOT/refget_staging +export INVENTORY_CSV=$BRICK_ROOT/refgenomes_inventory.csv +export S3_BUCKET=s3://refgenie + +# vgp store +export VGP_STORE_PATH=$BRICK_ROOT/refget-store/vgp +export VGP_S3_PATH=$S3_BUCKET/refget-store/vgp + +# jungle store +export REF_STORE_PATH=$BRICK_ROOT/refget-store/jungle +export REF_S3_PATH=$S3_BUCKET/refget-store/jungle + +# pangenome store +export PANGENOME_STORE_PATH=$BRICK_ROOT/refget-store/pangenome +export PANGENOME_S3_PATH=$S3_BUCKET/refget-store/pangenome + +# remote deployment +export SYNC_REMOTE=riva:~/code/ref-genome-analysis +export DEPLOY_HOST=riva +export DEPLOY_DIR=~/deploy diff --git a/data_loaders/ref-genome-analysis/src/01_inventory/inventory_genomes.py b/data_loaders/ref-genome-analysis/src/01_inventory/inventory_genomes.py new file mode 100644 index 0000000..b7881b8 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/01_inventory/inventory_genomes.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +""" +Inventory all FASTA files in the brickyard refgenomes directory. + +Walks the brickyard directory tree, extracts structured metadata from paths +and filenames, cross-references against the PEP project, and produces a +master CSV inventory. + +Zero non-stdlib dependencies. + +Usage: + python inventory_genomes.py + python inventory_genomes.py --dry-run --no-pep + python inventory_genomes.py --root /tmp/mock_brickyard --dry-run --no-pep +""" + +import argparse +import csv +import json +import os +import os.path +import pathlib +import re +import sys +import urllib.error +import urllib.request + +BRICKYARD_ROOT = os.environ["BRICK_ROOT"] +PEP_URL = "https://pephub-api.databio.org/api/v1/projects/donaldcampbelljr/human_mouse_fasta_brickyard/samples?tag=default" +OUTPUT_FILE = os.environ.get("INVENTORY_CSV", os.path.join(BRICKYARD_ROOT, "refgenomes_inventory.csv")) +FASTA_EXTENSIONS = {".fa", ".fa.gz", ".fna", ".fna.gz", ".fasta", ".fasta.gz"} +ACCESSION_PATTERN = re.compile(r"(GC[AF]_\d+\.\d+)") + + +def fetch_pep_samples(): + """Fetch PEP samples from the PEPHub API. + + Returns a dict mapping absolute fasta path to sample_name. + Falls back to an empty dict if the API is unreachable. + """ + try: + with urllib.request.urlopen(PEP_URL) as response: + data = json.loads(response.read().decode("utf-8")) + lookup = {} + for item in data.get("items", []): + fasta_path = item.get("fasta", "") + sample_name = item.get("sample_name", "") + if fasta_path: + lookup[fasta_path] = sample_name + print(f"Fetched {len(lookup)} PEP samples.", file=sys.stderr) + return lookup + except urllib.error.URLError as e: + print(f"Warning: Could not fetch PEP samples: {e}", file=sys.stderr) + return {} + + +def walk_fasta_files(root): + """Walk the directory tree and yield absolute paths of FASTA files.""" + for dirpath, _dirnames, filenames in os.walk(root): + for name in filenames: + if any(name.endswith(ext) for ext in FASTA_EXTENSIONS): + yield os.path.join(dirpath, name) + + +def extract_metadata(filepath, root): + """Extract structured metadata from a FASTA file path. + + Returns a dict with: path, filename, accession, group, source, build. + """ + filename = os.path.basename(filepath) + match = ACCESSION_PATTERN.search(filename) + accession = match.group(1) if match else "" + + rel = os.path.relpath(filepath, root) + parts = pathlib.PurePosixPath(rel).parts + # parts[0] = group, parts[1] = source, parts[2] = build (or subdir), parts[-1] = filename + group = parts[0] if len(parts) > 1 else "" + source = parts[1] if len(parts) > 2 else "" + build = parts[2] if len(parts) > 3 else "" + + return { + "path": filepath, + "filename": filename, + "accession": accession, + "group": group, + "source": source, + "build": build, + } + + +def add_pep_info(record, pep_lookup): + """Add PEP sample name to a record if it exists in the lookup.""" + record["pep_sample_name"] = pep_lookup.get(record["path"], "") + + +def write_inventory(records, output_path): + """Write the inventory records to a CSV file.""" + fieldnames = ["path", "filename", "accession", "group", "source", "build", "pep_sample_name"] + with open(output_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(records) + print(f"Wrote {len(records)} records to {output_path}", file=sys.stderr) + + +def main(): + parser = argparse.ArgumentParser( + description="Inventory FASTA files in the brickyard refgenomes directory." + ) + parser.add_argument( + "--root", + default=BRICKYARD_ROOT, + help=f"Root directory to scan (default: {BRICKYARD_ROOT})", + ) + parser.add_argument( + "--output", + default=None, + help=f"Output CSV path (default: /refgenomes_inventory.csv)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print the first 10 rows to stdout instead of writing CSV.", + ) + parser.add_argument( + "--no-pep", + action="store_true", + help="Skip PEP fetching (useful for offline HPC nodes).", + ) + args = parser.parse_args() + + root = args.root + output_path = args.output if args.output else os.path.join(root, "refgenomes_inventory.csv") + + # Step 1: Fetch PEP samples + if args.no_pep: + pep_lookup = {} + print("Skipping PEP fetch (--no-pep).", file=sys.stderr) + else: + pep_lookup = fetch_pep_samples() + + # Step 2: Walk and collect FASTA files + print(f"Scanning {root} ...", file=sys.stderr) + records = [] + for filepath in walk_fasta_files(root): + record = extract_metadata(filepath, root) + add_pep_info(record, pep_lookup) + records.append(record) + + # Step 3: Sort for deterministic output + records.sort(key=lambda r: r["path"]) + + # Step 4: Output + if args.dry_run: + fieldnames = ["path", "filename", "accession", "group", "source", "build", "pep_sample_name"] + writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames) + writer.writeheader() + for row in records[:10]: + writer.writerow(row) + else: + write_inventory(records, output_path) + + # Step 5: Summary stats + total = len(records) + with_accession = sum(1 for r in records if r["accession"]) + in_pep = sum(1 for r in records if r["pep_sample_name"]) + unique_groups = len({r["group"] for r in records if r["group"]}) + unique_sources = len({r["source"] for r in records if r["source"]}) + + print(f"\nSummary:", file=sys.stderr) + print(f" Total FASTA files: {total}", file=sys.stderr) + print(f" Files with accessions: {with_accession}", file=sys.stderr) + print(f" Files in PEP: {in_pep}", file=sys.stderr) + print(f" Unique groups: {unique_groups}", file=sys.stderr) + print(f" Unique sources: {unique_sources}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/02_aliases/build_ncbi_alias_table.py b/data_loaders/ref-genome-analysis/src/02_aliases/build_ncbi_alias_table.py new file mode 100644 index 0000000..4ceca3f --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/02_aliases/build_ncbi_alias_table.py @@ -0,0 +1,381 @@ +#!/usr/bin/env python3 +""" +Build NCBI alias mapping table from assembly reports. + +Downloads NCBI assembly_report.txt files for each accession in the inventory +CSV and parses them into a flat CSV mapping sequence names to accessions. + +This is Phase A of the alias registration pipeline -- it produces a standalone +CSV with no store dependency. Needs only the inventory CSV and internet access. + +Usage: + python build_ncbi_alias_table.py --inventory refgenomes_inventory.csv + python build_ncbi_alias_table.py --inventory refgenomes_inventory.csv --limit 3 + python build_ncbi_alias_table.py --inventory refgenomes_inventory.csv --download-only +""" + +import argparse +import csv +import os +import re +import sys +import time +import urllib.error +import urllib.request + +BRICK_ROOT = os.environ["BRICK_ROOT"] +INVENTORY_CSV = os.environ.get("INVENTORY_CSV", f"{BRICK_ROOT}/refgenomes_inventory.csv") +STAGING_DIR = os.environ.get("STAGING", f"{BRICK_ROOT}/refget_staging") +ACCESSION_PATTERN = re.compile(r"(GC[AF]_\d+\.\d+)") +NCBI_FTP_BASE = "https://ftp.ncbi.nlm.nih.gov/genomes/all" + +OUTPUT_COLUMNS = [ + "accession", + "sequence_name", + "sequence_length", + "refseq_accn", + "genbank_accn", + "ucsc_name", + "genbank_assembly_accn", + "refseq_assembly_accn", +] + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Download NCBI assembly reports and build alias mapping table." + ) + parser.add_argument( + "--inventory", default=INVENTORY_CSV, help="Path to refgenomes_inventory.csv" + ) + parser.add_argument( + "--report-cache", + default=f"{STAGING_DIR}/assembly_reports", + help="Directory to cache downloaded assembly_report.txt files", + ) + parser.add_argument( + "--output", + default=f"{STAGING_DIR}/ncbi_alias_table.csv", + help="Output CSV path", + ) + parser.add_argument( + "--limit", type=int, default=None, help="Process only first N accessions" + ) + parser.add_argument( + "--offset", type=int, default=0, help="Skip first N accessions" + ) + parser.add_argument( + "--download-only", + action="store_true", + help="Download reports but don't parse into table", + ) + return parser.parse_args() + + +# --------------------------------------------------------------------------- +# Step A2: Read inventory and extract accessions +# --------------------------------------------------------------------------- + +def read_accessions_from_inventory(csv_path): + """Read inventory CSV and return list of (accession, filename) pairs. + + Filters to rows with a non-empty accession matching the GCF_/GCA_ pattern. + """ + pairs = [] + seen_accessions = set() + with open(csv_path, newline="") as f: + reader = csv.DictReader(f) + if reader.fieldnames is None: + print(f"ERROR: {csv_path} appears to be empty", file=sys.stderr) + sys.exit(1) + for row in reader: + accession = row.get("accession", "").strip() + filename = row.get("filename", "").strip() + if not accession or not ACCESSION_PATTERN.match(accession): + continue + if accession in seen_accessions: + continue + seen_accessions.add(accession) + pairs.append((accession, filename)) + return pairs + + +# --------------------------------------------------------------------------- +# Step A3: Construct NCBI FTP URLs from filename +# --------------------------------------------------------------------------- + +def derive_assembly_name(accession, filename): + """Derive the assembly name from the FASTA filename. + + Example: + accession = "GCF_000001405.40" + filename = "GCF_000001405.40_GRCh38.p14_genomic.fna.gz" + returns "GRCh38.p14" + + The filename pattern is: {accession}_{assembly_name}_genomic.fna[.gz] + """ + # Strip the accession prefix and _genomic.fna[.gz] suffix + prefix = accession + "_" + if not filename.startswith(prefix): + return None + rest = filename[len(prefix):] + # Remove _genomic.fna, _genomic.fna.gz, _genomic.fa.gz, etc. + rest = re.sub(r"_genomic\.(fna|fa|fasta)(\.gz)?$", "", rest) + if not rest: + return None + return rest + + +def accession_to_ftp_dir(accession): + """Convert an accession to its NCBI FTP parent directory URL. + + GCF_963692335.1 -> https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/963/692/335/ + """ + match = re.match(r"(GC[AF])_(\d+)\.\d+", accession) + if not match: + return None + prefix = match.group(1) + numeric = match.group(2).zfill(9) + d1, d2, d3 = numeric[0:3], numeric[3:6], numeric[6:9] + return f"{NCBI_FTP_BASE}/{prefix}/{d1}/{d2}/{d3}/" + + +def lookup_assembly_name_from_ftp(accession): + """Scrape the NCBI FTP directory listing to find the assembly name. + + The directory contains a single subdirectory like GCF_963692335.1_fOsmEpe2.1/. + We extract the assembly name from that. + """ + dir_url = accession_to_ftp_dir(accession) + if not dir_url: + return None + try: + req = urllib.request.Request(dir_url, headers={"User-Agent": "refget-alias-builder/1.0"}) + with urllib.request.urlopen(req, timeout=15) as response: + html = response.read().decode("utf-8", errors="replace") + # Look for a link like GCF_963692335.1_fOsmEpe2.1/ + pattern = re.escape(accession) + r"_([^/\"]+)/" + m = re.search(pattern, html) + if m: + return m.group(1) + except (urllib.error.URLError, urllib.error.HTTPError, OSError): + pass + return None + + +def construct_report_url(accession, assembly_name): + """Construct the NCBI FTP URL for an assembly_report.txt. + + URL pattern: + https://ftp.ncbi.nlm.nih.gov/genomes/all/{GCF|GCA}/{d1}/{d2}/{d3}/ + {accession}_{assembly_name}/{accession}_{assembly_name}_assembly_report.txt + + Where d1/d2/d3 are 3-char chunks of the numeric part of the accession + (the digits between the underscore and the dot). + """ + dir_url = accession_to_ftp_dir(accession) + if not dir_url: + return None + stem = f"{accession}_{assembly_name}" + return f"{dir_url}{stem}/{stem}_assembly_report.txt" + + +# --------------------------------------------------------------------------- +# Step A4: Download with caching and rate limiting +# --------------------------------------------------------------------------- + +def download_report(accession, filename, cache_dir, sleep_sec=0.3): + """Download assembly_report.txt for a given accession. + + Returns (cache_path, status) where status is one of: + "cached" - already existed in cache + "downloaded" - freshly downloaded + "failed" - download failed (logged to stderr) + "skipped" - could not derive assembly name from filename + """ + cache_path = os.path.join(cache_dir, f"{accession}_assembly_report.txt") + + # Check cache first + if os.path.exists(cache_path) and os.path.getsize(cache_path) > 0: + return cache_path, "cached" + + # Derive assembly name from filename, fall back to FTP directory lookup + assembly_name = derive_assembly_name(accession, filename) + if not assembly_name: + assembly_name = lookup_assembly_name_from_ftp(accession) + if assembly_name: + time.sleep(sleep_sec) # Rate limit the directory lookup too + else: + return cache_path, "skipped" + + url = construct_report_url(accession, assembly_name) + if not url: + print(f" WARNING: Cannot construct URL for {accession}", file=sys.stderr) + return cache_path, "skipped" + + # Download + try: + req = urllib.request.Request(url, headers={"User-Agent": "refget-alias-builder/1.0"}) + with urllib.request.urlopen(req, timeout=30) as response: + data = response.read() + with open(cache_path, "wb") as f: + f.write(data) + time.sleep(sleep_sec) + return cache_path, "downloaded" + except (urllib.error.URLError, urllib.error.HTTPError, OSError) as e: + print(f" FAILED: {accession} ({url}): {e}", file=sys.stderr) + return cache_path, "failed" + + +# --------------------------------------------------------------------------- +# Step A5: Parse reports into flat CSV +# --------------------------------------------------------------------------- + +def parse_assembly_report(filepath, accession): + """Parse an assembly_report.txt file into a list of row dicts. + + Returns (rows, genbank_assembly_accn, refseq_assembly_accn). + """ + genbank_assembly_accn = "" + refseq_assembly_accn = "" + rows = [] + + with open(filepath, "r", errors="replace") as f: + for line in f: + line = line.rstrip("\n") + # Parse header metadata + if line.startswith("#"): + if "GenBank assembly accession:" in line: + m = ACCESSION_PATTERN.search(line) + if m: + genbank_assembly_accn = m.group(1) + elif "RefSeq assembly accession:" in line: + m = ACCESSION_PATTERN.search(line) + if m: + refseq_assembly_accn = m.group(1) + continue + + # Data rows: tab-separated, 10 columns + fields = line.split("\t") + if len(fields) < 9: + continue + + sequence_name = fields[0].strip() + genbank_accn = fields[4].strip() if len(fields) > 4 else "na" + refseq_accn = fields[6].strip() if len(fields) > 6 else "na" + sequence_length = fields[8].strip() if len(fields) > 8 else "na" + ucsc_name = fields[9].strip() if len(fields) > 9 else "na" + + # Normalize "na" to empty string + if genbank_accn == "na": + genbank_accn = "" + if refseq_accn == "na": + refseq_accn = "" + if ucsc_name == "na": + ucsc_name = "" + if sequence_length == "na": + sequence_length = "" + + rows.append({ + "accession": accession, + "sequence_name": sequence_name, + "sequence_length": sequence_length, + "refseq_accn": refseq_accn, + "genbank_accn": genbank_accn, + "ucsc_name": ucsc_name, + "genbank_assembly_accn": genbank_assembly_accn, + "refseq_assembly_accn": refseq_assembly_accn, + }) + + return rows + + +def write_alias_table(output_path, all_rows): + """Write the alias table CSV.""" + with open(output_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=OUTPUT_COLUMNS) + writer.writeheader() + writer.writerows(all_rows) + + +def main(): + args = parse_args() + + # Step A2: Read inventory and extract accessions + print(f"Reading inventory from {args.inventory}", file=sys.stderr) + pairs = read_accessions_from_inventory(args.inventory) + print(f"Found {len(pairs)} unique accessions", file=sys.stderr) + + # Apply offset and limit + if args.offset: + pairs = pairs[args.offset:] + print(f"Skipped first {args.offset} accessions", file=sys.stderr) + if args.limit: + pairs = pairs[: args.limit] + print(f"Limited to {args.limit} accessions", file=sys.stderr) + + # Create cache directory + os.makedirs(args.report_cache, exist_ok=True) + + # Step A4: Download reports + n_cached = 0 + n_downloaded = 0 + n_failed = 0 + n_skipped = 0 + downloaded_reports = [] # (accession, cache_path) + + print(f"\nDownloading assembly reports...", file=sys.stderr) + for i, (accession, filename) in enumerate(pairs, 1): + print( + f"[{i}/{len(pairs)}] {accession}...", + end=" ", + flush=True, + file=sys.stderr, + ) + cache_path, status = download_report(accession, filename, args.report_cache) + print(status, file=sys.stderr) + + if status == "cached": + n_cached += 1 + downloaded_reports.append((accession, cache_path)) + elif status == "downloaded": + n_downloaded += 1 + downloaded_reports.append((accession, cache_path)) + elif status == "failed": + n_failed += 1 + elif status == "skipped": + n_skipped += 1 + + print( + f"\nDownload summary: {n_downloaded} downloaded, {n_cached} cached, " + f"{n_failed} failed, {n_skipped} skipped", + file=sys.stderr, + ) + + if args.download_only: + print("--download-only specified, stopping before parsing.", file=sys.stderr) + return + + # Step A5: Parse reports into flat CSV + print(f"\nParsing assembly reports...", file=sys.stderr) + all_rows = [] + n_parsed = 0 + for accession, cache_path in downloaded_reports: + if not os.path.exists(cache_path) or os.path.getsize(cache_path) == 0: + continue + rows = parse_assembly_report(cache_path, accession) + all_rows.extend(rows) + n_parsed += 1 + + write_alias_table(args.output, all_rows) + + # Summary + print(f"\nResults:", file=sys.stderr) + print(f" Accessions processed: {len(pairs)}", file=sys.stderr) + print(f" Reports parsed: {n_parsed}", file=sys.stderr) + print(f" Total sequence rows: {len(all_rows)}", file=sys.stderr) + print(f" Output written to: {args.output}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/02_aliases/register_aliases.sbatch b/data_loaders/ref-genome-analysis/src/02_aliases/register_aliases.sbatch new file mode 100644 index 0000000..6b831e2 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/02_aliases/register_aliases.sbatch @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=ncbi_aliases +#SBATCH --output=ncbi_aliases_%j.log +#SBATCH --error=ncbi_aliases_%j.log +#SBATCH --partition=standard +#SBATCH --time=4:00:00 +#SBATCH --mem=8G +#SBATCH --cpus-per-task=1 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd $HOME/code/ref-genome-analysis +source env/on-cluster.env + +python src/02_aliases/register_ncbi_aliases.py diff --git a/data_loaders/ref-genome-analysis/src/02_aliases/register_ncbi_aliases.py b/data_loaders/ref-genome-analysis/src/02_aliases/register_ncbi_aliases.py new file mode 100644 index 0000000..f3f545b --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/02_aliases/register_ncbi_aliases.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 +""" +Register NCBI sequence and collection aliases in a RefgetStore. + +Phase B of the alias registration pipeline. Reads the ncbi_alias_table.csv +(from Phase A), matches sequences to store digests, and bulk-loads aliases +via temporary TSV files. + +Usage: + python register_ncbi_aliases.py --store-path /path/to/store + python register_ncbi_aliases.py --store-path /path/to/store --dry-run + python register_ncbi_aliases.py --store-path /path/to/store --limit 5 +""" + +import argparse +import csv +import os +import sys +import tempfile +import time +from collections import defaultdict + +from refget.store import RefgetStore + +BRICK_ROOT = os.environ["BRICK_ROOT"] +STORE_PATH = os.environ.get("STORE_PATH", f"{BRICK_ROOT}/refget_store") +INVENTORY_CSV = os.environ.get("INVENTORY_CSV", f"{BRICK_ROOT}/refgenomes_inventory.csv") +STAGING = os.environ.get("STAGING", f"{BRICK_ROOT}/refget_staging") +ALIAS_TABLE_CSV = f"{STAGING}/ncbi_alias_table.csv" + + +def parse_args(): + parser = argparse.ArgumentParser(description="Register NCBI aliases in RefgetStore") + parser.add_argument("--store-path", default=STORE_PATH, help="Path to RefgetStore") + parser.add_argument("--alias-table", default=ALIAS_TABLE_CSV, help="Path to ncbi_alias_table.csv") + parser.add_argument("--inventory", default=INVENTORY_CSV, help="Path to refgenomes_inventory.csv") + parser.add_argument("--dry-run", action="store_true", help="Parse and match but don't register") + parser.add_argument("--limit", type=int, default=None, help="Process only first N accessions") + parser.add_argument("--offset", type=int, default=0, help="Skip first N accessions") + return parser.parse_args() + + +def read_inventory(csv_path): + """Read inventory CSV, return accession -> path mapping.""" + acc_to_path = {} + with open(csv_path, newline="") as f: + for row in csv.DictReader(f): + acc = row.get("accession", "").strip() + path = row.get("path", "").strip() + if acc and path: + acc_to_path[acc] = path + return acc_to_path + + +def read_alias_table(csv_path): + """Read alias table CSV, return accession -> list of row dicts.""" + acc_to_rows = defaultdict(list) + with open(csv_path, newline="") as f: + for row in csv.DictReader(f): + acc = row.get("accession", "").strip() + if acc: + acc_to_rows[acc].append(row) + return acc_to_rows + + +def write_tsv(path, pairs): + """Write alias\tdigest pairs to a TSV file.""" + with open(path, "w") as f: + for alias, digest in pairs: + f.write(f"{alias}\t{digest}\n") + + +def main(): + args = parse_args() + + # Read inputs + print(f"Reading inventory from {args.inventory}") + acc_to_path = read_inventory(args.inventory) + print(f" {len(acc_to_path)} accessions with paths") + + print(f"Reading alias table from {args.alias_table}") + acc_to_rows = read_alias_table(args.alias_table) + print(f" {len(acc_to_rows)} accessions, {sum(len(v) for v in acc_to_rows.values())} sequence rows") + + # Filter to accessions present in both + common_accessions = sorted(set(acc_to_path) & set(acc_to_rows)) + print(f" {len(common_accessions)} accessions in both inventory and alias table") + + if args.offset: + common_accessions = common_accessions[args.offset:] + print(f" Skipped first {args.offset}") + if args.limit: + common_accessions = common_accessions[:args.limit] + print(f" Limited to {args.limit}") + + # Open store + store = RefgetStore.on_disk(args.store_path) + store.set_quiet(True) + print(f"Store opened: {store.stats()}") + + # Accumulate all aliases in memory, then bulk-load at the end + seq_aliases = {"refseq": [], "insdc": [], "ucsc": []} + coll_aliases = {"refseq": [], "insdc": []} + + n_collections = 0 + n_matched = 0 + n_unmatched = 0 + n_skipped_files = 0 + t_start = time.time() + + for i, accession in enumerate(common_accessions, 1): + fasta_path = acc_to_path[accession] + alias_rows = acc_to_rows[accession] + + print(f"[{i}/{len(common_accessions)}] {accession} ({len(alias_rows)} seqs)...", end=" ", flush=True) + + # Get collection digest by loading (returns immediately if exists) + if not os.path.exists(fasta_path): + print("SKIP (file missing)") + n_skipped_files += 1 + continue + + try: + meta, was_new = store.add_sequence_collection_from_fasta(fasta_path) + except Exception as e: + print(f"SKIP ({e})") + n_skipped_files += 1 + continue + + coll_digest = meta.digest + n_collections += 1 + + # Collection-level aliases from report header + first_row = alias_rows[0] + genbank_acc = first_row.get("genbank_assembly_accn", "").strip() + refseq_acc = first_row.get("refseq_assembly_accn", "").strip() + + if refseq_acc: + coll_aliases["refseq"].append((refseq_acc, coll_digest)) + if genbank_acc: + coll_aliases["insdc"].append((genbank_acc, coll_digest)) + + # Get collection's sequences to match against alias table + level2 = store.get_collection_level2(coll_digest) + names = level2.get("names", []) + lengths = level2.get("lengths", []) + sequences = level2.get("sequences", []) + + # Build name -> (seq_digest, length) lookup + name_to_info = {} + for name, length, seq_digest in zip(names, lengths, sequences): + name_to_info[name] = (seq_digest, int(length)) + + # Match alias table rows to store sequences + matched_this = 0 + unmatched_this = 0 + for row in alias_rows: + seq_name = row.get("sequence_name", "").strip() + seq_length_str = row.get("sequence_length", "").strip() + refseq_accn = row.get("refseq_accn", "").strip() + genbank_accn = row.get("genbank_accn", "").strip() + ucsc_name = row.get("ucsc_name", "").strip() + + seq_length = int(seq_length_str) if seq_length_str else None + + # Try matching by sequence_name, then refseq_accn, then genbank_accn, then ucsc_name + seq_digest = None + for candidate in [seq_name, refseq_accn, genbank_accn, ucsc_name]: + if candidate and candidate in name_to_info: + store_digest, store_length = name_to_info[candidate] + if seq_length is None or store_length == seq_length: + seq_digest = store_digest + break + + if seq_digest is None: + unmatched_this += 1 + continue + + matched_this += 1 + + if refseq_accn: + seq_aliases["refseq"].append((refseq_accn, seq_digest)) + if genbank_accn: + seq_aliases["insdc"].append((genbank_accn, seq_digest)) + if ucsc_name: + seq_aliases["ucsc"].append((ucsc_name, seq_digest)) + + n_matched += matched_this + n_unmatched += unmatched_this + print(f"{coll_digest[:12]}... {matched_this}/{len(alias_rows)} matched") + + match_elapsed = time.time() - t_start + + # Summary of what was collected + n_seq_aliases = sum(len(v) for v in seq_aliases.values()) + n_coll_aliases = sum(len(v) for v in coll_aliases.values()) + print(f"\nMatching done in {match_elapsed:.1f}s") + print(f" Collections: {n_collections}, skipped: {n_skipped_files}") + print(f" Sequences matched: {n_matched}, unmatched: {n_unmatched}") + print(f" Sequence aliases to register: {n_seq_aliases}") + print(f" Collection aliases to register: {n_coll_aliases}") + + if args.dry_run: + print("\n[DRY RUN] Skipping alias registration.") + return + + # Bulk-load aliases via temp TSV files + print(f"\nRegistering aliases...") + with tempfile.TemporaryDirectory() as tmpdir: + for namespace, pairs in seq_aliases.items(): + if not pairs: + continue + tsv_path = os.path.join(tmpdir, f"seq_{namespace}.tsv") + write_tsv(tsv_path, pairs) + n = store.load_sequence_aliases(namespace, tsv_path) + print(f" sequences/{namespace}: {n} aliases loaded") + + for namespace, pairs in coll_aliases.items(): + if not pairs: + continue + tsv_path = os.path.join(tmpdir, f"coll_{namespace}.tsv") + write_tsv(tsv_path, pairs) + n = store.load_collection_aliases(namespace, tsv_path) + print(f" collections/{namespace}: {n} aliases loaded") + + total_elapsed = time.time() - t_start + print(f"\nDone in {total_elapsed:.1f}s") + print(f" Store stats: {store.stats()}") + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.py b/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.py new file mode 100644 index 0000000..f4c2487 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +""" +Build a complete digest_map.csv from the inventory CSV. + +For each FASTA in the inventory, reads the seqcol digest from the .rgsi cache +file next to it (instant — just reads the first line). Falls back to computing +the digest with digest_fasta() if no .rgsi exists. + +Outputs: $STAGING/digest_map.csv with columns: + path, filename, digest, n_sequences, group + +Usage: + python src/02_build/build_digest_map.py + python src/02_build/build_digest_map.py --dry-run +""" + +import argparse +import csv +import os +import re +import sys +import time + +BRICK_ROOT = os.environ["BRICK_ROOT"] +STAGING = os.environ.get("STAGING", os.path.join(BRICK_ROOT, "refget_staging")) +INVENTORY_CSV = os.environ.get("INVENTORY_CSV", os.path.join(BRICK_ROOT, "refgenomes_inventory.csv")) +OUTPUT_CSV = os.path.join(STAGING, "digest_map.csv") + +# Pattern to strip FASTA extensions and get the RGSI path +FASTA_EXTS = re.compile(r'\.(fa|fasta|fna)(\.gz)?$') + + +def rgsi_path_for(fasta_path: str) -> str: + """Get the .rgsi cache path for a FASTA file.""" + return FASTA_EXTS.sub('.rgsi', fasta_path) + + +def read_rgsi_digest(rgsi_path: str) -> tuple[str, int] | None: + """Read seqcol digest and sequence count from an .rgsi file. + + Returns (digest, n_sequences) or None if file doesn't exist or is malformed. + """ + if not os.path.exists(rgsi_path): + return None + digest = None + n_sequences = 0 + with open(rgsi_path) as f: + for line in f: + if line.startswith("##seqcol_digest="): + digest = line.strip().split("=", 1)[1] + elif not line.startswith("#"): + n_sequences += 1 + if digest: + return digest, n_sequences + return None + + +def build_digest_map(inventory_path: str, output_path: str, dry_run: bool = False): + with open(inventory_path) as f: + rows = list(csv.DictReader(f)) + + total = len(rows) + print(f"Inventory: {total} FASTAs from {inventory_path}") + + if dry_run: + # Just count how many have .rgsi files + have_rgsi = sum(1 for r in rows if os.path.exists(rgsi_path_for(r["path"]))) + print(f"FASTAs with .rgsi cache: {have_rgsi}/{total}") + print("--dry-run: stopping here.") + return + + results = [] + from_cache = 0 + skipped = 0 + t0 = time.time() + + for i, row in enumerate(rows, 1): + fasta_path = row["path"] + group = row.get("group", "") + filename = row.get("filename", os.path.basename(fasta_path)) + + # Try .rgsi cache first + rgsi = rgsi_path_for(fasta_path) + cached = read_rgsi_digest(rgsi) + if cached: + digest, n_sequences = cached + from_cache += 1 + results.append({ + "path": fasta_path, + "filename": filename, + "digest": digest, + "n_sequences": n_sequences, + "group": group, + }) + print(f" [{i}/{total}] (cache) {group}/{filename} -> {digest}") + continue + + # No cache — skip (these FASTAs were never successfully loaded) + print(f" [{i}/{total}] NO CACHE: {group}/{filename}", file=sys.stderr) + skipped += 1 + + elapsed = time.time() - t0 + + # Write output + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=["path", "filename", "digest", "n_sequences", "group"]) + writer.writeheader() + writer.writerows(results) + + print(f"\nDone in {elapsed:.1f}s") + print(f" Written: {len(results)} entries to {output_path}") + print(f" From cache: {from_cache}") + print(f" No cache: {skipped}") + + # Summary by group + from collections import Counter + group_counts = Counter(r["group"] for r in results) + print(f"\nBy group:") + for group, count in sorted(group_counts.items(), key=lambda x: -x[1]): + print(f" {group}: {count}") + + +def main(): + parser = argparse.ArgumentParser(description="Build complete digest_map.csv from inventory.") + parser.add_argument("--inventory", default=INVENTORY_CSV) + parser.add_argument("--output", default=OUTPUT_CSV) + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + build_digest_map(args.inventory, args.output, args.dry_run) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.sbatch b/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.sbatch new file mode 100644 index 0000000..523f1f7 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.sbatch @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=build_digest_map +#SBATCH --output=build_digest_map_%j.log +#SBATCH --error=build_digest_map_%j.log +#SBATCH --partition=standard +#SBATCH --time=4:00:00 +#SBATCH --mem=16G +#SBATCH --cpus-per-task=1 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd $HOME/code/ref-genome-analysis +source env/on-cluster.env + +python src/02_build/build_digest_map.py diff --git a/data_loaders/ref-genome-analysis/src/03_fhr/batch_generate_fhr.py b/data_loaders/ref-genome-analysis/src/03_fhr/batch_generate_fhr.py new file mode 100755 index 0000000..63e3d81 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/03_fhr/batch_generate_fhr.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +"""Batch-generate FHR metadata for all VGP vertebrate genomes. + +Reads the inventory CSV, extracts unique GCA accessions for vertebrate genomes, +and fetches FHR metadata from NCBI for each. Skips accessions that already have +an FHR file in the output directory, so it's safe to re-run. + +Usage: + python batch_generate_fhr.py --inventory /path/to/inventory.csv --output-dir /path/to/fhr_metadata/ + python batch_generate_fhr.py --inventory /path/to/inventory.csv --output-dir /path/to/fhr_metadata/ --group vertebrates +""" + +import argparse +import csv +import re +import sys +import os +import time + +from genomeark_to_fhr import process_accession + + +def main(): + parser = argparse.ArgumentParser(description="Batch-generate FHR metadata from inventory CSV") + parser.add_argument("--inventory", required=True, help="Path to refgenomes_inventory.csv") + parser.add_argument("--output-dir", required=True, help="Output directory for .fhr.json files") + parser.add_argument("--group", default="vertebrates", help="Filter by group column (default: vertebrates)") + parser.add_argument("--limit", type=int, default=None, help="Process only first N accessions") + parser.add_argument("--skip-genomeark", action="store_true", help="Skip GenomeArk YAML fetch (faster)") + args = parser.parse_args() + + # Read inventory and extract unique accessions for the target group + with open(args.inventory, newline="") as f: + rows = list(csv.DictReader(f)) + + accessions = set() + for row in rows: + if row.get("group", "").strip() != args.group: + continue + acc = row.get("accession", "").strip() + if not acc: + m = re.search(r'(GCA_\d+(?:\.\d+)?)', row.get("filename", "")) + if m: + acc = m.group(1) + if acc: + accessions.add(acc) + + accessions = sorted(accessions) + if args.limit: + accessions = accessions[:args.limit] + + # Check which ones already exist + os.makedirs(args.output_dir, exist_ok=True) + existing = {f.replace(".fhr.json", "") for f in os.listdir(args.output_dir) if f.endswith(".fhr.json")} + todo = [a for a in accessions if a not in existing] + + print(f"Group: {args.group}", file=sys.stderr) + print(f"Total accessions: {len(accessions)}", file=sys.stderr) + print(f"Already done: {len(accessions) - len(todo)}", file=sys.stderr) + print(f"To process: {len(todo)}", file=sys.stderr) + + if not todo: + print("Nothing to do!", file=sys.stderr) + return + + n_ok = 0 + n_fail = 0 + t_start = time.time() + + for i, acc in enumerate(todo, 1): + output_path = os.path.join(args.output_dir, f"{acc}.fhr.json") + ok = False + for attempt in range(3): + try: + print(f"[{i}/{len(todo)}] ", end="", file=sys.stderr) + process_accession(acc, output_path) + n_ok += 1 + ok = True + break + except Exception as e: + if "429" in str(e) and attempt < 2: + wait = 5 * (attempt + 1) + print(f"[{i}/{len(todo)}] {acc}: rate limited, waiting {wait}s...", file=sys.stderr) + time.sleep(wait) + else: + print(f"[{i}/{len(todo)}] {acc}: FAILED ({e})", file=sys.stderr) + n_fail += 1 + break + + # Throttle to ~3 requests/sec (NCBI + GenomeArk = 2 requests per accession) + time.sleep(0.3) + + elapsed = time.time() - t_start + print(f"\nDone in {elapsed:.0f}s: {n_ok} OK, {n_fail} failed out of {len(todo)}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/03_fhr/genomeark_to_fhr.py b/data_loaders/ref-genome-analysis/src/03_fhr/genomeark_to_fhr.py new file mode 100755 index 0000000..e8a1e72 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/03_fhr/genomeark_to_fhr.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +"""Generate FHR metadata JSON files from GenomeArk + NCBI Datasets API. + +Given a GCA accession, fetches: + 1. Assembly metadata from NCBI Datasets API (taxonomy, stats, sequencing tech) + 2. Species metadata from GenomeArk GitHub repo (common name, genome size, project) + +Outputs an FHR-compatible JSON file that can be loaded into a RefgetStore via +store.load_fhr_metadata(digest, path). + +Usage: + python genomeark_to_fhr.py GCA_964261635.1 [output.fhr.json] + python genomeark_to_fhr.py GCA_964261635.1 GCA_964263255.1 # multiple accessions +""" + +import json +import sys +import urllib.request +from pathlib import Path + + +def fetch_ncbi_report(accession: str) -> dict: + """Fetch assembly report from NCBI Datasets API.""" + url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{accession}/dataset_report" + req = urllib.request.Request(url, headers={"Accept": "application/json"}) + with urllib.request.urlopen(req) as resp: + data = json.loads(resp.read()) + reports = data.get("reports", []) + if not reports: + raise ValueError(f"No assembly report found for {accession}") + return reports[0] + + +def fetch_genomeark_yaml(species_name: str) -> dict | None: + """Fetch species YAML from genomeark-metadata GitHub repo.""" + filename = species_name.replace(" ", "_") + url = f"https://raw.githubusercontent.com/genomeark/genomeark-metadata/main/species/{filename}.yaml" + try: + import yaml + except ImportError: + # Fall back to basic parsing if PyYAML not available + try: + with urllib.request.urlopen(url) as resp: + text = resp.read().decode() + # Basic extraction without full YAML parsing + result = {"_raw": text} + for line in text.split("\n"): + line = line.strip() + if line.startswith("common_name:"): + result["common_name"] = line.split(":", 1)[1].strip().strip("'\"") + elif line.startswith("genome_size:"): + try: + result["genome_size"] = int(line.split(":", 1)[1].strip()) + except ValueError: + pass + elif line.startswith("project:"): + result["project"] = line.split(":", 1)[1].strip() + return result + except Exception: + return None + + try: + with urllib.request.urlopen(url) as resp: + return yaml.safe_load(resp.read()) + except Exception: + return None + + +def ncbi_to_fhr(report: dict, genomeark: dict | None = None) -> dict: + """Convert NCBI assembly report + GenomeArk data to FHR metadata.""" + organism = report.get("organism", {}) + assembly = report.get("assembly_info", {}) + stats = report.get("assembly_stats", {}) + + species_name = organism.get("organism_name", "") + tax_id = organism.get("tax_id") + common_name = organism.get("common_name", "") + + # GenomeArk may have a better common name + if genomeark: + species = genomeark.get("species", genomeark) + common_name = common_name or species.get("common_name", "") + + fhr = { + "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", + "schemaVersion": 1, + "genome": species_name, + "version": assembly.get("assembly_name", ""), + "dateCreated": assembly.get("release_date", ""), + } + + # Taxonomy + if tax_id: + fhr["taxon"] = { + "name": species_name, + "uri": f"https://identifiers.org/taxonomy:{tax_id}", + } + + # Common name as synonym + if common_name: + fhr["genomeSynonym"] = [common_name] + + # Accession + accession = report.get("accession", "") + if accession: + fhr["accessionID"] = { + "name": accession, + "url": f"https://www.ncbi.nlm.nih.gov/datasets/genome/{accession}/", + } + + # Submitter as assembly author + submitter = assembly.get("submitter", "") + if submitter: + fhr["assemblyAuthor"] = [{"name": submitter}] + + # Sequencing technology + seq_tech = assembly.get("sequencing_tech", "") + if seq_tech: + fhr["instrument"] = [t.strip() for t in seq_tech.split(",")] + + # Assembly method + method = assembly.get("assembly_method", "") + if method and method != "various": + fhr["assemblySoftware"] = method + + # Vital statistics + vital = {} + if stats.get("contig_n50"): + vital["N50"] = stats["contig_n50"] + if stats.get("contig_l50"): + vital["L50"] = stats["contig_l50"] + if stats.get("total_sequence_length"): + vital["totalBasePairs"] = int(stats["total_sequence_length"]) + if stats.get("number_of_contigs"): + vital["numberContigs"] = stats["number_of_contigs"] + if stats.get("number_of_scaffolds"): + vital["numberScaffolds"] = stats["number_of_scaffolds"] + if stats.get("scaffold_n50"): + vital["scaffoldN50"] = stats["scaffold_n50"] + if vital: + fhr["vitalStats"] = vital + + # Related links + links = [] + links.append(f"https://www.genomeark.org/genomeark-all/{species_name.replace(' ', '_')}.html") + if accession: + links.append(f"https://www.ncbi.nlm.nih.gov/datasets/genome/{accession}/") + fhr["relatedLink"] = links + + # BioProject lineage — note VGP/DToL/EBP affiliations + projects = [] + for lineage in assembly.get("bioproject_lineage", []): + for bp in lineage.get("bioprojects", []): + title = bp.get("title", "") + if any(kw in title.lower() for kw in ["vertebrate genomes", "darwin tree", "earth biogenome"]): + projects.append(title) + if projects: + fhr["documentation"] = "Projects: " + "; ".join(projects) + + # License + fhr["license"] = "https://www.genomeark.org/documentation/data-use-policy.html" + + return fhr + + +def process_accession(accession: str, output_path: str | None = None) -> str: + """Process a single accession and write FHR JSON.""" + print(f"Fetching NCBI report for {accession}...", file=sys.stderr) + report = fetch_ncbi_report(accession) + + species_name = report.get("organism", {}).get("organism_name", "") + print(f" Species: {species_name}", file=sys.stderr) + + print(f" Fetching GenomeArk metadata...", file=sys.stderr) + genomeark = fetch_genomeark_yaml(species_name) if species_name else None + + fhr = ncbi_to_fhr(report, genomeark) + + if output_path is None: + output_path = f"{accession}.fhr.json" + + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w") as f: + json.dump(fhr, f, indent=2) + + print(f" Wrote: {output_path}", file=sys.stderr) + return output_path + + +def main(): + if len(sys.argv) < 2: + print("Usage: genomeark_to_fhr.py [accession2 ...] [--output-dir DIR]") + print(" genomeark_to_fhr.py GCA_964261635.1") + print(" genomeark_to_fhr.py GCA_964261635.1 GCA_964263255.1 --output-dir fhr/") + sys.exit(1) + + args = sys.argv[1:] + output_dir = None + + if "--output-dir" in args: + idx = args.index("--output-dir") + output_dir = args[idx + 1] + args = args[:idx] + args[idx + 2:] + + for accession in args: + if output_dir: + output_path = f"{output_dir}/{accession}.fhr.json" + else: + output_path = f"{accession}.fhr.json" + process_accession(accession, output_path) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/03_fhr/load_fhr_metadata.py b/data_loaders/ref-genome-analysis/src/03_fhr/load_fhr_metadata.py new file mode 100644 index 0000000..78a9bfd --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/03_fhr/load_fhr_metadata.py @@ -0,0 +1,97 @@ +""" +Load FHR metadata JSON files into an existing RefgetStore. + +Resolves accessions to collection digests via the store's 'insdc' alias +namespace. Strips vitalStats before loading, since those describe the source +assembly, not the specific sequence collection. + +Usage: + python load_fhr_metadata.py --store-path /path/to/store --fhr-dir fhr_metadata/ + python load_fhr_metadata.py --store-path /path/to/store --fhr file.fhr.json --digest abc123 +""" + +import argparse +import glob +import json +import os +import sys +import tempfile + +from gtars.refget import RefgetStore + + +def strip_vital_stats(fhr_path): + """Write a temp FHR file with vitalStats removed. Returns temp path.""" + with open(fhr_path) as f: + fhr_data = json.load(f) + provenance = {k: v for k, v in fhr_data.items() if k != "vitalStats"} + tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".fhr.json", delete=False) + json.dump(provenance, tmp, indent=2) + tmp.close() + return tmp.name + + +def load_fhr_dir(store, fhr_dir, namespaces=("insdc", "refseq")): + """Load all .fhr.json files, resolving accession -> digest via alias namespaces.""" + fhr_files = sorted(glob.glob(os.path.join(fhr_dir, "*.fhr.json"))) + if not fhr_files: + print(f"No .fhr.json files found in {fhr_dir}", file=sys.stderr) + return + + print(f"Loading {len(fhr_files)} FHR files, resolving via {namespaces} aliases...", file=sys.stderr) + + n_loaded = 0 + n_skipped = 0 + for fhr_path in fhr_files: + basename = os.path.basename(fhr_path) + accession = basename.replace(".fhr.json", "") + + meta = None + for ns in namespaces: + meta = store.get_collection_metadata_by_alias(ns, accession) + if meta is not None: + break + + if meta is None: + n_skipped += 1 + continue + + tmp_path = strip_vital_stats(fhr_path) + try: + store.load_fhr_metadata(meta.digest, tmp_path) + finally: + os.unlink(tmp_path) + n_loaded += 1 + + if n_loaded % 100 == 0: + print(f" ... {n_loaded} loaded", file=sys.stderr) + + print(f"\nLoaded {n_loaded}, skipped {n_skipped} (no alias match)", file=sys.stderr) + + +def main(): + parser = argparse.ArgumentParser(description="Load FHR metadata into RefgetStore") + parser.add_argument("--store-path", required=True, help="Path to RefgetStore") + parser.add_argument("--fhr-dir", help="Directory of .fhr.json files") + parser.add_argument("--fhr", help="Single .fhr.json file") + parser.add_argument("--digest", help="Collection digest (required with --fhr)") + args = parser.parse_args() + + store = RefgetStore.on_disk(args.store_path) + + if args.fhr_dir: + load_fhr_dir(store, args.fhr_dir) + elif args.fhr and args.digest: + tmp_path = strip_vital_stats(args.fhr) + try: + store.load_fhr_metadata(args.digest, tmp_path) + finally: + os.unlink(tmp_path) + print(f"Loaded {args.fhr} -> {args.digest}", file=sys.stderr) + else: + print("Provide --fhr-dir or --fhr + --digest", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_000001405.29.fhr.json b/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_000001405.29.fhr.json new file mode 100755 index 0000000..4c3b322 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_000001405.29.fhr.json @@ -0,0 +1,36 @@ +{ + "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", + "schemaVersion": 1, + "genome": "Homo sapiens", + "version": "GRCh38.p14", + "dateCreated": "2022-02-03", + "taxon": { + "name": "Homo sapiens", + "uri": "https://identifiers.org/taxonomy:9606" + }, + "genomeSynonym": [ + "human" + ], + "accessionID": { + "name": "GCA_000001405.29", + "url": "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_000001405.29/" + }, + "assemblyAuthor": [ + { + "name": "Genome Reference Consortium" + } + ], + "vitalStats": { + "N50": 57879411, + "L50": 18, + "totalBasePairs": 3099734149, + "numberContigs": 999, + "numberScaffolds": 473, + "scaffoldN50": 67794873 + }, + "relatedLink": [ + "https://www.genomeark.org/genomeark-all/Homo_sapiens.html", + "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_000001405.29/" + ], + "license": "https://www.genomeark.org/documentation/data-use-policy.html" +} \ No newline at end of file diff --git a/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_000001405.fhr.json b/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_000001405.fhr.json new file mode 100755 index 0000000..4c3b322 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_000001405.fhr.json @@ -0,0 +1,36 @@ +{ + "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", + "schemaVersion": 1, + "genome": "Homo sapiens", + "version": "GRCh38.p14", + "dateCreated": "2022-02-03", + "taxon": { + "name": "Homo sapiens", + "uri": "https://identifiers.org/taxonomy:9606" + }, + "genomeSynonym": [ + "human" + ], + "accessionID": { + "name": "GCA_000001405.29", + "url": "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_000001405.29/" + }, + "assemblyAuthor": [ + { + "name": "Genome Reference Consortium" + } + ], + "vitalStats": { + "N50": 57879411, + "L50": 18, + "totalBasePairs": 3099734149, + "numberContigs": 999, + "numberScaffolds": 473, + "scaffoldN50": 67794873 + }, + "relatedLink": [ + "https://www.genomeark.org/genomeark-all/Homo_sapiens.html", + "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_000001405.29/" + ], + "license": "https://www.genomeark.org/documentation/data-use-policy.html" +} \ No newline at end of file diff --git a/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_964261635.1.fhr.json b/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_964261635.1.fhr.json new file mode 100644 index 0000000..37fdd6f --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_964261635.1.fhr.json @@ -0,0 +1,41 @@ +{ + "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", + "schemaVersion": 1, + "genome": "Lissotriton helveticus", + "version": "aLisHel1.1", + "dateCreated": "2024-10-17", + "taxon": { + "name": "Lissotriton helveticus", + "uri": "https://identifiers.org/taxonomy:256425" + }, + "genomeSynonym": [ + "palmate newt" + ], + "accessionID": { + "name": "GCA_964261635.1", + "url": "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_964261635.1/" + }, + "assemblyAuthor": [ + { + "name": "WELLCOME SANGER INSTITUTE" + } + ], + "instrument": [ + "PacBio", + "Arima2" + ], + "vitalStats": { + "N50": 7795245, + "L50": 941, + "totalBasePairs": 23170028842, + "numberContigs": 5693, + "numberScaffolds": 448, + "scaffoldN50": 2132484007 + }, + "relatedLink": [ + "https://www.genomeark.org/genomeark-all/Lissotriton_helveticus.html", + "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_964261635.1/" + ], + "documentation": "Projects: Vertebrate Genomes Project; Darwin Tree of Life Project: Genome Data and Assemblies; Earth BioGenome Project (EBP)", + "license": "https://www.genomeark.org/documentation/data-use-policy.html" +} \ No newline at end of file diff --git a/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_964263255.1.fhr.json b/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_964263255.1.fhr.json new file mode 100644 index 0000000..e8a6bda --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_964263255.1.fhr.json @@ -0,0 +1,41 @@ +{ + "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", + "schemaVersion": 1, + "genome": "Lissotriton vulgaris", + "version": "aLisVul1.1", + "dateCreated": "2024-10-17", + "taxon": { + "name": "Lissotriton vulgaris", + "uri": "https://identifiers.org/taxonomy:8324" + }, + "genomeSynonym": [ + "common newt" + ], + "accessionID": { + "name": "GCA_964263255.1", + "url": "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_964263255.1/" + }, + "assemblyAuthor": [ + { + "name": "WELLCOME SANGER INSTITUTE" + } + ], + "instrument": [ + "PacBio", + "Arima2" + ], + "vitalStats": { + "N50": 6568731, + "L50": 1102, + "totalBasePairs": 24226223864, + "numberContigs": 19295, + "numberScaffolds": 15265, + "scaffoldN50": 1925992481 + }, + "relatedLink": [ + "https://www.genomeark.org/genomeark-all/Lissotriton_vulgaris.html", + "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_964263255.1/" + ], + "documentation": "Projects: Vertebrate Genomes Project; Darwin Tree of Life Project: Genome Data and Assemblies; Earth BioGenome Project (EBP)", + "license": "https://www.genomeark.org/documentation/data-use-policy.html" +} \ No newline at end of file diff --git a/data_loaders/ref-genome-analysis/src/04_verify/verify_refgetstore.py b/data_loaders/ref-genome-analysis/src/04_verify/verify_refgetstore.py new file mode 100644 index 0000000..b7b4686 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/04_verify/verify_refgetstore.py @@ -0,0 +1,447 @@ +#!/usr/bin/env python3 +""" +Verification script for the brickyard RefgetStore. + +Runs automated checks against the store at STORE_PATH and produces a +structured pass/fail report. Designed to work with a partial store +(not all files loaded yet) and without aliases (alias registration +has not been done yet). + +Usage: + python verify_refgetstore.py + python verify_refgetstore.py --store-path /alt/path --limit 5 + +Expected results (update after first successful run): +- collections: ~XXX unique (out of ~1,147 input FASTAs processed so far) +- sequences: ~XXX unique +- roundtrip digest match: PASS for at least one collection +""" + +import argparse +import csv +import json +import os +import subprocess +import sys +import tempfile +import time + +BRICK_ROOT = os.environ["BRICK_ROOT"] +STORE_PATH = os.environ.get("STORE_PATH", f"{BRICK_ROOT}/refget_store") +INVENTORY_CSV = os.environ.get("INVENTORY_CSV", f"{BRICK_ROOT}/refgenomes_inventory.csv") +STAGING = os.environ.get("STAGING", f"{BRICK_ROOT}/refget_staging") +DIGEST_MAP_CSV = f"{STAGING}/digest_map.csv" + +results = [] + + +def check(name, passed, detail=""): + """Record and print a check result.""" + status = "PASS" if passed else "FAIL" + results.append({"name": name, "status": status, "detail": detail}) + print(f"[{status}] {name}" + (f" -- {detail}" if detail else "")) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Verify brickyard RefgetStore") + parser.add_argument("--store-path", default=STORE_PATH, help="RefgetStore path") + parser.add_argument("--inventory", default=INVENTORY_CSV, help="Inventory CSV path") + parser.add_argument("--digest-map", default=DIGEST_MAP_CSV, help="Digest map CSV path") + parser.add_argument( + "--limit", + type=int, + default=3, + help="Number of collections to test for round-trip export (default: 3)", + ) + parser.add_argument( + "--skip-roundtrip", + action="store_true", + help="Skip round-trip FASTA export checks (slow for large genomes)", + ) + return parser.parse_args() + + +# ── Check 1: Store opens and stats are valid ─────────────────────────── + + +def check_store_opens(store_path): + """Open the store and verify basic stats.""" + try: + from refget.store import RefgetStore + + store = RefgetStore.open_local(store_path) + check("store_opens", True, f"path={store_path}") + except Exception as e: + check("store_opens", False, f"path={store_path}, error={e}") + return None + + # Count collections and sequences + try: + collections = list(store.list_collections()["results"]) + n_collections = len(collections) + except Exception as e: + check("list_collections", False, f"error={e}") + n_collections = 0 + + try: + sequences = list(store.list_sequences()) + n_sequences = len(sequences) + except Exception as e: + check("list_sequences", False, f"error={e}") + n_sequences = 0 + + check("collections_nonzero", n_collections > 0, f"collections={n_collections}") + check("sequences_nonzero", n_sequences > 0, f"sequences={n_sequences}") + + # Stats object + try: + stats = store.stats() + check("stats_callable", True, f"stats={stats}") + except Exception as e: + check("stats_callable", False, f"error={e}") + + return store + + +# ── Check 2: Digest map coverage ────────────────────────────────────── + + +def check_digest_map(store, digest_map_path): + """Verify that digests in the digest map are present in the store.""" + if not os.path.exists(digest_map_path): + check("digest_map_exists", False, f"not found: {digest_map_path}") + return + + check("digest_map_exists", True, f"path={digest_map_path}") + + # Read digest map + rows = [] + with open(digest_map_path, newline="") as f: + reader = csv.DictReader(f) + for row in reader: + rows.append(row) + + total = len(rows) + with_digest = [r for r in rows if r.get("digest")] + with_error = [r for r in rows if r.get("error")] + + check( + "digest_map_stats", + len(with_digest) > 0, + f"total_rows={total}, with_digest={len(with_digest)}, with_error={len(with_error)}", + ) + + # Get store collection digests for comparison + store_digests = {meta.digest for meta in store.list_collections()["results"]} + + # Check how many digest_map digests are in the store + matched = 0 + missing = [] + for row in with_digest: + d = row["digest"] + if d in store_digests: + matched += 1 + else: + missing.append(d[:16] + "...") + + check( + "digest_map_coverage", + matched == len(with_digest), + f"in_store={matched}/{len(with_digest)}" + + (f", missing_sample={missing[:5]}" if missing else ""), + ) + + +# ── Check 3: Collection level2 data integrity ───────────────────────── + + +def check_level2_integrity(store, n_to_check=3): + """Verify level2 data for a sample of collections.""" + collections = list(store.list_collections()["results"]) + if not collections: + check("level2_integrity", False, "no collections to check") + return + + sample = collections[:n_to_check] + all_ok = True + details = [] + + for meta in sample: + digest = meta.digest + try: + level2 = store.get_collection_level2(digest) + names = level2.get("names", []) + lengths = level2.get("lengths", []) + sequences = level2.get("sequences", []) + + arrays_ok = ( + len(names) == len(lengths) == len(sequences) and len(names) > 0 + ) + lengths_ok = all(l > 0 for l in lengths) if lengths else False + + if not arrays_ok or not lengths_ok: + all_ok = False + details.append( + f"{digest[:16]}: names={len(names)} lengths={len(lengths)} " + f"sequences={len(sequences)} lengths_positive={lengths_ok}" + ) + else: + details.append( + f"{digest[:16]}: {len(names)} seqs, OK" + ) + except Exception as e: + all_ok = False + details.append(f"{digest[:16]}: ERROR {e}") + + check( + "level2_arrays_valid", + all_ok, + f"checked={len(sample)}, results=[{'; '.join(details)}]", + ) + + +# ── Check 4: Round-trip FASTA export and digest comparison ───────────── + + +def check_roundtrip_export(store, store_path, digest_map_path, inventory_path, limit=3): + """Export FASTAs from the store and compare digests to originals.""" + try: + from gtars.refget import digest_fasta + except ImportError: + check("roundtrip_export", False, "gtars.refget.digest_fasta not available") + return + + # Build a mapping from digest -> original path using digest_map + inventory + digest_to_original = {} + + if os.path.exists(digest_map_path) and os.path.exists(inventory_path): + # Read inventory to get path -> accession mapping (for reference) + inv_lookup = {} + with open(inventory_path, newline="") as f: + for row in csv.DictReader(f): + inv_lookup[row["path"]] = row + + # Read digest_map to get digest -> path mapping + with open(digest_map_path, newline="") as f: + for row in csv.DictReader(f): + if row.get("digest") and row.get("path"): + # Only keep the first mapping per digest (avoid duplicates) + if row["digest"] not in digest_to_original: + digest_to_original[row["digest"]] = row["path"] + + if not digest_to_original: + check("roundtrip_export", False, "no digest-to-path mappings found") + return + + # Pick a sample of collections that have original files + collections = list(store.list_collections()["results"]) + test_pairs = [] + for meta in collections: + if meta.digest in digest_to_original: + original_path = digest_to_original[meta.digest] + if os.path.exists(original_path): + test_pairs.append((meta.digest, original_path)) + if len(test_pairs) >= limit: + break + + if not test_pairs: + check("roundtrip_export", False, "no original FASTA files accessible for comparison") + return + + all_match = True + details = [] + + for digest, original_path in test_pairs: + fd, tmp_path = tempfile.mkstemp(suffix=".fa") + os.close(fd) + try: + store.export_fasta(digest, tmp_path, None, 80) + + exported_sc = digest_fasta(tmp_path) + original_sc = digest_fasta(original_path) + + match = exported_sc.digest == original_sc.digest + if not match: + all_match = False + basename = os.path.basename(original_path) + details.append( + f"{basename}: {'MATCH' if match else 'MISMATCH'} " + f"(exported={exported_sc.digest[:16]}... " + f"original={original_sc.digest[:16]}...)" + ) + except Exception as e: + all_match = False + basename = os.path.basename(original_path) + details.append(f"{basename}: ERROR {e}") + finally: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + + check( + "roundtrip_digest_match", + all_match, + f"tested={len(test_pairs)}, results=[{'; '.join(details)}]", + ) + + +# ── Check 5: CLI stats command works ────────────────────────────────── + + +def check_cli_stats(store_path): + """Verify the CLI stats command runs against the store.""" + try: + result = subprocess.run( + ["refget", "store", "stats", "--path", store_path], + capture_output=True, + text=True, + timeout=60, + ) + if result.returncode == 0: + check("cli_stats_runs", True, f"stdout={result.stdout.strip()[:200]}") + else: + check( + "cli_stats_runs", + False, + f"returncode={result.returncode}, stderr={result.stderr.strip()[:200]}", + ) + except FileNotFoundError: + check("cli_stats_runs", False, "refget CLI not found in PATH") + except subprocess.TimeoutExpired: + check("cli_stats_runs", False, "timed out after 60s") + except Exception as e: + check("cli_stats_runs", False, f"error={e}") + + +# ── Check 6: Inventory cross-reference ──────────────────────────────── + + +def check_inventory_crossref(store, inventory_path, digest_map_path): + """Cross-check inventory against digest_map to verify completeness.""" + if not os.path.exists(inventory_path): + check("inventory_exists", False, f"not found: {inventory_path}") + return + if not os.path.exists(digest_map_path): + check("inventory_crossref", False, f"digest_map not found: {digest_map_path}") + return + + # Count inventory rows + with open(inventory_path, newline="") as f: + inv_rows = list(csv.DictReader(f)) + + # Count digest_map rows + with open(digest_map_path, newline="") as f: + dm_rows = list(csv.DictReader(f)) + + inv_paths = {r["path"] for r in inv_rows} + dm_paths = {r["path"] for r in dm_rows} + + # How many inventory files have been processed? + processed = inv_paths & dm_paths + unprocessed = inv_paths - dm_paths + + check( + "inventory_processing_coverage", + True, # Always pass -- partial is expected + f"inventory={len(inv_rows)}, digest_map={len(dm_rows)}, " + f"processed={len(processed)}, unprocessed={len(unprocessed)}", + ) + + # Check error rate in digest_map + errors = [r for r in dm_rows if r.get("error")] + check( + "digest_map_error_rate", + len(errors) == 0, + f"errors={len(errors)}/{len(dm_rows)}" + + (f", samples={[r['filename'] + ': ' + r['error'] for r in errors[:3]]}" if errors else ""), + ) + + +# ── Summary and report ──────────────────────────────────────────────── + + +def print_summary(store_path): + """Print summary and write JSON report.""" + print("\n" + "=" * 60) + print("VERIFICATION SUMMARY") + print("=" * 60) + passed = sum(1 for r in results if r["status"] == "PASS") + failed = sum(1 for r in results if r["status"] == "FAIL") + print(f"Passed: {passed}") + print(f"Failed: {failed}") + print(f"Total: {passed + failed}") + + if failed > 0: + print("\nFailed checks:") + for r in results: + if r["status"] == "FAIL": + print(f" - {r['name']}: {r['detail']}") + + # Write JSON report to staging area + report_dir = STAGING + os.makedirs(report_dir, exist_ok=True) + report_path = os.path.join(report_dir, "verification_report.json") + with open(report_path, "w") as f: + json.dump( + {"results": results, "passed": passed, "failed": failed}, + f, + indent=2, + ) + print(f"\nJSON report: {report_path}") + + return failed + + +def main(): + args = parse_args() + store_path = args.store_path + + print(f"Verifying RefgetStore at: {store_path}") + print(f"Inventory CSV: {args.inventory}") + print(f"Digest map CSV: {args.digest_map}") + print("=" * 60) + + t_start = time.time() + + # Check 1: Store opens and stats + print("\n── Check 1: Store opens and stats ──") + store = check_store_opens(store_path) + if store is None: + print("\nStore failed to open. Cannot continue.") + print_summary(store_path) + sys.exit(1) + + # Check 2: Digest map coverage + print("\n── Check 2: Digest map coverage ──") + check_digest_map(store, args.digest_map) + + # Check 3: Level2 data integrity + print("\n── Check 3: Collection level2 data integrity ──") + check_level2_integrity(store, n_to_check=min(args.limit, 5)) + + # Check 4: Round-trip FASTA export + if args.skip_roundtrip: + print("\n── Check 4: Round-trip export (SKIPPED) ──") + check("roundtrip_digest_match", True, "skipped via --skip-roundtrip") + else: + print("\n── Check 4: Round-trip FASTA export ──") + check_roundtrip_export( + store, store_path, args.digest_map, args.inventory, limit=args.limit + ) + + # Check 5: CLI stats command + print("\n── Check 5: CLI stats command ──") + check_cli_stats(store_path) + + # Check 6: Inventory cross-reference + print("\n── Check 6: Inventory cross-reference ──") + check_inventory_crossref(store, args.inventory, args.digest_map) + + elapsed = time.time() - t_start + print(f"\nVerification completed in {elapsed:.1f}s") + + failed = print_summary(store_path) + sys.exit(1 if failed > 0 else 0) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/05_profiling/profile_all.py b/data_loaders/ref-genome-analysis/src/05_profiling/profile_all.py new file mode 100644 index 0000000..3fac641 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_all.py @@ -0,0 +1,49 @@ +"""Profile RefgetStore on 5 genomes: newt + 4 normal. Compare timing and memory.""" +import time +import resource + +def peak_mb(): + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + +def rss_mb(): + try: + with open("/proc/self/status") as f: + for line in f: + if line.startswith("VmRSS:"): + return int(line.split()[1]) / 1024 + except: + pass + return peak_mb() + +from gtars.refget import RefgetStore + +import os +BRICK_ROOT = os.environ["BRICK_ROOT"] +STORE_PATH = os.environ.get("STORE_PATH", f"{BRICK_ROOT}/refget_store") +GENOMES = [ + # (path, old_total_time, n_seqs, label) + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964261635.1.fa.gz", 183.7, 448, "newt (2GB chr)"), + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964263255.1.fa.gz", 213.2, 15265, "15K seqs"), + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964263955.1.fa.gz", 42.7, 11150, "11K seqs"), + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964264875.2.fa.gz", 27.4, 585, "585 seqs"), + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964266715.1.fa.gz", 17.0, 1581, "1.6K seqs"), +] + +store = RefgetStore.on_disk(STORE_PATH) +print(f"Store opened. Stats: {store.stats()}") +print(f"RSS after open: {rss_mb():.0f} MB\n") + +print(f"{'Genome':<30} {'Seqs':>6} {'New(s)':>8} {'Old(s)':>8} {'Ratio':>7} {'Peak MB':>8}") +print("-" * 75) + +for fasta, old_total, old_nseqs, label in GENOMES: + name = fasta.split("/")[-1] + t0 = time.time() + meta, was_new = store.add_sequence_collection_from_fasta(fasta) + elapsed = time.time() - t0 + ratio = elapsed / old_total + status = "NEW" if was_new else "SKIP" + print(f"{label:<30} {meta.n_sequences:>6} {elapsed:>7.1f}s {old_total:>7.1f}s {ratio:>6.2f}x {peak_mb():>7.0f}", flush=True) + +print(f"\nFinal RSS: {rss_mb():.0f} MB, Peak: {peak_mb():.0f} MB") +print(f"Store stats: {store.stats()}") diff --git a/data_loaders/ref-genome-analysis/src/05_profiling/profile_all.sbatch b/data_loaders/ref-genome-analysis/src/05_profiling/profile_all.sbatch new file mode 100644 index 0000000..5d9e57e --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_all.sbatch @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=profile_all +#SBATCH --output=profile_all_%j.log +#SBATCH --error=profile_all_%j.log +#SBATCH --partition=standard +#SBATCH --time=2:00:00 +#SBATCH --mem=16G +#SBATCH --cpus-per-task=4 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd $HOME/code/ref-genome-analysis +source env/on-cluster.env + +python src/05_profiling/profile_all.py diff --git a/data_loaders/ref-genome-analysis/src/05_profiling/profile_batch.py b/data_loaders/ref-genome-analysis/src/05_profiling/profile_batch.py new file mode 100644 index 0000000..760d64c --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_batch.py @@ -0,0 +1,29 @@ +"""Profile RefgetStore on several normal genomes for timing comparison.""" +import time +import resource + +def peak_mb(): + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + +from gtars.refget import RefgetStore + +import os +BRICK_ROOT = os.environ["BRICK_ROOT"] +STORE_PATH = os.environ.get("STORE_PATH", f"{BRICK_ROOT}/refget_store") +GENOMES = [ + # (path, old_pipeline_time, old_total_time, n_seqs) + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964263255.1.fa.gz", 203.1, 213.2, 15265), + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964263955.1.fa.gz", 32.6, 42.7, 11150), + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964266715.1.fa.gz", 7.2, 17.0, 1581), +] + +store = RefgetStore.on_disk(STORE_PATH) +print(f"Store opened. Stats: {store.stats()}\n") + +for fasta, old_pipe, old_total, old_nseqs in GENOMES: + name = fasta.split("/")[-1] + t0 = time.time() + meta, was_new = store.add_sequence_collection_from_fasta(fasta) + elapsed = time.time() - t0 + status = "NEW" if was_new else "SKIP" + print(f"{status} {name}: {meta.n_sequences} seqs, {elapsed:.1f}s (old: {old_pipe:.1f}s pipe / {old_total:.1f}s total), Peak={peak_mb():.0f} MB") diff --git a/data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.py b/data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.py new file mode 100644 index 0000000..07a3ad7 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.py @@ -0,0 +1,87 @@ +"""Profile RefgetStore memory usage on Rivanna.""" +import os +import sys +import time +import resource +import csv + +def rss_mb(): + """Current RSS in MB from /proc/self/status (more accurate than ru_maxrss).""" + try: + with open("/proc/self/status") as f: + for line in f: + if line.startswith("VmRSS:"): + return int(line.split()[1]) / 1024 # KB to MB + except: + pass + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + +def peak_mb(): + """Peak RSS (high-water mark).""" + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + +def print_mem(label): + print(f"[MEM] {label}: RSS={rss_mb():.1f} MB, Peak={peak_mb():.1f} MB", flush=True) + +print_mem("startup") + +from gtars.refget import RefgetStore +print_mem("after import") + +BRICK_ROOT = os.environ["BRICK_ROOT"] +STORE_PATH = os.environ.get("STORE_PATH", f"{BRICK_ROOT}/refget_store") +INVENTORY_CSV = os.environ.get("INVENTORY_CSV", f"{BRICK_ROOT}/refgenomes_inventory.csv") + +# Open the store +t0 = time.time() +store = RefgetStore.on_disk(STORE_PATH) +t1 = time.time() +print_mem(f"after open_local ({t1-t0:.1f}s)") +print(f"Store stats: {store.stats()}") + +# Read inventory +rows = [] +with open(INVENTORY_CSV) as f: + reader = csv.DictReader(f) + for row in reader: + rows.append(row) + +# Use offset to skip to unprocessed files +OFFSET = int(sys.argv[1]) if len(sys.argv) > 1 else 0 +TARGET_NEW = int(sys.argv[2]) if len(sys.argv) > 2 else 5 + +if OFFSET: + rows = rows[OFFSET:] + print(f"Skipped to offset {OFFSET}, {len(rows)} remaining") + +print(f"Total rows to process: {len(rows)}, targeting {TARGET_NEW} new files") +print_mem("before processing loop") + +n_new = 0 +n_skipped = 0 + +for i, row in enumerate(rows): + fasta_path = row["path"] + filename = row.get("filename", "") + + t0 = time.time() + try: + meta, was_new = store.add_sequence_collection_from_fasta(fasta_path, threads=4) + elapsed = time.time() - t0 + + if was_new: + n_new += 1 + print(f"\n[{OFFSET+i+1}] NEW: {filename} -> {meta.digest} ({meta.n_sequences} seqs, {elapsed:.1f}s)") + print_mem(f"after NEW #{n_new}") + print(f"Store stats: {store.stats()}") + if n_new >= TARGET_NEW: + break + else: + n_skipped += 1 + if n_skipped % 50 == 0: + print_mem(f"skipping... ({n_skipped} skipped, row {OFFSET+i+1})") + except Exception as e: + print(f"[{OFFSET+i+1}] FAILED {filename}: {e}") + +print(f"\nDone: {n_new} new, {n_skipped} skipped") +print_mem("final") diff --git a/data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.sbatch b/data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.sbatch new file mode 100644 index 0000000..2eee915 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.sbatch @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=profile_mem +#SBATCH --output=profile_mem_%j.log +#SBATCH --error=profile_mem_%j.log +#SBATCH --partition=standard +#SBATCH --time=2:00:00 +#SBATCH --mem=16G +#SBATCH --cpus-per-task=4 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd $HOME/code/ref-genome-analysis +source env/on-cluster.env + +python src/05_profiling/profile_memory.py 850 5 diff --git a/data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.py b/data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.py new file mode 100644 index 0000000..7d2285a --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.py @@ -0,0 +1,57 @@ +"""Profile RefgetStore memory on the palmate newt genome (GCA_964261635.1). + +This genome has a single 2 GB chromosome — the worst case for pipeline memory. +Run via sbatch after removing the genome from the store to force re-processing. +""" +import os +import sys +import time +import resource + +def rss_mb(): + """Current RSS in MB from /proc/self/status.""" + try: + with open("/proc/self/status") as f: + for line in f: + if line.startswith("VmRSS:"): + return int(line.split()[1]) / 1024 + except: + pass + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + +def peak_mb(): + """Peak RSS (high-water mark).""" + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + +def print_mem(label): + print(f"[MEM] {label}: RSS={rss_mb():.1f} MB, Peak={peak_mb():.1f} MB", flush=True) + +print_mem("startup") + +from gtars.refget import RefgetStore +print_mem("after import") + +BRICK_ROOT = os.environ["BRICK_ROOT"] +STORE_PATH = os.environ.get("STORE_PATH", f"{BRICK_ROOT}/refget_store") +NEWT_FASTA = f"{BRICK_ROOT}/vertebrates/fasta/GCA_964261635.1.fa.gz" + +# Open the store +t0 = time.time() +store = RefgetStore.on_disk(STORE_PATH) +t1 = time.time() +print_mem(f"after open_local ({t1-t0:.1f}s)") +print(f"Store stats: {store.stats()}") + +# Process the newt genome +print(f"\nProcessing newt genome: {NEWT_FASTA}") +t0 = time.time() +meta, was_new = store.add_sequence_collection_from_fasta(NEWT_FASTA) +elapsed = time.time() - t0 + +status = "NEW" if was_new else "SKIPPED (already exists)" +print(f"\nResult: {status}") +print(f"Digest: {meta.digest}") +print(f"Sequences: {meta.n_sequences}") +print(f"Time: {elapsed:.1f}s") +print_mem("after processing") +print(f"Store stats: {store.stats()}") diff --git a/data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.sbatch b/data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.sbatch new file mode 100644 index 0000000..3db0284 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.sbatch @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=profile_newt +#SBATCH --output=profile_newt_%j.log +#SBATCH --error=profile_newt_%j.log +#SBATCH --partition=standard +#SBATCH --time=2:00:00 +#SBATCH --mem=16G +#SBATCH --cpus-per-task=4 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd $HOME/code/ref-genome-analysis +source env/on-cluster.env + +python src/05_profiling/profile_newt.py diff --git a/data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.py b/data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.py new file mode 100644 index 0000000..6c414ca --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.py @@ -0,0 +1,35 @@ +"""Profile RefgetStore on a normal-sized genome (GCA_964264875.2, 585 seqs). +Compare timing with old code (17.7s pipeline time).""" +import time +import resource + +def peak_mb(): + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + +def print_mem(label): + print(f"[MEM] {label}: Peak={peak_mb():.1f} MB", flush=True) + +print_mem("startup") + +from gtars.refget import RefgetStore +print_mem("after import") + +import os +BRICK_ROOT = os.environ["BRICK_ROOT"] +STORE_PATH = os.environ.get("STORE_PATH", f"{BRICK_ROOT}/refget_store") +FASTA = f"{BRICK_ROOT}/vertebrates/fasta/GCA_964264875.2.fa.gz" + +t0 = time.time() +store = RefgetStore.on_disk(STORE_PATH) +print_mem(f"after open_local ({time.time()-t0:.1f}s)") + +print(f"\nProcessing: {FASTA}") +t0 = time.time() +meta, was_new = store.add_sequence_collection_from_fasta(FASTA) +elapsed = time.time() - t0 + +print(f"Result: {'NEW' if was_new else 'SKIPPED'}") +print(f"Digest: {meta.digest}") +print(f"Sequences: {meta.n_sequences}") +print(f"Time: {elapsed:.1f}s (old code: 17.7s pipeline / 27.4s total)") +print_mem("after processing") diff --git a/data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.sbatch b/data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.sbatch new file mode 100644 index 0000000..c5ba2ca --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.sbatch @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=profile_normal +#SBATCH --output=profile_normal_%j.log +#SBATCH --error=profile_normal_%j.log +#SBATCH --partition=standard +#SBATCH --time=0:30:00 +#SBATCH --mem=8G +#SBATCH --cpus-per-task=4 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd $HOME/code/ref-genome-analysis +source env/on-cluster.env + +python src/05_profiling/profile_normal.py diff --git a/data_loaders/ref-genome-analysis/src/90_split_store.py b/data_loaders/ref-genome-analysis/src/90_split_store.py new file mode 100644 index 0000000..93ff4b5 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/90_split_store.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +""" +Split the combined refget store into two stores: VGP vertebrates and reference genomes. + +Reads digest_map.csv (produced by 02_build/build_digest_map.py) which has a 'group' +column for every FASTA. Collections with group='vertebrates' go to the VGP store, +everything else goes to the ref store. + +Usage: + python src/90_split_store.py --dry-run + python src/90_split_store.py +""" + +import argparse +import csv +import os +import sys +import time + +from refget.store import RefgetStore + +BRICK_ROOT = os.environ["BRICK_ROOT"] +DEFAULT_SOURCE = os.environ.get("STORE_PATH", os.path.join(BRICK_ROOT, "refget_store")) +STAGING = os.environ.get("STAGING", os.path.join(BRICK_ROOT, "refget_staging")) +DEFAULT_DIGEST_MAP = os.path.join(STAGING, "digest_map.csv") +DEFAULT_VGP_OUTPUT = os.environ.get("VGP_STORE_PATH", os.path.join(BRICK_ROOT, "refget-store", "vgp")) +DEFAULT_REF_OUTPUT = os.environ.get("REF_STORE_PATH", os.path.join(BRICK_ROOT, "refget-store", "jungle")) + +VGP_GROUPS = {"vertebrates"} + + +def _paginate(store): + """Yield pages of collection results from a store.""" + page = 0 + while True: + result = store.list_collections(page, 1000) + yield result["results"] + if len(result["results"]) < 1000: + break + page += 1 + + +def load_digest_map(digest_map_path: str) -> dict[str, set[str]]: + """Read digest_map.csv and return group -> set of digests.""" + groups: dict[str, set[str]] = {} + with open(digest_map_path) as f: + for row in csv.DictReader(f): + digest = row.get("digest", "").strip() + group = row.get("group", "unknown").strip() + if digest: + groups.setdefault(group, set()).add(digest) + return groups + + +def split_store( + source_path: str, + digest_map_path: str, + vgp_output: str, + ref_output: str, + dry_run: bool = False, +): + # Load group -> digest mapping + group_digests = load_digest_map(digest_map_path) + + vgp_digests = set() + ref_digests = set() + for group, digests in group_digests.items(): + label = "VGP" if group in VGP_GROUPS else "ref" + print(f" {group}: {len(digests)} collections ({label})") + if group in VGP_GROUPS: + vgp_digests |= digests + else: + ref_digests |= digests + + # Open source store and load all collections (metadata only) + print(f"\nOpening source store: {source_path}") + source = RefgetStore.on_disk(source_path) + source.load_all_collections() + + # Get all store digests + all_store_digests = set() + page = 0 + while True: + result = source.list_collections(page, 1000) + for c in result["results"]: + all_store_digests.add(c.digest) + if len(result["results"]) < 1000: + break + page += 1 + + vgp_in_store = vgp_digests & all_store_digests + ref_in_store = ref_digests & all_store_digests + unaccounted = all_store_digests - vgp_digests - ref_digests + + print(f"\nTotal in store: {len(all_store_digests)}") + print(f"VGP to import: {len(vgp_in_store)}") + print(f"Ref to import: {len(ref_in_store)}") + if unaccounted: + print(f"Unaccounted: {len(unaccounted)} (in store but not in digest_map)") + + if vgp_digests - all_store_digests: + print(f"Warning: {len(vgp_digests - all_store_digests)} VGP digests not in store", file=sys.stderr) + if ref_digests - all_store_digests: + print(f"Warning: {len(ref_digests - all_store_digests)} ref digests not in store", file=sys.stderr) + + if dry_run: + print("\n--dry-run: stopping here.") + return + + # Import VGP collections + print(f"\nCreating VGP store: {vgp_output}") + vgp_store = RefgetStore.on_disk(vgp_output) + existing_vgp = {c.digest for p in _paginate(vgp_store) for c in p} + to_import_vgp = sorted(vgp_in_store - existing_vgp) + print(f"VGP: {len(vgp_in_store)} total, {len(existing_vgp)} already imported, {len(to_import_vgp)} remaining") + t0 = time.time() + for i, digest in enumerate(to_import_vgp, 1): + print(f" [{i}/{len(to_import_vgp)}] {digest}") + vgp_store.import_collection(source, digest) + print(f"VGP import done in {time.time() - t0:.1f}s") + + # Import ref collections + print(f"\nCreating ref store: {ref_output}") + ref_store = RefgetStore.on_disk(ref_output) + existing_ref = {c.digest for p in _paginate(ref_store) for c in p} + to_import_ref = sorted(ref_in_store - existing_ref) + print(f"Ref: {len(ref_in_store)} total, {len(existing_ref)} already imported, {len(to_import_ref)} remaining") + t0 = time.time() + for i, digest in enumerate(to_import_ref, 1): + print(f" [{i}/{len(to_import_ref)}] {digest}") + ref_store.import_collection(source, digest) + print(f"Ref import done in {time.time() - t0:.1f}s") + + print("\nDone!") + print(f" VGP store: {vgp_output}") + print(f" Ref store: {ref_output}") + + +def main(): + parser = argparse.ArgumentParser( + description="Split combined refget store into VGP and ref genome stores." + ) + parser.add_argument("--source", default=DEFAULT_SOURCE) + parser.add_argument("--digest-map", default=DEFAULT_DIGEST_MAP) + parser.add_argument("--vgp-output", default=DEFAULT_VGP_OUTPUT) + parser.add_argument("--ref-output", default=DEFAULT_REF_OUTPUT) + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + split_store( + source_path=args.source, + digest_map_path=args.digest_map, + vgp_output=args.vgp_output, + ref_output=args.ref_output, + dry_run=args.dry_run, + ) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/90_split_store.sbatch b/data_loaders/ref-genome-analysis/src/90_split_store.sbatch new file mode 100644 index 0000000..eb83e26 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/90_split_store.sbatch @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=split_store +#SBATCH --output=split_store_%j.log +#SBATCH --error=split_store_%j.log +#SBATCH --partition=standard +#SBATCH --time=4:00:00 +#SBATCH --mem=16G +#SBATCH --cpus-per-task=1 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd $HOME/code/ref-genome-analysis +source env/on-cluster.env + +python src/90_split_store.py diff --git a/data_loaders/ref-genome-analysis/src/backfill_sequence_aliases.py b/data_loaders/ref-genome-analysis/src/backfill_sequence_aliases.py new file mode 100644 index 0000000..0b95b75 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/backfill_sequence_aliases.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +""" +Backfill sequence and collection aliases into a split store. + +Matches accessions to target store collections via digest_map (path join), +then registers aliases from the NCBI alias table by matching sequence names +in level2 data. Does NOT load any FASTAs — read-only against the target store +except for alias registration. + +Usage: + source env/on-cluster.env + python src/backfill_sequence_aliases.py --target $VGP_STORE_PATH + python src/backfill_sequence_aliases.py --target $REF_STORE_PATH + python src/backfill_sequence_aliases.py --target $VGP_STORE_PATH --dry-run +""" + +import argparse +import csv +import os +import tempfile +import time +from collections import defaultdict + +BRICK_ROOT = os.environ["BRICK_ROOT"] +STAGING = os.environ.get("STAGING", os.path.join(BRICK_ROOT, "refget_staging")) +INVENTORY_CSV = os.environ.get("INVENTORY_CSV", os.path.join(BRICK_ROOT, "refgenomes_inventory.csv")) +ALIAS_TABLE_CSV = os.path.join(STAGING, "ncbi_alias_table.csv") +DIGEST_MAP_CSV = os.path.join(STAGING, "digest_map.csv") + + +def get_all_collection_digests(store): + digests = set() + page = 0 + while True: + result = store.list_collections(page, 1000) + for c in result["results"]: + digests.add(c.digest) + if len(result["results"]) < 1000: + break + page += 1 + return digests + + +def main(): + parser = argparse.ArgumentParser( + description="Backfill aliases into a split store from NCBI alias table." + ) + parser.add_argument("--target", required=True, help="Target RefgetStore path") + parser.add_argument("--alias-table", default=ALIAS_TABLE_CSV) + parser.add_argument("--inventory", default=INVENTORY_CSV) + parser.add_argument("--digest-map", default=DIGEST_MAP_CSV) + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + from refget.store import RefgetStore + + print(f"Target store: {args.target}") + print(f"Alias table: {args.alias_table}") + print(f"Inventory: {args.inventory}") + print(f"Digest map: {args.digest_map}") + print(f"Dry run: {args.dry_run}") + print() + + # Open target store (read-only for collection lookup, then alias writes) + store = RefgetStore.open_local(args.target) + target_digests = get_all_collection_digests(store) + print(f"Target has {len(target_digests)} collections") + + # Build path -> accession from inventory + path_to_accession = {} + with open(args.inventory, newline="") as f: + for row in csv.DictReader(f): + acc = row.get("accession", "").strip() + path = row.get("path", "").strip() + if acc and path: + path_to_accession[path] = acc + + # Build digest -> accession via digest_map (join on path) + digest_to_accession = {} + with open(args.digest_map, newline="") as f: + for row in csv.DictReader(f): + digest = row.get("digest", "").strip() + path = row.get("path", "").strip() + if digest and path and path in path_to_accession: + digest_to_accession[digest] = path_to_accession[path] + + # Filter to accessions whose digest is in the target store + target_acc_to_digest = {} + for digest in target_digests: + acc = digest_to_accession.get(digest) + if acc: + target_acc_to_digest[acc] = digest + + print(f"Accessions in target with alias data: {len(target_acc_to_digest)}") + + # Read alias table, filtered to target accessions + acc_to_rows = defaultdict(list) + with open(args.alias_table, newline="") as f: + for row in csv.DictReader(f): + acc = row.get("accession", "").strip() + if acc and acc in target_acc_to_digest: + acc_to_rows[acc].append(row) + + common = sorted(target_acc_to_digest.keys() & acc_to_rows.keys()) + print(f"Accessions with alias table entries: {len(common)}") + + # Re-open as on_disk for writing aliases + store = RefgetStore.on_disk(args.target) + store.set_quiet(True) + + seq_aliases = {"refseq": [], "insdc": [], "ucsc": []} + coll_aliases = {"refseq": [], "insdc": []} + n_matched = 0 + n_unmatched = 0 + t_start = time.time() + + for i, accession in enumerate(common, 1): + coll_digest = target_acc_to_digest[accession] + alias_rows = acc_to_rows[accession] + + print(f"[{i}/{len(common)}] {accession} ({len(alias_rows)} seqs)...", end=" ", flush=True) + + # Collection-level aliases + first_row = alias_rows[0] + genbank_acc = first_row.get("genbank_assembly_accn", "").strip() + refseq_acc = first_row.get("refseq_assembly_accn", "").strip() + if refseq_acc: + coll_aliases["refseq"].append((refseq_acc, coll_digest)) + if genbank_acc: + coll_aliases["insdc"].append((genbank_acc, coll_digest)) + + # Sequence-level aliases via name matching in level2 + level2 = store.get_collection_level2(coll_digest) + names = level2.get("names", []) + lengths = level2.get("lengths", []) + sequences = level2.get("sequences", []) + name_to_info = {n: (s, int(l)) for n, l, s in zip(names, lengths, sequences)} + + matched_this = 0 + for row in alias_rows: + seq_name = row.get("sequence_name", "").strip() + seq_length_str = row.get("sequence_length", "").strip() + refseq_accn = row.get("refseq_accn", "").strip() + genbank_accn = row.get("genbank_accn", "").strip() + ucsc_name = row.get("ucsc_name", "").strip() + seq_length = int(seq_length_str) if seq_length_str else None + + seq_digest = None + for candidate in [seq_name, refseq_accn, genbank_accn, ucsc_name]: + if candidate and candidate in name_to_info: + sd, sl = name_to_info[candidate] + if seq_length is None or sl == seq_length: + seq_digest = sd + break + + if seq_digest is None: + n_unmatched += 1 + continue + + matched_this += 1 + if refseq_accn: + seq_aliases["refseq"].append((refseq_accn, seq_digest)) + if genbank_accn: + seq_aliases["insdc"].append((genbank_accn, seq_digest)) + if ucsc_name: + seq_aliases["ucsc"].append((ucsc_name, seq_digest)) + + n_matched += matched_this + print(f"{matched_this}/{len(alias_rows)} matched") + + elapsed = time.time() - t_start + n_seq = sum(len(v) for v in seq_aliases.values()) + n_coll = sum(len(v) for v in coll_aliases.values()) + print(f"\nMatching done in {elapsed:.1f}s") + print(f" Matched: {n_matched}, unmatched: {n_unmatched}") + print(f" Seq aliases: {n_seq}, coll aliases: {n_coll}") + + if args.dry_run: + print("\n[DRY RUN] Skipping registration.") + return + + print("\nRegistering aliases...") + with tempfile.TemporaryDirectory() as tmpdir: + for ns, pairs in seq_aliases.items(): + if not pairs: + continue + tsv = os.path.join(tmpdir, f"seq_{ns}.tsv") + with open(tsv, "w") as f: + for alias, digest in pairs: + f.write(f"{alias}\t{digest}\n") + n = store.load_sequence_aliases(ns, tsv) + print(f" sequences/{ns}: {n} aliases loaded") + + for ns, pairs in coll_aliases.items(): + if not pairs: + continue + tsv = os.path.join(tmpdir, f"coll_{ns}.tsv") + with open(tsv, "w") as f: + for alias, digest in pairs: + f.write(f"{alias}\t{digest}\n") + n = store.load_collection_aliases(ns, tsv) + print(f" collections/{ns}: {n} aliases loaded") + + print(f"\nDone! Store stats: {store.stats()}") + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/backfill_sequence_aliases.sbatch b/data_loaders/ref-genome-analysis/src/backfill_sequence_aliases.sbatch new file mode 100644 index 0000000..c6c976d --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/backfill_sequence_aliases.sbatch @@ -0,0 +1,22 @@ +#!/bin/bash +#SBATCH --job-name=backfill_aliases +#SBATCH --output=backfill_aliases_%j.log +#SBATCH --error=backfill_aliases_%j.log +#SBATCH --partition=standard +#SBATCH --time=2:00:00 +#SBATCH --mem=16G +#SBATCH --cpus-per-task=1 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd $HOME/code/ref-genome-analysis +source env/on-cluster.env + +# Backfill VGP store +python src/backfill_sequence_aliases.py --target $BRICK_ROOT/vgp_reference_store + +# Backfill ref store (if it exists) +if [ -d "$BRICK_ROOT/refgenome_jungle_store" ]; then + python src/backfill_sequence_aliases.py --target $BRICK_ROOT/refgenome_jungle_store +fi diff --git a/data_loaders/ref-genome-analysis/src/examples/test_20_genomes.py b/data_loaders/ref-genome-analysis/src/examples/test_20_genomes.py new file mode 100644 index 0000000..3937965 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/examples/test_20_genomes.py @@ -0,0 +1,166 @@ +""" +Quick test: load 20 genomes into a RefgetStore and attach FHR metadata. + +Usage: + python test_20_genomes.py [--inventory PATH] [--limit N] +""" + +import argparse +import csv +import json +import os +import re +import sys +import tempfile +import time + +from gtars.refget import RefgetStore + +BRICK_ROOT = os.environ["BRICK_ROOT"] +INVENTORY_CSV = os.environ.get("INVENTORY_CSV", f"{BRICK_ROOT}/refgenomes_inventory.csv") +STAGING = os.environ.get("STAGING", f"{BRICK_ROOT}/refget_staging") +FHR_DIR = f"{STAGING}/fhr_metadata" +STORE_PATH = os.environ.get("STORE_PATH", "/scratch/$USER/test_refget_store_20") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--inventory", default=INVENTORY_CSV) + parser.add_argument("--store-path", default=STORE_PATH) + parser.add_argument("--limit", type=int, default=20) + args = parser.parse_args() + + # Read inventory + with open(args.inventory, newline="") as f: + rows = list(csv.DictReader(f)) + rows = rows[:args.limit] + + print(f"Loading {len(rows)} genomes into {args.store_path}") + os.makedirs(args.store_path, exist_ok=True) + store = RefgetStore.on_disk(args.store_path) + + # Phase 1: Load FASTAs + print("\n=== Phase 1: Load FASTAs ===") + digest_map = {} # filename -> digest + t_start = time.time() + + for i, row in enumerate(rows, 1): + fasta_path = row["path"] + filename = row.get("filename", os.path.basename(fasta_path)) + t0 = time.time() + print(f"[{i}/{len(rows)}] {filename}...", end=" ", flush=True) + try: + meta, was_new = store.add_sequence_collection_from_fasta(fasta_path) + elapsed = time.time() - t0 + status = "NEW" if was_new else "exists" + print(f"{meta.digest} ({meta.n_sequences} seqs, {status}, {elapsed:.1f}s)") + digest_map[filename] = meta.digest + except Exception as e: + print(f"FAILED: {e}") + + t_fasta = time.time() - t_start + print(f"\nPhase 1 done: {len(digest_map)} loaded in {t_fasta:.1f}s") + + # Phase 2: Load FHR metadata (provenance only, no vitalStats) for all collections + # Map build names to GCA accessions for known species + BUILD_TO_ACCESSION = { + ("homo_sapiens", "hg19"): "GCA_000001405", + ("homo_sapiens", "hg38"): "GCA_000001405", + ("mus_musculus", "mm9"): "GCA_000001635", + ("mus_musculus", "mm10"): "GCA_000001635", + ("mus_musculus", "mm39"): "GCA_000001635", + } + + print("\n=== Phase 2: Load FHR metadata ===") + fhr_loaded = 0 + + # Build accession -> set of digests from inventory metadata + accession_digests = {} # accession -> set of digests + for row in rows: + filename = row.get("filename", "") + if filename not in digest_map: + continue + digest = digest_map[filename] + + # Try explicit accession column first + accession = row.get("accession", "").strip() + + # Try extracting from filename + if not accession: + m = re.search(r'(GCA_\d+(?:\.\d+)?)', filename) + if m: + accession = m.group(1) + + # Fall back to group+build mapping + if not accession: + group = row.get("group", "").strip() + build = row.get("build", "").strip() + accession = BUILD_TO_ACCESSION.get((group, build), "") + + if accession: + accession_digests.setdefault(accession, set()).add(digest) + + print(f" Found {len(accession_digests)} accessions across {sum(len(v) for v in accession_digests.values())} collections") + + def load_fhr_for_accession(store, accession, fhr_data, digests): + """Strip vitalStats and attach provenance FHR to all matching collections.""" + provenance = {k: v for k, v in fhr_data.items() if k != "vitalStats"} + loaded = 0 + with tempfile.NamedTemporaryFile(mode="w", suffix=".fhr.json", delete=False) as tmp: + json.dump(provenance, tmp, indent=2) + tmp_path = tmp.name + try: + for digest in digests: + store.load_fhr_metadata(digest, tmp_path) + print(f" {accession} -> {digest}") + loaded += 1 + finally: + os.unlink(tmp_path) + return loaded + + for accession, digests in sorted(accession_digests.items()): + # Check for pre-generated FHR file + fhr_path = os.path.join(FHR_DIR, f"{accession}.fhr.json") + if os.path.exists(fhr_path): + with open(fhr_path) as f: + fhr_data = json.load(f) + print(f" {accession}: loading from file ({len(digests)} collections)") + fhr_loaded += load_fhr_for_accession(store, accession, fhr_data, digests) + continue + + # Try NCBI API + try: + sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "fhr")) + from genomeark_to_fhr import fetch_ncbi_report, ncbi_to_fhr + print(f" {accession}: fetching from NCBI...", end=" ", flush=True) + report = fetch_ncbi_report(accession) + fhr_data = ncbi_to_fhr(report) + # Save full FHR (with vitalStats) for reference + os.makedirs(FHR_DIR, exist_ok=True) + with open(fhr_path, "w") as f: + json.dump(fhr_data, f, indent=2) + print(f"OK ({len(digests)} collections)") + fhr_loaded += load_fhr_for_accession(store, accession, fhr_data, digests) + except Exception as e: + print(f" {accession}: SKIP ({e})") + + print(f"\nPhase 2 done: {fhr_loaded} FHR entries loaded") + + # Summary + print("\n=== Summary ===") + store_stats = store.stats() + print(f"Store stats: {store_stats}") + fhr_digests = store.list_fhr_metadata() + print(f"FHR entries: {len(fhr_digests)}") + + # Verify FHR data is readable + for digest in fhr_digests: + fhr = store.get_fhr_metadata(digest) + print(f" {digest}: genome={fhr.genome}, version={fhr.version}") + + print(f"\nStore path: {args.store_path}") + print("Done!") + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/load_compliance_fastas.py b/data_loaders/ref-genome-analysis/src/load_compliance_fastas.py new file mode 100644 index 0000000..35e3b15 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/load_compliance_fastas.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +""" +Load GA4GH seqcol compliance test FASTAs into all RefgetStores. + +These small synthetic FASTAs are required for the compliance suite to pass. +They live in the refget repo at test_fasta/*.fa. + +Usage: + source env/on-cluster.env + python src/load_compliance_fastas.py + python src/load_compliance_fastas.py --store $VGP_STORE_PATH # single store +""" + +import argparse +import os +from pathlib import Path + +# Compliance FASTAs are in the refget repo +REFGET_REPO = os.environ.get("REFGET_REPO", os.path.join(os.environ.get("DEPLOY_DIR", ""), "refget")) +TEST_FASTA_DIR = os.path.join(REFGET_REPO, "test_fasta") + +COMPLIANCE_FASTAS = [ + "base.fa", + "different_names.fa", + "different_order.fa", + "pair_swap.fa", + "subset.fa", + "swap_wo_coords.fa", +] + + +def load_compliance_fastas(store_path: str, fasta_dir: str): + from refget.store import RefgetStore + + store = RefgetStore.on_disk(store_path) + store.set_quiet(True) + + print(f"Store: {store_path}") + loaded = 0 + for fa in COMPLIANCE_FASTAS: + path = os.path.join(fasta_dir, fa) + if not os.path.exists(path): + print(f" {fa}: NOT FOUND at {path}") + continue + meta, was_new = store.add_sequence_collection_from_fasta(path) + status = "added" if was_new else "exists" + print(f" {fa}: {meta.digest} ({status})") + loaded += 1 + + print(f" {loaded}/{len(COMPLIANCE_FASTAS)} loaded\n") + + +def main(): + parser = argparse.ArgumentParser(description="Load compliance FASTAs into RefgetStores") + parser.add_argument("--store", help="Load into a single store (path)") + parser.add_argument("--fasta-dir", default=TEST_FASTA_DIR, help="Directory containing test FASTAs") + args = parser.parse_args() + + if not os.path.isdir(args.fasta_dir): + print(f"Error: FASTA directory not found: {args.fasta_dir}") + print("Set REFGET_REPO or DEPLOY_DIR, or pass --fasta-dir") + return + + print(f"Compliance FASTAs from: {args.fasta_dir}\n") + + if args.store: + load_compliance_fastas(args.store, args.fasta_dir) + else: + # Load into all stores from env vars + for var in ["VGP_STORE_PATH", "REF_STORE_PATH", "PANGENOME_STORE_PATH"]: + path = os.environ.get(var) + if path and os.path.isdir(path): + load_compliance_fastas(path, args.fasta_dir) + elif path: + print(f" {var}={path} (not found, skipping)") + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/push_to_s3.sh b/data_loaders/ref-genome-analysis/src/push_to_s3.sh new file mode 100644 index 0000000..920ccff --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/push_to_s3.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Push refget stores to S3 via Rivanna. +# +# Clears stale GPG socket, connects with agent forwarding, decrypts +# credentials, and runs aws s3 sync. +# +# Usage (from laptop): +# source env/remote-hpc.env +# bash src/push_to_s3.sh vgp +# bash src/push_to_s3.sh ref +# bash src/push_to_s3.sh pangenome +# bash src/push_to_s3.sh all +# bash src/push_to_s3.sh vgp --dry-run + +set -euo pipefail + +STORE=${1:-all} +DRYRUN_FLAG="${2:-}" + +: "${VGP_STORE_PATH:?Set VGP_STORE_PATH in env}" +: "${REF_STORE_PATH:?Set REF_STORE_PATH in env}" +: "${PANGENOME_STORE_PATH:?Set PANGENOME_STORE_PATH in env}" +: "${VGP_S3_PATH:?Set VGP_S3_PATH in env}" +: "${REF_S3_PATH:?Set REF_S3_PATH in env}" +: "${PANGENOME_S3_PATH:?Set PANGENOME_S3_PATH in env}" + +# Clear stale GPG socket, then connect with forwarding +ssh riva1 "rm -f /run/user/\$(id -u)/gnupg/S.gpg-agent" + +ssh riva1_gpg " + source /etc/profile.d/modules.sh + module load awscli + + export AWS_ACCESS_KEY_ID=\$(pass databio/refgenie/s3_access_key_id) + export AWS_SECRET_ACCESS_KEY=\$(pass databio/refgenie/s3_secret_access_key) + + if [ \"$STORE\" = \"vgp\" ] || [ \"$STORE\" = \"all\" ]; then + echo 'Pushing VGP store to $VGP_S3_PATH ...' + aws s3 sync '$VGP_STORE_PATH' '$VGP_S3_PATH' $DRYRUN_FLAG + echo 'VGP push complete.' + fi + + if [ \"$STORE\" = \"ref\" ] || [ \"$STORE\" = \"all\" ]; then + echo 'Pushing ref store to $REF_S3_PATH ...' + aws s3 sync '$REF_STORE_PATH' '$REF_S3_PATH' $DRYRUN_FLAG + echo 'Ref push complete.' + fi + + if [ \"$STORE\" = \"pangenome\" ] || [ \"$STORE\" = \"all\" ]; then + echo 'Pushing pangenome store to $PANGENOME_S3_PATH ...' + aws s3 sync '$PANGENOME_STORE_PATH' '$PANGENOME_S3_PATH' $DRYRUN_FLAG + echo 'Pangenome push complete.' + fi + + echo 'Done!' +" diff --git a/data_loaders/ref-genome-analysis/src/validate_split_stores.py b/data_loaders/ref-genome-analysis/src/validate_split_stores.py new file mode 100644 index 0000000..d1ba420 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/validate_split_stores.py @@ -0,0 +1,538 @@ +#!/usr/bin/env python3 +""" +Validate the VGP and ref genome stores produced by 90_split_store.py. + +Checks that the split stores are complete, internally consistent, and +that every collection from the source store ended up in exactly one +output store. + +Usage: + source env/on-cluster.env + python src/validate_split_stores.py # validate both + python src/validate_split_stores.py --store vgp # VGP only + python src/validate_split_stores.py --store ref # ref only + python src/validate_split_stores.py --thorough # deep checks (slow) +""" + +import argparse +import csv +import json +import os +import sys +import tempfile +import time + +BRICK_ROOT = os.environ["BRICK_ROOT"] +STAGING = os.environ.get("STAGING", os.path.join(BRICK_ROOT, "refget_staging")) +SOURCE_PATH = os.environ.get("STORE_PATH", os.path.join(BRICK_ROOT, "refget_store")) +VGP_PATH = os.path.join(BRICK_ROOT, "refget-store/vgp") +REF_PATH = os.path.join(BRICK_ROOT, "refget-store/jungle") +DIGEST_MAP = os.path.join(STAGING, "digest_map.csv") + +VGP_GROUPS = {"vertebrates"} + +results = [] + + +def check(name, passed, detail=""): + status = "PASS" if passed else "FAIL" + results.append({"name": name, "status": status, "detail": detail}) + print(f" [{'PASS' if passed else 'FAIL'}] {name}" + (f" -- {detail}" if detail else "")) + + +def load_digest_map(path): + """Return (group->set_of_digests, all_rows).""" + groups = {} + rows = [] + with open(path) as f: + for row in csv.DictReader(f): + rows.append(row) + digest = row.get("digest", "").strip() + group = row.get("group", "unknown").strip() + if digest: + groups.setdefault(group, set()).add(digest) + return groups, rows + + +def get_all_collection_digests(store): + """Paginate through list_collections to get all digests.""" + digests = set() + page = 0 + while True: + result = store.list_collections(page, 1000) + for c in result["results"]: + digests.add(c.digest) + if len(result["results"]) < 1000: + break + page += 1 + return digests + + +# ── Test 1: Store opens and basic stats ──────────────────────────────── + + +def test_store_opens(store_path, label): + """Verify store opens and has non-zero collections/sequences.""" + from refget.store import RefgetStore + + print(f"\n── {label}: Store opens and stats ──") + + try: + store = RefgetStore.open_local(store_path) + check(f"{label}_opens", True, f"path={store_path}") + except Exception as e: + check(f"{label}_opens", False, f"error={e}") + return None + + try: + stats = store.stats() + check(f"{label}_stats", True, f"stats={stats}") + except Exception as e: + check(f"{label}_stats", False, f"error={e}") + + digests = get_all_collection_digests(store) + check(f"{label}_has_collections", len(digests) > 0, f"n={len(digests)}") + + try: + seqs = store.list_sequences() + n_seqs = len(seqs) + check(f"{label}_has_sequences", n_seqs > 0, f"n={n_seqs}") + except Exception as e: + check(f"{label}_has_sequences", False, f"error={e}") + + return store + + +# ── Test 2: Collection counts match digest map ──────────────────────── + + +def test_collection_counts(store, label, expected_digests): + """Verify the store has exactly the expected collections.""" + print(f"\n── {label}: Collection count vs digest map ──") + + store_digests = get_all_collection_digests(store) + + check( + f"{label}_count_match", + len(store_digests) == len(expected_digests), + f"store={len(store_digests)}, expected={len(expected_digests)}", + ) + + missing = expected_digests - store_digests + extra = store_digests - expected_digests + + check( + f"{label}_no_missing", + len(missing) == 0, + f"missing={len(missing)}" + (f", sample={list(missing)[:3]}" if missing else ""), + ) + check( + f"{label}_no_extra", + len(extra) == 0, + f"extra={len(extra)}" + (f", sample={list(extra)[:3]}" if extra else ""), + ) + + return store_digests + + +# ── Test 3: Level2 integrity for all collections ────────────────────── + + +def test_level2_integrity(store, label, digests, limit=None): + """Verify level2 arrays are aligned and valid for every collection.""" + print(f"\n── {label}: Level2 data integrity ──") + + to_check = sorted(digests) + if limit: + to_check = to_check[:limit] + + ok_count = 0 + fail_count = 0 + fail_details = [] + + for digest in to_check: + try: + level2 = store.get_collection_level2(digest) + names = level2.get("names", []) + lengths = level2.get("lengths", []) + sequences = level2.get("sequences", []) + + arrays_aligned = len(names) == len(lengths) == len(sequences) and len(names) > 0 + lengths_positive = all(l > 0 for l in lengths) if lengths else False + seqs_nonempty = all(s and len(s) > 0 for s in sequences) if sequences else False + + if arrays_aligned and lengths_positive and seqs_nonempty: + ok_count += 1 + else: + fail_count += 1 + fail_details.append( + f"{digest[:16]}: names={len(names)} lengths={len(lengths)} " + f"seqs={len(sequences)} aligned={arrays_aligned} " + f"lengths_ok={lengths_positive} seqs_ok={seqs_nonempty}" + ) + except Exception as e: + fail_count += 1 + fail_details.append(f"{digest[:16]}: ERROR {e}") + + total = ok_count + fail_count + check( + f"{label}_level2_all_valid", + fail_count == 0, + f"ok={ok_count}/{total}" + (f", failures=[{'; '.join(fail_details[:5])}]" if fail_details else ""), + ) + + +# ── Test 4: Aliases were imported ───────────────────────────────────── + + +def test_aliases(store, label, digests): + """Check that alias namespaces exist and at least some collections have aliases.""" + print(f"\n── {label}: Alias integrity ──") + + # Check namespaces exist + try: + coll_ns = store.list_collection_alias_namespaces() + check(f"{label}_collection_alias_namespaces", len(coll_ns) > 0, f"namespaces={coll_ns}") + except Exception as e: + check(f"{label}_collection_alias_namespaces", False, f"error={e}") + coll_ns = [] + + try: + seq_ns = store.list_sequence_alias_namespaces() + check(f"{label}_sequence_alias_namespaces", len(seq_ns) > 0, f"namespaces={seq_ns}") + except Exception as e: + check(f"{label}_sequence_alias_namespaces", False, f"error={e}") + seq_ns = [] + + # Sample: check that some collections have aliases + sample = sorted(digests)[:20] + with_aliases = 0 + for digest in sample: + try: + aliases = store.get_aliases_for_collection(digest) + if aliases and len(aliases) > 0: + with_aliases += 1 + except Exception: + pass + + check( + f"{label}_collections_have_aliases", + with_aliases > 0, + f"with_aliases={with_aliases}/{len(sample)} (sampled)", + ) + + # For each namespace, count total aliases + for ns in coll_ns: + try: + aliases = store.list_collection_aliases(ns) + check(f"{label}_coll_alias_count_{ns}", len(aliases) > 0, f"n={len(aliases)}") + except Exception as e: + check(f"{label}_coll_alias_count_{ns}", False, f"error={e}") + + # Forward lookup: pick an alias and verify it resolves + for ns in coll_ns[:1]: # test first namespace + try: + aliases = store.list_collection_aliases(ns) + if aliases: + alias = aliases[0] + resolved = store.get_collection_by_alias(ns, alias) + check( + f"{label}_coll_alias_forward_lookup_{ns}", + resolved is not None, + f"alias={alias}, resolved={resolved.digest[:16] if resolved else None}", + ) + except Exception as e: + check(f"{label}_coll_alias_forward_lookup_{ns}", False, f"error={e}") + + # Sequence alias count proportionality check + for ns in seq_ns: + try: + aliases = store.list_sequence_aliases(ns) + n_aliases = len(aliases) if aliases else 0 + check(f"{label}_seq_alias_count_{ns}", n_aliases > 0, f"n={n_aliases}") + except Exception as e: + check(f"{label}_seq_alias_count_{ns}", False, f"error={e}") + + +# ── Test 5: FHR metadata was imported ───────────────────────────────── + + +def test_fhr_metadata(store, label, digests): + """Check that FHR metadata exists for collections.""" + print(f"\n── {label}: FHR metadata ──") + + try: + fhr_digests = store.list_fhr_metadata() + n_fhr = len(fhr_digests) + check(f"{label}_fhr_exists", n_fhr > 0, f"n_with_fhr={n_fhr}") + except Exception as e: + check(f"{label}_fhr_exists", False, f"error={e}") + return + + # Verify FHR digests are in this store + fhr_set = set(fhr_digests) + orphan_fhr = fhr_set - digests + check( + f"{label}_fhr_no_orphans", + len(orphan_fhr) == 0, + f"orphaned_fhr={len(orphan_fhr)}" + (f", sample={list(orphan_fhr)[:3]}" if orphan_fhr else ""), + ) + + # Sample: read a few FHR records + sample = list(fhr_set & digests)[:5] + readable = 0 + for digest in sample: + try: + fhr = store.get_fhr_metadata(digest) + if fhr is not None: + readable += 1 + except Exception: + pass + + check( + f"{label}_fhr_readable", + readable == len(sample), + f"readable={readable}/{len(sample)}", + ) + + +# ── Test 6: Sequence retrieval works ────────────────────────────────── + + +def test_sequence_retrieval(store, label, digests): + """Verify sequences can be retrieved for sampled collections.""" + print(f"\n── {label}: Sequence retrieval ──") + + sample = sorted(digests)[:5] + ok_count = 0 + fail_details = [] + + for coll_digest in sample: + try: + level2 = store.get_collection_level2(coll_digest) + seq_digests = level2.get("sequences", []) + lengths = level2.get("lengths", []) + if not seq_digests: + fail_details.append(f"{coll_digest[:16]}: no sequences") + continue + + # Test first sequence in collection + seq = store.get_sequence(seq_digests[0]) + if seq is not None: + ok_count += 1 + else: + fail_details.append(f"{coll_digest[:16]}: get_sequence returned None") + except Exception as e: + fail_details.append(f"{coll_digest[:16]}: {e}") + + check( + f"{label}_sequence_retrieval", + ok_count == len(sample), + f"ok={ok_count}/{len(sample)}" + (f", failures=[{'; '.join(fail_details[:3])}]" if fail_details else ""), + ) + + +# ── Test 7: No overlap between VGP and ref stores ──────────────────── + + +def test_no_overlap(vgp_store, ref_store): + """Verify no collection appears in both stores.""" + print("\n── Cross-store: No overlap ──") + + vgp_digests = get_all_collection_digests(vgp_store) + ref_digests = get_all_collection_digests(ref_store) + + overlap = vgp_digests & ref_digests + check( + "no_collection_overlap", + len(overlap) == 0, + f"overlap={len(overlap)}" + (f", sample={list(overlap)[:3]}" if overlap else ""), + ) + + +# ── Test 8: Full coverage — VGP + ref = source ─────────────────────── + + +def test_full_coverage(vgp_store, ref_store, source_store): + """Verify VGP + ref stores together contain all source collections.""" + print("\n── Cross-store: Full coverage ──") + + vgp_digests = get_all_collection_digests(vgp_store) + ref_digests = get_all_collection_digests(ref_store) + source_digests = get_all_collection_digests(source_store) + + combined = vgp_digests | ref_digests + missing = source_digests - combined + extra = combined - source_digests + + check( + "combined_equals_source", + len(missing) == 0 and len(extra) == 0, + f"source={len(source_digests)}, vgp={len(vgp_digests)}, ref={len(ref_digests)}, " + f"combined={len(combined)}, missing={len(missing)}, extra={len(extra)}", + ) + + +# ── Test 9: Roundtrip FASTA export ─────────────────────────────────── + + +def test_roundtrip_fasta(store, label, digests, limit=3): + """Export a few collections to FASTA and verify digest matches.""" + print(f"\n── {label}: Roundtrip FASTA export ──") + + try: + from gtars.refget import digest_fasta + except ImportError: + check(f"{label}_roundtrip", False, "gtars.refget.digest_fasta not available") + return + + sample = sorted(digests)[:limit] + ok_count = 0 + fail_details = [] + + for digest in sample: + fd, tmp_path = tempfile.mkstemp(suffix=".fa") + os.close(fd) + try: + store.export_fasta(digest, tmp_path, None, 80) + exported_sc = digest_fasta(tmp_path) + match = exported_sc.digest == digest + if match: + ok_count += 1 + else: + fail_details.append( + f"{digest[:16]}: exported={exported_sc.digest[:16]} != original" + ) + except Exception as e: + fail_details.append(f"{digest[:16]}: {e}") + finally: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + + check( + f"{label}_roundtrip_fasta", + ok_count == len(sample), + f"ok={ok_count}/{len(sample)}" + (f", failures=[{'; '.join(fail_details)}]" if fail_details else ""), + ) + + +# ── Main ────────────────────────────────────────────────────────────── + + +def validate_store(store_path, label, expected_digests, thorough=False): + """Run all single-store validations.""" + from refget.store import RefgetStore + + store = test_store_opens(store_path, label) + if store is None: + return None + + store_digests = test_collection_counts(store, label, expected_digests) + + # Level2: check all in thorough mode, sample otherwise + limit = None if thorough else 20 + test_level2_integrity(store, label, store_digests, limit=limit) + + test_aliases(store, label, store_digests) + test_fhr_metadata(store, label, store_digests) + test_sequence_retrieval(store, label, store_digests) + + if thorough: + test_roundtrip_fasta(store, label, store_digests, limit=5) + + return store + + +def main(): + parser = argparse.ArgumentParser(description="Validate split RefgetStores") + parser.add_argument( + "--store", + choices=["vgp", "ref", "both"], + default="both", + help="Which store to validate (default: both)", + ) + parser.add_argument( + "--thorough", + action="store_true", + help="Run deep checks: all level2, roundtrip FASTA (slow)", + ) + parser.add_argument("--vgp-path", default=VGP_PATH) + parser.add_argument("--ref-path", default=REF_PATH) + parser.add_argument("--source-path", default=SOURCE_PATH) + parser.add_argument("--digest-map", default=DIGEST_MAP) + args = parser.parse_args() + + print(f"Validating split stores") + print(f" Source: {args.source_path}") + print(f" VGP: {args.vgp_path}") + print(f" Ref: {args.ref_path}") + print(f" Digest map: {args.digest_map}") + print(f" Thorough: {args.thorough}") + print("=" * 60) + + t_start = time.time() + + # Load digest map to compute expected sets + group_digests, dm_rows = load_digest_map(args.digest_map) + vgp_expected = set() + ref_expected = set() + for group, digests in group_digests.items(): + if group in VGP_GROUPS: + vgp_expected |= digests + else: + ref_expected |= digests + + print(f"\nDigest map: {len(dm_rows)} rows, " + f"VGP expected={len(vgp_expected)}, ref expected={len(ref_expected)}") + + vgp_store = None + ref_store = None + + if args.store in ("vgp", "both"): + vgp_store = validate_store(args.vgp_path, "vgp", vgp_expected, args.thorough) + + if args.store in ("ref", "both"): + ref_store = validate_store(args.ref_path, "ref", ref_expected, args.thorough) + + # Cross-store checks (only if both stores validated) + if vgp_store and ref_store: + test_no_overlap(vgp_store, ref_store) + + # Full coverage against source + from refget.store import RefgetStore + if RefgetStore.store_exists(args.source_path): + source_store = RefgetStore.open_local(args.source_path) + test_full_coverage(vgp_store, ref_store, source_store) + else: + check("full_coverage", False, f"source store not found: {args.source_path}") + + # Summary + elapsed = time.time() - t_start + print(f"\n{'=' * 60}") + print("VALIDATION SUMMARY") + print("=" * 60) + passed = sum(1 for r in results if r["status"] == "PASS") + failed = sum(1 for r in results if r["status"] == "FAIL") + print(f"Passed: {passed}") + print(f"Failed: {failed}") + print(f"Total: {passed + failed}") + print(f"Time: {elapsed:.1f}s") + + if failed > 0: + print("\nFailed checks:") + for r in results: + if r["status"] == "FAIL": + print(f" - {r['name']}: {r['detail']}") + + # Write JSON report + report_path = os.path.join(STAGING, "split_validation_report.json") + os.makedirs(STAGING, exist_ok=True) + with open(report_path, "w") as f: + json.dump({"results": results, "passed": passed, "failed": failed}, f, indent=2) + print(f"\nJSON report: {report_path}") + + sys.exit(1 if failed > 0 else 0) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/validate_split_stores.sbatch b/data_loaders/ref-genome-analysis/src/validate_split_stores.sbatch new file mode 100644 index 0000000..d9c3b4c --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/validate_split_stores.sbatch @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=validate_split +#SBATCH --output=validate_split_%j.log +#SBATCH --error=validate_split_%j.log +#SBATCH --partition=standard +#SBATCH --time=2:00:00 +#SBATCH --mem=16G +#SBATCH --cpus-per-task=1 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd $HOME/code/ref-genome-analysis +source env/on-cluster.env + +python src/validate_split_stores.py "$@" diff --git a/data_loaders/riva_pangenome_analysis/README.md b/data_loaders/riva_pangenome_analysis/README.md index 9ecce89..50e9078 100644 --- a/data_loaders/riva_pangenome_analysis/README.md +++ b/data_loaders/riva_pangenome_analysis/README.md @@ -1,5 +1,23 @@ # RIVA Pangenome RefgetStore +## Prep + + +```sh +# Build gtars +cd ~/code/gtars +git checkout refgetstore +git pull +cd gtars-python +python -m pip install -e . + +# Next, install local refget: +cd ~/code/refget +git checkout dev +git pull +python -m pip install -e . +``` + ## Build the store ```python @@ -27,7 +45,7 @@ import os from pathlib import Path from refget.store import RefgetStore -store_dir = Path(os.path.expandvars("$BRICKYARD/datasets_downloaded/pangenome_fasta/refget_store2")) +store_dir = Path(os.path.expandvars("$BRICKYARD/datasets_downloaded/pangenome_fasta/refget_store")) store = RefgetStore.on_disk(str(store_dir)) @@ -35,10 +53,11 @@ store.list_collections() cm = store.get_collection_metadata("s0nMiOFHPsIBrm2bd3PkzWXKLKWQZq70") -EXAMPLE_COLLECTION = "0aHV7I-94paL9Z1H4LNlqsW3WxJhlou5" -EXAMPLE_SEQ_NAME = "JAGYVX010000006.1 unmasked:primary_assembly HG03540.pri.mat.f1_v2:JAGYVX010000006.1:1:96320881:1" +EXAMPLE_COLLECTION = "L5fggdWYz5tCr4v8XbPYoOwv79Sqmf1W" +EXAMPLE_SEQ_NAME = "JAGYVI010000261.1" + -record = store.get_sequence_by_collection_and_name(EXAMPLE_COLLECTION, EXAMPLE_SEQ_NAME) +record = store.get_sequence_by_name(EXAMPLE_COLLECTION, EXAMPLE_SEQ_NAME) ## Upload to S3 @@ -74,7 +93,7 @@ seq s1 seq.decode() store.get_collection_metadata(col1.digest) -col1_loaded.is_loaded() +store.is_collection_loaded(col1.digest) ``` diff --git a/data_loaders/riva_pangenome_analysis/update-gtars.sh b/data_loaders/riva_pangenome_analysis/update-gtars.sh new file mode 100644 index 0000000..2664b00 --- /dev/null +++ b/data_loaders/riva_pangenome_analysis/update-gtars.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Reinstall gtars and refget on rivanna + +ssh riva 'bash --login -s' << 'EOF' +set -e +source /etc/profile.d/modules.sh +module load miniforge/24.3.0-py3.11 + +# Build gtars (refget module only) +cd ~/code/gtars +git checkout dev +git pull +cd gtars-python +rm -f ../target/wheels/gtars-*.whl +maturin build --release --no-default-features --features refget +pip install ../target/wheels/gtars-*.whl --force-reinstall --no-deps + +# Install local refget +cd ~/code/refget +git checkout dev +git pull +python -m pip install -e . + +echo "Done!" +EOF diff --git a/deployment/demo_up.sh b/deployment/demo_up.sh index 2a1018e..96595c8 100644 --- a/deployment/demo_up.sh +++ b/deployment/demo_up.sh @@ -39,7 +39,8 @@ uvicorn seqcolapi.main:app --reload --port 8100 & PID=$! echo "Loading demo sequence collections..." -python data_loaders/load_demo_seqcols.py +# Unset storage locations so demo loader creates fake URLs instead of uploading +FASTA_STORAGE_LOCATIONS="" python data_loaders/load_demo_seqcols.py # Set up cleanup on Ctrl+C trap cleanup SIGINT EXIT diff --git a/deployment/seqcolapi-store/scom_config.json b/deployment/seqcolapi-store/scom_config.json new file mode 100644 index 0000000..df19a7d --- /dev/null +++ b/deployment/seqcolapi-store/scom_config.json @@ -0,0 +1,77 @@ +{ + "human": [ + "FnbS0xDAGOePD7lp6Xt0vnbcASzkk3gk", + "MkXYxV2-83BEcPmzskEVGJsJ7Qkb--gX", + "gHcfbUVnFzHv3QSqz2sSqVHdUQbDO8N5", + "q5Gn6dl5HbkZe6sRs9CZVGwGd1-XwyAq", + "lXa-sGAmSafYXHN4iEwup0EsQh5F1krA", + "7hP7E8o-q6H8qcqMHBEIYdbAK49PoEUZ", + "h_kOcvPobU9it_QR1LjPqeNpM56xjEJQ", + "GOIHeGSorDrbznRxihs5rIb6vTiTKaw7", + "kgTizBhIBf5BeEMTqBuyzRBi2AlwTRj6", + "Dx_M8skbJqROkfXhhQRtWejcCyewRbdL", + "xEg2q8K9gV4027DMTCiaUGLCcrqySglR", + "k4mLJvbFzZiw3o6SL8hh63V2u7AjDMrE", + "tmnbiAyj2fke68d_TYjq2g487US8C15r", + "ThZcNYiLuWWL86NdJ8dvvJG15K9mW3Fo", + "ieWVCws5MC2QFRKgH9QcN3u5_Y_3hPG6", + "DvAlkUMPq7CRnTYAfXGkQmAOfFqVMZHE", + "6chutju9QVJW0rdA-wgubHbtoTQ42o-6", + "YfZ0rklv8KY9DCtqG0iIX16zsgbuBgmM", + "6pxZqxG0TYtyVb8yp14ONpaxZ8msQqKr", + "5ryrHdbJHIgyZuE29h5uzITRL4kinZWG", + "vRjC5qM1Tc-fFjJo0TGRw4CVjFhmLG0f", + "RGSkyOkQ4qnSLhjrR_e3AI51Ac-RohAL", + "EiFob05aCWgVU_B_Ae0cypnQut3cxUP1", + "Ba88PY52_qeifhJrgUXyin6UITdXNsg3", + "-qU7PUmse_-pilikFDTJKyrt2_QJvUFy", + "0OgP0NkIM22lVYT5AMmkbb9knKDhk4I6", + "a_WL8OC7sFJfjux5m11M2bKl0dYepA1x", + "eN7J_gZz_meakMCeXXBEvY_njignMPxl", + "svwHqvgassl0loigdqVIQJdjo7NWDIx3", + "wdpZbFN0pd92H2VeKZBAp3riQN4nJXkK", + "NTeQ1GQMt2ocCFkS8Z3_qkvetZjabWSt", + "XJWKh8nsSqBFfcU0DIHMZohYyCWF-vcA", + "oLfPx0NOBKKXMIngGeQ4YewtU4Ge_wKz", + "tkRdBlSp5hewK4OpEJC87J9pw-ac0vOa", + "5SdkZCnuZL2YIptqjSBZfupo_O7HpD_B", + "H3RA3Jez6oqMOW87LGuwtSgKQqTgWVxx", + "Q3xii3AkJDCTXSO6Vg13kjbOutQu0KP9", + "H9er8ocYfIN2TOfyf6zMyeXXm7trXGP7", + "jFm0Uca8a7vK2cbuIQgBopjBilgCFheD", + "F9zeFn6M4EN4KGAJush7rZEU3GROoeNP", + "lWRRNMNypacEjnJCy-AYiDNUPy1brQGC", + "EGlYk1stOsAjmTALWkqKDqtZOSkyA_YH", + "u1HyLgIlq8M_XvEwy0oGqAvKGHJMGtxH" + ], + "mouse": [ + "6-UTIAyR94-nanfrhd_sAF6oHLyMd0zH", + "D-6wf8dsOttiVNnLSImSglRJvw_8Zr_j", + "bLbjXXCz_5qAonaDXcVuadx65QZkC7mb", + "TQORtixTJqM3Su9dmtACKc7hNHAceE4I", + "-e70JAQq4NJDg8-1Ab2XhHu6yYjeW-zu", + "3MS1-4k87pZ0-C80QDoUvhFmC0usPH28", + "PM8ODmBlTISp4Onv0aSBFaAfi3QVCGzx", + "hW3Ba5zoufl3-MGXQESlWXjsW56R5vPG", + "dMjpOU7EvpeZVb0gpPoZ7prNaxOu88Ta", + "qtPKGcXii2OuiyIoDA9K0jSKR62qCyzd", + "fXjBOJjw-DYsSnnfDBl5vtZu1N7lbnUl", + "XVhfRj6PCzLoGiEeXjpFv7vriVN02aPc", + "jpOqOhddb15iOm2SIdJSjsf-U5Uu7Def", + "99TjKCwZJJLjpBqkLpTgC2E_Y3OgKtMz", + "JL56x8L1q1Fs_-jHvZxBG01Vitac-CmO", + "9k1WrFA4Ys2fPifOOVswhOEdurvsaLfI", + "4mvptys3ckGgiUCcly4HOHB40IhpwwVT", + "ABEupc6KHmxtHGarfWFXTmu9mUcNnfoM", + "M7ZWnvUTT06JREJnMb_7UGwgGaG0-13s", + "FTBYBUoMhkOJ_-8lWpERVTxe62kstAol", + "KPagVaXI4XwQ1D0L0EW4eEuVl-otaAtX", + "2Ls1P5eUdKbvtOhjJx3s2R5r0_I-IB5Z", + "qcT5VVX5G3mN2O9OqeFR-F0POVuY2oGw", + "3rgz8-_XPSiTUYPamUTRF3DArhAhTint", + "wsDErYxgCXiPnb2FWZ4sXtx3B0YyruRu", + "WsOG-InFnIta0rqSy1KUBjrFrukbnE5j", + "JPyo8AqZzCyVaUx1lAkk6LbpyQPX4VUB", + "vygX07e7feibvucSnWj6hRScGMfc7B6P" + ] +} \ No newline at end of file diff --git a/deployment/seqcolapi-store/task_def.json b/deployment/seqcolapi-store/task_def.json index 644ba2c..70529b9 100644 --- a/deployment/seqcolapi-store/task_def.json +++ b/deployment/seqcolapi-store/task_def.json @@ -9,7 +9,7 @@ "entryPoint": null, "portMappings": [ { - "hostPort": 8105, + "hostPort": 8106, "protocol": "tcp", "containerPort": 80 } @@ -55,7 +55,7 @@ "dockerLabels": null, "systemControls": null, "privileged": null, - "name": "seqcolapi" + "name": "seqcolapi-store" } ], "placementConstraints": [], diff --git a/deployment/store_demo/store_demo.env b/deployment/store_demo/store_demo.env new file mode 100644 index 0000000..05ac069 --- /dev/null +++ b/deployment/store_demo/store_demo.env @@ -0,0 +1,3 @@ +export REFGET_STORE_PATH="/tmp/refget_demo_store" +export SEQCOLAPI_PORT="8100" +export SERVER_ENV="dev" diff --git a/deployment/store_demo_up.sh b/deployment/store_demo_up.sh new file mode 100755 index 0000000..7e3bb24 --- /dev/null +++ b/deployment/store_demo_up.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# This script starts a local store-backed demo of the SeqCol API service + +# Use local source instead of installed package +export PYTHONPATH="$(pwd):$PYTHONPATH" + +# Function to handle cleanup on Ctrl+C +cleanup() { + echo "Stopping uvicorn (PID: $PID)..." + kill -15 $PID 2>/dev/null + wait $PID 2>/dev/null + echo "Uvicorn stopped." + if [ -n "$STORE_HTTP_PID" ]; then + echo "Stopping store HTTP server (PID: $STORE_HTTP_PID)..." + kill -15 $STORE_HTTP_PID 2>/dev/null + wait $STORE_HTTP_PID 2>/dev/null + fi + echo "Cleaning up demo store at $REFGET_STORE_PATH..." + rm -rf "$REFGET_STORE_PATH" + exit 0 +} + +# Load environment variables +source deployment/store_demo/store_demo.env + +echo "Building demo store from test FASTA files..." +python data_loaders/demo_build_store.py test_fasta "$REFGET_STORE_PATH" + +STORE_HTTP_PORT=8200 +echo "Starting HTTP file server for store on port $STORE_HTTP_PORT..." +STORE_DIR="$REFGET_STORE_PATH" STORE_PORT="$STORE_HTTP_PORT" python -c ' +import http.server, socketserver, os + +class CORSHandler(http.server.SimpleHTTPRequestHandler): + def end_headers(self): + self.send_header("Access-Control-Allow-Origin", "*") + super().end_headers() + def __init__(self, *args, **kwargs): + super().__init__(*args, directory=os.environ["STORE_DIR"], **kwargs) + +socketserver.TCPServer(("", int(os.environ["STORE_PORT"])), CORSHandler).serve_forever() +' & +STORE_HTTP_PID=$! +export REFGET_STORE_HTTP_URL="http://localhost:$STORE_HTTP_PORT" + +echo "Running store-backed uvicorn API service..." +uvicorn seqcolapi.main:store_app --reload --port ${SEQCOLAPI_PORT:-8100} & +PID=$! + +echo "" +echo "Store-backed seqcolapi is running at http://localhost:${SEQCOLAPI_PORT:-8100}" +echo " API docs: http://localhost:${SEQCOLAPI_PORT:-8100}/docs" +echo " Service info: http://localhost:${SEQCOLAPI_PORT:-8100}/service-info" +echo " Store files: $REFGET_STORE_HTTP_URL" +echo "" + +# Set up cleanup on Ctrl+C +trap cleanup SIGINT EXIT + +# Wait indefinitely until Ctrl+C is pressed +wait $PID diff --git a/examples/remote_store.py b/examples/remote_store.py index 6db0447..09c8f79 100644 --- a/examples/remote_store.py +++ b/examples/remote_store.py @@ -29,7 +29,7 @@ # The store metadata (~1.5 MB) is fetched; sequences are loaded on-demand. # %% -store = RefgetStore.load_remote(cache_path=str(CACHE_DIR), remote_url=REMOTE_URL) +store = RefgetStore.open_remote(cache_path=str(CACHE_DIR), remote_url=REMOTE_URL) print(f"Loaded {len(store)} sequences from {REMOTE_URL}") @@ -45,10 +45,9 @@ # ## 3. List Sequences # %% -records = store.sequence_records() -for i, rec in enumerate(records[:5]): - m = rec.metadata - print(f"{i+1}. {m.name[:60]}...") +records = store.list_sequences() +for i, m in enumerate(records[:5]): + print(f"{i + 1}. {m.name[:60]}...") print(f" sha512t24u: {m.sha512t24u}, length: {m.length:,} bp") # %% [markdown] @@ -58,7 +57,7 @@ # %% seq_digest = "du4GiRD_OcmdmCn_RmImyb71YZ4XoCdk" -record = store.get_sequence_by_id(seq_digest) +record = store.get_sequence(seq_digest) if record: print(f"Name: {record.metadata.name}") print(f"Length: {record.metadata.length:,} bp") @@ -99,7 +98,7 @@ # Look up sequences by collection digest + sequence name. # %% -record = store.get_sequence_by_collection_and_name(EXAMPLE_COLLECTION, EXAMPLE_SEQ_NAME) +record = store.get_sequence_by_name(EXAMPLE_COLLECTION, EXAMPLE_SEQ_NAME) if record: print(f"Collection: {EXAMPLE_COLLECTION}") print(f"Sequence: {EXAMPLE_SEQ_NAME[:50]}...") diff --git a/frontend/.env.production b/frontend/.env.production new file mode 100644 index 0000000..4c7c46f --- /dev/null +++ b/frontend/.env.production @@ -0,0 +1 @@ +VITE_API_BASE=https://seqcolapi.databio.org diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 7a78b74..a5180d9 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -981,9 +981,9 @@ } }, "node_modules/@rollup/rollup-android-arm-eabi": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.35.0.tgz", - "integrity": "sha512-uYQ2WfPaqz5QtVgMxfN6NpLD+no0MYHDBywl7itPYd3K5TjjSghNKmX8ic9S8NU8w81NVhJv/XojcHptRly7qQ==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.59.0.tgz", + "integrity": "sha512-upnNBkA6ZH2VKGcBj9Fyl9IGNPULcjXRlg0LLeaioQWueH30p6IXtJEbKAgvyv+mJaMxSm1l6xwDXYjpEMiLMg==", "cpu": [ "arm" ], @@ -995,9 +995,9 @@ ] }, "node_modules/@rollup/rollup-android-arm64": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.35.0.tgz", - "integrity": "sha512-FtKddj9XZudurLhdJnBl9fl6BwCJ3ky8riCXjEw3/UIbjmIY58ppWwPEvU3fNu+W7FUsAsB1CdH+7EQE6CXAPA==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.59.0.tgz", + "integrity": "sha512-hZ+Zxj3SySm4A/DylsDKZAeVg0mvi++0PYVceVyX7hemkw7OreKdCvW2oQ3T1FMZvCaQXqOTHb8qmBShoqk69Q==", "cpu": [ "arm64" ], @@ -1009,9 +1009,9 @@ ] }, "node_modules/@rollup/rollup-darwin-arm64": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.35.0.tgz", - "integrity": "sha512-Uk+GjOJR6CY844/q6r5DR/6lkPFOw0hjfOIzVx22THJXMxktXG6CbejseJFznU8vHcEBLpiXKY3/6xc+cBm65Q==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.59.0.tgz", + "integrity": "sha512-W2Psnbh1J8ZJw0xKAd8zdNgF9HRLkdWwwdWqubSVk0pUuQkoHnv7rx4GiF9rT4t5DIZGAsConRE3AxCdJ4m8rg==", "cpu": [ "arm64" ], @@ -1023,9 +1023,9 @@ ] }, "node_modules/@rollup/rollup-darwin-x64": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.35.0.tgz", - "integrity": "sha512-3IrHjfAS6Vkp+5bISNQnPogRAW5GAV1n+bNCrDwXmfMHbPl5EhTmWtfmwlJxFRUCBZ+tZ/OxDyU08aF6NI/N5Q==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.59.0.tgz", + "integrity": "sha512-ZW2KkwlS4lwTv7ZVsYDiARfFCnSGhzYPdiOU4IM2fDbL+QGlyAbjgSFuqNRbSthybLbIJ915UtZBtmuLrQAT/w==", "cpu": [ "x64" ], @@ -1037,9 +1037,9 @@ ] }, "node_modules/@rollup/rollup-freebsd-arm64": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.35.0.tgz", - "integrity": "sha512-sxjoD/6F9cDLSELuLNnY0fOrM9WA0KrM0vWm57XhrIMf5FGiN8D0l7fn+bpUeBSU7dCgPV2oX4zHAsAXyHFGcQ==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.59.0.tgz", + "integrity": "sha512-EsKaJ5ytAu9jI3lonzn3BgG8iRBjV4LxZexygcQbpiU0wU0ATxhNVEpXKfUa0pS05gTcSDMKpn3Sx+QB9RlTTA==", "cpu": [ "arm64" ], @@ -1051,9 +1051,9 @@ ] }, "node_modules/@rollup/rollup-freebsd-x64": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.35.0.tgz", - "integrity": "sha512-2mpHCeRuD1u/2kruUiHSsnjWtHjqVbzhBkNVQ1aVD63CcexKVcQGwJ2g5VphOd84GvxfSvnnlEyBtQCE5hxVVw==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.59.0.tgz", + "integrity": "sha512-d3DuZi2KzTMjImrxoHIAODUZYoUUMsuUiY4SRRcJy6NJoZ6iIqWnJu9IScV9jXysyGMVuW+KNzZvBLOcpdl3Vg==", "cpu": [ "x64" ], @@ -1065,9 +1065,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm-gnueabihf": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.35.0.tgz", - "integrity": "sha512-mrA0v3QMy6ZSvEuLs0dMxcO2LnaCONs1Z73GUDBHWbY8tFFocM6yl7YyMu7rz4zS81NDSqhrUuolyZXGi8TEqg==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.59.0.tgz", + "integrity": "sha512-t4ONHboXi/3E0rT6OZl1pKbl2Vgxf9vJfWgmUoCEVQVxhW6Cw/c8I6hbbu7DAvgp82RKiH7TpLwxnJeKv2pbsw==", "cpu": [ "arm" ], @@ -1079,9 +1079,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm-musleabihf": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.35.0.tgz", - "integrity": "sha512-DnYhhzcvTAKNexIql8pFajr0PiDGrIsBYPRvCKlA5ixSS3uwo/CWNZxB09jhIapEIg945KOzcYEAGGSmTSpk7A==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.59.0.tgz", + "integrity": "sha512-CikFT7aYPA2ufMD086cVORBYGHffBo4K8MQ4uPS/ZnY54GKj36i196u8U+aDVT2LX4eSMbyHtyOh7D7Zvk2VvA==", "cpu": [ "arm" ], @@ -1093,9 +1093,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm64-gnu": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.35.0.tgz", - "integrity": "sha512-uagpnH2M2g2b5iLsCTZ35CL1FgyuzzJQ8L9VtlJ+FckBXroTwNOaD0z0/UF+k5K3aNQjbm8LIVpxykUOQt1m/A==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.59.0.tgz", + "integrity": "sha512-jYgUGk5aLd1nUb1CtQ8E+t5JhLc9x5WdBKew9ZgAXg7DBk0ZHErLHdXM24rfX+bKrFe+Xp5YuJo54I5HFjGDAA==", "cpu": [ "arm64" ], @@ -1107,9 +1107,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm64-musl": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.35.0.tgz", - "integrity": "sha512-XQxVOCd6VJeHQA/7YcqyV0/88N6ysSVzRjJ9I9UA/xXpEsjvAgDTgH3wQYz5bmr7SPtVK2TsP2fQ2N9L4ukoUg==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.59.0.tgz", + "integrity": "sha512-peZRVEdnFWZ5Bh2KeumKG9ty7aCXzzEsHShOZEFiCQlDEepP1dpUl/SrUNXNg13UmZl+gzVDPsiCwnV1uI0RUA==", "cpu": [ "arm64" ], @@ -1120,10 +1120,10 @@ "linux" ] }, - "node_modules/@rollup/rollup-linux-loongarch64-gnu": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loongarch64-gnu/-/rollup-linux-loongarch64-gnu-4.35.0.tgz", - "integrity": "sha512-5pMT5PzfgwcXEwOaSrqVsz/LvjDZt+vQ8RT/70yhPU06PTuq8WaHhfT1LW+cdD7mW6i/J5/XIkX/1tCAkh1W6g==", + "node_modules/@rollup/rollup-linux-loong64-gnu": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.59.0.tgz", + "integrity": "sha512-gbUSW/97f7+r4gHy3Jlup8zDG190AuodsWnNiXErp9mT90iCy9NKKU0Xwx5k8VlRAIV2uU9CsMnEFg/xXaOfXg==", "cpu": [ "loong64" ], @@ -1134,10 +1134,38 @@ "linux" ] }, - "node_modules/@rollup/rollup-linux-powerpc64le-gnu": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.35.0.tgz", - "integrity": "sha512-c+zkcvbhbXF98f4CtEIP1EBA/lCic5xB0lToneZYvMeKu5Kamq3O8gqrxiYYLzlZH6E3Aq+TSW86E4ay8iD8EA==", + "node_modules/@rollup/rollup-linux-loong64-musl": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-musl/-/rollup-linux-loong64-musl-4.59.0.tgz", + "integrity": "sha512-yTRONe79E+o0FWFijasoTjtzG9EBedFXJMl888NBEDCDV9I2wGbFFfJQQe63OijbFCUZqxpHz1GzpbtSFikJ4Q==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-gnu": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.59.0.tgz", + "integrity": "sha512-sw1o3tfyk12k3OEpRddF68a1unZ5VCN7zoTNtSn2KndUE+ea3m3ROOKRCZxEpmT9nsGnogpFP9x6mnLTCaoLkA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-musl": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-musl/-/rollup-linux-ppc64-musl-4.59.0.tgz", + "integrity": "sha512-+2kLtQ4xT3AiIxkzFVFXfsmlZiG5FXYW7ZyIIvGA7Bdeuh9Z0aN4hVyXS/G1E9bTP/vqszNIN/pUKCk/BTHsKA==", "cpu": [ "ppc64" ], @@ -1149,9 +1177,23 @@ ] }, "node_modules/@rollup/rollup-linux-riscv64-gnu": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.35.0.tgz", - "integrity": "sha512-s91fuAHdOwH/Tad2tzTtPX7UZyytHIRR6V4+2IGlV0Cej5rkG0R61SX4l4y9sh0JBibMiploZx3oHKPnQBKe4g==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.59.0.tgz", + "integrity": "sha512-NDYMpsXYJJaj+I7UdwIuHHNxXZ/b/N2hR15NyH3m2qAtb/hHPA4g4SuuvrdxetTdndfj9b1WOmy73kcPRoERUg==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-musl": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.59.0.tgz", + "integrity": "sha512-nLckB8WOqHIf1bhymk+oHxvM9D3tyPndZH8i8+35p/1YiVoVswPid2yLzgX7ZJP0KQvnkhM4H6QZ5m0LzbyIAg==", "cpu": [ "riscv64" ], @@ -1163,9 +1205,9 @@ ] }, "node_modules/@rollup/rollup-linux-s390x-gnu": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.35.0.tgz", - "integrity": "sha512-hQRkPQPLYJZYGP+Hj4fR9dDBMIM7zrzJDWFEMPdTnTy95Ljnv0/4w/ixFw3pTBMEuuEuoqtBINYND4M7ujcuQw==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.59.0.tgz", + "integrity": "sha512-oF87Ie3uAIvORFBpwnCvUzdeYUqi2wY6jRFWJAy1qus/udHFYIkplYRW+wo+GRUP4sKzYdmE1Y3+rY5Gc4ZO+w==", "cpu": [ "s390x" ], @@ -1177,9 +1219,9 @@ ] }, "node_modules/@rollup/rollup-linux-x64-gnu": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.35.0.tgz", - "integrity": "sha512-Pim1T8rXOri+0HmV4CdKSGrqcBWX0d1HoPnQ0uw0bdp1aP5SdQVNBy8LjYncvnLgu3fnnCt17xjWGd4cqh8/hA==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.59.0.tgz", + "integrity": "sha512-3AHmtQq/ppNuUspKAlvA8HtLybkDflkMuLK4DPo77DfthRb71V84/c4MlWJXixZz4uruIH4uaa07IqoAkG64fg==", "cpu": [ "x64" ], @@ -1191,9 +1233,9 @@ ] }, "node_modules/@rollup/rollup-linux-x64-musl": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.35.0.tgz", - "integrity": "sha512-QysqXzYiDvQWfUiTm8XmJNO2zm9yC9P/2Gkrwg2dH9cxotQzunBHYr6jk4SujCTqnfGxduOmQcI7c2ryuW8XVg==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.59.0.tgz", + "integrity": "sha512-2UdiwS/9cTAx7qIUZB/fWtToJwvt0Vbo0zmnYt7ED35KPg13Q0ym1g442THLC7VyI6JfYTP4PiSOWyoMdV2/xg==", "cpu": [ "x64" ], @@ -1204,10 +1246,38 @@ "linux" ] }, + "node_modules/@rollup/rollup-openbsd-x64": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openbsd-x64/-/rollup-openbsd-x64-4.59.0.tgz", + "integrity": "sha512-M3bLRAVk6GOwFlPTIxVBSYKUaqfLrn8l0psKinkCFxl4lQvOSz8ZrKDz2gxcBwHFpci0B6rttydI4IpS4IS/jQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ] + }, + "node_modules/@rollup/rollup-openharmony-arm64": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.59.0.tgz", + "integrity": "sha512-tt9KBJqaqp5i5HUZzoafHZX8b5Q2Fe7UjYERADll83O4fGqJ49O1FsL6LpdzVFQcpwvnyd0i+K/VSwu/o/nWlA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ] + }, "node_modules/@rollup/rollup-win32-arm64-msvc": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.35.0.tgz", - "integrity": "sha512-OUOlGqPkVJCdJETKOCEf1mw848ZyJ5w50/rZ/3IBQVdLfR5jk/6Sr5m3iO2tdPgwo0x7VcncYuOvMhBWZq8ayg==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.59.0.tgz", + "integrity": "sha512-V5B6mG7OrGTwnxaNUzZTDTjDS7F75PO1ae6MJYdiMu60sq0CqN5CVeVsbhPxalupvTX8gXVSU9gq+Rx1/hvu6A==", "cpu": [ "arm64" ], @@ -1219,9 +1289,9 @@ ] }, "node_modules/@rollup/rollup-win32-ia32-msvc": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.35.0.tgz", - "integrity": "sha512-2/lsgejMrtwQe44glq7AFFHLfJBPafpsTa6JvP2NGef/ifOa4KBoglVf7AKN7EV9o32evBPRqfg96fEHzWo5kw==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.59.0.tgz", + "integrity": "sha512-UKFMHPuM9R0iBegwzKF4y0C4J9u8C6MEJgFuXTBerMk7EJ92GFVFYBfOZaSGLu6COf7FxpQNqhNS4c4icUPqxA==", "cpu": [ "ia32" ], @@ -1232,10 +1302,24 @@ "win32" ] }, + "node_modules/@rollup/rollup-win32-x64-gnu": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.59.0.tgz", + "integrity": "sha512-laBkYlSS1n2L8fSo1thDNGrCTQMmxjYY5G0WFWjFFYZkKPjsMBsgJfGf4TLxXrF6RyhI60L8TMOjBMvXiTcxeA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, "node_modules/@rollup/rollup-win32-x64-msvc": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.35.0.tgz", - "integrity": "sha512-PIQeY5XDkrOysbQblSW7v3l1MDZzkTEzAfTPkj5VAu3FW8fS4ynyLg2sINp0fp3SjZ8xkRYpLqoKcYqAkhU1dw==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.59.0.tgz", + "integrity": "sha512-2HRCml6OztYXyJXAvdDXPKcawukWY2GpR5/nxKp4iBgiO3wcoEGkAaqctIbZcNB6KlUQBIqt8VYkNSj2397EfA==", "cpu": [ "x64" ], @@ -1288,10 +1372,9 @@ } }, "node_modules/@types/estree": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.6.tgz", - "integrity": "sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==", - "dev": true, + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", + "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", "license": "MIT" }, "node_modules/@types/geojson": { @@ -3868,10 +3951,11 @@ } }, "node_modules/minimatch": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", - "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", + "version": "3.1.5", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.5.tgz", + "integrity": "sha512-VgjWUsnnT6n+NUk6eZq77zeFdpW2LWDzP6zFGrCbHXiYNul5Dzqk2HHQ5uFH2DNW5Xbp8+jVzaeNt94ssEEl4w==", "dev": true, + "license": "ISC", "dependencies": { "brace-expansion": "^1.1.7" }, @@ -4424,13 +4508,13 @@ "license": "Unlicense" }, "node_modules/rollup": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.35.0.tgz", - "integrity": "sha512-kg6oI4g+vc41vePJyO6dHt/yl0Rz3Thv0kJeVQ3D1kS3E5XSuKbPc29G4IpT/Kv1KQwgHVcN+HtyS+HYLNSvQg==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.59.0.tgz", + "integrity": "sha512-2oMpl67a3zCH9H79LeMcbDhXW/UmWG/y2zuqnF2jQq5uq9TbM9TVyXvA4+t+ne2IIkBdrLpAaRQAvo7YI/Yyeg==", "dev": true, "license": "MIT", "dependencies": { - "@types/estree": "1.0.6" + "@types/estree": "1.0.8" }, "bin": { "rollup": "dist/bin/rollup" @@ -4440,25 +4524,31 @@ "npm": ">=8.0.0" }, "optionalDependencies": { - "@rollup/rollup-android-arm-eabi": "4.35.0", - "@rollup/rollup-android-arm64": "4.35.0", - "@rollup/rollup-darwin-arm64": "4.35.0", - "@rollup/rollup-darwin-x64": "4.35.0", - "@rollup/rollup-freebsd-arm64": "4.35.0", - "@rollup/rollup-freebsd-x64": "4.35.0", - "@rollup/rollup-linux-arm-gnueabihf": "4.35.0", - "@rollup/rollup-linux-arm-musleabihf": "4.35.0", - "@rollup/rollup-linux-arm64-gnu": "4.35.0", - "@rollup/rollup-linux-arm64-musl": "4.35.0", - "@rollup/rollup-linux-loongarch64-gnu": "4.35.0", - "@rollup/rollup-linux-powerpc64le-gnu": "4.35.0", - "@rollup/rollup-linux-riscv64-gnu": "4.35.0", - "@rollup/rollup-linux-s390x-gnu": "4.35.0", - "@rollup/rollup-linux-x64-gnu": "4.35.0", - "@rollup/rollup-linux-x64-musl": "4.35.0", - "@rollup/rollup-win32-arm64-msvc": "4.35.0", - "@rollup/rollup-win32-ia32-msvc": "4.35.0", - "@rollup/rollup-win32-x64-msvc": "4.35.0", + "@rollup/rollup-android-arm-eabi": "4.59.0", + "@rollup/rollup-android-arm64": "4.59.0", + "@rollup/rollup-darwin-arm64": "4.59.0", + "@rollup/rollup-darwin-x64": "4.59.0", + "@rollup/rollup-freebsd-arm64": "4.59.0", + "@rollup/rollup-freebsd-x64": "4.59.0", + "@rollup/rollup-linux-arm-gnueabihf": "4.59.0", + "@rollup/rollup-linux-arm-musleabihf": "4.59.0", + "@rollup/rollup-linux-arm64-gnu": "4.59.0", + "@rollup/rollup-linux-arm64-musl": "4.59.0", + "@rollup/rollup-linux-loong64-gnu": "4.59.0", + "@rollup/rollup-linux-loong64-musl": "4.59.0", + "@rollup/rollup-linux-ppc64-gnu": "4.59.0", + "@rollup/rollup-linux-ppc64-musl": "4.59.0", + "@rollup/rollup-linux-riscv64-gnu": "4.59.0", + "@rollup/rollup-linux-riscv64-musl": "4.59.0", + "@rollup/rollup-linux-s390x-gnu": "4.59.0", + "@rollup/rollup-linux-x64-gnu": "4.59.0", + "@rollup/rollup-linux-x64-musl": "4.59.0", + "@rollup/rollup-openbsd-x64": "4.59.0", + "@rollup/rollup-openharmony-arm64": "4.59.0", + "@rollup/rollup-win32-arm64-msvc": "4.59.0", + "@rollup/rollup-win32-ia32-msvc": "4.59.0", + "@rollup/rollup-win32-x64-gnu": "4.59.0", + "@rollup/rollup-win32-x64-msvc": "4.59.0", "fsevents": "~2.3.2" } }, @@ -5128,13 +5218,6 @@ "vega-util": "^2.1.0" } }, - "node_modules/vega-expression/node_modules/@types/estree": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", - "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", - "license": "MIT", - "peer": true - }, "node_modules/vega-force": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/vega-force/-/vega-force-5.1.0.tgz", @@ -5219,12 +5302,6 @@ "vega-util": "^2.1.0" } }, - "node_modules/vega-interpreter/node_modules/vega-util": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/vega-util/-/vega-util-2.1.0.tgz", - "integrity": "sha512-PGfp0m0QCufDmcxKJCWQy4Ov23FoF8DSXmoJwSezi3itQaa2hbxK0+xwsTMP2vy4PR16Pu25HMzgMwXVW1+33w==", - "license": "BSD-3-Clause" - }, "node_modules/vega-label": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/vega-label/-/vega-label-2.1.0.tgz", diff --git a/frontend/src/components/APINav.jsx b/frontend/src/components/APINav.jsx new file mode 100644 index 0000000..5ca752f --- /dev/null +++ b/frontend/src/components/APINav.jsx @@ -0,0 +1,50 @@ +import { Link } from 'react-router-dom'; +import { useApiExplorerStore } from '../stores/apiExplorerStore.js'; + +const APINav = ({ active }) => { + const { apiUrl } = useApiExplorerStore(); + const urlParam = apiUrl ? `?url=${encodeURIComponent(apiUrl)}` : ''; + + const items = [ + { key: 'collections', label: 'Collections', path: '/explore-api/collections', icon: 'bi-collection' }, + { key: 'compare', label: 'Compare (SCIM)', path: '/explore-api/compare', icon: 'bi-arrows-angle-contract' }, + ]; + + return ( +
+
+

+ + API Explorer +

+ + + Change API + +
+ + {apiUrl && ( +
+ + {apiUrl} +
+ )} + +
    + {items.map((item) => ( +
  • + + + {item.label} + +
  • + ))} +
+
+ ); +}; + +export { APINav }; diff --git a/frontend/src/components/CliSnippet.jsx b/frontend/src/components/CliSnippet.jsx new file mode 100644 index 0000000..61e5ec5 --- /dev/null +++ b/frontend/src/components/CliSnippet.jsx @@ -0,0 +1,136 @@ +import { useState } from 'react'; + +/** + * A copyable CLI command snippet. + * Shows a monospace command with a copy button. + */ +const CliCommand = ({ command }) => { + const [copied, setCopied] = useState(false); + + const handleCopy = () => { + navigator.clipboard.writeText(command).then(() => { + setCopied(true); + setTimeout(() => setCopied(false), 1500); + }); + }; + + return ( +
+
{command}
+ +
+ ); +}; + +/** + * A collapsible panel of CLI commands for a given context. + * Props: + * commands: [{label, command}] + */ +const CliSnippet = ({ commands }) => { + const [open, setOpen] = useState(false); + + if (!commands || commands.length === 0) return null; + + return ( +
+ + {open && ( +
+ {commands.map(({ label, command }, i) => ( +
+ {label && {label}} + +
+ ))} + + Install: pip install refget + +
+ )} +
+ ); +}; + +/** + * A small icon button for table rows that opens a modal with CLI/Python snippets. + * Props: + * snippets: [{ label, cli, python }] + * title: modal title + */ +const RowCodeButton = ({ snippets, title = 'Code' }) => { + const [show, setShow] = useState(false); + const [tab, setTab] = useState('cli'); + + return ( + <> + + {show && ( + <> +
setShow(false)} /> +
setShow(false)}> +
e.stopPropagation()}> +
+
+
+ + {title} +
+
+
+
    +
  • + +
  • +
  • + +
  • +
+ {snippets.map((snippet, i) => ( +
+ {snippet.label && {snippet.label}} + +
+ ))} +
+
+
+
+ + )} + + ); +}; + +export { CliSnippet, CliCommand, RowCodeButton }; diff --git a/frontend/src/components/CompareTable.jsx b/frontend/src/components/CompareTable.jsx index 4b349b3..0ad9ca4 100644 --- a/frontend/src/components/CompareTable.jsx +++ b/frontend/src/components/CompareTable.jsx @@ -36,7 +36,7 @@ const CompareTable = ({ seqColDict }) => { '=' ) : ( { + const [copied, setCopied] = useState(false); + const handleCopy = (e) => { + e.stopPropagation(); + navigator.clipboard.writeText(value).then(() => { + setCopied(true); + setTimeout(() => setCopied(false), 1500); + }); + }; + return ( + + {value} + + + ); +}; + +export { CopyableDigest }; diff --git a/frontend/src/components/ExplorerNav.jsx b/frontend/src/components/ExplorerNav.jsx new file mode 100644 index 0000000..6562691 --- /dev/null +++ b/frontend/src/components/ExplorerNav.jsx @@ -0,0 +1,37 @@ +import { Link } from 'react-router-dom'; +import { useUnifiedStore } from '../stores/unifiedStore.js'; + +const ExplorerNav = ({ active }) => { + const { hasStore, hasAPI } = useUnifiedStore(); + + const items = [ + { key: 'collections', label: 'Collections', path: '/collections', icon: 'bi-collection' }, + { key: 'sequences', label: 'Sequences', path: '/sequences', icon: 'bi-list-ol', requireStore: true }, + { key: 'aliases', label: 'Aliases', path: '/aliases', icon: 'bi-tag', requireStore: true }, + { key: 'compare', label: 'Compare', path: '/compare', icon: 'bi-arrows-angle-contract', requireAPI: true }, + ]; + + const visibleItems = items.filter((item) => { + if (item.requireStore && !hasStore) return false; + if (item.requireAPI && !hasAPI) return false; + return true; + }); + + return ( +
    + {visibleItems.map((item) => ( +
  • + + + {item.label} + +
  • + ))} +
+ ); +}; + +export { ExplorerNav }; diff --git a/frontend/src/components/ObjectLists.jsx b/frontend/src/components/ObjectLists.jsx index b6c99c8..ecf39de 100644 --- a/frontend/src/components/ObjectLists.jsx +++ b/frontend/src/components/ObjectLists.jsx @@ -3,7 +3,8 @@ import { useLoaderData } from 'react-router-dom'; // Basic list of Sequence Collections const CollectionList = ({ collections }) => { - const seqColList = collections || useLoaderData()[0]; + const loaderData = useLoaderData(); + const seqColList = collections || loaderData[0]; return ( <> @@ -24,7 +25,8 @@ const CollectionList = ({ collections }) => { }; const AttributeList = ({ attributeName, attributeDigests }) => { - const attrList = attributeDigests || useLoaderData()[0]; + const loaderData = useLoaderData(); + const attrList = attributeDigests || loaderData[0]; return ( <> @@ -44,7 +46,8 @@ const AttributeList = ({ attributeName, attributeDigests }) => { // Basic list of Pangenomes const PangenomeList = ({ pangenomes }) => { - const pangenomeList = pangenomes || useLoaderData()[1]; + const loaderData = useLoaderData(); + const pangenomeList = pangenomes || loaderData[1]; return ( <> diff --git a/frontend/src/components/ReportCard.jsx b/frontend/src/components/ReportCard.jsx new file mode 100644 index 0000000..7a80584 --- /dev/null +++ b/frontend/src/components/ReportCard.jsx @@ -0,0 +1,77 @@ +import { useState } from 'react'; + +const ReportCard = ({ + title, + tooltipText, + messageArray, + colorScheme = 'info' +}) => { + const [showTooltip, setShowTooltip] = useState(false); + + // Map color scheme to Bootstrap CSS classes + const headerClass = `bg-${colorScheme} bg-opacity-25`; + const titleClass = `fw-medium text-${colorScheme}-emphasis`; + const iconClass = `ms-2 text-${colorScheme}-emphasis`; + const bodyClass = `bg-${colorScheme} bg-opacity-10 rounded-bottom-1`; + + return ( +
+
+
+ + {title} + +
+ setShowTooltip(true)} + onMouseLeave={() => setShowTooltip(false)} + > + + + {showTooltip && ( +
+ {tooltipText} +
+ )} +
+
+
+ +
+
    + {messageArray.map((msg, index) => ( +
  • {msg}
  • + ))} +
+
+
+ ); +}; + +export { ReportCard }; diff --git a/frontend/src/components/SequenceTable.jsx b/frontend/src/components/SequenceTable.jsx new file mode 100644 index 0000000..1df6700 --- /dev/null +++ b/frontend/src/components/SequenceTable.jsx @@ -0,0 +1,188 @@ +import { useState, useMemo } from 'react'; +import { CopyableDigest } from './CopyableDigest.jsx'; +import { CliCommand } from './CliSnippet.jsx'; + +const PAGE_SIZE = 50; + +/** + * Paginated sequence table with detail modal. + * + * Props: + * sequences: array of {name, length, sha512t24u, md5, alphabet, description} + * storeUrl: optional store URL for code snippets in modal + * sortable: if true, column headers are clickable to sort + */ +const SequenceTable = ({ sequences, storeUrl, sortable = false }) => { + const [page, setPage] = useState(0); + const [selectedSeq, setSelectedSeq] = useState(null); + const [codeTab, setCodeTab] = useState('cli'); + const [sortCol, setSortCol] = useState(null); + const [sortAsc, setSortAsc] = useState(true); + + const handleSort = (col) => { + if (!sortable) return; + if (sortCol === col) setSortAsc(!sortAsc); + else { setSortCol(col); setSortAsc(true); } + setPage(0); + }; + + const sorted = useMemo(() => { + if (!sortable || !sortCol) return sequences; + return [...sequences].sort((a, b) => { + const va = a[sortCol]; + const vb = b[sortCol]; + if (typeof va === 'number' && typeof vb === 'number') + return sortAsc ? va - vb : vb - va; + return sortAsc + ? String(va).localeCompare(String(vb)) + : String(vb).localeCompare(String(va)); + }); + }, [sequences, sortCol, sortAsc, sortable]); + + const totalPages = Math.ceil(sorted.length / PAGE_SIZE); + const paged = sorted.slice(page * PAGE_SIZE, (page + 1) * PAGE_SIZE); + + const SortIcon = ({ col }) => { + if (!sortable || sortCol !== col) return null; + return ; + }; + + const thStyle = sortable ? { cursor: 'pointer' } : {}; + + return ( + <> +
+ + + + + + + + + + + {paged.map((seq, i) => ( + + + + + + + ))} + +
handleSort('name')}> + Name + handleSort('length')}> + Length + handleSort('sha512t24u')}> + SHA-512/24u +
{seq.name}{seq.length.toLocaleString()} + +
+
+ + {totalPages > 1 && ( +
+ +
+ )} + + {/* Sequence detail modal */} + {selectedSeq && ( + <> +
setSelectedSeq(null)} /> +
setSelectedSeq(null)}> +
e.stopPropagation()}> +
+
+
{selectedSeq.name}
+
+
+ + + + + + + + + + + + + + + + + + + {selectedSeq.description && ( + + + + + )} + +
Length{selectedSeq.length.toLocaleString()}
Alphabet{selectedSeq.alphabet}
SHA-512/24u
MD5
Description{selectedSeq.description}
+ {storeUrl && ( + <> +
Code
+
    +
  • + +
  • +
  • + +
  • +
+ Get sequence + + + )} +
+
+
+
+ + )} + + ); +}; + +export { SequenceTable }; diff --git a/frontend/src/components/StoreNav.jsx b/frontend/src/components/StoreNav.jsx new file mode 100644 index 0000000..1ba2cd5 --- /dev/null +++ b/frontend/src/components/StoreNav.jsx @@ -0,0 +1,222 @@ +import { useState } from 'react'; +import { Link } from 'react-router-dom'; +import { useExplorerStore } from '../stores/explorerStore.js'; +import { CliCommand } from './CliSnippet.jsx'; + +const StoreNav = ({ active, storeUrlParam, collectionDigest }) => { + const [showCode, setShowCode] = useState(false); + const [codeTab, setCodeTab] = useState('cli'); + const { storeUrl } = useExplorerStore(); + + const remote = storeUrl || new URLSearchParams(storeUrlParam).get('url') || ''; + + const items = [ + { key: 'overview', label: 'Overview', path: '/explore-store/overview', icon: 'bi-house' }, + { key: 'sequences', label: 'Sequences', path: '/explore-store/sequences', icon: 'bi-list-ol' }, + { key: 'aliases', label: 'Aliases', path: '/explore-store/aliases', icon: 'bi-tag' }, + ]; + + const snippetGroups = [ + { + heading: 'Setup', + snippets: [ + { + label: 'Subscribe to this remote store', + cli: `refget config add store \\ + ${remote}`, + python: `import refget + +refget.config.add("store", "${remote}")`, + }, + ], + }, + { + heading: 'Browse', + snippets: [ + { + label: 'List collections', + cli: `refget store list \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +store.list()`, + }, + { + label: 'List sequences', + cli: `refget store list --sequences \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +store.list(sequences=True)`, + }, + { + label: 'Store statistics', + cli: `refget store stats \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +print(store)`, + }, + ], + }, + ]; + + if (collectionDigest) { + snippetGroups.push({ + heading: 'Collection', + snippets: [ + { + label: 'Get collection metadata', + cli: `refget store get \\ + ${collectionDigest} \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +store.get("${collectionDigest}")`, + }, + { + label: 'Pull collection to local cache', + cli: `refget store pull \\ + ${collectionDigest} \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +store.pull("${collectionDigest}")`, + }, + { + label: 'Export as FASTA', + cli: `refget store export \\ + ${collectionDigest} \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +store.export("${collectionDigest}")`, + }, + { + label: 'Generate .fai index', + cli: `refget store fai \\ + ${collectionDigest} \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +store.fai("${collectionDigest}")`, + }, + { + label: 'Generate chrom.sizes', + cli: `refget store chrom-sizes \\ + ${collectionDigest} \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +store.chrom_sizes("${collectionDigest}")`, + }, + ], + }); + } + + return ( +
+
+

+ + RefgetStore Explorer +

+
+ + + + Change Store + +
+
+ + {/* Code Snippets Modal */} + {showCode && ( + <> +
setShowCode(false)} /> +
setShowCode(false)}> +
e.stopPropagation()}> +
+
+
+ + Code Snippets +
+
+
+
    +
  • + +
  • +
  • + +
  • +
+ + {snippetGroups.map((group, gi) => ( +
+
{group.heading}
+ {group.snippets.map((snippet, i) => ( +
+ {snippet.label} + +
+ ))} +
+ ))} +
+ + Install: pip install refget + +
+
+
+
+ + )} + +
    + {items.map((item) => ( +
  • + + + {item.label} + +
  • + ))} +
+
+ ); +}; + +export { StoreNav }; diff --git a/frontend/src/features/digest/FastaDropzone.jsx b/frontend/src/components/digest/FastaDropzone.jsx similarity index 100% rename from frontend/src/features/digest/FastaDropzone.jsx rename to frontend/src/components/digest/FastaDropzone.jsx diff --git a/frontend/src/features/digest/SeqColResult.jsx b/frontend/src/components/digest/SeqColResult.jsx similarity index 100% rename from frontend/src/features/digest/SeqColResult.jsx rename to frontend/src/components/digest/SeqColResult.jsx diff --git a/frontend/src/features/digest/digest.css b/frontend/src/components/digest/digest.css similarity index 100% rename from frontend/src/features/digest/digest.css rename to frontend/src/components/digest/digest.css diff --git a/frontend/src/components/digest/fastaDigestWorker.js b/frontend/src/components/digest/fastaDigestWorker.js new file mode 100644 index 0000000..86e9178 --- /dev/null +++ b/frontend/src/components/digest/fastaDigestWorker.js @@ -0,0 +1,118 @@ +// Web Worker for streaming FASTA digest computation. +// Runs in background thread to avoid freezing UI. +// Uses streaming API for files of any size. + +const PROGRESS_INTERVAL_MS = 200; // Max 5 updates/sec +let lastProgressTime = 0; +let wasmModule = null; +let cancelled = false; + +async function initWasm() { + if (wasmModule) return wasmModule; + + const gtars = await import('@databio/gtars'); + await gtars.default(); + wasmModule = gtars; + return wasmModule; +} + +self.onmessage = async (e) => { + const { type } = e.data; + + if (type === 'cancel') { + cancelled = true; + return; + } + + const { file } = e.data; + cancelled = false; + + const stats = { chunks: 0, totalBytes: 0, startTime: Date.now() }; + + try { + self.postMessage({ type: 'status', message: 'Loading WASM module...' }); + const gtars = await initWasm(); + + // Create streaming hasher + const hasher = gtars.fastaHasherNew(); + + try { + self.postMessage({ type: 'status', message: 'Processing file...' }); + + // Stream file chunks to WASM + const stream = file.stream(); + const reader = stream.getReader(); + + let bytesProcessed = 0; + const totalSize = file.size; + + while (true) { + if (cancelled) { + reader.cancel(); + gtars.fastaHasherFree(hasher); + self.postMessage({ type: 'cancelled' }); + return; + } + + const { done, value } = await reader.read(); + if (done) break; + + try { + gtars.fastaHasherUpdate(hasher, value); + } catch (err) { + gtars.fastaHasherFree(hasher); + const msg = err.message || ''; + if (msg.toLowerCase().includes('fasta') || msg.toLowerCase().includes('parse')) { + self.postMessage({ type: 'error', message: `Invalid FASTA format: ${msg}`, category: 'parse' }); + } else { + self.postMessage({ type: 'error', message: `WASM processing error: ${msg}`, category: 'wasm' }); + } + return; + } + + stats.chunks++; + bytesProcessed += value.length; + stats.totalBytes = bytesProcessed; + + const now = Date.now(); + if (now - lastProgressTime >= PROGRESS_INTERVAL_MS) { + lastProgressTime = now; + self.postMessage({ + type: 'progress', + bytesProcessed, + totalSize, + percent: Math.round(100 * bytesProcessed / totalSize) + }); + } + } + + // Send final progress to ensure 100% + self.postMessage({ type: 'progress', bytesProcessed: totalSize, totalSize, percent: 100 }); + + // Finalize and get result + self.postMessage({ type: 'status', message: 'Computing final digests...' }); + const result = gtars.fastaHasherFinish(hasher); + + stats.elapsedMs = Date.now() - stats.startTime; + stats.avgChunkSize = stats.chunks > 0 ? Math.round(stats.totalBytes / stats.chunks) : 0; + self.postMessage({ type: 'result', result, stats }); + + } catch (err) { + gtars.fastaHasherFree(hasher); // Cleanup on error + throw err; + } + + } catch (error) { + const msg = error.message || 'Processing failed'; + let category = 'unknown'; + if (msg.toLowerCase().includes('gzip') || msg.toLowerCase().includes('decompress') || msg.toLowerCase().includes('corrupt')) { + category = 'gzip'; + self.postMessage({ type: 'error', message: `File appears corrupted or is not valid gzip: ${msg}`, category }); + } else if (msg.toLowerCase().includes('stream') || msg.toLowerCase().includes('read')) { + category = 'stream'; + self.postMessage({ type: 'error', message: `Error reading file: ${msg}`, category }); + } else { + self.postMessage({ type: 'error', message: msg, category }); + } + } +}; diff --git a/frontend/src/features/digest/fastaDigestWorker.js b/frontend/src/features/digest/fastaDigestWorker.js deleted file mode 100644 index 616aed6..0000000 --- a/frontend/src/features/digest/fastaDigestWorker.js +++ /dev/null @@ -1,65 +0,0 @@ -// Web Worker for streaming FASTA digest computation. -// Runs in background thread to avoid freezing UI. -// Uses streaming API for files of any size. - -let wasmModule = null; - -async function initWasm() { - if (wasmModule) return wasmModule; - - const gtars = await import('@databio/gtars'); - await gtars.default(); - wasmModule = gtars; - return wasmModule; -} - -self.onmessage = async (e) => { - const { file } = e.data; - - try { - self.postMessage({ type: 'status', message: 'Loading WASM module...' }); - const gtars = await initWasm(); - - // Create streaming hasher - const hasher = gtars.fastaHasherNew(); - - try { - self.postMessage({ type: 'status', message: 'Processing file...' }); - - // Stream file chunks to WASM - const stream = file.stream(); - const reader = stream.getReader(); - - let bytesProcessed = 0; - const totalSize = file.size; - - while (true) { - const { done, value } = await reader.read(); - if (done) break; - - // Pass chunk directly to Rust - no parsing in JS - gtars.fastaHasherUpdate(hasher, value); - - bytesProcessed += value.length; - self.postMessage({ - type: 'progress', - bytesProcessed, - totalSize, - percent: Math.round(100 * bytesProcessed / totalSize) - }); - } - - // Finalize and get result - self.postMessage({ type: 'status', message: 'Computing final digests...' }); - const result = gtars.fastaHasherFinish(hasher); - self.postMessage({ type: 'result', result }); - - } catch (err) { - gtars.fastaHasherFree(hasher); // Cleanup on error - throw err; - } - - } catch (error) { - self.postMessage({ type: 'error', message: error.message || 'Processing failed' }); - } -}; diff --git a/frontend/src/features/digest/index.js b/frontend/src/features/digest/index.js deleted file mode 100644 index 279fe78..0000000 --- a/frontend/src/features/digest/index.js +++ /dev/null @@ -1 +0,0 @@ -export { default as DigestPage } from './DigestPage'; diff --git a/frontend/src/main.jsx b/frontend/src/main.jsx index 2268aef..6d31897 100644 --- a/frontend/src/main.jsx +++ b/frontend/src/main.jsx @@ -11,41 +11,56 @@ import 'bootstrap/dist/css/bootstrap.css'; import 'bootstrap/dist/js/bootstrap.bundle.js'; import 'bootstrap-icons/font/bootstrap-icons.css'; -import { CollectionView } from './pages/CollectionView.jsx'; +import { useUnifiedStore } from './stores/unifiedStore.js'; + +// Unified Explorer pages +import { LandingPage } from './pages/LandingPage.jsx'; +import { Explorer } from './pages/Explorer.jsx'; +import { ExplorerCollection } from './pages/ExplorerCollection.jsx'; +import { ExplorerSequences } from './pages/ExplorerSequences.jsx'; +import { ExplorerAliases } from './pages/ExplorerAliases.jsx'; + +// API Explorer pages +import { APIExplorer } from './pages/APIExplorer.jsx'; +import { APICollections } from './pages/APICollections.jsx'; +import { APICollectionView } from './pages/APICollectionView.jsx'; +import { APICompare } from './pages/APICompare.jsx'; +import { APICompliance } from './pages/APICompliance.jsx'; + +// Store Explorer pages +import { StoreExplorer } from './pages/StoreExplorer.jsx'; +import { StoreOverview } from './pages/StoreOverview.jsx'; +import { StoreSequences } from './pages/StoreSequences.jsx'; +import { StoreCollection } from './pages/StoreCollection.jsx'; +import { StoreAliases } from './pages/StoreAliases.jsx'; + +// Site-specific pages import { PangenomeView } from './pages/PangenomeView.jsx'; import { AttributeView } from './pages/AttributeView.jsx'; import { DemoPage } from './pages/DemoPage.jsx'; import { SCIM } from './pages/SCIM.jsx'; import { SCOM } from './pages/SCOM.jsx'; -import { HomePage } from './pages/HomePage.jsx'; import { HPRCGenomes } from './pages/HPRCGenomes.jsx'; import { HumanReferencesView } from './pages/HumanReferences.jsx'; -import { DigestPage } from './features/digest'; +import { DigestPage } from './pages/DigestPage.jsx'; +import { CompliancePage } from './pages/CompliancePage.jsx'; import { fetchServiceInfo, fetchPangenomeLevels, - fetchSeqColList, fetchAllSeqCols, fetchCollectionLevels, fetchComparison, fetchAttribute, } from './services/fetchData.jsx'; -import { - AttributeValue, - LinkedAttributeDigest, -} from './components/ValuesAndDigests.jsx'; -import { CollectionList, PangenomeList } from './components/ObjectLists.jsx'; -import { copyToClipboardIcon, copyToClipboard } from './utilities'; +import { copyToClipboard } from './utilities'; import { Outlet, - Link, createBrowserRouter, RouterProvider, useLoaderData, - useParams, useRouteError, useNavigate, useLocation, @@ -53,44 +68,38 @@ import { import { API_BASE } from './utilities.jsx'; -const Level1Collection = ({ collection }) => { - return ( -
- Names:{' '} - - {collection.names} - -
- Lengths:{' '} - - {collection.lengths} - -
- Sequences:{' '} - - {collection.sequences} - -
-
- ); -}; +const NavItem = ({ path, label, location, navigate, isDropdown }) => { + const active = path === '/' + ? location === '' + : location.startsWith(path.substring(1)); -const Level2Collection = ({ collection }) => { return ( -
-

Names

-
{JSON.stringify(collection.names, null, 2)}
-

Lengths

-
{JSON.stringify(collection.lengths, null, 2)}
-

Sequences

-
{JSON.stringify(collection.sequences, null, 2)}
-
+
  • + navigate(path)} + className={`nav-link cursor-pointer ${active ? 'fw-medium text-black' : 'fw-light'}`} + > + {label} + +
  • ); }; const Nav = () => { const navigate = useNavigate(); const location = useLocation().pathname.substring(1) || ''; + const { serviceInfo } = useUnifiedStore(); + const scomEnabled = serviceInfo?.seqcol?.scom?.enabled; + + const navTo = (path) => { + navigate(path); + // Close any open Bootstrap dropdown + document.querySelectorAll('.dropdown-menu.show').forEach((el) => { + el.classList.remove('show'); + el.previousElementSibling?.classList.remove('show'); + el.previousElementSibling?.setAttribute('aria-expanded', 'false'); + }); + }; return (
    @@ -197,36 +212,82 @@ const Nav = () => { ); }; +class ReactErrorBoundary extends React.Component { + constructor(props) { + super(props); + this.state = { hasError: false, error: null }; + } + + static getDerivedStateFromError(error) { + return { hasError: true, error }; + } + + componentDidCatch(error, errorInfo) { + console.error('ReactErrorBoundary caught an error:', error, errorInfo); + } + + render() { + if (this.state.hasError) { + return ( +
    + Something went wrong. +

    {this.state.error?.message || 'An unexpected error occurred.'}

    + +
    + ); + } + return this.props.children; + } +} + const App = () => { const loaderData = useLoaderData(); - const refgetVersion = loaderData['version']['refget_pkg_version']; + const apiAvailable = loaderData != null; + const version = loaderData?.version; + return ( <>
    + + ); +}; + +export { APICollectionView }; diff --git a/frontend/src/pages/APICollections.jsx b/frontend/src/pages/APICollections.jsx new file mode 100644 index 0000000..c8eb919 --- /dev/null +++ b/frontend/src/pages/APICollections.jsx @@ -0,0 +1,99 @@ +import { useState, useEffect } from 'react'; +import { Link, useSearchParams } from 'react-router-dom'; +import { useApiExplorerStore } from '../stores/apiExplorerStore.js'; +import { APINav } from '../components/APINav.jsx'; +import { fetchSeqColList } from '../services/fetchData.jsx'; + +const APICollections = () => { + const [searchParams] = useSearchParams(); + const { apiUrl, probeApi, loading: probing } = useApiExplorerStore(); + const [data, setData] = useState(null); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + + const urlParam = searchParams.get('url'); + const effectiveUrl = apiUrl || urlParam; + + useEffect(() => { + const init = async () => { + try { + if (urlParam && !apiUrl) { + await probeApi(urlParam); + } + const result = await fetchSeqColList(effectiveUrl); + setData(result); + } catch (err) { + setError(err.message); + } finally { + setLoading(false); + } + }; + init(); + }, [urlParam]); // eslint-disable-line react-hooks/exhaustive-deps + + if (loading || probing) { + return ( +
    + +
    +
    +

    Loading collections...

    +
    +
    + ); + } + + if (error) { + return ( +
    + +
    {error}
    +
    + ); + } + + if (!data || !Array.isArray(data) || data.length < 1) { + return ( +
    + +
    No data available.
    +
    + ); + } + + const collections = data[0]; + const urlSuffix = effectiveUrl ? `?url=${encodeURIComponent(effectiveUrl)}` : ''; + + return ( +
    + + +
    +
    +
    + {collections?.pagination?.total ?? 0} collections +
    +
    +
    + + {collections?.results?.length > 0 ? ( +
      + {collections.results.map((digest) => ( +
    • + + {digest} + +
    • + ))} +
    + ) : ( +

    No collections found.

    + )} +
    + ); +}; + +export { APICollections }; diff --git a/frontend/src/pages/APICompare.jsx b/frontend/src/pages/APICompare.jsx new file mode 100644 index 0000000..a485f19 --- /dev/null +++ b/frontend/src/pages/APICompare.jsx @@ -0,0 +1,26 @@ +import { useEffect } from 'react'; +import { useSearchParams } from 'react-router-dom'; +import { useApiExplorerStore } from '../stores/apiExplorerStore.js'; +import { APINav } from '../components/APINav.jsx'; +import { SCIM } from './SCIM.jsx'; + +const APICompare = () => { + const [searchParams] = useSearchParams(); + const { apiUrl, probeApi } = useApiExplorerStore(); + const urlParam = searchParams.get('url'); + + useEffect(() => { + if (urlParam && !apiUrl) { + probeApi(urlParam).catch(() => {}); + } + }, [urlParam]); // eslint-disable-line react-hooks/exhaustive-deps + + return ( +
    + + +
    + ); +}; + +export { APICompare }; diff --git a/frontend/src/pages/APICompliance.jsx b/frontend/src/pages/APICompliance.jsx new file mode 100644 index 0000000..b221c99 --- /dev/null +++ b/frontend/src/pages/APICompliance.jsx @@ -0,0 +1,26 @@ +import { useEffect } from 'react'; +import { useSearchParams } from 'react-router-dom'; +import { useApiExplorerStore } from '../stores/apiExplorerStore.js'; +import { APINav } from '../components/APINav.jsx'; +import { CompliancePage } from './CompliancePage.jsx'; + +const APICompliance = () => { + const [searchParams] = useSearchParams(); + const { apiUrl, probeApi } = useApiExplorerStore(); + const urlParam = searchParams.get('url'); + + useEffect(() => { + if (urlParam && !apiUrl) { + probeApi(urlParam).catch(() => {}); + } + }, [urlParam]); // eslint-disable-line react-hooks/exhaustive-deps + + return ( +
    + + +
    + ); +}; + +export { APICompliance }; diff --git a/frontend/src/pages/APIExplorer.jsx b/frontend/src/pages/APIExplorer.jsx new file mode 100644 index 0000000..ec0aada --- /dev/null +++ b/frontend/src/pages/APIExplorer.jsx @@ -0,0 +1,115 @@ +import { useState } from 'react'; +import { useNavigate } from 'react-router-dom'; +import { useApiExplorerStore } from '../stores/apiExplorerStore.js'; + +const RECENT_APIS_KEY = 'refget-explorer-recent-apis'; +const MAX_RECENT = 5; + +const getRecentApis = () => { + try { + return JSON.parse(localStorage.getItem(RECENT_APIS_KEY)) || []; + } catch { + return []; + } +}; + +const APIExplorer = () => { + const navigate = useNavigate(); + const { probeApi, loading, error } = useApiExplorerStore(); + const [url, setUrl] = useState(''); + const [localError, setLocalError] = useState(null); + const recentApis = getRecentApis(); + + const handleExplore = async (targetUrl) => { + const trimmed = (targetUrl || url).trim(); + if (!trimmed) return; + setLocalError(null); + try { + await probeApi(trimmed); + navigate(`/explore-api/collections?url=${encodeURIComponent(trimmed)}`); + } catch (err) { + setLocalError(err.message); + } + }; + + const handleSubmit = (e) => { + e.preventDefault(); + handleExplore(); + }; + + return ( +
    +

    + + API Explorer +

    +

    + Browse any SeqCol API server. Enter the base URL and explore its collections, + run comparisons, and test compliance. +

    + +
    +
    + setUrl(e.target.value)} + required + /> + +
    + + + {(localError || error) && ( +
    + Failed to connect: {localError || error} +

    + Make sure the URL points to a SeqCol API server with a{' '} + /service-info endpoint. The server must allow CORS. +

    +
    + )} + + {recentApis.length > 0 && ( +
    +
    Recent APIs
    +
    + {recentApis.map((recentUrl) => ( + + ))} +
    +
    + )} +
    + ); +}; + +export { APIExplorer }; diff --git a/frontend/src/pages/AttributeView.jsx b/frontend/src/pages/AttributeView.jsx index cabeec3..33e3a23 100644 --- a/frontend/src/pages/AttributeView.jsx +++ b/frontend/src/pages/AttributeView.jsx @@ -9,6 +9,11 @@ import { CollectionList } from '../components/ObjectLists.jsx'; const AttributeView = () => { const content = useLoaderData(); const { attribute, digest } = useParams(); + + if (!Array.isArray(content) || content.length < 2) { + return
    Failed to load attribute data.
    ; + } + const api_url = `${API_BASE}/attribute/collection/${attribute}/${digest}`; const api_url_list = `${API_BASE}/list/collection?${attribute}=${digest}`; let results = content[0]; diff --git a/frontend/src/pages/CollectionView.jsx b/frontend/src/pages/CollectionView.jsx index 7469ab1..86cd378 100644 --- a/frontend/src/pages/CollectionView.jsx +++ b/frontend/src/pages/CollectionView.jsx @@ -1,45 +1,22 @@ import { Link, useLoaderData, useParams } from 'react-router-dom'; -import { useState } from 'react'; import { API_BASE } from '../utilities.jsx'; import { AttributeValue, LinkedAttributeDigest, } from '../components/ValuesAndDigests.jsx'; -const CollectionView = (params) => { +const CollectionView = () => { const collection = useLoaderData(); - const [collectionRepresentation, setCollectionRepresentation] = - useState(null); const { digest } = useParams(); + if (!Array.isArray(collection) || collection.length < 3) { + return
    Failed to load collection data.
    ; + } + let level1 = collection[0]; let level2 = collection[1]; let uncollated = collection[2]; - // const col_str = (2 == 1 ? "asdf" :
    {JSON.stringify(collectionRepresentation, null, 2)}
    ) - const showLevel = (level, collated = true) => { - fetchSeqColDetails(digest, level, collated).then((data) => { - if (level == 1) { - data = Level1Collection(data); - } else if (level == 2) { - data = Level2Collection(data); - } - setCollectionRepresentation(data); - }); - - const showUncollated = () => { - fetchSeqColDetails(digest, 'uncollated').then((data) => { - setCollectionRepresentation(data); - }); - }; - }; - - const urls = { - level1: `/collection/${digest}?level=1`, - level2: `/collection/${digest}?level=2`, - uncollated: `/collection/${digest}?collated=false`, - }; - let attribute_list_views = []; for (let attribute in level2) { attribute_list_views.push( @@ -101,11 +78,11 @@ const CollectionView = (params) => { aria-expanded='true' aria-controls='collapseLevel1' > -
    Level 1: {urls['level1']}
    +
    Level 1: {`/collection/${digest}?level=1`}
    { aria-expanded='false' aria-controls='collapseLevel2' > -
    Level 2: {urls['level2']}
    +
    Level 2: {`/collection/${digest}?level=2`}
    { aria-expanded='false' aria-controls='collapseUncollated' > -
    Uncollated: {urls['uncollated']}
    +
    Uncollated: {`/collection/${digest}?collated=false`}
    { - const [showTooltip, setShowTooltip] = useState(false); - - return ( -
    -
    -
    - - Coordinate System - -
    - setShowTooltip(true)} - onMouseLeave={() => setShowTooltip(false)} - > - - - {showTooltip && ( -
    - This assessment reports on the compatibility of the names and - lengths of the sequences, without regard to sequence content. -
    - )} -
    -
    -
    - -
    -
      - {messageArray.map((msg, index) => ( -
    • {msg}
    • - ))} -
    -
    -
    - ); -}; - -const SequencesReport = ({ messageArray }) => { - const [showTooltip, setShowTooltip] = useState(false); - - return ( -
    -
    -
    - Sequences -
    - setShowTooltip(true)} - onMouseLeave={() => setShowTooltip(false)} - > - - - {showTooltip && ( -
    - This assessment reports on the sequences only, without regard to - their names. -
    - )} -
    -
    -
    - -
    -
      - {messageArray.map((msg, index) => ( -
    • {msg}
    • - ))} -
    -
    -
    - ); -}; +import { API_BASE, encodeToBase64 } from '../utilities.jsx'; // Component to display the comparison between two collections // ✅❔❌❔ const coordinateSystemInterpretation = (comparison) => { + if (!comparison?.array_elements?.a_count || !comparison?.array_elements?.b_count || !comparison?.array_elements?.a_and_b_count) { + return ['Unable to interpret: incomplete comparison data']; + } + const lengthsANotB = - comparison.array_elements.a.lengths - - comparison.array_elements.a_and_b.lengths; + comparison.array_elements.a_count.lengths - + comparison.array_elements.a_and_b_count.lengths; const lengthsBNotA = - comparison.array_elements.b.lengths - - comparison.array_elements.a_and_b.lengths; + comparison.array_elements.b_count.lengths - + comparison.array_elements.a_and_b_count.lengths; const namesANotB = - comparison.array_elements.a.names - comparison.array_elements.a_and_b.names; + comparison.array_elements.a_count.names - comparison.array_elements.a_and_b_count.names; const namesBNotA = - comparison.array_elements.b.names - comparison.array_elements.a_and_b.names; + comparison.array_elements.b_count.names - comparison.array_elements.a_and_b_count.names; const nlpANotB = - comparison.array_elements.a.name_length_pairs - - comparison.array_elements.a_and_b.name_length_pairs; + comparison.array_elements.a_count.name_length_pairs - + comparison.array_elements.a_and_b_count.name_length_pairs; const nlpBNotA = - comparison.array_elements.b.name_length_pairs - - comparison.array_elements.a_and_b.name_length_pairs; + comparison.array_elements.b_count.name_length_pairs - + comparison.array_elements.a_and_b_count.name_length_pairs; const msgArray = []; // If the name_length_pairs match, then the coordinate systems are identical if (nlpANotB === 0 && nlpBNotA === 0) { @@ -162,7 +40,7 @@ const coordinateSystemInterpretation = (comparison) => { } else if (nlpANotB > 0 && nlpBNotA === 0) { // If B nlp is a subset of A msgArray.push("Collection B's coordinate system is a subset of A's."); - } else if (comparison.array_elements.a_and_b.name_length_pairs !== 0) { + } else if (comparison.array_elements.a_and_b_count.name_length_pairs !== 0) { // If there is some overlap msgArray.push('The coordinate systems are partially overlapping.'); } else { @@ -190,7 +68,7 @@ const coordinateSystemInterpretation = (comparison) => { const LinkToLocalComparison = ({ comparison }) => { const [copied, setCopied] = useState(false); - const base64encodedComparison = btoa(JSON.stringify(comparison)); + const base64encodedComparison = encodeToBase64(JSON.stringify(comparison)); return ( + ) : ( + + )} +
    + + + + + {error && ( +
    + Error: {error} +
    + )} + + {(results.length > 0 || loading) && ( +
    +
    +
    +
    +
    +
    {total}
    +
    Total
    +
    +
    +
    {passed}
    +
    Passed
    +
    +
    +
    {failed}
    +
    Failed
    +
    +
    +
    + {serverUrl} +
    + {summary && ( +
    + {new Date().toLocaleString()} +
    + )} +
    +
    +
    +
    +
    0 ? (passed / total) * 100 : 0}%`, + transition: 'width 0.3s ease', + }} + /> +
    0 ? (failed / total) * 100 : 0}%`, + transition: 'width 0.3s ease', + }} + /> +
    +
    +
    +
    + +
    + {results.map((result, idx) => ( +
    +
    +
    + + {result.passed ? 'PASS' : 'FAIL'} + + {result.name} +
    + {result.description && ( +
    + {result.description} +
    + )} + {result.error && ( +
    + {result.error} +
    + )} +
    + + {result.duration_ms.toFixed(0)}ms + +
    + ))} + {loading && completed < total && ( +
    + + Running check {completed + 1} of {total}... +
    + )} +
    +
    + )} +
    + ); +}; diff --git a/frontend/src/features/digest/DigestPage.jsx b/frontend/src/pages/DigestPage.jsx similarity index 76% rename from frontend/src/features/digest/DigestPage.jsx rename to frontend/src/pages/DigestPage.jsx index 4eb2369..001a578 100644 --- a/frontend/src/features/digest/DigestPage.jsx +++ b/frontend/src/pages/DigestPage.jsx @@ -1,9 +1,9 @@ -import { useState, useRef, useEffect } from 'react'; +import { useState, useRef, useEffect, useCallback } from 'react'; import { useSearchParams, useNavigate } from 'react-router-dom'; import toast from 'react-hot-toast'; -import FastaDropzone from './FastaDropzone'; -import SeqColResult from './SeqColResult'; -import './digest.css'; +import FastaDropzone from '../components/digest/FastaDropzone'; +import SeqColResult from '../components/digest/SeqColResult'; +import '../components/digest/digest.css'; const HISTORY_KEY = 'digest-history'; const MAX_HISTORY = 20; @@ -54,7 +54,14 @@ function loadFromHistory(digest) { } } -export default function DigestPage() { +function createWorker() { + return new Worker( + new URL('../components/digest/fastaDigestWorker.js', import.meta.url), + { type: 'module' } + ); +} + +export function DigestPage() { const [searchParams] = useSearchParams(); const navigate = useNavigate(); const [result, setResult] = useState(null); @@ -63,6 +70,7 @@ export default function DigestPage() { const [progress, setProgress] = useState(null); const [error, setError] = useState(null); const [history, setHistory] = useState([]); + const [stats, setStats] = useState(null); const workerRef = useRef(null); // Load history on mount @@ -85,15 +93,16 @@ export default function DigestPage() { } }, [searchParams]); - // Initialize worker - useEffect(() => { - workerRef.current = new Worker( - new URL('./fastaDigestWorker.js', import.meta.url), - { type: 'module' } - ); + const setupWorker = useCallback(() => { + // Terminate existing worker if any + if (workerRef.current) { + workerRef.current.terminate(); + } + + const worker = createWorker(); - workerRef.current.onmessage = (e) => { - const { type, result, message, bytesProcessed, totalSize, percent } = e.data; + worker.onmessage = (e) => { + const { type, result, message, bytesProcessed, totalSize, percent, stats: workerStats } = e.data; if (type === 'status') { setStatus(message); @@ -103,8 +112,19 @@ export default function DigestPage() { setResult(result); setStatus(null); setProgress(null); + if (workerStats) { + setStats(workerStats); + if (import.meta.env.DEV) { + console.log('[FASTA Digest]', { + chunks: workerStats.chunks, + avgChunkSize: `${(workerStats.avgChunkSize / 1024).toFixed(1)} KB`, + elapsed: `${(workerStats.elapsedMs / 1000).toFixed(1)}s`, + throughput: `${(workerStats.totalBytes / workerStats.elapsedMs / 1024).toFixed(1)} MB/s` + }); + } + } // Save to localStorage - const name = workerRef.current._fileName; + const name = worker._fileName; saveToHistory(result, name); setHistory(getHistory()); // Update URL @@ -115,20 +135,58 @@ export default function DigestPage() { setStatus(null); setProgress(null); toast.error(message); + } else if (type === 'cancelled') { + setStatus(null); + setProgress(null); + setError('Processing cancelled.'); } }; - return () => workerRef.current?.terminate(); + worker.onerror = (event) => { + event.preventDefault(); + setError(event.message || 'Worker crashed unexpectedly'); + setStatus(null); + setProgress(null); + }; + + workerRef.current = worker; + return worker; }, []); + // Initialize worker on mount + useEffect(() => { + setupWorker(); + return () => workerRef.current?.terminate(); + }, [setupWorker]); + const handleFileSelected = (file) => { + // Cancel and replace any running worker to prevent double-processing + const worker = setupWorker(); setFileName(file.name); setResult(null); setError(null); setProgress(null); + setStats(null); setStatus('Starting...'); - workerRef.current._fileName = file.name; - workerRef.current.postMessage({ file }); + worker._fileName = file.name; + worker.postMessage({ file }); + }; + + const handleCancel = () => { + if (workerRef.current) { + workerRef.current.terminate(); + workerRef.current = null; + } + setStatus(null); + setProgress(null); + setError('Processing cancelled.'); + }; + + const handleClear = () => { + setError(null); + setStatus(null); + setProgress(null); + setStats(null); }; const handleHistoryClick = (digest) => { @@ -304,6 +362,12 @@ export default function DigestPage() {
    {status} +
    {progress && ( @@ -328,11 +392,19 @@ export default function DigestPage() {
    )} - {/* Error */} + {/* Error or Cancelled */} {error && ( -
    - - {error} +
    +
    + + {error} +
    +
    )} @@ -346,6 +418,21 @@ export default function DigestPage() { onDownloadRgsi={handleDownloadRgsi} /> + {/* Processing Stats (collapsed) */} + {stats && result && ( +
    + + Processing details + +
    +
    Chunks processed: {stats.chunks.toLocaleString()}
    +
    Average chunk size: {(stats.avgChunkSize / 1024).toFixed(1)} KB
    +
    Elapsed time: {(stats.elapsedMs / 1000).toFixed(1)}s
    +
    Throughput: {(stats.totalBytes / stats.elapsedMs / 1024).toFixed(1)} MB/s
    +
    +
    + )} + {/* History */} {history.length > 0 && (
    diff --git a/frontend/src/pages/Explorer.jsx b/frontend/src/pages/Explorer.jsx new file mode 100644 index 0000000..f483795 --- /dev/null +++ b/frontend/src/pages/Explorer.jsx @@ -0,0 +1,228 @@ +import { useState, useEffect, useMemo } from 'react'; +import { Link } from 'react-router-dom'; +import { useUnifiedStore } from '../stores/unifiedStore.js'; +import { useExplorerStore } from '../stores/explorerStore.js'; +import { ExplorerNav } from '../components/ExplorerNav.jsx'; +import { fetchSeqColList } from '../services/fetchData.jsx'; + +const Explorer = () => { + const { hasStore, hasAPI, storeUrl, apiUrl, storeCollections, probe, probed, loading: probing } = + useUnifiedStore(); + const { loadStore, metadata, loadAliases } = useExplorerStore(); + const [apiCollections, setApiCollections] = useState(null); + const [aliasMap, setAliasMap] = useState({}); + const [filter, setFilter] = useState(''); + const [sortCol, setSortCol] = useState(null); + const [sortAsc, setSortAsc] = useState(true); + const [loading, setLoading] = useState(true); + + useEffect(() => { + const init = async () => { + await probe(); + }; + init(); + }, []); // eslint-disable-line react-hooks/exhaustive-deps + + useEffect(() => { + if (!probed) return; + + const load = async () => { + // Load store data if available + if (hasStore && storeUrl) { + try { + await loadStore(storeUrl); + } catch {} + // Try to load collection aliases + try { + const storeData = useExplorerStore.getState(); + const namespaces = storeData.metadata?.collection_alias_namespaces || []; + const map = {}; + for (const ns of namespaces) { + const aliases = await loadAliases('collections', ns).catch(() => null); + if (aliases) { + aliases.forEach((a) => { + if (!map[a.digest]) map[a.digest] = []; + map[a.digest].push(a.alias); + }); + } + } + setAliasMap(map); + } catch {} + } + + // Load API collection list if available + if (hasAPI) { + try { + const result = await fetchSeqColList(apiUrl); + setApiCollections(result[0]); + } catch {} + } + + setLoading(false); + }; + load(); + }, [probed, hasStore, hasAPI]); // eslint-disable-line react-hooks/exhaustive-deps + + // Merge store collections with API collection list + // NOTE: useMemo hooks must be called before any early returns to avoid + // "Rendered more hooks than during the previous render" errors + const collections = useMemo(() => { + const byDigest = new Map(); + + // Store collections have richer data (n_sequences, attribute digests) + if (storeCollections) { + storeCollections.forEach((col) => { + byDigest.set(col.digest, { + digest: col.digest, + n_sequences: col.n_sequences, + names: aliasMap[col.digest] || [], + source: 'store', + }); + }); + } + + // API collections add any that store doesn't have + if (apiCollections?.results) { + apiCollections.results.forEach((digest) => { + if (!byDigest.has(digest)) { + byDigest.set(digest, { + digest, + n_sequences: null, + names: aliasMap[digest] || [], + source: 'api', + }); + } + }); + } + + return Array.from(byDigest.values()); + }, [storeCollections, apiCollections, aliasMap]); + + const filtered = useMemo(() => { + if (!filter) return collections; + const term = filter.toLowerCase(); + return collections.filter( + (c) => + c.digest.toLowerCase().includes(term) || + c.names.some((n) => n.toLowerCase().includes(term)), + ); + }, [collections, filter]); + + const sorted = useMemo(() => { + if (!sortCol) return filtered; + return [...filtered].sort((a, b) => { + let va, vb; + if (sortCol === 'name') { + va = (a.names[0] || '').toLowerCase(); + vb = (b.names[0] || '').toLowerCase(); + } else if (sortCol === 'n_sequences') { + va = a.n_sequences ?? -1; + vb = b.n_sequences ?? -1; + return sortAsc ? va - vb : vb - va; + } else { + va = (a[sortCol] || '').toLowerCase(); + vb = (b[sortCol] || '').toLowerCase(); + } + return sortAsc ? va.localeCompare(vb) : vb.localeCompare(va); + }); + }, [filtered, sortCol, sortAsc]); + + const handleSort = (col) => { + if (sortCol === col) setSortAsc(!sortAsc); + else { setSortCol(col); setSortAsc(true); } + }; + + if (probing || loading) { + return ( +
    +
    +

    Loading collections...

    +
    + ); + } + + const totalFromApi = apiCollections?.pagination?.total; + + return ( +
    + + +
    + + {filtered.length} collection{filtered.length !== 1 ? 's' : ''} + {totalFromApi != null && ` (${totalFromApi} total on server)`} + {filter && ` matching "${filter}"`} + + setFilter(e.target.value)} + /> +
    + + {!hasStore && !hasAPI && ( +
    + Neither a RefgetStore nor an API was detected at this server. + Try the Store Explorer or{' '} + API Explorer to connect to a specific URL. +
    + )} + + {sorted.length > 0 ? ( +
    + + + + + + {hasStore && ( + + )} + + + + {sorted.map((col) => ( + + + + {hasStore && ( + + )} + + ))} + +
    handleSort('name')}> + Name + {sortCol === 'name' && } + handleSort('digest')}> + Digest + {sortCol === 'digest' && } + handleSort('n_sequences')}> + Sequences + {sortCol === 'n_sequences' && } +
    + {col.names.length > 0 + ? [...new Set(col.names)].join(', ') + : -} + + + {col.digest} + + + {col.n_sequences != null ? col.n_sequences : '-'} +
    +
    + ) : ( +

    No collections found.

    + )} +
    + ); +}; + +export { Explorer }; diff --git a/frontend/src/pages/ExplorerAliases.jsx b/frontend/src/pages/ExplorerAliases.jsx new file mode 100644 index 0000000..cb51543 --- /dev/null +++ b/frontend/src/pages/ExplorerAliases.jsx @@ -0,0 +1,172 @@ +import { useEffect } from 'react'; +import { Link } from 'react-router-dom'; +import { useUnifiedStore } from '../stores/unifiedStore.js'; +import { useExplorerStore } from '../stores/explorerStore.js'; +import { ExplorerNav } from '../components/ExplorerNav.jsx'; +import { useState } from 'react'; + +const AliasPanel = ({ type, availableNamespaces }) => { + const { loadAliases } = useExplorerStore(); + const [namespace, setNamespace] = useState(''); + const [aliases, setAliases] = useState(null); + const [loading, setLoading] = useState(false); + const [error, setError] = useState(null); + const [filter, setFilter] = useState(''); + + const handleNamespaceClick = (ns) => { + setNamespace(ns); + setFilter(''); + setError(null); + setLoading(true); + loadAliases(type, ns) + .then((data) => { + if (!data) { + setError(`Namespace "${ns}" not found.`); + setAliases(null); + } else { + setAliases(data); + } + }) + .catch((err) => setError(err.message)) + .finally(() => setLoading(false)); + }; + + const filtered = aliases + ? aliases.filter( + (a) => + !filter || + a.alias.toLowerCase().includes(filter.toLowerCase()) || + a.digest.toLowerCase().includes(filter.toLowerCase()), + ) + : null; + + const linkPrefix = type === 'collections' ? '/collection/' : null; + + return ( +
    +
    +
    + + {type} aliases +
    +
    +
    + {availableNamespaces && availableNamespaces.length > 0 ? ( +
    + Namespaces: + {availableNamespaces.map((ns) => ( + + ))} + {loading && } +
    + ) : ( +

    + No {type} alias namespaces found. +

    + )} + + {error &&
    {error}
    } + + {filtered && ( + <> +
    + {filtered.length} aliases in "{namespace}" + setFilter(e.target.value)} + /> +
    +
    + + + + + + + + + {filtered.map((a, i) => ( + + + + + ))} + +
    AliasDigest
    {a.alias} + {linkPrefix ? ( + {a.digest} + ) : ( + a.digest + )} +
    +
    + + )} +
    +
    + ); +}; + +const ExplorerAliases = () => { + const { hasStore, storeUrl, probe, probed } = useUnifiedStore(); + const { metadata, loading, loadStore } = useExplorerStore(); + + useEffect(() => { + const init = async () => { + if (!probed) await probe(); + }; + init(); + }, []); // eslint-disable-line react-hooks/exhaustive-deps + + useEffect(() => { + if (probed && hasStore && storeUrl && !metadata && !loading) { + loadStore(storeUrl).catch(() => {}); + } + }, [probed, hasStore, storeUrl]); // eslint-disable-line react-hooks/exhaustive-deps + + if (!probed || loading) { + return ( +
    + +
    +
    +
    +
    + ); + } + + if (!hasStore) { + return ( +
    + +
    + Alias browsing requires a RefgetStore. No store was detected. +
    +
    + ); + } + + return ( +
    + +

    + Aliases map human-readable names to digests. Select a namespace to browse. +

    + + +
    + ); +}; + +export { ExplorerAliases }; diff --git a/frontend/src/pages/ExplorerCollection.jsx b/frontend/src/pages/ExplorerCollection.jsx new file mode 100644 index 0000000..f46c1d5 --- /dev/null +++ b/frontend/src/pages/ExplorerCollection.jsx @@ -0,0 +1,334 @@ +import { useState, useEffect } from 'react'; +import { Link, useParams } from 'react-router-dom'; +import { useUnifiedStore } from '../stores/unifiedStore.js'; +import { useExplorerStore } from '../stores/explorerStore.js'; +import { ExplorerNav } from '../components/ExplorerNav.jsx'; +import { CliCommand } from '../components/CliSnippet.jsx'; +import { SequenceTable } from '../components/SequenceTable.jsx'; +import { fetchCollectionLevels, fetchAttribute } from '../services/fetchData.jsx'; +import { CopyableDigest } from '../components/CopyableDigest.jsx'; +import { + AttributeValue, + LinkedAttributeDigest, +} from '../components/ValuesAndDigests.jsx'; + +const ExplorerCollection = () => { + const { digest } = useParams(); + const { hasStore, hasAPI, storeUrl, apiUrl } = useUnifiedStore(); + const { loadCollection, loadFhrMetadata, loadStore, metadata } = useExplorerStore(); + + const [storeData, setStoreData] = useState(null); + const [apiData, setApiData] = useState(null); + const [fhr, setFhr] = useState(undefined); + const [relatedCollections, setRelatedCollections] = useState(null); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + const [showRaw, setShowRaw] = useState(false); + const [codeTab, setCodeTab] = useState('cli'); + + useEffect(() => { + const load = async () => { + setLoading(true); + setError(null); + try { + // Load store data + if (hasStore && storeUrl) { + if (!metadata) { + await loadStore(storeUrl).catch(() => {}); + } + const col = await loadCollection(digest).catch(() => null); + setStoreData(col); + const fhrData = await loadFhrMetadata(digest).catch(() => null); + setFhr(fhrData); + } + + // Load API data + if (hasAPI && apiUrl) { + const levels = await fetchCollectionLevels(digest, apiUrl).catch(() => null); + setApiData(levels); + + // Fetch related collections via sorted_name_length_pairs + if (levels && levels[0]?.sorted_name_length_pairs) { + const snlp = levels[0].sorted_name_length_pairs; + try { + const related = await fetchAttribute('sorted_name_length_pairs', snlp, apiUrl); + setRelatedCollections(related[0]?.results?.filter((d) => d !== digest) || []); + } catch {} + } + } + } catch (err) { + setError(err.message); + } finally { + setLoading(false); + } + }; + load(); + }, [digest]); // eslint-disable-line react-hooks/exhaustive-deps + + if (loading) { + return ( +
    + +
    +
    +

    Loading collection...

    +
    +
    + ); + } + + if (error) { + return ( +
    + +
    {error}
    +
    + ); + } + + if (!storeData && !apiData) { + return ( +
    + +
    + Collection {digest} not found. +
    +
    + ); + } + + const sequences = storeData?.sequences || []; + const totalBases = sequences.reduce((sum, s) => sum + s.length, 0); + const alphabetCounts = {}; + sequences.forEach((s) => { + alphabetCounts[s.alphabet] = (alphabetCounts[s.alphabet] || 0) + 1; + }); + + const level1 = apiData?.[0]; + const level2 = apiData?.[1]; + const uncollated = apiData?.[2]; + + return ( +
    + + + + + {/* Summary stats (from store) */} + {storeData && ( +
    +
    +
    +
    + Sequences + {sequences.length.toLocaleString()} +
    +
    +
    +
    +
    +
    + Total bases + {totalBases.toLocaleString()} +
    +
    +
    + {Object.keys(alphabetCounts).length > 0 && ( +
    +
    +
    + Alphabets + {Object.entries(alphabetCounts).map(([alph, count]) => ( + + {alph}: {count} + + ))} +
    +
    +
    + )} +
    + )} + + {/* Related collections (from API) */} + {relatedCollections && relatedCollections.length > 0 && ( +
    +
    +
    + + Related collections (same coordinate system) +
    +
    +
    +

    + Collections sharing the same sorted_name_length_pairs digest: +

    +
      + {relatedCollections.slice(0, 10).map((d) => ( +
    • + + {d} + +
    • + ))} + {relatedCollections.length > 10 && ( +
    • + ...and {relatedCollections.length - 10} more +
    • + )} +
    +
    +
    + )} + + {/* Compare button */} + {hasAPI && ( +
    + + + Compare this collection + +
    + )} + + {/* FHR metadata */} + {fhr && ( +
    +
    +
    + + FHR Metadata +
    +
    +
    +
    +              {JSON.stringify(fhr, null, 2)}
    +            
    +
    +
    + )} + + {/* Sequence table (from store) */} + {sequences.length > 0 && ( +
    +
    +
    Sequences ({sequences.length.toLocaleString()})
    +
    +
    + +
    +
    + )} + + {/* Attribute digests (from API) */} + {level2 && ( +
    +
    +
    Attribute Digests
    +
    +
    + {Object.keys(level2).map((attribute) => ( +
    +
    {attribute}
    +
    +
    Digest:
    +
    + +
    +
    +
    + ))} +
    +
    + )} + + {/* Collapsible Technical Details */} + {apiData && ( +
    + )} + +
    + ); +}; + +export { ExplorerCollection }; diff --git a/frontend/src/pages/ExplorerSequences.jsx b/frontend/src/pages/ExplorerSequences.jsx new file mode 100644 index 0000000..2d4159c --- /dev/null +++ b/frontend/src/pages/ExplorerSequences.jsx @@ -0,0 +1,152 @@ +import { useState, useMemo, useEffect } from 'react'; +import { useUnifiedStore } from '../stores/unifiedStore.js'; +import { useExplorerStore } from '../stores/explorerStore.js'; +import { ExplorerNav } from '../components/ExplorerNav.jsx'; +import { SequenceTable } from '../components/SequenceTable.jsx'; + +const PARTIAL_LOAD_SIZE = 2 * 1024 * 1024; + +const formatBytes = (bytes) => { + if (bytes < 1024) return `${bytes} B`; + if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`; + return `${(bytes / (1024 * 1024)).toFixed(1)} MB`; +}; + +const ExplorerSequences = () => { + const { hasStore, storeUrl, probe, probed } = useUnifiedStore(); + const { + metadata, sequenceIndex, sequenceIndexPartial, sequenceIndexTotalSize, + loading, loadStore, loadSequenceIndex, + } = useExplorerStore(); + const [filter, setFilter] = useState(''); + const [seqLoading, setSeqLoading] = useState(false); + const [seqError, setSeqError] = useState(null); + + useEffect(() => { + const init = async () => { + if (!probed) await probe(); + }; + init(); + }, []); // eslint-disable-line react-hooks/exhaustive-deps + + useEffect(() => { + if (probed && hasStore && storeUrl && !metadata && !loading) { + loadStore(storeUrl).catch(() => {}); + } + }, [probed, hasStore, storeUrl]); // eslint-disable-line react-hooks/exhaustive-deps + + useEffect(() => { + if (metadata && !sequenceIndex && !seqLoading) { + setSeqLoading(true); + loadSequenceIndex() + .catch((err) => setSeqError(err.message)) + .finally(() => setSeqLoading(false)); + } + }, [metadata]); // eslint-disable-line react-hooks/exhaustive-deps + + if (!probed || loading || seqLoading) { + return ( +
    + +
    +
    +

    Loading sequences...

    +
    +
    + ); + } + + if (!hasStore) { + return ( +
    + +
    + Sequence browsing requires a RefgetStore. No store was detected. +
    +
    + ); + } + + if (seqError) { + return ( +
    + +
    {seqError}
    +
    + ); + } + + if (!sequenceIndex) { + return ( +
    + +
    No sequence index found.
    +
    + ); + } + + const filtered = sequenceIndex.filter((s) => { + if (!filter) return true; + const term = filter.toLowerCase(); + return ( + s.name?.toLowerCase().includes(term) || + s.sha512t24u?.toLowerCase().includes(term) || + s.md5?.toLowerCase().includes(term) || + s.description?.toLowerCase().includes(term) + ); + }); + + const handleLoadMore = async (maxBytes) => { + setSeqLoading(true); + setSeqError(null); + try { + await loadSequenceIndex(maxBytes ? { maxBytes } : {}); + } catch (err) { + setSeqError(err.message); + } finally { + setSeqLoading(false); + } + }; + + return ( +
    + + + {sequenceIndexPartial && ( +
    + + + Showing first {sequenceIndex.length.toLocaleString()} of + ~{Math.round(sequenceIndex.length * sequenceIndexTotalSize / (PARTIAL_LOAD_SIZE) / 1000).toLocaleString()}k sequences + (loaded {formatBytes(PARTIAL_LOAD_SIZE)} of {formatBytes(sequenceIndexTotalSize)}). + + +
    + )} + +
    + + {filtered.length.toLocaleString()} sequences + {filter && ` (filtered from ${sequenceIndex.length.toLocaleString()})`} + + setFilter(e.target.value)} + /> +
    + + +
    + ); +}; + +export { ExplorerSequences }; diff --git a/frontend/src/pages/HomePage.jsx b/frontend/src/pages/HomePage.jsx index 7a957ff..85d53ad 100644 --- a/frontend/src/pages/HomePage.jsx +++ b/frontend/src/pages/HomePage.jsx @@ -6,6 +6,11 @@ import { AttributeList } from '../components/ObjectLists'; const HomePage = () => { const loaderData = useLoaderData(); + + if (!Array.isArray(loaderData) || loaderData.length < 3) { + return
    Failed to load homepage data.
    ; + } + const collections = loaderData[0]; const pangenomes = loaderData[1]; const name_length_pairs = loaderData[2]; @@ -86,7 +91,7 @@ const HomePage = () => { -
    4. List of name_length_pairs on this server:
    +
    5. List of name_length_pairs on this server:

    The{' '} /list/attributes{' '} diff --git a/frontend/src/pages/LandingPage.jsx b/frontend/src/pages/LandingPage.jsx new file mode 100644 index 0000000..7db6903 --- /dev/null +++ b/frontend/src/pages/LandingPage.jsx @@ -0,0 +1,231 @@ +import { Link, useOutletContext } from 'react-router-dom'; +import { useUnifiedStore } from '../stores/unifiedStore.js'; +import { CopyableDigest } from '../components/CopyableDigest.jsx'; +import { useEffect } from 'react'; + +const LandingPage = () => { + const { apiAvailable } = useOutletContext(); + const { hasStore, hasAPI, storeUrl, storeMetadata, storeCollections, serviceInfo, probe, probed } = + useUnifiedStore(); + + useEffect(() => { + if (!probed) probe(); + }, []); // eslint-disable-line react-hooks/exhaustive-deps + + const nCollections = storeCollections?.length || serviceInfo?.seqcol?.refget_store?.n_collections; + const nSequences = serviceInfo?.seqcol?.refget_store?.n_sequences || storeMetadata?.n_sequences; + const aliasNamespaces = storeMetadata?.collection_alias_namespaces || serviceInfo?.seqcol?.refget_store?.collection_alias_namespaces || []; + const scomEnabled = serviceInfo?.seqcol?.scom?.enabled; + + return ( +

    +

    Refget Sequence Collections

    + +

    + Welcome to the Refget Sequence Collections service. Browse, compare, and + explore reference genome sequence collections following the{' '} + + GA4GH refget specification + . +

    + +
    + {/* Browse section */} +
    +
    +
    +
    + + Browse +
    +
    +
    +

    + Explore sequence collections on this server. Each collection represents a reference genome assembly with its sequences, names, and lengths. +

    + {storeUrl && ( +
    + Store: + + + +
    + )} +
      +
    • + + + Collections + {nCollections && (n = {Number(nCollections).toLocaleString()})} + +
    • +
    • + + + Sequences + {nSequences && (n = {Number(nSequences).toLocaleString()})} + +
    • +
    • + + + Aliases + {aliasNamespaces.length > 0 && ( + + ({aliasNamespaces.length} namespace{aliasNamespaces.length !== 1 ? 's' : ''}: {aliasNamespaces.join(', ')}) + + )} + +
    • +
    +
    +
    +
    + + {/* Tools section */} +
    +
    +
    +
    + + Tools +
    +
    +
    +

    + Standalone tools for working with sequence collections. Compute digests, compare assemblies, or connect to external servers. +

    +
      +
    • + + + FASTA Digester + + + Compute digests from FASTA files in-browser + +
    • +
    • + + + Compare (SCIM) + + + Interpret sequence collection comparisons + +
    • +
    • + + + Explore a Store + + + Browse any RefgetStore by URL + +
    • +
    • + + + Explore an API + + + Connect to any SeqCol API server + +
    • +
    +
    +
    +
    +
    + +
    + {/* Curated section */} +
    +
    +
    +
    + + Curated +
    +
    +
    +

    + Pre-built views for specific genome sets. Precomputed similarity matrices and curated reference genome pages. +

    +
      + {scomEnabled && ( +
    • + + + SCOM — Similarity Matrix + +
    • + )} +
    • + + + Human Reference Genomes + +
    • +
    • + + + HPRC Genomes + +
    • +
    +
    +
    +
    +
    +
    +
    +
    + + Developer +
    +
    +
    +

    + Test API compliance and explore the raw API endpoints. For developers building on the seqcol specification. +

    +
      +
    • + + + Compliance Testing + + + Run GA4GH spec compliance checks + +
    • +
    • + + + Demo + + + Collection comparison demo + +
    • +
    +
    +
    +
    +
    + + {!apiAvailable && ( +
    + + The API is currently unavailable. Some features may be limited. +
    + )} +
    + ); +}; + +export { LandingPage }; diff --git a/frontend/src/pages/PangenomeView.jsx b/frontend/src/pages/PangenomeView.jsx index 2d55013..146382f 100644 --- a/frontend/src/pages/PangenomeView.jsx +++ b/frontend/src/pages/PangenomeView.jsx @@ -11,6 +11,10 @@ const PangenomeView = ({ params }) => { const pangenome = useLoaderData(); const { digest } = useParams(); + if (!Array.isArray(pangenome) || pangenome.length < 3) { + return
    Failed to load pangenome data.
    ; + } + let level1 = pangenome[0]; let level2 = pangenome[1]; let itemwise = pangenome[2]; diff --git a/frontend/src/pages/SCIM.jsx b/frontend/src/pages/SCIM.jsx index 7e8a40a..41e9ee1 100644 --- a/frontend/src/pages/SCIM.jsx +++ b/frontend/src/pages/SCIM.jsx @@ -2,7 +2,7 @@ import { useEffect, useState } from 'react'; import { useSearchParams, useLoaderData } from 'react-router-dom'; import toast from 'react-hot-toast'; -import { API_BASE } from '../utilities.jsx'; +import { API_BASE, encodeToBase64, decodeFromBase64 } from '../utilities.jsx'; import { ComparisonView } from './ComparisonView.jsx'; // Seqcol Comparison Interpretation Module (SCIM) @@ -30,18 +30,24 @@ const SCIM = () => { useEffect(() => { const comparisonFromQuery = searchParams.get('val'); if (comparisonFromQuery) { - // decode base64encoded string - const decodedComparisonFromQuery = atob(comparisonFromQuery); - // prettify the comparison string - const prettyComparison = JSON.stringify( - JSON.parse(decodedComparisonFromQuery), - null, - 2, - ); - setComparisonStr(prettyComparison); - - const parsedComparison = JSON.parse(decodedComparisonFromQuery); - setComparison(parsedComparison); + try { + // decode base64encoded string + const decodedComparisonFromQuery = decodeFromBase64(comparisonFromQuery); + // prettify the comparison string + const prettyComparison = JSON.stringify( + JSON.parse(decodedComparisonFromQuery), + null, + 2, + ); + setComparisonStr(prettyComparison); + + const parsedComparison = JSON.parse(decodedComparisonFromQuery); + setComparison(parsedComparison); + } catch { + toast.error('Invalid comparison URL. The data may be corrupted.'); + setComparison(null); + setComparisonStr(''); + } } }, [searchParams]); @@ -79,7 +85,7 @@ const SCIM = () => { setComparison(parsedComparison); // update the query param to base64 encoded string - const base64encodedComparison = btoa(comparisonStr); + const base64encodedComparison = encodeToBase64(comparisonStr); window.history.pushState( {}, '', @@ -95,9 +101,9 @@ const SCIM = () => { const loadExample = () => { const exampleData = - 'eyJkaWdlc3RzIjp7ImEiOiJYWmxyY0VHaTZtbG9wWjJ1RDhPYkhrUUIxZDBvRHdLayIsImIiOiJRdlQ1dEFRMEI4Vmt4ZC1xRmZ0bHpFazJReWZQdGdPdiJ9LCJhdHRyaWJ1dGVzIjp7ImFfb25seSI6W10sImJfb25seSI6W10sImFfYW5kX2IiOlsibGVuZ3RocyIsIm5hbWVfbGVuZ3RoX3BhaXJzIiwibmFtZXMiLCJzZXF1ZW5jZXMiLCJzb3J0ZWRfc2VxdWVuY2VzIl19LCJhcnJheV9lbGVtZW50cyI6eyJhIjp7Imxlbmd0aHMiOjMsIm5hbWVfbGVuZ3RoX3BhaXJzIjozLCJuYW1lcyI6Mywic2VxdWVuY2VzIjozLCJzb3J0ZWRfc2VxdWVuY2VzIjozfSwiYiI6eyJsZW5ndGhzIjozLCJuYW1lX2xlbmd0aF9wYWlycyI6MywibmFtZXMiOjMsInNlcXVlbmNlcyI6Mywic29ydGVkX3NlcXVlbmNlcyI6M30sImFfYW5kX2IiOnsibGVuZ3RocyI6MywibmFtZV9sZW5ndGhfcGFpcnMiOjAsIm5hbWVzIjowLCJzZXF1ZW5jZXMiOjMsInNvcnRlZF9zZXF1ZW5jZXMiOjN9LCJhX2FuZF9iX3NhbWVfb3JkZXIiOnsibGVuZ3RocyI6dHJ1ZSwibmFtZV9sZW5ndGhfcGFpcnMiOm51bGwsIm5hbWVzIjpudWxsLCJzZXF1ZW5jZXMiOnRydWUsInNvcnRlZF9zZXF1ZW5jZXMiOnRydWV9fX0='; + 'eyJkaWdlc3RzIjp7ImEiOiJYWmxyY0VHaTZtbG9wWjJ1RDhPYkhrUUIxZDBvRHdLayIsImIiOiJRdlQ1dEFRMEI4Vmt4ZC1xRmZ0bHpFazJReWZQdGdPdiJ9LCJhdHRyaWJ1dGVzIjp7ImFfb25seSI6W10sImJfb25seSI6W10sImFfYW5kX2IiOlsibGVuZ3RocyIsIm5hbWVfbGVuZ3RoX3BhaXJzIiwibmFtZXMiLCJzZXF1ZW5jZXMiLCJzb3J0ZWRfc2VxdWVuY2VzIl19LCJhcnJheV9lbGVtZW50cyI6eyJhX2NvdW50Ijp7Imxlbmd0aHMiOjMsIm5hbWVfbGVuZ3RoX3BhaXJzIjozLCJuYW1lcyI6Mywic2VxdWVuY2VzIjozLCJzb3J0ZWRfc2VxdWVuY2VzIjozfSwiYl9jb3VudCI6eyJsZW5ndGhzIjozLCJuYW1lX2xlbmd0aF9wYWlycyI6MywibmFtZXMiOjMsInNlcXVlbmNlcyI6Mywic29ydGVkX3NlcXVlbmNlcyI6M30sImFfYW5kX2JfY291bnQiOnsibGVuZ3RocyI6MywibmFtZV9sZW5ndGhfcGFpcnMiOjAsIm5hbWVzIjowLCJzZXF1ZW5jZXMiOjMsInNvcnRlZF9zZXF1ZW5jZXMiOjN9LCJhX2FuZF9iX3NhbWVfb3JkZXIiOnsibGVuZ3RocyI6dHJ1ZSwibmFtZV9sZW5ndGhfcGFpcnMiOm51bGwsIm5hbWVzIjpudWxsLCJzZXF1ZW5jZXMiOnRydWUsInNvcnRlZF9zZXF1ZW5jZXMiOnRydWV9fX0='; - const decodedComparison = atob(exampleData); + const decodedComparison = decodeFromBase64(exampleData); const prettyComparison = JSON.stringify( JSON.parse(decodedComparison), null, diff --git a/frontend/src/pages/SCOM.jsx b/frontend/src/pages/SCOM.jsx index e859f96..ff3dd87 100644 --- a/frontend/src/pages/SCOM.jsx +++ b/frontend/src/pages/SCOM.jsx @@ -1,4 +1,4 @@ -import { useEffect, useState } from 'react'; +import { useCallback, useEffect, useState } from 'react'; import { encodeComparison } from '../utilities.jsx'; import { useLoaderData, useNavigate, useSearchParams } from 'react-router-dom'; import toast from 'react-hot-toast'; @@ -11,7 +11,6 @@ import { } from '../services/fetchData.jsx'; import { MultiMetricHeatmapPlot } from '../components/MultiMetricHeatmapPlot.jsx'; import { StripPlot } from '../components/StripPlot.jsx'; -// import { NetworkGraph } from '../components/NetworkGraph.jsx'; import { useSimilaritiesStore } from '../stores/similarities'; @@ -20,7 +19,7 @@ const SCOM = () => { const navigate = useNavigate(); const [searchParams] = useSearchParams(); const loaderData = useLoaderData(); - const collections = loaderData[0]; + const collections = Array.isArray(loaderData) && loaderData.length >= 1 ? loaderData[0] : null; const { selectedCollectionsIndex, @@ -38,19 +37,17 @@ const SCOM = () => { getAllCollections, initializeSelectedCollections, sortBy, - setSortBy, sortAscending, - setSortAscending, - sortSimilarities, + sortByColumn, + resetSort, species, - setSpecies + setSpecies, + error: storeError, + setError: setStoreError, } = useSimilaritiesStore(); const [stripJitter, setStripJitter] = useState('none'); const [stripOrientation, setStripOrientation] = useState('horizontal'); - const [heatmapMetric, setHeatmapMetric] = useState('sequences'); - // const [networkMetric, setNetworkMetric] = useState('sequences'); - // const [networkThreshold, setNetworkThreshold] = useState(0.8); const [relationship, setRelationship] = useState('oneToMany'); const [isLoading, setIsLoading] = useState(false); const [pendingPrefill, setPendingPrefill] = useState(null); @@ -270,15 +267,8 @@ const SCOM = () => { ] } - // const handleSelectCollection = (index) => { - // setSelectedCollectionsIndex((prev) => { - // const newArray = [...prev]; - // newArray[index] = !newArray[index]; - // return newArray; - // }); - // }; - const handleNavigateSCIM = async (similarityRow) => { + setStoreError(null); try { let comparison; if (similarityRow.custom) { @@ -294,8 +284,8 @@ const SCOM = () => { } const encodedComparison = encodeComparison(comparison); navigate(`/scim?val=${encodedComparison}`); - // window.scrollTo(0, 0); } catch (error) { + setStoreError('Comparison could not be made.'); toast.error( Error: Comparison could not be made. @@ -304,29 +294,12 @@ const SCOM = () => { } }; - // const handleRelationshipChange = (newRelationship) => { - // if ( - // newRelationship === 'oneToMany' && - // relationship === 'manyToMany' && - // selectedCollections.length > 1 - // ) { - // setCustomCollections([]); - // setSelectedCollectionsIndex(collections.results.map(() => false)); - // setCustomCount(1); - // } - // setSelectedCollectionsIndex((prev) => - // prev.map((item, index) => - // index < collections.results.length ? false : item, - // ), - // ); - // setStripJitter('none'); - // setRelationship(newRelationship); - // }; - - const handleAddCustomCollection = async (data, name) => { + const handleAddCustomCollection = useCallback(async (data, name) => { + setStoreError(null); try { data = JSON.parse(data); } catch (e) { + setStoreError('Invalid JSON format. Please check your input.'); toast.error( Error: Invalid JSON format. Please check your input. @@ -335,25 +308,16 @@ const SCOM = () => { return; } - // if (relationship === 'manyToMany' && allCollections.includes(name)) { - // toast.error( - // - // Error: Collection with name already exists. Please - // try another name. - // , - // ); - // return; - // } - try { setIsLoading(true); const result = await fetchSimilaritiesJSON(data, species); if (result?.similarities) { - // const customDigest = 'query_seqcol' + (customCount > 1 ? customCount : ''); const customDigest = 'Input Seqcol'; - // console.log(result.similarities) - const flattenedSimilarities = result.similarities.flatMap((s) => - s.human_readable_names.map((humanReadableName) => ({ + const flattenedSimilarities = result.similarities.flatMap((s) => { + const names = s.human_readable_names.length > 0 + ? s.human_readable_names + : [s.digest]; + return names.map((humanReadableName) => ({ selectedDigest: name !== '' ? name : customDigest, comparedDigest: s.digest, comparedAlias: humanReadableName || s.digest, @@ -364,8 +328,8 @@ const SCOM = () => { sorted_sequences: s.similarities.sorted_sequences, custom: true, raw: data, - })) - ); + })); + }); if (relationship === 'oneToMany') { setCustomCollections([ @@ -396,6 +360,7 @@ const SCOM = () => { } catch (e) { console.error('SCOM submission error:', e); console.log('Data that was submitted:', data); + setStoreError('Collection is invalid. Please check your input.'); toast.error( Error: Collection is invalid. Please check your @@ -404,10 +369,10 @@ const SCOM = () => { ); return; } finally { - setSortBy(null); + resetSort(); setIsLoading(false); } - }; + }, [species, relationship, collections, customCollections, customCount, setCustomCollections, setSelectedCollectionsIndex, setCustomCount, resetSort, setIsLoading, setStoreError]); // Auto-submit prefilled data (wait for collections to be ready) useEffect(() => { @@ -415,7 +380,7 @@ const SCOM = () => { handleAddCustomCollection(JSON.stringify(pendingPrefill.json), pendingPrefill.name || ''); setPendingPrefill(null); } - }, [pendingPrefill, isLoading, collections]); + }, [pendingPrefill, isLoading, collections, handleAddCustomCollection]); useEffect(() => { const fetchAllSimilarities = async () => { @@ -424,41 +389,10 @@ const SCOM = () => { for (let i = 0; i < selectedCollectionsIndex.length; i++) { if (!selectedCollectionsIndex[i]) continue; - // const collection = allCollections[i]; - - - if (i < collections.results.length && relationship === 'manyToMany') { - // // server collection - // try { - // const result = await fetchSimilarities(collection); - // if (result?.similarities) { - // const flattenedSimilarities = result.similarities.map((s) => ({ - // selectedDigest: collection, - // comparedDigest: s.digest, - // comparedAlias: s.human_readable_name, - // lengths: s.similarities.lengths, - // name_length_pairs: s.similarities.name_length_pairs, - // names: s.similarities.names, - // sequences: s.similarities.sequences, - // sorted_sequences: s.similarities.sorted_sequences, - // custom: false, - // raw: null, - // })); - // allSimilarities.push(...flattenedSimilarities); - // } - // } catch (error) { - // console.error( - // `Error fetching similarities for ${collection}:`, - // error, - // ); - // } - } else { - // custom collection - const customIndex = i - collections.results.length; - const customCollection = customCollections[customIndex]; - if (customCollection) { - allSimilarities.push(...customCollection.similarities); - } + const customIndex = i - collections.results.length; + const customCollection = customCollections[customIndex]; + if (customCollection) { + allSimilarities.push(...customCollection.similarities); } } @@ -469,40 +403,19 @@ const SCOM = () => { }, [selectedCollectionsIndex, customCollections]); const handleSortTable = (column) => { - if (sortBy === column) { - setSortAscending(!sortAscending) - sortSimilarities() - } else { - setSortBy(column) - setSortAscending(false) - sortSimilarities() - } + sortByColumn(column); }; + if (!collections) { + return
    Failed to load collection data.
    ; + } + return (

    Seqcol Comparison Overview Module (SCOM)

    - {/*
      -
    • - handleRelationshipChange('oneToMany')} - > - One-to-Many - -
    • -
    • - handleRelationshipChange('manyToMany')} - > - Many-to-Many - -
    • -
    */}
    @@ -525,11 +438,6 @@ const SCOM = () => {
    - {/*

    - If you would like to view metrics for multiple sequence collections - at once, use the "Many-to-Many" tab. -

    */} -
    { value={customCollectionJSON} placeholder='Paste output from `refget fasta seqcol yourfasta.fa` here.' className='form-control tiny border-0 rounded-0 rounded-bottom z-active' - // style={{ maxHeight: 'calc(200px - 32.333333px)' }} rows='12' />
    - {/* {relationship === 'manyToMany' && ( -
    -
    -
    - - Selected Sequence Collections - - - - -
    -
      - {allCollections && - allCollections.map((collection, index) => ( -
    • -
      -
      - handleSelectCollection(index)} - checked={selectedCollectionsIndex[index]} - /> - -
      - {index >= collections.results.length ? ( - { - const customIndex = - index - collections.results.length; - setCustomCollections((prev) => - prev.filter((_, i) => i !== customIndex), - ); - setSelectedCollectionsIndex((prev) => - prev.filter((_, i) => i !== index), - ); - toast.success('Custom collection removed.'); - }} - /> - ) : ( - - )} -
      -
    • - ))} -
    -
    -
    - )} */}
    + {storeError && ( +
    +
    + + Error: {storeError} +
    + +
    + )} + {(similarities && !isLoading) ? (
    @@ -716,7 +553,6 @@ const SCOM = () => { {relationship === 'manyToMany' && ( )} - {/* {relationship === 'manyToMany' && } */} {relationship === 'oneToMany' && ( )} @@ -735,74 +571,9 @@ const SCOM = () => {
    Heatmap
    - {/* */}
    rest)} /> - {/* {relationship === 'manyToMany' && ( - <> -
    -
    Network Graph
    -
    - Threshold - - setNetworkThreshold(Number(e.target.value)) - } - className='form-control form-range' - style={{ height: 'inherit' }} - /> - - setNetworkThreshold(Number(e.target.value)) - } - className='form-control' - style={{ maxWidth: '70px' }} - /> -
    - -
    - rest)} - metric={networkMetric} - threshold={networkThreshold} - /> - - )} */} -
    Seqcol Comparison Summary Table

    @@ -813,7 +584,6 @@ const SCOM = () => { - {/* */} @@ -830,14 +600,13 @@ const SCOM = () => { className='cursor-pointer' onClick={() => handleNavigateSCIM(row)} > - {/* */} - - - - - + + + + + ))} diff --git a/frontend/src/pages/StoreAliases.jsx b/frontend/src/pages/StoreAliases.jsx new file mode 100644 index 0000000..8ff33dc --- /dev/null +++ b/frontend/src/pages/StoreAliases.jsx @@ -0,0 +1,198 @@ +import { useState, useEffect } from 'react'; +import { Link, useSearchParams } from 'react-router-dom'; +import { useExplorerStore } from '../stores/explorerStore.js'; +import { StoreNav } from '../components/StoreNav.jsx'; + +const AliasNamespacePanel = ({ type, storeUrlParam, availableNamespaces }) => { + const { loadAliases } = useExplorerStore(); + const [namespace, setNamespace] = useState(''); + const [aliases, setAliases] = useState(null); + const [loading, setLoading] = useState(false); + const [error, setError] = useState(null); + const [filter, setFilter] = useState(''); + + const handleLoad = async (e) => { + e?.preventDefault(); + if (!namespace.trim()) return; + setLoading(true); + setError(null); + try { + const data = await loadAliases(type, namespace.trim()); + if (!data) { + setError(`Namespace "${namespace}" not found.`); + setAliases(null); + } else { + setAliases(data); + } + } catch (err) { + setError(err.message); + } finally { + setLoading(false); + } + }; + + const handleNamespaceClick = (ns) => { + setNamespace(ns); + setFilter(''); + setError(null); + setLoading(true); + loadAliases(type, ns) + .then((data) => { + if (!data) { + setError(`Namespace "${ns}" not found.`); + setAliases(null); + } else { + setAliases(data); + } + }) + .catch((err) => setError(err.message)) + .finally(() => setLoading(false)); + }; + + const filtered = aliases + ? aliases.filter( + (a) => + !filter || + a.alias.toLowerCase().includes(filter.toLowerCase()) || + a.digest.toLowerCase().includes(filter.toLowerCase()), + ) + : null; + + const linkPrefix = + type === 'sequences' + ? null // sequences don't have a detail page in the explorer + : `/explore-store/collection/`; + + return ( +
    +
    +
    + + {type} aliases +
    +
    +
    + {availableNamespaces && availableNamespaces.length > 0 ? ( +
    + Namespaces: + {availableNamespaces.map((ns) => ( + + ))} + {loading && } +
    + ) : ( +

    + + No {type} alias namespaces found in this store. +

    + )} + + {error && ( +
    {error}
    + )} + + {filtered && ( + <> +
    + + {filtered.length} aliases in "{namespace}" + + setFilter(e.target.value)} + /> +
    +
    +
    handleSortTable('selectedDigest')}>Seqcol A handleSortTable('comparedAlias')}>Compared Seqcol handleSortTable('comparedDigest')}>Compared Seqcol Digest handleSortTable('lengths')}>Lengths {row.selectedDigest}{row.comparedAlias ? row.comparedAlias : row.comparedDigest} {row.comparedDigest}{Number.isInteger(row.lengths) ? row.lengths : row.lengths.toFixed(3)}{Number.isInteger(row.name_length_pairs) ? row.name_length_pairs : row.name_length_pairs.toFixed(3)}{Number.isInteger(row.names) ? row.names : row.names.toFixed(3)}{Number.isInteger(row.sequences) ? row.sequences : row.sequences.toFixed(3)}{Number.isInteger(row.sorted_sequences) ? row.sorted_sequences : row.sorted_sequences.toFixed(3)}{row.lengths != null ? (Number.isInteger(row.lengths) ? row.lengths : row.lengths.toFixed(3)) : '-'}{row.name_length_pairs != null ? (Number.isInteger(row.name_length_pairs) ? row.name_length_pairs : row.name_length_pairs.toFixed(3)) : '-'}{row.names != null ? (Number.isInteger(row.names) ? row.names : row.names.toFixed(3)) : '-'}{row.sequences != null ? (Number.isInteger(row.sequences) ? row.sequences : row.sequences.toFixed(3)) : '-'}{row.sorted_sequences != null ? (Number.isInteger(row.sorted_sequences) ? row.sorted_sequences : row.sorted_sequences.toFixed(3)) : '-'}
    + + + + + + + + {filtered.map((a, i) => ( + + + + + ))} + +
    AliasDigest
    {a.alias} + {linkPrefix ? ( + + {a.digest} + + ) : ( + a.digest + )} +
    +

    + + )} +
    +
    + ); +}; + +const StoreAliases = () => { + const [searchParams] = useSearchParams(); + const { storeUrl, metadata, loading, loadStore } = useExplorerStore(); + + const urlParam = searchParams.get('url'); + const storeUrlParam = `?url=${encodeURIComponent(storeUrl || urlParam)}`; + + useEffect(() => { + if (urlParam && !metadata && !loading) { + loadStore(urlParam).catch(() => {}); + } + }, [urlParam]); // eslint-disable-line react-hooks/exhaustive-deps + + if (!metadata && !loading) { + return ( +
    + No store loaded.{' '} + Go back to enter a store URL. +
    + ); + } + + if (loading) { + return ( +
    +
    +
    + ); + } + + return ( +
    + + +

    + Aliases map human-readable names to digests. Select a namespace to + browse its alias mappings. +

    + + + +
    + ); +}; + +export { StoreAliases }; diff --git a/frontend/src/pages/StoreCollection.jsx b/frontend/src/pages/StoreCollection.jsx new file mode 100644 index 0000000..2602197 --- /dev/null +++ b/frontend/src/pages/StoreCollection.jsx @@ -0,0 +1,279 @@ +import { useState, useEffect } from 'react'; +import { Link, useParams, useSearchParams } from 'react-router-dom'; +import { useExplorerStore } from '../stores/explorerStore.js'; +import { StoreNav } from '../components/StoreNav.jsx'; +import { CliCommand } from '../components/CliSnippet.jsx'; + +const StoreCollection = () => { + const { digest } = useParams(); + const [searchParams] = useSearchParams(); + const { storeUrl, metadata, loadStore, loadCollection, loadFhrMetadata, loading } = + useExplorerStore(); + const [collection, setCollection] = useState(null); + const [fhr, setFhr] = useState(undefined); + const [error, setError] = useState(null); + const [loadingCol, setLoadingCol] = useState(true); + const [selectedSeq, setSelectedSeq] = useState(null); + const [seqCodeTab, setSeqCodeTab] = useState('cli'); + + const urlParam = searchParams.get('url'); + const storeUrlParam = `?url=${encodeURIComponent(storeUrl || urlParam)}`; + + useEffect(() => { + const load = async () => { + try { + // Ensure store is loaded + if (!metadata && urlParam) { + await loadStore(urlParam); + } + const col = await loadCollection(digest); + setCollection(col); + const fhrData = await loadFhrMetadata(digest); + setFhr(fhrData); + } catch (err) { + setError(err.message); + } finally { + setLoadingCol(false); + } + }; + load(); + }, [digest, urlParam]); // eslint-disable-line react-hooks/exhaustive-deps + + if (!metadata && !loading && !loadingCol) { + return ( +
    + No store loaded.{' '} + Go back to enter a store URL. +
    + ); + } + + if (loading || loadingCol) { + return ( +
    +
    +

    Loading collection...

    +
    + ); + } + + if (error) { + return ( +
    + +
    {error}
    +
    + ); + } + + const { metadata: colMeta, sequences } = collection; + const totalBases = sequences.reduce((sum, s) => sum + s.length, 0); + const alphabetCounts = {}; + sequences.forEach((s) => { + alphabetCounts[s.alphabet] = (alphabetCounts[s.alphabet] || 0) + 1; + }); + + return ( +
    + + +
    {digest}
    + + {/* Summary stats */} +
    +
    +
    +
    + Sequences + {sequences.length.toLocaleString()} +
    +
    +
    +
    +
    +
    + Total bases + {totalBases.toLocaleString()} +
    +
    +
    + {Object.keys(alphabetCounts).length > 0 && ( +
    +
    +
    + Alphabets + + + {Object.entries(alphabetCounts).map(([alph, count]) => ( + + + + + ))} + +
    {alph}{count.toLocaleString()}
    +
    +
    +
    + )} +
    + + {/* Collection metadata from ## headers */} + {Object.keys(colMeta).length > 0 && ( +
    +
    +
    Collection Metadata
    +
    +
    + + + {Object.entries(colMeta).map(([key, value]) => ( + + + + + ))} + +
    {key}{value}
    +
    +
    + )} + + {/* FHR metadata */} + {fhr ? ( +
    +
    +
    + + FHR Metadata +
    +
    +
    +
    +              {JSON.stringify(fhr, null, 2)}
    +            
    +
    +
    + ) : fhr === null ? ( +

    + + No FHR metadata sidecar found for this collection. +

    + ) : null} + + {/* Sequence table */} +
    +
    +
    Sequences in this collection
    +
    +
    +
    + + + + + + + + + + {sequences.map((seq, i) => ( + setSelectedSeq(seq)} + > + + + + + ))} + +
    NameLengthSHA-512/24u
    {seq.name} + {seq.length.toLocaleString()} + {seq.sha512t24u}
    +
    +
    +
    + + {/* Sequence detail modal */} + {selectedSeq && ( + <> +
    setSelectedSeq(null)} /> +
    setSelectedSeq(null)}> +
    e.stopPropagation()}> +
    +
    +
    {selectedSeq.name}
    +
    +
    + + + + + + + + + + + + + + + + + + + {selectedSeq.description && ( + + + + + )} + +
    Length{selectedSeq.length.toLocaleString()}
    Alphabet{selectedSeq.alphabet}
    SHA-512/24u{selectedSeq.sha512t24u}
    MD5{selectedSeq.md5}
    Description{selectedSeq.description}
    + +
    Code
    +
      +
    • + +
    • +
    • + +
    • +
    + Get sequence + +
    +
    +
    +
    + + )} +
    + ); +}; + +export { StoreCollection }; diff --git a/frontend/src/pages/StoreExplorer.jsx b/frontend/src/pages/StoreExplorer.jsx new file mode 100644 index 0000000..c7a2d17 --- /dev/null +++ b/frontend/src/pages/StoreExplorer.jsx @@ -0,0 +1,149 @@ +import { useState, useEffect } from 'react'; +import { useNavigate, useSearchParams } from 'react-router-dom'; +import { useExplorerStore } from '../stores/explorerStore.js'; + +const RECENT_STORES_KEY = 'refget-explorer-recent-stores'; +const MAX_RECENT = 5; + +const getRecentStores = () => { + try { + return JSON.parse(localStorage.getItem(RECENT_STORES_KEY)) || []; + } catch { + return []; + } +}; + +const saveRecentStore = (url) => { + const recent = getRecentStores().filter((u) => u !== url); + recent.unshift(url); + localStorage.setItem( + RECENT_STORES_KEY, + JSON.stringify(recent.slice(0, MAX_RECENT)), + ); +}; + +const StoreExplorer = () => { + const [searchParams] = useSearchParams(); + const navigate = useNavigate(); + const { loadStore, loading, error, storeUrl } = useExplorerStore(); + const [url, setUrl] = useState(searchParams.get('url') || ''); + const [localError, setLocalError] = useState(null); + const recentStores = getRecentStores(); + + // Auto-load if URL param provided + useEffect(() => { + const paramUrl = searchParams.get('url'); + if (paramUrl && paramUrl !== storeUrl) { + handleExplore(paramUrl); + } + }, []); // eslint-disable-line react-hooks/exhaustive-deps + + const handleExplore = async (targetUrl) => { + const trimmed = (targetUrl || url).trim(); + if (!trimmed) return; + setLocalError(null); + try { + await loadStore(trimmed); + saveRecentStore(trimmed); + navigate(`/explore-store/overview?url=${encodeURIComponent(trimmed)}`); + } catch (err) { + setLocalError(err.message); + } + }; + + const handleSubmit = (e) => { + e.preventDefault(); + handleExplore(); + }; + + return ( +
    +

    + + RefgetStore Explorer +

    +

    + Browse the contents of any RefgetStore — sequences, collections, aliases, + and metadata. Enter the URL of a store hosted on any HTTP server. +

    + +
    +
    + setUrl(e.target.value)} + required + /> + +
    + + + {(localError || error) && ( +
    + Failed to load store: {localError || error} +

    + Make sure the URL points to a valid RefgetStore directory with an{' '} + rgstore.json file. The server must allow cross-origin + requests (CORS). +

    +
    + )} + + {recentStores.length > 0 && ( +
    +
    Recent stores
    +
    + {recentStores.map((recentUrl) => ( +
    + {recentUrl} + + + + +
    + ))} +
    +
    + )} +
    + ); +}; + +export { StoreExplorer }; diff --git a/frontend/src/pages/StoreOverview.jsx b/frontend/src/pages/StoreOverview.jsx new file mode 100644 index 0000000..3505300 --- /dev/null +++ b/frontend/src/pages/StoreOverview.jsx @@ -0,0 +1,309 @@ +import { useState, useEffect } from 'react'; +import { Link, useNavigate, useSearchParams } from 'react-router-dom'; +import { useExplorerStore } from '../stores/explorerStore.js'; +import { StoreNav } from '../components/StoreNav.jsx'; +import { RowCodeButton } from '../components/CliSnippet.jsx'; + +const StoreOverview = () => { + const [searchParams] = useSearchParams(); + const navigate = useNavigate(); + const { storeUrl, metadata, sequenceIndex, collections, loading, loadStore, loadSequenceIndex } = + useExplorerStore(); + const [seqLoading, setSeqLoading] = useState(false); + + const urlParam = searchParams.get('url'); + + // If we have a URL param but no loaded store, load it + useEffect(() => { + const init = async () => { + if (urlParam && !metadata && !loading) { + await loadStore(urlParam).catch(() => {}); + } + }; + init(); + }, [urlParam]); // eslint-disable-line react-hooks/exhaustive-deps + + // Auto-load sequence index (fetchSequenceIndex handles size check internally) + useEffect(() => { + if (metadata && !sequenceIndex && !seqLoading) { + setSeqLoading(true); + loadSequenceIndex() + .catch(() => {}) + .finally(() => setSeqLoading(false)); + } + }, [metadata]); // eslint-disable-line react-hooks/exhaustive-deps + + if (!metadata && !loading) { + return ( +
    + No store loaded.{' '} + Go back to enter a store URL. +
    + ); + } + + if (loading) { + return ( +
    +
    +

    Loading store...

    +
    + ); + } + + const totalBases = sequenceIndex + ? sequenceIndex.reduce((sum, s) => sum + s.length, 0) + : 0; + + const alphabetCounts = {}; + if (sequenceIndex) { + sequenceIndex.forEach((s) => { + alphabetCounts[s.alphabet] = (alphabetCounts[s.alphabet] || 0) + 1; + }); + } + + const storeUrlParam = `?url=${encodeURIComponent(storeUrl || urlParam)}`; + + return ( +
    + + +
    + {/* Store info card */} +
    +
    +
    +
    + + Store Info +
    +
    +
    + + + + + + + + + + + + + + + {metadata.created_at && ( + + + + + )} + +
    URL + {storeUrl || urlParam} +
    Version{metadata.version}
    Storage Mode + + {metadata.mode} + +
    Created{new Date(metadata.created_at).toLocaleString()}
    +
    +
    +
    + + {/* Sequences summary card */} +
    +
    +
    +
    + + Sequences +
    + + Browse all + +
    +
    + {sequenceIndex ? ( + + + + + + + + + + + + + + + +
    Total sequences{sequenceIndex.length.toLocaleString()}
    Total bases{totalBases.toLocaleString()}
    Alphabets + {Object.entries(alphabetCounts).map(([alph, count]) => ( + + {alph}: {count} + + ))} +
    + ) : seqLoading ? ( +
    + + Loading sequence index... +
    + ) : ( +

    + Sequence index not available. +

    + )} +
    +
    +
    +
    + + {/* Collections */} +
    +
    +
    + + Collections{collections?.length > 0 && ` (${collections.length.toLocaleString()})`} +
    +
    +
    + {collections && collections.length > 0 ? ( +
    + + + + + + + + + + {collections.map((col) => ( + + + + + + ))} + +
    DigestSequences
    + + {col.digest} + + {col.n_sequences} + +
    +
    + ) : ( +

    + No collection index (collections.rgci) found. Individual + collections can still be viewed if you know the digest. +

    + )} +
    +
    + + {/* Aliases section */} +
    +
    +
    + + Aliases +
    + + Browse aliases + +
    +
    + {(metadata.sequence_alias_namespaces?.length > 0 || metadata.collection_alias_namespaces?.length > 0) ? ( + + + {metadata.sequence_alias_namespaces?.length > 0 && ( + + + + + )} + {metadata.collection_alias_namespaces?.length > 0 && ( + + + + + )} + +
    Sequence namespaces + {metadata.sequence_alias_namespaces.map((ns) => ( + + {ns} + + ))} +
    Collection namespaces + {metadata.collection_alias_namespaces.map((ns) => ( + + {ns} + + ))} +
    + ) : ( +

    + No alias namespace information available. +

    + )} +
    +
    + +
    + ); +}; + +export { StoreOverview }; diff --git a/frontend/src/pages/StoreSequences.jsx b/frontend/src/pages/StoreSequences.jsx new file mode 100644 index 0000000..ae09afe --- /dev/null +++ b/frontend/src/pages/StoreSequences.jsx @@ -0,0 +1,337 @@ +import { useState, useMemo, useEffect } from 'react'; +import { Link, useSearchParams } from 'react-router-dom'; +import { useExplorerStore } from '../stores/explorerStore.js'; +import { StoreNav } from '../components/StoreNav.jsx'; +import { CliCommand } from '../components/CliSnippet.jsx'; + +const PAGE_SIZE = 50; + +const formatBytes = (bytes) => { + if (bytes < 1024) return `${bytes} B`; + if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`; + return `${(bytes / (1024 * 1024)).toFixed(1)} MB`; +}; + +const StoreSequences = () => { + const [searchParams] = useSearchParams(); + const { + storeUrl, sequenceIndex, sequenceIndexPartial, sequenceIndexTotalSize, + metadata, loading, loadStore, loadSequenceIndex, + } = useExplorerStore(); + const [filter, setFilter] = useState(''); + const [sortCol, setSortCol] = useState(null); + const [sortAsc, setSortAsc] = useState(true); + const [page, setPage] = useState(0); + const [seqLoading, setSeqLoading] = useState(false); + const [seqError, setSeqError] = useState(null); + const [selectedSeq, setSelectedSeq] = useState(null); + const [seqCodeTab, setSeqCodeTab] = useState('cli'); + + const urlParam = searchParams.get('url'); + const storeUrlParam = `?url=${encodeURIComponent(storeUrl || urlParam)}`; + + // Auto-load on mount — fetchSequenceIndex handles the size check internally + useEffect(() => { + const init = async () => { + if (urlParam && !metadata && !loading) { + await loadStore(urlParam).catch(() => {}); + } + if (!sequenceIndex && !seqLoading) { + setSeqLoading(true); + try { + await loadSequenceIndex(); + } catch (err) { + setSeqError(err.message); + } finally { + setSeqLoading(false); + } + } + }; + init(); + }, [urlParam, metadata]); // eslint-disable-line react-hooks/exhaustive-deps + + const handleLoadMore = async (maxBytes) => { + setSeqLoading(true); + setSeqError(null); + try { + await loadSequenceIndex(maxBytes ? { maxBytes } : {}); + } catch (err) { + setSeqError(err.message); + } finally { + setSeqLoading(false); + } + }; + + const filtered = useMemo(() => { + if (!sequenceIndex) return []; + const term = filter.toLowerCase(); + return sequenceIndex.filter( + (s) => + !term || + s.name?.toLowerCase().includes(term) || + s.sha512t24u?.toLowerCase().includes(term) || + s.md5?.toLowerCase().includes(term) || + s.description?.toLowerCase().includes(term), + ); + }, [sequenceIndex, filter]); + + const sorted = useMemo(() => { + if (!sortCol) return filtered; + return [...filtered].sort((a, b) => { + const va = a[sortCol]; + const vb = b[sortCol]; + if (typeof va === 'number' && typeof vb === 'number') { + return sortAsc ? va - vb : vb - va; + } + return sortAsc + ? String(va).localeCompare(String(vb)) + : String(vb).localeCompare(String(va)); + }); + }, [filtered, sortCol, sortAsc]); + + const totalPages = Math.ceil(sorted.length / PAGE_SIZE); + const paged = sorted.slice(page * PAGE_SIZE, (page + 1) * PAGE_SIZE); + + const handleSort = (col) => { + if (sortCol === col) { + setSortAsc(!sortAsc); + } else { + setSortCol(col); + setSortAsc(true); + } + setPage(0); + }; + + const SortIcon = ({ col }) => { + if (sortCol !== col) return null; + return ; + }; + + if (!metadata && !loading) { + return ( +
    + No store loaded.{' '} + Go back to enter a store URL. +
    + ); + } + + if (loading || seqLoading) { + return ( +
    +
    +

    + {seqLoading ? 'Loading sequence index...' : 'Loading store...'} +

    +
    + ); + } + + if (seqError) { + return ( +
    + +
    {seqError}
    +
    + ); + } + + if (!sequenceIndex) { + return ( +
    + +
    + No sequence index (sequences.rgsi) found in this store. +
    +
    + ); + } + + const columns = [ + { key: 'name', label: 'Name' }, + { key: 'length', label: 'Length' }, + { key: 'sha512t24u', label: 'SHA-512/24u' }, + ]; + + return ( +
    + + + {/* Partial load banner */} + {sequenceIndexPartial && ( +
    + + + Sequence index is {formatBytes(sequenceIndexTotalSize)} — showing first{' '} + {sequenceIndex.length.toLocaleString()} sequences. + Sorting and filtering apply only to loaded data. + + +
    + )} + +
    + + {filtered.length.toLocaleString()} sequences + {filter && ` (filtered from ${sequenceIndex.length.toLocaleString()})`} + {sequenceIndexPartial && ' (partial)'} + + { + setFilter(e.target.value); + setPage(0); + }} + /> +
    + +
    + + + + {columns.map((col) => ( + + ))} + + + + {paged.map((seq, i) => ( + setSelectedSeq(seq)} + > + + + + + ))} + +
    handleSort(col.key)} + style={{ cursor: 'pointer' }} + className={col.key === 'length' ? 'text-end' : ''} + > + {col.label} + +
    {seq.name} + {seq.length.toLocaleString()} + {seq.sha512t24u}
    +
    + + {/* Sequence detail modal */} + {selectedSeq && ( + <> +
    setSelectedSeq(null)} /> +
    setSelectedSeq(null)}> +
    e.stopPropagation()}> +
    +
    +
    {selectedSeq.name}
    +
    +
    + + + + + + + + + + + + + + + + + + + {selectedSeq.description && ( + + + + + )} + +
    Length{selectedSeq.length.toLocaleString()}
    Alphabet{selectedSeq.alphabet}
    SHA-512/24u{selectedSeq.sha512t24u}
    MD5{selectedSeq.md5}
    Description{selectedSeq.description}
    + +
    Code
    +
      +
    • + +
    • +
    • + +
    • +
    + Get sequence + +
    +
    +
    +
    + + )} + + {totalPages > 1 && ( + + )} +
    + ); +}; + +export { StoreSequences }; diff --git a/frontend/src/services/fetchData.jsx b/frontend/src/services/fetchData.jsx index 6ee01f9..9d8173a 100644 --- a/frontend/src/services/fetchData.jsx +++ b/frontend/src/services/fetchData.jsx @@ -1,120 +1,197 @@ import { API_BASE } from '../utilities.jsx'; +export class AppError extends Error { + constructor(message, { status, isNotFound, digest1, digest2 } = {}) { + super(message); + this.name = 'AppError'; + this.status = status ?? null; + this.isNotFound = isNotFound ?? false; + this.digest1 = digest1 ?? null; + this.digest2 = digest2 ?? null; + } +} + +const checkResponse = async (response, url) => { + if (!response.ok) { + let errorDetail = response.statusText; + try { + const errorData = await response.json(); + errorDetail = errorData.detail || errorData.message || errorData.error || errorDetail; + } catch { + try { + errorDetail = await response.text(); + if (errorDetail.length > 200) { + errorDetail = errorDetail.substring(0, 200) + '...'; + } + } catch { + // Fallback to status text if body cannot be read + } + } + throw new Error(`HTTP ${response.status} from ${url}: ${errorDetail}`); + } + return response; +}; + export const fetchServiceInfo = async () => { - const response = await fetch(`${API_BASE}/service-info`); + try { + const url = `${API_BASE}/service-info`; + const response = await fetch(url); + await checkResponse(response, url); + return response.json(); + } catch { + return null; + } +}; + +export const fetchServiceInfoFromUrl = async (baseUrl) => { + const url = `${baseUrl.replace(/\/+$/, '')}/service-info`; + const response = await fetch(url); + await checkResponse(response, url); return response.json(); }; -export const fetchPangenomeLevels = async ( - digest, - level = '2', - collated = true, -) => { - const url = `${API_BASE}/pangenome/${digest}?level=1`; - const url2 = `${API_BASE}/pangenome/${digest}?level=2`; - const urlItemwise = `${API_BASE}/pangenome/${digest}?collated=false`; - let resps = [ - fetch(url).then((response) => response.json()), - fetch(url2).then((response) => response.json()), - fetch(urlItemwise).then((response) => response.json()), +export const fetchPangenomeLevels = async (digest) => { + const urls = [ + `${API_BASE}/pangenome/${digest}?level=1`, + `${API_BASE}/pangenome/${digest}?level=2`, + `${API_BASE}/pangenome/${digest}?collated=false`, ]; - return Promise.all(resps); + return Promise.all( + urls.map(async (url) => { + const response = await fetch(url); + await checkResponse(response, url); + return response.json(); + }), + ); }; -export const fetchSeqColList = async () => { - const url = `${API_BASE}/list/collection?page_size=10&page=0`; - const url2 = `${API_BASE}/list/pangenome?page_size=5`; - const url3 = `${API_BASE}/list/attributes/name_length_pairs?page_size=5`; - let resps = [ - fetch(url).then((response) => response.json()), - fetch(url2).then((response) => response.json()), - fetch(url3).then((response) => response.json()), - ]; - return Promise.all(resps); -}; +export const fetchSeqColList = async (baseUrl) => { + const base = (baseUrl || API_BASE).replace(/\/+$/, ''); + const fetchRequired = async (url) => { + const response = await fetch(url); + await checkResponse(response, url); + return response.json(); + }; -export const fetchAllSeqCols = async () => { - const url = `${API_BASE}/list/collection?page_size=1000&page=0`; - let resps = [fetch(url).then((response) => response.json())]; - return Promise.all(resps); -}; + const fetchOptional = async (url) => { + try { + const response = await fetch(url); + if (!response.ok) return null; + return response.json(); + } catch { + return null; + } + }; -export const fetchSeqColDetails = async ( - digest, - level = '2', - collated = true, -) => { - const url = `${API_BASE}/collection/${digest}?level=${level}&collated=${collated}`; - return fetch(url).then((response) => response.json()); + return Promise.all([ + fetchRequired(`${base}/list/collection?page_size=10&page=0`), + fetchOptional(`${base}/list/pangenome?page_size=5`), + fetchRequired(`${base}/list/attributes/name_length_pairs?page_size=5`), + ]); }; -export const fetchCollectionLevels = async (digest) => { +export const fetchAllSeqCols = async (baseUrl) => { + const base = (baseUrl || API_BASE).replace(/\/+$/, ''); const urls = [ - `${API_BASE}/collection/${digest}?level=1`, - `${API_BASE}/collection/${digest}?level=2`, - `${API_BASE}/collection/${digest}?collated=false`, + `${base}/list/collection?page_size=1000&page=0`, ]; - const responses = await Promise.all( - urls.map((url) => - fetch(url).then((response) => { - if (!response.ok) { - throw new Error( - `Error fetching data from ${url}: ${response.statusText}`, - ); - } - return response.json(); - }), - ), + return Promise.all( + urls.map(async (url) => { + const response = await fetch(url); + await checkResponse(response, url); + return response.json(); + }), ); +}; + +export const fetchCollectionLevels = async (digest, baseUrl) => { + const base = (baseUrl || API_BASE).replace(/\/+$/, ''); + const urls = [ + `${base}/collection/${digest}?level=1`, + `${base}/collection/${digest}?level=2`, + `${base}/collection/${digest}?collated=false`, + ]; - return responses; + return Promise.all( + urls.map(async (url) => { + const response = await fetch(url); + await checkResponse(response, url); + return response.json(); + }), + ); }; -export const fetchComparison = async (digest1, digest2) => { - const url = `${API_BASE}/comparison/${digest1}/${digest2}`; - return fetch(url).then((response) => response.json()); +export const fetchComparison = async (digest1, digest2, baseUrl) => { + const base = (baseUrl || API_BASE).replace(/\/+$/, ''); + const url = `${base}/comparison/${digest1}/${digest2}`; + const response = await fetch(url); + if (!response.ok) { + if (response.status === 404) { + throw new AppError('Collection not found', { + status: 404, + isNotFound: true, + digest1, + digest2, + }); + } + await checkResponse(response, url); + } + return response.json(); }; -export const fetchComparisonJSON = async (data, digest) => { - const url = `${API_BASE}/comparison/${digest}`; - return fetch(url, { +export const fetchComparisonJSON = async (data, digest, baseUrl) => { + const base = (baseUrl || API_BASE).replace(/\/+$/, ''); + const url = `${base}/comparison/${digest}`; + const response = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify(data), - }).then((response) => response.json()); + }); + await checkResponse(response, url); + return response.json(); }; -export const fetchAttribute = async (attribute, digest) => { - const url = `${API_BASE}/list/collection?${attribute}=${digest}`; - const url2 = `${API_BASE}/attribute/collection/${attribute}/${digest}`; - let resps = [ - fetch(url).then((response) => response.json()), - fetch(url2).then((response) => response.json()), +export const fetchAttribute = async (attribute, digest, baseUrl) => { + const base = (baseUrl || API_BASE).replace(/\/+$/, ''); + const urls = [ + `${base}/list/collection?${attribute}=${digest}`, + `${base}/attribute/collection/${attribute}/${digest}`, ]; - return Promise.all(resps); + + return Promise.all( + urls.map(async (url) => { + const response = await fetch(url); + await checkResponse(response, url); + return response.json(); + }), + ); }; export const fetchSimilarities = async (digest) => { const url = `${API_BASE}/similarities/${digest}?page_size=60`; - return fetch(url, { + const response = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json', }, - }).then((response) => response.json()); + }); + await checkResponse(response, url); + return response.json(); }; export const fetchSimilaritiesJSON = async (data, species) => { const url = `${API_BASE}/similarities/?species=${species}&page_size=60`; - return fetch(url, { + const response = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify(data), - }).then((response) => response.json()); + }); + await checkResponse(response, url); + return response.json(); }; diff --git a/frontend/src/services/storeService.js b/frontend/src/services/storeService.js new file mode 100644 index 0000000..4ac46b2 --- /dev/null +++ b/frontend/src/services/storeService.js @@ -0,0 +1,217 @@ +/** + * Service for fetching and parsing RefgetStore static files. + * A RefgetStore is a directory of static TSV/JSON files — no backend needed. + */ + +// Ensure URL ends without trailing slash +const normalizeUrl = (url) => url.replace(/\/+$/, ''); + +/** + * Parse TSV text into array of objects. + * Handles # comment header lines and ## metadata headers. + * Returns { metadata: {key: value}, rows: [{col: val}] } + */ +const parseTsv = (text) => { + const lines = text.split('\n').filter((l) => l.length > 0); + const metadata = {}; + let headerCols = null; + const rows = []; + + for (const line of lines) { + if (line.startsWith('##')) { + // Metadata header: ##key=value + const eq = line.indexOf('='); + if (eq > 2) { + metadata[line.substring(2, eq)] = line.substring(eq + 1); + } + } else if (line.startsWith('#')) { + // Column header + headerCols = line.substring(1).split('\t'); + } else if (headerCols) { + const fields = line.split('\t'); + const row = {}; + headerCols.forEach((col, i) => { + row[col] = fields[i] ?? ''; + }); + rows.push(row); + } + } + + return { metadata, rows }; +}; + +/** + * Parse a two-column TSV (alias files have no header). + * Returns [{alias, digest}] + */ +const parseAliasTsv = (text) => { + return text + .split('\n') + .filter((l) => l.length > 0 && !l.startsWith('#')) + .map((line) => { + const [alias, digest] = line.split('\t'); + return { alias, digest }; + }); +}; + +/** + * Parse collections.rgci — a TSV with #header row. + * Columns: digest, n_sequences, names_digest, sequences_digest, lengths_digest, + * name_length_pairs_digest, sorted_name_length_pairs_digest, sorted_sequences_digest + */ +const parseRgci = (text) => { + const { rows } = parseTsv(text); + return rows.map((r) => ({ + ...r, + n_sequences: r.n_sequences ? parseInt(r.n_sequences, 10) : 0, + })); +}; + +/** Fetch with error handling */ +const fetchFile = async (url) => { + const response = await fetch(url); + if (!response.ok) { + if (response.status === 404 || response.status === 403) return null; + throw new Error(`HTTP ${response.status} fetching ${url}`); + } + return response; +}; + +/** GET rgstore.json → parsed JSON */ +export const fetchStoreMetadata = async (baseUrl) => { + const url = `${normalizeUrl(baseUrl)}/rgstore.json`; + const response = await fetchFile(url); + if (!response) throw new Error('rgstore.json not found at this URL'); + return response.json(); +}; + +/** Size threshold for auto-loading sequence index (10 MB) */ +const AUTO_LOAD_THRESHOLD = 10 * 1024 * 1024; +/** Default partial load size (2 MB) */ +const PARTIAL_LOAD_SIZE = 2 * 1024 * 1024; + +const parseSequenceRows = (text) => { + const { rows } = parseTsv(text); + return rows.map((r) => ({ + ...r, + length: r.length ? parseInt(r.length, 10) : 0, + })); +}; + +/** + * Check the size of sequences.rgsi via HEAD request. + * Returns { url, size } or null if not found. + */ +export const checkSequenceIndexSize = async (baseUrl) => { + const url = `${normalizeUrl(baseUrl)}/sequences.rgsi`; + try { + const response = await fetch(url, { method: 'HEAD' }); + if (!response.ok) return null; + const size = parseInt(response.headers.get('content-length') || '0', 10); + return { url, size }; + } catch { + return null; + } +}; + +/** + * Fetch sequences.rgsi — auto-loads if small, otherwise requires explicit call. + * Returns { rows, partial, totalSize } + * partial: true if only a prefix was loaded + * totalSize: file size in bytes + */ +export const fetchSequenceIndex = async (baseUrl, { maxBytes } = {}) => { + const url = `${normalizeUrl(baseUrl)}/sequences.rgsi`; + + // Check file size first + let totalSize = 0; + try { + const head = await fetch(url, { method: 'HEAD' }); + if (!head.ok) { + if (head.status === 404 || head.status === 403) throw new Error('sequences.rgsi not found'); + throw new Error(`HTTP ${head.status} fetching ${url}`); + } + totalSize = parseInt(head.headers.get('content-length') || '0', 10); + } catch (err) { + if (err.message.includes('not found')) throw err; + // HEAD failed (CORS?), fall back to full fetch + const response = await fetchFile(url); + if (!response) throw new Error('sequences.rgsi not found'); + const text = await response.text(); + return { rows: parseSequenceRows(text), partial: false, totalSize: text.length }; + } + + const limit = maxBytes || (totalSize <= AUTO_LOAD_THRESHOLD ? totalSize : PARTIAL_LOAD_SIZE); + const loadFull = limit >= totalSize; + + if (loadFull) { + const response = await fetchFile(url); + if (!response) throw new Error('sequences.rgsi not found'); + const text = await response.text(); + return { rows: parseSequenceRows(text), partial: false, totalSize }; + } + + // Partial load via Range header + const response = await fetch(url, { + headers: { Range: `bytes=0-${limit - 1}` }, + }); + if (!response.ok && response.status !== 206) { + // Server doesn't support Range — fall back to full fetch + const fullResponse = await fetchFile(url); + if (!fullResponse) throw new Error('sequences.rgsi not found'); + const text = await fullResponse.text(); + return { rows: parseSequenceRows(text), partial: false, totalSize }; + } + const text = await response.text(); + // Discard last partial line + const lastNewline = text.lastIndexOf('\n'); + const cleanText = lastNewline > 0 ? text.substring(0, lastNewline) : text; + return { rows: parseSequenceRows(cleanText), partial: true, totalSize }; +}; + +/** GET collections.rgci → array of collection summaries */ +export const fetchCollectionIndex = async (baseUrl) => { + const url = `${normalizeUrl(baseUrl)}/collections.rgci`; + const response = await fetchFile(url); + if (!response) return null; // No collection index available + const text = await response.text(); + return parseRgci(text); +}; + +/** GET collections/{digest}.rgsi → {metadata, sequences} */ +export const fetchCollection = async (baseUrl, digest) => { + const base = normalizeUrl(baseUrl); + // Try .rgsi first (format spec default), then .rgci + let response = await fetchFile(`${base}/collections/${digest}.rgsi`); + if (!response) { + response = await fetchFile(`${base}/collections/${digest}.rgci`); + } + if (!response) + throw new Error(`Collection ${digest} not found`); + const text = await response.text(); + const { metadata, rows } = parseTsv(text); + return { + metadata, + sequences: rows.map((r) => ({ + ...r, + length: r.length ? parseInt(r.length, 10) : 0, + })), + }; +}; + +/** GET aliases/{type}/{namespace}.tsv → [{alias, digest}] */ +export const fetchAliases = async (baseUrl, type, namespace) => { + const url = `${normalizeUrl(baseUrl)}/aliases/${type}/${namespace}.tsv`; + const response = await fetchFile(url); + if (!response) return null; + const text = await response.text(); + return parseAliasTsv(text); +}; + +/** GET collections/{digest}.fhr.json → parsed JSON or null */ +export const fetchFhrMetadata = async (baseUrl, digest) => { + const url = `${normalizeUrl(baseUrl)}/collections/${digest}.fhr.json`; + const response = await fetchFile(url); + if (!response) return null; + return response.json(); +}; diff --git a/frontend/src/stores/apiExplorerStore.js b/frontend/src/stores/apiExplorerStore.js new file mode 100644 index 0000000..e75df30 --- /dev/null +++ b/frontend/src/stores/apiExplorerStore.js @@ -0,0 +1,55 @@ +import { create } from 'zustand'; +import { fetchServiceInfoFromUrl } from '../services/fetchData.jsx'; + +const RECENT_APIS_KEY = 'refget-explorer-recent-apis'; +const MAX_RECENT = 5; + +const getRecentApis = () => { + try { + return JSON.parse(localStorage.getItem(RECENT_APIS_KEY)) || []; + } catch { + return []; + } +}; + +const saveRecentApi = (url) => { + const recent = getRecentApis().filter((u) => u !== url); + recent.unshift(url); + localStorage.setItem( + RECENT_APIS_KEY, + JSON.stringify(recent.slice(0, MAX_RECENT)), + ); +}; + +export const useApiExplorerStore = create((set, get) => ({ + apiUrl: null, + serviceInfo: null, + apiAvailable: false, + loading: false, + error: null, + + probeApi: async (url) => { + const trimmed = url.replace(/\/+$/, ''); + set({ loading: true, error: null, apiUrl: trimmed }); + try { + const info = await fetchServiceInfoFromUrl(trimmed); + saveRecentApi(trimmed); + set({ serviceInfo: info, apiAvailable: true, loading: false }); + return info; + } catch (err) { + set({ serviceInfo: null, apiAvailable: false, loading: false, error: err.message }); + throw err; + } + }, + + reset: () => + set({ + apiUrl: null, + serviceInfo: null, + apiAvailable: false, + loading: false, + error: null, + }), + + getRecentApis, +})); diff --git a/frontend/src/stores/explorerStore.js b/frontend/src/stores/explorerStore.js new file mode 100644 index 0000000..9edd6c1 --- /dev/null +++ b/frontend/src/stores/explorerStore.js @@ -0,0 +1,113 @@ +import { create } from 'zustand'; +import { + fetchStoreMetadata, + fetchSequenceIndex, + fetchCollectionIndex, + fetchCollection, + fetchAliases, + fetchFhrMetadata, +} from '../services/storeService.js'; + +export const useExplorerStore = create((set, get) => ({ + storeUrl: null, + metadata: null, + sequenceIndex: null, // array of sequence rows (or null if not loaded) + sequenceIndexPartial: false, // true if only a prefix was loaded + sequenceIndexTotalSize: 0, // total file size in bytes + collections: null, + loadedCollections: {}, + aliases: {}, + fhrMetadata: {}, + loading: false, + error: null, + + setStoreUrl: (url) => set({ storeUrl: url }), + + /** Fetch store metadata + collection index (sequence index is lazy-loaded) */ + loadStore: async (url) => { + set({ loading: true, error: null, storeUrl: url }); + try { + const metadata = await fetchStoreMetadata(url); + set({ metadata }); + + const collections = await fetchCollectionIndex(url).catch(() => null); + + set({ + sequenceIndex: null, + sequenceIndexPartial: false, + sequenceIndexTotalSize: 0, + collections, + loading: false, + loadedCollections: {}, + aliases: {}, + fhrMetadata: {}, + }); + } catch (err) { + set({ loading: false, error: err.message }); + throw err; + } + }, + + /** Fetch and cache the sequence index (lazy — only when needed). + * Options: { maxBytes } to limit partial load size. */ + loadSequenceIndex: async (options) => { + const { storeUrl, sequenceIndex } = get(); + // If already fully loaded, return cached + if (sequenceIndex && !get().sequenceIndexPartial) return sequenceIndex; + + const { rows, partial, totalSize } = await fetchSequenceIndex(storeUrl, options); + set({ + sequenceIndex: rows, + sequenceIndexPartial: partial, + sequenceIndexTotalSize: totalSize, + }); + return rows; + }, + + /** Fetch and cache a single collection */ + loadCollection: async (digest) => { + const { storeUrl, loadedCollections } = get(); + if (loadedCollections[digest]) return loadedCollections[digest]; + + const data = await fetchCollection(storeUrl, digest); + set({ loadedCollections: { ...get().loadedCollections, [digest]: data } }); + return data; + }, + + /** Fetch and cache aliases for a type/namespace */ + loadAliases: async (type, namespace) => { + const { storeUrl, aliases } = get(); + const key = `${type}/${namespace}`; + if (aliases[key]) return aliases[key]; + + const data = await fetchAliases(storeUrl, type, namespace); + set({ aliases: { ...get().aliases, [key]: data } }); + return data; + }, + + /** Fetch and cache FHR metadata for a collection */ + loadFhrMetadata: async (digest) => { + const { storeUrl, fhrMetadata } = get(); + if (fhrMetadata[digest] !== undefined) return fhrMetadata[digest]; + + const data = await fetchFhrMetadata(storeUrl, digest); + set({ fhrMetadata: { ...get().fhrMetadata, [digest]: data } }); + return data; + }, + + /** Reset store state */ + reset: () => + set({ + storeUrl: null, + metadata: null, + sequenceIndex: null, + sequenceIndexPartial: false, + sequenceIndexTotalSize: 0, + collections: null, + loadedCollections: {}, + aliases: {}, + fhrMetadata: {}, + loading: false, + error: null, + }), +})); diff --git a/frontend/src/stores/similarities.js b/frontend/src/stores/similarities.js index 5c854c2..6c70663 100644 --- a/frontend/src/stores/similarities.js +++ b/frontend/src/stores/similarities.js @@ -7,33 +7,41 @@ export const useSimilaritiesStore = create((set, get) => ({ customCollectionJSON: '', customCount: 1, similarities: null, + error: null, sortBy: null, sortAscending: false, species: 'human', - setSortBy: (value) => set({ sortBy: value }), - setSortAscending: (value) => set({ sortAscending: value }), setSpecies: (value) => set({ species: value }), + setError: (value) => set({ error: value }), - sortSimilarities: () => { + resetSort: () => set({ sortBy: null, sortAscending: false }), + + sortByColumn: (column) => { const { similarities, sortBy, sortAscending } = get(); - - if (!similarities || !sortBy) return; - - const sampleValue = similarities.find(item => item[sortBy] != null)?.[sortBy]; - + + const newSortBy = column; + const newSortAscending = sortBy === column ? !sortAscending : false; + + if (!similarities) { + set({ sortBy: newSortBy, sortAscending: newSortAscending }); + return; + } + + const sampleValue = similarities.find(item => item[newSortBy] != null)?.[newSortBy]; + const sorted = [...similarities]; - + if (typeof sampleValue === 'number') { - sorted.sort((a, b) => sortAscending ? a[sortBy] - b[sortBy] : b[sortBy] - a[sortBy]); + sorted.sort((a, b) => newSortAscending ? a[newSortBy] - b[newSortBy] : b[newSortBy] - a[newSortBy]); } else { - sorted.sort((a, b) => sortAscending - ? String(a[sortBy]).localeCompare(String(b[sortBy])) - : String(b[sortBy]).localeCompare(String(a[sortBy])) + sorted.sort((a, b) => newSortAscending + ? String(a[newSortBy]).localeCompare(String(b[newSortBy])) + : String(b[newSortBy]).localeCompare(String(a[newSortBy])) ); } - - set({ similarities: sorted }); + + set({ sortBy: newSortBy, sortAscending: newSortAscending, similarities: sorted }); }, setSelectedCollectionsIndex: (value) => { @@ -68,25 +76,26 @@ export const useSimilaritiesStore = create((set, get) => ({ setSimilarities: (value) => { const { sortBy, sortAscending } = get(); - - if (!sortBy) { + + if (!sortBy || !value) { set({ similarities: value }); return; } const sampleValue = value.find(item => item[sortBy] != null)?.[sortBy]; + const sorted = [...value]; + if (typeof sampleValue === 'number') { - set({ similarities: sortAscending - ? value.sort((a, b) => a[sortBy] - b[sortBy]) - : value.sort((a, b) => b[sortBy] - a[sortBy]) - }); + sorted.sort((a, b) => sortAscending ? a[sortBy] - b[sortBy] : b[sortBy] - a[sortBy]); } else { - set({ similarities: sortAscending - ? value.sort((a, b) => a[sortBy].localeCompare(b[sortBy])) - : value.sort((a, b) => b[sortBy].localeCompare(a[sortBy])) - }); + sorted.sort((a, b) => sortAscending + ? String(a[sortBy]).localeCompare(String(b[sortBy])) + : String(b[sortBy]).localeCompare(String(a[sortBy])) + ); } + + set({ similarities: sorted }); }, getAllCollections: (collections) => { diff --git a/frontend/src/stores/unifiedStore.js b/frontend/src/stores/unifiedStore.js new file mode 100644 index 0000000..f964fc2 --- /dev/null +++ b/frontend/src/stores/unifiedStore.js @@ -0,0 +1,67 @@ +import { create } from 'zustand'; +import { API_BASE } from '../utilities.jsx'; +import { fetchStoreMetadata, fetchCollectionIndex } from '../services/storeService.js'; + +export const useUnifiedStore = create((set, get) => ({ + hasStore: false, + hasAPI: false, + storeUrl: null, + apiUrl: API_BASE, + storeMetadata: null, + storeCollections: null, + serviceInfo: null, + probed: false, + loading: false, + + probe: async () => { + if (get().probed) return; + set({ loading: true }); + + let hasAPI = false; + let hasStore = false; + let storeUrl = null; + let storeMetadata = null; + let storeCollections = null; + let serviceInfo = null; + + // First, fetch /service-info to discover the API and store URL + try { + const resp = await fetch(`${API_BASE}/service-info`); + if (resp.ok) { + hasAPI = true; + serviceInfo = await resp.json(); + + // Extract store URL from service-info + const storeConfig = serviceInfo?.seqcol?.refget_store; + if (storeConfig?.enabled && storeConfig?.url) { + const candidateUrl = storeConfig.url; + + // Only probe if it's an HTTP(S) URL (browser can't fetch local paths) + if (/^https?:\/\//i.test(candidateUrl)) { + try { + storeMetadata = await fetchStoreMetadata(candidateUrl); + hasStore = true; + storeUrl = candidateUrl; + storeCollections = await fetchCollectionIndex(candidateUrl).catch(() => null); + } catch { + hasStore = false; + } + } + } + } + } catch { + hasAPI = false; + } + + set({ + hasStore, + hasAPI, + storeUrl, + storeMetadata, + storeCollections, + serviceInfo, + probed: true, + loading: false, + }); + }, +})); diff --git a/frontend/src/utilities.jsx b/frontend/src/utilities.jsx index c5f2ecd..d137471 100644 --- a/frontend/src/utilities.jsx +++ b/frontend/src/utilities.jsx @@ -5,13 +5,29 @@ import copyToClipboardIcon from './assets/copy_to_clipboard.svg'; import barcodeIcon from './assets/barcode.svg'; const copyToClipboard = async (text) => { - toast.success('Digest copied!'); - return await navigator.clipboard.writeText(text); + try { + await navigator.clipboard.writeText(text); + toast.success('Digest copied!'); + } catch (error) { + toast.error('Failed to copy to clipboard'); + } }; const snakeToTitle = (str) => str.replace(/_/g, ' ').replace(/\b\w/g, (char) => char.toUpperCase()); +// Unicode-safe base64 encoding +// Handles all Unicode characters including non-ASCII sequences +const encodeToBase64 = (str) => { + return btoa(unescape(encodeURIComponent(str))); +}; + +// Unicode-safe base64 decoding +// Handles all Unicode characters including non-ASCII sequences +const decodeFromBase64 = (encoded) => { + return decodeURIComponent(escape(atob(encoded))); +}; + const encodeComparison = (input) => { let jsonString; @@ -28,7 +44,7 @@ const encodeComparison = (input) => { throw new Error('Input must be an object or valid JSON string'); } - return btoa(jsonString); + return encodeToBase64(jsonString); }; export { @@ -38,4 +54,6 @@ export { copyToClipboardIcon, snakeToTitle, encodeComparison, + encodeToBase64, + decodeFromBase64, }; diff --git a/pyproject.toml b/pyproject.toml index 458f68e..0ddd2d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,72 @@ -[tool.black] +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "refget" +dynamic = ["version"] +description = "GA4GH refget - reference sequence and sequence collection tools" +readme = "README.md" +license = "BSD-2-Clause" +requires-python = ">=3.10" +authors = [ + { name = "Nathan Sheffield", email = "nathan@code.databio.org" }, + { name = "Michal Stolarczyk" }, +] +keywords = ["genome", "assembly", "bioinformatics", "reference", "sequence"] +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", +] +dependencies = [ + "gtars>=0.8.0", + "jsonschema", + "pyyaml", + "requests", + "sqlmodel", + "tomli_w", + "typer>=0.9.0", +] + +[project.scripts] +refget = "refget.cli:main" + +[project.optional-dependencies] +test = ["pytest", "pytest-cov>=6.0.0", "fastapi", "httpx"] +seqcolapi = [ + "fastapi", + "psycopg2-binary", + "sqlmodel", + "uvicorn>=0.30.0", + "ubiquerg>=0.6.1", +] + +[project.urls] +Homepage = "https://github.com/refgenie/refget" + +[tool.hatch.version] +path = "refget/_version.py" + +[tool.ruff] line-length = 99 -target-version = ['py38', 'py311'] -include = '\.pyi?$' +exclude = [ + "array_overlap.py", + "create_compliance_answers.py", + "data_loaders", + "interactive_tests.py", +] + +[tool.ruff.lint] +select = ["E", "F", "I"] +ignore = ["F403", "F405", "E501"] + +[tool.ruff.lint.isort] +known-first-party = ["refget"] [tool.pytest.ini_options] testpaths = ["tests/local"] diff --git a/refget-r/.Rbuildignore b/refget-r/.Rbuildignore new file mode 100644 index 0000000..3c3871e --- /dev/null +++ b/refget-r/.Rbuildignore @@ -0,0 +1,6 @@ +^\.Rproj\.user$ +^.*\.Rproj$ +^\.git$ +^\.gitignore$ +^README\.Rmd$ +^LICENSE\.md$ diff --git a/refget-r/DESCRIPTION b/refget-r/DESCRIPTION new file mode 100644 index 0000000..f0aed17 --- /dev/null +++ b/refget-r/DESCRIPTION @@ -0,0 +1,35 @@ +Package: BiocRefgetStore +Title: BSgenome-Compatible Interface to RefgetStore +Version: 0.1.0 +Authors@R: + c(person("Nathan", "Sheffield", , "nathan@databio.org", role = c("aut", "cre"), + comment = c(ORCID = "0000-0001-5643-4068")), + person("Sheffield Lab", role = "fnd")) +Description: Provides a BSgenome-compatible interface for accessing genomic + sequences stored in RefgetStore format. Enables chromosome-name-based + sequence access (e.g., getSeq(genome, "chr1", 1000, 2000)) using + RefgetStore as the backend, with optional conversion to Biostrings + DNAString/DNAStringSet objects. +License: MIT + file LICENSE +Encoding: UTF-8 +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.3.2 +Depends: + R (>= 4.0), + methods +Imports: + gtars, + GenomeInfoDb +Suggests: + Biostrings, + GenomicRanges, + testthat (>= 3.0.0) +biocViews: Infrastructure, DataRepresentation, Sequencing +Config/testthat/edition: 3 +Collate: + 'utils.R' + 'RefgetGenome-class.R' + 'getSeq-methods.R' + 'conversion.R' + 'bulk-extraction.R' + 'zzz.R' diff --git a/refget-r/LICENSE b/refget-r/LICENSE new file mode 100644 index 0000000..5c6e794 --- /dev/null +++ b/refget-r/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Sheffield Lab + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/refget-r/NAMESPACE b/refget-r/NAMESPACE new file mode 100644 index 0000000..0c172ce --- /dev/null +++ b/refget-r/NAMESPACE @@ -0,0 +1,42 @@ +# Generated by roxygen2: do not edit by hand + +# Exports - Constructors +export(RefgetGenome) +export(RefgetGenome.from_directory) +export(RefgetGenome.from_fasta) +export(RefgetGenome.from_remote) + +# Exports - Accessors +export(collection_digest) +export(coordinate_system) +export(sequence_digests) +export(store) + +# Exports - Bulk extraction +export(extractRegions) +export(extractToFasta) +export(exportChromosomes) + +# Exports - Conversion +export(as_DNAString) +export(as_DNAStringSet) + +# S4 methods +exportMethods(getSeq) +exportMethods(seqinfo) +exportMethods(seqnames) +exportMethods(seqlengths) +exportMethods(length) +exportMethods(names) +exportMethods("[[") +exportMethods(show) + +# S4 classes +exportClasses(RefgetGenome) + +# Imports +import(methods) +importFrom(GenomeInfoDb, Seqinfo) +importFrom(GenomeInfoDb, seqinfo) +importFrom(GenomeInfoDb, seqnames) +importFrom(GenomeInfoDb, seqlengths) diff --git a/refget-r/R/RefgetGenome-class.R b/refget-r/R/RefgetGenome-class.R new file mode 100644 index 0000000..2a45560 --- /dev/null +++ b/refget-r/R/RefgetGenome-class.R @@ -0,0 +1,270 @@ +#' RefgetGenome Class +#' +#' A BSgenome-compatible wrapper around a RefgetStore collection that provides +#' chromosome-name-based sequence access. +#' +#' @slot store A gtars RefgetStore object +#' @slot collection_digest Character string containing the seqcol digest +#' @slot seqinfo A Seqinfo object with sequence metadata +#' +#' @exportClass RefgetGenome +setClass( + "RefgetGenome", + slots = list( + store = "ANY", # gtars RefgetStore + collection_digest = "character", + seqinfo = "ANY" # GenomeInfoDb::Seqinfo + ) +) + +#' Create a RefgetGenome object +#' +#' Creates a BSgenome-compatible genome object backed by a RefgetStore. +#' +#' @param store A gtars RefgetStore object +#' @param digest Optional. The collection digest to use. +#' @param namespace Optional. Alias namespace (e.g., "refseq", "genbank") +#' @param alias Optional. Alias name (e.g., "hg38", "GRCh38") +#' +#' @return A RefgetGenome object +#' +#' @details +#' You must provide either `digest` or both `namespace` and `alias`. +#' If using aliases, the function will resolve the alias to a digest using +#' the store's alias system. +#' +#' @examples +#' \dontrun{ +#' # Load by digest +#' store <- gtars::refget_store_open_local("/path/to/store") +#' genome <- RefgetGenome(store, digest = "abc123...") +#' +#' # Load by alias +#' genome <- RefgetGenome(store, namespace = "refseq", alias = "GRCh38") +#' } +#' +#' @export +RefgetGenome <- function(store, digest = NULL, namespace = NULL, alias = NULL) { + # Validate inputs + if (is.null(digest) && (is.null(namespace) || is.null(alias))) { + stop("Must provide either 'digest' or both 'namespace' and 'alias'") + } + + # Resolve alias to digest if needed + if (is.null(digest)) { + collection_meta <- gtars::get_collection_by_alias(store, namespace, alias) + if (is.null(collection_meta)) { + stop(sprintf("No collection found for alias '%s/%s'", namespace, alias)) + } + digest <- collection_meta@digest + } + + # Get level2 data for building Seqinfo + level2 <- gtars::get_level2(store, digest) + if (is.null(level2)) { + stop(sprintf("Collection '%s' not found in store", digest)) + } + + # Build Seqinfo from level2 data + seqinfo <- GenomeInfoDb::Seqinfo( + seqnames = level2$names, + seqlengths = as.integer(level2$lengths) + ) + + new("RefgetGenome", + store = store, + collection_digest = digest, + seqinfo = seqinfo) +} + +#' Create RefgetGenome from a directory +#' +#' Convenience constructor that loads a RefgetStore from a directory. +#' +#' @param path Path to the RefgetStore directory +#' @param digest The collection digest to use +#' @param namespace Optional alias namespace +#' @param alias Optional alias name +#' +#' @return A RefgetGenome object +#' +#' @examples +#' \dontrun{ +#' genome <- RefgetGenome.from_directory("/path/to/store", digest = "abc123...") +#' } +#' +#' @export +RefgetGenome.from_directory <- function(path, digest = NULL, namespace = NULL, alias = NULL) { + store <- gtars::refget_store_open_local(path) + RefgetGenome(store, digest = digest, namespace = namespace, alias = alias) +} + +#' Create RefgetGenome from a FASTA file +#' +#' Creates an in-memory RefgetStore from a FASTA file and returns a RefgetGenome. +#' +#' @param fasta_path Path to a FASTA file +#' +#' @return A RefgetGenome object +#' +#' @examples +#' \dontrun{ +#' genome <- RefgetGenome.from_fasta("/path/to/genome.fa") +#' } +#' +#' @export +RefgetGenome.from_fasta <- function(fasta_path) { + store <- gtars::refget_store() + result <- gtars::add_fasta(store, fasta_path) + RefgetGenome(store, digest = result$digest) +} + +#' Create RefgetGenome from a remote store +#' +#' Creates a RefgetGenome backed by a remote RefgetStore with local caching. +#' +#' @param cache_path Local directory for caching downloaded data +#' @param remote_url URL of the remote RefgetStore +#' @param digest The collection digest to use +#' @param namespace Optional alias namespace +#' @param alias Optional alias name +#' +#' @return A RefgetGenome object +#' +#' @examples +#' \dontrun{ +#' genome <- RefgetGenome.from_remote( +#' cache_path = "~/.cache/refget", +#' remote_url = "https://refget.databio.org/store", +#' namespace = "refseq", +#' alias = "GRCh38" +#' ) +#' } +#' +#' @export +RefgetGenome.from_remote <- function(cache_path, remote_url, digest = NULL, namespace = NULL, alias = NULL) { + store <- gtars::refget_store_open_remote(cache_path, remote_url) + RefgetGenome(store, digest = digest, namespace = namespace, alias = alias) +} + +# ============================================================================= +# Accessor Methods +# ============================================================================= + +#' Get the collection digest +#' +#' @param genome A RefgetGenome object +#' @return The seqcol digest string +#' @export +collection_digest <- function(genome) { + genome@collection_digest +} + +#' Get the coordinate system digest +#' +#' Returns the sorted_name_length_pairs digest which identifies the coordinate +#' system. Two genomes with the same coordinate_system() are compatible for +#' coordinate-based operations. +#' +#' @param genome A RefgetGenome object +#' @return The sorted_name_length_pairs digest string +#' @export +coordinate_system <- function(genome) { + meta <- gtars::get_collection_metadata(genome@store, genome@collection_digest) + meta@sorted_name_length_pairs_digest +} + +#' Get the underlying RefgetStore +#' +#' @param genome A RefgetGenome object +#' @return The gtars RefgetStore object +#' @export +store <- function(genome) { + genome@store +} + +#' Get per-sequence SHA512t24u digests +#' +#' @param genome A RefgetGenome object +#' @return Named character vector of sequence digests +#' @export +sequence_digests <- function(genome) { + level2 <- gtars::get_level2(genome@store, genome@collection_digest) + digests <- level2$sequences + names(digests) <- level2$names + digests +} + +# Standard BSgenome-like accessors via S4 methods + +#' @rdname RefgetGenome-class +#' @export +setMethod("seqinfo", "RefgetGenome", function(x) { + x@seqinfo +}) + +#' @rdname RefgetGenome-class +#' @export +setMethod("seqnames", "RefgetGenome", function(x) { + GenomeInfoDb::seqnames(x@seqinfo) +}) + +#' @rdname RefgetGenome-class +#' @export +setMethod("seqlengths", "RefgetGenome", function(x) { + GenomeInfoDb::seqlengths(x@seqinfo) +}) + +#' @rdname RefgetGenome-class +#' @export +setMethod("length", "RefgetGenome", function(x) { + length(GenomeInfoDb::seqnames(x@seqinfo)) +}) + +#' @rdname RefgetGenome-class +#' @export +setMethod("names", "RefgetGenome", function(x) { + as.character(GenomeInfoDb::seqnames(x@seqinfo)) +}) + +#' Extract a full sequence by name +#' +#' @param x A RefgetGenome object +#' @param i Sequence name (e.g., "chr1") +#' @return Sequence string or DNAString if Biostrings is available +#' @rdname RefgetGenome-class +#' @export +setMethod("[[", c("RefgetGenome", "character"), function(x, i) { + record <- gtars::get_sequence_by_name(x@store, x@collection_digest, i) + if (is.null(record)) { + stop(sprintf("Sequence '%s' not found in collection", i)) + } + seq_string <- record@data + + # Convert to DNAString if Biostrings is available + if (requireNamespace("Biostrings", quietly = TRUE)) { + return(Biostrings::DNAString(seq_string)) + } + seq_string +}) + +#' Show method for RefgetGenome +#' +#' @param object A RefgetGenome object +#' @rdname RefgetGenome-class +#' @export +setMethod("show", "RefgetGenome", function(object) { + n_seqs <- length(object) + cat(sprintf("RefgetGenome with %d sequences\n", n_seqs)) + cat(sprintf(" collection_digest: %s\n", object@collection_digest)) + + # Show first few sequence names + seq_names <- names(object) + if (length(seq_names) > 5) { + cat(sprintf(" seqnames: %s ... (%d more)\n", + paste(seq_names[1:5], collapse = ", "), + length(seq_names) - 5)) + } else { + cat(sprintf(" seqnames: %s\n", paste(seq_names, collapse = ", "))) + } +}) diff --git a/refget-r/R/bulk-extraction.R b/refget-r/R/bulk-extraction.R new file mode 100644 index 0000000..7186414 --- /dev/null +++ b/refget-r/R/bulk-extraction.R @@ -0,0 +1,181 @@ +#' Extract regions from a RefgetGenome +#' +#' Efficiently extract multiple genomic regions using BED-based extraction. +#' +#' @param genome A RefgetGenome object +#' @param regions A GRanges object or data.frame with columns: chrom, start, end +#' @param as.character If TRUE, return character vector instead of DNAStringSet +#' +#' @return DNAStringSet or character vector of extracted sequences +#' +#' @examples +#' \dontrun{ +#' genome <- RefgetGenome.from_fasta("genome.fa") +#' +#' # From GRanges +#' library(GenomicRanges) +#' regions <- GRanges(c("chr1:1000-2000", "chr2:5000-6000")) +#' seqs <- extractRegions(genome, regions) +#' +#' # From data.frame +#' regions_df <- data.frame( +#' chrom = c("chr1", "chr2"), +#' start = c(1000, 5000), +#' end = c(2000, 6000) +#' ) +#' seqs <- extractRegions(genome, regions_df) +#' } +#' +#' @export +extractRegions <- function(genome, regions, as.character = FALSE) { + # Convert GRanges to data.frame + if (inherits(regions, "GRanges")) { + if (!requireNamespace("GenomicRanges", quietly = TRUE)) { + stop("GenomicRanges package required for GRanges input") + } + regions <- data.frame( + chrom = as.character(GenomicRanges::seqnames(regions)), + start = GenomicRanges::start(regions), + end = GenomicRanges::end(regions), + stringsAsFactors = FALSE + ) + } + + # Validate columns + required_cols <- c("chrom", "start", "end") + if (!all(required_cols %in% names(regions))) { + stop("regions must have columns: chrom, start, end") + } + + # Write temp BED file (0-based coordinates for BED) + bed_file <- tempfile(fileext = ".bed") + on.exit(unlink(bed_file), add = TRUE) + + bed_df <- data.frame( + chrom = regions$chrom, + start = as.integer(regions$start - 1), # Convert to 0-based + end = as.integer(regions$end) + ) + write.table(bed_df, bed_file, sep = "\t", row.names = FALSE, + col.names = FALSE, quote = FALSE) + + # Use gtars BED extraction + retrieved <- gtars::get_seqs_bed_file_to_vec( + genome@store, + genome@collection_digest, + bed_file + ) + + # Extract sequence strings + seqs <- vapply(retrieved, function(r) r@sequence, character(1)) + + # Name by region + result_names <- sprintf("%s:%d-%d", regions$chrom, regions$start, regions$end) + names(seqs) <- result_names + + # Convert to DNAStringSet if requested + if (!as.character && requireNamespace("Biostrings", quietly = TRUE)) { + return(Biostrings::DNAStringSet(seqs)) + } + + seqs +} + +#' Extract regions to a FASTA file +#' +#' Write extracted sequences directly to a FASTA file. +#' +#' @param genome A RefgetGenome object +#' @param regions A GRanges object or data.frame with columns: chrom, start, end +#' @param output_path Path for output FASTA file +#' +#' @return Invisibly returns the output path +#' +#' @examples +#' \dontrun{ +#' genome <- RefgetGenome.from_fasta("genome.fa") +#' regions <- data.frame( +#' chrom = c("chr1", "chr2"), +#' start = c(1000, 5000), +#' end = c(2000, 6000) +#' ) +#' extractToFasta(genome, regions, "extracted.fa") +#' } +#' +#' @export +extractToFasta <- function(genome, regions, output_path) { + # Convert GRanges to data.frame + if (inherits(regions, "GRanges")) { + if (!requireNamespace("GenomicRanges", quietly = TRUE)) { + stop("GenomicRanges package required for GRanges input") + } + regions <- data.frame( + chrom = as.character(GenomicRanges::seqnames(regions)), + start = GenomicRanges::start(regions), + end = GenomicRanges::end(regions), + stringsAsFactors = FALSE + ) + } + + # Validate columns + required_cols <- c("chrom", "start", "end") + if (!all(required_cols %in% names(regions))) { + stop("regions must have columns: chrom, start, end") + } + + # Write temp BED file (0-based coordinates for BED) + bed_file <- tempfile(fileext = ".bed") + on.exit(unlink(bed_file), add = TRUE) + + bed_df <- data.frame( + chrom = regions$chrom, + start = as.integer(regions$start - 1), # Convert to 0-based + end = as.integer(regions$end) + ) + write.table(bed_df, bed_file, sep = "\t", row.names = FALSE, + col.names = FALSE, quote = FALSE) + + # Use gtars BED extraction to FASTA + gtars::get_seqs_bed_file( + genome@store, + genome@collection_digest, + bed_file, + output_path + ) + + invisible(output_path) +} + +#' Export specific chromosomes to FASTA +#' +#' Export one or more complete chromosomes to a FASTA file. +#' +#' @param genome A RefgetGenome object +#' @param names Character vector of chromosome names to export (NULL = all) +#' @param output_path Path for output FASTA file +#' @param line_width Number of bases per line in output (default: 80) +#' +#' @return Invisibly returns the output path +#' +#' @examples +#' \dontrun{ +#' genome <- RefgetGenome.from_fasta("genome.fa") +#' +#' # Export specific chromosomes +#' exportChromosomes(genome, c("chr1", "chr2"), "subset.fa") +#' +#' # Export all chromosomes +#' exportChromosomes(genome, NULL, "all.fa") +#' } +#' +#' @export +exportChromosomes <- function(genome, names = NULL, output_path, line_width = 80L) { + gtars::export_fasta( + genome@store, + genome@collection_digest, + output_path, + sequence_names = names, + line_width = as.integer(line_width) + ) + invisible(output_path) +} diff --git a/refget-r/R/conversion.R b/refget-r/R/conversion.R new file mode 100644 index 0000000..7a279ed --- /dev/null +++ b/refget-r/R/conversion.R @@ -0,0 +1,42 @@ +#' Convert sequence to DNAString +#' +#' @param seq_string Character string containing a DNA sequence +#' @return A Biostrings DNAString object +#' +#' @examples +#' \dontrun{ +#' dna <- as_DNAString("ACGTACGT") +#' } +#' +#' @export +as_DNAString <- function(seq_string) { + if (!requireNamespace("Biostrings", quietly = TRUE)) { + stop("Biostrings package required for DNAString conversion. ", + "Install with: BiocManager::install('Biostrings')") + } + Biostrings::DNAString(seq_string) +} + +#' Convert sequences to DNAStringSet +#' +#' @param seq_strings Character vector of DNA sequences +#' @param names Optional names for the sequences +#' @return A Biostrings DNAStringSet object +#' +#' @examples +#' \dontrun{ +#' seqs <- as_DNAStringSet(c("ACGT", "GGCC"), names = c("seq1", "seq2")) +#' } +#' +#' @export +as_DNAStringSet <- function(seq_strings, names = NULL) { + if (!requireNamespace("Biostrings", quietly = TRUE)) { + stop("Biostrings package required for DNAStringSet conversion. ", + "Install with: BiocManager::install('Biostrings')") + } + result <- Biostrings::DNAStringSet(seq_strings) + if (!is.null(names)) { + names(result) <- names + } + result +} diff --git a/refget-r/R/getSeq-methods.R b/refget-r/R/getSeq-methods.R new file mode 100644 index 0000000..0cab648 --- /dev/null +++ b/refget-r/R/getSeq-methods.R @@ -0,0 +1,178 @@ +# Define getSeq generic if not already available from Biostrings/BSgenome +# This allows the package to work without Biostrings installed +if (!isGeneric("getSeq")) { + setGeneric("getSeq", function(x, ...) standardGeneric("getSeq")) +} + +#' getSeq method for RefgetGenome +#' +#' Extract sequences from a RefgetGenome object using BSgenome-compatible syntax. +#' +#' @param x A RefgetGenome object +#' @param names Sequence names (character vector) or a GRanges object +#' @param start Start positions (integer vector, 1-based) +#' @param end End positions (integer vector, 1-based) +#' @param strand Strand ("+" or "-"). Default is "+". +#' @param as.character If TRUE, return character strings instead of DNAString/DNAStringSet +#' @param ... Additional arguments (ignored) +#' +#' @return +#' - Single sequence: DNAString (or character if as.character=TRUE or Biostrings unavailable) +#' - Multiple sequences: DNAStringSet (or character vector) +#' +#' @examples +#' \dontrun{ +#' genome <- RefgetGenome.from_fasta("genome.fa") +#' +#' # Full chromosome +#' seq <- getSeq(genome, "chr1") +#' +#' # Region by coordinates +#' seq <- getSeq(genome, "chr1", 1000, 2000) +#' +#' # Multiple regions +#' seqs <- getSeq(genome, c("chr1", "chr2"), c(1000, 5000), c(2000, 6000)) +#' +#' # From GRanges (requires GenomicRanges) +#' library(GenomicRanges) +#' gr <- GRanges(c("chr1:1000-2000", "chr2:5000-6000:-")) +#' seqs <- getSeq(genome, gr) +#' } +#' +#' @rdname getSeq-methods +#' @export +setMethod("getSeq", "RefgetGenome", + function(x, names, start = NA, end = NA, strand = "+", as.character = FALSE, ...) { + # Handle GRanges input + if (inherits(names, "GRanges")) { + return(.getSeq_GRanges(x, names, as.character = as.character)) + } + + # Ensure names is character + names <- as.character(names) + + # Handle single sequence (full or region) + if (length(names) == 1 && length(start) <= 1 && length(end) <= 1) { + return(.getSeq_single(x, names, start, end, strand, as.character)) + } + + # Vectorized extraction + .getSeq_vectorized(x, names, start, end, strand, as.character) + } +) + +# ----------------------------------------------------------------------------- +# Internal helpers +# ----------------------------------------------------------------------------- + +#' Extract a single sequence or region +#' @keywords internal +.getSeq_single <- function(genome, name, start, end, strand, as.character) { + # Get full sequence via get_sequence_by_name + record <- gtars::get_sequence_by_name(genome@store, genome@collection_digest, name) + if (is.null(record)) { + stop(sprintf("Sequence '%s' not found in collection", name)) + } + + seq_string <- record@data + + # Extract substring if coordinates provided + if (!is.na(start) && !is.na(end)) { + # R uses 1-based indexing + if (start < 1 || end > nchar(seq_string)) { + stop(sprintf("Coordinates [%d, %d] out of range for sequence '%s' (length %d)", + start, end, name, nchar(seq_string))) + } + seq_string <- substr(seq_string, start, end) + } + + # Handle negative strand + if (identical(strand, "-")) { + seq_string <- .reverse_complement(seq_string) + } + + # Convert to DNAString if requested and Biostrings available + if (!as.character && requireNamespace("Biostrings", quietly = TRUE)) { + return(Biostrings::DNAString(seq_string)) + } + + seq_string +} + +#' Vectorized sequence extraction +#' @keywords internal +.getSeq_vectorized <- function(genome, names, start, end, strand, as.character) { + n <- length(names) + + # Recycle start/end/strand to match names length + if (length(start) == 1) start <- rep(start, n) + if (length(end) == 1) end <- rep(end, n) + if (length(strand) == 1) strand <- rep(strand, n) + + if (length(start) != n || length(end) != n || length(strand) != n) { + stop("Length mismatch: names, start, end, and strand must have compatible lengths") + } + + # Use bulk BED extraction if all regions have coordinates + if (!any(is.na(start)) && !any(is.na(end))) { + # Write temp BED file (convert 1-based closed to 0-based half-open) + bed_file <- tempfile(fileext = ".bed") + on.exit(unlink(bed_file), add = TRUE) + + bed_df <- data.frame( + chrom = names, + start = as.integer(start - 1L), + end = as.integer(end) + ) + write.table(bed_df, bed_file, sep = "\t", row.names = FALSE, + col.names = FALSE, quote = FALSE) + + # Single Rust FFI call for all regions + retrieved <- gtars::get_seqs_bed_file_to_vec( + genome@store, genome@collection_digest, bed_file + ) + + seqs <- vapply(retrieved, function(r) r@sequence, character(1)) + + # Handle negative strand + minus_idx <- which(strand == "-") + if (length(minus_idx) > 0) { + seqs[minus_idx] <- vapply(seqs[minus_idx], .reverse_complement, character(1)) + } + + result_names <- sprintf("%s:%d-%d", names, start, end) + } else { + # Fallback: full chromosome extraction (no coordinates) + seqs <- vapply(seq_len(n), function(i) { + .getSeq_single(genome, names[i], start[i], end[i], strand[i], as.character = TRUE) + }, character(1)) + + result_names <- names + } + + names(seqs) <- result_names + + # Convert to DNAStringSet if requested + if (!as.character && requireNamespace("Biostrings", quietly = TRUE)) { + return(Biostrings::DNAStringSet(seqs)) + } + + seqs +} + +#' Extract sequences from GRanges +#' @keywords internal +.getSeq_GRanges <- function(genome, gr, as.character) { + if (!requireNamespace("GenomicRanges", quietly = TRUE)) { + stop("GenomicRanges package required for GRanges input") + } + + # Extract components from GRanges + names <- as.character(GenomicRanges::seqnames(gr)) + start <- GenomicRanges::start(gr) + end <- GenomicRanges::end(gr) + strand <- as.character(GenomicRanges::strand(gr)) + strand[strand == "*"] <- "+" # Treat unstranded as + + + .getSeq_vectorized(genome, names, start, end, strand, as.character) +} diff --git a/refget-r/R/utils.R b/refget-r/R/utils.R new file mode 100644 index 0000000..bd4de76 --- /dev/null +++ b/refget-r/R/utils.R @@ -0,0 +1,29 @@ +#' Reverse complement a DNA sequence +#' +#' @param seq DNA sequence string +#' @return Reverse complement string +#' @keywords internal +.reverse_complement <- function(seq) { + # Use Biostrings if available (faster and handles IUPAC codes) + if (requireNamespace("Biostrings", quietly = TRUE)) { + rc <- Biostrings::reverseComplement(Biostrings::DNAString(seq)) + return(as.character(rc)) + } + + # Pure R fallback for basic ACGT + complement_map <- c( + A = "T", T = "A", G = "C", C = "G", + a = "t", t = "a", g = "c", c = "g", + N = "N", n = "n" + ) + + chars <- strsplit(seq, "")[[1]] + complemented <- complement_map[chars] + # Handle unknown characters by keeping them + complemented[is.na(complemented)] <- chars[is.na(complemented)] + paste(rev(complemented), collapse = "") +} + +#' Null coalescing operator +#' @keywords internal +`%||%` <- function(x, y) if (is.null(x)) y else x diff --git a/refget-r/R/zzz.R b/refget-r/R/zzz.R new file mode 100644 index 0000000..c322b4c --- /dev/null +++ b/refget-r/R/zzz.R @@ -0,0 +1,13 @@ +#' @import methods +#' @importFrom GenomeInfoDb Seqinfo seqnames seqlengths +NULL + +.onLoad <- function(libname, pkgname) { + # Check that gtars is available + if (!requireNamespace("gtars", quietly = TRUE)) { + packageStartupMessage( + "Note: gtars package is required but not available. ", + "Install from: https://github.com/databio/gtars" + ) + } +} diff --git a/refget-r/README.md b/refget-r/README.md new file mode 100644 index 0000000..64ce684 --- /dev/null +++ b/refget-r/README.md @@ -0,0 +1,120 @@ +# BiocRefgetStore + +BSgenome-compatible interface to RefgetStore for R/Bioconductor. + +## Overview + +BiocRefgetStore provides a bridge between the gtars RefgetStore format and Bioconductor's BSgenome API. This allows you to use RefgetStore-backed genomes with the familiar `getSeq()` interface that Bioconductor users expect. + +## Installation + +```r +# Install gtars first (required) +# See: https://github.com/databio/gtars + +# Install BiocRefgetStore +devtools::install_local("path/to/refget-r") +``` + +## Quick Start + +```r +library(BiocRefgetStore) + +# Create a genome from a FASTA file +genome <- RefgetGenome.from_fasta("genome.fa") + +# BSgenome-compatible access +seq <- getSeq(genome, "chr1", 1000, 2000) # Returns DNAString +seqs <- getSeq(genome, c("chr1", "chr2")) # Returns DNAStringSet +genome[["chr1"]] # Full chromosome + +# Standard accessors +seqnames(genome) # c("chr1", "chr2", ...) +seqlengths(genome) # Named integer vector +seqinfo(genome) # Seqinfo object +``` + +## Loading Genomes + +```r +# From FASTA file (creates in-memory store) +genome <- RefgetGenome.from_fasta("/path/to/genome.fa") + +# From persisted RefgetStore directory +genome <- RefgetGenome.from_directory("/path/to/store", digest = "abc123...") + +# From RefgetStore with alias +store <- gtars::refget_store_open_local("/path/to/store") +genome <- RefgetGenome(store, namespace = "refseq", alias = "GRCh38") + +# From remote store (cloud-backed with local caching) +genome <- RefgetGenome.from_remote( + cache_path = "~/.cache/refget", + remote_url = "https://refget.databio.org/store", + namespace = "refseq", + alias = "GRCh38" +) +``` + +## Sequence Extraction + +```r +# Single region +seq <- getSeq(genome, "chr1", 1000, 2000) + +# Multiple regions +seqs <- getSeq(genome, + names = c("chr1", "chr2"), + start = c(1000, 5000), + end = c(2000, 6000)) + +# From GRanges (requires GenomicRanges) +library(GenomicRanges) +gr <- GRanges(c("chr1:1000-2000", "chr2:5000-6000:-")) +seqs <- getSeq(genome, gr) + +# Strand-aware extraction +seq <- getSeq(genome, "chr1", 1000, 2000, strand = "-") # Reverse complement +``` + +## Bulk Extraction + +```r +# Extract multiple regions efficiently +regions <- data.frame( + chrom = c("chr1", "chr2", "chr3"), + start = c(1000, 5000, 10000), + end = c(2000, 6000, 11000) +) +seqs <- extractRegions(genome, regions) + +# Write regions to FASTA +extractToFasta(genome, regions, "extracted.fa") + +# Export specific chromosomes +exportChromosomes(genome, c("chr1", "chr2"), "subset.fa") +``` + +## RefgetStore-Specific Features + +```r +# Get the seqcol digest +collection_digest(genome) + +# Get coordinate system (for compatibility checking) +coordinate_system(genome) + +# Get per-sequence digests +sequence_digests(genome) + +# Access underlying RefgetStore +store <- store(genome) +``` + +## Dependencies + +- **Required**: gtars (for RefgetStore), GenomeInfoDb (for Seqinfo) +- **Optional**: Biostrings (for DNAString/DNAStringSet), GenomicRanges (for GRanges support) + +Without Biostrings, sequences are returned as character strings. diff --git a/refget-r/install_and_test.sh b/refget-r/install_and_test.sh new file mode 100755 index 0000000..a3e3179 --- /dev/null +++ b/refget-r/install_and_test.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Install and test BiocRefgetStore +# Usage: bash install_and_test.sh [install|test|both] +# Default: both + +set -e + +PKG_DIR="$(cd "$(dirname "$0")" && pwd)" +ACTION="${1:-both}" + +R_CMD="bulker exec databio/nsheff -- R" +RSCRIPT_CMD="bulker exec databio/nsheff -- Rscript" + +install_pkg() { + echo "=== Installing BiocRefgetStore ===" + $R_CMD CMD INSTALL --no-multiarch "$PKG_DIR" + echo "=== Installation complete ===" +} + +run_tests() { + echo "=== Running tests ===" + $RSCRIPT_CMD -e "testthat::test_local('$PKG_DIR')" + echo "=== Tests complete ===" +} + +case "$ACTION" in + install) + install_pkg + ;; + test) + run_tests + ;; + both) + install_pkg + run_tests + ;; + *) + echo "Usage: bash install_and_test.sh [install|test|both]" + exit 1 + ;; +esac diff --git a/refget-r/tests/testthat.R b/refget-r/tests/testthat.R new file mode 100644 index 0000000..3259445 --- /dev/null +++ b/refget-r/tests/testthat.R @@ -0,0 +1,5 @@ +# This file is part of the standard testthat pattern +library(testthat) +library(BiocRefgetStore) + +test_check("BiocRefgetStore") diff --git a/refget-r/tests/testthat/test-RefgetGenome.R b/refget-r/tests/testthat/test-RefgetGenome.R new file mode 100644 index 0000000..aceeb6c --- /dev/null +++ b/refget-r/tests/testthat/test-RefgetGenome.R @@ -0,0 +1,124 @@ +# Test RefgetGenome class construction and accessors + +test_that("RefgetGenome can be created from FASTA", { + # Skip if gtars not available + skip_if_not_installed("gtars") + + # Create test FASTA + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGTACGTACGTACGTACGT", + ">chr2", + "GGCCGGCCGGCCGGCC" + ), fasta_file) + on.exit(unlink(fasta_file)) + + # Create genome + genome <- RefgetGenome.from_fasta(fasta_file) + + # Test basic properties + expect_s4_class(genome, "RefgetGenome") + expect_equal(length(genome), 2) + expect_equal(sort(names(genome)), c("chr1", "chr2")) +}) + +test_that("RefgetGenome accessors work", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGTACGTACGTACGTACGT", + ">chr2", + "GGCCGGCCGGCCGGCC" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # Test seqlengths + lens <- seqlengths(genome) + expect_equal(lens[["chr1"]], 20) + expect_equal(lens[["chr2"]], 16) + + # Test collection_digest + digest <- collection_digest(genome) + expect_type(digest, "character") + expect_true(nchar(digest) > 0) + + # Test sequence_digests + seq_digests <- sequence_digests(genome) + expect_named(seq_digests) + expect_true(all(c("chr1", "chr2") %in% names(seq_digests))) +}) + +test_that("RefgetGenome [[ extraction works", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGTACGTACGTACGTACGT" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # Extract full sequence + seq <- genome[["chr1"]] + + # Should be character or DNAString depending on Biostrings availability + if (requireNamespace("Biostrings", quietly = TRUE)) { + expect_s4_class(seq, "DNAString") + expect_equal(as.character(seq), "ACGTACGTACGTACGTACGT") + } else { + expect_type(seq, "character") + expect_equal(seq, "ACGTACGTACGTACGTACGT") + } +}) + +test_that("RefgetGenome errors on missing sequence", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGT" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + expect_error(genome[["chrX"]], "not found") +}) + +test_that("RefgetGenome from_directory works", { + skip_if_not_installed("gtars") + + # Create a store on disk + store_dir <- tempfile() + dir.create(store_dir) + on.exit(unlink(store_dir, recursive = TRUE)) + + store <- gtars::refget_store_on_disk(store_dir) + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">seq1", "ACGT"), fasta_file) + on.exit(unlink(fasta_file), add = TRUE) + + result <- gtars::add_fasta(store, fasta_file) + + # Load from directory + genome <- RefgetGenome.from_directory(store_dir, digest = result$digest) + expect_s4_class(genome, "RefgetGenome") + expect_equal(names(genome), "seq1") +}) + +test_that("RefgetGenome constructor requires digest or alias", { + skip_if_not_installed("gtars") + + store <- gtars::refget_store() + + expect_error(RefgetGenome(store), "Must provide either") +}) diff --git a/refget-r/tests/testthat/test-bulk.R b/refget-r/tests/testthat/test-bulk.R new file mode 100644 index 0000000..056d70a --- /dev/null +++ b/refget-r/tests/testthat/test-bulk.R @@ -0,0 +1,134 @@ +# Test bulk extraction functions + +test_that("extractRegions works with data.frame", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGTACGTACGTACGTACGT", + ">chr2", + "GGCCGGCCGGCCGGCC" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + regions <- data.frame( + chrom = c("chr1", "chr2"), + start = c(1, 1), + end = c(4, 4) + ) + + seqs <- extractRegions(genome, regions, as.character = TRUE) + + expect_length(seqs, 2) + expect_equal(seqs[[1]], "ACGT") + expect_equal(seqs[[2]], "GGCC") +}) + +test_that("extractRegions works with GRanges", { + skip_if_not_installed("gtars") + skip_if_not_installed("GenomicRanges") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGTACGTACGTACGTACGT" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + gr <- GenomicRanges::GRanges(c("chr1:1-4", "chr1:5-8")) + + seqs <- extractRegions(genome, gr, as.character = TRUE) + + expect_length(seqs, 2) + expect_equal(seqs[[1]], "ACGT") + expect_equal(seqs[[2]], "ACGT") +}) + +test_that("extractToFasta writes correct output", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGTACGTACGTACGTACGT" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + regions <- data.frame( + chrom = c("chr1"), + start = c(1), + end = c(8) + ) + + output_file <- tempfile(fileext = ".fa") + on.exit(unlink(output_file), add = TRUE) + + result <- extractToFasta(genome, regions, output_file) + + expect_equal(result, output_file) + expect_true(file.exists(output_file)) + + # Check content + content <- readLines(output_file) + expect_true(length(content) > 0) +}) + +test_that("exportChromosomes works", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGTACGTACGTACGTACGT", + ">chr2", + "GGCCGGCCGGCCGGCC" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + output_file <- tempfile(fileext = ".fa") + on.exit(unlink(output_file), add = TRUE) + + # Export just chr1 + result <- exportChromosomes(genome, "chr1", output_file) + + expect_equal(result, output_file) + expect_true(file.exists(output_file)) + + # Read and verify + content <- readLines(output_file) + expect_true(any(grepl(">chr1", content))) + expect_false(any(grepl(">chr2", content))) +}) + +test_that("exportChromosomes exports all when names is NULL", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGT", + ">chr2", + "GGCC" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + output_file <- tempfile(fileext = ".fa") + on.exit(unlink(output_file), add = TRUE) + + exportChromosomes(genome, NULL, output_file) + + content <- readLines(output_file) + expect_true(any(grepl(">chr1", content))) + expect_true(any(grepl(">chr2", content))) +}) diff --git a/refget-r/tests/testthat/test-constructors.R b/refget-r/tests/testthat/test-constructors.R new file mode 100644 index 0000000..227f02b --- /dev/null +++ b/refget-r/tests/testthat/test-constructors.R @@ -0,0 +1,100 @@ +# Test constructor edge cases + +test_that("RefgetGenome with invalid digest errors", { + skip_if_not_installed("gtars") + + store <- gtars::refget_store() + + # A digest that doesn't exist in the store + expect_error( + RefgetGenome(store, digest = "nonexistent_digest_abc123"), + "not found" + ) +}) + +test_that("RefgetGenome.from_fasta with nonexistent file errors", { + skip_if_not_installed("gtars") + + expect_error( + RefgetGenome.from_fasta("/tmp/does_not_exist_xyz.fa") + ) +}) + +test_that("RefgetGenome.from_directory with nonexistent path errors", { + skip_if_not_installed("gtars") + + expect_error( + RefgetGenome.from_directory("/tmp/no_such_store_dir_xyz", digest = "abc") + ) +}) + +test_that("RefgetGenome with only namespace (missing alias) errors", { + skip_if_not_installed("gtars") + + store <- gtars::refget_store() + + expect_error( + RefgetGenome(store, namespace = "refseq"), + "Must provide either" + ) +}) + +test_that("RefgetGenome with only alias (missing namespace) errors", { + skip_if_not_installed("gtars") + + store <- gtars::refget_store() + + expect_error( + RefgetGenome(store, alias = "hg38"), + "Must provide either" + ) +}) + +test_that("RefgetGenome with neither digest nor alias errors", { + skip_if_not_installed("gtars") + + store <- gtars::refget_store() + + expect_error( + RefgetGenome(store), + "Must provide either" + ) +}) + +test_that("RefgetGenome.from_fasta returns correct class", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">seq1", "ACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + expect_s4_class(genome, "RefgetGenome") + + # Verify the digest was set + d <- collection_digest(genome) + expect_type(d, "character") + expect_true(nchar(d) > 0) +}) + +test_that("RefgetGenome.from_directory roundtrip works", { + skip_if_not_installed("gtars") + + # Create on-disk store and add FASTA + store_dir <- tempfile() + dir.create(store_dir) + on.exit(unlink(store_dir, recursive = TRUE)) + + store <- gtars::refget_store_on_disk(store_dir) + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "AAAA", ">chr2", "CCCC"), fasta_file) + on.exit(unlink(fasta_file), add = TRUE) + + result <- gtars::add_fasta(store, fasta_file) + + # Reload from directory + genome <- RefgetGenome.from_directory(store_dir, digest = result$digest) + expect_s4_class(genome, "RefgetGenome") + expect_equal(length(genome), 2) + expect_equal(sort(names(genome)), c("chr1", "chr2")) +}) diff --git a/refget-r/tests/testthat/test-conversion.R b/refget-r/tests/testthat/test-conversion.R new file mode 100644 index 0000000..49df15a --- /dev/null +++ b/refget-r/tests/testthat/test-conversion.R @@ -0,0 +1,30 @@ +# Test Biostrings conversion functions + +test_that("as_DNAString works", { + skip_if_not_installed("Biostrings") + + dna <- as_DNAString("ACGT") + expect_s4_class(dna, "DNAString") + expect_equal(as.character(dna), "ACGT") +}) + +test_that("as_DNAStringSet works", { + skip_if_not_installed("Biostrings") + + seqs <- as_DNAStringSet(c("ACGT", "GGCC")) + expect_s4_class(seqs, "DNAStringSet") + expect_length(seqs, 2) +}) + +test_that("as_DNAStringSet accepts names", { + skip_if_not_installed("Biostrings") + + seqs <- as_DNAStringSet(c("ACGT", "GGCC"), names = c("seq1", "seq2")) + expect_equal(names(seqs), c("seq1", "seq2")) +}) + +test_that("as_DNAString errors without Biostrings", { + # This test verifies the error message is clear + # In practice, this won't run if Biostrings is installed + skip("Cannot test Biostrings absence when it's installed") +}) diff --git a/refget-r/tests/testthat/test-edge-cases.R b/refget-r/tests/testthat/test-edge-cases.R new file mode 100644 index 0000000..31a3f37 --- /dev/null +++ b/refget-r/tests/testthat/test-edge-cases.R @@ -0,0 +1,291 @@ +# Test edge cases and gaps in current coverage + +# -- Single-sequence FASTA ------------------------------------------------ + +test_that("RefgetGenome works with single-sequence FASTA", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">only_seq", "ACGTACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + expect_equal(length(genome), 1) + expect_equal(names(genome), "only_seq") + expect_equal(seqlengths(genome)[["only_seq"]], 8) +}) + +# -- Sequences with N/ambiguous bases ------------------------------------- + +test_that("getSeq handles sequences with N bases", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chrN", "ACNNNGTACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + seq <- getSeq(genome, "chrN", as.character = TRUE) + expect_equal(seq, "ACNNNGTACGT") + + # Substring containing Ns + sub <- getSeq(genome, "chrN", start = 2, end = 6, as.character = TRUE) + expect_equal(sub, "CNNNG") +}) + +# -- Partial coordinates (only start or only end) ------------------------- + +test_that("getSeq with only start=NA returns full sequence", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGTACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # Both NA -> full sequence + seq <- getSeq(genome, "chr1", start = NA, end = NA, as.character = TRUE) + expect_equal(seq, "ACGTACGT") +}) + +# -- Coordinate boundary conditions --------------------------------------- + +test_that("getSeq works at sequence boundaries", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGTACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # start=1, end=seqlength (full range) + full <- getSeq(genome, "chr1", start = 1, end = 8, as.character = TRUE) + expect_equal(full, "ACGTACGT") + + # First base only + first <- getSeq(genome, "chr1", start = 1, end = 1, as.character = TRUE) + expect_equal(first, "A") + + # Last base only + last <- getSeq(genome, "chr1", start = 8, end = 8, as.character = TRUE) + expect_equal(last, "T") +}) + +# -- extractRegions with single region ------------------------------------ + +test_that("extractRegions works with a single-row data.frame", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGTACGTACGTACGTACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + regions <- data.frame( + chrom = "chr1", + start = 1, + end = 4, + stringsAsFactors = FALSE + ) + seqs <- extractRegions(genome, regions, as.character = TRUE) + + expect_length(seqs, 1) + expect_equal(names(seqs), "chr1:1-4") +}) + +# -- extractRegions error on missing columns ------------------------------ + +test_that("extractRegions errors on missing required columns", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # Missing 'end' column + bad_df <- data.frame(chrom = "chr1", start = 1) + expect_error(extractRegions(genome, bad_df), "must have columns") + + # Wrong column names + bad_df2 <- data.frame(chromosome = "chr1", begin = 1, finish = 4) + expect_error(extractRegions(genome, bad_df2), "must have columns") +}) + +# -- exportChromosomes with nonexistent name ------------------------------ + +test_that("exportChromosomes with nonexistent chromosome", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + output <- tempfile(fileext = ".fa") + on.exit(unlink(output), add = TRUE) + + # Requesting a nonexistent chromosome should error or produce empty output + expect_error(exportChromosomes(genome, names = "chrX", output_path = output)) +}) + +# -- show() method -------------------------------------------------------- + +test_that("show() produces expected output", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", "ACGTACGT", + ">chr2", "GGCCGGCC" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + out <- capture.output(show(genome)) + expect_true(any(grepl("RefgetGenome with 2 sequences", out))) + expect_true(any(grepl("collection_digest:", out))) + expect_true(any(grepl("seqnames:", out))) +}) + +test_that("show() truncates when >5 sequences", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + lines <- unlist(lapply(paste0("seq", 1:7), function(nm) { + c(paste0(">", nm), "ACGT") + })) + writeLines(lines, fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + out <- capture.output(show(genome)) + expect_true(any(grepl("more\\)", out))) +}) + +# -- length(), names(), seqnames() on multi-sequence genome --------------- + +test_that("length, names, seqnames work on multi-sequence genome", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", "AAAA", + ">chr2", "CCCC", + ">chr3", "GGGG" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + expect_equal(length(genome), 3) + expect_type(names(genome), "character") + expect_length(names(genome), 3) + + sn <- seqnames(genome) + expect_length(sn, 3) + expect_true(all(c("chr1", "chr2", "chr3") %in% as.character(sn))) +}) + +# -- coordinate_system() returns a string --------------------------------- + +test_that("coordinate_system returns a character string", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGTACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + cs <- coordinate_system(genome) + expect_type(cs, "character") + expect_length(cs, 1) + expect_true(nchar(cs) > 0) +}) + +# -- store() returns the underlying RefgetStore --------------------------- + +test_that("store() returns the underlying RefgetStore", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + s <- store(genome) + expect_false(is.null(s)) + # The store should be usable with gtars functions + expect_true(inherits(s, "RefgetStore") || is(s, "RefgetStore")) +}) + +# -- getSeq as.character flag with Biostrings available ------------------- + +test_that("getSeq as.character=TRUE returns character even with Biostrings", { + skip_if_not_installed("gtars") + skip_if_not_installed("Biostrings") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGTACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # as.character=TRUE should force character output + seq <- getSeq(genome, "chr1", as.character = TRUE) + expect_type(seq, "character") + expect_equal(seq, "ACGTACGT") + + # as.character=FALSE should return DNAString + seq2 <- getSeq(genome, "chr1", as.character = FALSE) + expect_s4_class(seq2, "DNAString") +}) + +test_that("getSeq vectorized as.character=TRUE returns character vector", { + skip_if_not_installed("gtars") + skip_if_not_installed("Biostrings") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", "ACGTACGT", + ">chr2", "GGCCGGCC" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + seqs <- getSeq(genome, c("chr1", "chr2"), as.character = TRUE) + expect_type(seqs, "character") + expect_length(seqs, 2) + + seqs2 <- getSeq(genome, c("chr1", "chr2"), as.character = FALSE) + expect_s4_class(seqs2, "DNAStringSet") +}) + +# -- seqinfo returns Seqinfo object --------------------------------------- + +test_that("seqinfo returns a Seqinfo object", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", "ACGTACGT", + ">chr2", "GGCC" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + si <- seqinfo(genome) + expect_s4_class(si, "Seqinfo") + expect_true("chr1" %in% GenomeInfoDb::seqnames(si)) +}) diff --git a/refget-r/tests/testthat/test-getSeq.R b/refget-r/tests/testthat/test-getSeq.R new file mode 100644 index 0000000..ffbd4a0 --- /dev/null +++ b/refget-r/tests/testthat/test-getSeq.R @@ -0,0 +1,156 @@ +# Test getSeq methods + +test_that("getSeq extracts full sequence", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGTACGTACGTACGTACGT" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # Full sequence + seq <- getSeq(genome, "chr1", as.character = TRUE) + expect_equal(seq, "ACGTACGTACGTACGTACGT") +}) + +test_that("getSeq extracts regions by coordinates", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGTACGTACGTACGTACGT" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # Region extraction (1-based, inclusive) + seq <- getSeq(genome, "chr1", start = 1, end = 4, as.character = TRUE) + expect_equal(seq, "ACGT") + + seq <- getSeq(genome, "chr1", start = 5, end = 8, as.character = TRUE) + expect_equal(seq, "ACGT") +}) + +test_that("getSeq handles negative strand", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGTACGTACGTACGTACGT" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # Forward strand + seq_plus <- getSeq(genome, "chr1", 1, 4, strand = "+", as.character = TRUE) + expect_equal(seq_plus, "ACGT") + + # Reverse complement + seq_minus <- getSeq(genome, "chr1", 1, 4, strand = "-", as.character = TRUE) + expect_equal(seq_minus, "ACGT") # ACGT reverse complement is ACGT +}) + +test_that("getSeq vectorized extraction works", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGTACGTACGTACGTACGT", + ">chr2", + "GGCCGGCCGGCCGGCC" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # Multiple regions + seqs <- getSeq(genome, + names = c("chr1", "chr2"), + start = c(1, 1), + end = c(4, 4), + as.character = TRUE) + + expect_length(seqs, 2) + expect_equal(seqs[[1]], "ACGT") + expect_equal(seqs[[2]], "GGCC") +}) + +test_that("getSeq errors on out-of-range coordinates", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + expect_error(getSeq(genome, "chr1", 1, 100), "out of range") +}) + +test_that("getSeq with GRanges input works", { + skip_if_not_installed("gtars") + skip_if_not_installed("GenomicRanges") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGTACGTACGTACGTACGT", + ">chr2", + "GGCCGGCCGGCCGGCC" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # Create GRanges + gr <- GenomicRanges::GRanges(c("chr1:1-4", "chr2:1-4")) + + seqs <- getSeq(genome, gr, as.character = TRUE) + + expect_length(seqs, 2) + expect_equal(seqs[[1]], "ACGT") + expect_equal(seqs[[2]], "GGCC") +}) + +test_that("getSeq returns DNAString when Biostrings available", { + skip_if_not_installed("gtars") + skip_if_not_installed("Biostrings") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + seq <- getSeq(genome, "chr1") + expect_s4_class(seq, "DNAString") +}) + +test_that("getSeq returns DNAStringSet for multiple sequences", { + skip_if_not_installed("gtars") + skip_if_not_installed("Biostrings") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGT", + ">chr2", + "GGCC" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + seqs <- getSeq(genome, c("chr1", "chr2")) + expect_s4_class(seqs, "DNAStringSet") + expect_length(seqs, 2) +}) diff --git a/refget-r/tests/testthat/test-remote.R b/refget-r/tests/testthat/test-remote.R new file mode 100644 index 0000000..5fc03fb --- /dev/null +++ b/refget-r/tests/testthat/test-remote.R @@ -0,0 +1,39 @@ +# Test remote store access + +test_that("RefgetGenome.from_remote constructor works", { + skip_if_not_installed("gtars") + skip("Requires a live remote server") + + # This test requires a live remote server + cache_dir <- tempfile() + dir.create(cache_dir) + on.exit(unlink(cache_dir, recursive = TRUE)) + + # Example (would need real server and digest) + # genome <- RefgetGenome.from_remote( + # cache_path = cache_dir, + # remote_url = "https://refget.databio.org/store", + # digest = "known_digest_here" + # ) + # expect_s4_class(genome, "RefgetGenome") +}) + +test_that("coordinate_system accessor works", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", + "ACGTACGTACGTACGTACGT", + ">chr2", + "GGCCGGCCGGCCGGCC" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # coordinate_system should return sorted_name_length_pairs_digest + coord_sys <- coordinate_system(genome) + expect_type(coord_sys, "character") + expect_true(nchar(coord_sys) > 0) +}) diff --git a/refget-r/vignettes/getting-started.Rmd b/refget-r/vignettes/getting-started.Rmd new file mode 100644 index 0000000..bdea166 --- /dev/null +++ b/refget-r/vignettes/getting-started.Rmd @@ -0,0 +1,185 @@ +--- +title: "Getting Started with BiocRefgetStore" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Getting Started with BiocRefgetStore} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +BiocRefgetStore provides a BSgenome-compatible interface to reference genomes +backed by GA4GH refget stores. Instead of managing FASTA files, you connect +to a refget store (local or remote) and access sequences by digest. + +This tutorial uses the **2023 Human Pangenome Reference** -- a remote refget +store containing 47 haplotype-resolved assemblies hosted on S3. Sequences are +downloaded on-demand and cached locally, so you don't need to download the +entire dataset upfront. + +## Installation + +BiocRefgetStore depends on `gtars`, a Rust-backed R package. Because `gtars` +lives in a Rust monorepo with sibling crates, it must be installed from a local +clone of the full repository (not directly from GitHub): + +```{r install, eval=FALSE} +# 1. Clone the gtars monorepo (if you haven't already) +# git clone https://github.com/databio/gtars.git + +# 2. Install gtars from the local clone (requires Rust toolchain) +install.packages("path/to/gtars/gtars-r", repos = NULL, type = "source") + +# 3. Install BiocRefgetStore +remotes::install_github("refgenie/refget", subdir = "refget-r", ref = "r") +# Or from local source: +# install.packages("path/to/refget/refget-r", repos = NULL, type = "source") +``` + +`gtars` is not on CRAN and `remotes::install_github()` won't work for it +because the R package depends on sibling Rust crates via relative paths. +You need the full monorepo checkout so those paths resolve correctly. + +## Connect to a remote pangenome store + +Load a genome from the Human Pangenome Reference store. The store metadata +(~1.5 MB) is fetched on first use; individual sequences are downloaded +on-demand and cached locally: + +```{r remote-store, eval=FALSE} +library(BiocRefgetStore) + +# 2023 Human Pangenome Reference (47 haplotype-resolved assemblies) +pangenome_url <- "https://refgenie.s3.us-east-1.amazonaws.com/pangenome_refget_store" + +# One assembly from the pangenome (HG03540.pri.mat.f1_v2) +genome <- RefgetGenome.from_remote( + cache_path = "~/.cache/refget/pangenome", + remote_url = pangenome_url, + digest = "0aHV7I-94paL9Z1H4LNlqsW3WxJhlou5" +) +genome +#> RefgetGenome with 750 sequences +#> collection_digest: 0aHV7I-94paL9Z1H4LNlqsW3WxJhlou5 +#> seqnames: JAGYVX010000001.1, JAGYVX010000002.1, ... (745 more) +``` + +Subsequent calls reuse the local cache -- no re-downloading. + +## Basic sequence access + +Extract a full sequence or a region by coordinates: + +```{r getseq, eval=FALSE} +# Full sequence (returns DNAString if Biostrings is installed) +seq <- genome[["JAGYVX010000001.1"]] + +# Region by coordinates (1-based, inclusive) +region <- getSeq(genome, "JAGYVX010000001.1", start = 1000, end = 2000) + +# Force character output +region_chr <- getSeq(genome, "JAGYVX010000001.1", start = 1000, end = 2000, + as.character = TRUE) +``` + +Negative-strand extraction applies the reverse complement: + +```{r strand, eval=FALSE} +rc <- getSeq(genome, "JAGYVX010000001.1", start = 1000, end = 2000, strand = "-") +``` + +## Multiple regions at once + +Pass vectors of names, starts, and ends: + +```{r vectorized, eval=FALSE} +seqs <- getSeq( + genome, + names = c("JAGYVX010000001.1", "JAGYVX010000002.1", "JAGYVX010000003.1"), + start = c(100, 200, 300), + end = c(199, 299, 399) +) +``` + +If you have a GRanges object, pass it directly: + +```{r granges, eval=FALSE} +library(GenomicRanges) +gr <- GRanges(c("JAGYVX010000001.1:100-199:+", "JAGYVX010000002.1:200-299:-")) +seqs <- getSeq(genome, gr) +``` + +## Bulk extraction from a data.frame + +`extractRegions` accepts a data.frame with `chrom`, `start`, `end` columns: + +```{r extract-regions, eval=FALSE} +regions <- data.frame( + chrom = c("JAGYVX010000001.1", "JAGYVX010000001.1", "JAGYVX010000002.1"), + start = c(100, 5000, 200), + end = c(199, 5099, 299) +) +seqs <- extractRegions(genome, regions, as.character = TRUE) +``` + +## Export sequences to FASTA + +Write extracted regions or full sequences to a FASTA file: + +```{r export, eval=FALSE} +# Regions to FASTA +extractToFasta(genome, regions, "extracted_regions.fa") + +# Specific sequences +exportChromosomes(genome, c("JAGYVX010000001.1", "JAGYVX010000002.1"), "subset.fa") + +# All sequences +exportChromosomes(genome, output_path = "full_assembly.fa") +``` + +## Genome metadata + +Inspect sequences and their properties: + +```{r metadata, eval=FALSE} +seqnames(genome) # sequence names +seqlengths(genome) # named integer vector of lengths +seqinfo(genome) # full Seqinfo object +length(genome) # number of sequences +collection_digest(genome) # seqcol digest +coordinate_system(genome) # sorted_name_length_pairs digest +sequence_digests(genome) # per-sequence SHA512t24u digests +``` + +## Working with local FASTA files + +You can also create a genome directly from a local FASTA file. This builds +an in-memory refget store, computes sequence digests, and creates a `Seqinfo` +object automatically: + +```{r from-fasta, eval=FALSE} +genome <- RefgetGenome.from_fasta("genome.fa") +genome +``` + +## Persistent on-disk store + +For large genomes you access repeatedly, use an on-disk store so sequences +are indexed once and reused across sessions: + +```{r on-disk, eval=FALSE} +# First time: create the store from FASTA +store <- gtars::refget_store_on_disk("~/.local/share/refget/hg38") +result <- gtars::add_fasta(store, "hg38.fa") +# Save the digest: result$digest + +# Later: reload without re-parsing +genome <- RefgetGenome.from_directory( + "~/.local/share/refget/hg38", + digest = "saved_digest_string" +) +``` + +## Next steps + +See the [Reference](reference.html) vignette for complete documentation of +every function and method in the package. diff --git a/refget-r/vignettes/reference.Rmd b/refget-r/vignettes/reference.Rmd new file mode 100644 index 0000000..c6238c4 --- /dev/null +++ b/refget-r/vignettes/reference.Rmd @@ -0,0 +1,393 @@ +--- +title: "BiocRefgetStore Reference" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{BiocRefgetStore Reference} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +Complete reference for every exported function, method, and class in +BiocRefgetStore. + +## Constructors + +### RefgetGenome.from_fasta + +```r +RefgetGenome.from_fasta(fasta_path) +``` + +Create a `RefgetGenome` from a FASTA file. Builds an in-memory refget store, +computes digests, and indexes all sequences. + +- **fasta_path** — Path to a FASTA file (`.fa`, `.fasta`, `.fa.gz`). +- **Returns** — A `RefgetGenome` object. + +```{r from-fasta, eval=FALSE} +genome <- RefgetGenome.from_fasta("hg38.fa") +``` + +### RefgetGenome.from_directory + +```r +RefgetGenome.from_directory(path, digest = NULL, namespace = NULL, alias = NULL) +``` + +Load a `RefgetGenome` from a persisted on-disk refget store directory. + +- **path** — Path to a directory created by `gtars::refget_store_on_disk()`. +- **digest** — Collection digest string (provide this OR namespace + alias). +- **namespace** — Alias namespace (e.g., `"refseq"`). +- **alias** — Alias name (e.g., `"GRCh38"`). +- **Returns** — A `RefgetGenome` object. + +```{r from-directory, eval=FALSE} +genome <- RefgetGenome.from_directory("~/.refget/hg38", digest = "abc123...") +``` + +### RefgetGenome.from_remote + +```r +RefgetGenome.from_remote(cache_path, remote_url, digest = NULL, namespace = NULL, alias = NULL) +``` + +Create a `RefgetGenome` backed by a remote refget store with local caching. + +- **cache_path** — Local directory for caching downloaded data. +- **remote_url** — URL of the remote refget store. +- **digest** / **namespace** / **alias** — Same as `RefgetGenome.from_directory`. +- **Returns** — A `RefgetGenome` object. + +```{r from-remote, eval=FALSE} +genome <- RefgetGenome.from_remote( + cache_path = "~/.cache/refget", + remote_url = "https://refget.databio.org/store", + namespace = "refseq", alias = "GRCh38" +) +``` + +### RefgetGenome (low-level) + +```r +RefgetGenome(store, digest = NULL, namespace = NULL, alias = NULL) +``` + +Construct a `RefgetGenome` from an existing `gtars::RefgetStore` object. +Requires either `digest` or both `namespace` and `alias`. + +- **store** — A gtars `RefgetStore` object. +- **digest** / **namespace** / **alias** — Collection identifier. +- **Returns** — A `RefgetGenome` object. + +```{r constructor, eval=FALSE} +store <- gtars::refget_store_open_local("/path/to/store") +genome <- RefgetGenome(store, namespace = "refseq", alias = "GRCh38") +``` + +--- + +## Sequence Access + +### getSeq + +```r +getSeq(x, names, start = NA, end = NA, strand = "+", as.character = FALSE, ...) +``` + +Extract sequences from a `RefgetGenome`. BSgenome-compatible interface. + +- **x** — A `RefgetGenome` object. +- **names** — Character vector of sequence names, or a `GRanges` object. +- **start** — Integer start position(s), 1-based inclusive. `NA` for full sequence. +- **end** — Integer end position(s), 1-based inclusive. `NA` for full sequence. +- **strand** — `"+"` (default) or `"-"` for reverse complement. +- **as.character** — If `TRUE`, return character instead of DNAString/DNAStringSet. +- **Returns** — Single sequence: `DNAString` (or character). Multiple: `DNAStringSet` (or character vector). Named as `"seqname:start-end"` for regions. + +```{r getseq, eval=FALSE} +# Full chromosome +getSeq(genome, "chr1") + +# Region +getSeq(genome, "chr1", start = 100, end = 200) + +# Reverse complement +getSeq(genome, "chr1", start = 100, end = 200, strand = "-") + +# Multiple regions +getSeq(genome, c("chr1", "chr2"), c(100, 500), c(200, 600)) + +# From GRanges +getSeq(genome, GRanges("chr1:100-200:-")) +``` + +### `[[` (bracket extraction) + +```r +genome[["chr1"]] +``` + +Extract a full sequence by name. Returns `DNAString` if Biostrings is +installed, otherwise a character string. + +- **i** — Sequence name (character). +- **Returns** — `DNAString` or character string. +- **Errors** — If the sequence name is not found in the collection. + +--- + +## Metadata Accessors + +### seqinfo + +```r +seqinfo(x) +``` + +Returns the `Seqinfo` object containing sequence names and lengths. + +- **Returns** — A `GenomeInfoDb::Seqinfo` object. + +### seqnames + +```r +seqnames(x) +``` + +Returns the sequence names. + +- **Returns** — Character vector (via `Seqinfo`). + +### seqlengths + +```r +seqlengths(x) +``` + +Returns named integer vector of sequence lengths. + +- **Returns** — Named integer vector. + +```{r seqlengths, eval=FALSE} +seqlengths(genome) +#> chr1 chr2 chr3 +#> 248956 242193 198295 +``` + +### length + +```r +length(x) +``` + +Returns the number of sequences in the genome. + +- **Returns** — Integer scalar. + +### names + +```r +names(x) +``` + +Returns the sequence names as a character vector. + +- **Returns** — Character vector. + +### collection_digest + +```r +collection_digest(genome) +``` + +Returns the GA4GH seqcol digest identifying this sequence collection. + +- **genome** — A `RefgetGenome` object. +- **Returns** — Character string. + +### coordinate_system + +```r +coordinate_system(genome) +``` + +Returns the `sorted_name_length_pairs` digest. Two genomes with the same +`coordinate_system()` share the same coordinate system and are compatible for +coordinate-based operations (e.g., lifting over annotations). + +- **genome** — A `RefgetGenome` object. +- **Returns** — Character string. + +### sequence_digests + +```r +sequence_digests(genome) +``` + +Returns a named character vector of per-sequence SHA512t24u digests. + +- **genome** — A `RefgetGenome` object. +- **Returns** — Named character vector (names are sequence names, values are digests). + +```{r seq-digests, eval=FALSE} +sequence_digests(genome) +#> chr1 chr2 +#> "SQ.2648ae1bacce4ec4b6cf337..." "SQ.f932a39b4c70..." +``` + +### store + +```r +store(genome) +``` + +Returns the underlying `gtars::RefgetStore` object. Useful for calling gtars +functions directly. + +- **genome** — A `RefgetGenome` object. +- **Returns** — A gtars `RefgetStore` object. + +--- + +## Bulk Extraction + +### extractRegions + +```r +extractRegions(genome, regions, as.character = FALSE) +``` + +Extract multiple genomic regions efficiently using BED-based extraction. + +- **genome** — A `RefgetGenome` object. +- **regions** — A `GRanges` object or a `data.frame` with columns `chrom`, `start`, `end` (1-based inclusive coordinates). +- **as.character** — If `TRUE`, return character vector instead of `DNAStringSet`. +- **Returns** — `DNAStringSet` or named character vector. Named as `"chrom:start-end"`. + +```{r extract-regions, eval=FALSE} +regions <- data.frame( + chrom = c("chr1", "chr1", "chr2"), + start = c(100, 5000, 200), + end = c(199, 5099, 299) +) +seqs <- extractRegions(genome, regions) +``` + +### extractToFasta + +```r +extractToFasta(genome, regions, output_path) +``` + +Write extracted regions directly to a FASTA file. + +- **genome** — A `RefgetGenome` object. +- **regions** — A `GRanges` object or data.frame (same as `extractRegions`). +- **output_path** — Path for the output FASTA file. +- **Returns** — Invisibly returns `output_path`. + +```{r extract-to-fasta, eval=FALSE} +extractToFasta(genome, regions, "output.fa") +``` + +### exportChromosomes + +```r +exportChromosomes(genome, names = NULL, output_path, line_width = 80L) +``` + +Export complete chromosomes to a FASTA file. + +- **genome** — A `RefgetGenome` object. +- **names** — Character vector of chromosome names to export, or `NULL` for all. +- **output_path** — Path for the output FASTA file. +- **line_width** — Bases per line in output (default: 80). +- **Returns** — Invisibly returns `output_path`. + +```{r export-chroms, eval=FALSE} +# Specific chromosomes +exportChromosomes(genome, c("chr1", "chr22"), "subset.fa") + +# All chromosomes +exportChromosomes(genome, output_path = "full.fa") +``` + +--- + +## Conversion Utilities + +### as_DNAString + +```r +as_DNAString(seq_string) +``` + +Convert a character string to a Biostrings `DNAString` object. + +- **seq_string** — Character string containing a DNA sequence. +- **Returns** — A `DNAString` object. +- **Errors** — If Biostrings is not installed. + +```{r as-dnastring, eval=FALSE} +dna <- as_DNAString("ACGTACGT") +``` + +### as_DNAStringSet + +```r +as_DNAStringSet(seq_strings, names = NULL) +``` + +Convert a character vector to a Biostrings `DNAStringSet` object. + +- **seq_strings** — Character vector of DNA sequences. +- **names** — Optional names for the sequences. +- **Returns** — A `DNAStringSet` object. +- **Errors** — If Biostrings is not installed. + +```{r as-dnastringset, eval=FALSE} +seqs <- as_DNAStringSet(c("ACGT", "GGCC"), names = c("seq1", "seq2")) +``` + +--- + +## Working with the Underlying Store + +The `store()` accessor gives you access to the full `gtars::RefgetStore` API +for operations not directly exposed by BiocRefgetStore. + +```{r store-advanced, eval=FALSE} +s <- store(genome) + +# List all aliases in the store +gtars::get_aliases(s) + +# Compare two sequence collections +gtars::compare_seqcols(s, digest_a, digest_b) + +# Get FHR (FASTA Header Record) metadata +gtars::get_fhr(s, collection_digest(genome)) + +# Access level 2 data (raw attribute arrays) +level2 <- gtars::get_level2(s, collection_digest(genome)) +level2$names # sequence names +level2$lengths # sequence lengths +level2$sequences # sequence digests +``` + +### show + +```r +show(object) +``` + +Display method for `RefgetGenome`. Prints the number of sequences, collection +digest, and first few sequence names. + +```{r show, eval=FALSE} +genome +#> RefgetGenome with 24 sequences +#> collection_digest: abc123... +#> seqnames: chr1, chr2, chr3, chr4, chr5 ... (19 more) +``` diff --git a/refget/__init__.py b/refget/__init__.py index 9b53cc3..8fae16e 100644 --- a/refget/__init__.py +++ b/refget/__init__.py @@ -2,18 +2,27 @@ refget - GA4GH reference sequence and sequence collection tools. Import from submodules: - from refget.store import RefgetStore, digest_fasta, StorageMode + from refget.store import RefgetStore, digest_fasta, StorageMode, compute_fai, digest_sequence, SequenceCollection from refget.digests import sha512t24u_digest, md5_digest, ga4gh_digest from refget.utils import compare_seqcols, validate_seqcol, seqcol_digest from refget.clients import SequenceCollectionClient, FastaDrsClient - from refget.models import SequenceCollection from refget.router import create_refget_router from refget.agents import RefgetDBAgent """ from ._version import __version__ -from .exceptions import InvalidSeqColError +from .backend import RefgetStoreBackend, SeqColBackend +from .clients import SequenceCollectionClient from .const import GTARS_INSTALLED +from .exceptions import InvalidSeqColError +from .store import ( + RefgetStore, + SequenceCollection, + StorageMode, + compute_fai, + digest_fasta, + digest_sequence, +) from .utils import canonical_str __all__ = [ @@ -21,4 +30,13 @@ "InvalidSeqColError", "GTARS_INSTALLED", "canonical_str", + "RefgetStore", + "StorageMode", + "digest_fasta", + "compute_fai", + "digest_sequence", + "SequenceCollection", + "SeqColBackend", + "RefgetStoreBackend", + "SequenceCollectionClient", ] diff --git a/refget/_version.py b/refget/_version.py index 1f4c4d4..ae6db5f 100644 --- a/refget/_version.py +++ b/refget/_version.py @@ -1 +1 @@ -__version__ = "0.10.1" +__version__ = "0.11.0" diff --git a/refget/agents.py b/refget/agents.py index e98c837..9bdce15 100644 --- a/refget/agents.py +++ b/refget/agents.py @@ -2,42 +2,43 @@ import json import os -import requests - from typing import TYPE_CHECKING -from sqlmodel import create_engine, select, Session, delete, func, SQLModel + +import requests +from sqlmodel import Session, SQLModel, create_engine, delete, func, select if TYPE_CHECKING: import peppy -from sqlalchemy.orm import selectinload +from typing import List, Optional + from sqlalchemy import URL from sqlalchemy.engine import Engine as SqlalchemyDatabaseEngine -from typing import Optional, List +from sqlalchemy.orm import selectinload +from .const import _LOGGER, DEFAULT_INHERENT_ATTRS, SEQCOL_SCHEMA_PATH from .models import ( - Sequence, - SequenceCollection, - Pangenome, - NamesAttr, - LengthsAttr, - SequencesAttr, - SortedSequencesAttr, - NameLengthPairsAttr, + AccessMethod, + AccessURL, CollectionNamesAttr, + FastaDrsObject, HumanReadableNames, + LengthsAttr, + NameLengthPairsAttr, + NamesAttr, PaginationResult, + Pangenome, ResultsSequenceCollections, - FastaDrsObject, - AccessMethod, - AccessURL, + Sequence, + SequenceCollection, + SequencesAttr, + SortedSequencesAttr, ) from .utils import ( - compare_seqcols, build_pangenome_model, calc_jaccard_similarities, + compare_seqcols, fasta_to_seqcol_dict, ) -from .const import _LOGGER, DEFAULT_INHERENT_ATTRS, SEQCOL_SCHEMA_PATH ATTR_TYPE_MAP = { "sequences": SequencesAttr, @@ -304,7 +305,6 @@ def add(self, seqcol: SequenceCollection, update: bool = False) -> SequenceColle for name_model in seqcol.human_readable_names: if name_model.human_readable_name not in existing_names: - new_name = HumanReadableNames( human_readable_name=name_model.human_readable_name, digest=existing.digest, @@ -659,7 +659,6 @@ def list(self, attribute_type: str, offset: int = 0, limit: int = 50) -> dict: } def search(self, attribute_type: str, digest: str, offset: int = 0, limit: int = 50) -> dict: - Attribute = ATTR_TYPE_MAP[attribute_type] with Session(self.engine) as session: list_stmt = ( select(SequenceCollection) @@ -825,11 +824,58 @@ def __init__( self.__attribute = AttributeAgent(self.engine) self.__fasta_drs = FastaDrsAgent(self.engine, fasta_drs_url_prefix) + # ========================================================================= + # SeqColBackend protocol methods + # ========================================================================= + + def get_collection(self, digest: str, level: int = 2) -> dict: + format_map = {1: "level1", 2: "level2"} + return self.seqcol.get(digest, return_format=format_map.get(level, "level2")) + + def get_collection_attribute(self, digest: str, attribute: str) -> list: + return self.seqcol.get(digest, attribute=attribute) + + def get_collection_itemwise(self, digest: str, limit: int | None = None) -> list[dict]: + return self.seqcol.get(digest, return_format="itemwise", itemwise_limit=limit) + + def get_attribute(self, attribute_name: str, attribute_digest: str) -> list: + return self.attribute.get(attribute_name, attribute_digest) + def compare_digests(self, digestA: str, digestB: str) -> dict: A = self.seqcol.get(digestA, return_format="level2") B = self.seqcol.get(digestB, return_format="level2") return compare_seqcols(A, B) + def compare_digest_with_level2(self, digest: str, level2_b: dict) -> dict: + return self.compare_1_digest(digest, level2_b) + + def list_collections( + self, page: int = 0, page_size: int = 100, filters: dict | None = None + ) -> dict: + if filters: + return self.seqcol.search_by_attributes( + filters, limit=page_size, offset=page * page_size + ) + return self.seqcol.list_by_offset(limit=page_size, offset=page * page_size) + + def collection_count(self) -> int: + result = self.seqcol.list_by_offset(limit=1, offset=0) + return result["pagination"]["total"] + + def list_attributes(self, attribute: str, page: int = 0, page_size: int = 100) -> dict: + res = self.attribute.list(attribute, limit=page_size, offset=page * page_size) + res["results"] = [x.digest for x in res["results"]] + return res + + def capabilities(self) -> dict: + return { + "backend_type": "database", + "n_collections": self.collection_count(), + "has_sequence_data": True, # database always has sequences + "collection_alias_namespaces": [], + "sequence_alias_namespaces": [], + } + def calc_similarities(self, digestA: str, digestB: str) -> dict: """ Calculates the Jaccard similarity between two sequence collections. @@ -910,20 +956,12 @@ def truncate(self) -> int: with Session(self.engine) as session: statement = delete(SequenceCollection) result1 = session.exec(statement) - statement = delete(Pangenome) - result = session.exec(statement) - statement = delete(NamesAttr) - result = session.exec(statement) - statement = delete(LengthsAttr) - result = session.exec(statement) - statement = delete(SequencesAttr) - result = session.exec(statement) - # statement = delete(SortedNameLengthPairsAttr) - # result = session.exec(statement) - statement = delete(NameLengthPairsAttr) - result = session.exec(statement) - statement = delete(SortedSequencesAttr) - result = session.exec(statement) + session.exec(delete(Pangenome)) + session.exec(delete(NamesAttr)) + session.exec(delete(LengthsAttr)) + session.exec(delete(SequencesAttr)) + session.exec(delete(NameLengthPairsAttr)) + session.exec(delete(SortedSequencesAttr)) session.commit() return result1.rowcount diff --git a/refget/backend.py b/refget/backend.py new file mode 100644 index 0000000..148c53f --- /dev/null +++ b/refget/backend.py @@ -0,0 +1,259 @@ +""" +SeqColBackend protocol and RefgetStoreBackend implementation. + +The SeqColBackend protocol defines the interface for serving seqcol API endpoints. +Two implementations: +- RefgetDBAgent (PostgreSQL) — full features including similarities, pangenomes, DRS +- RefgetStoreBackend (RefgetStore) — core seqcol operations, no database required +""" + +from __future__ import annotations + +from typing import Protocol, runtime_checkable + +from .const import DEFAULT_TRANSIENT_ATTRS +from .utils import calc_jaccard_similarities, compare_seqcols + + +@runtime_checkable +class SeqColBackend(Protocol): + """Backend protocol for serving seqcol API endpoints.""" + + def get_collection(self, digest: str, level: int = 2) -> dict: + """Get a collection at level 1 or 2. Raises ValueError if not found.""" + ... + + def get_collection_attribute(self, digest: str, attribute: str) -> list: + """Get a single attribute array from a collection. Raises ValueError if not found.""" + ... + + def get_collection_itemwise(self, digest: str, limit: int | None = None) -> list[dict]: + """Get collection in itemwise format. Raises ValueError if not found.""" + ... + + def get_attribute(self, attribute_name: str, attribute_digest: str) -> list: + """Get an attribute by its own digest. Raises KeyError if not found.""" + ... + + def compare_digests(self, digest_a: str, digest_b: str) -> dict: + """Compare two collections by digest. Raises ValueError if not found.""" + ... + + def compare_digest_with_level2(self, digest: str, level2_b: dict) -> dict: + """Compare a stored collection with a POSTed level2 dict. Raises ValueError if not found.""" + ... + + def list_collections( + self, page: int = 0, page_size: int = 100, filters: dict | None = None + ) -> dict: + """List collections with pagination and optional attribute filters. + Returns {"results": [...], "pagination": {...}}""" + ... + + def list_attributes(self, attribute: str, page: int = 0, page_size: int = 100) -> dict: + """List unique attribute digests. Returns {"results": [...], "pagination": {...}}""" + ... + + def collection_count(self) -> int: + """Total number of collections.""" + ... + + def capabilities(self) -> dict: + """Return backend capabilities for service-info.""" + ... + + +class RefgetStoreBackend: + """SeqColBackend backed by a RefgetStore (no database).""" + + def __init__(self, store): + """ + Args: + store: A RefgetStore instance from gtars. Do NOT pass a + ReadonlyRefgetStore — it cannot lazy-load collections. + """ + self._store = store + + def get_collection(self, digest: str, level: int = 2) -> dict: + try: + if level == 1: + result = self._store.get_collection_level1(digest) + else: + result = self._get_enriched_level2(digest) + except (OSError, IOError): + raise ValueError(f"Collection '{digest}' not found") + if result is None: + raise ValueError(f"Collection '{digest}' not found") + return result + + def get_collection_attribute(self, digest: str, attribute: str) -> list: + level2 = self.get_collection(digest, level=2) + if attribute not in level2: + raise ValueError(f"Attribute '{attribute}' not found") + return level2[attribute] + + def get_collection_itemwise(self, digest: str, limit: int | None = None) -> list[dict]: + level2 = self.get_collection(digest, level=2) + # Transpose: {"names": [a,b], "lengths": [1,2]} -> [{"names": a, "lengths": 1}, ...] + keys = list(level2.keys()) + n = len(level2[keys[0]]) + if limit: + n = min(n, limit) + return [{k: level2[k][i] for k in keys} for i in range(n)] + + def get_attribute(self, attribute_name: str, attribute_digest: str) -> list: + if attribute_name in DEFAULT_TRANSIENT_ATTRS: + raise KeyError( + f"Transient attribute '{attribute_name}' is not served via /attribute endpoint" + ) + result = self._store.get_attribute(attribute_name, attribute_digest) + if result is None: + raise KeyError(f"Attribute {attribute_name}/{attribute_digest} not found") + return result + + def _get_enriched_level2(self, digest: str) -> dict: + """Get level 2 enriched with derived attributes (name_length_pairs, sorted_sequences). + + The store's get_collection_level2 only returns core attributes (names, lengths, + sequences). For comparison, we need the derived attributes too. We get them + from level 1 digests and resolve each via get_attribute. + """ + try: + level2 = self._store.get_collection_level2(digest) + except (OSError, IOError): + raise ValueError(f"Collection '{digest}' not found") + if level2 is None: + raise ValueError(f"Collection '{digest}' not found") + try: + level1 = self._store.get_collection_level1(digest) + except (OSError, IOError): + return level2 + # Add derived attributes that exist in level 1 but not level 2 + for attr in ["name_length_pairs", "sorted_sequences"]: + if attr in level1 and attr not in level2: + try: + resolved = self._store.get_attribute(attr, level1[attr]) + if resolved is not None: + level2[attr] = resolved + except Exception: + pass + return level2 + + def compare_digests(self, digest_a: str, digest_b: str) -> dict: + level2_a = self._get_enriched_level2(digest_a) + level2_b = self._get_enriched_level2(digest_b) + return compare_seqcols(level2_a, level2_b) + + def compare_digest_with_level2(self, digest: str, level2_b: dict) -> dict: + """Compare a stored collection with a POSTed level2 dict. + + The store does not have a native compare_with_level2, so we retrieve + enriched level2 for the stored collection and use the Python compare utility. + """ + level2_a = self._get_enriched_level2(digest) + return compare_seqcols(level2_a, level2_b) + + def list_collections( + self, page: int = 0, page_size: int = 100, filters: dict | None = None + ) -> dict: + result = self._store.list_collections(page=page, page_size=page_size, filters=filters) + # Extract digest strings from SequenceCollectionMetadata objects + result["results"] = [r.digest if hasattr(r, "digest") else r for r in result["results"]] + return result + + def list_attributes(self, attribute: str, page: int = 0, page_size: int = 100) -> dict: + all_cols = self._store.list_collections(page=0, page_size=10000) + unique_digests = set() + for col in all_cols["results"]: + digest = col.digest if hasattr(col, "digest") else col + level1 = self._store.get_collection_level1(digest) + if level1 and attribute in level1: + unique_digests.add(level1[attribute]) + sorted_digests = sorted(unique_digests) + start = page * page_size + end = start + page_size + return { + "results": sorted_digests[start:end], + "pagination": {"page": page, "page_size": page_size, "total": len(sorted_digests)}, + } + + def compute_similarities( + self, + seqcol: dict, + page: int = 0, + page_size: int = 50, + target_digests: list[str] | None = None, + ) -> dict: + """Compute Jaccard similarities between a seqcol and collections in the store. + + Args: + target_digests: If provided, only compare against these digests. + If None, compares against all collections. + """ + if target_digests: + all_digests = list(dict.fromkeys(target_digests)) # deduplicate, preserve order + else: + all_cols = self._store.list_collections(page=0, page_size=10000) + all_digests = [c.digest if hasattr(c, "digest") else c for c in all_cols["results"]] + + # Get aliases for human-readable names + alias_map = {} + for ns in self._store.list_collection_alias_namespaces(): + try: + aliases = self._store.list_collection_aliases(ns) + for a in aliases: + digest = a["digest"] if isinstance(a, dict) else a.digest + alias = a["alias"] if isinstance(a, dict) else a.alias + alias_map.setdefault(digest, []).append(alias) + except Exception: + pass + + similarities = [] + for digest in all_digests: + try: + level2 = self._store.get_collection_level2(digest) + if level2 is None: + continue + jaccard = calc_jaccard_similarities(seqcol, level2) + similarities.append( + { + "digest": digest, + "human_readable_names": alias_map.get(digest, []), + "similarities": jaccard, + } + ) + except Exception: + continue + + # Sort by max similarity descending + similarities.sort( + key=lambda s: max(s["similarities"].values()) if s["similarities"] else 0, + reverse=True, + ) + + total = len(similarities) + start = page * page_size + paged = similarities[start : start + page_size] + + return { + "similarities": paged, + "pagination": {"page": page, "page_size": page_size, "total": total}, + "reference_digest": None, + } + + def collection_count(self) -> int: + result = self._store.list_collections(page=0, page_size=1) + return result["pagination"]["total"] + + def capabilities(self) -> dict: + stats = self._store.stats() + n_collections = int(stats.get("n_collections", 0)) + n_sequences = int(stats.get("n_sequences", 0)) + return { + "backend_type": "refget_store", + "n_collections": n_collections, + "n_sequences": n_sequences, + "has_sequence_data": n_sequences > 0, + "collection_alias_namespaces": self._store.list_collection_alias_namespaces(), + "sequence_alias_namespaces": self._store.list_sequence_alias_namespaces(), + } diff --git a/refget/cli/admin.py b/refget/cli/admin.py index 1787c82..9e6b067 100644 --- a/refget/cli/admin.py +++ b/refget/cli/admin.py @@ -15,20 +15,18 @@ import json import os from pathlib import Path -from typing import Optional, List, Dict, Any +from typing import Any, Dict, List, Optional import typer from refget.cli.output import ( EXIT_CONFIG_ERROR, - EXIT_FILE_NOT_FOUND, EXIT_FAILURE, - EXIT_SUCCESS, + EXIT_FILE_NOT_FOUND, print_error, print_info, print_json, print_success, - print_warning, ) # Heavy imports (sqlmodel) are done lazily inside functions that need them diff --git a/refget/cli/config.py b/refget/cli/config.py index 6f5756f..7614ad5 100644 --- a/refget/cli/config.py +++ b/refget/cli/config.py @@ -14,14 +14,6 @@ import typer -from refget.cli.output import ( - EXIT_CONFIG_ERROR, - EXIT_FAILURE, - EXIT_SUCCESS, - print_error, - print_json, - print_success, -) from refget.cli.config_manager import ( DEFAULTS, get_config_path, @@ -30,6 +22,14 @@ save_config, set_value, ) +from refget.cli.output import ( + EXIT_CONFIG_ERROR, + EXIT_FAILURE, + EXIT_SUCCESS, + print_error, + print_json, + print_success, +) app = typer.Typer( name="config", @@ -220,7 +220,7 @@ def add( if resource_type not in RESOURCE_TYPE_MAP: valid_types = ", ".join(RESOURCE_TYPE_MAP.keys()) print_error( - f"Invalid resource type '{resource_type}'.\n" f"Valid types: {valid_types}", + f"Invalid resource type '{resource_type}'.\nValid types: {valid_types}", EXIT_CONFIG_ERROR, ) return # Unreachable, but clarifies control flow @@ -314,7 +314,7 @@ def remove( if resource_type not in RESOURCE_TYPE_MAP: valid_types = ", ".join(RESOURCE_TYPE_MAP.keys()) print_error( - f"Invalid resource type '{resource_type}'.\n" f"Valid types: {valid_types}", + f"Invalid resource type '{resource_type}'.\nValid types: {valid_types}", EXIT_CONFIG_ERROR, ) return # Unreachable, but clarifies control flow diff --git a/refget/cli/config_manager.py b/refget/cli/config_manager.py index aff7c8b..1eb968f 100644 --- a/refget/cli/config_manager.py +++ b/refget/cli/config_manager.py @@ -105,7 +105,7 @@ def save_config(config: Dict[str, Any]) -> None: """ if tomli_w is None: raise ImportError( - "tomli_w is required to save configuration.\n" "Install with: pip install tomli-w" + "tomli_w is required to save configuration.\nInstall with: pip install tomli-w" ) config_path = get_config_path() diff --git a/refget/cli/fasta.py b/refget/cli/fasta.py index 364c0cc..d5f3313 100644 --- a/refget/cli/fasta.py +++ b/refget/cli/fasta.py @@ -22,10 +22,9 @@ import typer from refget.cli.output import ( - EXIT_FILE_NOT_FOUND, EXIT_FAILURE, + EXIT_FILE_NOT_FOUND, EXIT_SUCCESS, - not_implemented, print_error, print_json, print_success, @@ -67,6 +66,8 @@ def index( - genome.fa.fai (FASTA index, samtools-compatible) - genome.seqcol.json (Sequence collection JSON) - genome.chrom.sizes (Chromosome sizes) + - genome.rgsi (RefgetStore sequence index) + - genome.rgci (RefgetStore collection index) Prints the seqcol digest to stdout. """ @@ -137,7 +138,39 @@ def index( with open(chrom_sizes_path, "w") as f: f.write(chrom_sizes_content) - files_created = [str(fai_path), str(seqcol_path), str(chrom_sizes_path)] + # Write RGSI file + stem = base_name + for ext in [".fa.gz", ".fasta.gz", ".fa", ".fasta"]: + if stem.endswith(ext): + stem = stem[: -len(ext)] + break + rgsi_path = out_dir / f"{stem}.rgsi" + sc.write_rgsi(str(rgsi_path)) + + # Write RGCI file + rgci_path = out_dir / f"{stem}.rgci" + with open(rgci_path, "w") as f: + meta = sc.metadata + f.write( + "#digest\tn_sequences\tnames_digest\tsequences_digest" + "\tlengths_digest\tname_length_pairs_digest" + "\tsorted_name_length_pairs_digest\tsorted_sequences_digest\n" + ) + f.write( + f"{meta.digest}\t{meta.n_sequences}\t{meta.names_digest}" + f"\t{meta.sequences_digest}\t{meta.lengths_digest}" + f"\t{meta.name_length_pairs_digest or ''}" + f"\t{meta.sorted_name_length_pairs_digest or ''}" + f"\t{meta.sorted_sequences_digest or ''}\n" + ) + + files_created = [ + str(fai_path), + str(seqcol_path), + str(chrom_sizes_path), + str(rgsi_path), + str(rgci_path), + ] if json_output: print_json( @@ -364,11 +397,34 @@ def rgsi( """ Compute .rgsi (RefgetStore sequence index) from a FASTA file. - The .rgsi is a binary index file used by RefgetStore for efficient - on-disk sequence storage and retrieval. It maps sequence digests to - byte offsets. + The .rgsi is a TSV index file containing collection-level digest headers + and per-sequence metadata (name, length, alphabet, digests). Used by + RefgetStore for efficient collection storage and as a FASTA digest cache. """ - not_implemented("fasta rgsi") + from gtars.refget import digest_fasta + + try: + # Determine output path + if output is None: + # Replace FASTA extensions with .rgsi + stem = file.name + for ext in [".fa.gz", ".fasta.gz", ".fa", ".fasta"]: + if stem.endswith(ext): + stem = stem[: -len(ext)] + break + output = file.parent / f"{stem}.rgsi" + + # Digest the FASTA file + with suppress_stdout(): + sc = digest_fasta(str(file)) + + # Write RGSI file using gtars binding + sc.write_rgsi(str(output)) + + print_success(f"Wrote RGSI index to {output}") + raise typer.Exit(EXIT_SUCCESS) + except OSError as e: + print_error(f"Error processing FASTA file: {e}", EXIT_FAILURE) @app.command() @@ -389,10 +445,49 @@ def rgci( """ Compute .rgci (RefgetStore collection index) from a FASTA file. - The .rgci is a binary index file used by RefgetStore to store - collection metadata. + The .rgci is a TSV index file listing collection metadata (digest, + sequence count, and level 1 digests). Used by RefgetStore as a + master index of all collections. """ - not_implemented("fasta rgci") + from gtars.refget import digest_fasta + + try: + # Determine output path + if output is None: + stem = file.name + for ext in [".fa.gz", ".fasta.gz", ".fa", ".fasta"]: + if stem.endswith(ext): + stem = stem[: -len(ext)] + break + output = file.parent / f"{stem}.rgci" + + # Digest the FASTA file + with suppress_stdout(): + sc = digest_fasta(str(file)) + + meta = sc.metadata + + # Write RGCI file (matches store.rs write_collections_rgci format) + with open(output, "w") as f: + # Header + f.write( + "#digest\tn_sequences\tnames_digest\tsequences_digest" + "\tlengths_digest\tname_length_pairs_digest" + "\tsorted_name_length_pairs_digest\tsorted_sequences_digest\n" + ) + # Single collection row + f.write( + f"{meta.digest}\t{meta.n_sequences}\t{meta.names_digest}" + f"\t{meta.sequences_digest}\t{meta.lengths_digest}" + f"\t{meta.name_length_pairs_digest or ''}" + f"\t{meta.sorted_name_length_pairs_digest or ''}" + f"\t{meta.sorted_sequences_digest or ''}\n" + ) + + print_success(f"Wrote RGCI index to {output}") + raise typer.Exit(EXIT_SUCCESS) + except OSError as e: + print_error(f"Error processing FASTA file: {e}", EXIT_FAILURE) @app.command() diff --git a/refget/cli/main.py b/refget/cli/main.py index 61bb2a2..39129e8 100644 --- a/refget/cli/main.py +++ b/refget/cli/main.py @@ -4,16 +4,16 @@ This module defines the main CLI app and registers all command groups. """ -import typer from typing import Optional -from refget._version import __version__ +import typer +from refget._version import __version__ +from refget.cli.admin import app as admin_app from refget.cli.config import app as config_app from refget.cli.fasta import app as fasta_app -from refget.cli.store import app as store_app from refget.cli.seqcol import app as seqcol_app -from refget.cli.admin import app as admin_app +from refget.cli.store import app as store_app app = typer.Typer( name="refget", diff --git a/refget/cli/seqcol.py b/refget/cli/seqcol.py index 2e83ab1..14cc4c0 100644 --- a/refget/cli/seqcol.py +++ b/refget/cli/seqcol.py @@ -19,7 +19,7 @@ import typer -from refget.cli.config_manager import get_seqcol_servers +from refget.cli.config_manager import get_seqcol_servers, get_store_path from refget.cli.output import ( EXIT_FAILURE, EXIT_NETWORK_ERROR, @@ -32,6 +32,7 @@ # Heavy imports moved inside functions to speed up CLI startup: # - refget.clients (requests ~51ms) # - refget.utils (jsonschema ~60ms) +# - refget.store (gtars ~100ms) def _get_client(server_override: Optional[str] = None): @@ -54,6 +55,59 @@ def _get_client(server_override: Optional[str] = None): return SequenceCollectionClient(urls=urls, raise_errors=False) +def _collection_to_seqcol_dict(store, digest: str, level: int = 2) -> Optional[dict]: + """ + Convert a RefgetStore collection to seqcol API dict format. + + Args: + store: RefgetStore instance with the collection loaded + digest: Collection digest + level: 1 for attribute digests only, 2 for full arrays + + Returns: + Seqcol dict in API format, or None if collection not found. + """ + try: + if level == 1: + return store.get_collection_level1(digest) + else: + return store.get_collection_level2(digest) + except Exception: + return None + + +def _get_local_seqcol(digest: str, level: int = 2) -> Optional[dict]: + """ + Try to get a seqcol from the local RefgetStore. + + Args: + digest: Collection digest to look up + level: 1 for attribute digests only, 2 for full arrays + + Returns: + Seqcol dict if found locally, None otherwise. + """ + try: + from refget.store import RefgetStore + except ImportError: + # gtars not installed - can't use local store + return None + + store_path = get_store_path() + + # Check if store exists + if not RefgetStore.store_exists(str(store_path)): + return None + + try: + store = RefgetStore.open_local(str(store_path)) + store.set_quiet(True) + return _collection_to_seqcol_dict(store, digest, level) + except Exception: + # Any error (store corruption, etc.) - fall back to remote + return None + + def _compute_snlp_digest(seqcol_dict: dict) -> str: """ Compute the sorted_name_length_pairs digest from a seqcol dict. @@ -64,8 +118,8 @@ def _compute_snlp_digest(seqcol_dict: dict) -> str: Returns: The snlp digest (coordinate system identifier) """ - from refget.utils import build_sorted_name_length_pairs, canonical_str from refget.digests import sha512t24u_digest + from refget.utils import build_sorted_name_length_pairs, canonical_str snlp_digests = build_sorted_name_length_pairs(seqcol_dict) return sha512t24u_digest(canonical_str(snlp_digests)) @@ -134,6 +188,12 @@ def _load_seqcol(input_str: str, client, level: int = 2) -> Optional[dict]: return None else: # digest + # Try local store first + result = _get_local_seqcol(input_str, level=level) + if result is not None: + return result + + # Fall back to remote result = client.get_collection(input_str, level=level) if result is None: print_error(f"Could not fetch seqcol for digest: {input_str}", EXIT_FAILURE) @@ -176,6 +236,13 @@ def show( Level 1 returns attribute digests only. Level 2 (default) returns full arrays. """ + # Try local store first + result = _get_local_seqcol(digest, level=level) + if result is not None: + print_json(result) + raise typer.Exit(EXIT_SUCCESS) + + # Fall back to remote servers client = _get_client(server) try: @@ -310,6 +377,44 @@ def list_collections( raise typer.Exit(EXIT_SUCCESS) +def _search_local_store(filters: dict) -> Optional[list]: + """Search the local RefgetStore for collections matching attribute filters.""" + try: + from refget.store import RefgetStore + except ImportError: + return None + + store_path = get_store_path() + + if not RefgetStore.store_exists(str(store_path)): + return None + + try: + store = RefgetStore.open_local(str(store_path)) + store.set_quiet(True) + + # Search each filter; results must match ALL filters (intersection) + result_sets = [] + for attr_name, attr_digest in filters.items(): + matches = store.find_collections_by_attribute(attr_name, attr_digest) + result_sets.append(set(matches)) + + if not result_sets: + return None + + # Intersection of all filter results + matching = result_sets[0] + for s in result_sets[1:]: + matching &= s + + if not matching: + return None + + return [{"digest": d} for d in sorted(matching)] + except Exception: + return None + + @app.command() def search( names: Optional[str] = typer.Option( @@ -333,6 +438,16 @@ def search( "-s", help="Server URL override", ), + local: bool = typer.Option( + False, + "--local", + help="Search only the local store (skip remote)", + ), + no_local: bool = typer.Option( + False, + "--no-local", + help="Skip local store and search remote only", + ), ) -> None: """ Find collections that share an attribute. @@ -340,6 +455,9 @@ def search( The attribute digest is the digest of an attribute array (e.g., the names array digest from level 1 output). + By default, searches the local store first, then falls back to remote. + Use --local to search only locally, or --no-local to skip local search. + Example workflow: # Get names digest from level 1 names_digest=$(refget fasta seqcol genome.fa --level 1 | jq -r '.names') @@ -362,6 +480,19 @@ def search( ) return + # Try local store first (unless --no-local) + if not no_local: + local_results = _search_local_store(filters) + if local_results is not None: + print_json(local_results) + raise typer.Exit(EXIT_SUCCESS) + + if local: + # --local flag set but no results found locally + print_error("No matching collections found in local store", EXIT_FAILURE) + return + + # Fall back to remote server client = _get_client(server) try: diff --git a/refget/cli/store.py b/refget/cli/store.py index a8bc790..d385819 100644 --- a/refget/cli/store.py +++ b/refget/cli/store.py @@ -7,10 +7,10 @@ Commands: init - Initialize local store add - Import FASTA to local store - list - List collections in store + list - List collections or sequences in store + get - Get collection or sequence by digest pull - Pull collection from remote export - Export collection as FASTA - seq - Get sequence/subsequence fai - Generate .fai from digest chrom-sizes - Generate chrom.sizes from digest stats - Store statistics @@ -27,8 +27,8 @@ from refget.cli.config_manager import get_remote_stores, get_seqcol_servers, get_store_path from refget.cli.output import ( - EXIT_FILE_NOT_FOUND, EXIT_FAILURE, + EXIT_FILE_NOT_FOUND, EXIT_SUCCESS, check_dependency, print_error, @@ -71,17 +71,17 @@ def _get_store_path(path: Optional[Path]) -> Path: def _get_collection_digests(store) -> set: """Get the set of collection digest strings from a store.""" - return {meta.digest for meta in store.list_collections()} + return {meta.digest for meta in store.list_collections()["results"]} -def _load_store(path: Optional[Path], must_exist: bool = True, server: Optional[str] = None): +def _load_store(path: Optional[Path], must_exist: bool = True, remote: Optional[str] = None): """ Load a RefgetStore from local path or remote server. Args: path: Optional path override (uses config if None) must_exist: If True, error if store doesn't exist - server: Optional remote server URL (overrides path) + remote: Optional remote store URL (overrides path) Returns: RefgetStore instance @@ -90,21 +90,18 @@ def _load_store(path: Optional[Path], must_exist: bool = True, server: Optional[ from refget.store import RefgetStore # Remote store takes precedence - if server: + if remote: cache_path = _get_store_path(path) / ".remote_cache" cache_path.mkdir(parents=True, exist_ok=True) - return RefgetStore.open_remote(str(cache_path), server) + return RefgetStore.open_remote(str(cache_path), remote) store_path = _get_store_path(path) if must_exist: if not store_path.exists(): print_error(f"Store not found at {store_path}", EXIT_FILE_NOT_FOUND) - # Check if rgstore.json exists - if not, it's an empty store that needs on_disk - # The store uses rgstore.json as its manifest file - rgstore_path = store_path / "rgstore.json" - if not rgstore_path.exists(): - # Empty store - use on_disk which handles initialization + if not RefgetStore.store_exists(str(store_path)): + # Empty directory - use on_disk which handles initialization return RefgetStore.on_disk(str(store_path)) return RefgetStore.open_local(str(store_path)) else: @@ -153,7 +150,7 @@ def init( store_path.parent.mkdir(parents=True, exist_ok=True) # Initialize the store (creates index files) - store = RefgetStore.on_disk(str(store_path)) + RefgetStore.on_disk(str(store_path)) print_json( { @@ -234,40 +231,58 @@ def add( @app.command("list") -def list_collections( +def list_items( + sequences: bool = typer.Option( + False, + "--sequences", + "-s", + help="List sequences instead of collections", + ), path: Optional[Path] = typer.Option( None, "--path", "-p", help="Store path (default: from config)", ), - server: Optional[str] = typer.Option( + remote: Optional[str] = typer.Option( None, - "--server", - "-s", + "--remote", + "-r", help="Remote store URL (overrides --path)", ), ) -> None: """ - List collections in the store. + List collections or sequences in the store. + + By default, lists collections. Use --sequences to list individual sequences. - Outputs JSON: {"collections": [{"digest": "...", "sequences": N}, ...]} + Outputs JSON: + Collections: {"collections": [{"digest": "..."}, ...]} + Sequences: {"sequences": [{"digest": "...", "name": "...", "length": N}, ...]} """ - store = _load_store(path, server=server) + store = _load_store(path, remote=remote) - collections = [] - for meta in store.list_collections(): - collections.append( - { - "digest": meta.digest, - } - ) + if sequences: + items = [] + for meta in store.list_sequences(): + items.append( + { + "digest": meta.sha512t24u, + "name": meta.name, + "length": meta.length, + } + ) + print_json({"sequences": items}) + else: + collections = [] + for meta in store.list_collections()["results"]: + collections.append( + { + "digest": meta.digest, + } + ) + print_json({"collections": collections}) - print_json( - { - "collections": collections, - } - ) raise typer.Exit(EXIT_SUCCESS) @@ -275,7 +290,29 @@ def list_collections( def get( digest: str = typer.Argument( ..., - help="Collection digest to retrieve", + help="Collection or sequence digest", + ), + sequence: bool = typer.Option( + False, + "--sequence", + "-s", + help="Get sequence instead of collection", + ), + name: Optional[str] = typer.Option( + None, + "--name", + "-n", + help="Sequence name (when getting sequence from collection)", + ), + start: Optional[int] = typer.Option( + None, + "--start", + help="Start position for subsequence (0-based, inclusive)", + ), + end: Optional[int] = typer.Option( + None, + "--end", + help="End position for subsequence (0-based, exclusive)", ), path: Optional[Path] = typer.Option( None, @@ -283,72 +320,104 @@ def get( "-p", help="Store path (default: from config)", ), - server: Optional[str] = typer.Option( + remote: Optional[str] = typer.Option( None, - "--server", - "-s", + "--remote", + "-r", help="Remote store URL (overrides --path)", ), ) -> None: """ - Get a collection by digest. + Get a collection or sequence by digest. - Returns the full sequence collection with names, lengths, and sequences. + By default, returns the full sequence collection with names, lengths, and sequences. + Use --sequence to get a sequence instead. - Outputs JSON: {"names": [...], "lengths": [...], "sequences": [...]} + Examples: + refget store get # Get collection + refget store get -s # Get sequence + refget store get -s -n chr1 # Sequence by name + refget store get -s --start 0 --end 100 # Subsequence + + Outputs JSON for collections: {"names": [...], "lengths": [...], "sequences": [...]} + Outputs raw sequence text for sequences. """ - store = _load_store(path, server=server) + store = _load_store(path, remote=remote) - # Check if collection exists - if digest not in _get_collection_digests(store): - print_error(f"Collection not found: {digest}", EXIT_FAILURE) - return # Unreachable, but clarifies control flow + if sequence: + # Sequence retrieval mode — load sequence data + store.load_all_collections() + store.load_all_sequences() + seq_data = None - # Ensure collection is loaded - _ensure_collection_loaded(store, digest) + if name is not None: + # Get sequence by collection + name + try: + record = store.get_sequence_by_name(digest, name) + except KeyError as e: + print_error(str(e), EXIT_FAILURE) + return + + if start is not None and end is not None: + # Get substring using the sequence digest + try: + seq_data = store.get_substring(record.metadata.sha512t24u, start, end) + except KeyError as e: + print_error(str(e), EXIT_FAILURE) + return + elif start is not None or end is not None: + print_error("Both --start and --end must be provided for substring", EXIT_FAILURE) + return + else: + seq_data = record.decode() + else: + # Direct sequence lookup by digest + if start is not None and end is not None: + try: + seq_data = store.get_substring(digest, start, end) + except KeyError as e: + print_error(str(e), EXIT_FAILURE) + return + elif start is not None or end is not None: + print_error("Both --start and --end must be provided for substring", EXIT_FAILURE) + return + else: + try: + record = store.get_sequence(digest) + seq_data = record.decode() + except KeyError as e: + print_error(str(e), EXIT_FAILURE) + return + + # Output raw sequence to stdout + print(seq_data) + else: + # Collection retrieval mode (default) + try: + result = store.get_collection_level2(digest) + except Exception: + print_error(f"Collection not found: {digest}", EXIT_FAILURE) + return - # Get collection data - names = [] - lengths = [] - sequences = [] - - for coll in store.iter_collections(): - if coll.digest == digest: - for seq in coll.sequences: - m = seq.metadata - names.append(m.name) - lengths.append(m.length) - sequences.append("SQ." + m.sha512t24u) - break - - if not names: - print_error(f"Collection not found: {digest}", EXIT_FAILURE) - return # Unreachable, but clarifies control flow + print_json(result) - print_json( - { - "names": names, - "lengths": lengths, - "sequences": sequences, - } - ) raise typer.Exit(EXIT_SUCCESS) -def _find_remote_urls(server_override: Optional[str] = None) -> List[str]: +def _find_remote_urls(remote_override: Optional[str] = None) -> List[str]: """ Find remote RefgetStore URLs to try. Resolution order: - 1. --server flag (direct RefgetStore URL) + 1. --remote flag (direct RefgetStore URL) 2. Configured remote_stores 3. Configured seqcol_servers (discover RefgetStore via service-info) Returns: List of remote store URLs to try, in priority order. """ - if server_override: - return [server_override] + if remote_override: + return [remote_override] urls: List[str] = [] @@ -390,10 +459,11 @@ def pull( "-p", help="Store path (default: from config)", ), - server: Optional[str] = typer.Option( + remote: Optional[str] = typer.Option( None, "--server", - "-s", + "--remote", + "-r", help="Remote store URL (default: try configured remote_stores)", ), eager: bool = typer.Option( @@ -416,7 +486,7 @@ def pull( downloaded on-demand when accessed (lazy loading). Use --eager to pre-fetch all sequences. - Resolution order (if --server not specified): + Resolution order (if --remote not specified): 1. Check local store (already cached?) 2. Try configured remote_stores in priority order 3. Try seqcol_servers (discover RefgetStore via service-info) @@ -424,9 +494,9 @@ def pull( Use --file for batch operations with multiple digests. Examples: - refget store pull ABC123 --server https://example.com/store + refget store pull ABC123 --remote https://example.com/store refget store pull ABC123 --eager # Pre-fetch all sequences - refget store pull --file digests.txt --server https://example.com/store + refget store pull --file digests.txt --remote https://example.com/store """ check_dependency("gtars", "store", "store") from refget.store import RefgetStore @@ -450,11 +520,11 @@ def pull( print_error("No digests to pull", EXIT_FAILURE) # Determine remote URLs to try - remote_urls = _find_remote_urls(server) + remote_urls = _find_remote_urls(remote) if not remote_urls: print_error( - "No remote store found. Use --server or configure remote_stores:\n" + "No remote store found. Use --remote or configure remote_stores:\n" " refget config add remote_store https://example.com/store", EXIT_FAILURE, ) @@ -466,7 +536,7 @@ def pull( # Check local store first local_collections: set = set() - if store_path.exists() and (store_path / "rgstore.json").exists(): + if RefgetStore.store_exists(str(store_path)): try: local_store = RefgetStore.open_local(str(store_path)) local_collections = _get_collection_digests(local_store) @@ -575,10 +645,10 @@ def export( "-p", help="Store path (default: from config)", ), - server: Optional[str] = typer.Option( + remote: Optional[str] = typer.Option( None, - "--server", - "-s", + "--remote", + "-r", help="Remote store URL (overrides --path)", ), line_width: int = typer.Option( @@ -598,10 +668,11 @@ def export( If no output file is specified, exports to stdout. """ - store = _load_store(path, server=server) + store = _load_store(path, remote=remote) - # Ensure collection is loaded (required for export) + # Ensure collection and sequence data are loaded (required for export) _ensure_collection_loaded(store, digest) + store.load_all_sequences() def _do_export(output_path: str) -> None: """Perform the actual export to a file path.""" @@ -634,86 +705,6 @@ def _do_export(output_path: str) -> None: raise typer.Exit(EXIT_SUCCESS) -@app.command() -def seq( - digest: str = typer.Argument( - ..., - help="Sequence digest or collection digest", - ), - name: Optional[str] = typer.Option( - None, - "--name", - "-n", - help="Sequence name (when using collection digest)", - ), - start: Optional[int] = typer.Option( - None, - "--start", - "-s", - help="Start position (0-based, inclusive)", - ), - end: Optional[int] = typer.Option( - None, - "--end", - "-e", - help="End position (0-based, exclusive)", - ), - path: Optional[Path] = typer.Option( - None, - "--path", - "-p", - help="Store path (default: from config)", - ), - server: Optional[str] = typer.Option( - None, - "--server", - help="Remote store URL (overrides --path)", - ), -) -> None: - """ - Get a sequence or subsequence. - - Examples: - refget store seq # Full sequence - refget store seq --start 100 --end 200 # Subsequence - refget store seq --name chr1 # By name - refget store seq --name chr1 -s 100 -e 200 - """ - store = _load_store(path, server=server) - - sequence = None - - if name is not None: - # Get sequence by collection + name - record = store.get_sequence_by_name(digest, name) - if record is None: - print_error(f"Sequence '{name}' not found in collection {digest}", EXIT_FAILURE) - if start is not None and end is not None: - # Get substring using the sequence digest - sequence = store.get_substring(record.metadata.sha512t24u, start, end) - elif start is not None or end is not None: - print_error("Both --start and --end must be provided for substring", EXIT_FAILURE) - else: - # Use decode() to get the sequence string (handles encoded mode) - sequence = record.decode() - else: - # Direct sequence lookup by digest - if start is not None and end is not None: - sequence = store.get_substring(digest, start, end) - elif start is not None or end is not None: - print_error("Both --start and --end must be provided for substring", EXIT_FAILURE) - else: - record = store.get_sequence(digest) - if record is None: - print_error(f"Sequence not found: {digest}", EXIT_FAILURE) - # Use decode() to get the sequence string (handles encoded mode) - sequence = record.decode() - - # Output raw sequence to stdout - print(sequence) - raise typer.Exit(EXIT_SUCCESS) - - @app.command() def fai( digest: str = typer.Argument( @@ -732,10 +723,10 @@ def fai( "-p", help="Store path (default: from config)", ), - server: Optional[str] = typer.Option( + remote: Optional[str] = typer.Option( None, - "--server", - "-s", + "--remote", + "-r", help="Remote store URL (overrides --path)", ), ) -> None: @@ -747,26 +738,17 @@ def fai( Note: Byte offset columns will be placeholder values since the collection may not correspond to any specific FASTA file layout. """ - store = _load_store(path, server=server) + store = _load_store(path, remote=remote) - # Ensure collection is loaded - _ensure_collection_loaded(store, digest) + try: + lvl2 = store.get_collection_level2(digest) + except Exception: + print_error(f"Collection not found: {digest}", EXIT_FAILURE) + return lines = [] - - # Find the collection and get its sequences - for coll in store.iter_collections(): - if coll.digest == digest: - for seq in coll.sequences: - m = seq.metadata - # FAI format: name, length, offset, linebases, linewidth - # Since we don't have a specific FASTA file, offset is 0 - # Using default line width of 80 - lines.append(f"{m.name}\t{m.length}\t0\t80\t81") - break - - if not lines: - print_error(f"Collection not found: {digest}", EXIT_FAILURE) + for name, length in zip(lvl2["names"], lvl2["lengths"]): + lines.append(f"{name}\t{length}\t0\t80\t81") fai_content = "\n".join(lines) if lines: @@ -798,10 +780,10 @@ def chrom_sizes( "-p", help="Store path (default: from config)", ), - server: Optional[str] = typer.Option( + remote: Optional[str] = typer.Option( None, - "--server", - "-s", + "--remote", + "-r", help="Remote store URL (overrides --path)", ), ) -> None: @@ -810,23 +792,17 @@ def chrom_sizes( Outputs UCSC-compatible chrom.sizes format (tab-separated name/length). """ - store = _load_store(path, server=server) + store = _load_store(path, remote=remote) - # Ensure collection is loaded - _ensure_collection_loaded(store, digest) + try: + lvl2 = store.get_collection_level2(digest) + except Exception: + print_error(f"Collection not found: {digest}", EXIT_FAILURE) + return lines = [] - - # Find the collection and get its sequences - for coll in store.iter_collections(): - if coll.digest == digest: - for seq in coll.sequences: - m = seq.metadata - lines.append(f"{m.name}\t{m.length}") - break - - if not lines: - print_error(f"Collection not found: {digest}", EXIT_FAILURE) + for name, length in zip(lvl2["names"], lvl2["lengths"]): + lines.append(f"{name}\t{length}") sizes_content = "\n".join(lines) if lines: @@ -848,10 +824,10 @@ def stats( "-p", help="Store path (default: from config)", ), - server: Optional[str] = typer.Option( + remote: Optional[str] = typer.Option( None, - "--server", - "-s", + "--remote", + "-r", help="Remote store URL (overrides --path)", ), ) -> None: @@ -863,7 +839,7 @@ def stats( Example output: {"collections": 3, "sequences": 75, "storage_mode": "Encoded"} """ - store = _load_store(path, server=server) + store = _load_store(path, remote=remote) stats_obj = store.stats() @@ -889,54 +865,12 @@ def stats( stats_dict["collections"] = int(stats_dict["collections"]) else: # Fallback: count collections ourselves - stats_dict["collections"] = len(store.list_collections()) + stats_dict["collections"] = store.list_collections()["pagination"]["total"] print_json(stats_dict) raise typer.Exit(EXIT_SUCCESS) -def _remove_collection_from_store(store_path: Path, digest: str) -> bool: - """ - Remove a collection from the store by manipulating store files. - - gtars RefgetStore doesn't provide a remove_collection method, so we - implement it by modifying the collections index file directly. - - Args: - store_path: Path to the store directory - digest: Collection digest to remove - - Returns: - True if removed, False if not found - """ - # Validate digest to prevent path traversal - if "/" in digest or "\\" in digest or ".." in digest: - return False - - # Remove from collections index (TSV file) - collections_idx = store_path / "collections.rgci" - if collections_idx.exists(): - lines = collections_idx.read_text().splitlines() - new_lines = [] - found = False - for line in lines: - if line.startswith("#") or not line.strip(): - new_lines.append(line) - elif line.startswith(digest + "\t"): - found = True # Skip this line (remove it) - else: - new_lines.append(line) - if found: - collections_idx.write_text("\n".join(new_lines) + "\n" if new_lines else "") - - # Remove the collection's .rgsi file - collection_file = store_path / "collections" / f"{digest}.rgsi" - if collection_file.exists(): - collection_file.unlink() - - return True - - @app.command() def remove( digest: str = typer.Argument( @@ -958,15 +892,11 @@ def remove( with other collections. """ store = _load_store(path) - store_path = _get_store_path(path) - # Check if collection exists - if digest not in _get_collection_digests(store): + removed = store.remove_collection(digest) + if not removed: print_error(f"Collection not found: {digest}", EXIT_FAILURE) - # Remove the collection by manipulating store files - _remove_collection_from_store(store_path, digest) - print_json( { "digest": digest, @@ -974,3 +904,379 @@ def remove( } ) raise typer.Exit(EXIT_SUCCESS) + + +@app.command() +def metadata( + digest: str = typer.Argument(help="Collection digest"), + path: Optional[Path] = typer.Option(None, "--path", "-p", help="Store path"), +): + """Show FHR metadata for a collection.""" + store = _load_store(path) + fhr = store.get_fhr_metadata(digest) + if fhr is None: + print_error(f"No FHR metadata for collection {digest}", EXIT_FAILURE) + import json + + print(json.dumps(fhr.to_dict(), indent=2)) + raise typer.Exit(EXIT_SUCCESS) + + +@app.command("metadata-set") +def metadata_set( + digest: str = typer.Argument(help="Collection digest"), + file: Path = typer.Argument(help="Path to FHR JSON file"), + path: Optional[Path] = typer.Option(None, "--path", "-p", help="Store path"), +): + """Set FHR metadata for a collection from a JSON file.""" + store = _load_store(path) + store.load_fhr_metadata(digest, str(file)) + print(f"Set FHR metadata for collection {digest}") + raise typer.Exit(EXIT_SUCCESS) + + +@app.command("crate") +def crate( + path: Optional[Path] = typer.Option( + None, + "--path", + "-p", + help="Store path (default: from config)", + ), + name: str = typer.Option( + ..., + "--name", + "-n", + help="Name for the RO-Crate root dataset", + ), + description: Optional[str] = typer.Option( + None, + "--description", + "-d", + help="Description of the store", + ), + author: Optional[str] = typer.Option( + None, + "--author", + "-a", + help='Author in "Name " format, e.g. "Jane Doe "', + ), + license: Optional[str] = typer.Option( + None, + "--license", + "-l", + help="License URL, e.g. https://creativecommons.org/publicdomain/zero/1.0/", + ), + output: Optional[Path] = typer.Option( + None, + "--output", + "-o", + help="Output path (default: /ro-crate-metadata.json)", + ), +) -> None: + """Generate an RO-Crate metadata file for a RefgetStore. + + Creates a ro-crate-metadata.json describing the store as a FAIR + research object, including structure, provenance, and statistics. + + Examples: + refget store crate --path /store --name "My genomes" --author "J Doe " + refget store crate -p /store -n "Store" -l https://creativecommons.org/publicdomain/zero/1.0/ + """ + import json + import re + from datetime import datetime, timezone + + from refget._version import __version__ + + store = _load_store(path) + store_path = _get_store_path(path) + + # Gather stats + stats_obj = store.stats() + stats_dict = {} + if hasattr(stats_obj, "__iter__"): + for key, value in stats_obj.items(): + stats_dict[key] = value + elif hasattr(stats_obj, "__dict__"): + stats_dict = vars(stats_obj) + + storage_mode = stats_dict.get("storage_mode", "Unknown") + seq_count = int(stats_dict.get("n_sequences", stats_dict.get("sequences", 0))) + + # Count collections + try: + coll_count = store.list_collections()["pagination"]["total"] + except Exception: + coll_count = 0 + + # Build the @graph + now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + today = datetime.now(timezone.utc).strftime("%Y-%m-%d") + + graph = [ + # Metadata descriptor + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "conformsTo": [ + {"@id": "https://w3id.org/ro/crate/1.2"}, + {"@id": "https://w3id.org/ga4gh/refget/refgetstore-crate/0.1"}, + ], + "about": {"@id": "./"}, + }, + ] + + # Root dataset + root = { + "@id": "./", + "@type": "Dataset", + "name": name, + "datePublished": today, + "conformsTo": {"@id": "https://w3id.org/ga4gh/refget/refgetstore-crate/0.1"}, + "hasPart": [ + {"@id": "rgstore.json"}, + {"@id": "sequences.rgsi"}, + {"@id": "sequences/"}, + {"@id": "collections/"}, + ], + "additionalProperty": [ + {"@id": "#prop-storageMode"}, + {"@id": "#prop-sequenceCount"}, + {"@id": "#prop-collectionCount"}, + {"@id": "#prop-refgetDigestAlgorithm"}, + ], + } + if description: + root["description"] = description + if license: + root["license"] = {"@id": license} + if author: + # Parse "Name " format + match = re.match(r"^(.+?)\s*<(.+?)>\s*$", author) + if match: + author_name = match.group(1).strip() + author_url = match.group(2).strip() + root["author"] = {"@id": author_url} + else: + author_name = author.strip() + author_url = None + root["author"] = {"@id": f"#author-{author_name.replace(' ', '-').lower()}"} + + # Add aliases/ if it exists + aliases_path = store_path / "aliases" + if aliases_path.exists() and aliases_path.is_dir(): + root["hasPart"].append({"@id": "aliases/"}) + + graph.append(root) + + # Data entities + graph.extend( + [ + { + "@id": "rgstore.json", + "@type": "File", + "name": "Store configuration", + "description": "Operational configuration for RefgetStore: path templates, storage mode, format version.", + "encodingFormat": "application/json", + }, + { + "@id": "sequences.rgsi", + "@type": "File", + "name": "Master sequence index", + "description": "Tab-separated index of all sequences in the store with names, lengths, alphabets, and GA4GH digests.", + "encodingFormat": "text/tab-separated-values", + }, + { + "@id": "sequences/", + "@type": "Dataset", + "name": "Sequence data", + "description": "Content-addressable sequence files organized by digest prefix.", + }, + { + "@id": "collections/", + "@type": "Dataset", + "name": "Sequence collections", + "description": "GA4GH sequence collection metadata. Each .rgsi file defines a collection with its member sequences and digests.", + }, + ] + ) + + if aliases_path.exists() and aliases_path.is_dir(): + graph.append( + { + "@id": "aliases/", + "@type": "Dataset", + "name": "Alias namespaces", + "description": "Human-readable name mappings for sequences and collections.", + } + ) + + # PropertyValue entities + graph.extend( + [ + { + "@id": "#prop-storageMode", + "@type": "PropertyValue", + "propertyID": "storageMode", + "name": "Storage Mode", + "value": storage_mode, + }, + { + "@id": "#prop-sequenceCount", + "@type": "PropertyValue", + "propertyID": "sequenceCount", + "name": "Sequence Count", + "value": seq_count, + }, + { + "@id": "#prop-collectionCount", + "@type": "PropertyValue", + "propertyID": "collectionCount", + "name": "Collection Count", + "value": coll_count, + }, + { + "@id": "#prop-refgetDigestAlgorithm", + "@type": "PropertyValue", + "propertyID": "refgetDigestAlgorithm", + "name": "Refget Digest Algorithm", + "value": "sha512t24u", + }, + ] + ) + + # CreateAction provenance + graph.extend( + [ + { + "@id": "#crate-creation", + "@type": "CreateAction", + "name": "Generate RO-Crate metadata for RefgetStore", + "endTime": now, + "instrument": {"@id": "#refget-software"}, + "result": {"@id": "./"}, + }, + { + "@id": "#refget-software", + "@type": "SoftwareApplication", + "name": "refget", + "version": __version__, + "url": "https://github.com/refgenie/refget", + "description": "Python package implementing GA4GH refget standards for sequences and sequence collections.", + }, + ] + ) + + # Add agent to CreateAction if author provided + if author: + graph[-2]["agent"] = root["author"] + + # Profile entity + graph.append( + { + "@id": "https://w3id.org/ga4gh/refget/refgetstore-crate/0.1", + "@type": ["CreativeWork", "Profile"], + "name": "RefgetStore RO-Crate Profile", + "version": "0.1", + "description": "Profile for RO-Crates containing GA4GH RefgetStore sequence databases.", + } + ) + + # Author entity + if author: + match = re.match(r"^(.+?)\s*<(.+?)>\s*$", author) + if match: + graph.append( + { + "@id": author_url, + "@type": "Person", + "name": author_name, + } + ) + else: + graph.append( + { + "@id": root["author"]["@id"], + "@type": "Person", + "name": author_name, + } + ) + + # License entity + if license: + graph.append( + { + "@id": license, + "@type": "CreativeWork", + "name": license.rstrip("/").split("/")[-1] or "License", + } + ) + + crate = { + "@context": "https://w3id.org/ro/crate/1.2/context", + "@graph": graph, + } + + # Write output + output_path = output or (store_path / "ro-crate-metadata.json") + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(crate, indent=2) + "\n") + + print_json( + { + "output": str(output_path), + "status": "created", + "entities": len(graph), + } + ) + raise typer.Exit(EXIT_SUCCESS) + + +@app.command("serve") +def serve( + path: Optional[Path] = typer.Option(None, "--path", "-p", help="Local store path"), + remote: Optional[str] = typer.Option( + None, "--remote", "-r", help="Remote store URL (e.g. s3://bucket/store/)" + ), + port: int = typer.Option(8000, "--port", help="Port to serve on"), + host: str = typer.Option("0.0.0.0", "--host", help="Host to bind to"), +): + """Serve a seqcol API backed by a RefgetStore (no database required). + + Examples: + refget store serve --path /path/to/store --port 8000 + refget store serve --remote s3://bucket/store/ --port 8000 + """ + try: + import uvicorn + except ImportError: + print_error("uvicorn is required: pip install uvicorn", EXIT_FAILURE) + + from refget.backend import RefgetStoreBackend + + if remote: + store = _load_store(path=None, remote=remote) + elif path: + store = _load_store(path) + else: + store = _load_store(None) + + backend = RefgetStoreBackend(store) + + from fastapi import FastAPI + + from refget.router import create_refget_router + + app = FastAPI(title="Sequence Collections API (Store-backed)") + app.state.backend = backend + router = create_refget_router( + sequences=False, + pangenomes=False, + refget_store_url=remote, + ) + app.include_router(router) + + typer.echo(f"Serving store-backed seqcol API on {host}:{port}") + uvicorn.run(app, host=host, port=port) + raise typer.Exit(EXIT_SUCCESS) diff --git a/refget/clients.py b/refget/clients.py index 0e0030b..6ad9196 100644 --- a/refget/clients.py +++ b/refget/clients.py @@ -1,8 +1,13 @@ +from __future__ import annotations + import logging import re +from typing import TYPE_CHECKING, Optional + import requests -from typing import Optional +if TYPE_CHECKING: + from .store import RefgetStore _LOGGER = logging.getLogger(__name__) @@ -199,9 +204,9 @@ def download_fasta_to_store( ImportError: If gtars/RefgetStore is not available Example: - >>> from refget.store import RefgetStore, StorageMode + >>> from refget.store import RefgetStore >>> from refget.clients import SequenceCollectionClient - >>> store = RefgetStore(StorageMode.Encoded) + >>> store = RefgetStore.in_memory() >>> client = SequenceCollectionClient() >>> collection_digest = client.download_fasta_to_store("abc123", store) >>> # Now you can retrieve sequences by digest from the local store @@ -440,7 +445,7 @@ def get_refget_store(self, cache_dir: str) -> "RefgetStore": except ImportError: raise ImportError("gtars is required: pip install gtars") - return RefgetStore.load_remote(cache_dir, url) + return RefgetStore.open_remote(cache_dir, url) class PangenomeClient(RefgetClient): @@ -597,17 +602,17 @@ def download_to_store( ImportError: If gtars/RefgetStore is not available Example: - >>> from refget.store import RefgetStore, StorageMode - >>> store = RefgetStore(StorageMode.Encoded) + >>> from refget.store import RefgetStore + >>> store = RefgetStore.in_memory() >>> client = FastaDrsClient() >>> collection_digest = client.download_to_store("abc123", store) """ - import tempfile import os + import tempfile # Verify store is available try: - from .store import RefgetStore as RefgetStoreClass + from .store import RefgetStore as RefgetStoreClass # noqa: F401 except ImportError: raise ImportError("gtars is required for download_to_store functionality") @@ -627,7 +632,7 @@ def download_to_store( _LOGGER.info(f"Downloaded FASTA to {downloaded_path}") # Import into store - store.import_fasta(downloaded_path) + store.add_sequence_collection_from_fasta(downloaded_path) _LOGGER.info(f"Imported FASTA into RefgetStore: {digest}") return digest diff --git a/refget/compliance.py b/refget/compliance.py new file mode 100644 index 0000000..24d2d84 --- /dev/null +++ b/refget/compliance.py @@ -0,0 +1,547 @@ +""" +GA4GH SeqCol API Compliance Suite. + +This is THE canonical compliance suite. It can be run two ways: +1. Via pytest: tests/api/test_compliance.py wraps these checks +2. Via web UI: /compliance/stream endpoint streams results in real-time + +All check functions take an api_root URL and raise AssertionError on failure. +The runner functions execute checks and return structured results. + +Test data is loaded from test_fasta/test_fasta_digests.json and +tests/api/comparison/ fixture files relative to the repository root. +""" + +import json +import logging +import time +from dataclasses import asdict, dataclass, field +from datetime import datetime, timezone +from pathlib import Path + +import requests + +_LOGGER = logging.getLogger(__name__) + +COMPLIANCE_TIMEOUT = 3 # seconds per request + +# ============================================================ +# Test data -- loaded from repository fixtures +# ============================================================ + +REPO_ROOT = Path(__file__).parent.parent +_DIGESTS_FILE = REPO_ROOT / "test_fasta" / "test_fasta_digests.json" +_COMPARISON_DIR = REPO_ROOT / "tests" / "api" / "comparison" + +# Lazy-loaded — these are only needed when running compliance checks +DIGEST_DATA = None +DIGEST_TESTS = None +COMPARISON_FILES = None +COMPARISON_FIXTURES = None + + +def _load_test_data(): + global DIGEST_DATA, DIGEST_TESTS, COMPARISON_FILES, COMPARISON_FIXTURES + if DIGEST_DATA is not None: + return + if not _DIGESTS_FILE.exists(): + raise FileNotFoundError( + f"Compliance test data not found at {_DIGESTS_FILE}. " + "This is expected when refget is pip-installed. " + "Clone the repo to run compliance tests." + ) + with open(_DIGESTS_FILE) as f: + DIGEST_DATA = json.load(f) + DIGEST_TESTS = [(name, bundle) for name, bundle in DIGEST_DATA.items()] + COMPARISON_FILES = [ + _COMPARISON_DIR / "compare_base.fa_subset.fa.json", + _COMPARISON_DIR / "compare_base.fa_different_names.fa.json", + _COMPARISON_DIR / "compare_base.fa_different_order.fa.json", + _COMPARISON_DIR / "compare_base.fa_pair_swap.fa.json", + _COMPARISON_DIR / "compare_base.fa_swap_wo_coords.fa.json", + ] + COMPARISON_FIXTURES = {} + for fp in COMPARISON_FILES: + with open(fp) as fh: + COMPARISON_FIXTURES[fp.name] = json.load(fh) + + +# ============================================================ +# Result types +# ============================================================ + + +@dataclass +class CheckResult: + """Result of a single compliance check.""" + + name: str + passed: bool + duration_ms: float + description: str | None = None + message: str | None = None + error: str | None = None + + +@dataclass +class ComplianceReport: + """Full compliance report for a server.""" + + server_url: str + timestamp: str + total: int = 0 + passed: int = 0 + failed: int = 0 + errors: int = 0 + results: list[dict] = field(default_factory=list) + + def to_dict(self) -> dict: + return asdict(self) + + +def _timed_check(name: str, func, *args, **kwargs) -> CheckResult: + """Run a check function and capture timing and errors.""" + description = (func.__doc__ or "").strip().split("\n")[0] or None + start = time.monotonic() + try: + func(*args, **kwargs) + elapsed = (time.monotonic() - start) * 1000 + return CheckResult( + name=name, passed=True, duration_ms=round(elapsed, 2), description=description + ) + except AssertionError as e: + elapsed = (time.monotonic() - start) * 1000 + return CheckResult( + name=name, + passed=False, + duration_ms=round(elapsed, 2), + description=description, + error=str(e), + ) + except requests.exceptions.RequestException as e: + elapsed = (time.monotonic() - start) * 1000 + return CheckResult( + name=name, + passed=False, + duration_ms=round(elapsed, 2), + description=description, + error=f"Connection error: {e}", + ) + except Exception as e: + elapsed = (time.monotonic() - start) * 1000 + return CheckResult( + name=name, + passed=False, + duration_ms=round(elapsed, 2), + description=description, + error=f"Unexpected error: {e}", + ) + + +# ============================================================ +# Structure checks -- validate response format +# ============================================================ + + +def check_service_info(api_root): + """Service-info returns required GA4GH fields and seqcol schema.""" + res = requests.get(f"{api_root}/service-info", timeout=COMPLIANCE_TIMEOUT) + data = res.json() + assert "id" in data, "service-info missing 'id' field" + assert "type" in data, "service-info missing 'type' field" + assert "group" in data["type"], "service-info type missing 'group'" + assert "artifact" in data["type"], "service-info type missing 'artifact'" + assert "version" in data["type"], "service-info type missing 'version'" + assert "seqcol" in data, "service-info must have 'seqcol' section" + assert "schema" in data["seqcol"], "seqcol section must include 'schema'" + schema = data["seqcol"]["schema"] + assert "properties" in schema, "schema must have 'properties'" + assert "lengths" in schema["properties"], "schema must define 'lengths'" + assert "names" in schema["properties"], "schema must define 'names'" + assert "sequences" in schema["properties"], "schema must define 'sequences'" + + +def check_list_collections(api_root): + """List collections returns paginated results with total count.""" + res = requests.get(f"{api_root}/list/collection", timeout=COMPLIANCE_TIMEOUT) + data = res.json() + assert "results" in data, "list/collection missing 'results' field" + assert isinstance(data["results"], list), "list/collection 'results' should be a list" + assert "pagination" in data, "list/collection missing 'pagination' field" + assert "page" in data["pagination"], "pagination missing 'page'" + assert "page_size" in data["pagination"], "pagination missing 'page_size'" + assert "total" in data["pagination"], "pagination must include 'total' per GA4GH spec" + assert isinstance(data["pagination"]["total"], int), "pagination 'total' must be an integer" + + +def check_list_attributes(api_root, attribute_name): + """List attributes endpoint returns paginated results.""" + res = requests.get(f"{api_root}/list/attributes/{attribute_name}", timeout=COMPLIANCE_TIMEOUT) + data = res.json() + assert "results" in data, f"list/attributes/{attribute_name} missing 'results' field" + assert isinstance(data["results"], list), ( + f"list/attributes/{attribute_name} 'results' should be a list" + ) + + +def check_openapi_available(api_root): + """OpenAPI endpoint is available (RECOMMENDED by spec Section 3.6).""" + res = requests.get(f"{api_root}/openapi.json", timeout=COMPLIANCE_TIMEOUT) + assert res.status_code == 200, f"OpenAPI endpoint returned status {res.status_code}" + data = res.json() + assert "openapi" in data, "OpenAPI response missing 'openapi' field" + + +# ============================================================ +# Collection checks -- verify content against known test data +# ============================================================ + + +def check_collection_level1(api_root, fa_name, bundle): + """Level 1 response returns digest strings for all attributes.""" + digest = bundle["top_level_digest"] + res = requests.get(f"{api_root}/collection/{digest}?level=1", timeout=COMPLIANCE_TIMEOUT) + assert res.status_code == 200, ( + f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + ) + data = res.json() + for attr in ["names", "lengths", "sequences"]: + assert isinstance(data[attr], str), ( + f"Level 1 {attr} should be digest string, got {type(data[attr]).__name__}: {data[attr]}" + ) + assert data[attr] == bundle["level1"][attr], ( + f"Level 1 {attr} for {fa_name}: expected {bundle['level1'][attr]}, got {data[attr]}" + ) + assert "sorted_name_length_pairs" in data, "Level 1 missing sorted_name_length_pairs" + + +def check_collection_level2(api_root, fa_name, bundle): + """Level 2 response returns arrays matching expected content.""" + digest = bundle["top_level_digest"] + res = requests.get(f"{api_root}/collection/{digest}?level=2", timeout=COMPLIANCE_TIMEOUT) + assert res.status_code == 200, ( + f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + ) + data = res.json() + for attr in ["names", "lengths", "sequences"]: + assert isinstance(data[attr], list), ( + f"Level 2 {attr} should be array, got {type(data[attr]).__name__}" + ) + assert data[attr] == bundle["level2"][attr], ( + f"Level 2 {attr} for {fa_name}: expected {bundle['level2'][attr]}, got {data[attr]}" + ) + assert "sorted_name_length_pairs" not in data, ( + "Level 2 should not have sorted_name_length_pairs" + ) + + +def check_default_level_returns_level2(api_root, fa_name, bundle): + """Collection without ?level= param returns level 2 arrays (spec default).""" + digest = bundle["top_level_digest"] + res = requests.get(f"{api_root}/collection/{digest}", timeout=COMPLIANCE_TIMEOUT) + assert res.status_code == 200, ( + f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + ) + data = res.json() + for attr in ["names", "lengths", "sequences"]: + assert isinstance(data[attr], list), ( + f"Default level for {fa_name} {attr} should be array, got {type(data[attr]).__name__}" + ) + + +def check_sorted_name_length_pairs(api_root, fa_name, bundle): + """Level 1 sorted_name_length_pairs digest matches expected value.""" + digest = bundle["top_level_digest"] + res = requests.get(f"{api_root}/collection/{digest}?level=1", timeout=COMPLIANCE_TIMEOUT) + assert res.status_code == 200, ( + f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + ) + data = res.json() + expected = bundle["sorted_name_length_pairs_digest"] + actual = data.get("sorted_name_length_pairs") + assert actual == expected, f"SNLP for {fa_name}: expected {expected}, got {actual}" + + +# ============================================================ +# Attribute checks -- verify attribute retrieval +# ============================================================ + + +def check_attribute_retrieval(api_root, fa_name, bundle, attr_name): + """Attribute endpoint returns correct array for a known digest.""" + attr_digest = bundle["level1"][attr_name] + expected = bundle["level2"][attr_name] + res = requests.get( + f"{api_root}/attribute/collection/{attr_name}/{attr_digest}", timeout=COMPLIANCE_TIMEOUT + ) + assert res.status_code == 200, ( + f"Attribute {attr_name}/{attr_digest} returned HTTP {res.status_code} (expected 200)" + ) + actual = res.json() + assert actual == expected, ( + f"Attribute {attr_name} for {fa_name}: expected {expected}, got {actual}" + ) + + +def check_transient_attribute_not_served(api_root): + """Transient attributes (sorted_name_length_pairs) return 404 from /attribute.""" + bundle = DIGEST_TESTS[0][1] + digest = bundle["top_level_digest"] + level1 = requests.get( + f"{api_root}/collection/{digest}?level=1", timeout=COMPLIANCE_TIMEOUT + ).json() + snlp_digest = level1["sorted_name_length_pairs"] + res = requests.get( + f"{api_root}/attribute/collection/sorted_name_length_pairs/{snlp_digest}", + timeout=COMPLIANCE_TIMEOUT, + ) + assert res.status_code == 404, ( + "Transient attributes should not be served by /attribute endpoint" + ) + + +# ============================================================ +# List/filter checks -- verify filtering and pagination +# ============================================================ + + +def check_list_filter_by_attribute(api_root, fa_name, bundle, attr_name): + """List collections filtered by attribute digest returns the expected collection.""" + attr_digest = bundle["level1"][attr_name] + top_digest = bundle["top_level_digest"] + res = requests.get( + f"{api_root}/list/collection?{attr_name}={attr_digest}", timeout=COMPLIANCE_TIMEOUT + ) + assert res.status_code == 200, f"List filter returned HTTP {res.status_code}" + data = res.json() + assert "results" in data, "Filtered list missing 'results'" + assert top_digest in data["results"], ( + f"Collection {top_digest} not in results when filtering by {attr_name}={attr_digest} for {fa_name}. " + f"Got {len(data['results'])} results: {data['results'][:5]}" + ) + + +def check_list_multi_attribute_filter_and(api_root): + """Multiple filter attributes use AND logic (spec Section 3.4).""" + bundle = DIGEST_TESTS[0][1] + names_digest = bundle["level1"]["names"] + lengths_digest = bundle["level1"]["lengths"] + data = requests.get( + f"{api_root}/list/collection?names={names_digest}&lengths={lengths_digest}", + timeout=COMPLIANCE_TIMEOUT, + ).json() + assert bundle["top_level_digest"] in data["results"], ( + "AND filter should return base.fa collection" + ) + + +# ============================================================ +# Comparison checks -- verify comparison endpoint +# ============================================================ + + +def check_comparison(api_root, fixture_name, expected): + """GET comparison returns correct diff structure matching fixture data.""" + url = f"{api_root}/comparison/{expected['digests']['a']}/{expected['digests']['b']}" + res = requests.get(url, timeout=COMPLIANCE_TIMEOUT) + assert res.status_code == 200, f"Comparison returned HTTP {res.status_code} for {fixture_name}" + import refget + + actual = res.json() + assert refget.canonical_str(actual) == refget.canonical_str(expected), ( + f"Comparison mismatch for {fixture_name}.\n" + f" Expected attributes: {expected.get('attributes')}\n" + f" Got attributes: {actual.get('attributes')}" + ) + + +def check_comparison_structure(api_root): + """Comparison response has all required fields (digests, attributes, array_elements).""" + digest_a = DIGEST_TESTS[0][1]["top_level_digest"] + digest_b = DIGEST_TESTS[1][1]["top_level_digest"] + data = requests.get( + f"{api_root}/comparison/{digest_a}/{digest_b}", timeout=COMPLIANCE_TIMEOUT + ).json() + assert "digests" in data and "a" in data["digests"] and "b" in data["digests"] + assert "attributes" in data + assert "a_only" in data["attributes"] + assert "b_only" in data["attributes"] + assert "a_and_b" in data["attributes"] + assert "array_elements" in data + assert "a_count" in data["array_elements"] + assert "b_count" in data["array_elements"] + assert "a_and_b_count" in data["array_elements"] + assert "a_and_b_same_order" in data["array_elements"] + + +def check_comparison_same_order_values(api_root): + """Identical comparison: a_and_b_same_order values are all true.""" + digest = DIGEST_TESTS[0][1]["top_level_digest"] + data = requests.get( + f"{api_root}/comparison/{digest}/{digest}", timeout=COMPLIANCE_TIMEOUT + ).json() + same_order = data["array_elements"]["a_and_b_same_order"] + for attr, val in same_order.items(): + assert val is True or val is False or val is None, ( + f"a_and_b_same_order[{attr}] must be bool or null, got {type(val)}" + ) + assert val is True, f"Identical comparison: a_and_b_same_order[{attr}] should be true" + + +def check_comparison_post(api_root, fixture_name, expected): + """POST comparison with local seqcol body returns correct diff.""" + import refget + + digest_b = expected["digests"]["b"] + client = refget.SequenceCollectionClient(urls=[api_root]) + local_collection = client.get_collection(digest_b) + + digest_a = expected["digests"]["a"] + res = requests.post( + f"{api_root}/comparison/{digest_a}", + json=local_collection, + timeout=COMPLIANCE_TIMEOUT, + ) + assert res.status_code == 200, ( + f"Comparison POST returned HTTP {res.status_code} for {fixture_name}" + ) + data = res.json() + assert data["digests"]["a"] == expected["digests"]["a"], ( + f"POST digest a: expected {expected['digests']['a']}, got {data['digests']['a']}" + ) + assert data["attributes"] == expected["attributes"], ( + f"POST attributes for {fixture_name}: expected {expected['attributes']}, got {data['attributes']}" + ) + assert data["array_elements"] == expected["array_elements"], ( + f"POST array_elements for {fixture_name}: expected {expected['array_elements']}, got {data['array_elements']}" + ) + + +# ============================================================ +# Check registry -- builds the full compliance suite +# ============================================================ + + +def build_checks(api_root: str) -> list[tuple[str, callable, list]]: + """Build the complete list of compliance checks. + + Returns list of (name, function, args) tuples. + """ + _load_test_data() + checks = [] + + # Structure checks + checks.append(("service_info", check_service_info, [api_root])) + checks.append(("list_collections", check_list_collections, [api_root])) + for attr in ["lengths", "names", "sequences"]: + checks.append((f"list_attributes_{attr}", check_list_attributes, [api_root, attr])) + checks.append(("openapi_available", check_openapi_available, [api_root])) + + # Collection content checks (per FASTA file) + for fa_name, bundle in DIGEST_TESTS: + tag = fa_name.replace(".fa", "") + checks.append( + (f"collection_level1_{tag}", check_collection_level1, [api_root, fa_name, bundle]) + ) + checks.append( + (f"collection_level2_{tag}", check_collection_level2, [api_root, fa_name, bundle]) + ) + checks.append( + ( + f"default_level2_{tag}", + check_default_level_returns_level2, + [api_root, fa_name, bundle], + ) + ) + checks.append( + (f"snlp_digest_{tag}", check_sorted_name_length_pairs, [api_root, fa_name, bundle]) + ) + + # Attribute retrieval checks (per FASTA, per attribute) + for fa_name, bundle in DIGEST_TESTS: + tag = fa_name.replace(".fa", "") + for attr in ["lengths", "names", "sequences"]: + checks.append( + ( + f"attribute_{attr}_{tag}", + check_attribute_retrieval, + [api_root, fa_name, bundle, attr], + ) + ) + + # Attribute filtering checks + checks.append( + ("transient_attribute_not_served", check_transient_attribute_not_served, [api_root]) + ) + checks.append( + ("multi_attribute_filter_and", check_list_multi_attribute_filter_and, [api_root]) + ) + + # List filter checks (base.fa, filter by each attribute) + base_name, base_bundle = DIGEST_TESTS[0] + for attr in ["lengths", "names", "sequences"]: + checks.append( + ( + f"list_filter_{attr}", + check_list_filter_by_attribute, + [api_root, base_name, base_bundle, attr], + ) + ) + + # Comparison checks + checks.append(("comparison_structure", check_comparison_structure, [api_root])) + checks.append(("comparison_same_order", check_comparison_same_order_values, [api_root])) + + for fixture_name, expected in COMPARISON_FIXTURES.items(): + tag = fixture_name.replace("compare_", "").replace(".json", "") + checks.append((f"comparison_{tag}", check_comparison, [api_root, fixture_name, expected])) + checks.append( + (f"comparison_post_{tag}", check_comparison_post, [api_root, fixture_name, expected]) + ) + + return checks + + +# ============================================================ +# Runners -- batch and streaming +# ============================================================ + + +def run_compliance(api_root: str) -> dict: + """Run all compliance checks and return a report dict.""" + api_root = api_root.rstrip("/") + report = ComplianceReport( + server_url=api_root, + timestamp=datetime.now(timezone.utc).isoformat(), + ) + + for name, func, args in build_checks(api_root): + result = _timed_check(name, func, *args) + report.results.append(asdict(result)) + report.total += 1 + if result.passed: + report.passed += 1 + else: + report.failed += 1 + + return report.to_dict() + + +def run_compliance_stream(api_root: str): + """Generator that yields each check result as a JSON string for SSE streaming.""" + api_root = api_root.rstrip("/") + checks = build_checks(api_root) + + yield json.dumps({"type": "start", "total": len(checks), "server_url": api_root}) + + passed = 0 + failed = 0 + for name, func, args in checks: + result = _timed_check(name, func, *args) + if result.passed: + passed += 1 + else: + failed += 1 + yield json.dumps({"type": "result", **asdict(result)}) + + yield json.dumps({"type": "done", "passed": passed, "failed": failed, "total": len(checks)}) diff --git a/refget/const.py b/refget/const.py index 68104fe..66f3175 100644 --- a/refget/const.py +++ b/refget/const.py @@ -1,5 +1,5 @@ -import os import logging +import os _LOGGER = logging.getLogger(__name__) diff --git a/refget/digests.py b/refget/digests.py index b72ba0c..6ffa265 100644 --- a/refget/digests.py +++ b/refget/digests.py @@ -4,9 +4,8 @@ When gtars is not available, falls back to pure Python implementations (slower). """ -import hashlib import base64 - +import hashlib from typing import Callable, Union from .const import GTARS_INSTALLED @@ -34,7 +33,7 @@ def py_md5_digest(seq) -> str: # Default exports - use gtars if available, else Python fallback if GTARS_INSTALLED: - from gtars.refget import sha512t24u_digest, md5_digest + from gtars.refget import md5_digest, sha512t24u_digest else: sha512t24u_digest = py_sha512t24u_digest md5_digest = py_md5_digest diff --git a/refget/examples.py b/refget/examples.py index 94ac812..064c30b 100644 --- a/refget/examples.py +++ b/refget/examples.py @@ -1,7 +1,7 @@ # Models # Used for documentation examples in OpenAPI -from fastapi import Path, Body +from fastapi import Body, Path example_digest = Path( ..., diff --git a/refget/middleware.py b/refget/middleware.py new file mode 100644 index 0000000..f9e0c87 --- /dev/null +++ b/refget/middleware.py @@ -0,0 +1,59 @@ +""" +Middleware for store-backed seqcolapi deployments. + +StoreFreshnessMiddleware periodically checks if the remote store has changed +(via rgstore.json digest) and reloads the backend when new data is available. +""" + +import json +import logging +import time +import urllib.request + +from starlette.middleware.base import BaseHTTPMiddleware + +_LOGGER = logging.getLogger(__name__) + + +class StoreFreshnessMiddleware(BaseHTTPMiddleware): + """On each request, if >N seconds since last check, fetch rgstore.json + and compare collections_digest. If changed, re-open the store and + swap the backend. Lazy, request-triggered, no background threads.""" + + def __init__(self, app, store_url: str, cache_dir: str, check_interval: int = 300): + super().__init__(app) + self.store_url = store_url + self.cache_dir = cache_dir + self.check_interval = check_interval + self.last_check = time.time() + self.last_digest = None + + async def dispatch(self, request, call_next): + now = time.time() + if now - self.last_check > self.check_interval: + self.last_check = now + self._check_and_reload(request.app) + return await call_next(request) + + def _check_and_reload(self, app): + try: + metadata = self._fetch_metadata() + digest = metadata.get("collections_digest") + if digest and digest != self.last_digest: + self.last_digest = digest + self._reload_backend(app) + except Exception as e: + _LOGGER.warning(f"Store freshness check failed: {e}") + + def _fetch_metadata(self) -> dict: + url = self.store_url.rstrip("/") + "/rgstore.json" + with urllib.request.urlopen(url) as resp: + return json.loads(resp.read()) + + def _reload_backend(self, app): + from refget.backend import RefgetStoreBackend + from refget.store import RefgetStore + + _LOGGER.info(f"Store changed, reloading from {self.store_url}") + store = RefgetStore.open_remote(self.cache_dir, self.store_url) + app.state.backend = RefgetStoreBackend(store) diff --git a/refget/models.py b/refget/models.py index 5476e86..b6e4e87 100644 --- a/refget/models.py +++ b/refget/models.py @@ -2,12 +2,11 @@ import logging from copy import copy from datetime import datetime, timezone -from sqlalchemy.types import TypeDecorator -from sqlmodel import Field, SQLModel, Column, Relationship -from sqlmodel import JSON -from typing import List, Optional, Dict, Any, Literal, TYPE_CHECKING -from pydantic import BaseModel, field_validator, field_serializer +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional +from pydantic import BaseModel, field_serializer, field_validator +from sqlalchemy.types import TypeDecorator +from sqlmodel import JSON, Column, Field, Relationship, SQLModel from .digests import sha512t24u_digest @@ -38,19 +37,18 @@ def _serialize_item(self, item): return item -from .const import ( +from .const import ( # noqa: E402 DEFAULT_INHERENT_ATTRS, - DEFAULT_PASSTHRU_ATTRS, - SEQCOL_SCHEMA_PATH, GTARS_INSTALLED, + SEQCOL_SCHEMA_PATH, ) -from .exceptions import InvalidSeqColError -from .utils import ( - canonical_str, +from .exceptions import InvalidSeqColError # noqa: E402 +from .utils import ( # noqa: E402 build_name_length_pairs, - seqcol_dict_to_level1_dict, - level1_dict_to_seqcol_digest, + canonical_str, fasta_to_seqcol_dict, + level1_dict_to_seqcol_digest, + seqcol_dict_to_level1_dict, ) _LOGGER = logging.getLogger(__name__) @@ -71,9 +69,9 @@ def create_fasta_drs_object(fasta_file: str, digest: str = None) -> "FastaDrsObj Raises: ImportError: If gtars is not installed (required for FASTA processing) """ - import os import hashlib - from datetime import datetime, timezone + import os + from datetime import datetime if not GTARS_INSTALLED: raise ImportError( diff --git a/refget/router.py b/refget/router.py index 96dbd9b..486c692 100644 --- a/refget/router.py +++ b/refget/router.py @@ -6,25 +6,24 @@ This router does not supply the /service-info endpoint, which should be created by the main app. -To use, first import it, then attach it to the app, -then create a dbagent object to connect to the database, -and attach it to the app state like this: +To use, import the router and setup_backend, then wire them up: -from refget.router import create_refget_router -from refget.agents import RefgetDBAgent +from refget.router import create_refget_router, setup_backend router = create_refget_router(sequences=False, collections=True, pangenomes=False) app.include_router(router, prefix="/seqcol") -app.state.dbagent = RefgetDBAgent() +setup_backend(app, store=my_store) # RefgetStore backend (no database) +# OR: setup_backend(app, engine=engine) # PostgreSQL via RefgetDBAgent """ import logging -from fastapi import APIRouter, Response, HTTPException, Request, Depends -from .models import Similarities, PaginationResult, PaginatedDigestList -from .agents import RefgetDBAgent +from fastapi import APIRouter, Depends, HTTPException, Query, Request, Response +from fastapi.responses import StreamingResponse +from .backend import SeqColBackend from .examples import * +from .models import PaginatedDigestList, Similarities _LOGGER = logging.getLogger(__name__) @@ -35,9 +34,38 @@ _ROUTER_CONFIG: dict = {} -# dbagent is a RefgetDBAgent, which handles connection to the POSTGRES database -async def get_dbagent(request: Request) -> RefgetDBAgent: - return request.app.state.dbagent +def setup_backend(app, store=None, engine=None): + """Configure the seqcol backend on a FastAPI app. + + Pass a RefgetStore to serve from the store (no database needed). + The store is used directly (not converted to readonly) so it can lazy-load collections. + Pass a SQLAlchemy engine to serve from PostgreSQL via RefgetDBAgent. + """ + if store is not None: + from .backend import RefgetStoreBackend + + app.state.backend = RefgetStoreBackend(store) + elif engine is not None: + from .agents import RefgetDBAgent + + dbagent = RefgetDBAgent(engine=engine) + app.state.dbagent = dbagent + app.state.backend = dbagent + else: + raise ValueError("setup_backend requires either store or engine") + + +async def get_backend(request: Request) -> SeqColBackend: + """Get the SeqColBackend from the app state.""" + return request.app.state.backend + + +async def get_dbagent(request: Request): + """Get the RefgetDBAgent for DB-only endpoints. Returns None if not configured.""" + dbagent = getattr(request.app.state, "dbagent", None) + if dbagent is None: + raise HTTPException(status_code=501, detail="This endpoint requires database backend") + return dbagent def create_refget_router( @@ -45,6 +73,7 @@ def create_refget_router( collections: bool = True, pangenomes: bool = False, fasta_drs: bool = False, + compliance: bool = True, refget_store_url: str = None, ) -> APIRouter: """ @@ -85,6 +114,9 @@ def create_refget_router( if fasta_drs: _LOGGER.info("Adding FASTA DRS endpoints...") refget_router.include_router(fasta_drs_router, prefix="/fasta") + if compliance: + _LOGGER.info("Adding compliance endpoints...") + refget_router.include_router(compliance_router) return refget_router @@ -98,10 +130,10 @@ def create_refget_router( tags=["Retrieving data"], ) async def sequence( - dbagent=Depends(get_dbagent), sequence_digest: str = example_sequence, - start: int = None, - end: int = None, + start: int | None = Query(None, description="Start position (0-based, inclusive)"), + end: int | None = Query(None, description="End position (0-based, exclusive)"), + dbagent=Depends(get_dbagent), ): return Response(content=dbagent.seq.get(sequence_digest, start, end), media_type="text/plain") @@ -111,7 +143,7 @@ async def sequence( summary="Retrieve metadata for a sequence", tags=["Retrieving data"], ) -async def seq_metadata(dbagent=Depends(get_dbagent), sequence_digest: str = example_sequence): +async def seq_metadata(sequence_digest: str = example_sequence, dbagent=Depends(get_dbagent)): raise HTTPException(status_code=501, detail="Metadata retrieval not yet implemented.") @@ -124,13 +156,15 @@ async def seq_metadata(dbagent=Depends(get_dbagent), sequence_digest: str = exam tags=["Retrieving data"], ) async def collection( - dbagent=Depends(get_dbagent), collection_digest: str = example_collection_digest, - level: int | None = None, - collated: bool = True, - attribute: str = None, + level: int | None = Query(None, description="Recursion depth (1 or 2)", ge=1, le=2), + collated: bool = Query(True, description="Return collated format (arrays) vs itemwise"), + attribute: str | None = Query( + None, description="Return only this attribute (e.g., 'names', 'lengths')" + ), + backend=Depends(get_backend), ): - if level == None: + if level is None: level = 2 if level > 2: raise HTTPException( @@ -139,16 +173,10 @@ async def collection( ) try: if not collated: - return dbagent.seqcol.get( - collection_digest, return_format="itemwise", itemwise_limit=10000 - ) + return backend.get_collection_itemwise(collection_digest, limit=10000) if attribute: - return dbagent.seqcol.get(collection_digest, attribute=attribute) - if level == 1: - return dbagent.seqcol.get(collection_digest, return_format="level1") - if level == 2: - return dbagent.seqcol.get(collection_digest, return_format="level2") - return {"error": "Invalid level specified."} + return backend.get_collection_attribute(collection_digest, attribute) + return backend.get_collection(collection_digest, level=level) except ValueError as e: raise HTTPException( status_code=404, @@ -162,18 +190,18 @@ async def collection( tags=["Retrieving data"], ) async def attribute( - dbagent=Depends(get_dbagent), attribute_name: str = "names", attribute_digest: str = example_attribute_digest, + backend=Depends(get_backend), ): try: - return dbagent.attribute.get(attribute_name, attribute_digest) - except KeyError as e: + return backend.get_attribute(attribute_name, attribute_digest) + except KeyError: raise HTTPException( status_code=404, detail="Error: attribute not found. Check the attribute and try again.", ) - except AttributeError as e: + except AttributeError: raise HTTPException( status_code=404, detail="Digest not found. Check the digest and try again.", @@ -186,15 +214,15 @@ async def attribute( tags=["Comparing sequence collections"], ) async def compare_2_digests( - dbagent=Depends(get_dbagent), collection_digest1: str = example_digest_hg38, collection_digest2: str = example_digest_hg38_primary, + backend=Depends(get_backend), ): _LOGGER.info("Comparing two digests...") result = {} result["digests"] = {"a": collection_digest1, "b": collection_digest2} try: - result.update(dbagent.compare_digests(collection_digest1, collection_digest2)) + result.update(backend.compare_digests(collection_digest1, collection_digest2)) except ValueError as e: _LOGGER.debug(e) raise HTTPException( @@ -206,107 +234,75 @@ async def compare_2_digests( @seqcol_router.post( "/similarities/{collection_digest}", - summary="Calculate Jaccard similarities between a single sequence collection in the database and all other collections in the database (by species)", + summary="Calculate Jaccard similarities between a sequence collection and all others", tags=["Comparing sequence collections"], response_model=Similarities, ) async def calc_similarities( collection_digest: str, - species: str = "human", - page_size: int = 50, - page: int = 0, - dbagent=Depends(get_dbagent), + species: str = Query("human", description="Species/group to filter by"), + page_size: int = Query(50, description="Number of results per page"), + page: int = Query(0, description="Page number (0-indexed)"), + backend=Depends(get_backend), ) -> Similarities: _LOGGER.info("Calculating Jaccard similarities...") try: - seqcolA = dbagent.seqcol.get(digest=collection_digest) - except Exception as e: - _LOGGER.debug(f"Error fetching collection: {e}") + seqcolA = backend.get_collection(collection_digest, level=2) + except (ValueError, KeyError): raise HTTPException(status_code=404, detail="Collection not found") - return await calc_similarities_from_json(seqcolA, species, page_size, page, dbagent) + return await _compute_similarities(seqcolA, species, page_size, page, backend) @seqcol_router.post( "/similarities/", - summary="Calculate Jaccard similarities between input sequence collection and all collections in database", + summary="Calculate Jaccard similarities between input sequence collection and all collections", tags=["Comparing sequence collections"], response_model=Similarities, ) async def calc_similarities_from_json( seqcolA: dict, - species: str = "human", - page_size: int = 50, - page: int = 0, - dbagent=Depends(get_dbagent), + species: str = Query("human", description="Species/group to filter by"), + page_size: int = Query(50, description="Number of results per page"), + page: int = Query(0, description="Page number (0-indexed)"), + backend=Depends(get_backend), ) -> Similarities: - """ - Calculate Jaccard similarities between input sequence collection and all collections in DB. - Takes a JSON sequence collection directly instead of a digest. - Take output from: refget digest-fasta "yourfasta.fa" -l 2 > myoutput.json + return await _compute_similarities(seqcolA, species, page_size, page, backend) - Args: - seqcolA: Input sequence collection dictionary - species: Species to filter by ("human" or "mouse"), defaults to "human" - page_size: Number of results per page - page: Page number - dbagent: Database agent dependency - """ - _LOGGER.info( - f"Calculating Jaccard similarities from input sequence collection for {species}..." - ) +async def _compute_similarities( + seqcolA: dict, + species: str, + page_size: int, + page: int, + backend: SeqColBackend, +) -> Similarities: + """Shared implementation for both similarity endpoints.""" try: - # Validate species parameter - if species.lower() not in _SAMPLE_DIGESTS: + # Get target digests for species if configured + target_digests = _SAMPLE_DIGESTS.get(species.lower()) if _SAMPLE_DIGESTS else None + + if not _SAMPLE_DIGESTS: raise HTTPException( - status_code=400, - detail=f"Invalid species '{species}'. Choose from: {list(_SAMPLE_DIGESTS.keys())}", + status_code=501, + detail="Similarities not configured. No scom_config.json found.", ) - - # Get pre-loaded digests for the species - target_digests = _SAMPLE_DIGESTS[species.lower()] - if not target_digests: - _LOGGER.warning(f"No pre-loaded digests found for {species}") - return Similarities( - similarities=[], - pagination=PaginationResult(page=page, page_size=page_size, total=0), - reference_digest=None, - ) - - _LOGGER.info(f"Using {len(target_digests)} pre-loaded digests for {species}") - - # Use the modified get_many_level2_offset function with target_digests filter - results = dbagent.seqcol.get_many_level2_offset( - limit=page_size, offset=page * page_size, target_digests=target_digests - ) - - similarities = [] - for key in results.results.keys(): - human_readable_names = results.results[key]["human_readable_names"] - jaccard_sims = dbagent.calc_similarities_seqcol_dicts(seqcolA, results.results[key]) - similarities.append( - { - "digest": key, - "human_readable_names": human_readable_names, - "similarities": jaccard_sims, - } + raise HTTPException( + status_code=400, + detail=f"Invalid species '{species}'. Choose from: {list(_SAMPLE_DIGESTS.keys())}", ) - result = Similarities( - similarities=similarities, pagination=results.pagination, reference_digest=None + result = backend.compute_similarities( + seqcolA, page=page, page_size=page_size, target_digests=target_digests ) - + return Similarities(**result) except HTTPException: - # Re-raise HTTP exceptions raise except Exception as e: - _LOGGER.debug(f"Error in calc_similarities_from_json: {e}") + _LOGGER.debug(f"Error computing similarities: {e}") raise HTTPException(status_code=500, detail="Error calculating similarities") - return result - @seqcol_router.post( "/comparison/{collection_digest1}", @@ -314,9 +310,9 @@ async def calc_similarities_from_json( tags=["Comparing sequence collections"], ) async def compare_1_digest( - dbagent=Depends(get_dbagent), collection_digest1: str = example_digest_hg38, seqcolB: dict = example_hg38_sc, + backend=Depends(get_backend), ): _LOGGER.info("Comparing one digests and one POSTed seqcol...") _LOGGER.info(f"digest1: {collection_digest1}") @@ -324,7 +320,7 @@ async def compare_1_digest( result = {} result["digests"] = {"a": collection_digest1, "b": "POSTed seqcol"} try: - result.update(dbagent.compare_1_digest(collection_digest1, seqcolB)) + result.update(backend.compare_digest_with_level2(collection_digest1, seqcolB)) except ValueError as e: _LOGGER.debug(e) raise HTTPException( @@ -341,28 +337,35 @@ async def compare_1_digest( response_model=PaginatedDigestList, ) async def list_collections_by_offset( - request: Request, - dbagent=Depends(get_dbagent), - page_size: int = 100, - page: int = 0, + page_size: int = Query(100, description="Number of results per page"), + page: int = Query(0, description="Page number (0-indexed)"), + names: str | None = Query(None, description="Filter by names attribute digest"), + lengths: str | None = Query(None, description="Filter by lengths attribute digest"), + sequences: str | None = Query(None, description="Filter by sequences attribute digest"), + name_length_pairs: str | None = Query(None, description="Filter by name_length_pairs digest"), + sorted_sequences: str | None = Query(None, description="Filter by sorted_sequences digest"), + backend=Depends(get_backend), ): - # Extract all query params except pagination params - filters = {k: v for k, v in request.query_params.items() if k not in ["page", "page_size"]} - - if filters: - try: - # Multi-attribute filtering with AND logic - res = dbagent.seqcol.search_by_attributes( - filters, limit=page_size, offset=page * page_size - ) - except ValueError as e: - # Invalid attribute name - raise HTTPException(status_code=400, detail=str(e)) - else: - # No filters, return all collections - res = dbagent.seqcol.list_by_offset(limit=page_size, offset=page * page_size) + # Build filters from explicit parameters + filters = { + k: v + for k, v in { + "names": names, + "lengths": lengths, + "sequences": sequences, + "name_length_pairs": name_length_pairs, + "sorted_sequences": sorted_sequences, + }.items() + if v is not None + } - res["results"] = [x.digest for x in res["results"]] + try: + res = backend.list_collections(page=page, page_size=page_size, filters=filters or None) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + + # Normalize results to digest strings (DB backend returns model objects) + res["results"] = [x.digest if hasattr(x, "digest") else x for x in res["results"]] return res @@ -373,13 +376,14 @@ async def list_collections_by_offset( response_model=PaginatedDigestList, ) async def list_attributes( - dbagent=Depends(get_dbagent), attribute: str = "names", page_size: int = 100, page: int = 0 + backend=Depends(get_backend), + attribute: str = "names", + page_size: int = Query(100, description="Number of results per page"), + page: int = Query(0, description="Page number (0-indexed)"), ): try: - res = dbagent.attribute.list(attribute, limit=page_size, offset=page * page_size) - res["results"] = [x.digest for x in res["results"]] - return res - except KeyError as e: + return backend.list_attributes(attribute, page=page, page_size=page_size) + except KeyError: raise HTTPException( status_code=404, detail="Error: attribute not found. Check the attribute and try again.", @@ -397,7 +401,9 @@ async def list_attributes( response_model=PaginatedDigestList, ) async def list_cpangenomes_by_offset( - dbagent=Depends(get_dbagent), page_size: int = 100, page: int = 0 + dbagent=Depends(get_dbagent), + page_size: int = Query(100, description="Number of results per page"), + page: int = Query(0, description="Page number (0-indexed)"), ): res = dbagent.pangenome.list_by_offset(limit=page_size, offset=page * page_size) res["results"] = [x.digest for x in res["results"]] @@ -413,10 +419,10 @@ async def list_cpangenomes_by_offset( async def pangenome( dbagent=Depends(get_dbagent), pangenome_digest: str = example_pangenome_digest, - level: int | None = None, - collated: bool = True, + level: int | None = Query(None, description="Recursion depth (1-4)", ge=1, le=4), + collated: bool = Query(True, description="Return collated format (arrays) vs itemwise"), ): - if level == None: + if level is None: level = 2 try: if not collated: @@ -545,3 +551,69 @@ async def get_fasta_index( } except ValueError: raise HTTPException(status_code=404, detail="Object not found") + + +compliance_router = APIRouter() + + +@compliance_router.get( + "/compliance/run", + summary="Run compliance checks against a seqcol server", + tags=["Compliance"], +) +def run_compliance_endpoint( + request: Request, + target_url: str | None = Query( + None, description="Target server URL to test (defaults to self)" + ), +): + """ + Run GA4GH SeqCol compliance structure tests against a server. + + Only runs structure tests (service-info, list, pagination, collection structure). + Content tests that require specific test data are not included. + + If no target_url is provided, tests run against this server. + """ + from .compliance import run_compliance + + if target_url is None: + scheme = request.headers.get("x-forwarded-proto", request.url.scheme) + host = request.headers.get("host", request.url.netloc) + target_url = f"{scheme}://{host}" + + return run_compliance(target_url) + + +@compliance_router.get( + "/compliance/stream", + summary="Stream compliance checks via Server-Sent Events", + tags=["Compliance"], +) +def stream_compliance_endpoint( + request: Request, + target_url: str | None = Query( + None, description="Target server URL to test (defaults to self)" + ), +): + """ + Stream compliance check results in real-time via Server-Sent Events. + + Each event contains a JSON object with type "start", "result", or "done". + """ + from .compliance import run_compliance_stream + + if target_url is None: + scheme = request.headers.get("x-forwarded-proto", request.url.scheme) + host = request.headers.get("host", request.url.netloc) + target_url = f"{scheme}://{host}" + + def event_stream(): + for data in run_compliance_stream(target_url): + yield f"data: {data}\n\n" + + return StreamingResponse( + event_stream(), + media_type="text/event-stream", + headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, + ) diff --git a/refget/store.py b/refget/store.py index 504c27b..41f79ee 100644 --- a/refget/store.py +++ b/refget/store.py @@ -3,20 +3,41 @@ This module re-exports the Rust-based gtars.refget components for local sequence collection storage and FASTA processing. + +RefgetStore also provides namespace-based alias management: + Sequence aliases: add_sequence_alias, get_sequence_by_alias, + get_aliases_for_sequence, list_sequence_alias_namespaces, + list_sequence_aliases, remove_sequence_alias, load_sequence_aliases + Collection aliases: add_collection_alias, get_collection_by_alias, + get_aliases_for_collection, list_collection_alias_namespaces, + list_collection_aliases, remove_collection_alias, load_collection_aliases """ from .const import GTARS_INSTALLED if GTARS_INSTALLED: - from gtars.refget import RefgetStore, digest_fasta, StorageMode + from gtars.refget import ( + RefgetStore, + SequenceCollection, + StorageMode, + compute_fai, + digest_fasta, + digest_sequence, + ) else: RefgetStore = None - digest_fasta = None StorageMode = None + digest_fasta = None + compute_fai = None + digest_sequence = None + SequenceCollection = None __all__ = [ "RefgetStore", "digest_fasta", "StorageMode", + "compute_fai", + "digest_sequence", + "SequenceCollection", "GTARS_INSTALLED", ] diff --git a/refget/utils.py b/refget/utils.py index 7c73799..c3b3e16 100644 --- a/refget/utils.py +++ b/refget/utils.py @@ -1,19 +1,19 @@ import json import logging - -from jsonschema import Draft7Validator from pathlib import Path from typing import Optional, Union +from jsonschema import Draft7Validator + from .const import ( - SeqColDict, DEFAULT_INHERENT_ATTRS, DEFAULT_PASSTHRU_ATTRS, - SEQCOL_SCHEMA_PATH, GTARS_INSTALLED, + SEQCOL_SCHEMA_PATH, + SeqColDict, ) +from .digests import DigestFunction, sha512t24u_digest from .exceptions import InvalidSeqColError -from .digests import sha512t24u_digest, DigestFunction _LOGGER = logging.getLogger(__name__) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt deleted file mode 100644 index 146a887..0000000 --- a/requirements/requirements-all.txt +++ /dev/null @@ -1,7 +0,0 @@ -jsonschema -gtars>=0.6.0 -pyyaml -requests -sqlmodel -tomli_w -typer>=0.9.0 diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt deleted file mode 100644 index 60c9958..0000000 --- a/requirements/requirements-dev.txt +++ /dev/null @@ -1 +0,0 @@ --e git+git://github.com/databio/henge@master#egg=henge \ No newline at end of file diff --git a/requirements/requirements-docs.txt b/requirements/requirements-docs.txt deleted file mode 100644 index b2a9546..0000000 --- a/requirements/requirements-docs.txt +++ /dev/null @@ -1,2 +0,0 @@ -https://github.com/refgenie/refget/archive/master.zip -https://github.com/databio/mkdocs-databio/archive/master.zip diff --git a/requirements/requirements-seqcolapi.txt b/requirements/requirements-seqcolapi.txt deleted file mode 100644 index bbd3811..0000000 --- a/requirements/requirements-seqcolapi.txt +++ /dev/null @@ -1,6 +0,0 @@ -fastapi -psycopg2-binary -refget -sqlmodel -uvicorn>=0.30.0 -ubiquerg>=0.6.1 diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt deleted file mode 100644 index aadcdae..0000000 --- a/requirements/requirements-test.txt +++ /dev/null @@ -1,3 +0,0 @@ --r requirements-all.txt -coveralls>=1.1 -pytest-cov>=6.0.0 \ No newline at end of file diff --git a/scripts/test-store-docker.sh b/scripts/test-store-docker.sh new file mode 100755 index 0000000..2dcd790 --- /dev/null +++ b/scripts/test-store-docker.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# Test the store-backed Docker image builds and starts correctly. +# +# Usage: ./scripts/test-store-docker.sh +# +# Builds a Docker image using LOCAL source code, compiles gtars from source +# if needed (cached as a wheel for subsequent runs), and verifies endpoints. + +set -e + +IMAGE_NAME="seqcolapi-store-test" +CONTAINER_NAME="seqcolapi-store-test" +PORT=8199 +STORE_URL="https://refgenie.s3.us-east-1.amazonaws.com/refget-store/jungle/" +GTARS_REPO="${GTARS_REPO:-$(cd "$(dirname "$0")/../../gtars" 2>/dev/null && pwd)}" +WHEEL_CACHE_DIR="${HOME}/.cache/seqcolapi-test-wheels" + +cleanup() { + echo "Cleaning up..." + docker stop "$CONTAINER_NAME" 2>/dev/null || true + docker rm "$CONTAINER_NAME" 2>/dev/null || true +} +trap cleanup EXIT + +# Build or find gtars wheel for cp311 +mkdir -p "$WHEEL_CACHE_DIR" +GTARS_WHEEL=$(find "$WHEEL_CACHE_DIR" -name "gtars*cp311*linux*.whl" 2>/dev/null | head -1) + +if [ -z "$GTARS_WHEEL" ] && [ -d "$GTARS_REPO" ]; then + echo "Building gtars wheel for Python 3.11 (one-time, cached)..." + docker run --rm -v "$GTARS_REPO:/src" -v "$WHEEL_CACHE_DIR:/wheels" \ + python:3.11-slim bash -c " + apt-get update -qq && apt-get install -y -qq curl gcc > /dev/null 2>&1 + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y > /dev/null 2>&1 + export PATH=\$HOME/.cargo/bin:\$PATH + pip install maturin > /dev/null 2>&1 + cd /src/gtars-python && maturin build --release -o /wheels 2>&1 | tail -3 + " + GTARS_WHEEL=$(find "$WHEEL_CACHE_DIR" -name "gtars*cp311*linux*.whl" 2>/dev/null | head -1) +fi + +if [ -n "$GTARS_WHEEL" ]; then + echo "Using cached gtars wheel: $(basename "$GTARS_WHEEL")" + GTARS_INSTALL="COPY gtars.whl /tmp/gtars.whl +RUN pip install --no-cache-dir /tmp/gtars.whl" + EXTRA_COPY="-v $GTARS_WHEEL:/tmp/build-context/gtars.whl:ro" + # We'll copy the wheel into context below +else + echo "No local gtars repo found, using PyPI version" + GTARS_INSTALL="RUN pip install --no-cache-dir gtars" + EXTRA_COPY="" +fi + +# Build context — just refget source + wheel +CONTEXT_DIR="/tmp/seqcolapi-store-docker-context" +rm -rf "$CONTEXT_DIR" +mkdir -p "$CONTEXT_DIR" + +# Copy only essential files, not .git or node_modules +rsync -a --exclude='.git' --exclude='node_modules' --exclude='__pycache__' \ + --exclude='*.pyc' --exclude='.pytest_cache' --exclude='frontend/node_modules' \ + . "$CONTEXT_DIR/refget/" + +[ -n "$GTARS_WHEEL" ] && cp "$GTARS_WHEEL" "$CONTEXT_DIR/gtars.whl" + +echo "Building Docker image from local source..." +docker build -t "$IMAGE_NAME" -f - "$CONTEXT_DIR" < /dev/null 2>&1; then + echo "Server is up after ${i}s" + break + fi + if [ "$i" -eq 60 ]; then + echo "FAILED: Server did not start within 60s" + echo "Container logs:" + docker logs "$CONTAINER_NAME" + exit 1 + fi + sleep 1 +done + +echo "Checking /service-info..." +curl -s "http://localhost:$PORT/service-info" | python3 -c " +import sys, json +info = json.load(sys.stdin) +assert 'seqcol' in info, 'Missing seqcol in service-info' +print(f' Name: {info[\"name\"]}') +print(f' Store: {info[\"seqcol\"].get(\"refget_store\", {}).get(\"enabled\", False)}') +print(f' SCOM: {info[\"seqcol\"].get(\"scom\", {}).get(\"enabled\", False)}') +" + +echo "Checking /list/collection..." +curl -s "http://localhost:$PORT/list/collection?page_size=1" | python3 -c " +import sys, json +data = json.load(sys.stdin) +assert 'results' in data, 'Missing results in list response' +assert 'pagination' in data, 'Missing pagination' +print(f' Total collections: {data[\"pagination\"][\"total\"]}') +" + +echo "" +echo "PASSED: Store Docker image builds and runs correctly." diff --git a/scripts/test-store-integration.sh b/scripts/test-store-integration.sh new file mode 100755 index 0000000..2b50ca6 --- /dev/null +++ b/scripts/test-store-integration.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Run store-backed integration tests (no database needed) +# +# Usage: ./scripts/test-store-integration.sh [pytest args...] +# +# Examples: +# ./scripts/test-store-integration.sh # run store compliance tests +# ./scripts/test-store-integration.sh -v # verbose output +set -e +cd "$(dirname "${BASH_SOURCE[0]}")/.." +pytest tests/integration/test_store_compliance.py "$@" diff --git a/seqcolapi/__main__.py b/seqcolapi/__main__.py index 2e4396a..3ea99ba 100644 --- a/seqcolapi/__main__.py +++ b/seqcolapi/__main__.py @@ -1,4 +1,5 @@ import sys + from .main import main if __name__ == "__main__": diff --git a/seqcolapi/_version.py b/seqcolapi/_version.py deleted file mode 100644 index 3e2f46a..0000000 --- a/seqcolapi/_version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "0.9.0" diff --git a/seqcolapi/const.py b/seqcolapi/const.py index 07fbe54..34e6aac 100644 --- a/seqcolapi/const.py +++ b/seqcolapi/const.py @@ -1,13 +1,13 @@ import os - -from refget._version import __version__ as refget_pkg_version from platform import python_version -from ._version import __version__ as seqcolapi_version +from gtars import __version__ as gtars_version + +from refget._version import __version__ as refget_version ALL_VERSIONS = { - "seqcolapi_version": seqcolapi_version, - "refget_pkg_version": refget_pkg_version, + "refget_version": refget_version, + "gtars_version": gtars_version, "python_version": python_version(), "seqcol_spec_version": "1.0.0", } diff --git a/seqcolapi/examples.py b/seqcolapi/examples.py index 2704252..032b863 100644 --- a/seqcolapi/examples.py +++ b/seqcolapi/examples.py @@ -1,7 +1,7 @@ # Models # Used for documentation examples in OpenAPI -from fastapi import Path, Body +from fastapi import Body, Path example_digest = Path( ..., diff --git a/seqcolapi/main.py b/seqcolapi/main.py index 70d6fd6..85fe6b8 100644 --- a/seqcolapi/main.py +++ b/seqcolapi/main.py @@ -1,26 +1,71 @@ import logging +import os +from contextlib import asynccontextmanager -from fastapi import FastAPI, Depends -from fastapi import HTTPException +from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, FileResponse, HTMLResponse -from refget.router import create_refget_router, get_dbagent +from fastapi.responses import FileResponse, HTMLResponse, JSONResponse +from sqlmodel import Session, select from starlette.requests import Request from starlette.staticfiles import StaticFiles -from sqlmodel import Session, select -from contextlib import asynccontextmanager -from .const import ALL_VERSIONS, STATIC_PATH, STATIC_DIRNAME +from refget.agents import RefgetDBAgent from refget.const import HUMANS_SAMPLE_LIST, MOUSE_SAMPLES_LIST from refget.models import HumanReadableNames -from .examples import * +from refget.router import _ROUTER_CONFIG, _SAMPLE_DIGESTS, create_refget_router, setup_backend -from refget.router import _SAMPLE_DIGESTS, _ROUTER_CONFIG -from refget.agents import RefgetDBAgent +from .const import ALL_VERSIONS, STATIC_DIRNAME, STATIC_PATH +from .examples import * global _LOGGER _LOGGER = logging.getLogger(__name__) + +def _load_scom_config(store_path: str, remote: bool): + """Load SCOM target digests from a JSON config. + + Checks (in order): + 1. SCOM_CONFIG_URL environment variable (any HTTP URL) + 2. scom_config.json next to the store (convention) + + Format: {"human": ["digest1", "digest2", ...], "mouse": [...]} + """ + import json + import os + import urllib.request + + # Try env var first + config_url = os.environ.get("SCOM_CONFIG_URL") + + # Fall back to store convention + if not config_url: + if remote: + config_url = store_path.rstrip("/") + "/scom_config.json" + else: + config_path = os.path.join(store_path, "scom_config.json") + if os.path.exists(config_path): + with open(config_path) as f: + config = json.load(f) + for species, digests in config.items(): + _SAMPLE_DIGESTS[species] = digests + _LOGGER.info(f"SCOM: loaded {len(digests)} target digests for '{species}'") + return + else: + _LOGGER.info( + "No SCOM_CONFIG_URL set and no scom_config.json found. SCOM disabled." + ) + return + + try: + with urllib.request.urlopen(config_url, timeout=10) as resp: + config = json.loads(resp.read()) + for species, digests in config.items(): + _SAMPLE_DIGESTS[species] = digests + _LOGGER.info(f"SCOM: loaded {len(digests)} target digests for '{species}'") + except Exception as e: + _LOGGER.info(f"Could not load SCOM config from {config_url} ({e}). SCOM disabled.") + + for key, value in ALL_VERSIONS.items(): _LOGGER.info(f"{key}: {value}") @@ -32,9 +77,8 @@ async def lifespan_loader(app): """ _LOGGER.info("Starting lifespan: Loading sample data...") - # Initialize database agent and store in app state - dbagent = RefgetDBAgent() - app.state.dbagent = dbagent + # Initialize backend via setup_backend + setup_backend(app, engine=RefgetDBAgent().engine) species_samples = {"human": HUMANS_SAMPLE_LIST, "mouse": MOUSE_SAMPLES_LIST} @@ -42,7 +86,7 @@ async def lifespan_loader(app): try: _LOGGER.info(f"Loading {len(sample_names)} sample names for {species}") - with Session(dbagent.engine) as session: + with Session(app.state.dbagent.engine) as session: statement = select(HumanReadableNames).where( HumanReadableNames.human_readable_name.in_(sample_names) ) @@ -69,7 +113,7 @@ async def lifespan_loader(app): app = FastAPI( title="Sequence Collections API", description="An API providing metadata such as names, lengths, and other values for collections of reference sequences", - version=ALL_VERSIONS["seqcolapi_version"], + version=ALL_VERSIONS["refget_version"], lifespan=lifespan_loader, ) @@ -121,13 +165,13 @@ async def http_exception_handler(request: Request, exc: HTTPException): @app.exception_handler(ValueError) -async def generic_exception_handler(request: Request, exc: Exception): +async def value_error_handler(request: Request, exc: Exception): raise HTTPException(status_code=404, detail=str(exc)) @app.get("favicon.ico", include_in_schema=False) async def favicon(): - return FileResponse(f"/static/favicon.ico") + return FileResponse("/static/favicon.ico") @app.get("/", summary="Home page", tags=["General endpoints"], response_class=HTMLResponse) @@ -144,15 +188,21 @@ async def index(request: Request): async def service_info(): # Build seqcol capabilities object seqcol_info = { - "schema": dbagent.schema_dict, + "schema": getattr(app.state.dbagent, "schema_dict", None) + if hasattr(app.state, "dbagent") + else None, "sorted_name_length_pairs": True, "fasta_drs": {"enabled": _ROUTER_CONFIG.get("fasta_drs", False)}, } + # Get backend capabilities + backend = getattr(app.state, "backend", None) + caps = backend.capabilities() if backend and hasattr(backend, "capabilities") else {} + # Add refget_store info store_url = _ROUTER_CONFIG.get("refget_store_url") if store_url: - seqcol_info["refget_store"] = {"enabled": True, "url": store_url} + seqcol_info["refget_store"] = {"enabled": True, "url": store_url, **caps} else: seqcol_info["refget_store"] = {"enabled": False} @@ -176,17 +226,110 @@ async def service_info(): # Mount statics after other routes for lower precedence -app.mount(f"/", StaticFiles(directory=STATIC_PATH), name=STATIC_DIRNAME) +app.mount("/", StaticFiles(directory=STATIC_PATH), name=STATIC_DIRNAME) -def create_global_dbagent(): - """ - Create a global database agent for use in the app. +def create_store_app(store_path: str, remote: bool = False, cache_dir: str = "/tmp/seqcol_cache"): + """Create a seqcolapi FastAPI app backed by a RefgetStore (no database). + + Args: + store_path: Path to store on disk, or S3 URL for remote stores. + remote: If True, open as a remote (S3) store. + cache_dir: Local cache directory for remote stores. + + Returns: + FastAPI app with store-backed seqcol endpoints. """ - global dbagent - dbagent = RefgetDBAgent() # Configured via env vars - return dbagent + from refget.store import RefgetStore + if remote: + store = RefgetStore.open_remote(cache_dir, store_path) + else: + store = RefgetStore.on_disk(store_path) + + store_app = FastAPI( + title="Sequence Collections API (Store-backed)", + version=ALL_VERSIONS["refget_version"], + ) + + store_app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) -if __name__ != "__main__": - app.state.dbagent = create_global_dbagent() + setup_backend(store_app, store=store) + router = create_refget_router( + sequences=False, pangenomes=False, refget_store_url=store_path if remote else None + ) + store_app.include_router(router) + + # Load SCOM config: check SCOM_CONFIG_URL env var, then fall back to store convention + _load_scom_config(store_path, remote) + + if remote: + from refget.middleware import StoreFreshnessMiddleware + + store_app.add_middleware( + StoreFreshnessMiddleware, + store_url=store_path, + cache_dir=cache_dir, + ) + + @store_app.get("/service-info", summary="GA4GH service info", tags=["General endpoints"]) + async def store_service_info(): + import json as _json + from pathlib import Path as _Path + + backend = getattr(store_app.state, "backend", None) + caps = backend.capabilities() if backend and hasattr(backend, "capabilities") else {} + + # Load the seqcol schema (same schema used by the DB-backed app) + _schema_path = _Path(__file__).parent.parent / "refget" / "schemas" / "seqcol.json" + try: + with open(_schema_path) as _f: + schema = _json.load(_f) + except Exception: + schema = None + + return { + "id": "org.databio.seqcolapi.store", + "name": "Sequence collections (store-backed)", + "type": { + "group": "org.ga4gh", + "artifact": "refget-seqcol", + "version": ALL_VERSIONS["seqcol_spec_version"], + }, + "description": "Store-backed API providing metadata for collections of reference sequences", + "organization": {"name": "Databio Lab", "url": "https://databio.org"}, + "contactUrl": "https://github.com/refgenie/refget/issues", + "version": ALL_VERSIONS, + "seqcol": { + "schema": schema, + "refget_store": { + "enabled": True, + "url": os.environ.get("REFGET_STORE_HTTP_URL", store_path), + **caps, + }, + "scom": { + "enabled": bool(_SAMPLE_DIGESTS), + "species": list(_SAMPLE_DIGESTS.keys()), + }, + }, + } + + return store_app + + +_STORE_URL_ENV = os.environ.get("REFGET_STORE_URL") +_STORE_PATH_ENV = os.environ.get("REFGET_STORE_PATH") + +if _STORE_URL_ENV: + store_app = create_store_app(_STORE_URL_ENV, remote=True) +elif _STORE_PATH_ENV: + store_app = create_store_app(_STORE_PATH_ENV, remote=False) + +if __name__ != "__main__" and not _STORE_URL_ENV and not _STORE_PATH_ENV: + setup_backend(app, engine=RefgetDBAgent().engine) diff --git a/setup.py b/setup.py deleted file mode 100644 index 4e22f29..0000000 --- a/setup.py +++ /dev/null @@ -1,56 +0,0 @@ -#! /usr/bin/env python - -import os -from setuptools import setup, find_packages -import sys - -PACKAGE = "refget" - -# Additional keyword arguments for setup(). -extra = {} - -# Ordinary dependencies -DEPENDENCIES = [] -with open("requirements/requirements-all.txt", "r") as reqs_file: - for line in reqs_file: - if not line.strip(): - continue - DEPENDENCIES.append(line) - -extra["install_requires"] = DEPENDENCIES - -with open("{}/_version.py".format(PACKAGE), "r") as versionfile: - version = versionfile.readline().split()[-1].strip("\"'\n") - -long_description = open("README.md").read() - -setup( - name=PACKAGE, - packages=find_packages(include=[PACKAGE, f"{PACKAGE}.*"]), - version=version, - description="Python client for refget", - long_description=long_description, - long_description_content_type="text/markdown", - classifiers=[ - "Development Status :: 4 - Beta", - "License :: OSI Approved :: BSD License", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - ], - keywords="genome, assembly, bioinformatics, reference, sequence", - url="https://github.com/refgenie/refget", - author="Nathan Sheffield, Michal Stolarczyk", - author_email="nathan@code.databio.org", - license="BSD2", - entry_points={ - "console_scripts": ["refget = refget.cli:main"], - }, - # package_data={"refget": [os.path.join("refget", "*")]}, - include_package_data=True, - test_suite="tests", - tests_require=(["mock", "pytest"]), - setup_requires=(["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else []), - **extra, -) diff --git a/test_fasta/base.farg b/test_fasta/base.farg new file mode 100644 index 0000000..5a3c2fe --- /dev/null +++ b/test_fasta/base.farg @@ -0,0 +1,8 @@ +##seqcol_digest=XZlrcEGi6mlopZ2uD8ObHkQB1d0oDwKk +##names_digest=Fw1r9eRxfOZD98KKrhlYQNEdSRHoVxAG +##sequences_digest=0uDQVLuHaOZi1u76LjV__yrVUIz9Bwhr +##lengths_digest=cGRMZIb3AVgkcAfNv39RN7hnT5Chk7RX +#name length alphabet sha512t24u md5 +chrX 8 dna2bit iYtREV555dUFKg2_agSJW6suquUyPpMw 5f63cfaa3ef61f88c9635fb9d18ec945 +chr1 4 dna2bit YBbVX0dLKG1ieEDCiMmkrTZFt_Z5Vdaj 31fc6ca291a32fb9df82b85e5f077e31 +chr2 4 dna2bit AcLxtBuKEPk_7PGE_H4dGElwZHCujwH6 92c6a56c9e9459d8a42b96f7884710bc diff --git a/test_fasta/different_order.rgsi b/test_fasta/different_order.rgsi new file mode 100644 index 0000000..3cf702e --- /dev/null +++ b/test_fasta/different_order.rgsi @@ -0,0 +1,8 @@ +##seqcol_digest=Tpdsg75D4GKCGEHtIiDSL9Zx-DSuX5V8 +##names_digest=dOAOfPGkf3wAf3CUsbjVTKhY9Wq2DL6f +##sequences_digest=7t6Ulz6OeUWu6FBxntbvFKOl8w3icl2h +##lengths_digest=x5qpE4FtMkvlwpKIzvHs3a02Nex5tthp +#name length alphabet sha512t24u md5 description +chr1 4 dna2bit YBbVX0dLKG1ieEDCiMmkrTZFt_Z5Vdaj 31fc6ca291a32fb9df82b85e5f077e31 +chr2 4 dna2bit AcLxtBuKEPk_7PGE_H4dGElwZHCujwH6 92c6a56c9e9459d8a42b96f7884710bc +chrX 8 dna2bit iYtREV555dUFKg2_agSJW6suquUyPpMw 5f63cfaa3ef61f88c9635fb9d18ec945 diff --git a/test_fasta/pair_swap.rgsi b/test_fasta/pair_swap.rgsi new file mode 100644 index 0000000..f9cafb5 --- /dev/null +++ b/test_fasta/pair_swap.rgsi @@ -0,0 +1,11 @@ +##seqcol_digest=UNGAdNDmBbQbHihecPPFxwTydTcdFKxL +##names_digest=gSWbV6khfIsnlQTyw1PmlQ8G7VRfIWbU +##sequences_digest=0uDQVLuHaOZi1u76LjV__yrVUIz9Bwhr +##lengths_digest=cGRMZIb3AVgkcAfNv39RN7hnT5Chk7RX +##name_length_pairs_digest=yjUFKuKCURANxHar4JDF5ABOn6FJ-T8m +##sorted_name_length_pairs_digest=rL5OQOnFba8yyz7lS-0-hgZvwcQsiajN +##sorted_sequences_digest=KgWo6TT1Lqw6vgkXU9sYtCU9xwXoDt6M +#name length alphabet sha512t24u md5 description +chr2 8 dna2bit iYtREV555dUFKg2_agSJW6suquUyPpMw 5f63cfaa3ef61f88c9635fb9d18ec945 +chr1 4 dna2bit YBbVX0dLKG1ieEDCiMmkrTZFt_Z5Vdaj 31fc6ca291a32fb9df82b85e5f077e31 +chrX 4 dna2bit AcLxtBuKEPk_7PGE_H4dGElwZHCujwH6 92c6a56c9e9459d8a42b96f7884710bc diff --git a/test_fasta/sample_fhr.json b/test_fasta/sample_fhr.json new file mode 100644 index 0000000..098bb0e --- /dev/null +++ b/test_fasta/sample_fhr.json @@ -0,0 +1,14 @@ +{ + "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", + "schemaVersion": 1.0, + "genome": "Test organism", + "version": "v1.0", + "taxon": { + "name": "Test organism", + "uri": "https://identifiers.org/taxonomy:12345" + }, + "masking": "soft-masked", + "genomeSynonym": ["test_v1"], + "dateCreated": "2025-01-01", + "license": "CC0-1.0" +} diff --git a/test_fasta/subset.rgsi b/test_fasta/subset.rgsi new file mode 100644 index 0000000..e767fc7 --- /dev/null +++ b/test_fasta/subset.rgsi @@ -0,0 +1,7 @@ +##seqcol_digest=sv7GIP1K0qcskIKF3iaBmQpaum21vH74 +##names_digest=iyNUhtfR0TALytlmxK1Zx1_q3frkZyAd +##sequences_digest=3ZP38SZcoc9wN7jsRyNSP9mQ1a3TUoUF +##lengths_digest=7-_HdxYiRf-AJLBKOTaJUdxXrUkIXs6T +#name length alphabet sha512t24u md5 description +chrX 8 dna2bit iYtREV555dUFKg2_agSJW6suquUyPpMw 5f63cfaa3ef61f88c9635fb9d18ec945 +chr1 4 dna2bit YBbVX0dLKG1ieEDCiMmkrTZFt_Z5Vdaj 31fc6ca291a32fb9df82b85e5f077e31 diff --git a/test_fasta/swap_wo_coords.rgsi b/test_fasta/swap_wo_coords.rgsi new file mode 100644 index 0000000..20d6e1c --- /dev/null +++ b/test_fasta/swap_wo_coords.rgsi @@ -0,0 +1,11 @@ +##seqcol_digest=aVzHaGFlUDUNF2IEmNdzS_A8lCY0stQH +##names_digest=QX5ur-faw5nXis8HXUK2kMxgY5MTGVRn +##sequences_digest=0uDQVLuHaOZi1u76LjV__yrVUIz9Bwhr +##lengths_digest=cGRMZIb3AVgkcAfNv39RN7hnT5Chk7RX +##name_length_pairs_digest=suXpFjcxpyUDOkBgNEakNEXtLlyxtjJr +##sorted_name_length_pairs_digest=zjM1Ie9m0zFbqsAnZ6jAJSXuFpKTr40J +##sorted_sequences_digest=KgWo6TT1Lqw6vgkXU9sYtCU9xwXoDt6M +#name length alphabet sha512t24u md5 description +chrX 8 dna2bit iYtREV555dUFKg2_agSJW6suquUyPpMw 5f63cfaa3ef61f88c9635fb9d18ec945 +chr2 4 dna2bit YBbVX0dLKG1ieEDCiMmkrTZFt_Z5Vdaj 31fc6ca291a32fb9df82b85e5f077e31 +chr1 4 dna2bit AcLxtBuKEPk_7PGE_H4dGElwZHCujwH6 92c6a56c9e9459d8a42b96f7884710bc diff --git a/tests/api/conftest.py b/tests/api/conftest.py index 4bf9fe5..49ea738 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -1,41 +1,6 @@ -import pytest from pathlib import Path -from tests.conftest import DEMO_FILES - -# from tests.conftest import pytest_addoption, api_root, pytest_configure, pytest_collection_modifyitems, check_server_is_running -from tests.conftest import API_TEST_DIR - -COLLECTION_TESTS = [ - (DEMO_FILES[0], f"{API_TEST_DIR}/collection/base_collection.json"), - (DEMO_FILES[1], f"{API_TEST_DIR}/collection/different_names_collection.json"), - (DEMO_FILES[2], f"{API_TEST_DIR}/collection/different_order_collection.json"), - (DEMO_FILES[3], f"{API_TEST_DIR}/collection/pair_swap_collection.json"), - (DEMO_FILES[4], f"{API_TEST_DIR}/collection/subset_collection.json"), - (DEMO_FILES[5], f"{API_TEST_DIR}/collection/swap_wo_coords_collection.json"), -] - -COMPARISON_TESTS = [ - f"{API_TEST_DIR}/comparison/compare_base.fa_subset.fa.json", # subset - f"{API_TEST_DIR}/comparison/compare_base.fa_different_names.fa.json", # same sequences, different names - f"{API_TEST_DIR}/comparison/compare_base.fa_different_order.fa.json", # same sequences, name order switch, but equivalent coordinate system - f"{API_TEST_DIR}/comparison/compare_base.fa_pair_swap.fa.json", # swapped name-length-pairs - f"{API_TEST_DIR}/comparison/compare_base.fa_swap_wo_coords.fa.json", # swapped name-length-pairs, but no coord system change -] - - -ATTRIBUTE_TESTS = [ - ("lengths", "7-_HdxYiRf-AJLBKOTaJUdxXrUkIXs6T", [8, 4]), - ("names", "Fw1r9eRxfOZD98KKrhlYQNEdSRHoVxAG", ["chrX", "chr1", "chr2"]), -] - -ATTRIBUTE_LIST_TESTS = [ - ( - "lengths", - "cGRMZIb3AVgkcAfNv39RN7hnT5Chk7RX", - f"{API_TEST_DIR}/attribute/cGRM.json", - ) -] +import pytest @pytest.fixture(scope="session") diff --git a/tests/api/test_compliance.py b/tests/api/test_compliance.py index 442d6f4..62676b9 100644 --- a/tests/api/test_compliance.py +++ b/tests/api/test_compliance.py @@ -1,299 +1,120 @@ -# Compliance suite for the GA4GH SeqCol API v1.0.0 +# Pytest wrapper for the GA4GH SeqCol compliance suite. # -# Endpoints tested: -# - GET /service-info -# - GET /collection/:digest (level 1 and level 2) -# - GET /comparison/:digest1/:digest2 -# - POST /comparison/:digest -# - GET /attribute/collection/:attr/:digest -# - GET /list/collection (with pagination and filtering) -# - GET /list/attributes/:attr +# The canonical compliance checks live in refget/compliance.py. +# This file parametrizes them for pytest execution. # -# Also validates: -# - Level 1 returns digest strings, level 2 returns arrays -# - Transient attributes (sorted_name_length_pairs) in level 1 only -# - Pagination structure (results + pagination fields) +# Run against an external server: +# pytest tests/api --api-root https://seqcolapi.databio.org # -# Tests fall into two categories: -# 1. Content tests (collection, comparison, attribute): compare full responses to known fixtures -# 2. Structure tests (service-info, list endpoints): validate response structure only, since values vary by server +# Run via integration test server: +# ./scripts/test-integration.sh -import json import pytest -import requests -import refget -# Collection endpoints -from tests.api.conftest import ( - COLLECTION_TESTS, - COMPARISON_TESTS, - ATTRIBUTE_TESTS, - ATTRIBUTE_LIST_TESTS, +import refget.compliance as compliance +from refget.compliance import ( + check_attribute_retrieval, + check_collection_level1, + check_collection_level2, + check_comparison, + check_comparison_post, + check_comparison_same_order_values, + check_comparison_structure, + check_default_level_returns_level2, + check_list_attributes, + check_list_collections, + check_list_filter_by_attribute, + check_list_multi_attribute_filter_and, + check_openapi_available, + check_service_info, + check_sorted_name_length_pairs, + check_transient_attribute_not_served, ) -from tests.conftest import DIGEST_TESTS -demo_file = "demo0.fa" -response_file = "tests/demo0_collection.json" +# Load test data at import time — tests always run from the repo +compliance._load_test_data() +DIGEST_TESTS = compliance.DIGEST_TESTS +COMPARISON_FIXTURES = compliance.COMPARISON_FIXTURES -print("Testing Compliance") - - -def read_url(url): - import requests - import yaml - - try: - response = requests.get(url, timeout=1) - except requests.exceptions.ConnectionError: - print(f"Connection error: {url}") - raise e - data = response.content - return yaml.safe_load(data) - - -def check_collection(api_root, demo_file, response_file, data_root): - - # Need schema to make sure we eliminate inherent attributes correctly - # schema_path = "https://schema.databio.org/refget/SeqColArraySetInherent.yaml" - # schema = read_url(schema_path) - # inherent_attrs = schema["inherent"] - - inherent_attrs = ["names", "sequences"] - print(f"Loading fasta file at '{data_root}/{demo_file}'") - digest = refget.fasta_to_digest(f"{data_root}/{demo_file}", inherent_attrs=inherent_attrs) - print(f"Checking digest: {digest}") - res = requests.get(f"{api_root}/collection/{digest}") - - client = refget.SequenceCollectionClient(urls=[api_root]) - - srv_response = client.get_collection(digest, level=1) - print("Server response:", srv_response) - try: - server_answer = json.loads(res.content) - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - - with open(response_file) as fp: - correct_answer = json.load(fp) - - assert ( - server_answer["sequences"] == correct_answer["sequences"] - ), f"Collection endpoint failed: sequence mismatch for {demo_file}" - assert ( - server_answer["names"] == correct_answer["names"] - ), f"Collection endpoint failed: names mismatch for {demo_file}" - assert ( - server_answer["lengths"] == correct_answer["lengths"] - ), f"Collection endpoint failed: lengths mismatch for {demo_file}" - - -def check_comparison(api_root, response_file): - with open(response_file) as fp: - correct_answer = json.load(fp) - - url = ( - f"{api_root}/comparison/{correct_answer['digests']['a']}/{correct_answer['digests']['b']}" - ) - res = requests.get(url) - try: - server_answer = json.loads(res.content) - print("Server answer:", refget.canonical_str(server_answer)) - print("Correct answer:", refget.canonical_str(correct_answer)) - assert refget.canonical_str(server_answer) == refget.canonical_str( - correct_answer - ), f"Comparison endpoint failed: {url}. File: {response_file}" - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - assert False, f"Comparison endpoint failed: {url}" - - -def check_attribute(api_root, attribute_type, attribute, correct_value): - url = f"{api_root}/attribute/collection/{attribute_type}/{attribute}" - res = requests.get(url) - try: - server_answer = json.loads(res.content) - assert ( - server_answer == correct_value - ), f"Attribute endpoint failed: {url}. Answer: {correct_value}" - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - assert False, f"Attribute endpoint failed: {url}" - - -def check_list_collections_by_attribute(api_root, attribute_type, attribute, response_file): - with open(response_file) as fp: - correct_answer = json.load(fp) - - url = f"{api_root}/list/collection?{attribute_type}={attribute}" - res = requests.get(url) - try: - server_answer = json.loads(res.content) - print("Server answer:", server_answer) - for digest in correct_answer["results"]: - print("Checking digest:", digest) - assert ( - digest in server_answer["results"] - ), f"Attribute endpoint failed: {url}. Missing: {digest}" - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - assert False, f"Attribute endpoint failed: {url}" - - -def check_service_info(api_root): - url = f"{api_root}/service-info" - res = requests.get(url) - try: - server_answer = json.loads(res.content) - # Check required GA4GH service-info fields exist - assert "id" in server_answer, "service-info missing 'id' field" - assert "type" in server_answer, "service-info missing 'type' field" - assert "group" in server_answer["type"], "service-info type missing 'group'" - assert "artifact" in server_answer["type"], "service-info type missing 'artifact'" - assert "version" in server_answer["type"], "service-info type missing 'version'" - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - assert False, f"Service-info endpoint failed: {url}" - - -def check_list_collections(api_root): - url = f"{api_root}/list/collection" - res = requests.get(url) - try: - server_answer = json.loads(res.content) - assert "results" in server_answer, "list/collection missing 'results' field" - assert isinstance( - server_answer["results"], list - ), "list/collection 'results' should be a list" - assert "pagination" in server_answer, "list/collection missing 'pagination' field" - assert "page" in server_answer["pagination"], "pagination missing 'page'" - assert "page_size" in server_answer["pagination"], "pagination missing 'page_size'" - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - assert False, f"List collections endpoint failed: {url}" - - -def check_list_attributes(api_root, attribute_name): - url = f"{api_root}/list/attributes/{attribute_name}" - res = requests.get(url) - try: - server_answer = json.loads(res.content) - assert ( - "results" in server_answer - ), f"list/attributes/{attribute_name} missing 'results' field" - assert isinstance( - server_answer["results"], list - ), f"list/attributes/{attribute_name} 'results' should be a list" - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - assert False, f"List attributes endpoint failed: {url}" +@pytest.mark.require_service +class TestAPI: + """GA4GH SeqCol compliance tests. Expects demo data loaded on the server.""" -def check_collection_structure(api_root, digest): - # Level 1: inherent attributes should be digest strings - level1 = requests.get(f"{api_root}/collection/{digest}?level=1").json() - for attr in ["names", "lengths", "sequences"]: - assert isinstance(level1[attr], str), f"Level 1 {attr} should be digest string" + # ---- Structure checks ---- - # Level 1 should include transient attribute - assert "sorted_name_length_pairs" in level1, "Level 1 missing sorted_name_length_pairs" + def test_service_info(self, api_root): + check_service_info(api_root) - # Level 2: inherent attributes should be arrays - level2 = requests.get(f"{api_root}/collection/{digest}?level=2").json() - for attr in ["names", "lengths", "sequences"]: - assert isinstance(level2[attr], list), f"Level 2 {attr} should be array" + def test_list_collections(self, api_root): + check_list_collections(api_root) - # Level 2 should NOT include transient attribute - assert ( - "sorted_name_length_pairs" not in level2 - ), "Level 2 should not have sorted_name_length_pairs" + @pytest.mark.parametrize("attribute_name", ["lengths", "names", "sequences"]) + def test_list_attributes(self, api_root, attribute_name): + check_list_attributes(api_root, attribute_name) + @pytest.mark.recommended + def test_openapi_available(self, api_root): + check_openapi_available(api_root) -def check_comparison_post(api_root, response_file, test_data_root): - with open(response_file) as fp: - correct_answer = json.load(fp) + # ---- Collection content checks ---- - # Get the local collection to POST - digest_b = correct_answer["digests"]["b"] - client = refget.SequenceCollectionClient(urls=[api_root]) - local_collection = client.get_collection(digest_b) + @pytest.mark.parametrize("fa_name, bundle", DIGEST_TESTS) + def test_collection_level1(self, api_root, fa_name, bundle): + check_collection_level1(api_root, fa_name, bundle) - # POST to compare with collection A on server - digest_a = correct_answer["digests"]["a"] - url = f"{api_root}/comparison/{digest_a}" - res = requests.post(url, json=local_collection) - try: - server_answer = json.loads(res.content) - # POST endpoint returns "POSTed seqcol" for digest b since it doesn't know the digest - # So we compare everything except the digests.b field - assert ( - server_answer["digests"]["a"] == correct_answer["digests"]["a"] - ), f"Comparison POST: digest a mismatch" - assert ( - server_answer["attributes"] == correct_answer["attributes"] - ), f"Comparison POST: attributes mismatch" - assert ( - server_answer["array_elements"] == correct_answer["array_elements"] - ), f"Comparison POST: array_elements mismatch" - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - assert False, f"Comparison POST endpoint failed: {url}" + @pytest.mark.parametrize("fa_name, bundle", DIGEST_TESTS) + def test_collection_level2(self, api_root, fa_name, bundle): + check_collection_level2(api_root, fa_name, bundle) + @pytest.mark.parametrize("fa_name, bundle", DIGEST_TESTS) + def test_default_level_returns_level2(self, api_root, fa_name, bundle): + check_default_level_returns_level2(api_root, fa_name, bundle) -@pytest.mark.require_service -class TestAPI: - print("Testing Compliance") + @pytest.mark.parametrize("fa_name, bundle", DIGEST_TESTS) + def test_sorted_name_length_pairs(self, api_root, fa_name, bundle): + check_sorted_name_length_pairs(api_root, fa_name, bundle) - @pytest.mark.parametrize("test_values", COLLECTION_TESTS) - def test_collection_endpoint(self, api_root, test_values, test_data_root): - print("Testing collection endpoint") - check_collection(api_root, *test_values, test_data_root) + # ---- Attribute checks ---- - @pytest.mark.parametrize("response_file", COMPARISON_TESTS) - def test_comparison_endpoint(self, api_root, response_file): - print("Testing comparison endpoint") - check_comparison(api_root, response_file) + @pytest.mark.parametrize("fa_name, bundle", DIGEST_TESTS) + @pytest.mark.parametrize("attr_name", ["lengths", "names", "sequences"]) + def test_attribute_retrieval(self, api_root, fa_name, bundle, attr_name): + check_attribute_retrieval(api_root, fa_name, bundle, attr_name) - @pytest.mark.parametrize("test_values", ATTRIBUTE_TESTS) - def test_attribute_endpoint(self, api_root, test_values): - check_attribute(api_root, *test_values) + def test_transient_attribute_not_served(self, api_root): + check_transient_attribute_not_served(api_root) - @pytest.mark.parametrize("test_values", ATTRIBUTE_LIST_TESTS) - def test_attribute_list_endpoint(self, api_root, test_values): - check_list_collections_by_attribute(api_root, *test_values) + # ---- List/filter checks ---- - def test_service_info_endpoint(self, api_root): - check_service_info(api_root) + @pytest.mark.parametrize("attr_name", ["lengths", "names", "sequences"]) + def test_list_filter_by_attribute(self, api_root, attr_name): + fa_name, bundle = DIGEST_TESTS[0] + check_list_filter_by_attribute(api_root, fa_name, bundle, attr_name) - def test_list_collections_endpoint(self, api_root): - check_list_collections(api_root) + def test_multi_attribute_filter_and(self, api_root): + check_list_multi_attribute_filter_and(api_root) - @pytest.mark.parametrize("attribute_name", ["lengths", "names", "sequences"]) - def test_list_attributes_endpoint(self, api_root, attribute_name): - check_list_attributes(api_root, attribute_name) + # ---- Comparison checks ---- - @pytest.mark.parametrize("response_file", COMPARISON_TESTS) - def test_comparison_post_endpoint(self, api_root, response_file, test_data_root): - check_comparison_post(api_root, response_file, test_data_root) + def test_comparison_structure(self, api_root): + check_comparison_structure(api_root) - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_collection_structure(self, api_root, fa_file, fa_digest_bundle): - digest = fa_digest_bundle["top_level_digest"] - check_collection_structure(api_root, digest) + def test_comparison_same_order_values(self, api_root): + check_comparison_same_order_values(api_root) - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_collections(self, api_root, fa_file, fa_digest_bundle): - client = refget.SequenceCollectionClient(urls=[api_root]) - digest = fa_digest_bundle["top_level_digest"] - srv_response = client.get_collection(digest, level=1) - print("Server response:", srv_response) + @pytest.mark.parametrize( + "fixture_name, expected", + list(COMPARISON_FIXTURES.items()), + ids=list(COMPARISON_FIXTURES.keys()), + ) + def test_comparison(self, api_root, fixture_name, expected): + check_comparison(api_root, fixture_name, expected) - @pytest.mark.snlp - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_sorted_name_length_pairs(self, api_root, fa_file, fa_digest_bundle): - client = refget.SequenceCollectionClient(urls=[api_root]) - digest = fa_digest_bundle["top_level_digest"] - srv_response = client.get_collection(digest, level=1) - assert ( - srv_response["sorted_name_length_pairs"] - == fa_digest_bundle["sorted_name_length_pairs_digest"] - ), f"Collection endpoint failed: sorted_name_length_pairs mismatch for {demo_file}" + @pytest.mark.parametrize( + "fixture_name, expected", + list(COMPARISON_FIXTURES.items()), + ids=list(COMPARISON_FIXTURES.keys()), + ) + def test_comparison_post(self, api_root, fixture_name, expected): + check_comparison_post(api_root, fixture_name, expected) diff --git a/tests/conftest.py b/tests/conftest.py index 093814d..b504297 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,7 @@ import json import os from pathlib import Path + import pytest from typer.testing import CliRunner @@ -64,11 +65,20 @@ def invoke(*args): TEST_DATA_DIR = Path(__file__).parent.parent / "test_fasta" BASE_FASTA = TEST_DATA_DIR / "base.fa" + + +@pytest.fixture(scope="session") +def test_data_root(): + """Provides the absolute path to the test_fasta directory.""" + return TEST_DATA_DIR + + DIFFERENT_NAMES_FASTA = TEST_DATA_DIR / "different_names.fa" DIFFERENT_ORDER_FASTA = TEST_DATA_DIR / "different_order.fa" PAIR_SWAP_FASTA = TEST_DATA_DIR / "pair_swap.fa" SUBSET_FASTA = TEST_DATA_DIR / "subset.fa" SWAP_WO_COORDS_FASTA = TEST_DATA_DIR / "swap_wo_coords.fa" +SAMPLE_FHR_JSON = TEST_DATA_DIR / "sample_fhr.json" # ============================================================ @@ -246,6 +256,12 @@ def pytest_configure(config): config.addinivalue_line("markers", "requires_network: mark test as requiring network access") config.addinivalue_line("markers", "requires_db: mark test as requiring database access") config.addinivalue_line("markers", "slow: mark test as slow running") + config.addinivalue_line( + "markers", "recommended: mark test as RECOMMENDED (not REQUIRED) by GA4GH spec" + ) + config.addinivalue_line( + "markers", "require_service: mark test as requiring a running seqcol service" + ) def pytest_collection_modifyitems(config, items): @@ -273,3 +289,19 @@ def pytest_collection_modifyitems(config, items): for item in items: if "requires_db" in item.keywords: item.add_marker(skip_db) + + # Skip require_service tests if no api_root or test_server available + api_root = config.getoption("api_root") + if api_root is None: + skip_service = pytest.mark.skip( + reason="No --api-root provided and not running via integration test_server" + ) + for item in items: + if "require_service" in item.keywords: + # Only skip if this is the base TestAPI class, not a subclass with test_server + if ( + "TestAPI" in item.nodeid + and "TestComplianceViaIntegration" not in item.nodeid + and "TestStoreCompliance" not in item.nodeid + ): + item.add_marker(skip_service) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 7f7cdff..f67f829 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -7,12 +7,13 @@ """ import os -import pytest import socket import threading import time from pathlib import Path +import pytest + # Set environment variables BEFORE any app imports # Must match test-db.sh settings os.environ["POSTGRES_HOST"] = "localhost" @@ -78,8 +79,8 @@ def loaded_dbagent(test_dbagent, test_fasta_path): @pytest.fixture(scope="session") def client(loaded_dbagent): """Create TestClient with test database""" - from seqcolapi.main import app from refget.router import get_dbagent + from seqcolapi.main import app def override_get_dbagent(): return loaded_dbagent @@ -131,8 +132,9 @@ def test_server(request): loaded_dbagent = request.getfixturevalue("loaded_dbagent") import uvicorn - from seqcolapi.main import app + from refget.router import get_dbagent + from seqcolapi.main import app def override_get_dbagent(): return loaded_dbagent @@ -169,10 +171,123 @@ def override_get_dbagent(): app.dependency_overrides.clear() +@pytest.fixture(scope="session") +def store_test_server(tmp_path_factory): + """ + Provide a store-backed seqcol server URL for integration tests. + + Creates a temporary RefgetStore, loads all 6 test FASTA files, + and runs a store-backed uvicorn server in a background thread. + No database required. + + Note: We build the app manually (instead of create_store_app) so we can + reuse the same store instance that loaded the FASTAs, preserving + correct array ordering. Opening a new store from the same path + would lose FASTA-order due to a gtars hash-map ordering issue. + """ + import json + + import uvicorn + from fastapi import FastAPI + from fastapi.middleware.cors import CORSMiddleware + + from refget.router import create_refget_router, setup_backend + from refget.store import RefgetStore + from seqcolapi.const import ALL_VERSIONS + + # Create store and load test FASTAs + store_dir = tmp_path_factory.mktemp("store") + store = RefgetStore.on_disk(str(store_dir)) + + test_fasta_dir = Path(__file__).parent.parent.parent / "test_fasta" + for fa_file in [ + "base.fa", + "different_names.fa", + "different_order.fa", + "pair_swap.fa", + "subset.fa", + "swap_wo_coords.fa", + ]: + fa_path = test_fasta_dir / fa_file + store.add_sequence_collection_from_fasta(str(fa_path)) + + # Build the app directly using the same store instance + store_app = FastAPI( + title="Sequence Collections API (Store-backed test)", + version=ALL_VERSIONS["refget_version"], + ) + store_app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + setup_backend(store_app, store=store) + router = create_refget_router(sequences=False, pangenomes=False) + store_app.include_router(router) + + # Load seqcol schema for service-info + schema_path = Path(__file__).parent.parent.parent / "refget" / "schemas" / "seqcol.json" + try: + with open(schema_path) as f: + schema = json.load(f) + except Exception: + schema = None + + @store_app.get("/service-info", summary="GA4GH service info", tags=["General endpoints"]) + async def store_service_info(): + backend = getattr(store_app.state, "backend", None) + caps = backend.capabilities() if backend and hasattr(backend, "capabilities") else {} + return { + "id": "org.databio.seqcolapi.store", + "name": "Sequence collections (store-backed)", + "type": { + "group": "org.ga4gh", + "artifact": "refget-seqcol", + "version": ALL_VERSIONS["seqcol_spec_version"], + }, + "description": "Store-backed API providing metadata for collections of reference sequences", + "organization": {"name": "Databio Lab", "url": "https://databio.org"}, + "contactUrl": "https://github.com/refgenie/refget/issues", + "version": ALL_VERSIONS, + "seqcol": { + "schema": schema, + "refget_store": {"enabled": True, **caps}, + }, + } + + port = find_free_port() + server_url = f"http://localhost:{port}" + + config = uvicorn.Config(store_app, host="127.0.0.1", port=port, log_level="error", ws="none") + server = uvicorn.Server(config) + + thread = threading.Thread(target=server.run, daemon=True) + thread.start() + + # Wait for server to start + max_wait = 5.0 + start_time = time.time() + while time.time() - start_time < max_wait: + try: + with socket.create_connection(("127.0.0.1", port), timeout=0.1): + break + except (ConnectionRefusedError, OSError): + time.sleep(0.1) + else: + raise RuntimeError(f"Store test server failed to start on port {port}") + + yield server_url + + server.should_exit = True + + @pytest.fixture def cli_runner(): """CLI runner for integration tests.""" from typer.testing import CliRunner + from refget.cli.main import app runner = CliRunner() diff --git a/tests/integration/test_cli_admin_integration.py b/tests/integration/test_cli_admin_integration.py index f3dc832..9642803 100644 --- a/tests/integration/test_cli_admin_integration.py +++ b/tests/integration/test_cli_admin_integration.py @@ -7,8 +7,6 @@ """ import pytest -import json -from pathlib import Path from typer.testing import CliRunner from refget.cli.main import app diff --git a/tests/integration/test_cli_seqcol_integration.py b/tests/integration/test_cli_seqcol_integration.py index 9f13a1a..da6ab12 100644 --- a/tests/integration/test_cli_seqcol_integration.py +++ b/tests/integration/test_cli_seqcol_integration.py @@ -6,7 +6,6 @@ Run with: ./scripts/test-integration.sh """ -import pytest import json diff --git a/tests/integration/test_compliance_integration.py b/tests/integration/test_compliance_integration.py deleted file mode 100644 index 76773d1..0000000 --- a/tests/integration/test_compliance_integration.py +++ /dev/null @@ -1,260 +0,0 @@ -""" -Compliance tests running against the integration test server. - -These tests verify the API responses match expected fixtures, -using the ephemeral Docker PostgreSQL + test server infrastructure. -""" - -import json -import pytest -import requests -from pathlib import Path - -from tests.conftest import DIGEST_TESTS - - -class TestComplianceStructure: - """Test response structure matches GA4GH spec.""" - - def test_service_info_structure(self, test_server): - """Service-info has required GA4GH fields""" - res = requests.get(f"{test_server}/service-info") - assert res.status_code == 200 - data = res.json() - # GA4GH service-info required fields - assert "id" in data - assert "type" in data - assert "group" in data["type"] - assert "artifact" in data["type"] - assert "version" in data["type"] - - def test_service_info_seqcol_schema(self, test_server): - """Service-info MUST include seqcol.schema (GA4GH spec requirement)""" - res = requests.get(f"{test_server}/service-info") - assert res.status_code == 200 - data = res.json() - # Spec: service-info MUST return the JSON Schema implemented by the server - assert "seqcol" in data, "service-info must have 'seqcol' section" - assert "schema" in data["seqcol"], "seqcol section must include 'schema'" - schema = data["seqcol"]["schema"] - # Schema should define the required attributes - assert "properties" in schema, "schema must have 'properties'" - assert "lengths" in schema["properties"], "schema must define 'lengths'" - assert "names" in schema["properties"], "schema must define 'names'" - assert "sequences" in schema["properties"], "schema must define 'sequences'" - - def test_list_collections_structure(self, test_server): - """List collections has pagination structure per GA4GH paging guide""" - res = requests.get(f"{test_server}/list/collection") - assert res.status_code == 200 - data = res.json() - assert "results" in data - assert isinstance(data["results"], list) - assert "pagination" in data - assert "page" in data["pagination"] - assert "page_size" in data["pagination"] - assert "total" in data["pagination"], "pagination must include 'total' per GA4GH spec" - - def test_list_collections_filter_by_attribute(self, test_server): - """List collections filtered by attribute digest (REQUIRED by spec)""" - # Use base.fa's names digest to filter - names_digest = DIGEST_TESTS[0][1]["level1"]["names"] - res = requests.get(f"{test_server}/list/collection?names={names_digest}") - assert res.status_code == 200 - data = res.json() - assert "results" in data - # Should return only collections with this exact names digest - # base.fa has this names digest - assert DIGEST_TESTS[0][1]["top_level_digest"] in data["results"] - - -class TestAttributeEndpoint: - """Test /attribute/collection/:attr/:digest endpoint (REQUIRED by spec).""" - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_attribute_lengths(self, test_server, fa_file, fa_digest_bundle): - """Retrieve lengths attribute by its digest""" - lengths_digest = fa_digest_bundle["level1"]["lengths"] - expected_lengths = fa_digest_bundle["level2"]["lengths"] - res = requests.get(f"{test_server}/attribute/collection/lengths/{lengths_digest}") - assert res.status_code == 200 - data = res.json() - assert data == expected_lengths - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_attribute_names(self, test_server, fa_file, fa_digest_bundle): - """Retrieve names attribute by its digest""" - names_digest = fa_digest_bundle["level1"]["names"] - expected_names = fa_digest_bundle["level2"]["names"] - res = requests.get(f"{test_server}/attribute/collection/names/{names_digest}") - assert res.status_code == 200 - data = res.json() - assert data == expected_names - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_attribute_sequences(self, test_server, fa_file, fa_digest_bundle): - """Retrieve sequences attribute by its digest""" - sequences_digest = fa_digest_bundle["level1"]["sequences"] - expected_sequences = fa_digest_bundle["level2"]["sequences"] - res = requests.get(f"{test_server}/attribute/collection/sequences/{sequences_digest}") - assert res.status_code == 200 - data = res.json() - assert data == expected_sequences - - def test_attribute_not_found(self, test_server): - """Non-existent attribute digest returns 404""" - res = requests.get(f"{test_server}/attribute/collection/names/nonexistent_digest_12345") - assert res.status_code == 404 - - -class TestCollectionLevels: - """Test collection level 1 vs level 2 response formats.""" - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_default_level_returns_level2(self, test_server, fa_file, fa_digest_bundle): - """Collection without ?level= param returns level 2 (spec default)""" - digest = fa_digest_bundle["top_level_digest"] - res = requests.get(f"{test_server}/collection/{digest}") - assert res.status_code == 200 - data = res.json() - # Level 2 returns arrays, not digest strings - for attr in ["names", "lengths", "sequences"]: - assert isinstance(data[attr], list), f"Default should return level 2 (arrays)" - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_level1_returns_digests(self, test_server, fa_file, fa_digest_bundle): - """Level 1 returns digest strings for attributes""" - digest = fa_digest_bundle["top_level_digest"] - res = requests.get(f"{test_server}/collection/{digest}?level=1") - assert res.status_code == 200 - data = res.json() - for attr in ["names", "lengths", "sequences"]: - assert isinstance(data[attr], str), f"Level 1 {attr} should be digest string" - # Transient attribute present in level 1 - assert "sorted_name_length_pairs" in data - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_level2_returns_arrays(self, test_server, fa_file, fa_digest_bundle): - """Level 2 returns arrays for attributes""" - digest = fa_digest_bundle["top_level_digest"] - res = requests.get(f"{test_server}/collection/{digest}?level=2") - assert res.status_code == 200 - data = res.json() - for attr in ["names", "lengths", "sequences"]: - assert isinstance(data[attr], list), f"Level 2 {attr} should be array" - # Transient attribute NOT in level 2 - assert "sorted_name_length_pairs" not in data - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_sorted_name_length_pairs_digest(self, test_server, fa_file, fa_digest_bundle): - """Level 1 sorted_name_length_pairs matches expected digest""" - digest = fa_digest_bundle["top_level_digest"] - res = requests.get(f"{test_server}/collection/{digest}?level=1") - assert res.status_code == 200 - data = res.json() - assert ( - data["sorted_name_length_pairs"] == fa_digest_bundle["sorted_name_length_pairs_digest"] - ) - - -class TestComparison: - """Test comparison endpoint responses.""" - - def test_compare_identical(self, test_server): - """Comparing collection to itself returns expected structure""" - # Use base.fa digest - digest = DIGEST_TESTS[0][1]["top_level_digest"] - res = requests.get(f"{test_server}/comparison/{digest}/{digest}") - assert res.status_code == 200 - data = res.json() - assert "digests" in data - assert data["digests"]["a"] == digest - assert data["digests"]["b"] == digest - assert "attributes" in data - assert "array_elements" in data - - def test_compare_different(self, test_server): - """Comparing different collections returns diff structure""" - digest_a = DIGEST_TESTS[0][1]["top_level_digest"] # base.fa - digest_b = DIGEST_TESTS[1][1]["top_level_digest"] # different_names.fa - res = requests.get(f"{test_server}/comparison/{digest_a}/{digest_b}") - assert res.status_code == 200 - data = res.json() - assert data["digests"]["a"] == digest_a - assert data["digests"]["b"] == digest_b - assert "a_and_b" in data["attributes"] - - def test_compare_full_structure(self, test_server): - """Comparison returns complete structure per spec""" - digest_a = DIGEST_TESTS[0][1]["top_level_digest"] # base.fa - digest_b = DIGEST_TESTS[1][1]["top_level_digest"] # different_names.fa - res = requests.get(f"{test_server}/comparison/{digest_a}/{digest_b}") - assert res.status_code == 200 - data = res.json() - # Verify digests structure - assert "digests" in data - assert "a" in data["digests"] - assert "b" in data["digests"] - # Verify attributes structure - assert "attributes" in data - assert "a_only" in data["attributes"] - assert "b_only" in data["attributes"] - assert "a_and_b" in data["attributes"] - # Verify array_elements structure - assert "array_elements" in data - assert "a_count" in data["array_elements"] - assert "b_count" in data["array_elements"] - assert "a_and_b_count" in data["array_elements"] - assert "a_and_b_same_order" in data["array_elements"] - - def test_compare_post_with_seqcol_body(self, test_server): - """POST comparison with local seqcol in body (RECOMMENDED by spec)""" - digest_a = DIGEST_TESTS[0][1]["top_level_digest"] # base.fa on server - # POST the level 2 representation of different_names.fa - seqcol_b = DIGEST_TESTS[1][1]["level2"] - res = requests.post( - f"{test_server}/comparison/{digest_a}", - json=seqcol_b, - ) - assert res.status_code == 200 - data = res.json() - assert "digests" in data - assert data["digests"]["a"] == digest_a - # b digest may be computed or null per spec - assert "attributes" in data - assert "array_elements" in data - - def test_compare_with_fixtures(self, test_server): - """Comparison results match fixture files""" - # Test base.fa vs different_names.fa comparison - with open("tests/api/comparison/compare_base.fa_different_names.fa.json") as f: - expected = json.load(f) - - res = requests.get( - f"{test_server}/comparison/{expected['digests']['a']}/{expected['digests']['b']}" - ) - assert res.status_code == 200 - data = res.json() - assert data["digests"] == expected["digests"] - assert data["attributes"] == expected["attributes"] - assert data["array_elements"] == expected["array_elements"] - - -class TestCollectionContent: - """Test collection content matches fixtures.""" - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_collection_content(self, test_server, fa_file, fa_digest_bundle): - """Collection arrays match expected values from digests file""" - digest = fa_digest_bundle["top_level_digest"] - expected = fa_digest_bundle["level2"] - res = requests.get(f"{test_server}/collection/{digest}?level=2") - assert res.status_code == 200 - data = res.json() - - # Verify lengths match - assert data["lengths"] == expected["lengths"] - # Verify names match - assert data["names"] == expected["names"] - # Verify sequence digests match - assert data["sequences"] == expected["sequences"] diff --git a/tests/integration/test_run_compliance.py b/tests/integration/test_run_compliance.py new file mode 100644 index 0000000..6cb7df7 --- /dev/null +++ b/tests/integration/test_run_compliance.py @@ -0,0 +1,19 @@ +"""Run the standalone compliance suite against the integration test server.""" + +import pytest + +from tests.api.test_compliance import TestAPI + + +@pytest.mark.require_service +class TestComplianceViaIntegration(TestAPI): + """Run compliance tests against integration test server. + + Inherits all tests from TestAPI but provides api_root from + the integration test_server fixture instead of --api-root CLI option. + """ + + @pytest.fixture + def api_root(self, test_server): + """Map test_server fixture to api_root for compliance tests.""" + return test_server diff --git a/tests/integration/test_store_compliance.py b/tests/integration/test_store_compliance.py new file mode 100644 index 0000000..6785bdd --- /dev/null +++ b/tests/integration/test_store_compliance.py @@ -0,0 +1,102 @@ +"""Run the compliance suite against a store-backed seqcolapi server. + +Tests that depend on array element ordering are marked xfail because the +gtars RefgetStore does not preserve FASTA insertion order when returning +level 2 arrays. The digests (level 1) are always correct. +""" + +import pytest +import requests + +import refget.compliance as compliance +from refget.compliance import COMPLIANCE_TIMEOUT, check_transient_attribute_not_served +from tests.api.test_compliance import TestAPI + +# Load test data at import time — tests always run from the repo +compliance._load_test_data() +DIGEST_TESTS = compliance.DIGEST_TESTS +COMPARISON_FIXTURES = compliance.COMPARISON_FIXTURES + +# Reason used for all ordering-dependent xfails +_ORDER_REASON = "gtars RefgetStore does not preserve array element ordering (level 2)" + + +@pytest.mark.require_service +class TestStoreCompliance(TestAPI): + """Run compliance tests against store-backed seqcolapi server. + + Inherits all tests from TestAPI but provides api_root from + the store_test_server fixture. DB-only endpoints are overridden + to assert the expected non-200 behavior. Tests that depend on + exact array ordering are marked xfail due to a known gtars limitation. + """ + + @pytest.fixture + def api_root(self, store_test_server): + return store_test_server + + # --- Override DB-only tests --- + + @pytest.mark.parametrize("attribute_name", ["lengths", "names", "sequences"]) + def test_list_attributes(self, api_root, attribute_name): + """Store backend: /list/attributes returns 501 (DB-only endpoint).""" + res = requests.get( + f"{api_root}/list/attributes/{attribute_name}", + timeout=COMPLIANCE_TIMEOUT, + ) + assert res.status_code == 501 + + @pytest.mark.parametrize("attr_name", ["lengths", "names", "sequences"]) + def test_list_filter_by_attribute(self, api_root, attr_name): + """Store backend: attribute filtering returns 400 (not supported).""" + fa_name, bundle = DIGEST_TESTS[0] + attr_digest = bundle["level1"][attr_name] + res = requests.get( + f"{api_root}/list/collection?{attr_name}={attr_digest}", + timeout=COMPLIANCE_TIMEOUT, + ) + assert res.status_code == 400 + + def test_multi_attribute_filter_and(self, api_root): + """Store backend: multi-attribute filtering returns 400 (not supported).""" + bundle = DIGEST_TESTS[0][1] + res = requests.get( + f"{api_root}/list/collection?names={bundle['level1']['names']}&lengths={bundle['level1']['lengths']}", + timeout=COMPLIANCE_TIMEOUT, + ) + assert res.status_code == 400 + + # --- Override ordering-dependent tests with xfail --- + + @pytest.mark.xfail(reason=_ORDER_REASON, strict=False) + @pytest.mark.parametrize("fa_name, bundle", DIGEST_TESTS) + def test_collection_level2(self, api_root, fa_name, bundle): + super().test_collection_level2(api_root, fa_name, bundle) + + @pytest.mark.xfail(reason=_ORDER_REASON, strict=False) + @pytest.mark.parametrize("fa_name, bundle", DIGEST_TESTS) + @pytest.mark.parametrize("attr_name", ["lengths", "names", "sequences"]) + def test_attribute_retrieval(self, api_root, fa_name, bundle, attr_name): + super().test_attribute_retrieval(api_root, fa_name, bundle, attr_name) + + def test_transient_attribute_not_served(self, api_root): + """Transient attributes should return 404 from /attribute endpoint.""" + check_transient_attribute_not_served(api_root) + + @pytest.mark.xfail(reason=_ORDER_REASON, strict=False) + @pytest.mark.parametrize( + "fixture_name, expected", + list(COMPARISON_FIXTURES.items()), + ids=list(COMPARISON_FIXTURES.keys()), + ) + def test_comparison(self, api_root, fixture_name, expected): + super().test_comparison(api_root, fixture_name, expected) + + @pytest.mark.xfail(reason=_ORDER_REASON, strict=False) + @pytest.mark.parametrize( + "fixture_name, expected", + list(COMPARISON_FIXTURES.items()), + ids=list(COMPARISON_FIXTURES.keys()), + ) + def test_comparison_post(self, api_root, fixture_name, expected): + super().test_comparison_post(api_root, fixture_name, expected) diff --git a/tests/local/test_aliases.py b/tests/local/test_aliases.py new file mode 100644 index 0000000..58aeacc --- /dev/null +++ b/tests/local/test_aliases.py @@ -0,0 +1,99 @@ +"""Smoke tests for RefgetStore alias functionality via Python bindings.""" + +import os +import tempfile + +import pytest + +from refget.store import RefgetStore + +try: + from gtars.refget import RefgetStore as _check # noqa: F401 + + _RUST_BINDINGS_AVAILABLE = True +except ImportError: + _RUST_BINDINGS_AVAILABLE = False + +FASTA_PATH = "test_fasta/base.fa" + + +@pytest.fixture +def store(): + """Create an in-memory RefgetStore with base.fa loaded.""" + s = RefgetStore.in_memory() + s.disable_encoding() + s.add_sequence_collection_from_fasta(FASTA_PATH) + return s + + +@pytest.fixture +def seq_digest(store): + return store.list_sequences()[0].sha512t24u + + +@pytest.fixture +def col_digest(store): + return store.list_collections()["results"][0].digest + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +def test_sequence_alias_round_trip(store, seq_digest): + """Add, retrieve, and remove a sequence alias; verify None for missing aliases.""" + # Not found returns None + assert store.get_sequence_by_alias("ucsc", "chr1") is None + + # Add and retrieve + store.add_sequence_alias("ucsc", "chr1", seq_digest) + result = store.get_sequence_by_alias("ucsc", "chr1") + assert result is not None + assert result.metadata.sha512t24u == seq_digest + + # Remove + assert store.remove_sequence_alias("ucsc", "chr1") is True + assert store.get_sequence_by_alias("ucsc", "chr1") is None + assert store.remove_sequence_alias("ucsc", "chr1") is False + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +def test_collection_alias_round_trip(store, col_digest): + """Add, retrieve, and remove a collection alias; verify None for missing aliases.""" + assert store.get_collection_by_alias("genomes", "hg38") is None + + store.add_collection_alias("genomes", "hg38", col_digest) + result = store.get_collection_by_alias("genomes", "hg38") + assert result is not None + assert result.digest == col_digest + + assert store.remove_collection_alias("genomes", "hg38") is True + assert store.get_collection_by_alias("genomes", "hg38") is None + assert store.remove_collection_alias("genomes", "hg38") is False + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +def test_load_sequence_aliases_from_tsv(store, seq_digest): + """Load aliases from TSV; verify count return and post-load lookup.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as f: + f.write(f"chr1\t{seq_digest}\n") + f.write(f"chr2\t{seq_digest}\n") + tsv_path = f.name + try: + count = store.load_sequence_aliases("from_file", tsv_path) + assert count == 2 + assert store.get_sequence_by_alias("from_file", "chr1") is not None + finally: + os.unlink(tsv_path) + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +def test_load_collection_aliases_from_tsv(store, col_digest): + """Load aliases from TSV; verify count return and post-load lookup.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as f: + f.write(f"hg38\t{col_digest}\n") + f.write(f"GRCh38\t{col_digest}\n") + tsv_path = f.name + try: + count = store.load_collection_aliases("from_file", tsv_path) + assert count == 2 + assert store.get_collection_by_alias("from_file", "hg38") is not None + finally: + os.unlink(tsv_path) diff --git a/tests/local/test_backend.py b/tests/local/test_backend.py new file mode 100644 index 0000000..5f224ec --- /dev/null +++ b/tests/local/test_backend.py @@ -0,0 +1,218 @@ +""" +Tests for SeqColBackend protocol and RefgetStoreBackend implementation. + +Verifies that: +- RefgetStoreBackend wraps RefgetStore correctly +- All SeqColBackend protocol methods work +- Error handling (ValueError, KeyError) works properly +""" + +import json +from pathlib import Path + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +try: + from refget.backend import RefgetStoreBackend, SeqColBackend + from refget.store import RefgetStore + + _RUST_BINDINGS_AVAILABLE = True +except ImportError: + _RUST_BINDINGS_AVAILABLE = False + +from refget.router import create_refget_router + +TEST_FASTA_DIR = Path("test_fasta") +BASE_FASTA = TEST_FASTA_DIR / "base.fa" +DIFFERENT_NAMES_FASTA = TEST_FASTA_DIR / "different_names.fa" + +with open(TEST_FASTA_DIR / "test_fasta_digests.json") as fp: + TEST_DIGESTS = json.load(fp) + +BASE_DIGEST = TEST_DIGESTS["base.fa"]["top_level_digest"] +BASE_LEVEL1 = TEST_DIGESTS["base.fa"]["level1"] +BASE_LEVEL2 = TEST_DIGESTS["base.fa"]["level2"] +DIFFERENT_NAMES_DIGEST = TEST_DIGESTS["different_names.fa"]["top_level_digest"] + + +@pytest.fixture +def backend(): + """Create a RefgetStoreBackend with base.fa and different_names.fa loaded.""" + store = RefgetStore.in_memory() + store.add_sequence_collection_from_fasta(str(BASE_FASTA)) + store.add_sequence_collection_from_fasta(str(DIFFERENT_NAMES_FASTA)) + return RefgetStoreBackend(store) + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +class TestRefgetStoreBackend: + """Tests for RefgetStoreBackend.""" + + def test_satisfies_protocol(self, backend): + """RefgetStoreBackend satisfies the SeqColBackend protocol.""" + assert isinstance(backend, SeqColBackend) + + def test_get_collection_level2(self, backend): + """get_collection returns level2 by default.""" + result = backend.get_collection(BASE_DIGEST) + assert "names" in result + assert "lengths" in result + assert "sequences" in result + assert isinstance(result["names"], list) + + def test_get_collection_level1(self, backend): + """get_collection with level=1 returns digest strings.""" + result = backend.get_collection(BASE_DIGEST, level=1) + assert "names" in result + assert isinstance(result["names"], str) + + def test_get_collection_not_found(self, backend): + """get_collection raises ValueError for missing digest.""" + with pytest.raises(ValueError, match="not found"): + backend.get_collection("nonexistent_digest") + + def test_get_collection_attribute(self, backend): + """get_collection_attribute returns a single attribute array matching level2.""" + names = backend.get_collection_attribute(BASE_DIGEST, "names") + assert isinstance(names, list) + # Should match what get_collection returns + level2 = backend.get_collection(BASE_DIGEST, level=2) + assert names == level2["names"] + + def test_get_collection_attribute_not_found(self, backend): + """get_collection_attribute raises ValueError for missing attribute.""" + with pytest.raises(ValueError, match="not found"): + backend.get_collection_attribute(BASE_DIGEST, "nonexistent_attr") + + def test_get_collection_itemwise(self, backend): + """get_collection_itemwise returns transposed list of dicts.""" + items = backend.get_collection_itemwise(BASE_DIGEST) + assert isinstance(items, list) + assert len(items) > 0 + for item in items: + assert "names" in item + assert "lengths" in item + + def test_get_collection_itemwise_with_limit(self, backend): + """get_collection_itemwise respects limit parameter.""" + items = backend.get_collection_itemwise(BASE_DIGEST, limit=1) + assert len(items) == 1 + + def test_get_attribute(self, backend): + """get_attribute returns attribute by its own digest.""" + names_digest = BASE_LEVEL1["names"] + result = backend.get_attribute("names", names_digest) + assert isinstance(result, list) + + def test_get_attribute_not_found(self, backend): + """get_attribute raises KeyError for missing attribute.""" + with pytest.raises(KeyError): + backend.get_attribute("names", "nonexistent_digest") + + def test_compare_digests(self, backend): + """compare_digests returns comparison dict.""" + result = backend.compare_digests(BASE_DIGEST, DIFFERENT_NAMES_DIGEST) + assert "attributes" in result + assert "array_elements" in result + + def test_compare_digests_not_found(self, backend): + """compare_digests raises ValueError for missing digest.""" + with pytest.raises(ValueError): + backend.compare_digests("nonexistent", DIFFERENT_NAMES_DIGEST) + + def test_compare_digest_with_level2(self, backend): + """compare_digest_with_level2 compares stored vs POSTed collection.""" + level2_b = backend.get_collection(DIFFERENT_NAMES_DIGEST, level=2) + result = backend.compare_digest_with_level2(BASE_DIGEST, level2_b) + assert "attributes" in result + assert "array_elements" in result + + def test_list_collections(self, backend): + """list_collections returns paginated results.""" + result = backend.list_collections() + assert "results" in result + assert "pagination" in result + assert result["pagination"]["total"] >= 2 + + def test_list_collections_pagination(self, backend): + """list_collections respects page_size.""" + result = backend.list_collections(page=0, page_size=1) + assert len(result["results"]) <= 1 + + def test_collection_count(self, backend): + """collection_count returns total number of collections.""" + count = backend.collection_count() + assert count >= 2 + + def test_capabilities(self, backend): + """capabilities returns expected keys for RefgetStoreBackend.""" + caps = backend.capabilities() + assert caps["backend_type"] == "refget_store" + assert "n_collections" in caps + assert "n_sequences" in caps + assert "has_sequence_data" in caps + assert isinstance(caps["collection_alias_namespaces"], list) + assert isinstance(caps["sequence_alias_namespaces"], list) + assert caps["n_collections"] >= 2 + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +class TestStoreBackend501: + """Verify DB-only endpoints return 501 when only RefgetStoreBackend is configured.""" + + @pytest.fixture + def store_client(self): + """Create a TestClient with RefgetStoreBackend but no dbagent.""" + app = FastAPI() + router = create_refget_router(sequences=False, collections=True, pangenomes=False) + app.include_router(router, prefix="/seqcol") + + store = RefgetStore.in_memory() + store.add_sequence_collection_from_fasta(str(BASE_FASTA)) + backend = RefgetStoreBackend(store) + app.state.backend = backend + # Deliberately do NOT set app.state.dbagent + return TestClient(app) + + def test_list_attributes_works_without_dbagent(self, store_client): + """GET /list/attributes/names works via backend without dbagent.""" + response = store_client.get("/seqcol/list/attributes/names") + assert response.status_code == 200 + data = response.json() + assert "results" in data + assert "pagination" in data + + def test_similarities_post_returns_501(self, store_client): + """POST /similarities/{digest} returns 501 without dbagent.""" + response = store_client.post( + f"/seqcol/similarities/{BASE_DIGEST}", + params={"species": "human"}, + ) + assert response.status_code == 501 + + def test_similarities_json_post_returns_501(self, store_client): + """POST /similarities/ returns 501 without dbagent.""" + response = store_client.post( + "/seqcol/similarities/", + json={"names": ["chr1"], "lengths": [100], "sequences": ["abc"]}, + ) + assert response.status_code == 501 + + def test_backend_endpoints_still_work(self, store_client): + """Backend-powered endpoints work fine without dbagent.""" + # GET /collection/{digest} uses get_backend, should work + response = store_client.get(f"/seqcol/collection/{BASE_DIGEST}") + assert response.status_code == 200 + data = response.json() + assert "names" in data + assert "lengths" in data + + def test_list_collections_still_works(self, store_client): + """GET /list/collection uses get_backend, should work.""" + response = store_client.get("/seqcol/list/collection") + assert response.status_code == 200 + data = response.json() + assert "results" in data + assert "pagination" in data diff --git a/tests/local/test_digest_functions.py b/tests/local/test_digest_functions.py index da6fed8..d3b4b34 100644 --- a/tests/local/test_digest_functions.py +++ b/tests/local/test_digest_functions.py @@ -1,15 +1,20 @@ +from pathlib import Path + import pytest from refget import GTARS_INSTALLED -from refget.digests import ga4gh_digest, py_sha512t24u_digest, py_md5_digest -from pathlib import Path +from refget.digests import ga4gh_digest, py_md5_digest, py_sha512t24u_digest if GTARS_INSTALLED: from gtars.refget import ( - sha512t24u_digest as gtars_sha512t24u_digest, - md5_digest as gtars_md5_digest, digest_fasta, ) + from gtars.refget import ( + md5_digest as gtars_md5_digest, + ) + from gtars.refget import ( + sha512t24u_digest as gtars_sha512t24u_digest, + ) @pytest.mark.skipif(not GTARS_INSTALLED, reason="gtars is not installed") diff --git a/tests/local/test_local_models.py b/tests/local/test_local_models.py index cc84e4f..75157f1 100644 --- a/tests/local/test_local_models.py +++ b/tests/local/test_local_models.py @@ -1,11 +1,12 @@ import json import os + import pytest + from refget import InvalidSeqColError from refget.models import SequenceCollection from refget.utils import compare_seqcols, validate_seqcol - -from tests.conftest import DEMO_FILES, DIGEST_TESTS, API_TEST_DIR +from tests.conftest import API_TEST_DIR, DEMO_FILES, DIGEST_TESTS # Pairs of files to compare, with the "correct" compare response COMPARE_TESTS = [ diff --git a/tests/local/test_local_models_gtars.py b/tests/local/test_local_models_gtars.py index 8ac2445..e0b2122 100644 --- a/tests/local/test_local_models_gtars.py +++ b/tests/local/test_local_models_gtars.py @@ -1,24 +1,26 @@ -import pytest import logging - -_LOGGER = logging.getLogger(__name__) from pathlib import Path +import pytest + from refget.models import SequenceCollection as pythonSequenceCollection +from refget.store import RefgetStore -from refget.store import RefgetStore, StorageMode +_LOGGER = logging.getLogger(__name__) try: - from gtars.refget import ( + from gtars.refget import ( # noqa: F401 SequenceCollection as gtarsSequenceCollection, + ) + from gtars.refget import ( digest_fasta, ) _RUST_BINDINGS_AVAILABLE = True -except ImportError as e: +except ImportError: _LOGGER.warning( - f"Could not import gtars python bindings. `from_PySequenceCollection` will not be available." + "Could not import gtars python bindings. `from_PySequenceCollection` will not be available." ) _RUST_BINDINGS_AVAILABLE = False @@ -35,9 +37,9 @@ def test_pysequencecollection(self): bridged_seq_col = pythonSequenceCollection.from_PySequenceCollection( gtars_seq_col=gtars_digested_seq_col ) - assert ( - bridged_seq_col.digest == python_seq_col.digest == gtars_digested_seq_col.digest - ), "Top-level digest mismatch!" + assert bridged_seq_col.digest == python_seq_col.digest == gtars_digested_seq_col.digest, ( + "Top-level digest mismatch!" + ) assert bridged_seq_col.sequences.digest == python_seq_col.sequences.digest assert bridged_seq_col.sequences.value == python_seq_col.sequences.value diff --git a/tests/local/test_refget_clients.py b/tests/local/test_refget_clients.py index 13b81e4..77941df 100644 --- a/tests/local/test_refget_clients.py +++ b/tests/local/test_refget_clients.py @@ -8,7 +8,7 @@ see tests/integration/test_seqcolapi_client.py """ -from refget.clients import SequenceCollectionClient, FastaDrsClient +from refget.clients import FastaDrsClient, SequenceCollectionClient class TestClientConstruction: diff --git a/tests/local/test_remove_collection.py b/tests/local/test_remove_collection.py new file mode 100644 index 0000000..b998770 --- /dev/null +++ b/tests/local/test_remove_collection.py @@ -0,0 +1,35 @@ +"""Smoke test for RefgetStore.remove_collection() Python binding.""" + +import pytest + +from refget.store import RefgetStore + +try: + from gtars.refget import RefgetStore as _check # noqa: F401 + + _RUST_BINDINGS_AVAILABLE = True +except ImportError: + _RUST_BINDINGS_AVAILABLE = False + +FASTA_PATH = "test_fasta/base.fa" + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +def test_remove_collection_round_trip(): + """Add a collection, remove it with orphan cleanup, verify store is empty.""" + store = RefgetStore.in_memory() + store.set_quiet(True) + store.add_sequence_collection_from_fasta(FASTA_PATH) + + assert len(store.list_collections()["results"]) == 1 + assert len(store.list_sequences()) > 0 + + digest = store.list_collections()["results"][0].digest + + # Nonexistent returns False + assert store.remove_collection("nonexistent") is False + + # Real removal with orphan cleanup + assert store.remove_collection(digest, remove_orphan_sequences=True) is True + assert len(store.list_collections()["results"]) == 0 + assert len(store.list_sequences()) == 0 diff --git a/tests/local/test_store_seqcol_features.py b/tests/local/test_store_seqcol_features.py new file mode 100644 index 0000000..8a9bae2 --- /dev/null +++ b/tests/local/test_store_seqcol_features.py @@ -0,0 +1,102 @@ +""" +Tests for RefgetStore seqcol features: level1/level2, compare, find_collections_by_attribute. + +Only tests that verify Python-specific behavior beyond what Rust tests cover: +- Rust/Python parity for compare() +- Multi-collection attribute search +- Basic level1/level2 smoke test +""" + +import json +from pathlib import Path + +import pytest + +try: + from refget.store import RefgetStore + + _RUST_BINDINGS_AVAILABLE = True +except ImportError: + _RUST_BINDINGS_AVAILABLE = False + +TEST_FASTA_DIR = Path("test_fasta") +BASE_FASTA = TEST_FASTA_DIR / "base.fa" +DIFFERENT_NAMES_FASTA = TEST_FASTA_DIR / "different_names.fa" + +with open(TEST_FASTA_DIR / "test_fasta_digests.json") as fp: + TEST_DIGESTS = json.load(fp) + +BASE_DIGEST = TEST_DIGESTS["base.fa"]["top_level_digest"] +BASE_LEVEL1 = TEST_DIGESTS["base.fa"]["level1"] +BASE_LEVEL2 = TEST_DIGESTS["base.fa"]["level2"] +DIFFERENT_NAMES_DIGEST = TEST_DIGESTS["different_names.fa"]["top_level_digest"] + + +@pytest.fixture +def store_with_base(): + """Create an in-memory store with base.fa loaded.""" + store = RefgetStore.in_memory() + store.add_sequence_collection_from_fasta(str(BASE_FASTA)) + return store + + +@pytest.fixture +def store_with_two(): + """Create an in-memory store with base.fa and different_names.fa loaded.""" + store = RefgetStore.in_memory() + store.add_sequence_collection_from_fasta(str(BASE_FASTA)) + store.add_sequence_collection_from_fasta(str(DIFFERENT_NAMES_FASTA)) + return store + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +def test_level1_and_level2_smoke(store_with_base): + """Level1 returns digests, level2 returns arrays, both have required keys.""" + lvl1 = store_with_base.get_collection_level1(BASE_DIGEST) + lvl2 = store_with_base.get_collection_level2(BASE_DIGEST) + + for key in ("names", "lengths", "sequences"): + assert key in lvl1 + assert key in lvl2 + # Level1 values are digest strings, level2 values are lists + assert isinstance(lvl1[key], str) + assert isinstance(lvl2[key], list) + + # Verify level2 matches expected values + assert sorted(lvl2["names"]) == sorted(BASE_LEVEL2["names"]) + assert sorted(lvl2["lengths"]) == sorted(BASE_LEVEL2["lengths"]) + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +def test_compare_matches_python_implementation(store_with_two): + """Verify store.compare() (Rust) agrees with compare_seqcols() (Python) on core attributes.""" + from refget.utils import compare_seqcols + + lvl2_a = store_with_two.get_collection_level2(BASE_DIGEST) + lvl2_b = store_with_two.get_collection_level2(DIFFERENT_NAMES_DIGEST) + + python_result = compare_seqcols(lvl2_a, lvl2_b) + rust_result = store_with_two.compare(BASE_DIGEST, DIFFERENT_NAMES_DIGEST) + + core_attrs = {"names", "lengths", "sequences"} + assert core_attrs <= set(python_result["attributes"]["a_and_b"]) + assert core_attrs <= set(rust_result["attributes"]["a_and_b"]) + + for attr in core_attrs: + assert ( + rust_result["array_elements"]["a_and_b_count"][attr] + == python_result["array_elements"]["a_and_b_count"][attr] + ) + assert ( + rust_result["array_elements"]["a_and_b_same_order"][attr] + == python_result["array_elements"]["a_and_b_same_order"][attr] + ) + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +def test_shared_attribute_returns_multiple(store_with_two): + """base.fa and different_names.fa share lengths; searching by lengths returns both.""" + lengths_digest = BASE_LEVEL1["lengths"] + results = store_with_two.find_collections_by_attribute("lengths", lengths_digest) + assert BASE_DIGEST in results + assert DIFFERENT_NAMES_DIGEST in results diff --git a/tests/test_cli/test_admin_commands.py b/tests/test_cli/test_admin_commands.py index f9d80de..3761d8f 100644 --- a/tests/test_cli/test_admin_commands.py +++ b/tests/test_cli/test_admin_commands.py @@ -7,9 +7,6 @@ Database-dependent admin tests are in tests/integration/test_cli_admin_integration.py """ -import pytest -import json - class TestAdminStatus: """Tests for: refget admin status diff --git a/tests/test_cli/test_config_commands.py b/tests/test_cli/test_config_commands.py index 8e9f78a..666864e 100644 --- a/tests/test_cli/test_config_commands.py +++ b/tests/test_cli/test_config_commands.py @@ -2,7 +2,6 @@ """Tests for refget config CLI commands.""" -import pytest import json @@ -101,11 +100,14 @@ def test_creates_config_file(self, cli, tmp_path, monkeypatch): # Provide minimal input for interactive prompts from typer.testing import CliRunner + from refget.cli import app runner = CliRunner() result = runner.invoke( - app, ["config", "init"], input=f"{tmp_path}/store\n\n\n" # Store path + defaults + app, + ["config", "init"], + input=f"{tmp_path}/store\n\n\n", # Store path + defaults ) # Config init should succeed or prompt for input @@ -116,6 +118,7 @@ def test_init_no_overwrite(self, cli, temp_config, monkeypatch): monkeypatch.setenv("REFGET_CONFIG", str(temp_config)) from typer.testing import CliRunner + from refget.cli import app runner = CliRunner() diff --git a/tests/test_cli/test_fasta_commands.py b/tests/test_cli/test_fasta_commands.py index df5c698..99060c3 100644 --- a/tests/test_cli/test_fasta_commands.py +++ b/tests/test_cli/test_fasta_commands.py @@ -3,43 +3,39 @@ """ Tests for refget fasta CLI commands. -These test the CLI wrapper behavior: output formatting, exit codes, argument parsing. +These test CLI-specific behavior: output formatting, exit codes, argument parsing. """ -import pytest +import importlib.util import json -from pathlib import Path - -import sys import os +from pathlib import Path -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from conftest import ( - BASE_FASTA, - DIFFERENT_NAMES_FASTA, - TEST_FASTA_DIGESTS, - assert_json_output, - assert_valid_digest, +_conftest_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "conftest.py" ) +_spec = importlib.util.spec_from_file_location("tests_conftest", _conftest_path) +_conftest = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_conftest) + +BASE_FASTA = _conftest.BASE_FASTA +DIFFERENT_NAMES_FASTA = _conftest.DIFFERENT_NAMES_FASTA +TEST_FASTA_DIGESTS = _conftest.TEST_FASTA_DIGESTS +assert_json_output = _conftest.assert_json_output +assert_valid_digest = _conftest.assert_valid_digest class TestFastaDigest: """Tests for: refget fasta digest """ - def test_outputs_json(self, cli, sample_fasta): - """Output is valid JSON with digest.""" - result = cli("fasta", "digest", str(sample_fasta)) - - data = assert_json_output(result, ["digest"]) - assert_valid_digest(data["digest"]) - - def test_digest_with_file_key(self, cli, sample_fasta): - """Output may include file path.""" - result = cli("fasta", "digest", str(sample_fasta)) + def test_known_digest(self, cli): + """Verify digest matches expected value for known file.""" + result = cli("fasta", "digest", str(BASE_FASTA)) assert result.exit_code == 0 data = json.loads(result.stdout) - assert "digest" in data + expected_digest = TEST_FASTA_DIGESTS["base.fa"]["top_level_digest"] + assert data["digest"] == expected_digest def test_gzipped_file(self, cli, sample_fasta_gz): """Handles gzipped files seamlessly.""" @@ -52,34 +48,13 @@ def test_gzipped_file(self, cli, sample_fasta_gz): def test_file_not_found_exit_code(self, cli): """Returns non-zero exit code for missing file.""" result = cli("fasta", "digest", "/nonexistent/file.fa") - - assert result.exit_code != 0 - # Error message goes to stderr (correct Unix behavior) - assert "not found" in result.stderr.lower() or "error" in result.stderr.lower() - - def test_missing_argument(self, cli): - """Returns non-zero exit for missing argument.""" - result = cli("fasta", "digest") - assert result.exit_code != 0 - def test_known_digest(self, cli): - """Verify digest matches expected value for known file.""" - result = cli("fasta", "digest", str(BASE_FASTA)) - - assert result.exit_code == 0 - data = json.loads(result.stdout) - expected_digest = TEST_FASTA_DIGESTS["base.fa"]["top_level_digest"] - assert data["digest"] == expected_digest - def test_different_files_different_digests(self, cli): """Different files produce different digests.""" result1 = cli("fasta", "digest", str(BASE_FASTA)) result2 = cli("fasta", "digest", str(DIFFERENT_NAMES_FASTA)) - assert result1.exit_code == 0 - assert result2.exit_code == 0 - digest1 = json.loads(result1.stdout)["digest"] digest2 = json.loads(result2.stdout)["digest"] assert digest1 != digest2 @@ -88,38 +63,6 @@ def test_different_files_different_digests(self, cli): class TestFastaSeqcol: """Tests for: refget fasta seqcol """ - def test_outputs_seqcol_json(self, cli, sample_fasta): - """Output is valid seqcol JSON.""" - result = cli("fasta", "seqcol", str(sample_fasta)) - - data = assert_json_output(result, ["names", "lengths", "sequences"]) - assert isinstance(data["names"], list) - assert isinstance(data["lengths"], list) - assert isinstance(data["sequences"], list) - - def test_seqcol_array_lengths_match(self, cli, sample_fasta): - """All seqcol arrays have same length.""" - result = cli("fasta", "seqcol", str(sample_fasta)) - - assert result.exit_code == 0 - data = json.loads(result.stdout) - n_seqs = len(data["names"]) - assert len(data["lengths"]) == n_seqs - assert len(data["sequences"]) == n_seqs - - def test_output_to_file(self, cli, sample_fasta, tmp_path): - """Writes to file with -o option.""" - output = tmp_path / "out.seqcol.json" - result = cli("fasta", "seqcol", str(sample_fasta), "-o", str(output)) - - assert result.exit_code == 0 - assert output.exists() - - data = json.loads(output.read_text()) - assert "names" in data - assert "lengths" in data - assert "sequences" in data - def test_known_seqcol(self, cli): """Verify seqcol matches expected values for known file.""" result = cli("fasta", "seqcol", str(BASE_FASTA)) @@ -131,41 +74,20 @@ def test_known_seqcol(self, cli): assert data["names"] == expected["names"] assert data["lengths"] == expected["lengths"] - def test_gzipped_file(self, cli, sample_fasta_gz): - """Handles gzipped FASTA files.""" - result = cli("fasta", "seqcol", str(sample_fasta_gz)) + def test_output_to_file(self, cli, sample_fasta, tmp_path): + """Writes to file with -o option.""" + output = tmp_path / "out.seqcol.json" + result = cli("fasta", "seqcol", str(sample_fasta), "-o", str(output)) assert result.exit_code == 0 - data = json.loads(result.stdout) + assert output.exists() + data = json.loads(output.read_text()) assert "names" in data class TestFastaFai: """Tests for: refget fasta fai """ - def test_outputs_fai_format(self, cli, sample_fasta, tmp_path): - """Outputs valid FAI format.""" - output = tmp_path / "test.fa.fai" - result = cli("fasta", "fai", str(sample_fasta), "-o", str(output)) - - assert result.exit_code == 0 - assert output.exists() - - # FAI format: name\tlength\toffset\tline_bases\tline_width - lines = output.read_text().strip().split("\n") - assert len(lines) > 0 - for line in lines: - parts = line.split("\t") - assert len(parts) >= 2 # At least name and length - - def test_fai_to_stdout(self, cli, sample_fasta): - """Outputs FAI to stdout when no -o specified.""" - result = cli("fasta", "fai", str(sample_fasta)) - - assert result.exit_code == 0 - lines = result.stdout.strip().split("\n") - assert len(lines) > 0 - def test_fai_sequence_count(self, cli, multi_seq_fasta, tmp_path): """FAI has one line per sequence.""" output = tmp_path / "test.fa.fai" @@ -179,42 +101,13 @@ def test_fai_sequence_count(self, cli, multi_seq_fasta, tmp_path): class TestFastaChromSizes: """Tests for: refget fasta chrom-sizes """ - def test_outputs_chrom_sizes(self, cli, sample_fasta, tmp_path): - """Outputs valid chrom.sizes format.""" - output = tmp_path / "test.chrom.sizes" - result = cli("fasta", "chrom-sizes", str(sample_fasta), "-o", str(output)) - - assert result.exit_code == 0 - assert output.exists() - - # Format: name\tlength - lines = output.read_text().strip().split("\n") - for line in lines: - parts = line.split("\t") - assert len(parts) == 2 - assert parts[1].isdigit() - - def test_chrom_sizes_to_stdout(self, cli, sample_fasta): - """Outputs chrom.sizes to stdout when no -o specified.""" - result = cli("fasta", "chrom-sizes", str(sample_fasta)) - - assert result.exit_code == 0 - lines = result.stdout.strip().split("\n") - assert len(lines) > 0 - for line in lines: - parts = line.split("\t") - assert len(parts) == 2 - def test_chrom_sizes_values(self, cli): """Verify chrom.sizes values for known file.""" result = cli("fasta", "chrom-sizes", str(BASE_FASTA)) assert result.exit_code == 0 - lines = result.stdout.strip().split("\n") - - # base.fa has chrX(8), chr1(4), chr2(4) sizes = {} - for line in lines: + for line in result.stdout.strip().split("\n"): name, length = line.split("\t") sizes[name] = int(length) @@ -226,62 +119,32 @@ def test_chrom_sizes_values(self, cli): class TestFastaIndex: """Tests for: refget fasta index """ - def test_creates_fai_file(self, cli, sample_fasta): - """Creates .fai file.""" - result = cli("fasta", "index", str(sample_fasta)) - - assert result.exit_code == 0 - fai_path = Path(str(sample_fasta) + ".fai") - assert fai_path.exists() - - def test_creates_seqcol_file(self, cli, sample_fasta): - """Creates .seqcol.json file.""" - result = cli("fasta", "index", str(sample_fasta)) - - assert result.exit_code == 0 - seqcol_path = sample_fasta.parent / f"{sample_fasta.stem}.seqcol.json" - assert seqcol_path.exists() - - data = json.loads(seqcol_path.read_text()) - assert "names" in data - - def test_creates_chrom_sizes_file(self, cli, sample_fasta): - """Creates .chrom.sizes file.""" - result = cli("fasta", "index", str(sample_fasta)) + def test_index_creates_all_files(self, cli, sample_fasta): + """Index with --json lists all 5 created files.""" + result = cli("fasta", "index", str(sample_fasta), "--json") assert result.exit_code == 0 - sizes_path = sample_fasta.parent / f"{sample_fasta.stem}.chrom.sizes" - assert sizes_path.exists() - - def test_index_summary_output(self, cli, sample_fasta): - """Index command provides summary output.""" - result = cli("fasta", "index", str(sample_fasta)) - - assert result.exit_code == 0 - # Should indicate files created - assert len(result.stdout) > 0 + data = json.loads(result.stdout) + assert len(data["files_created"]) == 5 + extensions = [Path(f).suffix for f in data["files_created"]] + assert ".fai" in extensions + assert ".json" in extensions + assert ".rgsi" in extensions + assert ".rgci" in extensions class TestFastaStats: """Tests for: refget fasta stats """ - def test_outputs_stats_json(self, cli, sample_fasta): - """Outputs statistics in JSON format.""" - result = cli("fasta", "stats", str(sample_fasta), "--json") - - data = assert_json_output(result, ["sequences", "total_length"]) - assert isinstance(data["sequences"], int) - assert data["sequences"] > 0 - - def test_stats_values(self, cli, sample_fasta): - """Stats values are correct.""" - result = cli("fasta", "stats", str(sample_fasta), "--json") + def test_stats_known_file(self, cli): + """Stats for known test file.""" + result = cli("fasta", "stats", str(BASE_FASTA), "--json") assert result.exit_code == 0 data = json.loads(result.stdout) - # sample_fasta has 2 sequences, each 8 bases - assert data["sequences"] == 2 + # base.fa: chrX(8), chr1(4), chr2(4) = 16 total + assert data["sequences"] == 3 assert data["total_length"] == 16 def test_stats_plain_output(self, cli, sample_fasta): @@ -289,20 +152,8 @@ def test_stats_plain_output(self, cli, sample_fasta): result = cli("fasta", "stats", str(sample_fasta)) assert result.exit_code == 0 - # Should have some output assert len(result.stdout.strip()) > 0 - def test_stats_known_file(self, cli): - """Stats for known test file.""" - result = cli("fasta", "stats", str(BASE_FASTA), "--json") - - assert result.exit_code == 0 - data = json.loads(result.stdout) - - # base.fa: chrX(8), chr1(4), chr2(4) = 16 total - assert data["sequences"] == 3 - assert data["total_length"] == 16 - class TestFastaValidate: """Tests for: refget fasta validate """ @@ -310,57 +161,73 @@ class TestFastaValidate: def test_valid_fasta(self, cli, sample_fasta): """Valid FASTA passes validation.""" result = cli("fasta", "validate", str(sample_fasta)) - assert result.exit_code == 0 def test_invalid_fasta_exits_nonzero(self, cli, tmp_path): """Invalid FASTA fails validation.""" invalid = tmp_path / "invalid.fa" invalid.write_text("This is not a valid FASTA file\nNo headers here\n") - result = cli("fasta", "validate", str(invalid)) - - # Should fail with non-zero exit code assert result.exit_code != 0 -class TestFastaErrorHandling: - """Test error handling for fasta commands.""" +class TestFastaRgsi: + """Tests for: refget fasta rgsi """ - def test_nonexistent_file(self, cli): - """Graceful error for nonexistent file.""" - result = cli("fasta", "digest", "/path/to/nonexistent.fa") + def test_rgsi_format_and_content(self, cli, sample_fasta): + """Creates .rgsi with correct headers, columns, and sequence data.""" + result = cli("fasta", "rgsi", str(sample_fasta)) - assert result.exit_code != 0 - # Should have informative error message - assert ( - len(result.stdout) > 0 or len(result.stderr if hasattr(result, "stderr") else "") > 0 - ) - - def test_empty_fasta(self, cli, tmp_path): - """Handle empty FASTA file.""" - empty = tmp_path / "empty.fa" - empty.write_text("") - - result = cli("fasta", "stats", str(empty), "--json") - - # May succeed with 0 sequences or fail gracefully - if result.exit_code == 0: - data = json.loads(result.stdout) - assert data["sequences"] == 0 - - def test_permission_denied(self, cli, tmp_path): - """Handle permission denied.""" - # This test may be skipped on systems where we can't change permissions - protected = tmp_path / "protected.fa" - protected.write_text(">chr1\nACGT\n") - - import os - import stat - - try: - os.chmod(protected, 0o000) - result = cli("fasta", "digest", str(protected)) - assert result.exit_code != 0 - finally: - os.chmod(protected, stat.S_IRUSR | stat.S_IWUSR) + assert result.exit_code == 0 + rgsi_path = sample_fasta.parent / f"{sample_fasta.stem}.rgsi" + assert rgsi_path.exists() + + content = rgsi_path.read_text() + assert "##seqcol_digest=" in content + assert "#name\tlength\talphabet\tsha512t24u\tmd5\tdescription" in content + + data_lines = [line for line in content.strip().split("\n") if not line.startswith("#")] + assert len(data_lines) == 2 # sample_fasta has 2 sequences + + # Verify first sequence + cols = data_lines[0].split("\t") + assert len(cols) == 6 + assert cols[0] == "chr1" + assert cols[1] == "8" + + def test_rgsi_custom_output(self, cli, sample_fasta, tmp_path): + """Writes to a custom output path with -o.""" + custom_output = tmp_path / "custom.rgsi" + result = cli("fasta", "rgsi", str(sample_fasta), "-o", str(custom_output)) + + assert result.exit_code == 0 + assert custom_output.exists() + + +class TestFastaRgci: + """Tests for: refget fasta rgci """ + + def test_rgci_format_and_digest(self, cli, sample_fasta): + """Creates .rgci with correct columns, and digest matches fasta digest.""" + # Get expected digest + digest_result = cli("fasta", "digest", str(sample_fasta)) + expected_digest = json.loads(digest_result.stdout)["digest"] + + # Generate RGCI + result = cli("fasta", "rgci", str(sample_fasta)) + assert result.exit_code == 0 + + rgci_path = sample_fasta.parent / f"{sample_fasta.stem}.rgci" + content = rgci_path.read_text() + lines = content.strip().split("\n") + + # Header has 8 columns + header_cols = lines[0].lstrip("#").split("\t") + assert len(header_cols) == 8 + assert header_cols[0] == "digest" + + # Data row: correct column count, digest matches, n_sequences correct + data_cols = lines[1].split("\t") + assert len(data_cols) == 8 + assert data_cols[0] == expected_digest + assert data_cols[1] == "2" # sample_fasta has 2 sequences diff --git a/tests/test_cli/test_help.py b/tests/test_cli/test_help.py index b1e599b..f80ff4d 100644 --- a/tests/test_cli/test_help.py +++ b/tests/test_cli/test_help.py @@ -2,8 +2,6 @@ """Tests for CLI help output.""" -import pytest - class TestHelpOutput: """Verify help text displays correctly.""" diff --git a/tests/test_cli/test_seqcol_commands.py b/tests/test_cli/test_seqcol_commands.py index de8324e..398fc47 100644 --- a/tests/test_cli/test_seqcol_commands.py +++ b/tests/test_cli/test_seqcol_commands.py @@ -7,21 +7,23 @@ Network-dependent tests are in tests/integration/test_cli_seqcol_integration.py """ -import pytest +import importlib.util import json -import sys import os -from pathlib import Path - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from conftest import ( - BASE_FASTA, - DIFFERENT_NAMES_FASTA, - DIFFERENT_ORDER_FASTA, - SUBSET_FASTA, - TEST_FASTA_DIGESTS, - assert_json_output, + +_conftest_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "conftest.py" ) +_spec = importlib.util.spec_from_file_location("tests_conftest", _conftest_path) +_conftest = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_conftest) + +BASE_FASTA = _conftest.BASE_FASTA +DIFFERENT_NAMES_FASTA = _conftest.DIFFERENT_NAMES_FASTA +DIFFERENT_ORDER_FASTA = _conftest.DIFFERENT_ORDER_FASTA +SUBSET_FASTA = _conftest.SUBSET_FASTA +TEST_FASTA_DIGESTS = _conftest.TEST_FASTA_DIGESTS +assert_json_output = _conftest.assert_json_output class TestSeqcolCompare: @@ -214,3 +216,122 @@ def test_invalid_file_format(self, cli, tmp_path): result = cli("seqcol", "digest", str(invalid)) assert result.exit_code != 0 + + +class TestSeqcolLocalStoreLookup: + """Tests for local store lookup in seqcol show and compare commands.""" + + def test_show_from_local_store(self, cli, populated_store): + """Show command retrieves collection from local store.""" + digest = populated_store["digest"] + store_path = populated_store["path"] + + # Use REFGET_STORE env var to point to our test store + import os + + old_env = os.environ.get("REFGET_STORE") + os.environ["REFGET_STORE"] = str(store_path) + + try: + result = cli("seqcol", "show", digest) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + # Level 2 (default) should have arrays + assert "names" in data + assert "lengths" in data + assert "sequences" in data + assert isinstance(data["names"], list) + finally: + if old_env: + os.environ["REFGET_STORE"] = old_env + elif "REFGET_STORE" in os.environ: + del os.environ["REFGET_STORE"] + + def test_show_from_local_store_level1(self, cli, populated_store): + """Show command with level=1 returns digests from local store.""" + digest = populated_store["digest"] + store_path = populated_store["path"] + + import os + + old_env = os.environ.get("REFGET_STORE") + os.environ["REFGET_STORE"] = str(store_path) + + try: + result = cli("seqcol", "show", digest, "--level", "1") + + assert result.exit_code == 0 + data = json.loads(result.stdout) + # Level 1 should have string digests, not arrays + assert "names" in data + assert "lengths" in data + assert "sequences" in data + assert isinstance(data["names"], str) + assert isinstance(data["lengths"], str) + assert isinstance(data["sequences"], str) + finally: + if old_env: + os.environ["REFGET_STORE"] = old_env + elif "REFGET_STORE" in os.environ: + del os.environ["REFGET_STORE"] + + def test_compare_uses_local_store_for_digest(self, cli, populated_store): + """Compare command resolves digest inputs from local store first.""" + digest = populated_store["digest"] + store_path = populated_store["path"] + + import os + + old_env = os.environ.get("REFGET_STORE") + os.environ["REFGET_STORE"] = str(store_path) + + try: + # Compare local store collection with itself + result = cli("seqcol", "compare", digest, digest) + + # Should succeed (both resolved from local store) + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data.get("compatible", False) is True + finally: + if old_env: + os.environ["REFGET_STORE"] = old_env + elif "REFGET_STORE" in os.environ: + del os.environ["REFGET_STORE"] + + def test_compare_local_digest_with_fasta(self, cli, populated_store): + """Compare local store digest with FASTA file.""" + digest = populated_store["digest"] + store_path = populated_store["path"] + + import os + + old_env = os.environ.get("REFGET_STORE") + os.environ["REFGET_STORE"] = str(store_path) + + try: + # Compare local store collection with original FASTA + result = cli("seqcol", "compare", digest, str(BASE_FASTA)) + + # Should succeed and show they are compatible (same content) + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data.get("compatible") is True + finally: + if old_env: + os.environ["REFGET_STORE"] = old_env + elif "REFGET_STORE" in os.environ: + del os.environ["REFGET_STORE"] + + def test_show_nonexistent_digest_not_in_local_store(self, cli, temp_store, monkeypatch): + """Show command falls back to remote for digest not in local store.""" + # Use a digest that doesn't exist anywhere + fake_digest = "NONEXISTENT123456789012345678901234567890" + + monkeypatch.setenv("REFGET_STORE", str(temp_store)) + + result = cli("seqcol", "show", fake_digest) + + # Should fail (not in local store, not on remote servers) + assert result.exit_code != 0 diff --git a/tests/test_cli/test_store_commands.py b/tests/test_cli/test_store_commands.py index b77a9da..dd3c60a 100644 --- a/tests/test_cli/test_store_commands.py +++ b/tests/test_cli/test_store_commands.py @@ -2,20 +2,23 @@ """Tests for refget store CLI commands.""" -import pytest +import importlib.util import json -import sys import os -from pathlib import Path - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from conftest import ( - BASE_FASTA, - DIFFERENT_NAMES_FASTA, - DIFFERENT_ORDER_FASTA, - TEST_FASTA_DIGESTS, - assert_json_output, + +_conftest_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "conftest.py" ) +_spec = importlib.util.spec_from_file_location("tests_conftest", _conftest_path) +_conftest = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_conftest) + +BASE_FASTA = _conftest.BASE_FASTA +DIFFERENT_NAMES_FASTA = _conftest.DIFFERENT_NAMES_FASTA +DIFFERENT_ORDER_FASTA = _conftest.DIFFERENT_ORDER_FASTA +SAMPLE_FHR_JSON = _conftest.SAMPLE_FHR_JSON +TEST_FASTA_DIGESTS = _conftest.TEST_FASTA_DIGESTS +assert_json_output = _conftest.assert_json_output class TestStoreInit: @@ -214,7 +217,7 @@ def test_get_collection(self, cli, tmp_path): result = cli("store", "get", digest, "--path", str(store_path)) - data = assert_json_output(result, ["names", "lengths", "sequences"]) + assert_json_output(result, ["names", "lengths", "sequences"]) def test_get_nonexistent_digest(self, cli, tmp_path): """Returns error for nonexistent digest.""" @@ -278,25 +281,25 @@ def test_export_nonexistent_digest(self, cli, tmp_path): assert result.exit_code != 0 -class TestStoreSeq: - """Tests for: refget store seq """ +class TestStoreGetSequence: + """Tests for: refget store get --sequence""" def test_gets_sequence_by_name(self, cli, tmp_path): - """Gets sequence by name.""" + """Gets sequence by name using get -s.""" store_path = tmp_path / "store" cli("store", "init", "--path", str(store_path)) add_result = cli("store", "add", str(BASE_FASTA), "--path", str(store_path)) digest = json.loads(add_result.stdout)["digest"] - result = cli("store", "seq", digest, "--name", "chr1", "--path", str(store_path)) + result = cli("store", "get", digest, "-s", "--name", "chr1", "--path", str(store_path)) assert result.exit_code == 0 # Output should be sequence (GGAA for chr1 in base.fa) assert len(result.stdout.strip()) > 0 def test_substring(self, cli, tmp_path): - """Gets subsequence with range.""" + """Gets subsequence with range using get -s.""" store_path = tmp_path / "store" cli("store", "init", "--path", str(store_path)) @@ -305,8 +308,9 @@ def test_substring(self, cli, tmp_path): result = cli( "store", - "seq", + "get", digest, + "-s", "--name", "chrX", "--start", @@ -330,12 +334,49 @@ def test_seq_nonexistent_name(self, cli, tmp_path): digest = json.loads(add_result.stdout)["digest"] result = cli( - "store", "seq", digest, "--name", "nonexistent_chr", "--path", str(store_path) + "store", + "get", + digest, + "-s", + "--name", + "nonexistent_chr", + "--path", + str(store_path), ) assert result.exit_code != 0 +class TestStoreListSequences: + """Tests for: refget store list --sequences""" + + def test_list_sequences(self, cli, tmp_path): + """Lists sequences with -s flag.""" + store_path = tmp_path / "store" + cli("store", "init", "--path", str(store_path)) + cli("store", "add", str(BASE_FASTA), "--path", str(store_path)) + + result = cli("store", "list", "-s", "--path", str(store_path)) + + data = assert_json_output(result, ["sequences"]) + assert len(data["sequences"]) >= 1 + # Each sequence should have digest, name, length + for seq in data["sequences"]: + assert "digest" in seq + assert "name" in seq + assert "length" in seq + + def test_list_sequences_empty_store(self, cli, tmp_path): + """Lists sequences in empty store.""" + store_path = tmp_path / "store" + cli("store", "init", "--path", str(store_path)) + + result = cli("store", "list", "-s", "--path", str(store_path)) + + data = assert_json_output(result, ["sequences"]) + assert data["sequences"] == [] + + class TestStoreStats: """Tests for: refget store stats""" @@ -440,3 +481,187 @@ def test_add_to_nonexistent_store(self, cli, tmp_path): result = cli("store", "add", str(BASE_FASTA), "--path", str(nonexistent)) assert result.exit_code != 0 + + +def _setup_store_with_fasta(cli, tmp_path): + """Initialize a store, add BASE_FASTA, and return (store_path, digest).""" + store_path = tmp_path / "store" + cli("store", "init", "--path", str(store_path)) + add_result = cli("store", "add", str(BASE_FASTA), "--path", str(store_path)) + digest = json.loads(add_result.stdout)["digest"] + return store_path, digest + + +class TestStoreMetadata: + """Tests for: refget store metadata / metadata-set""" + + def test_metadata_no_fhr_set(self, cli, tmp_path): + """Error when no FHR metadata exists for a collection.""" + store_path, digest = _setup_store_with_fasta(cli, tmp_path) + + result = cli("store", "metadata", digest, "--path", str(store_path)) + + assert result.exit_code != 0 + assert "No FHR metadata" in result.stderr + + def test_metadata_set_from_json_file(self, cli, tmp_path): + """Happy path: set FHR metadata from a JSON file.""" + store_path, digest = _setup_store_with_fasta(cli, tmp_path) + + result = cli( + "store", + "metadata-set", + digest, + str(SAMPLE_FHR_JSON), + "--path", + str(store_path), + ) + + assert result.exit_code == 0 + assert "Set FHR metadata for collection" in result.stdout + + def test_metadata_read_after_set(self, cli, tmp_path): + """Round-trip: set metadata then read it back.""" + store_path, digest = _setup_store_with_fasta(cli, tmp_path) + + cli( + "store", + "metadata-set", + digest, + str(SAMPLE_FHR_JSON), + "--path", + str(store_path), + ) + + result = cli("store", "metadata", digest, "--path", str(store_path)) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data["genome"] == "Test organism" + assert data["version"] == "v1.0" + assert data["masking"] == "soft-masked" + assert "test_v1" in data["genomeSynonym"] + + def test_metadata_output_is_valid_json(self, cli, tmp_path): + """Output is valid JSON with camelCase keys per FHR spec.""" + store_path, digest = _setup_store_with_fasta(cli, tmp_path) + + cli( + "store", + "metadata-set", + digest, + str(SAMPLE_FHR_JSON), + "--path", + str(store_path), + ) + + result = cli("store", "metadata", digest, "--path", str(store_path)) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + + # Verify camelCase keys from the FHR spec + assert "schemaVersion" in data + assert "genomeSynonym" in data + assert "dateCreated" in data + + # Verify no snake_case keys leaked through + raw = result.stdout + assert "schema_version" not in raw + assert "genome_synonym" not in raw + assert "date_created" not in raw + + def test_metadata_set_nonexistent_file(self, cli, tmp_path): + """Error when JSON file does not exist.""" + store_path, digest = _setup_store_with_fasta(cli, tmp_path) + + result = cli( + "store", + "metadata-set", + digest, + "/nonexistent/fhr.json", + "--path", + str(store_path), + ) + + assert result.exit_code != 0 + + def test_metadata_nonexistent_digest(self, cli, tmp_path): + """Error when reading metadata for a nonexistent digest.""" + store_path = tmp_path / "store" + cli("store", "init", "--path", str(store_path)) + + result = cli( + "store", + "metadata", + "nonexistent_digest_123", + "--path", + str(store_path), + ) + + assert result.exit_code != 0 + + def test_metadata_set_then_overwrite(self, cli, tmp_path): + """Overwriting metadata replaces the previous values.""" + store_path, digest = _setup_store_with_fasta(cli, tmp_path) + + # Set original metadata + cli( + "store", + "metadata-set", + digest, + str(SAMPLE_FHR_JSON), + "--path", + str(store_path), + ) + + # Create updated FHR JSON + updated_fhr = tmp_path / "updated_fhr.json" + updated_fhr.write_text( + json.dumps( + { + "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", + "schemaVersion": 1.0, + "genome": "Updated organism", + "version": "v2.0", + } + ) + ) + + # Overwrite + cli( + "store", + "metadata-set", + digest, + str(updated_fhr), + "--path", + str(store_path), + ) + + result = cli("store", "metadata", digest, "--path", str(store_path)) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data["genome"] == "Updated organism" + + def test_metadata_removed_with_collection(self, cli, tmp_path): + """Metadata sidecar is cleaned up when the collection is removed.""" + store_path, digest = _setup_store_with_fasta(cli, tmp_path) + + # Set metadata + cli( + "store", + "metadata-set", + digest, + str(SAMPLE_FHR_JSON), + "--path", + str(store_path), + ) + + # Remove the collection + cli("store", "remove", digest, "--path", str(store_path)) + + # Metadata should be gone + result = cli("store", "metadata", digest, "--path", str(store_path)) + + assert result.exit_code != 0 diff --git a/tests/test_cli/test_store_crate.py b/tests/test_cli/test_store_crate.py new file mode 100644 index 0000000..c5fdf51 --- /dev/null +++ b/tests/test_cli/test_store_crate.py @@ -0,0 +1,356 @@ +# tests/test_cli/test_store_crate.py + +"""Tests for refget store crate CLI command.""" + +import importlib.util +import json +import os + +_conftest_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "conftest.py" +) +_spec = importlib.util.spec_from_file_location("tests_conftest", _conftest_path) +_conftest = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_conftest) + +BASE_FASTA = _conftest.BASE_FASTA +assert_json_output = _conftest.assert_json_output + + +def _init_and_add(cli, tmp_path): + """Initialize a store and add a FASTA, return store_path.""" + store_path = tmp_path / "store" + cli("store", "init", "--path", str(store_path)) + cli("store", "add", str(BASE_FASTA), "--path", str(store_path)) + return store_path + + +class TestStoreCrate: + """Tests for: refget store crate""" + + def test_produces_valid_json(self, cli, tmp_path): + """Crate command produces valid JSON output file.""" + store_path = _init_and_add(cli, tmp_path) + + result = cli( + "store", + "crate", + "--path", + str(store_path), + "--name", + "Test Store", + ) + + assert result.exit_code == 0 + crate_path = store_path / "ro-crate-metadata.json" + assert crate_path.exists() + + crate = json.loads(crate_path.read_text()) + assert "@context" in crate + assert "@graph" in crate + assert isinstance(crate["@graph"], list) + + def test_has_must_entities(self, cli, tmp_path): + """Crate contains all MUST entities per the profile.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", + "crate", + "--path", + str(store_path), + "--name", + "Test Store", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + ids = {e["@id"] for e in crate["@graph"]} + + # MUST entities + assert "ro-crate-metadata.json" in ids + assert "./" in ids + assert "rgstore.json" in ids + assert "sequences.rgsi" in ids + assert "sequences/" in ids + assert "collections/" in ids + + def test_metadata_descriptor_conformsto(self, cli, tmp_path): + """Metadata descriptor has correct conformsTo.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", + "crate", + "--path", + str(store_path), + "--name", + "Test Store", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + descriptor = next(e for e in crate["@graph"] if e["@id"] == "ro-crate-metadata.json") + + conforms = [c["@id"] for c in descriptor["conformsTo"]] + assert "https://w3id.org/ro/crate/1.2" in conforms + assert "https://w3id.org/ga4gh/refget/refgetstore-crate/0.1" in conforms + + def test_root_dataset_name(self, cli, tmp_path): + """Root dataset has the specified name.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", + "crate", + "--path", + str(store_path), + "--name", + "My Genome Store", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + root = next(e for e in crate["@graph"] if e["@id"] == "./") + assert root["name"] == "My Genome Store" + + def test_property_values(self, cli, tmp_path): + """Crate contains PropertyValue entities with correct stats.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", + "crate", + "--path", + str(store_path), + "--name", + "Test Store", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + props = { + e["propertyID"]: e["value"] + for e in crate["@graph"] + if e.get("@type") == "PropertyValue" + } + + assert "storageMode" in props + assert "sequenceCount" in props + assert props["sequenceCount"] > 0 + assert "collectionCount" in props + assert props["collectionCount"] >= 1 + assert props["refgetDigestAlgorithm"] == "sha512t24u" + + def test_author_parsing_orcid(self, cli, tmp_path): + """Parses 'Name ' format into Person entity.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", + "crate", + "--path", + str(store_path), + "--name", + "Test Store", + "--author", + "Jane Doe ", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + + # Find Person entity + person = next( + (e for e in crate["@graph"] if e.get("@type") == "Person"), + None, + ) + assert person is not None + assert person["@id"] == "https://orcid.org/0000-0001-1234-5678" + assert person["name"] == "Jane Doe" + + # Root dataset references author + root = next(e for e in crate["@graph"] if e["@id"] == "./") + assert root["author"]["@id"] == "https://orcid.org/0000-0001-1234-5678" + + def test_author_plain_name(self, cli, tmp_path): + """Handles plain name without URL.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", + "crate", + "--path", + str(store_path), + "--name", + "Test Store", + "--author", + "John Smith", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + person = next( + (e for e in crate["@graph"] if e.get("@type") == "Person"), + None, + ) + assert person is not None + assert person["name"] == "John Smith" + + def test_license(self, cli, tmp_path): + """License creates a CreativeWork entity.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", + "crate", + "--path", + str(store_path), + "--name", + "Test Store", + "--license", + "https://creativecommons.org/publicdomain/zero/1.0/", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + + root = next(e for e in crate["@graph"] if e["@id"] == "./") + assert root["license"]["@id"] == "https://creativecommons.org/publicdomain/zero/1.0/" + + license_entity = next( + ( + e + for e in crate["@graph"] + if e["@id"] == "https://creativecommons.org/publicdomain/zero/1.0/" + ), + None, + ) + assert license_entity is not None + assert license_entity["@type"] == "CreativeWork" + + def test_custom_output_path(self, cli, tmp_path): + """Writes to custom output path.""" + store_path = _init_and_add(cli, tmp_path) + output_path = tmp_path / "custom" / "crate.json" + + result = cli( + "store", + "crate", + "--path", + str(store_path), + "--name", + "Test Store", + "--output", + str(output_path), + ) + + assert result.exit_code == 0 + assert output_path.exists() + + crate = json.loads(output_path.read_text()) + assert "@graph" in crate + + def test_no_aliases_when_absent(self, cli, tmp_path): + """Does not include aliases/ when directory doesn't exist.""" + store_path = _init_and_add(cli, tmp_path) + + # Remove aliases dir if it exists + aliases = store_path / "aliases" + if aliases.exists(): + import shutil + + shutil.rmtree(aliases) + + cli( + "store", + "crate", + "--path", + str(store_path), + "--name", + "Test Store", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + ids = {e["@id"] for e in crate["@graph"]} + assert "aliases/" not in ids + + def test_create_action_provenance(self, cli, tmp_path): + """Crate includes CreateAction with refget version.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", + "crate", + "--path", + str(store_path), + "--name", + "Test Store", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + + action = next( + (e for e in crate["@graph"] if e.get("@type") == "CreateAction"), + None, + ) + assert action is not None + assert "endTime" in action + assert action["instrument"]["@id"] == "#refget-software" + + sw = next( + (e for e in crate["@graph"] if e["@id"] == "#refget-software"), + None, + ) + assert sw is not None + assert sw["@type"] == "SoftwareApplication" + assert "version" in sw + + def test_description_optional(self, cli, tmp_path): + """Description is included when provided, absent when not.""" + store_path = _init_and_add(cli, tmp_path) + + # Without description + cli( + "store", + "crate", + "--path", + str(store_path), + "--name", + "Test Store", + ) + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + root = next(e for e in crate["@graph"] if e["@id"] == "./") + assert "description" not in root + + # With description + cli( + "store", + "crate", + "--path", + str(store_path), + "--name", + "Test Store", + "--description", + "A test store for genomes", + ) + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + root = next(e for e in crate["@graph"] if e["@id"] == "./") + assert root["description"] == "A test store for genomes" + + def test_empty_store(self, cli, tmp_path): + """Crate works for empty store with zero counts.""" + store_path = tmp_path / "store" + cli("store", "init", "--path", str(store_path)) + + result = cli( + "store", + "crate", + "--path", + str(store_path), + "--name", + "Empty Store", + ) + + assert result.exit_code == 0 + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + props = { + e["propertyID"]: e["value"] + for e in crate["@graph"] + if e.get("@type") == "PropertyValue" + } + assert props["sequenceCount"] == 0 + assert props["collectionCount"] == 0 diff --git a/tests/test_cli/test_store_pull.py b/tests/test_cli/test_store_pull.py new file mode 100644 index 0000000..0b737a4 --- /dev/null +++ b/tests/test_cli/test_store_pull.py @@ -0,0 +1,433 @@ +# tests/test_cli/test_store_pull.py + +"""Tests for refget store pull CLI command. + +Note: The HTTP server fixtures use subprocess instead of threading because +gtars' open_remote (Rust/PyO3) holds the GIL during HTTP requests, which +would deadlock a Python-thread-based HTTP server. +""" + +import importlib.util +import json +import os +import socket +import subprocess +import sys +import time + +import pytest + +_conftest_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "conftest.py" +) +_spec = importlib.util.spec_from_file_location("tests_conftest", _conftest_path) +_conftest = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_conftest) + +BASE_FASTA = _conftest.BASE_FASTA +DIFFERENT_NAMES_FASTA = _conftest.DIFFERENT_NAMES_FASTA + +# Skip entire module if gtars is not installed +pytest.importorskip("gtars") + + +def _find_free_port() -> int: + """Find a free port on localhost.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +def _start_http_server(directory: str, port: int) -> subprocess.Popen: + """Start an HTTP server as a subprocess serving the given directory.""" + proc = subprocess.Popen( + [sys.executable, "-m", "http.server", str(port), "--directory", directory], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + # Wait for server to be ready + max_wait = 5.0 + start_time = time.time() + while time.time() - start_time < max_wait: + try: + with socket.create_connection(("127.0.0.1", port), timeout=0.1): + break + except (ConnectionRefusedError, OSError): + time.sleep(0.1) + else: + proc.terminate() + raise RuntimeError(f"HTTP server failed to start on port {port}") + return proc + + +def _stop_http_server(proc: subprocess.Popen) -> None: + """Stop an HTTP server subprocess.""" + proc.terminate() + try: + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + + +@pytest.fixture +def remote_store_server(cli, tmp_path): + """Set up a local store, serve it over HTTP, yield (url, digest, source_store_path).""" + source_store = tmp_path / "source_store" + cli("store", "init", "--path", str(source_store)) + add_result = cli("store", "add", str(BASE_FASTA), "--path", str(source_store)) + assert add_result.exit_code == 0, f"Failed to add FASTA: {add_result.stdout}" + digest = json.loads(add_result.stdout)["digest"] + + port = _find_free_port() + proc = _start_http_server(str(source_store), port) + + yield f"http://127.0.0.1:{port}", digest, source_store + + _stop_http_server(proc) + + +@pytest.fixture +def multi_remote_store_server(cli, tmp_path): + """Set up a local store with multiple FASTAs, serve over HTTP.""" + source_store = tmp_path / "multi_source_store" + cli("store", "init", "--path", str(source_store)) + + add_result1 = cli("store", "add", str(BASE_FASTA), "--path", str(source_store)) + assert add_result1.exit_code == 0 + digest1 = json.loads(add_result1.stdout)["digest"] + + add_result2 = cli("store", "add", str(DIFFERENT_NAMES_FASTA), "--path", str(source_store)) + assert add_result2.exit_code == 0 + digest2 = json.loads(add_result2.stdout)["digest"] + + port = _find_free_port() + proc = _start_http_server(str(source_store), port) + + yield f"http://127.0.0.1:{port}", digest1, digest2, source_store + + _stop_http_server(proc) + + +@pytest.fixture +def local_store(cli, tmp_path): + """Initialize an empty local store for pulling into.""" + store_path = tmp_path / "local_store" + result = cli("store", "init", "--path", str(store_path)) + assert result.exit_code == 0 + return store_path + + +class TestStorePullBasic: + """Core pull functionality tests.""" + + def test_pull_single_digest(self, cli, tmp_path, remote_store_server): + """Pull a known digest from the remote store server.""" + server_url, digest, _ = remote_store_server + local_store = tmp_path / "pull_store" + cli("store", "init", "--path", str(local_store)) + + result = cli("store", "pull", digest, "--server", server_url, "--path", str(local_store)) + + assert result.exit_code == 0, f"Pull failed: {result.stdout}" + data = json.loads(result.stdout) + assert data["status"] == "pulled" + assert data["digest"] == digest + + def test_pull_creates_local_cache(self, cli, tmp_path, remote_store_server): + """After pulling, the .remote_cache directory is created.""" + server_url, digest, _ = remote_store_server + local_store = tmp_path / "cache_store" + cli("store", "init", "--path", str(local_store)) + + result = cli("store", "pull", digest, "--server", server_url, "--path", str(local_store)) + + assert result.exit_code == 0 + cache_dir = local_store / ".remote_cache" + assert cache_dir.exists() + + def test_pull_quiet_flag(self, cli, tmp_path, remote_store_server): + """Pull with --quiet suppresses progress output.""" + server_url, digest, _ = remote_store_server + local_store = tmp_path / "quiet_store" + cli("store", "init", "--path", str(local_store)) + + result = cli( + "store", "pull", digest, "--server", server_url, "--path", str(local_store), "--quiet" + ) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data["status"] == "pulled" + + +class TestStorePullEager: + """Eager sequence fetching tests.""" + + def test_pull_eager_fetches_sequences(self, cli, tmp_path, remote_store_server): + """Pull with --eager pre-fetches all sequences.""" + server_url, digest, _ = remote_store_server + local_store = tmp_path / "eager_store" + cli("store", "init", "--path", str(local_store)) + + result = cli( + "store", "pull", digest, "--server", server_url, "--path", str(local_store), "--eager" + ) + + assert result.exit_code == 0, f"Eager pull failed: {result.stdout}" + data = json.loads(result.stdout) + assert data["eager"] is True + assert data["sequences_fetched"] > 0 + + def test_pull_default_is_lazy(self, cli, tmp_path, remote_store_server): + """Pull without --eager uses lazy mode.""" + server_url, digest, _ = remote_store_server + local_store = tmp_path / "lazy_store" + cli("store", "init", "--path", str(local_store)) + + result = cli("store", "pull", digest, "--server", server_url, "--path", str(local_store)) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data["eager"] is False + assert "sequences_fetched" not in data + + +class TestStorePullBatch: + """Batch pull via --file tests.""" + + def test_pull_from_file(self, cli, tmp_path, multi_remote_store_server): + """Pull multiple digests from a file.""" + server_url, digest1, digest2, _ = multi_remote_store_server + local_store = tmp_path / "batch_store" + cli("store", "init", "--path", str(local_store)) + + digest_file = tmp_path / "digests.txt" + digest_file.write_text(f"{digest1}\n{digest2}\n") + + result = cli( + "store", + "pull", + "--file", + str(digest_file), + "--server", + server_url, + "--path", + str(local_store), + ) + + assert result.exit_code == 0, f"Batch pull failed: {result.stdout}" + data = json.loads(result.stdout) + assert "results" in data + assert len(data["results"]) == 2 + + def test_pull_file_with_blank_lines(self, cli, tmp_path, remote_store_server): + """File with blank lines and whitespace is handled gracefully.""" + server_url, digest, _ = remote_store_server + local_store = tmp_path / "blank_store" + cli("store", "init", "--path", str(local_store)) + + digest_file = tmp_path / "digests_blanks.txt" + digest_file.write_text(f"\n \n{digest}\n\n \n") + + result = cli( + "store", + "pull", + "--file", + str(digest_file), + "--server", + server_url, + "--path", + str(local_store), + ) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + # Single digest after stripping blanks, so no "results" wrapper + assert data["digest"] == digest + assert data["status"] == "pulled" + + def test_pull_file_not_found(self, cli, tmp_path): + """Passing a nonexistent file to --file returns error.""" + local_store = tmp_path / "nofile_store" + cli("store", "init", "--path", str(local_store)) + + result = cli( + "store", + "pull", + "--file", + "/nonexistent/digests.txt", + "--server", + "http://127.0.0.1:1", + "--path", + str(local_store), + ) + + assert result.exit_code != 0 + + def test_pull_empty_file(self, cli, tmp_path, remote_store_server): + """Empty file returns error about no digests.""" + server_url, _, _ = remote_store_server + local_store = tmp_path / "empty_file_store" + cli("store", "init", "--path", str(local_store)) + + digest_file = tmp_path / "empty.txt" + digest_file.write_text("") + + result = cli( + "store", + "pull", + "--file", + str(digest_file), + "--server", + server_url, + "--path", + str(local_store), + ) + + assert result.exit_code != 0 + + +class TestStorePullAlreadyLocal: + """Skip already-cached collections.""" + + def test_pull_already_local(self, cli, tmp_path, remote_store_server): + """Pulling a digest that exists locally returns already_local status.""" + server_url, digest, _ = remote_store_server + local_store = tmp_path / "already_store" + cli("store", "init", "--path", str(local_store)) + + # Add the same FASTA to local store + cli("store", "add", str(BASE_FASTA), "--path", str(local_store)) + + # Try to pull -- should detect it is already local + result = cli("store", "pull", digest, "--server", server_url, "--path", str(local_store)) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data["status"] == "already_local" + + +class TestStorePullErrors: + """Error case tests.""" + + def test_pull_nonexistent_digest(self, cli, tmp_path, remote_store_server): + """Pull a digest that does not exist on the remote.""" + server_url, _, _ = remote_store_server + local_store = tmp_path / "nonexist_store" + cli("store", "init", "--path", str(local_store)) + + result = cli( + "store", + "pull", + "NONEXISTENT_DIGEST_12345678901234", + "--server", + server_url, + "--path", + str(local_store), + ) + + assert result.exit_code != 0 + data = json.loads(result.stdout) + assert data["status"] == "not_found" + + def test_pull_unreachable_server(self, cli, tmp_path): + """Pull from an unreachable URL returns error.""" + local_store = tmp_path / "unreach_store" + cli("store", "init", "--path", str(local_store)) + + result = cli( + "store", + "pull", + "some_digest_abc123", + "--server", + "http://127.0.0.1:1", + "--path", + str(local_store), + ) + + assert result.exit_code != 0 + + def test_pull_no_digest_or_file(self, cli, tmp_path): + """Pull with neither digest nor --file returns error.""" + local_store = tmp_path / "noarg_store" + cli("store", "init", "--path", str(local_store)) + + result = cli("store", "pull", "--server", "http://127.0.0.1:1", "--path", str(local_store)) + + assert result.exit_code != 0 + + def test_pull_both_digest_and_file(self, cli, tmp_path): + """Pull with both digest and --file returns error.""" + local_store = tmp_path / "both_store" + cli("store", "init", "--path", str(local_store)) + + digest_file = tmp_path / "digests.txt" + digest_file.write_text("some_digest\n") + + result = cli( + "store", + "pull", + "some_digest", + "--file", + str(digest_file), + "--server", + "http://127.0.0.1:1", + "--path", + str(local_store), + ) + + assert result.exit_code != 0 + + def test_pull_no_server_configured(self, cli, tmp_path, monkeypatch): + """Pull without --server and no configured remotes returns error.""" + local_store = tmp_path / "noserver_store" + cli("store", "init", "--path", str(local_store)) + + # Patch _find_remote_urls to return empty list + monkeypatch.setattr("refget.cli.store._find_remote_urls", lambda server_override=None: []) + + result = cli("store", "pull", "some_digest", "--path", str(local_store)) + + assert result.exit_code != 0 + + +class TestStorePullMultipleRemotes: + """Fallback across multiple remotes.""" + + def test_pull_tries_next_remote_on_failure( + self, cli, tmp_path, remote_store_server, monkeypatch + ): + """When first remote lacks the digest, tries the next one.""" + server_url, digest, _ = remote_store_server + + # Set up an empty store served over HTTP (first remote) + empty_store = tmp_path / "empty_remote" + cli("store", "init", "--path", str(empty_store)) + + port = _find_free_port() + empty_proc = _start_http_server(str(empty_store), port) + empty_url = f"http://127.0.0.1:{port}" + + try: + local_store = tmp_path / "multi_remote_store" + cli("store", "init", "--path", str(local_store)) + + # Patch to return empty server first, then the populated one + monkeypatch.setattr( + "refget.cli.store._find_remote_urls", + lambda server_override=None: [empty_url, server_url], + ) + + result = cli("store", "pull", digest, "--path", str(local_store), "--quiet") + + assert result.exit_code == 0, f"Multi-remote pull failed: {result.stdout}" + # Extract JSON from output (error messages from failed remotes may precede it) + stdout = result.stdout + json_start = stdout.rfind("{") + assert json_start >= 0, f"No JSON found in output: {stdout}" + data = json.loads(stdout[json_start:]) + assert data["status"] == "pulled" + assert data["source"] == server_url + finally: + _stop_http_server(empty_proc) diff --git a/tests/test_cli_integration/test_workflows.py b/tests/test_cli_integration/test_workflows.py index 2fce20c..8872f6a 100644 --- a/tests/test_cli_integration/test_workflows.py +++ b/tests/test_cli_integration/test_workflows.py @@ -6,20 +6,22 @@ These tests verify that commands work together correctly in typical usage patterns. """ -import pytest +import importlib.util import json -import sys import os -from pathlib import Path - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from conftest import ( - BASE_FASTA, - DIFFERENT_NAMES_FASTA, - DIFFERENT_ORDER_FASTA, - SUBSET_FASTA, - TEST_FASTA_DIGESTS, + +_conftest_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "conftest.py" ) +_spec = importlib.util.spec_from_file_location("tests_conftest", _conftest_path) +_conftest = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_conftest) + +BASE_FASTA = _conftest.BASE_FASTA +DIFFERENT_NAMES_FASTA = _conftest.DIFFERENT_NAMES_FASTA +DIFFERENT_ORDER_FASTA = _conftest.DIFFERENT_ORDER_FASTA +SUBSET_FASTA = _conftest.SUBSET_FASTA +TEST_FASTA_DIGESTS = _conftest.TEST_FASTA_DIGESTS class TestDigestAndCompare: