From 8c635b3b732751cf35454d3eb483466c7e5100b4 Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 18 Feb 2026 09:03:45 -0500 Subject: [PATCH 01/31] Add Swagger documentation for API query parameters; improve fhr metadata and other gtars access --- refget/__init__.py | 10 ++++++-- refget/cli/store.py | 37 ++++++++++++++++++++++++++++ refget/router.py | 59 +++++++++++++++++++++++++++++---------------- refget/store.py | 17 +++++++++++-- 4 files changed, 98 insertions(+), 25 deletions(-) diff --git a/refget/__init__.py b/refget/__init__.py index 9b53cc3..8129a43 100644 --- a/refget/__init__.py +++ b/refget/__init__.py @@ -2,11 +2,10 @@ refget - GA4GH reference sequence and sequence collection tools. Import from submodules: - from refget.store import RefgetStore, digest_fasta, StorageMode + from refget.store import RefgetStore, digest_fasta, StorageMode, compute_fai, digest_sequence, SequenceCollection from refget.digests import sha512t24u_digest, md5_digest, ga4gh_digest from refget.utils import compare_seqcols, validate_seqcol, seqcol_digest from refget.clients import SequenceCollectionClient, FastaDrsClient - from refget.models import SequenceCollection from refget.router import create_refget_router from refget.agents import RefgetDBAgent """ @@ -15,10 +14,17 @@ from .exceptions import InvalidSeqColError from .const import GTARS_INSTALLED from .utils import canonical_str +from .store import RefgetStore, StorageMode, digest_fasta, compute_fai, digest_sequence, SequenceCollection __all__ = [ "__version__", "InvalidSeqColError", "GTARS_INSTALLED", "canonical_str", + "RefgetStore", + "StorageMode", + "digest_fasta", + "compute_fai", + "digest_sequence", + "SequenceCollection", ] diff --git a/refget/cli/store.py b/refget/cli/store.py index 7df195b..99370d7 100644 --- a/refget/cli/store.py +++ b/refget/cli/store.py @@ -906,6 +906,10 @@ def _remove_collection_from_store(store_path: Path, digest: str) -> bool: if collection_file.exists(): collection_file.unlink() + # Remove the FHR metadata sidecar file (if it exists) + fhr_file = store_path / "collections" / f"{digest}.fhr.json" + fhr_file.unlink(missing_ok=True) + return True @@ -946,3 +950,36 @@ def remove( } ) raise typer.Exit(EXIT_SUCCESS) + + +@app.command() +def metadata( + digest: str = typer.Argument(help="Collection digest"), + path: Optional[Path] = typer.Option( + None, "--path", "-p", help="Store path" + ), +): + """Show FHR metadata for a collection.""" + store = _load_store(path) + fhr = store.get_fhr_metadata(digest) + if fhr is None: + print_error(f"No FHR metadata for collection {digest}", EXIT_FAILURE) + import json + + print(json.dumps(fhr.to_dict(), indent=2)) + raise typer.Exit(EXIT_SUCCESS) + + +@app.command("metadata-set") +def metadata_set( + digest: str = typer.Argument(help="Collection digest"), + file: Path = typer.Argument(help="Path to FHR JSON file"), + path: Optional[Path] = typer.Option( + None, "--path", "-p", help="Store path" + ), +): + """Set FHR metadata for a collection from a JSON file.""" + store = _load_store(path) + store.load_fhr_metadata(digest, str(file)) + print(f"Set FHR metadata for collection {digest}") + raise typer.Exit(EXIT_SUCCESS) diff --git a/refget/router.py b/refget/router.py index 96dbd9b..82e768d 100644 --- a/refget/router.py +++ b/refget/router.py @@ -20,7 +20,7 @@ import logging -from fastapi import APIRouter, Response, HTTPException, Request, Depends +from fastapi import APIRouter, Response, HTTPException, Request, Depends, Query from .models import Similarities, PaginationResult, PaginatedDigestList from .agents import RefgetDBAgent @@ -100,8 +100,8 @@ def create_refget_router( async def sequence( dbagent=Depends(get_dbagent), sequence_digest: str = example_sequence, - start: int = None, - end: int = None, + start: int | None = Query(None, description="Start position (0-based, inclusive)"), + end: int | None = Query(None, description="End position (0-based, exclusive)"), ): return Response(content=dbagent.seq.get(sequence_digest, start, end), media_type="text/plain") @@ -126,9 +126,9 @@ async def seq_metadata(dbagent=Depends(get_dbagent), sequence_digest: str = exam async def collection( dbagent=Depends(get_dbagent), collection_digest: str = example_collection_digest, - level: int | None = None, - collated: bool = True, - attribute: str = None, + level: int | None = Query(None, description="Recursion depth (1 or 2)", ge=1, le=2), + collated: bool = Query(True, description="Return collated format (arrays) vs itemwise"), + attribute: str | None = Query(None, description="Return only this attribute (e.g., 'names', 'lengths')"), ): if level == None: level = 2 @@ -212,9 +212,9 @@ async def compare_2_digests( ) async def calc_similarities( collection_digest: str, - species: str = "human", - page_size: int = 50, - page: int = 0, + species: str = Query("human", description="Species to filter by ('human' or 'mouse')"), + page_size: int = Query(50, description="Number of results per page"), + page: int = Query(0, description="Page number (0-indexed)"), dbagent=Depends(get_dbagent), ) -> Similarities: _LOGGER.info("Calculating Jaccard similarities...") @@ -235,9 +235,9 @@ async def calc_similarities( ) async def calc_similarities_from_json( seqcolA: dict, - species: str = "human", - page_size: int = 50, - page: int = 0, + species: str = Query("human", description="Species to filter by ('human' or 'mouse')"), + page_size: int = Query(50, description="Number of results per page"), + page: int = Query(0, description="Page number (0-indexed)"), dbagent=Depends(get_dbagent), ) -> Similarities: """ @@ -341,13 +341,25 @@ async def compare_1_digest( response_model=PaginatedDigestList, ) async def list_collections_by_offset( - request: Request, dbagent=Depends(get_dbagent), - page_size: int = 100, - page: int = 0, + page_size: int = Query(100, description="Number of results per page"), + page: int = Query(0, description="Page number (0-indexed)"), + names: str | None = Query(None, description="Filter by names attribute digest"), + lengths: str | None = Query(None, description="Filter by lengths attribute digest"), + sequences: str | None = Query(None, description="Filter by sequences attribute digest"), + name_length_pairs: str | None = Query(None, description="Filter by name_length_pairs digest"), + sorted_sequences: str | None = Query(None, description="Filter by sorted_sequences digest"), ): - # Extract all query params except pagination params - filters = {k: v for k, v in request.query_params.items() if k not in ["page", "page_size"]} + # Build filters from explicit parameters + filters = { + k: v for k, v in { + "names": names, + "lengths": lengths, + "sequences": sequences, + "name_length_pairs": name_length_pairs, + "sorted_sequences": sorted_sequences, + }.items() if v is not None + } if filters: try: @@ -373,7 +385,10 @@ async def list_collections_by_offset( response_model=PaginatedDigestList, ) async def list_attributes( - dbagent=Depends(get_dbagent), attribute: str = "names", page_size: int = 100, page: int = 0 + dbagent=Depends(get_dbagent), + attribute: str = "names", + page_size: int = Query(100, description="Number of results per page"), + page: int = Query(0, description="Page number (0-indexed)"), ): try: res = dbagent.attribute.list(attribute, limit=page_size, offset=page * page_size) @@ -397,7 +412,9 @@ async def list_attributes( response_model=PaginatedDigestList, ) async def list_cpangenomes_by_offset( - dbagent=Depends(get_dbagent), page_size: int = 100, page: int = 0 + dbagent=Depends(get_dbagent), + page_size: int = Query(100, description="Number of results per page"), + page: int = Query(0, description="Page number (0-indexed)"), ): res = dbagent.pangenome.list_by_offset(limit=page_size, offset=page * page_size) res["results"] = [x.digest for x in res["results"]] @@ -413,8 +430,8 @@ async def list_cpangenomes_by_offset( async def pangenome( dbagent=Depends(get_dbagent), pangenome_digest: str = example_pangenome_digest, - level: int | None = None, - collated: bool = True, + level: int | None = Query(None, description="Recursion depth (1-4)", ge=1, le=4), + collated: bool = Query(True, description="Return collated format (arrays) vs itemwise"), ): if level == None: level = 2 diff --git a/refget/store.py b/refget/store.py index 504c27b..042b90c 100644 --- a/refget/store.py +++ b/refget/store.py @@ -8,15 +8,28 @@ from .const import GTARS_INSTALLED if GTARS_INSTALLED: - from gtars.refget import RefgetStore, digest_fasta, StorageMode + from gtars.refget import ( + RefgetStore, + StorageMode, + digest_fasta, + compute_fai, + digest_sequence, + SequenceCollection, + ) else: RefgetStore = None - digest_fasta = None StorageMode = None + digest_fasta = None + compute_fai = None + digest_sequence = None + SequenceCollection = None __all__ = [ "RefgetStore", "digest_fasta", "StorageMode", + "compute_fai", + "digest_sequence", + "SequenceCollection", "GTARS_INSTALLED", ] From afc7a2dc028d5e8aab410d4d0c2292ee7aa8167c Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 18 Feb 2026 10:09:08 -0500 Subject: [PATCH 02/31] add local store lookup capability to the seqcol CLI commands --- refget/cli/seqcol.py | 111 ++++++++++++++++++++++- test_fasta/base.farg | 8 ++ tests/test_cli/test_seqcol_commands.py | 119 +++++++++++++++++++++++++ 3 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 test_fasta/base.farg diff --git a/refget/cli/seqcol.py b/refget/cli/seqcol.py index 2e83ab1..0bc3881 100644 --- a/refget/cli/seqcol.py +++ b/refget/cli/seqcol.py @@ -19,7 +19,7 @@ import typer -from refget.cli.config_manager import get_seqcol_servers +from refget.cli.config_manager import get_seqcol_servers, get_store_path from refget.cli.output import ( EXIT_FAILURE, EXIT_NETWORK_ERROR, @@ -32,6 +32,7 @@ # Heavy imports moved inside functions to speed up CLI startup: # - refget.clients (requests ~51ms) # - refget.utils (jsonschema ~60ms) +# - refget.store (gtars ~100ms) def _get_client(server_override: Optional[str] = None): @@ -54,6 +55,101 @@ def _get_client(server_override: Optional[str] = None): return SequenceCollectionClient(urls=urls, raise_errors=False) +def _collection_to_seqcol_dict(store, digest: str, level: int = 2) -> Optional[dict]: + """ + Convert a RefgetStore collection to seqcol API dict format. + + Args: + store: RefgetStore instance with the collection loaded + digest: Collection digest + level: 1 for attribute digests only, 2 for full arrays + + Returns: + Seqcol dict in API format, or None if collection not found. + """ + from refget.utils import canonical_str + from refget.digests import sha512t24u_digest + + names = [] + lengths = [] + sequences = [] + + for coll in store.iter_collections(): + if coll.digest == digest: + for seq in coll.sequences: + m = seq.metadata + names.append(m.name) + lengths.append(m.length) + sequences.append("SQ." + m.sha512t24u) + break + else: + # Collection not found in iteration + return None + + if not names: + return None + + if level == 1: + # Return digests of arrays instead of arrays themselves + return { + "names": sha512t24u_digest(canonical_str(names)), + "lengths": sha512t24u_digest(canonical_str(lengths)), + "sequences": sha512t24u_digest(canonical_str(sequences)), + } + else: + # Level 2: return full arrays + return { + "names": names, + "lengths": lengths, + "sequences": sequences, + } + + +def _get_local_seqcol(digest: str, level: int = 2) -> Optional[dict]: + """ + Try to get a seqcol from the local RefgetStore. + + Args: + digest: Collection digest to look up + level: 1 for attribute digests only, 2 for full arrays + + Returns: + Seqcol dict if found locally, None otherwise. + """ + try: + from refget.store import RefgetStore + except ImportError: + # gtars not installed - can't use local store + return None + + store_path = get_store_path() + rgstore_path = store_path / "rgstore.json" + + # Check if store exists + if not store_path.exists() or not rgstore_path.exists(): + return None + + try: + store = RefgetStore.open_local(str(store_path)) + store.set_quiet(True) + + # Check if collection exists + collection_digests = {meta.digest for meta in store.list_collections()} + if digest not in collection_digests: + return None + + # Load the collection (triggers lazy loading if needed) + if not store.is_collection_loaded(digest): + store.get_collection(digest) + + # Convert to seqcol dict format + return _collection_to_seqcol_dict(store, digest, level) + + except Exception: + # Any error (store corruption, etc.) - fall back to remote + return None + + def _compute_snlp_digest(seqcol_dict: dict) -> str: """ Compute the sorted_name_length_pairs digest from a seqcol dict. @@ -134,6 +230,12 @@ def _load_seqcol(input_str: str, client, level: int = 2) -> Optional[dict]: return None else: # digest + # Try local store first + result = _get_local_seqcol(input_str, level=level) + if result is not None: + return result + + # Fall back to remote result = client.get_collection(input_str, level=level) if result is None: print_error(f"Could not fetch seqcol for digest: {input_str}", EXIT_FAILURE) @@ -176,6 +278,13 @@ def show( Level 1 returns attribute digests only. Level 2 (default) returns full arrays. """ + # Try local store first + result = _get_local_seqcol(digest, level=level) + if result is not None: + print_json(result) + raise typer.Exit(EXIT_SUCCESS) + + # Fall back to remote servers client = _get_client(server) try: diff --git a/test_fasta/base.farg b/test_fasta/base.farg new file mode 100644 index 0000000..5a3c2fe --- /dev/null +++ b/test_fasta/base.farg @@ -0,0 +1,8 @@ +##seqcol_digest=XZlrcEGi6mlopZ2uD8ObHkQB1d0oDwKk +##names_digest=Fw1r9eRxfOZD98KKrhlYQNEdSRHoVxAG +##sequences_digest=0uDQVLuHaOZi1u76LjV__yrVUIz9Bwhr +##lengths_digest=cGRMZIb3AVgkcAfNv39RN7hnT5Chk7RX +#name length alphabet sha512t24u md5 +chrX 8 dna2bit iYtREV555dUFKg2_agSJW6suquUyPpMw 5f63cfaa3ef61f88c9635fb9d18ec945 +chr1 4 dna2bit YBbVX0dLKG1ieEDCiMmkrTZFt_Z5Vdaj 31fc6ca291a32fb9df82b85e5f077e31 +chr2 4 dna2bit AcLxtBuKEPk_7PGE_H4dGElwZHCujwH6 92c6a56c9e9459d8a42b96f7884710bc diff --git a/tests/test_cli/test_seqcol_commands.py b/tests/test_cli/test_seqcol_commands.py index de8324e..88887d6 100644 --- a/tests/test_cli/test_seqcol_commands.py +++ b/tests/test_cli/test_seqcol_commands.py @@ -214,3 +214,122 @@ def test_invalid_file_format(self, cli, tmp_path): result = cli("seqcol", "digest", str(invalid)) assert result.exit_code != 0 + + +class TestSeqcolLocalStoreLookup: + """Tests for local store lookup in seqcol show and compare commands.""" + + def test_show_from_local_store(self, cli, populated_store): + """Show command retrieves collection from local store.""" + digest = populated_store["digest"] + store_path = populated_store["path"] + + # Use REFGET_STORE env var to point to our test store + import os + + old_env = os.environ.get("REFGET_STORE") + os.environ["REFGET_STORE"] = str(store_path) + + try: + result = cli("seqcol", "show", digest) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + # Level 2 (default) should have arrays + assert "names" in data + assert "lengths" in data + assert "sequences" in data + assert isinstance(data["names"], list) + finally: + if old_env: + os.environ["REFGET_STORE"] = old_env + elif "REFGET_STORE" in os.environ: + del os.environ["REFGET_STORE"] + + def test_show_from_local_store_level1(self, cli, populated_store): + """Show command with level=1 returns digests from local store.""" + digest = populated_store["digest"] + store_path = populated_store["path"] + + import os + + old_env = os.environ.get("REFGET_STORE") + os.environ["REFGET_STORE"] = str(store_path) + + try: + result = cli("seqcol", "show", digest, "--level", "1") + + assert result.exit_code == 0 + data = json.loads(result.stdout) + # Level 1 should have string digests, not arrays + assert "names" in data + assert "lengths" in data + assert "sequences" in data + assert isinstance(data["names"], str) + assert isinstance(data["lengths"], str) + assert isinstance(data["sequences"], str) + finally: + if old_env: + os.environ["REFGET_STORE"] = old_env + elif "REFGET_STORE" in os.environ: + del os.environ["REFGET_STORE"] + + def test_compare_uses_local_store_for_digest(self, cli, populated_store): + """Compare command resolves digest inputs from local store first.""" + digest = populated_store["digest"] + store_path = populated_store["path"] + + import os + + old_env = os.environ.get("REFGET_STORE") + os.environ["REFGET_STORE"] = str(store_path) + + try: + # Compare local store collection with itself + result = cli("seqcol", "compare", digest, digest) + + # Should succeed (both resolved from local store) + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data.get("compatible", False) is True + finally: + if old_env: + os.environ["REFGET_STORE"] = old_env + elif "REFGET_STORE" in os.environ: + del os.environ["REFGET_STORE"] + + def test_compare_local_digest_with_fasta(self, cli, populated_store): + """Compare local store digest with FASTA file.""" + digest = populated_store["digest"] + store_path = populated_store["path"] + + import os + + old_env = os.environ.get("REFGET_STORE") + os.environ["REFGET_STORE"] = str(store_path) + + try: + # Compare local store collection with original FASTA + result = cli("seqcol", "compare", digest, str(BASE_FASTA)) + + # Should succeed and show they are compatible (same content) + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data.get("compatible") is True + finally: + if old_env: + os.environ["REFGET_STORE"] = old_env + elif "REFGET_STORE" in os.environ: + del os.environ["REFGET_STORE"] + + def test_show_nonexistent_digest_not_in_local_store(self, cli, temp_store, monkeypatch): + """Show command falls back to remote for digest not in local store.""" + # Use a digest that doesn't exist anywhere + fake_digest = "NONEXISTENT123456789012345678901234567890" + + monkeypatch.setenv("REFGET_STORE", str(temp_store)) + + result = cli("seqcol", "show", fake_digest) + + # Should fail (not in local store, not on remote servers) + assert result.exit_code != 0 From 09646a241abbf195b8412fd1249c9b96395a7f61 Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 18 Feb 2026 12:36:20 -0500 Subject: [PATCH 03/31] improve fasta digest ui --- frontend/src/features/digest/DigestPage.jsx | 110 +++++++++++++++--- .../src/features/digest/fastaDigestWorker.js | 73 ++++++++++-- 2 files changed, 156 insertions(+), 27 deletions(-) diff --git a/frontend/src/features/digest/DigestPage.jsx b/frontend/src/features/digest/DigestPage.jsx index 4eb2369..88ca6f8 100644 --- a/frontend/src/features/digest/DigestPage.jsx +++ b/frontend/src/features/digest/DigestPage.jsx @@ -1,4 +1,4 @@ -import { useState, useRef, useEffect } from 'react'; +import { useState, useRef, useEffect, useCallback } from 'react'; import { useSearchParams, useNavigate } from 'react-router-dom'; import toast from 'react-hot-toast'; import FastaDropzone from './FastaDropzone'; @@ -54,6 +54,13 @@ function loadFromHistory(digest) { } } +function createWorker() { + return new Worker( + new URL('./fastaDigestWorker.js', import.meta.url), + { type: 'module' } + ); +} + export default function DigestPage() { const [searchParams] = useSearchParams(); const navigate = useNavigate(); @@ -63,6 +70,7 @@ export default function DigestPage() { const [progress, setProgress] = useState(null); const [error, setError] = useState(null); const [history, setHistory] = useState([]); + const [stats, setStats] = useState(null); const workerRef = useRef(null); // Load history on mount @@ -85,15 +93,16 @@ export default function DigestPage() { } }, [searchParams]); - // Initialize worker - useEffect(() => { - workerRef.current = new Worker( - new URL('./fastaDigestWorker.js', import.meta.url), - { type: 'module' } - ); + const setupWorker = useCallback(() => { + // Terminate existing worker if any + if (workerRef.current) { + workerRef.current.terminate(); + } + + const worker = createWorker(); - workerRef.current.onmessage = (e) => { - const { type, result, message, bytesProcessed, totalSize, percent } = e.data; + worker.onmessage = (e) => { + const { type, result, message, bytesProcessed, totalSize, percent, stats: workerStats } = e.data; if (type === 'status') { setStatus(message); @@ -103,8 +112,19 @@ export default function DigestPage() { setResult(result); setStatus(null); setProgress(null); + if (workerStats) { + setStats(workerStats); + if (import.meta.env.DEV) { + console.log('[FASTA Digest]', { + chunks: workerStats.chunks, + avgChunkSize: `${(workerStats.avgChunkSize / 1024).toFixed(1)} KB`, + elapsed: `${(workerStats.elapsedMs / 1000).toFixed(1)}s`, + throughput: `${(workerStats.totalBytes / workerStats.elapsedMs / 1024).toFixed(1)} MB/s` + }); + } + } // Save to localStorage - const name = workerRef.current._fileName; + const name = worker._fileName; saveToHistory(result, name); setHistory(getHistory()); // Update URL @@ -115,20 +135,47 @@ export default function DigestPage() { setStatus(null); setProgress(null); toast.error(message); + } else if (type === 'cancelled') { + setStatus(null); + setProgress(null); + setError('Processing cancelled.'); } }; - return () => workerRef.current?.terminate(); + workerRef.current = worker; + return worker; }, []); + // Initialize worker on mount + useEffect(() => { + setupWorker(); + return () => workerRef.current?.terminate(); + }, [setupWorker]); + const handleFileSelected = (file) => { + // Cancel and replace any running worker to prevent double-processing + const worker = setupWorker(); setFileName(file.name); setResult(null); setError(null); setProgress(null); + setStats(null); setStatus('Starting...'); - workerRef.current._fileName = file.name; - workerRef.current.postMessage({ file }); + worker._fileName = file.name; + worker.postMessage({ file }); + }; + + const handleCancel = () => { + if (workerRef.current) { + workerRef.current.postMessage({ type: 'cancel' }); + } + }; + + const handleClear = () => { + setError(null); + setStatus(null); + setProgress(null); + setStats(null); }; const handleHistoryClick = (digest) => { @@ -304,6 +351,12 @@ export default function DigestPage() {
{status} +
{progress && ( @@ -328,11 +381,19 @@ export default function DigestPage() { )} - {/* Error */} + {/* Error or Cancelled */} {error && ( -
- - {error} +
+
+ + {error} +
+
)} @@ -346,6 +407,21 @@ export default function DigestPage() { onDownloadRgsi={handleDownloadRgsi} /> + {/* Processing Stats (collapsed) */} + {stats && result && ( +
+ + Processing details + +
+
Chunks processed: {stats.chunks.toLocaleString()}
+
Average chunk size: {(stats.avgChunkSize / 1024).toFixed(1)} KB
+
Elapsed time: {(stats.elapsedMs / 1000).toFixed(1)}s
+
Throughput: {(stats.totalBytes / stats.elapsedMs / 1024).toFixed(1)} MB/s
+
+
+ )} + {/* History */} {history.length > 0 && (
diff --git a/frontend/src/features/digest/fastaDigestWorker.js b/frontend/src/features/digest/fastaDigestWorker.js index 616aed6..86e9178 100644 --- a/frontend/src/features/digest/fastaDigestWorker.js +++ b/frontend/src/features/digest/fastaDigestWorker.js @@ -2,7 +2,10 @@ // Runs in background thread to avoid freezing UI. // Uses streaming API for files of any size. +const PROGRESS_INTERVAL_MS = 200; // Max 5 updates/sec +let lastProgressTime = 0; let wasmModule = null; +let cancelled = false; async function initWasm() { if (wasmModule) return wasmModule; @@ -14,7 +17,17 @@ async function initWasm() { } self.onmessage = async (e) => { + const { type } = e.data; + + if (type === 'cancel') { + cancelled = true; + return; + } + const { file } = e.data; + cancelled = false; + + const stats = { chunks: 0, totalBytes: 0, startTime: Date.now() }; try { self.postMessage({ type: 'status', message: 'Loading WASM module...' }); @@ -34,25 +47,55 @@ self.onmessage = async (e) => { const totalSize = file.size; while (true) { + if (cancelled) { + reader.cancel(); + gtars.fastaHasherFree(hasher); + self.postMessage({ type: 'cancelled' }); + return; + } + const { done, value } = await reader.read(); if (done) break; - // Pass chunk directly to Rust - no parsing in JS - gtars.fastaHasherUpdate(hasher, value); + try { + gtars.fastaHasherUpdate(hasher, value); + } catch (err) { + gtars.fastaHasherFree(hasher); + const msg = err.message || ''; + if (msg.toLowerCase().includes('fasta') || msg.toLowerCase().includes('parse')) { + self.postMessage({ type: 'error', message: `Invalid FASTA format: ${msg}`, category: 'parse' }); + } else { + self.postMessage({ type: 'error', message: `WASM processing error: ${msg}`, category: 'wasm' }); + } + return; + } + stats.chunks++; bytesProcessed += value.length; - self.postMessage({ - type: 'progress', - bytesProcessed, - totalSize, - percent: Math.round(100 * bytesProcessed / totalSize) - }); + stats.totalBytes = bytesProcessed; + + const now = Date.now(); + if (now - lastProgressTime >= PROGRESS_INTERVAL_MS) { + lastProgressTime = now; + self.postMessage({ + type: 'progress', + bytesProcessed, + totalSize, + percent: Math.round(100 * bytesProcessed / totalSize) + }); + } } + // Send final progress to ensure 100% + self.postMessage({ type: 'progress', bytesProcessed: totalSize, totalSize, percent: 100 }); + // Finalize and get result self.postMessage({ type: 'status', message: 'Computing final digests...' }); const result = gtars.fastaHasherFinish(hasher); - self.postMessage({ type: 'result', result }); + + stats.elapsedMs = Date.now() - stats.startTime; + stats.avgChunkSize = stats.chunks > 0 ? Math.round(stats.totalBytes / stats.chunks) : 0; + self.postMessage({ type: 'result', result, stats }); } catch (err) { gtars.fastaHasherFree(hasher); // Cleanup on error @@ -60,6 +103,16 @@ self.onmessage = async (e) => { } } catch (error) { - self.postMessage({ type: 'error', message: error.message || 'Processing failed' }); + const msg = error.message || 'Processing failed'; + let category = 'unknown'; + if (msg.toLowerCase().includes('gzip') || msg.toLowerCase().includes('decompress') || msg.toLowerCase().includes('corrupt')) { + category = 'gzip'; + self.postMessage({ type: 'error', message: `File appears corrupted or is not valid gzip: ${msg}`, category }); + } else if (msg.toLowerCase().includes('stream') || msg.toLowerCase().includes('read')) { + category = 'stream'; + self.postMessage({ type: 'error', message: `Error reading file: ${msg}`, category }); + } else { + self.postMessage({ type: 'error', message: msg, category }); + } } }; From 58f57990b006d75ed4e5877c3c7f8202e11cb360 Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 18 Feb 2026 12:47:23 -0500 Subject: [PATCH 04/31] fix cancel --- frontend/src/features/digest/DigestPage.jsx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/frontend/src/features/digest/DigestPage.jsx b/frontend/src/features/digest/DigestPage.jsx index 88ca6f8..0ea1f59 100644 --- a/frontend/src/features/digest/DigestPage.jsx +++ b/frontend/src/features/digest/DigestPage.jsx @@ -167,8 +167,12 @@ export default function DigestPage() { const handleCancel = () => { if (workerRef.current) { - workerRef.current.postMessage({ type: 'cancel' }); + workerRef.current.terminate(); + workerRef.current = null; } + setStatus(null); + setProgress(null); + setError('Processing cancelled.'); }; const handleClear = () => { From eb2ce703daee6541a2810f323870aa0c68a11290 Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 18 Feb 2026 13:15:31 -0500 Subject: [PATCH 05/31] fix comparison links --- frontend/.env.production | 1 + frontend/src/main.jsx | 1 + frontend/src/pages/ComparisonView.jsx | 54 +++++++++++++-------------- 3 files changed, 29 insertions(+), 27 deletions(-) create mode 100644 frontend/.env.production diff --git a/frontend/.env.production b/frontend/.env.production new file mode 100644 index 0000000..4c7c46f --- /dev/null +++ b/frontend/.env.production @@ -0,0 +1 @@ +VITE_API_BASE=https://seqcolapi.databio.org diff --git a/frontend/src/main.jsx b/frontend/src/main.jsx index 2268aef..1ad13fe 100644 --- a/frontend/src/main.jsx +++ b/frontend/src/main.jsx @@ -351,6 +351,7 @@ const router = createBrowserRouter([ { path: '/scim/:digest1/:digest2', element: , + errorElement: , loader: (request) => { return fetchComparison( request.params.digest1, diff --git a/frontend/src/pages/ComparisonView.jsx b/frontend/src/pages/ComparisonView.jsx index e2419b4..ddb1257 100644 --- a/frontend/src/pages/ComparisonView.jsx +++ b/frontend/src/pages/ComparisonView.jsx @@ -136,21 +136,21 @@ const SequencesReport = ({ messageArray }) => { // ✅❔❌❔ const coordinateSystemInterpretation = (comparison) => { const lengthsANotB = - comparison.array_elements.a.lengths - - comparison.array_elements.a_and_b.lengths; + comparison.array_elements.a_count.lengths - + comparison.array_elements.a_and_b_count.lengths; const lengthsBNotA = - comparison.array_elements.b.lengths - - comparison.array_elements.a_and_b.lengths; + comparison.array_elements.b_count.lengths - + comparison.array_elements.a_and_b_count.lengths; const namesANotB = - comparison.array_elements.a.names - comparison.array_elements.a_and_b.names; + comparison.array_elements.a_count.names - comparison.array_elements.a_and_b_count.names; const namesBNotA = - comparison.array_elements.b.names - comparison.array_elements.a_and_b.names; + comparison.array_elements.b_count.names - comparison.array_elements.a_and_b_count.names; const nlpANotB = - comparison.array_elements.a.name_length_pairs - - comparison.array_elements.a_and_b.name_length_pairs; + comparison.array_elements.a_count.name_length_pairs - + comparison.array_elements.a_and_b_count.name_length_pairs; const nlpBNotA = - comparison.array_elements.b.name_length_pairs - - comparison.array_elements.a_and_b.name_length_pairs; + comparison.array_elements.b_count.name_length_pairs - + comparison.array_elements.a_and_b_count.name_length_pairs; const msgArray = []; // If the name_length_pairs match, then the coordinate systems are identical if (nlpANotB === 0 && nlpBNotA === 0) { @@ -162,7 +162,7 @@ const coordinateSystemInterpretation = (comparison) => { } else if (nlpANotB > 0 && nlpBNotA === 0) { // If B nlp is a subset of A msgArray.push("Collection B's coordinate system is a subset of A's."); - } else if (comparison.array_elements.a_and_b.name_length_pairs !== 0) { + } else if (comparison.array_elements.a_and_b_count.name_length_pairs !== 0) { // If there is some overlap msgArray.push('The coordinate systems are partially overlapping.'); } else { @@ -230,22 +230,22 @@ const ComparisonView = ({ paramComparison }) => { // ✅❔❌ const getInterpretation = (comparison, attribute) => { - const nSequencesA = comparison.array_elements.a[attribute]; - const nSequencesB = comparison.array_elements.b[attribute]; + const nSequencesA = comparison.array_elements.a_count[attribute]; + const nSequencesB = comparison.array_elements.b_count[attribute]; const aNotB = - comparison.array_elements.a[attribute] - - comparison.array_elements.a_and_b[attribute]; + comparison.array_elements.a_count[attribute] - + comparison.array_elements.a_and_b_count[attribute]; const bNotA = - comparison.array_elements.b[attribute] - - comparison.array_elements.a_and_b[attribute]; + comparison.array_elements.b_count[attribute] - + comparison.array_elements.a_and_b_count[attribute]; const orderCheck = comparison.array_elements.a_and_b_same_order[attribute]; let interpTerm = ''; const msgArray = []; if ( - comparison.array_elements.a_and_b[attribute] == nSequencesA && - comparison.array_elements.a_and_b[attribute] == nSequencesB + comparison.array_elements.a_and_b_count[attribute] == nSequencesA && + comparison.array_elements.a_and_b_count[attribute] == nSequencesB ) { msgArray.push(`🟰 The ${attribute} contents are identical.`); if (orderCheck === true) { @@ -256,8 +256,8 @@ const ComparisonView = ({ paramComparison }) => { interpTerm = 'identical_content'; } if ( - comparison.array_elements.a_and_b[attribute] == nSequencesA && - comparison.array_elements.a_and_b[attribute] < nSequencesB + comparison.array_elements.a_and_b_count[attribute] == nSequencesA && + comparison.array_elements.a_and_b_count[attribute] < nSequencesB ) { msgArray.push( `Collection B contains all ${nSequencesA} ${attribute} from collection A, and ${bNotA} additional.`, @@ -265,20 +265,20 @@ const ComparisonView = ({ paramComparison }) => { interpTerm = 'subset'; } if ( - comparison.array_elements.a_and_b[attribute] == nSequencesB && - comparison.array_elements.a_and_b[attribute] < nSequencesA + comparison.array_elements.a_and_b_count[attribute] == nSequencesB && + comparison.array_elements.a_and_b_count[attribute] < nSequencesA ) { msgArray.push( `Collection A contains all ${nSequencesB} ${attribute} from collection B, and ${aNotB} additional.`, ); interpTerm = 'subset'; } - if (comparison.array_elements.a_and_b[attribute] === 0) { + if (comparison.array_elements.a_and_b_count[attribute] === 0) { msgArray.push(`The collections' ${attribute} contents are disjoint.`); interpTerm = 'disjoint'; } else if ( - comparison.array_elements.a_and_b[attribute] < nSequencesA && - comparison.array_elements.a_and_b[attribute] < nSequencesB + comparison.array_elements.a_and_b_count[attribute] < nSequencesA && + comparison.array_elements.a_and_b_count[attribute] < nSequencesB ) { msgArray.push( `The collections' ${attribute} contents are partially overlapping; some are shared, and some are unique to each collection.`, @@ -375,7 +375,7 @@ const ComparisonView = ({ paramComparison }) => { (number of elements found in both):
- {Object.entries(comparison.array_elements.a_and_b).map( + {Object.entries(comparison.array_elements.a_and_b_count).map( ([key, value]) => (
+ + {error && ( +
+ Error: {error} +
+ )} + + {(results.length > 0 || loading) && ( +
+
+
+
+
+
{total}
+
Total
+
+
+
{passed}
+
Passed
+
+
+
{failed}
+
Failed
+
+
+
+ {serverUrl} +
+ {summary && ( +
+ {new Date().toLocaleString()} +
+ )} +
+
+
+
+
0 ? (passed / total) * 100 : 0}%`, + transition: 'width 0.3s ease', + }} + /> +
0 ? (failed / total) * 100 : 0}%`, + transition: 'width 0.3s ease', + }} + /> +
+
+
+
+ +
+ {results.map((result, idx) => ( +
+
+
+ + {result.passed ? 'PASS' : 'FAIL'} + + {result.name} +
+ {result.description && ( +
+ {result.description} +
+ )} + {result.error && ( +
+ {result.error} +
+ )} +
+ + {result.duration_ms.toFixed(0)}ms + +
+ ))} + {loading && completed < total && ( +
+ + Running check {completed + 1} of {total}... +
+ )} +
+
+ )} +
+ ); +}; diff --git a/frontend/src/features/digest/DigestPage.jsx b/frontend/src/pages/DigestPage.jsx similarity index 96% rename from frontend/src/features/digest/DigestPage.jsx rename to frontend/src/pages/DigestPage.jsx index 0ea1f59..001a578 100644 --- a/frontend/src/features/digest/DigestPage.jsx +++ b/frontend/src/pages/DigestPage.jsx @@ -1,9 +1,9 @@ import { useState, useRef, useEffect, useCallback } from 'react'; import { useSearchParams, useNavigate } from 'react-router-dom'; import toast from 'react-hot-toast'; -import FastaDropzone from './FastaDropzone'; -import SeqColResult from './SeqColResult'; -import './digest.css'; +import FastaDropzone from '../components/digest/FastaDropzone'; +import SeqColResult from '../components/digest/SeqColResult'; +import '../components/digest/digest.css'; const HISTORY_KEY = 'digest-history'; const MAX_HISTORY = 20; @@ -56,12 +56,12 @@ function loadFromHistory(digest) { function createWorker() { return new Worker( - new URL('./fastaDigestWorker.js', import.meta.url), + new URL('../components/digest/fastaDigestWorker.js', import.meta.url), { type: 'module' } ); } -export default function DigestPage() { +export function DigestPage() { const [searchParams] = useSearchParams(); const navigate = useNavigate(); const [result, setResult] = useState(null); @@ -142,6 +142,13 @@ export default function DigestPage() { } }; + worker.onerror = (event) => { + event.preventDefault(); + setError(event.message || 'Worker crashed unexpectedly'); + setStatus(null); + setProgress(null); + }; + workerRef.current = worker; return worker; }, []); diff --git a/frontend/src/pages/HomePage.jsx b/frontend/src/pages/HomePage.jsx index 7a957ff..85d53ad 100644 --- a/frontend/src/pages/HomePage.jsx +++ b/frontend/src/pages/HomePage.jsx @@ -6,6 +6,11 @@ import { AttributeList } from '../components/ObjectLists'; const HomePage = () => { const loaderData = useLoaderData(); + + if (!Array.isArray(loaderData) || loaderData.length < 3) { + return
Failed to load homepage data.
; + } + const collections = loaderData[0]; const pangenomes = loaderData[1]; const name_length_pairs = loaderData[2]; @@ -86,7 +91,7 @@ const HomePage = () => { -
4. List of name_length_pairs on this server:
+
5. List of name_length_pairs on this server:

The{' '} /list/attributes{' '} diff --git a/frontend/src/pages/PangenomeView.jsx b/frontend/src/pages/PangenomeView.jsx index 2d55013..146382f 100644 --- a/frontend/src/pages/PangenomeView.jsx +++ b/frontend/src/pages/PangenomeView.jsx @@ -11,6 +11,10 @@ const PangenomeView = ({ params }) => { const pangenome = useLoaderData(); const { digest } = useParams(); + if (!Array.isArray(pangenome) || pangenome.length < 3) { + return

Failed to load pangenome data.
; + } + let level1 = pangenome[0]; let level2 = pangenome[1]; let itemwise = pangenome[2]; diff --git a/frontend/src/pages/SCIM.jsx b/frontend/src/pages/SCIM.jsx index 7e8a40a..41e9ee1 100644 --- a/frontend/src/pages/SCIM.jsx +++ b/frontend/src/pages/SCIM.jsx @@ -2,7 +2,7 @@ import { useEffect, useState } from 'react'; import { useSearchParams, useLoaderData } from 'react-router-dom'; import toast from 'react-hot-toast'; -import { API_BASE } from '../utilities.jsx'; +import { API_BASE, encodeToBase64, decodeFromBase64 } from '../utilities.jsx'; import { ComparisonView } from './ComparisonView.jsx'; // Seqcol Comparison Interpretation Module (SCIM) @@ -30,18 +30,24 @@ const SCIM = () => { useEffect(() => { const comparisonFromQuery = searchParams.get('val'); if (comparisonFromQuery) { - // decode base64encoded string - const decodedComparisonFromQuery = atob(comparisonFromQuery); - // prettify the comparison string - const prettyComparison = JSON.stringify( - JSON.parse(decodedComparisonFromQuery), - null, - 2, - ); - setComparisonStr(prettyComparison); - - const parsedComparison = JSON.parse(decodedComparisonFromQuery); - setComparison(parsedComparison); + try { + // decode base64encoded string + const decodedComparisonFromQuery = decodeFromBase64(comparisonFromQuery); + // prettify the comparison string + const prettyComparison = JSON.stringify( + JSON.parse(decodedComparisonFromQuery), + null, + 2, + ); + setComparisonStr(prettyComparison); + + const parsedComparison = JSON.parse(decodedComparisonFromQuery); + setComparison(parsedComparison); + } catch { + toast.error('Invalid comparison URL. The data may be corrupted.'); + setComparison(null); + setComparisonStr(''); + } } }, [searchParams]); @@ -79,7 +85,7 @@ const SCIM = () => { setComparison(parsedComparison); // update the query param to base64 encoded string - const base64encodedComparison = btoa(comparisonStr); + const base64encodedComparison = encodeToBase64(comparisonStr); window.history.pushState( {}, '', @@ -95,9 +101,9 @@ const SCIM = () => { const loadExample = () => { const exampleData = - 'eyJkaWdlc3RzIjp7ImEiOiJYWmxyY0VHaTZtbG9wWjJ1RDhPYkhrUUIxZDBvRHdLayIsImIiOiJRdlQ1dEFRMEI4Vmt4ZC1xRmZ0bHpFazJReWZQdGdPdiJ9LCJhdHRyaWJ1dGVzIjp7ImFfb25seSI6W10sImJfb25seSI6W10sImFfYW5kX2IiOlsibGVuZ3RocyIsIm5hbWVfbGVuZ3RoX3BhaXJzIiwibmFtZXMiLCJzZXF1ZW5jZXMiLCJzb3J0ZWRfc2VxdWVuY2VzIl19LCJhcnJheV9lbGVtZW50cyI6eyJhIjp7Imxlbmd0aHMiOjMsIm5hbWVfbGVuZ3RoX3BhaXJzIjozLCJuYW1lcyI6Mywic2VxdWVuY2VzIjozLCJzb3J0ZWRfc2VxdWVuY2VzIjozfSwiYiI6eyJsZW5ndGhzIjozLCJuYW1lX2xlbmd0aF9wYWlycyI6MywibmFtZXMiOjMsInNlcXVlbmNlcyI6Mywic29ydGVkX3NlcXVlbmNlcyI6M30sImFfYW5kX2IiOnsibGVuZ3RocyI6MywibmFtZV9sZW5ndGhfcGFpcnMiOjAsIm5hbWVzIjowLCJzZXF1ZW5jZXMiOjMsInNvcnRlZF9zZXF1ZW5jZXMiOjN9LCJhX2FuZF9iX3NhbWVfb3JkZXIiOnsibGVuZ3RocyI6dHJ1ZSwibmFtZV9sZW5ndGhfcGFpcnMiOm51bGwsIm5hbWVzIjpudWxsLCJzZXF1ZW5jZXMiOnRydWUsInNvcnRlZF9zZXF1ZW5jZXMiOnRydWV9fX0='; + 'eyJkaWdlc3RzIjp7ImEiOiJYWmxyY0VHaTZtbG9wWjJ1RDhPYkhrUUIxZDBvRHdLayIsImIiOiJRdlQ1dEFRMEI4Vmt4ZC1xRmZ0bHpFazJReWZQdGdPdiJ9LCJhdHRyaWJ1dGVzIjp7ImFfb25seSI6W10sImJfb25seSI6W10sImFfYW5kX2IiOlsibGVuZ3RocyIsIm5hbWVfbGVuZ3RoX3BhaXJzIiwibmFtZXMiLCJzZXF1ZW5jZXMiLCJzb3J0ZWRfc2VxdWVuY2VzIl19LCJhcnJheV9lbGVtZW50cyI6eyJhX2NvdW50Ijp7Imxlbmd0aHMiOjMsIm5hbWVfbGVuZ3RoX3BhaXJzIjozLCJuYW1lcyI6Mywic2VxdWVuY2VzIjozLCJzb3J0ZWRfc2VxdWVuY2VzIjozfSwiYl9jb3VudCI6eyJsZW5ndGhzIjozLCJuYW1lX2xlbmd0aF9wYWlycyI6MywibmFtZXMiOjMsInNlcXVlbmNlcyI6Mywic29ydGVkX3NlcXVlbmNlcyI6M30sImFfYW5kX2JfY291bnQiOnsibGVuZ3RocyI6MywibmFtZV9sZW5ndGhfcGFpcnMiOjAsIm5hbWVzIjowLCJzZXF1ZW5jZXMiOjMsInNvcnRlZF9zZXF1ZW5jZXMiOjN9LCJhX2FuZF9iX3NhbWVfb3JkZXIiOnsibGVuZ3RocyI6dHJ1ZSwibmFtZV9sZW5ndGhfcGFpcnMiOm51bGwsIm5hbWVzIjpudWxsLCJzZXF1ZW5jZXMiOnRydWUsInNvcnRlZF9zZXF1ZW5jZXMiOnRydWV9fX0='; - const decodedComparison = atob(exampleData); + const decodedComparison = decodeFromBase64(exampleData); const prettyComparison = JSON.stringify( JSON.parse(decodedComparison), null, diff --git a/frontend/src/pages/SCOM.jsx b/frontend/src/pages/SCOM.jsx index e859f96..b27d747 100644 --- a/frontend/src/pages/SCOM.jsx +++ b/frontend/src/pages/SCOM.jsx @@ -1,4 +1,4 @@ -import { useEffect, useState } from 'react'; +import { useCallback, useEffect, useState } from 'react'; import { encodeComparison } from '../utilities.jsx'; import { useLoaderData, useNavigate, useSearchParams } from 'react-router-dom'; import toast from 'react-hot-toast'; @@ -11,7 +11,6 @@ import { } from '../services/fetchData.jsx'; import { MultiMetricHeatmapPlot } from '../components/MultiMetricHeatmapPlot.jsx'; import { StripPlot } from '../components/StripPlot.jsx'; -// import { NetworkGraph } from '../components/NetworkGraph.jsx'; import { useSimilaritiesStore } from '../stores/similarities'; @@ -20,7 +19,7 @@ const SCOM = () => { const navigate = useNavigate(); const [searchParams] = useSearchParams(); const loaderData = useLoaderData(); - const collections = loaderData[0]; + const collections = Array.isArray(loaderData) && loaderData.length >= 1 ? loaderData[0] : null; const { selectedCollectionsIndex, @@ -38,19 +37,17 @@ const SCOM = () => { getAllCollections, initializeSelectedCollections, sortBy, - setSortBy, sortAscending, - setSortAscending, - sortSimilarities, + sortByColumn, + resetSort, species, - setSpecies + setSpecies, + error: storeError, + setError: setStoreError, } = useSimilaritiesStore(); const [stripJitter, setStripJitter] = useState('none'); const [stripOrientation, setStripOrientation] = useState('horizontal'); - const [heatmapMetric, setHeatmapMetric] = useState('sequences'); - // const [networkMetric, setNetworkMetric] = useState('sequences'); - // const [networkThreshold, setNetworkThreshold] = useState(0.8); const [relationship, setRelationship] = useState('oneToMany'); const [isLoading, setIsLoading] = useState(false); const [pendingPrefill, setPendingPrefill] = useState(null); @@ -270,15 +267,8 @@ const SCOM = () => { ] } - // const handleSelectCollection = (index) => { - // setSelectedCollectionsIndex((prev) => { - // const newArray = [...prev]; - // newArray[index] = !newArray[index]; - // return newArray; - // }); - // }; - const handleNavigateSCIM = async (similarityRow) => { + setStoreError(null); try { let comparison; if (similarityRow.custom) { @@ -294,8 +284,8 @@ const SCOM = () => { } const encodedComparison = encodeComparison(comparison); navigate(`/scim?val=${encodedComparison}`); - // window.scrollTo(0, 0); } catch (error) { + setStoreError('Comparison could not be made.'); toast.error( Error: Comparison could not be made. @@ -304,29 +294,12 @@ const SCOM = () => { } }; - // const handleRelationshipChange = (newRelationship) => { - // if ( - // newRelationship === 'oneToMany' && - // relationship === 'manyToMany' && - // selectedCollections.length > 1 - // ) { - // setCustomCollections([]); - // setSelectedCollectionsIndex(collections.results.map(() => false)); - // setCustomCount(1); - // } - // setSelectedCollectionsIndex((prev) => - // prev.map((item, index) => - // index < collections.results.length ? false : item, - // ), - // ); - // setStripJitter('none'); - // setRelationship(newRelationship); - // }; - - const handleAddCustomCollection = async (data, name) => { + const handleAddCustomCollection = useCallback(async (data, name) => { + setStoreError(null); try { data = JSON.parse(data); } catch (e) { + setStoreError('Invalid JSON format. Please check your input.'); toast.error( Error: Invalid JSON format. Please check your input. @@ -335,23 +308,11 @@ const SCOM = () => { return; } - // if (relationship === 'manyToMany' && allCollections.includes(name)) { - // toast.error( - // - // Error: Collection with name already exists. Please - // try another name. - // , - // ); - // return; - // } - try { setIsLoading(true); const result = await fetchSimilaritiesJSON(data, species); if (result?.similarities) { - // const customDigest = 'query_seqcol' + (customCount > 1 ? customCount : ''); const customDigest = 'Input Seqcol'; - // console.log(result.similarities) const flattenedSimilarities = result.similarities.flatMap((s) => s.human_readable_names.map((humanReadableName) => ({ selectedDigest: name !== '' ? name : customDigest, @@ -396,6 +357,7 @@ const SCOM = () => { } catch (e) { console.error('SCOM submission error:', e); console.log('Data that was submitted:', data); + setStoreError('Collection is invalid. Please check your input.'); toast.error( Error: Collection is invalid. Please check your @@ -404,10 +366,10 @@ const SCOM = () => { ); return; } finally { - setSortBy(null); + resetSort(); setIsLoading(false); } - }; + }, [species, relationship, collections, customCollections, customCount, setCustomCollections, setSelectedCollectionsIndex, setCustomCount, resetSort, setIsLoading, setStoreError]); // Auto-submit prefilled data (wait for collections to be ready) useEffect(() => { @@ -415,7 +377,7 @@ const SCOM = () => { handleAddCustomCollection(JSON.stringify(pendingPrefill.json), pendingPrefill.name || ''); setPendingPrefill(null); } - }, [pendingPrefill, isLoading, collections]); + }, [pendingPrefill, isLoading, collections, handleAddCustomCollection]); useEffect(() => { const fetchAllSimilarities = async () => { @@ -424,41 +386,10 @@ const SCOM = () => { for (let i = 0; i < selectedCollectionsIndex.length; i++) { if (!selectedCollectionsIndex[i]) continue; - // const collection = allCollections[i]; - - - if (i < collections.results.length && relationship === 'manyToMany') { - // // server collection - // try { - // const result = await fetchSimilarities(collection); - // if (result?.similarities) { - // const flattenedSimilarities = result.similarities.map((s) => ({ - // selectedDigest: collection, - // comparedDigest: s.digest, - // comparedAlias: s.human_readable_name, - // lengths: s.similarities.lengths, - // name_length_pairs: s.similarities.name_length_pairs, - // names: s.similarities.names, - // sequences: s.similarities.sequences, - // sorted_sequences: s.similarities.sorted_sequences, - // custom: false, - // raw: null, - // })); - // allSimilarities.push(...flattenedSimilarities); - // } - // } catch (error) { - // console.error( - // `Error fetching similarities for ${collection}:`, - // error, - // ); - // } - } else { - // custom collection - const customIndex = i - collections.results.length; - const customCollection = customCollections[customIndex]; - if (customCollection) { - allSimilarities.push(...customCollection.similarities); - } + const customIndex = i - collections.results.length; + const customCollection = customCollections[customIndex]; + if (customCollection) { + allSimilarities.push(...customCollection.similarities); } } @@ -469,40 +400,19 @@ const SCOM = () => { }, [selectedCollectionsIndex, customCollections]); const handleSortTable = (column) => { - if (sortBy === column) { - setSortAscending(!sortAscending) - sortSimilarities() - } else { - setSortBy(column) - setSortAscending(false) - sortSimilarities() - } + sortByColumn(column); }; + if (!collections) { + return
Failed to load collection data.
; + } + return (

Seqcol Comparison Overview Module (SCOM)

- {/*
    -
  • - handleRelationshipChange('oneToMany')} - > - One-to-Many - -
  • -
  • - handleRelationshipChange('manyToMany')} - > - Many-to-Many - -
  • -
*/}
@@ -525,11 +435,6 @@ const SCOM = () => {
- {/*

- If you would like to view metrics for multiple sequence collections - at once, use the "Many-to-Many" tab. -

*/} -
{ value={customCollectionJSON} placeholder='Paste output from `refget fasta seqcol yourfasta.fa` here.' className='form-control tiny border-0 rounded-0 rounded-bottom z-active' - // style={{ maxHeight: 'calc(200px - 32.333333px)' }} rows='12' />
- {/* {relationship === 'manyToMany' && ( -
-
-
- - Selected Sequence Collections - - - - -
-
    - {allCollections && - allCollections.map((collection, index) => ( -
  • -
    -
    - handleSelectCollection(index)} - checked={selectedCollectionsIndex[index]} - /> - -
    - {index >= collections.results.length ? ( - { - const customIndex = - index - collections.results.length; - setCustomCollections((prev) => - prev.filter((_, i) => i !== customIndex), - ); - setSelectedCollectionsIndex((prev) => - prev.filter((_, i) => i !== index), - ); - toast.success('Custom collection removed.'); - }} - /> - ) : ( - - )} -
    -
  • - ))} -
-
-
- )} */}
+ {storeError && ( +
+
+ + Error: {storeError} +
+ +
+ )} + {(similarities && !isLoading) ? (
@@ -716,7 +550,6 @@ const SCOM = () => { {relationship === 'manyToMany' && ( )} - {/* {relationship === 'manyToMany' && } */} {relationship === 'oneToMany' && ( )} @@ -735,74 +568,9 @@ const SCOM = () => {
Heatmap
- {/* */}
rest)} /> - {/* {relationship === 'manyToMany' && ( - <> -
-
Network Graph
-
- Threshold - - setNetworkThreshold(Number(e.target.value)) - } - className='form-control form-range' - style={{ height: 'inherit' }} - /> - - setNetworkThreshold(Number(e.target.value)) - } - className='form-control' - style={{ maxWidth: '70px' }} - /> -
- -
- rest)} - metric={networkMetric} - threshold={networkThreshold} - /> - - )} */} -
Seqcol Comparison Summary Table

@@ -813,7 +581,6 @@ const SCOM = () => { - {/* */} @@ -830,7 +597,6 @@ const SCOM = () => { className='cursor-pointer' onClick={() => handleNavigateSCIM(row)} > - {/* */} diff --git a/frontend/src/services/fetchData.jsx b/frontend/src/services/fetchData.jsx index 9eac0b5..ac6fbab 100644 --- a/frontend/src/services/fetchData.jsx +++ b/frontend/src/services/fetchData.jsx @@ -1,52 +1,88 @@ import { API_BASE } from '../utilities.jsx'; +export class AppError extends Error { + constructor(message, { status, isNotFound, digest1, digest2 } = {}) { + super(message); + this.name = 'AppError'; + this.status = status ?? null; + this.isNotFound = isNotFound ?? false; + this.digest1 = digest1 ?? null; + this.digest2 = digest2 ?? null; + } +} + +const checkResponse = async (response, url) => { + if (!response.ok) { + let errorDetail = response.statusText; + try { + const errorData = await response.json(); + errorDetail = errorData.detail || errorData.message || errorData.error || errorDetail; + } catch { + try { + errorDetail = await response.text(); + if (errorDetail.length > 200) { + errorDetail = errorDetail.substring(0, 200) + '...'; + } + } catch { + // Fallback to status text if body cannot be read + } + } + throw new Error(`HTTP ${response.status} from ${url}: ${errorDetail}`); + } + return response; +}; + export const fetchServiceInfo = async () => { - const response = await fetch(`${API_BASE}/service-info`); + const url = `${API_BASE}/service-info`; + const response = await fetch(url); + await checkResponse(response, url); return response.json(); }; -export const fetchPangenomeLevels = async ( - digest, - level = '2', - collated = true, -) => { - const url = `${API_BASE}/pangenome/${digest}?level=1`; - const url2 = `${API_BASE}/pangenome/${digest}?level=2`; - const urlItemwise = `${API_BASE}/pangenome/${digest}?collated=false`; - let resps = [ - fetch(url).then((response) => response.json()), - fetch(url2).then((response) => response.json()), - fetch(urlItemwise).then((response) => response.json()), +export const fetchPangenomeLevels = async (digest) => { + const urls = [ + `${API_BASE}/pangenome/${digest}?level=1`, + `${API_BASE}/pangenome/${digest}?level=2`, + `${API_BASE}/pangenome/${digest}?collated=false`, ]; - return Promise.all(resps); + return Promise.all( + urls.map(async (url) => { + const response = await fetch(url); + await checkResponse(response, url); + return response.json(); + }), + ); }; export const fetchSeqColList = async () => { - const url = `${API_BASE}/list/collection?page_size=10&page=0`; - const url2 = `${API_BASE}/list/pangenome?page_size=5`; - const url3 = `${API_BASE}/list/attributes/name_length_pairs?page_size=5`; - let resps = [ - fetch(url).then((response) => response.json()), - fetch(url2).then((response) => response.json()), - fetch(url3).then((response) => response.json()), + const urls = [ + `${API_BASE}/list/collection?page_size=10&page=0`, + `${API_BASE}/list/pangenome?page_size=5`, + `${API_BASE}/list/attributes/name_length_pairs?page_size=5`, ]; - return Promise.all(resps); + + return Promise.all( + urls.map(async (url) => { + const response = await fetch(url); + await checkResponse(response, url); + return response.json(); + }), + ); }; export const fetchAllSeqCols = async () => { - const url = `${API_BASE}/list/collection?page_size=1000&page=0`; - let resps = [fetch(url).then((response) => response.json())]; - return Promise.all(resps); -}; + const urls = [ + `${API_BASE}/list/collection?page_size=1000&page=0`, + ]; -export const fetchSeqColDetails = async ( - digest, - level = '2', - collated = true, -) => { - const url = `${API_BASE}/collection/${digest}?level=${level}&collated=${collated}`; - return fetch(url).then((response) => response.json()); + return Promise.all( + urls.map(async (url) => { + const response = await fetch(url); + await checkResponse(response, url); + return response.json(); + }), + ); }; export const fetchCollectionLevels = async (digest) => { @@ -56,20 +92,13 @@ export const fetchCollectionLevels = async (digest) => { `${API_BASE}/collection/${digest}?collated=false`, ]; - const responses = await Promise.all( - urls.map((url) => - fetch(url).then((response) => { - if (!response.ok) { - throw new Error( - `Error fetching data from ${url}: ${response.statusText}`, - ); - } - return response.json(); - }), - ), + return Promise.all( + urls.map(async (url) => { + const response = await fetch(url); + await checkResponse(response, url); + return response.json(); + }), ); - - return responses; }; export const fetchComparison = async (digest1, digest2) => { @@ -77,55 +106,67 @@ export const fetchComparison = async (digest1, digest2) => { const response = await fetch(url); if (!response.ok) { if (response.status === 404) { - const err = new Error('Collection not found'); - err.digest1 = digest1; - err.digest2 = digest2; - err.isNotFound = true; - throw err; + throw new AppError('Collection not found', { + status: 404, + isNotFound: true, + digest1, + digest2, + }); } - throw new Error(`Comparison failed: ${response.status} ${response.statusText}`); + await checkResponse(response, url); } return response.json(); }; export const fetchComparisonJSON = async (data, digest) => { const url = `${API_BASE}/comparison/${digest}`; - return fetch(url, { + const response = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify(data), - }).then((response) => response.json()); + }); + await checkResponse(response, url); + return response.json(); }; export const fetchAttribute = async (attribute, digest) => { - const url = `${API_BASE}/list/collection?${attribute}=${digest}`; - const url2 = `${API_BASE}/attribute/collection/${attribute}/${digest}`; - let resps = [ - fetch(url).then((response) => response.json()), - fetch(url2).then((response) => response.json()), + const urls = [ + `${API_BASE}/list/collection?${attribute}=${digest}`, + `${API_BASE}/attribute/collection/${attribute}/${digest}`, ]; - return Promise.all(resps); + + return Promise.all( + urls.map(async (url) => { + const response = await fetch(url); + await checkResponse(response, url); + return response.json(); + }), + ); }; export const fetchSimilarities = async (digest) => { const url = `${API_BASE}/similarities/${digest}?page_size=60`; - return fetch(url, { + const response = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json', }, - }).then((response) => response.json()); + }); + await checkResponse(response, url); + return response.json(); }; export const fetchSimilaritiesJSON = async (data, species) => { const url = `${API_BASE}/similarities/?species=${species}&page_size=60`; - return fetch(url, { + const response = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify(data), - }).then((response) => response.json()); + }); + await checkResponse(response, url); + return response.json(); }; diff --git a/frontend/src/stores/similarities.js b/frontend/src/stores/similarities.js index 5c854c2..6c70663 100644 --- a/frontend/src/stores/similarities.js +++ b/frontend/src/stores/similarities.js @@ -7,33 +7,41 @@ export const useSimilaritiesStore = create((set, get) => ({ customCollectionJSON: '', customCount: 1, similarities: null, + error: null, sortBy: null, sortAscending: false, species: 'human', - setSortBy: (value) => set({ sortBy: value }), - setSortAscending: (value) => set({ sortAscending: value }), setSpecies: (value) => set({ species: value }), + setError: (value) => set({ error: value }), - sortSimilarities: () => { + resetSort: () => set({ sortBy: null, sortAscending: false }), + + sortByColumn: (column) => { const { similarities, sortBy, sortAscending } = get(); - - if (!similarities || !sortBy) return; - - const sampleValue = similarities.find(item => item[sortBy] != null)?.[sortBy]; - + + const newSortBy = column; + const newSortAscending = sortBy === column ? !sortAscending : false; + + if (!similarities) { + set({ sortBy: newSortBy, sortAscending: newSortAscending }); + return; + } + + const sampleValue = similarities.find(item => item[newSortBy] != null)?.[newSortBy]; + const sorted = [...similarities]; - + if (typeof sampleValue === 'number') { - sorted.sort((a, b) => sortAscending ? a[sortBy] - b[sortBy] : b[sortBy] - a[sortBy]); + sorted.sort((a, b) => newSortAscending ? a[newSortBy] - b[newSortBy] : b[newSortBy] - a[newSortBy]); } else { - sorted.sort((a, b) => sortAscending - ? String(a[sortBy]).localeCompare(String(b[sortBy])) - : String(b[sortBy]).localeCompare(String(a[sortBy])) + sorted.sort((a, b) => newSortAscending + ? String(a[newSortBy]).localeCompare(String(b[newSortBy])) + : String(b[newSortBy]).localeCompare(String(a[newSortBy])) ); } - - set({ similarities: sorted }); + + set({ sortBy: newSortBy, sortAscending: newSortAscending, similarities: sorted }); }, setSelectedCollectionsIndex: (value) => { @@ -68,25 +76,26 @@ export const useSimilaritiesStore = create((set, get) => ({ setSimilarities: (value) => { const { sortBy, sortAscending } = get(); - - if (!sortBy) { + + if (!sortBy || !value) { set({ similarities: value }); return; } const sampleValue = value.find(item => item[sortBy] != null)?.[sortBy]; + const sorted = [...value]; + if (typeof sampleValue === 'number') { - set({ similarities: sortAscending - ? value.sort((a, b) => a[sortBy] - b[sortBy]) - : value.sort((a, b) => b[sortBy] - a[sortBy]) - }); + sorted.sort((a, b) => sortAscending ? a[sortBy] - b[sortBy] : b[sortBy] - a[sortBy]); } else { - set({ similarities: sortAscending - ? value.sort((a, b) => a[sortBy].localeCompare(b[sortBy])) - : value.sort((a, b) => b[sortBy].localeCompare(a[sortBy])) - }); + sorted.sort((a, b) => sortAscending + ? String(a[sortBy]).localeCompare(String(b[sortBy])) + : String(b[sortBy]).localeCompare(String(a[sortBy])) + ); } + + set({ similarities: sorted }); }, getAllCollections: (collections) => { diff --git a/frontend/src/utilities.jsx b/frontend/src/utilities.jsx index c5f2ecd..d137471 100644 --- a/frontend/src/utilities.jsx +++ b/frontend/src/utilities.jsx @@ -5,13 +5,29 @@ import copyToClipboardIcon from './assets/copy_to_clipboard.svg'; import barcodeIcon from './assets/barcode.svg'; const copyToClipboard = async (text) => { - toast.success('Digest copied!'); - return await navigator.clipboard.writeText(text); + try { + await navigator.clipboard.writeText(text); + toast.success('Digest copied!'); + } catch (error) { + toast.error('Failed to copy to clipboard'); + } }; const snakeToTitle = (str) => str.replace(/_/g, ' ').replace(/\b\w/g, (char) => char.toUpperCase()); +// Unicode-safe base64 encoding +// Handles all Unicode characters including non-ASCII sequences +const encodeToBase64 = (str) => { + return btoa(unescape(encodeURIComponent(str))); +}; + +// Unicode-safe base64 decoding +// Handles all Unicode characters including non-ASCII sequences +const decodeFromBase64 = (encoded) => { + return decodeURIComponent(escape(atob(encoded))); +}; + const encodeComparison = (input) => { let jsonString; @@ -28,7 +44,7 @@ const encodeComparison = (input) => { throw new Error('Input must be an object or valid JSON string'); } - return btoa(jsonString); + return encodeToBase64(jsonString); }; export { @@ -38,4 +54,6 @@ export { copyToClipboardIcon, snakeToTitle, encodeComparison, + encodeToBase64, + decodeFromBase64, }; diff --git a/refget/__init__.py b/refget/__init__.py index 8129a43..e739c06 100644 --- a/refget/__init__.py +++ b/refget/__init__.py @@ -15,6 +15,8 @@ from .const import GTARS_INSTALLED from .utils import canonical_str from .store import RefgetStore, StorageMode, digest_fasta, compute_fai, digest_sequence, SequenceCollection +from .compliance import run_compliance +from .clients import SequenceCollectionClient __all__ = [ "__version__", @@ -27,4 +29,6 @@ "compute_fai", "digest_sequence", "SequenceCollection", + "run_compliance", + "SequenceCollectionClient", ] diff --git a/refget/_version.py b/refget/_version.py index 1f4c4d4..ae6db5f 100644 --- a/refget/_version.py +++ b/refget/_version.py @@ -1 +1 @@ -__version__ = "0.10.1" +__version__ = "0.11.0" diff --git a/refget/compliance.py b/refget/compliance.py new file mode 100644 index 0000000..7020d9e --- /dev/null +++ b/refget/compliance.py @@ -0,0 +1,496 @@ +""" +GA4GH SeqCol API Compliance Suite. + +This is THE canonical compliance suite. It can be run two ways: +1. Via pytest: tests/api/test_compliance.py wraps these checks +2. Via web UI: /compliance/stream endpoint streams results in real-time + +All check functions take an api_root URL and raise AssertionError on failure. +The runner functions execute checks and return structured results. + +Test data is loaded from test_fasta/test_fasta_digests.json and +tests/api/comparison/ fixture files relative to the repository root. +""" + +import json +import logging +import time +from dataclasses import dataclass, field, asdict +from datetime import datetime, timezone +from pathlib import Path + +import requests + +_LOGGER = logging.getLogger(__name__) + +COMPLIANCE_TIMEOUT = 3 # seconds per request + +# ============================================================ +# Test data -- loaded from repository fixtures +# ============================================================ + +REPO_ROOT = Path(__file__).parent.parent +_DIGESTS_FILE = REPO_ROOT / "test_fasta" / "test_fasta_digests.json" +_COMPARISON_DIR = REPO_ROOT / "tests" / "api" / "comparison" + +# Load digest test data +with open(_DIGESTS_FILE) as _f: + DIGEST_DATA = json.load(_f) + +# Convert to list of (name, bundle) tuples for iteration +DIGEST_TESTS = [(name, bundle) for name, bundle in DIGEST_DATA.items()] + +# Comparison fixture files (base.fa vs each other file) +COMPARISON_FILES = [ + _COMPARISON_DIR / "compare_base.fa_subset.fa.json", + _COMPARISON_DIR / "compare_base.fa_different_names.fa.json", + _COMPARISON_DIR / "compare_base.fa_different_order.fa.json", + _COMPARISON_DIR / "compare_base.fa_pair_swap.fa.json", + _COMPARISON_DIR / "compare_base.fa_swap_wo_coords.fa.json", +] + +# Load comparison fixtures +COMPARISON_FIXTURES = {} +for _f in COMPARISON_FILES: + with open(_f) as _fp: + COMPARISON_FIXTURES[_f.name] = json.load(_fp) + + +# ============================================================ +# Result types +# ============================================================ + + +@dataclass +class CheckResult: + """Result of a single compliance check.""" + + name: str + passed: bool + duration_ms: float + description: str | None = None + message: str | None = None + error: str | None = None + + +@dataclass +class ComplianceReport: + """Full compliance report for a server.""" + + server_url: str + timestamp: str + total: int = 0 + passed: int = 0 + failed: int = 0 + errors: int = 0 + results: list[dict] = field(default_factory=list) + + def to_dict(self) -> dict: + return asdict(self) + + +def _timed_check(name: str, func, *args, **kwargs) -> CheckResult: + """Run a check function and capture timing and errors.""" + description = (func.__doc__ or "").strip().split("\n")[0] or None + start = time.monotonic() + try: + func(*args, **kwargs) + elapsed = (time.monotonic() - start) * 1000 + return CheckResult(name=name, passed=True, duration_ms=round(elapsed, 2), description=description) + except AssertionError as e: + elapsed = (time.monotonic() - start) * 1000 + return CheckResult( + name=name, passed=False, duration_ms=round(elapsed, 2), description=description, error=str(e) + ) + except requests.exceptions.RequestException as e: + elapsed = (time.monotonic() - start) * 1000 + return CheckResult( + name=name, + passed=False, + duration_ms=round(elapsed, 2), + description=description, + error=f"Connection error: {e}", + ) + except Exception as e: + elapsed = (time.monotonic() - start) * 1000 + return CheckResult( + name=name, + passed=False, + duration_ms=round(elapsed, 2), + description=description, + error=f"Unexpected error: {e}", + ) + + +# ============================================================ +# Structure checks -- validate response format +# ============================================================ + + +def check_service_info(api_root): + """Service-info returns required GA4GH fields and seqcol schema.""" + res = requests.get(f"{api_root}/service-info", timeout=COMPLIANCE_TIMEOUT) + data = res.json() + assert "id" in data, "service-info missing 'id' field" + assert "type" in data, "service-info missing 'type' field" + assert "group" in data["type"], "service-info type missing 'group'" + assert "artifact" in data["type"], "service-info type missing 'artifact'" + assert "version" in data["type"], "service-info type missing 'version'" + assert "seqcol" in data, "service-info must have 'seqcol' section" + assert "schema" in data["seqcol"], "seqcol section must include 'schema'" + schema = data["seqcol"]["schema"] + assert "properties" in schema, "schema must have 'properties'" + assert "lengths" in schema["properties"], "schema must define 'lengths'" + assert "names" in schema["properties"], "schema must define 'names'" + assert "sequences" in schema["properties"], "schema must define 'sequences'" + + +def check_list_collections(api_root): + """List collections returns paginated results with total count.""" + res = requests.get(f"{api_root}/list/collection", timeout=COMPLIANCE_TIMEOUT) + data = res.json() + assert "results" in data, "list/collection missing 'results' field" + assert isinstance(data["results"], list), "list/collection 'results' should be a list" + assert "pagination" in data, "list/collection missing 'pagination' field" + assert "page" in data["pagination"], "pagination missing 'page'" + assert "page_size" in data["pagination"], "pagination missing 'page_size'" + assert "total" in data["pagination"], "pagination must include 'total' per GA4GH spec" + assert isinstance(data["pagination"]["total"], int), "pagination 'total' must be an integer" + + +def check_list_attributes(api_root, attribute_name): + """List attributes endpoint returns paginated results.""" + res = requests.get(f"{api_root}/list/attributes/{attribute_name}", timeout=COMPLIANCE_TIMEOUT) + data = res.json() + assert "results" in data, f"list/attributes/{attribute_name} missing 'results' field" + assert isinstance( + data["results"], list + ), f"list/attributes/{attribute_name} 'results' should be a list" + + +def check_openapi_available(api_root): + """OpenAPI endpoint is available (RECOMMENDED by spec Section 3.6).""" + res = requests.get(f"{api_root}/openapi.json", timeout=COMPLIANCE_TIMEOUT) + assert res.status_code == 200, f"OpenAPI endpoint returned status {res.status_code}" + data = res.json() + assert "openapi" in data, "OpenAPI response missing 'openapi' field" + + +# ============================================================ +# Collection checks -- verify content against known test data +# ============================================================ + + +def check_collection_level1(api_root, fa_name, bundle): + """Level 1 response returns digest strings for all attributes.""" + digest = bundle["top_level_digest"] + res = requests.get(f"{api_root}/collection/{digest}?level=1", timeout=COMPLIANCE_TIMEOUT) + assert res.status_code == 200, f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + data = res.json() + for attr in ["names", "lengths", "sequences"]: + assert isinstance(data[attr], str), ( + f"Level 1 {attr} should be digest string, got {type(data[attr]).__name__}: {data[attr]}" + ) + assert data[attr] == bundle["level1"][attr], ( + f"Level 1 {attr} for {fa_name}: expected {bundle['level1'][attr]}, got {data[attr]}" + ) + assert "sorted_name_length_pairs" in data, "Level 1 missing sorted_name_length_pairs" + + +def check_collection_level2(api_root, fa_name, bundle): + """Level 2 response returns arrays matching expected content.""" + digest = bundle["top_level_digest"] + res = requests.get(f"{api_root}/collection/{digest}?level=2", timeout=COMPLIANCE_TIMEOUT) + assert res.status_code == 200, f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + data = res.json() + for attr in ["names", "lengths", "sequences"]: + assert isinstance(data[attr], list), ( + f"Level 2 {attr} should be array, got {type(data[attr]).__name__}" + ) + assert data[attr] == bundle["level2"][attr], ( + f"Level 2 {attr} for {fa_name}: expected {bundle['level2'][attr]}, got {data[attr]}" + ) + assert "sorted_name_length_pairs" not in data, "Level 2 should not have sorted_name_length_pairs" + + +def check_default_level_returns_level2(api_root, fa_name, bundle): + """Collection without ?level= param returns level 2 arrays (spec default).""" + digest = bundle["top_level_digest"] + res = requests.get(f"{api_root}/collection/{digest}", timeout=COMPLIANCE_TIMEOUT) + assert res.status_code == 200, f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + data = res.json() + for attr in ["names", "lengths", "sequences"]: + assert isinstance(data[attr], list), ( + f"Default level for {fa_name} {attr} should be array, got {type(data[attr]).__name__}" + ) + + +def check_sorted_name_length_pairs(api_root, fa_name, bundle): + """Level 1 sorted_name_length_pairs digest matches expected value.""" + digest = bundle["top_level_digest"] + res = requests.get(f"{api_root}/collection/{digest}?level=1", timeout=COMPLIANCE_TIMEOUT) + assert res.status_code == 200, f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + data = res.json() + expected = bundle["sorted_name_length_pairs_digest"] + actual = data.get("sorted_name_length_pairs") + assert actual == expected, ( + f"SNLP for {fa_name}: expected {expected}, got {actual}" + ) + + +# ============================================================ +# Attribute checks -- verify attribute retrieval +# ============================================================ + + +def check_attribute_retrieval(api_root, fa_name, bundle, attr_name): + """Attribute endpoint returns correct array for a known digest.""" + attr_digest = bundle["level1"][attr_name] + expected = bundle["level2"][attr_name] + res = requests.get( + f"{api_root}/attribute/collection/{attr_name}/{attr_digest}", timeout=COMPLIANCE_TIMEOUT + ) + assert res.status_code == 200, ( + f"Attribute {attr_name}/{attr_digest} returned HTTP {res.status_code} (expected 200)" + ) + actual = res.json() + assert actual == expected, ( + f"Attribute {attr_name} for {fa_name}: expected {expected}, got {actual}" + ) + + +def check_transient_attribute_not_served(api_root): + """Transient attributes (sorted_name_length_pairs) return 404 from /attribute.""" + bundle = DIGEST_TESTS[0][1] + digest = bundle["top_level_digest"] + level1 = requests.get(f"{api_root}/collection/{digest}?level=1", timeout=COMPLIANCE_TIMEOUT).json() + snlp_digest = level1["sorted_name_length_pairs"] + res = requests.get( + f"{api_root}/attribute/collection/sorted_name_length_pairs/{snlp_digest}", + timeout=COMPLIANCE_TIMEOUT, + ) + assert res.status_code == 404, "Transient attributes should not be served by /attribute endpoint" + + +# ============================================================ +# List/filter checks -- verify filtering and pagination +# ============================================================ + + +def check_list_filter_by_attribute(api_root, fa_name, bundle, attr_name): + """List collections filtered by attribute digest returns the expected collection.""" + attr_digest = bundle["level1"][attr_name] + top_digest = bundle["top_level_digest"] + res = requests.get( + f"{api_root}/list/collection?{attr_name}={attr_digest}", timeout=COMPLIANCE_TIMEOUT + ) + assert res.status_code == 200, f"List filter returned HTTP {res.status_code}" + data = res.json() + assert "results" in data, "Filtered list missing 'results'" + assert top_digest in data["results"], ( + f"Collection {top_digest} not in results when filtering by {attr_name}={attr_digest} for {fa_name}. " + f"Got {len(data['results'])} results: {data['results'][:5]}" + ) + + +def check_list_multi_attribute_filter_and(api_root): + """Multiple filter attributes use AND logic (spec Section 3.4).""" + bundle = DIGEST_TESTS[0][1] + names_digest = bundle["level1"]["names"] + lengths_digest = bundle["level1"]["lengths"] + data = requests.get( + f"{api_root}/list/collection?names={names_digest}&lengths={lengths_digest}", + timeout=COMPLIANCE_TIMEOUT, + ).json() + assert bundle["top_level_digest"] in data["results"], ( + "AND filter should return base.fa collection" + ) + + +# ============================================================ +# Comparison checks -- verify comparison endpoint +# ============================================================ + + +def check_comparison(api_root, fixture_name, expected): + """GET comparison returns correct diff structure matching fixture data.""" + url = f"{api_root}/comparison/{expected['digests']['a']}/{expected['digests']['b']}" + res = requests.get(url, timeout=COMPLIANCE_TIMEOUT) + assert res.status_code == 200, f"Comparison returned HTTP {res.status_code} for {fixture_name}" + import refget + + actual = res.json() + assert refget.canonical_str(actual) == refget.canonical_str(expected), ( + f"Comparison mismatch for {fixture_name}.\n" + f" Expected attributes: {expected.get('attributes')}\n" + f" Got attributes: {actual.get('attributes')}" + ) + + +def check_comparison_structure(api_root): + """Comparison response has all required fields (digests, attributes, array_elements).""" + digest_a = DIGEST_TESTS[0][1]["top_level_digest"] + digest_b = DIGEST_TESTS[1][1]["top_level_digest"] + data = requests.get( + f"{api_root}/comparison/{digest_a}/{digest_b}", timeout=COMPLIANCE_TIMEOUT + ).json() + assert "digests" in data and "a" in data["digests"] and "b" in data["digests"] + assert "attributes" in data + assert "a_only" in data["attributes"] + assert "b_only" in data["attributes"] + assert "a_and_b" in data["attributes"] + assert "array_elements" in data + assert "a_count" in data["array_elements"] + assert "b_count" in data["array_elements"] + assert "a_and_b_count" in data["array_elements"] + assert "a_and_b_same_order" in data["array_elements"] + + +def check_comparison_same_order_values(api_root): + """Identical comparison: a_and_b_same_order values are all true.""" + digest = DIGEST_TESTS[0][1]["top_level_digest"] + data = requests.get( + f"{api_root}/comparison/{digest}/{digest}", timeout=COMPLIANCE_TIMEOUT + ).json() + same_order = data["array_elements"]["a_and_b_same_order"] + for attr, val in same_order.items(): + assert val is True or val is False or val is None, ( + f"a_and_b_same_order[{attr}] must be bool or null, got {type(val)}" + ) + assert val is True, f"Identical comparison: a_and_b_same_order[{attr}] should be true" + + +def check_comparison_post(api_root, fixture_name, expected): + """POST comparison with local seqcol body returns correct diff.""" + import refget + + digest_b = expected["digests"]["b"] + client = refget.SequenceCollectionClient(urls=[api_root]) + local_collection = client.get_collection(digest_b) + + digest_a = expected["digests"]["a"] + res = requests.post( + f"{api_root}/comparison/{digest_a}", + json=local_collection, + timeout=COMPLIANCE_TIMEOUT, + ) + assert res.status_code == 200, ( + f"Comparison POST returned HTTP {res.status_code} for {fixture_name}" + ) + data = res.json() + assert data["digests"]["a"] == expected["digests"]["a"], ( + f"POST digest a: expected {expected['digests']['a']}, got {data['digests']['a']}" + ) + assert data["attributes"] == expected["attributes"], ( + f"POST attributes for {fixture_name}: expected {expected['attributes']}, got {data['attributes']}" + ) + assert data["array_elements"] == expected["array_elements"], ( + f"POST array_elements for {fixture_name}: expected {expected['array_elements']}, got {data['array_elements']}" + ) + + +# ============================================================ +# Check registry -- builds the full compliance suite +# ============================================================ + + +def build_checks(api_root: str) -> list[tuple[str, callable, list]]: + """Build the complete list of compliance checks. + + Returns list of (name, function, args) tuples. + """ + checks = [] + + # Structure checks + checks.append(("service_info", check_service_info, [api_root])) + checks.append(("list_collections", check_list_collections, [api_root])) + for attr in ["lengths", "names", "sequences"]: + checks.append((f"list_attributes_{attr}", check_list_attributes, [api_root, attr])) + checks.append(("openapi_available", check_openapi_available, [api_root])) + + # Collection content checks (per FASTA file) + for fa_name, bundle in DIGEST_TESTS: + tag = fa_name.replace(".fa", "") + checks.append((f"collection_level1_{tag}", check_collection_level1, [api_root, fa_name, bundle])) + checks.append((f"collection_level2_{tag}", check_collection_level2, [api_root, fa_name, bundle])) + checks.append((f"default_level2_{tag}", check_default_level_returns_level2, [api_root, fa_name, bundle])) + checks.append((f"snlp_digest_{tag}", check_sorted_name_length_pairs, [api_root, fa_name, bundle])) + + # Attribute retrieval checks (per FASTA, per attribute) + for fa_name, bundle in DIGEST_TESTS: + tag = fa_name.replace(".fa", "") + for attr in ["lengths", "names", "sequences"]: + checks.append(( + f"attribute_{attr}_{tag}", + check_attribute_retrieval, + [api_root, fa_name, bundle, attr], + )) + + # Attribute filtering checks + checks.append(("transient_attribute_not_served", check_transient_attribute_not_served, [api_root])) + checks.append(("multi_attribute_filter_and", check_list_multi_attribute_filter_and, [api_root])) + + # List filter checks (base.fa, filter by each attribute) + base_name, base_bundle = DIGEST_TESTS[0] + for attr in ["lengths", "names", "sequences"]: + checks.append(( + f"list_filter_{attr}", + check_list_filter_by_attribute, + [api_root, base_name, base_bundle, attr], + )) + + # Comparison checks + checks.append(("comparison_structure", check_comparison_structure, [api_root])) + checks.append(("comparison_same_order", check_comparison_same_order_values, [api_root])) + + for fixture_name, expected in COMPARISON_FIXTURES.items(): + tag = fixture_name.replace("compare_", "").replace(".json", "") + checks.append((f"comparison_{tag}", check_comparison, [api_root, fixture_name, expected])) + checks.append((f"comparison_post_{tag}", check_comparison_post, [api_root, fixture_name, expected])) + + return checks + + +# ============================================================ +# Runners -- batch and streaming +# ============================================================ + + +def run_compliance(api_root: str) -> dict: + """Run all compliance checks and return a report dict.""" + api_root = api_root.rstrip("/") + report = ComplianceReport( + server_url=api_root, + timestamp=datetime.now(timezone.utc).isoformat(), + ) + + for name, func, args in build_checks(api_root): + result = _timed_check(name, func, *args) + report.results.append(asdict(result)) + report.total += 1 + if result.passed: + report.passed += 1 + else: + report.failed += 1 + + return report.to_dict() + + +def run_compliance_stream(api_root: str): + """Generator that yields each check result as a JSON string for SSE streaming.""" + api_root = api_root.rstrip("/") + checks = build_checks(api_root) + + yield json.dumps({"type": "start", "total": len(checks), "server_url": api_root}) + + passed = 0 + failed = 0 + for name, func, args in checks: + result = _timed_check(name, func, *args) + if result.passed: + passed += 1 + else: + failed += 1 + yield json.dumps({"type": "result", **asdict(result)}) + + yield json.dumps({"type": "done", "passed": passed, "failed": failed, "total": len(checks)}) diff --git a/refget/router.py b/refget/router.py index 82e768d..eeb76ef 100644 --- a/refget/router.py +++ b/refget/router.py @@ -21,6 +21,7 @@ import logging from fastapi import APIRouter, Response, HTTPException, Request, Depends, Query +from fastapi.responses import StreamingResponse from .models import Similarities, PaginationResult, PaginatedDigestList from .agents import RefgetDBAgent @@ -45,6 +46,7 @@ def create_refget_router( collections: bool = True, pangenomes: bool = False, fasta_drs: bool = False, + compliance: bool = True, refget_store_url: str = None, ) -> APIRouter: """ @@ -85,6 +87,9 @@ def create_refget_router( if fasta_drs: _LOGGER.info("Adding FASTA DRS endpoints...") refget_router.include_router(fasta_drs_router, prefix="/fasta") + if compliance: + _LOGGER.info("Adding compliance endpoints...") + refget_router.include_router(compliance_router) return refget_router @@ -562,3 +567,65 @@ async def get_fasta_index( } except ValueError: raise HTTPException(status_code=404, detail="Object not found") + + +compliance_router = APIRouter() + + +@compliance_router.get( + "/compliance/run", + summary="Run compliance checks against a seqcol server", + tags=["Compliance"], +) +def run_compliance_endpoint( + request: Request, + target_url: str | None = Query(None, description="Target server URL to test (defaults to self)"), +): + """ + Run GA4GH SeqCol compliance structure tests against a server. + + Only runs structure tests (service-info, list, pagination, collection structure). + Content tests that require specific test data are not included. + + If no target_url is provided, tests run against this server. + """ + from .compliance import run_compliance + + if target_url is None: + scheme = request.headers.get("x-forwarded-proto", request.url.scheme) + host = request.headers.get("host", request.url.netloc) + target_url = f"{scheme}://{host}" + + return run_compliance(target_url) + + +@compliance_router.get( + "/compliance/stream", + summary="Stream compliance checks via Server-Sent Events", + tags=["Compliance"], +) +def stream_compliance_endpoint( + request: Request, + target_url: str | None = Query(None, description="Target server URL to test (defaults to self)"), +): + """ + Stream compliance check results in real-time via Server-Sent Events. + + Each event contains a JSON object with type "start", "result", or "done". + """ + from .compliance import run_compliance_stream + + if target_url is None: + scheme = request.headers.get("x-forwarded-proto", request.url.scheme) + host = request.headers.get("host", request.url.netloc) + target_url = f"{scheme}://{host}" + + def event_stream(): + for data in run_compliance_stream(target_url): + yield f"data: {data}\n\n" + + return StreamingResponse( + event_stream(), + media_type="text/event-stream", + headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, + ) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 146a887..1d1c6e3 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,5 +1,5 @@ jsonschema -gtars>=0.6.0 +gtars>=0.7.0 pyyaml requests sqlmodel diff --git a/seqcolapi/_version.py b/seqcolapi/_version.py deleted file mode 100644 index 3e2f46a..0000000 --- a/seqcolapi/_version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "0.9.0" diff --git a/seqcolapi/const.py b/seqcolapi/const.py index 07fbe54..ccf3895 100644 --- a/seqcolapi/const.py +++ b/seqcolapi/const.py @@ -1,13 +1,12 @@ import os -from refget._version import __version__ as refget_pkg_version +from refget._version import __version__ as refget_version +from gtars import __version__ as gtars_version from platform import python_version -from ._version import __version__ as seqcolapi_version - ALL_VERSIONS = { - "seqcolapi_version": seqcolapi_version, - "refget_pkg_version": refget_pkg_version, + "refget_version": refget_version, + "gtars_version": gtars_version, "python_version": python_version(), "seqcol_spec_version": "1.0.0", } diff --git a/seqcolapi/main.py b/seqcolapi/main.py index 70d6fd6..b13cc72 100644 --- a/seqcolapi/main.py +++ b/seqcolapi/main.py @@ -69,7 +69,7 @@ async def lifespan_loader(app): app = FastAPI( title="Sequence Collections API", description="An API providing metadata such as names, lengths, and other values for collections of reference sequences", - version=ALL_VERSIONS["seqcolapi_version"], + version=ALL_VERSIONS["refget_version"], lifespan=lifespan_loader, ) diff --git a/test_fasta/different_order.rgsi b/test_fasta/different_order.rgsi new file mode 100644 index 0000000..3cf702e --- /dev/null +++ b/test_fasta/different_order.rgsi @@ -0,0 +1,8 @@ +##seqcol_digest=Tpdsg75D4GKCGEHtIiDSL9Zx-DSuX5V8 +##names_digest=dOAOfPGkf3wAf3CUsbjVTKhY9Wq2DL6f +##sequences_digest=7t6Ulz6OeUWu6FBxntbvFKOl8w3icl2h +##lengths_digest=x5qpE4FtMkvlwpKIzvHs3a02Nex5tthp +#name length alphabet sha512t24u md5 description +chr1 4 dna2bit YBbVX0dLKG1ieEDCiMmkrTZFt_Z5Vdaj 31fc6ca291a32fb9df82b85e5f077e31 +chr2 4 dna2bit AcLxtBuKEPk_7PGE_H4dGElwZHCujwH6 92c6a56c9e9459d8a42b96f7884710bc +chrX 8 dna2bit iYtREV555dUFKg2_agSJW6suquUyPpMw 5f63cfaa3ef61f88c9635fb9d18ec945 diff --git a/test_fasta/sample_fhr.json b/test_fasta/sample_fhr.json new file mode 100644 index 0000000..098bb0e --- /dev/null +++ b/test_fasta/sample_fhr.json @@ -0,0 +1,14 @@ +{ + "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", + "schemaVersion": 1.0, + "genome": "Test organism", + "version": "v1.0", + "taxon": { + "name": "Test organism", + "uri": "https://identifiers.org/taxonomy:12345" + }, + "masking": "soft-masked", + "genomeSynonym": ["test_v1"], + "dateCreated": "2025-01-01", + "license": "CC0-1.0" +} diff --git a/test_fasta/subset.rgsi b/test_fasta/subset.rgsi new file mode 100644 index 0000000..e767fc7 --- /dev/null +++ b/test_fasta/subset.rgsi @@ -0,0 +1,7 @@ +##seqcol_digest=sv7GIP1K0qcskIKF3iaBmQpaum21vH74 +##names_digest=iyNUhtfR0TALytlmxK1Zx1_q3frkZyAd +##sequences_digest=3ZP38SZcoc9wN7jsRyNSP9mQ1a3TUoUF +##lengths_digest=7-_HdxYiRf-AJLBKOTaJUdxXrUkIXs6T +#name length alphabet sha512t24u md5 description +chrX 8 dna2bit iYtREV555dUFKg2_agSJW6suquUyPpMw 5f63cfaa3ef61f88c9635fb9d18ec945 +chr1 4 dna2bit YBbVX0dLKG1ieEDCiMmkrTZFt_Z5Vdaj 31fc6ca291a32fb9df82b85e5f077e31 diff --git a/tests/api/conftest.py b/tests/api/conftest.py index 4bf9fe5..9ebaaca 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -1,42 +1,6 @@ import pytest from pathlib import Path -from tests.conftest import DEMO_FILES - -# from tests.conftest import pytest_addoption, api_root, pytest_configure, pytest_collection_modifyitems, check_server_is_running -from tests.conftest import API_TEST_DIR - -COLLECTION_TESTS = [ - (DEMO_FILES[0], f"{API_TEST_DIR}/collection/base_collection.json"), - (DEMO_FILES[1], f"{API_TEST_DIR}/collection/different_names_collection.json"), - (DEMO_FILES[2], f"{API_TEST_DIR}/collection/different_order_collection.json"), - (DEMO_FILES[3], f"{API_TEST_DIR}/collection/pair_swap_collection.json"), - (DEMO_FILES[4], f"{API_TEST_DIR}/collection/subset_collection.json"), - (DEMO_FILES[5], f"{API_TEST_DIR}/collection/swap_wo_coords_collection.json"), -] - -COMPARISON_TESTS = [ - f"{API_TEST_DIR}/comparison/compare_base.fa_subset.fa.json", # subset - f"{API_TEST_DIR}/comparison/compare_base.fa_different_names.fa.json", # same sequences, different names - f"{API_TEST_DIR}/comparison/compare_base.fa_different_order.fa.json", # same sequences, name order switch, but equivalent coordinate system - f"{API_TEST_DIR}/comparison/compare_base.fa_pair_swap.fa.json", # swapped name-length-pairs - f"{API_TEST_DIR}/comparison/compare_base.fa_swap_wo_coords.fa.json", # swapped name-length-pairs, but no coord system change -] - - -ATTRIBUTE_TESTS = [ - ("lengths", "7-_HdxYiRf-AJLBKOTaJUdxXrUkIXs6T", [8, 4]), - ("names", "Fw1r9eRxfOZD98KKrhlYQNEdSRHoVxAG", ["chrX", "chr1", "chr2"]), -] - -ATTRIBUTE_LIST_TESTS = [ - ( - "lengths", - "cGRMZIb3AVgkcAfNv39RN7hnT5Chk7RX", - f"{API_TEST_DIR}/attribute/cGRM.json", - ) -] - @pytest.fixture(scope="session") def test_data_root(): diff --git a/tests/api/test_compliance.py b/tests/api/test_compliance.py index 442d6f4..0e8652d 100644 --- a/tests/api/test_compliance.py +++ b/tests/api/test_compliance.py @@ -1,299 +1,115 @@ -# Compliance suite for the GA4GH SeqCol API v1.0.0 +# Pytest wrapper for the GA4GH SeqCol compliance suite. # -# Endpoints tested: -# - GET /service-info -# - GET /collection/:digest (level 1 and level 2) -# - GET /comparison/:digest1/:digest2 -# - POST /comparison/:digest -# - GET /attribute/collection/:attr/:digest -# - GET /list/collection (with pagination and filtering) -# - GET /list/attributes/:attr +# The canonical compliance checks live in refget/compliance.py. +# This file parametrizes them for pytest execution. # -# Also validates: -# - Level 1 returns digest strings, level 2 returns arrays -# - Transient attributes (sorted_name_length_pairs) in level 1 only -# - Pagination structure (results + pagination fields) +# Run against an external server: +# pytest tests/api --api-root https://seqcolapi.databio.org # -# Tests fall into two categories: -# 1. Content tests (collection, comparison, attribute): compare full responses to known fixtures -# 2. Structure tests (service-info, list endpoints): validate response structure only, since values vary by server +# Run via integration test server: +# ./scripts/test-integration.sh -import json import pytest -import requests -import refget - -# Collection endpoints -from tests.api.conftest import ( - COLLECTION_TESTS, - COMPARISON_TESTS, - ATTRIBUTE_TESTS, - ATTRIBUTE_LIST_TESTS, +from refget.compliance import ( + DIGEST_TESTS, + COMPARISON_FIXTURES, + check_service_info, + check_list_collections, + check_list_attributes, + check_openapi_available, + check_collection_level1, + check_collection_level2, + check_default_level_returns_level2, + check_sorted_name_length_pairs, + check_attribute_retrieval, + check_transient_attribute_not_served, + check_list_filter_by_attribute, + check_list_multi_attribute_filter_and, + check_comparison, + check_comparison_structure, + check_comparison_same_order_values, + check_comparison_post, ) -from tests.conftest import DIGEST_TESTS - -demo_file = "demo0.fa" -response_file = "tests/demo0_collection.json" - -print("Testing Compliance") - - -def read_url(url): - import requests - import yaml - - try: - response = requests.get(url, timeout=1) - except requests.exceptions.ConnectionError: - print(f"Connection error: {url}") - raise e - data = response.content - return yaml.safe_load(data) - - -def check_collection(api_root, demo_file, response_file, data_root): - - # Need schema to make sure we eliminate inherent attributes correctly - # schema_path = "https://schema.databio.org/refget/SeqColArraySetInherent.yaml" - # schema = read_url(schema_path) - # inherent_attrs = schema["inherent"] - - inherent_attrs = ["names", "sequences"] - print(f"Loading fasta file at '{data_root}/{demo_file}'") - digest = refget.fasta_to_digest(f"{data_root}/{demo_file}", inherent_attrs=inherent_attrs) - print(f"Checking digest: {digest}") - res = requests.get(f"{api_root}/collection/{digest}") - - client = refget.SequenceCollectionClient(urls=[api_root]) - - srv_response = client.get_collection(digest, level=1) - print("Server response:", srv_response) - try: - server_answer = json.loads(res.content) - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - - with open(response_file) as fp: - correct_answer = json.load(fp) - - assert ( - server_answer["sequences"] == correct_answer["sequences"] - ), f"Collection endpoint failed: sequence mismatch for {demo_file}" - assert ( - server_answer["names"] == correct_answer["names"] - ), f"Collection endpoint failed: names mismatch for {demo_file}" - assert ( - server_answer["lengths"] == correct_answer["lengths"] - ), f"Collection endpoint failed: lengths mismatch for {demo_file}" - - -def check_comparison(api_root, response_file): - with open(response_file) as fp: - correct_answer = json.load(fp) - - url = ( - f"{api_root}/comparison/{correct_answer['digests']['a']}/{correct_answer['digests']['b']}" - ) - res = requests.get(url) - try: - server_answer = json.loads(res.content) - print("Server answer:", refget.canonical_str(server_answer)) - print("Correct answer:", refget.canonical_str(correct_answer)) - assert refget.canonical_str(server_answer) == refget.canonical_str( - correct_answer - ), f"Comparison endpoint failed: {url}. File: {response_file}" - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - assert False, f"Comparison endpoint failed: {url}" - - -def check_attribute(api_root, attribute_type, attribute, correct_value): - url = f"{api_root}/attribute/collection/{attribute_type}/{attribute}" - res = requests.get(url) - try: - server_answer = json.loads(res.content) - assert ( - server_answer == correct_value - ), f"Attribute endpoint failed: {url}. Answer: {correct_value}" - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - assert False, f"Attribute endpoint failed: {url}" - -def check_list_collections_by_attribute(api_root, attribute_type, attribute, response_file): - with open(response_file) as fp: - correct_answer = json.load(fp) - - url = f"{api_root}/list/collection?{attribute_type}={attribute}" - res = requests.get(url) - try: - server_answer = json.loads(res.content) - print("Server answer:", server_answer) - for digest in correct_answer["results"]: - print("Checking digest:", digest) - assert ( - digest in server_answer["results"] - ), f"Attribute endpoint failed: {url}. Missing: {digest}" - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - assert False, f"Attribute endpoint failed: {url}" - - -def check_service_info(api_root): - url = f"{api_root}/service-info" - res = requests.get(url) - try: - server_answer = json.loads(res.content) - # Check required GA4GH service-info fields exist - assert "id" in server_answer, "service-info missing 'id' field" - assert "type" in server_answer, "service-info missing 'type' field" - assert "group" in server_answer["type"], "service-info type missing 'group'" - assert "artifact" in server_answer["type"], "service-info type missing 'artifact'" - assert "version" in server_answer["type"], "service-info type missing 'version'" - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - assert False, f"Service-info endpoint failed: {url}" - - -def check_list_collections(api_root): - url = f"{api_root}/list/collection" - res = requests.get(url) - try: - server_answer = json.loads(res.content) - assert "results" in server_answer, "list/collection missing 'results' field" - assert isinstance( - server_answer["results"], list - ), "list/collection 'results' should be a list" - assert "pagination" in server_answer, "list/collection missing 'pagination' field" - assert "page" in server_answer["pagination"], "pagination missing 'page'" - assert "page_size" in server_answer["pagination"], "pagination missing 'page_size'" - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - assert False, f"List collections endpoint failed: {url}" - - -def check_list_attributes(api_root, attribute_name): - url = f"{api_root}/list/attributes/{attribute_name}" - res = requests.get(url) - try: - server_answer = json.loads(res.content) - assert ( - "results" in server_answer - ), f"list/attributes/{attribute_name} missing 'results' field" - assert isinstance( - server_answer["results"], list - ), f"list/attributes/{attribute_name} 'results' should be a list" - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - assert False, f"List attributes endpoint failed: {url}" +@pytest.mark.require_service +class TestAPI: + """GA4GH SeqCol compliance tests. Expects demo data loaded on the server.""" -def check_collection_structure(api_root, digest): - # Level 1: inherent attributes should be digest strings - level1 = requests.get(f"{api_root}/collection/{digest}?level=1").json() - for attr in ["names", "lengths", "sequences"]: - assert isinstance(level1[attr], str), f"Level 1 {attr} should be digest string" + # ---- Structure checks ---- - # Level 1 should include transient attribute - assert "sorted_name_length_pairs" in level1, "Level 1 missing sorted_name_length_pairs" + def test_service_info(self, api_root): + check_service_info(api_root) - # Level 2: inherent attributes should be arrays - level2 = requests.get(f"{api_root}/collection/{digest}?level=2").json() - for attr in ["names", "lengths", "sequences"]: - assert isinstance(level2[attr], list), f"Level 2 {attr} should be array" + def test_list_collections(self, api_root): + check_list_collections(api_root) - # Level 2 should NOT include transient attribute - assert ( - "sorted_name_length_pairs" not in level2 - ), "Level 2 should not have sorted_name_length_pairs" + @pytest.mark.parametrize("attribute_name", ["lengths", "names", "sequences"]) + def test_list_attributes(self, api_root, attribute_name): + check_list_attributes(api_root, attribute_name) + @pytest.mark.recommended + def test_openapi_available(self, api_root): + check_openapi_available(api_root) -def check_comparison_post(api_root, response_file, test_data_root): - with open(response_file) as fp: - correct_answer = json.load(fp) + # ---- Collection content checks ---- - # Get the local collection to POST - digest_b = correct_answer["digests"]["b"] - client = refget.SequenceCollectionClient(urls=[api_root]) - local_collection = client.get_collection(digest_b) + @pytest.mark.parametrize("fa_name, bundle", DIGEST_TESTS) + def test_collection_level1(self, api_root, fa_name, bundle): + check_collection_level1(api_root, fa_name, bundle) - # POST to compare with collection A on server - digest_a = correct_answer["digests"]["a"] - url = f"{api_root}/comparison/{digest_a}" - res = requests.post(url, json=local_collection) - try: - server_answer = json.loads(res.content) - # POST endpoint returns "POSTed seqcol" for digest b since it doesn't know the digest - # So we compare everything except the digests.b field - assert ( - server_answer["digests"]["a"] == correct_answer["digests"]["a"] - ), f"Comparison POST: digest a mismatch" - assert ( - server_answer["attributes"] == correct_answer["attributes"] - ), f"Comparison POST: attributes mismatch" - assert ( - server_answer["array_elements"] == correct_answer["array_elements"] - ), f"Comparison POST: array_elements mismatch" - except json.decoder.JSONDecodeError: - print(f"Url: {url}") - assert False, f"Comparison POST endpoint failed: {url}" + @pytest.mark.parametrize("fa_name, bundle", DIGEST_TESTS) + def test_collection_level2(self, api_root, fa_name, bundle): + check_collection_level2(api_root, fa_name, bundle) + @pytest.mark.parametrize("fa_name, bundle", DIGEST_TESTS) + def test_default_level_returns_level2(self, api_root, fa_name, bundle): + check_default_level_returns_level2(api_root, fa_name, bundle) -@pytest.mark.require_service -class TestAPI: - print("Testing Compliance") + @pytest.mark.parametrize("fa_name, bundle", DIGEST_TESTS) + def test_sorted_name_length_pairs(self, api_root, fa_name, bundle): + check_sorted_name_length_pairs(api_root, fa_name, bundle) - @pytest.mark.parametrize("test_values", COLLECTION_TESTS) - def test_collection_endpoint(self, api_root, test_values, test_data_root): - print("Testing collection endpoint") - check_collection(api_root, *test_values, test_data_root) + # ---- Attribute checks ---- - @pytest.mark.parametrize("response_file", COMPARISON_TESTS) - def test_comparison_endpoint(self, api_root, response_file): - print("Testing comparison endpoint") - check_comparison(api_root, response_file) + @pytest.mark.parametrize("fa_name, bundle", DIGEST_TESTS) + @pytest.mark.parametrize("attr_name", ["lengths", "names", "sequences"]) + def test_attribute_retrieval(self, api_root, fa_name, bundle, attr_name): + check_attribute_retrieval(api_root, fa_name, bundle, attr_name) - @pytest.mark.parametrize("test_values", ATTRIBUTE_TESTS) - def test_attribute_endpoint(self, api_root, test_values): - check_attribute(api_root, *test_values) + def test_transient_attribute_not_served(self, api_root): + check_transient_attribute_not_served(api_root) - @pytest.mark.parametrize("test_values", ATTRIBUTE_LIST_TESTS) - def test_attribute_list_endpoint(self, api_root, test_values): - check_list_collections_by_attribute(api_root, *test_values) + # ---- List/filter checks ---- - def test_service_info_endpoint(self, api_root): - check_service_info(api_root) + @pytest.mark.parametrize("attr_name", ["lengths", "names", "sequences"]) + def test_list_filter_by_attribute(self, api_root, attr_name): + fa_name, bundle = DIGEST_TESTS[0] + check_list_filter_by_attribute(api_root, fa_name, bundle, attr_name) - def test_list_collections_endpoint(self, api_root): - check_list_collections(api_root) + def test_multi_attribute_filter_and(self, api_root): + check_list_multi_attribute_filter_and(api_root) - @pytest.mark.parametrize("attribute_name", ["lengths", "names", "sequences"]) - def test_list_attributes_endpoint(self, api_root, attribute_name): - check_list_attributes(api_root, attribute_name) + # ---- Comparison checks ---- - @pytest.mark.parametrize("response_file", COMPARISON_TESTS) - def test_comparison_post_endpoint(self, api_root, response_file, test_data_root): - check_comparison_post(api_root, response_file, test_data_root) + def test_comparison_structure(self, api_root): + check_comparison_structure(api_root) - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_collection_structure(self, api_root, fa_file, fa_digest_bundle): - digest = fa_digest_bundle["top_level_digest"] - check_collection_structure(api_root, digest) + def test_comparison_same_order_values(self, api_root): + check_comparison_same_order_values(api_root) - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_collections(self, api_root, fa_file, fa_digest_bundle): - client = refget.SequenceCollectionClient(urls=[api_root]) - digest = fa_digest_bundle["top_level_digest"] - srv_response = client.get_collection(digest, level=1) - print("Server response:", srv_response) + @pytest.mark.parametrize( + "fixture_name, expected", + list(COMPARISON_FIXTURES.items()), + ids=list(COMPARISON_FIXTURES.keys()), + ) + def test_comparison(self, api_root, fixture_name, expected): + check_comparison(api_root, fixture_name, expected) - @pytest.mark.snlp - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_sorted_name_length_pairs(self, api_root, fa_file, fa_digest_bundle): - client = refget.SequenceCollectionClient(urls=[api_root]) - digest = fa_digest_bundle["top_level_digest"] - srv_response = client.get_collection(digest, level=1) - assert ( - srv_response["sorted_name_length_pairs"] - == fa_digest_bundle["sorted_name_length_pairs_digest"] - ), f"Collection endpoint failed: sorted_name_length_pairs mismatch for {demo_file}" + @pytest.mark.parametrize( + "fixture_name, expected", + list(COMPARISON_FIXTURES.items()), + ids=list(COMPARISON_FIXTURES.keys()), + ) + def test_comparison_post(self, api_root, fixture_name, expected): + check_comparison_post(api_root, fixture_name, expected) diff --git a/tests/conftest.py b/tests/conftest.py index 093814d..536bc8c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -64,11 +64,18 @@ def invoke(*args): TEST_DATA_DIR = Path(__file__).parent.parent / "test_fasta" BASE_FASTA = TEST_DATA_DIR / "base.fa" + + +@pytest.fixture(scope="session") +def test_data_root(): + """Provides the absolute path to the test_fasta directory.""" + return TEST_DATA_DIR DIFFERENT_NAMES_FASTA = TEST_DATA_DIR / "different_names.fa" DIFFERENT_ORDER_FASTA = TEST_DATA_DIR / "different_order.fa" PAIR_SWAP_FASTA = TEST_DATA_DIR / "pair_swap.fa" SUBSET_FASTA = TEST_DATA_DIR / "subset.fa" SWAP_WO_COORDS_FASTA = TEST_DATA_DIR / "swap_wo_coords.fa" +SAMPLE_FHR_JSON = TEST_DATA_DIR / "sample_fhr.json" # ============================================================ @@ -246,6 +253,8 @@ def pytest_configure(config): config.addinivalue_line("markers", "requires_network: mark test as requiring network access") config.addinivalue_line("markers", "requires_db: mark test as requiring database access") config.addinivalue_line("markers", "slow: mark test as slow running") + config.addinivalue_line("markers", "recommended: mark test as RECOMMENDED (not REQUIRED) by GA4GH spec") + config.addinivalue_line("markers", "require_service: mark test as requiring a running seqcol service") def pytest_collection_modifyitems(config, items): @@ -273,3 +282,13 @@ def pytest_collection_modifyitems(config, items): for item in items: if "requires_db" in item.keywords: item.add_marker(skip_db) + + # Skip require_service tests if no api_root or test_server available + api_root = config.getoption("api_root") + if api_root is None: + skip_service = pytest.mark.skip(reason="No --api-root provided and not running via integration test_server") + for item in items: + if "require_service" in item.keywords: + # Only skip if this is the base TestAPI class, not a subclass with test_server + if "TestAPI" in item.nodeid and "TestComplianceViaIntegration" not in item.nodeid: + item.add_marker(skip_service) diff --git a/tests/integration/test_compliance_integration.py b/tests/integration/test_compliance_integration.py deleted file mode 100644 index 76773d1..0000000 --- a/tests/integration/test_compliance_integration.py +++ /dev/null @@ -1,260 +0,0 @@ -""" -Compliance tests running against the integration test server. - -These tests verify the API responses match expected fixtures, -using the ephemeral Docker PostgreSQL + test server infrastructure. -""" - -import json -import pytest -import requests -from pathlib import Path - -from tests.conftest import DIGEST_TESTS - - -class TestComplianceStructure: - """Test response structure matches GA4GH spec.""" - - def test_service_info_structure(self, test_server): - """Service-info has required GA4GH fields""" - res = requests.get(f"{test_server}/service-info") - assert res.status_code == 200 - data = res.json() - # GA4GH service-info required fields - assert "id" in data - assert "type" in data - assert "group" in data["type"] - assert "artifact" in data["type"] - assert "version" in data["type"] - - def test_service_info_seqcol_schema(self, test_server): - """Service-info MUST include seqcol.schema (GA4GH spec requirement)""" - res = requests.get(f"{test_server}/service-info") - assert res.status_code == 200 - data = res.json() - # Spec: service-info MUST return the JSON Schema implemented by the server - assert "seqcol" in data, "service-info must have 'seqcol' section" - assert "schema" in data["seqcol"], "seqcol section must include 'schema'" - schema = data["seqcol"]["schema"] - # Schema should define the required attributes - assert "properties" in schema, "schema must have 'properties'" - assert "lengths" in schema["properties"], "schema must define 'lengths'" - assert "names" in schema["properties"], "schema must define 'names'" - assert "sequences" in schema["properties"], "schema must define 'sequences'" - - def test_list_collections_structure(self, test_server): - """List collections has pagination structure per GA4GH paging guide""" - res = requests.get(f"{test_server}/list/collection") - assert res.status_code == 200 - data = res.json() - assert "results" in data - assert isinstance(data["results"], list) - assert "pagination" in data - assert "page" in data["pagination"] - assert "page_size" in data["pagination"] - assert "total" in data["pagination"], "pagination must include 'total' per GA4GH spec" - - def test_list_collections_filter_by_attribute(self, test_server): - """List collections filtered by attribute digest (REQUIRED by spec)""" - # Use base.fa's names digest to filter - names_digest = DIGEST_TESTS[0][1]["level1"]["names"] - res = requests.get(f"{test_server}/list/collection?names={names_digest}") - assert res.status_code == 200 - data = res.json() - assert "results" in data - # Should return only collections with this exact names digest - # base.fa has this names digest - assert DIGEST_TESTS[0][1]["top_level_digest"] in data["results"] - - -class TestAttributeEndpoint: - """Test /attribute/collection/:attr/:digest endpoint (REQUIRED by spec).""" - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_attribute_lengths(self, test_server, fa_file, fa_digest_bundle): - """Retrieve lengths attribute by its digest""" - lengths_digest = fa_digest_bundle["level1"]["lengths"] - expected_lengths = fa_digest_bundle["level2"]["lengths"] - res = requests.get(f"{test_server}/attribute/collection/lengths/{lengths_digest}") - assert res.status_code == 200 - data = res.json() - assert data == expected_lengths - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_attribute_names(self, test_server, fa_file, fa_digest_bundle): - """Retrieve names attribute by its digest""" - names_digest = fa_digest_bundle["level1"]["names"] - expected_names = fa_digest_bundle["level2"]["names"] - res = requests.get(f"{test_server}/attribute/collection/names/{names_digest}") - assert res.status_code == 200 - data = res.json() - assert data == expected_names - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_attribute_sequences(self, test_server, fa_file, fa_digest_bundle): - """Retrieve sequences attribute by its digest""" - sequences_digest = fa_digest_bundle["level1"]["sequences"] - expected_sequences = fa_digest_bundle["level2"]["sequences"] - res = requests.get(f"{test_server}/attribute/collection/sequences/{sequences_digest}") - assert res.status_code == 200 - data = res.json() - assert data == expected_sequences - - def test_attribute_not_found(self, test_server): - """Non-existent attribute digest returns 404""" - res = requests.get(f"{test_server}/attribute/collection/names/nonexistent_digest_12345") - assert res.status_code == 404 - - -class TestCollectionLevels: - """Test collection level 1 vs level 2 response formats.""" - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_default_level_returns_level2(self, test_server, fa_file, fa_digest_bundle): - """Collection without ?level= param returns level 2 (spec default)""" - digest = fa_digest_bundle["top_level_digest"] - res = requests.get(f"{test_server}/collection/{digest}") - assert res.status_code == 200 - data = res.json() - # Level 2 returns arrays, not digest strings - for attr in ["names", "lengths", "sequences"]: - assert isinstance(data[attr], list), f"Default should return level 2 (arrays)" - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_level1_returns_digests(self, test_server, fa_file, fa_digest_bundle): - """Level 1 returns digest strings for attributes""" - digest = fa_digest_bundle["top_level_digest"] - res = requests.get(f"{test_server}/collection/{digest}?level=1") - assert res.status_code == 200 - data = res.json() - for attr in ["names", "lengths", "sequences"]: - assert isinstance(data[attr], str), f"Level 1 {attr} should be digest string" - # Transient attribute present in level 1 - assert "sorted_name_length_pairs" in data - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_level2_returns_arrays(self, test_server, fa_file, fa_digest_bundle): - """Level 2 returns arrays for attributes""" - digest = fa_digest_bundle["top_level_digest"] - res = requests.get(f"{test_server}/collection/{digest}?level=2") - assert res.status_code == 200 - data = res.json() - for attr in ["names", "lengths", "sequences"]: - assert isinstance(data[attr], list), f"Level 2 {attr} should be array" - # Transient attribute NOT in level 2 - assert "sorted_name_length_pairs" not in data - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_sorted_name_length_pairs_digest(self, test_server, fa_file, fa_digest_bundle): - """Level 1 sorted_name_length_pairs matches expected digest""" - digest = fa_digest_bundle["top_level_digest"] - res = requests.get(f"{test_server}/collection/{digest}?level=1") - assert res.status_code == 200 - data = res.json() - assert ( - data["sorted_name_length_pairs"] == fa_digest_bundle["sorted_name_length_pairs_digest"] - ) - - -class TestComparison: - """Test comparison endpoint responses.""" - - def test_compare_identical(self, test_server): - """Comparing collection to itself returns expected structure""" - # Use base.fa digest - digest = DIGEST_TESTS[0][1]["top_level_digest"] - res = requests.get(f"{test_server}/comparison/{digest}/{digest}") - assert res.status_code == 200 - data = res.json() - assert "digests" in data - assert data["digests"]["a"] == digest - assert data["digests"]["b"] == digest - assert "attributes" in data - assert "array_elements" in data - - def test_compare_different(self, test_server): - """Comparing different collections returns diff structure""" - digest_a = DIGEST_TESTS[0][1]["top_level_digest"] # base.fa - digest_b = DIGEST_TESTS[1][1]["top_level_digest"] # different_names.fa - res = requests.get(f"{test_server}/comparison/{digest_a}/{digest_b}") - assert res.status_code == 200 - data = res.json() - assert data["digests"]["a"] == digest_a - assert data["digests"]["b"] == digest_b - assert "a_and_b" in data["attributes"] - - def test_compare_full_structure(self, test_server): - """Comparison returns complete structure per spec""" - digest_a = DIGEST_TESTS[0][1]["top_level_digest"] # base.fa - digest_b = DIGEST_TESTS[1][1]["top_level_digest"] # different_names.fa - res = requests.get(f"{test_server}/comparison/{digest_a}/{digest_b}") - assert res.status_code == 200 - data = res.json() - # Verify digests structure - assert "digests" in data - assert "a" in data["digests"] - assert "b" in data["digests"] - # Verify attributes structure - assert "attributes" in data - assert "a_only" in data["attributes"] - assert "b_only" in data["attributes"] - assert "a_and_b" in data["attributes"] - # Verify array_elements structure - assert "array_elements" in data - assert "a_count" in data["array_elements"] - assert "b_count" in data["array_elements"] - assert "a_and_b_count" in data["array_elements"] - assert "a_and_b_same_order" in data["array_elements"] - - def test_compare_post_with_seqcol_body(self, test_server): - """POST comparison with local seqcol in body (RECOMMENDED by spec)""" - digest_a = DIGEST_TESTS[0][1]["top_level_digest"] # base.fa on server - # POST the level 2 representation of different_names.fa - seqcol_b = DIGEST_TESTS[1][1]["level2"] - res = requests.post( - f"{test_server}/comparison/{digest_a}", - json=seqcol_b, - ) - assert res.status_code == 200 - data = res.json() - assert "digests" in data - assert data["digests"]["a"] == digest_a - # b digest may be computed or null per spec - assert "attributes" in data - assert "array_elements" in data - - def test_compare_with_fixtures(self, test_server): - """Comparison results match fixture files""" - # Test base.fa vs different_names.fa comparison - with open("tests/api/comparison/compare_base.fa_different_names.fa.json") as f: - expected = json.load(f) - - res = requests.get( - f"{test_server}/comparison/{expected['digests']['a']}/{expected['digests']['b']}" - ) - assert res.status_code == 200 - data = res.json() - assert data["digests"] == expected["digests"] - assert data["attributes"] == expected["attributes"] - assert data["array_elements"] == expected["array_elements"] - - -class TestCollectionContent: - """Test collection content matches fixtures.""" - - @pytest.mark.parametrize("fa_file, fa_digest_bundle", DIGEST_TESTS) - def test_collection_content(self, test_server, fa_file, fa_digest_bundle): - """Collection arrays match expected values from digests file""" - digest = fa_digest_bundle["top_level_digest"] - expected = fa_digest_bundle["level2"] - res = requests.get(f"{test_server}/collection/{digest}?level=2") - assert res.status_code == 200 - data = res.json() - - # Verify lengths match - assert data["lengths"] == expected["lengths"] - # Verify names match - assert data["names"] == expected["names"] - # Verify sequence digests match - assert data["sequences"] == expected["sequences"] diff --git a/tests/integration/test_run_compliance.py b/tests/integration/test_run_compliance.py new file mode 100644 index 0000000..d240024 --- /dev/null +++ b/tests/integration/test_run_compliance.py @@ -0,0 +1,18 @@ +"""Run the standalone compliance suite against the integration test server.""" + +import pytest +from tests.api.test_compliance import TestAPI + + +@pytest.mark.require_service +class TestComplianceViaIntegration(TestAPI): + """Run compliance tests against integration test server. + + Inherits all tests from TestAPI but provides api_root from + the integration test_server fixture instead of --api-root CLI option. + """ + + @pytest.fixture + def api_root(self, test_server): + """Map test_server fixture to api_root for compliance tests.""" + return test_server diff --git a/tests/test_cli/test_store_commands.py b/tests/test_cli/test_store_commands.py index a1a4522..120a53d 100644 --- a/tests/test_cli/test_store_commands.py +++ b/tests/test_cli/test_store_commands.py @@ -13,6 +13,7 @@ BASE_FASTA, DIFFERENT_NAMES_FASTA, DIFFERENT_ORDER_FASTA, + SAMPLE_FHR_JSON, TEST_FASTA_DIGESTS, assert_json_output, ) @@ -480,3 +481,152 @@ def test_add_to_nonexistent_store(self, cli, tmp_path): result = cli("store", "add", str(BASE_FASTA), "--path", str(nonexistent)) assert result.exit_code != 0 + + +def _setup_store_with_fasta(cli, tmp_path): + """Initialize a store, add BASE_FASTA, and return (store_path, digest).""" + store_path = tmp_path / "store" + cli("store", "init", "--path", str(store_path)) + add_result = cli("store", "add", str(BASE_FASTA), "--path", str(store_path)) + digest = json.loads(add_result.stdout)["digest"] + return store_path, digest + + +class TestStoreMetadata: + """Tests for: refget store metadata / metadata-set""" + + def test_metadata_no_fhr_set(self, cli, tmp_path): + """Error when no FHR metadata exists for a collection.""" + store_path, digest = _setup_store_with_fasta(cli, tmp_path) + + result = cli("store", "metadata", digest, "--path", str(store_path)) + + assert result.exit_code != 0 + assert "No FHR metadata" in result.stdout + + def test_metadata_set_from_json_file(self, cli, tmp_path): + """Happy path: set FHR metadata from a JSON file.""" + store_path, digest = _setup_store_with_fasta(cli, tmp_path) + + result = cli( + "store", "metadata-set", digest, str(SAMPLE_FHR_JSON), + "--path", str(store_path), + ) + + assert result.exit_code == 0 + assert "Set FHR metadata for collection" in result.stdout + + def test_metadata_read_after_set(self, cli, tmp_path): + """Round-trip: set metadata then read it back.""" + store_path, digest = _setup_store_with_fasta(cli, tmp_path) + + cli( + "store", "metadata-set", digest, str(SAMPLE_FHR_JSON), + "--path", str(store_path), + ) + + result = cli("store", "metadata", digest, "--path", str(store_path)) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data["genome"] == "Test organism" + assert data["version"] == "v1.0" + assert data["masking"] == "soft-masked" + assert "test_v1" in data["genomeSynonym"] + + def test_metadata_output_is_valid_json(self, cli, tmp_path): + """Output is valid JSON with camelCase keys per FHR spec.""" + store_path, digest = _setup_store_with_fasta(cli, tmp_path) + + cli( + "store", "metadata-set", digest, str(SAMPLE_FHR_JSON), + "--path", str(store_path), + ) + + result = cli("store", "metadata", digest, "--path", str(store_path)) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + + # Verify camelCase keys from the FHR spec + assert "schemaVersion" in data + assert "genomeSynonym" in data + assert "dateCreated" in data + + # Verify no snake_case keys leaked through + raw = result.stdout + assert "schema_version" not in raw + assert "genome_synonym" not in raw + assert "date_created" not in raw + + def test_metadata_set_nonexistent_file(self, cli, tmp_path): + """Error when JSON file does not exist.""" + store_path, digest = _setup_store_with_fasta(cli, tmp_path) + + result = cli( + "store", "metadata-set", digest, "/nonexistent/fhr.json", + "--path", str(store_path), + ) + + assert result.exit_code != 0 + + def test_metadata_nonexistent_digest(self, cli, tmp_path): + """Error when reading metadata for a nonexistent digest.""" + store_path = tmp_path / "store" + cli("store", "init", "--path", str(store_path)) + + result = cli( + "store", "metadata", "nonexistent_digest_123", + "--path", str(store_path), + ) + + assert result.exit_code != 0 + + def test_metadata_set_then_overwrite(self, cli, tmp_path): + """Overwriting metadata replaces the previous values.""" + store_path, digest = _setup_store_with_fasta(cli, tmp_path) + + # Set original metadata + cli( + "store", "metadata-set", digest, str(SAMPLE_FHR_JSON), + "--path", str(store_path), + ) + + # Create updated FHR JSON + updated_fhr = tmp_path / "updated_fhr.json" + updated_fhr.write_text(json.dumps({ + "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", + "schemaVersion": 1.0, + "genome": "Updated organism", + "version": "v2.0", + })) + + # Overwrite + cli( + "store", "metadata-set", digest, str(updated_fhr), + "--path", str(store_path), + ) + + result = cli("store", "metadata", digest, "--path", str(store_path)) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data["genome"] == "Updated organism" + + def test_metadata_removed_with_collection(self, cli, tmp_path): + """Metadata sidecar is cleaned up when the collection is removed.""" + store_path, digest = _setup_store_with_fasta(cli, tmp_path) + + # Set metadata + cli( + "store", "metadata-set", digest, str(SAMPLE_FHR_JSON), + "--path", str(store_path), + ) + + # Remove the collection + cli("store", "remove", digest, "--path", str(store_path)) + + # Metadata should be gone + result = cli("store", "metadata", digest, "--path", str(store_path)) + + assert result.exit_code != 0 diff --git a/tests/test_cli/test_store_pull.py b/tests/test_cli/test_store_pull.py new file mode 100644 index 0000000..c90cd72 --- /dev/null +++ b/tests/test_cli/test_store_pull.py @@ -0,0 +1,400 @@ +# tests/test_cli/test_store_pull.py + +"""Tests for refget store pull CLI command. + +Note: The HTTP server fixtures use subprocess instead of threading because +gtars' open_remote (Rust/PyO3) holds the GIL during HTTP requests, which +would deadlock a Python-thread-based HTTP server. +""" + +import json +import os +import signal +import socket +import subprocess +import sys +import time +from pathlib import Path + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from conftest import ( + BASE_FASTA, + DIFFERENT_NAMES_FASTA, +) + +# Skip entire module if gtars is not installed +pytest.importorskip("gtars") + + +def _find_free_port() -> int: + """Find a free port on localhost.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +def _start_http_server(directory: str, port: int) -> subprocess.Popen: + """Start an HTTP server as a subprocess serving the given directory.""" + proc = subprocess.Popen( + [sys.executable, "-m", "http.server", str(port), "--directory", directory], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + # Wait for server to be ready + max_wait = 5.0 + start_time = time.time() + while time.time() - start_time < max_wait: + try: + with socket.create_connection(("127.0.0.1", port), timeout=0.1): + break + except (ConnectionRefusedError, OSError): + time.sleep(0.1) + else: + proc.terminate() + raise RuntimeError(f"HTTP server failed to start on port {port}") + return proc + + +def _stop_http_server(proc: subprocess.Popen) -> None: + """Stop an HTTP server subprocess.""" + proc.terminate() + try: + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + + +@pytest.fixture +def remote_store_server(cli, tmp_path): + """Set up a local store, serve it over HTTP, yield (url, digest, source_store_path).""" + source_store = tmp_path / "source_store" + cli("store", "init", "--path", str(source_store)) + add_result = cli("store", "add", str(BASE_FASTA), "--path", str(source_store)) + assert add_result.exit_code == 0, f"Failed to add FASTA: {add_result.stdout}" + digest = json.loads(add_result.stdout)["digest"] + + port = _find_free_port() + proc = _start_http_server(str(source_store), port) + + yield f"http://127.0.0.1:{port}", digest, source_store + + _stop_http_server(proc) + + +@pytest.fixture +def multi_remote_store_server(cli, tmp_path): + """Set up a local store with multiple FASTAs, serve over HTTP.""" + source_store = tmp_path / "multi_source_store" + cli("store", "init", "--path", str(source_store)) + + add_result1 = cli("store", "add", str(BASE_FASTA), "--path", str(source_store)) + assert add_result1.exit_code == 0 + digest1 = json.loads(add_result1.stdout)["digest"] + + add_result2 = cli("store", "add", str(DIFFERENT_NAMES_FASTA), "--path", str(source_store)) + assert add_result2.exit_code == 0 + digest2 = json.loads(add_result2.stdout)["digest"] + + port = _find_free_port() + proc = _start_http_server(str(source_store), port) + + yield f"http://127.0.0.1:{port}", digest1, digest2, source_store + + _stop_http_server(proc) + + +@pytest.fixture +def local_store(cli, tmp_path): + """Initialize an empty local store for pulling into.""" + store_path = tmp_path / "local_store" + result = cli("store", "init", "--path", str(store_path)) + assert result.exit_code == 0 + return store_path + + +class TestStorePullBasic: + """Core pull functionality tests.""" + + def test_pull_single_digest(self, cli, tmp_path, remote_store_server): + """Pull a known digest from the remote store server.""" + server_url, digest, _ = remote_store_server + local_store = tmp_path / "pull_store" + cli("store", "init", "--path", str(local_store)) + + result = cli("store", "pull", digest, "--server", server_url, "--path", str(local_store)) + + assert result.exit_code == 0, f"Pull failed: {result.stdout}" + data = json.loads(result.stdout) + assert data["status"] == "pulled" + assert data["digest"] == digest + + def test_pull_creates_local_cache(self, cli, tmp_path, remote_store_server): + """After pulling, the .remote_cache directory is created.""" + server_url, digest, _ = remote_store_server + local_store = tmp_path / "cache_store" + cli("store", "init", "--path", str(local_store)) + + result = cli("store", "pull", digest, "--server", server_url, "--path", str(local_store)) + + assert result.exit_code == 0 + cache_dir = local_store / ".remote_cache" + assert cache_dir.exists() + + def test_pull_quiet_flag(self, cli, tmp_path, remote_store_server): + """Pull with --quiet suppresses progress output.""" + server_url, digest, _ = remote_store_server + local_store = tmp_path / "quiet_store" + cli("store", "init", "--path", str(local_store)) + + result = cli( + "store", "pull", digest, "--server", server_url, "--path", str(local_store), "--quiet" + ) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data["status"] == "pulled" + + +class TestStorePullEager: + """Eager sequence fetching tests.""" + + def test_pull_eager_fetches_sequences(self, cli, tmp_path, remote_store_server): + """Pull with --eager pre-fetches all sequences.""" + server_url, digest, _ = remote_store_server + local_store = tmp_path / "eager_store" + cli("store", "init", "--path", str(local_store)) + + result = cli( + "store", "pull", digest, "--server", server_url, + "--path", str(local_store), "--eager" + ) + + assert result.exit_code == 0, f"Eager pull failed: {result.stdout}" + data = json.loads(result.stdout) + assert data["eager"] is True + assert data["sequences_fetched"] > 0 + + def test_pull_default_is_lazy(self, cli, tmp_path, remote_store_server): + """Pull without --eager uses lazy mode.""" + server_url, digest, _ = remote_store_server + local_store = tmp_path / "lazy_store" + cli("store", "init", "--path", str(local_store)) + + result = cli("store", "pull", digest, "--server", server_url, "--path", str(local_store)) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data["eager"] is False + assert "sequences_fetched" not in data + + +class TestStorePullBatch: + """Batch pull via --file tests.""" + + def test_pull_from_file(self, cli, tmp_path, multi_remote_store_server): + """Pull multiple digests from a file.""" + server_url, digest1, digest2, _ = multi_remote_store_server + local_store = tmp_path / "batch_store" + cli("store", "init", "--path", str(local_store)) + + digest_file = tmp_path / "digests.txt" + digest_file.write_text(f"{digest1}\n{digest2}\n") + + result = cli( + "store", "pull", "--file", str(digest_file), + "--server", server_url, "--path", str(local_store) + ) + + assert result.exit_code == 0, f"Batch pull failed: {result.stdout}" + data = json.loads(result.stdout) + assert "results" in data + assert len(data["results"]) == 2 + + def test_pull_file_with_blank_lines(self, cli, tmp_path, remote_store_server): + """File with blank lines and whitespace is handled gracefully.""" + server_url, digest, _ = remote_store_server + local_store = tmp_path / "blank_store" + cli("store", "init", "--path", str(local_store)) + + digest_file = tmp_path / "digests_blanks.txt" + digest_file.write_text(f"\n \n{digest}\n\n \n") + + result = cli( + "store", "pull", "--file", str(digest_file), + "--server", server_url, "--path", str(local_store) + ) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + # Single digest after stripping blanks, so no "results" wrapper + assert data["digest"] == digest + assert data["status"] == "pulled" + + def test_pull_file_not_found(self, cli, tmp_path): + """Passing a nonexistent file to --file returns error.""" + local_store = tmp_path / "nofile_store" + cli("store", "init", "--path", str(local_store)) + + result = cli( + "store", "pull", "--file", "/nonexistent/digests.txt", + "--server", "http://127.0.0.1:1", "--path", str(local_store) + ) + + assert result.exit_code != 0 + + def test_pull_empty_file(self, cli, tmp_path, remote_store_server): + """Empty file returns error about no digests.""" + server_url, _, _ = remote_store_server + local_store = tmp_path / "empty_file_store" + cli("store", "init", "--path", str(local_store)) + + digest_file = tmp_path / "empty.txt" + digest_file.write_text("") + + result = cli( + "store", "pull", "--file", str(digest_file), + "--server", server_url, "--path", str(local_store) + ) + + assert result.exit_code != 0 + + +class TestStorePullAlreadyLocal: + """Skip already-cached collections.""" + + def test_pull_already_local(self, cli, tmp_path, remote_store_server): + """Pulling a digest that exists locally returns already_local status.""" + server_url, digest, _ = remote_store_server + local_store = tmp_path / "already_store" + cli("store", "init", "--path", str(local_store)) + + # Add the same FASTA to local store + cli("store", "add", str(BASE_FASTA), "--path", str(local_store)) + + # Try to pull -- should detect it is already local + result = cli("store", "pull", digest, "--server", server_url, "--path", str(local_store)) + + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data["status"] == "already_local" + + +class TestStorePullErrors: + """Error case tests.""" + + def test_pull_nonexistent_digest(self, cli, tmp_path, remote_store_server): + """Pull a digest that does not exist on the remote.""" + server_url, _, _ = remote_store_server + local_store = tmp_path / "nonexist_store" + cli("store", "init", "--path", str(local_store)) + + result = cli( + "store", "pull", "NONEXISTENT_DIGEST_12345678901234", + "--server", server_url, "--path", str(local_store) + ) + + assert result.exit_code != 0 + data = json.loads(result.stdout) + assert data["status"] == "not_found" + + def test_pull_unreachable_server(self, cli, tmp_path): + """Pull from an unreachable URL returns error.""" + local_store = tmp_path / "unreach_store" + cli("store", "init", "--path", str(local_store)) + + result = cli( + "store", "pull", "some_digest_abc123", + "--server", "http://127.0.0.1:1", "--path", str(local_store) + ) + + assert result.exit_code != 0 + + def test_pull_no_digest_or_file(self, cli, tmp_path): + """Pull with neither digest nor --file returns error.""" + local_store = tmp_path / "noarg_store" + cli("store", "init", "--path", str(local_store)) + + result = cli( + "store", "pull", + "--server", "http://127.0.0.1:1", "--path", str(local_store) + ) + + assert result.exit_code != 0 + + def test_pull_both_digest_and_file(self, cli, tmp_path): + """Pull with both digest and --file returns error.""" + local_store = tmp_path / "both_store" + cli("store", "init", "--path", str(local_store)) + + digest_file = tmp_path / "digests.txt" + digest_file.write_text("some_digest\n") + + result = cli( + "store", "pull", "some_digest", + "--file", str(digest_file), + "--server", "http://127.0.0.1:1", "--path", str(local_store) + ) + + assert result.exit_code != 0 + + def test_pull_no_server_configured(self, cli, tmp_path, monkeypatch): + """Pull without --server and no configured remotes returns error.""" + local_store = tmp_path / "noserver_store" + cli("store", "init", "--path", str(local_store)) + + # Patch _find_remote_urls to return empty list + monkeypatch.setattr( + "refget.cli.store._find_remote_urls", + lambda server_override=None: [] + ) + + result = cli( + "store", "pull", "some_digest", + "--path", str(local_store) + ) + + assert result.exit_code != 0 + + +class TestStorePullMultipleRemotes: + """Fallback across multiple remotes.""" + + def test_pull_tries_next_remote_on_failure( + self, cli, tmp_path, remote_store_server, monkeypatch + ): + """When first remote lacks the digest, tries the next one.""" + server_url, digest, _ = remote_store_server + + # Set up an empty store served over HTTP (first remote) + empty_store = tmp_path / "empty_remote" + cli("store", "init", "--path", str(empty_store)) + + port = _find_free_port() + empty_proc = _start_http_server(str(empty_store), port) + empty_url = f"http://127.0.0.1:{port}" + + try: + local_store = tmp_path / "multi_remote_store" + cli("store", "init", "--path", str(local_store)) + + # Patch to return empty server first, then the populated one + monkeypatch.setattr( + "refget.cli.store._find_remote_urls", + lambda server_override=None: [empty_url, server_url] + ) + + result = cli("store", "pull", digest, "--path", str(local_store), "--quiet") + + assert result.exit_code == 0, f"Multi-remote pull failed: {result.stdout}" + # Extract JSON from output (error messages from failed remotes may precede it) + stdout = result.stdout + json_start = stdout.rfind("{") + assert json_start >= 0, f"No JSON found in output: {stdout}" + data = json.loads(stdout[json_start:]) + assert data["status"] == "pulled" + assert data["source"] == server_url + finally: + _stop_http_server(empty_proc) From 286fbabb388ac4a7ce4441443e00cf01cb64bc64 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 27 Feb 2026 05:24:38 +0000 Subject: [PATCH 10/31] Bump rollup from 4.35.0 to 4.59.0 in /frontend Bumps [rollup](https://github.com/rollup/rollup) from 4.35.0 to 4.59.0. - [Release notes](https://github.com/rollup/rollup/releases) - [Changelog](https://github.com/rollup/rollup/blob/master/CHANGELOG.md) - [Commits](https://github.com/rollup/rollup/compare/v4.35.0...v4.59.0) --- updated-dependencies: - dependency-name: rollup dependency-version: 4.59.0 dependency-type: indirect ... Signed-off-by: dependabot[bot] --- frontend/package-lock.json | 274 +++++++++++++++++++++++-------------- 1 file changed, 175 insertions(+), 99 deletions(-) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 7a78b74..ddfcfe8 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -981,9 +981,9 @@ } }, "node_modules/@rollup/rollup-android-arm-eabi": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.35.0.tgz", - "integrity": "sha512-uYQ2WfPaqz5QtVgMxfN6NpLD+no0MYHDBywl7itPYd3K5TjjSghNKmX8ic9S8NU8w81NVhJv/XojcHptRly7qQ==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.59.0.tgz", + "integrity": "sha512-upnNBkA6ZH2VKGcBj9Fyl9IGNPULcjXRlg0LLeaioQWueH30p6IXtJEbKAgvyv+mJaMxSm1l6xwDXYjpEMiLMg==", "cpu": [ "arm" ], @@ -995,9 +995,9 @@ ] }, "node_modules/@rollup/rollup-android-arm64": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.35.0.tgz", - "integrity": "sha512-FtKddj9XZudurLhdJnBl9fl6BwCJ3ky8riCXjEw3/UIbjmIY58ppWwPEvU3fNu+W7FUsAsB1CdH+7EQE6CXAPA==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.59.0.tgz", + "integrity": "sha512-hZ+Zxj3SySm4A/DylsDKZAeVg0mvi++0PYVceVyX7hemkw7OreKdCvW2oQ3T1FMZvCaQXqOTHb8qmBShoqk69Q==", "cpu": [ "arm64" ], @@ -1009,9 +1009,9 @@ ] }, "node_modules/@rollup/rollup-darwin-arm64": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.35.0.tgz", - "integrity": "sha512-Uk+GjOJR6CY844/q6r5DR/6lkPFOw0hjfOIzVx22THJXMxktXG6CbejseJFznU8vHcEBLpiXKY3/6xc+cBm65Q==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.59.0.tgz", + "integrity": "sha512-W2Psnbh1J8ZJw0xKAd8zdNgF9HRLkdWwwdWqubSVk0pUuQkoHnv7rx4GiF9rT4t5DIZGAsConRE3AxCdJ4m8rg==", "cpu": [ "arm64" ], @@ -1023,9 +1023,9 @@ ] }, "node_modules/@rollup/rollup-darwin-x64": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.35.0.tgz", - "integrity": "sha512-3IrHjfAS6Vkp+5bISNQnPogRAW5GAV1n+bNCrDwXmfMHbPl5EhTmWtfmwlJxFRUCBZ+tZ/OxDyU08aF6NI/N5Q==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.59.0.tgz", + "integrity": "sha512-ZW2KkwlS4lwTv7ZVsYDiARfFCnSGhzYPdiOU4IM2fDbL+QGlyAbjgSFuqNRbSthybLbIJ915UtZBtmuLrQAT/w==", "cpu": [ "x64" ], @@ -1037,9 +1037,9 @@ ] }, "node_modules/@rollup/rollup-freebsd-arm64": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.35.0.tgz", - "integrity": "sha512-sxjoD/6F9cDLSELuLNnY0fOrM9WA0KrM0vWm57XhrIMf5FGiN8D0l7fn+bpUeBSU7dCgPV2oX4zHAsAXyHFGcQ==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.59.0.tgz", + "integrity": "sha512-EsKaJ5ytAu9jI3lonzn3BgG8iRBjV4LxZexygcQbpiU0wU0ATxhNVEpXKfUa0pS05gTcSDMKpn3Sx+QB9RlTTA==", "cpu": [ "arm64" ], @@ -1051,9 +1051,9 @@ ] }, "node_modules/@rollup/rollup-freebsd-x64": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.35.0.tgz", - "integrity": "sha512-2mpHCeRuD1u/2kruUiHSsnjWtHjqVbzhBkNVQ1aVD63CcexKVcQGwJ2g5VphOd84GvxfSvnnlEyBtQCE5hxVVw==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.59.0.tgz", + "integrity": "sha512-d3DuZi2KzTMjImrxoHIAODUZYoUUMsuUiY4SRRcJy6NJoZ6iIqWnJu9IScV9jXysyGMVuW+KNzZvBLOcpdl3Vg==", "cpu": [ "x64" ], @@ -1065,9 +1065,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm-gnueabihf": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.35.0.tgz", - "integrity": "sha512-mrA0v3QMy6ZSvEuLs0dMxcO2LnaCONs1Z73GUDBHWbY8tFFocM6yl7YyMu7rz4zS81NDSqhrUuolyZXGi8TEqg==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.59.0.tgz", + "integrity": "sha512-t4ONHboXi/3E0rT6OZl1pKbl2Vgxf9vJfWgmUoCEVQVxhW6Cw/c8I6hbbu7DAvgp82RKiH7TpLwxnJeKv2pbsw==", "cpu": [ "arm" ], @@ -1079,9 +1079,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm-musleabihf": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.35.0.tgz", - "integrity": "sha512-DnYhhzcvTAKNexIql8pFajr0PiDGrIsBYPRvCKlA5ixSS3uwo/CWNZxB09jhIapEIg945KOzcYEAGGSmTSpk7A==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.59.0.tgz", + "integrity": "sha512-CikFT7aYPA2ufMD086cVORBYGHffBo4K8MQ4uPS/ZnY54GKj36i196u8U+aDVT2LX4eSMbyHtyOh7D7Zvk2VvA==", "cpu": [ "arm" ], @@ -1093,9 +1093,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm64-gnu": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.35.0.tgz", - "integrity": "sha512-uagpnH2M2g2b5iLsCTZ35CL1FgyuzzJQ8L9VtlJ+FckBXroTwNOaD0z0/UF+k5K3aNQjbm8LIVpxykUOQt1m/A==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.59.0.tgz", + "integrity": "sha512-jYgUGk5aLd1nUb1CtQ8E+t5JhLc9x5WdBKew9ZgAXg7DBk0ZHErLHdXM24rfX+bKrFe+Xp5YuJo54I5HFjGDAA==", "cpu": [ "arm64" ], @@ -1107,9 +1107,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm64-musl": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.35.0.tgz", - "integrity": "sha512-XQxVOCd6VJeHQA/7YcqyV0/88N6ysSVzRjJ9I9UA/xXpEsjvAgDTgH3wQYz5bmr7SPtVK2TsP2fQ2N9L4ukoUg==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.59.0.tgz", + "integrity": "sha512-peZRVEdnFWZ5Bh2KeumKG9ty7aCXzzEsHShOZEFiCQlDEepP1dpUl/SrUNXNg13UmZl+gzVDPsiCwnV1uI0RUA==", "cpu": [ "arm64" ], @@ -1120,10 +1120,10 @@ "linux" ] }, - "node_modules/@rollup/rollup-linux-loongarch64-gnu": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loongarch64-gnu/-/rollup-linux-loongarch64-gnu-4.35.0.tgz", - "integrity": "sha512-5pMT5PzfgwcXEwOaSrqVsz/LvjDZt+vQ8RT/70yhPU06PTuq8WaHhfT1LW+cdD7mW6i/J5/XIkX/1tCAkh1W6g==", + "node_modules/@rollup/rollup-linux-loong64-gnu": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.59.0.tgz", + "integrity": "sha512-gbUSW/97f7+r4gHy3Jlup8zDG190AuodsWnNiXErp9mT90iCy9NKKU0Xwx5k8VlRAIV2uU9CsMnEFg/xXaOfXg==", "cpu": [ "loong64" ], @@ -1134,10 +1134,38 @@ "linux" ] }, - "node_modules/@rollup/rollup-linux-powerpc64le-gnu": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.35.0.tgz", - "integrity": "sha512-c+zkcvbhbXF98f4CtEIP1EBA/lCic5xB0lToneZYvMeKu5Kamq3O8gqrxiYYLzlZH6E3Aq+TSW86E4ay8iD8EA==", + "node_modules/@rollup/rollup-linux-loong64-musl": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-musl/-/rollup-linux-loong64-musl-4.59.0.tgz", + "integrity": "sha512-yTRONe79E+o0FWFijasoTjtzG9EBedFXJMl888NBEDCDV9I2wGbFFfJQQe63OijbFCUZqxpHz1GzpbtSFikJ4Q==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-gnu": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.59.0.tgz", + "integrity": "sha512-sw1o3tfyk12k3OEpRddF68a1unZ5VCN7zoTNtSn2KndUE+ea3m3ROOKRCZxEpmT9nsGnogpFP9x6mnLTCaoLkA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-musl": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-musl/-/rollup-linux-ppc64-musl-4.59.0.tgz", + "integrity": "sha512-+2kLtQ4xT3AiIxkzFVFXfsmlZiG5FXYW7ZyIIvGA7Bdeuh9Z0aN4hVyXS/G1E9bTP/vqszNIN/pUKCk/BTHsKA==", "cpu": [ "ppc64" ], @@ -1149,9 +1177,23 @@ ] }, "node_modules/@rollup/rollup-linux-riscv64-gnu": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.35.0.tgz", - "integrity": "sha512-s91fuAHdOwH/Tad2tzTtPX7UZyytHIRR6V4+2IGlV0Cej5rkG0R61SX4l4y9sh0JBibMiploZx3oHKPnQBKe4g==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.59.0.tgz", + "integrity": "sha512-NDYMpsXYJJaj+I7UdwIuHHNxXZ/b/N2hR15NyH3m2qAtb/hHPA4g4SuuvrdxetTdndfj9b1WOmy73kcPRoERUg==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-musl": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.59.0.tgz", + "integrity": "sha512-nLckB8WOqHIf1bhymk+oHxvM9D3tyPndZH8i8+35p/1YiVoVswPid2yLzgX7ZJP0KQvnkhM4H6QZ5m0LzbyIAg==", "cpu": [ "riscv64" ], @@ -1163,9 +1205,9 @@ ] }, "node_modules/@rollup/rollup-linux-s390x-gnu": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.35.0.tgz", - "integrity": "sha512-hQRkPQPLYJZYGP+Hj4fR9dDBMIM7zrzJDWFEMPdTnTy95Ljnv0/4w/ixFw3pTBMEuuEuoqtBINYND4M7ujcuQw==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.59.0.tgz", + "integrity": "sha512-oF87Ie3uAIvORFBpwnCvUzdeYUqi2wY6jRFWJAy1qus/udHFYIkplYRW+wo+GRUP4sKzYdmE1Y3+rY5Gc4ZO+w==", "cpu": [ "s390x" ], @@ -1177,9 +1219,9 @@ ] }, "node_modules/@rollup/rollup-linux-x64-gnu": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.35.0.tgz", - "integrity": "sha512-Pim1T8rXOri+0HmV4CdKSGrqcBWX0d1HoPnQ0uw0bdp1aP5SdQVNBy8LjYncvnLgu3fnnCt17xjWGd4cqh8/hA==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.59.0.tgz", + "integrity": "sha512-3AHmtQq/ppNuUspKAlvA8HtLybkDflkMuLK4DPo77DfthRb71V84/c4MlWJXixZz4uruIH4uaa07IqoAkG64fg==", "cpu": [ "x64" ], @@ -1191,9 +1233,9 @@ ] }, "node_modules/@rollup/rollup-linux-x64-musl": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.35.0.tgz", - "integrity": "sha512-QysqXzYiDvQWfUiTm8XmJNO2zm9yC9P/2Gkrwg2dH9cxotQzunBHYr6jk4SujCTqnfGxduOmQcI7c2ryuW8XVg==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.59.0.tgz", + "integrity": "sha512-2UdiwS/9cTAx7qIUZB/fWtToJwvt0Vbo0zmnYt7ED35KPg13Q0ym1g442THLC7VyI6JfYTP4PiSOWyoMdV2/xg==", "cpu": [ "x64" ], @@ -1204,10 +1246,38 @@ "linux" ] }, + "node_modules/@rollup/rollup-openbsd-x64": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openbsd-x64/-/rollup-openbsd-x64-4.59.0.tgz", + "integrity": "sha512-M3bLRAVk6GOwFlPTIxVBSYKUaqfLrn8l0psKinkCFxl4lQvOSz8ZrKDz2gxcBwHFpci0B6rttydI4IpS4IS/jQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ] + }, + "node_modules/@rollup/rollup-openharmony-arm64": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.59.0.tgz", + "integrity": "sha512-tt9KBJqaqp5i5HUZzoafHZX8b5Q2Fe7UjYERADll83O4fGqJ49O1FsL6LpdzVFQcpwvnyd0i+K/VSwu/o/nWlA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ] + }, "node_modules/@rollup/rollup-win32-arm64-msvc": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.35.0.tgz", - "integrity": "sha512-OUOlGqPkVJCdJETKOCEf1mw848ZyJ5w50/rZ/3IBQVdLfR5jk/6Sr5m3iO2tdPgwo0x7VcncYuOvMhBWZq8ayg==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.59.0.tgz", + "integrity": "sha512-V5B6mG7OrGTwnxaNUzZTDTjDS7F75PO1ae6MJYdiMu60sq0CqN5CVeVsbhPxalupvTX8gXVSU9gq+Rx1/hvu6A==", "cpu": [ "arm64" ], @@ -1219,9 +1289,9 @@ ] }, "node_modules/@rollup/rollup-win32-ia32-msvc": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.35.0.tgz", - "integrity": "sha512-2/lsgejMrtwQe44glq7AFFHLfJBPafpsTa6JvP2NGef/ifOa4KBoglVf7AKN7EV9o32evBPRqfg96fEHzWo5kw==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.59.0.tgz", + "integrity": "sha512-UKFMHPuM9R0iBegwzKF4y0C4J9u8C6MEJgFuXTBerMk7EJ92GFVFYBfOZaSGLu6COf7FxpQNqhNS4c4icUPqxA==", "cpu": [ "ia32" ], @@ -1232,10 +1302,24 @@ "win32" ] }, + "node_modules/@rollup/rollup-win32-x64-gnu": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.59.0.tgz", + "integrity": "sha512-laBkYlSS1n2L8fSo1thDNGrCTQMmxjYY5G0WFWjFFYZkKPjsMBsgJfGf4TLxXrF6RyhI60L8TMOjBMvXiTcxeA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, "node_modules/@rollup/rollup-win32-x64-msvc": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.35.0.tgz", - "integrity": "sha512-PIQeY5XDkrOysbQblSW7v3l1MDZzkTEzAfTPkj5VAu3FW8fS4ynyLg2sINp0fp3SjZ8xkRYpLqoKcYqAkhU1dw==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.59.0.tgz", + "integrity": "sha512-2HRCml6OztYXyJXAvdDXPKcawukWY2GpR5/nxKp4iBgiO3wcoEGkAaqctIbZcNB6KlUQBIqt8VYkNSj2397EfA==", "cpu": [ "x64" ], @@ -1288,10 +1372,9 @@ } }, "node_modules/@types/estree": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.6.tgz", - "integrity": "sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==", - "dev": true, + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", + "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", "license": "MIT" }, "node_modules/@types/geojson": { @@ -4424,13 +4507,13 @@ "license": "Unlicense" }, "node_modules/rollup": { - "version": "4.35.0", - "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.35.0.tgz", - "integrity": "sha512-kg6oI4g+vc41vePJyO6dHt/yl0Rz3Thv0kJeVQ3D1kS3E5XSuKbPc29G4IpT/Kv1KQwgHVcN+HtyS+HYLNSvQg==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.59.0.tgz", + "integrity": "sha512-2oMpl67a3zCH9H79LeMcbDhXW/UmWG/y2zuqnF2jQq5uq9TbM9TVyXvA4+t+ne2IIkBdrLpAaRQAvo7YI/Yyeg==", "dev": true, "license": "MIT", "dependencies": { - "@types/estree": "1.0.6" + "@types/estree": "1.0.8" }, "bin": { "rollup": "dist/bin/rollup" @@ -4440,25 +4523,31 @@ "npm": ">=8.0.0" }, "optionalDependencies": { - "@rollup/rollup-android-arm-eabi": "4.35.0", - "@rollup/rollup-android-arm64": "4.35.0", - "@rollup/rollup-darwin-arm64": "4.35.0", - "@rollup/rollup-darwin-x64": "4.35.0", - "@rollup/rollup-freebsd-arm64": "4.35.0", - "@rollup/rollup-freebsd-x64": "4.35.0", - "@rollup/rollup-linux-arm-gnueabihf": "4.35.0", - "@rollup/rollup-linux-arm-musleabihf": "4.35.0", - "@rollup/rollup-linux-arm64-gnu": "4.35.0", - "@rollup/rollup-linux-arm64-musl": "4.35.0", - "@rollup/rollup-linux-loongarch64-gnu": "4.35.0", - "@rollup/rollup-linux-powerpc64le-gnu": "4.35.0", - "@rollup/rollup-linux-riscv64-gnu": "4.35.0", - "@rollup/rollup-linux-s390x-gnu": "4.35.0", - "@rollup/rollup-linux-x64-gnu": "4.35.0", - "@rollup/rollup-linux-x64-musl": "4.35.0", - "@rollup/rollup-win32-arm64-msvc": "4.35.0", - "@rollup/rollup-win32-ia32-msvc": "4.35.0", - "@rollup/rollup-win32-x64-msvc": "4.35.0", + "@rollup/rollup-android-arm-eabi": "4.59.0", + "@rollup/rollup-android-arm64": "4.59.0", + "@rollup/rollup-darwin-arm64": "4.59.0", + "@rollup/rollup-darwin-x64": "4.59.0", + "@rollup/rollup-freebsd-arm64": "4.59.0", + "@rollup/rollup-freebsd-x64": "4.59.0", + "@rollup/rollup-linux-arm-gnueabihf": "4.59.0", + "@rollup/rollup-linux-arm-musleabihf": "4.59.0", + "@rollup/rollup-linux-arm64-gnu": "4.59.0", + "@rollup/rollup-linux-arm64-musl": "4.59.0", + "@rollup/rollup-linux-loong64-gnu": "4.59.0", + "@rollup/rollup-linux-loong64-musl": "4.59.0", + "@rollup/rollup-linux-ppc64-gnu": "4.59.0", + "@rollup/rollup-linux-ppc64-musl": "4.59.0", + "@rollup/rollup-linux-riscv64-gnu": "4.59.0", + "@rollup/rollup-linux-riscv64-musl": "4.59.0", + "@rollup/rollup-linux-s390x-gnu": "4.59.0", + "@rollup/rollup-linux-x64-gnu": "4.59.0", + "@rollup/rollup-linux-x64-musl": "4.59.0", + "@rollup/rollup-openbsd-x64": "4.59.0", + "@rollup/rollup-openharmony-arm64": "4.59.0", + "@rollup/rollup-win32-arm64-msvc": "4.59.0", + "@rollup/rollup-win32-ia32-msvc": "4.59.0", + "@rollup/rollup-win32-x64-gnu": "4.59.0", + "@rollup/rollup-win32-x64-msvc": "4.59.0", "fsevents": "~2.3.2" } }, @@ -5128,13 +5217,6 @@ "vega-util": "^2.1.0" } }, - "node_modules/vega-expression/node_modules/@types/estree": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", - "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", - "license": "MIT", - "peer": true - }, "node_modules/vega-force": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/vega-force/-/vega-force-5.1.0.tgz", @@ -5219,12 +5301,6 @@ "vega-util": "^2.1.0" } }, - "node_modules/vega-interpreter/node_modules/vega-util": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/vega-util/-/vega-util-2.1.0.tgz", - "integrity": "sha512-PGfp0m0QCufDmcxKJCWQy4Ov23FoF8DSXmoJwSezi3itQaa2hbxK0+xwsTMP2vy4PR16Pu25HMzgMwXVW1+33w==", - "license": "BSD-3-Clause" - }, "node_modules/vega-label": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/vega-label/-/vega-label-2.1.0.tgz", From 599cc5b62f3e9fb40c81d946909c2f96e19ef9fe Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 28 Feb 2026 07:51:10 +0000 Subject: [PATCH 11/31] Bump minimatch from 3.1.2 to 3.1.5 in /frontend Bumps [minimatch](https://github.com/isaacs/minimatch) from 3.1.2 to 3.1.5. - [Changelog](https://github.com/isaacs/minimatch/blob/main/changelog.md) - [Commits](https://github.com/isaacs/minimatch/compare/v3.1.2...v3.1.5) --- updated-dependencies: - dependency-name: minimatch dependency-version: 3.1.5 dependency-type: indirect ... Signed-off-by: dependabot[bot] --- frontend/package-lock.json | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 7a78b74..e89c7a5 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -3868,10 +3868,11 @@ } }, "node_modules/minimatch": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", - "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", + "version": "3.1.5", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.5.tgz", + "integrity": "sha512-VgjWUsnnT6n+NUk6eZq77zeFdpW2LWDzP6zFGrCbHXiYNul5Dzqk2HHQ5uFH2DNW5Xbp8+jVzaeNt94ssEEl4w==", "dev": true, + "license": "ISC", "dependencies": { "brace-expansion": "^1.1.7" }, @@ -5219,12 +5220,6 @@ "vega-util": "^2.1.0" } }, - "node_modules/vega-interpreter/node_modules/vega-util": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/vega-util/-/vega-util-2.1.0.tgz", - "integrity": "sha512-PGfp0m0QCufDmcxKJCWQy4Ov23FoF8DSXmoJwSezi3itQaa2hbxK0+xwsTMP2vy4PR16Pu25HMzgMwXVW1+33w==", - "license": "BSD-3-Clause" - }, "node_modules/vega-label": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/vega-label/-/vega-label-2.1.0.tgz", From 3e83b3a35d295256852ea789f5e38c75c3f7b966 Mon Sep 17 00:00:00 2001 From: nsheff Date: Sat, 28 Feb 2026 08:48:02 -0500 Subject: [PATCH 12/31] clean up actions --- .github/dependabot.yml | 19 ++++++++ .github/workflows/claude-code-review.yml | 57 ++++-------------------- 2 files changed, 28 insertions(+), 48 deletions(-) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..f46f5c9 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,19 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + target-branch: "dev" + + - package-ecosystem: "npm" + directory: "/frontend" + schedule: + interval: "weekly" + target-branch: "dev" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + target-branch: "dev" diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml index 474e2ba..32c3840 100644 --- a/.github/workflows/claude-code-review.yml +++ b/.github/workflows/claude-code-review.yml @@ -1,30 +1,22 @@ name: Claude Code Review on: - pull_request: - types: [opened, ready_for_review] - # Optional: Only run on specific file changes - # paths: - # - "src/**/*.ts" - # - "src/**/*.tsx" - # - "src/**/*.js" - # - "src/**/*.jsx" + workflow_dispatch: + inputs: + pr_number: + description: 'PR number to review' + required: true + type: number jobs: claude-review: - # Optional: Filter by PR author - # if: | - # github.event.pull_request.user.login == 'external-contributor' || - # github.event.pull_request.user.login == 'new-developer' || - # github.event.pull_request.author_association == 'FIRST_TIME_CONTRIBUTOR' - runs-on: ubuntu-latest permissions: contents: read pull-requests: read issues: read id-token: write - + steps: - name: Checkout repository uses: actions/checkout@v4 @@ -36,43 +28,12 @@ jobs: uses: anthropics/claude-code-action@beta with: claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} - - # Optional: Specify model (defaults to Claude Sonnet 4, uncomment for Claude Opus 4) - # model: "claude-opus-4-20250514" - - # Direct prompt for automated review (no @claude mention needed) direct_prompt: | - Please review this pull request and provide feedback on: + Please review pull request #${{ inputs.pr_number }} and provide feedback on: - Code quality and best practices - Potential bugs or issues - Performance considerations - Security concerns - Test coverage - - Be constructive and helpful in your feedback. - - # Optional: Use sticky comments to make Claude reuse the same comment on subsequent pushes to the same PR - # use_sticky_comment: true - - # Optional: Customize review based on file types - # direct_prompt: | - # Review this PR focusing on: - # - For TypeScript files: Type safety and proper interface usage - # - For API endpoints: Security, input validation, and error handling - # - For React components: Performance, accessibility, and best practices - # - For tests: Coverage, edge cases, and test quality - - # Optional: Different prompts for different authors - # direct_prompt: | - # ${{ github.event.pull_request.author_association == 'FIRST_TIME_CONTRIBUTOR' && - # 'Welcome! Please review this PR from a first-time contributor. Be encouraging and provide detailed explanations for any suggestions.' || - # 'Please provide a thorough code review focusing on our coding standards and best practices.' }} - - # Optional: Add specific tools for running tests or linting - # allowed_tools: "Bash(npm run test),Bash(npm run lint),Bash(npm run typecheck)" - - # Optional: Skip review for certain conditions - # if: | - # !contains(github.event.pull_request.title, '[skip-review]') && - # !contains(github.event.pull_request.title, '[WIP]') + Be constructive and helpful in your feedback. From c15333511401b095968d97d07ed135e2a7b10390 Mon Sep 17 00:00:00 2001 From: nsheff Date: Sat, 28 Feb 2026 08:49:12 -0500 Subject: [PATCH 13/31] first pass at an r pkg --- refget-r/NAMESPACE | 1 + refget-r/R/RefgetGenome-class.R | 4 +- refget-r/install_and_test.sh | 41 ++ refget-r/tests/testthat/test-RefgetGenome.R | 4 +- refget-r/tests/testthat/test-constructors.R | 100 +++++ refget-r/tests/testthat/test-edge-cases.R | 291 +++++++++++++++ refget-r/vignettes/getting-started.Rmd | 185 +++++++++ refget-r/vignettes/reference.Rmd | 393 ++++++++++++++++++++ 8 files changed, 1015 insertions(+), 4 deletions(-) create mode 100755 refget-r/install_and_test.sh create mode 100644 refget-r/tests/testthat/test-constructors.R create mode 100644 refget-r/tests/testthat/test-edge-cases.R create mode 100644 refget-r/vignettes/getting-started.Rmd create mode 100644 refget-r/vignettes/reference.Rmd diff --git a/refget-r/NAMESPACE b/refget-r/NAMESPACE index 349f1c9..0c172ce 100644 --- a/refget-r/NAMESPACE +++ b/refget-r/NAMESPACE @@ -37,5 +37,6 @@ exportClasses(RefgetGenome) # Imports import(methods) importFrom(GenomeInfoDb, Seqinfo) +importFrom(GenomeInfoDb, seqinfo) importFrom(GenomeInfoDb, seqnames) importFrom(GenomeInfoDb, seqlengths) diff --git a/refget-r/R/RefgetGenome-class.R b/refget-r/R/RefgetGenome-class.R index 3118f24..2a45560 100644 --- a/refget-r/R/RefgetGenome-class.R +++ b/refget-r/R/RefgetGenome-class.R @@ -115,8 +115,8 @@ RefgetGenome.from_directory <- function(path, digest = NULL, namespace = NULL, a #' @export RefgetGenome.from_fasta <- function(fasta_path) { store <- gtars::refget_store() - digest <- gtars::add_fasta(store, fasta_path) - RefgetGenome(store, digest = digest) + result <- gtars::add_fasta(store, fasta_path) + RefgetGenome(store, digest = result$digest) } #' Create RefgetGenome from a remote store diff --git a/refget-r/install_and_test.sh b/refget-r/install_and_test.sh new file mode 100755 index 0000000..a3e3179 --- /dev/null +++ b/refget-r/install_and_test.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Install and test BiocRefgetStore +# Usage: bash install_and_test.sh [install|test|both] +# Default: both + +set -e + +PKG_DIR="$(cd "$(dirname "$0")" && pwd)" +ACTION="${1:-both}" + +R_CMD="bulker exec databio/nsheff -- R" +RSCRIPT_CMD="bulker exec databio/nsheff -- Rscript" + +install_pkg() { + echo "=== Installing BiocRefgetStore ===" + $R_CMD CMD INSTALL --no-multiarch "$PKG_DIR" + echo "=== Installation complete ===" +} + +run_tests() { + echo "=== Running tests ===" + $RSCRIPT_CMD -e "testthat::test_local('$PKG_DIR')" + echo "=== Tests complete ===" +} + +case "$ACTION" in + install) + install_pkg + ;; + test) + run_tests + ;; + both) + install_pkg + run_tests + ;; + *) + echo "Usage: bash install_and_test.sh [install|test|both]" + exit 1 + ;; +esac diff --git a/refget-r/tests/testthat/test-RefgetGenome.R b/refget-r/tests/testthat/test-RefgetGenome.R index 1dc7b83..aceeb6c 100644 --- a/refget-r/tests/testthat/test-RefgetGenome.R +++ b/refget-r/tests/testthat/test-RefgetGenome.R @@ -107,10 +107,10 @@ test_that("RefgetGenome from_directory works", { writeLines(c(">seq1", "ACGT"), fasta_file) on.exit(unlink(fasta_file), add = TRUE) - digest <- gtars::add_fasta(store, fasta_file) + result <- gtars::add_fasta(store, fasta_file) # Load from directory - genome <- RefgetGenome.from_directory(store_dir, digest = digest) + genome <- RefgetGenome.from_directory(store_dir, digest = result$digest) expect_s4_class(genome, "RefgetGenome") expect_equal(names(genome), "seq1") }) diff --git a/refget-r/tests/testthat/test-constructors.R b/refget-r/tests/testthat/test-constructors.R new file mode 100644 index 0000000..227f02b --- /dev/null +++ b/refget-r/tests/testthat/test-constructors.R @@ -0,0 +1,100 @@ +# Test constructor edge cases + +test_that("RefgetGenome with invalid digest errors", { + skip_if_not_installed("gtars") + + store <- gtars::refget_store() + + # A digest that doesn't exist in the store + expect_error( + RefgetGenome(store, digest = "nonexistent_digest_abc123"), + "not found" + ) +}) + +test_that("RefgetGenome.from_fasta with nonexistent file errors", { + skip_if_not_installed("gtars") + + expect_error( + RefgetGenome.from_fasta("/tmp/does_not_exist_xyz.fa") + ) +}) + +test_that("RefgetGenome.from_directory with nonexistent path errors", { + skip_if_not_installed("gtars") + + expect_error( + RefgetGenome.from_directory("/tmp/no_such_store_dir_xyz", digest = "abc") + ) +}) + +test_that("RefgetGenome with only namespace (missing alias) errors", { + skip_if_not_installed("gtars") + + store <- gtars::refget_store() + + expect_error( + RefgetGenome(store, namespace = "refseq"), + "Must provide either" + ) +}) + +test_that("RefgetGenome with only alias (missing namespace) errors", { + skip_if_not_installed("gtars") + + store <- gtars::refget_store() + + expect_error( + RefgetGenome(store, alias = "hg38"), + "Must provide either" + ) +}) + +test_that("RefgetGenome with neither digest nor alias errors", { + skip_if_not_installed("gtars") + + store <- gtars::refget_store() + + expect_error( + RefgetGenome(store), + "Must provide either" + ) +}) + +test_that("RefgetGenome.from_fasta returns correct class", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">seq1", "ACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + expect_s4_class(genome, "RefgetGenome") + + # Verify the digest was set + d <- collection_digest(genome) + expect_type(d, "character") + expect_true(nchar(d) > 0) +}) + +test_that("RefgetGenome.from_directory roundtrip works", { + skip_if_not_installed("gtars") + + # Create on-disk store and add FASTA + store_dir <- tempfile() + dir.create(store_dir) + on.exit(unlink(store_dir, recursive = TRUE)) + + store <- gtars::refget_store_on_disk(store_dir) + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "AAAA", ">chr2", "CCCC"), fasta_file) + on.exit(unlink(fasta_file), add = TRUE) + + result <- gtars::add_fasta(store, fasta_file) + + # Reload from directory + genome <- RefgetGenome.from_directory(store_dir, digest = result$digest) + expect_s4_class(genome, "RefgetGenome") + expect_equal(length(genome), 2) + expect_equal(sort(names(genome)), c("chr1", "chr2")) +}) diff --git a/refget-r/tests/testthat/test-edge-cases.R b/refget-r/tests/testthat/test-edge-cases.R new file mode 100644 index 0000000..31a3f37 --- /dev/null +++ b/refget-r/tests/testthat/test-edge-cases.R @@ -0,0 +1,291 @@ +# Test edge cases and gaps in current coverage + +# -- Single-sequence FASTA ------------------------------------------------ + +test_that("RefgetGenome works with single-sequence FASTA", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">only_seq", "ACGTACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + expect_equal(length(genome), 1) + expect_equal(names(genome), "only_seq") + expect_equal(seqlengths(genome)[["only_seq"]], 8) +}) + +# -- Sequences with N/ambiguous bases ------------------------------------- + +test_that("getSeq handles sequences with N bases", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chrN", "ACNNNGTACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + seq <- getSeq(genome, "chrN", as.character = TRUE) + expect_equal(seq, "ACNNNGTACGT") + + # Substring containing Ns + sub <- getSeq(genome, "chrN", start = 2, end = 6, as.character = TRUE) + expect_equal(sub, "CNNNG") +}) + +# -- Partial coordinates (only start or only end) ------------------------- + +test_that("getSeq with only start=NA returns full sequence", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGTACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # Both NA -> full sequence + seq <- getSeq(genome, "chr1", start = NA, end = NA, as.character = TRUE) + expect_equal(seq, "ACGTACGT") +}) + +# -- Coordinate boundary conditions --------------------------------------- + +test_that("getSeq works at sequence boundaries", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGTACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # start=1, end=seqlength (full range) + full <- getSeq(genome, "chr1", start = 1, end = 8, as.character = TRUE) + expect_equal(full, "ACGTACGT") + + # First base only + first <- getSeq(genome, "chr1", start = 1, end = 1, as.character = TRUE) + expect_equal(first, "A") + + # Last base only + last <- getSeq(genome, "chr1", start = 8, end = 8, as.character = TRUE) + expect_equal(last, "T") +}) + +# -- extractRegions with single region ------------------------------------ + +test_that("extractRegions works with a single-row data.frame", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGTACGTACGTACGTACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + regions <- data.frame( + chrom = "chr1", + start = 1, + end = 4, + stringsAsFactors = FALSE + ) + seqs <- extractRegions(genome, regions, as.character = TRUE) + + expect_length(seqs, 1) + expect_equal(names(seqs), "chr1:1-4") +}) + +# -- extractRegions error on missing columns ------------------------------ + +test_that("extractRegions errors on missing required columns", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # Missing 'end' column + bad_df <- data.frame(chrom = "chr1", start = 1) + expect_error(extractRegions(genome, bad_df), "must have columns") + + # Wrong column names + bad_df2 <- data.frame(chromosome = "chr1", begin = 1, finish = 4) + expect_error(extractRegions(genome, bad_df2), "must have columns") +}) + +# -- exportChromosomes with nonexistent name ------------------------------ + +test_that("exportChromosomes with nonexistent chromosome", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + output <- tempfile(fileext = ".fa") + on.exit(unlink(output), add = TRUE) + + # Requesting a nonexistent chromosome should error or produce empty output + expect_error(exportChromosomes(genome, names = "chrX", output_path = output)) +}) + +# -- show() method -------------------------------------------------------- + +test_that("show() produces expected output", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", "ACGTACGT", + ">chr2", "GGCCGGCC" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + out <- capture.output(show(genome)) + expect_true(any(grepl("RefgetGenome with 2 sequences", out))) + expect_true(any(grepl("collection_digest:", out))) + expect_true(any(grepl("seqnames:", out))) +}) + +test_that("show() truncates when >5 sequences", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + lines <- unlist(lapply(paste0("seq", 1:7), function(nm) { + c(paste0(">", nm), "ACGT") + })) + writeLines(lines, fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + out <- capture.output(show(genome)) + expect_true(any(grepl("more\\)", out))) +}) + +# -- length(), names(), seqnames() on multi-sequence genome --------------- + +test_that("length, names, seqnames work on multi-sequence genome", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", "AAAA", + ">chr2", "CCCC", + ">chr3", "GGGG" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + expect_equal(length(genome), 3) + expect_type(names(genome), "character") + expect_length(names(genome), 3) + + sn <- seqnames(genome) + expect_length(sn, 3) + expect_true(all(c("chr1", "chr2", "chr3") %in% as.character(sn))) +}) + +# -- coordinate_system() returns a string --------------------------------- + +test_that("coordinate_system returns a character string", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGTACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + cs <- coordinate_system(genome) + expect_type(cs, "character") + expect_length(cs, 1) + expect_true(nchar(cs) > 0) +}) + +# -- store() returns the underlying RefgetStore --------------------------- + +test_that("store() returns the underlying RefgetStore", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + s <- store(genome) + expect_false(is.null(s)) + # The store should be usable with gtars functions + expect_true(inherits(s, "RefgetStore") || is(s, "RefgetStore")) +}) + +# -- getSeq as.character flag with Biostrings available ------------------- + +test_that("getSeq as.character=TRUE returns character even with Biostrings", { + skip_if_not_installed("gtars") + skip_if_not_installed("Biostrings") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c(">chr1", "ACGTACGT"), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + # as.character=TRUE should force character output + seq <- getSeq(genome, "chr1", as.character = TRUE) + expect_type(seq, "character") + expect_equal(seq, "ACGTACGT") + + # as.character=FALSE should return DNAString + seq2 <- getSeq(genome, "chr1", as.character = FALSE) + expect_s4_class(seq2, "DNAString") +}) + +test_that("getSeq vectorized as.character=TRUE returns character vector", { + skip_if_not_installed("gtars") + skip_if_not_installed("Biostrings") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", "ACGTACGT", + ">chr2", "GGCCGGCC" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + seqs <- getSeq(genome, c("chr1", "chr2"), as.character = TRUE) + expect_type(seqs, "character") + expect_length(seqs, 2) + + seqs2 <- getSeq(genome, c("chr1", "chr2"), as.character = FALSE) + expect_s4_class(seqs2, "DNAStringSet") +}) + +# -- seqinfo returns Seqinfo object --------------------------------------- + +test_that("seqinfo returns a Seqinfo object", { + skip_if_not_installed("gtars") + + fasta_file <- tempfile(fileext = ".fa") + writeLines(c( + ">chr1", "ACGTACGT", + ">chr2", "GGCC" + ), fasta_file) + on.exit(unlink(fasta_file)) + + genome <- RefgetGenome.from_fasta(fasta_file) + + si <- seqinfo(genome) + expect_s4_class(si, "Seqinfo") + expect_true("chr1" %in% GenomeInfoDb::seqnames(si)) +}) diff --git a/refget-r/vignettes/getting-started.Rmd b/refget-r/vignettes/getting-started.Rmd new file mode 100644 index 0000000..bdea166 --- /dev/null +++ b/refget-r/vignettes/getting-started.Rmd @@ -0,0 +1,185 @@ +--- +title: "Getting Started with BiocRefgetStore" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Getting Started with BiocRefgetStore} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +BiocRefgetStore provides a BSgenome-compatible interface to reference genomes +backed by GA4GH refget stores. Instead of managing FASTA files, you connect +to a refget store (local or remote) and access sequences by digest. + +This tutorial uses the **2023 Human Pangenome Reference** -- a remote refget +store containing 47 haplotype-resolved assemblies hosted on S3. Sequences are +downloaded on-demand and cached locally, so you don't need to download the +entire dataset upfront. + +## Installation + +BiocRefgetStore depends on `gtars`, a Rust-backed R package. Because `gtars` +lives in a Rust monorepo with sibling crates, it must be installed from a local +clone of the full repository (not directly from GitHub): + +```{r install, eval=FALSE} +# 1. Clone the gtars monorepo (if you haven't already) +# git clone https://github.com/databio/gtars.git + +# 2. Install gtars from the local clone (requires Rust toolchain) +install.packages("path/to/gtars/gtars-r", repos = NULL, type = "source") + +# 3. Install BiocRefgetStore +remotes::install_github("refgenie/refget", subdir = "refget-r", ref = "r") +# Or from local source: +# install.packages("path/to/refget/refget-r", repos = NULL, type = "source") +``` + +`gtars` is not on CRAN and `remotes::install_github()` won't work for it +because the R package depends on sibling Rust crates via relative paths. +You need the full monorepo checkout so those paths resolve correctly. + +## Connect to a remote pangenome store + +Load a genome from the Human Pangenome Reference store. The store metadata +(~1.5 MB) is fetched on first use; individual sequences are downloaded +on-demand and cached locally: + +```{r remote-store, eval=FALSE} +library(BiocRefgetStore) + +# 2023 Human Pangenome Reference (47 haplotype-resolved assemblies) +pangenome_url <- "https://refgenie.s3.us-east-1.amazonaws.com/pangenome_refget_store" + +# One assembly from the pangenome (HG03540.pri.mat.f1_v2) +genome <- RefgetGenome.from_remote( + cache_path = "~/.cache/refget/pangenome", + remote_url = pangenome_url, + digest = "0aHV7I-94paL9Z1H4LNlqsW3WxJhlou5" +) +genome +#> RefgetGenome with 750 sequences +#> collection_digest: 0aHV7I-94paL9Z1H4LNlqsW3WxJhlou5 +#> seqnames: JAGYVX010000001.1, JAGYVX010000002.1, ... (745 more) +``` + +Subsequent calls reuse the local cache -- no re-downloading. + +## Basic sequence access + +Extract a full sequence or a region by coordinates: + +```{r getseq, eval=FALSE} +# Full sequence (returns DNAString if Biostrings is installed) +seq <- genome[["JAGYVX010000001.1"]] + +# Region by coordinates (1-based, inclusive) +region <- getSeq(genome, "JAGYVX010000001.1", start = 1000, end = 2000) + +# Force character output +region_chr <- getSeq(genome, "JAGYVX010000001.1", start = 1000, end = 2000, + as.character = TRUE) +``` + +Negative-strand extraction applies the reverse complement: + +```{r strand, eval=FALSE} +rc <- getSeq(genome, "JAGYVX010000001.1", start = 1000, end = 2000, strand = "-") +``` + +## Multiple regions at once + +Pass vectors of names, starts, and ends: + +```{r vectorized, eval=FALSE} +seqs <- getSeq( + genome, + names = c("JAGYVX010000001.1", "JAGYVX010000002.1", "JAGYVX010000003.1"), + start = c(100, 200, 300), + end = c(199, 299, 399) +) +``` + +If you have a GRanges object, pass it directly: + +```{r granges, eval=FALSE} +library(GenomicRanges) +gr <- GRanges(c("JAGYVX010000001.1:100-199:+", "JAGYVX010000002.1:200-299:-")) +seqs <- getSeq(genome, gr) +``` + +## Bulk extraction from a data.frame + +`extractRegions` accepts a data.frame with `chrom`, `start`, `end` columns: + +```{r extract-regions, eval=FALSE} +regions <- data.frame( + chrom = c("JAGYVX010000001.1", "JAGYVX010000001.1", "JAGYVX010000002.1"), + start = c(100, 5000, 200), + end = c(199, 5099, 299) +) +seqs <- extractRegions(genome, regions, as.character = TRUE) +``` + +## Export sequences to FASTA + +Write extracted regions or full sequences to a FASTA file: + +```{r export, eval=FALSE} +# Regions to FASTA +extractToFasta(genome, regions, "extracted_regions.fa") + +# Specific sequences +exportChromosomes(genome, c("JAGYVX010000001.1", "JAGYVX010000002.1"), "subset.fa") + +# All sequences +exportChromosomes(genome, output_path = "full_assembly.fa") +``` + +## Genome metadata + +Inspect sequences and their properties: + +```{r metadata, eval=FALSE} +seqnames(genome) # sequence names +seqlengths(genome) # named integer vector of lengths +seqinfo(genome) # full Seqinfo object +length(genome) # number of sequences +collection_digest(genome) # seqcol digest +coordinate_system(genome) # sorted_name_length_pairs digest +sequence_digests(genome) # per-sequence SHA512t24u digests +``` + +## Working with local FASTA files + +You can also create a genome directly from a local FASTA file. This builds +an in-memory refget store, computes sequence digests, and creates a `Seqinfo` +object automatically: + +```{r from-fasta, eval=FALSE} +genome <- RefgetGenome.from_fasta("genome.fa") +genome +``` + +## Persistent on-disk store + +For large genomes you access repeatedly, use an on-disk store so sequences +are indexed once and reused across sessions: + +```{r on-disk, eval=FALSE} +# First time: create the store from FASTA +store <- gtars::refget_store_on_disk("~/.local/share/refget/hg38") +result <- gtars::add_fasta(store, "hg38.fa") +# Save the digest: result$digest + +# Later: reload without re-parsing +genome <- RefgetGenome.from_directory( + "~/.local/share/refget/hg38", + digest = "saved_digest_string" +) +``` + +## Next steps + +See the [Reference](reference.html) vignette for complete documentation of +every function and method in the package. diff --git a/refget-r/vignettes/reference.Rmd b/refget-r/vignettes/reference.Rmd new file mode 100644 index 0000000..c6238c4 --- /dev/null +++ b/refget-r/vignettes/reference.Rmd @@ -0,0 +1,393 @@ +--- +title: "BiocRefgetStore Reference" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{BiocRefgetStore Reference} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +Complete reference for every exported function, method, and class in +BiocRefgetStore. + +## Constructors + +### RefgetGenome.from_fasta + +```r +RefgetGenome.from_fasta(fasta_path) +``` + +Create a `RefgetGenome` from a FASTA file. Builds an in-memory refget store, +computes digests, and indexes all sequences. + +- **fasta_path** — Path to a FASTA file (`.fa`, `.fasta`, `.fa.gz`). +- **Returns** — A `RefgetGenome` object. + +```{r from-fasta, eval=FALSE} +genome <- RefgetGenome.from_fasta("hg38.fa") +``` + +### RefgetGenome.from_directory + +```r +RefgetGenome.from_directory(path, digest = NULL, namespace = NULL, alias = NULL) +``` + +Load a `RefgetGenome` from a persisted on-disk refget store directory. + +- **path** — Path to a directory created by `gtars::refget_store_on_disk()`. +- **digest** — Collection digest string (provide this OR namespace + alias). +- **namespace** — Alias namespace (e.g., `"refseq"`). +- **alias** — Alias name (e.g., `"GRCh38"`). +- **Returns** — A `RefgetGenome` object. + +```{r from-directory, eval=FALSE} +genome <- RefgetGenome.from_directory("~/.refget/hg38", digest = "abc123...") +``` + +### RefgetGenome.from_remote + +```r +RefgetGenome.from_remote(cache_path, remote_url, digest = NULL, namespace = NULL, alias = NULL) +``` + +Create a `RefgetGenome` backed by a remote refget store with local caching. + +- **cache_path** — Local directory for caching downloaded data. +- **remote_url** — URL of the remote refget store. +- **digest** / **namespace** / **alias** — Same as `RefgetGenome.from_directory`. +- **Returns** — A `RefgetGenome` object. + +```{r from-remote, eval=FALSE} +genome <- RefgetGenome.from_remote( + cache_path = "~/.cache/refget", + remote_url = "https://refget.databio.org/store", + namespace = "refseq", alias = "GRCh38" +) +``` + +### RefgetGenome (low-level) + +```r +RefgetGenome(store, digest = NULL, namespace = NULL, alias = NULL) +``` + +Construct a `RefgetGenome` from an existing `gtars::RefgetStore` object. +Requires either `digest` or both `namespace` and `alias`. + +- **store** — A gtars `RefgetStore` object. +- **digest** / **namespace** / **alias** — Collection identifier. +- **Returns** — A `RefgetGenome` object. + +```{r constructor, eval=FALSE} +store <- gtars::refget_store_open_local("/path/to/store") +genome <- RefgetGenome(store, namespace = "refseq", alias = "GRCh38") +``` + +--- + +## Sequence Access + +### getSeq + +```r +getSeq(x, names, start = NA, end = NA, strand = "+", as.character = FALSE, ...) +``` + +Extract sequences from a `RefgetGenome`. BSgenome-compatible interface. + +- **x** — A `RefgetGenome` object. +- **names** — Character vector of sequence names, or a `GRanges` object. +- **start** — Integer start position(s), 1-based inclusive. `NA` for full sequence. +- **end** — Integer end position(s), 1-based inclusive. `NA` for full sequence. +- **strand** — `"+"` (default) or `"-"` for reverse complement. +- **as.character** — If `TRUE`, return character instead of DNAString/DNAStringSet. +- **Returns** — Single sequence: `DNAString` (or character). Multiple: `DNAStringSet` (or character vector). Named as `"seqname:start-end"` for regions. + +```{r getseq, eval=FALSE} +# Full chromosome +getSeq(genome, "chr1") + +# Region +getSeq(genome, "chr1", start = 100, end = 200) + +# Reverse complement +getSeq(genome, "chr1", start = 100, end = 200, strand = "-") + +# Multiple regions +getSeq(genome, c("chr1", "chr2"), c(100, 500), c(200, 600)) + +# From GRanges +getSeq(genome, GRanges("chr1:100-200:-")) +``` + +### `[[` (bracket extraction) + +```r +genome[["chr1"]] +``` + +Extract a full sequence by name. Returns `DNAString` if Biostrings is +installed, otherwise a character string. + +- **i** — Sequence name (character). +- **Returns** — `DNAString` or character string. +- **Errors** — If the sequence name is not found in the collection. + +--- + +## Metadata Accessors + +### seqinfo + +```r +seqinfo(x) +``` + +Returns the `Seqinfo` object containing sequence names and lengths. + +- **Returns** — A `GenomeInfoDb::Seqinfo` object. + +### seqnames + +```r +seqnames(x) +``` + +Returns the sequence names. + +- **Returns** — Character vector (via `Seqinfo`). + +### seqlengths + +```r +seqlengths(x) +``` + +Returns named integer vector of sequence lengths. + +- **Returns** — Named integer vector. + +```{r seqlengths, eval=FALSE} +seqlengths(genome) +#> chr1 chr2 chr3 +#> 248956 242193 198295 +``` + +### length + +```r +length(x) +``` + +Returns the number of sequences in the genome. + +- **Returns** — Integer scalar. + +### names + +```r +names(x) +``` + +Returns the sequence names as a character vector. + +- **Returns** — Character vector. + +### collection_digest + +```r +collection_digest(genome) +``` + +Returns the GA4GH seqcol digest identifying this sequence collection. + +- **genome** — A `RefgetGenome` object. +- **Returns** — Character string. + +### coordinate_system + +```r +coordinate_system(genome) +``` + +Returns the `sorted_name_length_pairs` digest. Two genomes with the same +`coordinate_system()` share the same coordinate system and are compatible for +coordinate-based operations (e.g., lifting over annotations). + +- **genome** — A `RefgetGenome` object. +- **Returns** — Character string. + +### sequence_digests + +```r +sequence_digests(genome) +``` + +Returns a named character vector of per-sequence SHA512t24u digests. + +- **genome** — A `RefgetGenome` object. +- **Returns** — Named character vector (names are sequence names, values are digests). + +```{r seq-digests, eval=FALSE} +sequence_digests(genome) +#> chr1 chr2 +#> "SQ.2648ae1bacce4ec4b6cf337..." "SQ.f932a39b4c70..." +``` + +### store + +```r +store(genome) +``` + +Returns the underlying `gtars::RefgetStore` object. Useful for calling gtars +functions directly. + +- **genome** — A `RefgetGenome` object. +- **Returns** — A gtars `RefgetStore` object. + +--- + +## Bulk Extraction + +### extractRegions + +```r +extractRegions(genome, regions, as.character = FALSE) +``` + +Extract multiple genomic regions efficiently using BED-based extraction. + +- **genome** — A `RefgetGenome` object. +- **regions** — A `GRanges` object or a `data.frame` with columns `chrom`, `start`, `end` (1-based inclusive coordinates). +- **as.character** — If `TRUE`, return character vector instead of `DNAStringSet`. +- **Returns** — `DNAStringSet` or named character vector. Named as `"chrom:start-end"`. + +```{r extract-regions, eval=FALSE} +regions <- data.frame( + chrom = c("chr1", "chr1", "chr2"), + start = c(100, 5000, 200), + end = c(199, 5099, 299) +) +seqs <- extractRegions(genome, regions) +``` + +### extractToFasta + +```r +extractToFasta(genome, regions, output_path) +``` + +Write extracted regions directly to a FASTA file. + +- **genome** — A `RefgetGenome` object. +- **regions** — A `GRanges` object or data.frame (same as `extractRegions`). +- **output_path** — Path for the output FASTA file. +- **Returns** — Invisibly returns `output_path`. + +```{r extract-to-fasta, eval=FALSE} +extractToFasta(genome, regions, "output.fa") +``` + +### exportChromosomes + +```r +exportChromosomes(genome, names = NULL, output_path, line_width = 80L) +``` + +Export complete chromosomes to a FASTA file. + +- **genome** — A `RefgetGenome` object. +- **names** — Character vector of chromosome names to export, or `NULL` for all. +- **output_path** — Path for the output FASTA file. +- **line_width** — Bases per line in output (default: 80). +- **Returns** — Invisibly returns `output_path`. + +```{r export-chroms, eval=FALSE} +# Specific chromosomes +exportChromosomes(genome, c("chr1", "chr22"), "subset.fa") + +# All chromosomes +exportChromosomes(genome, output_path = "full.fa") +``` + +--- + +## Conversion Utilities + +### as_DNAString + +```r +as_DNAString(seq_string) +``` + +Convert a character string to a Biostrings `DNAString` object. + +- **seq_string** — Character string containing a DNA sequence. +- **Returns** — A `DNAString` object. +- **Errors** — If Biostrings is not installed. + +```{r as-dnastring, eval=FALSE} +dna <- as_DNAString("ACGTACGT") +``` + +### as_DNAStringSet + +```r +as_DNAStringSet(seq_strings, names = NULL) +``` + +Convert a character vector to a Biostrings `DNAStringSet` object. + +- **seq_strings** — Character vector of DNA sequences. +- **names** — Optional names for the sequences. +- **Returns** — A `DNAStringSet` object. +- **Errors** — If Biostrings is not installed. + +```{r as-dnastringset, eval=FALSE} +seqs <- as_DNAStringSet(c("ACGT", "GGCC"), names = c("seq1", "seq2")) +``` + +--- + +## Working with the Underlying Store + +The `store()` accessor gives you access to the full `gtars::RefgetStore` API +for operations not directly exposed by BiocRefgetStore. + +```{r store-advanced, eval=FALSE} +s <- store(genome) + +# List all aliases in the store +gtars::get_aliases(s) + +# Compare two sequence collections +gtars::compare_seqcols(s, digest_a, digest_b) + +# Get FHR (FASTA Header Record) metadata +gtars::get_fhr(s, collection_digest(genome)) + +# Access level 2 data (raw attribute arrays) +level2 <- gtars::get_level2(s, collection_digest(genome)) +level2$names # sequence names +level2$lengths # sequence lengths +level2$sequences # sequence digests +``` + +### show + +```r +show(object) +``` + +Display method for `RefgetGenome`. Prints the number of sequences, collection +digest, and first few sequence names. + +```{r show, eval=FALSE} +genome +#> RefgetGenome with 24 sequences +#> collection_digest: abc123... +#> seqnames: chr1, chr2, chr3, chr4, chr5 ... (19 more) +``` From 190fd6557d560a0e1e485672d146fe0cd16c55d3 Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 2 Mar 2026 13:44:05 -0500 Subject: [PATCH 14/31] add py alias docstring and tests --- refget/store.py | 8 ++ tests/local/test_aliases.py | 162 ++++++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+) create mode 100644 tests/local/test_aliases.py diff --git a/refget/store.py b/refget/store.py index 042b90c..30379e5 100644 --- a/refget/store.py +++ b/refget/store.py @@ -3,6 +3,14 @@ This module re-exports the Rust-based gtars.refget components for local sequence collection storage and FASTA processing. + +RefgetStore also provides namespace-based alias management: + Sequence aliases: add_sequence_alias, get_sequence_by_alias, + get_aliases_for_sequence, list_sequence_alias_namespaces, + list_sequence_aliases, remove_sequence_alias, load_sequence_aliases + Collection aliases: add_collection_alias, get_collection_by_alias, + get_aliases_for_collection, list_collection_alias_namespaces, + list_collection_aliases, remove_collection_alias, load_collection_aliases """ from .const import GTARS_INSTALLED diff --git a/tests/local/test_aliases.py b/tests/local/test_aliases.py new file mode 100644 index 0000000..8d077c4 --- /dev/null +++ b/tests/local/test_aliases.py @@ -0,0 +1,162 @@ +"""Tests for RefgetStore alias functionality.""" + +import os +import tempfile + +import pytest + +from refget.store import RefgetStore + +try: + from gtars.refget import RefgetStore as _check + + _RUST_BINDINGS_AVAILABLE = True +except ImportError: + _RUST_BINDINGS_AVAILABLE = False + +FASTA_PATH = "test_fasta/base.fa" + + +@pytest.fixture +def store(): + """Create an in-memory RefgetStore with base.fa loaded.""" + s = RefgetStore.in_memory() + s.disable_encoding() + s.add_sequence_collection_from_fasta(FASTA_PATH) + return s + + +@pytest.fixture +def seq_digest(store): + """Return the sha512t24u digest of the first sequence in the store.""" + return store.list_sequences()[0].sha512t24u + + +@pytest.fixture +def col_digest(store): + """Return the digest of the first collection in the store.""" + return store.list_collections()[0].digest + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +class TestSequenceAliases: + def test_add_and_retrieve(self, store, seq_digest): + store.add_sequence_alias("chromosomes", "chr1", seq_digest) + result = store.get_sequence_by_alias("chromosomes", "chr1") + assert result is not None + assert result.metadata.sha512t24u == seq_digest + + def test_list_namespaces(self, store, seq_digest): + store.add_sequence_alias("ucsc", "chrX", seq_digest) + namespaces = store.list_sequence_alias_namespaces() + assert "ucsc" in namespaces + + def test_list_aliases_in_namespace(self, store, seq_digest): + store.add_sequence_alias("ucsc", "chr1", seq_digest) + store.add_sequence_alias("ucsc", "chr2", seq_digest) + aliases = store.list_sequence_aliases("ucsc") + assert "chr1" in aliases + assert "chr2" in aliases + + def test_reverse_lookup(self, store, seq_digest): + store.add_sequence_alias("ucsc", "chr1", seq_digest) + result = store.get_aliases_for_sequence(seq_digest) + assert ("ucsc", "chr1") in result + + def test_remove_alias(self, store, seq_digest): + store.add_sequence_alias("ucsc", "chr1", seq_digest) + removed = store.remove_sequence_alias("ucsc", "chr1") + assert removed is True + result = store.get_sequence_by_alias("ucsc", "chr1") + assert result is None + + def test_remove_nonexistent_returns_false(self, store): + removed = store.remove_sequence_alias("fake", "fake") + assert removed is False + + def test_get_nonexistent_returns_none(self, store): + result = store.get_sequence_by_alias("fake_ns", "fake_alias") + assert result is None + + def test_multiple_namespaces_same_digest(self, store, seq_digest): + store.add_sequence_alias("ucsc", "chr1", seq_digest) + store.add_sequence_alias("ensembl", "1", seq_digest) + aliases = store.get_aliases_for_sequence(seq_digest) + namespaces = {ns for ns, _ in aliases} + assert "ucsc" in namespaces + assert "ensembl" in namespaces + + def test_load_from_tsv(self, store, seq_digest): + with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as f: + f.write(f"chr1\t{seq_digest}\n") + f.write(f"chr2\t{seq_digest}\n") + tsv_path = f.name + try: + count = store.load_sequence_aliases("from_file", tsv_path) + assert count == 2 + result = store.get_sequence_by_alias("from_file", "chr1") + assert result is not None + finally: + os.unlink(tsv_path) + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +class TestCollectionAliases: + def test_add_and_retrieve(self, store, col_digest): + store.add_collection_alias("genomes", "hg38", col_digest) + result = store.get_collection_by_alias("genomes", "hg38") + assert result is not None + assert result.digest == col_digest + + def test_list_namespaces(self, store, col_digest): + store.add_collection_alias("genomes", "hg38", col_digest) + namespaces = store.list_collection_alias_namespaces() + assert "genomes" in namespaces + + def test_list_aliases_in_namespace(self, store, col_digest): + store.add_collection_alias("genomes", "hg38", col_digest) + store.add_collection_alias("genomes", "GRCh38", col_digest) + aliases = store.list_collection_aliases("genomes") + assert "hg38" in aliases + assert "GRCh38" in aliases + + def test_reverse_lookup(self, store, col_digest): + store.add_collection_alias("genomes", "hg38", col_digest) + result = store.get_aliases_for_collection(col_digest) + assert ("genomes", "hg38") in result + + def test_remove_alias(self, store, col_digest): + store.add_collection_alias("genomes", "hg38", col_digest) + removed = store.remove_collection_alias("genomes", "hg38") + assert removed is True + result = store.get_collection_by_alias("genomes", "hg38") + assert result is None + + def test_remove_nonexistent_returns_false(self, store): + removed = store.remove_collection_alias("fake", "fake") + assert removed is False + + def test_get_nonexistent_returns_none(self, store): + result = store.get_collection_by_alias("fake_ns", "fake_alias") + assert result is None + + def test_multiple_namespaces_same_digest(self, store, col_digest): + store.add_collection_alias("ucsc", "hg38", col_digest) + store.add_collection_alias("ncbi", "GRCh38", col_digest) + aliases = store.get_aliases_for_collection(col_digest) + namespaces = {ns for ns, _ in aliases} + assert "ucsc" in namespaces + assert "ncbi" in namespaces + + def test_load_from_tsv(self, store, col_digest): + with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as f: + f.write(f"hg38\t{col_digest}\n") + f.write(f"GRCh38\t{col_digest}\n") + tsv_path = f.name + try: + count = store.load_collection_aliases("from_file", tsv_path) + assert count == 2 + result = store.get_collection_by_alias("from_file", "hg38") + assert result is not None + finally: + os.unlink(tsv_path) From 142ccc195c8075de5e2442f72ff7ac8680d6fc07 Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 2 Mar 2026 18:01:41 -0500 Subject: [PATCH 15/31] clean up for new gtars updates --- data_loaders/demo_remote_store.py | 17 +- .../riva_pangenome_analysis/README.md | 4 +- examples/remote_store.py | 11 +- refget/cli/fasta.py | 110 ++++++- refget/cli/seqcol.py | 120 ++++--- refget/cli/store.py | 140 ++------ refget/clients.py | 12 +- tests/local/test_aliases.py | 179 ++++------ tests/local/test_remove_collection.py | 38 +++ tests/local/test_store_seqcol_features.py | 101 ++++++ tests/test_cli/test_fasta_commands.py | 307 +++++------------- 11 files changed, 499 insertions(+), 540 deletions(-) create mode 100644 tests/local/test_remove_collection.py create mode 100644 tests/local/test_store_seqcol_features.py diff --git a/data_loaders/demo_remote_store.py b/data_loaders/demo_remote_store.py index 137fe52..af575e5 100644 --- a/data_loaders/demo_remote_store.py +++ b/data_loaders/demo_remote_store.py @@ -39,7 +39,7 @@ def main(): print(f"\n1. Loading remote store from:\n {REMOTE_URL}") print(f" Cache directory: {CACHE_DIR}\n") - store = RefgetStore.load_remote(cache_path=str(CACHE_DIR), remote_url=REMOTE_URL) + store = RefgetStore.open_remote(cache_path=str(CACHE_DIR), remote_url=REMOTE_URL) print(f" Loaded! {len(store)} sequences available (metadata only)") @@ -51,9 +51,8 @@ def main(): # 3. List sequences (first 5) print(f"\n3. Listing sequences (first 5 of {len(store)}):") - records = store.sequence_records() - for i, rec in enumerate(records[:5]): - m = rec.metadata + records = store.list_sequences() + for i, m in enumerate(records[:5]): print(f" {i+1}. {m.name[:50]}...") print(f" sha512t24u: {m.sha512t24u}") print(f" length: {m.length:,} bp") @@ -61,7 +60,7 @@ def main(): # 4. Fetch a sequence by ID (downloads sequence data on first access) seq_digest = "du4GiRD_OcmdmCn_RmImyb71YZ4XoCdk" print(f"\n4. Get sequence record by ID (fetches from remote):") - record = store.get_sequence_by_id(seq_digest) + record = store.get_sequence(seq_digest) if record: print(f" Name: {record.metadata.name}") print(f" Length: {record.metadata.length:,} bp") @@ -107,7 +106,7 @@ def main(): print(f" Collection: {EXAMPLE_COLLECTION}") print(f" Sequence: {EXAMPLE_SEQ_NAME[:50]}...") - record = store.get_sequence_by_collection_and_name(EXAMPLE_COLLECTION, EXAMPLE_SEQ_NAME) + record = store.get_sequence_by_name(EXAMPLE_COLLECTION, EXAMPLE_SEQ_NAME) if record: print(f" Found! Length: {record.metadata.length:,} bp") print(f" Digest: {record.metadata.sha512t24u}") @@ -149,9 +148,9 @@ def main(): print(f"\nCache directory: {CACHE_DIR}") print(f"Temp files: {temp_dir}") print("\nKey features demonstrated:") - print(" - load_remote(): Load store from URL, fetch sequences on-demand") - print(" - get_sequence_by_id(): Lookup by SHA-512/24u or MD5 digest") - print(" - get_sequence_by_collection_and_name(): Lookup by sequence name") + print(" - open_remote(): Load store from URL, fetch sequences on-demand") + print(" - get_sequence(): Lookup by SHA-512/24u or MD5 digest") + print(" - get_sequence_by_name(): Lookup by collection digest + sequence name") print(" - substrings_from_regions(): Batch retrieval from BED file") print(" - export_fasta_by_digests(): Export sequences by digest") print(" - export_fasta_from_regions(): Export BED regions to FASTA") diff --git a/data_loaders/riva_pangenome_analysis/README.md b/data_loaders/riva_pangenome_analysis/README.md index 9ecce89..44d108a 100644 --- a/data_loaders/riva_pangenome_analysis/README.md +++ b/data_loaders/riva_pangenome_analysis/README.md @@ -38,7 +38,7 @@ cm = store.get_collection_metadata("s0nMiOFHPsIBrm2bd3PkzWXKLKWQZq70") EXAMPLE_COLLECTION = "0aHV7I-94paL9Z1H4LNlqsW3WxJhlou5" EXAMPLE_SEQ_NAME = "JAGYVX010000006.1 unmasked:primary_assembly HG03540.pri.mat.f1_v2:JAGYVX010000006.1:1:96320881:1" -record = store.get_sequence_by_collection_and_name(EXAMPLE_COLLECTION, EXAMPLE_SEQ_NAME) +record = store.get_sequence_by_name(EXAMPLE_COLLECTION, EXAMPLE_SEQ_NAME) ## Upload to S3 @@ -74,7 +74,7 @@ seq s1 seq.decode() store.get_collection_metadata(col1.digest) -col1_loaded.is_loaded() +store.is_collection_loaded(col1.digest) ``` diff --git a/examples/remote_store.py b/examples/remote_store.py index 6db0447..aa7d904 100644 --- a/examples/remote_store.py +++ b/examples/remote_store.py @@ -29,7 +29,7 @@ # The store metadata (~1.5 MB) is fetched; sequences are loaded on-demand. # %% -store = RefgetStore.load_remote(cache_path=str(CACHE_DIR), remote_url=REMOTE_URL) +store = RefgetStore.open_remote(cache_path=str(CACHE_DIR), remote_url=REMOTE_URL) print(f"Loaded {len(store)} sequences from {REMOTE_URL}") @@ -45,9 +45,8 @@ # ## 3. List Sequences # %% -records = store.sequence_records() -for i, rec in enumerate(records[:5]): - m = rec.metadata +records = store.list_sequences() +for i, m in enumerate(records[:5]): print(f"{i+1}. {m.name[:60]}...") print(f" sha512t24u: {m.sha512t24u}, length: {m.length:,} bp") @@ -58,7 +57,7 @@ # %% seq_digest = "du4GiRD_OcmdmCn_RmImyb71YZ4XoCdk" -record = store.get_sequence_by_id(seq_digest) +record = store.get_sequence(seq_digest) if record: print(f"Name: {record.metadata.name}") print(f"Length: {record.metadata.length:,} bp") @@ -99,7 +98,7 @@ # Look up sequences by collection digest + sequence name. # %% -record = store.get_sequence_by_collection_and_name(EXAMPLE_COLLECTION, EXAMPLE_SEQ_NAME) +record = store.get_sequence_by_name(EXAMPLE_COLLECTION, EXAMPLE_SEQ_NAME) if record: print(f"Collection: {EXAMPLE_COLLECTION}") print(f"Sequence: {EXAMPLE_SEQ_NAME[:50]}...") diff --git a/refget/cli/fasta.py b/refget/cli/fasta.py index 364c0cc..5c32442 100644 --- a/refget/cli/fasta.py +++ b/refget/cli/fasta.py @@ -25,7 +25,6 @@ EXIT_FILE_NOT_FOUND, EXIT_FAILURE, EXIT_SUCCESS, - not_implemented, print_error, print_json, print_success, @@ -67,6 +66,8 @@ def index( - genome.fa.fai (FASTA index, samtools-compatible) - genome.seqcol.json (Sequence collection JSON) - genome.chrom.sizes (Chromosome sizes) + - genome.rgsi (RefgetStore sequence index) + - genome.rgci (RefgetStore collection index) Prints the seqcol digest to stdout. """ @@ -137,7 +138,36 @@ def index( with open(chrom_sizes_path, "w") as f: f.write(chrom_sizes_content) - files_created = [str(fai_path), str(seqcol_path), str(chrom_sizes_path)] + # Write RGSI file + stem = base_name + for ext in [".fa.gz", ".fasta.gz", ".fa", ".fasta"]: + if stem.endswith(ext): + stem = stem[: -len(ext)] + break + rgsi_path = out_dir / f"{stem}.rgsi" + sc.write_rgsi(str(rgsi_path)) + + # Write RGCI file + rgci_path = out_dir / f"{stem}.rgci" + with open(rgci_path, "w") as f: + meta = sc.metadata + f.write( + "#digest\tn_sequences\tnames_digest\tsequences_digest" + "\tlengths_digest\tname_length_pairs_digest" + "\tsorted_name_length_pairs_digest\tsorted_sequences_digest\n" + ) + f.write( + f"{meta.digest}\t{meta.n_sequences}\t{meta.names_digest}" + f"\t{meta.sequences_digest}\t{meta.lengths_digest}" + f"\t{meta.name_length_pairs_digest or ''}" + f"\t{meta.sorted_name_length_pairs_digest or ''}" + f"\t{meta.sorted_sequences_digest or ''}\n" + ) + + files_created = [ + str(fai_path), str(seqcol_path), str(chrom_sizes_path), + str(rgsi_path), str(rgci_path), + ] if json_output: print_json( @@ -364,11 +394,34 @@ def rgsi( """ Compute .rgsi (RefgetStore sequence index) from a FASTA file. - The .rgsi is a binary index file used by RefgetStore for efficient - on-disk sequence storage and retrieval. It maps sequence digests to - byte offsets. + The .rgsi is a TSV index file containing collection-level digest headers + and per-sequence metadata (name, length, alphabet, digests). Used by + RefgetStore for efficient collection storage and as a FASTA digest cache. """ - not_implemented("fasta rgsi") + from gtars.refget import digest_fasta + + try: + # Determine output path + if output is None: + # Replace FASTA extensions with .rgsi + stem = file.name + for ext in [".fa.gz", ".fasta.gz", ".fa", ".fasta"]: + if stem.endswith(ext): + stem = stem[: -len(ext)] + break + output = file.parent / f"{stem}.rgsi" + + # Digest the FASTA file + with suppress_stdout(): + sc = digest_fasta(str(file)) + + # Write RGSI file using gtars binding + sc.write_rgsi(str(output)) + + print_success(f"Wrote RGSI index to {output}") + raise typer.Exit(EXIT_SUCCESS) + except OSError as e: + print_error(f"Error processing FASTA file: {e}", EXIT_FAILURE) @app.command() @@ -389,10 +442,49 @@ def rgci( """ Compute .rgci (RefgetStore collection index) from a FASTA file. - The .rgci is a binary index file used by RefgetStore to store - collection metadata. + The .rgci is a TSV index file listing collection metadata (digest, + sequence count, and level 1 digests). Used by RefgetStore as a + master index of all collections. """ - not_implemented("fasta rgci") + from gtars.refget import digest_fasta + + try: + # Determine output path + if output is None: + stem = file.name + for ext in [".fa.gz", ".fasta.gz", ".fa", ".fasta"]: + if stem.endswith(ext): + stem = stem[: -len(ext)] + break + output = file.parent / f"{stem}.rgci" + + # Digest the FASTA file + with suppress_stdout(): + sc = digest_fasta(str(file)) + + meta = sc.metadata + + # Write RGCI file (matches store.rs write_collections_rgci format) + with open(output, "w") as f: + # Header + f.write( + "#digest\tn_sequences\tnames_digest\tsequences_digest" + "\tlengths_digest\tname_length_pairs_digest" + "\tsorted_name_length_pairs_digest\tsorted_sequences_digest\n" + ) + # Single collection row + f.write( + f"{meta.digest}\t{meta.n_sequences}\t{meta.names_digest}" + f"\t{meta.sequences_digest}\t{meta.lengths_digest}" + f"\t{meta.name_length_pairs_digest or ''}" + f"\t{meta.sorted_name_length_pairs_digest or ''}" + f"\t{meta.sorted_sequences_digest or ''}\n" + ) + + print_success(f"Wrote RGCI index to {output}") + raise typer.Exit(EXIT_SUCCESS) + except OSError as e: + print_error(f"Error processing FASTA file: {e}", EXIT_FAILURE) @app.command() diff --git a/refget/cli/seqcol.py b/refget/cli/seqcol.py index 0bc3881..69dc486 100644 --- a/refget/cli/seqcol.py +++ b/refget/cli/seqcol.py @@ -67,43 +67,14 @@ def _collection_to_seqcol_dict(store, digest: str, level: int = 2) -> Optional[d Returns: Seqcol dict in API format, or None if collection not found. """ - from refget.utils import canonical_str - from refget.digests import sha512t24u_digest - - names = [] - lengths = [] - sequences = [] - - for coll in store.iter_collections(): - if coll.digest == digest: - for seq in coll.sequences: - m = seq.metadata - names.append(m.name) - lengths.append(m.length) - sequences.append("SQ." + m.sha512t24u) - break - else: - # Collection not found in iteration - return None - - if not names: + try: + if level == 1: + return store.get_collection_level1(digest) + else: + return store.get_collection_level2(digest) + except Exception: return None - if level == 1: - # Return digests of arrays instead of arrays themselves - return { - "names": sha512t24u_digest(canonical_str(names)), - "lengths": sha512t24u_digest(canonical_str(lengths)), - "sequences": sha512t24u_digest(canonical_str(sequences)), - } - else: - # Level 2: return full arrays - return { - "names": names, - "lengths": lengths, - "sequences": sequences, - } - def _get_local_seqcol(digest: str, level: int = 2) -> Optional[dict]: """ @@ -123,28 +94,15 @@ def _get_local_seqcol(digest: str, level: int = 2) -> Optional[dict]: return None store_path = get_store_path() - rgstore_path = store_path / "rgstore.json" # Check if store exists - if not store_path.exists() or not rgstore_path.exists(): + if not RefgetStore.store_exists(str(store_path)): return None try: store = RefgetStore.open_local(str(store_path)) store.set_quiet(True) - - # Check if collection exists - collection_digests = {meta.digest for meta in store.list_collections()} - if digest not in collection_digests: - return None - - # Load the collection (triggers lazy loading if needed) - if not store.is_collection_loaded(digest): - store.get_collection(digest) - - # Convert to seqcol dict format return _collection_to_seqcol_dict(store, digest, level) - except Exception: # Any error (store corruption, etc.) - fall back to remote return None @@ -419,6 +377,44 @@ def list_collections( raise typer.Exit(EXIT_SUCCESS) +def _search_local_store(filters: dict) -> Optional[list]: + """Search the local RefgetStore for collections matching attribute filters.""" + try: + from refget.store import RefgetStore + except ImportError: + return None + + store_path = get_store_path() + + if not RefgetStore.store_exists(str(store_path)): + return None + + try: + store = RefgetStore.open_local(str(store_path)) + store.set_quiet(True) + + # Search each filter; results must match ALL filters (intersection) + result_sets = [] + for attr_name, attr_digest in filters.items(): + matches = store.find_collections_by_attribute(attr_name, attr_digest) + result_sets.append(set(matches)) + + if not result_sets: + return None + + # Intersection of all filter results + matching = result_sets[0] + for s in result_sets[1:]: + matching &= s + + if not matching: + return None + + return [{"digest": d} for d in sorted(matching)] + except Exception: + return None + + @app.command() def search( names: Optional[str] = typer.Option( @@ -442,6 +438,16 @@ def search( "-s", help="Server URL override", ), + local: bool = typer.Option( + False, + "--local", + help="Search only the local store (skip remote)", + ), + no_local: bool = typer.Option( + False, + "--no-local", + help="Skip local store and search remote only", + ), ) -> None: """ Find collections that share an attribute. @@ -449,6 +455,9 @@ def search( The attribute digest is the digest of an attribute array (e.g., the names array digest from level 1 output). + By default, searches the local store first, then falls back to remote. + Use --local to search only locally, or --no-local to skip local search. + Example workflow: # Get names digest from level 1 names_digest=$(refget fasta seqcol genome.fa --level 1 | jq -r '.names') @@ -471,6 +480,19 @@ def search( ) return + # Try local store first (unless --no-local) + if not no_local: + local_results = _search_local_store(filters) + if local_results is not None: + print_json(local_results) + raise typer.Exit(EXIT_SUCCESS) + + if local: + # --local flag set but no results found locally + print_error("No matching collections found in local store", EXIT_FAILURE) + return + + # Fall back to remote server client = _get_client(server) try: diff --git a/refget/cli/store.py b/refget/cli/store.py index 9334630..fc04971 100644 --- a/refget/cli/store.py +++ b/refget/cli/store.py @@ -100,11 +100,8 @@ def _load_store(path: Optional[Path], must_exist: bool = True, remote: Optional[ if must_exist: if not store_path.exists(): print_error(f"Store not found at {store_path}", EXIT_FILE_NOT_FOUND) - # Check if rgstore.json exists - if not, it's an empty store that needs on_disk - # The store uses rgstore.json as its manifest file - rgstore_path = store_path / "rgstore.json" - if not rgstore_path.exists(): - # Empty store - use on_disk which handles initialization + if not RefgetStore.store_exists(str(store_path)): + # Empty directory - use on_disk which handles initialization return RefgetStore.on_disk(str(store_path)) return RefgetStore.open_local(str(store_path)) else: @@ -394,39 +391,13 @@ def get( print(seq_data) else: # Collection retrieval mode (default) - # Check if collection exists - if digest not in _get_collection_digests(store): + try: + result = store.get_collection_level2(digest) + except Exception: print_error(f"Collection not found: {digest}", EXIT_FAILURE) return - # Ensure collection is loaded - _ensure_collection_loaded(store, digest) - - # Get collection data - names = [] - lengths = [] - sequences = [] - - for coll in store.iter_collections(): - if coll.digest == digest: - for seq in coll.sequences: - m = seq.metadata - names.append(m.name) - lengths.append(m.length) - sequences.append("SQ." + m.sha512t24u) - break - - if not names: - print_error(f"Collection not found: {digest}", EXIT_FAILURE) - return - - print_json( - { - "names": names, - "lengths": lengths, - "sequences": sequences, - } - ) + print_json(result) raise typer.Exit(EXIT_SUCCESS) @@ -562,7 +533,7 @@ def pull( # Check local store first local_collections: set = set() - if store_path.exists() and (store_path / "rgstore.json").exists(): + if RefgetStore.store_exists(str(store_path)): try: local_store = RefgetStore.open_local(str(store_path)) local_collections = _get_collection_digests(local_store) @@ -765,24 +736,15 @@ def fai( """ store = _load_store(path, remote=remote) - # Ensure collection is loaded - _ensure_collection_loaded(store, digest) + try: + lvl2 = store.get_collection_level2(digest) + except Exception: + print_error(f"Collection not found: {digest}", EXIT_FAILURE) + return lines = [] - - # Find the collection and get its sequences - for coll in store.iter_collections(): - if coll.digest == digest: - for seq in coll.sequences: - m = seq.metadata - # FAI format: name, length, offset, linebases, linewidth - # Since we don't have a specific FASTA file, offset is 0 - # Using default line width of 80 - lines.append(f"{m.name}\t{m.length}\t0\t80\t81") - break - - if not lines: - print_error(f"Collection not found: {digest}", EXIT_FAILURE) + for name, length in zip(lvl2["names"], lvl2["lengths"]): + lines.append(f"{name}\t{length}\t0\t80\t81") fai_content = "\n".join(lines) if lines: @@ -828,21 +790,15 @@ def chrom_sizes( """ store = _load_store(path, remote=remote) - # Ensure collection is loaded - _ensure_collection_loaded(store, digest) + try: + lvl2 = store.get_collection_level2(digest) + except Exception: + print_error(f"Collection not found: {digest}", EXIT_FAILURE) + return lines = [] - - # Find the collection and get its sequences - for coll in store.iter_collections(): - if coll.digest == digest: - for seq in coll.sequences: - m = seq.metadata - lines.append(f"{m.name}\t{m.length}") - break - - if not lines: - print_error(f"Collection not found: {digest}", EXIT_FAILURE) + for name, length in zip(lvl2["names"], lvl2["lengths"]): + lines.append(f"{name}\t{length}") sizes_content = "\n".join(lines) if lines: @@ -911,52 +867,6 @@ def stats( raise typer.Exit(EXIT_SUCCESS) -def _remove_collection_from_store(store_path: Path, digest: str) -> bool: - """ - Remove a collection from the store by manipulating store files. - - gtars RefgetStore doesn't provide a remove_collection method, so we - implement it by modifying the collections index file directly. - - Args: - store_path: Path to the store directory - digest: Collection digest to remove - - Returns: - True if removed, False if not found - """ - # Validate digest to prevent path traversal - if "/" in digest or "\\" in digest or ".." in digest: - return False - - # Remove from collections index (TSV file) - collections_idx = store_path / "collections.rgci" - if collections_idx.exists(): - lines = collections_idx.read_text().splitlines() - new_lines = [] - found = False - for line in lines: - if line.startswith("#") or not line.strip(): - new_lines.append(line) - elif line.startswith(digest + "\t"): - found = True # Skip this line (remove it) - else: - new_lines.append(line) - if found: - collections_idx.write_text("\n".join(new_lines) + "\n" if new_lines else "") - - # Remove the collection's .rgsi file - collection_file = store_path / "collections" / f"{digest}.rgsi" - if collection_file.exists(): - collection_file.unlink() - - # Remove the FHR metadata sidecar file (if it exists) - fhr_file = store_path / "collections" / f"{digest}.fhr.json" - fhr_file.unlink(missing_ok=True) - - return True - - @app.command() def remove( digest: str = typer.Argument( @@ -978,15 +888,11 @@ def remove( with other collections. """ store = _load_store(path) - store_path = _get_store_path(path) - # Check if collection exists - if digest not in _get_collection_digests(store): + removed = store.remove_collection(digest) + if not removed: print_error(f"Collection not found: {digest}", EXIT_FAILURE) - # Remove the collection by manipulating store files - _remove_collection_from_store(store_path, digest) - print_json( { "digest": digest, diff --git a/refget/clients.py b/refget/clients.py index 0e0030b..977eb68 100644 --- a/refget/clients.py +++ b/refget/clients.py @@ -199,9 +199,9 @@ def download_fasta_to_store( ImportError: If gtars/RefgetStore is not available Example: - >>> from refget.store import RefgetStore, StorageMode + >>> from refget.store import RefgetStore >>> from refget.clients import SequenceCollectionClient - >>> store = RefgetStore(StorageMode.Encoded) + >>> store = RefgetStore.in_memory() >>> client = SequenceCollectionClient() >>> collection_digest = client.download_fasta_to_store("abc123", store) >>> # Now you can retrieve sequences by digest from the local store @@ -440,7 +440,7 @@ def get_refget_store(self, cache_dir: str) -> "RefgetStore": except ImportError: raise ImportError("gtars is required: pip install gtars") - return RefgetStore.load_remote(cache_dir, url) + return RefgetStore.open_remote(cache_dir, url) class PangenomeClient(RefgetClient): @@ -597,8 +597,8 @@ def download_to_store( ImportError: If gtars/RefgetStore is not available Example: - >>> from refget.store import RefgetStore, StorageMode - >>> store = RefgetStore(StorageMode.Encoded) + >>> from refget.store import RefgetStore + >>> store = RefgetStore.in_memory() >>> client = FastaDrsClient() >>> collection_digest = client.download_to_store("abc123", store) """ @@ -627,7 +627,7 @@ def download_to_store( _LOGGER.info(f"Downloaded FASTA to {downloaded_path}") # Import into store - store.import_fasta(downloaded_path) + store.add_sequence_collection_from_fasta(downloaded_path) _LOGGER.info(f"Imported FASTA into RefgetStore: {digest}") return digest diff --git a/tests/local/test_aliases.py b/tests/local/test_aliases.py index 8d077c4..251cea4 100644 --- a/tests/local/test_aliases.py +++ b/tests/local/test_aliases.py @@ -1,4 +1,4 @@ -"""Tests for RefgetStore alias functionality.""" +"""Smoke tests for RefgetStore alias functionality via Python bindings.""" import os import tempfile @@ -28,135 +28,72 @@ def store(): @pytest.fixture def seq_digest(store): - """Return the sha512t24u digest of the first sequence in the store.""" return store.list_sequences()[0].sha512t24u @pytest.fixture def col_digest(store): - """Return the digest of the first collection in the store.""" return store.list_collections()[0].digest @pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") -class TestSequenceAliases: - def test_add_and_retrieve(self, store, seq_digest): - store.add_sequence_alias("chromosomes", "chr1", seq_digest) - result = store.get_sequence_by_alias("chromosomes", "chr1") - assert result is not None - assert result.metadata.sha512t24u == seq_digest - - def test_list_namespaces(self, store, seq_digest): - store.add_sequence_alias("ucsc", "chrX", seq_digest) - namespaces = store.list_sequence_alias_namespaces() - assert "ucsc" in namespaces - - def test_list_aliases_in_namespace(self, store, seq_digest): - store.add_sequence_alias("ucsc", "chr1", seq_digest) - store.add_sequence_alias("ucsc", "chr2", seq_digest) - aliases = store.list_sequence_aliases("ucsc") - assert "chr1" in aliases - assert "chr2" in aliases - - def test_reverse_lookup(self, store, seq_digest): - store.add_sequence_alias("ucsc", "chr1", seq_digest) - result = store.get_aliases_for_sequence(seq_digest) - assert ("ucsc", "chr1") in result - - def test_remove_alias(self, store, seq_digest): - store.add_sequence_alias("ucsc", "chr1", seq_digest) - removed = store.remove_sequence_alias("ucsc", "chr1") - assert removed is True - result = store.get_sequence_by_alias("ucsc", "chr1") - assert result is None - - def test_remove_nonexistent_returns_false(self, store): - removed = store.remove_sequence_alias("fake", "fake") - assert removed is False - - def test_get_nonexistent_returns_none(self, store): - result = store.get_sequence_by_alias("fake_ns", "fake_alias") - assert result is None - - def test_multiple_namespaces_same_digest(self, store, seq_digest): - store.add_sequence_alias("ucsc", "chr1", seq_digest) - store.add_sequence_alias("ensembl", "1", seq_digest) - aliases = store.get_aliases_for_sequence(seq_digest) - namespaces = {ns for ns, _ in aliases} - assert "ucsc" in namespaces - assert "ensembl" in namespaces - - def test_load_from_tsv(self, store, seq_digest): - with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as f: - f.write(f"chr1\t{seq_digest}\n") - f.write(f"chr2\t{seq_digest}\n") - tsv_path = f.name - try: - count = store.load_sequence_aliases("from_file", tsv_path) - assert count == 2 - result = store.get_sequence_by_alias("from_file", "chr1") - assert result is not None - finally: - os.unlink(tsv_path) +def test_sequence_alias_round_trip(store, seq_digest): + """Add, retrieve, and remove a sequence alias; verify None for missing aliases.""" + # Not found returns None + assert store.get_sequence_by_alias("ucsc", "chr1") is None + + # Add and retrieve + store.add_sequence_alias("ucsc", "chr1", seq_digest) + result = store.get_sequence_by_alias("ucsc", "chr1") + assert result is not None + assert result.metadata.sha512t24u == seq_digest + + # Remove + assert store.remove_sequence_alias("ucsc", "chr1") is True + assert store.get_sequence_by_alias("ucsc", "chr1") is None + assert store.remove_sequence_alias("ucsc", "chr1") is False + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +def test_collection_alias_round_trip(store, col_digest): + """Add, retrieve, and remove a collection alias; verify None for missing aliases.""" + assert store.get_collection_by_alias("genomes", "hg38") is None + + store.add_collection_alias("genomes", "hg38", col_digest) + result = store.get_collection_by_alias("genomes", "hg38") + assert result is not None + assert result.digest == col_digest + + assert store.remove_collection_alias("genomes", "hg38") is True + assert store.get_collection_by_alias("genomes", "hg38") is None + assert store.remove_collection_alias("genomes", "hg38") is False + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +def test_load_sequence_aliases_from_tsv(store, seq_digest): + """Load aliases from TSV; verify count return and post-load lookup.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as f: + f.write(f"chr1\t{seq_digest}\n") + f.write(f"chr2\t{seq_digest}\n") + tsv_path = f.name + try: + count = store.load_sequence_aliases("from_file", tsv_path) + assert count == 2 + assert store.get_sequence_by_alias("from_file", "chr1") is not None + finally: + os.unlink(tsv_path) @pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") -class TestCollectionAliases: - def test_add_and_retrieve(self, store, col_digest): - store.add_collection_alias("genomes", "hg38", col_digest) - result = store.get_collection_by_alias("genomes", "hg38") - assert result is not None - assert result.digest == col_digest - - def test_list_namespaces(self, store, col_digest): - store.add_collection_alias("genomes", "hg38", col_digest) - namespaces = store.list_collection_alias_namespaces() - assert "genomes" in namespaces - - def test_list_aliases_in_namespace(self, store, col_digest): - store.add_collection_alias("genomes", "hg38", col_digest) - store.add_collection_alias("genomes", "GRCh38", col_digest) - aliases = store.list_collection_aliases("genomes") - assert "hg38" in aliases - assert "GRCh38" in aliases - - def test_reverse_lookup(self, store, col_digest): - store.add_collection_alias("genomes", "hg38", col_digest) - result = store.get_aliases_for_collection(col_digest) - assert ("genomes", "hg38") in result - - def test_remove_alias(self, store, col_digest): - store.add_collection_alias("genomes", "hg38", col_digest) - removed = store.remove_collection_alias("genomes", "hg38") - assert removed is True - result = store.get_collection_by_alias("genomes", "hg38") - assert result is None - - def test_remove_nonexistent_returns_false(self, store): - removed = store.remove_collection_alias("fake", "fake") - assert removed is False - - def test_get_nonexistent_returns_none(self, store): - result = store.get_collection_by_alias("fake_ns", "fake_alias") - assert result is None - - def test_multiple_namespaces_same_digest(self, store, col_digest): - store.add_collection_alias("ucsc", "hg38", col_digest) - store.add_collection_alias("ncbi", "GRCh38", col_digest) - aliases = store.get_aliases_for_collection(col_digest) - namespaces = {ns for ns, _ in aliases} - assert "ucsc" in namespaces - assert "ncbi" in namespaces - - def test_load_from_tsv(self, store, col_digest): - with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as f: - f.write(f"hg38\t{col_digest}\n") - f.write(f"GRCh38\t{col_digest}\n") - tsv_path = f.name - try: - count = store.load_collection_aliases("from_file", tsv_path) - assert count == 2 - result = store.get_collection_by_alias("from_file", "hg38") - assert result is not None - finally: - os.unlink(tsv_path) +def test_load_collection_aliases_from_tsv(store, col_digest): + """Load aliases from TSV; verify count return and post-load lookup.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as f: + f.write(f"hg38\t{col_digest}\n") + f.write(f"GRCh38\t{col_digest}\n") + tsv_path = f.name + try: + count = store.load_collection_aliases("from_file", tsv_path) + assert count == 2 + assert store.get_collection_by_alias("from_file", "hg38") is not None + finally: + os.unlink(tsv_path) diff --git a/tests/local/test_remove_collection.py b/tests/local/test_remove_collection.py new file mode 100644 index 0000000..88fb208 --- /dev/null +++ b/tests/local/test_remove_collection.py @@ -0,0 +1,38 @@ +"""Smoke test for RefgetStore.remove_collection() Python binding.""" + +import os +import tempfile + +import pytest + +from refget.store import RefgetStore + +try: + from gtars.refget import RefgetStore as _check + + _RUST_BINDINGS_AVAILABLE = True +except ImportError: + _RUST_BINDINGS_AVAILABLE = False + +FASTA_PATH = "test_fasta/base.fa" + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +def test_remove_collection_round_trip(): + """Add a collection, remove it with orphan cleanup, verify store is empty.""" + store = RefgetStore.in_memory() + store.set_quiet(True) + store.add_sequence_collection_from_fasta(FASTA_PATH) + + assert len(store.list_collections()) == 1 + assert len(store.list_sequences()) > 0 + + digest = store.list_collections()[0].digest + + # Nonexistent returns False + assert store.remove_collection("nonexistent") is False + + # Real removal with orphan cleanup + assert store.remove_collection(digest, remove_orphan_sequences=True) is True + assert len(store.list_collections()) == 0 + assert len(store.list_sequences()) == 0 diff --git a/tests/local/test_store_seqcol_features.py b/tests/local/test_store_seqcol_features.py new file mode 100644 index 0000000..3779eb1 --- /dev/null +++ b/tests/local/test_store_seqcol_features.py @@ -0,0 +1,101 @@ +""" +Tests for RefgetStore seqcol features: level1/level2, compare, find_collections_by_attribute. + +Only tests that verify Python-specific behavior beyond what Rust tests cover: +- Rust/Python parity for compare() +- Multi-collection attribute search +- Basic level1/level2 smoke test +""" + +import json +import pytest +from pathlib import Path + +try: + from refget.store import RefgetStore + + _RUST_BINDINGS_AVAILABLE = True +except ImportError: + _RUST_BINDINGS_AVAILABLE = False + +TEST_FASTA_DIR = Path("test_fasta") +BASE_FASTA = TEST_FASTA_DIR / "base.fa" +DIFFERENT_NAMES_FASTA = TEST_FASTA_DIR / "different_names.fa" + +with open(TEST_FASTA_DIR / "test_fasta_digests.json") as fp: + TEST_DIGESTS = json.load(fp) + +BASE_DIGEST = TEST_DIGESTS["base.fa"]["top_level_digest"] +BASE_LEVEL1 = TEST_DIGESTS["base.fa"]["level1"] +BASE_LEVEL2 = TEST_DIGESTS["base.fa"]["level2"] +DIFFERENT_NAMES_DIGEST = TEST_DIGESTS["different_names.fa"]["top_level_digest"] + + +@pytest.fixture +def store_with_base(): + """Create an in-memory store with base.fa loaded.""" + store = RefgetStore.in_memory() + store.add_sequence_collection_from_fasta(str(BASE_FASTA)) + return store + + +@pytest.fixture +def store_with_two(): + """Create an in-memory store with base.fa and different_names.fa loaded.""" + store = RefgetStore.in_memory() + store.add_sequence_collection_from_fasta(str(BASE_FASTA)) + store.add_sequence_collection_from_fasta(str(DIFFERENT_NAMES_FASTA)) + return store + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +def test_level1_and_level2_smoke(store_with_base): + """Level1 returns digests, level2 returns arrays, both have required keys.""" + lvl1 = store_with_base.get_collection_level1(BASE_DIGEST) + lvl2 = store_with_base.get_collection_level2(BASE_DIGEST) + + for key in ("names", "lengths", "sequences"): + assert key in lvl1 + assert key in lvl2 + # Level1 values are digest strings, level2 values are lists + assert isinstance(lvl1[key], str) + assert isinstance(lvl2[key], list) + + # Verify level2 matches expected values + assert sorted(lvl2["names"]) == sorted(BASE_LEVEL2["names"]) + assert sorted(lvl2["lengths"]) == sorted(BASE_LEVEL2["lengths"]) + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +def test_compare_matches_python_implementation(store_with_two): + """Verify store.compare() (Rust) agrees with compare_seqcols() (Python) on core attributes.""" + from refget.utils import compare_seqcols + + lvl2_a = store_with_two.get_collection_level2(BASE_DIGEST) + lvl2_b = store_with_two.get_collection_level2(DIFFERENT_NAMES_DIGEST) + + python_result = compare_seqcols(lvl2_a, lvl2_b) + rust_result = store_with_two.compare(BASE_DIGEST, DIFFERENT_NAMES_DIGEST) + + core_attrs = {"names", "lengths", "sequences"} + assert core_attrs <= set(python_result["attributes"]["a_and_b"]) + assert core_attrs <= set(rust_result["attributes"]["a_and_b"]) + + for attr in core_attrs: + assert ( + rust_result["array_elements"]["a_and_b_count"][attr] + == python_result["array_elements"]["a_and_b_count"][attr] + ) + assert ( + rust_result["array_elements"]["a_and_b_same_order"][attr] + == python_result["array_elements"]["a_and_b_same_order"][attr] + ) + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +def test_shared_attribute_returns_multiple(store_with_two): + """base.fa and different_names.fa share lengths; searching by lengths returns both.""" + lengths_digest = BASE_LEVEL1["lengths"] + results = store_with_two.find_collections_by_attribute("lengths", lengths_digest) + assert BASE_DIGEST in results + assert DIFFERENT_NAMES_DIGEST in results diff --git a/tests/test_cli/test_fasta_commands.py b/tests/test_cli/test_fasta_commands.py index df5c698..2f3ea6e 100644 --- a/tests/test_cli/test_fasta_commands.py +++ b/tests/test_cli/test_fasta_commands.py @@ -3,7 +3,7 @@ """ Tests for refget fasta CLI commands. -These test the CLI wrapper behavior: output formatting, exit codes, argument parsing. +These test CLI-specific behavior: output formatting, exit codes, argument parsing. """ import pytest @@ -26,20 +26,14 @@ class TestFastaDigest: """Tests for: refget fasta digest """ - def test_outputs_json(self, cli, sample_fasta): - """Output is valid JSON with digest.""" - result = cli("fasta", "digest", str(sample_fasta)) - - data = assert_json_output(result, ["digest"]) - assert_valid_digest(data["digest"]) - - def test_digest_with_file_key(self, cli, sample_fasta): - """Output may include file path.""" - result = cli("fasta", "digest", str(sample_fasta)) + def test_known_digest(self, cli): + """Verify digest matches expected value for known file.""" + result = cli("fasta", "digest", str(BASE_FASTA)) assert result.exit_code == 0 data = json.loads(result.stdout) - assert "digest" in data + expected_digest = TEST_FASTA_DIGESTS["base.fa"]["top_level_digest"] + assert data["digest"] == expected_digest def test_gzipped_file(self, cli, sample_fasta_gz): """Handles gzipped files seamlessly.""" @@ -52,34 +46,13 @@ def test_gzipped_file(self, cli, sample_fasta_gz): def test_file_not_found_exit_code(self, cli): """Returns non-zero exit code for missing file.""" result = cli("fasta", "digest", "/nonexistent/file.fa") - - assert result.exit_code != 0 - # Error message goes to stderr (correct Unix behavior) - assert "not found" in result.stderr.lower() or "error" in result.stderr.lower() - - def test_missing_argument(self, cli): - """Returns non-zero exit for missing argument.""" - result = cli("fasta", "digest") - assert result.exit_code != 0 - def test_known_digest(self, cli): - """Verify digest matches expected value for known file.""" - result = cli("fasta", "digest", str(BASE_FASTA)) - - assert result.exit_code == 0 - data = json.loads(result.stdout) - expected_digest = TEST_FASTA_DIGESTS["base.fa"]["top_level_digest"] - assert data["digest"] == expected_digest - def test_different_files_different_digests(self, cli): """Different files produce different digests.""" result1 = cli("fasta", "digest", str(BASE_FASTA)) result2 = cli("fasta", "digest", str(DIFFERENT_NAMES_FASTA)) - assert result1.exit_code == 0 - assert result2.exit_code == 0 - digest1 = json.loads(result1.stdout)["digest"] digest2 = json.loads(result2.stdout)["digest"] assert digest1 != digest2 @@ -88,38 +61,6 @@ def test_different_files_different_digests(self, cli): class TestFastaSeqcol: """Tests for: refget fasta seqcol """ - def test_outputs_seqcol_json(self, cli, sample_fasta): - """Output is valid seqcol JSON.""" - result = cli("fasta", "seqcol", str(sample_fasta)) - - data = assert_json_output(result, ["names", "lengths", "sequences"]) - assert isinstance(data["names"], list) - assert isinstance(data["lengths"], list) - assert isinstance(data["sequences"], list) - - def test_seqcol_array_lengths_match(self, cli, sample_fasta): - """All seqcol arrays have same length.""" - result = cli("fasta", "seqcol", str(sample_fasta)) - - assert result.exit_code == 0 - data = json.loads(result.stdout) - n_seqs = len(data["names"]) - assert len(data["lengths"]) == n_seqs - assert len(data["sequences"]) == n_seqs - - def test_output_to_file(self, cli, sample_fasta, tmp_path): - """Writes to file with -o option.""" - output = tmp_path / "out.seqcol.json" - result = cli("fasta", "seqcol", str(sample_fasta), "-o", str(output)) - - assert result.exit_code == 0 - assert output.exists() - - data = json.loads(output.read_text()) - assert "names" in data - assert "lengths" in data - assert "sequences" in data - def test_known_seqcol(self, cli): """Verify seqcol matches expected values for known file.""" result = cli("fasta", "seqcol", str(BASE_FASTA)) @@ -131,41 +72,20 @@ def test_known_seqcol(self, cli): assert data["names"] == expected["names"] assert data["lengths"] == expected["lengths"] - def test_gzipped_file(self, cli, sample_fasta_gz): - """Handles gzipped FASTA files.""" - result = cli("fasta", "seqcol", str(sample_fasta_gz)) + def test_output_to_file(self, cli, sample_fasta, tmp_path): + """Writes to file with -o option.""" + output = tmp_path / "out.seqcol.json" + result = cli("fasta", "seqcol", str(sample_fasta), "-o", str(output)) assert result.exit_code == 0 - data = json.loads(result.stdout) + assert output.exists() + data = json.loads(output.read_text()) assert "names" in data class TestFastaFai: """Tests for: refget fasta fai """ - def test_outputs_fai_format(self, cli, sample_fasta, tmp_path): - """Outputs valid FAI format.""" - output = tmp_path / "test.fa.fai" - result = cli("fasta", "fai", str(sample_fasta), "-o", str(output)) - - assert result.exit_code == 0 - assert output.exists() - - # FAI format: name\tlength\toffset\tline_bases\tline_width - lines = output.read_text().strip().split("\n") - assert len(lines) > 0 - for line in lines: - parts = line.split("\t") - assert len(parts) >= 2 # At least name and length - - def test_fai_to_stdout(self, cli, sample_fasta): - """Outputs FAI to stdout when no -o specified.""" - result = cli("fasta", "fai", str(sample_fasta)) - - assert result.exit_code == 0 - lines = result.stdout.strip().split("\n") - assert len(lines) > 0 - def test_fai_sequence_count(self, cli, multi_seq_fasta, tmp_path): """FAI has one line per sequence.""" output = tmp_path / "test.fa.fai" @@ -179,42 +99,13 @@ def test_fai_sequence_count(self, cli, multi_seq_fasta, tmp_path): class TestFastaChromSizes: """Tests for: refget fasta chrom-sizes """ - def test_outputs_chrom_sizes(self, cli, sample_fasta, tmp_path): - """Outputs valid chrom.sizes format.""" - output = tmp_path / "test.chrom.sizes" - result = cli("fasta", "chrom-sizes", str(sample_fasta), "-o", str(output)) - - assert result.exit_code == 0 - assert output.exists() - - # Format: name\tlength - lines = output.read_text().strip().split("\n") - for line in lines: - parts = line.split("\t") - assert len(parts) == 2 - assert parts[1].isdigit() - - def test_chrom_sizes_to_stdout(self, cli, sample_fasta): - """Outputs chrom.sizes to stdout when no -o specified.""" - result = cli("fasta", "chrom-sizes", str(sample_fasta)) - - assert result.exit_code == 0 - lines = result.stdout.strip().split("\n") - assert len(lines) > 0 - for line in lines: - parts = line.split("\t") - assert len(parts) == 2 - def test_chrom_sizes_values(self, cli): """Verify chrom.sizes values for known file.""" result = cli("fasta", "chrom-sizes", str(BASE_FASTA)) assert result.exit_code == 0 - lines = result.stdout.strip().split("\n") - - # base.fa has chrX(8), chr1(4), chr2(4) sizes = {} - for line in lines: + for line in result.stdout.strip().split("\n"): name, length = line.split("\t") sizes[name] = int(length) @@ -226,62 +117,32 @@ def test_chrom_sizes_values(self, cli): class TestFastaIndex: """Tests for: refget fasta index """ - def test_creates_fai_file(self, cli, sample_fasta): - """Creates .fai file.""" - result = cli("fasta", "index", str(sample_fasta)) - - assert result.exit_code == 0 - fai_path = Path(str(sample_fasta) + ".fai") - assert fai_path.exists() - - def test_creates_seqcol_file(self, cli, sample_fasta): - """Creates .seqcol.json file.""" - result = cli("fasta", "index", str(sample_fasta)) + def test_index_creates_all_files(self, cli, sample_fasta): + """Index with --json lists all 5 created files.""" + result = cli("fasta", "index", str(sample_fasta), "--json") assert result.exit_code == 0 - seqcol_path = sample_fasta.parent / f"{sample_fasta.stem}.seqcol.json" - assert seqcol_path.exists() - - data = json.loads(seqcol_path.read_text()) - assert "names" in data - - def test_creates_chrom_sizes_file(self, cli, sample_fasta): - """Creates .chrom.sizes file.""" - result = cli("fasta", "index", str(sample_fasta)) - - assert result.exit_code == 0 - sizes_path = sample_fasta.parent / f"{sample_fasta.stem}.chrom.sizes" - assert sizes_path.exists() - - def test_index_summary_output(self, cli, sample_fasta): - """Index command provides summary output.""" - result = cli("fasta", "index", str(sample_fasta)) - - assert result.exit_code == 0 - # Should indicate files created - assert len(result.stdout) > 0 + data = json.loads(result.stdout) + assert len(data["files_created"]) == 5 + extensions = [Path(f).suffix for f in data["files_created"]] + assert ".fai" in extensions + assert ".json" in extensions + assert ".rgsi" in extensions + assert ".rgci" in extensions class TestFastaStats: """Tests for: refget fasta stats """ - def test_outputs_stats_json(self, cli, sample_fasta): - """Outputs statistics in JSON format.""" - result = cli("fasta", "stats", str(sample_fasta), "--json") - - data = assert_json_output(result, ["sequences", "total_length"]) - assert isinstance(data["sequences"], int) - assert data["sequences"] > 0 - - def test_stats_values(self, cli, sample_fasta): - """Stats values are correct.""" - result = cli("fasta", "stats", str(sample_fasta), "--json") + def test_stats_known_file(self, cli): + """Stats for known test file.""" + result = cli("fasta", "stats", str(BASE_FASTA), "--json") assert result.exit_code == 0 data = json.loads(result.stdout) - # sample_fasta has 2 sequences, each 8 bases - assert data["sequences"] == 2 + # base.fa: chrX(8), chr1(4), chr2(4) = 16 total + assert data["sequences"] == 3 assert data["total_length"] == 16 def test_stats_plain_output(self, cli, sample_fasta): @@ -289,20 +150,8 @@ def test_stats_plain_output(self, cli, sample_fasta): result = cli("fasta", "stats", str(sample_fasta)) assert result.exit_code == 0 - # Should have some output assert len(result.stdout.strip()) > 0 - def test_stats_known_file(self, cli): - """Stats for known test file.""" - result = cli("fasta", "stats", str(BASE_FASTA), "--json") - - assert result.exit_code == 0 - data = json.loads(result.stdout) - - # base.fa: chrX(8), chr1(4), chr2(4) = 16 total - assert data["sequences"] == 3 - assert data["total_length"] == 16 - class TestFastaValidate: """Tests for: refget fasta validate """ @@ -310,57 +159,73 @@ class TestFastaValidate: def test_valid_fasta(self, cli, sample_fasta): """Valid FASTA passes validation.""" result = cli("fasta", "validate", str(sample_fasta)) - assert result.exit_code == 0 def test_invalid_fasta_exits_nonzero(self, cli, tmp_path): """Invalid FASTA fails validation.""" invalid = tmp_path / "invalid.fa" invalid.write_text("This is not a valid FASTA file\nNo headers here\n") - result = cli("fasta", "validate", str(invalid)) - - # Should fail with non-zero exit code assert result.exit_code != 0 -class TestFastaErrorHandling: - """Test error handling for fasta commands.""" +class TestFastaRgsi: + """Tests for: refget fasta rgsi """ - def test_nonexistent_file(self, cli): - """Graceful error for nonexistent file.""" - result = cli("fasta", "digest", "/path/to/nonexistent.fa") + def test_rgsi_format_and_content(self, cli, sample_fasta): + """Creates .rgsi with correct headers, columns, and sequence data.""" + result = cli("fasta", "rgsi", str(sample_fasta)) - assert result.exit_code != 0 - # Should have informative error message - assert ( - len(result.stdout) > 0 or len(result.stderr if hasattr(result, "stderr") else "") > 0 - ) - - def test_empty_fasta(self, cli, tmp_path): - """Handle empty FASTA file.""" - empty = tmp_path / "empty.fa" - empty.write_text("") - - result = cli("fasta", "stats", str(empty), "--json") - - # May succeed with 0 sequences or fail gracefully - if result.exit_code == 0: - data = json.loads(result.stdout) - assert data["sequences"] == 0 - - def test_permission_denied(self, cli, tmp_path): - """Handle permission denied.""" - # This test may be skipped on systems where we can't change permissions - protected = tmp_path / "protected.fa" - protected.write_text(">chr1\nACGT\n") - - import os - import stat - - try: - os.chmod(protected, 0o000) - result = cli("fasta", "digest", str(protected)) - assert result.exit_code != 0 - finally: - os.chmod(protected, stat.S_IRUSR | stat.S_IWUSR) + assert result.exit_code == 0 + rgsi_path = sample_fasta.parent / f"{sample_fasta.stem}.rgsi" + assert rgsi_path.exists() + + content = rgsi_path.read_text() + assert "##seqcol_digest=" in content + assert "#name\tlength\talphabet\tsha512t24u\tmd5\tdescription" in content + + data_lines = [l for l in content.strip().split("\n") if not l.startswith("#")] + assert len(data_lines) == 2 # sample_fasta has 2 sequences + + # Verify first sequence + cols = data_lines[0].split("\t") + assert len(cols) == 6 + assert cols[0] == "chr1" + assert cols[1] == "8" + + def test_rgsi_custom_output(self, cli, sample_fasta, tmp_path): + """Writes to a custom output path with -o.""" + custom_output = tmp_path / "custom.rgsi" + result = cli("fasta", "rgsi", str(sample_fasta), "-o", str(custom_output)) + + assert result.exit_code == 0 + assert custom_output.exists() + + +class TestFastaRgci: + """Tests for: refget fasta rgci """ + + def test_rgci_format_and_digest(self, cli, sample_fasta): + """Creates .rgci with correct columns, and digest matches fasta digest.""" + # Get expected digest + digest_result = cli("fasta", "digest", str(sample_fasta)) + expected_digest = json.loads(digest_result.stdout)["digest"] + + # Generate RGCI + result = cli("fasta", "rgci", str(sample_fasta)) + assert result.exit_code == 0 + + rgci_path = sample_fasta.parent / f"{sample_fasta.stem}.rgci" + content = rgci_path.read_text() + lines = content.strip().split("\n") + + # Header has 8 columns + header_cols = lines[0].lstrip("#").split("\t") + assert len(header_cols) == 8 + assert header_cols[0] == "digest" + + # Data row: correct column count, digest matches, n_sequences correct + data_cols = lines[1].split("\t") + assert len(data_cols) == 8 + assert data_cols[0] == expected_digest + assert data_cols[1] == "2" # sample_fasta has 2 sequences From 49529744370dbdd75e16e329049bd42c17f84d6e Mon Sep 17 00:00:00 2001 From: nsheff Date: Tue, 3 Mar 2026 14:27:17 -0500 Subject: [PATCH 16/31] Add inventory_genomes.py for brickyard FASTA inventory Stdlib-only script that walks the brickyard directory tree, extracts metadata (accession, group, source, build), cross-references PEP, and outputs refgenomes_inventory.csv. --- changelog.md | 64 +++++++ .../ref-genome-analysis/inventory_genomes.py | 179 ++++++++++++++++++ refget/cli/store.py | 1 + tests/conftest.py | 2 +- tests/test_cli/test_store_commands.py | 2 +- 5 files changed, 246 insertions(+), 2 deletions(-) create mode 100644 changelog.md create mode 100644 data_loaders/ref-genome-analysis/inventory_genomes.py diff --git a/changelog.md b/changelog.md new file mode 100644 index 0000000..65c3702 --- /dev/null +++ b/changelog.md @@ -0,0 +1,64 @@ +# Changelog + +All notable changes to the refget package will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.11.0] - 2026-02-28 + +This is a major release with significant restructuring, new features, and improved tooling. + +### Added + +- **CLI overhaul**: New `refget` CLI built with Typer, including subcommands for `store`, `seqcol`, `fasta`, `config`, and `admin` +- **Local store**: `refget store pull` command to pull sequence collections from remote servers to a local store +- **FASTA digesting**: `refget fasta digest` CLI command for computing sequence collection digests from FASTA files +- **Sequence collection similarities**: `calc_similarities` and `calc_similarities_from_json` functions with Jaccard similarity metrics and API endpoint +- **FASTA DRS objects**: `FastaDrsObject` model for serving FASTA files via DRS endpoints +- **Comparison interpreter**: Local sequence collection comparison interpretation module (SCIM) +- **Species filtering**: Filter similarities endpoint by species +- **Human-readable names**: `human_readable_name` field on `SequenceCollection` model +- **Pydantic API models**: Structured response models for API endpoints (fixes #33) +- **Swagger documentation**: API query parameter documentation +- **Frontend features**: Strip plots, one-to-many comparison view, FASTA digest tool, species selector, SCIM integration, dynamic version display +- **Compliance testing**: Comprehensive API compliance test suite +- **Integration test framework**: New integration test infrastructure with ephemeral databases +- **CLI test suite**: Extensive CLI tests covering store, seqcol, fasta, config, admin, and help commands +- **Service info**: `/service-info` endpoints for fasta_drs and refget_store features +- **Attribute listing**: `/list/attributes` endpoint per GA4GH paging guide +- **Bulk query**: Preload and bulk query support for sequence collections +- **R package**: First pass at `refget-r` R bindings (experimental) + +### Changed + +- **Switched to gtars**: Replaced pyfaidx and henge with gtars for FASTA parsing and digest computation +- **Major code restructure**: Consolidated schemas, reorganized modules, reduced code duplication +- **Improved error messages**: Better dependency error messages (fixes #49), clearer import errors +- **Performance optimizations**: Faster level 2 retrieval using `get_many`, optimized similarity calculations +- **Updated GA4GH compliance**: Aligned with latest refget sequence collections specification +- **Schema consolidation**: Single unified schema replacing multiple schema files +- **Collated attribute validation**: Validation for collated attributes in sequence collections +- **Frontend overhaul**: Updated comparison view, heatmap aliases, loading states, error handling + +### Removed + +- **Henge dependency**: Removed henge and biopython requirements +- **Legacy code**: Removed old flags code, duplicate functions, unused yacman imports + +### Fixed + +- `from_PySequenceCollection` construction and associated tests +- Circular dependency import issues in utilities +- Level 1 model representation +- Comparison links +- Cancel handling in frontend +- Various linting and type hint improvements + +### Security + +- Bumped frontend dependencies: vite, minimatch, rollup, esbuild, js-yaml, vega + +## [0.10.1] - 2025-06-01 + +Previous release. See git history for details. diff --git a/data_loaders/ref-genome-analysis/inventory_genomes.py b/data_loaders/ref-genome-analysis/inventory_genomes.py new file mode 100644 index 0000000..d741601 --- /dev/null +++ b/data_loaders/ref-genome-analysis/inventory_genomes.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +""" +Inventory all FASTA files in the brickyard refgenomes directory. + +Walks the brickyard directory tree, extracts structured metadata from paths +and filenames, cross-references against the PEP project, and produces a +master CSV inventory. + +Zero non-stdlib dependencies. + +Usage: + python inventory_genomes.py + python inventory_genomes.py --dry-run --no-pep + python inventory_genomes.py --root /tmp/mock_brickyard --dry-run --no-pep +""" + +import argparse +import csv +import json +import os +import os.path +import pathlib +import re +import sys +import urllib.error +import urllib.request + +BRICKYARD_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" +PEP_URL = "https://pephub-api.databio.org/api/v1/projects/donaldcampbelljr/human_mouse_fasta_brickyard/samples?tag=default" +OUTPUT_FILE = os.path.join(BRICKYARD_ROOT, "refgenomes_inventory.csv") +FASTA_EXTENSIONS = {".fa", ".fa.gz", ".fna", ".fna.gz", ".fasta", ".fasta.gz"} +ACCESSION_PATTERN = re.compile(r"(GC[AF]_\d+\.\d+)") + + +def fetch_pep_samples(): + """Fetch PEP samples from the PEPHub API. + + Returns a dict mapping absolute fasta path to sample_name. + Falls back to an empty dict if the API is unreachable. + """ + try: + with urllib.request.urlopen(PEP_URL) as response: + data = json.loads(response.read().decode("utf-8")) + lookup = {} + for item in data.get("items", []): + fasta_path = item.get("fasta", "") + sample_name = item.get("sample_name", "") + if fasta_path: + lookup[fasta_path] = sample_name + print(f"Fetched {len(lookup)} PEP samples.", file=sys.stderr) + return lookup + except urllib.error.URLError as e: + print(f"Warning: Could not fetch PEP samples: {e}", file=sys.stderr) + return {} + + +def walk_fasta_files(root): + """Walk the directory tree and yield absolute paths of FASTA files.""" + for dirpath, _dirnames, filenames in os.walk(root): + for name in filenames: + if any(name.endswith(ext) for ext in FASTA_EXTENSIONS): + yield os.path.join(dirpath, name) + + +def extract_metadata(filepath, root): + """Extract structured metadata from a FASTA file path. + + Returns a dict with: path, filename, accession, group, source, build. + """ + filename = os.path.basename(filepath) + match = ACCESSION_PATTERN.search(filename) + accession = match.group(1) if match else "" + + rel = os.path.relpath(filepath, root) + parts = pathlib.PurePosixPath(rel).parts + # parts[0] = group, parts[1] = source, parts[2] = build (or subdir), parts[-1] = filename + group = parts[0] if len(parts) > 1 else "" + source = parts[1] if len(parts) > 2 else "" + build = parts[2] if len(parts) > 3 else "" + + return { + "path": filepath, + "filename": filename, + "accession": accession, + "group": group, + "source": source, + "build": build, + } + + +def add_pep_info(record, pep_lookup): + """Add PEP sample name to a record if it exists in the lookup.""" + record["pep_sample_name"] = pep_lookup.get(record["path"], "") + + +def write_inventory(records, output_path): + """Write the inventory records to a CSV file.""" + fieldnames = ["path", "filename", "accession", "group", "source", "build", "pep_sample_name"] + with open(output_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(records) + print(f"Wrote {len(records)} records to {output_path}", file=sys.stderr) + + +def main(): + parser = argparse.ArgumentParser( + description="Inventory FASTA files in the brickyard refgenomes directory." + ) + parser.add_argument( + "--root", + default=BRICKYARD_ROOT, + help=f"Root directory to scan (default: {BRICKYARD_ROOT})", + ) + parser.add_argument( + "--output", + default=None, + help=f"Output CSV path (default: /refgenomes_inventory.csv)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print the first 10 rows to stdout instead of writing CSV.", + ) + parser.add_argument( + "--no-pep", + action="store_true", + help="Skip PEP fetching (useful for offline HPC nodes).", + ) + args = parser.parse_args() + + root = args.root + output_path = args.output if args.output else os.path.join(root, "refgenomes_inventory.csv") + + # Step 1: Fetch PEP samples + if args.no_pep: + pep_lookup = {} + print("Skipping PEP fetch (--no-pep).", file=sys.stderr) + else: + pep_lookup = fetch_pep_samples() + + # Step 2: Walk and collect FASTA files + print(f"Scanning {root} ...", file=sys.stderr) + records = [] + for filepath in walk_fasta_files(root): + record = extract_metadata(filepath, root) + add_pep_info(record, pep_lookup) + records.append(record) + + # Step 3: Sort for deterministic output + records.sort(key=lambda r: r["path"]) + + # Step 4: Output + if args.dry_run: + fieldnames = ["path", "filename", "accession", "group", "source", "build", "pep_sample_name"] + writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames) + writer.writeheader() + for row in records[:10]: + writer.writerow(row) + else: + write_inventory(records, output_path) + + # Step 5: Summary stats + total = len(records) + with_accession = sum(1 for r in records if r["accession"]) + in_pep = sum(1 for r in records if r["pep_sample_name"]) + unique_groups = len({r["group"] for r in records if r["group"]}) + unique_sources = len({r["source"] for r in records if r["source"]}) + + print(f"\nSummary:", file=sys.stderr) + print(f" Total FASTA files: {total}", file=sys.stderr) + print(f" Files with accessions: {with_accession}", file=sys.stderr) + print(f" Files in PEP: {in_pep}", file=sys.stderr) + print(f" Unique groups: {unique_groups}", file=sys.stderr) + print(f" Unique sources: {unique_sources}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/refget/cli/store.py b/refget/cli/store.py index 9334630..6b97082 100644 --- a/refget/cli/store.py +++ b/refget/cli/store.py @@ -488,6 +488,7 @@ def pull( ), remote: Optional[str] = typer.Option( None, + "--server", "--remote", "-r", help="Remote store URL (default: try configured remote_stores)", diff --git a/tests/conftest.py b/tests/conftest.py index 536bc8c..1d26369 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -38,7 +38,7 @@ @pytest.fixture def runner(): """Typer CLI test runner.""" - return CliRunner() + return CliRunner(mix_stderr=False) @pytest.fixture diff --git a/tests/test_cli/test_store_commands.py b/tests/test_cli/test_store_commands.py index 120a53d..5544151 100644 --- a/tests/test_cli/test_store_commands.py +++ b/tests/test_cli/test_store_commands.py @@ -502,7 +502,7 @@ def test_metadata_no_fhr_set(self, cli, tmp_path): result = cli("store", "metadata", digest, "--path", str(store_path)) assert result.exit_code != 0 - assert "No FHR metadata" in result.stdout + assert "No FHR metadata" in result.stderr def test_metadata_set_from_json_file(self, cli, tmp_path): """Happy path: set FHR metadata from a JSON file.""" From c49e55f4c6c3002537d8e53bca4e2971fbf424bd Mon Sep 17 00:00:00 2001 From: nsheff Date: Tue, 3 Mar 2026 15:50:01 -0500 Subject: [PATCH 17/31] parallel encoding for loading fasta --- changelog.md | 64 --------- .../ref-genome-analysis/build_refgetstore.py | 129 ++++++++++++++++++ .../riva_pangenome_analysis/README.md | 28 +++- refget/cli/store.py | 10 +- 4 files changed, 162 insertions(+), 69 deletions(-) delete mode 100644 changelog.md create mode 100644 data_loaders/ref-genome-analysis/build_refgetstore.py diff --git a/changelog.md b/changelog.md deleted file mode 100644 index 65c3702..0000000 --- a/changelog.md +++ /dev/null @@ -1,64 +0,0 @@ -# Changelog - -All notable changes to the refget package will be documented in this file. - -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - -## [0.11.0] - 2026-02-28 - -This is a major release with significant restructuring, new features, and improved tooling. - -### Added - -- **CLI overhaul**: New `refget` CLI built with Typer, including subcommands for `store`, `seqcol`, `fasta`, `config`, and `admin` -- **Local store**: `refget store pull` command to pull sequence collections from remote servers to a local store -- **FASTA digesting**: `refget fasta digest` CLI command for computing sequence collection digests from FASTA files -- **Sequence collection similarities**: `calc_similarities` and `calc_similarities_from_json` functions with Jaccard similarity metrics and API endpoint -- **FASTA DRS objects**: `FastaDrsObject` model for serving FASTA files via DRS endpoints -- **Comparison interpreter**: Local sequence collection comparison interpretation module (SCIM) -- **Species filtering**: Filter similarities endpoint by species -- **Human-readable names**: `human_readable_name` field on `SequenceCollection` model -- **Pydantic API models**: Structured response models for API endpoints (fixes #33) -- **Swagger documentation**: API query parameter documentation -- **Frontend features**: Strip plots, one-to-many comparison view, FASTA digest tool, species selector, SCIM integration, dynamic version display -- **Compliance testing**: Comprehensive API compliance test suite -- **Integration test framework**: New integration test infrastructure with ephemeral databases -- **CLI test suite**: Extensive CLI tests covering store, seqcol, fasta, config, admin, and help commands -- **Service info**: `/service-info` endpoints for fasta_drs and refget_store features -- **Attribute listing**: `/list/attributes` endpoint per GA4GH paging guide -- **Bulk query**: Preload and bulk query support for sequence collections -- **R package**: First pass at `refget-r` R bindings (experimental) - -### Changed - -- **Switched to gtars**: Replaced pyfaidx and henge with gtars for FASTA parsing and digest computation -- **Major code restructure**: Consolidated schemas, reorganized modules, reduced code duplication -- **Improved error messages**: Better dependency error messages (fixes #49), clearer import errors -- **Performance optimizations**: Faster level 2 retrieval using `get_many`, optimized similarity calculations -- **Updated GA4GH compliance**: Aligned with latest refget sequence collections specification -- **Schema consolidation**: Single unified schema replacing multiple schema files -- **Collated attribute validation**: Validation for collated attributes in sequence collections -- **Frontend overhaul**: Updated comparison view, heatmap aliases, loading states, error handling - -### Removed - -- **Henge dependency**: Removed henge and biopython requirements -- **Legacy code**: Removed old flags code, duplicate functions, unused yacman imports - -### Fixed - -- `from_PySequenceCollection` construction and associated tests -- Circular dependency import issues in utilities -- Level 1 model representation -- Comparison links -- Cancel handling in frontend -- Various linting and type hint improvements - -### Security - -- Bumped frontend dependencies: vite, minimatch, rollup, esbuild, js-yaml, vega - -## [0.10.1] - 2025-06-01 - -Previous release. See git history for details. diff --git a/data_loaders/ref-genome-analysis/build_refgetstore.py b/data_loaders/ref-genome-analysis/build_refgetstore.py new file mode 100644 index 0000000..7f3e09a --- /dev/null +++ b/data_loaders/ref-genome-analysis/build_refgetstore.py @@ -0,0 +1,129 @@ +""" +Build a RefgetStore from the refgenomes inventory CSV. + +Reads refgenomes_inventory.csv and populates a RefgetStore with all FASTA +files. No alias registration -- that is a separate, deliberate step. + +Usage: + python build_refgetstore.py [--inventory PATH] [--store-path PATH] [--output PATH] [--limit N] +""" + +import argparse +import csv +import sys +import time + +from refget.store import RefgetStore + +STORE_PATH = "/project/shefflab/brickyard/refget_store" +INVENTORY_CSV = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refgenomes_inventory.csv" +OUTPUT_CSV = "digest_map.csv" + + +def parse_args(): + parser = argparse.ArgumentParser(description="Build RefgetStore from inventory CSV") + parser.add_argument("--inventory", default=INVENTORY_CSV, help="Input inventory CSV") + parser.add_argument("--store-path", default=STORE_PATH, help="RefgetStore path") + parser.add_argument("--output", default=OUTPUT_CSV, help="Output digest map CSV") + parser.add_argument("--limit", type=int, default=None, help="Process only first N rows (for testing)") + return parser.parse_args() + + +def read_inventory(csv_path): + """Read inventory CSV and return list of row dicts.""" + rows = [] + with open(csv_path, newline="") as f: + reader = csv.DictReader(f) + if reader.fieldnames is None: + print(f"ERROR: {csv_path} appears to be empty", file=sys.stderr) + sys.exit(1) + if "path" not in reader.fieldnames: + print(f"ERROR: {csv_path} missing required 'path' column", file=sys.stderr) + sys.exit(1) + for row in reader: + rows.append(row) + return rows + + +def write_digest_map(output_path, results): + """Write results to digest_map.csv.""" + fieldnames = ["path", "filename", "digest", "n_sequences", "was_new", "error"] + with open(output_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(results) + + +def main(): + args = parse_args() + + inventory = read_inventory(args.inventory) + if args.limit: + inventory = inventory[:args.limit] + print(f"Limited to first {args.limit} records") + total = len(inventory) + print(f"Processing {total} records from {args.inventory}") + + store = RefgetStore.on_disk(args.store_path) + store.set_quiet(True) + print(f"Store initialized at {args.store_path}") + + results = [] + n_success = 0 + n_failed = 0 + n_new = 0 + t_start = time.time() + + for i, row in enumerate(inventory, 1): + fasta_path = row["path"] + filename = row.get("filename", "") + + t0 = time.time() + print(f"[{i}/{total}] {filename}...", end=" ", flush=True) + + try: + meta, was_new = store.add_sequence_collection_from_fasta(fasta_path) + elapsed = time.time() - t0 + status = "NEW" if was_new else "exists" + if was_new: + n_new += 1 + print(f"{meta.digest} ({meta.n_sequences} seqs, {status}, {elapsed:.1f}s)") + n_success += 1 + results.append({ + "path": fasta_path, + "filename": filename, + "digest": meta.digest, + "n_sequences": meta.n_sequences, + "was_new": was_new, + "error": "", + }) + except Exception as e: + error_msg = f"{type(e).__name__}: {e}" + print(f"FAILED: {error_msg}") + n_failed += 1 + results.append({ + "path": fasta_path, + "filename": filename, + "digest": "", + "n_sequences": 0, + "was_new": False, + "error": error_msg, + }) + + write_digest_map(args.output, results) + + total_time = time.time() - t_start + print(f"\nDone in {total_time:.1f}s. {n_success}/{total} succeeded, {n_new} new, {n_failed} failed.") + print(f"Digest map written to {args.output}") + print(f"\nStore stats: {store.stats()}") + + if n_failed > 0: + print(f"\nFailed files:") + for r in results: + if r["error"]: + print(f" {r['filename']}: {r['error']}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/riva_pangenome_analysis/README.md b/data_loaders/riva_pangenome_analysis/README.md index 9ecce89..3acd622 100644 --- a/data_loaders/riva_pangenome_analysis/README.md +++ b/data_loaders/riva_pangenome_analysis/README.md @@ -1,5 +1,23 @@ # RIVA Pangenome RefgetStore +## Prep + + +```sh +# Build gtars +cd ~/code/gtars +git checkout refgetstore +git pull +cd gtars-python +python -m pip install -e . + +# Next, install local refget: +cd ~/code/refget +git checkout dev +git pull +python -m pip install -e . +``` + ## Build the store ```python @@ -27,7 +45,7 @@ import os from pathlib import Path from refget.store import RefgetStore -store_dir = Path(os.path.expandvars("$BRICKYARD/datasets_downloaded/pangenome_fasta/refget_store2")) +store_dir = Path(os.path.expandvars("$BRICKYARD/datasets_downloaded/pangenome_fasta/refget_store")) store = RefgetStore.on_disk(str(store_dir)) @@ -35,10 +53,12 @@ store.list_collections() cm = store.get_collection_metadata("s0nMiOFHPsIBrm2bd3PkzWXKLKWQZq70") -EXAMPLE_COLLECTION = "0aHV7I-94paL9Z1H4LNlqsW3WxJhlou5" -EXAMPLE_SEQ_NAME = "JAGYVX010000006.1 unmasked:primary_assembly HG03540.pri.mat.f1_v2:JAGYVX010000006.1:1:96320881:1" +EXAMPLE_COLLECTION = "L5fggdWYz5tCr4v8XbPYoOwv79Sqmf1W" +EXAMPLE_SEQ_NAME = "JAGYVI010000261.1" + + +record = store.get_sequence_by_name(EXAMPLE_COLLECTION, EXAMPLE_SEQ_NAME) -record = store.get_sequence_by_collection_and_name(EXAMPLE_COLLECTION, EXAMPLE_SEQ_NAME) ## Upload to S3 diff --git a/refget/cli/store.py b/refget/cli/store.py index 6b97082..d2eb603 100644 --- a/refget/cli/store.py +++ b/refget/cli/store.py @@ -190,6 +190,12 @@ def add( "-q", help="Suppress progress output", ), + threads: Optional[int] = typer.Option( + None, + "--threads", + "-t", + help="Number of threads for parallel encoding (default: all CPUs)", + ), ) -> None: """ Import a FASTA file to the local store. @@ -220,7 +226,9 @@ def add( store.set_encoding_mode(StorageMode.Encoded) # Add the FASTA file - returns (metadata, was_new) with all info we need - metadata, was_new = store.add_sequence_collection_from_fasta(str(fasta.resolve())) + metadata, was_new = store.add_sequence_collection_from_fasta( + str(fasta.resolve()), threads=threads + ) print_json( { From ba6f35f2a3af69561d7e19195b466737760c8c27 Mon Sep 17 00:00:00 2001 From: nsheff Date: Tue, 3 Mar 2026 21:43:18 -0500 Subject: [PATCH 18/31] add explorer, store builders --- data_loaders/load_demo_seqcols.py | 2 +- .../process-all-genomes.sbatch | 15 + .../ref-genome-analysis/verify_refgetstore.py | 444 ++++++++++++++++++ frontend/src/components/CliSnippet.jsx | 136 ++++++ frontend/src/components/StoreNav.jsx | 222 +++++++++ frontend/src/main.jsx | 38 ++ frontend/src/pages/StoreAliases.jsx | 198 ++++++++ frontend/src/pages/StoreCollection.jsx | 279 +++++++++++ frontend/src/pages/StoreExplorer.jsx | 135 ++++++ frontend/src/pages/StoreOverview.jsx | 309 ++++++++++++ frontend/src/pages/StoreSequences.jsx | 337 +++++++++++++ frontend/src/services/fetchData.jsx | 28 +- frontend/src/services/storeService.js | 217 +++++++++ frontend/src/stores/explorerStore.js | 113 +++++ 14 files changed, 2462 insertions(+), 11 deletions(-) create mode 100644 data_loaders/ref-genome-analysis/process-all-genomes.sbatch create mode 100644 data_loaders/ref-genome-analysis/verify_refgetstore.py create mode 100644 frontend/src/components/CliSnippet.jsx create mode 100644 frontend/src/components/StoreNav.jsx create mode 100644 frontend/src/pages/StoreAliases.jsx create mode 100644 frontend/src/pages/StoreCollection.jsx create mode 100644 frontend/src/pages/StoreExplorer.jsx create mode 100644 frontend/src/pages/StoreOverview.jsx create mode 100644 frontend/src/pages/StoreSequences.jsx create mode 100644 frontend/src/services/storeService.js create mode 100644 frontend/src/stores/explorerStore.js diff --git a/data_loaders/load_demo_seqcols.py b/data_loaders/load_demo_seqcols.py index 21c9499..cb49246 100644 --- a/data_loaders/load_demo_seqcols.py +++ b/data_loaders/load_demo_seqcols.py @@ -19,7 +19,7 @@ DEMO_FASTA = json.load(open("test_fasta/test_fasta_digests.json")) # Storage locations from environment (if set, will upload; otherwise use demo defaults with skip_upload) -ENV_STORAGE = json.loads(os.environ.get("FASTA_STORAGE_LOCATIONS", "[]")) +ENV_STORAGE = json.loads(os.environ.get("FASTA_STORAGE_LOCATIONS") or "[]") if ENV_STORAGE: DEMO_STORAGE = ENV_STORAGE SKIP_UPLOAD = False diff --git a/data_loaders/ref-genome-analysis/process-all-genomes.sbatch b/data_loaders/ref-genome-analysis/process-all-genomes.sbatch new file mode 100644 index 0000000..28f3de7 --- /dev/null +++ b/data_loaders/ref-genome-analysis/process-all-genomes.sbatch @@ -0,0 +1,15 @@ +#!/bin/bash +#SBATCH --job-name=refgetstore +#SBATCH --output=refgetstore_%j.log +#SBATCH --error=refgetstore_%j.log +#SBATCH --partition=standard +#SBATCH --time=24:00:00 +#SBATCH --mem=16G +#SBATCH --cpus-per-task=4 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget/data_loaders/ref-genome-analysis + +python build_refgetstore.py --store-path /project/shefflab/brickyard/refget_store diff --git a/data_loaders/ref-genome-analysis/verify_refgetstore.py b/data_loaders/ref-genome-analysis/verify_refgetstore.py new file mode 100644 index 0000000..fc53a59 --- /dev/null +++ b/data_loaders/ref-genome-analysis/verify_refgetstore.py @@ -0,0 +1,444 @@ +#!/usr/bin/env python3 +""" +Verification script for the brickyard RefgetStore. + +Runs automated checks against the store at STORE_PATH and produces a +structured pass/fail report. Designed to work with a partial store +(not all files loaded yet) and without aliases (alias registration +has not been done yet). + +Usage: + python verify_refgetstore.py + python verify_refgetstore.py --store-path /alt/path --limit 5 + +Expected results (update after first successful run): +- collections: ~XXX unique (out of ~1,147 input FASTAs processed so far) +- sequences: ~XXX unique +- roundtrip digest match: PASS for at least one collection +""" + +import argparse +import csv +import json +import os +import subprocess +import sys +import tempfile +import time + +STORE_PATH = "/project/shefflab/brickyard/refget_store" +INVENTORY_CSV = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refgenomes_inventory.csv" +DIGEST_MAP_CSV = "/home/nsheff/Dropbox/workspaces/refgenie/repos/refget/data_loaders/ref-genome-analysis/digest_map.csv" + +results = [] + + +def check(name, passed, detail=""): + """Record and print a check result.""" + status = "PASS" if passed else "FAIL" + results.append({"name": name, "status": status, "detail": detail}) + print(f"[{status}] {name}" + (f" -- {detail}" if detail else "")) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Verify brickyard RefgetStore") + parser.add_argument("--store-path", default=STORE_PATH, help="RefgetStore path") + parser.add_argument("--inventory", default=INVENTORY_CSV, help="Inventory CSV path") + parser.add_argument("--digest-map", default=DIGEST_MAP_CSV, help="Digest map CSV path") + parser.add_argument( + "--limit", + type=int, + default=3, + help="Number of collections to test for round-trip export (default: 3)", + ) + parser.add_argument( + "--skip-roundtrip", + action="store_true", + help="Skip round-trip FASTA export checks (slow for large genomes)", + ) + return parser.parse_args() + + +# ── Check 1: Store opens and stats are valid ─────────────────────────── + + +def check_store_opens(store_path): + """Open the store and verify basic stats.""" + try: + from refget.store import RefgetStore + + store = RefgetStore.open_local(store_path) + check("store_opens", True, f"path={store_path}") + except Exception as e: + check("store_opens", False, f"path={store_path}, error={e}") + return None + + # Count collections and sequences + try: + collections = list(store.list_collections()) + n_collections = len(collections) + except Exception as e: + check("list_collections", False, f"error={e}") + n_collections = 0 + + try: + sequences = list(store.list_sequences()) + n_sequences = len(sequences) + except Exception as e: + check("list_sequences", False, f"error={e}") + n_sequences = 0 + + check("collections_nonzero", n_collections > 0, f"collections={n_collections}") + check("sequences_nonzero", n_sequences > 0, f"sequences={n_sequences}") + + # Stats object + try: + stats = store.stats() + check("stats_callable", True, f"stats={stats}") + except Exception as e: + check("stats_callable", False, f"error={e}") + + return store + + +# ── Check 2: Digest map coverage ────────────────────────────────────── + + +def check_digest_map(store, digest_map_path): + """Verify that digests in the digest map are present in the store.""" + if not os.path.exists(digest_map_path): + check("digest_map_exists", False, f"not found: {digest_map_path}") + return + + check("digest_map_exists", True, f"path={digest_map_path}") + + # Read digest map + rows = [] + with open(digest_map_path, newline="") as f: + reader = csv.DictReader(f) + for row in reader: + rows.append(row) + + total = len(rows) + with_digest = [r for r in rows if r.get("digest")] + with_error = [r for r in rows if r.get("error")] + + check( + "digest_map_stats", + len(with_digest) > 0, + f"total_rows={total}, with_digest={len(with_digest)}, with_error={len(with_error)}", + ) + + # Get store collection digests for comparison + store_digests = {meta.digest for meta in store.list_collections()} + + # Check how many digest_map digests are in the store + matched = 0 + missing = [] + for row in with_digest: + d = row["digest"] + if d in store_digests: + matched += 1 + else: + missing.append(d[:16] + "...") + + check( + "digest_map_coverage", + matched == len(with_digest), + f"in_store={matched}/{len(with_digest)}" + + (f", missing_sample={missing[:5]}" if missing else ""), + ) + + +# ── Check 3: Collection level2 data integrity ───────────────────────── + + +def check_level2_integrity(store, n_to_check=3): + """Verify level2 data for a sample of collections.""" + collections = list(store.list_collections()) + if not collections: + check("level2_integrity", False, "no collections to check") + return + + sample = collections[:n_to_check] + all_ok = True + details = [] + + for meta in sample: + digest = meta.digest + try: + level2 = store.get_collection_level2(digest) + names = level2.get("names", []) + lengths = level2.get("lengths", []) + sequences = level2.get("sequences", []) + + arrays_ok = ( + len(names) == len(lengths) == len(sequences) and len(names) > 0 + ) + lengths_ok = all(l > 0 for l in lengths) if lengths else False + + if not arrays_ok or not lengths_ok: + all_ok = False + details.append( + f"{digest[:16]}: names={len(names)} lengths={len(lengths)} " + f"sequences={len(sequences)} lengths_positive={lengths_ok}" + ) + else: + details.append( + f"{digest[:16]}: {len(names)} seqs, OK" + ) + except Exception as e: + all_ok = False + details.append(f"{digest[:16]}: ERROR {e}") + + check( + "level2_arrays_valid", + all_ok, + f"checked={len(sample)}, results=[{'; '.join(details)}]", + ) + + +# ── Check 4: Round-trip FASTA export and digest comparison ───────────── + + +def check_roundtrip_export(store, store_path, digest_map_path, inventory_path, limit=3): + """Export FASTAs from the store and compare digests to originals.""" + try: + from gtars.refget import digest_fasta + except ImportError: + check("roundtrip_export", False, "gtars.refget.digest_fasta not available") + return + + # Build a mapping from digest -> original path using digest_map + inventory + digest_to_original = {} + + if os.path.exists(digest_map_path) and os.path.exists(inventory_path): + # Read inventory to get path -> accession mapping (for reference) + inv_lookup = {} + with open(inventory_path, newline="") as f: + for row in csv.DictReader(f): + inv_lookup[row["path"]] = row + + # Read digest_map to get digest -> path mapping + with open(digest_map_path, newline="") as f: + for row in csv.DictReader(f): + if row.get("digest") and row.get("path"): + # Only keep the first mapping per digest (avoid duplicates) + if row["digest"] not in digest_to_original: + digest_to_original[row["digest"]] = row["path"] + + if not digest_to_original: + check("roundtrip_export", False, "no digest-to-path mappings found") + return + + # Pick a sample of collections that have original files + collections = list(store.list_collections()) + test_pairs = [] + for meta in collections: + if meta.digest in digest_to_original: + original_path = digest_to_original[meta.digest] + if os.path.exists(original_path): + test_pairs.append((meta.digest, original_path)) + if len(test_pairs) >= limit: + break + + if not test_pairs: + check("roundtrip_export", False, "no original FASTA files accessible for comparison") + return + + all_match = True + details = [] + + for digest, original_path in test_pairs: + fd, tmp_path = tempfile.mkstemp(suffix=".fa") + os.close(fd) + try: + store.export_fasta(digest, tmp_path, None, 80) + + exported_sc = digest_fasta(tmp_path) + original_sc = digest_fasta(original_path) + + match = exported_sc.digest == original_sc.digest + if not match: + all_match = False + basename = os.path.basename(original_path) + details.append( + f"{basename}: {'MATCH' if match else 'MISMATCH'} " + f"(exported={exported_sc.digest[:16]}... " + f"original={original_sc.digest[:16]}...)" + ) + except Exception as e: + all_match = False + basename = os.path.basename(original_path) + details.append(f"{basename}: ERROR {e}") + finally: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + + check( + "roundtrip_digest_match", + all_match, + f"tested={len(test_pairs)}, results=[{'; '.join(details)}]", + ) + + +# ── Check 5: CLI stats command works ────────────────────────────────── + + +def check_cli_stats(store_path): + """Verify the CLI stats command runs against the store.""" + try: + result = subprocess.run( + ["refget", "store", "stats", "--path", store_path], + capture_output=True, + text=True, + timeout=60, + ) + if result.returncode == 0: + check("cli_stats_runs", True, f"stdout={result.stdout.strip()[:200]}") + else: + check( + "cli_stats_runs", + False, + f"returncode={result.returncode}, stderr={result.stderr.strip()[:200]}", + ) + except FileNotFoundError: + check("cli_stats_runs", False, "refget CLI not found in PATH") + except subprocess.TimeoutExpired: + check("cli_stats_runs", False, "timed out after 60s") + except Exception as e: + check("cli_stats_runs", False, f"error={e}") + + +# ── Check 6: Inventory cross-reference ──────────────────────────────── + + +def check_inventory_crossref(store, inventory_path, digest_map_path): + """Cross-check inventory against digest_map to verify completeness.""" + if not os.path.exists(inventory_path): + check("inventory_exists", False, f"not found: {inventory_path}") + return + if not os.path.exists(digest_map_path): + check("inventory_crossref", False, f"digest_map not found: {digest_map_path}") + return + + # Count inventory rows + with open(inventory_path, newline="") as f: + inv_rows = list(csv.DictReader(f)) + + # Count digest_map rows + with open(digest_map_path, newline="") as f: + dm_rows = list(csv.DictReader(f)) + + inv_paths = {r["path"] for r in inv_rows} + dm_paths = {r["path"] for r in dm_rows} + + # How many inventory files have been processed? + processed = inv_paths & dm_paths + unprocessed = inv_paths - dm_paths + + check( + "inventory_processing_coverage", + True, # Always pass -- partial is expected + f"inventory={len(inv_rows)}, digest_map={len(dm_rows)}, " + f"processed={len(processed)}, unprocessed={len(unprocessed)}", + ) + + # Check error rate in digest_map + errors = [r for r in dm_rows if r.get("error")] + check( + "digest_map_error_rate", + len(errors) == 0, + f"errors={len(errors)}/{len(dm_rows)}" + + (f", samples={[r['filename'] + ': ' + r['error'] for r in errors[:3]]}" if errors else ""), + ) + + +# ── Summary and report ──────────────────────────────────────────────── + + +def print_summary(store_path): + """Print summary and write JSON report.""" + print("\n" + "=" * 60) + print("VERIFICATION SUMMARY") + print("=" * 60) + passed = sum(1 for r in results if r["status"] == "PASS") + failed = sum(1 for r in results if r["status"] == "FAIL") + print(f"Passed: {passed}") + print(f"Failed: {failed}") + print(f"Total: {passed + failed}") + + if failed > 0: + print("\nFailed checks:") + for r in results: + if r["status"] == "FAIL": + print(f" - {r['name']}: {r['detail']}") + + # Write JSON report next to the store + report_dir = os.path.dirname(os.path.abspath(__file__)) + report_path = os.path.join(report_dir, "verification_report.json") + with open(report_path, "w") as f: + json.dump( + {"results": results, "passed": passed, "failed": failed}, + f, + indent=2, + ) + print(f"\nJSON report: {report_path}") + + return failed + + +def main(): + args = parse_args() + store_path = args.store_path + + print(f"Verifying RefgetStore at: {store_path}") + print(f"Inventory CSV: {args.inventory}") + print(f"Digest map CSV: {args.digest_map}") + print("=" * 60) + + t_start = time.time() + + # Check 1: Store opens and stats + print("\n── Check 1: Store opens and stats ──") + store = check_store_opens(store_path) + if store is None: + print("\nStore failed to open. Cannot continue.") + print_summary(store_path) + sys.exit(1) + + # Check 2: Digest map coverage + print("\n── Check 2: Digest map coverage ──") + check_digest_map(store, args.digest_map) + + # Check 3: Level2 data integrity + print("\n── Check 3: Collection level2 data integrity ──") + check_level2_integrity(store, n_to_check=min(args.limit, 5)) + + # Check 4: Round-trip FASTA export + if args.skip_roundtrip: + print("\n── Check 4: Round-trip export (SKIPPED) ──") + check("roundtrip_digest_match", True, "skipped via --skip-roundtrip") + else: + print("\n── Check 4: Round-trip FASTA export ──") + check_roundtrip_export( + store, store_path, args.digest_map, args.inventory, limit=args.limit + ) + + # Check 5: CLI stats command + print("\n── Check 5: CLI stats command ──") + check_cli_stats(store_path) + + # Check 6: Inventory cross-reference + print("\n── Check 6: Inventory cross-reference ──") + check_inventory_crossref(store, args.inventory, args.digest_map) + + elapsed = time.time() - t_start + print(f"\nVerification completed in {elapsed:.1f}s") + + failed = print_summary(store_path) + sys.exit(1 if failed > 0 else 0) + + +if __name__ == "__main__": + main() diff --git a/frontend/src/components/CliSnippet.jsx b/frontend/src/components/CliSnippet.jsx new file mode 100644 index 0000000..61e5ec5 --- /dev/null +++ b/frontend/src/components/CliSnippet.jsx @@ -0,0 +1,136 @@ +import { useState } from 'react'; + +/** + * A copyable CLI command snippet. + * Shows a monospace command with a copy button. + */ +const CliCommand = ({ command }) => { + const [copied, setCopied] = useState(false); + + const handleCopy = () => { + navigator.clipboard.writeText(command).then(() => { + setCopied(true); + setTimeout(() => setCopied(false), 1500); + }); + }; + + return ( +
+
{command}
+ +
+ ); +}; + +/** + * A collapsible panel of CLI commands for a given context. + * Props: + * commands: [{label, command}] + */ +const CliSnippet = ({ commands }) => { + const [open, setOpen] = useState(false); + + if (!commands || commands.length === 0) return null; + + return ( +
+ + {open && ( +
+ {commands.map(({ label, command }, i) => ( +
+ {label && {label}} + +
+ ))} + + Install: pip install refget + +
+ )} +
+ ); +}; + +/** + * A small icon button for table rows that opens a modal with CLI/Python snippets. + * Props: + * snippets: [{ label, cli, python }] + * title: modal title + */ +const RowCodeButton = ({ snippets, title = 'Code' }) => { + const [show, setShow] = useState(false); + const [tab, setTab] = useState('cli'); + + return ( + <> + + {show && ( + <> +
setShow(false)} /> +
setShow(false)}> +
e.stopPropagation()}> +
+
+
+ + {title} +
+
+
+
    +
  • + +
  • +
  • + +
  • +
+ {snippets.map((snippet, i) => ( +
+ {snippet.label && {snippet.label}} + +
+ ))} +
+
+
+
+ + )} + + ); +}; + +export { CliSnippet, CliCommand, RowCodeButton }; diff --git a/frontend/src/components/StoreNav.jsx b/frontend/src/components/StoreNav.jsx new file mode 100644 index 0000000..f24d50f --- /dev/null +++ b/frontend/src/components/StoreNav.jsx @@ -0,0 +1,222 @@ +import { useState } from 'react'; +import { Link } from 'react-router-dom'; +import { useExplorerStore } from '../stores/explorerStore.js'; +import { CliCommand } from './CliSnippet.jsx'; + +const StoreNav = ({ active, storeUrlParam, collectionDigest }) => { + const [showCode, setShowCode] = useState(false); + const [codeTab, setCodeTab] = useState('cli'); + const { storeUrl } = useExplorerStore(); + + const remote = storeUrl || new URLSearchParams(storeUrlParam).get('url') || ''; + + const items = [ + { key: 'overview', label: 'Overview', path: '/explore/store', icon: 'bi-house' }, + { key: 'sequences', label: 'Sequences', path: '/explore/store/sequences', icon: 'bi-list-ol' }, + { key: 'aliases', label: 'Aliases', path: '/explore/store/aliases', icon: 'bi-tag' }, + ]; + + const snippetGroups = [ + { + heading: 'Setup', + snippets: [ + { + label: 'Subscribe to this remote store', + cli: `refget config add store \\ + ${remote}`, + python: `import refget + +refget.config.add("store", "${remote}")`, + }, + ], + }, + { + heading: 'Browse', + snippets: [ + { + label: 'List collections', + cli: `refget store list \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +store.list()`, + }, + { + label: 'List sequences', + cli: `refget store list --sequences \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +store.list(sequences=True)`, + }, + { + label: 'Store statistics', + cli: `refget store stats \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +print(store)`, + }, + ], + }, + ]; + + if (collectionDigest) { + snippetGroups.push({ + heading: 'Collection', + snippets: [ + { + label: 'Get collection metadata', + cli: `refget store get \\ + ${collectionDigest} \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +store.get("${collectionDigest}")`, + }, + { + label: 'Pull collection to local cache', + cli: `refget store pull \\ + ${collectionDigest} \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +store.pull("${collectionDigest}")`, + }, + { + label: 'Export as FASTA', + cli: `refget store export \\ + ${collectionDigest} \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +store.export("${collectionDigest}")`, + }, + { + label: 'Generate .fai index', + cli: `refget store fai \\ + ${collectionDigest} \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +store.fai("${collectionDigest}")`, + }, + { + label: 'Generate chrom.sizes', + cli: `refget store chrom-sizes \\ + ${collectionDigest} \\ + --remote ${remote}`, + python: `import refget + +store = refget.RefgetStore("${remote}") +store.chrom_sizes("${collectionDigest}")`, + }, + ], + }); + } + + return ( +
+
+

+ + RefgetStore Explorer +

+
+ + + + Change Store + +
+
+ + {/* Code Snippets Modal */} + {showCode && ( + <> +
setShowCode(false)} /> +
setShowCode(false)}> +
e.stopPropagation()}> +
+
+
+ + Code Snippets +
+
+
+
    +
  • + +
  • +
  • + +
  • +
+ + {snippetGroups.map((group, gi) => ( +
+
{group.heading}
+ {group.snippets.map((snippet, i) => ( +
+ {snippet.label} + +
+ ))} +
+ ))} +
+ + Install: pip install refget + +
+
+
+
+ + )} + +
    + {items.map((item) => ( +
  • + + + {item.label} + +
  • + ))} +
+
+ ); +}; + +export { StoreNav }; diff --git a/frontend/src/main.jsx b/frontend/src/main.jsx index 4a65f6b..5a6f613 100644 --- a/frontend/src/main.jsx +++ b/frontend/src/main.jsx @@ -22,6 +22,11 @@ import { HPRCGenomes } from './pages/HPRCGenomes.jsx'; import { HumanReferencesView } from './pages/HumanReferences.jsx'; import { DigestPage } from './pages/DigestPage.jsx'; import { CompliancePage } from './pages/CompliancePage.jsx'; +import { StoreExplorer } from './pages/StoreExplorer.jsx'; +import { StoreOverview } from './pages/StoreOverview.jsx'; +import { StoreSequences } from './pages/StoreSequences.jsx'; +import { StoreCollection } from './pages/StoreCollection.jsx'; +import { StoreAliases } from './pages/StoreAliases.jsx'; import { fetchServiceInfo, @@ -127,6 +132,14 @@ const Nav = () => { Compliance +
  • + navigate('/explore')} + className={`nav-link cursor-pointer ${location.startsWith('explore') ? 'fw-medium text-black' : 'fw-light'}`} + > + Explore Store + +
  • , loader: (request) => fetchPangenomeLevels(request.params.digest), }, + { + path: '/explore', + element: , + errorElement: , + }, + { + path: '/explore/store', + element: , + errorElement: , + }, + { + path: '/explore/store/sequences', + element: , + errorElement: , + }, + { + path: '/explore/store/collection/:digest', + element: , + errorElement: , + }, + { + path: '/explore/store/aliases', + element: , + errorElement: , + }, ], }, ]); diff --git a/frontend/src/pages/StoreAliases.jsx b/frontend/src/pages/StoreAliases.jsx new file mode 100644 index 0000000..beec357 --- /dev/null +++ b/frontend/src/pages/StoreAliases.jsx @@ -0,0 +1,198 @@ +import { useState, useEffect } from 'react'; +import { Link, useSearchParams } from 'react-router-dom'; +import { useExplorerStore } from '../stores/explorerStore.js'; +import { StoreNav } from '../components/StoreNav.jsx'; + +const AliasNamespacePanel = ({ type, storeUrlParam, availableNamespaces }) => { + const { loadAliases } = useExplorerStore(); + const [namespace, setNamespace] = useState(''); + const [aliases, setAliases] = useState(null); + const [loading, setLoading] = useState(false); + const [error, setError] = useState(null); + const [filter, setFilter] = useState(''); + + const handleLoad = async (e) => { + e?.preventDefault(); + if (!namespace.trim()) return; + setLoading(true); + setError(null); + try { + const data = await loadAliases(type, namespace.trim()); + if (!data) { + setError(`Namespace "${namespace}" not found.`); + setAliases(null); + } else { + setAliases(data); + } + } catch (err) { + setError(err.message); + } finally { + setLoading(false); + } + }; + + const handleNamespaceClick = (ns) => { + setNamespace(ns); + setFilter(''); + setError(null); + setLoading(true); + loadAliases(type, ns) + .then((data) => { + if (!data) { + setError(`Namespace "${ns}" not found.`); + setAliases(null); + } else { + setAliases(data); + } + }) + .catch((err) => setError(err.message)) + .finally(() => setLoading(false)); + }; + + const filtered = aliases + ? aliases.filter( + (a) => + !filter || + a.alias.toLowerCase().includes(filter.toLowerCase()) || + a.digest.toLowerCase().includes(filter.toLowerCase()), + ) + : null; + + const linkPrefix = + type === 'sequences' + ? null // sequences don't have a detail page in the explorer + : `/explore/store/collection/`; + + return ( +
    +
    +
    + + {type} aliases +
    +
    +
    + {availableNamespaces && availableNamespaces.length > 0 ? ( +
    + Namespaces: + {availableNamespaces.map((ns) => ( + + ))} + {loading && } +
    + ) : ( +

    + + No {type} alias namespaces found in this store. +

    + )} + + {error && ( +
    {error}
    + )} + + {filtered && ( + <> +
    + + {filtered.length} aliases in "{namespace}" + + setFilter(e.target.value)} + /> +
    +
    +
  • handleSortTable('selectedDigest')}>Seqcol A handleSortTable('comparedAlias')}>Compared Seqcol handleSortTable('comparedDigest')}>Compared Seqcol Digest handleSortTable('lengths')}>Lengths {row.selectedDigest}{row.comparedAlias ? row.comparedAlias : row.comparedDigest} {row.comparedDigest} {Number.isInteger(row.lengths) ? row.lengths : row.lengths.toFixed(3)}
    + + + + + + + + {filtered.map((a, i) => ( + + + + + ))} + +
    AliasDigest
    {a.alias} + {linkPrefix ? ( + + {a.digest} + + ) : ( + a.digest + )} +
    +

    + + )} +
    +
    + ); +}; + +const StoreAliases = () => { + const [searchParams] = useSearchParams(); + const { storeUrl, metadata, loading, loadStore } = useExplorerStore(); + + const urlParam = searchParams.get('url'); + const storeUrlParam = `?url=${encodeURIComponent(storeUrl || urlParam)}`; + + useEffect(() => { + if (urlParam && !metadata && !loading) { + loadStore(urlParam).catch(() => {}); + } + }, [urlParam]); // eslint-disable-line react-hooks/exhaustive-deps + + if (!metadata && !loading) { + return ( +
    + No store loaded.{' '} + Go back to enter a store URL. +
    + ); + } + + if (loading) { + return ( +
    +
    +
    + ); + } + + return ( +
    + + +

    + Aliases map human-readable names to digests. Select a namespace to + browse its alias mappings. +

    + + + +
    + ); +}; + +export { StoreAliases }; diff --git a/frontend/src/pages/StoreCollection.jsx b/frontend/src/pages/StoreCollection.jsx new file mode 100644 index 0000000..4b9ea21 --- /dev/null +++ b/frontend/src/pages/StoreCollection.jsx @@ -0,0 +1,279 @@ +import { useState, useEffect } from 'react'; +import { Link, useParams, useSearchParams } from 'react-router-dom'; +import { useExplorerStore } from '../stores/explorerStore.js'; +import { StoreNav } from '../components/StoreNav.jsx'; +import { CliCommand } from '../components/CliSnippet.jsx'; + +const StoreCollection = () => { + const { digest } = useParams(); + const [searchParams] = useSearchParams(); + const { storeUrl, metadata, loadStore, loadCollection, loadFhrMetadata, loading } = + useExplorerStore(); + const [collection, setCollection] = useState(null); + const [fhr, setFhr] = useState(undefined); + const [error, setError] = useState(null); + const [loadingCol, setLoadingCol] = useState(true); + const [selectedSeq, setSelectedSeq] = useState(null); + const [seqCodeTab, setSeqCodeTab] = useState('cli'); + + const urlParam = searchParams.get('url'); + const storeUrlParam = `?url=${encodeURIComponent(storeUrl || urlParam)}`; + + useEffect(() => { + const load = async () => { + try { + // Ensure store is loaded + if (!metadata && urlParam) { + await loadStore(urlParam); + } + const col = await loadCollection(digest); + setCollection(col); + const fhrData = await loadFhrMetadata(digest); + setFhr(fhrData); + } catch (err) { + setError(err.message); + } finally { + setLoadingCol(false); + } + }; + load(); + }, [digest, urlParam]); // eslint-disable-line react-hooks/exhaustive-deps + + if (!metadata && !loading && !loadingCol) { + return ( +
    + No store loaded.{' '} + Go back to enter a store URL. +
    + ); + } + + if (loading || loadingCol) { + return ( +
    +
    +

    Loading collection...

    +
    + ); + } + + if (error) { + return ( +
    + +
    {error}
    +
    + ); + } + + const { metadata: colMeta, sequences } = collection; + const totalBases = sequences.reduce((sum, s) => sum + s.length, 0); + const alphabetCounts = {}; + sequences.forEach((s) => { + alphabetCounts[s.alphabet] = (alphabetCounts[s.alphabet] || 0) + 1; + }); + + return ( +
    + + +
    {digest}
    + + {/* Summary stats */} +
    +
    +
    +
    + Sequences + {sequences.length.toLocaleString()} +
    +
    +
    +
    +
    +
    + Total bases + {totalBases.toLocaleString()} +
    +
    +
    + {Object.keys(alphabetCounts).length > 0 && ( +
    +
    +
    + Alphabets + + + {Object.entries(alphabetCounts).map(([alph, count]) => ( + + + + + ))} + +
    {alph}{count.toLocaleString()}
    +
    +
    +
    + )} +
    + + {/* Collection metadata from ## headers */} + {Object.keys(colMeta).length > 0 && ( +
    +
    +
    Collection Metadata
    +
    +
    + + + {Object.entries(colMeta).map(([key, value]) => ( + + + + + ))} + +
    {key}{value}
    +
    +
    + )} + + {/* FHR metadata */} + {fhr ? ( +
    +
    +
    + + FHR Metadata +
    +
    +
    +
    +              {JSON.stringify(fhr, null, 2)}
    +            
    +
    +
    + ) : fhr === null ? ( +

    + + No FHR metadata sidecar found for this collection. +

    + ) : null} + + {/* Sequence table */} +
    +
    +
    Sequences in this collection
    +
    +
    +
    + + + + + + + + + + {sequences.map((seq, i) => ( + setSelectedSeq(seq)} + > + + + + + ))} + +
    NameLengthSHA-512/24u
    {seq.name} + {seq.length.toLocaleString()} + {seq.sha512t24u}
    +
    +
    +
    + + {/* Sequence detail modal */} + {selectedSeq && ( + <> +
    setSelectedSeq(null)} /> +
    setSelectedSeq(null)}> +
    e.stopPropagation()}> +
    +
    +
    {selectedSeq.name}
    +
    +
    + + + + + + + + + + + + + + + + + + + {selectedSeq.description && ( + + + + + )} + +
    Length{selectedSeq.length.toLocaleString()}
    Alphabet{selectedSeq.alphabet}
    SHA-512/24u{selectedSeq.sha512t24u}
    MD5{selectedSeq.md5}
    Description{selectedSeq.description}
    + +
    Code
    +
      +
    • + +
    • +
    • + +
    • +
    + Get sequence + +
    +
    +
    +
    + + )} +
    + ); +}; + +export { StoreCollection }; diff --git a/frontend/src/pages/StoreExplorer.jsx b/frontend/src/pages/StoreExplorer.jsx new file mode 100644 index 0000000..810ef30 --- /dev/null +++ b/frontend/src/pages/StoreExplorer.jsx @@ -0,0 +1,135 @@ +import { useState, useEffect } from 'react'; +import { useNavigate, useSearchParams } from 'react-router-dom'; +import { useExplorerStore } from '../stores/explorerStore.js'; + +const RECENT_STORES_KEY = 'refget-explorer-recent-stores'; +const MAX_RECENT = 5; + +const getRecentStores = () => { + try { + return JSON.parse(localStorage.getItem(RECENT_STORES_KEY)) || []; + } catch { + return []; + } +}; + +const saveRecentStore = (url) => { + const recent = getRecentStores().filter((u) => u !== url); + recent.unshift(url); + localStorage.setItem( + RECENT_STORES_KEY, + JSON.stringify(recent.slice(0, MAX_RECENT)), + ); +}; + +const StoreExplorer = () => { + const [searchParams] = useSearchParams(); + const navigate = useNavigate(); + const { loadStore, loading, error, storeUrl } = useExplorerStore(); + const [url, setUrl] = useState(searchParams.get('url') || ''); + const [localError, setLocalError] = useState(null); + const recentStores = getRecentStores(); + + // Auto-load if URL param provided + useEffect(() => { + const paramUrl = searchParams.get('url'); + if (paramUrl && paramUrl !== storeUrl) { + handleExplore(paramUrl); + } + }, []); // eslint-disable-line react-hooks/exhaustive-deps + + const handleExplore = async (targetUrl) => { + const trimmed = (targetUrl || url).trim(); + if (!trimmed) return; + setLocalError(null); + try { + await loadStore(trimmed); + saveRecentStore(trimmed); + navigate(`/explore/store?url=${encodeURIComponent(trimmed)}`); + } catch (err) { + setLocalError(err.message); + } + }; + + const handleSubmit = (e) => { + e.preventDefault(); + handleExplore(); + }; + + return ( +
    +

    + + RefgetStore Explorer +

    +

    + Browse the contents of any RefgetStore — sequences, collections, aliases, + and metadata. Enter the URL of a store hosted on any HTTP server. +

    + +
    +
    + setUrl(e.target.value)} + required + /> + +
    + + + {(localError || error) && ( +
    + Failed to load store: {localError || error} +

    + Make sure the URL points to a valid RefgetStore directory with an{' '} + rgstore.json file. The server must allow cross-origin + requests (CORS). +

    +
    + )} + + {recentStores.length > 0 && ( +
    +
    Recent stores
    +
    + {recentStores.map((recentUrl) => ( + + ))} +
    +
    + )} +
    + ); +}; + +export { StoreExplorer }; diff --git a/frontend/src/pages/StoreOverview.jsx b/frontend/src/pages/StoreOverview.jsx new file mode 100644 index 0000000..fb17740 --- /dev/null +++ b/frontend/src/pages/StoreOverview.jsx @@ -0,0 +1,309 @@ +import { useState, useEffect } from 'react'; +import { Link, useNavigate, useSearchParams } from 'react-router-dom'; +import { useExplorerStore } from '../stores/explorerStore.js'; +import { StoreNav } from '../components/StoreNav.jsx'; +import { RowCodeButton } from '../components/CliSnippet.jsx'; + +const StoreOverview = () => { + const [searchParams] = useSearchParams(); + const navigate = useNavigate(); + const { storeUrl, metadata, sequenceIndex, collections, loading, loadStore, loadSequenceIndex } = + useExplorerStore(); + const [seqLoading, setSeqLoading] = useState(false); + + const urlParam = searchParams.get('url'); + + // If we have a URL param but no loaded store, load it + useEffect(() => { + const init = async () => { + if (urlParam && !metadata && !loading) { + await loadStore(urlParam).catch(() => {}); + } + }; + init(); + }, [urlParam]); // eslint-disable-line react-hooks/exhaustive-deps + + // Auto-load sequence index (fetchSequenceIndex handles size check internally) + useEffect(() => { + if (metadata && !sequenceIndex && !seqLoading) { + setSeqLoading(true); + loadSequenceIndex() + .catch(() => {}) + .finally(() => setSeqLoading(false)); + } + }, [metadata]); // eslint-disable-line react-hooks/exhaustive-deps + + if (!metadata && !loading) { + return ( +
    + No store loaded.{' '} + Go back to enter a store URL. +
    + ); + } + + if (loading) { + return ( +
    +
    +

    Loading store...

    +
    + ); + } + + const totalBases = sequenceIndex + ? sequenceIndex.reduce((sum, s) => sum + s.length, 0) + : 0; + + const alphabetCounts = {}; + if (sequenceIndex) { + sequenceIndex.forEach((s) => { + alphabetCounts[s.alphabet] = (alphabetCounts[s.alphabet] || 0) + 1; + }); + } + + const storeUrlParam = `?url=${encodeURIComponent(storeUrl || urlParam)}`; + + return ( +
    + + +
    + {/* Store info card */} +
    +
    +
    +
    + + Store Info +
    +
    +
    + + + + + + + + + + + + + + + {metadata.created_at && ( + + + + + )} + +
    URL + {storeUrl || urlParam} +
    Version{metadata.version}
    Storage Mode + + {metadata.mode} + +
    Created{new Date(metadata.created_at).toLocaleString()}
    +
    +
    +
    + + {/* Sequences summary card */} +
    +
    +
    +
    + + Sequences +
    + + Browse all + +
    +
    + {sequenceIndex ? ( + + + + + + + + + + + + + + + +
    Total sequences{sequenceIndex.length.toLocaleString()}
    Total bases{totalBases.toLocaleString()}
    Alphabets + {Object.entries(alphabetCounts).map(([alph, count]) => ( + + {alph}: {count} + + ))} +
    + ) : seqLoading ? ( +
    + + Loading sequence index... +
    + ) : ( +

    + Sequence index not available. +

    + )} +
    +
    +
    +
    + + {/* Collections */} +
    +
    +
    + + Collections +
    +
    +
    + {collections && collections.length > 0 ? ( +
    + + + + + + + + + + {collections.map((col) => ( + + + + + + ))} + +
    DigestSequences
    + + {col.digest} + + {col.n_sequences} + +
    +
    + ) : ( +

    + No collection index (collections.rgci) found. Individual + collections can still be viewed if you know the digest. +

    + )} +
    +
    + + {/* Aliases section */} +
    +
    +
    + + Aliases +
    + + Browse aliases + +
    +
    + {(metadata.sequence_alias_namespaces?.length > 0 || metadata.collection_alias_namespaces?.length > 0) ? ( + + + {metadata.sequence_alias_namespaces?.length > 0 && ( + + + + + )} + {metadata.collection_alias_namespaces?.length > 0 && ( + + + + + )} + +
    Sequence namespaces + {metadata.sequence_alias_namespaces.map((ns) => ( + + {ns} + + ))} +
    Collection namespaces + {metadata.collection_alias_namespaces.map((ns) => ( + + {ns} + + ))} +
    + ) : ( +

    + No alias namespace information available. +

    + )} +
    +
    + +
    + ); +}; + +export { StoreOverview }; diff --git a/frontend/src/pages/StoreSequences.jsx b/frontend/src/pages/StoreSequences.jsx new file mode 100644 index 0000000..da0fca5 --- /dev/null +++ b/frontend/src/pages/StoreSequences.jsx @@ -0,0 +1,337 @@ +import { useState, useMemo, useEffect } from 'react'; +import { Link, useSearchParams } from 'react-router-dom'; +import { useExplorerStore } from '../stores/explorerStore.js'; +import { StoreNav } from '../components/StoreNav.jsx'; +import { CliCommand } from '../components/CliSnippet.jsx'; + +const PAGE_SIZE = 50; + +const formatBytes = (bytes) => { + if (bytes < 1024) return `${bytes} B`; + if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`; + return `${(bytes / (1024 * 1024)).toFixed(1)} MB`; +}; + +const StoreSequences = () => { + const [searchParams] = useSearchParams(); + const { + storeUrl, sequenceIndex, sequenceIndexPartial, sequenceIndexTotalSize, + metadata, loading, loadStore, loadSequenceIndex, + } = useExplorerStore(); + const [filter, setFilter] = useState(''); + const [sortCol, setSortCol] = useState(null); + const [sortAsc, setSortAsc] = useState(true); + const [page, setPage] = useState(0); + const [seqLoading, setSeqLoading] = useState(false); + const [seqError, setSeqError] = useState(null); + const [selectedSeq, setSelectedSeq] = useState(null); + const [seqCodeTab, setSeqCodeTab] = useState('cli'); + + const urlParam = searchParams.get('url'); + const storeUrlParam = `?url=${encodeURIComponent(storeUrl || urlParam)}`; + + // Auto-load on mount — fetchSequenceIndex handles the size check internally + useEffect(() => { + const init = async () => { + if (urlParam && !metadata && !loading) { + await loadStore(urlParam).catch(() => {}); + } + if (!sequenceIndex && !seqLoading) { + setSeqLoading(true); + try { + await loadSequenceIndex(); + } catch (err) { + setSeqError(err.message); + } finally { + setSeqLoading(false); + } + } + }; + init(); + }, [urlParam, metadata]); // eslint-disable-line react-hooks/exhaustive-deps + + const handleLoadMore = async (maxBytes) => { + setSeqLoading(true); + setSeqError(null); + try { + await loadSequenceIndex(maxBytes ? { maxBytes } : {}); + } catch (err) { + setSeqError(err.message); + } finally { + setSeqLoading(false); + } + }; + + const filtered = useMemo(() => { + if (!sequenceIndex) return []; + const term = filter.toLowerCase(); + return sequenceIndex.filter( + (s) => + !term || + s.name?.toLowerCase().includes(term) || + s.sha512t24u?.toLowerCase().includes(term) || + s.md5?.toLowerCase().includes(term) || + s.description?.toLowerCase().includes(term), + ); + }, [sequenceIndex, filter]); + + const sorted = useMemo(() => { + if (!sortCol) return filtered; + return [...filtered].sort((a, b) => { + const va = a[sortCol]; + const vb = b[sortCol]; + if (typeof va === 'number' && typeof vb === 'number') { + return sortAsc ? va - vb : vb - va; + } + return sortAsc + ? String(va).localeCompare(String(vb)) + : String(vb).localeCompare(String(va)); + }); + }, [filtered, sortCol, sortAsc]); + + const totalPages = Math.ceil(sorted.length / PAGE_SIZE); + const paged = sorted.slice(page * PAGE_SIZE, (page + 1) * PAGE_SIZE); + + const handleSort = (col) => { + if (sortCol === col) { + setSortAsc(!sortAsc); + } else { + setSortCol(col); + setSortAsc(true); + } + setPage(0); + }; + + const SortIcon = ({ col }) => { + if (sortCol !== col) return null; + return ; + }; + + if (!metadata && !loading) { + return ( +
    + No store loaded.{' '} + Go back to enter a store URL. +
    + ); + } + + if (loading || seqLoading) { + return ( +
    +
    +

    + {seqLoading ? 'Loading sequence index...' : 'Loading store...'} +

    +
    + ); + } + + if (seqError) { + return ( +
    + +
    {seqError}
    +
    + ); + } + + if (!sequenceIndex) { + return ( +
    + +
    + No sequence index (sequences.rgsi) found in this store. +
    +
    + ); + } + + const columns = [ + { key: 'name', label: 'Name' }, + { key: 'length', label: 'Length' }, + { key: 'sha512t24u', label: 'SHA-512/24u' }, + ]; + + return ( +
    + + + {/* Partial load banner */} + {sequenceIndexPartial && ( +
    + + + Sequence index is {formatBytes(sequenceIndexTotalSize)} — showing first{' '} + {sequenceIndex.length.toLocaleString()} sequences. + Sorting and filtering apply only to loaded data. + + +
    + )} + +
    + + {filtered.length.toLocaleString()} sequences + {filter && ` (filtered from ${sequenceIndex.length.toLocaleString()})`} + {sequenceIndexPartial && ' (partial)'} + + { + setFilter(e.target.value); + setPage(0); + }} + /> +
    + +
    + + + + {columns.map((col) => ( + + ))} + + + + {paged.map((seq, i) => ( + setSelectedSeq(seq)} + > + + + + + ))} + +
    handleSort(col.key)} + style={{ cursor: 'pointer' }} + className={col.key === 'length' ? 'text-end' : ''} + > + {col.label} + +
    {seq.name} + {seq.length.toLocaleString()} + {seq.sha512t24u}
    +
    + + {/* Sequence detail modal */} + {selectedSeq && ( + <> +
    setSelectedSeq(null)} /> +
    setSelectedSeq(null)}> +
    e.stopPropagation()}> +
    +
    +
    {selectedSeq.name}
    +
    +
    + + + + + + + + + + + + + + + + + + + {selectedSeq.description && ( + + + + + )} + +
    Length{selectedSeq.length.toLocaleString()}
    Alphabet{selectedSeq.alphabet}
    SHA-512/24u{selectedSeq.sha512t24u}
    MD5{selectedSeq.md5}
    Description{selectedSeq.description}
    + +
    Code
    +
      +
    • + +
    • +
    • + +
    • +
    + Get sequence + +
    +
    +
    +
    + + )} + + {totalPages > 1 && ( + + )} +
    + ); +}; + +export { StoreSequences }; diff --git a/frontend/src/services/fetchData.jsx b/frontend/src/services/fetchData.jsx index ac6fbab..f911b93 100644 --- a/frontend/src/services/fetchData.jsx +++ b/frontend/src/services/fetchData.jsx @@ -56,19 +56,27 @@ export const fetchPangenomeLevels = async (digest) => { }; export const fetchSeqColList = async () => { - const urls = [ - `${API_BASE}/list/collection?page_size=10&page=0`, - `${API_BASE}/list/pangenome?page_size=5`, - `${API_BASE}/list/attributes/name_length_pairs?page_size=5`, - ]; + const fetchRequired = async (url) => { + const response = await fetch(url); + await checkResponse(response, url); + return response.json(); + }; - return Promise.all( - urls.map(async (url) => { + const fetchOptional = async (url) => { + try { const response = await fetch(url); - await checkResponse(response, url); + if (!response.ok) return null; return response.json(); - }), - ); + } catch { + return null; + } + }; + + return Promise.all([ + fetchRequired(`${API_BASE}/list/collection?page_size=10&page=0`), + fetchOptional(`${API_BASE}/list/pangenome?page_size=5`), + fetchRequired(`${API_BASE}/list/attributes/name_length_pairs?page_size=5`), + ]); }; export const fetchAllSeqCols = async () => { diff --git a/frontend/src/services/storeService.js b/frontend/src/services/storeService.js new file mode 100644 index 0000000..4ac46b2 --- /dev/null +++ b/frontend/src/services/storeService.js @@ -0,0 +1,217 @@ +/** + * Service for fetching and parsing RefgetStore static files. + * A RefgetStore is a directory of static TSV/JSON files — no backend needed. + */ + +// Ensure URL ends without trailing slash +const normalizeUrl = (url) => url.replace(/\/+$/, ''); + +/** + * Parse TSV text into array of objects. + * Handles # comment header lines and ## metadata headers. + * Returns { metadata: {key: value}, rows: [{col: val}] } + */ +const parseTsv = (text) => { + const lines = text.split('\n').filter((l) => l.length > 0); + const metadata = {}; + let headerCols = null; + const rows = []; + + for (const line of lines) { + if (line.startsWith('##')) { + // Metadata header: ##key=value + const eq = line.indexOf('='); + if (eq > 2) { + metadata[line.substring(2, eq)] = line.substring(eq + 1); + } + } else if (line.startsWith('#')) { + // Column header + headerCols = line.substring(1).split('\t'); + } else if (headerCols) { + const fields = line.split('\t'); + const row = {}; + headerCols.forEach((col, i) => { + row[col] = fields[i] ?? ''; + }); + rows.push(row); + } + } + + return { metadata, rows }; +}; + +/** + * Parse a two-column TSV (alias files have no header). + * Returns [{alias, digest}] + */ +const parseAliasTsv = (text) => { + return text + .split('\n') + .filter((l) => l.length > 0 && !l.startsWith('#')) + .map((line) => { + const [alias, digest] = line.split('\t'); + return { alias, digest }; + }); +}; + +/** + * Parse collections.rgci — a TSV with #header row. + * Columns: digest, n_sequences, names_digest, sequences_digest, lengths_digest, + * name_length_pairs_digest, sorted_name_length_pairs_digest, sorted_sequences_digest + */ +const parseRgci = (text) => { + const { rows } = parseTsv(text); + return rows.map((r) => ({ + ...r, + n_sequences: r.n_sequences ? parseInt(r.n_sequences, 10) : 0, + })); +}; + +/** Fetch with error handling */ +const fetchFile = async (url) => { + const response = await fetch(url); + if (!response.ok) { + if (response.status === 404 || response.status === 403) return null; + throw new Error(`HTTP ${response.status} fetching ${url}`); + } + return response; +}; + +/** GET rgstore.json → parsed JSON */ +export const fetchStoreMetadata = async (baseUrl) => { + const url = `${normalizeUrl(baseUrl)}/rgstore.json`; + const response = await fetchFile(url); + if (!response) throw new Error('rgstore.json not found at this URL'); + return response.json(); +}; + +/** Size threshold for auto-loading sequence index (10 MB) */ +const AUTO_LOAD_THRESHOLD = 10 * 1024 * 1024; +/** Default partial load size (2 MB) */ +const PARTIAL_LOAD_SIZE = 2 * 1024 * 1024; + +const parseSequenceRows = (text) => { + const { rows } = parseTsv(text); + return rows.map((r) => ({ + ...r, + length: r.length ? parseInt(r.length, 10) : 0, + })); +}; + +/** + * Check the size of sequences.rgsi via HEAD request. + * Returns { url, size } or null if not found. + */ +export const checkSequenceIndexSize = async (baseUrl) => { + const url = `${normalizeUrl(baseUrl)}/sequences.rgsi`; + try { + const response = await fetch(url, { method: 'HEAD' }); + if (!response.ok) return null; + const size = parseInt(response.headers.get('content-length') || '0', 10); + return { url, size }; + } catch { + return null; + } +}; + +/** + * Fetch sequences.rgsi — auto-loads if small, otherwise requires explicit call. + * Returns { rows, partial, totalSize } + * partial: true if only a prefix was loaded + * totalSize: file size in bytes + */ +export const fetchSequenceIndex = async (baseUrl, { maxBytes } = {}) => { + const url = `${normalizeUrl(baseUrl)}/sequences.rgsi`; + + // Check file size first + let totalSize = 0; + try { + const head = await fetch(url, { method: 'HEAD' }); + if (!head.ok) { + if (head.status === 404 || head.status === 403) throw new Error('sequences.rgsi not found'); + throw new Error(`HTTP ${head.status} fetching ${url}`); + } + totalSize = parseInt(head.headers.get('content-length') || '0', 10); + } catch (err) { + if (err.message.includes('not found')) throw err; + // HEAD failed (CORS?), fall back to full fetch + const response = await fetchFile(url); + if (!response) throw new Error('sequences.rgsi not found'); + const text = await response.text(); + return { rows: parseSequenceRows(text), partial: false, totalSize: text.length }; + } + + const limit = maxBytes || (totalSize <= AUTO_LOAD_THRESHOLD ? totalSize : PARTIAL_LOAD_SIZE); + const loadFull = limit >= totalSize; + + if (loadFull) { + const response = await fetchFile(url); + if (!response) throw new Error('sequences.rgsi not found'); + const text = await response.text(); + return { rows: parseSequenceRows(text), partial: false, totalSize }; + } + + // Partial load via Range header + const response = await fetch(url, { + headers: { Range: `bytes=0-${limit - 1}` }, + }); + if (!response.ok && response.status !== 206) { + // Server doesn't support Range — fall back to full fetch + const fullResponse = await fetchFile(url); + if (!fullResponse) throw new Error('sequences.rgsi not found'); + const text = await fullResponse.text(); + return { rows: parseSequenceRows(text), partial: false, totalSize }; + } + const text = await response.text(); + // Discard last partial line + const lastNewline = text.lastIndexOf('\n'); + const cleanText = lastNewline > 0 ? text.substring(0, lastNewline) : text; + return { rows: parseSequenceRows(cleanText), partial: true, totalSize }; +}; + +/** GET collections.rgci → array of collection summaries */ +export const fetchCollectionIndex = async (baseUrl) => { + const url = `${normalizeUrl(baseUrl)}/collections.rgci`; + const response = await fetchFile(url); + if (!response) return null; // No collection index available + const text = await response.text(); + return parseRgci(text); +}; + +/** GET collections/{digest}.rgsi → {metadata, sequences} */ +export const fetchCollection = async (baseUrl, digest) => { + const base = normalizeUrl(baseUrl); + // Try .rgsi first (format spec default), then .rgci + let response = await fetchFile(`${base}/collections/${digest}.rgsi`); + if (!response) { + response = await fetchFile(`${base}/collections/${digest}.rgci`); + } + if (!response) + throw new Error(`Collection ${digest} not found`); + const text = await response.text(); + const { metadata, rows } = parseTsv(text); + return { + metadata, + sequences: rows.map((r) => ({ + ...r, + length: r.length ? parseInt(r.length, 10) : 0, + })), + }; +}; + +/** GET aliases/{type}/{namespace}.tsv → [{alias, digest}] */ +export const fetchAliases = async (baseUrl, type, namespace) => { + const url = `${normalizeUrl(baseUrl)}/aliases/${type}/${namespace}.tsv`; + const response = await fetchFile(url); + if (!response) return null; + const text = await response.text(); + return parseAliasTsv(text); +}; + +/** GET collections/{digest}.fhr.json → parsed JSON or null */ +export const fetchFhrMetadata = async (baseUrl, digest) => { + const url = `${normalizeUrl(baseUrl)}/collections/${digest}.fhr.json`; + const response = await fetchFile(url); + if (!response) return null; + return response.json(); +}; diff --git a/frontend/src/stores/explorerStore.js b/frontend/src/stores/explorerStore.js new file mode 100644 index 0000000..9edd6c1 --- /dev/null +++ b/frontend/src/stores/explorerStore.js @@ -0,0 +1,113 @@ +import { create } from 'zustand'; +import { + fetchStoreMetadata, + fetchSequenceIndex, + fetchCollectionIndex, + fetchCollection, + fetchAliases, + fetchFhrMetadata, +} from '../services/storeService.js'; + +export const useExplorerStore = create((set, get) => ({ + storeUrl: null, + metadata: null, + sequenceIndex: null, // array of sequence rows (or null if not loaded) + sequenceIndexPartial: false, // true if only a prefix was loaded + sequenceIndexTotalSize: 0, // total file size in bytes + collections: null, + loadedCollections: {}, + aliases: {}, + fhrMetadata: {}, + loading: false, + error: null, + + setStoreUrl: (url) => set({ storeUrl: url }), + + /** Fetch store metadata + collection index (sequence index is lazy-loaded) */ + loadStore: async (url) => { + set({ loading: true, error: null, storeUrl: url }); + try { + const metadata = await fetchStoreMetadata(url); + set({ metadata }); + + const collections = await fetchCollectionIndex(url).catch(() => null); + + set({ + sequenceIndex: null, + sequenceIndexPartial: false, + sequenceIndexTotalSize: 0, + collections, + loading: false, + loadedCollections: {}, + aliases: {}, + fhrMetadata: {}, + }); + } catch (err) { + set({ loading: false, error: err.message }); + throw err; + } + }, + + /** Fetch and cache the sequence index (lazy — only when needed). + * Options: { maxBytes } to limit partial load size. */ + loadSequenceIndex: async (options) => { + const { storeUrl, sequenceIndex } = get(); + // If already fully loaded, return cached + if (sequenceIndex && !get().sequenceIndexPartial) return sequenceIndex; + + const { rows, partial, totalSize } = await fetchSequenceIndex(storeUrl, options); + set({ + sequenceIndex: rows, + sequenceIndexPartial: partial, + sequenceIndexTotalSize: totalSize, + }); + return rows; + }, + + /** Fetch and cache a single collection */ + loadCollection: async (digest) => { + const { storeUrl, loadedCollections } = get(); + if (loadedCollections[digest]) return loadedCollections[digest]; + + const data = await fetchCollection(storeUrl, digest); + set({ loadedCollections: { ...get().loadedCollections, [digest]: data } }); + return data; + }, + + /** Fetch and cache aliases for a type/namespace */ + loadAliases: async (type, namespace) => { + const { storeUrl, aliases } = get(); + const key = `${type}/${namespace}`; + if (aliases[key]) return aliases[key]; + + const data = await fetchAliases(storeUrl, type, namespace); + set({ aliases: { ...get().aliases, [key]: data } }); + return data; + }, + + /** Fetch and cache FHR metadata for a collection */ + loadFhrMetadata: async (digest) => { + const { storeUrl, fhrMetadata } = get(); + if (fhrMetadata[digest] !== undefined) return fhrMetadata[digest]; + + const data = await fetchFhrMetadata(storeUrl, digest); + set({ fhrMetadata: { ...get().fhrMetadata, [digest]: data } }); + return data; + }, + + /** Reset store state */ + reset: () => + set({ + storeUrl: null, + metadata: null, + sequenceIndex: null, + sequenceIndexPartial: false, + sequenceIndexTotalSize: 0, + collections: null, + loadedCollections: {}, + aliases: {}, + fhrMetadata: {}, + loading: false, + error: null, + }), +})); From 6e8f97879fc012112b039fbb9dd267cc77d07caf Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 13 Mar 2026 07:29:28 -0400 Subject: [PATCH 19/31] Modernize build system, expand store CLI, and add store-backed backend - Migrate from setup.py to pyproject.toml with hatchling - Replace Black with Ruff, update GH Actions, use trusted publishing - Remove legacy requirements/ files and codecov workflow - Add SeqColBackend protocol and RefgetStoreBackend (database-free API serving) - Expand store CLI subcommands - Update seqcolapi configuration and compliance checks - Add tests for backend, store crate, and updated CLI commands --- .github/workflows/black.yml | 10 +- .github/workflows/python-publish.yml | 9 +- .github/workflows/run-codecov.yml | 21 - .github/workflows/run-pytest.yml | 11 +- MANIFEST.in | 3 - examples/remote_store.py | 2 +- pyproject.toml | 71 +++- refget/__init__.py | 18 +- refget/agents.py | 101 +++-- refget/backend.py | 144 +++++++ refget/cli/admin.py | 6 +- refget/cli/config.py | 20 +- refget/cli/config_manager.py | 2 +- refget/cli/fasta.py | 9 +- refget/cli/main.py | 8 +- refget/cli/seqcol.py | 2 +- refget/cli/store.py | 364 ++++++++++++++++-- refget/clients.py | 11 +- refget/compliance.py | 106 +++-- refget/const.py | 2 +- refget/digests.py | 5 +- refget/examples.py | 2 +- refget/models.py | 28 +- refget/router.py | 106 ++--- refget/store.py | 4 +- refget/utils.py | 10 +- requirements/requirements-all.txt | 7 - requirements/requirements-dev.txt | 1 - requirements/requirements-docs.txt | 2 - requirements/requirements-seqcolapi.txt | 6 - requirements/requirements-test.txt | 3 - seqcolapi/__main__.py | 1 + seqcolapi/const.py | 5 +- seqcolapi/examples.py | 2 +- seqcolapi/main.py | 65 +++- setup.py | 56 --- tests/api/conftest.py | 3 +- tests/api/test_compliance.py | 25 +- tests/conftest.py | 17 +- tests/integration/conftest.py | 9 +- .../integration/test_cli_admin_integration.py | 2 - .../test_cli_seqcol_integration.py | 1 - tests/integration/test_run_compliance.py | 1 + tests/local/test_aliases.py | 4 +- tests/local/test_backend.py | 216 +++++++++++ tests/local/test_digest_functions.py | 13 +- tests/local/test_local_models.py | 5 +- tests/local/test_local_models_gtars.py | 22 +- tests/local/test_refget_clients.py | 2 +- tests/local/test_remove_collection.py | 11 +- tests/local/test_store_seqcol_features.py | 3 +- tests/test_cli/test_admin_commands.py | 3 - tests/test_cli/test_config_commands.py | 7 +- tests/test_cli/test_fasta_commands.py | 26 +- tests/test_cli/test_help.py | 2 - tests/test_cli/test_seqcol_commands.py | 26 +- tests/test_cli/test_store_commands.py | 111 ++++-- tests/test_cli/test_store_crate.py | 305 +++++++++++++++ tests/test_cli/test_store_pull.py | 105 +++-- tests/test_cli_integration/test_workflows.py | 24 +- 60 files changed, 1667 insertions(+), 499 deletions(-) delete mode 100644 .github/workflows/run-codecov.yml delete mode 100644 MANIFEST.in create mode 100644 refget/backend.py delete mode 100644 requirements/requirements-all.txt delete mode 100644 requirements/requirements-dev.txt delete mode 100644 requirements/requirements-docs.txt delete mode 100644 requirements/requirements-seqcolapi.txt delete mode 100644 requirements/requirements-test.txt delete mode 100644 setup.py create mode 100644 tests/local/test_backend.py create mode 100644 tests/test_cli/test_store_crate.py diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index 8b48ddf..f329b68 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -6,6 +6,10 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - - uses: psf/black@stable + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - run: pip install ruff + - run: ruff check . + - run: ruff format --check . diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index e54ad87..b8cb119 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -1,4 +1,4 @@ -# This workflows will upload a Python Package using Twine when a release is created +# This workflow uploads a Python Package using trusted publishing when a release is created # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries name: Upload Python Package @@ -23,10 +23,9 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel twine - - name: Build and publish + pip install build + - name: Build package run: | - python setup.py sdist bdist_wheel + python -m build - name: Publish package distributions to PyPI uses: pypa/gh-action-pypi-publish@release/v1 - diff --git a/.github/workflows/run-codecov.yml b/.github/workflows/run-codecov.yml deleted file mode 100644 index de9e8f6..0000000 --- a/.github/workflows/run-codecov.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: Run codecov - -on: - pull_request: - branches: [master] - -jobs: - pytest: - runs-on: ${{ matrix.os }} - strategy: - matrix: - python-version: ["3.13"] - os: [ubuntu-latest] - - steps: - - uses: actions/checkout@v2 - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v2 - with: - file: ./coverage.xml - name: py-${{ matrix.python-version }}-${{ matrix.os }} diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 446f4dc..637e616 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -9,7 +9,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ["3.10", "3.13"] + python-version: ["3.10", "3.14"] os: [ubuntu-latest] steps: @@ -20,13 +20,10 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Install test dependencies - run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi - - - name: Install package + - name: Install package with test extras env: PYO3_USE_ABI3_FORWARD_COMPATIBILITY: 1 - run: python -m pip install . + run: python -m pip install ".[test]" - name: Run pytest tests - run: pytest -x -vv --cov=./ --cov-report=xml \ No newline at end of file + run: pytest -x -vv --cov=./ --cov-report=xml diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 9c9f250..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include requirements/* -include README.md -include refget/schemas/* \ No newline at end of file diff --git a/examples/remote_store.py b/examples/remote_store.py index aa7d904..09c8f79 100644 --- a/examples/remote_store.py +++ b/examples/remote_store.py @@ -47,7 +47,7 @@ # %% records = store.list_sequences() for i, m in enumerate(records[:5]): - print(f"{i+1}. {m.name[:60]}...") + print(f"{i + 1}. {m.name[:60]}...") print(f" sha512t24u: {m.sha512t24u}, length: {m.length:,} bp") # %% [markdown] diff --git a/pyproject.toml b/pyproject.toml index 458f68e..47a1411 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,72 @@ -[tool.black] +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "refget" +dynamic = ["version"] +description = "GA4GH refget - reference sequence and sequence collection tools" +readme = "README.md" +license = "BSD-2-Clause" +requires-python = ">=3.10" +authors = [ + { name = "Nathan Sheffield", email = "nathan@code.databio.org" }, + { name = "Michal Stolarczyk" }, +] +keywords = ["genome", "assembly", "bioinformatics", "reference", "sequence"] +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", +] +dependencies = [ + "gtars>=0.7.0", + "jsonschema", + "pyyaml", + "requests", + "sqlmodel", + "tomli_w", + "typer>=0.9.0", +] + +[project.scripts] +refget = "refget.cli:main" + +[project.optional-dependencies] +test = ["pytest", "pytest-cov>=6.0.0"] +seqcolapi = [ + "fastapi", + "psycopg2-binary", + "sqlmodel", + "uvicorn>=0.30.0", + "ubiquerg>=0.6.1", +] + +[project.urls] +Homepage = "https://github.com/refgenie/refget" + +[tool.hatch.version] +path = "refget/_version.py" + +[tool.ruff] line-length = 99 -target-version = ['py38', 'py311'] -include = '\.pyi?$' +exclude = [ + "array_overlap.py", + "create_compliance_answers.py", + "data_loaders", + "interactive_tests.py", +] + +[tool.ruff.lint] +select = ["E", "F", "I"] +ignore = ["F403", "F405", "E501"] + +[tool.ruff.lint.isort] +known-first-party = ["refget"] [tool.pytest.ini_options] testpaths = ["tests/local"] diff --git a/refget/__init__.py b/refget/__init__.py index e739c06..196ab19 100644 --- a/refget/__init__.py +++ b/refget/__init__.py @@ -11,12 +11,20 @@ """ from ._version import __version__ -from .exceptions import InvalidSeqColError +from .backend import RefgetStoreBackend, SeqColBackend +from .clients import SequenceCollectionClient +from .compliance import run_compliance from .const import GTARS_INSTALLED +from .exceptions import InvalidSeqColError +from .store import ( + RefgetStore, + SequenceCollection, + StorageMode, + compute_fai, + digest_fasta, + digest_sequence, +) from .utils import canonical_str -from .store import RefgetStore, StorageMode, digest_fasta, compute_fai, digest_sequence, SequenceCollection -from .compliance import run_compliance -from .clients import SequenceCollectionClient __all__ = [ "__version__", @@ -30,5 +38,7 @@ "digest_sequence", "SequenceCollection", "run_compliance", + "SeqColBackend", + "RefgetStoreBackend", "SequenceCollectionClient", ] diff --git a/refget/agents.py b/refget/agents.py index e98c837..e40ad1d 100644 --- a/refget/agents.py +++ b/refget/agents.py @@ -2,42 +2,43 @@ import json import os -import requests - from typing import TYPE_CHECKING -from sqlmodel import create_engine, select, Session, delete, func, SQLModel + +import requests +from sqlmodel import Session, SQLModel, create_engine, delete, func, select if TYPE_CHECKING: import peppy -from sqlalchemy.orm import selectinload +from typing import List, Optional + from sqlalchemy import URL from sqlalchemy.engine import Engine as SqlalchemyDatabaseEngine -from typing import Optional, List +from sqlalchemy.orm import selectinload +from .const import _LOGGER, DEFAULT_INHERENT_ATTRS, SEQCOL_SCHEMA_PATH from .models import ( - Sequence, - SequenceCollection, - Pangenome, - NamesAttr, - LengthsAttr, - SequencesAttr, - SortedSequencesAttr, - NameLengthPairsAttr, + AccessMethod, + AccessURL, CollectionNamesAttr, + FastaDrsObject, HumanReadableNames, + LengthsAttr, + NameLengthPairsAttr, + NamesAttr, PaginationResult, + Pangenome, ResultsSequenceCollections, - FastaDrsObject, - AccessMethod, - AccessURL, + Sequence, + SequenceCollection, + SequencesAttr, + SortedSequencesAttr, ) from .utils import ( - compare_seqcols, build_pangenome_model, calc_jaccard_similarities, + compare_seqcols, fasta_to_seqcol_dict, ) -from .const import _LOGGER, DEFAULT_INHERENT_ATTRS, SEQCOL_SCHEMA_PATH ATTR_TYPE_MAP = { "sequences": SequencesAttr, @@ -304,7 +305,6 @@ def add(self, seqcol: SequenceCollection, update: bool = False) -> SequenceColle for name_model in seqcol.human_readable_names: if name_model.human_readable_name not in existing_names: - new_name = HumanReadableNames( human_readable_name=name_model.human_readable_name, digest=existing.digest, @@ -659,7 +659,6 @@ def list(self, attribute_type: str, offset: int = 0, limit: int = 50) -> dict: } def search(self, attribute_type: str, digest: str, offset: int = 0, limit: int = 50) -> dict: - Attribute = ATTR_TYPE_MAP[attribute_type] with Session(self.engine) as session: list_stmt = ( select(SequenceCollection) @@ -825,11 +824,53 @@ def __init__( self.__attribute = AttributeAgent(self.engine) self.__fasta_drs = FastaDrsAgent(self.engine, fasta_drs_url_prefix) + # ========================================================================= + # SeqColBackend protocol methods + # ========================================================================= + + def get_collection(self, digest: str, level: int = 2) -> dict: + format_map = {1: "level1", 2: "level2"} + return self.seqcol.get(digest, return_format=format_map.get(level, "level2")) + + def get_collection_attribute(self, digest: str, attribute: str) -> list: + return self.seqcol.get(digest, attribute=attribute) + + def get_collection_itemwise(self, digest: str, limit: int | None = None) -> list[dict]: + return self.seqcol.get(digest, return_format="itemwise", itemwise_limit=limit) + + def get_attribute(self, attribute_name: str, attribute_digest: str) -> list: + return self.attribute.get(attribute_name, attribute_digest) + def compare_digests(self, digestA: str, digestB: str) -> dict: A = self.seqcol.get(digestA, return_format="level2") B = self.seqcol.get(digestB, return_format="level2") return compare_seqcols(A, B) + def compare_digest_with_level2(self, digest: str, level2_b: dict) -> dict: + return self.compare_1_digest(digest, level2_b) + + def list_collections( + self, page: int = 0, page_size: int = 100, filters: dict | None = None + ) -> dict: + if filters: + return self.seqcol.search_by_attributes( + filters, limit=page_size, offset=page * page_size + ) + return self.seqcol.list_by_offset(limit=page_size, offset=page * page_size) + + def collection_count(self) -> int: + result = self.seqcol.list_by_offset(limit=1, offset=0) + return result["pagination"]["total"] + + def capabilities(self) -> dict: + return { + "backend_type": "database", + "n_collections": self.collection_count(), + "has_sequence_data": True, # database always has sequences + "collection_alias_namespaces": [], + "sequence_alias_namespaces": [], + } + def calc_similarities(self, digestA: str, digestB: str) -> dict: """ Calculates the Jaccard similarity between two sequence collections. @@ -910,20 +951,12 @@ def truncate(self) -> int: with Session(self.engine) as session: statement = delete(SequenceCollection) result1 = session.exec(statement) - statement = delete(Pangenome) - result = session.exec(statement) - statement = delete(NamesAttr) - result = session.exec(statement) - statement = delete(LengthsAttr) - result = session.exec(statement) - statement = delete(SequencesAttr) - result = session.exec(statement) - # statement = delete(SortedNameLengthPairsAttr) - # result = session.exec(statement) - statement = delete(NameLengthPairsAttr) - result = session.exec(statement) - statement = delete(SortedSequencesAttr) - result = session.exec(statement) + session.exec(delete(Pangenome)) + session.exec(delete(NamesAttr)) + session.exec(delete(LengthsAttr)) + session.exec(delete(SequencesAttr)) + session.exec(delete(NameLengthPairsAttr)) + session.exec(delete(SortedSequencesAttr)) session.commit() return result1.rowcount diff --git a/refget/backend.py b/refget/backend.py new file mode 100644 index 0000000..9408230 --- /dev/null +++ b/refget/backend.py @@ -0,0 +1,144 @@ +""" +SeqColBackend protocol and RefgetStoreBackend implementation. + +The SeqColBackend protocol defines the interface for serving seqcol API endpoints. +Two implementations: +- RefgetDBAgent (PostgreSQL) — full features including similarities, pangenomes, DRS +- RefgetStoreBackend (RefgetStore) — core seqcol operations, no database required +""" + +from __future__ import annotations + +from typing import Protocol, runtime_checkable + +from .utils import compare_seqcols + + +@runtime_checkable +class SeqColBackend(Protocol): + """Backend protocol for serving seqcol API endpoints.""" + + def get_collection(self, digest: str, level: int = 2) -> dict: + """Get a collection at level 1 or 2. Raises ValueError if not found.""" + ... + + def get_collection_attribute(self, digest: str, attribute: str) -> list: + """Get a single attribute array from a collection. Raises ValueError if not found.""" + ... + + def get_collection_itemwise(self, digest: str, limit: int | None = None) -> list[dict]: + """Get collection in itemwise format. Raises ValueError if not found.""" + ... + + def get_attribute(self, attribute_name: str, attribute_digest: str) -> list: + """Get an attribute by its own digest. Raises KeyError if not found.""" + ... + + def compare_digests(self, digest_a: str, digest_b: str) -> dict: + """Compare two collections by digest. Raises ValueError if not found.""" + ... + + def compare_digest_with_level2(self, digest: str, level2_b: dict) -> dict: + """Compare a stored collection with a POSTed level2 dict. Raises ValueError if not found.""" + ... + + def list_collections( + self, page: int = 0, page_size: int = 100, filters: dict | None = None + ) -> dict: + """List collections with pagination and optional attribute filters. + Returns {"results": [...], "pagination": {...}}""" + ... + + def collection_count(self) -> int: + """Total number of collections.""" + ... + + def capabilities(self) -> dict: + """Return backend capabilities for service-info.""" + ... + + +class RefgetStoreBackend: + """SeqColBackend backed by a RefgetStore (no database).""" + + def __init__(self, store): + """ + Args: + store: A RefgetStore or ReadonlyRefgetStore instance from gtars. + """ + self._store = store + + def get_collection(self, digest: str, level: int = 2) -> dict: + try: + if level == 1: + result = self._store.get_collection_level1(digest) + else: + result = self._store.get_collection_level2(digest) + except (OSError, IOError): + raise ValueError(f"Collection '{digest}' not found") + if result is None: + raise ValueError(f"Collection '{digest}' not found") + return result + + def get_collection_attribute(self, digest: str, attribute: str) -> list: + level2 = self.get_collection(digest, level=2) + if attribute not in level2: + raise ValueError(f"Attribute '{attribute}' not found") + return level2[attribute] + + def get_collection_itemwise(self, digest: str, limit: int | None = None) -> list[dict]: + level2 = self.get_collection(digest, level=2) + # Transpose: {"names": [a,b], "lengths": [1,2]} -> [{"names": a, "lengths": 1}, ...] + keys = list(level2.keys()) + n = len(level2[keys[0]]) + if limit: + n = min(n, limit) + return [{k: level2[k][i] for k in keys} for i in range(n)] + + def get_attribute(self, attribute_name: str, attribute_digest: str) -> list: + result = self._store.get_attribute(attribute_name, attribute_digest) + if result is None: + raise KeyError(f"Attribute {attribute_name}/{attribute_digest} not found") + return result + + def compare_digests(self, digest_a: str, digest_b: str) -> dict: + try: + result = self._store.compare(digest_a, digest_b) + except (OSError, IOError): + raise ValueError("Collection not found") + if result is None: + raise ValueError("Collection not found") + return result + + def compare_digest_with_level2(self, digest: str, level2_b: dict) -> dict: + """Compare a stored collection with a POSTed level2 dict. + + The store does not have a native compare_with_level2, so we retrieve + level2 for the stored collection and use the Python compare utility. + """ + level2_a = self.get_collection(digest, level=2) + return compare_seqcols(level2_a, level2_b) + + def list_collections( + self, page: int = 0, page_size: int = 100, filters: dict | None = None + ) -> dict: + if filters: + raise ValueError("Filtering by attribute is not supported by RefgetStore backend") + return self._store.list_collections(page=page, page_size=page_size) + + def collection_count(self) -> int: + result = self._store.list_collections(page=0, page_size=1) + return result["pagination"]["total"] + + def capabilities(self) -> dict: + stats = self._store.stats() + n_collections = int(stats.get("n_collections", 0)) + n_sequences = int(stats.get("n_sequences", 0)) + return { + "backend_type": "refget_store", + "n_collections": n_collections, + "n_sequences": n_sequences, + "has_sequence_data": n_sequences > 0, + "collection_alias_namespaces": self._store.list_collection_alias_namespaces(), + "sequence_alias_namespaces": self._store.list_sequence_alias_namespaces(), + } diff --git a/refget/cli/admin.py b/refget/cli/admin.py index 1787c82..9e6b067 100644 --- a/refget/cli/admin.py +++ b/refget/cli/admin.py @@ -15,20 +15,18 @@ import json import os from pathlib import Path -from typing import Optional, List, Dict, Any +from typing import Any, Dict, List, Optional import typer from refget.cli.output import ( EXIT_CONFIG_ERROR, - EXIT_FILE_NOT_FOUND, EXIT_FAILURE, - EXIT_SUCCESS, + EXIT_FILE_NOT_FOUND, print_error, print_info, print_json, print_success, - print_warning, ) # Heavy imports (sqlmodel) are done lazily inside functions that need them diff --git a/refget/cli/config.py b/refget/cli/config.py index 6f5756f..7614ad5 100644 --- a/refget/cli/config.py +++ b/refget/cli/config.py @@ -14,14 +14,6 @@ import typer -from refget.cli.output import ( - EXIT_CONFIG_ERROR, - EXIT_FAILURE, - EXIT_SUCCESS, - print_error, - print_json, - print_success, -) from refget.cli.config_manager import ( DEFAULTS, get_config_path, @@ -30,6 +22,14 @@ save_config, set_value, ) +from refget.cli.output import ( + EXIT_CONFIG_ERROR, + EXIT_FAILURE, + EXIT_SUCCESS, + print_error, + print_json, + print_success, +) app = typer.Typer( name="config", @@ -220,7 +220,7 @@ def add( if resource_type not in RESOURCE_TYPE_MAP: valid_types = ", ".join(RESOURCE_TYPE_MAP.keys()) print_error( - f"Invalid resource type '{resource_type}'.\n" f"Valid types: {valid_types}", + f"Invalid resource type '{resource_type}'.\nValid types: {valid_types}", EXIT_CONFIG_ERROR, ) return # Unreachable, but clarifies control flow @@ -314,7 +314,7 @@ def remove( if resource_type not in RESOURCE_TYPE_MAP: valid_types = ", ".join(RESOURCE_TYPE_MAP.keys()) print_error( - f"Invalid resource type '{resource_type}'.\n" f"Valid types: {valid_types}", + f"Invalid resource type '{resource_type}'.\nValid types: {valid_types}", EXIT_CONFIG_ERROR, ) return # Unreachable, but clarifies control flow diff --git a/refget/cli/config_manager.py b/refget/cli/config_manager.py index aff7c8b..1eb968f 100644 --- a/refget/cli/config_manager.py +++ b/refget/cli/config_manager.py @@ -105,7 +105,7 @@ def save_config(config: Dict[str, Any]) -> None: """ if tomli_w is None: raise ImportError( - "tomli_w is required to save configuration.\n" "Install with: pip install tomli-w" + "tomli_w is required to save configuration.\nInstall with: pip install tomli-w" ) config_path = get_config_path() diff --git a/refget/cli/fasta.py b/refget/cli/fasta.py index 5c32442..d5f3313 100644 --- a/refget/cli/fasta.py +++ b/refget/cli/fasta.py @@ -22,8 +22,8 @@ import typer from refget.cli.output import ( - EXIT_FILE_NOT_FOUND, EXIT_FAILURE, + EXIT_FILE_NOT_FOUND, EXIT_SUCCESS, print_error, print_json, @@ -165,8 +165,11 @@ def index( ) files_created = [ - str(fai_path), str(seqcol_path), str(chrom_sizes_path), - str(rgsi_path), str(rgci_path), + str(fai_path), + str(seqcol_path), + str(chrom_sizes_path), + str(rgsi_path), + str(rgci_path), ] if json_output: diff --git a/refget/cli/main.py b/refget/cli/main.py index 61bb2a2..39129e8 100644 --- a/refget/cli/main.py +++ b/refget/cli/main.py @@ -4,16 +4,16 @@ This module defines the main CLI app and registers all command groups. """ -import typer from typing import Optional -from refget._version import __version__ +import typer +from refget._version import __version__ +from refget.cli.admin import app as admin_app from refget.cli.config import app as config_app from refget.cli.fasta import app as fasta_app -from refget.cli.store import app as store_app from refget.cli.seqcol import app as seqcol_app -from refget.cli.admin import app as admin_app +from refget.cli.store import app as store_app app = typer.Typer( name="refget", diff --git a/refget/cli/seqcol.py b/refget/cli/seqcol.py index 69dc486..14cc4c0 100644 --- a/refget/cli/seqcol.py +++ b/refget/cli/seqcol.py @@ -118,8 +118,8 @@ def _compute_snlp_digest(seqcol_dict: dict) -> str: Returns: The snlp digest (coordinate system identifier) """ - from refget.utils import build_sorted_name_length_pairs, canonical_str from refget.digests import sha512t24u_digest + from refget.utils import build_sorted_name_length_pairs, canonical_str snlp_digests = build_sorted_name_length_pairs(seqcol_dict) return sha512t24u_digest(canonical_str(snlp_digests)) diff --git a/refget/cli/store.py b/refget/cli/store.py index 904e1ce..0fe33e3 100644 --- a/refget/cli/store.py +++ b/refget/cli/store.py @@ -27,8 +27,8 @@ from refget.cli.config_manager import get_remote_stores, get_seqcol_servers, get_store_path from refget.cli.output import ( - EXIT_FILE_NOT_FOUND, EXIT_FAILURE, + EXIT_FILE_NOT_FOUND, EXIT_SUCCESS, check_dependency, print_error, @@ -71,7 +71,7 @@ def _get_store_path(path: Optional[Path]) -> Path: def _get_collection_digests(store) -> set: """Get the set of collection digest strings from a store.""" - return {meta.digest for meta in store.list_collections()} + return {meta.digest for meta in store.list_collections()["results"]} def _load_store(path: Optional[Path], must_exist: bool = True, remote: Optional[str] = None): @@ -150,7 +150,7 @@ def init( store_path.parent.mkdir(parents=True, exist_ok=True) # Initialize the store (creates index files) - store = RefgetStore.on_disk(str(store_path)) + RefgetStore.on_disk(str(store_path)) print_json( { @@ -187,12 +187,6 @@ def add( "-q", help="Suppress progress output", ), - threads: Optional[int] = typer.Option( - None, - "--threads", - "-t", - help="Number of threads for parallel encoding (default: 1)", - ), ) -> None: """ Import a FASTA file to the local store. @@ -223,9 +217,7 @@ def add( store.set_encoding_mode(StorageMode.Encoded) # Add the FASTA file - returns (metadata, was_new) with all info we need - metadata, was_new = store.add_sequence_collection_from_fasta( - str(fasta.resolve()), threads=threads - ) + metadata, was_new = store.add_sequence_collection_from_fasta(str(fasta.resolve())) print_json( { @@ -283,7 +275,7 @@ def list_items( print_json({"sequences": items}) else: collections = [] - for meta in store.list_collections(): + for meta in store.list_collections()["results"]: collections.append( { "digest": meta.digest, @@ -353,7 +345,9 @@ def get( store = _load_store(path, remote=remote) if sequence: - # Sequence retrieval mode + # Sequence retrieval mode — load sequence data + store.load_all_collections() + store.load_all_sequences() seq_data = None if name is not None: @@ -676,8 +670,9 @@ def export( """ store = _load_store(path, remote=remote) - # Ensure collection is loaded (required for export) + # Ensure collection and sequence data are loaded (required for export) _ensure_collection_loaded(store, digest) + store.load_all_sequences() def _do_export(output_path: str) -> None: """Perform the actual export to a file path.""" @@ -870,7 +865,7 @@ def stats( stats_dict["collections"] = int(stats_dict["collections"]) else: # Fallback: count collections ourselves - stats_dict["collections"] = len(store.list_collections()) + stats_dict["collections"] = store.list_collections()["pagination"]["total"] print_json(stats_dict) raise typer.Exit(EXIT_SUCCESS) @@ -914,9 +909,7 @@ def remove( @app.command() def metadata( digest: str = typer.Argument(help="Collection digest"), - path: Optional[Path] = typer.Option( - None, "--path", "-p", help="Store path" - ), + path: Optional[Path] = typer.Option(None, "--path", "-p", help="Store path"), ): """Show FHR metadata for a collection.""" store = _load_store(path) @@ -933,12 +926,339 @@ def metadata( def metadata_set( digest: str = typer.Argument(help="Collection digest"), file: Path = typer.Argument(help="Path to FHR JSON file"), - path: Optional[Path] = typer.Option( - None, "--path", "-p", help="Store path" - ), + path: Optional[Path] = typer.Option(None, "--path", "-p", help="Store path"), ): """Set FHR metadata for a collection from a JSON file.""" store = _load_store(path) store.load_fhr_metadata(digest, str(file)) print(f"Set FHR metadata for collection {digest}") raise typer.Exit(EXIT_SUCCESS) + + +@app.command("crate") +def crate( + path: Optional[Path] = typer.Option( + None, + "--path", + "-p", + help="Store path (default: from config)", + ), + name: str = typer.Option( + ..., + "--name", + "-n", + help="Name for the RO-Crate root dataset", + ), + description: Optional[str] = typer.Option( + None, + "--description", + "-d", + help="Description of the store", + ), + author: Optional[str] = typer.Option( + None, + "--author", + "-a", + help='Author in "Name " format, e.g. "Jane Doe "', + ), + license: Optional[str] = typer.Option( + None, + "--license", + "-l", + help="License URL, e.g. https://creativecommons.org/publicdomain/zero/1.0/", + ), + output: Optional[Path] = typer.Option( + None, + "--output", + "-o", + help="Output path (default: /ro-crate-metadata.json)", + ), +) -> None: + """Generate an RO-Crate metadata file for a RefgetStore. + + Creates a ro-crate-metadata.json describing the store as a FAIR + research object, including structure, provenance, and statistics. + + Examples: + refget store crate --path /store --name "My genomes" --author "J Doe " + refget store crate -p /store -n "Store" -l https://creativecommons.org/publicdomain/zero/1.0/ + """ + import json + import re + from datetime import datetime, timezone + + from refget._version import __version__ + + store = _load_store(path) + store_path = _get_store_path(path) + + # Gather stats + stats_obj = store.stats() + stats_dict = {} + if hasattr(stats_obj, "__iter__"): + for key, value in stats_obj.items(): + stats_dict[key] = value + elif hasattr(stats_obj, "__dict__"): + stats_dict = vars(stats_obj) + + storage_mode = stats_dict.get("storage_mode", "Unknown") + seq_count = int(stats_dict.get("n_sequences", stats_dict.get("sequences", 0))) + + # Count collections + try: + coll_count = store.list_collections()["pagination"]["total"] + except Exception: + coll_count = 0 + + # Build the @graph + now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + today = datetime.now(timezone.utc).strftime("%Y-%m-%d") + + graph = [ + # Metadata descriptor + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "conformsTo": [ + {"@id": "https://w3id.org/ro/crate/1.2"}, + {"@id": "https://w3id.org/ga4gh/refget/refgetstore-crate/0.1"}, + ], + "about": {"@id": "./"}, + }, + ] + + # Root dataset + root = { + "@id": "./", + "@type": "Dataset", + "name": name, + "datePublished": today, + "conformsTo": {"@id": "https://w3id.org/ga4gh/refget/refgetstore-crate/0.1"}, + "hasPart": [ + {"@id": "rgstore.json"}, + {"@id": "sequences.rgsi"}, + {"@id": "sequences/"}, + {"@id": "collections/"}, + ], + "additionalProperty": [ + {"@id": "#prop-storageMode"}, + {"@id": "#prop-sequenceCount"}, + {"@id": "#prop-collectionCount"}, + {"@id": "#prop-refgetDigestAlgorithm"}, + ], + } + if description: + root["description"] = description + if license: + root["license"] = {"@id": license} + if author: + # Parse "Name " format + match = re.match(r"^(.+?)\s*<(.+?)>\s*$", author) + if match: + author_name = match.group(1).strip() + author_url = match.group(2).strip() + root["author"] = {"@id": author_url} + else: + author_name = author.strip() + author_url = None + root["author"] = {"@id": f"#author-{author_name.replace(' ', '-').lower()}"} + + # Add aliases/ if it exists + aliases_path = store_path / "aliases" + if aliases_path.exists() and aliases_path.is_dir(): + root["hasPart"].append({"@id": "aliases/"}) + + graph.append(root) + + # Data entities + graph.extend([ + { + "@id": "rgstore.json", + "@type": "File", + "name": "Store configuration", + "description": "Operational configuration for RefgetStore: path templates, storage mode, format version.", + "encodingFormat": "application/json", + }, + { + "@id": "sequences.rgsi", + "@type": "File", + "name": "Master sequence index", + "description": "Tab-separated index of all sequences in the store with names, lengths, alphabets, and GA4GH digests.", + "encodingFormat": "text/tab-separated-values", + }, + { + "@id": "sequences/", + "@type": "Dataset", + "name": "Sequence data", + "description": "Content-addressable sequence files organized by digest prefix.", + }, + { + "@id": "collections/", + "@type": "Dataset", + "name": "Sequence collections", + "description": "GA4GH sequence collection metadata. Each .rgsi file defines a collection with its member sequences and digests.", + }, + ]) + + if aliases_path.exists() and aliases_path.is_dir(): + graph.append({ + "@id": "aliases/", + "@type": "Dataset", + "name": "Alias namespaces", + "description": "Human-readable name mappings for sequences and collections.", + }) + + # PropertyValue entities + graph.extend([ + { + "@id": "#prop-storageMode", + "@type": "PropertyValue", + "propertyID": "storageMode", + "name": "Storage Mode", + "value": storage_mode, + }, + { + "@id": "#prop-sequenceCount", + "@type": "PropertyValue", + "propertyID": "sequenceCount", + "name": "Sequence Count", + "value": seq_count, + }, + { + "@id": "#prop-collectionCount", + "@type": "PropertyValue", + "propertyID": "collectionCount", + "name": "Collection Count", + "value": coll_count, + }, + { + "@id": "#prop-refgetDigestAlgorithm", + "@type": "PropertyValue", + "propertyID": "refgetDigestAlgorithm", + "name": "Refget Digest Algorithm", + "value": "sha512t24u", + }, + ]) + + # CreateAction provenance + graph.extend([ + { + "@id": "#crate-creation", + "@type": "CreateAction", + "name": "Generate RO-Crate metadata for RefgetStore", + "endTime": now, + "instrument": {"@id": "#refget-software"}, + "result": {"@id": "./"}, + }, + { + "@id": "#refget-software", + "@type": "SoftwareApplication", + "name": "refget", + "version": __version__, + "url": "https://github.com/refgenie/refget", + "description": "Python package implementing GA4GH refget standards for sequences and sequence collections.", + }, + ]) + + # Add agent to CreateAction if author provided + if author: + graph[-2]["agent"] = root["author"] + + # Profile entity + graph.append({ + "@id": "https://w3id.org/ga4gh/refget/refgetstore-crate/0.1", + "@type": ["CreativeWork", "Profile"], + "name": "RefgetStore RO-Crate Profile", + "version": "0.1", + "description": "Profile for RO-Crates containing GA4GH RefgetStore sequence databases.", + }) + + # Author entity + if author: + match = re.match(r"^(.+?)\s*<(.+?)>\s*$", author) + if match: + graph.append({ + "@id": author_url, + "@type": "Person", + "name": author_name, + }) + else: + graph.append({ + "@id": root["author"]["@id"], + "@type": "Person", + "name": author_name, + }) + + # License entity + if license: + graph.append({ + "@id": license, + "@type": "CreativeWork", + "name": license.rstrip("/").split("/")[-1] or "License", + }) + + crate = { + "@context": "https://w3id.org/ro/crate/1.2/context", + "@graph": graph, + } + + # Write output + output_path = output or (store_path / "ro-crate-metadata.json") + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(crate, indent=2) + "\n") + + print_json({ + "output": str(output_path), + "status": "created", + "entities": len(graph), + }) + raise typer.Exit(EXIT_SUCCESS) + + +@app.command("serve") +def serve( + path: Optional[Path] = typer.Option(None, "--path", "-p", help="Local store path"), + remote: Optional[str] = typer.Option( + None, "--remote", "-r", help="Remote store URL (e.g. s3://bucket/store/)" + ), + port: int = typer.Option(8000, "--port", help="Port to serve on"), + host: str = typer.Option("0.0.0.0", "--host", help="Host to bind to"), +): + """Serve a seqcol API backed by a RefgetStore (no database required). + + Examples: + refget store serve --path /path/to/store --port 8000 + refget store serve --remote s3://bucket/store/ --port 8000 + """ + try: + import uvicorn + except ImportError: + print_error("uvicorn is required: pip install uvicorn", EXIT_FAILURE) + + from refget.backend import RefgetStoreBackend + + if remote: + store = _load_store(path=None, remote=remote) + elif path: + store = _load_store(path) + else: + store = _load_store(None) + + backend = RefgetStoreBackend(store.into_readonly()) + + from fastapi import FastAPI + + from refget.router import create_refget_router + + app = FastAPI(title="Sequence Collections API (Store-backed)") + app.state.backend = backend + router = create_refget_router( + sequences=False, + pangenomes=False, + refget_store_url=remote, + ) + app.include_router(router) + + typer.echo(f"Serving store-backed seqcol API on {host}:{port}") + uvicorn.run(app, host=host, port=port) + raise typer.Exit(EXIT_SUCCESS) diff --git a/refget/clients.py b/refget/clients.py index 977eb68..6ad9196 100644 --- a/refget/clients.py +++ b/refget/clients.py @@ -1,8 +1,13 @@ +from __future__ import annotations + import logging import re +from typing import TYPE_CHECKING, Optional + import requests -from typing import Optional +if TYPE_CHECKING: + from .store import RefgetStore _LOGGER = logging.getLogger(__name__) @@ -602,12 +607,12 @@ def download_to_store( >>> client = FastaDrsClient() >>> collection_digest = client.download_to_store("abc123", store) """ - import tempfile import os + import tempfile # Verify store is available try: - from .store import RefgetStore as RefgetStoreClass + from .store import RefgetStore as RefgetStoreClass # noqa: F401 except ImportError: raise ImportError("gtars is required for download_to_store functionality") diff --git a/refget/compliance.py b/refget/compliance.py index 7020d9e..eaa64f7 100644 --- a/refget/compliance.py +++ b/refget/compliance.py @@ -15,7 +15,7 @@ import json import logging import time -from dataclasses import dataclass, field, asdict +from dataclasses import asdict, dataclass, field from datetime import datetime, timezone from pathlib import Path @@ -96,11 +96,17 @@ def _timed_check(name: str, func, *args, **kwargs) -> CheckResult: try: func(*args, **kwargs) elapsed = (time.monotonic() - start) * 1000 - return CheckResult(name=name, passed=True, duration_ms=round(elapsed, 2), description=description) + return CheckResult( + name=name, passed=True, duration_ms=round(elapsed, 2), description=description + ) except AssertionError as e: elapsed = (time.monotonic() - start) * 1000 return CheckResult( - name=name, passed=False, duration_ms=round(elapsed, 2), description=description, error=str(e) + name=name, + passed=False, + duration_ms=round(elapsed, 2), + description=description, + error=str(e), ) except requests.exceptions.RequestException as e: elapsed = (time.monotonic() - start) * 1000 @@ -163,9 +169,9 @@ def check_list_attributes(api_root, attribute_name): res = requests.get(f"{api_root}/list/attributes/{attribute_name}", timeout=COMPLIANCE_TIMEOUT) data = res.json() assert "results" in data, f"list/attributes/{attribute_name} missing 'results' field" - assert isinstance( - data["results"], list - ), f"list/attributes/{attribute_name} 'results' should be a list" + assert isinstance(data["results"], list), ( + f"list/attributes/{attribute_name} 'results' should be a list" + ) def check_openapi_available(api_root): @@ -185,7 +191,9 @@ def check_collection_level1(api_root, fa_name, bundle): """Level 1 response returns digest strings for all attributes.""" digest = bundle["top_level_digest"] res = requests.get(f"{api_root}/collection/{digest}?level=1", timeout=COMPLIANCE_TIMEOUT) - assert res.status_code == 200, f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + assert res.status_code == 200, ( + f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + ) data = res.json() for attr in ["names", "lengths", "sequences"]: assert isinstance(data[attr], str), ( @@ -201,7 +209,9 @@ def check_collection_level2(api_root, fa_name, bundle): """Level 2 response returns arrays matching expected content.""" digest = bundle["top_level_digest"] res = requests.get(f"{api_root}/collection/{digest}?level=2", timeout=COMPLIANCE_TIMEOUT) - assert res.status_code == 200, f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + assert res.status_code == 200, ( + f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + ) data = res.json() for attr in ["names", "lengths", "sequences"]: assert isinstance(data[attr], list), ( @@ -210,14 +220,18 @@ def check_collection_level2(api_root, fa_name, bundle): assert data[attr] == bundle["level2"][attr], ( f"Level 2 {attr} for {fa_name}: expected {bundle['level2'][attr]}, got {data[attr]}" ) - assert "sorted_name_length_pairs" not in data, "Level 2 should not have sorted_name_length_pairs" + assert "sorted_name_length_pairs" not in data, ( + "Level 2 should not have sorted_name_length_pairs" + ) def check_default_level_returns_level2(api_root, fa_name, bundle): """Collection without ?level= param returns level 2 arrays (spec default).""" digest = bundle["top_level_digest"] res = requests.get(f"{api_root}/collection/{digest}", timeout=COMPLIANCE_TIMEOUT) - assert res.status_code == 200, f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + assert res.status_code == 200, ( + f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + ) data = res.json() for attr in ["names", "lengths", "sequences"]: assert isinstance(data[attr], list), ( @@ -229,13 +243,13 @@ def check_sorted_name_length_pairs(api_root, fa_name, bundle): """Level 1 sorted_name_length_pairs digest matches expected value.""" digest = bundle["top_level_digest"] res = requests.get(f"{api_root}/collection/{digest}?level=1", timeout=COMPLIANCE_TIMEOUT) - assert res.status_code == 200, f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + assert res.status_code == 200, ( + f"Collection {digest} returned HTTP {res.status_code} (expected 200)" + ) data = res.json() expected = bundle["sorted_name_length_pairs_digest"] actual = data.get("sorted_name_length_pairs") - assert actual == expected, ( - f"SNLP for {fa_name}: expected {expected}, got {actual}" - ) + assert actual == expected, f"SNLP for {fa_name}: expected {expected}, got {actual}" # ============================================================ @@ -263,13 +277,17 @@ def check_transient_attribute_not_served(api_root): """Transient attributes (sorted_name_length_pairs) return 404 from /attribute.""" bundle = DIGEST_TESTS[0][1] digest = bundle["top_level_digest"] - level1 = requests.get(f"{api_root}/collection/{digest}?level=1", timeout=COMPLIANCE_TIMEOUT).json() + level1 = requests.get( + f"{api_root}/collection/{digest}?level=1", timeout=COMPLIANCE_TIMEOUT + ).json() snlp_digest = level1["sorted_name_length_pairs"] res = requests.get( f"{api_root}/attribute/collection/sorted_name_length_pairs/{snlp_digest}", timeout=COMPLIANCE_TIMEOUT, ) - assert res.status_code == 404, "Transient attributes should not be served by /attribute endpoint" + assert res.status_code == 404, ( + "Transient attributes should not be served by /attribute endpoint" + ) # ============================================================ @@ -411,33 +429,53 @@ def build_checks(api_root: str) -> list[tuple[str, callable, list]]: # Collection content checks (per FASTA file) for fa_name, bundle in DIGEST_TESTS: tag = fa_name.replace(".fa", "") - checks.append((f"collection_level1_{tag}", check_collection_level1, [api_root, fa_name, bundle])) - checks.append((f"collection_level2_{tag}", check_collection_level2, [api_root, fa_name, bundle])) - checks.append((f"default_level2_{tag}", check_default_level_returns_level2, [api_root, fa_name, bundle])) - checks.append((f"snlp_digest_{tag}", check_sorted_name_length_pairs, [api_root, fa_name, bundle])) + checks.append( + (f"collection_level1_{tag}", check_collection_level1, [api_root, fa_name, bundle]) + ) + checks.append( + (f"collection_level2_{tag}", check_collection_level2, [api_root, fa_name, bundle]) + ) + checks.append( + ( + f"default_level2_{tag}", + check_default_level_returns_level2, + [api_root, fa_name, bundle], + ) + ) + checks.append( + (f"snlp_digest_{tag}", check_sorted_name_length_pairs, [api_root, fa_name, bundle]) + ) # Attribute retrieval checks (per FASTA, per attribute) for fa_name, bundle in DIGEST_TESTS: tag = fa_name.replace(".fa", "") for attr in ["lengths", "names", "sequences"]: - checks.append(( - f"attribute_{attr}_{tag}", - check_attribute_retrieval, - [api_root, fa_name, bundle, attr], - )) + checks.append( + ( + f"attribute_{attr}_{tag}", + check_attribute_retrieval, + [api_root, fa_name, bundle, attr], + ) + ) # Attribute filtering checks - checks.append(("transient_attribute_not_served", check_transient_attribute_not_served, [api_root])) - checks.append(("multi_attribute_filter_and", check_list_multi_attribute_filter_and, [api_root])) + checks.append( + ("transient_attribute_not_served", check_transient_attribute_not_served, [api_root]) + ) + checks.append( + ("multi_attribute_filter_and", check_list_multi_attribute_filter_and, [api_root]) + ) # List filter checks (base.fa, filter by each attribute) base_name, base_bundle = DIGEST_TESTS[0] for attr in ["lengths", "names", "sequences"]: - checks.append(( - f"list_filter_{attr}", - check_list_filter_by_attribute, - [api_root, base_name, base_bundle, attr], - )) + checks.append( + ( + f"list_filter_{attr}", + check_list_filter_by_attribute, + [api_root, base_name, base_bundle, attr], + ) + ) # Comparison checks checks.append(("comparison_structure", check_comparison_structure, [api_root])) @@ -446,7 +484,9 @@ def build_checks(api_root: str) -> list[tuple[str, callable, list]]: for fixture_name, expected in COMPARISON_FIXTURES.items(): tag = fixture_name.replace("compare_", "").replace(".json", "") checks.append((f"comparison_{tag}", check_comparison, [api_root, fixture_name, expected])) - checks.append((f"comparison_post_{tag}", check_comparison_post, [api_root, fixture_name, expected])) + checks.append( + (f"comparison_post_{tag}", check_comparison_post, [api_root, fixture_name, expected]) + ) return checks diff --git a/refget/const.py b/refget/const.py index 68104fe..66f3175 100644 --- a/refget/const.py +++ b/refget/const.py @@ -1,5 +1,5 @@ -import os import logging +import os _LOGGER = logging.getLogger(__name__) diff --git a/refget/digests.py b/refget/digests.py index b72ba0c..6ffa265 100644 --- a/refget/digests.py +++ b/refget/digests.py @@ -4,9 +4,8 @@ When gtars is not available, falls back to pure Python implementations (slower). """ -import hashlib import base64 - +import hashlib from typing import Callable, Union from .const import GTARS_INSTALLED @@ -34,7 +33,7 @@ def py_md5_digest(seq) -> str: # Default exports - use gtars if available, else Python fallback if GTARS_INSTALLED: - from gtars.refget import sha512t24u_digest, md5_digest + from gtars.refget import md5_digest, sha512t24u_digest else: sha512t24u_digest = py_sha512t24u_digest md5_digest = py_md5_digest diff --git a/refget/examples.py b/refget/examples.py index 94ac812..064c30b 100644 --- a/refget/examples.py +++ b/refget/examples.py @@ -1,7 +1,7 @@ # Models # Used for documentation examples in OpenAPI -from fastapi import Path, Body +from fastapi import Body, Path example_digest = Path( ..., diff --git a/refget/models.py b/refget/models.py index 5476e86..b6e4e87 100644 --- a/refget/models.py +++ b/refget/models.py @@ -2,12 +2,11 @@ import logging from copy import copy from datetime import datetime, timezone -from sqlalchemy.types import TypeDecorator -from sqlmodel import Field, SQLModel, Column, Relationship -from sqlmodel import JSON -from typing import List, Optional, Dict, Any, Literal, TYPE_CHECKING -from pydantic import BaseModel, field_validator, field_serializer +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional +from pydantic import BaseModel, field_serializer, field_validator +from sqlalchemy.types import TypeDecorator +from sqlmodel import JSON, Column, Field, Relationship, SQLModel from .digests import sha512t24u_digest @@ -38,19 +37,18 @@ def _serialize_item(self, item): return item -from .const import ( +from .const import ( # noqa: E402 DEFAULT_INHERENT_ATTRS, - DEFAULT_PASSTHRU_ATTRS, - SEQCOL_SCHEMA_PATH, GTARS_INSTALLED, + SEQCOL_SCHEMA_PATH, ) -from .exceptions import InvalidSeqColError -from .utils import ( - canonical_str, +from .exceptions import InvalidSeqColError # noqa: E402 +from .utils import ( # noqa: E402 build_name_length_pairs, - seqcol_dict_to_level1_dict, - level1_dict_to_seqcol_digest, + canonical_str, fasta_to_seqcol_dict, + level1_dict_to_seqcol_digest, + seqcol_dict_to_level1_dict, ) _LOGGER = logging.getLogger(__name__) @@ -71,9 +69,9 @@ def create_fasta_drs_object(fasta_file: str, digest: str = None) -> "FastaDrsObj Raises: ImportError: If gtars is not installed (required for FASTA processing) """ - import os import hashlib - from datetime import datetime, timezone + import os + from datetime import datetime if not GTARS_INSTALLED: raise ImportError( diff --git a/refget/router.py b/refget/router.py index eeb76ef..a2af04d 100644 --- a/refget/router.py +++ b/refget/router.py @@ -7,25 +7,26 @@ by the main app. To use, first import it, then attach it to the app, -then create a dbagent object to connect to the database, -and attach it to the app state like this: +then create a backend object and attach it to the app state like this: from refget.router import create_refget_router from refget.agents import RefgetDBAgent router = create_refget_router(sequences=False, collections=True, pangenomes=False) app.include_router(router, prefix="/seqcol") -app.state.dbagent = RefgetDBAgent() +dbagent = RefgetDBAgent() +app.state.backend = dbagent # RefgetDBAgent satisfies SeqColBackend +app.state.dbagent = dbagent # For DB-only endpoints (similarities, pangenomes, DRS) """ import logging -from fastapi import APIRouter, Response, HTTPException, Request, Depends, Query +from fastapi import APIRouter, Depends, HTTPException, Query, Request, Response from fastapi.responses import StreamingResponse -from .models import Similarities, PaginationResult, PaginatedDigestList -from .agents import RefgetDBAgent +from .backend import SeqColBackend from .examples import * +from .models import PaginatedDigestList, PaginationResult, Similarities _LOGGER = logging.getLogger(__name__) @@ -36,9 +37,17 @@ _ROUTER_CONFIG: dict = {} -# dbagent is a RefgetDBAgent, which handles connection to the POSTGRES database -async def get_dbagent(request: Request) -> RefgetDBAgent: - return request.app.state.dbagent +async def get_backend(request: Request) -> SeqColBackend: + """Get the SeqColBackend from the app state.""" + return request.app.state.backend + + +async def get_dbagent(request: Request): + """Get the RefgetDBAgent for DB-only endpoints. Returns None if not configured.""" + dbagent = getattr(request.app.state, "dbagent", None) + if dbagent is None: + raise HTTPException(status_code=501, detail="This endpoint requires database backend") + return dbagent def create_refget_router( @@ -103,10 +112,10 @@ def create_refget_router( tags=["Retrieving data"], ) async def sequence( - dbagent=Depends(get_dbagent), sequence_digest: str = example_sequence, start: int | None = Query(None, description="Start position (0-based, inclusive)"), end: int | None = Query(None, description="End position (0-based, exclusive)"), + dbagent=Depends(get_dbagent), ): return Response(content=dbagent.seq.get(sequence_digest, start, end), media_type="text/plain") @@ -116,7 +125,7 @@ async def sequence( summary="Retrieve metadata for a sequence", tags=["Retrieving data"], ) -async def seq_metadata(dbagent=Depends(get_dbagent), sequence_digest: str = example_sequence): +async def seq_metadata(sequence_digest: str = example_sequence, dbagent=Depends(get_dbagent)): raise HTTPException(status_code=501, detail="Metadata retrieval not yet implemented.") @@ -129,13 +138,15 @@ async def seq_metadata(dbagent=Depends(get_dbagent), sequence_digest: str = exam tags=["Retrieving data"], ) async def collection( - dbagent=Depends(get_dbagent), collection_digest: str = example_collection_digest, level: int | None = Query(None, description="Recursion depth (1 or 2)", ge=1, le=2), collated: bool = Query(True, description="Return collated format (arrays) vs itemwise"), - attribute: str | None = Query(None, description="Return only this attribute (e.g., 'names', 'lengths')"), + attribute: str | None = Query( + None, description="Return only this attribute (e.g., 'names', 'lengths')" + ), + backend=Depends(get_backend), ): - if level == None: + if level is None: level = 2 if level > 2: raise HTTPException( @@ -144,16 +155,10 @@ async def collection( ) try: if not collated: - return dbagent.seqcol.get( - collection_digest, return_format="itemwise", itemwise_limit=10000 - ) + return backend.get_collection_itemwise(collection_digest, limit=10000) if attribute: - return dbagent.seqcol.get(collection_digest, attribute=attribute) - if level == 1: - return dbagent.seqcol.get(collection_digest, return_format="level1") - if level == 2: - return dbagent.seqcol.get(collection_digest, return_format="level2") - return {"error": "Invalid level specified."} + return backend.get_collection_attribute(collection_digest, attribute) + return backend.get_collection(collection_digest, level=level) except ValueError as e: raise HTTPException( status_code=404, @@ -167,18 +172,18 @@ async def collection( tags=["Retrieving data"], ) async def attribute( - dbagent=Depends(get_dbagent), attribute_name: str = "names", attribute_digest: str = example_attribute_digest, + backend=Depends(get_backend), ): try: - return dbagent.attribute.get(attribute_name, attribute_digest) - except KeyError as e: + return backend.get_attribute(attribute_name, attribute_digest) + except KeyError: raise HTTPException( status_code=404, detail="Error: attribute not found. Check the attribute and try again.", ) - except AttributeError as e: + except AttributeError: raise HTTPException( status_code=404, detail="Digest not found. Check the digest and try again.", @@ -191,15 +196,15 @@ async def attribute( tags=["Comparing sequence collections"], ) async def compare_2_digests( - dbagent=Depends(get_dbagent), collection_digest1: str = example_digest_hg38, collection_digest2: str = example_digest_hg38_primary, + backend=Depends(get_backend), ): _LOGGER.info("Comparing two digests...") result = {} result["digests"] = {"a": collection_digest1, "b": collection_digest2} try: - result.update(dbagent.compare_digests(collection_digest1, collection_digest2)) + result.update(backend.compare_digests(collection_digest1, collection_digest2)) except ValueError as e: _LOGGER.debug(e) raise HTTPException( @@ -319,9 +324,9 @@ async def calc_similarities_from_json( tags=["Comparing sequence collections"], ) async def compare_1_digest( - dbagent=Depends(get_dbagent), collection_digest1: str = example_digest_hg38, seqcolB: dict = example_hg38_sc, + backend=Depends(get_backend), ): _LOGGER.info("Comparing one digests and one POSTed seqcol...") _LOGGER.info(f"digest1: {collection_digest1}") @@ -329,7 +334,7 @@ async def compare_1_digest( result = {} result["digests"] = {"a": collection_digest1, "b": "POSTed seqcol"} try: - result.update(dbagent.compare_1_digest(collection_digest1, seqcolB)) + result.update(backend.compare_digest_with_level2(collection_digest1, seqcolB)) except ValueError as e: _LOGGER.debug(e) raise HTTPException( @@ -346,7 +351,6 @@ async def compare_1_digest( response_model=PaginatedDigestList, ) async def list_collections_by_offset( - dbagent=Depends(get_dbagent), page_size: int = Query(100, description="Number of results per page"), page: int = Query(0, description="Page number (0-indexed)"), names: str | None = Query(None, description="Filter by names attribute digest"), @@ -354,32 +358,28 @@ async def list_collections_by_offset( sequences: str | None = Query(None, description="Filter by sequences attribute digest"), name_length_pairs: str | None = Query(None, description="Filter by name_length_pairs digest"), sorted_sequences: str | None = Query(None, description="Filter by sorted_sequences digest"), + backend=Depends(get_backend), ): # Build filters from explicit parameters filters = { - k: v for k, v in { + k: v + for k, v in { "names": names, "lengths": lengths, "sequences": sequences, "name_length_pairs": name_length_pairs, "sorted_sequences": sorted_sequences, - }.items() if v is not None + }.items() + if v is not None } - if filters: - try: - # Multi-attribute filtering with AND logic - res = dbagent.seqcol.search_by_attributes( - filters, limit=page_size, offset=page * page_size - ) - except ValueError as e: - # Invalid attribute name - raise HTTPException(status_code=400, detail=str(e)) - else: - # No filters, return all collections - res = dbagent.seqcol.list_by_offset(limit=page_size, offset=page * page_size) + try: + res = backend.list_collections(page=page, page_size=page_size, filters=filters or None) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) - res["results"] = [x.digest for x in res["results"]] + # Normalize results to digest strings (DB backend returns model objects) + res["results"] = [x.digest if hasattr(x, "digest") else x for x in res["results"]] return res @@ -399,7 +399,7 @@ async def list_attributes( res = dbagent.attribute.list(attribute, limit=page_size, offset=page * page_size) res["results"] = [x.digest for x in res["results"]] return res - except KeyError as e: + except KeyError: raise HTTPException( status_code=404, detail="Error: attribute not found. Check the attribute and try again.", @@ -438,7 +438,7 @@ async def pangenome( level: int | None = Query(None, description="Recursion depth (1-4)", ge=1, le=4), collated: bool = Query(True, description="Return collated format (arrays) vs itemwise"), ): - if level == None: + if level is None: level = 2 try: if not collated: @@ -579,7 +579,9 @@ async def get_fasta_index( ) def run_compliance_endpoint( request: Request, - target_url: str | None = Query(None, description="Target server URL to test (defaults to self)"), + target_url: str | None = Query( + None, description="Target server URL to test (defaults to self)" + ), ): """ Run GA4GH SeqCol compliance structure tests against a server. @@ -606,7 +608,9 @@ def run_compliance_endpoint( ) def stream_compliance_endpoint( request: Request, - target_url: str | None = Query(None, description="Target server URL to test (defaults to self)"), + target_url: str | None = Query( + None, description="Target server URL to test (defaults to self)" + ), ): """ Stream compliance check results in real-time via Server-Sent Events. diff --git a/refget/store.py b/refget/store.py index 30379e5..41f79ee 100644 --- a/refget/store.py +++ b/refget/store.py @@ -18,11 +18,11 @@ if GTARS_INSTALLED: from gtars.refget import ( RefgetStore, + SequenceCollection, StorageMode, - digest_fasta, compute_fai, + digest_fasta, digest_sequence, - SequenceCollection, ) else: RefgetStore = None diff --git a/refget/utils.py b/refget/utils.py index 7c73799..c3b3e16 100644 --- a/refget/utils.py +++ b/refget/utils.py @@ -1,19 +1,19 @@ import json import logging - -from jsonschema import Draft7Validator from pathlib import Path from typing import Optional, Union +from jsonschema import Draft7Validator + from .const import ( - SeqColDict, DEFAULT_INHERENT_ATTRS, DEFAULT_PASSTHRU_ATTRS, - SEQCOL_SCHEMA_PATH, GTARS_INSTALLED, + SEQCOL_SCHEMA_PATH, + SeqColDict, ) +from .digests import DigestFunction, sha512t24u_digest from .exceptions import InvalidSeqColError -from .digests import sha512t24u_digest, DigestFunction _LOGGER = logging.getLogger(__name__) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt deleted file mode 100644 index 1d1c6e3..0000000 --- a/requirements/requirements-all.txt +++ /dev/null @@ -1,7 +0,0 @@ -jsonschema -gtars>=0.7.0 -pyyaml -requests -sqlmodel -tomli_w -typer>=0.9.0 diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt deleted file mode 100644 index 60c9958..0000000 --- a/requirements/requirements-dev.txt +++ /dev/null @@ -1 +0,0 @@ --e git+git://github.com/databio/henge@master#egg=henge \ No newline at end of file diff --git a/requirements/requirements-docs.txt b/requirements/requirements-docs.txt deleted file mode 100644 index b2a9546..0000000 --- a/requirements/requirements-docs.txt +++ /dev/null @@ -1,2 +0,0 @@ -https://github.com/refgenie/refget/archive/master.zip -https://github.com/databio/mkdocs-databio/archive/master.zip diff --git a/requirements/requirements-seqcolapi.txt b/requirements/requirements-seqcolapi.txt deleted file mode 100644 index bbd3811..0000000 --- a/requirements/requirements-seqcolapi.txt +++ /dev/null @@ -1,6 +0,0 @@ -fastapi -psycopg2-binary -refget -sqlmodel -uvicorn>=0.30.0 -ubiquerg>=0.6.1 diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt deleted file mode 100644 index aadcdae..0000000 --- a/requirements/requirements-test.txt +++ /dev/null @@ -1,3 +0,0 @@ --r requirements-all.txt -coveralls>=1.1 -pytest-cov>=6.0.0 \ No newline at end of file diff --git a/seqcolapi/__main__.py b/seqcolapi/__main__.py index 2e4396a..3ea99ba 100644 --- a/seqcolapi/__main__.py +++ b/seqcolapi/__main__.py @@ -1,4 +1,5 @@ import sys + from .main import main if __name__ == "__main__": diff --git a/seqcolapi/const.py b/seqcolapi/const.py index ccf3895..34e6aac 100644 --- a/seqcolapi/const.py +++ b/seqcolapi/const.py @@ -1,8 +1,9 @@ import os +from platform import python_version -from refget._version import __version__ as refget_version from gtars import __version__ as gtars_version -from platform import python_version + +from refget._version import __version__ as refget_version ALL_VERSIONS = { "refget_version": refget_version, diff --git a/seqcolapi/examples.py b/seqcolapi/examples.py index 2704252..032b863 100644 --- a/seqcolapi/examples.py +++ b/seqcolapi/examples.py @@ -1,7 +1,7 @@ # Models # Used for documentation examples in OpenAPI -from fastapi import Path, Body +from fastapi import Body, Path example_digest = Path( ..., diff --git a/seqcolapi/main.py b/seqcolapi/main.py index b13cc72..c21c995 100644 --- a/seqcolapi/main.py +++ b/seqcolapi/main.py @@ -1,22 +1,20 @@ import logging +from contextlib import asynccontextmanager -from fastapi import FastAPI, Depends -from fastapi import HTTPException +from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, FileResponse, HTMLResponse -from refget.router import create_refget_router, get_dbagent +from fastapi.responses import FileResponse, HTMLResponse, JSONResponse +from sqlmodel import Session, select from starlette.requests import Request from starlette.staticfiles import StaticFiles -from sqlmodel import Session, select -from contextlib import asynccontextmanager -from .const import ALL_VERSIONS, STATIC_PATH, STATIC_DIRNAME +from refget.agents import RefgetDBAgent from refget.const import HUMANS_SAMPLE_LIST, MOUSE_SAMPLES_LIST from refget.models import HumanReadableNames -from .examples import * +from refget.router import _ROUTER_CONFIG, _SAMPLE_DIGESTS, create_refget_router -from refget.router import _SAMPLE_DIGESTS, _ROUTER_CONFIG -from refget.agents import RefgetDBAgent +from .const import ALL_VERSIONS, STATIC_DIRNAME, STATIC_PATH +from .examples import * global _LOGGER _LOGGER = logging.getLogger(__name__) @@ -35,6 +33,7 @@ async def lifespan_loader(app): # Initialize database agent and store in app state dbagent = RefgetDBAgent() app.state.dbagent = dbagent + app.state.backend = dbagent # RefgetDBAgent satisfies SeqColBackend species_samples = {"human": HUMANS_SAMPLE_LIST, "mouse": MOUSE_SAMPLES_LIST} @@ -121,13 +120,13 @@ async def http_exception_handler(request: Request, exc: HTTPException): @app.exception_handler(ValueError) -async def generic_exception_handler(request: Request, exc: Exception): +async def value_error_handler(request: Request, exc: Exception): raise HTTPException(status_code=404, detail=str(exc)) @app.get("favicon.ico", include_in_schema=False) async def favicon(): - return FileResponse(f"/static/favicon.ico") + return FileResponse("/static/favicon.ico") @app.get("/", summary="Home page", tags=["General endpoints"], response_class=HTMLResponse) @@ -149,10 +148,14 @@ async def service_info(): "fasta_drs": {"enabled": _ROUTER_CONFIG.get("fasta_drs", False)}, } + # Get backend capabilities + backend = getattr(app.state, "backend", None) + caps = backend.capabilities() if backend and hasattr(backend, "capabilities") else {} + # Add refget_store info store_url = _ROUTER_CONFIG.get("refget_store_url") if store_url: - seqcol_info["refget_store"] = {"enabled": True, "url": store_url} + seqcol_info["refget_store"] = {"enabled": True, "url": store_url, **caps} else: seqcol_info["refget_store"] = {"enabled": False} @@ -176,7 +179,7 @@ async def service_info(): # Mount statics after other routes for lower precedence -app.mount(f"/", StaticFiles(directory=STATIC_PATH), name=STATIC_DIRNAME) +app.mount("/", StaticFiles(directory=STATIC_PATH), name=STATIC_DIRNAME) def create_global_dbagent(): @@ -188,5 +191,37 @@ def create_global_dbagent(): return dbagent +def create_store_app(store_path: str, remote: bool = False, cache_dir: str = "/tmp/seqcol_cache"): + """Create a seqcolapi FastAPI app backed by a RefgetStore (no database). + + Args: + store_path: Path to store on disk, or S3 URL for remote stores. + remote: If True, open as a remote (S3) store. + cache_dir: Local cache directory for remote stores. + + Returns: + FastAPI app with store-backed seqcol endpoints. + """ + from refget.backend import RefgetStoreBackend + from refget.store import RefgetStore + + if remote: + store = RefgetStore.open_remote(cache_dir, store_path) + else: + store = RefgetStore.on_disk(store_path) + + backend = RefgetStoreBackend(store.into_readonly()) + + store_app = FastAPI(title="Sequence Collections API (Store-backed)") + store_app.state.backend = backend + router = create_refget_router( + sequences=False, pangenomes=False, refget_store_url=store_path if remote else None + ) + store_app.include_router(router) + return store_app + + if __name__ != "__main__": - app.state.dbagent = create_global_dbagent() + _dbagent = create_global_dbagent() + app.state.dbagent = _dbagent + app.state.backend = _dbagent # RefgetDBAgent satisfies SeqColBackend diff --git a/setup.py b/setup.py deleted file mode 100644 index 4e22f29..0000000 --- a/setup.py +++ /dev/null @@ -1,56 +0,0 @@ -#! /usr/bin/env python - -import os -from setuptools import setup, find_packages -import sys - -PACKAGE = "refget" - -# Additional keyword arguments for setup(). -extra = {} - -# Ordinary dependencies -DEPENDENCIES = [] -with open("requirements/requirements-all.txt", "r") as reqs_file: - for line in reqs_file: - if not line.strip(): - continue - DEPENDENCIES.append(line) - -extra["install_requires"] = DEPENDENCIES - -with open("{}/_version.py".format(PACKAGE), "r") as versionfile: - version = versionfile.readline().split()[-1].strip("\"'\n") - -long_description = open("README.md").read() - -setup( - name=PACKAGE, - packages=find_packages(include=[PACKAGE, f"{PACKAGE}.*"]), - version=version, - description="Python client for refget", - long_description=long_description, - long_description_content_type="text/markdown", - classifiers=[ - "Development Status :: 4 - Beta", - "License :: OSI Approved :: BSD License", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - ], - keywords="genome, assembly, bioinformatics, reference, sequence", - url="https://github.com/refgenie/refget", - author="Nathan Sheffield, Michal Stolarczyk", - author_email="nathan@code.databio.org", - license="BSD2", - entry_points={ - "console_scripts": ["refget = refget.cli:main"], - }, - # package_data={"refget": [os.path.join("refget", "*")]}, - include_package_data=True, - test_suite="tests", - tests_require=(["mock", "pytest"]), - setup_requires=(["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else []), - **extra, -) diff --git a/tests/api/conftest.py b/tests/api/conftest.py index 9ebaaca..49ea738 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -1,6 +1,7 @@ -import pytest from pathlib import Path +import pytest + @pytest.fixture(scope="session") def test_data_root(): diff --git a/tests/api/test_compliance.py b/tests/api/test_compliance.py index 0e8652d..8960493 100644 --- a/tests/api/test_compliance.py +++ b/tests/api/test_compliance.py @@ -10,25 +10,26 @@ # ./scripts/test-integration.sh import pytest + from refget.compliance import ( - DIGEST_TESTS, COMPARISON_FIXTURES, - check_service_info, - check_list_collections, - check_list_attributes, - check_openapi_available, + DIGEST_TESTS, + check_attribute_retrieval, check_collection_level1, check_collection_level2, + check_comparison, + check_comparison_post, + check_comparison_same_order_values, + check_comparison_structure, check_default_level_returns_level2, - check_sorted_name_length_pairs, - check_attribute_retrieval, - check_transient_attribute_not_served, + check_list_attributes, + check_list_collections, check_list_filter_by_attribute, check_list_multi_attribute_filter_and, - check_comparison, - check_comparison_structure, - check_comparison_same_order_values, - check_comparison_post, + check_openapi_available, + check_service_info, + check_sorted_name_length_pairs, + check_transient_attribute_not_served, ) diff --git a/tests/conftest.py b/tests/conftest.py index 1d26369..2e2f637 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,7 @@ import json import os from pathlib import Path + import pytest from typer.testing import CliRunner @@ -38,7 +39,7 @@ @pytest.fixture def runner(): """Typer CLI test runner.""" - return CliRunner(mix_stderr=False) + return CliRunner() @pytest.fixture @@ -70,6 +71,8 @@ def invoke(*args): def test_data_root(): """Provides the absolute path to the test_fasta directory.""" return TEST_DATA_DIR + + DIFFERENT_NAMES_FASTA = TEST_DATA_DIR / "different_names.fa" DIFFERENT_ORDER_FASTA = TEST_DATA_DIR / "different_order.fa" PAIR_SWAP_FASTA = TEST_DATA_DIR / "pair_swap.fa" @@ -253,8 +256,12 @@ def pytest_configure(config): config.addinivalue_line("markers", "requires_network: mark test as requiring network access") config.addinivalue_line("markers", "requires_db: mark test as requiring database access") config.addinivalue_line("markers", "slow: mark test as slow running") - config.addinivalue_line("markers", "recommended: mark test as RECOMMENDED (not REQUIRED) by GA4GH spec") - config.addinivalue_line("markers", "require_service: mark test as requiring a running seqcol service") + config.addinivalue_line( + "markers", "recommended: mark test as RECOMMENDED (not REQUIRED) by GA4GH spec" + ) + config.addinivalue_line( + "markers", "require_service: mark test as requiring a running seqcol service" + ) def pytest_collection_modifyitems(config, items): @@ -286,7 +293,9 @@ def pytest_collection_modifyitems(config, items): # Skip require_service tests if no api_root or test_server available api_root = config.getoption("api_root") if api_root is None: - skip_service = pytest.mark.skip(reason="No --api-root provided and not running via integration test_server") + skip_service = pytest.mark.skip( + reason="No --api-root provided and not running via integration test_server" + ) for item in items: if "require_service" in item.keywords: # Only skip if this is the base TestAPI class, not a subclass with test_server diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 7f7cdff..d840250 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -7,12 +7,13 @@ """ import os -import pytest import socket import threading import time from pathlib import Path +import pytest + # Set environment variables BEFORE any app imports # Must match test-db.sh settings os.environ["POSTGRES_HOST"] = "localhost" @@ -78,8 +79,8 @@ def loaded_dbagent(test_dbagent, test_fasta_path): @pytest.fixture(scope="session") def client(loaded_dbagent): """Create TestClient with test database""" - from seqcolapi.main import app from refget.router import get_dbagent + from seqcolapi.main import app def override_get_dbagent(): return loaded_dbagent @@ -131,8 +132,9 @@ def test_server(request): loaded_dbagent = request.getfixturevalue("loaded_dbagent") import uvicorn - from seqcolapi.main import app + from refget.router import get_dbagent + from seqcolapi.main import app def override_get_dbagent(): return loaded_dbagent @@ -173,6 +175,7 @@ def override_get_dbagent(): def cli_runner(): """CLI runner for integration tests.""" from typer.testing import CliRunner + from refget.cli.main import app runner = CliRunner() diff --git a/tests/integration/test_cli_admin_integration.py b/tests/integration/test_cli_admin_integration.py index f3dc832..9642803 100644 --- a/tests/integration/test_cli_admin_integration.py +++ b/tests/integration/test_cli_admin_integration.py @@ -7,8 +7,6 @@ """ import pytest -import json -from pathlib import Path from typer.testing import CliRunner from refget.cli.main import app diff --git a/tests/integration/test_cli_seqcol_integration.py b/tests/integration/test_cli_seqcol_integration.py index 9f13a1a..da6ab12 100644 --- a/tests/integration/test_cli_seqcol_integration.py +++ b/tests/integration/test_cli_seqcol_integration.py @@ -6,7 +6,6 @@ Run with: ./scripts/test-integration.sh """ -import pytest import json diff --git a/tests/integration/test_run_compliance.py b/tests/integration/test_run_compliance.py index d240024..6cb7df7 100644 --- a/tests/integration/test_run_compliance.py +++ b/tests/integration/test_run_compliance.py @@ -1,6 +1,7 @@ """Run the standalone compliance suite against the integration test server.""" import pytest + from tests.api.test_compliance import TestAPI diff --git a/tests/local/test_aliases.py b/tests/local/test_aliases.py index 251cea4..58aeacc 100644 --- a/tests/local/test_aliases.py +++ b/tests/local/test_aliases.py @@ -8,7 +8,7 @@ from refget.store import RefgetStore try: - from gtars.refget import RefgetStore as _check + from gtars.refget import RefgetStore as _check # noqa: F401 _RUST_BINDINGS_AVAILABLE = True except ImportError: @@ -33,7 +33,7 @@ def seq_digest(store): @pytest.fixture def col_digest(store): - return store.list_collections()[0].digest + return store.list_collections()["results"][0].digest @pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") diff --git a/tests/local/test_backend.py b/tests/local/test_backend.py new file mode 100644 index 0000000..e52f309 --- /dev/null +++ b/tests/local/test_backend.py @@ -0,0 +1,216 @@ +""" +Tests for SeqColBackend protocol and RefgetStoreBackend implementation. + +Verifies that: +- RefgetStoreBackend wraps RefgetStore correctly +- All SeqColBackend protocol methods work +- Error handling (ValueError, KeyError) works properly +""" + +import json +from pathlib import Path + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +try: + from refget.backend import RefgetStoreBackend, SeqColBackend + from refget.store import RefgetStore + + _RUST_BINDINGS_AVAILABLE = True +except ImportError: + _RUST_BINDINGS_AVAILABLE = False + +from refget.router import create_refget_router + +TEST_FASTA_DIR = Path("test_fasta") +BASE_FASTA = TEST_FASTA_DIR / "base.fa" +DIFFERENT_NAMES_FASTA = TEST_FASTA_DIR / "different_names.fa" + +with open(TEST_FASTA_DIR / "test_fasta_digests.json") as fp: + TEST_DIGESTS = json.load(fp) + +BASE_DIGEST = TEST_DIGESTS["base.fa"]["top_level_digest"] +BASE_LEVEL1 = TEST_DIGESTS["base.fa"]["level1"] +BASE_LEVEL2 = TEST_DIGESTS["base.fa"]["level2"] +DIFFERENT_NAMES_DIGEST = TEST_DIGESTS["different_names.fa"]["top_level_digest"] + + +@pytest.fixture +def backend(): + """Create a RefgetStoreBackend with base.fa and different_names.fa loaded.""" + store = RefgetStore.in_memory() + store.add_sequence_collection_from_fasta(str(BASE_FASTA)) + store.add_sequence_collection_from_fasta(str(DIFFERENT_NAMES_FASTA)) + return RefgetStoreBackend(store.into_readonly()) + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +class TestRefgetStoreBackend: + """Tests for RefgetStoreBackend.""" + + def test_satisfies_protocol(self, backend): + """RefgetStoreBackend satisfies the SeqColBackend protocol.""" + assert isinstance(backend, SeqColBackend) + + def test_get_collection_level2(self, backend): + """get_collection returns level2 by default.""" + result = backend.get_collection(BASE_DIGEST) + assert "names" in result + assert "lengths" in result + assert "sequences" in result + assert isinstance(result["names"], list) + + def test_get_collection_level1(self, backend): + """get_collection with level=1 returns digest strings.""" + result = backend.get_collection(BASE_DIGEST, level=1) + assert "names" in result + assert isinstance(result["names"], str) + + def test_get_collection_not_found(self, backend): + """get_collection raises ValueError for missing digest.""" + with pytest.raises(ValueError, match="not found"): + backend.get_collection("nonexistent_digest") + + def test_get_collection_attribute(self, backend): + """get_collection_attribute returns a single attribute array matching level2.""" + names = backend.get_collection_attribute(BASE_DIGEST, "names") + assert isinstance(names, list) + # Should match what get_collection returns + level2 = backend.get_collection(BASE_DIGEST, level=2) + assert names == level2["names"] + + def test_get_collection_attribute_not_found(self, backend): + """get_collection_attribute raises ValueError for missing attribute.""" + with pytest.raises(ValueError, match="not found"): + backend.get_collection_attribute(BASE_DIGEST, "nonexistent_attr") + + def test_get_collection_itemwise(self, backend): + """get_collection_itemwise returns transposed list of dicts.""" + items = backend.get_collection_itemwise(BASE_DIGEST) + assert isinstance(items, list) + assert len(items) > 0 + for item in items: + assert "names" in item + assert "lengths" in item + + def test_get_collection_itemwise_with_limit(self, backend): + """get_collection_itemwise respects limit parameter.""" + items = backend.get_collection_itemwise(BASE_DIGEST, limit=1) + assert len(items) == 1 + + def test_get_attribute(self, backend): + """get_attribute returns attribute by its own digest.""" + names_digest = BASE_LEVEL1["names"] + result = backend.get_attribute("names", names_digest) + assert isinstance(result, list) + + def test_get_attribute_not_found(self, backend): + """get_attribute raises KeyError for missing attribute.""" + with pytest.raises(KeyError): + backend.get_attribute("names", "nonexistent_digest") + + def test_compare_digests(self, backend): + """compare_digests returns comparison dict.""" + result = backend.compare_digests(BASE_DIGEST, DIFFERENT_NAMES_DIGEST) + assert "attributes" in result + assert "array_elements" in result + + def test_compare_digests_not_found(self, backend): + """compare_digests raises ValueError for missing digest.""" + with pytest.raises(ValueError): + backend.compare_digests("nonexistent", DIFFERENT_NAMES_DIGEST) + + def test_compare_digest_with_level2(self, backend): + """compare_digest_with_level2 compares stored vs POSTed collection.""" + level2_b = backend.get_collection(DIFFERENT_NAMES_DIGEST, level=2) + result = backend.compare_digest_with_level2(BASE_DIGEST, level2_b) + assert "attributes" in result + assert "array_elements" in result + + def test_list_collections(self, backend): + """list_collections returns paginated results.""" + result = backend.list_collections() + assert "results" in result + assert "pagination" in result + assert result["pagination"]["total"] >= 2 + + def test_list_collections_pagination(self, backend): + """list_collections respects page_size.""" + result = backend.list_collections(page=0, page_size=1) + assert len(result["results"]) <= 1 + + def test_collection_count(self, backend): + """collection_count returns total number of collections.""" + count = backend.collection_count() + assert count >= 2 + + def test_capabilities(self, backend): + """capabilities returns expected keys for RefgetStoreBackend.""" + caps = backend.capabilities() + assert caps["backend_type"] == "refget_store" + assert "n_collections" in caps + assert "n_sequences" in caps + assert "has_sequence_data" in caps + assert isinstance(caps["collection_alias_namespaces"], list) + assert isinstance(caps["sequence_alias_namespaces"], list) + assert caps["n_collections"] >= 2 + + +@pytest.mark.skipif(not _RUST_BINDINGS_AVAILABLE, reason="gtars is not installed") +class TestStoreBackend501: + """Verify DB-only endpoints return 501 when only RefgetStoreBackend is configured.""" + + @pytest.fixture + def store_client(self): + """Create a TestClient with RefgetStoreBackend but no dbagent.""" + app = FastAPI() + router = create_refget_router(sequences=False, collections=True, pangenomes=False) + app.include_router(router, prefix="/seqcol") + + store = RefgetStore.in_memory() + store.add_sequence_collection_from_fasta(str(BASE_FASTA)) + backend = RefgetStoreBackend(store.into_readonly()) + app.state.backend = backend + # Deliberately do NOT set app.state.dbagent + return TestClient(app) + + def test_list_attributes_returns_501(self, store_client): + """GET /list/attributes/names returns 501 without dbagent.""" + response = store_client.get("/seqcol/list/attributes/names") + assert response.status_code == 501 + assert "database backend" in response.json()["detail"].lower() + + def test_similarities_post_returns_501(self, store_client): + """POST /similarities/{digest} returns 501 without dbagent.""" + response = store_client.post( + f"/seqcol/similarities/{BASE_DIGEST}", + params={"species": "human"}, + ) + assert response.status_code == 501 + + def test_similarities_json_post_returns_501(self, store_client): + """POST /similarities/ returns 501 without dbagent.""" + response = store_client.post( + "/seqcol/similarities/", + json={"names": ["chr1"], "lengths": [100], "sequences": ["abc"]}, + ) + assert response.status_code == 501 + + def test_backend_endpoints_still_work(self, store_client): + """Backend-powered endpoints work fine without dbagent.""" + # GET /collection/{digest} uses get_backend, should work + response = store_client.get(f"/seqcol/collection/{BASE_DIGEST}") + assert response.status_code == 200 + data = response.json() + assert "names" in data + assert "lengths" in data + + def test_list_collections_still_works(self, store_client): + """GET /list/collection uses get_backend, should work.""" + response = store_client.get("/seqcol/list/collection") + assert response.status_code == 200 + data = response.json() + assert "results" in data + assert "pagination" in data diff --git a/tests/local/test_digest_functions.py b/tests/local/test_digest_functions.py index da6fed8..d3b4b34 100644 --- a/tests/local/test_digest_functions.py +++ b/tests/local/test_digest_functions.py @@ -1,15 +1,20 @@ +from pathlib import Path + import pytest from refget import GTARS_INSTALLED -from refget.digests import ga4gh_digest, py_sha512t24u_digest, py_md5_digest -from pathlib import Path +from refget.digests import ga4gh_digest, py_md5_digest, py_sha512t24u_digest if GTARS_INSTALLED: from gtars.refget import ( - sha512t24u_digest as gtars_sha512t24u_digest, - md5_digest as gtars_md5_digest, digest_fasta, ) + from gtars.refget import ( + md5_digest as gtars_md5_digest, + ) + from gtars.refget import ( + sha512t24u_digest as gtars_sha512t24u_digest, + ) @pytest.mark.skipif(not GTARS_INSTALLED, reason="gtars is not installed") diff --git a/tests/local/test_local_models.py b/tests/local/test_local_models.py index cc84e4f..75157f1 100644 --- a/tests/local/test_local_models.py +++ b/tests/local/test_local_models.py @@ -1,11 +1,12 @@ import json import os + import pytest + from refget import InvalidSeqColError from refget.models import SequenceCollection from refget.utils import compare_seqcols, validate_seqcol - -from tests.conftest import DEMO_FILES, DIGEST_TESTS, API_TEST_DIR +from tests.conftest import API_TEST_DIR, DEMO_FILES, DIGEST_TESTS # Pairs of files to compare, with the "correct" compare response COMPARE_TESTS = [ diff --git a/tests/local/test_local_models_gtars.py b/tests/local/test_local_models_gtars.py index 8ac2445..e0b2122 100644 --- a/tests/local/test_local_models_gtars.py +++ b/tests/local/test_local_models_gtars.py @@ -1,24 +1,26 @@ -import pytest import logging - -_LOGGER = logging.getLogger(__name__) from pathlib import Path +import pytest + from refget.models import SequenceCollection as pythonSequenceCollection +from refget.store import RefgetStore -from refget.store import RefgetStore, StorageMode +_LOGGER = logging.getLogger(__name__) try: - from gtars.refget import ( + from gtars.refget import ( # noqa: F401 SequenceCollection as gtarsSequenceCollection, + ) + from gtars.refget import ( digest_fasta, ) _RUST_BINDINGS_AVAILABLE = True -except ImportError as e: +except ImportError: _LOGGER.warning( - f"Could not import gtars python bindings. `from_PySequenceCollection` will not be available." + "Could not import gtars python bindings. `from_PySequenceCollection` will not be available." ) _RUST_BINDINGS_AVAILABLE = False @@ -35,9 +37,9 @@ def test_pysequencecollection(self): bridged_seq_col = pythonSequenceCollection.from_PySequenceCollection( gtars_seq_col=gtars_digested_seq_col ) - assert ( - bridged_seq_col.digest == python_seq_col.digest == gtars_digested_seq_col.digest - ), "Top-level digest mismatch!" + assert bridged_seq_col.digest == python_seq_col.digest == gtars_digested_seq_col.digest, ( + "Top-level digest mismatch!" + ) assert bridged_seq_col.sequences.digest == python_seq_col.sequences.digest assert bridged_seq_col.sequences.value == python_seq_col.sequences.value diff --git a/tests/local/test_refget_clients.py b/tests/local/test_refget_clients.py index 13b81e4..77941df 100644 --- a/tests/local/test_refget_clients.py +++ b/tests/local/test_refget_clients.py @@ -8,7 +8,7 @@ see tests/integration/test_seqcolapi_client.py """ -from refget.clients import SequenceCollectionClient, FastaDrsClient +from refget.clients import FastaDrsClient, SequenceCollectionClient class TestClientConstruction: diff --git a/tests/local/test_remove_collection.py b/tests/local/test_remove_collection.py index 88fb208..b998770 100644 --- a/tests/local/test_remove_collection.py +++ b/tests/local/test_remove_collection.py @@ -1,14 +1,11 @@ """Smoke test for RefgetStore.remove_collection() Python binding.""" -import os -import tempfile - import pytest from refget.store import RefgetStore try: - from gtars.refget import RefgetStore as _check + from gtars.refget import RefgetStore as _check # noqa: F401 _RUST_BINDINGS_AVAILABLE = True except ImportError: @@ -24,15 +21,15 @@ def test_remove_collection_round_trip(): store.set_quiet(True) store.add_sequence_collection_from_fasta(FASTA_PATH) - assert len(store.list_collections()) == 1 + assert len(store.list_collections()["results"]) == 1 assert len(store.list_sequences()) > 0 - digest = store.list_collections()[0].digest + digest = store.list_collections()["results"][0].digest # Nonexistent returns False assert store.remove_collection("nonexistent") is False # Real removal with orphan cleanup assert store.remove_collection(digest, remove_orphan_sequences=True) is True - assert len(store.list_collections()) == 0 + assert len(store.list_collections()["results"]) == 0 assert len(store.list_sequences()) == 0 diff --git a/tests/local/test_store_seqcol_features.py b/tests/local/test_store_seqcol_features.py index 3779eb1..8a9bae2 100644 --- a/tests/local/test_store_seqcol_features.py +++ b/tests/local/test_store_seqcol_features.py @@ -8,9 +8,10 @@ """ import json -import pytest from pathlib import Path +import pytest + try: from refget.store import RefgetStore diff --git a/tests/test_cli/test_admin_commands.py b/tests/test_cli/test_admin_commands.py index f9d80de..3761d8f 100644 --- a/tests/test_cli/test_admin_commands.py +++ b/tests/test_cli/test_admin_commands.py @@ -7,9 +7,6 @@ Database-dependent admin tests are in tests/integration/test_cli_admin_integration.py """ -import pytest -import json - class TestAdminStatus: """Tests for: refget admin status diff --git a/tests/test_cli/test_config_commands.py b/tests/test_cli/test_config_commands.py index 8e9f78a..666864e 100644 --- a/tests/test_cli/test_config_commands.py +++ b/tests/test_cli/test_config_commands.py @@ -2,7 +2,6 @@ """Tests for refget config CLI commands.""" -import pytest import json @@ -101,11 +100,14 @@ def test_creates_config_file(self, cli, tmp_path, monkeypatch): # Provide minimal input for interactive prompts from typer.testing import CliRunner + from refget.cli import app runner = CliRunner() result = runner.invoke( - app, ["config", "init"], input=f"{tmp_path}/store\n\n\n" # Store path + defaults + app, + ["config", "init"], + input=f"{tmp_path}/store\n\n\n", # Store path + defaults ) # Config init should succeed or prompt for input @@ -116,6 +118,7 @@ def test_init_no_overwrite(self, cli, temp_config, monkeypatch): monkeypatch.setenv("REFGET_CONFIG", str(temp_config)) from typer.testing import CliRunner + from refget.cli import app runner = CliRunner() diff --git a/tests/test_cli/test_fasta_commands.py b/tests/test_cli/test_fasta_commands.py index 2f3ea6e..99060c3 100644 --- a/tests/test_cli/test_fasta_commands.py +++ b/tests/test_cli/test_fasta_commands.py @@ -6,21 +6,23 @@ These test CLI-specific behavior: output formatting, exit codes, argument parsing. """ -import pytest +import importlib.util import json -from pathlib import Path - -import sys import os +from pathlib import Path -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from conftest import ( - BASE_FASTA, - DIFFERENT_NAMES_FASTA, - TEST_FASTA_DIGESTS, - assert_json_output, - assert_valid_digest, +_conftest_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "conftest.py" ) +_spec = importlib.util.spec_from_file_location("tests_conftest", _conftest_path) +_conftest = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_conftest) + +BASE_FASTA = _conftest.BASE_FASTA +DIFFERENT_NAMES_FASTA = _conftest.DIFFERENT_NAMES_FASTA +TEST_FASTA_DIGESTS = _conftest.TEST_FASTA_DIGESTS +assert_json_output = _conftest.assert_json_output +assert_valid_digest = _conftest.assert_valid_digest class TestFastaDigest: @@ -184,7 +186,7 @@ def test_rgsi_format_and_content(self, cli, sample_fasta): assert "##seqcol_digest=" in content assert "#name\tlength\talphabet\tsha512t24u\tmd5\tdescription" in content - data_lines = [l for l in content.strip().split("\n") if not l.startswith("#")] + data_lines = [line for line in content.strip().split("\n") if not line.startswith("#")] assert len(data_lines) == 2 # sample_fasta has 2 sequences # Verify first sequence diff --git a/tests/test_cli/test_help.py b/tests/test_cli/test_help.py index b1e599b..f80ff4d 100644 --- a/tests/test_cli/test_help.py +++ b/tests/test_cli/test_help.py @@ -2,8 +2,6 @@ """Tests for CLI help output.""" -import pytest - class TestHelpOutput: """Verify help text displays correctly.""" diff --git a/tests/test_cli/test_seqcol_commands.py b/tests/test_cli/test_seqcol_commands.py index 88887d6..398fc47 100644 --- a/tests/test_cli/test_seqcol_commands.py +++ b/tests/test_cli/test_seqcol_commands.py @@ -7,21 +7,23 @@ Network-dependent tests are in tests/integration/test_cli_seqcol_integration.py """ -import pytest +import importlib.util import json -import sys import os -from pathlib import Path - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from conftest import ( - BASE_FASTA, - DIFFERENT_NAMES_FASTA, - DIFFERENT_ORDER_FASTA, - SUBSET_FASTA, - TEST_FASTA_DIGESTS, - assert_json_output, + +_conftest_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "conftest.py" ) +_spec = importlib.util.spec_from_file_location("tests_conftest", _conftest_path) +_conftest = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_conftest) + +BASE_FASTA = _conftest.BASE_FASTA +DIFFERENT_NAMES_FASTA = _conftest.DIFFERENT_NAMES_FASTA +DIFFERENT_ORDER_FASTA = _conftest.DIFFERENT_ORDER_FASTA +SUBSET_FASTA = _conftest.SUBSET_FASTA +TEST_FASTA_DIGESTS = _conftest.TEST_FASTA_DIGESTS +assert_json_output = _conftest.assert_json_output class TestSeqcolCompare: diff --git a/tests/test_cli/test_store_commands.py b/tests/test_cli/test_store_commands.py index 5544151..dd3c60a 100644 --- a/tests/test_cli/test_store_commands.py +++ b/tests/test_cli/test_store_commands.py @@ -2,21 +2,23 @@ """Tests for refget store CLI commands.""" -import pytest +import importlib.util import json -import sys import os -from pathlib import Path - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from conftest import ( - BASE_FASTA, - DIFFERENT_NAMES_FASTA, - DIFFERENT_ORDER_FASTA, - SAMPLE_FHR_JSON, - TEST_FASTA_DIGESTS, - assert_json_output, + +_conftest_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "conftest.py" ) +_spec = importlib.util.spec_from_file_location("tests_conftest", _conftest_path) +_conftest = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_conftest) + +BASE_FASTA = _conftest.BASE_FASTA +DIFFERENT_NAMES_FASTA = _conftest.DIFFERENT_NAMES_FASTA +DIFFERENT_ORDER_FASTA = _conftest.DIFFERENT_ORDER_FASTA +SAMPLE_FHR_JSON = _conftest.SAMPLE_FHR_JSON +TEST_FASTA_DIGESTS = _conftest.TEST_FASTA_DIGESTS +assert_json_output = _conftest.assert_json_output class TestStoreInit: @@ -215,7 +217,7 @@ def test_get_collection(self, cli, tmp_path): result = cli("store", "get", digest, "--path", str(store_path)) - data = assert_json_output(result, ["names", "lengths", "sequences"]) + assert_json_output(result, ["names", "lengths", "sequences"]) def test_get_nonexistent_digest(self, cli, tmp_path): """Returns error for nonexistent digest.""" @@ -290,9 +292,7 @@ def test_gets_sequence_by_name(self, cli, tmp_path): add_result = cli("store", "add", str(BASE_FASTA), "--path", str(store_path)) digest = json.loads(add_result.stdout)["digest"] - result = cli( - "store", "get", digest, "-s", "--name", "chr1", "--path", str(store_path) - ) + result = cli("store", "get", digest, "-s", "--name", "chr1", "--path", str(store_path)) assert result.exit_code == 0 # Output should be sequence (GGAA for chr1 in base.fa) @@ -509,8 +509,12 @@ def test_metadata_set_from_json_file(self, cli, tmp_path): store_path, digest = _setup_store_with_fasta(cli, tmp_path) result = cli( - "store", "metadata-set", digest, str(SAMPLE_FHR_JSON), - "--path", str(store_path), + "store", + "metadata-set", + digest, + str(SAMPLE_FHR_JSON), + "--path", + str(store_path), ) assert result.exit_code == 0 @@ -521,8 +525,12 @@ def test_metadata_read_after_set(self, cli, tmp_path): store_path, digest = _setup_store_with_fasta(cli, tmp_path) cli( - "store", "metadata-set", digest, str(SAMPLE_FHR_JSON), - "--path", str(store_path), + "store", + "metadata-set", + digest, + str(SAMPLE_FHR_JSON), + "--path", + str(store_path), ) result = cli("store", "metadata", digest, "--path", str(store_path)) @@ -539,8 +547,12 @@ def test_metadata_output_is_valid_json(self, cli, tmp_path): store_path, digest = _setup_store_with_fasta(cli, tmp_path) cli( - "store", "metadata-set", digest, str(SAMPLE_FHR_JSON), - "--path", str(store_path), + "store", + "metadata-set", + digest, + str(SAMPLE_FHR_JSON), + "--path", + str(store_path), ) result = cli("store", "metadata", digest, "--path", str(store_path)) @@ -564,8 +576,12 @@ def test_metadata_set_nonexistent_file(self, cli, tmp_path): store_path, digest = _setup_store_with_fasta(cli, tmp_path) result = cli( - "store", "metadata-set", digest, "/nonexistent/fhr.json", - "--path", str(store_path), + "store", + "metadata-set", + digest, + "/nonexistent/fhr.json", + "--path", + str(store_path), ) assert result.exit_code != 0 @@ -576,8 +592,11 @@ def test_metadata_nonexistent_digest(self, cli, tmp_path): cli("store", "init", "--path", str(store_path)) result = cli( - "store", "metadata", "nonexistent_digest_123", - "--path", str(store_path), + "store", + "metadata", + "nonexistent_digest_123", + "--path", + str(store_path), ) assert result.exit_code != 0 @@ -588,23 +607,35 @@ def test_metadata_set_then_overwrite(self, cli, tmp_path): # Set original metadata cli( - "store", "metadata-set", digest, str(SAMPLE_FHR_JSON), - "--path", str(store_path), + "store", + "metadata-set", + digest, + str(SAMPLE_FHR_JSON), + "--path", + str(store_path), ) # Create updated FHR JSON updated_fhr = tmp_path / "updated_fhr.json" - updated_fhr.write_text(json.dumps({ - "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", - "schemaVersion": 1.0, - "genome": "Updated organism", - "version": "v2.0", - })) + updated_fhr.write_text( + json.dumps( + { + "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", + "schemaVersion": 1.0, + "genome": "Updated organism", + "version": "v2.0", + } + ) + ) # Overwrite cli( - "store", "metadata-set", digest, str(updated_fhr), - "--path", str(store_path), + "store", + "metadata-set", + digest, + str(updated_fhr), + "--path", + str(store_path), ) result = cli("store", "metadata", digest, "--path", str(store_path)) @@ -619,8 +650,12 @@ def test_metadata_removed_with_collection(self, cli, tmp_path): # Set metadata cli( - "store", "metadata-set", digest, str(SAMPLE_FHR_JSON), - "--path", str(store_path), + "store", + "metadata-set", + digest, + str(SAMPLE_FHR_JSON), + "--path", + str(store_path), ) # Remove the collection diff --git a/tests/test_cli/test_store_crate.py b/tests/test_cli/test_store_crate.py new file mode 100644 index 0000000..2be442d --- /dev/null +++ b/tests/test_cli/test_store_crate.py @@ -0,0 +1,305 @@ +# tests/test_cli/test_store_crate.py + +"""Tests for refget store crate CLI command.""" + +import importlib.util +import json +import os + +_conftest_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "conftest.py" +) +_spec = importlib.util.spec_from_file_location("tests_conftest", _conftest_path) +_conftest = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_conftest) + +BASE_FASTA = _conftest.BASE_FASTA +assert_json_output = _conftest.assert_json_output + + +def _init_and_add(cli, tmp_path): + """Initialize a store and add a FASTA, return store_path.""" + store_path = tmp_path / "store" + cli("store", "init", "--path", str(store_path)) + cli("store", "add", str(BASE_FASTA), "--path", str(store_path)) + return store_path + + +class TestStoreCrate: + """Tests for: refget store crate""" + + def test_produces_valid_json(self, cli, tmp_path): + """Crate command produces valid JSON output file.""" + store_path = _init_and_add(cli, tmp_path) + + result = cli( + "store", "crate", + "--path", str(store_path), + "--name", "Test Store", + ) + + assert result.exit_code == 0 + crate_path = store_path / "ro-crate-metadata.json" + assert crate_path.exists() + + crate = json.loads(crate_path.read_text()) + assert "@context" in crate + assert "@graph" in crate + assert isinstance(crate["@graph"], list) + + def test_has_must_entities(self, cli, tmp_path): + """Crate contains all MUST entities per the profile.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", "crate", + "--path", str(store_path), + "--name", "Test Store", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + ids = {e["@id"] for e in crate["@graph"]} + + # MUST entities + assert "ro-crate-metadata.json" in ids + assert "./" in ids + assert "rgstore.json" in ids + assert "sequences.rgsi" in ids + assert "sequences/" in ids + assert "collections/" in ids + + def test_metadata_descriptor_conformsto(self, cli, tmp_path): + """Metadata descriptor has correct conformsTo.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", "crate", + "--path", str(store_path), + "--name", "Test Store", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + descriptor = next(e for e in crate["@graph"] if e["@id"] == "ro-crate-metadata.json") + + conforms = [c["@id"] for c in descriptor["conformsTo"]] + assert "https://w3id.org/ro/crate/1.2" in conforms + assert "https://w3id.org/ga4gh/refget/refgetstore-crate/0.1" in conforms + + def test_root_dataset_name(self, cli, tmp_path): + """Root dataset has the specified name.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", "crate", + "--path", str(store_path), + "--name", "My Genome Store", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + root = next(e for e in crate["@graph"] if e["@id"] == "./") + assert root["name"] == "My Genome Store" + + def test_property_values(self, cli, tmp_path): + """Crate contains PropertyValue entities with correct stats.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", "crate", + "--path", str(store_path), + "--name", "Test Store", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + props = { + e["propertyID"]: e["value"] + for e in crate["@graph"] + if e.get("@type") == "PropertyValue" + } + + assert "storageMode" in props + assert "sequenceCount" in props + assert props["sequenceCount"] > 0 + assert "collectionCount" in props + assert props["collectionCount"] >= 1 + assert props["refgetDigestAlgorithm"] == "sha512t24u" + + def test_author_parsing_orcid(self, cli, tmp_path): + """Parses 'Name ' format into Person entity.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", "crate", + "--path", str(store_path), + "--name", "Test Store", + "--author", "Jane Doe ", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + + # Find Person entity + person = next( + (e for e in crate["@graph"] if e.get("@type") == "Person"), + None, + ) + assert person is not None + assert person["@id"] == "https://orcid.org/0000-0001-1234-5678" + assert person["name"] == "Jane Doe" + + # Root dataset references author + root = next(e for e in crate["@graph"] if e["@id"] == "./") + assert root["author"]["@id"] == "https://orcid.org/0000-0001-1234-5678" + + def test_author_plain_name(self, cli, tmp_path): + """Handles plain name without URL.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", "crate", + "--path", str(store_path), + "--name", "Test Store", + "--author", "John Smith", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + person = next( + (e for e in crate["@graph"] if e.get("@type") == "Person"), + None, + ) + assert person is not None + assert person["name"] == "John Smith" + + def test_license(self, cli, tmp_path): + """License creates a CreativeWork entity.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", "crate", + "--path", str(store_path), + "--name", "Test Store", + "--license", "https://creativecommons.org/publicdomain/zero/1.0/", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + + root = next(e for e in crate["@graph"] if e["@id"] == "./") + assert root["license"]["@id"] == "https://creativecommons.org/publicdomain/zero/1.0/" + + license_entity = next( + (e for e in crate["@graph"] + if e["@id"] == "https://creativecommons.org/publicdomain/zero/1.0/"), + None, + ) + assert license_entity is not None + assert license_entity["@type"] == "CreativeWork" + + def test_custom_output_path(self, cli, tmp_path): + """Writes to custom output path.""" + store_path = _init_and_add(cli, tmp_path) + output_path = tmp_path / "custom" / "crate.json" + + result = cli( + "store", "crate", + "--path", str(store_path), + "--name", "Test Store", + "--output", str(output_path), + ) + + assert result.exit_code == 0 + assert output_path.exists() + + crate = json.loads(output_path.read_text()) + assert "@graph" in crate + + def test_no_aliases_when_absent(self, cli, tmp_path): + """Does not include aliases/ when directory doesn't exist.""" + store_path = _init_and_add(cli, tmp_path) + + # Remove aliases dir if it exists + aliases = store_path / "aliases" + if aliases.exists(): + import shutil + shutil.rmtree(aliases) + + cli( + "store", "crate", + "--path", str(store_path), + "--name", "Test Store", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + ids = {e["@id"] for e in crate["@graph"]} + assert "aliases/" not in ids + + def test_create_action_provenance(self, cli, tmp_path): + """Crate includes CreateAction with refget version.""" + store_path = _init_and_add(cli, tmp_path) + + cli( + "store", "crate", + "--path", str(store_path), + "--name", "Test Store", + ) + + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + + action = next( + (e for e in crate["@graph"] if e.get("@type") == "CreateAction"), + None, + ) + assert action is not None + assert "endTime" in action + assert action["instrument"]["@id"] == "#refget-software" + + sw = next( + (e for e in crate["@graph"] if e["@id"] == "#refget-software"), + None, + ) + assert sw is not None + assert sw["@type"] == "SoftwareApplication" + assert "version" in sw + + def test_description_optional(self, cli, tmp_path): + """Description is included when provided, absent when not.""" + store_path = _init_and_add(cli, tmp_path) + + # Without description + cli( + "store", "crate", + "--path", str(store_path), + "--name", "Test Store", + ) + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + root = next(e for e in crate["@graph"] if e["@id"] == "./") + assert "description" not in root + + # With description + cli( + "store", "crate", + "--path", str(store_path), + "--name", "Test Store", + "--description", "A test store for genomes", + ) + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + root = next(e for e in crate["@graph"] if e["@id"] == "./") + assert root["description"] == "A test store for genomes" + + def test_empty_store(self, cli, tmp_path): + """Crate works for empty store with zero counts.""" + store_path = tmp_path / "store" + cli("store", "init", "--path", str(store_path)) + + result = cli( + "store", "crate", + "--path", str(store_path), + "--name", "Empty Store", + ) + + assert result.exit_code == 0 + crate = json.loads((store_path / "ro-crate-metadata.json").read_text()) + props = { + e["propertyID"]: e["value"] + for e in crate["@graph"] + if e.get("@type") == "PropertyValue" + } + assert props["sequenceCount"] == 0 + assert props["collectionCount"] == 0 diff --git a/tests/test_cli/test_store_pull.py b/tests/test_cli/test_store_pull.py index c90cd72..0b737a4 100644 --- a/tests/test_cli/test_store_pull.py +++ b/tests/test_cli/test_store_pull.py @@ -7,22 +7,25 @@ would deadlock a Python-thread-based HTTP server. """ +import importlib.util import json import os -import signal import socket import subprocess import sys import time -from pathlib import Path import pytest -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from conftest import ( - BASE_FASTA, - DIFFERENT_NAMES_FASTA, +_conftest_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "conftest.py" ) +_spec = importlib.util.spec_from_file_location("tests_conftest", _conftest_path) +_conftest = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_conftest) + +BASE_FASTA = _conftest.BASE_FASTA +DIFFERENT_NAMES_FASTA = _conftest.DIFFERENT_NAMES_FASTA # Skip entire module if gtars is not installed pytest.importorskip("gtars") @@ -168,8 +171,7 @@ def test_pull_eager_fetches_sequences(self, cli, tmp_path, remote_store_server): cli("store", "init", "--path", str(local_store)) result = cli( - "store", "pull", digest, "--server", server_url, - "--path", str(local_store), "--eager" + "store", "pull", digest, "--server", server_url, "--path", str(local_store), "--eager" ) assert result.exit_code == 0, f"Eager pull failed: {result.stdout}" @@ -204,8 +206,14 @@ def test_pull_from_file(self, cli, tmp_path, multi_remote_store_server): digest_file.write_text(f"{digest1}\n{digest2}\n") result = cli( - "store", "pull", "--file", str(digest_file), - "--server", server_url, "--path", str(local_store) + "store", + "pull", + "--file", + str(digest_file), + "--server", + server_url, + "--path", + str(local_store), ) assert result.exit_code == 0, f"Batch pull failed: {result.stdout}" @@ -223,8 +231,14 @@ def test_pull_file_with_blank_lines(self, cli, tmp_path, remote_store_server): digest_file.write_text(f"\n \n{digest}\n\n \n") result = cli( - "store", "pull", "--file", str(digest_file), - "--server", server_url, "--path", str(local_store) + "store", + "pull", + "--file", + str(digest_file), + "--server", + server_url, + "--path", + str(local_store), ) assert result.exit_code == 0 @@ -239,8 +253,14 @@ def test_pull_file_not_found(self, cli, tmp_path): cli("store", "init", "--path", str(local_store)) result = cli( - "store", "pull", "--file", "/nonexistent/digests.txt", - "--server", "http://127.0.0.1:1", "--path", str(local_store) + "store", + "pull", + "--file", + "/nonexistent/digests.txt", + "--server", + "http://127.0.0.1:1", + "--path", + str(local_store), ) assert result.exit_code != 0 @@ -255,8 +275,14 @@ def test_pull_empty_file(self, cli, tmp_path, remote_store_server): digest_file.write_text("") result = cli( - "store", "pull", "--file", str(digest_file), - "--server", server_url, "--path", str(local_store) + "store", + "pull", + "--file", + str(digest_file), + "--server", + server_url, + "--path", + str(local_store), ) assert result.exit_code != 0 @@ -292,8 +318,13 @@ def test_pull_nonexistent_digest(self, cli, tmp_path, remote_store_server): cli("store", "init", "--path", str(local_store)) result = cli( - "store", "pull", "NONEXISTENT_DIGEST_12345678901234", - "--server", server_url, "--path", str(local_store) + "store", + "pull", + "NONEXISTENT_DIGEST_12345678901234", + "--server", + server_url, + "--path", + str(local_store), ) assert result.exit_code != 0 @@ -306,8 +337,13 @@ def test_pull_unreachable_server(self, cli, tmp_path): cli("store", "init", "--path", str(local_store)) result = cli( - "store", "pull", "some_digest_abc123", - "--server", "http://127.0.0.1:1", "--path", str(local_store) + "store", + "pull", + "some_digest_abc123", + "--server", + "http://127.0.0.1:1", + "--path", + str(local_store), ) assert result.exit_code != 0 @@ -317,10 +353,7 @@ def test_pull_no_digest_or_file(self, cli, tmp_path): local_store = tmp_path / "noarg_store" cli("store", "init", "--path", str(local_store)) - result = cli( - "store", "pull", - "--server", "http://127.0.0.1:1", "--path", str(local_store) - ) + result = cli("store", "pull", "--server", "http://127.0.0.1:1", "--path", str(local_store)) assert result.exit_code != 0 @@ -333,9 +366,15 @@ def test_pull_both_digest_and_file(self, cli, tmp_path): digest_file.write_text("some_digest\n") result = cli( - "store", "pull", "some_digest", - "--file", str(digest_file), - "--server", "http://127.0.0.1:1", "--path", str(local_store) + "store", + "pull", + "some_digest", + "--file", + str(digest_file), + "--server", + "http://127.0.0.1:1", + "--path", + str(local_store), ) assert result.exit_code != 0 @@ -346,15 +385,9 @@ def test_pull_no_server_configured(self, cli, tmp_path, monkeypatch): cli("store", "init", "--path", str(local_store)) # Patch _find_remote_urls to return empty list - monkeypatch.setattr( - "refget.cli.store._find_remote_urls", - lambda server_override=None: [] - ) + monkeypatch.setattr("refget.cli.store._find_remote_urls", lambda server_override=None: []) - result = cli( - "store", "pull", "some_digest", - "--path", str(local_store) - ) + result = cli("store", "pull", "some_digest", "--path", str(local_store)) assert result.exit_code != 0 @@ -383,7 +416,7 @@ def test_pull_tries_next_remote_on_failure( # Patch to return empty server first, then the populated one monkeypatch.setattr( "refget.cli.store._find_remote_urls", - lambda server_override=None: [empty_url, server_url] + lambda server_override=None: [empty_url, server_url], ) result = cli("store", "pull", digest, "--path", str(local_store), "--quiet") diff --git a/tests/test_cli_integration/test_workflows.py b/tests/test_cli_integration/test_workflows.py index 2fce20c..8872f6a 100644 --- a/tests/test_cli_integration/test_workflows.py +++ b/tests/test_cli_integration/test_workflows.py @@ -6,20 +6,22 @@ These tests verify that commands work together correctly in typical usage patterns. """ -import pytest +import importlib.util import json -import sys import os -from pathlib import Path - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from conftest import ( - BASE_FASTA, - DIFFERENT_NAMES_FASTA, - DIFFERENT_ORDER_FASTA, - SUBSET_FASTA, - TEST_FASTA_DIGESTS, + +_conftest_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "conftest.py" ) +_spec = importlib.util.spec_from_file_location("tests_conftest", _conftest_path) +_conftest = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_conftest) + +BASE_FASTA = _conftest.BASE_FASTA +DIFFERENT_NAMES_FASTA = _conftest.DIFFERENT_NAMES_FASTA +DIFFERENT_ORDER_FASTA = _conftest.DIFFERENT_ORDER_FASTA +SUBSET_FASTA = _conftest.SUBSET_FASTA +TEST_FASTA_DIGESTS = _conftest.TEST_FASTA_DIGESTS class TestDigestAndCompare: From 82e41553f906754c6a33d1f4fc8b6c35ee25916d Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 13 Mar 2026 08:10:22 -0400 Subject: [PATCH 20/31] Reorganize data_loaders into task-specific subdirectories --- data_loaders/ref-genome-analysis/README.md | 60 +++ .../aliases/build_ncbi_alias_table.py | 381 ++++++++++++++++++ .../aliases/register_aliases.sbatch | 15 + .../aliases/register_ncbi_aliases.py | 231 +++++++++++ .../ref-genome-analysis/build_refgetstore.py | 132 ------ .../examples/test_20_genomes.py | 165 ++++++++ .../fhr/batch_generate_fhr.py | 98 +++++ .../fhr/genomeark_to_fhr.py | 214 ++++++++++ .../fhr/load_fhr_metadata.py | 97 +++++ .../fhr/metadata/GCA_000001405.29.fhr.json | 36 ++ .../fhr/metadata/GCA_000001405.fhr.json | 36 ++ .../fhr/metadata/GCA_964261635.1.fhr.json | 41 ++ .../fhr/metadata/GCA_964263255.1.fhr.json | 41 ++ .../{ => inventory}/inventory_genomes.py | 0 .../process-all-genomes.sbatch | 15 - .../profiling/profile_all.py | 48 +++ .../profiling/profile_all.sbatch | 15 + .../profiling/profile_batch.py | 28 ++ .../profiling/profile_memory.py | 87 ++++ .../profiling/profile_memory.sbatch | 15 + .../profiling/profile_newt.py | 57 +++ .../profiling/profile_newt.sbatch | 15 + .../profiling/profile_normal.py | 34 ++ .../profiling/profile_normal.sbatch | 15 + .../{ => verify}/verify_refgetstore.py | 15 +- .../riva_pangenome_analysis/update-gtars.sh | 25 ++ 26 files changed, 1762 insertions(+), 154 deletions(-) create mode 100644 data_loaders/ref-genome-analysis/README.md create mode 100644 data_loaders/ref-genome-analysis/aliases/build_ncbi_alias_table.py create mode 100644 data_loaders/ref-genome-analysis/aliases/register_aliases.sbatch create mode 100644 data_loaders/ref-genome-analysis/aliases/register_ncbi_aliases.py delete mode 100644 data_loaders/ref-genome-analysis/build_refgetstore.py create mode 100644 data_loaders/ref-genome-analysis/examples/test_20_genomes.py create mode 100755 data_loaders/ref-genome-analysis/fhr/batch_generate_fhr.py create mode 100755 data_loaders/ref-genome-analysis/fhr/genomeark_to_fhr.py create mode 100644 data_loaders/ref-genome-analysis/fhr/load_fhr_metadata.py create mode 100755 data_loaders/ref-genome-analysis/fhr/metadata/GCA_000001405.29.fhr.json create mode 100755 data_loaders/ref-genome-analysis/fhr/metadata/GCA_000001405.fhr.json create mode 100644 data_loaders/ref-genome-analysis/fhr/metadata/GCA_964261635.1.fhr.json create mode 100644 data_loaders/ref-genome-analysis/fhr/metadata/GCA_964263255.1.fhr.json rename data_loaders/ref-genome-analysis/{ => inventory}/inventory_genomes.py (100%) delete mode 100644 data_loaders/ref-genome-analysis/process-all-genomes.sbatch create mode 100644 data_loaders/ref-genome-analysis/profiling/profile_all.py create mode 100644 data_loaders/ref-genome-analysis/profiling/profile_all.sbatch create mode 100644 data_loaders/ref-genome-analysis/profiling/profile_batch.py create mode 100644 data_loaders/ref-genome-analysis/profiling/profile_memory.py create mode 100644 data_loaders/ref-genome-analysis/profiling/profile_memory.sbatch create mode 100644 data_loaders/ref-genome-analysis/profiling/profile_newt.py create mode 100644 data_loaders/ref-genome-analysis/profiling/profile_newt.sbatch create mode 100644 data_loaders/ref-genome-analysis/profiling/profile_normal.py create mode 100644 data_loaders/ref-genome-analysis/profiling/profile_normal.sbatch rename data_loaders/ref-genome-analysis/{ => verify}/verify_refgetstore.py (97%) create mode 100644 data_loaders/riva_pangenome_analysis/update-gtars.sh diff --git a/data_loaders/ref-genome-analysis/README.md b/data_loaders/ref-genome-analysis/README.md new file mode 100644 index 0000000..6f71ee6 --- /dev/null +++ b/data_loaders/ref-genome-analysis/README.md @@ -0,0 +1,60 @@ +# ref-genome-analysis + +Pipeline for loading reference genome FASTA files into a RefgetStore and enriching them with NCBI aliases and FHR provenance metadata. + +## Pipeline stages + +Execute in order: + +``` +inventory --> build --> aliases --> fhr --> verify +``` + +| Stage | Directory | Purpose | +|---|---|---| +| **inventory** | `inventory/` | Scan brickyard FASTA files, produce `refgenomes_inventory.csv` | +| **build** | `build/` | Load FASTAs into RefgetStore, produce `digest_map.csv` | +| **aliases** | `aliases/` | Download NCBI assembly reports, build alias table, register sequence/collection aliases | +| **fhr** | `fhr/` | Generate and attach FHR provenance metadata (species, taxon, accession, submitter, etc.) | +| **verify** | `verify/` | Automated pass/fail checks against the store | +| **profiling** | `profiling/` | Memory and timing benchmarks | +| **examples** | `examples/` | End-to-end test scripts (e.g., load 20 genomes with FHR) | + +## Rivanna paths + +All data lives within the `refgenomes_fasta` brickyard brick: + +``` +/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/ +├── homo_sapiens/... # Source FASTAs +├── mus_musculus/... +├── refgenomes_inventory.csv # Inventory of all FASTAs +├── refget_store/ # The RefgetStore (fixed-format, don't modify manually) +└── refget_staging/ # Pipeline intermediates + ├── assembly_reports/ # Downloaded NCBI assembly_report.txt files + ├── ncbi_alias_table.csv # Parsed alias table (367K sequence rows) + ├── fhr_metadata/ # Generated FHR provenance JSON files + └── digest_map.csv # Build output mapping FASTAs to digests +``` + +- **Store**: `.../refgenomes_fasta/refget_store` +- **Staging**: `.../refgenomes_fasta/refget_staging` +- **This pipeline**: `.../refgenomes_fasta/refget/data_loaders/ref-genome-analysis/` + +## Quick start (Rivanna) + +```bash +module load miniforge/24.3.0-py3.11 + +# 1. Build store +sbatch build/build_refgetstore.sbatch + +# 2. Register NCBI aliases +sbatch aliases/register_aliases.sbatch + +# 3. Attach FHR metadata +cd fhr && python load_fhr_metadata.py --store-path /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget_store --fhr-dir metadata/ + +# 4. Verify +cd verify && python verify_refgetstore.py +``` diff --git a/data_loaders/ref-genome-analysis/aliases/build_ncbi_alias_table.py b/data_loaders/ref-genome-analysis/aliases/build_ncbi_alias_table.py new file mode 100644 index 0000000..8c0d07c --- /dev/null +++ b/data_loaders/ref-genome-analysis/aliases/build_ncbi_alias_table.py @@ -0,0 +1,381 @@ +#!/usr/bin/env python3 +""" +Build NCBI alias mapping table from assembly reports. + +Downloads NCBI assembly_report.txt files for each accession in the inventory +CSV and parses them into a flat CSV mapping sequence names to accessions. + +This is Phase A of the alias registration pipeline -- it produces a standalone +CSV with no store dependency. Needs only the inventory CSV and internet access. + +Usage: + python build_ncbi_alias_table.py --inventory refgenomes_inventory.csv + python build_ncbi_alias_table.py --inventory refgenomes_inventory.csv --limit 3 + python build_ncbi_alias_table.py --inventory refgenomes_inventory.csv --download-only +""" + +import argparse +import csv +import os +import re +import sys +import time +import urllib.error +import urllib.request + +BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" +INVENTORY_CSV = f"{BRICK_ROOT}/refgenomes_inventory.csv" +STAGING_DIR = f"{BRICK_ROOT}/refget_staging" +ACCESSION_PATTERN = re.compile(r"(GC[AF]_\d+\.\d+)") +NCBI_FTP_BASE = "https://ftp.ncbi.nlm.nih.gov/genomes/all" + +OUTPUT_COLUMNS = [ + "accession", + "sequence_name", + "sequence_length", + "refseq_accn", + "genbank_accn", + "ucsc_name", + "genbank_assembly_accn", + "refseq_assembly_accn", +] + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Download NCBI assembly reports and build alias mapping table." + ) + parser.add_argument( + "--inventory", default=INVENTORY_CSV, help="Path to refgenomes_inventory.csv" + ) + parser.add_argument( + "--report-cache", + default=f"{STAGING_DIR}/assembly_reports", + help="Directory to cache downloaded assembly_report.txt files", + ) + parser.add_argument( + "--output", + default=f"{STAGING_DIR}/ncbi_alias_table.csv", + help="Output CSV path", + ) + parser.add_argument( + "--limit", type=int, default=None, help="Process only first N accessions" + ) + parser.add_argument( + "--offset", type=int, default=0, help="Skip first N accessions" + ) + parser.add_argument( + "--download-only", + action="store_true", + help="Download reports but don't parse into table", + ) + return parser.parse_args() + + +# --------------------------------------------------------------------------- +# Step A2: Read inventory and extract accessions +# --------------------------------------------------------------------------- + +def read_accessions_from_inventory(csv_path): + """Read inventory CSV and return list of (accession, filename) pairs. + + Filters to rows with a non-empty accession matching the GCF_/GCA_ pattern. + """ + pairs = [] + seen_accessions = set() + with open(csv_path, newline="") as f: + reader = csv.DictReader(f) + if reader.fieldnames is None: + print(f"ERROR: {csv_path} appears to be empty", file=sys.stderr) + sys.exit(1) + for row in reader: + accession = row.get("accession", "").strip() + filename = row.get("filename", "").strip() + if not accession or not ACCESSION_PATTERN.match(accession): + continue + if accession in seen_accessions: + continue + seen_accessions.add(accession) + pairs.append((accession, filename)) + return pairs + + +# --------------------------------------------------------------------------- +# Step A3: Construct NCBI FTP URLs from filename +# --------------------------------------------------------------------------- + +def derive_assembly_name(accession, filename): + """Derive the assembly name from the FASTA filename. + + Example: + accession = "GCF_000001405.40" + filename = "GCF_000001405.40_GRCh38.p14_genomic.fna.gz" + returns "GRCh38.p14" + + The filename pattern is: {accession}_{assembly_name}_genomic.fna[.gz] + """ + # Strip the accession prefix and _genomic.fna[.gz] suffix + prefix = accession + "_" + if not filename.startswith(prefix): + return None + rest = filename[len(prefix):] + # Remove _genomic.fna, _genomic.fna.gz, _genomic.fa.gz, etc. + rest = re.sub(r"_genomic\.(fna|fa|fasta)(\.gz)?$", "", rest) + if not rest: + return None + return rest + + +def accession_to_ftp_dir(accession): + """Convert an accession to its NCBI FTP parent directory URL. + + GCF_963692335.1 -> https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/963/692/335/ + """ + match = re.match(r"(GC[AF])_(\d+)\.\d+", accession) + if not match: + return None + prefix = match.group(1) + numeric = match.group(2).zfill(9) + d1, d2, d3 = numeric[0:3], numeric[3:6], numeric[6:9] + return f"{NCBI_FTP_BASE}/{prefix}/{d1}/{d2}/{d3}/" + + +def lookup_assembly_name_from_ftp(accession): + """Scrape the NCBI FTP directory listing to find the assembly name. + + The directory contains a single subdirectory like GCF_963692335.1_fOsmEpe2.1/. + We extract the assembly name from that. + """ + dir_url = accession_to_ftp_dir(accession) + if not dir_url: + return None + try: + req = urllib.request.Request(dir_url, headers={"User-Agent": "refget-alias-builder/1.0"}) + with urllib.request.urlopen(req, timeout=15) as response: + html = response.read().decode("utf-8", errors="replace") + # Look for a link like GCF_963692335.1_fOsmEpe2.1/ + pattern = re.escape(accession) + r"_([^/\"]+)/" + m = re.search(pattern, html) + if m: + return m.group(1) + except (urllib.error.URLError, urllib.error.HTTPError, OSError): + pass + return None + + +def construct_report_url(accession, assembly_name): + """Construct the NCBI FTP URL for an assembly_report.txt. + + URL pattern: + https://ftp.ncbi.nlm.nih.gov/genomes/all/{GCF|GCA}/{d1}/{d2}/{d3}/ + {accession}_{assembly_name}/{accession}_{assembly_name}_assembly_report.txt + + Where d1/d2/d3 are 3-char chunks of the numeric part of the accession + (the digits between the underscore and the dot). + """ + dir_url = accession_to_ftp_dir(accession) + if not dir_url: + return None + stem = f"{accession}_{assembly_name}" + return f"{dir_url}{stem}/{stem}_assembly_report.txt" + + +# --------------------------------------------------------------------------- +# Step A4: Download with caching and rate limiting +# --------------------------------------------------------------------------- + +def download_report(accession, filename, cache_dir, sleep_sec=0.3): + """Download assembly_report.txt for a given accession. + + Returns (cache_path, status) where status is one of: + "cached" - already existed in cache + "downloaded" - freshly downloaded + "failed" - download failed (logged to stderr) + "skipped" - could not derive assembly name from filename + """ + cache_path = os.path.join(cache_dir, f"{accession}_assembly_report.txt") + + # Check cache first + if os.path.exists(cache_path) and os.path.getsize(cache_path) > 0: + return cache_path, "cached" + + # Derive assembly name from filename, fall back to FTP directory lookup + assembly_name = derive_assembly_name(accession, filename) + if not assembly_name: + assembly_name = lookup_assembly_name_from_ftp(accession) + if assembly_name: + time.sleep(sleep_sec) # Rate limit the directory lookup too + else: + return cache_path, "skipped" + + url = construct_report_url(accession, assembly_name) + if not url: + print(f" WARNING: Cannot construct URL for {accession}", file=sys.stderr) + return cache_path, "skipped" + + # Download + try: + req = urllib.request.Request(url, headers={"User-Agent": "refget-alias-builder/1.0"}) + with urllib.request.urlopen(req, timeout=30) as response: + data = response.read() + with open(cache_path, "wb") as f: + f.write(data) + time.sleep(sleep_sec) + return cache_path, "downloaded" + except (urllib.error.URLError, urllib.error.HTTPError, OSError) as e: + print(f" FAILED: {accession} ({url}): {e}", file=sys.stderr) + return cache_path, "failed" + + +# --------------------------------------------------------------------------- +# Step A5: Parse reports into flat CSV +# --------------------------------------------------------------------------- + +def parse_assembly_report(filepath, accession): + """Parse an assembly_report.txt file into a list of row dicts. + + Returns (rows, genbank_assembly_accn, refseq_assembly_accn). + """ + genbank_assembly_accn = "" + refseq_assembly_accn = "" + rows = [] + + with open(filepath, "r", errors="replace") as f: + for line in f: + line = line.rstrip("\n") + # Parse header metadata + if line.startswith("#"): + if "GenBank assembly accession:" in line: + m = ACCESSION_PATTERN.search(line) + if m: + genbank_assembly_accn = m.group(1) + elif "RefSeq assembly accession:" in line: + m = ACCESSION_PATTERN.search(line) + if m: + refseq_assembly_accn = m.group(1) + continue + + # Data rows: tab-separated, 10 columns + fields = line.split("\t") + if len(fields) < 9: + continue + + sequence_name = fields[0].strip() + genbank_accn = fields[4].strip() if len(fields) > 4 else "na" + refseq_accn = fields[6].strip() if len(fields) > 6 else "na" + sequence_length = fields[8].strip() if len(fields) > 8 else "na" + ucsc_name = fields[9].strip() if len(fields) > 9 else "na" + + # Normalize "na" to empty string + if genbank_accn == "na": + genbank_accn = "" + if refseq_accn == "na": + refseq_accn = "" + if ucsc_name == "na": + ucsc_name = "" + if sequence_length == "na": + sequence_length = "" + + rows.append({ + "accession": accession, + "sequence_name": sequence_name, + "sequence_length": sequence_length, + "refseq_accn": refseq_accn, + "genbank_accn": genbank_accn, + "ucsc_name": ucsc_name, + "genbank_assembly_accn": genbank_assembly_accn, + "refseq_assembly_accn": refseq_assembly_accn, + }) + + return rows + + +def write_alias_table(output_path, all_rows): + """Write the alias table CSV.""" + with open(output_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=OUTPUT_COLUMNS) + writer.writeheader() + writer.writerows(all_rows) + + +def main(): + args = parse_args() + + # Step A2: Read inventory and extract accessions + print(f"Reading inventory from {args.inventory}", file=sys.stderr) + pairs = read_accessions_from_inventory(args.inventory) + print(f"Found {len(pairs)} unique accessions", file=sys.stderr) + + # Apply offset and limit + if args.offset: + pairs = pairs[args.offset:] + print(f"Skipped first {args.offset} accessions", file=sys.stderr) + if args.limit: + pairs = pairs[: args.limit] + print(f"Limited to {args.limit} accessions", file=sys.stderr) + + # Create cache directory + os.makedirs(args.report_cache, exist_ok=True) + + # Step A4: Download reports + n_cached = 0 + n_downloaded = 0 + n_failed = 0 + n_skipped = 0 + downloaded_reports = [] # (accession, cache_path) + + print(f"\nDownloading assembly reports...", file=sys.stderr) + for i, (accession, filename) in enumerate(pairs, 1): + print( + f"[{i}/{len(pairs)}] {accession}...", + end=" ", + flush=True, + file=sys.stderr, + ) + cache_path, status = download_report(accession, filename, args.report_cache) + print(status, file=sys.stderr) + + if status == "cached": + n_cached += 1 + downloaded_reports.append((accession, cache_path)) + elif status == "downloaded": + n_downloaded += 1 + downloaded_reports.append((accession, cache_path)) + elif status == "failed": + n_failed += 1 + elif status == "skipped": + n_skipped += 1 + + print( + f"\nDownload summary: {n_downloaded} downloaded, {n_cached} cached, " + f"{n_failed} failed, {n_skipped} skipped", + file=sys.stderr, + ) + + if args.download_only: + print("--download-only specified, stopping before parsing.", file=sys.stderr) + return + + # Step A5: Parse reports into flat CSV + print(f"\nParsing assembly reports...", file=sys.stderr) + all_rows = [] + n_parsed = 0 + for accession, cache_path in downloaded_reports: + if not os.path.exists(cache_path) or os.path.getsize(cache_path) == 0: + continue + rows = parse_assembly_report(cache_path, accession) + all_rows.extend(rows) + n_parsed += 1 + + write_alias_table(args.output, all_rows) + + # Summary + print(f"\nResults:", file=sys.stderr) + print(f" Accessions processed: {len(pairs)}", file=sys.stderr) + print(f" Reports parsed: {n_parsed}", file=sys.stderr) + print(f" Total sequence rows: {len(all_rows)}", file=sys.stderr) + print(f" Output written to: {args.output}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/aliases/register_aliases.sbatch b/data_loaders/ref-genome-analysis/aliases/register_aliases.sbatch new file mode 100644 index 0000000..10dfe48 --- /dev/null +++ b/data_loaders/ref-genome-analysis/aliases/register_aliases.sbatch @@ -0,0 +1,15 @@ +#!/bin/bash +#SBATCH --job-name=ncbi_aliases +#SBATCH --output=ncbi_aliases_%j.log +#SBATCH --error=ncbi_aliases_%j.log +#SBATCH --partition=standard +#SBATCH --time=4:00:00 +#SBATCH --mem=8G +#SBATCH --cpus-per-task=1 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget/data_loaders/ref-genome-analysis/aliases + +python register_ncbi_aliases.py --store-path /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget_store diff --git a/data_loaders/ref-genome-analysis/aliases/register_ncbi_aliases.py b/data_loaders/ref-genome-analysis/aliases/register_ncbi_aliases.py new file mode 100644 index 0000000..c344d75 --- /dev/null +++ b/data_loaders/ref-genome-analysis/aliases/register_ncbi_aliases.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 +""" +Register NCBI sequence and collection aliases in a RefgetStore. + +Phase B of the alias registration pipeline. Reads the ncbi_alias_table.csv +(from Phase A), matches sequences to store digests, and bulk-loads aliases +via temporary TSV files. + +Usage: + python register_ncbi_aliases.py --store-path /path/to/store + python register_ncbi_aliases.py --store-path /path/to/store --dry-run + python register_ncbi_aliases.py --store-path /path/to/store --limit 5 +""" + +import argparse +import csv +import os +import sys +import tempfile +import time +from collections import defaultdict + +from refget.store import RefgetStore + +BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" +STORE_PATH = f"{BRICK_ROOT}/refget_store" +INVENTORY_CSV = f"{BRICK_ROOT}/refgenomes_inventory.csv" +ALIAS_TABLE_CSV = f"{BRICK_ROOT}/refget_staging/ncbi_alias_table.csv" + + +def parse_args(): + parser = argparse.ArgumentParser(description="Register NCBI aliases in RefgetStore") + parser.add_argument("--store-path", default=STORE_PATH, help="Path to RefgetStore") + parser.add_argument("--alias-table", default=ALIAS_TABLE_CSV, help="Path to ncbi_alias_table.csv") + parser.add_argument("--inventory", default=INVENTORY_CSV, help="Path to refgenomes_inventory.csv") + parser.add_argument("--dry-run", action="store_true", help="Parse and match but don't register") + parser.add_argument("--limit", type=int, default=None, help="Process only first N accessions") + parser.add_argument("--offset", type=int, default=0, help="Skip first N accessions") + return parser.parse_args() + + +def read_inventory(csv_path): + """Read inventory CSV, return accession -> path mapping.""" + acc_to_path = {} + with open(csv_path, newline="") as f: + for row in csv.DictReader(f): + acc = row.get("accession", "").strip() + path = row.get("path", "").strip() + if acc and path: + acc_to_path[acc] = path + return acc_to_path + + +def read_alias_table(csv_path): + """Read alias table CSV, return accession -> list of row dicts.""" + acc_to_rows = defaultdict(list) + with open(csv_path, newline="") as f: + for row in csv.DictReader(f): + acc = row.get("accession", "").strip() + if acc: + acc_to_rows[acc].append(row) + return acc_to_rows + + +def write_tsv(path, pairs): + """Write alias\tdigest pairs to a TSV file.""" + with open(path, "w") as f: + for alias, digest in pairs: + f.write(f"{alias}\t{digest}\n") + + +def main(): + args = parse_args() + + # Read inputs + print(f"Reading inventory from {args.inventory}") + acc_to_path = read_inventory(args.inventory) + print(f" {len(acc_to_path)} accessions with paths") + + print(f"Reading alias table from {args.alias_table}") + acc_to_rows = read_alias_table(args.alias_table) + print(f" {len(acc_to_rows)} accessions, {sum(len(v) for v in acc_to_rows.values())} sequence rows") + + # Filter to accessions present in both + common_accessions = sorted(set(acc_to_path) & set(acc_to_rows)) + print(f" {len(common_accessions)} accessions in both inventory and alias table") + + if args.offset: + common_accessions = common_accessions[args.offset:] + print(f" Skipped first {args.offset}") + if args.limit: + common_accessions = common_accessions[:args.limit] + print(f" Limited to {args.limit}") + + # Open store + store = RefgetStore.on_disk(args.store_path) + store.set_quiet(True) + print(f"Store opened: {store.stats()}") + + # Accumulate all aliases in memory, then bulk-load at the end + seq_aliases = {"refseq": [], "insdc": [], "ucsc": []} + coll_aliases = {"refseq": [], "insdc": []} + + n_collections = 0 + n_matched = 0 + n_unmatched = 0 + n_skipped_files = 0 + t_start = time.time() + + for i, accession in enumerate(common_accessions, 1): + fasta_path = acc_to_path[accession] + alias_rows = acc_to_rows[accession] + + print(f"[{i}/{len(common_accessions)}] {accession} ({len(alias_rows)} seqs)...", end=" ", flush=True) + + # Get collection digest by loading (returns immediately if exists) + if not os.path.exists(fasta_path): + print("SKIP (file missing)") + n_skipped_files += 1 + continue + + try: + meta, was_new = store.add_sequence_collection_from_fasta(fasta_path) + except Exception as e: + print(f"SKIP ({e})") + n_skipped_files += 1 + continue + + coll_digest = meta.digest + n_collections += 1 + + # Collection-level aliases from report header + first_row = alias_rows[0] + genbank_acc = first_row.get("genbank_assembly_accn", "").strip() + refseq_acc = first_row.get("refseq_assembly_accn", "").strip() + + if refseq_acc: + coll_aliases["refseq"].append((refseq_acc, coll_digest)) + if genbank_acc: + coll_aliases["insdc"].append((genbank_acc, coll_digest)) + + # Get collection's sequences to match against alias table + level2 = store.get_collection_level2(coll_digest) + names = level2.get("names", []) + lengths = level2.get("lengths", []) + sequences = level2.get("sequences", []) + + # Build name -> (seq_digest, length) lookup + name_to_info = {} + for name, length, seq_digest in zip(names, lengths, sequences): + name_to_info[name] = (seq_digest, int(length)) + + # Match alias table rows to store sequences + matched_this = 0 + unmatched_this = 0 + for row in alias_rows: + seq_name = row.get("sequence_name", "").strip() + seq_length_str = row.get("sequence_length", "").strip() + refseq_accn = row.get("refseq_accn", "").strip() + genbank_accn = row.get("genbank_accn", "").strip() + ucsc_name = row.get("ucsc_name", "").strip() + + seq_length = int(seq_length_str) if seq_length_str else None + + # Try matching by sequence_name, then refseq_accn, then genbank_accn, then ucsc_name + seq_digest = None + for candidate in [seq_name, refseq_accn, genbank_accn, ucsc_name]: + if candidate and candidate in name_to_info: + store_digest, store_length = name_to_info[candidate] + if seq_length is None or store_length == seq_length: + seq_digest = store_digest + break + + if seq_digest is None: + unmatched_this += 1 + continue + + matched_this += 1 + + if refseq_accn: + seq_aliases["refseq"].append((refseq_accn, seq_digest)) + if genbank_accn: + seq_aliases["insdc"].append((genbank_accn, seq_digest)) + if ucsc_name: + seq_aliases["ucsc"].append((ucsc_name, seq_digest)) + + n_matched += matched_this + n_unmatched += unmatched_this + print(f"{coll_digest[:12]}... {matched_this}/{len(alias_rows)} matched") + + match_elapsed = time.time() - t_start + + # Summary of what was collected + n_seq_aliases = sum(len(v) for v in seq_aliases.values()) + n_coll_aliases = sum(len(v) for v in coll_aliases.values()) + print(f"\nMatching done in {match_elapsed:.1f}s") + print(f" Collections: {n_collections}, skipped: {n_skipped_files}") + print(f" Sequences matched: {n_matched}, unmatched: {n_unmatched}") + print(f" Sequence aliases to register: {n_seq_aliases}") + print(f" Collection aliases to register: {n_coll_aliases}") + + if args.dry_run: + print("\n[DRY RUN] Skipping alias registration.") + return + + # Bulk-load aliases via temp TSV files + print(f"\nRegistering aliases...") + with tempfile.TemporaryDirectory() as tmpdir: + for namespace, pairs in seq_aliases.items(): + if not pairs: + continue + tsv_path = os.path.join(tmpdir, f"seq_{namespace}.tsv") + write_tsv(tsv_path, pairs) + n = store.load_sequence_aliases(namespace, tsv_path) + print(f" sequences/{namespace}: {n} aliases loaded") + + for namespace, pairs in coll_aliases.items(): + if not pairs: + continue + tsv_path = os.path.join(tmpdir, f"coll_{namespace}.tsv") + write_tsv(tsv_path, pairs) + n = store.load_collection_aliases(namespace, tsv_path) + print(f" collections/{namespace}: {n} aliases loaded") + + total_elapsed = time.time() - t_start + print(f"\nDone in {total_elapsed:.1f}s") + print(f" Store stats: {store.stats()}") + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/build_refgetstore.py b/data_loaders/ref-genome-analysis/build_refgetstore.py deleted file mode 100644 index 68ef281..0000000 --- a/data_loaders/ref-genome-analysis/build_refgetstore.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -Build a RefgetStore from the refgenomes inventory CSV. - -Reads refgenomes_inventory.csv and populates a RefgetStore with all FASTA -files. No alias registration -- that is a separate, deliberate step. - -Usage: - python build_refgetstore.py [--inventory PATH] [--store-path PATH] [--output PATH] [--limit N] -""" - -import argparse -import csv -import sys -import time - -from refget.store import RefgetStore - -STORE_PATH = "/project/shefflab/brickyard/refget_store" -INVENTORY_CSV = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refgenomes_inventory.csv" -OUTPUT_CSV = "digest_map.csv" - - -def parse_args(): - parser = argparse.ArgumentParser(description="Build RefgetStore from inventory CSV") - parser.add_argument("--inventory", default=INVENTORY_CSV, help="Input inventory CSV") - parser.add_argument("--store-path", default=STORE_PATH, help="RefgetStore path") - parser.add_argument("--output", default=OUTPUT_CSV, help="Output digest map CSV") - parser.add_argument("--limit", type=int, default=None, help="Process only first N rows (for testing)") - parser.add_argument("--offset", type=int, default=0, help="Skip first N rows") - return parser.parse_args() - - -def read_inventory(csv_path): - """Read inventory CSV and return list of row dicts.""" - rows = [] - with open(csv_path, newline="") as f: - reader = csv.DictReader(f) - if reader.fieldnames is None: - print(f"ERROR: {csv_path} appears to be empty", file=sys.stderr) - sys.exit(1) - if "path" not in reader.fieldnames: - print(f"ERROR: {csv_path} missing required 'path' column", file=sys.stderr) - sys.exit(1) - for row in reader: - rows.append(row) - return rows - - -def write_digest_map(output_path, results): - """Write results to digest_map.csv.""" - fieldnames = ["path", "filename", "digest", "n_sequences", "was_new", "error"] - with open(output_path, "w", newline="") as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(results) - - -def main(): - args = parse_args() - - inventory = read_inventory(args.inventory) - if args.offset: - inventory = inventory[args.offset:] - print(f"Skipped first {args.offset} records") - if args.limit: - inventory = inventory[:args.limit] - print(f"Limited to {args.limit} records") - total = len(inventory) - print(f"Processing {total} records from {args.inventory}") - - store = RefgetStore.on_disk(args.store_path) - print(f"Store initialized at {args.store_path}") - - results = [] - n_success = 0 - n_failed = 0 - n_new = 0 - t_start = time.time() - - for i, row in enumerate(inventory, 1): - fasta_path = row["path"] - filename = row.get("filename", "") - - t0 = time.time() - print(f"[{i}/{total}] {filename}...", end=" ", flush=True) - - try: - meta, was_new = store.add_sequence_collection_from_fasta(fasta_path, threads=4) - elapsed = time.time() - t0 - status = "NEW" if was_new else "exists" - if was_new: - n_new += 1 - print(f"{meta.digest} ({meta.n_sequences} seqs, {status}, {elapsed:.1f}s)") - n_success += 1 - results.append({ - "path": fasta_path, - "filename": filename, - "digest": meta.digest, - "n_sequences": meta.n_sequences, - "was_new": was_new, - "error": "", - }) - except Exception as e: - error_msg = f"{type(e).__name__}: {e}" - print(f"FAILED: {error_msg}") - n_failed += 1 - results.append({ - "path": fasta_path, - "filename": filename, - "digest": "", - "n_sequences": 0, - "was_new": False, - "error": error_msg, - }) - - write_digest_map(args.output, results) - - total_time = time.time() - t_start - print(f"\nDone in {total_time:.1f}s. {n_success}/{total} succeeded, {n_new} new, {n_failed} failed.") - print(f"Digest map written to {args.output}") - print(f"\nStore stats: {store.stats()}") - - if n_failed > 0: - print(f"\nFailed files:") - for r in results: - if r["error"]: - print(f" {r['filename']}: {r['error']}") - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/data_loaders/ref-genome-analysis/examples/test_20_genomes.py b/data_loaders/ref-genome-analysis/examples/test_20_genomes.py new file mode 100644 index 0000000..6e98e85 --- /dev/null +++ b/data_loaders/ref-genome-analysis/examples/test_20_genomes.py @@ -0,0 +1,165 @@ +""" +Quick test: load 20 genomes into a RefgetStore and attach FHR metadata. + +Usage: + python test_20_genomes.py [--inventory PATH] [--limit N] +""" + +import argparse +import csv +import json +import os +import re +import sys +import tempfile +import time + +from gtars.refget import RefgetStore + +BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" +INVENTORY_CSV = f"{BRICK_ROOT}/refgenomes_inventory.csv" +FHR_DIR = f"{BRICK_ROOT}/refget_staging/fhr_metadata" +STORE_PATH = "/scratch/ns5bc/test_refget_store_20" + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--inventory", default=INVENTORY_CSV) + parser.add_argument("--store-path", default=STORE_PATH) + parser.add_argument("--limit", type=int, default=20) + args = parser.parse_args() + + # Read inventory + with open(args.inventory, newline="") as f: + rows = list(csv.DictReader(f)) + rows = rows[:args.limit] + + print(f"Loading {len(rows)} genomes into {args.store_path}") + os.makedirs(args.store_path, exist_ok=True) + store = RefgetStore.on_disk(args.store_path) + + # Phase 1: Load FASTAs + print("\n=== Phase 1: Load FASTAs ===") + digest_map = {} # filename -> digest + t_start = time.time() + + for i, row in enumerate(rows, 1): + fasta_path = row["path"] + filename = row.get("filename", os.path.basename(fasta_path)) + t0 = time.time() + print(f"[{i}/{len(rows)}] {filename}...", end=" ", flush=True) + try: + meta, was_new = store.add_sequence_collection_from_fasta(fasta_path) + elapsed = time.time() - t0 + status = "NEW" if was_new else "exists" + print(f"{meta.digest} ({meta.n_sequences} seqs, {status}, {elapsed:.1f}s)") + digest_map[filename] = meta.digest + except Exception as e: + print(f"FAILED: {e}") + + t_fasta = time.time() - t_start + print(f"\nPhase 1 done: {len(digest_map)} loaded in {t_fasta:.1f}s") + + # Phase 2: Load FHR metadata (provenance only, no vitalStats) for all collections + # Map build names to GCA accessions for known species + BUILD_TO_ACCESSION = { + ("homo_sapiens", "hg19"): "GCA_000001405", + ("homo_sapiens", "hg38"): "GCA_000001405", + ("mus_musculus", "mm9"): "GCA_000001635", + ("mus_musculus", "mm10"): "GCA_000001635", + ("mus_musculus", "mm39"): "GCA_000001635", + } + + print("\n=== Phase 2: Load FHR metadata ===") + fhr_loaded = 0 + + # Build accession -> set of digests from inventory metadata + accession_digests = {} # accession -> set of digests + for row in rows: + filename = row.get("filename", "") + if filename not in digest_map: + continue + digest = digest_map[filename] + + # Try explicit accession column first + accession = row.get("accession", "").strip() + + # Try extracting from filename + if not accession: + m = re.search(r'(GCA_\d+(?:\.\d+)?)', filename) + if m: + accession = m.group(1) + + # Fall back to group+build mapping + if not accession: + group = row.get("group", "").strip() + build = row.get("build", "").strip() + accession = BUILD_TO_ACCESSION.get((group, build), "") + + if accession: + accession_digests.setdefault(accession, set()).add(digest) + + print(f" Found {len(accession_digests)} accessions across {sum(len(v) for v in accession_digests.values())} collections") + + def load_fhr_for_accession(store, accession, fhr_data, digests): + """Strip vitalStats and attach provenance FHR to all matching collections.""" + provenance = {k: v for k, v in fhr_data.items() if k != "vitalStats"} + loaded = 0 + with tempfile.NamedTemporaryFile(mode="w", suffix=".fhr.json", delete=False) as tmp: + json.dump(provenance, tmp, indent=2) + tmp_path = tmp.name + try: + for digest in digests: + store.load_fhr_metadata(digest, tmp_path) + print(f" {accession} -> {digest}") + loaded += 1 + finally: + os.unlink(tmp_path) + return loaded + + for accession, digests in sorted(accession_digests.items()): + # Check for pre-generated FHR file + fhr_path = os.path.join(FHR_DIR, f"{accession}.fhr.json") + if os.path.exists(fhr_path): + with open(fhr_path) as f: + fhr_data = json.load(f) + print(f" {accession}: loading from file ({len(digests)} collections)") + fhr_loaded += load_fhr_for_accession(store, accession, fhr_data, digests) + continue + + # Try NCBI API + try: + sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "fhr")) + from genomeark_to_fhr import fetch_ncbi_report, ncbi_to_fhr + print(f" {accession}: fetching from NCBI...", end=" ", flush=True) + report = fetch_ncbi_report(accession) + fhr_data = ncbi_to_fhr(report) + # Save full FHR (with vitalStats) for reference + os.makedirs(FHR_DIR, exist_ok=True) + with open(fhr_path, "w") as f: + json.dump(fhr_data, f, indent=2) + print(f"OK ({len(digests)} collections)") + fhr_loaded += load_fhr_for_accession(store, accession, fhr_data, digests) + except Exception as e: + print(f" {accession}: SKIP ({e})") + + print(f"\nPhase 2 done: {fhr_loaded} FHR entries loaded") + + # Summary + print("\n=== Summary ===") + store_stats = store.stats() + print(f"Store stats: {store_stats}") + fhr_digests = store.list_fhr_metadata() + print(f"FHR entries: {len(fhr_digests)}") + + # Verify FHR data is readable + for digest in fhr_digests: + fhr = store.get_fhr_metadata(digest) + print(f" {digest}: genome={fhr.genome}, version={fhr.version}") + + print(f"\nStore path: {args.store_path}") + print("Done!") + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/fhr/batch_generate_fhr.py b/data_loaders/ref-genome-analysis/fhr/batch_generate_fhr.py new file mode 100755 index 0000000..63e3d81 --- /dev/null +++ b/data_loaders/ref-genome-analysis/fhr/batch_generate_fhr.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +"""Batch-generate FHR metadata for all VGP vertebrate genomes. + +Reads the inventory CSV, extracts unique GCA accessions for vertebrate genomes, +and fetches FHR metadata from NCBI for each. Skips accessions that already have +an FHR file in the output directory, so it's safe to re-run. + +Usage: + python batch_generate_fhr.py --inventory /path/to/inventory.csv --output-dir /path/to/fhr_metadata/ + python batch_generate_fhr.py --inventory /path/to/inventory.csv --output-dir /path/to/fhr_metadata/ --group vertebrates +""" + +import argparse +import csv +import re +import sys +import os +import time + +from genomeark_to_fhr import process_accession + + +def main(): + parser = argparse.ArgumentParser(description="Batch-generate FHR metadata from inventory CSV") + parser.add_argument("--inventory", required=True, help="Path to refgenomes_inventory.csv") + parser.add_argument("--output-dir", required=True, help="Output directory for .fhr.json files") + parser.add_argument("--group", default="vertebrates", help="Filter by group column (default: vertebrates)") + parser.add_argument("--limit", type=int, default=None, help="Process only first N accessions") + parser.add_argument("--skip-genomeark", action="store_true", help="Skip GenomeArk YAML fetch (faster)") + args = parser.parse_args() + + # Read inventory and extract unique accessions for the target group + with open(args.inventory, newline="") as f: + rows = list(csv.DictReader(f)) + + accessions = set() + for row in rows: + if row.get("group", "").strip() != args.group: + continue + acc = row.get("accession", "").strip() + if not acc: + m = re.search(r'(GCA_\d+(?:\.\d+)?)', row.get("filename", "")) + if m: + acc = m.group(1) + if acc: + accessions.add(acc) + + accessions = sorted(accessions) + if args.limit: + accessions = accessions[:args.limit] + + # Check which ones already exist + os.makedirs(args.output_dir, exist_ok=True) + existing = {f.replace(".fhr.json", "") for f in os.listdir(args.output_dir) if f.endswith(".fhr.json")} + todo = [a for a in accessions if a not in existing] + + print(f"Group: {args.group}", file=sys.stderr) + print(f"Total accessions: {len(accessions)}", file=sys.stderr) + print(f"Already done: {len(accessions) - len(todo)}", file=sys.stderr) + print(f"To process: {len(todo)}", file=sys.stderr) + + if not todo: + print("Nothing to do!", file=sys.stderr) + return + + n_ok = 0 + n_fail = 0 + t_start = time.time() + + for i, acc in enumerate(todo, 1): + output_path = os.path.join(args.output_dir, f"{acc}.fhr.json") + ok = False + for attempt in range(3): + try: + print(f"[{i}/{len(todo)}] ", end="", file=sys.stderr) + process_accession(acc, output_path) + n_ok += 1 + ok = True + break + except Exception as e: + if "429" in str(e) and attempt < 2: + wait = 5 * (attempt + 1) + print(f"[{i}/{len(todo)}] {acc}: rate limited, waiting {wait}s...", file=sys.stderr) + time.sleep(wait) + else: + print(f"[{i}/{len(todo)}] {acc}: FAILED ({e})", file=sys.stderr) + n_fail += 1 + break + + # Throttle to ~3 requests/sec (NCBI + GenomeArk = 2 requests per accession) + time.sleep(0.3) + + elapsed = time.time() - t_start + print(f"\nDone in {elapsed:.0f}s: {n_ok} OK, {n_fail} failed out of {len(todo)}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/fhr/genomeark_to_fhr.py b/data_loaders/ref-genome-analysis/fhr/genomeark_to_fhr.py new file mode 100755 index 0000000..e8a1e72 --- /dev/null +++ b/data_loaders/ref-genome-analysis/fhr/genomeark_to_fhr.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +"""Generate FHR metadata JSON files from GenomeArk + NCBI Datasets API. + +Given a GCA accession, fetches: + 1. Assembly metadata from NCBI Datasets API (taxonomy, stats, sequencing tech) + 2. Species metadata from GenomeArk GitHub repo (common name, genome size, project) + +Outputs an FHR-compatible JSON file that can be loaded into a RefgetStore via +store.load_fhr_metadata(digest, path). + +Usage: + python genomeark_to_fhr.py GCA_964261635.1 [output.fhr.json] + python genomeark_to_fhr.py GCA_964261635.1 GCA_964263255.1 # multiple accessions +""" + +import json +import sys +import urllib.request +from pathlib import Path + + +def fetch_ncbi_report(accession: str) -> dict: + """Fetch assembly report from NCBI Datasets API.""" + url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{accession}/dataset_report" + req = urllib.request.Request(url, headers={"Accept": "application/json"}) + with urllib.request.urlopen(req) as resp: + data = json.loads(resp.read()) + reports = data.get("reports", []) + if not reports: + raise ValueError(f"No assembly report found for {accession}") + return reports[0] + + +def fetch_genomeark_yaml(species_name: str) -> dict | None: + """Fetch species YAML from genomeark-metadata GitHub repo.""" + filename = species_name.replace(" ", "_") + url = f"https://raw.githubusercontent.com/genomeark/genomeark-metadata/main/species/{filename}.yaml" + try: + import yaml + except ImportError: + # Fall back to basic parsing if PyYAML not available + try: + with urllib.request.urlopen(url) as resp: + text = resp.read().decode() + # Basic extraction without full YAML parsing + result = {"_raw": text} + for line in text.split("\n"): + line = line.strip() + if line.startswith("common_name:"): + result["common_name"] = line.split(":", 1)[1].strip().strip("'\"") + elif line.startswith("genome_size:"): + try: + result["genome_size"] = int(line.split(":", 1)[1].strip()) + except ValueError: + pass + elif line.startswith("project:"): + result["project"] = line.split(":", 1)[1].strip() + return result + except Exception: + return None + + try: + with urllib.request.urlopen(url) as resp: + return yaml.safe_load(resp.read()) + except Exception: + return None + + +def ncbi_to_fhr(report: dict, genomeark: dict | None = None) -> dict: + """Convert NCBI assembly report + GenomeArk data to FHR metadata.""" + organism = report.get("organism", {}) + assembly = report.get("assembly_info", {}) + stats = report.get("assembly_stats", {}) + + species_name = organism.get("organism_name", "") + tax_id = organism.get("tax_id") + common_name = organism.get("common_name", "") + + # GenomeArk may have a better common name + if genomeark: + species = genomeark.get("species", genomeark) + common_name = common_name or species.get("common_name", "") + + fhr = { + "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", + "schemaVersion": 1, + "genome": species_name, + "version": assembly.get("assembly_name", ""), + "dateCreated": assembly.get("release_date", ""), + } + + # Taxonomy + if tax_id: + fhr["taxon"] = { + "name": species_name, + "uri": f"https://identifiers.org/taxonomy:{tax_id}", + } + + # Common name as synonym + if common_name: + fhr["genomeSynonym"] = [common_name] + + # Accession + accession = report.get("accession", "") + if accession: + fhr["accessionID"] = { + "name": accession, + "url": f"https://www.ncbi.nlm.nih.gov/datasets/genome/{accession}/", + } + + # Submitter as assembly author + submitter = assembly.get("submitter", "") + if submitter: + fhr["assemblyAuthor"] = [{"name": submitter}] + + # Sequencing technology + seq_tech = assembly.get("sequencing_tech", "") + if seq_tech: + fhr["instrument"] = [t.strip() for t in seq_tech.split(",")] + + # Assembly method + method = assembly.get("assembly_method", "") + if method and method != "various": + fhr["assemblySoftware"] = method + + # Vital statistics + vital = {} + if stats.get("contig_n50"): + vital["N50"] = stats["contig_n50"] + if stats.get("contig_l50"): + vital["L50"] = stats["contig_l50"] + if stats.get("total_sequence_length"): + vital["totalBasePairs"] = int(stats["total_sequence_length"]) + if stats.get("number_of_contigs"): + vital["numberContigs"] = stats["number_of_contigs"] + if stats.get("number_of_scaffolds"): + vital["numberScaffolds"] = stats["number_of_scaffolds"] + if stats.get("scaffold_n50"): + vital["scaffoldN50"] = stats["scaffold_n50"] + if vital: + fhr["vitalStats"] = vital + + # Related links + links = [] + links.append(f"https://www.genomeark.org/genomeark-all/{species_name.replace(' ', '_')}.html") + if accession: + links.append(f"https://www.ncbi.nlm.nih.gov/datasets/genome/{accession}/") + fhr["relatedLink"] = links + + # BioProject lineage — note VGP/DToL/EBP affiliations + projects = [] + for lineage in assembly.get("bioproject_lineage", []): + for bp in lineage.get("bioprojects", []): + title = bp.get("title", "") + if any(kw in title.lower() for kw in ["vertebrate genomes", "darwin tree", "earth biogenome"]): + projects.append(title) + if projects: + fhr["documentation"] = "Projects: " + "; ".join(projects) + + # License + fhr["license"] = "https://www.genomeark.org/documentation/data-use-policy.html" + + return fhr + + +def process_accession(accession: str, output_path: str | None = None) -> str: + """Process a single accession and write FHR JSON.""" + print(f"Fetching NCBI report for {accession}...", file=sys.stderr) + report = fetch_ncbi_report(accession) + + species_name = report.get("organism", {}).get("organism_name", "") + print(f" Species: {species_name}", file=sys.stderr) + + print(f" Fetching GenomeArk metadata...", file=sys.stderr) + genomeark = fetch_genomeark_yaml(species_name) if species_name else None + + fhr = ncbi_to_fhr(report, genomeark) + + if output_path is None: + output_path = f"{accession}.fhr.json" + + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w") as f: + json.dump(fhr, f, indent=2) + + print(f" Wrote: {output_path}", file=sys.stderr) + return output_path + + +def main(): + if len(sys.argv) < 2: + print("Usage: genomeark_to_fhr.py [accession2 ...] [--output-dir DIR]") + print(" genomeark_to_fhr.py GCA_964261635.1") + print(" genomeark_to_fhr.py GCA_964261635.1 GCA_964263255.1 --output-dir fhr/") + sys.exit(1) + + args = sys.argv[1:] + output_dir = None + + if "--output-dir" in args: + idx = args.index("--output-dir") + output_dir = args[idx + 1] + args = args[:idx] + args[idx + 2:] + + for accession in args: + if output_dir: + output_path = f"{output_dir}/{accession}.fhr.json" + else: + output_path = f"{accession}.fhr.json" + process_accession(accession, output_path) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/fhr/load_fhr_metadata.py b/data_loaders/ref-genome-analysis/fhr/load_fhr_metadata.py new file mode 100644 index 0000000..78a9bfd --- /dev/null +++ b/data_loaders/ref-genome-analysis/fhr/load_fhr_metadata.py @@ -0,0 +1,97 @@ +""" +Load FHR metadata JSON files into an existing RefgetStore. + +Resolves accessions to collection digests via the store's 'insdc' alias +namespace. Strips vitalStats before loading, since those describe the source +assembly, not the specific sequence collection. + +Usage: + python load_fhr_metadata.py --store-path /path/to/store --fhr-dir fhr_metadata/ + python load_fhr_metadata.py --store-path /path/to/store --fhr file.fhr.json --digest abc123 +""" + +import argparse +import glob +import json +import os +import sys +import tempfile + +from gtars.refget import RefgetStore + + +def strip_vital_stats(fhr_path): + """Write a temp FHR file with vitalStats removed. Returns temp path.""" + with open(fhr_path) as f: + fhr_data = json.load(f) + provenance = {k: v for k, v in fhr_data.items() if k != "vitalStats"} + tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".fhr.json", delete=False) + json.dump(provenance, tmp, indent=2) + tmp.close() + return tmp.name + + +def load_fhr_dir(store, fhr_dir, namespaces=("insdc", "refseq")): + """Load all .fhr.json files, resolving accession -> digest via alias namespaces.""" + fhr_files = sorted(glob.glob(os.path.join(fhr_dir, "*.fhr.json"))) + if not fhr_files: + print(f"No .fhr.json files found in {fhr_dir}", file=sys.stderr) + return + + print(f"Loading {len(fhr_files)} FHR files, resolving via {namespaces} aliases...", file=sys.stderr) + + n_loaded = 0 + n_skipped = 0 + for fhr_path in fhr_files: + basename = os.path.basename(fhr_path) + accession = basename.replace(".fhr.json", "") + + meta = None + for ns in namespaces: + meta = store.get_collection_metadata_by_alias(ns, accession) + if meta is not None: + break + + if meta is None: + n_skipped += 1 + continue + + tmp_path = strip_vital_stats(fhr_path) + try: + store.load_fhr_metadata(meta.digest, tmp_path) + finally: + os.unlink(tmp_path) + n_loaded += 1 + + if n_loaded % 100 == 0: + print(f" ... {n_loaded} loaded", file=sys.stderr) + + print(f"\nLoaded {n_loaded}, skipped {n_skipped} (no alias match)", file=sys.stderr) + + +def main(): + parser = argparse.ArgumentParser(description="Load FHR metadata into RefgetStore") + parser.add_argument("--store-path", required=True, help="Path to RefgetStore") + parser.add_argument("--fhr-dir", help="Directory of .fhr.json files") + parser.add_argument("--fhr", help="Single .fhr.json file") + parser.add_argument("--digest", help="Collection digest (required with --fhr)") + args = parser.parse_args() + + store = RefgetStore.on_disk(args.store_path) + + if args.fhr_dir: + load_fhr_dir(store, args.fhr_dir) + elif args.fhr and args.digest: + tmp_path = strip_vital_stats(args.fhr) + try: + store.load_fhr_metadata(args.digest, tmp_path) + finally: + os.unlink(tmp_path) + print(f"Loaded {args.fhr} -> {args.digest}", file=sys.stderr) + else: + print("Provide --fhr-dir or --fhr + --digest", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/fhr/metadata/GCA_000001405.29.fhr.json b/data_loaders/ref-genome-analysis/fhr/metadata/GCA_000001405.29.fhr.json new file mode 100755 index 0000000..4c3b322 --- /dev/null +++ b/data_loaders/ref-genome-analysis/fhr/metadata/GCA_000001405.29.fhr.json @@ -0,0 +1,36 @@ +{ + "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", + "schemaVersion": 1, + "genome": "Homo sapiens", + "version": "GRCh38.p14", + "dateCreated": "2022-02-03", + "taxon": { + "name": "Homo sapiens", + "uri": "https://identifiers.org/taxonomy:9606" + }, + "genomeSynonym": [ + "human" + ], + "accessionID": { + "name": "GCA_000001405.29", + "url": "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_000001405.29/" + }, + "assemblyAuthor": [ + { + "name": "Genome Reference Consortium" + } + ], + "vitalStats": { + "N50": 57879411, + "L50": 18, + "totalBasePairs": 3099734149, + "numberContigs": 999, + "numberScaffolds": 473, + "scaffoldN50": 67794873 + }, + "relatedLink": [ + "https://www.genomeark.org/genomeark-all/Homo_sapiens.html", + "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_000001405.29/" + ], + "license": "https://www.genomeark.org/documentation/data-use-policy.html" +} \ No newline at end of file diff --git a/data_loaders/ref-genome-analysis/fhr/metadata/GCA_000001405.fhr.json b/data_loaders/ref-genome-analysis/fhr/metadata/GCA_000001405.fhr.json new file mode 100755 index 0000000..4c3b322 --- /dev/null +++ b/data_loaders/ref-genome-analysis/fhr/metadata/GCA_000001405.fhr.json @@ -0,0 +1,36 @@ +{ + "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", + "schemaVersion": 1, + "genome": "Homo sapiens", + "version": "GRCh38.p14", + "dateCreated": "2022-02-03", + "taxon": { + "name": "Homo sapiens", + "uri": "https://identifiers.org/taxonomy:9606" + }, + "genomeSynonym": [ + "human" + ], + "accessionID": { + "name": "GCA_000001405.29", + "url": "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_000001405.29/" + }, + "assemblyAuthor": [ + { + "name": "Genome Reference Consortium" + } + ], + "vitalStats": { + "N50": 57879411, + "L50": 18, + "totalBasePairs": 3099734149, + "numberContigs": 999, + "numberScaffolds": 473, + "scaffoldN50": 67794873 + }, + "relatedLink": [ + "https://www.genomeark.org/genomeark-all/Homo_sapiens.html", + "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_000001405.29/" + ], + "license": "https://www.genomeark.org/documentation/data-use-policy.html" +} \ No newline at end of file diff --git a/data_loaders/ref-genome-analysis/fhr/metadata/GCA_964261635.1.fhr.json b/data_loaders/ref-genome-analysis/fhr/metadata/GCA_964261635.1.fhr.json new file mode 100644 index 0000000..37fdd6f --- /dev/null +++ b/data_loaders/ref-genome-analysis/fhr/metadata/GCA_964261635.1.fhr.json @@ -0,0 +1,41 @@ +{ + "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", + "schemaVersion": 1, + "genome": "Lissotriton helveticus", + "version": "aLisHel1.1", + "dateCreated": "2024-10-17", + "taxon": { + "name": "Lissotriton helveticus", + "uri": "https://identifiers.org/taxonomy:256425" + }, + "genomeSynonym": [ + "palmate newt" + ], + "accessionID": { + "name": "GCA_964261635.1", + "url": "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_964261635.1/" + }, + "assemblyAuthor": [ + { + "name": "WELLCOME SANGER INSTITUTE" + } + ], + "instrument": [ + "PacBio", + "Arima2" + ], + "vitalStats": { + "N50": 7795245, + "L50": 941, + "totalBasePairs": 23170028842, + "numberContigs": 5693, + "numberScaffolds": 448, + "scaffoldN50": 2132484007 + }, + "relatedLink": [ + "https://www.genomeark.org/genomeark-all/Lissotriton_helveticus.html", + "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_964261635.1/" + ], + "documentation": "Projects: Vertebrate Genomes Project; Darwin Tree of Life Project: Genome Data and Assemblies; Earth BioGenome Project (EBP)", + "license": "https://www.genomeark.org/documentation/data-use-policy.html" +} \ No newline at end of file diff --git a/data_loaders/ref-genome-analysis/fhr/metadata/GCA_964263255.1.fhr.json b/data_loaders/ref-genome-analysis/fhr/metadata/GCA_964263255.1.fhr.json new file mode 100644 index 0000000..e8a6bda --- /dev/null +++ b/data_loaders/ref-genome-analysis/fhr/metadata/GCA_964263255.1.fhr.json @@ -0,0 +1,41 @@ +{ + "schema": "https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json", + "schemaVersion": 1, + "genome": "Lissotriton vulgaris", + "version": "aLisVul1.1", + "dateCreated": "2024-10-17", + "taxon": { + "name": "Lissotriton vulgaris", + "uri": "https://identifiers.org/taxonomy:8324" + }, + "genomeSynonym": [ + "common newt" + ], + "accessionID": { + "name": "GCA_964263255.1", + "url": "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_964263255.1/" + }, + "assemblyAuthor": [ + { + "name": "WELLCOME SANGER INSTITUTE" + } + ], + "instrument": [ + "PacBio", + "Arima2" + ], + "vitalStats": { + "N50": 6568731, + "L50": 1102, + "totalBasePairs": 24226223864, + "numberContigs": 19295, + "numberScaffolds": 15265, + "scaffoldN50": 1925992481 + }, + "relatedLink": [ + "https://www.genomeark.org/genomeark-all/Lissotriton_vulgaris.html", + "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_964263255.1/" + ], + "documentation": "Projects: Vertebrate Genomes Project; Darwin Tree of Life Project: Genome Data and Assemblies; Earth BioGenome Project (EBP)", + "license": "https://www.genomeark.org/documentation/data-use-policy.html" +} \ No newline at end of file diff --git a/data_loaders/ref-genome-analysis/inventory_genomes.py b/data_loaders/ref-genome-analysis/inventory/inventory_genomes.py similarity index 100% rename from data_loaders/ref-genome-analysis/inventory_genomes.py rename to data_loaders/ref-genome-analysis/inventory/inventory_genomes.py diff --git a/data_loaders/ref-genome-analysis/process-all-genomes.sbatch b/data_loaders/ref-genome-analysis/process-all-genomes.sbatch deleted file mode 100644 index 28f3de7..0000000 --- a/data_loaders/ref-genome-analysis/process-all-genomes.sbatch +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=refgetstore -#SBATCH --output=refgetstore_%j.log -#SBATCH --error=refgetstore_%j.log -#SBATCH --partition=standard -#SBATCH --time=24:00:00 -#SBATCH --mem=16G -#SBATCH --cpus-per-task=4 -#SBATCH --account=shefflab - -module load miniforge/24.3.0-py3.11 - -cd /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget/data_loaders/ref-genome-analysis - -python build_refgetstore.py --store-path /project/shefflab/brickyard/refget_store diff --git a/data_loaders/ref-genome-analysis/profiling/profile_all.py b/data_loaders/ref-genome-analysis/profiling/profile_all.py new file mode 100644 index 0000000..e9cf60b --- /dev/null +++ b/data_loaders/ref-genome-analysis/profiling/profile_all.py @@ -0,0 +1,48 @@ +"""Profile RefgetStore on 5 genomes: newt + 4 normal. Compare timing and memory.""" +import time +import resource + +def peak_mb(): + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + +def rss_mb(): + try: + with open("/proc/self/status") as f: + for line in f: + if line.startswith("VmRSS:"): + return int(line.split()[1]) / 1024 + except: + pass + return peak_mb() + +from gtars.refget import RefgetStore + +BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" +STORE_PATH = f"{BRICK_ROOT}/refget_store" +GENOMES = [ + # (path, old_total_time, n_seqs, label) + ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964261635.1.fa.gz", 183.7, 448, "newt (2GB chr)"), + ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964263255.1.fa.gz", 213.2, 15265, "15K seqs"), + ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964263955.1.fa.gz", 42.7, 11150, "11K seqs"), + ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964264875.2.fa.gz", 27.4, 585, "585 seqs"), + ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964266715.1.fa.gz", 17.0, 1581, "1.6K seqs"), +] + +store = RefgetStore.on_disk(STORE_PATH) +print(f"Store opened. Stats: {store.stats()}") +print(f"RSS after open: {rss_mb():.0f} MB\n") + +print(f"{'Genome':<30} {'Seqs':>6} {'New(s)':>8} {'Old(s)':>8} {'Ratio':>7} {'Peak MB':>8}") +print("-" * 75) + +for fasta, old_total, old_nseqs, label in GENOMES: + name = fasta.split("/")[-1] + t0 = time.time() + meta, was_new = store.add_sequence_collection_from_fasta(fasta) + elapsed = time.time() - t0 + ratio = elapsed / old_total + status = "NEW" if was_new else "SKIP" + print(f"{label:<30} {meta.n_sequences:>6} {elapsed:>7.1f}s {old_total:>7.1f}s {ratio:>6.2f}x {peak_mb():>7.0f}", flush=True) + +print(f"\nFinal RSS: {rss_mb():.0f} MB, Peak: {peak_mb():.0f} MB") +print(f"Store stats: {store.stats()}") diff --git a/data_loaders/ref-genome-analysis/profiling/profile_all.sbatch b/data_loaders/ref-genome-analysis/profiling/profile_all.sbatch new file mode 100644 index 0000000..e1edf19 --- /dev/null +++ b/data_loaders/ref-genome-analysis/profiling/profile_all.sbatch @@ -0,0 +1,15 @@ +#!/bin/bash +#SBATCH --job-name=profile_all +#SBATCH --output=profile_all_%j.log +#SBATCH --error=profile_all_%j.log +#SBATCH --partition=standard +#SBATCH --time=2:00:00 +#SBATCH --mem=16G +#SBATCH --cpus-per-task=4 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget/data_loaders/ref-genome-analysis/profiling + +python profile_all.py diff --git a/data_loaders/ref-genome-analysis/profiling/profile_batch.py b/data_loaders/ref-genome-analysis/profiling/profile_batch.py new file mode 100644 index 0000000..b291e78 --- /dev/null +++ b/data_loaders/ref-genome-analysis/profiling/profile_batch.py @@ -0,0 +1,28 @@ +"""Profile RefgetStore on several normal genomes for timing comparison.""" +import time +import resource + +def peak_mb(): + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + +from gtars.refget import RefgetStore + +BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" +STORE_PATH = f"{BRICK_ROOT}/refget_store" +GENOMES = [ + # (path, old_pipeline_time, old_total_time, n_seqs) + ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964263255.1.fa.gz", 203.1, 213.2, 15265), + ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964263955.1.fa.gz", 32.6, 42.7, 11150), + ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964266715.1.fa.gz", 7.2, 17.0, 1581), +] + +store = RefgetStore.on_disk(STORE_PATH) +print(f"Store opened. Stats: {store.stats()}\n") + +for fasta, old_pipe, old_total, old_nseqs in GENOMES: + name = fasta.split("/")[-1] + t0 = time.time() + meta, was_new = store.add_sequence_collection_from_fasta(fasta) + elapsed = time.time() - t0 + status = "NEW" if was_new else "SKIP" + print(f"{status} {name}: {meta.n_sequences} seqs, {elapsed:.1f}s (old: {old_pipe:.1f}s pipe / {old_total:.1f}s total), Peak={peak_mb():.0f} MB") diff --git a/data_loaders/ref-genome-analysis/profiling/profile_memory.py b/data_loaders/ref-genome-analysis/profiling/profile_memory.py new file mode 100644 index 0000000..5511032 --- /dev/null +++ b/data_loaders/ref-genome-analysis/profiling/profile_memory.py @@ -0,0 +1,87 @@ +"""Profile RefgetStore memory usage on Rivanna.""" +import os +import sys +import time +import resource +import csv + +def rss_mb(): + """Current RSS in MB from /proc/self/status (more accurate than ru_maxrss).""" + try: + with open("/proc/self/status") as f: + for line in f: + if line.startswith("VmRSS:"): + return int(line.split()[1]) / 1024 # KB to MB + except: + pass + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + +def peak_mb(): + """Peak RSS (high-water mark).""" + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + +def print_mem(label): + print(f"[MEM] {label}: RSS={rss_mb():.1f} MB, Peak={peak_mb():.1f} MB", flush=True) + +print_mem("startup") + +from gtars.refget import RefgetStore +print_mem("after import") + +BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" +STORE_PATH = f"{BRICK_ROOT}/refget_store" +INVENTORY_CSV = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refgenomes_inventory.csv" + +# Open the store +t0 = time.time() +store = RefgetStore.on_disk(STORE_PATH) +t1 = time.time() +print_mem(f"after open_local ({t1-t0:.1f}s)") +print(f"Store stats: {store.stats()}") + +# Read inventory +rows = [] +with open(INVENTORY_CSV) as f: + reader = csv.DictReader(f) + for row in reader: + rows.append(row) + +# Use offset to skip to unprocessed files +OFFSET = int(sys.argv[1]) if len(sys.argv) > 1 else 0 +TARGET_NEW = int(sys.argv[2]) if len(sys.argv) > 2 else 5 + +if OFFSET: + rows = rows[OFFSET:] + print(f"Skipped to offset {OFFSET}, {len(rows)} remaining") + +print(f"Total rows to process: {len(rows)}, targeting {TARGET_NEW} new files") +print_mem("before processing loop") + +n_new = 0 +n_skipped = 0 + +for i, row in enumerate(rows): + fasta_path = row["path"] + filename = row.get("filename", "") + + t0 = time.time() + try: + meta, was_new = store.add_sequence_collection_from_fasta(fasta_path, threads=4) + elapsed = time.time() - t0 + + if was_new: + n_new += 1 + print(f"\n[{OFFSET+i+1}] NEW: {filename} -> {meta.digest} ({meta.n_sequences} seqs, {elapsed:.1f}s)") + print_mem(f"after NEW #{n_new}") + print(f"Store stats: {store.stats()}") + if n_new >= TARGET_NEW: + break + else: + n_skipped += 1 + if n_skipped % 50 == 0: + print_mem(f"skipping... ({n_skipped} skipped, row {OFFSET+i+1})") + except Exception as e: + print(f"[{OFFSET+i+1}] FAILED {filename}: {e}") + +print(f"\nDone: {n_new} new, {n_skipped} skipped") +print_mem("final") diff --git a/data_loaders/ref-genome-analysis/profiling/profile_memory.sbatch b/data_loaders/ref-genome-analysis/profiling/profile_memory.sbatch new file mode 100644 index 0000000..b6ddc84 --- /dev/null +++ b/data_loaders/ref-genome-analysis/profiling/profile_memory.sbatch @@ -0,0 +1,15 @@ +#!/bin/bash +#SBATCH --job-name=profile_mem +#SBATCH --output=profile_mem_%j.log +#SBATCH --error=profile_mem_%j.log +#SBATCH --partition=standard +#SBATCH --time=2:00:00 +#SBATCH --mem=16G +#SBATCH --cpus-per-task=4 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget/data_loaders/ref-genome-analysis/profiling + +python profile_memory.py 850 5 diff --git a/data_loaders/ref-genome-analysis/profiling/profile_newt.py b/data_loaders/ref-genome-analysis/profiling/profile_newt.py new file mode 100644 index 0000000..5343eba --- /dev/null +++ b/data_loaders/ref-genome-analysis/profiling/profile_newt.py @@ -0,0 +1,57 @@ +"""Profile RefgetStore memory on the palmate newt genome (GCA_964261635.1). + +This genome has a single 2 GB chromosome — the worst case for pipeline memory. +Run via sbatch after removing the genome from the store to force re-processing. +""" +import os +import sys +import time +import resource + +def rss_mb(): + """Current RSS in MB from /proc/self/status.""" + try: + with open("/proc/self/status") as f: + for line in f: + if line.startswith("VmRSS:"): + return int(line.split()[1]) / 1024 + except: + pass + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + +def peak_mb(): + """Peak RSS (high-water mark).""" + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + +def print_mem(label): + print(f"[MEM] {label}: RSS={rss_mb():.1f} MB, Peak={peak_mb():.1f} MB", flush=True) + +print_mem("startup") + +from gtars.refget import RefgetStore +print_mem("after import") + +BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" +STORE_PATH = f"{BRICK_ROOT}/refget_store" +NEWT_FASTA = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964261635.1.fa.gz" + +# Open the store +t0 = time.time() +store = RefgetStore.on_disk(STORE_PATH) +t1 = time.time() +print_mem(f"after open_local ({t1-t0:.1f}s)") +print(f"Store stats: {store.stats()}") + +# Process the newt genome +print(f"\nProcessing newt genome: {NEWT_FASTA}") +t0 = time.time() +meta, was_new = store.add_sequence_collection_from_fasta(NEWT_FASTA) +elapsed = time.time() - t0 + +status = "NEW" if was_new else "SKIPPED (already exists)" +print(f"\nResult: {status}") +print(f"Digest: {meta.digest}") +print(f"Sequences: {meta.n_sequences}") +print(f"Time: {elapsed:.1f}s") +print_mem("after processing") +print(f"Store stats: {store.stats()}") diff --git a/data_loaders/ref-genome-analysis/profiling/profile_newt.sbatch b/data_loaders/ref-genome-analysis/profiling/profile_newt.sbatch new file mode 100644 index 0000000..e3595af --- /dev/null +++ b/data_loaders/ref-genome-analysis/profiling/profile_newt.sbatch @@ -0,0 +1,15 @@ +#!/bin/bash +#SBATCH --job-name=profile_newt +#SBATCH --output=profile_newt_%j.log +#SBATCH --error=profile_newt_%j.log +#SBATCH --partition=standard +#SBATCH --time=2:00:00 +#SBATCH --mem=16G +#SBATCH --cpus-per-task=4 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget/data_loaders/ref-genome-analysis/profiling + +python profile_newt.py diff --git a/data_loaders/ref-genome-analysis/profiling/profile_normal.py b/data_loaders/ref-genome-analysis/profiling/profile_normal.py new file mode 100644 index 0000000..ddec583 --- /dev/null +++ b/data_loaders/ref-genome-analysis/profiling/profile_normal.py @@ -0,0 +1,34 @@ +"""Profile RefgetStore on a normal-sized genome (GCA_964264875.2, 585 seqs). +Compare timing with old code (17.7s pipeline time).""" +import time +import resource + +def peak_mb(): + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + +def print_mem(label): + print(f"[MEM] {label}: Peak={peak_mb():.1f} MB", flush=True) + +print_mem("startup") + +from gtars.refget import RefgetStore +print_mem("after import") + +BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" +STORE_PATH = f"{BRICK_ROOT}/refget_store" +FASTA = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964264875.2.fa.gz" + +t0 = time.time() +store = RefgetStore.on_disk(STORE_PATH) +print_mem(f"after open_local ({time.time()-t0:.1f}s)") + +print(f"\nProcessing: {FASTA}") +t0 = time.time() +meta, was_new = store.add_sequence_collection_from_fasta(FASTA) +elapsed = time.time() - t0 + +print(f"Result: {'NEW' if was_new else 'SKIPPED'}") +print(f"Digest: {meta.digest}") +print(f"Sequences: {meta.n_sequences}") +print(f"Time: {elapsed:.1f}s (old code: 17.7s pipeline / 27.4s total)") +print_mem("after processing") diff --git a/data_loaders/ref-genome-analysis/profiling/profile_normal.sbatch b/data_loaders/ref-genome-analysis/profiling/profile_normal.sbatch new file mode 100644 index 0000000..a6e7b88 --- /dev/null +++ b/data_loaders/ref-genome-analysis/profiling/profile_normal.sbatch @@ -0,0 +1,15 @@ +#!/bin/bash +#SBATCH --job-name=profile_normal +#SBATCH --output=profile_normal_%j.log +#SBATCH --error=profile_normal_%j.log +#SBATCH --partition=standard +#SBATCH --time=0:30:00 +#SBATCH --mem=8G +#SBATCH --cpus-per-task=4 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget/data_loaders/ref-genome-analysis/profiling + +python profile_normal.py diff --git a/data_loaders/ref-genome-analysis/verify_refgetstore.py b/data_loaders/ref-genome-analysis/verify/verify_refgetstore.py similarity index 97% rename from data_loaders/ref-genome-analysis/verify_refgetstore.py rename to data_loaders/ref-genome-analysis/verify/verify_refgetstore.py index fc53a59..e054393 100644 --- a/data_loaders/ref-genome-analysis/verify_refgetstore.py +++ b/data_loaders/ref-genome-analysis/verify/verify_refgetstore.py @@ -26,9 +26,10 @@ import tempfile import time -STORE_PATH = "/project/shefflab/brickyard/refget_store" -INVENTORY_CSV = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refgenomes_inventory.csv" -DIGEST_MAP_CSV = "/home/nsheff/Dropbox/workspaces/refgenie/repos/refget/data_loaders/ref-genome-analysis/digest_map.csv" +BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" +STORE_PATH = f"{BRICK_ROOT}/refget_store" +INVENTORY_CSV = f"{BRICK_ROOT}/refgenomes_inventory.csv" +DIGEST_MAP_CSV = f"{BRICK_ROOT}/refget_staging/digest_map.csv" results = [] @@ -75,7 +76,7 @@ def check_store_opens(store_path): # Count collections and sequences try: - collections = list(store.list_collections()) + collections = list(store.list_collections()["results"]) n_collections = len(collections) except Exception as e: check("list_collections", False, f"error={e}") @@ -130,7 +131,7 @@ def check_digest_map(store, digest_map_path): ) # Get store collection digests for comparison - store_digests = {meta.digest for meta in store.list_collections()} + store_digests = {meta.digest for meta in store.list_collections()["results"]} # Check how many digest_map digests are in the store matched = 0 @@ -155,7 +156,7 @@ def check_digest_map(store, digest_map_path): def check_level2_integrity(store, n_to_check=3): """Verify level2 data for a sample of collections.""" - collections = list(store.list_collections()) + collections = list(store.list_collections()["results"]) if not collections: check("level2_integrity", False, "no collections to check") return @@ -232,7 +233,7 @@ def check_roundtrip_export(store, store_path, digest_map_path, inventory_path, l return # Pick a sample of collections that have original files - collections = list(store.list_collections()) + collections = list(store.list_collections()["results"]) test_pairs = [] for meta in collections: if meta.digest in digest_to_original: diff --git a/data_loaders/riva_pangenome_analysis/update-gtars.sh b/data_loaders/riva_pangenome_analysis/update-gtars.sh new file mode 100644 index 0000000..44d22c5 --- /dev/null +++ b/data_loaders/riva_pangenome_analysis/update-gtars.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Reinstall gtars and refget on rivanna + +ssh riva 'bash --login -s' << 'EOF' +set -e +source /etc/profile.d/modules.sh +module load miniforge/24.3.0-py3.11 + +# Build gtars (refget module only) +cd ~/code/gtars +git checkout refgetstore +git pull +cd gtars-python +rm -f ../target/wheels/gtars-*.whl +maturin build --release --no-default-features --features refget +pip install ../target/wheels/gtars-*.whl --force-reinstall --no-deps + +# Install local refget +cd ~/code/refget +git checkout dev +git pull +python -m pip install -e . + +echo "Done!" +EOF From e09ccdd3222d5fc52d27e26a2990d287e885c89f Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 16 Mar 2026 22:20:41 -0400 Subject: [PATCH 21/31] build up store-backed seqcolapi --- .github/workflows/deploy_store.yml | 56 ++++++++++++ deployment/seqcolapi-store/Dockerfile | 5 ++ deployment/seqcolapi-store/production.env | 2 + deployment/seqcolapi-store/task_def.json | 101 ++++++++++++++++++++++ refget-r/R/getSeq-methods.R | 42 +++++++-- refget/middleware.py | 59 +++++++++++++ refget/router.py | 31 +++++-- seqcolapi/main.py | 81 ++++++++++++----- 8 files changed, 339 insertions(+), 38 deletions(-) create mode 100644 .github/workflows/deploy_store.yml create mode 100644 deployment/seqcolapi-store/Dockerfile create mode 100644 deployment/seqcolapi-store/production.env create mode 100644 deployment/seqcolapi-store/task_def.json create mode 100644 refget/middleware.py diff --git a/.github/workflows/deploy_store.yml b/.github/workflows/deploy_store.yml new file mode 100644 index 0000000..6a88016 --- /dev/null +++ b/.github/workflows/deploy_store.yml @@ -0,0 +1,56 @@ +on: + workflow_dispatch: + inputs: null + workflow_run: + workflows: ["Deploy to Dockerhub on release"] + types: + - completed + +name: Deploy store-backed seqcolapi to Amazon ECS + +jobs: + deploy: + name: Deploy + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v1 + + - name: Build, tag, and push image to Amazon ECR + id: build-image + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + ECR_REPOSITORY: seqcolapi-store + IMAGE_TAG: ${{ github.sha }} + run: | + cd deployment/seqcolapi-store/ + docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -f Dockerfile . + docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG + echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" + + - name: Fill in the new image ID in the Amazon ECS task definition + id: task-def + uses: aws-actions/amazon-ecs-render-task-definition@v1 + with: + task-definition: deployment/seqcolapi-store/task_def.json + container-name: seqcolapi-store + image: ${{ steps.build-image.outputs.image }} + + - name: Deploy Amazon ECS task definition + uses: aws-actions/amazon-ecs-deploy-task-definition@v1 + with: + task-definition: ${{ steps.task-def.outputs.task-definition }} + service: seqcolapi-store-service + cluster: yeti + wait-for-service-stability: true diff --git a/deployment/seqcolapi-store/Dockerfile b/deployment/seqcolapi-store/Dockerfile new file mode 100644 index 0000000..e138239 --- /dev/null +++ b/deployment/seqcolapi-store/Dockerfile @@ -0,0 +1,5 @@ +FROM tiangolo/uvicorn-gunicorn:python3.11-slim +LABEL authors="Nathan Sheffield" +RUN pip install https://github.com/refgenie/refget/archive/dev.zip +RUN pip install gtars +CMD ["uvicorn", "seqcolapi.main:store_app", "--host", "0.0.0.0", "--port", "80"] diff --git a/deployment/seqcolapi-store/production.env b/deployment/seqcolapi-store/production.env new file mode 100644 index 0000000..fa32c22 --- /dev/null +++ b/deployment/seqcolapi-store/production.env @@ -0,0 +1,2 @@ +export REFGET_STORE_URL="s3://seqcolapi-store/refget/" +export SERVER_ENV="production" diff --git a/deployment/seqcolapi-store/task_def.json b/deployment/seqcolapi-store/task_def.json new file mode 100644 index 0000000..4a3d22a --- /dev/null +++ b/deployment/seqcolapi-store/task_def.json @@ -0,0 +1,101 @@ +{ + "ipcMode": null, + "executionRoleArn": "arn:aws:iam::235728444054:role/ecsTaskExecutionRole", + "containerDefinitions": [ + { + "dnsSearchDomains": null, + "environmentFiles": null, + "logConfiguration": null, + "entryPoint": null, + "portMappings": [ + { + "hostPort": 8106, + "protocol": "tcp", + "containerPort": 80 + } + ], + "command": null, + "linuxParameters": null, + "cpu": 0, + "environment": [ + { + "name": "REFGET_STORE_URL", + "value": "s3://seqcolapi-store/refget/" + } + ], + "resourceRequirements": null, + "ulimits": null, + "dnsServers": null, + "mountPoints": [], + "workingDirectory": null, + "secrets": [], + "dockerSecurityOptions": null, + "memory": 2048, + "memoryReservation": 512, + "volumesFrom": [], + "stopTimeout": null, + "image": "235728444054.dkr.ecr.us-east-1.amazonaws.com/my-ecr-repo:latest", + "startTimeout": null, + "firelensConfiguration": null, + "dependsOn": null, + "disableNetworking": null, + "interactive": null, + "healthCheck": null, + "essential": true, + "links": null, + "hostname": null, + "extraHosts": null, + "pseudoTerminal": null, + "user": null, + "readonlyRootFilesystem": null, + "dockerLabels": null, + "systemControls": null, + "privileged": null, + "name": "seqcolapi-store" + } + ], + "placementConstraints": [], + "memory": null, + "taskRoleArn": "ecsTaskExecutionRole", + "compatibilities": [ + "EC2" + ], + "family": "seqcolapi-store-task", + "requiresAttributes": [ + { + "targetId": null, + "targetType": null, + "value": null, + "name": "com.amazonaws.ecs.capability.ecr-auth" + }, + { + "targetId": null, + "targetType": null, + "value": null, + "name": "com.amazonaws.ecs.capability.docker-remote-api.1.21" + }, + { + "targetId": null, + "targetType": null, + "value": null, + "name": "com.amazonaws.ecs.capability.task-iam-role" + }, + { + "targetId": null, + "targetType": null, + "value": null, + "name": "ecs.capability.execution-role-ecr-pull" + } + ], + "pidMode": null, + "requiresCompatibilities": [ + "EC2" + ], + "networkMode": "bridge", + "cpu": "128", + "revision": 1, + "status": "ACTIVE", + "inferenceAccelerators": null, + "proxyConfiguration": null, + "volumes": [] +} diff --git a/refget-r/R/getSeq-methods.R b/refget-r/R/getSeq-methods.R index 7b0f9f2..0cab648 100644 --- a/refget-r/R/getSeq-methods.R +++ b/refget-r/R/getSeq-methods.R @@ -113,17 +113,43 @@ setMethod("getSeq", "RefgetGenome", stop("Length mismatch: names, start, end, and strand must have compatible lengths") } - # Extract each sequence - seqs <- vapply(seq_len(n), function(i) { - .getSeq_single(genome, names[i], start[i], end[i], strand[i], as.character = TRUE) - }, character(1)) + # Use bulk BED extraction if all regions have coordinates + if (!any(is.na(start)) && !any(is.na(end))) { + # Write temp BED file (convert 1-based closed to 0-based half-open) + bed_file <- tempfile(fileext = ".bed") + on.exit(unlink(bed_file), add = TRUE) + + bed_df <- data.frame( + chrom = names, + start = as.integer(start - 1L), + end = as.integer(end) + ) + write.table(bed_df, bed_file, sep = "\t", row.names = FALSE, + col.names = FALSE, quote = FALSE) + + # Single Rust FFI call for all regions + retrieved <- gtars::get_seqs_bed_file_to_vec( + genome@store, genome@collection_digest, bed_file + ) + + seqs <- vapply(retrieved, function(r) r@sequence, character(1)) + + # Handle negative strand + minus_idx <- which(strand == "-") + if (length(minus_idx) > 0) { + seqs[minus_idx] <- vapply(seqs[minus_idx], .reverse_complement, character(1)) + } - # Name the results - if (all(is.na(start)) || all(is.na(end))) { - result_names <- names - } else { result_names <- sprintf("%s:%d-%d", names, start, end) + } else { + # Fallback: full chromosome extraction (no coordinates) + seqs <- vapply(seq_len(n), function(i) { + .getSeq_single(genome, names[i], start[i], end[i], strand[i], as.character = TRUE) + }, character(1)) + + result_names <- names } + names(seqs) <- result_names # Convert to DNAStringSet if requested diff --git a/refget/middleware.py b/refget/middleware.py new file mode 100644 index 0000000..a32d37d --- /dev/null +++ b/refget/middleware.py @@ -0,0 +1,59 @@ +""" +Middleware for store-backed seqcolapi deployments. + +StoreFreshnessMiddleware periodically checks if the remote store has changed +(via rgstore.json digest) and reloads the backend when new data is available. +""" + +import json +import logging +import time +import urllib.request + +from starlette.middleware.base import BaseHTTPMiddleware + +_LOGGER = logging.getLogger(__name__) + + +class StoreFreshnessMiddleware(BaseHTTPMiddleware): + """On each request, if >N seconds since last check, fetch rgstore.json + and compare collections_digest. If changed, re-open the store and + swap the backend. Lazy, request-triggered, no background threads.""" + + def __init__(self, app, store_url: str, cache_dir: str, check_interval: int = 300): + super().__init__(app) + self.store_url = store_url + self.cache_dir = cache_dir + self.check_interval = check_interval + self.last_check = time.time() + self.last_digest = None + + async def dispatch(self, request, call_next): + now = time.time() + if now - self.last_check > self.check_interval: + self.last_check = now + self._check_and_reload(request.app) + return await call_next(request) + + def _check_and_reload(self, app): + try: + metadata = self._fetch_metadata() + digest = metadata.get("collections_digest") + if digest and digest != self.last_digest: + self.last_digest = digest + self._reload_backend(app) + except Exception as e: + _LOGGER.warning(f"Store freshness check failed: {e}") + + def _fetch_metadata(self) -> dict: + url = self.store_url.rstrip("/") + "/rgstore.json" + with urllib.request.urlopen(url) as resp: + return json.loads(resp.read()) + + def _reload_backend(self, app): + from refget.backend import RefgetStoreBackend + from refget.store import RefgetStore + + _LOGGER.info(f"Store changed, reloading from {self.store_url}") + store = RefgetStore.open_remote(self.cache_dir, self.store_url) + app.state.backend = RefgetStoreBackend(store.into_readonly()) diff --git a/refget/router.py b/refget/router.py index a2af04d..815c1e3 100644 --- a/refget/router.py +++ b/refget/router.py @@ -6,17 +6,14 @@ This router does not supply the /service-info endpoint, which should be created by the main app. -To use, first import it, then attach it to the app, -then create a backend object and attach it to the app state like this: +To use, import the router and setup_backend, then wire them up: -from refget.router import create_refget_router -from refget.agents import RefgetDBAgent +from refget.router import create_refget_router, setup_backend router = create_refget_router(sequences=False, collections=True, pangenomes=False) app.include_router(router, prefix="/seqcol") -dbagent = RefgetDBAgent() -app.state.backend = dbagent # RefgetDBAgent satisfies SeqColBackend -app.state.dbagent = dbagent # For DB-only endpoints (similarities, pangenomes, DRS) +setup_backend(app, store=my_store) # RefgetStore backend (no database) +# OR: setup_backend(app, engine=engine) # PostgreSQL via RefgetDBAgent """ import logging @@ -37,6 +34,26 @@ _ROUTER_CONFIG: dict = {} +def setup_backend(app, store=None, engine=None): + """Configure the seqcol backend on a FastAPI app. + + Pass a RefgetStore to serve from the store (default, no database needed). + Pass a SQLAlchemy engine to serve from PostgreSQL via RefgetDBAgent. + """ + if store is not None: + from .backend import RefgetStoreBackend + + app.state.backend = RefgetStoreBackend(store.into_readonly()) + elif engine is not None: + from .agents import RefgetDBAgent + + dbagent = RefgetDBAgent(engine=engine) + app.state.dbagent = dbagent + app.state.backend = dbagent + else: + raise ValueError("setup_backend requires either store or engine") + + async def get_backend(request: Request) -> SeqColBackend: """Get the SeqColBackend from the app state.""" return request.app.state.backend diff --git a/seqcolapi/main.py b/seqcolapi/main.py index c21c995..04b302f 100644 --- a/seqcolapi/main.py +++ b/seqcolapi/main.py @@ -11,7 +11,7 @@ from refget.agents import RefgetDBAgent from refget.const import HUMANS_SAMPLE_LIST, MOUSE_SAMPLES_LIST from refget.models import HumanReadableNames -from refget.router import _ROUTER_CONFIG, _SAMPLE_DIGESTS, create_refget_router +from refget.router import _ROUTER_CONFIG, _SAMPLE_DIGESTS, create_refget_router, setup_backend from .const import ALL_VERSIONS, STATIC_DIRNAME, STATIC_PATH from .examples import * @@ -30,10 +30,8 @@ async def lifespan_loader(app): """ _LOGGER.info("Starting lifespan: Loading sample data...") - # Initialize database agent and store in app state - dbagent = RefgetDBAgent() - app.state.dbagent = dbagent - app.state.backend = dbagent # RefgetDBAgent satisfies SeqColBackend + # Initialize backend via setup_backend + setup_backend(app, engine=RefgetDBAgent().engine) species_samples = {"human": HUMANS_SAMPLE_LIST, "mouse": MOUSE_SAMPLES_LIST} @@ -41,7 +39,7 @@ async def lifespan_loader(app): try: _LOGGER.info(f"Loading {len(sample_names)} sample names for {species}") - with Session(dbagent.engine) as session: + with Session(app.state.dbagent.engine) as session: statement = select(HumanReadableNames).where( HumanReadableNames.human_readable_name.in_(sample_names) ) @@ -143,7 +141,7 @@ async def index(request: Request): async def service_info(): # Build seqcol capabilities object seqcol_info = { - "schema": dbagent.schema_dict, + "schema": getattr(app.state.dbagent, "schema_dict", None) if hasattr(app.state, "dbagent") else None, "sorted_name_length_pairs": True, "fasta_drs": {"enabled": _ROUTER_CONFIG.get("fasta_drs", False)}, } @@ -182,15 +180,6 @@ async def service_info(): app.mount("/", StaticFiles(directory=STATIC_PATH), name=STATIC_DIRNAME) -def create_global_dbagent(): - """ - Create a global database agent for use in the app. - """ - global dbagent - dbagent = RefgetDBAgent() # Configured via env vars - return dbagent - - def create_store_app(store_path: str, remote: bool = False, cache_dir: str = "/tmp/seqcol_cache"): """Create a seqcolapi FastAPI app backed by a RefgetStore (no database). @@ -202,7 +191,6 @@ def create_store_app(store_path: str, remote: bool = False, cache_dir: str = "/t Returns: FastAPI app with store-backed seqcol endpoints. """ - from refget.backend import RefgetStoreBackend from refget.store import RefgetStore if remote: @@ -210,18 +198,65 @@ def create_store_app(store_path: str, remote: bool = False, cache_dir: str = "/t else: store = RefgetStore.on_disk(store_path) - backend = RefgetStoreBackend(store.into_readonly()) + store_app = FastAPI( + title="Sequence Collections API (Store-backed)", + version=ALL_VERSIONS["refget_version"], + ) + + store_app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) - store_app = FastAPI(title="Sequence Collections API (Store-backed)") - store_app.state.backend = backend + setup_backend(store_app, store=store) router = create_refget_router( sequences=False, pangenomes=False, refget_store_url=store_path if remote else None ) store_app.include_router(router) + + if remote: + from refget.middleware import StoreFreshnessMiddleware + + store_app.add_middleware( + StoreFreshnessMiddleware, + store_url=store_path, + cache_dir=cache_dir, + ) + + @store_app.get("/service-info", summary="GA4GH service info", tags=["General endpoints"]) + async def store_service_info(): + backend = getattr(store_app.state, "backend", None) + caps = backend.capabilities() if backend and hasattr(backend, "capabilities") else {} + return { + "id": "org.databio.seqcolapi.store", + "name": "Sequence collections (store-backed)", + "type": { + "group": "org.ga4gh", + "artifact": "refget-seqcol", + "version": ALL_VERSIONS["seqcol_spec_version"], + }, + "description": "Store-backed API providing metadata for collections of reference sequences", + "organization": {"name": "Databio Lab", "url": "https://databio.org"}, + "contactUrl": "https://github.com/refgenie/refget/issues", + "version": ALL_VERSIONS, + "seqcol": { + "refget_store": {"enabled": True, "url": store_path, **caps}, + }, + } + return store_app +import os + +_STORE_URL_ENV = os.environ.get("REFGET_STORE_URL") + +if _STORE_URL_ENV: + store_app = create_store_app(_STORE_URL_ENV, remote=True) + + if __name__ != "__main__": - _dbagent = create_global_dbagent() - app.state.dbagent = _dbagent - app.state.backend = _dbagent # RefgetDBAgent satisfies SeqColBackend + setup_backend(app, engine=RefgetDBAgent().engine) From be71c4e2c4898ef8a4dee7cfb422f28bc1d0e5b9 Mon Sep 17 00:00:00 2001 From: nsheff Date: Tue, 17 Mar 2026 08:06:42 -0400 Subject: [PATCH 22/31] restructure data loading --- data_loaders/ref-genome-analysis/.gitignore | 3 + data_loaders/ref-genome-analysis/CLAUDE.md | 49 ++++++ data_loaders/ref-genome-analysis/README.md | 70 +++++---- .../ref-genome-analysis/env/deploy-deps.sh | 32 ++++ .../ref-genome-analysis/env/mutagen-setup.sh | 55 +++++++ .../ref-genome-analysis/env/on-cluster.env | 6 + .../ref-genome-analysis/env/remote-hpc.env | 9 ++ .../01_inventory}/inventory_genomes.py | 4 +- .../02_aliases}/build_ncbi_alias_table.py | 6 +- .../02_aliases}/register_aliases.sbatch | 5 +- .../02_aliases}/register_ncbi_aliases.py | 9 +- .../src/02_build/build_digest_map.py | 135 ++++++++++++++++ .../src/02_build/build_digest_map.sbatch | 16 ++ .../{fhr => src/03_fhr}/batch_generate_fhr.py | 0 .../{fhr => src/03_fhr}/genomeark_to_fhr.py | 0 .../{fhr => src/03_fhr}/load_fhr_metadata.py | 0 .../metadata/GCA_000001405.29.fhr.json | 0 .../03_fhr}/metadata/GCA_000001405.fhr.json | 0 .../03_fhr}/metadata/GCA_964261635.1.fhr.json | 0 .../03_fhr}/metadata/GCA_964263255.1.fhr.json | 0 .../04_verify}/verify_refgetstore.py | 14 +- .../05_profiling}/profile_all.py | 15 +- .../05_profiling}/profile_all.sbatch | 5 +- .../05_profiling}/profile_batch.py | 11 +- .../05_profiling}/profile_memory.py | 6 +- .../05_profiling}/profile_memory.sbatch | 5 +- .../05_profiling}/profile_newt.py | 6 +- .../05_profiling}/profile_newt.sbatch | 5 +- .../05_profiling}/profile_normal.py | 7 +- .../05_profiling}/profile_normal.sbatch | 5 +- .../ref-genome-analysis/src/90_split_store.py | 145 ++++++++++++++++++ .../src/90_split_store.sbatch | 16 ++ .../{ => src}/examples/test_20_genomes.py | 9 +- .../riva_pangenome_analysis/update-gtars.sh | 2 +- 34 files changed, 565 insertions(+), 85 deletions(-) create mode 100644 data_loaders/ref-genome-analysis/.gitignore create mode 100644 data_loaders/ref-genome-analysis/CLAUDE.md create mode 100644 data_loaders/ref-genome-analysis/env/deploy-deps.sh create mode 100755 data_loaders/ref-genome-analysis/env/mutagen-setup.sh create mode 100644 data_loaders/ref-genome-analysis/env/on-cluster.env create mode 100644 data_loaders/ref-genome-analysis/env/remote-hpc.env rename data_loaders/ref-genome-analysis/{inventory => src/01_inventory}/inventory_genomes.py (97%) rename data_loaders/ref-genome-analysis/{aliases => src/02_aliases}/build_ncbi_alias_table.py (98%) rename data_loaders/ref-genome-analysis/{aliases => src/02_aliases}/register_aliases.sbatch (53%) rename data_loaders/ref-genome-analysis/{aliases => src/02_aliases}/register_ncbi_aliases.py (96%) create mode 100644 data_loaders/ref-genome-analysis/src/02_build/build_digest_map.py create mode 100644 data_loaders/ref-genome-analysis/src/02_build/build_digest_map.sbatch rename data_loaders/ref-genome-analysis/{fhr => src/03_fhr}/batch_generate_fhr.py (100%) rename data_loaders/ref-genome-analysis/{fhr => src/03_fhr}/genomeark_to_fhr.py (100%) rename data_loaders/ref-genome-analysis/{fhr => src/03_fhr}/load_fhr_metadata.py (100%) rename data_loaders/ref-genome-analysis/{fhr => src/03_fhr}/metadata/GCA_000001405.29.fhr.json (100%) rename data_loaders/ref-genome-analysis/{fhr => src/03_fhr}/metadata/GCA_000001405.fhr.json (100%) rename data_loaders/ref-genome-analysis/{fhr => src/03_fhr}/metadata/GCA_964261635.1.fhr.json (100%) rename data_loaders/ref-genome-analysis/{fhr => src/03_fhr}/metadata/GCA_964263255.1.fhr.json (100%) rename data_loaders/ref-genome-analysis/{verify => src/04_verify}/verify_refgetstore.py (97%) rename data_loaders/ref-genome-analysis/{profiling => src/05_profiling}/profile_all.py (61%) rename data_loaders/ref-genome-analysis/{profiling => src/05_profiling}/profile_all.sbatch (66%) rename data_loaders/ref-genome-analysis/{profiling => src/05_profiling}/profile_batch.py (60%) rename data_loaders/ref-genome-analysis/{profiling => src/05_profiling}/profile_memory.py (91%) rename data_loaders/ref-genome-analysis/{profiling => src/05_profiling}/profile_memory.sbatch (64%) rename data_loaders/ref-genome-analysis/{profiling => src/05_profiling}/profile_newt.py (86%) rename data_loaders/ref-genome-analysis/{profiling => src/05_profiling}/profile_newt.sbatch (66%) rename data_loaders/ref-genome-analysis/{profiling => src/05_profiling}/profile_normal.py (79%) rename data_loaders/ref-genome-analysis/{profiling => src/05_profiling}/profile_normal.sbatch (66%) create mode 100644 data_loaders/ref-genome-analysis/src/90_split_store.py create mode 100644 data_loaders/ref-genome-analysis/src/90_split_store.sbatch rename data_loaders/ref-genome-analysis/{ => src}/examples/test_20_genomes.py (94%) diff --git a/data_loaders/ref-genome-analysis/.gitignore b/data_loaders/ref-genome-analysis/.gitignore new file mode 100644 index 0000000..ff7f203 --- /dev/null +++ b/data_loaders/ref-genome-analysis/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +*.pyc +*.log diff --git a/data_loaders/ref-genome-analysis/CLAUDE.md b/data_loaders/ref-genome-analysis/CLAUDE.md new file mode 100644 index 0000000..cbe896b --- /dev/null +++ b/data_loaders/ref-genome-analysis/CLAUDE.md @@ -0,0 +1,49 @@ +# ref-genome-analysis + +Pipeline for building a RefgetStore from reference genome FASTA files. Inventories genomes from the brickyard, loads them into a refget store, registers NCBI aliases, generates FAIR Header Representation (FHR) metadata, and verifies the result. + +## Setup + +Source the environment for your compute target: +- HPC (from laptop): `source env/remote-hpc.env` +- HPC (direct): `source env/on-cluster.env` + +To start mutagen sync: `./env/mutagen-setup.sh` + +## Pipeline Phases + +Execute in order: + +1. **01_inventory** -- Scan brickyard, generate CSV inventory of all FASTA files + - `python src/01_inventory/inventory_genomes.py` + +2. **02_aliases** -- Download NCBI assembly reports, build alias table, register in store + - Phase A: `python src/02_aliases/build_ncbi_alias_table.py` (downloads from NCBI, slow) + - Phase B: `sbatch src/02_aliases/register_aliases.sbatch` + +3. **03_fhr** -- Generate FAIR Header Representation metadata, load into store + - `python src/03_fhr/batch_generate_fhr.py --inventory $INVENTORY_CSV --output-dir $STAGING/fhr_metadata` + - `python src/03_fhr/load_fhr_metadata.py --store-path $STORE_PATH --fhr-dir $STAGING/fhr_metadata` + +4. **04_verify** -- Validate store integrity + - `python src/04_verify/verify_refgetstore.py` + +## Key Environment Variables + +- `BRICK_ROOT` -- Root of the refgenomes_fasta brick +- `STORE_PATH` -- Path to the RefgetStore database +- `STAGING` -- Staging area for intermediates (assembly reports, alias tables, FHR JSON) +- `INVENTORY_CSV` -- Path to the genome inventory CSV + +## Dependencies + +- Python 3.11+ (via `module load miniforge/24.3.0-py3.11` on Rivanna) +- `refget` or `gtars` Python package (for RefgetStore) +- Internet access for NCBI API calls (phases 2 and 3) + +## Notes + +- All phases are resumable -- cached downloads, idempotent store operations +- Phase 2A rate-limits NCBI requests (0.3s between calls) +- `src/05_profiling/` contains memory/timing benchmarks (not part of the main pipeline) +- `src/examples/` contains a 20-genome integration test diff --git a/data_loaders/ref-genome-analysis/README.md b/data_loaders/ref-genome-analysis/README.md index 6f71ee6..1ccf999 100644 --- a/data_loaders/ref-genome-analysis/README.md +++ b/data_loaders/ref-genome-analysis/README.md @@ -2,6 +2,14 @@ Pipeline for loading reference genome FASTA files into a RefgetStore and enriching them with NCBI aliases and FHR provenance metadata. +## Setup + +```bash +source env/on-cluster.env # on Rivanna directly +source env/remote-hpc.env # from laptop, targeting Rivanna +./env/mutagen-setup.sh # start file sync (laptop only) +``` + ## Pipeline stages Execute in order: @@ -10,51 +18,45 @@ Execute in order: inventory --> build --> aliases --> fhr --> verify ``` -| Stage | Directory | Purpose | +| Stage | Location | Purpose | |---|---|---| -| **inventory** | `inventory/` | Scan brickyard FASTA files, produce `refgenomes_inventory.csv` | -| **build** | `build/` | Load FASTAs into RefgetStore, produce `digest_map.csv` | -| **aliases** | `aliases/` | Download NCBI assembly reports, build alias table, register sequence/collection aliases | -| **fhr** | `fhr/` | Generate and attach FHR provenance metadata (species, taxon, accession, submitter, etc.) | -| **verify** | `verify/` | Automated pass/fail checks against the store | -| **profiling** | `profiling/` | Memory and timing benchmarks | -| **examples** | `examples/` | End-to-end test scripts (e.g., load 20 genomes with FHR) | - -## Rivanna paths - -All data lives within the `refgenomes_fasta` brickyard brick: - -``` -/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/ -├── homo_sapiens/... # Source FASTAs -├── mus_musculus/... -├── refgenomes_inventory.csv # Inventory of all FASTAs -├── refget_store/ # The RefgetStore (fixed-format, don't modify manually) -└── refget_staging/ # Pipeline intermediates - ├── assembly_reports/ # Downloaded NCBI assembly_report.txt files - ├── ncbi_alias_table.csv # Parsed alias table (367K sequence rows) - ├── fhr_metadata/ # Generated FHR provenance JSON files - └── digest_map.csv # Build output mapping FASTAs to digests -``` - -- **Store**: `.../refgenomes_fasta/refget_store` -- **Staging**: `.../refgenomes_fasta/refget_staging` -- **This pipeline**: `.../refgenomes_fasta/refget/data_loaders/ref-genome-analysis/` +| **inventory** | `src/01_inventory/` | Scan brickyard FASTA files, produce `refgenomes_inventory.csv` | +| **build** | `src/02_build/` | Compute seqcol digests for all FASTAs, produce `digest_map.csv` | +| **aliases** | `src/02_aliases/` | Download NCBI assembly reports, build alias table, register sequence/collection aliases | +| **fhr** | `src/03_fhr/` | Generate and attach FHR provenance metadata (species, taxon, accession, submitter, etc.) | +| **verify** | `src/04_verify/` | Automated pass/fail checks against the store | +| **profiling** | `src/05_profiling/` | Memory and timing benchmarks | +| **split** | `src/90_split_store.py` | Split combined store into VGP and reference genome stores | +| **examples** | `src/examples/` | End-to-end test scripts (e.g., load 20 genomes with FHR) | + +## Environment variables + +All paths come from environment variables set by sourcing an env file. No hardcoded paths in scripts. + +| Variable | Purpose | +|---|---| +| `BRICKYARD` | Lab-wide brickyard root | +| `BRICK_ROOT` | This project's brick (`$BRICKYARD/datasets_downloaded/refgenomes_fasta`) | +| `STORE_PATH` | The RefgetStore database | +| `STAGING` | Pipeline intermediates (assembly reports, alias tables, FHR JSON) | +| `INVENTORY_CSV` | Inventory of all FASTAs | ## Quick start (Rivanna) ```bash +source env/on-cluster.env module load miniforge/24.3.0-py3.11 -# 1. Build store -sbatch build/build_refgetstore.sbatch +# 1. Inventory +python src/01_inventory/inventory_genomes.py # 2. Register NCBI aliases -sbatch aliases/register_aliases.sbatch +sbatch src/02_aliases/register_aliases.sbatch # 3. Attach FHR metadata -cd fhr && python load_fhr_metadata.py --store-path /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget_store --fhr-dir metadata/ +python src/03_fhr/batch_generate_fhr.py --inventory $INVENTORY_CSV --output-dir $STAGING/fhr_metadata +python src/03_fhr/load_fhr_metadata.py --store-path $STORE_PATH --fhr-dir $STAGING/fhr_metadata # 4. Verify -cd verify && python verify_refgetstore.py +python src/04_verify/verify_refgetstore.py ``` diff --git a/data_loaders/ref-genome-analysis/env/deploy-deps.sh b/data_loaders/ref-genome-analysis/env/deploy-deps.sh new file mode 100644 index 0000000..5052be2 --- /dev/null +++ b/data_loaders/ref-genome-analysis/env/deploy-deps.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# env/deploy-deps.sh — Build and install dependencies on Rivanna from mutagen-synced source +# +# Requires: DEPLOY_HOST and DEPLOY_DIR set in env file +# Requires: mutagen syncs running (via mutagen-setup.sh) + +if [ -z "$DEPLOY_HOST" ] || [ -z "$DEPLOY_DIR" ]; then + echo "DEPLOY_HOST and DEPLOY_DIR must be set. Source your env file first." + exit 1 +fi + +ssh "$DEPLOY_HOST" 'bash --login -s' << EOF +set -e +source /etc/profile.d/modules.sh +module load miniforge/24.3.0-py3.11 + +# Build gtars from synced source +cd ${DEPLOY_DIR}/gtars/gtars-python +rm -f ../target/wheels/gtars-*.whl +echo "Building gtars..." +maturin build --release --no-default-features --features refget +pip install ../target/wheels/gtars-*.whl --force-reinstall --no-deps +echo "gtars installed." + +# Install refget from synced source +cd ${DEPLOY_DIR}/refget +echo "Installing refget..." +python -m pip install -e . +echo "refget installed." + +echo "Done!" +EOF diff --git a/data_loaders/ref-genome-analysis/env/mutagen-setup.sh b/data_loaders/ref-genome-analysis/env/mutagen-setup.sh new file mode 100755 index 0000000..e17922f --- /dev/null +++ b/data_loaders/ref-genome-analysis/env/mutagen-setup.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# env/mutagen-setup.sh — Start mutagen sync for this project and its dependencies + +if [ -z "$SYNC_REMOTE" ]; then + echo "SYNC_REMOTE is not set. Set it in your env file to enable sync." + echo "Example: export SYNC_REMOTE=user@host:/path/to/project" + exit 0 +fi + +if [ -z "$PROJECT_NAME" ]; then + PROJECT_NAME=$(basename "$PWD") +fi + +# Sync the project itself +mutagen sync create \ + --name="${PROJECT_NAME}-pipeline" \ + --ignore=__pycache__ \ + --ignore="*.pyc" \ + --ignore="*.log" \ + --ignore=.git \ + . "$SYNC_REMOTE" + +echo "Sync started: ${PROJECT_NAME}-pipeline → $SYNC_REMOTE" + +# Sync dependencies for deployment +if [ -n "$DEPLOY_HOST" ] && [ -n "$DEPLOY_DIR" ]; then + # gtars — local source synced to remote deploy dir + GTARS_LOCAL="$HOME/Dropbox/workspaces/intervals/repos/gtars" + if [ -d "$GTARS_LOCAL" ]; then + mutagen sync create \ + --name="deploy-gtars" \ + --ignore=target \ + --ignore=__pycache__ \ + --ignore="*.pyc" \ + --ignore=.git \ + "$GTARS_LOCAL" "${DEPLOY_HOST}:${DEPLOY_DIR}/gtars" + echo "Sync started: deploy-gtars → ${DEPLOY_HOST}:${DEPLOY_DIR}/gtars" + else + echo "Warning: $GTARS_LOCAL not found, skipping gtars sync" + fi + + # refget — local source synced to remote deploy dir + REFGET_LOCAL="$HOME/Dropbox/workspaces/intervals/repos/refget" + if [ -d "$REFGET_LOCAL" ]; then + mutagen sync create \ + --name="deploy-refget" \ + --ignore=__pycache__ \ + --ignore="*.pyc" \ + --ignore=.git \ + "$REFGET_LOCAL" "${DEPLOY_HOST}:${DEPLOY_DIR}/refget" + echo "Sync started: deploy-refget → ${DEPLOY_HOST}:${DEPLOY_DIR}/refget" + else + echo "Warning: $REFGET_LOCAL not found, skipping refget sync" + fi +fi diff --git a/data_loaders/ref-genome-analysis/env/on-cluster.env b/data_loaders/ref-genome-analysis/env/on-cluster.env new file mode 100644 index 0000000..167b5e4 --- /dev/null +++ b/data_loaders/ref-genome-analysis/env/on-cluster.env @@ -0,0 +1,6 @@ +export PROJECT_NAME="ref-genome-analysis" +export BRICKYARD=/project/shefflab/brickyard +export BRICK_ROOT=$BRICKYARD/datasets_downloaded/refgenomes_fasta +export STORE_PATH=$BRICK_ROOT/refget_store +export STAGING=$BRICK_ROOT/refget_staging +export INVENTORY_CSV=$BRICK_ROOT/refgenomes_inventory.csv diff --git a/data_loaders/ref-genome-analysis/env/remote-hpc.env b/data_loaders/ref-genome-analysis/env/remote-hpc.env new file mode 100644 index 0000000..6504138 --- /dev/null +++ b/data_loaders/ref-genome-analysis/env/remote-hpc.env @@ -0,0 +1,9 @@ +export PROJECT_NAME="ref-genome-analysis" +export BRICKYARD=/project/shefflab/brickyard +export BRICK_ROOT=$BRICKYARD/datasets_downloaded/refgenomes_fasta +export STORE_PATH=$BRICK_ROOT/refget_store +export STAGING=$BRICK_ROOT/refget_staging +export INVENTORY_CSV=$BRICK_ROOT/refgenomes_inventory.csv +export SYNC_REMOTE=ns5bc@login.hpc.virginia.edu:/home/ns5bc/code/ref-genome-analysis +export DEPLOY_HOST=ns5bc@login.hpc.virginia.edu +export DEPLOY_DIR=/home/ns5bc/deploy diff --git a/data_loaders/ref-genome-analysis/inventory/inventory_genomes.py b/data_loaders/ref-genome-analysis/src/01_inventory/inventory_genomes.py similarity index 97% rename from data_loaders/ref-genome-analysis/inventory/inventory_genomes.py rename to data_loaders/ref-genome-analysis/src/01_inventory/inventory_genomes.py index d741601..b7881b8 100644 --- a/data_loaders/ref-genome-analysis/inventory/inventory_genomes.py +++ b/data_loaders/ref-genome-analysis/src/01_inventory/inventory_genomes.py @@ -25,9 +25,9 @@ import urllib.error import urllib.request -BRICKYARD_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" +BRICKYARD_ROOT = os.environ["BRICK_ROOT"] PEP_URL = "https://pephub-api.databio.org/api/v1/projects/donaldcampbelljr/human_mouse_fasta_brickyard/samples?tag=default" -OUTPUT_FILE = os.path.join(BRICKYARD_ROOT, "refgenomes_inventory.csv") +OUTPUT_FILE = os.environ.get("INVENTORY_CSV", os.path.join(BRICKYARD_ROOT, "refgenomes_inventory.csv")) FASTA_EXTENSIONS = {".fa", ".fa.gz", ".fna", ".fna.gz", ".fasta", ".fasta.gz"} ACCESSION_PATTERN = re.compile(r"(GC[AF]_\d+\.\d+)") diff --git a/data_loaders/ref-genome-analysis/aliases/build_ncbi_alias_table.py b/data_loaders/ref-genome-analysis/src/02_aliases/build_ncbi_alias_table.py similarity index 98% rename from data_loaders/ref-genome-analysis/aliases/build_ncbi_alias_table.py rename to data_loaders/ref-genome-analysis/src/02_aliases/build_ncbi_alias_table.py index 8c0d07c..4ceca3f 100644 --- a/data_loaders/ref-genome-analysis/aliases/build_ncbi_alias_table.py +++ b/data_loaders/ref-genome-analysis/src/02_aliases/build_ncbi_alias_table.py @@ -23,9 +23,9 @@ import urllib.error import urllib.request -BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" -INVENTORY_CSV = f"{BRICK_ROOT}/refgenomes_inventory.csv" -STAGING_DIR = f"{BRICK_ROOT}/refget_staging" +BRICK_ROOT = os.environ["BRICK_ROOT"] +INVENTORY_CSV = os.environ.get("INVENTORY_CSV", f"{BRICK_ROOT}/refgenomes_inventory.csv") +STAGING_DIR = os.environ.get("STAGING", f"{BRICK_ROOT}/refget_staging") ACCESSION_PATTERN = re.compile(r"(GC[AF]_\d+\.\d+)") NCBI_FTP_BASE = "https://ftp.ncbi.nlm.nih.gov/genomes/all" diff --git a/data_loaders/ref-genome-analysis/aliases/register_aliases.sbatch b/data_loaders/ref-genome-analysis/src/02_aliases/register_aliases.sbatch similarity index 53% rename from data_loaders/ref-genome-analysis/aliases/register_aliases.sbatch rename to data_loaders/ref-genome-analysis/src/02_aliases/register_aliases.sbatch index 10dfe48..dc50581 100644 --- a/data_loaders/ref-genome-analysis/aliases/register_aliases.sbatch +++ b/data_loaders/ref-genome-analysis/src/02_aliases/register_aliases.sbatch @@ -10,6 +10,7 @@ module load miniforge/24.3.0-py3.11 -cd /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget/data_loaders/ref-genome-analysis/aliases +cd /home/ns5bc/code/ref-genome-analysis +source env/on-cluster.env -python register_ncbi_aliases.py --store-path /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget_store +python src/02_aliases/register_ncbi_aliases.py diff --git a/data_loaders/ref-genome-analysis/aliases/register_ncbi_aliases.py b/data_loaders/ref-genome-analysis/src/02_aliases/register_ncbi_aliases.py similarity index 96% rename from data_loaders/ref-genome-analysis/aliases/register_ncbi_aliases.py rename to data_loaders/ref-genome-analysis/src/02_aliases/register_ncbi_aliases.py index c344d75..f3f545b 100644 --- a/data_loaders/ref-genome-analysis/aliases/register_ncbi_aliases.py +++ b/data_loaders/ref-genome-analysis/src/02_aliases/register_ncbi_aliases.py @@ -22,10 +22,11 @@ from refget.store import RefgetStore -BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" -STORE_PATH = f"{BRICK_ROOT}/refget_store" -INVENTORY_CSV = f"{BRICK_ROOT}/refgenomes_inventory.csv" -ALIAS_TABLE_CSV = f"{BRICK_ROOT}/refget_staging/ncbi_alias_table.csv" +BRICK_ROOT = os.environ["BRICK_ROOT"] +STORE_PATH = os.environ.get("STORE_PATH", f"{BRICK_ROOT}/refget_store") +INVENTORY_CSV = os.environ.get("INVENTORY_CSV", f"{BRICK_ROOT}/refgenomes_inventory.csv") +STAGING = os.environ.get("STAGING", f"{BRICK_ROOT}/refget_staging") +ALIAS_TABLE_CSV = f"{STAGING}/ncbi_alias_table.csv" def parse_args(): diff --git a/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.py b/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.py new file mode 100644 index 0000000..f4c2487 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +""" +Build a complete digest_map.csv from the inventory CSV. + +For each FASTA in the inventory, reads the seqcol digest from the .rgsi cache +file next to it (instant — just reads the first line). Falls back to computing +the digest with digest_fasta() if no .rgsi exists. + +Outputs: $STAGING/digest_map.csv with columns: + path, filename, digest, n_sequences, group + +Usage: + python src/02_build/build_digest_map.py + python src/02_build/build_digest_map.py --dry-run +""" + +import argparse +import csv +import os +import re +import sys +import time + +BRICK_ROOT = os.environ["BRICK_ROOT"] +STAGING = os.environ.get("STAGING", os.path.join(BRICK_ROOT, "refget_staging")) +INVENTORY_CSV = os.environ.get("INVENTORY_CSV", os.path.join(BRICK_ROOT, "refgenomes_inventory.csv")) +OUTPUT_CSV = os.path.join(STAGING, "digest_map.csv") + +# Pattern to strip FASTA extensions and get the RGSI path +FASTA_EXTS = re.compile(r'\.(fa|fasta|fna)(\.gz)?$') + + +def rgsi_path_for(fasta_path: str) -> str: + """Get the .rgsi cache path for a FASTA file.""" + return FASTA_EXTS.sub('.rgsi', fasta_path) + + +def read_rgsi_digest(rgsi_path: str) -> tuple[str, int] | None: + """Read seqcol digest and sequence count from an .rgsi file. + + Returns (digest, n_sequences) or None if file doesn't exist or is malformed. + """ + if not os.path.exists(rgsi_path): + return None + digest = None + n_sequences = 0 + with open(rgsi_path) as f: + for line in f: + if line.startswith("##seqcol_digest="): + digest = line.strip().split("=", 1)[1] + elif not line.startswith("#"): + n_sequences += 1 + if digest: + return digest, n_sequences + return None + + +def build_digest_map(inventory_path: str, output_path: str, dry_run: bool = False): + with open(inventory_path) as f: + rows = list(csv.DictReader(f)) + + total = len(rows) + print(f"Inventory: {total} FASTAs from {inventory_path}") + + if dry_run: + # Just count how many have .rgsi files + have_rgsi = sum(1 for r in rows if os.path.exists(rgsi_path_for(r["path"]))) + print(f"FASTAs with .rgsi cache: {have_rgsi}/{total}") + print("--dry-run: stopping here.") + return + + results = [] + from_cache = 0 + skipped = 0 + t0 = time.time() + + for i, row in enumerate(rows, 1): + fasta_path = row["path"] + group = row.get("group", "") + filename = row.get("filename", os.path.basename(fasta_path)) + + # Try .rgsi cache first + rgsi = rgsi_path_for(fasta_path) + cached = read_rgsi_digest(rgsi) + if cached: + digest, n_sequences = cached + from_cache += 1 + results.append({ + "path": fasta_path, + "filename": filename, + "digest": digest, + "n_sequences": n_sequences, + "group": group, + }) + print(f" [{i}/{total}] (cache) {group}/{filename} -> {digest}") + continue + + # No cache — skip (these FASTAs were never successfully loaded) + print(f" [{i}/{total}] NO CACHE: {group}/{filename}", file=sys.stderr) + skipped += 1 + + elapsed = time.time() - t0 + + # Write output + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=["path", "filename", "digest", "n_sequences", "group"]) + writer.writeheader() + writer.writerows(results) + + print(f"\nDone in {elapsed:.1f}s") + print(f" Written: {len(results)} entries to {output_path}") + print(f" From cache: {from_cache}") + print(f" No cache: {skipped}") + + # Summary by group + from collections import Counter + group_counts = Counter(r["group"] for r in results) + print(f"\nBy group:") + for group, count in sorted(group_counts.items(), key=lambda x: -x[1]): + print(f" {group}: {count}") + + +def main(): + parser = argparse.ArgumentParser(description="Build complete digest_map.csv from inventory.") + parser.add_argument("--inventory", default=INVENTORY_CSV) + parser.add_argument("--output", default=OUTPUT_CSV) + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + build_digest_map(args.inventory, args.output, args.dry_run) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.sbatch b/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.sbatch new file mode 100644 index 0000000..d4c042b --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.sbatch @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=build_digest_map +#SBATCH --output=build_digest_map_%j.log +#SBATCH --error=build_digest_map_%j.log +#SBATCH --partition=standard +#SBATCH --time=4:00:00 +#SBATCH --mem=16G +#SBATCH --cpus-per-task=1 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd /home/ns5bc/code/ref-genome-analysis +source env/on-cluster.env + +python src/02_build/build_digest_map.py diff --git a/data_loaders/ref-genome-analysis/fhr/batch_generate_fhr.py b/data_loaders/ref-genome-analysis/src/03_fhr/batch_generate_fhr.py similarity index 100% rename from data_loaders/ref-genome-analysis/fhr/batch_generate_fhr.py rename to data_loaders/ref-genome-analysis/src/03_fhr/batch_generate_fhr.py diff --git a/data_loaders/ref-genome-analysis/fhr/genomeark_to_fhr.py b/data_loaders/ref-genome-analysis/src/03_fhr/genomeark_to_fhr.py similarity index 100% rename from data_loaders/ref-genome-analysis/fhr/genomeark_to_fhr.py rename to data_loaders/ref-genome-analysis/src/03_fhr/genomeark_to_fhr.py diff --git a/data_loaders/ref-genome-analysis/fhr/load_fhr_metadata.py b/data_loaders/ref-genome-analysis/src/03_fhr/load_fhr_metadata.py similarity index 100% rename from data_loaders/ref-genome-analysis/fhr/load_fhr_metadata.py rename to data_loaders/ref-genome-analysis/src/03_fhr/load_fhr_metadata.py diff --git a/data_loaders/ref-genome-analysis/fhr/metadata/GCA_000001405.29.fhr.json b/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_000001405.29.fhr.json similarity index 100% rename from data_loaders/ref-genome-analysis/fhr/metadata/GCA_000001405.29.fhr.json rename to data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_000001405.29.fhr.json diff --git a/data_loaders/ref-genome-analysis/fhr/metadata/GCA_000001405.fhr.json b/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_000001405.fhr.json similarity index 100% rename from data_loaders/ref-genome-analysis/fhr/metadata/GCA_000001405.fhr.json rename to data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_000001405.fhr.json diff --git a/data_loaders/ref-genome-analysis/fhr/metadata/GCA_964261635.1.fhr.json b/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_964261635.1.fhr.json similarity index 100% rename from data_loaders/ref-genome-analysis/fhr/metadata/GCA_964261635.1.fhr.json rename to data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_964261635.1.fhr.json diff --git a/data_loaders/ref-genome-analysis/fhr/metadata/GCA_964263255.1.fhr.json b/data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_964263255.1.fhr.json similarity index 100% rename from data_loaders/ref-genome-analysis/fhr/metadata/GCA_964263255.1.fhr.json rename to data_loaders/ref-genome-analysis/src/03_fhr/metadata/GCA_964263255.1.fhr.json diff --git a/data_loaders/ref-genome-analysis/verify/verify_refgetstore.py b/data_loaders/ref-genome-analysis/src/04_verify/verify_refgetstore.py similarity index 97% rename from data_loaders/ref-genome-analysis/verify/verify_refgetstore.py rename to data_loaders/ref-genome-analysis/src/04_verify/verify_refgetstore.py index e054393..b7b4686 100644 --- a/data_loaders/ref-genome-analysis/verify/verify_refgetstore.py +++ b/data_loaders/ref-genome-analysis/src/04_verify/verify_refgetstore.py @@ -26,10 +26,11 @@ import tempfile import time -BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" -STORE_PATH = f"{BRICK_ROOT}/refget_store" -INVENTORY_CSV = f"{BRICK_ROOT}/refgenomes_inventory.csv" -DIGEST_MAP_CSV = f"{BRICK_ROOT}/refget_staging/digest_map.csv" +BRICK_ROOT = os.environ["BRICK_ROOT"] +STORE_PATH = os.environ.get("STORE_PATH", f"{BRICK_ROOT}/refget_store") +INVENTORY_CSV = os.environ.get("INVENTORY_CSV", f"{BRICK_ROOT}/refgenomes_inventory.csv") +STAGING = os.environ.get("STAGING", f"{BRICK_ROOT}/refget_staging") +DIGEST_MAP_CSV = f"{STAGING}/digest_map.csv" results = [] @@ -375,8 +376,9 @@ def print_summary(store_path): if r["status"] == "FAIL": print(f" - {r['name']}: {r['detail']}") - # Write JSON report next to the store - report_dir = os.path.dirname(os.path.abspath(__file__)) + # Write JSON report to staging area + report_dir = STAGING + os.makedirs(report_dir, exist_ok=True) report_path = os.path.join(report_dir, "verification_report.json") with open(report_path, "w") as f: json.dump( diff --git a/data_loaders/ref-genome-analysis/profiling/profile_all.py b/data_loaders/ref-genome-analysis/src/05_profiling/profile_all.py similarity index 61% rename from data_loaders/ref-genome-analysis/profiling/profile_all.py rename to data_loaders/ref-genome-analysis/src/05_profiling/profile_all.py index e9cf60b..3fac641 100644 --- a/data_loaders/ref-genome-analysis/profiling/profile_all.py +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_all.py @@ -17,15 +17,16 @@ def rss_mb(): from gtars.refget import RefgetStore -BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" -STORE_PATH = f"{BRICK_ROOT}/refget_store" +import os +BRICK_ROOT = os.environ["BRICK_ROOT"] +STORE_PATH = os.environ.get("STORE_PATH", f"{BRICK_ROOT}/refget_store") GENOMES = [ # (path, old_total_time, n_seqs, label) - ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964261635.1.fa.gz", 183.7, 448, "newt (2GB chr)"), - ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964263255.1.fa.gz", 213.2, 15265, "15K seqs"), - ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964263955.1.fa.gz", 42.7, 11150, "11K seqs"), - ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964264875.2.fa.gz", 27.4, 585, "585 seqs"), - ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964266715.1.fa.gz", 17.0, 1581, "1.6K seqs"), + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964261635.1.fa.gz", 183.7, 448, "newt (2GB chr)"), + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964263255.1.fa.gz", 213.2, 15265, "15K seqs"), + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964263955.1.fa.gz", 42.7, 11150, "11K seqs"), + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964264875.2.fa.gz", 27.4, 585, "585 seqs"), + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964266715.1.fa.gz", 17.0, 1581, "1.6K seqs"), ] store = RefgetStore.on_disk(STORE_PATH) diff --git a/data_loaders/ref-genome-analysis/profiling/profile_all.sbatch b/data_loaders/ref-genome-analysis/src/05_profiling/profile_all.sbatch similarity index 66% rename from data_loaders/ref-genome-analysis/profiling/profile_all.sbatch rename to data_loaders/ref-genome-analysis/src/05_profiling/profile_all.sbatch index e1edf19..d96a2c0 100644 --- a/data_loaders/ref-genome-analysis/profiling/profile_all.sbatch +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_all.sbatch @@ -10,6 +10,7 @@ module load miniforge/24.3.0-py3.11 -cd /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget/data_loaders/ref-genome-analysis/profiling +cd /home/ns5bc/code/ref-genome-analysis +source env/on-cluster.env -python profile_all.py +python src/05_profiling/profile_all.py diff --git a/data_loaders/ref-genome-analysis/profiling/profile_batch.py b/data_loaders/ref-genome-analysis/src/05_profiling/profile_batch.py similarity index 60% rename from data_loaders/ref-genome-analysis/profiling/profile_batch.py rename to data_loaders/ref-genome-analysis/src/05_profiling/profile_batch.py index b291e78..760d64c 100644 --- a/data_loaders/ref-genome-analysis/profiling/profile_batch.py +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_batch.py @@ -7,13 +7,14 @@ def peak_mb(): from gtars.refget import RefgetStore -BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" -STORE_PATH = f"{BRICK_ROOT}/refget_store" +import os +BRICK_ROOT = os.environ["BRICK_ROOT"] +STORE_PATH = os.environ.get("STORE_PATH", f"{BRICK_ROOT}/refget_store") GENOMES = [ # (path, old_pipeline_time, old_total_time, n_seqs) - ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964263255.1.fa.gz", 203.1, 213.2, 15265), - ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964263955.1.fa.gz", 32.6, 42.7, 11150), - ("/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964266715.1.fa.gz", 7.2, 17.0, 1581), + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964263255.1.fa.gz", 203.1, 213.2, 15265), + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964263955.1.fa.gz", 32.6, 42.7, 11150), + (f"{BRICK_ROOT}/vertebrates/fasta/GCA_964266715.1.fa.gz", 7.2, 17.0, 1581), ] store = RefgetStore.on_disk(STORE_PATH) diff --git a/data_loaders/ref-genome-analysis/profiling/profile_memory.py b/data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.py similarity index 91% rename from data_loaders/ref-genome-analysis/profiling/profile_memory.py rename to data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.py index 5511032..07a3ad7 100644 --- a/data_loaders/ref-genome-analysis/profiling/profile_memory.py +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.py @@ -28,9 +28,9 @@ def print_mem(label): from gtars.refget import RefgetStore print_mem("after import") -BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" -STORE_PATH = f"{BRICK_ROOT}/refget_store" -INVENTORY_CSV = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refgenomes_inventory.csv" +BRICK_ROOT = os.environ["BRICK_ROOT"] +STORE_PATH = os.environ.get("STORE_PATH", f"{BRICK_ROOT}/refget_store") +INVENTORY_CSV = os.environ.get("INVENTORY_CSV", f"{BRICK_ROOT}/refgenomes_inventory.csv") # Open the store t0 = time.time() diff --git a/data_loaders/ref-genome-analysis/profiling/profile_memory.sbatch b/data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.sbatch similarity index 64% rename from data_loaders/ref-genome-analysis/profiling/profile_memory.sbatch rename to data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.sbatch index b6ddc84..0e70a7b 100644 --- a/data_loaders/ref-genome-analysis/profiling/profile_memory.sbatch +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.sbatch @@ -10,6 +10,7 @@ module load miniforge/24.3.0-py3.11 -cd /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget/data_loaders/ref-genome-analysis/profiling +cd /home/ns5bc/code/ref-genome-analysis +source env/on-cluster.env -python profile_memory.py 850 5 +python src/05_profiling/profile_memory.py 850 5 diff --git a/data_loaders/ref-genome-analysis/profiling/profile_newt.py b/data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.py similarity index 86% rename from data_loaders/ref-genome-analysis/profiling/profile_newt.py rename to data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.py index 5343eba..7d2285a 100644 --- a/data_loaders/ref-genome-analysis/profiling/profile_newt.py +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.py @@ -31,9 +31,9 @@ def print_mem(label): from gtars.refget import RefgetStore print_mem("after import") -BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" -STORE_PATH = f"{BRICK_ROOT}/refget_store" -NEWT_FASTA = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964261635.1.fa.gz" +BRICK_ROOT = os.environ["BRICK_ROOT"] +STORE_PATH = os.environ.get("STORE_PATH", f"{BRICK_ROOT}/refget_store") +NEWT_FASTA = f"{BRICK_ROOT}/vertebrates/fasta/GCA_964261635.1.fa.gz" # Open the store t0 = time.time() diff --git a/data_loaders/ref-genome-analysis/profiling/profile_newt.sbatch b/data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.sbatch similarity index 66% rename from data_loaders/ref-genome-analysis/profiling/profile_newt.sbatch rename to data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.sbatch index e3595af..ee37f56 100644 --- a/data_loaders/ref-genome-analysis/profiling/profile_newt.sbatch +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.sbatch @@ -10,6 +10,7 @@ module load miniforge/24.3.0-py3.11 -cd /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget/data_loaders/ref-genome-analysis/profiling +cd /home/ns5bc/code/ref-genome-analysis +source env/on-cluster.env -python profile_newt.py +python src/05_profiling/profile_newt.py diff --git a/data_loaders/ref-genome-analysis/profiling/profile_normal.py b/data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.py similarity index 79% rename from data_loaders/ref-genome-analysis/profiling/profile_normal.py rename to data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.py index ddec583..6c414ca 100644 --- a/data_loaders/ref-genome-analysis/profiling/profile_normal.py +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.py @@ -14,9 +14,10 @@ def print_mem(label): from gtars.refget import RefgetStore print_mem("after import") -BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" -STORE_PATH = f"{BRICK_ROOT}/refget_store" -FASTA = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/vertebrates/fasta/GCA_964264875.2.fa.gz" +import os +BRICK_ROOT = os.environ["BRICK_ROOT"] +STORE_PATH = os.environ.get("STORE_PATH", f"{BRICK_ROOT}/refget_store") +FASTA = f"{BRICK_ROOT}/vertebrates/fasta/GCA_964264875.2.fa.gz" t0 = time.time() store = RefgetStore.on_disk(STORE_PATH) diff --git a/data_loaders/ref-genome-analysis/profiling/profile_normal.sbatch b/data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.sbatch similarity index 66% rename from data_loaders/ref-genome-analysis/profiling/profile_normal.sbatch rename to data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.sbatch index a6e7b88..dd7eb04 100644 --- a/data_loaders/ref-genome-analysis/profiling/profile_normal.sbatch +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.sbatch @@ -10,6 +10,7 @@ module load miniforge/24.3.0-py3.11 -cd /project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta/refget/data_loaders/ref-genome-analysis/profiling +cd /home/ns5bc/code/ref-genome-analysis +source env/on-cluster.env -python profile_normal.py +python src/05_profiling/profile_normal.py diff --git a/data_loaders/ref-genome-analysis/src/90_split_store.py b/data_loaders/ref-genome-analysis/src/90_split_store.py new file mode 100644 index 0000000..86a4d0c --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/90_split_store.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +""" +Split the combined refget store into two stores: VGP vertebrates and reference genomes. + +Reads digest_map.csv (produced by 02_build/build_digest_map.py) which has a 'group' +column for every FASTA. Collections with group='vertebrates' go to the VGP store, +everything else goes to the ref store. + +Usage: + python src/90_split_store.py --dry-run + python src/90_split_store.py +""" + +import argparse +import csv +import os +import sys +import time + +from refget.store import RefgetStore + +BRICK_ROOT = os.environ["BRICK_ROOT"] +DEFAULT_SOURCE = os.environ.get("STORE_PATH", os.path.join(BRICK_ROOT, "refget_store")) +STAGING = os.environ.get("STAGING", os.path.join(BRICK_ROOT, "refget_staging")) +DEFAULT_DIGEST_MAP = os.path.join(STAGING, "digest_map.csv") +DEFAULT_VGP_OUTPUT = os.path.join(BRICK_ROOT, "vgp_reference_store") +DEFAULT_REF_OUTPUT = os.path.join(BRICK_ROOT, "refgenome_jungle_store") + +VGP_GROUPS = {"vertebrates"} + + +def load_digest_map(digest_map_path: str) -> dict[str, set[str]]: + """Read digest_map.csv and return group -> set of digests.""" + groups: dict[str, set[str]] = {} + with open(digest_map_path) as f: + for row in csv.DictReader(f): + digest = row.get("digest", "").strip() + group = row.get("group", "unknown").strip() + if digest: + groups.setdefault(group, set()).add(digest) + return groups + + +def split_store( + source_path: str, + digest_map_path: str, + vgp_output: str, + ref_output: str, + dry_run: bool = False, +): + # Load group -> digest mapping + group_digests = load_digest_map(digest_map_path) + + vgp_digests = set() + ref_digests = set() + for group, digests in group_digests.items(): + label = "VGP" if group in VGP_GROUPS else "ref" + print(f" {group}: {len(digests)} collections ({label})") + if group in VGP_GROUPS: + vgp_digests |= digests + else: + ref_digests |= digests + + # Open source store and load all collections (metadata only) + print(f"\nOpening source store: {source_path}") + source = RefgetStore.on_disk(source_path) + source.load_all_collections() + + # Get all store digests + all_store_digests = set() + page = 0 + while True: + result = source.list_collections(page, 1000) + for c in result["results"]: + all_store_digests.add(c.digest) + if len(result["results"]) < 1000: + break + page += 1 + + vgp_in_store = vgp_digests & all_store_digests + ref_in_store = ref_digests & all_store_digests + unaccounted = all_store_digests - vgp_digests - ref_digests + + print(f"\nTotal in store: {len(all_store_digests)}") + print(f"VGP to import: {len(vgp_in_store)}") + print(f"Ref to import: {len(ref_in_store)}") + if unaccounted: + print(f"Unaccounted: {len(unaccounted)} (in store but not in digest_map)") + + if vgp_digests - all_store_digests: + print(f"Warning: {len(vgp_digests - all_store_digests)} VGP digests not in store", file=sys.stderr) + if ref_digests - all_store_digests: + print(f"Warning: {len(ref_digests - all_store_digests)} ref digests not in store", file=sys.stderr) + + if dry_run: + print("\n--dry-run: stopping here.") + return + + # Import VGP collections + print(f"\nCreating VGP store: {vgp_output}") + vgp_store = RefgetStore.on_disk(vgp_output) + print(f"Importing {len(vgp_in_store)} VGP collections...") + t0 = time.time() + for i, digest in enumerate(sorted(vgp_in_store), 1): + print(f" [{i}/{len(vgp_in_store)}] {digest}") + vgp_store.import_collection(source, digest) + print(f"VGP import done in {time.time() - t0:.1f}s") + + # Import ref collections + print(f"\nCreating ref store: {ref_output}") + ref_store = RefgetStore.on_disk(ref_output) + print(f"Importing {len(ref_in_store)} ref genome collections...") + t0 = time.time() + for i, digest in enumerate(sorted(ref_in_store), 1): + print(f" [{i}/{len(ref_in_store)}] {digest}") + ref_store.import_collection(source, digest) + print(f"Ref import done in {time.time() - t0:.1f}s") + + print("\nDone!") + print(f" VGP store: {vgp_output}") + print(f" Ref store: {ref_output}") + + +def main(): + parser = argparse.ArgumentParser( + description="Split combined refget store into VGP and ref genome stores." + ) + parser.add_argument("--source", default=DEFAULT_SOURCE) + parser.add_argument("--digest-map", default=DEFAULT_DIGEST_MAP) + parser.add_argument("--vgp-output", default=DEFAULT_VGP_OUTPUT) + parser.add_argument("--ref-output", default=DEFAULT_REF_OUTPUT) + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + split_store( + source_path=args.source, + digest_map_path=args.digest_map, + vgp_output=args.vgp_output, + ref_output=args.ref_output, + dry_run=args.dry_run, + ) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/90_split_store.sbatch b/data_loaders/ref-genome-analysis/src/90_split_store.sbatch new file mode 100644 index 0000000..83e19e4 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/90_split_store.sbatch @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=split_store +#SBATCH --output=split_store_%j.log +#SBATCH --error=split_store_%j.log +#SBATCH --partition=standard +#SBATCH --time=4:00:00 +#SBATCH --mem=16G +#SBATCH --cpus-per-task=1 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd /home/ns5bc/code/ref-genome-analysis +source env/on-cluster.env + +python src/90_split_store.py diff --git a/data_loaders/ref-genome-analysis/examples/test_20_genomes.py b/data_loaders/ref-genome-analysis/src/examples/test_20_genomes.py similarity index 94% rename from data_loaders/ref-genome-analysis/examples/test_20_genomes.py rename to data_loaders/ref-genome-analysis/src/examples/test_20_genomes.py index 6e98e85..3b0eda7 100644 --- a/data_loaders/ref-genome-analysis/examples/test_20_genomes.py +++ b/data_loaders/ref-genome-analysis/src/examples/test_20_genomes.py @@ -16,10 +16,11 @@ from gtars.refget import RefgetStore -BRICK_ROOT = "/project/shefflab/brickyard/datasets_downloaded/refgenomes_fasta" -INVENTORY_CSV = f"{BRICK_ROOT}/refgenomes_inventory.csv" -FHR_DIR = f"{BRICK_ROOT}/refget_staging/fhr_metadata" -STORE_PATH = "/scratch/ns5bc/test_refget_store_20" +BRICK_ROOT = os.environ["BRICK_ROOT"] +INVENTORY_CSV = os.environ.get("INVENTORY_CSV", f"{BRICK_ROOT}/refgenomes_inventory.csv") +STAGING = os.environ.get("STAGING", f"{BRICK_ROOT}/refget_staging") +FHR_DIR = f"{STAGING}/fhr_metadata" +STORE_PATH = os.environ.get("STORE_PATH", "/scratch/ns5bc/test_refget_store_20") def main(): diff --git a/data_loaders/riva_pangenome_analysis/update-gtars.sh b/data_loaders/riva_pangenome_analysis/update-gtars.sh index 44d22c5..2664b00 100644 --- a/data_loaders/riva_pangenome_analysis/update-gtars.sh +++ b/data_loaders/riva_pangenome_analysis/update-gtars.sh @@ -8,7 +8,7 @@ module load miniforge/24.3.0-py3.11 # Build gtars (refget module only) cd ~/code/gtars -git checkout refgetstore +git checkout dev git pull cd gtars-python rm -f ../target/wheels/gtars-*.whl From 8324650af623c238a872eaf2058b1d1d6eb4b06f Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 18 Mar 2026 09:27:22 -0400 Subject: [PATCH 23/31] major work on building stores, frontend --- README.md | 45 +- data_loaders/demo_build_store.py | 9 +- data_loaders/ref-genome-analysis/README.md | 39 ++ .../docs/missing_seqcolapi_collections.md | 22 + .../docs/pephubclient-issues.md | 47 ++ .../ref-genome-analysis/env/mutagen-setup.sh | 2 +- .../ref-genome-analysis/env/on-cluster.env | 13 + .../ref-genome-analysis/env/remote-hpc.env | 21 +- .../src/02_aliases/register_aliases.sbatch | 2 +- .../src/02_build/build_digest_map.sbatch | 2 +- .../src/05_profiling/profile_all.sbatch | 2 +- .../src/05_profiling/profile_memory.sbatch | 2 +- .../src/05_profiling/profile_newt.sbatch | 2 +- .../src/05_profiling/profile_normal.sbatch | 2 +- .../ref-genome-analysis/src/90_split_store.py | 27 +- .../src/90_split_store.sbatch | 2 +- .../src/backfill_sequence_aliases.py | 208 +++++++ .../src/backfill_sequence_aliases.sbatch | 22 + .../src/examples/test_20_genomes.py | 2 +- .../ref-genome-analysis/src/push_to_s3.sh | 56 ++ .../src/validate_split_stores.py | 538 ++++++++++++++++++ .../src/validate_split_stores.sbatch | 16 + deployment/seqcolapi-store/production.env | 2 +- deployment/seqcolapi-store/task_def.json | 2 +- deployment/store_demo/store_demo.env | 3 + deployment/store_demo_up.sh | 61 ++ frontend/src/components/APINav.jsx | 50 ++ frontend/src/components/CompareTable.jsx | 2 +- frontend/src/components/CopyableDigest.jsx | 29 + frontend/src/components/ExplorerNav.jsx | 37 ++ frontend/src/components/SequenceTable.jsx | 188 ++++++ frontend/src/components/StoreNav.jsx | 8 +- frontend/src/main.jsx | 305 ++++++---- frontend/src/pages/APICollectionView.jsx | 147 +++++ frontend/src/pages/APICollections.jsx | 99 ++++ frontend/src/pages/APICompare.jsx | 26 + frontend/src/pages/APICompliance.jsx | 26 + frontend/src/pages/APIExplorer.jsx | 115 ++++ frontend/src/pages/Explorer.jsx | 228 ++++++++ frontend/src/pages/ExplorerAliases.jsx | 172 ++++++ frontend/src/pages/ExplorerCollection.jsx | 334 +++++++++++ frontend/src/pages/ExplorerSequences.jsx | 152 +++++ frontend/src/pages/LandingPage.jsx | 231 ++++++++ frontend/src/pages/StoreAliases.jsx | 4 +- frontend/src/pages/StoreCollection.jsx | 2 +- frontend/src/pages/StoreExplorer.jsx | 34 +- frontend/src/pages/StoreOverview.jsx | 14 +- frontend/src/pages/StoreSequences.jsx | 2 +- frontend/src/services/fetchData.jsx | 53 +- frontend/src/stores/apiExplorerStore.js | 55 ++ frontend/src/stores/unifiedStore.js | 67 +++ refget/__init__.py | 2 - refget/agents.py | 5 + refget/backend.py | 137 ++++- refget/cli/store.py | 2 +- refget/compliance.py | 53 +- refget/middleware.py | 2 +- refget/router.py | 107 ++-- scripts/test-store-integration.sh | 11 + seqcolapi/main.py | 59 +- test_fasta/pair_swap.rgsi | 11 + test_fasta/swap_wo_coords.rgsi | 11 + tests/api/test_compliance.py | 8 +- tests/conftest.py | 2 +- tests/integration/conftest.py | 112 ++++ tests/integration/test_store_compliance.py | 102 ++++ tests/local/test_backend.py | 14 +- 67 files changed, 3870 insertions(+), 297 deletions(-) create mode 100644 data_loaders/ref-genome-analysis/docs/missing_seqcolapi_collections.md create mode 100644 data_loaders/ref-genome-analysis/docs/pephubclient-issues.md create mode 100644 data_loaders/ref-genome-analysis/src/backfill_sequence_aliases.py create mode 100644 data_loaders/ref-genome-analysis/src/backfill_sequence_aliases.sbatch create mode 100644 data_loaders/ref-genome-analysis/src/push_to_s3.sh create mode 100644 data_loaders/ref-genome-analysis/src/validate_split_stores.py create mode 100644 data_loaders/ref-genome-analysis/src/validate_split_stores.sbatch create mode 100644 deployment/store_demo/store_demo.env create mode 100755 deployment/store_demo_up.sh create mode 100644 frontend/src/components/APINav.jsx create mode 100644 frontend/src/components/CopyableDigest.jsx create mode 100644 frontend/src/components/ExplorerNav.jsx create mode 100644 frontend/src/components/SequenceTable.jsx create mode 100644 frontend/src/pages/APICollectionView.jsx create mode 100644 frontend/src/pages/APICollections.jsx create mode 100644 frontend/src/pages/APICompare.jsx create mode 100644 frontend/src/pages/APICompliance.jsx create mode 100644 frontend/src/pages/APIExplorer.jsx create mode 100644 frontend/src/pages/Explorer.jsx create mode 100644 frontend/src/pages/ExplorerAliases.jsx create mode 100644 frontend/src/pages/ExplorerCollection.jsx create mode 100644 frontend/src/pages/ExplorerSequences.jsx create mode 100644 frontend/src/pages/LandingPage.jsx create mode 100644 frontend/src/stores/apiExplorerStore.js create mode 100644 frontend/src/stores/unifiedStore.js create mode 100755 scripts/test-store-integration.sh create mode 100644 test_fasta/pair_swap.rgsi create mode 100644 test_fasta/swap_wo_coords.rgsi create mode 100644 tests/integration/test_store_compliance.py diff --git a/README.md b/README.md index dec9086..c595362 100644 --- a/README.md +++ b/README.md @@ -43,9 +43,48 @@ This starts the test database, runs tests, and cleans up automatically. ## Development and deployment: Backend -### Easy-peasy way +### Store-backed (no database) -In a moment I'll show you how to do these steps individually, but if you're in a hurry, the easy way get a development API running for testing is to just use my very simple shell script like this (no data persistence, just loads demo data): +The store-backed seqcolapi uses a RefgetStore (local files) instead of PostgreSQL. This is the simplest way to run the API: + +#### Quick start + +```console +bash deployment/store_demo_up.sh +``` + +This will: +- Build a local RefgetStore from test FASTA files +- Run the store-backed seqcolapi with uvicorn +- Block the terminal until you press Ctrl+C, which cleans up + +No Docker or database required. + +#### Step-by-step + +1. Build a store from FASTA files: + +```console +python data_loaders/demo_build_store.py test_fasta /tmp/refget_demo_store +``` + +2. Start the store-backed API: + +```console +REFGET_STORE_PATH=/tmp/refget_demo_store uvicorn seqcolapi.main:store_app --reload --port 8100 +``` + +#### Remote store + +To run against a remote (S3) store: + +```console +REFGET_STORE_URL=https://example.com/store uvicorn seqcolapi.main:store_app --port 8100 +``` + +### DB-backed (PostgreSQL) + +If you need a database-backed instance (e.g., for mutable data, advanced queries), use the DB-backed workflow. In a moment I'll show you how to do these steps individually, but if you're in a hurry, the easy way to get a development API running for testing is to just use my very simple shell script like this (no data persistence, just loads demo data): ```console bash deployment/demo_up.sh @@ -58,7 +97,7 @@ This will: - load up the demo data - block the terminal until you press Ctrl+C, which will shut down all services. -### Step-by-step process +### Step-by-step process (DB-backed) Alternatively, if you want to run each step separately to see what's really going on, start here. diff --git a/data_loaders/demo_build_store.py b/data_loaders/demo_build_store.py index 39ae6c0..4fa1669 100644 --- a/data_loaders/demo_build_store.py +++ b/data_loaders/demo_build_store.py @@ -38,7 +38,14 @@ def main(): store = RefgetStore.on_disk(store_path) for fasta in fasta_files: - store.add_sequence_collection_from_fasta(fasta) + result = store.add_sequence_collection_from_fasta(fasta) + # Register the filename (without extension) as a collection alias + basename = os.path.basename(fasta) + name = basename.split(".")[0] # strip .fa, .fasta, .fa.gz, etc. + meta = result[0] if isinstance(result, tuple) else result + if meta: + store.add_collection_alias("fasta_filename", name, meta.digest) + print(f" {name} → {meta.digest}") print(f"Done. Store at: {store_path}") print(f"Stats: {store.stats()}") diff --git a/data_loaders/ref-genome-analysis/README.md b/data_loaders/ref-genome-analysis/README.md index 1ccf999..a93b618 100644 --- a/data_loaders/ref-genome-analysis/README.md +++ b/data_loaders/ref-genome-analysis/README.md @@ -27,6 +27,9 @@ inventory --> build --> aliases --> fhr --> verify | **verify** | `src/04_verify/` | Automated pass/fail checks against the store | | **profiling** | `src/05_profiling/` | Memory and timing benchmarks | | **split** | `src/90_split_store.py` | Split combined store into VGP and reference genome stores | +| **backfill** | `src/backfill_sequence_aliases.py` | Re-register aliases into split stores from NCBI alias table | +| **validate** | `src/validate_split_stores.py` | Validate split stores (counts, aliases, FHR, sequences, cross-store) | +| **push** | `src/push_to_s3.sh` | Push split stores to S3 (`s3://refgenie/`) | | **examples** | `src/examples/` | End-to-end test scripts (e.g., load 20 genomes with FHR) | ## Environment variables @@ -59,4 +62,40 @@ python src/03_fhr/load_fhr_metadata.py --store-path $STORE_PATH --fhr-dir $STAGI # 4. Verify python src/04_verify/verify_refgetstore.py + +# 5. Split into VGP + ref stores +sbatch src/90_split_store.sbatch + +# 6. Backfill aliases into split stores +python src/backfill_sequence_aliases.py --target $BRICK_ROOT/vgp_reference_store +python src/backfill_sequence_aliases.py --target $BRICK_ROOT/refgenome_jungle_store + +# 7. Validate split stores +sbatch src/validate_split_stores.sbatch + +# 8. Push to S3 (requires GPG agent forwarding: ssh riva1_gpg) +bash src/push_to_s3.sh both +``` + +## S3 deployment + +Requires GPG agent forwarding for `pass` credentials (see `ssh riva1_gpg` in SSH config). + +```bash +ssh riva1_gpg +cd code/ref-genome-analysis +source env/on-cluster.env +bash src/push_to_s3.sh vgp # or: ref, both, "vgp --dry-run" +``` + +Stores are pushed to `s3://refgenie/refget-store/vgp` and `s3://refgenie/refget-store/jungle`. + +To load from S3: + +```python +from refget.store import RefgetStore +store = RefgetStore.open_remote( + "~/.refget/vgp_cache", + "https://refgenie.s3.us-east-1.amazonaws.com/refget-store/vgp" +) ``` diff --git a/data_loaders/ref-genome-analysis/docs/missing_seqcolapi_collections.md b/data_loaders/ref-genome-analysis/docs/missing_seqcolapi_collections.md new file mode 100644 index 0000000..3ba8d34 --- /dev/null +++ b/data_loaders/ref-genome-analysis/docs/missing_seqcolapi_collections.md @@ -0,0 +1,22 @@ +# Missing seqcolapi collections + +8 collections hosted on seqcolapi.databio.org are not in any RefgetStore. +These were loaded into the PostgreSQL-backed seqcolapi directly from +`fasta/pangenome_reference/` FASTAs that weren't included in the combined store build. + +## TODO + +Load these into the jungle store. 7 of 8 are confirmed in `$BRICK_ROOT/fasta/pangenome_reference/`: + +| Digest | Seqs | FASTA | +|---|---|---| +| `2WhejNO718T5jvB4DVTAz-A_JF03iIkz` | 25 | `GCA_009914755.4_CHM13_T2T_v2.0_genomic.fna.gz` | +| `6DfkalgYxFZiYAKpJf19dbpnS-dGzi4m` | 24 | `chm13.draft_v1.1.fasta.gz` | +| `Hve5dblWYLxu1p9Cp930NB8twHGCsf6X` | 640 | `GCA_000001405.28_GRCh38.p13_genomic.fa.gz` | +| `VDUOdAUYpXHUhvU-MNmOTgYQAl67yRMs` | 445 | `Homo_sapiens.GRCh38.dna.alt.fa.gz` | +| `WwIG41XDzO0BTmEpzT7nPXv6Dfx7h4ju` | 1 | `CM000663.2.fasta.gz` | +| `awlJ5Q7EPDVlwXWH8LPN93oJ5jY2uajW` | 24 | `T2T-CHM13v2.0.unmasked.fa.gz` | +| `qJ79liNTAD-LShR3j_2xntOEt-eC3vhM` | 639 | `Homo_sapiens.GRCh38.dna.toplevel.fa.gz` | +| `gHcfbUVnFzHv3QSqz2sSqVHdUQbDO8N5` | 3366 | Not in pangenome_reference. Likely `GRCh38_full_analysis_set_plus_decoy_hla.fa.gz` from `fasta/jungle/homo_sapiens/` | + +These are needed for seqcol compliance testing since they're currently served by the API. diff --git a/data_loaders/ref-genome-analysis/docs/pephubclient-issues.md b/data_loaders/ref-genome-analysis/docs/pephubclient-issues.md new file mode 100644 index 0000000..bdeb978 --- /dev/null +++ b/data_loaders/ref-genome-analysis/docs/pephubclient-issues.md @@ -0,0 +1,47 @@ +# PEPhub Client: Issues Encountered + +## 1. `--force` doesn't update samples on existing projects + +**Problem:** `phc push --force` and `phc.upload(force=True)` return success (202) but silently fail to update the sample table when the project already exists. The config/metadata may update, but samples remain unchanged. + +**Workaround:** Delete the project first, then push fresh: + +```python +import requests +from pephubclient import PEPHubClient + +phc = PEPHubClient() +jwt = phc._PEPHubClient__jwt_data +headers = {"Authorization": f"Bearer {jwt}"} + +requests.delete( + "https://pephub-api.databio.org/api/v1/projects/NAMESPACE/PROJECT", + params={"tag": "TAG"}, + headers=headers, +) +``` + +Then push normally with `phc push`. + +## 2. Bare CSV push fails with 400 + +**Problem:** The CLI help says `CFG` accepts "Project config file (YAML) or sample table (CSV/TSV)", but pushing a bare CSV fails with `Unexpected Response Error. 400`. + +**Workaround:** Always push a YAML config that references the CSV: + +```yaml +# project_config.yaml +pep_version: "2.1.0" +sample_table: samples.csv +name: my_project +``` + +```bash +phc push --namespace NS --name NAME --tag TAG project_config.yaml +``` + +## 3. `phc.upload()` with peppy Project reports success but uploads empty samples + +**Problem:** Loading a project with `phc.load_project()`, modifying `sample_table` in-place, then calling `phc.upload()` reports success but the server receives no samples. The `project.to_dict()` output is correct (verified locally), so the issue is server-side. + +**Workaround:** Write the modified sample table to a CSV, create a YAML config referencing it, and use `phc push` with the YAML. diff --git a/data_loaders/ref-genome-analysis/env/mutagen-setup.sh b/data_loaders/ref-genome-analysis/env/mutagen-setup.sh index e17922f..8a147eb 100755 --- a/data_loaders/ref-genome-analysis/env/mutagen-setup.sh +++ b/data_loaders/ref-genome-analysis/env/mutagen-setup.sh @@ -40,7 +40,7 @@ if [ -n "$DEPLOY_HOST" ] && [ -n "$DEPLOY_DIR" ]; then fi # refget — local source synced to remote deploy dir - REFGET_LOCAL="$HOME/Dropbox/workspaces/intervals/repos/refget" + REFGET_LOCAL="$HOME/Dropbox/workspaces/refgenie/repos/refget" if [ -d "$REFGET_LOCAL" ]; then mutagen sync create \ --name="deploy-refget" \ diff --git a/data_loaders/ref-genome-analysis/env/on-cluster.env b/data_loaders/ref-genome-analysis/env/on-cluster.env index 167b5e4..f00236d 100644 --- a/data_loaders/ref-genome-analysis/env/on-cluster.env +++ b/data_loaders/ref-genome-analysis/env/on-cluster.env @@ -4,3 +4,16 @@ export BRICK_ROOT=$BRICKYARD/datasets_downloaded/refgenomes_fasta export STORE_PATH=$BRICK_ROOT/refget_store export STAGING=$BRICK_ROOT/refget_staging export INVENTORY_CSV=$BRICK_ROOT/refgenomes_inventory.csv +export S3_BUCKET=s3://refgenie + +# vgp store +export VGP_STORE_PATH=$BRICK_ROOT/refget-store/vgp +export VGP_S3_PATH=$S3_BUCKET/refget-store/vgp + +# jungle store +export REF_STORE_PATH=$BRICK_ROOT/refget-store/jungle +export REF_S3_PATH=$S3_BUCKET/refget-store/jungle + +# pangenome store +export PANGENOME_STORE_PATH=$BRICK_ROOT/refget-store/pangenome +export PANGENOME_S3_PATH=$S3_BUCKET/refget-store/pangenome diff --git a/data_loaders/ref-genome-analysis/env/remote-hpc.env b/data_loaders/ref-genome-analysis/env/remote-hpc.env index 6504138..3ec463f 100644 --- a/data_loaders/ref-genome-analysis/env/remote-hpc.env +++ b/data_loaders/ref-genome-analysis/env/remote-hpc.env @@ -4,6 +4,21 @@ export BRICK_ROOT=$BRICKYARD/datasets_downloaded/refgenomes_fasta export STORE_PATH=$BRICK_ROOT/refget_store export STAGING=$BRICK_ROOT/refget_staging export INVENTORY_CSV=$BRICK_ROOT/refgenomes_inventory.csv -export SYNC_REMOTE=ns5bc@login.hpc.virginia.edu:/home/ns5bc/code/ref-genome-analysis -export DEPLOY_HOST=ns5bc@login.hpc.virginia.edu -export DEPLOY_DIR=/home/ns5bc/deploy +export S3_BUCKET=s3://refgenie + +# vgp store +export VGP_STORE_PATH=$BRICK_ROOT/refget-store/vgp +export VGP_S3_PATH=$S3_BUCKET/refget-store/vgp + +# jungle store +export REF_STORE_PATH=$BRICK_ROOT/refget-store/jungle +export REF_S3_PATH=$S3_BUCKET/refget-store/jungle + +# pangenome store +export PANGENOME_STORE_PATH=$BRICK_ROOT/refget-store/pangenome +export PANGENOME_S3_PATH=$S3_BUCKET/refget-store/pangenome + +# remote deployment +export SYNC_REMOTE=riva:~/code/ref-genome-analysis +export DEPLOY_HOST=riva +export DEPLOY_DIR=~/deploy diff --git a/data_loaders/ref-genome-analysis/src/02_aliases/register_aliases.sbatch b/data_loaders/ref-genome-analysis/src/02_aliases/register_aliases.sbatch index dc50581..6b831e2 100644 --- a/data_loaders/ref-genome-analysis/src/02_aliases/register_aliases.sbatch +++ b/data_loaders/ref-genome-analysis/src/02_aliases/register_aliases.sbatch @@ -10,7 +10,7 @@ module load miniforge/24.3.0-py3.11 -cd /home/ns5bc/code/ref-genome-analysis +cd $HOME/code/ref-genome-analysis source env/on-cluster.env python src/02_aliases/register_ncbi_aliases.py diff --git a/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.sbatch b/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.sbatch index d4c042b..523f1f7 100644 --- a/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.sbatch +++ b/data_loaders/ref-genome-analysis/src/02_build/build_digest_map.sbatch @@ -10,7 +10,7 @@ module load miniforge/24.3.0-py3.11 -cd /home/ns5bc/code/ref-genome-analysis +cd $HOME/code/ref-genome-analysis source env/on-cluster.env python src/02_build/build_digest_map.py diff --git a/data_loaders/ref-genome-analysis/src/05_profiling/profile_all.sbatch b/data_loaders/ref-genome-analysis/src/05_profiling/profile_all.sbatch index d96a2c0..5d9e57e 100644 --- a/data_loaders/ref-genome-analysis/src/05_profiling/profile_all.sbatch +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_all.sbatch @@ -10,7 +10,7 @@ module load miniforge/24.3.0-py3.11 -cd /home/ns5bc/code/ref-genome-analysis +cd $HOME/code/ref-genome-analysis source env/on-cluster.env python src/05_profiling/profile_all.py diff --git a/data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.sbatch b/data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.sbatch index 0e70a7b..2eee915 100644 --- a/data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.sbatch +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_memory.sbatch @@ -10,7 +10,7 @@ module load miniforge/24.3.0-py3.11 -cd /home/ns5bc/code/ref-genome-analysis +cd $HOME/code/ref-genome-analysis source env/on-cluster.env python src/05_profiling/profile_memory.py 850 5 diff --git a/data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.sbatch b/data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.sbatch index ee37f56..3db0284 100644 --- a/data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.sbatch +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_newt.sbatch @@ -10,7 +10,7 @@ module load miniforge/24.3.0-py3.11 -cd /home/ns5bc/code/ref-genome-analysis +cd $HOME/code/ref-genome-analysis source env/on-cluster.env python src/05_profiling/profile_newt.py diff --git a/data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.sbatch b/data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.sbatch index dd7eb04..c5ba2ca 100644 --- a/data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.sbatch +++ b/data_loaders/ref-genome-analysis/src/05_profiling/profile_normal.sbatch @@ -10,7 +10,7 @@ module load miniforge/24.3.0-py3.11 -cd /home/ns5bc/code/ref-genome-analysis +cd $HOME/code/ref-genome-analysis source env/on-cluster.env python src/05_profiling/profile_normal.py diff --git a/data_loaders/ref-genome-analysis/src/90_split_store.py b/data_loaders/ref-genome-analysis/src/90_split_store.py index 86a4d0c..7f87cdd 100644 --- a/data_loaders/ref-genome-analysis/src/90_split_store.py +++ b/data_loaders/ref-genome-analysis/src/90_split_store.py @@ -29,6 +29,17 @@ VGP_GROUPS = {"vertebrates"} +def _paginate(store): + """Yield pages of collection results from a store.""" + page = 0 + while True: + result = store.list_collections(page, 1000) + yield result["results"] + if len(result["results"]) < 1000: + break + page += 1 + + def load_digest_map(digest_map_path: str) -> dict[str, set[str]]: """Read digest_map.csv and return group -> set of digests.""" groups: dict[str, set[str]] = {} @@ -99,20 +110,24 @@ def split_store( # Import VGP collections print(f"\nCreating VGP store: {vgp_output}") vgp_store = RefgetStore.on_disk(vgp_output) - print(f"Importing {len(vgp_in_store)} VGP collections...") + existing_vgp = {c.digest for p in _paginate(vgp_store) for c in p} + to_import_vgp = sorted(vgp_in_store - existing_vgp) + print(f"VGP: {len(vgp_in_store)} total, {len(existing_vgp)} already imported, {len(to_import_vgp)} remaining") t0 = time.time() - for i, digest in enumerate(sorted(vgp_in_store), 1): - print(f" [{i}/{len(vgp_in_store)}] {digest}") + for i, digest in enumerate(to_import_vgp, 1): + print(f" [{i}/{len(to_import_vgp)}] {digest}") vgp_store.import_collection(source, digest) print(f"VGP import done in {time.time() - t0:.1f}s") # Import ref collections print(f"\nCreating ref store: {ref_output}") ref_store = RefgetStore.on_disk(ref_output) - print(f"Importing {len(ref_in_store)} ref genome collections...") + existing_ref = {c.digest for p in _paginate(ref_store) for c in p} + to_import_ref = sorted(ref_in_store - existing_ref) + print(f"Ref: {len(ref_in_store)} total, {len(existing_ref)} already imported, {len(to_import_ref)} remaining") t0 = time.time() - for i, digest in enumerate(sorted(ref_in_store), 1): - print(f" [{i}/{len(ref_in_store)}] {digest}") + for i, digest in enumerate(to_import_ref, 1): + print(f" [{i}/{len(to_import_ref)}] {digest}") ref_store.import_collection(source, digest) print(f"Ref import done in {time.time() - t0:.1f}s") diff --git a/data_loaders/ref-genome-analysis/src/90_split_store.sbatch b/data_loaders/ref-genome-analysis/src/90_split_store.sbatch index 83e19e4..eb83e26 100644 --- a/data_loaders/ref-genome-analysis/src/90_split_store.sbatch +++ b/data_loaders/ref-genome-analysis/src/90_split_store.sbatch @@ -10,7 +10,7 @@ module load miniforge/24.3.0-py3.11 -cd /home/ns5bc/code/ref-genome-analysis +cd $HOME/code/ref-genome-analysis source env/on-cluster.env python src/90_split_store.py diff --git a/data_loaders/ref-genome-analysis/src/backfill_sequence_aliases.py b/data_loaders/ref-genome-analysis/src/backfill_sequence_aliases.py new file mode 100644 index 0000000..0b95b75 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/backfill_sequence_aliases.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +""" +Backfill sequence and collection aliases into a split store. + +Matches accessions to target store collections via digest_map (path join), +then registers aliases from the NCBI alias table by matching sequence names +in level2 data. Does NOT load any FASTAs — read-only against the target store +except for alias registration. + +Usage: + source env/on-cluster.env + python src/backfill_sequence_aliases.py --target $VGP_STORE_PATH + python src/backfill_sequence_aliases.py --target $REF_STORE_PATH + python src/backfill_sequence_aliases.py --target $VGP_STORE_PATH --dry-run +""" + +import argparse +import csv +import os +import tempfile +import time +from collections import defaultdict + +BRICK_ROOT = os.environ["BRICK_ROOT"] +STAGING = os.environ.get("STAGING", os.path.join(BRICK_ROOT, "refget_staging")) +INVENTORY_CSV = os.environ.get("INVENTORY_CSV", os.path.join(BRICK_ROOT, "refgenomes_inventory.csv")) +ALIAS_TABLE_CSV = os.path.join(STAGING, "ncbi_alias_table.csv") +DIGEST_MAP_CSV = os.path.join(STAGING, "digest_map.csv") + + +def get_all_collection_digests(store): + digests = set() + page = 0 + while True: + result = store.list_collections(page, 1000) + for c in result["results"]: + digests.add(c.digest) + if len(result["results"]) < 1000: + break + page += 1 + return digests + + +def main(): + parser = argparse.ArgumentParser( + description="Backfill aliases into a split store from NCBI alias table." + ) + parser.add_argument("--target", required=True, help="Target RefgetStore path") + parser.add_argument("--alias-table", default=ALIAS_TABLE_CSV) + parser.add_argument("--inventory", default=INVENTORY_CSV) + parser.add_argument("--digest-map", default=DIGEST_MAP_CSV) + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + from refget.store import RefgetStore + + print(f"Target store: {args.target}") + print(f"Alias table: {args.alias_table}") + print(f"Inventory: {args.inventory}") + print(f"Digest map: {args.digest_map}") + print(f"Dry run: {args.dry_run}") + print() + + # Open target store (read-only for collection lookup, then alias writes) + store = RefgetStore.open_local(args.target) + target_digests = get_all_collection_digests(store) + print(f"Target has {len(target_digests)} collections") + + # Build path -> accession from inventory + path_to_accession = {} + with open(args.inventory, newline="") as f: + for row in csv.DictReader(f): + acc = row.get("accession", "").strip() + path = row.get("path", "").strip() + if acc and path: + path_to_accession[path] = acc + + # Build digest -> accession via digest_map (join on path) + digest_to_accession = {} + with open(args.digest_map, newline="") as f: + for row in csv.DictReader(f): + digest = row.get("digest", "").strip() + path = row.get("path", "").strip() + if digest and path and path in path_to_accession: + digest_to_accession[digest] = path_to_accession[path] + + # Filter to accessions whose digest is in the target store + target_acc_to_digest = {} + for digest in target_digests: + acc = digest_to_accession.get(digest) + if acc: + target_acc_to_digest[acc] = digest + + print(f"Accessions in target with alias data: {len(target_acc_to_digest)}") + + # Read alias table, filtered to target accessions + acc_to_rows = defaultdict(list) + with open(args.alias_table, newline="") as f: + for row in csv.DictReader(f): + acc = row.get("accession", "").strip() + if acc and acc in target_acc_to_digest: + acc_to_rows[acc].append(row) + + common = sorted(target_acc_to_digest.keys() & acc_to_rows.keys()) + print(f"Accessions with alias table entries: {len(common)}") + + # Re-open as on_disk for writing aliases + store = RefgetStore.on_disk(args.target) + store.set_quiet(True) + + seq_aliases = {"refseq": [], "insdc": [], "ucsc": []} + coll_aliases = {"refseq": [], "insdc": []} + n_matched = 0 + n_unmatched = 0 + t_start = time.time() + + for i, accession in enumerate(common, 1): + coll_digest = target_acc_to_digest[accession] + alias_rows = acc_to_rows[accession] + + print(f"[{i}/{len(common)}] {accession} ({len(alias_rows)} seqs)...", end=" ", flush=True) + + # Collection-level aliases + first_row = alias_rows[0] + genbank_acc = first_row.get("genbank_assembly_accn", "").strip() + refseq_acc = first_row.get("refseq_assembly_accn", "").strip() + if refseq_acc: + coll_aliases["refseq"].append((refseq_acc, coll_digest)) + if genbank_acc: + coll_aliases["insdc"].append((genbank_acc, coll_digest)) + + # Sequence-level aliases via name matching in level2 + level2 = store.get_collection_level2(coll_digest) + names = level2.get("names", []) + lengths = level2.get("lengths", []) + sequences = level2.get("sequences", []) + name_to_info = {n: (s, int(l)) for n, l, s in zip(names, lengths, sequences)} + + matched_this = 0 + for row in alias_rows: + seq_name = row.get("sequence_name", "").strip() + seq_length_str = row.get("sequence_length", "").strip() + refseq_accn = row.get("refseq_accn", "").strip() + genbank_accn = row.get("genbank_accn", "").strip() + ucsc_name = row.get("ucsc_name", "").strip() + seq_length = int(seq_length_str) if seq_length_str else None + + seq_digest = None + for candidate in [seq_name, refseq_accn, genbank_accn, ucsc_name]: + if candidate and candidate in name_to_info: + sd, sl = name_to_info[candidate] + if seq_length is None or sl == seq_length: + seq_digest = sd + break + + if seq_digest is None: + n_unmatched += 1 + continue + + matched_this += 1 + if refseq_accn: + seq_aliases["refseq"].append((refseq_accn, seq_digest)) + if genbank_accn: + seq_aliases["insdc"].append((genbank_accn, seq_digest)) + if ucsc_name: + seq_aliases["ucsc"].append((ucsc_name, seq_digest)) + + n_matched += matched_this + print(f"{matched_this}/{len(alias_rows)} matched") + + elapsed = time.time() - t_start + n_seq = sum(len(v) for v in seq_aliases.values()) + n_coll = sum(len(v) for v in coll_aliases.values()) + print(f"\nMatching done in {elapsed:.1f}s") + print(f" Matched: {n_matched}, unmatched: {n_unmatched}") + print(f" Seq aliases: {n_seq}, coll aliases: {n_coll}") + + if args.dry_run: + print("\n[DRY RUN] Skipping registration.") + return + + print("\nRegistering aliases...") + with tempfile.TemporaryDirectory() as tmpdir: + for ns, pairs in seq_aliases.items(): + if not pairs: + continue + tsv = os.path.join(tmpdir, f"seq_{ns}.tsv") + with open(tsv, "w") as f: + for alias, digest in pairs: + f.write(f"{alias}\t{digest}\n") + n = store.load_sequence_aliases(ns, tsv) + print(f" sequences/{ns}: {n} aliases loaded") + + for ns, pairs in coll_aliases.items(): + if not pairs: + continue + tsv = os.path.join(tmpdir, f"coll_{ns}.tsv") + with open(tsv, "w") as f: + for alias, digest in pairs: + f.write(f"{alias}\t{digest}\n") + n = store.load_collection_aliases(ns, tsv) + print(f" collections/{ns}: {n} aliases loaded") + + print(f"\nDone! Store stats: {store.stats()}") + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/backfill_sequence_aliases.sbatch b/data_loaders/ref-genome-analysis/src/backfill_sequence_aliases.sbatch new file mode 100644 index 0000000..c6c976d --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/backfill_sequence_aliases.sbatch @@ -0,0 +1,22 @@ +#!/bin/bash +#SBATCH --job-name=backfill_aliases +#SBATCH --output=backfill_aliases_%j.log +#SBATCH --error=backfill_aliases_%j.log +#SBATCH --partition=standard +#SBATCH --time=2:00:00 +#SBATCH --mem=16G +#SBATCH --cpus-per-task=1 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd $HOME/code/ref-genome-analysis +source env/on-cluster.env + +# Backfill VGP store +python src/backfill_sequence_aliases.py --target $BRICK_ROOT/vgp_reference_store + +# Backfill ref store (if it exists) +if [ -d "$BRICK_ROOT/refgenome_jungle_store" ]; then + python src/backfill_sequence_aliases.py --target $BRICK_ROOT/refgenome_jungle_store +fi diff --git a/data_loaders/ref-genome-analysis/src/examples/test_20_genomes.py b/data_loaders/ref-genome-analysis/src/examples/test_20_genomes.py index 3b0eda7..3937965 100644 --- a/data_loaders/ref-genome-analysis/src/examples/test_20_genomes.py +++ b/data_loaders/ref-genome-analysis/src/examples/test_20_genomes.py @@ -20,7 +20,7 @@ INVENTORY_CSV = os.environ.get("INVENTORY_CSV", f"{BRICK_ROOT}/refgenomes_inventory.csv") STAGING = os.environ.get("STAGING", f"{BRICK_ROOT}/refget_staging") FHR_DIR = f"{STAGING}/fhr_metadata" -STORE_PATH = os.environ.get("STORE_PATH", "/scratch/ns5bc/test_refget_store_20") +STORE_PATH = os.environ.get("STORE_PATH", "/scratch/$USER/test_refget_store_20") def main(): diff --git a/data_loaders/ref-genome-analysis/src/push_to_s3.sh b/data_loaders/ref-genome-analysis/src/push_to_s3.sh new file mode 100644 index 0000000..920ccff --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/push_to_s3.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Push refget stores to S3 via Rivanna. +# +# Clears stale GPG socket, connects with agent forwarding, decrypts +# credentials, and runs aws s3 sync. +# +# Usage (from laptop): +# source env/remote-hpc.env +# bash src/push_to_s3.sh vgp +# bash src/push_to_s3.sh ref +# bash src/push_to_s3.sh pangenome +# bash src/push_to_s3.sh all +# bash src/push_to_s3.sh vgp --dry-run + +set -euo pipefail + +STORE=${1:-all} +DRYRUN_FLAG="${2:-}" + +: "${VGP_STORE_PATH:?Set VGP_STORE_PATH in env}" +: "${REF_STORE_PATH:?Set REF_STORE_PATH in env}" +: "${PANGENOME_STORE_PATH:?Set PANGENOME_STORE_PATH in env}" +: "${VGP_S3_PATH:?Set VGP_S3_PATH in env}" +: "${REF_S3_PATH:?Set REF_S3_PATH in env}" +: "${PANGENOME_S3_PATH:?Set PANGENOME_S3_PATH in env}" + +# Clear stale GPG socket, then connect with forwarding +ssh riva1 "rm -f /run/user/\$(id -u)/gnupg/S.gpg-agent" + +ssh riva1_gpg " + source /etc/profile.d/modules.sh + module load awscli + + export AWS_ACCESS_KEY_ID=\$(pass databio/refgenie/s3_access_key_id) + export AWS_SECRET_ACCESS_KEY=\$(pass databio/refgenie/s3_secret_access_key) + + if [ \"$STORE\" = \"vgp\" ] || [ \"$STORE\" = \"all\" ]; then + echo 'Pushing VGP store to $VGP_S3_PATH ...' + aws s3 sync '$VGP_STORE_PATH' '$VGP_S3_PATH' $DRYRUN_FLAG + echo 'VGP push complete.' + fi + + if [ \"$STORE\" = \"ref\" ] || [ \"$STORE\" = \"all\" ]; then + echo 'Pushing ref store to $REF_S3_PATH ...' + aws s3 sync '$REF_STORE_PATH' '$REF_S3_PATH' $DRYRUN_FLAG + echo 'Ref push complete.' + fi + + if [ \"$STORE\" = \"pangenome\" ] || [ \"$STORE\" = \"all\" ]; then + echo 'Pushing pangenome store to $PANGENOME_S3_PATH ...' + aws s3 sync '$PANGENOME_STORE_PATH' '$PANGENOME_S3_PATH' $DRYRUN_FLAG + echo 'Pangenome push complete.' + fi + + echo 'Done!' +" diff --git a/data_loaders/ref-genome-analysis/src/validate_split_stores.py b/data_loaders/ref-genome-analysis/src/validate_split_stores.py new file mode 100644 index 0000000..085d7f4 --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/validate_split_stores.py @@ -0,0 +1,538 @@ +#!/usr/bin/env python3 +""" +Validate the VGP and ref genome stores produced by 90_split_store.py. + +Checks that the split stores are complete, internally consistent, and +that every collection from the source store ended up in exactly one +output store. + +Usage: + source env/on-cluster.env + python src/validate_split_stores.py # validate both + python src/validate_split_stores.py --store vgp # VGP only + python src/validate_split_stores.py --store ref # ref only + python src/validate_split_stores.py --thorough # deep checks (slow) +""" + +import argparse +import csv +import json +import os +import sys +import tempfile +import time + +BRICK_ROOT = os.environ["BRICK_ROOT"] +STAGING = os.environ.get("STAGING", os.path.join(BRICK_ROOT, "refget_staging")) +SOURCE_PATH = os.environ.get("STORE_PATH", os.path.join(BRICK_ROOT, "refget_store")) +VGP_PATH = os.path.join(BRICK_ROOT, "vgp_reference_store") +REF_PATH = os.path.join(BRICK_ROOT, "refgenome_jungle_store") +DIGEST_MAP = os.path.join(STAGING, "digest_map.csv") + +VGP_GROUPS = {"vertebrates"} + +results = [] + + +def check(name, passed, detail=""): + status = "PASS" if passed else "FAIL" + results.append({"name": name, "status": status, "detail": detail}) + print(f" [{'PASS' if passed else 'FAIL'}] {name}" + (f" -- {detail}" if detail else "")) + + +def load_digest_map(path): + """Return (group->set_of_digests, all_rows).""" + groups = {} + rows = [] + with open(path) as f: + for row in csv.DictReader(f): + rows.append(row) + digest = row.get("digest", "").strip() + group = row.get("group", "unknown").strip() + if digest: + groups.setdefault(group, set()).add(digest) + return groups, rows + + +def get_all_collection_digests(store): + """Paginate through list_collections to get all digests.""" + digests = set() + page = 0 + while True: + result = store.list_collections(page, 1000) + for c in result["results"]: + digests.add(c.digest) + if len(result["results"]) < 1000: + break + page += 1 + return digests + + +# ── Test 1: Store opens and basic stats ──────────────────────────────── + + +def test_store_opens(store_path, label): + """Verify store opens and has non-zero collections/sequences.""" + from refget.store import RefgetStore + + print(f"\n── {label}: Store opens and stats ──") + + try: + store = RefgetStore.open_local(store_path) + check(f"{label}_opens", True, f"path={store_path}") + except Exception as e: + check(f"{label}_opens", False, f"error={e}") + return None + + try: + stats = store.stats() + check(f"{label}_stats", True, f"stats={stats}") + except Exception as e: + check(f"{label}_stats", False, f"error={e}") + + digests = get_all_collection_digests(store) + check(f"{label}_has_collections", len(digests) > 0, f"n={len(digests)}") + + try: + seqs = store.list_sequences() + n_seqs = len(seqs) + check(f"{label}_has_sequences", n_seqs > 0, f"n={n_seqs}") + except Exception as e: + check(f"{label}_has_sequences", False, f"error={e}") + + return store + + +# ── Test 2: Collection counts match digest map ──────────────────────── + + +def test_collection_counts(store, label, expected_digests): + """Verify the store has exactly the expected collections.""" + print(f"\n── {label}: Collection count vs digest map ──") + + store_digests = get_all_collection_digests(store) + + check( + f"{label}_count_match", + len(store_digests) == len(expected_digests), + f"store={len(store_digests)}, expected={len(expected_digests)}", + ) + + missing = expected_digests - store_digests + extra = store_digests - expected_digests + + check( + f"{label}_no_missing", + len(missing) == 0, + f"missing={len(missing)}" + (f", sample={list(missing)[:3]}" if missing else ""), + ) + check( + f"{label}_no_extra", + len(extra) == 0, + f"extra={len(extra)}" + (f", sample={list(extra)[:3]}" if extra else ""), + ) + + return store_digests + + +# ── Test 3: Level2 integrity for all collections ────────────────────── + + +def test_level2_integrity(store, label, digests, limit=None): + """Verify level2 arrays are aligned and valid for every collection.""" + print(f"\n── {label}: Level2 data integrity ──") + + to_check = sorted(digests) + if limit: + to_check = to_check[:limit] + + ok_count = 0 + fail_count = 0 + fail_details = [] + + for digest in to_check: + try: + level2 = store.get_collection_level2(digest) + names = level2.get("names", []) + lengths = level2.get("lengths", []) + sequences = level2.get("sequences", []) + + arrays_aligned = len(names) == len(lengths) == len(sequences) and len(names) > 0 + lengths_positive = all(l > 0 for l in lengths) if lengths else False + seqs_nonempty = all(s and len(s) > 0 for s in sequences) if sequences else False + + if arrays_aligned and lengths_positive and seqs_nonempty: + ok_count += 1 + else: + fail_count += 1 + fail_details.append( + f"{digest[:16]}: names={len(names)} lengths={len(lengths)} " + f"seqs={len(sequences)} aligned={arrays_aligned} " + f"lengths_ok={lengths_positive} seqs_ok={seqs_nonempty}" + ) + except Exception as e: + fail_count += 1 + fail_details.append(f"{digest[:16]}: ERROR {e}") + + total = ok_count + fail_count + check( + f"{label}_level2_all_valid", + fail_count == 0, + f"ok={ok_count}/{total}" + (f", failures=[{'; '.join(fail_details[:5])}]" if fail_details else ""), + ) + + +# ── Test 4: Aliases were imported ───────────────────────────────────── + + +def test_aliases(store, label, digests): + """Check that alias namespaces exist and at least some collections have aliases.""" + print(f"\n── {label}: Alias integrity ──") + + # Check namespaces exist + try: + coll_ns = store.list_collection_alias_namespaces() + check(f"{label}_collection_alias_namespaces", len(coll_ns) > 0, f"namespaces={coll_ns}") + except Exception as e: + check(f"{label}_collection_alias_namespaces", False, f"error={e}") + coll_ns = [] + + try: + seq_ns = store.list_sequence_alias_namespaces() + check(f"{label}_sequence_alias_namespaces", len(seq_ns) > 0, f"namespaces={seq_ns}") + except Exception as e: + check(f"{label}_sequence_alias_namespaces", False, f"error={e}") + seq_ns = [] + + # Sample: check that some collections have aliases + sample = sorted(digests)[:20] + with_aliases = 0 + for digest in sample: + try: + aliases = store.get_aliases_for_collection(digest) + if aliases and len(aliases) > 0: + with_aliases += 1 + except Exception: + pass + + check( + f"{label}_collections_have_aliases", + with_aliases > 0, + f"with_aliases={with_aliases}/{len(sample)} (sampled)", + ) + + # For each namespace, count total aliases + for ns in coll_ns: + try: + aliases = store.list_collection_aliases(ns) + check(f"{label}_coll_alias_count_{ns}", len(aliases) > 0, f"n={len(aliases)}") + except Exception as e: + check(f"{label}_coll_alias_count_{ns}", False, f"error={e}") + + # Forward lookup: pick an alias and verify it resolves + for ns in coll_ns[:1]: # test first namespace + try: + aliases = store.list_collection_aliases(ns) + if aliases: + alias = aliases[0] + resolved = store.get_collection_by_alias(ns, alias) + check( + f"{label}_coll_alias_forward_lookup_{ns}", + resolved is not None, + f"alias={alias}, resolved={resolved.digest[:16] if resolved else None}", + ) + except Exception as e: + check(f"{label}_coll_alias_forward_lookup_{ns}", False, f"error={e}") + + # Sequence alias count proportionality check + for ns in seq_ns: + try: + aliases = store.list_sequence_aliases(ns) + n_aliases = len(aliases) if aliases else 0 + check(f"{label}_seq_alias_count_{ns}", n_aliases > 0, f"n={n_aliases}") + except Exception as e: + check(f"{label}_seq_alias_count_{ns}", False, f"error={e}") + + +# ── Test 5: FHR metadata was imported ───────────────────────────────── + + +def test_fhr_metadata(store, label, digests): + """Check that FHR metadata exists for collections.""" + print(f"\n── {label}: FHR metadata ──") + + try: + fhr_digests = store.list_fhr_metadata() + n_fhr = len(fhr_digests) + check(f"{label}_fhr_exists", n_fhr > 0, f"n_with_fhr={n_fhr}") + except Exception as e: + check(f"{label}_fhr_exists", False, f"error={e}") + return + + # Verify FHR digests are in this store + fhr_set = set(fhr_digests) + orphan_fhr = fhr_set - digests + check( + f"{label}_fhr_no_orphans", + len(orphan_fhr) == 0, + f"orphaned_fhr={len(orphan_fhr)}" + (f", sample={list(orphan_fhr)[:3]}" if orphan_fhr else ""), + ) + + # Sample: read a few FHR records + sample = list(fhr_set & digests)[:5] + readable = 0 + for digest in sample: + try: + fhr = store.get_fhr_metadata(digest) + if fhr is not None: + readable += 1 + except Exception: + pass + + check( + f"{label}_fhr_readable", + readable == len(sample), + f"readable={readable}/{len(sample)}", + ) + + +# ── Test 6: Sequence retrieval works ────────────────────────────────── + + +def test_sequence_retrieval(store, label, digests): + """Verify sequences can be retrieved for sampled collections.""" + print(f"\n── {label}: Sequence retrieval ──") + + sample = sorted(digests)[:5] + ok_count = 0 + fail_details = [] + + for coll_digest in sample: + try: + level2 = store.get_collection_level2(coll_digest) + seq_digests = level2.get("sequences", []) + lengths = level2.get("lengths", []) + if not seq_digests: + fail_details.append(f"{coll_digest[:16]}: no sequences") + continue + + # Test first sequence in collection + seq = store.get_sequence(seq_digests[0]) + if seq is not None: + ok_count += 1 + else: + fail_details.append(f"{coll_digest[:16]}: get_sequence returned None") + except Exception as e: + fail_details.append(f"{coll_digest[:16]}: {e}") + + check( + f"{label}_sequence_retrieval", + ok_count == len(sample), + f"ok={ok_count}/{len(sample)}" + (f", failures=[{'; '.join(fail_details[:3])}]" if fail_details else ""), + ) + + +# ── Test 7: No overlap between VGP and ref stores ──────────────────── + + +def test_no_overlap(vgp_store, ref_store): + """Verify no collection appears in both stores.""" + print("\n── Cross-store: No overlap ──") + + vgp_digests = get_all_collection_digests(vgp_store) + ref_digests = get_all_collection_digests(ref_store) + + overlap = vgp_digests & ref_digests + check( + "no_collection_overlap", + len(overlap) == 0, + f"overlap={len(overlap)}" + (f", sample={list(overlap)[:3]}" if overlap else ""), + ) + + +# ── Test 8: Full coverage — VGP + ref = source ─────────────────────── + + +def test_full_coverage(vgp_store, ref_store, source_store): + """Verify VGP + ref stores together contain all source collections.""" + print("\n── Cross-store: Full coverage ──") + + vgp_digests = get_all_collection_digests(vgp_store) + ref_digests = get_all_collection_digests(ref_store) + source_digests = get_all_collection_digests(source_store) + + combined = vgp_digests | ref_digests + missing = source_digests - combined + extra = combined - source_digests + + check( + "combined_equals_source", + len(missing) == 0 and len(extra) == 0, + f"source={len(source_digests)}, vgp={len(vgp_digests)}, ref={len(ref_digests)}, " + f"combined={len(combined)}, missing={len(missing)}, extra={len(extra)}", + ) + + +# ── Test 9: Roundtrip FASTA export ─────────────────────────────────── + + +def test_roundtrip_fasta(store, label, digests, limit=3): + """Export a few collections to FASTA and verify digest matches.""" + print(f"\n── {label}: Roundtrip FASTA export ──") + + try: + from gtars.refget import digest_fasta + except ImportError: + check(f"{label}_roundtrip", False, "gtars.refget.digest_fasta not available") + return + + sample = sorted(digests)[:limit] + ok_count = 0 + fail_details = [] + + for digest in sample: + fd, tmp_path = tempfile.mkstemp(suffix=".fa") + os.close(fd) + try: + store.export_fasta(digest, tmp_path, None, 80) + exported_sc = digest_fasta(tmp_path) + match = exported_sc.digest == digest + if match: + ok_count += 1 + else: + fail_details.append( + f"{digest[:16]}: exported={exported_sc.digest[:16]} != original" + ) + except Exception as e: + fail_details.append(f"{digest[:16]}: {e}") + finally: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + + check( + f"{label}_roundtrip_fasta", + ok_count == len(sample), + f"ok={ok_count}/{len(sample)}" + (f", failures=[{'; '.join(fail_details)}]" if fail_details else ""), + ) + + +# ── Main ────────────────────────────────────────────────────────────── + + +def validate_store(store_path, label, expected_digests, thorough=False): + """Run all single-store validations.""" + from refget.store import RefgetStore + + store = test_store_opens(store_path, label) + if store is None: + return None + + store_digests = test_collection_counts(store, label, expected_digests) + + # Level2: check all in thorough mode, sample otherwise + limit = None if thorough else 20 + test_level2_integrity(store, label, store_digests, limit=limit) + + test_aliases(store, label, store_digests) + test_fhr_metadata(store, label, store_digests) + test_sequence_retrieval(store, label, store_digests) + + if thorough: + test_roundtrip_fasta(store, label, store_digests, limit=5) + + return store + + +def main(): + parser = argparse.ArgumentParser(description="Validate split RefgetStores") + parser.add_argument( + "--store", + choices=["vgp", "ref", "both"], + default="both", + help="Which store to validate (default: both)", + ) + parser.add_argument( + "--thorough", + action="store_true", + help="Run deep checks: all level2, roundtrip FASTA (slow)", + ) + parser.add_argument("--vgp-path", default=VGP_PATH) + parser.add_argument("--ref-path", default=REF_PATH) + parser.add_argument("--source-path", default=SOURCE_PATH) + parser.add_argument("--digest-map", default=DIGEST_MAP) + args = parser.parse_args() + + print(f"Validating split stores") + print(f" Source: {args.source_path}") + print(f" VGP: {args.vgp_path}") + print(f" Ref: {args.ref_path}") + print(f" Digest map: {args.digest_map}") + print(f" Thorough: {args.thorough}") + print("=" * 60) + + t_start = time.time() + + # Load digest map to compute expected sets + group_digests, dm_rows = load_digest_map(args.digest_map) + vgp_expected = set() + ref_expected = set() + for group, digests in group_digests.items(): + if group in VGP_GROUPS: + vgp_expected |= digests + else: + ref_expected |= digests + + print(f"\nDigest map: {len(dm_rows)} rows, " + f"VGP expected={len(vgp_expected)}, ref expected={len(ref_expected)}") + + vgp_store = None + ref_store = None + + if args.store in ("vgp", "both"): + vgp_store = validate_store(args.vgp_path, "vgp", vgp_expected, args.thorough) + + if args.store in ("ref", "both"): + ref_store = validate_store(args.ref_path, "ref", ref_expected, args.thorough) + + # Cross-store checks (only if both stores validated) + if vgp_store and ref_store: + test_no_overlap(vgp_store, ref_store) + + # Full coverage against source + from refget.store import RefgetStore + if RefgetStore.store_exists(args.source_path): + source_store = RefgetStore.open_local(args.source_path) + test_full_coverage(vgp_store, ref_store, source_store) + else: + check("full_coverage", False, f"source store not found: {args.source_path}") + + # Summary + elapsed = time.time() - t_start + print(f"\n{'=' * 60}") + print("VALIDATION SUMMARY") + print("=" * 60) + passed = sum(1 for r in results if r["status"] == "PASS") + failed = sum(1 for r in results if r["status"] == "FAIL") + print(f"Passed: {passed}") + print(f"Failed: {failed}") + print(f"Total: {passed + failed}") + print(f"Time: {elapsed:.1f}s") + + if failed > 0: + print("\nFailed checks:") + for r in results: + if r["status"] == "FAIL": + print(f" - {r['name']}: {r['detail']}") + + # Write JSON report + report_path = os.path.join(STAGING, "split_validation_report.json") + os.makedirs(STAGING, exist_ok=True) + with open(report_path, "w") as f: + json.dump({"results": results, "passed": passed, "failed": failed}, f, indent=2) + print(f"\nJSON report: {report_path}") + + sys.exit(1 if failed > 0 else 0) + + +if __name__ == "__main__": + main() diff --git a/data_loaders/ref-genome-analysis/src/validate_split_stores.sbatch b/data_loaders/ref-genome-analysis/src/validate_split_stores.sbatch new file mode 100644 index 0000000..d9c3b4c --- /dev/null +++ b/data_loaders/ref-genome-analysis/src/validate_split_stores.sbatch @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=validate_split +#SBATCH --output=validate_split_%j.log +#SBATCH --error=validate_split_%j.log +#SBATCH --partition=standard +#SBATCH --time=2:00:00 +#SBATCH --mem=16G +#SBATCH --cpus-per-task=1 +#SBATCH --account=shefflab + +module load miniforge/24.3.0-py3.11 + +cd $HOME/code/ref-genome-analysis +source env/on-cluster.env + +python src/validate_split_stores.py "$@" diff --git a/deployment/seqcolapi-store/production.env b/deployment/seqcolapi-store/production.env index fa32c22..ee36462 100644 --- a/deployment/seqcolapi-store/production.env +++ b/deployment/seqcolapi-store/production.env @@ -1,2 +1,2 @@ -export REFGET_STORE_URL="s3://seqcolapi-store/refget/" +export REFGET_STORE_URL="https://refgenie.s3.us-east-1.amazonaws.com/refget-store/vgp/" export SERVER_ENV="production" diff --git a/deployment/seqcolapi-store/task_def.json b/deployment/seqcolapi-store/task_def.json index 4a3d22a..7e74531 100644 --- a/deployment/seqcolapi-store/task_def.json +++ b/deployment/seqcolapi-store/task_def.json @@ -20,7 +20,7 @@ "environment": [ { "name": "REFGET_STORE_URL", - "value": "s3://seqcolapi-store/refget/" + "value": "https://refgenie.s3.us-east-1.amazonaws.com/refget-store/vgp/" } ], "resourceRequirements": null, diff --git a/deployment/store_demo/store_demo.env b/deployment/store_demo/store_demo.env new file mode 100644 index 0000000..05ac069 --- /dev/null +++ b/deployment/store_demo/store_demo.env @@ -0,0 +1,3 @@ +export REFGET_STORE_PATH="/tmp/refget_demo_store" +export SEQCOLAPI_PORT="8100" +export SERVER_ENV="dev" diff --git a/deployment/store_demo_up.sh b/deployment/store_demo_up.sh new file mode 100755 index 0000000..7e3bb24 --- /dev/null +++ b/deployment/store_demo_up.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# This script starts a local store-backed demo of the SeqCol API service + +# Use local source instead of installed package +export PYTHONPATH="$(pwd):$PYTHONPATH" + +# Function to handle cleanup on Ctrl+C +cleanup() { + echo "Stopping uvicorn (PID: $PID)..." + kill -15 $PID 2>/dev/null + wait $PID 2>/dev/null + echo "Uvicorn stopped." + if [ -n "$STORE_HTTP_PID" ]; then + echo "Stopping store HTTP server (PID: $STORE_HTTP_PID)..." + kill -15 $STORE_HTTP_PID 2>/dev/null + wait $STORE_HTTP_PID 2>/dev/null + fi + echo "Cleaning up demo store at $REFGET_STORE_PATH..." + rm -rf "$REFGET_STORE_PATH" + exit 0 +} + +# Load environment variables +source deployment/store_demo/store_demo.env + +echo "Building demo store from test FASTA files..." +python data_loaders/demo_build_store.py test_fasta "$REFGET_STORE_PATH" + +STORE_HTTP_PORT=8200 +echo "Starting HTTP file server for store on port $STORE_HTTP_PORT..." +STORE_DIR="$REFGET_STORE_PATH" STORE_PORT="$STORE_HTTP_PORT" python -c ' +import http.server, socketserver, os + +class CORSHandler(http.server.SimpleHTTPRequestHandler): + def end_headers(self): + self.send_header("Access-Control-Allow-Origin", "*") + super().end_headers() + def __init__(self, *args, **kwargs): + super().__init__(*args, directory=os.environ["STORE_DIR"], **kwargs) + +socketserver.TCPServer(("", int(os.environ["STORE_PORT"])), CORSHandler).serve_forever() +' & +STORE_HTTP_PID=$! +export REFGET_STORE_HTTP_URL="http://localhost:$STORE_HTTP_PORT" + +echo "Running store-backed uvicorn API service..." +uvicorn seqcolapi.main:store_app --reload --port ${SEQCOLAPI_PORT:-8100} & +PID=$! + +echo "" +echo "Store-backed seqcolapi is running at http://localhost:${SEQCOLAPI_PORT:-8100}" +echo " API docs: http://localhost:${SEQCOLAPI_PORT:-8100}/docs" +echo " Service info: http://localhost:${SEQCOLAPI_PORT:-8100}/service-info" +echo " Store files: $REFGET_STORE_HTTP_URL" +echo "" + +# Set up cleanup on Ctrl+C +trap cleanup SIGINT EXIT + +# Wait indefinitely until Ctrl+C is pressed +wait $PID diff --git a/frontend/src/components/APINav.jsx b/frontend/src/components/APINav.jsx new file mode 100644 index 0000000..5ca752f --- /dev/null +++ b/frontend/src/components/APINav.jsx @@ -0,0 +1,50 @@ +import { Link } from 'react-router-dom'; +import { useApiExplorerStore } from '../stores/apiExplorerStore.js'; + +const APINav = ({ active }) => { + const { apiUrl } = useApiExplorerStore(); + const urlParam = apiUrl ? `?url=${encodeURIComponent(apiUrl)}` : ''; + + const items = [ + { key: 'collections', label: 'Collections', path: '/explore-api/collections', icon: 'bi-collection' }, + { key: 'compare', label: 'Compare (SCIM)', path: '/explore-api/compare', icon: 'bi-arrows-angle-contract' }, + ]; + + return ( +
    +
    +

    + + API Explorer +

    + + + Change API + +
    + + {apiUrl && ( +
    + + {apiUrl} +
    + )} + +
      + {items.map((item) => ( +
    • + + + {item.label} + +
    • + ))} +
    +
    + ); +}; + +export { APINav }; diff --git a/frontend/src/components/CompareTable.jsx b/frontend/src/components/CompareTable.jsx index 4b349b3..0ad9ca4 100644 --- a/frontend/src/components/CompareTable.jsx +++ b/frontend/src/components/CompareTable.jsx @@ -36,7 +36,7 @@ const CompareTable = ({ seqColDict }) => { '=' ) : ( { + const [copied, setCopied] = useState(false); + const handleCopy = (e) => { + e.stopPropagation(); + navigator.clipboard.writeText(value).then(() => { + setCopied(true); + setTimeout(() => setCopied(false), 1500); + }); + }; + return ( + + {value} + + + ); +}; + +export { CopyableDigest }; diff --git a/frontend/src/components/ExplorerNav.jsx b/frontend/src/components/ExplorerNav.jsx new file mode 100644 index 0000000..6562691 --- /dev/null +++ b/frontend/src/components/ExplorerNav.jsx @@ -0,0 +1,37 @@ +import { Link } from 'react-router-dom'; +import { useUnifiedStore } from '../stores/unifiedStore.js'; + +const ExplorerNav = ({ active }) => { + const { hasStore, hasAPI } = useUnifiedStore(); + + const items = [ + { key: 'collections', label: 'Collections', path: '/collections', icon: 'bi-collection' }, + { key: 'sequences', label: 'Sequences', path: '/sequences', icon: 'bi-list-ol', requireStore: true }, + { key: 'aliases', label: 'Aliases', path: '/aliases', icon: 'bi-tag', requireStore: true }, + { key: 'compare', label: 'Compare', path: '/compare', icon: 'bi-arrows-angle-contract', requireAPI: true }, + ]; + + const visibleItems = items.filter((item) => { + if (item.requireStore && !hasStore) return false; + if (item.requireAPI && !hasAPI) return false; + return true; + }); + + return ( +
      + {visibleItems.map((item) => ( +
    • + + + {item.label} + +
    • + ))} +
    + ); +}; + +export { ExplorerNav }; diff --git a/frontend/src/components/SequenceTable.jsx b/frontend/src/components/SequenceTable.jsx new file mode 100644 index 0000000..1df6700 --- /dev/null +++ b/frontend/src/components/SequenceTable.jsx @@ -0,0 +1,188 @@ +import { useState, useMemo } from 'react'; +import { CopyableDigest } from './CopyableDigest.jsx'; +import { CliCommand } from './CliSnippet.jsx'; + +const PAGE_SIZE = 50; + +/** + * Paginated sequence table with detail modal. + * + * Props: + * sequences: array of {name, length, sha512t24u, md5, alphabet, description} + * storeUrl: optional store URL for code snippets in modal + * sortable: if true, column headers are clickable to sort + */ +const SequenceTable = ({ sequences, storeUrl, sortable = false }) => { + const [page, setPage] = useState(0); + const [selectedSeq, setSelectedSeq] = useState(null); + const [codeTab, setCodeTab] = useState('cli'); + const [sortCol, setSortCol] = useState(null); + const [sortAsc, setSortAsc] = useState(true); + + const handleSort = (col) => { + if (!sortable) return; + if (sortCol === col) setSortAsc(!sortAsc); + else { setSortCol(col); setSortAsc(true); } + setPage(0); + }; + + const sorted = useMemo(() => { + if (!sortable || !sortCol) return sequences; + return [...sequences].sort((a, b) => { + const va = a[sortCol]; + const vb = b[sortCol]; + if (typeof va === 'number' && typeof vb === 'number') + return sortAsc ? va - vb : vb - va; + return sortAsc + ? String(va).localeCompare(String(vb)) + : String(vb).localeCompare(String(va)); + }); + }, [sequences, sortCol, sortAsc, sortable]); + + const totalPages = Math.ceil(sorted.length / PAGE_SIZE); + const paged = sorted.slice(page * PAGE_SIZE, (page + 1) * PAGE_SIZE); + + const SortIcon = ({ col }) => { + if (!sortable || sortCol !== col) return null; + return ; + }; + + const thStyle = sortable ? { cursor: 'pointer' } : {}; + + return ( + <> +
    + + + + + + + + + + + {paged.map((seq, i) => ( + + + + + + + ))} + +
    handleSort('name')}> + Name + handleSort('length')}> + Length + handleSort('sha512t24u')}> + SHA-512/24u +
    {seq.name}{seq.length.toLocaleString()} + +
    +
    + + {totalPages > 1 && ( +
    + +
    + )} + + {/* Sequence detail modal */} + {selectedSeq && ( + <> +
    setSelectedSeq(null)} /> +
    setSelectedSeq(null)}> +
    e.stopPropagation()}> +
    +
    +
    {selectedSeq.name}
    +
    +
    + + + + + + + + + + + + + + + + + + + {selectedSeq.description && ( + + + + + )} + +
    Length{selectedSeq.length.toLocaleString()}
    Alphabet{selectedSeq.alphabet}
    SHA-512/24u
    MD5
    Description{selectedSeq.description}
    + {storeUrl && ( + <> +
    Code
    +
      +
    • + +
    • +
    • + +
    • +
    + Get sequence + + + )} +
    +
    +
    +
    + + )} + + ); +}; + +export { SequenceTable }; diff --git a/frontend/src/components/StoreNav.jsx b/frontend/src/components/StoreNav.jsx index f24d50f..1ba2cd5 100644 --- a/frontend/src/components/StoreNav.jsx +++ b/frontend/src/components/StoreNav.jsx @@ -11,9 +11,9 @@ const StoreNav = ({ active, storeUrlParam, collectionDigest }) => { const remote = storeUrl || new URLSearchParams(storeUrlParam).get('url') || ''; const items = [ - { key: 'overview', label: 'Overview', path: '/explore/store', icon: 'bi-house' }, - { key: 'sequences', label: 'Sequences', path: '/explore/store/sequences', icon: 'bi-list-ol' }, - { key: 'aliases', label: 'Aliases', path: '/explore/store/aliases', icon: 'bi-tag' }, + { key: 'overview', label: 'Overview', path: '/explore-store/overview', icon: 'bi-house' }, + { key: 'sequences', label: 'Sequences', path: '/explore-store/sequences', icon: 'bi-list-ol' }, + { key: 'aliases', label: 'Aliases', path: '/explore-store/aliases', icon: 'bi-tag' }, ]; const snippetGroups = [ @@ -137,7 +137,7 @@ store.chrom_sizes("${collectionDigest}")`, Code - + Change Store diff --git a/frontend/src/main.jsx b/frontend/src/main.jsx index 5a6f613..db616d1 100644 --- a/frontend/src/main.jsx +++ b/frontend/src/main.jsx @@ -11,27 +11,43 @@ import 'bootstrap/dist/css/bootstrap.css'; import 'bootstrap/dist/js/bootstrap.bundle.js'; import 'bootstrap-icons/font/bootstrap-icons.css'; -import { CollectionView } from './pages/CollectionView.jsx'; +import { useUnifiedStore } from './stores/unifiedStore.js'; + +// Unified Explorer pages +import { LandingPage } from './pages/LandingPage.jsx'; +import { Explorer } from './pages/Explorer.jsx'; +import { ExplorerCollection } from './pages/ExplorerCollection.jsx'; +import { ExplorerSequences } from './pages/ExplorerSequences.jsx'; +import { ExplorerAliases } from './pages/ExplorerAliases.jsx'; + +// API Explorer pages +import { APIExplorer } from './pages/APIExplorer.jsx'; +import { APICollections } from './pages/APICollections.jsx'; +import { APICollectionView } from './pages/APICollectionView.jsx'; +import { APICompare } from './pages/APICompare.jsx'; +import { APICompliance } from './pages/APICompliance.jsx'; + +// Store Explorer pages +import { StoreExplorer } from './pages/StoreExplorer.jsx'; +import { StoreOverview } from './pages/StoreOverview.jsx'; +import { StoreSequences } from './pages/StoreSequences.jsx'; +import { StoreCollection } from './pages/StoreCollection.jsx'; +import { StoreAliases } from './pages/StoreAliases.jsx'; + +// Site-specific pages import { PangenomeView } from './pages/PangenomeView.jsx'; import { AttributeView } from './pages/AttributeView.jsx'; import { DemoPage } from './pages/DemoPage.jsx'; import { SCIM } from './pages/SCIM.jsx'; import { SCOM } from './pages/SCOM.jsx'; -import { HomePage } from './pages/HomePage.jsx'; import { HPRCGenomes } from './pages/HPRCGenomes.jsx'; import { HumanReferencesView } from './pages/HumanReferences.jsx'; import { DigestPage } from './pages/DigestPage.jsx'; import { CompliancePage } from './pages/CompliancePage.jsx'; -import { StoreExplorer } from './pages/StoreExplorer.jsx'; -import { StoreOverview } from './pages/StoreOverview.jsx'; -import { StoreSequences } from './pages/StoreSequences.jsx'; -import { StoreCollection } from './pages/StoreCollection.jsx'; -import { StoreAliases } from './pages/StoreAliases.jsx'; import { fetchServiceInfo, fetchPangenomeLevels, - fetchSeqColList, fetchAllSeqCols, fetchCollectionLevels, fetchComparison, @@ -52,9 +68,38 @@ import { import { API_BASE } from './utilities.jsx'; +const NavItem = ({ path, label, location, navigate, isDropdown }) => { + const active = path === '/' + ? location === '' + : location.startsWith(path.substring(1)); + + return ( +
  • + navigate(path)} + className={`nav-link cursor-pointer ${active ? 'fw-medium text-black' : 'fw-light'}`} + > + {label} + +
  • + ); +}; + const Nav = () => { const navigate = useNavigate(); const location = useLocation().pathname.substring(1) || ''; + const { serviceInfo } = useUnifiedStore(); + const scomEnabled = serviceInfo?.seqcol?.scom?.enabled; + + const navTo = (path) => { + navigate(path); + // Close any open Bootstrap dropdown + document.querySelectorAll('.dropdown-menu.show').forEach((el) => { + el.classList.remove('show'); + el.previousElementSibling?.classList.remove('show'); + el.previousElementSibling?.setAttribute('aria-expanded', 'false'); + }); + }; return (
    @@ -212,35 +247,47 @@ class ReactErrorBoundary extends React.Component { const App = () => { const loaderData = useLoaderData(); + const apiAvailable = loaderData != null; + const version = loaderData?.version; + return ( <>