From a70e456318bd1752a8cb6f5decec1775bffd4767 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20L=C3=B3pez=20L=C3=B3pez?= Date: Tue, 24 Mar 2026 12:23:58 +0100 Subject: [PATCH 1/8] feat: add SampleCarrier model --- src/afquery/models.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/afquery/models.py b/src/afquery/models.py index 4b7cf3a..12002b3 100644 --- a/src/afquery/models.py +++ b/src/afquery/models.py @@ -73,3 +73,15 @@ class QueryResult: N_HOM_ALT: int = 0 N_HOM_REF: int = 0 N_FAIL: int = 0 + + +@dataclass +class SampleCarrier: + """A sample carrying a given variant, with its associated metadata.""" + sample_id: int + sample_name: str + sex: str # 'male' | 'female' + tech_name: str + phenotypes: list[str] + genotype: str # 'het' | 'hom' | 'alt' (FILTER≠PASS, ploidy unknown) + filter_pass: bool | None # None = v1 DB (no fail_bitmap) From 15d10c354ae5c7050f48a04416acd4ef47617b7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20L=C3=B3pez=20L=C3=B3pez?= Date: Tue, 24 Mar 2026 12:33:24 +0100 Subject: [PATCH 2/8] feat: add variant_info engine method with phenotype cache --- src/afquery/models.py | 2 +- src/afquery/query.py | 115 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 115 insertions(+), 2 deletions(-) diff --git a/src/afquery/models.py b/src/afquery/models.py index 12002b3..996a5ac 100644 --- a/src/afquery/models.py +++ b/src/afquery/models.py @@ -84,4 +84,4 @@ class SampleCarrier: tech_name: str phenotypes: list[str] genotype: str # 'het' | 'hom' | 'alt' (FILTER≠PASS, ploidy unknown) - filter_pass: bool | None # None = v1 DB (no fail_bitmap) + filter_pass: bool # False = FILTER≠PASS diff --git a/src/afquery/query.py b/src/afquery/query.py index 1f1dd76..72a5a41 100644 --- a/src/afquery/query.py +++ b/src/afquery/query.py @@ -9,7 +9,7 @@ from .bitmaps import deserialize from .capture import CaptureIndex, load_capture_indices from .constants import normalize_chrom, ALL_CHROMS, CHROM_ORDER -from .models import AfqueryWarning, QueryParams, QueryResult, VariantKey, Sample, Technology, SampleFilter +from .models import AfqueryWarning, QueryParams, QueryResult, SampleCarrier, VariantKey, Sample, Technology, SampleFilter from .ploidy import compute_AN, split_ploidy BATCH_IN_THRESHOLD = 10_000 @@ -43,9 +43,18 @@ def __init__(self, db_path: str): "SELECT tech_id, tech_name, bed_path FROM technologies" ).fetchall() ] + + # Phenotype cache: sample_id → list of phenotype codes + self._sample_phenotypes: dict[int, list[str]] = {} + for row in con.execute( + "SELECT sample_id, phenotype_code FROM sample_phenotype" + ).fetchall(): + self._sample_phenotypes.setdefault(row[0], []).append(row[1]) + con.close() self._samples = samples + self._samples_by_id: dict[int, Sample] = {s.sample_id: s for s in samples} self._tech_map = {t.tech_id: t for t in techs} sex_bms = self._bitmaps.get("sex", {}) self._male_bm = sex_bms.get("male", BitMap()) @@ -549,3 +558,107 @@ def query_region( sample_bm = self._build_sample_bitmap(sf) return self._query_region_inner(chrom, start, end, sample_bm) + + def variant_info(self, params: QueryParams) -> list[SampleCarrier]: + """Return all samples carrying the variant at params.chrom:params.pos. + + Each returned :class:`~afquery.models.SampleCarrier` includes the + sample's name, sex, technology, phenotype codes, genotype (het/hom/alt), + and FILTER status. ``filter_pass=None`` is returned for v1 databases + that do not store a ``fail_bitmap``. + + Args: + params: Query parameters (chrom, pos, optional ref/alt, sample filter). + + Returns: + List of :class:`~afquery.models.SampleCarrier`, sorted by + ``sample_id``. Empty if the variant is absent or no eligible + carrier exists. + """ + chrom = normalize_chrom(params.chrom) + pos = params.pos + + if chrom not in self._all_known_chroms: + warnings.warn( + f"Chromosome {chrom!r} has no data in this database. " + f"Available: {sorted(self._all_known_chroms)!r}", + AfqueryWarning, stacklevel=3, + ) + return [] + + sample_bm = self._build_sample_bitmap(params.filter) + parquet_path = self._parquet_path(chrom, pos) + if parquet_path is None: + return [] + + con = duckdb.connect(config={"temp_directory": "/tmp"}) + rows = con.execute( + "SELECT pos, ref, alt, het_bitmap, hom_bitmap, fail_bitmap" + " FROM read_parquet(?) WHERE pos = ?", + [str(parquet_path), pos], + ).fetchall() + con.close() + + if not rows: + return [] + + if params.ref is not None: + rows = [r for r in rows if r[1] == params.ref] + if params.alt is not None: + rows = [r for r in rows if r[2] == params.alt] + if not rows: + return [] + + # Warn if multiple alleles found without explicit ref/alt filter + if len(rows) > 1 and params.ref is None and params.alt is None: + alts = ", ".join(r[2] for r in rows) + warnings.warn( + f"Multiple alleles found at {chrom}:{pos} ({alts}). " + "Use --ref/--alt to restrict to a specific variant.", + AfqueryWarning, stacklevel=3, + ) + + carriers: list[SampleCarrier] = [] + for row in rows: + row_pos, ref, alt, het_bytes, hom_bytes, fail_bytes = row + het_bm = deserialize(bytes(het_bytes)) + hom_bm = deserialize(bytes(hom_bytes)) + fail_bm = deserialize(bytes(fail_bytes)) + het_elig = het_bm & sample_bm + hom_elig = hom_bm & sample_bm + fail_elig = fail_bm & sample_bm + + seen: set[int] = set() + for sid in sorted(hom_elig): + seen.add(sid) + carriers.append(self._make_carrier(sid, "hom", True)) + for sid in sorted(het_elig): + if sid not in seen: + seen.add(sid) + carriers.append(self._make_carrier(sid, "het", True)) + for sid in sorted(fail_elig): + if sid not in seen: + seen.add(sid) + carriers.append(self._make_carrier(sid, "alt", False)) + + carriers.sort(key=lambda c: c.sample_id) + return carriers + + def _make_carrier( + self, + sample_id: int, + genotype: str, + filter_pass: bool, + ) -> SampleCarrier: + s = self._samples_by_id[sample_id] + tech_name = self._tech_map[s.tech_id].tech_name + phenotypes = self._sample_phenotypes.get(sample_id, []) + return SampleCarrier( + sample_id=sample_id, + sample_name=s.sample_name, + sex=s.sex, + tech_name=tech_name, + phenotypes=sorted(phenotypes), + genotype=genotype, + filter_pass=filter_pass, + ) From 7143dcae20a325d8f62f9bf21e510af327e5903c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20L=C3=B3pez=20L=C3=B3pez?= Date: Tue, 24 Mar 2026 12:34:46 +0100 Subject: [PATCH 3/8] feat: expose variant_info in Database public API --- src/afquery/__init__.py | 5 ++-- src/afquery/database.py | 36 ++++++++++++++++++++++++++- src/afquery/variant_info.py | 49 +++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 3 deletions(-) create mode 100644 src/afquery/variant_info.py diff --git a/src/afquery/__init__.py b/src/afquery/__init__.py index 137c149..0103038 100644 --- a/src/afquery/__init__.py +++ b/src/afquery/__init__.py @@ -1,5 +1,6 @@ from afquery._version import __version__ from .database import Database -from .models import QueryResult, AfqueryWarning +from .models import QueryResult, SampleCarrier, AfqueryWarning +from .variant_info import variant_info -__all__ = ["Database", "QueryResult", "AfqueryWarning", "__version__"] +__all__ = ["Database", "QueryResult", "SampleCarrier", "AfqueryWarning", "variant_info", "__version__"] diff --git a/src/afquery/database.py b/src/afquery/database.py index dbd3e99..12f2c95 100644 --- a/src/afquery/database.py +++ b/src/afquery/database.py @@ -2,7 +2,7 @@ import sqlite3 from pathlib import Path -from .models import QueryParams, QueryResult, SampleFilter +from .models import QueryParams, QueryResult, SampleCarrier, SampleFilter from .query import QueryEngine @@ -86,6 +86,40 @@ def query_region( sf = self._make_filter(phenotype, sex, tech) return self._engine.query_region(chrom, start, end, sf) + def variant_info( + self, + chrom: str, + pos: int, + ref: str | None = None, + alt: str | None = None, + phenotype: list[str] | None = None, + sex: str = "both", + tech: list[str] | None = None, + ) -> list[SampleCarrier]: + """Return all samples carrying a variant, with their metadata. + + Each element includes sample name, sex, technology, phenotype codes, + genotype (``"het"``, ``"hom"``, or ``"alt"`` for FILTER≠PASS samples), + and FILTER status. + + Args: + chrom: Chromosome (e.g. ``"chr1"`` or ``"1"``). + pos: Position, 1-based. + ref: Reference allele filter. If omitted, all alleles at *pos* are returned. + alt: Alternate allele filter. If omitted, all alleles at *pos* are returned. + phenotype: Phenotype filter tokens. Use ``"^CODE"`` prefix to exclude. + sex: ``"both"`` (default), ``"male"``, or ``"female"``. + tech: Technology filter tokens. Use ``"^TECH"`` prefix to exclude. + + Returns: + List of :class:`~afquery.models.SampleCarrier` sorted by + ``sample_id``. Empty list if the variant is absent or no eligible + carrier exists. + """ + sf = self._make_filter(phenotype, sex, tech) + params = QueryParams(chrom=chrom, pos=pos, filter=sf, ref=ref, alt=alt) + return self._engine.variant_info(params) + def query_region_multi( self, regions: list[tuple[str, int, int]], diff --git a/src/afquery/variant_info.py b/src/afquery/variant_info.py new file mode 100644 index 0000000..b07fd0a --- /dev/null +++ b/src/afquery/variant_info.py @@ -0,0 +1,49 @@ +"""variant_info — retrieve sample carriers for a specific variant. + +Thin convenience wrapper around :class:`~afquery.database.Database` for +programmatic use without instantiating the class directly. +""" +from .database import Database +from .models import SampleCarrier + + +def variant_info( + db_path: str, + chrom: str, + pos: int, + ref: str | None = None, + alt: str | None = None, + phenotype: list[str] | None = None, + sex: str = "both", + tech: list[str] | None = None, +) -> list[SampleCarrier]: + """Return all samples carrying the given variant, with their metadata. + + Args: + db_path: Path to the AFQuery database directory. + chrom: Chromosome (e.g. ``"chr1"`` or ``"1"``). + pos: Position, 1-based. + ref: Reference allele filter. If omitted, all alleles at *pos* are returned. + alt: Alternate allele filter. If omitted, all alleles at *pos* are returned. + phenotype: Phenotype filter tokens. Use ``"^CODE"`` prefix to exclude. + sex: ``"both"`` (default), ``"male"``, or ``"female"``. + tech: Technology filter tokens. Use ``"^TECH"`` prefix to exclude. + + Returns: + List of :class:`~afquery.models.SampleCarrier` sorted by + ``sample_id``. Empty list if the variant is absent or no eligible + carrier exists. + """ + db = Database(db_path) + return db.variant_info( + chrom=chrom, + pos=pos, + ref=ref, + alt=alt, + phenotype=phenotype, + sex=sex, + tech=tech, + ) + + +__all__ = ["variant_info"] From 402664f5bce2feccc6750a4a8d696271f6f23c12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20L=C3=B3pez=20L=C3=B3pez?= Date: Tue, 24 Mar 2026 12:37:14 +0100 Subject: [PATCH 4/8] feat: add variant-info CLI command --- src/afquery/cli.py | 94 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/src/afquery/cli.py b/src/afquery/cli.py index e13212b..97e978e 100644 --- a/src/afquery/cli.py +++ b/src/afquery/cli.py @@ -109,6 +109,57 @@ def _print_results(results, fmt: str) -> None: ) +def _print_carriers(carriers, variant_key, fmt: str) -> None: + """Print variant_info results in the requested format.""" + if fmt == "json": + out = { + "variant": { + "chrom": variant_key[0], "pos": variant_key[1], + "ref": variant_key[2], "alt": variant_key[3], + }, + "samples": [ + { + "sample_id": c.sample_id, + "sample_name": c.sample_name, + "sex": c.sex, + "tech": c.tech_name, + "phenotypes": c.phenotypes, + "genotype": c.genotype, + "filter": "PASS" if c.filter_pass else "FAIL", + } + for c in carriers + ], + } + click.echo(json.dumps(out, indent=2)) + elif fmt == "tsv": + click.echo("sample_id\tsample_name\tsex\ttech\tphenotypes\tgenotype\tfilter") + for c in carriers: + click.echo( + f"{c.sample_id}\t{c.sample_name}\t{c.sex}\t{c.tech_name}\t" + f"{','.join(c.phenotypes)}\t{c.genotype}\t" + f"{'PASS' if c.filter_pass else 'FAIL'}" + ) + else: # text + if not carriers: + click.echo("No carriers found for the given filters.") + return + headers = ["sample_id", "sample_name", "sex", "tech", "phenotypes", "genotype", "filter"] + rows = [ + [ + str(c.sample_id), c.sample_name, c.sex, c.tech_name, + ",".join(c.phenotypes) if c.phenotypes else ".", + c.genotype, "PASS" if c.filter_pass else "FAIL", + ] + for c in carriers + ] + widths = [max(len(h), max((len(r[i]) for r in rows), default=0)) for i, h in enumerate(headers)] + fmt_row = " ".join(f"{{:<{w}}}" for w in widths) + click.echo(fmt_row.format(*headers)) + click.echo(" ".join("-" * w for w in widths)) + for row in rows: + click.echo(fmt_row.format(*row)) + + @click.group() def cli(): """AFQuery: bitmap-indexed allele frequency engine for local genomic cohorts. @@ -116,7 +167,7 @@ def cli(): Enables fast AC/AN/AF queries on user-defined subcohorts (phenotype, sex, technology) without rescanning VCFs. - Commands: query, annotate, dump, info, version, create-db, update-db, check, benchmark + Commands: query, variant-info, annotate, dump, info, version, create-db, update-db, check, benchmark """ @@ -178,6 +229,47 @@ def query(db, locus, region, from_file, phenotype, sex, ref, alt, tech, fmt, no_ _print_results(results, fmt) +@cli.command("variant-info") +@click.option("--db", required=True, help="Path to database directory.") +@click.option("--locus", required=True, help="Position as CHROM:POS (e.g., chr1:925952).") +@click.option("--ref", default=None, help="Filter to specific reference allele.") +@click.option("--alt", default=None, help="Filter to specific alternate allele.") +@click.option("--phenotype", multiple=True, help="Phenotype to include. Repeatable; comma-separated or multiple flags. Use ^ prefix to exclude. (default: include all samples)") +@click.option("--sex", default="both", type=click.Choice(["male", "female", "both"]), help="Restrict to specified sex. Options: male, female, both. (default: both)") +@click.option("--tech", multiple=True, help="Technology filter. Repeatable; comma-separated or multiple flags. Use ^ prefix to exclude. (default: include all samples)") +@click.option("--format", "fmt", default="text", type=click.Choice(["text", "json", "tsv"]), help="Output format. Options: text, json, tsv. (default: text)") +@click.option("--no-warn", is_flag=True, default=False, help="Suppress AfqueryWarning messages.") +def variant_info_cmd(db, locus, ref, alt, phenotype, sex, tech, fmt, no_warn): + """List samples carrying a specific variant, with their metadata. + + Returns one row per carrier sample with genotype (het/hom/alt) and FILTER + status (PASS/FAIL). Use --ref/--alt to restrict to a specific allele when + multiple alleles share the same position. + + \b + Examples: + afquery variant-info --db ./db --locus chr1:925952 + afquery variant-info --db ./db --locus chr1:925952 --ref A --alt T + afquery variant-info --db ./db --locus chrX:2700000 --sex female --format tsv + """ + if no_warn: + import warnings + from .models import AfqueryWarning + warnings.filterwarnings("ignore", category=AfqueryWarning) + + chrom, pos = _parse_locus(locus) + database = Database(db) + carriers = database.variant_info( + chrom=chrom, pos=pos, + ref=ref, alt=alt, + phenotype=_expand_tokens(phenotype), sex=sex, + tech=_expand_tokens(tech), + ) + + _variant_key = (chrom, pos, ref or ".", alt or ".") + _print_carriers(carriers, _variant_key, fmt) + + @cli.command() @click.option("--db", required=True, help="Path to database directory.") @click.option("--input", "input_vcf", required=True, help="Input VCF file (plain or .gz).") From 8ed6b4a53cec3ac5f21c1b9112b9b4a29c1e6249 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20L=C3=B3pez=20L=C3=B3pez?= Date: Tue, 24 Mar 2026 12:40:38 +0100 Subject: [PATCH 5/8] test: add variant_info tests --- src/afquery/query.py | 6 +- tests/test_variant_info.py | 358 +++++++++++++++++++++++++++++++++++++ 2 files changed, 362 insertions(+), 2 deletions(-) create mode 100644 tests/test_variant_info.py diff --git a/src/afquery/query.py b/src/afquery/query.py index 72a5a41..086d287 100644 --- a/src/afquery/query.py +++ b/src/afquery/query.py @@ -631,11 +631,13 @@ def variant_info(self, params: QueryParams) -> list[SampleCarrier]: seen: set[int] = set() for sid in sorted(hom_elig): seen.add(sid) - carriers.append(self._make_carrier(sid, "hom", True)) + is_fail = sid in fail_elig + carriers.append(self._make_carrier(sid, "alt" if is_fail else "hom", not is_fail)) for sid in sorted(het_elig): if sid not in seen: seen.add(sid) - carriers.append(self._make_carrier(sid, "het", True)) + is_fail = sid in fail_elig + carriers.append(self._make_carrier(sid, "alt" if is_fail else "het", not is_fail)) for sid in sorted(fail_elig): if sid not in seen: seen.add(sid) diff --git a/tests/test_variant_info.py b/tests/test_variant_info.py new file mode 100644 index 0000000..f215791 --- /dev/null +++ b/tests/test_variant_info.py @@ -0,0 +1,358 @@ +"""Tests for variant_info command: sample carrier retrieval with metadata.""" +import json +import warnings + +import pytest +from click.testing import CliRunner + +from afquery import Database, SampleCarrier +from afquery.cli import cli +from afquery.models import AfqueryWarning + +# --------------------------------------------------------------------------- +# Reference data from conftest.py +# --------------------------------------------------------------------------- +# VARIANTS = [ +# ("chr1", 1500, "A", "T", het=[0,5], hom=[2], fail=[0]), # S00 in both het+fail +# ("chr1", 3500, "G", "C", het=[7], hom=[], fail=[]), +# ("chr1", 5000, "T", "G", het=[0,1], hom=[3], fail=[1,3]), +# ("chrX", 5000000, "A", "G", het=[0,2], hom=[], fail=[]), +# ("chrY", 500000, "T", "C", het=[0,1], hom=[4], fail=[]), +# ("chrM", 100, "C", "A", het=[0,2,5], hom=[], fail=[5]), +# ] +# SAMPLES (id, name, sex, tech_id): +# 0=S00/male/WGS, 1=S01/male/WGS, 2=S02/female/WGS, 3=S03/female/WGS, +# 4=S04/male/WES_kit_A, 5=S05/female/WES_kit_A, 6=S06/female/WES_kit_A, +# 7=S07/male/WES_kit_B, 8=S08/male/WES_kit_B, 9=S09/female/WES_kit_B +# --------------------------------------------------------------------------- + + +# --------------------------------------------------------------------------- +# 1. Basic retrieval +# --------------------------------------------------------------------------- + +def test_basic_returns_carriers(test_db): + db = Database(test_db) + carriers = db.variant_info("chr1", 3500, ref="G", alt="C") + assert len(carriers) == 1 + assert carriers[0].sample_id == 7 + assert carriers[0].sample_name == "S07" + assert carriers[0].genotype == "het" + assert carriers[0].filter_pass is True + + +def test_hom_genotype(test_db): + db = Database(test_db) + carriers = db.variant_info("chr1", 1500, ref="A", alt="T") + by_id = {c.sample_id: c for c in carriers} + # S02 is hom/PASS + assert by_id[2].genotype == "hom" + assert by_id[2].filter_pass is True + + +def test_fail_genotype(test_db): + """S00 is in both het_bitmap and fail_bitmap → should be reported as alt/FAIL.""" + db = Database(test_db) + carriers = db.variant_info("chr1", 1500, ref="A", alt="T") + by_id = {c.sample_id: c for c in carriers} + assert by_id[0].genotype == "alt" + assert by_id[0].filter_pass is False + + +def test_pass_het_genotype(test_db): + """S05 is in het_bitmap but not fail_bitmap → het/PASS.""" + db = Database(test_db) + carriers = db.variant_info("chr1", 1500, ref="A", alt="T") + by_id = {c.sample_id: c for c in carriers} + assert by_id[5].genotype == "het" + assert by_id[5].filter_pass is True + + +def test_sorted_by_sample_id(test_db): + db = Database(test_db) + carriers = db.variant_info("chr1", 1500, ref="A", alt="T") + ids = [c.sample_id for c in carriers] + assert ids == sorted(ids) + + +# --------------------------------------------------------------------------- +# 2. Metadata included +# --------------------------------------------------------------------------- + +def test_sex_in_metadata(test_db): + db = Database(test_db) + carriers = db.variant_info("chr1", 3500, ref="G", alt="C") + by_id = {c.sample_id: c for c in carriers} + assert by_id[7].sex == "male" + + +def test_tech_in_metadata(test_db): + db = Database(test_db) + carriers = db.variant_info("chr1", 3500, ref="G", alt="C") + by_id = {c.sample_id: c for c in carriers} + assert by_id[7].tech_name == "WES_kit_B" + + +def test_phenotypes_in_metadata(test_db): + """S07 has phenotypes E11.9 and J45 (from conftest SAMPLE_PHENOTYPE).""" + db = Database(test_db) + carriers = db.variant_info("chr1", 3500, ref="G", alt="C") + by_id = {c.sample_id: c for c in carriers} + assert set(by_id[7].phenotypes) == {"E11.9", "J45"} + + +def test_phenotypes_sorted(test_db): + db = Database(test_db) + carriers = db.variant_info("chr1", 3500, ref="G", alt="C") + for c in carriers: + assert c.phenotypes == sorted(c.phenotypes) + + +# --------------------------------------------------------------------------- +# 3. All fail_bitmap cases — chr1:5000 T>G +# het=[0,1], hom=[3], fail=[1,3] +# → S01: het+fail → alt/FAIL +# → S03: hom+fail → alt/FAIL +# → S00: het only → het/PASS +# --------------------------------------------------------------------------- + +def test_fail_chr1_5000(test_db): + db = Database(test_db) + carriers = db.variant_info("chr1", 5000, ref="T", alt="G") + by_id = {c.sample_id: c for c in carriers} + # S00: het, no fail + assert by_id[0].genotype == "het" + assert by_id[0].filter_pass is True + # S01: het + fail + assert by_id[1].genotype == "alt" + assert by_id[1].filter_pass is False + # S03: hom + fail + assert by_id[3].genotype == "alt" + assert by_id[3].filter_pass is False + + +# --------------------------------------------------------------------------- +# 4. Sample filter: phenotype +# --------------------------------------------------------------------------- + +def test_phenotype_filter(test_db): + """chr1:1500 carriers with E11.9 only: S00(0), S05(5) [S02 has E11.9 too].""" + db = Database(test_db) + # E11.9 carriers at chr1:1500: S00(0), S02(2), S05(5) + carriers_all = db.variant_info("chr1", 1500, ref="A", alt="T") + carriers_ph = db.variant_info("chr1", 1500, ref="A", alt="T", phenotype=["E11.9"]) + ids_all = {c.sample_id for c in carriers_all} + ids_ph = {c.sample_id for c in carriers_ph} + # Filtered set must be a subset + assert ids_ph <= ids_all + # Only samples with E11.9 should appear + for c in carriers_ph: + assert "E11.9" in c.phenotypes + + +def test_phenotype_exclude(test_db): + """Exclude E11.9: S00(0) and S05(5) should be absent, S02(2) without E11.9 would remain, but S02 has E11.9 too. Let's check S02 is gone.""" + db = Database(test_db) + carriers = db.variant_info("chr1", 1500, ref="A", alt="T", phenotype=["^E11.9"]) + ids = {c.sample_id for c in carriers} + # S00, S02, S05 all have E11.9 → all excluded → empty + for sid in [0, 2, 5]: + assert sid not in ids + + +# --------------------------------------------------------------------------- +# 5. Sample filter: sex +# --------------------------------------------------------------------------- + +def test_sex_filter_male(test_db): + """chr1:1500 — male carriers only. S00(male), S05(female), S02(female). → only S00.""" + db = Database(test_db) + carriers = db.variant_info("chr1", 1500, ref="A", alt="T", sex="male") + ids = {c.sample_id for c in carriers} + assert ids == {0} + + +def test_sex_filter_female(test_db): + db = Database(test_db) + carriers = db.variant_info("chr1", 1500, ref="A", alt="T", sex="female") + for c in carriers: + assert c.sex == "female" + + +# --------------------------------------------------------------------------- +# 6. Sample filter: technology +# --------------------------------------------------------------------------- + +def test_tech_filter(test_db): + """chr1:1500 — WGS only. Carriers S00(WGS), S02(WGS), S05(WES_kit_A). → S00, S02.""" + db = Database(test_db) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", AfqueryWarning) + carriers = db.variant_info("chr1", 1500, ref="A", alt="T", tech=["WGS"]) + ids = {c.sample_id for c in carriers} + assert 5 not in ids # S05 is WES_kit_A → excluded + for c in carriers: + assert c.tech_name == "WGS" + + +# --------------------------------------------------------------------------- +# 7. Ref/alt specificity +# --------------------------------------------------------------------------- + +def test_ref_alt_specificity(test_db): + db = Database(test_db) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", AfqueryWarning) + carriers = db.variant_info("chr1", 1500, ref="A", alt="T") + assert all(isinstance(c, SampleCarrier) for c in carriers) + assert len(carriers) > 0 + + +def test_pos_only_returns_all_alleles(test_db): + """Without ref/alt, all alleles at the locus are returned.""" + db = Database(test_db) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", AfqueryWarning) + carriers_all = db.variant_info("chr1", 1500) + carriers_specific = db.variant_info("chr1", 1500, ref="A", alt="T") + # chr1:1500 has only one allele in the test DB, so both should match + assert {c.sample_id for c in carriers_all} == {c.sample_id for c in carriers_specific} + + +# --------------------------------------------------------------------------- +# 8. Edge case: variant not in database +# --------------------------------------------------------------------------- + +def test_variant_not_in_db(test_db): + db = Database(test_db) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", AfqueryWarning) + carriers = db.variant_info("chr1", 9999999, ref="A", alt="T") + assert carriers == [] + + +def test_unknown_chrom_returns_empty(test_db): + db = Database(test_db) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", AfqueryWarning) + carriers = db.variant_info("chr99", 1000) + assert carriers == [] + + +# --------------------------------------------------------------------------- +# 9. Edge case: no carriers in filtered population +# --------------------------------------------------------------------------- + +def test_no_carriers_after_filter(test_db): + """chr1:3500 only has S07 (WES_kit_B/male). Filtering to female → empty.""" + db = Database(test_db) + carriers = db.variant_info("chr1", 3500, ref="G", alt="C", sex="female") + assert carriers == [] + + +# --------------------------------------------------------------------------- +# 10. CLI: text output +# --------------------------------------------------------------------------- + +def test_cli_text_output(test_db): + runner = CliRunner() + result = runner.invoke( + cli, + ["variant-info", "--db", test_db, "--locus", "chr1:3500", + "--ref", "G", "--alt", "C", "--no-warn"], + ) + assert result.exit_code == 0, result.output + assert "S07" in result.output + assert "het" in result.output + assert "PASS" in result.output + + +def test_cli_text_empty(test_db): + """No carriers → prints 'No carriers found' message.""" + runner = CliRunner() + result = runner.invoke( + cli, + ["variant-info", "--db", test_db, "--locus", "chr1:3500", + "--ref", "G", "--alt", "C", "--sex", "female", "--no-warn"], + ) + assert result.exit_code == 0, result.output + assert "No carriers found" in result.output + + +# --------------------------------------------------------------------------- +# 11. CLI: TSV output +# --------------------------------------------------------------------------- + +def test_cli_tsv_output(test_db): + runner = CliRunner() + result = runner.invoke( + cli, + ["variant-info", "--db", test_db, "--locus", "chr1:3500", + "--ref", "G", "--alt", "C", "--format", "tsv", "--no-warn"], + ) + assert result.exit_code == 0, result.output + lines = result.output.strip().split("\n") + header = lines[0].split("\t") + assert header == ["sample_id", "sample_name", "sex", "tech", "phenotypes", "genotype", "filter"] + assert len(lines) == 2 # header + 1 carrier + fields = lines[1].split("\t") + assert fields[1] == "S07" + assert fields[5] == "het" + assert fields[6] == "PASS" + + +def test_cli_tsv_fail_sample(test_db): + """chr1:1500 — S00 is alt/FAIL.""" + runner = CliRunner() + result = runner.invoke( + cli, + ["variant-info", "--db", test_db, "--locus", "chr1:1500", + "--ref", "A", "--alt", "T", "--format", "tsv", "--no-warn"], + ) + assert result.exit_code == 0, result.output + lines = result.output.strip().split("\n") + rows = {l.split("\t")[1]: l.split("\t") for l in lines[1:]} + assert rows["S00"][5] == "alt" + assert rows["S00"][6] == "FAIL" + + +# --------------------------------------------------------------------------- +# 12. CLI: JSON output +# --------------------------------------------------------------------------- + +def test_cli_json_output(test_db): + runner = CliRunner() + result = runner.invoke( + cli, + ["variant-info", "--db", test_db, "--locus", "chr1:3500", + "--ref", "G", "--alt", "C", "--format", "json", "--no-warn"], + ) + assert result.exit_code == 0, result.output + data = json.loads(result.output) + assert "variant" in data + assert "samples" in data + assert data["variant"]["chrom"] == "chr1" + assert data["variant"]["pos"] == 3500 + assert len(data["samples"]) == 1 + s = data["samples"][0] + assert s["sample_name"] == "S07" + assert s["genotype"] == "het" + assert s["filter"] == "PASS" + assert isinstance(s["phenotypes"], list) + + +def test_cli_json_fail_sample(test_db): + """chr1:1500 JSON — S00 should have filter='FAIL'.""" + runner = CliRunner() + result = runner.invoke( + cli, + ["variant-info", "--db", test_db, "--locus", "chr1:1500", + "--ref", "A", "--alt", "T", "--format", "json", "--no-warn"], + ) + assert result.exit_code == 0, result.output + data = json.loads(result.output) + by_name = {s["sample_name"]: s for s in data["samples"]} + assert by_name["S00"]["filter"] == "FAIL" + assert by_name["S00"]["genotype"] == "alt" + assert by_name["S02"]["filter"] == "PASS" + assert by_name["S02"]["genotype"] == "hom" From 5a8958751e42e7d7cea6f80612e4dd3dc4c4f8cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20L=C3=B3pez=20L=C3=B3pez?= Date: Tue, 24 Mar 2026 12:41:37 +0100 Subject: [PATCH 6/8] docs: add variant-info page and update nav --- docs/guides/variant-info.md | 182 ++++++++++++++++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 183 insertions(+) create mode 100644 docs/guides/variant-info.md diff --git a/docs/guides/variant-info.md b/docs/guides/variant-info.md new file mode 100644 index 0000000..9047da6 --- /dev/null +++ b/docs/guides/variant-info.md @@ -0,0 +1,182 @@ +# Variant Info + +`afquery variant-info` returns the list of samples carrying a specific variant, together with each sample's metadata: sex, sequencing technology, phenotype codes, genotype (het/hom), and FILTER status (PASS/FAIL). + +This avoids the need to re-query raw VCF files when inspecting individual variant carriers. + +--- + +## Basic usage + +```bash +afquery variant-info --db ./db/ --locus chr1:925952 +``` + +By default all samples are queried and results are printed as an aligned text table: + +``` +sample_id sample_name sex tech phenotypes genotype filter +--------- ----------- ------ --------- ------------ -------- ------ +3 P003 male WGS E11.9,J45 het PASS +17 P017 female WES_kit_A E11.9 hom PASS +42 P042 male WGS I10 alt FAIL +``` + +--- + +## Filtering to a specific allele + +When multiple alleles exist at the same position, use `--ref` and `--alt` to restrict to one: + +```bash +afquery variant-info --db ./db/ --locus chr17:41245466 --ref A --alt T +``` + +Without `--ref`/`--alt`, carriers for all alleles at the locus are returned and a warning is emitted if more than one allele is found. + +--- + +## Sample filters + +`variant-info` accepts the same sample filters as `query`: + +```bash +# Only female carriers +afquery variant-info --db ./db/ --locus chr1:925952 --sex female + +# Only carriers with phenotype E11.9 +afquery variant-info --db ./db/ --locus chr1:925952 --phenotype E11.9 + +# Exclude phenotype I10 +afquery variant-info --db ./db/ --locus chr1:925952 --phenotype ^I10 + +# Restrict to WGS samples +afquery variant-info --db ./db/ --locus chr1:925952 --tech WGS + +# Combine filters +afquery variant-info --db ./db/ --locus chr1:925952 \ + --sex female --phenotype E11.9 --tech WGS,WES_kit_A +``` + +See [Sample Filtering](sample-filtering.md) for the full filter syntax. + +--- + +## Output formats + +### TSV + +Machine-readable tab-separated output, suitable for downstream processing: + +```bash +afquery variant-info --db ./db/ --locus chr1:925952 --format tsv > carriers.tsv +``` + +``` +sample_id sample_name sex tech phenotypes genotype filter +3 P003 male WGS E11.9,J45 het PASS +17 P017 female WES_kit_A E11.9 hom PASS +42 P042 male WGS I10 alt FAIL +``` + +### JSON + +Structured output with variant metadata and a sample list: + +```bash +afquery variant-info --db ./db/ --locus chr1:925952 --format json +``` + +```json +{ + "variant": { + "chrom": "chr1", + "pos": 925952, + "ref": ".", + "alt": "." + }, + "samples": [ + { + "sample_id": 3, + "sample_name": "P003", + "sex": "male", + "tech": "WGS", + "phenotypes": ["E11.9", "J45"], + "genotype": "het", + "filter": "PASS" + }, + { + "sample_id": 42, + "sample_name": "P042", + "sex": "male", + "tech": "WGS", + "phenotypes": ["I10"], + "genotype": "alt", + "filter": "FAIL" + } + ] +} +``` + +When `--ref` and `--alt` are specified, the `variant` block contains the actual alleles. Otherwise, `"."` is used as a placeholder. + +--- + +## Genotype values + +| Value | Meaning | +|---|---| +| `het` | Heterozygous carrier, FILTER=PASS | +| `hom` | Homozygous alt carrier, FILTER=PASS | +| `alt` | Non-ref carrier with FILTER≠PASS (ploidy unknown) | + +--- + +## Python API + +```python +from afquery import Database + +db = Database("./db/") + +# All carriers at a locus +carriers = db.variant_info("chr1", 925952) + +# With allele and sample filters +carriers = db.variant_info( + chrom="chr17", + pos=41245466, + ref="A", + alt="T", + phenotype=["E11.9"], + sex="female", + tech=["WGS"], +) + +for c in carriers: + print(c.sample_name, c.genotype, c.filter_pass, c.phenotypes) +``` + +The `variant_info` function is also available at the top level for one-off use: + +```python +from afquery import variant_info + +carriers = variant_info("./db/", "chr1", 925952, ref="G", alt="A") +``` + +--- + +## All options + +| Option | Default | Description | +|---|---|---| +| `--db` | required | Path to database directory | +| `--locus` | required | `CHROM:POS` (e.g. `chr1:925952`) | +| `--ref` | — | Filter to specific reference allele | +| `--alt` | — | Filter to specific alternate allele | +| `--phenotype` | all | Include phenotype (repeatable; `^CODE` excludes) | +| `--sex` | `both` | `male`, `female`, or `both` | +| `--tech` | all | Include technology (repeatable; `^NAME` excludes) | +| `--format` | `text` | `text`, `tsv`, or `json` | +| `--no-warn` | off | Suppress `AfqueryWarning` messages | diff --git a/mkdocs.yml b/mkdocs.yml index 6dfd8ff..d7eb52d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -84,6 +84,7 @@ nav: - Query Allele Frequencies: guides/query.md - Annotate a VCF: guides/annotate-vcf.md - Bulk Export: guides/dump-export.md + - Variant Info: guides/variant-info.md - Sample Filtering: guides/sample-filtering.md - 🏥 Clinical Workflows: - ACMG Criteria (BA1/PM2/PS4): use-cases/acmg-use-cases.md From dd11505a3e61c1cd6a5617d4f6dc046844696cb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20L=C3=B3pez=20L=C3=B3pez?= Date: Tue, 24 Mar 2026 14:44:06 +0100 Subject: [PATCH 7/8] refactor: remove stale v1/v2 schema references --- docs/advanced/filter-pass-tracking.md | 13 ------------- docs/troubleshooting.md | 10 ---------- src/afquery/cli.py | 2 +- src/afquery/query.py | 3 +-- tests/test_dump.py | 4 ++-- tests/test_pass_filter.py | 18 +++++++++--------- 6 files changed, 13 insertions(+), 37 deletions(-) diff --git a/docs/advanced/filter-pass-tracking.md b/docs/advanced/filter-pass-tracking.md index 45875b3..1248e29 100644 --- a/docs/advanced/filter-pass-tracking.md +++ b/docs/advanced/filter-pass-tracking.md @@ -85,19 +85,6 @@ afquery annotate --db ./db/ --input variants.vcf --output annotated.vcf --- -## Schema Version Compatibility - -The `fail_bitmap` and `N_FAIL` tracking requires schema version 2.0 or later. Databases created with older versions of AFQuery do not contain `fail_bitmap` data. - -| Database schema | N_FAIL behavior | -|----------------|-----------------| -| ≥ 2.0 | `N_FAIL` is always an integer (0 or more) | -| < 2.0 (legacy) | `N_FAIL` is `None` in Python API results | - -To upgrade a legacy database, rebuild it with `afquery create-db`. There is no in-place migration. - ---- - ## PASS-Only Enforcement AF reflects the quality-filtered allele frequency — the frequency of the alt allele among high-quality calls. This is appropriate for most clinical and research use cases. PASS-only ingestion is always enforced. diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 6b93444..af9d921 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -119,16 +119,6 @@ afquery info --db ./db/ --samples | grep SAMP --- -## Version Compatibility - -If you get `"Unsupported schema_version"` errors, upgrade AFQuery: - -```bash -pip install --upgrade afquery -``` - ---- - ## WES Technology Treated as WGS **Symptom:** AN for WES samples is much higher than expected; positions outside the capture panel return results. diff --git a/src/afquery/cli.py b/src/afquery/cli.py index 97e978e..18450a7 100644 --- a/src/afquery/cli.py +++ b/src/afquery/cli.py @@ -293,7 +293,7 @@ def annotate(db, input_vcf, output_vcf, phenotype, sex, tech, threads, verbose, AFQUERY_N_HET heterozygous carrier count (per ALT) AFQUERY_N_HOM_ALT homozygous alt count (per ALT) AFQUERY_N_HOM_REF homozygous ref count (per ALT) - AFQUERY_N_FAIL samples with FILTER!=PASS (v2 databases only) + AFQUERY_N_FAIL samples with FILTER!=PASS """ if no_warn: import warnings diff --git a/src/afquery/query.py b/src/afquery/query.py index 086d287..450dd97 100644 --- a/src/afquery/query.py +++ b/src/afquery/query.py @@ -564,8 +564,7 @@ def variant_info(self, params: QueryParams) -> list[SampleCarrier]: Each returned :class:`~afquery.models.SampleCarrier` includes the sample's name, sex, technology, phenotype codes, genotype (het/hom/alt), - and FILTER status. ``filter_pass=None`` is returned for v1 databases - that do not store a ``fail_bitmap``. + and FILTER status. Args: params: Query parameters (chrom, pos, optional ref/alt, sample filter). diff --git a/tests/test_dump.py b/tests/test_dump.py index 58a9f67..fc8ac78 100644 --- a/tests/test_dump.py +++ b/tests/test_dump.py @@ -142,7 +142,7 @@ def test_csv_header_base_columns(self, test_db): assert "N_HET" in rows[0] assert "N_HOM_ALT" in rows[0] assert "N_HOM_REF" in rows[0] - assert "N_FAIL" in rows[0] # v2 DB + assert "N_FAIL" in rows[0] def test_ac_positive_filter(self, test_db): """By default all exported rows must have AC > 0.""" @@ -360,7 +360,7 @@ def test_by_sex_and_phenotype_cartesian_labels(self, test_db): assert "AC_female_E11.9" in rows[0] def test_n_fail_in_group_columns(self, test_db): - """v2 DB: N_FAIL_