From c13e21be260688ee93425f1266badae5eb8a94df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20L=C3=B3pez=20L=C3=B3pez?= Date: Thu, 14 May 2026 17:36:20 +0200 Subject: [PATCH 1/3] docs: correct AC/AN, FILTER and bitmap inaccuracies across the docs A consistency review against the source code surfaced several technical claims that no longer matched behaviour. The most significant: FILTER!=PASS samples were described as excluded from both AC and AN, when the code keeps them in the eligible set so they count toward AN (only AC excludes them). Also corrected: - genotype-accounting invariant now includes N_NO_COVERAGE everywhere - fail_bitmap definition (any FILTER!=PASS call, including ./. at failed sites), not "non-ref genotype AND FILTER!=PASS" - hemizygous males counted in N_HOM_ALT, not N_HET - Parquet schema lists all five bitmap columns; bucket files are bucket_N.parquet, not bucket_N/data.parquet - bucket-id SQL uses integer division (//), not float division - manifest.json field table; ingest temp store is Parquet, not SQLite - N_NO_COVERAGE added to annotation INFO field lists --- docs/advanced/debugging-results.md | 4 +- docs/advanced/filter-pass-tracking.md | 29 ++++++++------- docs/advanced/multi-cohort-strategies.md | 10 +++-- docs/advanced/performance.md | 4 +- docs/advanced/ploidy-and-sex-chroms.md | 6 +-- docs/faq.md | 2 +- docs/getting-started/concepts.md | 14 +++---- docs/getting-started/quickstart.md | 1 + docs/getting-started/tutorial.md | 4 +- docs/getting-started/understanding-output.md | 4 +- docs/guides/annotate-vcf.md | 2 +- docs/guides/create-database.md | 14 +++---- docs/guides/sample-filtering.md | 2 +- docs/guides/update-database.md | 2 +- docs/index.md | 4 +- docs/reference/data-model.md | 39 +++++++++++--------- docs/reference/glossary.md | 4 +- docs/reference/python-api.md | 2 +- docs/troubleshooting.md | 2 +- docs/use-cases/acmg-use-cases.md | 2 +- docs/use-cases/sex-specific-af.md | 8 ++-- 21 files changed, 85 insertions(+), 74 deletions(-) diff --git a/docs/advanced/debugging-results.md b/docs/advanced/debugging-results.md index 256439b..23f2a04 100644 --- a/docs/advanced/debugging-results.md +++ b/docs/advanced/debugging-results.md @@ -34,13 +34,13 @@ A variant you expect to find is not in the database: | Check | Details | |-------|---------| | Was it in the source VCFs? | AFQuery only stores variants present in ingested VCFs | -| Was it FILTER=PASS? | Default ingestion skips non-PASS variants. Check with `afquery info` for `pass_only_filter` | +| Was the alt allele ever called? | A variant is stored only if at least one sample carries the alt allele or has a failed call there. A site seen only as `0/0` in every sample is genuinely absent. Note that non-PASS calls *are* kept (in `fail_bitmap`) — such a variant is not missing, it just shows `AC=0`. | | Multiallelic sites | AFQuery stores each ALT separately. Query the specific ALT allele, not just position | | Chromosome naming | Ensure consistent `chr` prefix usage | ### 4. Unexpected N_FAIL > 0 -`N_FAIL > 0` means some eligible samples had the alt allele called but with `FILTER≠PASS`. These samples are excluded from AC/AN. This is usually benign (1–2 samples), but a high N_FAIL warrants investigation: +`N_FAIL > 0` means some eligible samples had a call with `FILTER≠PASS` at this position. These samples are excluded from AC, but they remain eligible and still count in AN. This is usually benign (1–2 samples), but a high N_FAIL warrants investigation: | N_FAIL relative to n_eligible | Likely cause | |---|---| diff --git a/docs/advanced/filter-pass-tracking.md b/docs/advanced/filter-pass-tracking.md index cc18c61..11d03ca 100644 --- a/docs/advanced/filter-pass-tracking.md +++ b/docs/advanced/filter-pass-tracking.md @@ -11,9 +11,10 @@ In VCF format, the FILTER column indicates whether a variant call passed quality - `PASS` or `.` (missing) — the variant passed all filters - Any other value (e.g., `LowQual`, `VQSRTrancheSNP99.90to100.00`) — the variant failed one or more filters -AFQuery default behavior: +AFQuery's default behavior: -- **PASS-only**: only `FILTER=PASS` variants are counted in AC/AN. This is always enforced. +- Only `FILTER=PASS` calls contribute to **AC** (and therefore to **AF**). This is always enforced. +- **AN** is not affected by the filter — it counts every eligible sample at the position, failed calls included. --- @@ -21,19 +22,19 @@ AFQuery default behavior: AFQuery stores a third bitmap per variant alongside `het_bitmap` and `hom_bitmap`: -- **`fail_bitmap`** — bit set for each sample that has a non-ref genotype (AC>0) AND `FILTER≠PASS` +- **`fail_bitmap`** — bit set for each sample whose call at this site has `FILTER≠PASS`. In practice this is two cases: a sample that carries the alt allele but whose call failed a filter, and a sample with a missing genotype (`./.`) at a site that itself failed a filter. -This means: +What this means for a sample in `fail_bitmap`: -- A sample in `fail_bitmap` was genotyped with the alt allele but the call failed QC -- Such samples are **not** counted in AC/AN (they don't affect AF) -- Their count is exposed as `N_FAIL` +- Its alt alleles are **not** counted in AC, so it does not raise AF. +- It is still an eligible sample, so it **is** counted in AN. The failed call lowers AF by sitting in the denominator without contributing to the numerator. +- Its count is exposed separately as `N_FAIL`. --- ## Database Creation -The `fail_bitmap` is always written and always tracks PASS-only ingestion: +The `fail_bitmap` is always written, regardless of build options: ```bash afquery create-db --manifest manifest.tsv --output-dir ./db/ --genome-build GRCh38 @@ -52,10 +53,10 @@ afquery query --db ./db/ --locus chr1:925952 ``` ``` -chr1:925952 G>A AC=142 AN=2742 AF=0.0518 n_eligible=1371 N_HET=138 N_HOM_ALT=2 N_HOM_REF=1231 N_FAIL=7 +chr1:925952 G>A AC=142 AN=2742 AF=0.0518 n_eligible=1371 N_HET=138 N_HOM_ALT=2 N_HOM_REF=1224 N_FAIL=7 ``` -`N_FAIL=7` means 7 eligible samples had the alt allele called but with FILTER≠PASS. +`N_FAIL=7` means 7 eligible samples had a call with FILTER≠PASS at this site. They are part of `n_eligible` (and of AN), so the genotype counts still add up: 138 + 2 + 1224 + 7 = 1371. ### Python API @@ -83,21 +84,23 @@ Each carrier row shows its `filter` column as `PASS` or `FAIL`, along with sampl ## VCF Annotation -AFQuery adds an additional INFO field to annotated VCFs: +Among the INFO fields `afquery annotate` writes, one reports the failed-call count: | Field | Type | Description | |-------|------|-------------| -| `AFQUERY_N_FAIL` | Integer | Eligible samples with FILTER≠PASS at this variant | +| `AFQUERY_N_FAIL` | Integer | Eligible samples with `FILTER≠PASS` at this variant | ```bash afquery annotate --db ./db/ --input variants.vcf --output annotated.vcf ``` +See [Understanding Output](../getting-started/understanding-output.md#vcf-annotation-fields) for the full set of `AFQUERY_*` fields. + --- ## PASS-Only Enforcement -AF reflects the quality-filtered allele frequency — the frequency of the alt allele among high-quality calls. This is appropriate for most clinical and research use cases. PASS-only ingestion is always enforced. +PASS-only counting is always enforced and cannot be turned off. It applies to the numerator only: a failed call never adds an alt allele to AC, but the sample stays in the eligible set and so remains in AN. The result is a deliberately conservative AF — a failed carrier weighs on the denominator without lifting the numerator, which is the safe direction for clinical and research use. --- diff --git a/docs/advanced/multi-cohort-strategies.md b/docs/advanced/multi-cohort-strategies.md index b87578a..6ea03fd 100644 --- a/docs/advanced/multi-cohort-strategies.md +++ b/docs/advanced/multi-cohort-strategies.md @@ -61,7 +61,7 @@ All samples in one database, with phenotype codes distinguishing cohorts. ### Manifest Design ```tsv -sample_id vcf_path sex technology phenotype_codes +sample_name vcf_path sex tech_name phenotype_codes CARD_001 /data/card/001.vcf.gz female wgs cardiology,EUR,control CARD_002 /data/card/002.vcf.gz male wgs cardiology,EUR,case_HCM NEURO_001 /data/neuro/001.vcf.gz female wes neurology,AFR,case_epilepsy @@ -99,7 +99,7 @@ afquery query --db ./merged/ --locus chr1:12345678 --ref C --alt T \ |-----------|-------------| | Single database to maintain | Rebuilding requires all VCFs accessible | | Cross-cohort queries via phenotype filters | Phenotype code design must be planned upfront | -| One annotation pass covers all cohorts | Adding a new cohort requires `afquery update` | +| One annotation pass covers all cohorts | Adding a new cohort requires `afquery update-db` | | Flexible ad-hoc stratification | Larger database, longer rebuild time | --- @@ -134,7 +134,11 @@ Maintain both per-cohort and merged databases. afquery annotate --db /databases/institutional/cardiology/ \ --input patient.vcf.gz --output step1.vcf.gz -# Annotate against shared controls (use a different INFO prefix via Python API) +# Annotate against shared controls in a second pass +# Note: the AFQUERY_* INFO fields are overwritten on each pass — if you need +# both sets of frequencies, extract the values from step1.vcf.gz first. +afquery annotate --db /databases/shared/combined_controls/ \ + --input step1.vcf.gz --output step2.vcf.gz ``` --- diff --git a/docs/advanced/performance.md b/docs/advanced/performance.md index 43657cf..ace0553 100644 --- a/docs/advanced/performance.md +++ b/docs/advanced/performance.md @@ -50,7 +50,7 @@ Expected scaling (50K samples, 2,500 buckets, GRCh38): | 32 | ~18 min | ~24× | | 52 | ~13 min | ~38× | -Scaling is near-linear up to ~32 cores; beyond that, I/O and SQLite contention limit further gains. +Scaling is near-linear up to ~32 cores; beyond that, disk I/O contention limits further gains. Total RAM required: `build_threads × build_memory` @@ -80,7 +80,7 @@ afquery create-db ... --threads 32 --build-threads 16 --build-memory 4GB graph TD A["Query Request
chr1:925952"] B["Open DuckDB
connection"] - C["Locate Parquet
bucket_0/data.parquet"] + C["Locate Parquet
bucket_0.parquet"] D["Read rows
matching pos"] E["Deserialize
bitmaps"] F["Bitmap AND
with eligible
sample set"] diff --git a/docs/advanced/ploidy-and-sex-chroms.md b/docs/advanced/ploidy-and-sex-chroms.md index 030693a..987b991 100644 --- a/docs/advanced/ploidy-and-sex-chroms.md +++ b/docs/advanced/ploidy-and-sex-chroms.md @@ -103,12 +103,12 @@ afquery query --db ./db/ --locus chrM:3243 For every query result, the following identity holds: -**N_HET + N_HOM_ALT + N_HOM_REF + N_FAIL = n_eligible** +**N_HET + N_HOM_ALT + N_HOM_REF + N_FAIL + N_NO_COVERAGE = n_eligible** -This can be used to validate results. N_HOM_REF is the number of eligible samples that are homozygous reference (i.e., do not carry the alt allele and passed quality filters). +This can be used to validate results. N_HOM_REF is the number of eligible samples that are homozygous reference (i.e., do not carry the alt allele and passed quality filters). N_NO_COVERAGE is 0 unless a coverage-evidence filter is active — see [Coverage Evidence](coverage-evidence.md). !!! note "Mutual exclusivity" - N_HET, N_HOM_ALT, N_HOM_REF, and N_FAIL are mutually exclusive. A sample with a non-ref allele but FILTER≠PASS is counted in N_FAIL only — it does not appear in N_HET or N_HOM_ALT. Likewise, N_HOM_REF counts only PASS-filtered samples. + N_HET, N_HOM_ALT, N_HOM_REF, N_FAIL, and N_NO_COVERAGE are mutually exclusive. A sample with a non-ref allele but FILTER≠PASS is counted in N_FAIL only — it does not appear in N_HET or N_HOM_ALT. Likewise, N_HOM_REF counts only PASS-filtered samples. ### chrX non-PAR diff --git a/docs/faq.md b/docs/faq.md index 33fd8ed..1d7acff 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -149,7 +149,7 @@ On chrX non-PAR positions, males contribute AN=1 and females contribute AN=2. Wh ## What does N_FAIL mean in query output? -`N_FAIL` is the count of eligible samples that had the alt allele called but with `FILTER≠PASS`. Shown as `N_FAIL=N` in text output. These samples are not counted in AC/AN. See [FILTER=PASS Tracking](advanced/filter-pass-tracking.md). +`N_FAIL` is the count of eligible samples whose call had `FILTER≠PASS` at the position. Shown as `N_FAIL=N` in text output. These samples are excluded from AC, but they stay eligible and still count toward AN. See [FILTER=PASS Tracking](advanced/filter-pass-tracking.md). --- diff --git a/docs/getting-started/concepts.md b/docs/getting-started/concepts.md index 41a324b..fb1d5be 100644 --- a/docs/getting-started/concepts.md +++ b/docs/getting-started/concepts.md @@ -126,13 +126,13 @@ graph TB ### Per-Variant Bitmaps -For each variant row, AFQuery stores three [Roaring Bitmaps](https://roaringbitmap.org/): +For each variant row, AFQuery stores per-sample genotype information as [Roaring Bitmaps](https://roaringbitmap.org/). Three of them drive every query: -- **`het_bitmap`** — bit set for each sample that is heterozygous (GT=0/1 or 1/0) -- **`hom_bitmap`** — bit set for each sample that is homozygous alt (GT=1/1 or GT=1) -- **`fail_bitmap`** bit set for each sample called with FILTER≠PASS +- **`het_bitmap`** — sample is heterozygous (`GT=0/1`) with `FILTER=PASS` +- **`hom_bitmap`** — sample is homozygous alt (`GT=1/1`, or `GT=1` on haploid regions) with `FILTER=PASS` +- **`fail_bitmap`** — the sample's call at this site has `FILTER≠PASS` -Each sample has a stable integer ID (0-indexed). The bit position in the bitmap equals the sample ID. +Each sample has a stable integer ID (0-indexed). The bit position in the bitmap equals the sample ID. Databases built with coverage-quality filters carry two additional bitmaps — see [Data Model](../reference/data-model.md#parquet-schema). ### Parquet Storage @@ -141,8 +141,8 @@ Bitmaps are serialized and stored in Parquet files, partitioned by chromosome an ``` variants/ chr1/ - bucket_0/ ← positions 0–999,999 - bucket_1/ ← positions 1,000,000–1,999,999 + bucket_0.parquet ← positions 0–999,999 + bucket_1.parquet ← positions 1,000,000–1,999,999 ... chr2/ ... diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md index cb9b1bd..1a3ace0 100644 --- a/docs/getting-started/quickstart.md +++ b/docs/getting-started/quickstart.md @@ -135,6 +135,7 @@ The output VCF gains INFO fields (see [Annotate a VCF](../guides/annotate-vcf.md | `AFQUERY_N_HOM_ALT` | A (per ALT) | Homozygous alt sample count | | `AFQUERY_N_HOM_REF` | A (per ALT) | Homozygous ref sample count | | `AFQUERY_N_FAIL` | 1 (per site) | Samples with FILTER≠PASS | +| `AFQUERY_N_NO_COVERAGE` | A (per ALT) | Eligible samples lacking coverage evidence (0 unless a coverage-evidence filter is active) | --- diff --git a/docs/getting-started/tutorial.md b/docs/getting-started/tutorial.md index 92f6431..df85551 100644 --- a/docs/getting-started/tutorial.md +++ b/docs/getting-started/tutorial.md @@ -86,7 +86,7 @@ Every query result includes the following fields: | **AC** | Allele count — alt allele copies in eligible samples | | **AN** | Allele number — total alleles examined (adjusted for coverage and ploidy) | | **AF** | Allele frequency — `AC / AN` | -| **n_eligible** | Samples in the filtered set (before coverage or ploidy adjustments). Here, 10 = no filter applied. | +| **n_eligible** | Eligible samples at this position — those passing the metadata filters *and* covered here. Ploidy is applied afterwards, to AN. Here, 10 = no filter applied and every sample is covered. | | **N_HET** | Eligible samples heterozygous for the alt allele | | **N_HOM_ALT** | Eligible samples homozygous for the alt allele | | **N_HOM_REF** | Eligible samples homozygous reference | @@ -94,7 +94,7 @@ Every query result includes the following fields: For `chr1:925952`: AN=20 = 10 diploid samples × 2 alleles; AC=6 = 4 het samples (1 copy each) + 1 hom-alt (2 copies). -**Accounting identity:** `n_eligible = N_HET + N_HOM_ALT + N_HOM_REF + N_FAIL` always holds (10 = 4+1+5+0 ✓). +**Accounting identity:** `n_eligible = N_HET + N_HOM_ALT + N_HOM_REF + N_FAIL + N_NO_COVERAGE` always holds. `N_NO_COVERAGE` is 0 here (it only becomes non-zero with a coverage-evidence filter), so 10 = 4+1+5+0+0 ✓. ### N_FAIL in practice diff --git a/docs/getting-started/understanding-output.md b/docs/getting-started/understanding-output.md index ecf6c7f..e1528b9 100644 --- a/docs/getting-started/understanding-output.md +++ b/docs/getting-started/understanding-output.md @@ -14,8 +14,8 @@ This page explains what each field in AFQuery output means and how to interpret | **N_HET** | int | Number of eligible samples heterozygous for the alt allele (GT=0/1) | | **N_HOM_ALT** | int | Number of eligible samples homozygous for the alt allele (GT=1/1 or GT=1). Includes haploid carriers on sex chromosomes and chrM. See [Ploidy](../advanced/ploidy-and-sex-chroms.md#genotype-counting). | | **N_HOM_REF** | int | Number of eligible samples homozygous reference (GT=0/0 or GT=0) | -| **n_eligible** | int | Number of samples in the eligible set (after sex/phenotype/tech filters) | -| **N_FAIL** | int | Number of eligible samples with a non-ref allele called but FILTER≠PASS at this position. These samples are counted *only* in N_FAIL — not in N_HET, N_HOM_ALT, or N_HOM_REF. | +| **n_eligible** | int | Number of eligible samples — those passing the sex/phenotype/tech filters *and* covered at this position | +| **N_FAIL** | int | Number of eligible samples whose call at this position had FILTER≠PASS. These samples are counted *only* in N_FAIL — not in N_HET, N_HOM_ALT, or N_HOM_REF — but they stay eligible and still count toward AN. | | **N_NO_COVERAGE** | int | Number of eligible samples whose tech lacks coverage evidence at this position. Excluded from `N_HOM_REF` to keep AC/AN conservative. Always `0` unless a coverage-evidence filter is active. See [Coverage Evidence](../advanced/coverage-evidence.md). | diff --git a/docs/guides/annotate-vcf.md b/docs/guides/annotate-vcf.md index 758916f..01b6df5 100644 --- a/docs/guides/annotate-vcf.md +++ b/docs/guides/annotate-vcf.md @@ -26,7 +26,7 @@ afquery annotate \ | `AFQUERY_N_HET` | Integer | A (per ALT) | Heterozygous sample count | | `AFQUERY_N_HOM_ALT` | Integer | A (per ALT) | Homozygous alt sample count | | `AFQUERY_N_HOM_REF` | Integer | A (per ALT) | Homozygous ref sample count | -| `AFQUERY_N_FAIL` | Integer | 1 (per site) | Samples with FILTER≠PASS and alt allele called. Mutually exclusive with N_HET/N_HOM_ALT/N_HOM_REF. | +| `AFQUERY_N_FAIL` | Integer | 1 (per site) | Eligible samples whose call had FILTER≠PASS. Excluded from AC, but still counted in AN. Mutually exclusive with N_HET/N_HOM_ALT/N_HOM_REF. | | `AFQUERY_N_NO_COVERAGE` | Integer | A (per ALT) | Eligible samples whose tech lacks coverage evidence at this position. Excluded from `N_HOM_REF` to keep AC/AN conservative. Always `0` unless a coverage-evidence filter is active. See [Coverage Evidence](../advanced/coverage-evidence.md). | !!! note "Multi-allelic sites" diff --git a/docs/guides/create-database.md b/docs/guides/create-database.md index 9e2c3c4..6e48a08 100644 --- a/docs/guides/create-database.md +++ b/docs/guides/create-database.md @@ -27,8 +27,8 @@ afquery create-db \ ## What Happens -1. **Ingest phase** — Each VCF is parsed with cyvcf2. Genotypes and INFO fields are written to a SQLite temporary database, one row per variant per sample. -2. **Build phase** — DuckDB reads the SQLite data, aggregates per 1-Mbp bucket, and writes Roaring Bitmap Parquet files partitioned by chromosome/bucket. +1. **Ingest phase** — Each VCF is parsed with cyvcf2. Genotypes and quality fields are written to a temporary per-sample Parquet file, one row per variant per sample. +2. **Build phase** — DuckDB reads the temporary Parquet files, aggregates per 1-Mbp bucket, and writes Roaring Bitmap Parquet files partitioned by chromosome and bucket. 3. **Finalize** — `manifest.json` and `metadata.sqlite` are written to the output directory. --- @@ -39,12 +39,10 @@ afquery create-db \ ./db/ ├── manifest.json # Build configuration (genome build, schema version, etc.) ├── metadata.sqlite # Sample/phenotype/technology/changelog metadata -├── variants/ # Hive-partitioned Parquet files +├── variants/ # Parquet files partitioned by chromosome and bucket │ ├── chr1/ -│ │ ├── bucket_0/ # Positions 0–999,999 -│ │ │ └── data.parquet -│ │ ├── bucket_1/ # Positions 1,000,000–1,999,999 -│ │ │ └── data.parquet +│ │ ├── bucket_0.parquet # Positions 0–999,999 +│ │ ├── bucket_1.parquet # Positions 1,000,000–1,999,999 │ │ └── ... │ ├── chr2/ │ └── ... @@ -95,7 +93,7 @@ afquery create-db --manifest manifest.tsv --output-dir ./db/ --genome-build GRCh ## FILTER=PASS Behavior -By default, only variants with `FILTER=PASS` (or no FILTER field) are counted in AC/AN. Variants that fail filters are tracked in `fail_bitmap`. PASS-only ingestion is always enforced — there is currently no CLI option to change this behaviour. +Only `FILTER=PASS` calls (or calls with no FILTER field) contribute to AC, and therefore to AF. AN is not affected — it counts every eligible sample, failed calls included. Calls that fail a filter are tracked in `fail_bitmap` and surfaced as `N_FAIL`. PASS-only counting is always enforced — there is currently no CLI option to change this behaviour. See [FILTER=PASS Tracking](../advanced/filter-pass-tracking.md) for details. diff --git a/docs/guides/sample-filtering.md b/docs/guides/sample-filtering.md index e7657b2..10c32cf 100644 --- a/docs/guides/sample-filtering.md +++ b/docs/guides/sample-filtering.md @@ -1,6 +1,6 @@ # Sample Filtering -AFQuery supports flexible metadata-based selection of samples for AF computation. Filters are available on all query commands (`query`, `annotate`, `dump`). +AFQuery supports flexible metadata-based selection of samples for AF computation. Filters are available on all query commands (`query`, `annotate`, `dump`, `variant-info`). **Phenotype codes are arbitrary string labels** — you define them in your manifest. They can be ICD-10 codes, HPO terms, project tags (`control`, `pilot`), or any strings meaningful to your cohort. The filtering system does not interpret the codes — it matches them exactly as stored. diff --git a/docs/guides/update-database.md b/docs/guides/update-database.md index 7df1dfb..b0b2717 100644 --- a/docs/guides/update-database.md +++ b/docs/guides/update-database.md @@ -1,6 +1,6 @@ # Update a Database -`afquery update-db` supports three operations: adding new samples, removing existing samples, and compacting the database to reclaim space. +`afquery update-db` adds new samples, removes existing samples, updates sample metadata, and compacts the database to reclaim space. ### Update Timeline diff --git a/docs/index.md b/docs/index.md index a634a4d..1ab7a88 100644 --- a/docs/index.md +++ b/docs/index.md @@ -31,7 +31,7 @@ AFQuery pre-indexes genotypes as [Roaring Bitmaps](https://roaringbitmap.org/) i - **Ploidy-aware** — correct AN on chrX PAR/non-PAR, chrY, and chrM. - **Technology-aware AN** — per-position capture BED intersection across WGS, WES kits, and panels. - **Carrier lookup** — list samples carrying any variant with full metadata (sex, tech, phenotypes, genotype, FILTER status). -- **VCF annotation** — add `AFQUERY_AC/AN/AF/N_HET/N_HOM_ALT/N_HOM_REF/N_FAIL` INFO fields from any sample subset. +- **VCF annotation** — add `AFQUERY_AC/AN/AF/N_HET/N_HOM_ALT/N_HOM_REF/N_FAIL/N_NO_COVERAGE` INFO fields from any sample subset. - **Audit changelog** — every database operation is recorded for reproducibility. --- @@ -43,7 +43,7 @@ graph TD A["🔍 Input VCFs
single-sample"] B["📥 Ingest
cyvcf2 reads VCFs →
per-sample Parquet files emitted"] C["🏗️ Build
DuckDB aggregates per bucket →
Roaring Bitmaps → Parquet"] - D["💾 Database on Disk
variants/chr*/bucket_*/
capture/*.pkl
metadata.sqlite
manifest.json"] + D["💾 Database on Disk
variants/chr*/bucket_*.parquet
capture/*.pkl
metadata.sqlite
manifest.json"] E["⚡ Query Engine
Load bitmap → filter samples →
compute AC/AN/AF
~10-100ms"] F["📊 Annotate"] G["🔄 Update"] diff --git a/docs/reference/data-model.md b/docs/reference/data-model.md index cb8c6f1..de1ce99 100644 --- a/docs/reference/data-model.md +++ b/docs/reference/data-model.md @@ -10,12 +10,10 @@ This page documents the on-disk layout of an AFQuery database, including file fo / ├── manifest.json # Build configuration ├── metadata.sqlite # Sample/phenotype/technology/changelog metadata -├── variants/ # Hive-partitioned Parquet variant data +├── variants/ # Parquet variant data, partitioned by chromosome and bucket │ ├── chr1/ -│ │ ├── bucket_0/ # Positions 0–999,999 -│ │ │ └── data.parquet -│ │ ├── bucket_1/ # Positions 1,000,000–1,999,999 -│ │ │ └── data.parquet +│ │ ├── bucket_0.parquet # Positions 0–999,999 +│ │ ├── bucket_1.parquet # Positions 1,000,000–1,999,999 │ │ └── ... │ ├── chr2/ │ └── ... @@ -33,11 +31,13 @@ Stores the build configuration used during `create-db`. | Field | Type | Description | |-------|------|-------------| | `genome_build` | string | `"GRCh37"` or `"GRCh38"` | -| `schema_version` | string | `"2.0"` | -| `pass_only_filter` | bool | Whether FILTER=PASS was enforced during ingest | +| `version` | string | manifest.json format version | | `db_version` | string | User-specified version label | +| `sample_count` | int | Number of samples at build time | +| `schema_version` | string | `"2.0"`, or `"3.0"` when the database was built with coverage-quality filters | +| `pass_only_filter` | bool | Always `true` — PASS-only counting is always enforced | +| `coverage_filter` | object | The `min_dp` / `min_gq` / `min_qual` / `min_covered` thresholds used at build time (all `0` when unused) | | `created_at` | string | ISO 8601 timestamp | -| `manifest_path` | string | Path to original manifest TSV | --- @@ -102,12 +102,16 @@ Each bucket Parquet file has this schema: | `pos` | `uint32` | 1-based genomic position | | `ref` | `large_utf8` | Reference allele | | `alt` | `large_utf8` | Alternate allele | -| `het_bitmap` | `large_binary` | Serialized Roaring Bitmap of heterozygous sample IDs | -| `hom_bitmap` | `large_binary` | Serialized Roaring Bitmap of homozygous alt sample IDs | -| `fail_bitmap` | `large_binary` | Serialized Roaring Bitmap of FILTER≠PASS sample IDs | +| `het_bitmap` | `large_binary` | Roaring Bitmap of heterozygous, `FILTER=PASS` sample IDs | +| `hom_bitmap` | `large_binary` | Roaring Bitmap of homozygous-alt, `FILTER=PASS` sample IDs | +| `fail_bitmap` | `large_binary` | Roaring Bitmap of sample IDs whose call has `FILTER≠PASS` at this site | +| `filtered_bitmap` | `large_binary` | Roaring Bitmap of WES non-carriers with uncertain coverage | +| `quality_pass_bitmap` | `large_binary` | Roaring Bitmap of carriers meeting the DP/GQ/QUAL thresholds | Rows are sorted by `(pos, alt)` within each bucket. +`filtered_bitmap` and `quality_pass_bitmap` back the [coverage-evidence](../advanced/coverage-evidence.md) feature. The columns are always present, but they are empty unless the database was built with coverage-quality filters (schema version `3.0`). + !!! important "large_utf8 / large_binary" AFQuery uses `large_utf8` and `large_binary` (64-bit offsets) rather than `utf8` / `binary` (32-bit). This is required for compatibility with DuckDB's Parquet reader on large chromosomes. @@ -118,9 +122,10 @@ Rows are sorted by `(pos, alt)` within each bucket. Bitmaps use the [Roaring Bitmap](https://roaringbitmap.org/) format, serialized by pyroaring's portable serialization. - Bit position = sample ID (0-indexed integer) -- `het_bitmap`: bit set iff sample is heterozygous at this variant -- `hom_bitmap`: bit set iff sample is homozygous alt at this variant -- `fail_bitmap`: bit set iff sample has genotype AC>0 AND FILTER≠PASS +- `het_bitmap`: bit set iff the sample is heterozygous with `FILTER=PASS` +- `hom_bitmap`: bit set iff the sample is homozygous alt with `FILTER=PASS` +- `fail_bitmap`: bit set iff the sample's call has `FILTER≠PASS` — either a carrier whose call failed a filter, or a missing genotype (`./.`) at a failed site +- `filtered_bitmap`, `quality_pass_bitmap`: see the Parquet schema above; both are empty unless the database was built with coverage-quality filters To deserialize in Python: ```python @@ -152,11 +157,11 @@ bucket_id = pos // 1_000_000 ``` !!! warning "DuckDB integer arithmetic" - When computing bucket IDs in DuckDB SQL, always use: + When computing bucket IDs in DuckDB SQL, always use the integer-division operator: ```sql - CAST(pos AS BIGINT) / 1000000 + CAST(pos AS BIGINT) // 1000000 ``` - Not `CAST(pos / 1000000 AS BIGINT)` — DuckDB performs float division first and rounds, producing wrong bucket IDs. + Not `CAST(pos / 1000000 AS BIGINT)` — the `/` operator does float division, and casting the rounded result afterwards produces wrong bucket IDs. --- diff --git a/docs/reference/glossary.md b/docs/reference/glossary.md index e9f8547..0db3816 100644 --- a/docs/reference/glossary.md +++ b/docs/reference/glossary.md @@ -34,7 +34,7 @@ A sample that passes all query filters (sex, phenotype, technology) and has cove ## N_FAIL -The count of eligible samples at a position where the genotype call had FILTER≠PASS in the source VCF. These samples are not counted in AC/AN. See [FILTER=PASS Tracking](../advanced/filter-pass-tracking.md). +The count of eligible samples whose call at a position had FILTER≠PASS in the source VCF. These samples are excluded from AC, but they remain eligible and so still count toward AN. See [FILTER=PASS Tracking](../advanced/filter-pass-tracking.md). ## N_NO_COVERAGE @@ -66,7 +66,7 @@ A compressed bitset data structure that efficiently stores sets of integers. AFQ ## Schema Version -The AFQuery database format version stored in `manifest.json`. Each variant stores `het_bitmap`, `hom_bitmap`, and `fail_bitmap`. See [Data Model](data-model.md). +The AFQuery database format version stored in `manifest.json` — `2.0`, or `3.0` when the database was built with coverage-quality filters. See [Data Model](data-model.md). ## Technology diff --git a/docs/reference/python-api.md b/docs/reference/python-api.md index 114ce4b..3937e12 100644 --- a/docs/reference/python-api.md +++ b/docs/reference/python-api.md @@ -453,7 +453,7 @@ class QueryResult: N_HET: int # Heterozygous count N_HOM_ALT: int # Homozygous alt count N_HOM_REF: int # Homozygous ref count - N_FAIL: int # Samples with alt allele called but FILTER≠PASS + N_FAIL: int # Eligible samples whose call had FILTER≠PASS (excluded from AC, kept in AN) N_NO_COVERAGE: int # Eligible samples whose tech lacks evidence (excluded from N_HOM_REF) ``` diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index af9d921..6eee06f 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -77,7 +77,7 @@ AFQuery requires DuckDB to use Parquet for all temporary files. Arrow IPC is not **Cause:** A DuckDB float-division rounding bug was present in older AFQuery versions. When computing bucket IDs, `CAST(pos / 1000000 AS BIGINT)` rounds floats incorrectly (e.g., position 1,500,000 → bucket 2 instead of 1). -**Fix:** Upgrade to the latest AFQuery version. The fix uses `CAST(pos AS BIGINT) / 1000000` for correct integer division. Rebuild the database after upgrading. +**Fix:** Upgrade to the latest AFQuery version. The fix uses `CAST(pos AS BIGINT) // 1000000` — the integer-division operator — for correct bucket IDs. Rebuild the database after upgrading. --- diff --git a/docs/use-cases/acmg-use-cases.md b/docs/use-cases/acmg-use-cases.md index 66132a9..5ebb3b7 100644 --- a/docs/use-cases/acmg-use-cases.md +++ b/docs/use-cases/acmg-use-cases.md @@ -207,7 +207,7 @@ On chrX non-PAR regions, males contribute AN=1 and females AN=2. A cohort with 8 ### High N_FAIL at the Variant Site -`N_FAIL` counts eligible samples that were genotyped with the alt allele but failed quality filters (`FILTER≠PASS`). These samples are excluded from AC/AN, so AF is not directly inflated. However, a high N_FAIL indicates the site has systematic QC problems — and warrants caution before applying ACMG criteria. +`N_FAIL` counts eligible samples whose call at the site failed quality filters (`FILTER≠PASS`). These samples are excluded from AC but stay in AN, so they cannot inflate AF — if anything they make it slightly more conservative. A high N_FAIL still indicates the site has systematic QC problems, and warrants caution before applying ACMG criteria. | N_FAIL / n_eligible | Interpretation | |---|---| diff --git a/docs/use-cases/sex-specific-af.md b/docs/use-cases/sex-specific-af.md index b8a6dfd..b649303 100644 --- a/docs/use-cases/sex-specific-af.md +++ b/docs/use-cases/sex-specific-af.md @@ -34,10 +34,10 @@ afquery query --db ./db/ --locus chrX:153296777 ``` ``` -chrX:153296777 C>T AC=15 AN=2500 AF=0.0060 n_eligible=1500 N_HET=5 N_HOM_ALT=0 N_HOM_REF=1495 N_FAIL=0 +chrX:153296777 C>T AC=15 AN=2500 AF=0.0060 n_eligible=1500 N_HET=5 N_HOM_ALT=10 N_HOM_REF=1485 N_FAIL=0 ``` -AN=2500: 500 males × 1 + 1000 females × 2 = 2500 (mixed ploidy) +AN=2500: 500 males × 1 + 1000 females × 2 = 2500 (mixed ploidy). The 10 hemizygous male carriers land in N_HOM_ALT, not N_HET — on haploid positions there is no heterozygous state. ### 2. Query males only (hemizygous frequency) @@ -46,10 +46,10 @@ afquery query --db ./db/ --locus chrX:153296777 --sex male ``` ``` -chrX:153296777 C>T AC=10 AN=500 AF=0.0200 n_eligible=500 N_HET=10 N_HOM_ALT=0 N_HOM_REF=490 N_FAIL=0 +chrX:153296777 C>T AC=10 AN=500 AF=0.0200 n_eligible=500 N_HET=0 N_HOM_ALT=10 N_HOM_REF=490 N_FAIL=0 ``` -Hemizygous rate: 2% (N_HET here represents hemizygous males, GT=1) +Hemizygous rate: 2%. The carriers appear in N_HOM_ALT, not N_HET — males are haploid at chrX non-PAR, so `GT=1` carriers are counted as homozygous-alt. ### 3. Query females only (carrier frequency) From 4c90b2cffae9f8495a513ca93fd07d50e2a3b7c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20L=C3=B3pez=20L=C3=B3pez?= Date: Thu, 14 May 2026 17:36:31 +0200 Subject: [PATCH 2/3] fix(cli): emit N_NO_COVERAGE in query output The query command computed N_NO_COVERAGE but never printed it, so text, TSV and JSON output stopped at N_FAIL while annotate, dump and the Python API all exposed the field. Add it to all three formats and to the annotate help text. --- src/afquery/cli.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/afquery/cli.py b/src/afquery/cli.py index 0beea90..b722565 100644 --- a/src/afquery/cli.py +++ b/src/afquery/cli.py @@ -77,6 +77,7 @@ def _print_results(results, fmt: str) -> None: } if r.N_FAIL is not None: entry["N_FAIL"] = r.N_FAIL + entry["N_NO_COVERAGE"] = r.N_NO_COVERAGE out.append(entry) click.echo(json.dumps(out, indent=2)) elif fmt == "tsv": @@ -84,6 +85,7 @@ def _print_results(results, fmt: str) -> None: header = "chrom\tpos\tref\talt\tAC\tAN\tAF\tn_eligible\tN_HET\tN_HOM_ALT\tN_HOM_REF" if has_fail: header += "\tN_FAIL" + header += "\tN_NO_COVERAGE" click.echo(header) for r in results: af = f"{r.AF:.6f}" if r.AF is not None else "NA" @@ -94,6 +96,7 @@ def _print_results(results, fmt: str) -> None: ) if has_fail: line += f"\t{r.N_FAIL if r.N_FAIL is not None else '?'}" + line += f"\t{r.N_NO_COVERAGE}" click.echo(line) else: # text if not results: @@ -105,7 +108,8 @@ def _print_results(results, fmt: str) -> None: click.echo( f"{r.variant.chrom}:{r.variant.pos} {r.variant.ref}>{r.variant.alt} " f"AC={r.AC} AN={r.AN} AF={af} n_eligible={r.n_samples_eligible} " - f"N_HET={r.N_HET} N_HOM_ALT={r.N_HOM_ALT} N_HOM_REF={r.N_HOM_REF}{fail_str}" + f"N_HET={r.N_HET} N_HOM_ALT={r.N_HOM_ALT} N_HOM_REF={r.N_HOM_REF}" + f"{fail_str} N_NO_COVERAGE={r.N_NO_COVERAGE}" ) @@ -324,7 +328,8 @@ def annotate(db, input_vcf, output_vcf, phenotype, sex, tech, threads, verbose, AFQUERY_N_HET heterozygous carrier count (per ALT) AFQUERY_N_HOM_ALT homozygous alt count (per ALT) AFQUERY_N_HOM_REF homozygous ref count (per ALT) - AFQUERY_N_FAIL samples with FILTER!=PASS + AFQUERY_N_FAIL samples with FILTER!=PASS (per site) + AFQUERY_N_NO_COVERAGE eligible samples lacking coverage evidence (per ALT) """ if no_warn: import warnings From 710185e764f86361c9bb3792273cce6c27fb1bb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20L=C3=B3pez=20L=C3=B3pez?= Date: Thu, 14 May 2026 17:48:46 +0200 Subject: [PATCH 3/3] refactor(cli): drop dead N_FAIL conditionals in query output QueryResult.N_FAIL is a plain int defaulting to 0, never None, so the is-not-None guards in _print_results were always-true dead branches. N_FAIL now prints unconditionally and symmetrically with N_NO_COVERAGE across text/tsv/json. Adds a regression test pinning that both fields appear in all three formats and that TSV columns stay aligned. --- src/afquery/cli.py | 23 ++++++++--------------- tests/test_cli.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/src/afquery/cli.py b/src/afquery/cli.py index b722565..dda602e 100644 --- a/src/afquery/cli.py +++ b/src/afquery/cli.py @@ -74,29 +74,23 @@ def _print_results(results, fmt: str) -> None: "ref": r.variant.ref, "alt": r.variant.alt, "AC": r.AC, "AN": r.AN, "AF": r.AF, "n_eligible": r.n_samples_eligible, "N_HET": r.N_HET, "N_HOM_ALT": r.N_HOM_ALT, "N_HOM_REF": r.N_HOM_REF, + "N_FAIL": r.N_FAIL, "N_NO_COVERAGE": r.N_NO_COVERAGE, } - if r.N_FAIL is not None: - entry["N_FAIL"] = r.N_FAIL - entry["N_NO_COVERAGE"] = r.N_NO_COVERAGE out.append(entry) click.echo(json.dumps(out, indent=2)) elif fmt == "tsv": - has_fail = any(r.N_FAIL is not None for r in results) - header = "chrom\tpos\tref\talt\tAC\tAN\tAF\tn_eligible\tN_HET\tN_HOM_ALT\tN_HOM_REF" - if has_fail: - header += "\tN_FAIL" - header += "\tN_NO_COVERAGE" + header = ( + "chrom\tpos\tref\talt\tAC\tAN\tAF\tn_eligible\t" + "N_HET\tN_HOM_ALT\tN_HOM_REF\tN_FAIL\tN_NO_COVERAGE" + ) click.echo(header) for r in results: af = f"{r.AF:.6f}" if r.AF is not None else "NA" line = ( f"{r.variant.chrom}\t{r.variant.pos}\t{r.variant.ref}\t" f"{r.variant.alt}\t{r.AC}\t{r.AN}\t{af}\t{r.n_samples_eligible}\t" - f"{r.N_HET}\t{r.N_HOM_ALT}\t{r.N_HOM_REF}" + f"{r.N_HET}\t{r.N_HOM_ALT}\t{r.N_HOM_REF}\t{r.N_FAIL}\t{r.N_NO_COVERAGE}" ) - if has_fail: - line += f"\t{r.N_FAIL if r.N_FAIL is not None else '?'}" - line += f"\t{r.N_NO_COVERAGE}" click.echo(line) else: # text if not results: @@ -104,12 +98,11 @@ def _print_results(results, fmt: str) -> None: return for r in results: af = f"{r.AF:.4f}" if r.AF is not None else "NA" - fail_str = f" N_FAIL={r.N_FAIL}" if r.N_FAIL is not None else "" click.echo( f"{r.variant.chrom}:{r.variant.pos} {r.variant.ref}>{r.variant.alt} " f"AC={r.AC} AN={r.AN} AF={af} n_eligible={r.n_samples_eligible} " - f"N_HET={r.N_HET} N_HOM_ALT={r.N_HOM_ALT} N_HOM_REF={r.N_HOM_REF}" - f"{fail_str} N_NO_COVERAGE={r.N_NO_COVERAGE}" + f"N_HET={r.N_HET} N_HOM_ALT={r.N_HOM_ALT} N_HOM_REF={r.N_HOM_REF} " + f"N_FAIL={r.N_FAIL} N_NO_COVERAGE={r.N_NO_COVERAGE}" ) diff --git a/tests/test_cli.py b/tests/test_cli.py index b8c4526..4251286 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -435,6 +435,34 @@ def test_query_tsv_with_fail_col(runner, test_db): assert "N_FAIL" in lines[0] +def test_query_output_includes_n_no_coverage_all_formats(runner, test_db): + """N_FAIL and N_NO_COVERAGE appear in text/tsv/json output, and TSV columns line up.""" + base = ["query", "--db", test_db, "--locus", "chr1:1500"] + + text = runner.invoke(cli, base) + assert text.exit_code == 0 + assert "N_FAIL=" in text.output + assert "N_NO_COVERAGE=" in text.output + + tsv = runner.invoke(cli, base + ["--format", "tsv"]) + assert tsv.exit_code == 0 + tsv_lines = tsv.output.strip().splitlines() + assert len(tsv_lines) >= 2 + header = tsv_lines[0].split("\t") + assert "N_FAIL" in header + assert header[-1] == "N_NO_COVERAGE" + for row in tsv_lines[1:]: + assert len(row.split("\t")) == len(header) + + js = runner.invoke(cli, base + ["--format", "json"]) + assert js.exit_code == 0 + data = json.loads(js.output) + assert len(data) >= 1 + for entry in data: + assert isinstance(entry["N_FAIL"], int) + assert isinstance(entry["N_NO_COVERAGE"], int) + + # --- afquery annotate: verbose flag --- def test_annotate_verbose(runner, test_db, tmp_path):