From b44284f0966972341c2ed7b84d72573d141b1d34 Mon Sep 17 00:00:00 2001 From: "Cleber F. Carvalho" Date: Mon, 30 Mar 2026 15:03:41 -0300 Subject: [PATCH] Add genomic summary helper functions for missing rate and informative sites --- malariagen_data/util.py | 69 +++++++++++++++++++++++++++++++++++++++++ tests/test_genomics.py | 12 +++++++ 2 files changed, 81 insertions(+) create mode 100644 tests/test_genomics.py diff --git a/malariagen_data/util.py b/malariagen_data/util.py index 2ab75ec56..d8d831335 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -13,6 +13,8 @@ from typing import IO, Dict, Hashable, List, Mapping, Optional, Tuple, Union, Callable from urllib.parse import unquote_plus from numpy.testing import assert_allclose, assert_array_equal +import numpy as np + try: from google import colab # type: ignore @@ -1696,3 +1698,70 @@ def _distributed_client(): except ValueError: client = None return client + + +def compute_missing_rate(gt_array: np.ndarray) -> float: + """Compute proportion of missing genotype calls (-1).""" + return np.mean(gt_array == -1) + + +def compute_informative_sites(gt_array: np.ndarray) -> int: + """Count sites with at least one non-missing allele.""" + return int(np.sum(np.any(gt_array != -1, axis=1))) + + +def summarize_chromosome(root, sample: str, chrom: str, n_sites: int = 10000) -> dict: + """ + Summarize genotype data for a given chromosome. + + Parameters + ---------- + root : zarr.Group + Zarr root group. + sample : str + Sample identifier. + chrom : str + Chromosome name. + n_sites : int + Number of variants to sample. + + Returns + ------- + dict + Summary statistics. + """ + gt = root[f"{sample}/{chrom}/calldata/GT"][:n_sites, 0, :] + + return { + "missing_rate": compute_missing_rate(gt), + "informative_sites": compute_informative_sites(gt), + "n_sites": gt.shape[0], + } + + +def summarize_sample(root, sample: str, n_sites: int = 10000) -> dict: + """ + Summarize all chromosomes for a given sample. + + Parameters + ---------- + root : zarr.Group + Zarr root group. + sample : str + Sample identifier. + + Returns + ------- + dict + Dictionary with chromosome-level summaries. + """ + chromosomes = list(root[sample].keys()) + + summary = {} + for chrom in chromosomes: + try: + summary[chrom] = summarize_chromosome(root, sample, chrom, n_sites) + except Exception: + continue + + return summary \ No newline at end of file diff --git a/tests/test_genomics.py b/tests/test_genomics.py new file mode 100644 index 000000000..c4194081d --- /dev/null +++ b/tests/test_genomics.py @@ -0,0 +1,12 @@ +import numpy as np +from malariagen_data.util import compute_missing_rate, compute_informative_sites + + +def test_compute_missing_rate(): + gt = np.array([[0, 1], [-1, -1]]) + assert compute_missing_rate(gt) == 0.5 + + +def test_compute_informative_sites(): + gt = np.array([[0, 1], [-1, -1]]) + assert compute_informative_sites(gt) == 1 \ No newline at end of file