Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions malariagen_data/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from typing import IO, Dict, Hashable, List, Mapping, Optional, Tuple, Union, Callable
from urllib.parse import unquote_plus
from numpy.testing import assert_allclose, assert_array_equal
import numpy as np


try:
from google import colab # type: ignore
Expand Down Expand Up @@ -1696,3 +1698,70 @@ def _distributed_client():
except ValueError:
client = None
return client


def compute_missing_rate(gt_array: np.ndarray) -> float:
"""Compute proportion of missing genotype calls (-1)."""
return np.mean(gt_array == -1)


def compute_informative_sites(gt_array: np.ndarray) -> int:
"""Count sites with at least one non-missing allele."""
return int(np.sum(np.any(gt_array != -1, axis=1)))


def summarize_chromosome(root, sample: str, chrom: str, n_sites: int = 10000) -> dict:
"""
Summarize genotype data for a given chromosome.

Parameters
----------
root : zarr.Group
Zarr root group.
sample : str
Sample identifier.
chrom : str
Chromosome name.
n_sites : int
Number of variants to sample.

Returns
-------
dict
Summary statistics.
"""
gt = root[f"{sample}/{chrom}/calldata/GT"][:n_sites, 0, :]

return {
"missing_rate": compute_missing_rate(gt),
"informative_sites": compute_informative_sites(gt),
"n_sites": gt.shape[0],
}


def summarize_sample(root, sample: str, n_sites: int = 10000) -> dict:
"""
Summarize all chromosomes for a given sample.

Parameters
----------
root : zarr.Group
Zarr root group.
sample : str
Sample identifier.

Returns
-------
dict
Dictionary with chromosome-level summaries.
"""
chromosomes = list(root[sample].keys())

summary = {}
for chrom in chromosomes:
try:
summary[chrom] = summarize_chromosome(root, sample, chrom, n_sites)
except Exception:
continue

return summary
12 changes: 12 additions & 0 deletions tests/test_genomics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import numpy as np
from malariagen_data.util import compute_missing_rate, compute_informative_sites


def test_compute_missing_rate():
gt = np.array([[0, 1], [-1, -1]])
assert compute_missing_rate(gt) == 0.5


def test_compute_informative_sites():
gt = np.array([[0, 1], [-1, -1]])
assert compute_informative_sites(gt) == 1