Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion malariagen_data/anoph/fst.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings
from typing import Tuple, Optional

import numpy as np
Expand Down Expand Up @@ -81,6 +82,26 @@ def _fst_gwss(
chunks=chunks,
).compute()

n_snps = len(pos)
_min_snps_threshold = 1000
_window_adjustment_factor = 10
if n_snps < _min_snps_threshold:
raise ValueError(
f"Too few SNP sites ({n_snps}) available for Fst GWSS. "
f"At least {_min_snps_threshold} sites are required. "
"Try a larger genomic region or different site selection criteria."
)
if window_size >= n_snps:
adjusted_window_size = max(1, n_snps // _window_adjustment_factor)
warnings.warn(
f"window_size ({window_size}) is >= the number of SNP sites "
f"available ({n_snps}); automatically adjusting window_size to "
f"{adjusted_window_size} (= {n_snps} // {_window_adjustment_factor}).",
UserWarning,
stacklevel=2,
)
window_size = adjusted_window_size

with self._spinner(desc="Compute Fst"):
with np.errstate(divide="ignore", invalid="ignore"):
fst = allel.moving_hudson_fst(ac1, ac2, size=window_size)
Expand All @@ -96,7 +117,10 @@ def _fst_gwss(
@doc(
summary="""
Run a Fst genome-wide scan to investigate genetic differentiation
between two cohorts.
between two cohorts. If window_size is >= the number of available
SNP sites, a UserWarning is issued and window_size is automatically
adjusted to number_of_snps // 10. A ValueError is raised if the
number of available SNP sites is below 1000.
""",
returns=dict(
x="An array containing the window centre point genomic positions",
Expand Down
26 changes: 26 additions & 0 deletions tests/anoph/test_fst.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import itertools
import random
import pytest
from pytest_cases import parametrize_with_cases
import numpy as np
Expand Down Expand Up @@ -139,6 +140,31 @@ def test_fst_gwss(fixture, api: AnophelesFstAnalysis):
assert isinstance(fig, bokeh.models.GridPlot)


@parametrize_with_cases("fixture,api", cases=".")
def test_fst_gwss_window_size_too_large(fixture, api: AnophelesFstAnalysis):
# When window_size exceeds available SNPs, a UserWarning must be issued and
# the function must still return a valid result using the adjusted window_size.
all_sample_sets = api.sample_sets()["sample_set"].to_list()
all_countries = api.sample_metadata()["country"].dropna().unique().tolist()
countries = random.sample(all_countries, 2)
cohort1_query = f"country == {countries[0]!r}"
cohort2_query = f"country == {countries[1]!r}"
with pytest.warns(UserWarning, match="window_size"):
x, fst = api.fst_gwss(
contig=random.choice(api.contigs),
sample_sets=all_sample_sets,
cohort1_query=cohort1_query,
cohort2_query=cohort2_query,
site_mask=random.choice(api.site_mask_ids),
window_size=10_000_000, # far larger than any fixture SNP count
min_cohort_size=1,
)
assert isinstance(x, np.ndarray)
assert isinstance(fst, np.ndarray)
assert len(x) > 0
assert x.shape == fst.shape


@parametrize_with_cases("fixture,api", cases=".")
def test_average_fst(fixture, api: AnophelesFstAnalysis):
# Set up test parameters.
Expand Down
Loading