diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py index 7cd77d63a..e14b598db 100644 --- a/malariagen_data/anoph/fst.py +++ b/malariagen_data/anoph/fst.py @@ -1,3 +1,4 @@ +import warnings from typing import Tuple, Optional import numpy as np @@ -81,6 +82,26 @@ def _fst_gwss( chunks=chunks, ).compute() + n_snps = len(pos) + _min_snps_threshold = 1000 + _window_adjustment_factor = 10 + if n_snps < _min_snps_threshold: + raise ValueError( + f"Too few SNP sites ({n_snps}) available for Fst GWSS. " + f"At least {_min_snps_threshold} sites are required. " + "Try a larger genomic region or different site selection criteria." + ) + if window_size >= n_snps: + adjusted_window_size = max(1, n_snps // _window_adjustment_factor) + warnings.warn( + f"window_size ({window_size}) is >= the number of SNP sites " + f"available ({n_snps}); automatically adjusting window_size to " + f"{adjusted_window_size} (= {n_snps} // {_window_adjustment_factor}).", + UserWarning, + stacklevel=2, + ) + window_size = adjusted_window_size + with self._spinner(desc="Compute Fst"): with np.errstate(divide="ignore", invalid="ignore"): fst = allel.moving_hudson_fst(ac1, ac2, size=window_size) @@ -96,7 +117,10 @@ def _fst_gwss( @doc( summary=""" Run a Fst genome-wide scan to investigate genetic differentiation - between two cohorts. + between two cohorts. If window_size is >= the number of available + SNP sites, a UserWarning is issued and window_size is automatically + adjusted to number_of_snps // 10. A ValueError is raised if the + number of available SNP sites is below 1000. """, returns=dict( x="An array containing the window centre point genomic positions", diff --git a/tests/anoph/test_fst.py b/tests/anoph/test_fst.py index 790eb11b1..053e991fa 100644 --- a/tests/anoph/test_fst.py +++ b/tests/anoph/test_fst.py @@ -1,4 +1,5 @@ import itertools +import random import pytest from pytest_cases import parametrize_with_cases import numpy as np @@ -139,6 +140,31 @@ def test_fst_gwss(fixture, api: AnophelesFstAnalysis): assert isinstance(fig, bokeh.models.GridPlot) +@parametrize_with_cases("fixture,api", cases=".") +def test_fst_gwss_window_size_too_large(fixture, api: AnophelesFstAnalysis): + # When window_size exceeds available SNPs, a UserWarning must be issued and + # the function must still return a valid result using the adjusted window_size. + all_sample_sets = api.sample_sets()["sample_set"].to_list() + all_countries = api.sample_metadata()["country"].dropna().unique().tolist() + countries = random.sample(all_countries, 2) + cohort1_query = f"country == {countries[0]!r}" + cohort2_query = f"country == {countries[1]!r}" + with pytest.warns(UserWarning, match="window_size"): + x, fst = api.fst_gwss( + contig=random.choice(api.contigs), + sample_sets=all_sample_sets, + cohort1_query=cohort1_query, + cohort2_query=cohort2_query, + site_mask=random.choice(api.site_mask_ids), + window_size=10_000_000, # far larger than any fixture SNP count + min_cohort_size=1, + ) + assert isinstance(x, np.ndarray) + assert isinstance(fst, np.ndarray) + assert len(x) > 0 + assert x.shape == fst.shape + + @parametrize_with_cases("fixture,api", cases=".") def test_average_fst(fixture, api: AnophelesFstAnalysis): # Set up test parameters.