From 472d074a81cf15147558c17183734fdbee9646fd Mon Sep 17 00:00:00 2001 From: Suhrid Marwah Date: Wed, 18 Mar 2026 15:07:12 +0000 Subject: [PATCH 1/4] fix: raise ValueError when window_size exceeds available SNPs in fst_gwss Signed-off-by: Suhrid Marwah --- malariagen_data/anoph/fst.py | 7 +++++++ tests/anoph/test_fst.py | 21 +++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py index 4b24a7625..bfd97ecea 100644 --- a/malariagen_data/anoph/fst.py +++ b/malariagen_data/anoph/fst.py @@ -88,6 +88,13 @@ def _fst_gwss( fst = np.clip(fst, a_min=clip_min, a_max=1) x = allel.moving_statistic(pos, statistic=np.mean, size=window_size) + if len(x) == 0: + raise ValueError( + f"No Fst windows could be computed: window_size={window_size!r} is " + f"larger than the number of SNP sites available ({len(pos)}) in the " + "selected region. Try reducing window_size or selecting a larger region." + ) + results = dict(x=x, fst=fst) return results diff --git a/tests/anoph/test_fst.py b/tests/anoph/test_fst.py index 9a65d7f6f..41100406d 100644 --- a/tests/anoph/test_fst.py +++ b/tests/anoph/test_fst.py @@ -140,6 +140,27 @@ def test_fst_gwss(fixture, api: AnophelesFstAnalysis): assert isinstance(fig, bokeh.models.GridPlot) +@parametrize_with_cases("fixture,api", cases=".") +def test_fst_gwss_window_size_too_large(fixture, api: AnophelesFstAnalysis): + # Use a window_size larger than the number of available SNPs to trigger the + # ValueError guard added in _fst_gwss. + all_sample_sets = api.sample_sets()["sample_set"].to_list() + all_countries = api.sample_metadata()["country"].dropna().unique().tolist() + countries = random.sample(all_countries, 2) + cohort1_query = f"country == {countries[0]!r}" + cohort2_query = f"country == {countries[1]!r}" + with pytest.raises(ValueError, match="window_size"): + api.fst_gwss( + contig=random.choice(api.contigs), + sample_sets=all_sample_sets, + cohort1_query=cohort1_query, + cohort2_query=cohort2_query, + site_mask=random.choice(api.site_mask_ids), + window_size=10_000_000, # far larger than any fixture SNP count + min_cohort_size=1, + ) + + @parametrize_with_cases("fixture,api", cases=".") def test_average_fst(fixture, api: AnophelesFstAnalysis): # Set up test parameters. From a05bab0d1f916e33e13f005b7b1354bc797e0fb6 Mon Sep 17 00:00:00 2001 From: Suhrid Marwah Date: Thu, 19 Mar 2026 06:07:32 +0000 Subject: [PATCH 2/4] fix: gracefully handle oversized window_size with warning and adjustment Replace hard error when window_size exceeds available SNPs with a warning, and automatically adjust window_size to the maximum valid value so that computation can proceed. Signed-off-by: Suhrid Marwah --- malariagen_data/anoph/fst.py | 22 ++++++++++++++-------- tests/anoph/test_fst.py | 12 ++++++++---- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py index bfd97ecea..35c18bc00 100644 --- a/malariagen_data/anoph/fst.py +++ b/malariagen_data/anoph/fst.py @@ -1,3 +1,4 @@ +import warnings from typing import Tuple, Optional import numpy as np @@ -81,6 +82,16 @@ def _fst_gwss( chunks=chunks, ).compute() + n_snps = len(pos) + if window_size > n_snps: + warnings.warn( + f"window_size ({window_size}) is larger than the number of SNP sites " + f"available ({n_snps}); adjusting window_size to {n_snps}.", + UserWarning, + stacklevel=2, + ) + window_size = n_snps + with self._spinner(desc="Compute Fst"): with np.errstate(divide="ignore", invalid="ignore"): fst = allel.moving_hudson_fst(ac1, ac2, size=window_size) @@ -88,13 +99,6 @@ def _fst_gwss( fst = np.clip(fst, a_min=clip_min, a_max=1) x = allel.moving_statistic(pos, statistic=np.mean, size=window_size) - if len(x) == 0: - raise ValueError( - f"No Fst windows could be computed: window_size={window_size!r} is " - f"larger than the number of SNP sites available ({len(pos)}) in the " - "selected region. Try reducing window_size or selecting a larger region." - ) - results = dict(x=x, fst=fst) return results @@ -103,7 +107,9 @@ def _fst_gwss( @doc( summary=""" Run a Fst genome-wide scan to investigate genetic differentiation - between two cohorts. + between two cohorts. If window_size exceeds the number of available + SNP sites, a UserWarning is issued and window_size is automatically + reduced to the number of available sites. """, returns=dict( x="An array containing the window centre point genomic positions", diff --git a/tests/anoph/test_fst.py b/tests/anoph/test_fst.py index 41100406d..06f9a80e2 100644 --- a/tests/anoph/test_fst.py +++ b/tests/anoph/test_fst.py @@ -142,15 +142,15 @@ def test_fst_gwss(fixture, api: AnophelesFstAnalysis): @parametrize_with_cases("fixture,api", cases=".") def test_fst_gwss_window_size_too_large(fixture, api: AnophelesFstAnalysis): - # Use a window_size larger than the number of available SNPs to trigger the - # ValueError guard added in _fst_gwss. + # When window_size exceeds available SNPs, a UserWarning must be issued and + # the function must still return a valid result using the adjusted window_size. all_sample_sets = api.sample_sets()["sample_set"].to_list() all_countries = api.sample_metadata()["country"].dropna().unique().tolist() countries = random.sample(all_countries, 2) cohort1_query = f"country == {countries[0]!r}" cohort2_query = f"country == {countries[1]!r}" - with pytest.raises(ValueError, match="window_size"): - api.fst_gwss( + with pytest.warns(UserWarning, match="window_size"): + x, fst = api.fst_gwss( contig=random.choice(api.contigs), sample_sets=all_sample_sets, cohort1_query=cohort1_query, @@ -159,6 +159,10 @@ def test_fst_gwss_window_size_too_large(fixture, api: AnophelesFstAnalysis): window_size=10_000_000, # far larger than any fixture SNP count min_cohort_size=1, ) + assert isinstance(x, np.ndarray) + assert isinstance(fst, np.ndarray) + assert len(x) > 0 + assert x.shape == fst.shape @parametrize_with_cases("fixture,api", cases=".") From c3f42abd718e30fba5424763742c99caf35ddfdb Mon Sep 17 00:00:00 2001 From: suhr25 Date: Sun, 29 Mar 2026 09:54:13 +0000 Subject: [PATCH 3/4] fix: gracefully handle oversized window_size by auto-adjusting --- malariagen_data/anoph/fst.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py index 35c18bc00..d20d1f0c0 100644 --- a/malariagen_data/anoph/fst.py +++ b/malariagen_data/anoph/fst.py @@ -83,14 +83,24 @@ def _fst_gwss( ).compute() n_snps = len(pos) - if window_size > n_snps: + _min_snps_threshold = 1000 + _window_adjustment_factor = 10 + if n_snps < _min_snps_threshold: + raise ValueError( + f"Too few SNP sites ({n_snps}) available for Fst GWSS. " + f"At least {_min_snps_threshold} sites are required. " + "Try a larger genomic region or different site selection criteria." + ) + if window_size >= n_snps: + adjusted_window_size = max(1, n_snps // _window_adjustment_factor) warnings.warn( - f"window_size ({window_size}) is larger than the number of SNP sites " - f"available ({n_snps}); adjusting window_size to {n_snps}.", + f"window_size ({window_size}) is >= the number of SNP sites " + f"available ({n_snps}); automatically adjusting window_size to " + f"{adjusted_window_size} (= {n_snps} // {_window_adjustment_factor}).", UserWarning, stacklevel=2, ) - window_size = n_snps + window_size = adjusted_window_size with self._spinner(desc="Compute Fst"): with np.errstate(divide="ignore", invalid="ignore"): @@ -107,9 +117,10 @@ def _fst_gwss( @doc( summary=""" Run a Fst genome-wide scan to investigate genetic differentiation - between two cohorts. If window_size exceeds the number of available + between two cohorts. If window_size is >= the number of available SNP sites, a UserWarning is issued and window_size is automatically - reduced to the number of available sites. + adjusted to number_of_snps // 10. A ValueError is raised if the + number of available SNP sites is below 1000. """, returns=dict( x="An array containing the window centre point genomic positions", From f2f4f917913d65ac0c77f39df3d1b577a3d5a970 Mon Sep 17 00:00:00 2001 From: suhr25 Date: Fri, 3 Apr 2026 18:59:49 +0000 Subject: [PATCH 4/4] fix: restore missing import random in test_fst.py lost during master merge --- tests/anoph/test_fst.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/anoph/test_fst.py b/tests/anoph/test_fst.py index af031b0ee..053e991fa 100644 --- a/tests/anoph/test_fst.py +++ b/tests/anoph/test_fst.py @@ -1,4 +1,5 @@ import itertools +import random import pytest from pytest_cases import parametrize_with_cases import numpy as np