From a21a7e1628a3f992b954495170b86d5806c31231 Mon Sep 17 00:00:00 2001 From: Kevin Thornton Date: Sun, 7 Dec 2025 07:57:44 -0800 Subject: [PATCH 01/16] add ::testing --- src/lib.rs | 2 ++ src/testing/mod.rs | 0 2 files changed, 2 insertions(+) create mode 100644 src/testing/mod.rs diff --git a/src/lib.rs b/src/lib.rs index 45a9811..ecd510b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,8 @@ pub mod iter; pub mod stats; mod util; +#[cfg(test)] +mod testing; #[cfg(test)] mod naivecalculations; #[cfg(test)] diff --git a/src/testing/mod.rs b/src/testing/mod.rs new file mode 100644 index 0000000..e69de29 From 35d1244447eafb3f786b5c3ce714ffccfcc94562 Mon Sep 17 00:00:00 2001 From: Kevin Thornton Date: Sun, 7 Dec 2025 08:03:27 -0800 Subject: [PATCH 02/16] pass --- src/lib.rs | 6 ---- src/testing/mod.rs | 3 ++ src/{ => testing}/naivecalculations.rs | 4 +-- src/{ => testing}/test.rs | 41 +++++++++++++++----------- src/{ => testing}/testdata.rs | 0 5 files changed, 29 insertions(+), 25 deletions(-) rename src/{ => testing}/naivecalculations.rs (95%) rename src/{ => testing}/test.rs (93%) rename src/{ => testing}/testdata.rs (100%) diff --git a/src/lib.rs b/src/lib.rs index ecd510b..f1629e4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,12 +11,6 @@ mod util; #[cfg(test)] mod testing; -#[cfg(test)] -mod naivecalculations; -#[cfg(test)] -mod test; -#[cfg(test)] -mod testdata; pub use counts::*; diff --git a/src/testing/mod.rs b/src/testing/mod.rs index e69de29..e1fbc6f 100644 --- a/src/testing/mod.rs +++ b/src/testing/mod.rs @@ -0,0 +1,3 @@ +mod naivecalculations; +mod test; +mod testdata; diff --git a/src/naivecalculations.rs b/src/testing/naivecalculations.rs similarity index 95% rename from src/naivecalculations.rs rename to src/testing/naivecalculations.rs index 4cb7c1c..9e23066 100644 --- a/src/naivecalculations.rs +++ b/src/testing/naivecalculations.rs @@ -1,5 +1,5 @@ -use crate::testdata::GenotypeData; -use crate::testdata::Site; +use crate::testing::testdata::GenotypeData; +use crate::testing::testdata::Site; fn flatten_to_alleles(genotypes: &mut dyn Iterator) -> Vec { let mut alleles = vec![]; diff --git a/src/test.rs b/src/testing/test.rs similarity index 93% rename from src/test.rs rename to src/testing/test.rs index 7b0ab66..d8c6d4d 100644 --- a/src/test.rs +++ b/src/testing/test.rs @@ -540,14 +540,15 @@ chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# // make 10 random sites. // No missing data, etc.. for _ in 0..10 { - let site = crate::testdata::random_site_rng(10, ploidy, &freqs, None, &mut rng); + let site = + crate::testing::testdata::random_site_rng(10, ploidy, &freqs, None, &mut rng); sites.push(site); } // convert to our normal format - let counts = crate::testdata::single_pop_counts(&mut sites.iter()); + let counts = crate::testing::testdata::single_pop_counts(&mut sites.iter()); // get the calcs let pi_from_counts = GlobalPi::from_iter_sites(counts.iter()); - let pi_naive = crate::naivecalculations::pi(&mut sites.iter_mut()); + let pi_naive = crate::testing::naivecalculations::pi(&mut sites.iter_mut()); // compare if pi_naive.is_nan() { assert!(pi_from_counts.as_raw().is_nan()); @@ -573,11 +574,11 @@ chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# // make 10 random sites. // No missing data, etc.. for _ in 0..10 { - let site = crate::testdata::random_site_rng( + let site = crate::testing::testdata::random_site_rng( 10, ploidy, &freqs, - Some(crate::testdata::RandomSiteOptions { + Some(crate::testing::testdata::RandomSiteOptions { missing_data_rate: Some(rate), }), &mut rng, @@ -585,10 +586,10 @@ chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# sites.push(site); } // convert to our normal format - let counts = crate::testdata::single_pop_counts(&mut sites.iter()); + let counts = crate::testing::testdata::single_pop_counts(&mut sites.iter()); // get the calcs let pi_from_counts = GlobalPi::from_iter_sites(counts.iter()); - let pi_naive = crate::naivecalculations::pi(&mut sites.iter_mut()); + let pi_naive = crate::testing::naivecalculations::pi(&mut sites.iter_mut()); // compare if pi_naive.is_nan() { assert!(pi_from_counts.as_raw().is_nan()); @@ -609,13 +610,16 @@ chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# let mut rng = StdRng::seed_from_u64(54321); for ploidy in [1, 2, 4] { for num_alleles in [2, 3, 4] { - let freqs_iter = crate::testdata::FixedMutationIterator::new(num_alleles); + let freqs_iter = crate::testing::testdata::FixedMutationIterator::new(num_alleles); for freqs in freqs_iter.iter() { assert_eq!(freqs.len(), num_alleles); - let site = crate::testdata::random_site_rng(10, ploidy, &freqs, None, &mut rng); + let site = crate::testing::testdata::random_site_rng( + 10, ploidy, &freqs, None, &mut rng, + ); // convert to our normal format - let counts = crate::testdata::single_pop_counts(&mut std::iter::once(&site)); + let counts = + crate::testing::testdata::single_pop_counts(&mut std::iter::once(&site)); let pi_from_counts = GlobalPi::from_iter_sites(counts.iter()); assert_eq!(pi_from_counts.as_raw(), 0.); } @@ -717,13 +721,15 @@ chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# // make 10 random sites. // No missing data, etc.. for _ in 0..10 { - let site = crate::testdata::random_site_rng(10, ploidy, &freqs, None, &mut rng); + let site = + crate::testing::testdata::random_site_rng(10, ploidy, &freqs, None, &mut rng); sites.push(site); } // convert to our normal format - let counts = crate::testdata::single_pop_counts(&mut sites.iter()); + let counts = crate::testing::testdata::single_pop_counts(&mut sites.iter()); let theta = WattersonTheta::from_iter_sites(counts.iter()); - let theta_naive = crate::naivecalculations::watterson_theta(&mut sites.iter_mut()); + let theta_naive = + crate::testing::naivecalculations::watterson_theta(&mut sites.iter_mut()); if theta_naive.is_nan() { assert!(theta.as_raw().is_nan()) } else { @@ -745,11 +751,11 @@ chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# // make 10 random sites. // No missing data, etc.. for _ in 0..10 { - let site = crate::testdata::random_site_rng( + let site = crate::testing::testdata::random_site_rng( 10, ploidy, &freqs, - Some(crate::testdata::RandomSiteOptions { + Some(crate::testing::testdata::RandomSiteOptions { missing_data_rate: Some(rate), }), &mut rng, @@ -757,10 +763,11 @@ chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# sites.push(site); } // convert to our normal format - let counts = crate::testdata::single_pop_counts(&mut sites.iter()); + let counts = crate::testing::testdata::single_pop_counts(&mut sites.iter()); // get the calcs let theta = WattersonTheta::from_iter_sites(counts.iter()); - let theta_naive = crate::naivecalculations::watterson_theta(&mut sites.iter_mut()); + let theta_naive = + crate::testing::naivecalculations::watterson_theta(&mut sites.iter_mut()); // compare if theta_naive.is_nan() { assert!(theta.as_raw().is_nan()); diff --git a/src/testdata.rs b/src/testing/testdata.rs similarity index 100% rename from src/testdata.rs rename to src/testing/testdata.rs From 280f0cea592cce8fc73ced3961762b65414c45b1 Mon Sep 17 00:00:00 2001 From: Kevin Thornton Date: Sun, 7 Dec 2025 08:04:24 -0800 Subject: [PATCH 03/16] todo --- src/testing/mod.rs | 2 ++ src/testing/pi.rs | 4 ++++ 2 files changed, 6 insertions(+) create mode 100644 src/testing/pi.rs diff --git a/src/testing/mod.rs b/src/testing/mod.rs index e1fbc6f..b196efd 100644 --- a/src/testing/mod.rs +++ b/src/testing/mod.rs @@ -1,3 +1,5 @@ mod naivecalculations; mod test; mod testdata; + +mod pi; diff --git a/src/testing/pi.rs b/src/testing/pi.rs new file mode 100644 index 0000000..09f480d --- /dev/null +++ b/src/testing/pi.rs @@ -0,0 +1,4 @@ +#[test] +fn test() { + todo!("move test for pi here") +} From daf802eb93d335cfbefa14891999725efb61d195 Mon Sep 17 00:00:00 2001 From: Kevin Thornton Date: Sun, 7 Dec 2025 08:51:48 -0800 Subject: [PATCH 04/16] move pi --- src/testing/pi.rs | 106 +++++++++++++++- src/testing/test.rs | 303 +++++++++++++++----------------------------- 2 files changed, 203 insertions(+), 206 deletions(-) diff --git a/src/testing/pi.rs b/src/testing/pi.rs index 09f480d..2b67d25 100644 --- a/src/testing/pi.rs +++ b/src/testing/pi.rs @@ -1,4 +1,106 @@ +use crate::stats::GlobalPi; +use crate::stats::GlobalStatistic; + +// The basic logic here is that we should +// be able to make random data, convert it +// to our normal format, and get stats +// from both data inputs and get the same +// answer. +// If this is not possible then we have bugs +// in one of several possible places... #[test] -fn test() { - todo!("move test for pi here") +fn pi_from_random_data() { + use rand::prelude::*; + + let mut rng = StdRng::seed_from_u64(54321); + for ploidy in [1, 2, 3, 4] { + let freqs = [0.25, 0.5, 0.25]; // fixed allele freqs per site + let mut sites = vec![]; + // make 10 random sites. + // No missing data, etc.. + for _ in 0..10 { + let site = + crate::testing::testdata::random_site_rng(10, ploidy, &freqs, None, &mut rng); + sites.push(site); + } + // convert to our normal format + let counts = crate::testing::testdata::single_pop_counts(&mut sites.iter()); + // get the calcs + let pi_from_counts = GlobalPi::from_iter_sites(counts.iter()); + let pi_naive = crate::testing::naivecalculations::pi(&mut sites.iter_mut()); + // compare + if pi_naive.is_nan() { + assert!(pi_from_counts.as_raw().is_nan()); + } else { + assert!( + (pi_from_counts.as_raw() - pi_naive).abs() <= 1e-10, + "{pi_from_counts:?} != {pi_naive}" + ); + } + } +} + +#[test] +fn pi_from_random_data_with_missing_data() { + use rand::prelude::*; + + let mut rng = StdRng::seed_from_u64(54321); + + for ploidy in [1, 2, 4] { + for rate in [0.01, 0.1, 0.5, 0.9] { + let freqs = [0.25, 0.5, 0.25]; // fixed allele freqs per site + let mut sites = vec![]; + // make 10 random sites. + // No missing data, etc.. + for _ in 0..10 { + let site = crate::testing::testdata::random_site_rng( + 10, + ploidy, + &freqs, + Some(crate::testing::testdata::RandomSiteOptions { + missing_data_rate: Some(rate), + }), + &mut rng, + ); + sites.push(site); + } + // convert to our normal format + let counts = crate::testing::testdata::single_pop_counts(&mut sites.iter()); + // get the calcs + let pi_from_counts = GlobalPi::from_iter_sites(counts.iter()); + let pi_naive = crate::testing::naivecalculations::pi(&mut sites.iter_mut()); + // compare + if pi_naive.is_nan() { + assert!(pi_from_counts.as_raw().is_nan()); + } else { + assert!( + (pi_from_counts.as_raw() - pi_naive).abs() <= 1e-10, + "{pi_from_counts:?} != {pi_naive}" + ); + } + } + } +} + +#[test] +fn pi_allele_frequency_of_one() { + use rand::prelude::*; + + let mut rng = StdRng::seed_from_u64(54321); + for ploidy in [1, 2, 4] { + for num_alleles in [2, 3, 4] { + let freqs_iter = crate::testing::testdata::FixedMutationIterator::new(num_alleles); + + for freqs in freqs_iter.iter() { + assert_eq!(freqs.len(), num_alleles); + let site = + crate::testing::testdata::random_site_rng(10, ploidy, &freqs, None, &mut rng); + // convert to our normal format + let counts = + crate::testing::testdata::single_pop_counts(&mut std::iter::once(&site)); + let pi_from_counts = GlobalPi::from_iter_sites(counts.iter()); + assert_eq!(pi_from_counts.as_raw(), 0.); + } + } + } } diff --git a/src/testing/test.rs b/src/testing/test.rs index d8c6d4d..72d9d65 100644 --- a/src/testing/test.rs +++ b/src/testing/test.rs @@ -9,9 +9,9 @@ mod tests { use rand::rngs::ThreadRng; use rand::seq::SliceRandom; use std::iter::repeat_n; - use triangle_matrix::{ - SimpleLowerTri, SymmetricUpperTri, SymmetricUpperTriMut, Triangle, TriangleMut, - }; + // use triangle_matrix::{ + // SimpleLowerTri, SymmetricUpperTri, SymmetricUpperTriMut, Triangle, TriangleMut, + // }; #[cfg(feature = "noodles")] mod vcf { @@ -281,25 +281,25 @@ chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# } } - struct TriVec(usize, Vec); + // struct TriVec(usize, Vec); - impl Triangle for TriVec { - type Inner = Vec; + // impl Triangle for TriVec { + // type Inner = Vec; - fn n(&self) -> usize { - self.0 - } + // fn n(&self) -> usize { + // self.0 + // } - fn inner(&self) -> &Self::Inner { - &self.1 - } - } + // fn inner(&self) -> &Self::Inner { + // &self.1 + // } + // } - impl TriangleMut for TriVec { - fn inner_mut(&mut self) -> &mut Self::Inner { - &mut self.1 - } - } + // impl TriangleMut for TriVec { + // fn inner_mut(&mut self) -> &mut Self::Inner { + // &mut self.1 + // } + // } fn shuffled_site( ids: impl Iterator, usize)>, @@ -317,40 +317,40 @@ chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# /// inefficient O(n^2) computation of pairwise diversity at a site /// /// hidden in test module because nobody should use this; just want to verify without magic numbers that calculations are correct - fn pi_from_matrix(alleles: &[Option]) -> f64 { - use SymmetricUpperTri; - use SymmetricUpperTriMut; - - let total_alleles = alleles.len(); - let mut mat = TriVec( - total_alleles, - Vec::from_iter(repeat_n( - None::, - total_alleles * (total_alleles + 1) / 2, - )), - ); - - for (ind_a, allele_a) in alleles.iter().enumerate() { - for (ind_b, allele_b) in alleles.iter().enumerate().skip(ind_a + 1) { - *SymmetricUpperTriMut::get_element_mut(&mut mat, ind_a, ind_b) = match *allele_a { - None => None, - Some(allele_id_a) => match *allele_b { - None => None, - Some(allele_id_b) => match allele_id_a.eq(&allele_id_b) { - false => Some(1), - true => Some(0), - }, - }, - } - } - } - - let make_iter = || { - mat.iter_triangle_indices() - .filter_map(|(i, j)| *SymmetricUpperTri::get_element(&mat, i, j)) - }; - make_iter().sum::() as f64 / make_iter().count() as f64 - } + // fn pi_from_matrix(alleles: &[Option]) -> f64 { + // use SymmetricUpperTri; + // use SymmetricUpperTriMut; + + // let total_alleles = alleles.len(); + // let mut mat = TriVec( + // total_alleles, + // Vec::from_iter(repeat_n( + // None::, + // total_alleles * (total_alleles + 1) / 2, + // )), + // ); + + // for (ind_a, allele_a) in alleles.iter().enumerate() { + // for (ind_b, allele_b) in alleles.iter().enumerate().skip(ind_a + 1) { + // *SymmetricUpperTriMut::get_element_mut(&mut mat, ind_a, ind_b) = match *allele_a { + // None => None, + // Some(allele_id_a) => match *allele_b { + // None => None, + // Some(allele_id_b) => match allele_id_a.eq(&allele_id_b) { + // false => Some(1), + // true => Some(0), + // }, + // }, + // } + // } + // } + + // let make_iter = || { + // mat.iter_triangle_indices() + // .filter_map(|(i, j)| *SymmetricUpperTri::get_element(&mat, i, j)) + // }; + // make_iter().sum::() as f64 / make_iter().count() as f64 + // } #[test] fn load_raw() { @@ -420,53 +420,53 @@ chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# counts.add_site_from_counts([1, 2, 3], 1).unwrap(); } - #[test] - fn global_pi() { - let mut rng = rng(); - let sites = vec![ - shuffled_site( - vec![ - (Some(AlleleID::from(0)), 35), - (Some(AlleleID::from(1)), 6), - (None, 3), - ] - .into_iter(), - &mut rng, - ), - shuffled_site( - vec![ - (Some(AlleleID::from(0)), 2), - (Some(AlleleID::from(1)), 14), - (Some(AlleleID::from(2)), 155), - ] - .into_iter(), - &mut rng, - ), - ]; - - let expect_site_0 = pi_from_matrix(&sites[0]); - let expect_site_1 = pi_from_matrix(&sites[1]); - - let allele_counts = MultiSiteCounts::from_tabular(sites); - - assert!( - (GlobalPi::from_iter_sites(allele_counts.iter().take(1)).as_raw() - expect_site_0) - .abs() - < f64::EPSILON - ); - assert!( - (GlobalPi::from_iter_sites(allele_counts.iter().skip(1).take(1)).as_raw() - - expect_site_1) - .abs() - < f64::EPSILON - ); - assert!( - (GlobalPi::from_iter_sites(allele_counts.iter()).as_raw() - - (expect_site_0 + expect_site_1)) - .abs() - < f64::EPSILON - ); - } + // #[test] + // fn global_pi() { + // let mut rng = rng(); + // let sites = vec![ + // shuffled_site( + // vec![ + // (Some(AlleleID::from(0)), 35), + // (Some(AlleleID::from(1)), 6), + // (None, 3), + // ] + // .into_iter(), + // &mut rng, + // ), + // shuffled_site( + // vec![ + // (Some(AlleleID::from(0)), 2), + // (Some(AlleleID::from(1)), 14), + // (Some(AlleleID::from(2)), 155), + // ] + // .into_iter(), + // &mut rng, + // ), + // ]; + + // let expect_site_0 = pi_from_matrix(&sites[0]); + // let expect_site_1 = pi_from_matrix(&sites[1]); + + // let allele_counts = MultiSiteCounts::from_tabular(sites); + + // assert!( + // (GlobalPi::from_iter_sites(allele_counts.iter().take(1)).as_raw() - expect_site_0) + // .abs() + // < f64::EPSILON + // ); + // assert!( + // (GlobalPi::from_iter_sites(allele_counts.iter().skip(1).take(1)).as_raw() + // - expect_site_1) + // .abs() + // < f64::EPSILON + // ); + // assert!( + // (GlobalPi::from_iter_sites(allele_counts.iter()).as_raw() + // - (expect_site_0 + expect_site_1)) + // .abs() + // < f64::EPSILON + // ); + // } #[test] fn watterson_theta() { @@ -522,111 +522,6 @@ chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# assert!((tajima.as_raw() - -0.15474069911037955).abs() < f64::EPSILON); } - // The basic logic here is that we should - // be able to make random data, convert it - // to our normal format, and get stats - // from both data inputs and get the same - // answer. - // If this is not possible then we have bugs - // in one of several possible places... - #[test] - fn pi_from_random_data() { - use rand::prelude::*; - - let mut rng = StdRng::seed_from_u64(54321); - for ploidy in [1, 2, 3, 4] { - let freqs = [0.25, 0.5, 0.25]; // fixed allele freqs per site - let mut sites = vec![]; - // make 10 random sites. - // No missing data, etc.. - for _ in 0..10 { - let site = - crate::testing::testdata::random_site_rng(10, ploidy, &freqs, None, &mut rng); - sites.push(site); - } - // convert to our normal format - let counts = crate::testing::testdata::single_pop_counts(&mut sites.iter()); - // get the calcs - let pi_from_counts = GlobalPi::from_iter_sites(counts.iter()); - let pi_naive = crate::testing::naivecalculations::pi(&mut sites.iter_mut()); - // compare - if pi_naive.is_nan() { - assert!(pi_from_counts.as_raw().is_nan()); - } else { - assert!( - (pi_from_counts.as_raw() - pi_naive).abs() <= 1e-10, - "{pi_from_counts:?} != {pi_naive}" - ); - } - } - } - - #[test] - fn pi_from_random_data_with_missing_data() { - use rand::prelude::*; - - let mut rng = StdRng::seed_from_u64(54321); - - for ploidy in [1, 2, 4] { - for rate in [0.01, 0.1, 0.5, 0.9] { - let freqs = [0.25, 0.5, 0.25]; // fixed allele freqs per site - let mut sites = vec![]; - // make 10 random sites. - // No missing data, etc.. - for _ in 0..10 { - let site = crate::testing::testdata::random_site_rng( - 10, - ploidy, - &freqs, - Some(crate::testing::testdata::RandomSiteOptions { - missing_data_rate: Some(rate), - }), - &mut rng, - ); - sites.push(site); - } - // convert to our normal format - let counts = crate::testing::testdata::single_pop_counts(&mut sites.iter()); - // get the calcs - let pi_from_counts = GlobalPi::from_iter_sites(counts.iter()); - let pi_naive = crate::testing::naivecalculations::pi(&mut sites.iter_mut()); - // compare - if pi_naive.is_nan() { - assert!(pi_from_counts.as_raw().is_nan()); - } else { - assert!( - (pi_from_counts.as_raw() - pi_naive).abs() <= 1e-10, - "{pi_from_counts:?} != {pi_naive}" - ); - } - } - } - } - - #[test] - fn pi_allele_frequency_of_one() { - use rand::prelude::*; - - let mut rng = StdRng::seed_from_u64(54321); - for ploidy in [1, 2, 4] { - for num_alleles in [2, 3, 4] { - let freqs_iter = crate::testing::testdata::FixedMutationIterator::new(num_alleles); - - for freqs in freqs_iter.iter() { - assert_eq!(freqs.len(), num_alleles); - let site = crate::testing::testdata::random_site_rng( - 10, ploidy, &freqs, None, &mut rng, - ); - // convert to our normal format - let counts = - crate::testing::testdata::single_pop_counts(&mut std::iter::once(&site)); - let pi_from_counts = GlobalPi::from_iter_sites(counts.iter()); - assert_eq!(pi_from_counts.as_raw(), 0.); - } - } - } - } - #[test] fn f_st_empty() { fn ok(populations: &MultiPopulationCounts) { From 8274a02bee063aeda872f25856fff3e6849fde41 Mon Sep 17 00:00:00 2001 From: Kevin Thornton Date: Sun, 7 Dec 2025 08:52:13 -0800 Subject: [PATCH 05/16] drop triangle_matrix dev dependency --- Cargo.lock | 7 ------- Cargo.toml | 1 - 2 files changed, 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9df0fca..90801e4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -442,7 +442,6 @@ dependencies = [ "noodles-core", "rand", "rand_distr", - "triangle_matrix", "tskit", ] @@ -635,12 +634,6 @@ dependencies = [ "syn", ] -[[package]] -name = "triangle_matrix" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d2c16bd91b39350657d29da25c9cf9ecbc3d95506529c37373f6ae350241ff1" - [[package]] name = "tskit" version = "0.15.0-alpha.2" diff --git a/Cargo.toml b/Cargo.toml index 41ee372..c7e8793 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,6 @@ tskit = { version = "0.15.0-alpha.2", optional = true, features=["bindings"] } [dev-dependencies] rand = "0.9.0" noodles-core = "0.16.0" -triangle_matrix = "0.4.0" noodles = { version = "0.92.0", features = ["vcf", "csi", "tabix"] } rand_distr = "0.5.1" From 54f9f28306835fe73809b5ae3cdb7a66b61513e8 Mon Sep 17 00:00:00 2001 From: Kevin Thornton Date: Sun, 7 Dec 2025 08:55:29 -0800 Subject: [PATCH 06/16] move watterson theta tests --- src/testing/mod.rs | 1 + src/testing/test.rs | 101 -------------------------------------------- 2 files changed, 1 insertion(+), 101 deletions(-) diff --git a/src/testing/mod.rs b/src/testing/mod.rs index b196efd..d74eea6 100644 --- a/src/testing/mod.rs +++ b/src/testing/mod.rs @@ -3,3 +3,4 @@ mod test; mod testdata; mod pi; +mod wattersons_theta; diff --git a/src/testing/test.rs b/src/testing/test.rs index 72d9d65..6c3c5d4 100644 --- a/src/testing/test.rs +++ b/src/testing/test.rs @@ -468,37 +468,6 @@ chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# // ); // } - #[test] - fn watterson_theta() { - let sites = vec![ - vec![Some(0), Some(0), Some(1), Some(1), Some(1), Some(2)], - vec![Some(0), Some(1), Some(1), Some(1), Some(2), None], - ] - .into_iter() - .map(|site| { - site.into_iter() - .map(|sam| sam.map(AlleleID::from)) - .collect::>() - }) - .collect::>(); - - let allele_counts = MultiSiteCounts::from_tabular(sites); - let theta = WattersonTheta::from_iter_sites(allele_counts.iter()); - - let mut expected = 0f64; - for site in allele_counts.iter() { - let num_variants = site.counts.iter().filter(|c| **c > 0).count(); - let total_samples = site.counts.iter().filter(|c| **c > 0).sum::(); - - if num_variants > 1 { - let numerator = (num_variants - 1) as f64; - let denominator = (1..total_samples).map(|i| 1f64 / i as f64).sum::(); - expected += numerator / denominator; - } - } - - assert!((theta.as_raw() - expected).abs() < f64::EPSILON); - } #[test] fn tajima_d() { @@ -605,74 +574,4 @@ chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# ); } - #[test] - fn watterson_theta_from_random_data() { - use rand::prelude::*; - - let mut rng = StdRng::seed_from_u64(54321); - for ploidy in [1, 2, 3, 4] { - let freqs = [0.25, 0.5, 0.25]; // fixed allele freqs per site - let mut sites = vec![]; - // make 10 random sites. - // No missing data, etc.. - for _ in 0..10 { - let site = - crate::testing::testdata::random_site_rng(10, ploidy, &freqs, None, &mut rng); - sites.push(site); - } - // convert to our normal format - let counts = crate::testing::testdata::single_pop_counts(&mut sites.iter()); - let theta = WattersonTheta::from_iter_sites(counts.iter()); - let theta_naive = - crate::testing::naivecalculations::watterson_theta(&mut sites.iter_mut()); - if theta_naive.is_nan() { - assert!(theta.as_raw().is_nan()) - } else { - assert!((theta.as_raw() - theta_naive).abs() <= 1e-10) - } - } - } - - #[test] - fn watterson_theta_from_random_data_with_missing_data() { - use rand::prelude::*; - - let mut rng = StdRng::seed_from_u64(54321); - - for ploidy in [1, 2, 4] { - for rate in [0.01, 0.1, 0.5, 0.9] { - let freqs = [0.25, 0.5, 0.25]; // fixed allele freqs per site - let mut sites = vec![]; - // make 10 random sites. - // No missing data, etc.. - for _ in 0..10 { - let site = crate::testing::testdata::random_site_rng( - 10, - ploidy, - &freqs, - Some(crate::testing::testdata::RandomSiteOptions { - missing_data_rate: Some(rate), - }), - &mut rng, - ); - sites.push(site); - } - // convert to our normal format - let counts = crate::testing::testdata::single_pop_counts(&mut sites.iter()); - // get the calcs - let theta = WattersonTheta::from_iter_sites(counts.iter()); - let theta_naive = - crate::testing::naivecalculations::watterson_theta(&mut sites.iter_mut()); - // compare - if theta_naive.is_nan() { - assert!(theta.as_raw().is_nan()); - } else { - assert!( - (theta.as_raw() - theta_naive).abs() <= 1e-10, - "{theta:?} != {theta_naive}" - ); - } - } - } - } } From 9d0087d0041c77c0b29c07af4f871522d248348f Mon Sep 17 00:00:00 2001 From: Kevin Thornton Date: Sun, 7 Dec 2025 09:01:18 -0800 Subject: [PATCH 07/16] move watterson and noodles tests, add files this time... --- src/testing/mod.rs | 9 ++ src/testing/noodles_vcf.rs | 255 ++++++++++++++++++++++++++++++ src/testing/test.rs | 270 -------------------------------- src/testing/wattersons_theta.rs | 107 +++++++++++++ 4 files changed, 371 insertions(+), 270 deletions(-) create mode 100644 src/testing/noodles_vcf.rs create mode 100644 src/testing/wattersons_theta.rs diff --git a/src/testing/mod.rs b/src/testing/mod.rs index d74eea6..00755c6 100644 --- a/src/testing/mod.rs +++ b/src/testing/mod.rs @@ -2,5 +2,14 @@ mod naivecalculations; mod test; mod testdata; +// tests of CALCULATIONS go below + mod pi; mod wattersons_theta; + +// tests of OTHER THINGS, like +// input formats, follow + +#[cfg(feature = "noodles")] +mod noodles_vcf; + diff --git a/src/testing/noodles_vcf.rs b/src/testing/noodles_vcf.rs new file mode 100644 index 0000000..6adec5e --- /dev/null +++ b/src/testing/noodles_vcf.rs @@ -0,0 +1,255 @@ +use crate::adapter::vcf::{record_to_genotypes_adapter, VCFToPopulationsAdapter, WhichPopulation}; +use crate::counts::MultiSiteCounts; +use crate::{AlleleID, PopgenResult}; +use noodles::vcf::header::record::value::map::{Contig, Format}; +use noodles::vcf::header::record::value::Map; +use noodles::vcf::variant::io::Write; +use noodles::vcf::variant::record::samples::keys::key; +use noodles::vcf::variant::record::samples::series::value::genotype::Phasing::Unphased; +use noodles::vcf::variant::record_buf::samples::sample::value::genotype::Allele; +use noodles::vcf::variant::record_buf::samples::sample::value::Genotype; +use noodles::vcf::variant::record_buf::samples::sample::Value; +use noodles::vcf::variant::record_buf::samples::Keys; +use noodles::vcf::variant::record_buf::{AlternateBases, Samples}; +use noodles::vcf::variant::RecordBuf; +use rand::prelude::SliceRandom; +use rand::rng; +use std::borrow::Cow; +use std::collections::HashMap; + +// SiteVariant is to be a slice like ["A", "AG"] for a sample with these two genotypes +// the appropriate IDs, number of samples, etc. will be calculated +#[allow(dead_code)] +fn make_record( + seq_name: &str, + site_genotypes: SiteGenotypes, +) -> Option +where + SiteGenotypes: AsRef<[(SiteVariant, usize)]>, + SiteVariant: AsRef<[Option]>, + InnerAllele: AsRef, +{ + let site_genotypes = site_genotypes.as_ref(); + if site_genotypes.len() < 2 { + return None; + } + + let mut alleles_seen = Vec::new(); + for (genotype, _) in site_genotypes { + for gt_str in genotype + .as_ref() + .iter() + .filter_map(|gt_str| gt_str.as_ref()) + .map(|gt_str| gt_str.as_ref()) + { + if !alleles_seen.iter().any(|allele| gt_str.eq(*allele)) { + alleles_seen.push(gt_str); + } + } + } + + Some( + RecordBuf::builder() + .set_reference_sequence_name(seq_name) + .set_variant_start(noodles_core::Position::MIN) + .set_reference_bases(alleles_seen[0]) + .set_alternate_bases(AlternateBases::from( + alleles_seen[1..] + .iter() + .map(|s| String::from(*s)) + .collect::>(), + )) + .set_samples(Samples::new( + Keys::from_iter(vec![String::from(key::GENOTYPE)]), + // for each genotype and its count, create that many samples accordingly + { + let mut all_samples = site_genotypes + .iter() + .flat_map(|(genotype, count)| { + vec![ + // for each sample at this site, there will only be the GT field and no other hence another layer of Vec here + vec![Some(Value::from(Genotype::from_iter( + genotype.as_ref().iter() + .map(|sample_variant| Allele::new( + sample_variant.as_ref().map(|some| alleles_seen.iter() + .enumerate() + .find(|(_, variant)| **variant == some.as_ref()) + .unwrap().0 + ), + Unphased // TODO: make this configurable? + )), + )))]; + *count + ] + }) + .collect::>(); + all_samples.shuffle(&mut rng()); + all_samples + }, + )) + .build(), + ) +} + +#[allow(dead_code)] +fn make_mock_vcf(sites: Sites) -> Option +where + Sites: AsRef<[SiteGenotypes]>, + SiteGenotypes: AsRef<[(SiteVariant, usize)]>, + SiteVariant: AsRef<[Option]>, + Allele: AsRef, +{ + let mut buf = Vec::new(); + let mut writer = noodles::vcf::io::writer::Builder::default().build_from_writer(&mut buf); + + let seq_name = "chr0"; + // how many samples are needed to contain the genotypes of the sample with the most genotypes? + let num_samples = sites + .as_ref() + .iter() + .map(|site_alleles| site_alleles.as_ref().iter().map(|(_, count)| *count).sum()) + .max()?; + + let header = { + let id = key::GENOTYPE; + let format = Map::::from(id); + + let mut header = noodles::vcf::Header::builder() + .add_contig(seq_name, Map::::new()) + .add_format(id, format.clone()); + + for num in 0..num_samples { + header = header.add_sample_name(format!("s{num}")); + } + + header.build() + }; + + writer.write_header(&header).unwrap(); + + for site in sites.as_ref() { + let record = make_record(seq_name, site).unwrap(); + writer.write_variant_record(&header, &record).unwrap(); + } + + // kick the writer out of the buffer so we can move it + drop(writer); + + Some(String::from_utf8(buf).unwrap()) +} + +fn counts_from_vcf(vcf_buf: &str, ploidy: usize) -> (Vec>>, MultiSiteCounts) { + let mut reader = noodles::vcf::io::reader::Builder::default() + .build_from_reader(vcf_buf.as_bytes()) + .unwrap(); + + let header = reader.read_header().unwrap(); + + let all_alleles = reader + .records() + .map(Result::unwrap) + .map(|rec| record_to_genotypes_adapter(&header, rec, ploidy)) + .collect::>>() + .unwrap(); + let counts = MultiSiteCounts::from_tabular(all_alleles.iter().cloned()); + (all_alleles, counts) +} + +fn make_vcf() -> &'static str { + // let vcf_buf = make_mock_vcf(vec![ + // vec![ + // (vec![Some("A")], 11), + // (vec![Some("C")], 7), + // ], + // vec![ + // (vec![Some("G")], 7), + // (vec![None], 3), + // (vec![Some("A")], 8), + // ] + // ]).unwrap(); + // + // println!("{}", &vcf_buf); + + r#"##fileformat=VCFv4.5 +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 s16 s17 +chr0 1 . A C . . . GT /0 /1 /1 /0 /1 /0 /1 /0 /0 /0 /0 /0 /0 /1 /0 /1 /1 /0 +chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# +} + +#[test] +fn load_vcf() { + let (_all_alleles, allele_counts) = counts_from_vcf(make_vcf(), 1); + let mut iter = allele_counts.iter(); + let counts_0 = iter.next().unwrap(); + let counts_1 = iter.next().unwrap(); + assert!(iter.next().is_none()); + + assert_eq!(counts_0.counts(), &[11, 7]); + assert_eq!(counts_0.total_alleles(), 18); + assert_eq!(counts_1.counts(), &[7, 8]); + assert_eq!(counts_1.total_alleles(), 18); +} + +#[test] +fn load_vcf_multi_population() { + // use this overly verbose and inefficient map to verify lifetime correctness + let map = (0..18) + .map(|n| (format!("s{}", n), ["A", "B"][n % 2].to_owned())) + .collect::>(); + + struct MapBasedMapper(HashMap); + + impl WhichPopulation<()> for MapBasedMapper { + fn which_population<'s>(&'s mut self, sample_name: &'_ str) -> Result, ()> { + self.0 + .get(sample_name) + .map(String::as_str) + .map(Cow::Borrowed) + .ok_or(()) + } + } + + let mut vcf_reader = noodles::vcf::io::reader::Builder::default() + .build_from_reader(make_vcf().as_bytes()) + .unwrap(); + + let header = vcf_reader.read_header().unwrap(); + + let mut mapper = MapBasedMapper(map); + let mut adapter = VCFToPopulationsAdapter::new(&header, None, &mut mapper).unwrap(); + + for record in vcf_reader.records() { + let record = record.unwrap(); + adapter.add_record(&record).unwrap(); + } + + let (population_name_to_idx, counts) = adapter.build(); + assert_eq!(population_name_to_idx.get("A").unwrap(), &0); + assert_eq!(population_name_to_idx.get("B").unwrap(), &1); + + { + let first_pop = &counts[0]; + let mut pop_iter = first_pop.iter(); + let first_site = pop_iter.next().unwrap(); + assert_eq!(first_site.counts(), &[5, 4]); + assert_eq!(first_site.total_alleles(), 9); + + let second_site = pop_iter.next().unwrap(); + assert_eq!(second_site.counts(), &[4, 4]); + assert_eq!(second_site.total_alleles(), 9); + } + + { + let second_pop = &counts[1]; + let mut pop_iter = second_pop.iter(); + let first_site = pop_iter.next().unwrap(); + assert_eq!(first_site.counts(), &[6, 3]); + assert_eq!(first_site.total_alleles(), 9); + + let second_site = pop_iter.next().unwrap(); + assert_eq!(second_site.counts(), &[3, 4]); + assert_eq!(second_site.total_alleles(), 9); + } +} diff --git a/src/testing/test.rs b/src/testing/test.rs index 6c3c5d4..77a75bc 100644 --- a/src/testing/test.rs +++ b/src/testing/test.rs @@ -13,274 +13,6 @@ mod tests { // SimpleLowerTri, SymmetricUpperTri, SymmetricUpperTriMut, Triangle, TriangleMut, // }; - #[cfg(feature = "noodles")] - mod vcf { - use crate::adapter::vcf::{ - record_to_genotypes_adapter, VCFToPopulationsAdapter, WhichPopulation, - }; - use crate::counts::MultiSiteCounts; - use crate::{AlleleID, PopgenResult}; - use noodles::vcf::header::record::value::map::{Contig, Format}; - use noodles::vcf::header::record::value::Map; - use noodles::vcf::variant::io::Write; - use noodles::vcf::variant::record::samples::keys::key; - use noodles::vcf::variant::record::samples::series::value::genotype::Phasing::Unphased; - use noodles::vcf::variant::record_buf::samples::sample::value::genotype::Allele; - use noodles::vcf::variant::record_buf::samples::sample::value::Genotype; - use noodles::vcf::variant::record_buf::samples::sample::Value; - use noodles::vcf::variant::record_buf::samples::Keys; - use noodles::vcf::variant::record_buf::{AlternateBases, Samples}; - use noodles::vcf::variant::RecordBuf; - use rand::prelude::SliceRandom; - use rand::rng; - use std::borrow::Cow; - use std::collections::HashMap; - - // SiteVariant is to be a slice like ["A", "AG"] for a sample with these two genotypes - // the appropriate IDs, number of samples, etc. will be calculated - #[allow(dead_code)] - fn make_record( - seq_name: &str, - site_genotypes: SiteGenotypes, - ) -> Option - where - SiteGenotypes: AsRef<[(SiteVariant, usize)]>, - SiteVariant: AsRef<[Option]>, - InnerAllele: AsRef, - { - let site_genotypes = site_genotypes.as_ref(); - if site_genotypes.len() < 2 { - return None; - } - - let mut alleles_seen = Vec::new(); - for (genotype, _) in site_genotypes { - for gt_str in genotype - .as_ref() - .iter() - .filter_map(|gt_str| gt_str.as_ref()) - .map(|gt_str| gt_str.as_ref()) - { - if !alleles_seen.iter().any(|allele| gt_str.eq(*allele)) { - alleles_seen.push(gt_str); - } - } - } - - Some( - RecordBuf::builder() - .set_reference_sequence_name(seq_name) - .set_variant_start(noodles_core::Position::MIN) - .set_reference_bases(alleles_seen[0]) - .set_alternate_bases(AlternateBases::from( - alleles_seen[1..] - .iter() - .map(|s| String::from(*s)) - .collect::>(), - )) - .set_samples(Samples::new( - Keys::from_iter(vec![String::from(key::GENOTYPE)]), - // for each genotype and its count, create that many samples accordingly - { - let mut all_samples = site_genotypes - .iter() - .flat_map(|(genotype, count)| { - vec![ - // for each sample at this site, there will only be the GT field and no other hence another layer of Vec here - vec![Some(Value::from(Genotype::from_iter( - genotype.as_ref().iter() - .map(|sample_variant| Allele::new( - sample_variant.as_ref().map(|some| alleles_seen.iter() - .enumerate() - .find(|(_, variant)| **variant == some.as_ref()) - .unwrap().0 - ), - Unphased // TODO: make this configurable? - )), - )))]; - *count - ] - }) - .collect::>(); - all_samples.shuffle(&mut rng()); - all_samples - }, - )) - .build(), - ) - } - - #[allow(dead_code)] - fn make_mock_vcf(sites: Sites) -> Option - where - Sites: AsRef<[SiteGenotypes]>, - SiteGenotypes: AsRef<[(SiteVariant, usize)]>, - SiteVariant: AsRef<[Option]>, - Allele: AsRef, - { - let mut buf = Vec::new(); - let mut writer = - noodles::vcf::io::writer::Builder::default().build_from_writer(&mut buf); - - let seq_name = "chr0"; - // how many samples are needed to contain the genotypes of the sample with the most genotypes? - let num_samples = sites - .as_ref() - .iter() - .map(|site_alleles| site_alleles.as_ref().iter().map(|(_, count)| *count).sum()) - .max()?; - - let header = { - let id = key::GENOTYPE; - let format = Map::::from(id); - - let mut header = noodles::vcf::Header::builder() - .add_contig(seq_name, Map::::new()) - .add_format(id, format.clone()); - - for num in 0..num_samples { - header = header.add_sample_name(format!("s{num}")); - } - - header.build() - }; - - writer.write_header(&header).unwrap(); - - for site in sites.as_ref() { - let record = make_record(seq_name, site).unwrap(); - writer.write_variant_record(&header, &record).unwrap(); - } - - // kick the writer out of the buffer so we can move it - drop(writer); - - Some(String::from_utf8(buf).unwrap()) - } - - fn counts_from_vcf( - vcf_buf: &str, - ploidy: usize, - ) -> (Vec>>, MultiSiteCounts) { - let mut reader = noodles::vcf::io::reader::Builder::default() - .build_from_reader(vcf_buf.as_bytes()) - .unwrap(); - - let header = reader.read_header().unwrap(); - - let all_alleles = reader - .records() - .map(Result::unwrap) - .map(|rec| record_to_genotypes_adapter(&header, rec, ploidy)) - .collect::>>() - .unwrap(); - let counts = MultiSiteCounts::from_tabular(all_alleles.iter().cloned()); - (all_alleles, counts) - } - - fn make_vcf() -> &'static str { - // let vcf_buf = make_mock_vcf(vec![ - // vec![ - // (vec![Some("A")], 11), - // (vec![Some("C")], 7), - // ], - // vec![ - // (vec![Some("G")], 7), - // (vec![None], 3), - // (vec![Some("A")], 8), - // ] - // ]).unwrap(); - // - // println!("{}", &vcf_buf); - - r#"##fileformat=VCFv4.5 -##FORMAT= -##contig= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 s16 s17 -chr0 1 . A C . . . GT /0 /1 /1 /0 /1 /0 /1 /0 /0 /0 /0 /0 /0 /1 /0 /1 /1 /0 -chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# - } - - #[test] - fn load_vcf() { - let (_all_alleles, allele_counts) = counts_from_vcf(make_vcf(), 1); - let mut iter = allele_counts.iter(); - let counts_0 = iter.next().unwrap(); - let counts_1 = iter.next().unwrap(); - assert!(iter.next().is_none()); - - assert_eq!(counts_0.counts(), &[11, 7]); - assert_eq!(counts_0.total_alleles(), 18); - assert_eq!(counts_1.counts(), &[7, 8]); - assert_eq!(counts_1.total_alleles(), 18); - } - - #[test] - fn load_vcf_multi_population() { - // use this overly verbose and inefficient map to verify lifetime correctness - let map = (0..18) - .map(|n| (format!("s{}", n), ["A", "B"][n % 2].to_owned())) - .collect::>(); - - struct MapBasedMapper(HashMap); - - impl WhichPopulation<()> for MapBasedMapper { - fn which_population<'s>( - &'s mut self, - sample_name: &'_ str, - ) -> Result, ()> { - self.0 - .get(sample_name) - .map(String::as_str) - .map(Cow::Borrowed) - .ok_or(()) - } - } - - let mut vcf_reader = noodles::vcf::io::reader::Builder::default() - .build_from_reader(make_vcf().as_bytes()) - .unwrap(); - - let header = vcf_reader.read_header().unwrap(); - - let mut mapper = MapBasedMapper(map); - let mut adapter = VCFToPopulationsAdapter::new(&header, None, &mut mapper).unwrap(); - - for record in vcf_reader.records() { - let record = record.unwrap(); - adapter.add_record(&record).unwrap(); - } - - let (population_name_to_idx, counts) = adapter.build(); - assert_eq!(population_name_to_idx.get("A").unwrap(), &0); - assert_eq!(population_name_to_idx.get("B").unwrap(), &1); - - { - let first_pop = &counts[0]; - let mut pop_iter = first_pop.iter(); - let first_site = pop_iter.next().unwrap(); - assert_eq!(first_site.counts(), &[5, 4]); - assert_eq!(first_site.total_alleles(), 9); - - let second_site = pop_iter.next().unwrap(); - assert_eq!(second_site.counts(), &[4, 4]); - assert_eq!(second_site.total_alleles(), 9); - } - - { - let second_pop = &counts[1]; - let mut pop_iter = second_pop.iter(); - let first_site = pop_iter.next().unwrap(); - assert_eq!(first_site.counts(), &[6, 3]); - assert_eq!(first_site.total_alleles(), 9); - - let second_site = pop_iter.next().unwrap(); - assert_eq!(second_site.counts(), &[3, 4]); - assert_eq!(second_site.total_alleles(), 9); - } - } - } - // struct TriVec(usize, Vec); // impl Triangle for TriVec { @@ -468,7 +200,6 @@ chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# // ); // } - #[test] fn tajima_d() { let mut rng = rng(); @@ -573,5 +304,4 @@ chr0 1 . G A . . . GT /0 /1 /1 /0 /1 /1 /0 /0 /. /. /0 /0 /1 /1 /1 /1 /0 /."# < f64::EPSILON ); } - } diff --git a/src/testing/wattersons_theta.rs b/src/testing/wattersons_theta.rs new file mode 100644 index 0000000..2828c54 --- /dev/null +++ b/src/testing/wattersons_theta.rs @@ -0,0 +1,107 @@ +use crate::stats::GlobalStatistic; +use crate::stats::WattersonTheta; +use crate::AlleleID; +use crate::Count; +use crate::MultiSiteCounts; + +#[test] +fn watterson_theta() { + let sites = vec![ + vec![Some(0), Some(0), Some(1), Some(1), Some(1), Some(2)], + vec![Some(0), Some(1), Some(1), Some(1), Some(2), None], + ] + .into_iter() + .map(|site| { + site.into_iter() + .map(|sam| sam.map(AlleleID::from)) + .collect::>() + }) + .collect::>(); + + let allele_counts = MultiSiteCounts::from_tabular(sites); + let theta = WattersonTheta::from_iter_sites(allele_counts.iter()); + + let mut expected = 0f64; + for site in allele_counts.iter() { + let num_variants = site.counts.iter().filter(|c| **c > 0).count(); + let total_samples = site.counts.iter().filter(|c| **c > 0).sum::(); + + if num_variants > 1 { + let numerator = (num_variants - 1) as f64; + let denominator = (1..total_samples).map(|i| 1f64 / i as f64).sum::(); + expected += numerator / denominator; + } + } + + assert!((theta.as_raw() - expected).abs() < f64::EPSILON); +} + +#[test] +fn watterson_theta_from_random_data() { + use rand::prelude::*; + + let mut rng = StdRng::seed_from_u64(54321); + for ploidy in [1, 2, 3, 4] { + let freqs = [0.25, 0.5, 0.25]; // fixed allele freqs per site + let mut sites = vec![]; + // make 10 random sites. + // No missing data, etc.. + for _ in 0..10 { + let site = + crate::testing::testdata::random_site_rng(10, ploidy, &freqs, None, &mut rng); + sites.push(site); + } + // convert to our normal format + let counts = crate::testing::testdata::single_pop_counts(&mut sites.iter()); + let theta = WattersonTheta::from_iter_sites(counts.iter()); + let theta_naive = crate::testing::naivecalculations::watterson_theta(&mut sites.iter_mut()); + if theta_naive.is_nan() { + assert!(theta.as_raw().is_nan()) + } else { + assert!((theta.as_raw() - theta_naive).abs() <= 1e-10) + } + } +} + +#[test] +fn watterson_theta_from_random_data_with_missing_data() { + use rand::prelude::*; + + let mut rng = StdRng::seed_from_u64(54321); + + for ploidy in [1, 2, 4] { + for rate in [0.01, 0.1, 0.5, 0.9] { + let freqs = [0.25, 0.5, 0.25]; // fixed allele freqs per site + let mut sites = vec![]; + // make 10 random sites. + // No missing data, etc.. + for _ in 0..10 { + let site = crate::testing::testdata::random_site_rng( + 10, + ploidy, + &freqs, + Some(crate::testing::testdata::RandomSiteOptions { + missing_data_rate: Some(rate), + }), + &mut rng, + ); + sites.push(site); + } + // convert to our normal format + let counts = crate::testing::testdata::single_pop_counts(&mut sites.iter()); + // get the calcs + let theta = WattersonTheta::from_iter_sites(counts.iter()); + let theta_naive = + crate::testing::naivecalculations::watterson_theta(&mut sites.iter_mut()); + // compare + if theta_naive.is_nan() { + assert!(theta.as_raw().is_nan()); + } else { + assert!( + (theta.as_raw() - theta_naive).abs() <= 1e-10, + "{theta:?} != {theta_naive}" + ); + } + } + } +} From 2702902e2404eb5de179a195452b0787be3cef0b Mon Sep 17 00:00:00 2001 From: Kevin Thornton Date: Sun, 7 Dec 2025 09:05:52 -0800 Subject: [PATCH 08/16] fmt --- src/testing/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/testing/mod.rs b/src/testing/mod.rs index 00755c6..966fc03 100644 --- a/src/testing/mod.rs +++ b/src/testing/mod.rs @@ -12,4 +12,3 @@ mod wattersons_theta; #[cfg(feature = "noodles")] mod noodles_vcf; - From 87210d0dd9942fa4e990edccb754797867241329 Mon Sep 17 00:00:00 2001 From: Kevin Thornton Date: Sun, 7 Dec 2025 09:10:59 -0800 Subject: [PATCH 09/16] fst is next --- src/testing/fst.rs | 4 ++++ src/testing/mod.rs | 1 + 2 files changed, 5 insertions(+) create mode 100644 src/testing/fst.rs diff --git a/src/testing/fst.rs b/src/testing/fst.rs new file mode 100644 index 0000000..a8d5c74 --- /dev/null +++ b/src/testing/fst.rs @@ -0,0 +1,4 @@ +#[test] +fn fail() { + todo!() +} diff --git a/src/testing/mod.rs b/src/testing/mod.rs index 966fc03..f19a225 100644 --- a/src/testing/mod.rs +++ b/src/testing/mod.rs @@ -4,6 +4,7 @@ mod testdata; // tests of CALCULATIONS go below +mod fst; mod pi; mod wattersons_theta; From 7e797110e4070e4370c05584135378b4518e7830 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Sun, 7 Dec 2025 09:58:36 -0800 Subject: [PATCH 10/16] fst --- src/testing/fst.rs | 87 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 2 deletions(-) diff --git a/src/testing/fst.rs b/src/testing/fst.rs index a8d5c74..dc1d19b 100644 --- a/src/testing/fst.rs +++ b/src/testing/fst.rs @@ -1,4 +1,87 @@ +use crate::stats::FStatisticParts; +use crate::stats::GlobalPi; +use crate::stats::GlobalStatistic; +use crate::MultiPopulationCounts; + #[test] -fn fail() { - todo!() +fn f_st_empty() { + fn ok(populations: &MultiPopulationCounts) { + // this is the one case where these fail; let's make sure that is the case + let f_st = populations.f_st_if(|_| None); + assert_eq!(f_st.pi_S(), None); + assert_eq!(f_st.pi_B(), None); + assert_eq!(f_st.pi_D(), None); + } + + ok(&MultiPopulationCounts::of_empty_populations(0)); + ok(&MultiPopulationCounts::of_empty_populations(1)); + ok(&MultiPopulationCounts::of_empty_populations(5)); +} + +#[test] +fn f_st() { + let mut populations = MultiPopulationCounts::of_empty_populations(3); + + let data = [([1, 2, 0], 3), ([3, 0, 0], 3), ([0, 1, 2], 3)]; + let weights = [1.0, 2.0, 3.0]; + + populations + .extend_populations_from_site(|i| (&data[i].0, data[i].1)) + .unwrap(); + + let f_st = populations.f_st_if(|i| Some(weights[i])); + + #[allow(non_snake_case)] + let (pi_B_top, pi_B_bottom) = f_st.pi_B_parts(); + + assert!( + (pi_B_top + - ( + // differences between (0, 1) + 6.0 * weights[0] * weights[1] + // (0, 2) + + 7.0 * weights[0] * weights[2] + // (1, 2) + + 9.0 * weights[1] * weights[2] + ) + // number of comparisons between any two populations + / 9.) + .abs() + // this comparison seems a little finicky + < 10.0_f64.powi(-10) + ); + + assert_eq!( + pi_B_bottom, + weights + .iter() + .enumerate() + .flat_map(|(i, w1)| weights.iter().skip(i + 1).map(move |w2| w1 * w2)) + .sum::() + ); + + #[allow(non_snake_case)] + let (pi_S_top, pi_S_bottom) = f_st.pi_S_parts(); + + assert!( + (pi_S_top + - populations + .iter() + .enumerate() + .map(|(i, pop)| { + // sum of weight * weight * pi within this population + GlobalPi::from_iter_sites(pop.iter()).as_raw() * (weights[i]).powi(2) + }) + .sum::()) + .abs() + < f64::EPSILON + ); + + assert!( + (pi_S_bottom + // denominator should be sum of squares of weights + - weights.iter().map(|w| w.powi(2)).sum::()) + .abs() + < f64::EPSILON + ); } From 7bef773a56d0f4fc533a017e277b87dc31cb1a27 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Sun, 7 Dec 2025 10:05:56 -0800 Subject: [PATCH 11/16] tajimas d. drop itertools. some bad copy/paste happening --- src/testing/mod.rs | 1 + src/testing/tajimasd.rs | 43 +++++++++++++ src/testing/test.rs | 132 ++++------------------------------------ 3 files changed, 57 insertions(+), 119 deletions(-) create mode 100644 src/testing/tajimasd.rs diff --git a/src/testing/mod.rs b/src/testing/mod.rs index f19a225..70b3395 100644 --- a/src/testing/mod.rs +++ b/src/testing/mod.rs @@ -6,6 +6,7 @@ mod testdata; mod fst; mod pi; +mod tajimasd; mod wattersons_theta; // tests of OTHER THINGS, like diff --git a/src/testing/tajimasd.rs b/src/testing/tajimasd.rs new file mode 100644 index 0000000..ad4e026 --- /dev/null +++ b/src/testing/tajimasd.rs @@ -0,0 +1,43 @@ +use crate::stats::GlobalStatistic; +use crate::stats::TajimaD; +use crate::AlleleID; +use crate::MultiSiteCounts; +use rand::rng; +use rand::rngs::ThreadRng; +use rand::seq::SliceRandom; +use std::iter::repeat_n; + +fn shuffled_site( + ids: impl Iterator, usize)>, + rng: &mut ThreadRng, +) -> Vec> { + let mut site = vec![]; + ids.for_each(|(id, count)| { + site.extend(repeat_n(id, count)); + }); + + site.shuffle(rng); + site +} + +#[test] +fn tajima_d() { + let mut rng = rng(); + + let sites = vec![ + // common mutation + vec![(Some(AlleleID::from(0)), 11), (Some(AlleleID::from(1)), 7)], + // rare mutation + vec![(Some(AlleleID::from(0)), 16), (Some(AlleleID::from(1)), 2)], + // rare mutation + vec![(Some(AlleleID::from(0)), 1), (Some(AlleleID::from(1)), 17)], + ] + .into_iter() + .map(|site| shuffled_site(site.into_iter(), &mut rng)) + .collect::>(); + + let allele_counts = MultiSiteCounts::from_tabular(sites); + + let tajima = TajimaD::from_iter_sites(allele_counts.iter()); + assert!((tajima.as_raw() - -0.15474069911037955).abs() < f64::EPSILON); +} diff --git a/src/testing/test.rs b/src/testing/test.rs index 77a75bc..13f37bd 100644 --- a/src/testing/test.rs +++ b/src/testing/test.rs @@ -4,7 +4,6 @@ mod tests { use crate::counts::{MultiPopulationCounts, MultiSiteCounts}; use crate::iter::SiteCounts; use crate::stats::{FStatisticParts, GlobalPi, GlobalStatistic, TajimaD, WattersonTheta}; - use itertools::Itertools; use rand::rng; use rand::rngs::ThreadRng; use rand::seq::SliceRandom; @@ -33,19 +32,6 @@ mod tests { // } // } - fn shuffled_site( - ids: impl Iterator, usize)>, - rng: &mut ThreadRng, - ) -> Vec> { - let mut site = vec![]; - ids.for_each(|(id, count)| { - site.extend(repeat_n(id, count)); - }); - - site.shuffle(rng); - site - } - /// inefficient O(n^2) computation of pairwise diversity at a site /// /// hidden in test module because nobody should use this; just want to verify without magic numbers that calculations are correct @@ -86,6 +72,19 @@ mod tests { #[test] fn load_raw() { + // NOTE: copy-paste of a fn present elsewhere + fn shuffled_site( + ids: impl Iterator, usize)>, + rng: &mut ThreadRng, + ) -> Vec> { + let mut site = vec![]; + ids.for_each(|(id, count)| { + site.extend(repeat_n(id, count)); + }); + + site.shuffle(rng); + site + } let mut rng = rng(); let sites = vec![ @@ -199,109 +198,4 @@ mod tests { // < f64::EPSILON // ); // } - - #[test] - fn tajima_d() { - let mut rng = rng(); - - let sites = vec![ - // common mutation - vec![(Some(AlleleID::from(0)), 11), (Some(AlleleID::from(1)), 7)], - // rare mutation - vec![(Some(AlleleID::from(0)), 16), (Some(AlleleID::from(1)), 2)], - // rare mutation - vec![(Some(AlleleID::from(0)), 1), (Some(AlleleID::from(1)), 17)], - ] - .into_iter() - .map(|site| shuffled_site(site.into_iter(), &mut rng)) - .collect_vec(); - - let allele_counts = MultiSiteCounts::from_tabular(sites); - - let tajima = TajimaD::from_iter_sites(allele_counts.iter()); - assert!((tajima.as_raw() - -0.15474069911037955).abs() < f64::EPSILON); - } - - #[test] - fn f_st_empty() { - fn ok(populations: &MultiPopulationCounts) { - // this is the one case where these fail; let's make sure that is the case - let f_st = populations.f_st_if(|_| None); - assert_eq!(f_st.pi_S(), None); - assert_eq!(f_st.pi_B(), None); - assert_eq!(f_st.pi_D(), None); - } - - ok(&MultiPopulationCounts::of_empty_populations(0)); - ok(&MultiPopulationCounts::of_empty_populations(1)); - ok(&MultiPopulationCounts::of_empty_populations(5)); - } - - #[test] - fn f_st() { - let mut populations = MultiPopulationCounts::of_empty_populations(3); - - let data = [([1, 2, 0], 3), ([3, 0, 0], 3), ([0, 1, 2], 3)]; - let weights = [1.0, 2.0, 3.0]; - - populations - .extend_populations_from_site(|i| (&data[i].0, data[i].1)) - .unwrap(); - - let f_st = populations.f_st_if(|i| Some(weights[i])); - - #[allow(non_snake_case)] - let (pi_B_top, pi_B_bottom) = f_st.pi_B_parts(); - - assert!( - (pi_B_top - - ( - // differences between (0, 1) - 6.0 * weights[0] * weights[1] - // (0, 2) - + 7.0 * weights[0] * weights[2] - // (1, 2) - + 9.0 * weights[1] * weights[2] - ) - // number of comparisons between any two populations - / 9.) - .abs() - // this comparison seems a little finicky - < 10.0_f64.powi(-10) - ); - - assert_eq!( - pi_B_bottom, - weights - .iter() - .enumerate() - .flat_map(|(i, w1)| weights.iter().skip(i + 1).map(move |w2| w1 * w2)) - .sum::() - ); - - #[allow(non_snake_case)] - let (pi_S_top, pi_S_bottom) = f_st.pi_S_parts(); - - assert!( - (pi_S_top - - populations - .iter() - .enumerate() - .map(|(i, pop)| { - // sum of weight * weight * pi within this population - GlobalPi::from_iter_sites(pop.iter()).as_raw() * (weights[i]).powi(2) - }) - .sum::()) - .abs() - < f64::EPSILON - ); - - assert!( - (pi_S_bottom - // denominator should be sum of squares of weights - - weights.iter().map(|w| w.powi(2)).sum::()) - .abs() - < f64::EPSILON - ); - } } From 0741224a5b85dd6ca797e615fced4e195bdb2461 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Sun, 7 Dec 2025 10:25:51 -0800 Subject: [PATCH 12/16] move copy/pasted fn --- src/testing/tajimasd.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/testing/tajimasd.rs b/src/testing/tajimasd.rs index ad4e026..bb09758 100644 --- a/src/testing/tajimasd.rs +++ b/src/testing/tajimasd.rs @@ -7,21 +7,21 @@ use rand::rngs::ThreadRng; use rand::seq::SliceRandom; use std::iter::repeat_n; -fn shuffled_site( - ids: impl Iterator, usize)>, - rng: &mut ThreadRng, -) -> Vec> { - let mut site = vec![]; - ids.for_each(|(id, count)| { - site.extend(repeat_n(id, count)); - }); - - site.shuffle(rng); - site -} - #[test] fn tajima_d() { + fn shuffled_site( + ids: impl Iterator, usize)>, + rng: &mut ThreadRng, + ) -> Vec> { + let mut site = vec![]; + ids.for_each(|(id, count)| { + site.extend(repeat_n(id, count)); + }); + + site.shuffle(rng); + site + } + let mut rng = rng(); let sites = vec![ From ade832e2c896fd79d8d0e4229ab73695b53af16f Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Sun, 7 Dec 2025 12:56:18 -0800 Subject: [PATCH 13/16] note --- src/testing/tajimasd.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/testing/tajimasd.rs b/src/testing/tajimasd.rs index bb09758..302404b 100644 --- a/src/testing/tajimasd.rs +++ b/src/testing/tajimasd.rs @@ -7,6 +7,10 @@ use rand::rngs::ThreadRng; use rand::seq::SliceRandom; use std::iter::repeat_n; +// NOTE: the copy/pase of shuffled_site can be +// considered okay b/c this test will go away +// once we have a naive implementation in place +// and can test using our random data API. #[test] fn tajima_d() { fn shuffled_site( From 5c4adae1db0d097aeb4ecadde51f8a5d141d1925 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Sun, 7 Dec 2025 12:59:27 -0800 Subject: [PATCH 14/16] move type tests to new .rs file --- src/testing/mod.rs | 3 ++ src/testing/multisitecounts.rs | 89 ++++++++++++++++++++++++++++++++ src/testing/test.rs | 92 ++-------------------------------- 3 files changed, 97 insertions(+), 87 deletions(-) create mode 100644 src/testing/multisitecounts.rs diff --git a/src/testing/mod.rs b/src/testing/mod.rs index 70b3395..b90fb02 100644 --- a/src/testing/mod.rs +++ b/src/testing/mod.rs @@ -2,6 +2,9 @@ mod naivecalculations; mod test; mod testdata; +// tests of TYPES go below +mod multisitecounts; + // tests of CALCULATIONS go below mod fst; diff --git a/src/testing/multisitecounts.rs b/src/testing/multisitecounts.rs new file mode 100644 index 0000000..76e9dac --- /dev/null +++ b/src/testing/multisitecounts.rs @@ -0,0 +1,89 @@ +use crate::AlleleID; + +use crate::counts::MultiSiteCounts; +use crate::iter::SiteCounts; +use rand::rng; +use rand::rngs::ThreadRng; +use rand::seq::SliceRandom; +use std::iter::repeat_n; + +#[test] +fn load_raw() { + // NOTE: copy-paste of a fn present elsewhere + fn shuffled_site( + ids: impl Iterator, usize)>, + rng: &mut ThreadRng, + ) -> Vec> { + let mut site = vec![]; + ids.for_each(|(id, count)| { + site.extend(repeat_n(id, count)); + }); + + site.shuffle(rng); + site + } + let mut rng = rng(); + + let sites = vec![ + shuffled_site( + vec![(Some(AlleleID(0)), 8), (Some(AlleleID(1)), 7), (None, 4)].into_iter(), + &mut rng, + ), + shuffled_site( + vec![ + (Some(AlleleID(0)), 341), + (Some(AlleleID::from(1)), 69), + (Some(AlleleID::from(2)), 926), + (None, 300), + ] + .into_iter(), + &mut rng, + ), + ]; + + let counts = MultiSiteCounts::from_tabular(sites); + + assert_eq!(counts.len(), 2); + assert!(!counts.is_empty()); + + let mut iter = counts.iter(); + assert_eq!( + iter.next().unwrap(), + SiteCounts { + counts: &[8, 7], + total_alleles: 8 + 7 + 4, + } + ); + assert_eq!( + iter.next().unwrap(), + SiteCounts { + counts: &[341, 69, 926], + total_alleles: 341 + 69 + 926 + 300, + } + ); + assert!(iter.next().is_none()); +} + +#[test] +fn empty_counts() { + let counts = MultiSiteCounts::default(); + + assert!(counts.is_empty()); + assert_eq!(counts.len(), 0); +} + +#[test] +#[should_panic] +fn bad_site_negative_count() { + let mut counts = MultiSiteCounts::default(); + + counts.add_site_from_counts([-1, -2, -3], 100).unwrap(); +} + +#[test] +#[should_panic] +fn bad_site_deficient_total() { + let mut counts = MultiSiteCounts::default(); + + counts.add_site_from_counts([1, 2, 3], 1).unwrap(); +} diff --git a/src/testing/test.rs b/src/testing/test.rs index 13f37bd..005a1cb 100644 --- a/src/testing/test.rs +++ b/src/testing/test.rs @@ -1,9 +1,8 @@ mod tests { - use crate::{AlleleID, Count}; + use crate::AlleleID; - use crate::counts::{MultiPopulationCounts, MultiSiteCounts}; + use crate::counts::MultiSiteCounts; use crate::iter::SiteCounts; - use crate::stats::{FStatisticParts, GlobalPi, GlobalStatistic, TajimaD, WattersonTheta}; use rand::rng; use rand::rngs::ThreadRng; use rand::seq::SliceRandom; @@ -32,9 +31,9 @@ mod tests { // } // } - /// inefficient O(n^2) computation of pairwise diversity at a site - /// - /// hidden in test module because nobody should use this; just want to verify without magic numbers that calculations are correct + // inefficient O(n^2) computation of pairwise diversity at a site + // + // hidden in test module because nobody should use this; just want to verify without magic numbers that calculations are correct // fn pi_from_matrix(alleles: &[Option]) -> f64 { // use SymmetricUpperTri; // use SymmetricUpperTriMut; @@ -70,87 +69,6 @@ mod tests { // make_iter().sum::() as f64 / make_iter().count() as f64 // } - #[test] - fn load_raw() { - // NOTE: copy-paste of a fn present elsewhere - fn shuffled_site( - ids: impl Iterator, usize)>, - rng: &mut ThreadRng, - ) -> Vec> { - let mut site = vec![]; - ids.for_each(|(id, count)| { - site.extend(repeat_n(id, count)); - }); - - site.shuffle(rng); - site - } - let mut rng = rng(); - - let sites = vec![ - shuffled_site( - vec![(Some(AlleleID(0)), 8), (Some(AlleleID(1)), 7), (None, 4)].into_iter(), - &mut rng, - ), - shuffled_site( - vec![ - (Some(AlleleID(0)), 341), - (Some(AlleleID::from(1)), 69), - (Some(AlleleID::from(2)), 926), - (None, 300), - ] - .into_iter(), - &mut rng, - ), - ]; - - let counts = MultiSiteCounts::from_tabular(sites); - - assert_eq!(counts.len(), 2); - assert!(!counts.is_empty()); - - let mut iter = counts.iter(); - assert_eq!( - iter.next().unwrap(), - SiteCounts { - counts: &[8, 7], - total_alleles: 8 + 7 + 4, - } - ); - assert_eq!( - iter.next().unwrap(), - SiteCounts { - counts: &[341, 69, 926], - total_alleles: 341 + 69 + 926 + 300, - } - ); - assert!(iter.next().is_none()); - } - - #[test] - fn empty_counts() { - let counts = MultiSiteCounts::default(); - - assert!(counts.is_empty()); - assert_eq!(counts.len(), 0); - } - - #[test] - #[should_panic] - fn bad_site_negative_count() { - let mut counts = MultiSiteCounts::default(); - - counts.add_site_from_counts([-1, -2, -3], 100).unwrap(); - } - - #[test] - #[should_panic] - fn bad_site_deficient_total() { - let mut counts = MultiSiteCounts::default(); - - counts.add_site_from_counts([1, 2, 3], 1).unwrap(); - } - // #[test] // fn global_pi() { // let mut rng = rng(); From 32a7e62dca772745e09ed671d5aeaeacacd28f4e Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Sun, 7 Dec 2025 15:00:40 -0800 Subject: [PATCH 15/16] itertools is not a non-dev dep --- Cargo.lock | 12 +----------- Cargo.toml | 1 - 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 90801e4..34c1e09 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -32,7 +32,7 @@ dependencies = [ "bitflags", "cexpr", "clang-sys", - "itertools 0.12.1", + "itertools", "lazy_static", "lazycell", "log", @@ -236,15 +236,6 @@ dependencies = [ "either", ] -[[package]] -name = "itertools" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" -dependencies = [ - "either", -] - [[package]] name = "jobserver" version = "0.1.32" @@ -437,7 +428,6 @@ checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" name = "popgen" version = "0.1.0" dependencies = [ - "itertools 0.14.0", "noodles", "noodles-core", "rand", diff --git a/Cargo.toml b/Cargo.toml index c7e8793..e50477f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.0" edition = "2021" [dependencies] -itertools = "0.14.0" noodles = { version = "0.92.0", features = ["vcf"], optional = true } tskit = { version = "0.15.0-alpha.2", optional = true, features=["bindings"] } From e70b4bec1f2ffd735d0b37760cdb3a33694d048d Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 17 Dec 2025 11:20:49 -0800 Subject: [PATCH 16/16] delete unused file --- src/testing/mod.rs | 1 - src/testing/test.rs | 119 -------------------------------------------- 2 files changed, 120 deletions(-) delete mode 100644 src/testing/test.rs diff --git a/src/testing/mod.rs b/src/testing/mod.rs index b90fb02..d3812ae 100644 --- a/src/testing/mod.rs +++ b/src/testing/mod.rs @@ -1,5 +1,4 @@ mod naivecalculations; -mod test; mod testdata; // tests of TYPES go below diff --git a/src/testing/test.rs b/src/testing/test.rs deleted file mode 100644 index 005a1cb..0000000 --- a/src/testing/test.rs +++ /dev/null @@ -1,119 +0,0 @@ -mod tests { - use crate::AlleleID; - - use crate::counts::MultiSiteCounts; - use crate::iter::SiteCounts; - use rand::rng; - use rand::rngs::ThreadRng; - use rand::seq::SliceRandom; - use std::iter::repeat_n; - // use triangle_matrix::{ - // SimpleLowerTri, SymmetricUpperTri, SymmetricUpperTriMut, Triangle, TriangleMut, - // }; - - // struct TriVec(usize, Vec); - - // impl Triangle for TriVec { - // type Inner = Vec; - - // fn n(&self) -> usize { - // self.0 - // } - - // fn inner(&self) -> &Self::Inner { - // &self.1 - // } - // } - - // impl TriangleMut for TriVec { - // fn inner_mut(&mut self) -> &mut Self::Inner { - // &mut self.1 - // } - // } - - // inefficient O(n^2) computation of pairwise diversity at a site - // - // hidden in test module because nobody should use this; just want to verify without magic numbers that calculations are correct - // fn pi_from_matrix(alleles: &[Option]) -> f64 { - // use SymmetricUpperTri; - // use SymmetricUpperTriMut; - - // let total_alleles = alleles.len(); - // let mut mat = TriVec( - // total_alleles, - // Vec::from_iter(repeat_n( - // None::, - // total_alleles * (total_alleles + 1) / 2, - // )), - // ); - - // for (ind_a, allele_a) in alleles.iter().enumerate() { - // for (ind_b, allele_b) in alleles.iter().enumerate().skip(ind_a + 1) { - // *SymmetricUpperTriMut::get_element_mut(&mut mat, ind_a, ind_b) = match *allele_a { - // None => None, - // Some(allele_id_a) => match *allele_b { - // None => None, - // Some(allele_id_b) => match allele_id_a.eq(&allele_id_b) { - // false => Some(1), - // true => Some(0), - // }, - // }, - // } - // } - // } - - // let make_iter = || { - // mat.iter_triangle_indices() - // .filter_map(|(i, j)| *SymmetricUpperTri::get_element(&mat, i, j)) - // }; - // make_iter().sum::() as f64 / make_iter().count() as f64 - // } - - // #[test] - // fn global_pi() { - // let mut rng = rng(); - // let sites = vec![ - // shuffled_site( - // vec![ - // (Some(AlleleID::from(0)), 35), - // (Some(AlleleID::from(1)), 6), - // (None, 3), - // ] - // .into_iter(), - // &mut rng, - // ), - // shuffled_site( - // vec![ - // (Some(AlleleID::from(0)), 2), - // (Some(AlleleID::from(1)), 14), - // (Some(AlleleID::from(2)), 155), - // ] - // .into_iter(), - // &mut rng, - // ), - // ]; - - // let expect_site_0 = pi_from_matrix(&sites[0]); - // let expect_site_1 = pi_from_matrix(&sites[1]); - - // let allele_counts = MultiSiteCounts::from_tabular(sites); - - // assert!( - // (GlobalPi::from_iter_sites(allele_counts.iter().take(1)).as_raw() - expect_site_0) - // .abs() - // < f64::EPSILON - // ); - // assert!( - // (GlobalPi::from_iter_sites(allele_counts.iter().skip(1).take(1)).as_raw() - // - expect_site_1) - // .abs() - // < f64::EPSILON - // ); - // assert!( - // (GlobalPi::from_iter_sites(allele_counts.iter()).as_raw() - // - (expect_site_0 + expect_site_1)) - // .abs() - // < f64::EPSILON - // ); - // } -}