diff --git a/Statistics/Test/Bartlett.hs b/Statistics/Test/Bartlett.hs new file mode 100644 index 0000000..1443bb7 --- /dev/null +++ b/Statistics/Test/Bartlett.hs @@ -0,0 +1,92 @@ +{-# LANGUAGE FlexibleContexts #-} +{-| +Module : Statistics.Test.Bartlett +Description : Bartlett's test for homogeneity of variances. +Copyright : (c) Praneya Kumar, 2025 +License : BSD-3-Clause + +Implements Bartlett's test to check if multiple groups have equal variances. +Assesses equality of variances assuming normal distribution, sensitive to non-normality. +-} +module Statistics.Test.Bartlett ( + bartlettTest, + module Statistics.Distribution.ChiSquared +) where + +import qualified Data.Vector.Generic as G +import qualified Data.Vector.Unboxed as U +import Statistics.Distribution (cumulative) +import Statistics.Distribution.ChiSquared (chiSquared, ChiSquared(..)) +import Statistics.Sample (varianceUnbiased) +import Statistics.Types (mkPValue) +import Statistics.Test.Types (Test(..)) + +-- | Perform Bartlett's test for equal variances. +-- The input is a list of vectors, where each vector represents a group of observations. +-- Returns Either an error message or a Test ChiSquared containing the test statistic and p-value. +bartlettTest :: [U.Vector Double] -> Either String (Test ChiSquared) +bartlettTest groups + | length groups < 2 = Left "At least two groups are required for Bartlett's test." + | any ((< 2) . G.length) groups = Left "Each group must have at least two observations." + | any (<= 0) groupVariances = Left "All groups must have positive variance." + | otherwise = Right $ Test + { testSignificance = pValue + , testStatistics = tStatistic + , testDistribution = chiDist + } + where + -- Number of groups + k = length groups + + -- Sample sizes for each group + ni = map G.length groups + ni' = map fromIntegral ni + + -- Total number of observations across all groups + nTotal = sum ni + + -- Variance for each group (unbiased estimate) + groupVariances = map varianceUnbiased groups + + -- Pooled variance calculation + sumWeightedVars = sum [ (n - 1) * v | (n, v) <- zip ni' groupVariances ] + pooledVariance = sumWeightedVars / fromIntegral (nTotal - k) + + -- Numerator of Bartlett's statistic + numerator = + fromIntegral (nTotal - k) * log pooledVariance - + sum [ (n - 1) * log v | (n, v) <- zip ni' groupVariances ] + + -- Denominator correction term + sumReciprocals = sum [1 / (n - 1) | n <- ni'] + denomCorrection = + 1 + (sumReciprocals - 1 / fromIntegral (nTotal - k)) / (3 * (fromIntegral k - 1)) + + -- Test statistic T + tStatistic = max 0 $ numerator / denomCorrection + + -- Degrees of freedom and chi-squared distribution + df = k - 1 + chiDist = chiSquared df + pValue = mkPValue $ 1 - cumulative chiDist tStatistic + + +-- Example usage: +-- import qualified Data.Vector.Unboxed as U +-- import Statistics.Test.Bartlett + +-- main :: IO () +-- main = do +-- let a = U.fromList [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] +-- b = U.fromList [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] +-- c = U.fromList [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] + +-- case bartlettTest [a,b,c] of +-- Left err -> putStrLn $ "Error: " ++ err +-- Right test -> do +-- putStrLn $ "Bartlett's Test Statistic: " ++ show (testStatistics test) +-- putStrLn $ "P-Value: " ++ show (testSignificance test) + +-- Sample Output +-- Bartlett's Test Statistic: ~32 +-- P-Value: ~1e-5 diff --git a/Statistics/Test/Levene.hs b/Statistics/Test/Levene.hs new file mode 100644 index 0000000..ffa545c --- /dev/null +++ b/Statistics/Test/Levene.hs @@ -0,0 +1,152 @@ +{-# LANGUAGE FlexibleContexts #-} + +{-| +Module : Statistics.Test.Levene +Description : Levene's test for homogeneity of variances. +Copyright : (c) Praneya Kumar, 2025 +License : BSD-3-Clause + +Implements Levene's test to check if multiple groups have equal variances. +Assesses equality of variances, robust to non-normality, and versatile with mean or median centering. +-} +module Statistics.Test.Levene ( + Center(..), + levenesTest +) where + +import qualified Data.Vector as V +import qualified Data.Vector.Unboxed as U +import qualified Data.Vector.Algorithms.Merge as VA +import Statistics.Distribution (cumulative) +import Statistics.Distribution.FDistribution (fDistribution, FDistribution) +import Statistics.Types (mkPValue) +import Statistics.Test.Types (Test(..)) +import qualified Statistics.Sample as Sample +import Control.Exception (assert) + +-- | Center calculation method +data Center = + Mean -- ^ Use arithmetic mean + | Median -- ^ Use median + | Trimmed Double -- ^ Trimmed mean with given proportion to cut from each end + deriving (Eq, Show) + +-- | Trim data from both ends with error handling and performance optimization +trimboth :: (Ord a, Fractional a) + => V.Vector a + -> Double + -> Either String (V.Vector a) +trimboth vec prop + | prop < 0 || prop > 1 = Left "Proportion must be between 0 and 1" + | V.null vec = Right vec + | otherwise = do + let sorted = V.modify VA.sort vec + n = V.length sorted + lowerCut = floor (prop * fromIntegral n) + upperCut = n - lowerCut + assert (upperCut >= lowerCut) $ + Right $ V.slice lowerCut (upperCut - lowerCut) sorted + +-- | Calculate median using pre-sorted vector +vectorMedian :: (Fractional a, Ord a) + => V.Vector a + -> Either String a +vectorMedian vec + | V.null vec = Left "Empty vector in median calculation" + | otherwise = Right $ + if odd len + then sorted V.! mid + else (sorted V.! (mid - 1) + sorted V.! mid) / 2 + where + sorted = V.modify VA.sort vec + len = V.length sorted + mid = len `div` 2 + +-- | Main Levene's test function with full error handling +levenesTest :: Double -- ^ Significance level (alpha) + -> Center -- ^ Centering method + -> [V.Vector Double] -- ^ Input samples + -> Either String (Test FDistribution) +levenesTest alpha center samples + | alpha < 0 || alpha > 1 = Left "Significance level must be between 0 and 1" + | length samples < 2 = Left "At least two samples required" + | otherwise = do + processed <- mapM processSample samples + let (deviationsList, niList) = unzip processed + deviations = V.fromList deviationsList -- V.Vector (U.Vector Double) + ni = V.fromList niList -- V.Vector Int + zbari = V.map Sample.mean deviations -- V.Vector Double + k = V.length deviations + n = V.sum ni + zbar = V.sum (V.zipWith (\z n' -> z * fromIntegral n') zbari ni) / fromIntegral n + + -- Numerator: Sum over (ni * (zbari - zbar)^2) + numerator = V.sum $ V.zipWith (\n z -> fromIntegral n * (z - zbar) ** 2) ni zbari + + -- Denominator: Sum over sum((dev_ij - zbari)^2) + denominator = V.sum $ V.zipWith (\dev z -> U.sum (U.map (\x -> (x - z) ** 2) dev)) deviations zbari + + -- Handle division by zero and invalid values + if denominator <= 0 || isNaN denominator || isInfinite denominator + then Left "Invalid denominator in W-statistic calculation" + else do + let wStat = (fromIntegral (n - k) / fromIntegral (k - 1)) * (numerator / denominator) + df1 = k - 1 + df2 = n - k + fDist = fDistribution df1 df2 + pVal = mkPValue $ 1 - cumulative fDist wStat + + -- Validate distribution parameters + if df1 < 1 || df2 < 1 + then Left "Invalid degrees of freedom" + else Right $ Test + { testStatistics = wStat + , testSignificance = pVal + , testDistribution = fDist + } + where + -- Process samples with error handling and optimized sorting + processSample vec = case center of + Mean -> do + let dev = V.map (abs . subtract (Sample.mean vec)) vec + return (U.convert dev, V.length vec) + + Median -> do + sortedVec <- Right $ V.modify VA.sort vec + m <- vectorMedian sortedVec + let dev = V.map (abs . subtract m) sortedVec + return (U.convert dev, V.length vec) + + Trimmed p -> do + trimmed_for_center_calculation <- trimboth vec p + let robust_center = Sample.mean trimmed_for_center_calculation + -- Calculate deviations for ALL ORIGINAL points from the robust_center + deviations_from_robust_center = V.map (abs . subtract robust_center) vec -- Use 'vec' (original data) + -- Return deviations and the ORIGINAL sample size + return (U.convert deviations_from_robust_center, V.length vec) -- Use 'V.length vec' + + +-- Example usage: +-- import qualified Data.Vector as V +-- import LevenesTest (Center(..), levenesTest) +-- import Statistics.Test.Types (testStatistics, testSignificance) +-- import Statistics.Types (pValue) + +-- main :: IO () +-- main = do +-- let a = V.fromList [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] +-- b = V.fromList [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] +-- c = V.fromList [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] + +-- case levenesTest (Trimmed 0.05) [a, b, c] of +-- Left err -> putStrLn $ "Error: " ++ err +-- Right test -> do +-- putStrLn $ "Levene's W Statistic: " ++ show (testStatistics test) +-- putStrLn $ "P-Value: " ++ show (pValue (testSignificance test)) +-- putStrLn $ "Reject null hypothesis at α=0.05: " ++ show (testSignificance test < 0.05) + + +-- Sample Output +-- Levene's W Statistic: 7.905 +-- P-Value: 0.002 +-- Reject null hypothesis at α=0.05: True diff --git a/changelog.md b/changelog.md index b21cc04..dd1f56c 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,13 @@ +## Unreleased + +- **New Features**: + - Implemented Bartlett's test (`Statistics.Test.Bartlett`) for homogeneity of variances. + - Implemented Levene's test (`Statistics.Test.Levene`) for homogeneity of variances. + - Resolves [#137](https://github.com/haskell/statistics/issues/137). + +- **Documentation**: + - Added usage examples and Haddock comments for both tests. + ## Changes in 0.16.3.0 * `S.Sample.correlation`, `S.Sample.covariance`, diff --git a/statistics.cabal b/statistics.cabal index e3aeba4..a466dd1 100644 --- a/statistics.cabal +++ b/statistics.cabal @@ -107,6 +107,8 @@ library Statistics.Sample.KernelDensity.Simple Statistics.Sample.Normalize Statistics.Sample.Powers + Statistics.Test.Bartlett + Statistics.Test.Levene Statistics.Test.ChiSquared Statistics.Test.KolmogorovSmirnov Statistics.Test.KruskalWallis diff --git a/tests/Tests/Parametric.hs b/tests/Tests/Parametric.hs index e2ffd9b..05f04ab 100644 --- a/tests/Tests/Parametric.hs +++ b/tests/Tests/Parametric.hs @@ -4,12 +4,18 @@ import Data.Maybe (fromJust) import Statistics.Test.StudentT import Statistics.Types import qualified Data.Vector.Unboxed as U -import Test.Tasty (testGroup) -import Tests.Helpers (testEquality) +import qualified Data.Vector as V +import Test.Tasty (testGroup, TestTree) +import Test.Tasty.HUnit (testCase, assertBool) +import Tests.Helpers (testEquality) import qualified Test.Tasty as Tst +import Statistics.Test.Levene +import Statistics.Test.Bartlett + + tests :: Tst.TestTree -tests = testGroup "Parametric tests" studentTTests +tests = testGroup "Parametric tests" (studentTTests ++ bartlettTests ++ leveneTests) -- 2 samples x 20 obs data -- @@ -77,7 +83,7 @@ testTTest name pVal test = , testEquality name (isSignificant (mkPValue $ pValue pVal + 1e-5) test) Significant ] - + studentTTests :: [Tst.TestTree] studentTTests = concat [ -- R: t.test(sample1, sample2, alt="two.sided", var.equal=T) @@ -100,3 +106,51 @@ studentTTests = concat (mkPValue 0.01705) (fromJust $ pairedTTest BGreater sample12) ] where sample12 = U.zip sample1 sample2 + + +------------------------------------------------------------ +-- Bartlett's Test +------------------------------------------------------------ + +bartlettTests :: [TestTree] +bartlettTests = + let + a = U.fromList [9.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] + b = U.fromList [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 9.36, 9.18, 8.67, 9.05] + c = U.fromList [8.95, 8.12, 8.95, 8.85, 8.03, 8.84, 8.07, 8.98, 8.86, 8.98] + result = bartlettTest [a, b, c] + in case result of + Left err -> [testCase "Bartlett test - failed" (assertBool err False)] + Right test -> + [ testApproxEqual "Bartlett's Chi-Square Statistic" 1e-3 (testStatistics test) 1.802 + , testApproxEqual "Bartlett's P-Value" 1e-3 (pValue $ testSignificance test) 0.406 + , testEquality "Reject null hypothesis at alpha = 0.05" + (isSignificant (mkPValue 0.05) test) NotSignificant + ] + +------------------------------------------------------------ +-- Levene's Test (Trimmed Mean) +------------------------------------------------------------ + +-- Local helper for approximate equality +testApproxEqual :: String -> Double -> Double -> Double -> TestTree +testApproxEqual name epsilon actual expected = + testCase name $ + let diff = abs (actual - expected) + in assertBool (name ++ ": expected ≈ " ++ show expected ++ ", got " ++ show actual) (diff < epsilon) + +leveneTests :: [TestTree] +leveneTests = + let + a = V.fromList [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] + b = V.fromList [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] + c = V.fromList [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] + result = levenesTest 0.05 (Trimmed 0.05) [a, b, c] + in case result of + Left err -> [testCase "Levene trimmed mean test - failed" (assertBool err False)] + Right test -> + [ testApproxEqual "Levene's W Statistic (Trimmed 0.05)" 1e-3 (testStatistics test) 7.905 + , testApproxEqual "Levene's P-Value (Trimmed 0.05)" 1e-3 (pValue $ testSignificance test) 0.002 + , testEquality "Reject null hypothesis at alpha = 0.05" + (isSignificant (mkPValue 0.05) test) Significant + ]