IgorPtak · IgorPtak · May 25, 2026 · May 25, 2026 · May 25, 2026 · May 25, 2026
diff --git a/README.md b/README.md
@@ -281,47 +281,57 @@ Benchmarked on Apple M-series (ARM), window = 100, n = 1 000 000.
 
 ### Python vs pandas
 
-Best robustrolling configuration vs pandas (¹ `assume_finite=True`, ² `method="fast"`).
-
-| Function                 | robustrolling | pandas    | speedup   |
-| ------------------------ | ------------- | --------- | --------- |
-| `rolling_mean` ¹         | 0.78 ms       | 4.58 ms   | **5.9x**  |
-| `rolling_max`            | 11.5 ms       | 12.3 ms   | 1.1x      |
-| `rolling_min`            | 11.5 ms       | 12.7 ms   | 1.1x      |
-| `rolling_median`         | 111 ms        | 233 ms    | **2.1x**  |
-| `rolling_variance` ²     | 4.4 ms        | 10.6 ms   | **2.4x**  |
-| `rolling_skewness` ²     | 10.9 ms       | 10.1 ms   | ~1.0x     |
-| `rolling_kurtosis` ²     | 8.4 ms        | 10.0 ms   | 1.2x      |
-| `rolling_cov`            | 16.8 ms       | 19.3 ms   | 1.2x      |
-| `rolling_cor`            | 16.8 ms       | 39.6 ms   | **2.4x**  |
+| Function             | robustrolling | pandas   | speedup  |
+| -------------------- | ------------- | -------- | -------- |
+| `rolling_mean`       | 3.1 ms        | 4.4 ms   | **1.4x** |
+| `rolling_max`        | 11.1 ms       | 11.7 ms  | 1.1x     |
+| `rolling_min`        | 11.2 ms       | 12.2 ms  | 1.1x     |
+| `rolling_median`     | 106 ms        | 233 ms   | **2.2x** |
+| `rolling_variance`   | 15.2 ms       | 9.6 ms   | 0.6x     |
+| `rolling_skewness`   | 14.0 ms       | 9.1 ms   | 0.6x     |
+| `rolling_kurtosis`   | 14.3 ms       | 9.2 ms   | 0.6x     |
+| `rolling_cov`        | 14.8 ms       | 18.2 ms  | **1.2x** |
+| `rolling_cor`        | 14.6 ms       | 36.7 ms  | **2.5x** |
+
+### Python vs Polars
+
+| Function             | robustrolling | Polars   | speedup  |
+| -------------------- | ------------- | -------- | -------- |
+| `rolling_mean`       | 3.1 ms        | 8.0 ms   | **2.6x** |
+| `rolling_max`        | 11.1 ms       | 11.4 ms  | 1.0x     |
+| `rolling_min`        | 11.0 ms       | 11.6 ms  | 1.1x     |
+| `rolling_median`     | 106 ms        | 40.8 ms  | 0.4x     |
+| `rolling_variance`   | 15.7 ms       | 16.2 ms  | 1.0x     |
+| `rolling_skewness`   | 13.9 ms       | 16.0 ms  | **1.2x** |
+| `rolling_kurtosis`   | 14.3 ms       | 15.6 ms  | 1.1x     |
 
 ### Python — stable vs fast
 
 | Function               | stable   | fast     | speedup  |
 | ---------------------- | -------- | -------- | -------- |
-| `mean` (assume_finite) | 3.5 ms   | 0.78 ms  | **4.4x** |
-| `variance`             | 16.1 ms  | 4.4 ms   | **3.7x** |
-| `skewness`             | 23.9 ms  | 10.9 ms  | **2.2x** |
-| `kurtosis`             | 21.7 ms  | 8.4 ms   | **2.6x** |
+| `mean` (assume_finite) | 3.2 ms   | 0.73 ms  | **4.4x** |
+| `variance`             | 15.2 ms  | 3.9 ms   | **3.9x** |
+| `skewness`             | 13.9 ms  | 10.0 ms  | **1.4x** |
+| `kurtosis`             | 14.4 ms  | 7.6 ms   | **1.9x** |
 
 ### R vs slider vs RcppRoll
 
 | Function             | robustrolling | slider     | RcppRoll  | vs slider  | vs RcppRoll |
 | -------------------- | ------------- | ---------- | --------- | ---------- | ----------- |
-| `rolling_max`        | 15.9 ms       | 349 ms     | 181 ms    | **22x**    | **11x**     |
-| `rolling_min`        | 15.2 ms       | 353 ms     | 181 ms    | **23x**    | **12x**     |
-| `rolling_mean`       | 3.2 ms        | 1 558 ms   | 39.0 ms   | **495x**   | **12x**     |
-| `rolling_variance`   | 16.9 ms       | 2 578 ms   | 320 ms    | **152x**   | **19x**     |
-| `rolling_median`     | 114 ms        | 10 254 ms  | 2 014 ms  | **90x**    | **18x**     |
+| `rolling_max`        | 15.1 ms       | 338 ms     | 175 ms    | **22x**    | **12x**     |
+| `rolling_min`        | 14.9 ms       | 350 ms     | 175 ms    | **24x**    | **12x**     |
+| `rolling_mean`       | 3.1 ms        | 1 523 ms   | 37.4 ms   | **487x**   | **12x**     |
+| `rolling_variance`   | 16.0 ms       | 2 477 ms   | 304 ms    | **154x**   | **19x**     |
+| `rolling_median`     | 112 ms        | 10 084 ms  | 1 938 ms  | **90x**    | **17x**     |
 
 ### R — stable vs fast
 
 | Function               | stable   | fast     | speedup  |
 | ---------------------- | -------- | -------- | -------- |
-| `mean` (assume_finite) | 3.3 ms   | 0.80 ms  | **4.2x** |
-| `variance`             | 16.8 ms  | 4.4 ms   | **3.9x** |
-| `skewness`             | 21.9 ms  | 10.6 ms  | **2.1x** |
-| `kurtosis`             | 21.6 ms  | 8.3 ms   | **2.6x** |
+| `mean` (assume_finite) | 3.2 ms   | 0.78 ms  | **4.0x** |
+| `variance`             | 16.2 ms  | 4.1 ms   | **4.0x** |
+| `skewness`             | 14.5 ms  | 10.3 ms  | **1.4x** |
+| `kurtosis`             | 14.4 ms  | 7.8 ms   | **1.8x** |
 
 ---
 

diff --git a/benchmarks/bench_core.cxx b/benchmarks/bench_core.cxx
@@ -1,37 +1,151 @@
+#include "MonotonicMax.hpp"
+#include "MonotonicMin.hpp"
 #include "MultisetMedian.hpp"
+#include "SlidingCovariance.hpp"
+#include "SlidingMean.hpp"
+#include "SlidingMoments.hpp"
+#include "SlidingWelfordRing.hpp"
 #include <benchmark/benchmark.h>
 #include <cstddef>
 #include <random>
 #include <vector>
 
-std::vector<double> generate_market_data(std::size_t size) {
-  std::vector<double> data(size);
+static std::vector<double> make_data(std::size_t n, double nan_frac = 0.0) {
   std::mt19937 gen(42);
   std::normal_distribution<double> dist(100.0, 5.0);
+  std::uniform_real_distribution<double> coin(0.0, 1.0);
+  std::vector<double> v(n);
+  for (auto &x : v)
+    x = (coin(gen) < nan_frac) ? std::numeric_limits<double>::quiet_NaN()
+                               : dist(gen);
+  return v;
+}
+
+const auto DATA_CLEAN = make_data(100'000);
+const auto DATA_NAN = make_data(100'000, 0.15); // 15% NaN
+
+static void BM_MonotonicMax(benchmark::State &state) {
+  std::size_t w = static_cast<std::size_t>(state.range(0));
+  for (auto _ : state) {
+    MonotonicMax engine(w);
+    for (double v : DATA_CLEAN) {
+      engine.update(v);
+      benchmark::DoNotOptimize(engine.get_max());
+    }
+  }
+}
+BENCHMARK(BM_MonotonicMax)->Arg(10)->Arg(100)->Arg(1000);
+
+static void BM_MonotonicMin(benchmark::State &state) {
+  std::size_t w = static_cast<std::size_t>(state.range(0));
+  for (auto _ : state) {
+    MonotonicMin engine(w);
+    for (double v : DATA_CLEAN) {
+      engine.update(v);
+      benchmark::DoNotOptimize(engine.get_min());
+    }
+  }
+}
+BENCHMARK(BM_MonotonicMin)->Arg(10)->Arg(100)->Arg(1000);
+
+static void BM_MultisetMedian(benchmark::State &state) {
+  std::size_t w = static_cast<std::size_t>(state.range(0));
+  for (auto _ : state) {
+    MultisetMedian engine(w);
+    for (double v : DATA_CLEAN) {
+      engine.update(v);
+      benchmark::DoNotOptimize(engine.get_median());
+    }
+  }
+}
+BENCHMARK(BM_MultisetMedian)->Arg(10)->Arg(100)->Arg(1000);
+
+static void BM_SlidingMean(benchmark::State &state) {
+  std::size_t w = static_cast<std::size_t>(state.range(0));
+  for (auto _ : state) {
+    SlidingMean engine(w);
+    for (double v : DATA_CLEAN) {
+      engine.update(v);
+      benchmark::DoNotOptimize(engine.get_mean());
+    }
+  }
+}
+BENCHMARK(BM_SlidingMean)->Arg(10)->Arg(100)->Arg(1000);
+
+static void BM_SlidingWelfordRing(benchmark::State &state) {
+  std::size_t w = static_cast<std::size_t>(state.range(0));
+  for (auto _ : state) {
+    SlidingWelfordRing engine(w);
+    for (double v : DATA_CLEAN) {
+      engine.update(v);
+      benchmark::DoNotOptimize(engine.get_variance());
+    }
+  }
+}
+BENCHMARK(BM_SlidingWelfordRing)->Arg(10)->Arg(100)->Arg(1000);
 
-  for (std::size_t i = 0; i < size; ++i) {
-    data[i] = dist(gen);
+static void BM_SlidingMoments(benchmark::State &state) {
+  std::size_t w = static_cast<std::size_t>(state.range(0));
+  for (auto _ : state) {
+    SlidingMoments engine(w);
+    for (double v : DATA_CLEAN) {
+      engine.update(v);
+      benchmark::DoNotOptimize(engine.get_skewness());
+      benchmark::DoNotOptimize(engine.get_kurtosis());
+    }
   }
-  return data;
 }
+BENCHMARK(BM_SlidingMoments)->Arg(10)->Arg(100)->Arg(1000);
 
-const auto MARKET_DATA = generate_market_data(100000);
+static std::pair<std::vector<double>, std::vector<double>>
+make_pair_data(std::size_t n) {
+  std::mt19937 gen(99);
+  std::normal_distribution<double> dist(0.0, 1.0);
+  std::vector<double> x(n), y(n);
+  for (std::size_t i = 0; i < n; ++i) {
+    x[i] = dist(gen);
+    y[i] = dist(gen);
+  }
+  return {x, y};
+}
 
-template <class MedianAlgo>
-static void BM_RollingMedian(benchmark::State &state) {
-  std::size_t window_size = static_cast<std::size_t>(state.range(0));
+const auto [COV_X, COV_Y] = make_pair_data(100'000);
 
+static void BM_SlidingCovariance(benchmark::State &state) {
+  std::size_t w = static_cast<std::size_t>(state.range(0));
   for (auto _ : state) {
-    MedianAlgo engine(window_size);
+    SlidingCovariance engine(w);
+    for (std::size_t i = 0; i < COV_X.size(); ++i) {
+      engine.update(COV_X[i], COV_Y[i]);
+      benchmark::DoNotOptimize(engine.get_covariance());
+    }
+  }
+}
+BENCHMARK(BM_SlidingCovariance)->Arg(10)->Arg(100)->Arg(1000);
 
-    for (double price : MARKET_DATA) {
-      engine.update(price);
+static void BM_MultisetMedian_NaN(benchmark::State &state) {
+  std::size_t w = static_cast<std::size_t>(state.range(0));
+  for (auto _ : state) {
+    MultisetMedian engine(w);
+    for (double v : DATA_NAN) {
+      if (std::isnan(v))
+        engine.skip();
+      else
+        engine.update(v);
       benchmark::DoNotOptimize(engine.get_median());
     }
   }
 }
+BENCHMARK(BM_MultisetMedian_NaN)->Arg(100);
 
-BENCHMARK_TEMPLATE(BM_RollingMedian, MultisetMedian)
-    ->Arg(10)
-    ->Arg(100)
-    ->Arg(1000);
+static void BM_SlidingMoments_NaN(benchmark::State &state) {
+  std::size_t w = static_cast<std::size_t>(state.range(0));
+  for (auto _ : state) {
+    SlidingMoments engine(w);
+    for (double v : DATA_NAN) {
+      engine.update(v);
+      benchmark::DoNotOptimize(engine.get_skewness());
+    }
+  }
+}
+BENCHMARK(BM_SlidingMoments_NaN)->Arg(100);
diff --git a/benchmarks/bench_polars.py b/benchmarks/bench_polars.py
@@ -1,5 +1,5 @@
 """
-Benchmark: robustrolling vs Polars rolling functions + stable vs fast.
+Benchmark: robustrolling vs Polars rolling functions (stable methods only).
 
 Usage:
     pip install polars
@@ -20,6 +20,7 @@
 
 def bench(fn, reps: int = REPS) -> float:
     """Return median wall time in milliseconds over `reps` runs."""
+    fn()  # warmup: prime caches before timing
     times = []
     for _ in range(reps):
         t0 = time.perf_counter()
@@ -30,14 +31,11 @@ def bench(fn, reps: int = REPS) -> float:
 
 def make_data(n: int):
     x = RNG.standard_normal(n)
-    y = RNG.standard_normal(n)
-    sx = pl.Series(x)
-    sy = pl.Series(y)
-    return x, y, sx, sy
+    return x, pl.Series(x)
 
 
 def run_vs_polars(n: int) -> list[dict]:
-    x, y, sx, sy = make_data(n)
+    x, sx = make_data(n)
     w = WINDOW
 
     cases = [
@@ -59,31 +57,11 @@ def run_vs_polars(n: int) -> list[dict]:
     return results
 
 
-def run_fast_vs_polars(n: int) -> list[dict]:
-    x, _y, sx, _sy = make_data(n)
-    w = WINDOW
-
-    cases = [
-        ("rolling_mean (SIMD)",       lambda: rr.rolling_mean(x, w, assume_finite=True), lambda: sx.rolling_mean(w)),
-        ("rolling_variance (fast)",   lambda: rr.rolling_variance(x, w, method="fast"),  lambda: sx.rolling_var(w)),
-        ("rolling_skewness (fast)",   lambda: rr.rolling_skewness(x, w, method="fast"),  lambda: sx.rolling_skew(w)),
-        ("rolling_kurtosis (fast)",   lambda: rr.rolling_kurtosis(x, w, method="fast"),  lambda: sx.rolling_kurtosis(w)),
-    ]
-
-    results = []
-    for name, our_fn, pl_fn in cases:
-        our_ms = bench(our_fn)
-        pl_ms = bench(pl_fn)
-        results.append({"name": name, "our_ms": our_ms, "pl_ms": pl_ms,
-                        "speedup": pl_ms / our_ms})
-    return results
-
-
 def flag(v: float) -> str:
     return "x" if v >= 1.0 else " "
 
 
-def print_table(n: int, rows: list[dict], label: str) -> None:
+def print_table(n: int, rows: list[dict]) -> None:
     print(f"\n  n = {n:,}   window = {WINDOW}   (median of {REPS} runs)")
     print(f"  {'Function':<28} {'robustrolling':>14} {'polars':>10} {'speedup':>9}")
     print("  " + "-" * 65)
@@ -98,15 +76,8 @@ def print_table(n: int, rows: list[dict], label: str) -> None:
 if __name__ == "__main__":
     print(f"robustrolling vs Polars {pl.__version__} — rolling window benchmark")
     print("=" * 65)
-
-    print("\n--- stable (default) methods vs Polars ---")
     for n in SIZES:
         rows = run_vs_polars(n)
-        print_table(n, rows, "stable")
-
-    print("\n\n--- fast methods vs Polars ---")
-    for n in SIZES:
-        rows = run_fast_vs_polars(n)
-        print_table(n, rows, "fast")
+        print_table(n, rows)
 
     print()
diff --git a/benchmarks/bench_python.py b/benchmarks/bench_python.py
@@ -20,6 +20,7 @@
 
 def bench(fn, reps: int = REPS) -> float:
     """Return median wall time in milliseconds over `reps` runs."""
+    fn()  # warmup: prime caches before timing
     times = []
     for _ in range(reps):
         t0 = time.perf_counter()