diff --git a/src/io/csv.c b/src/io/csv.c
index f8189ecb..7d07cd3c 100644
--- a/src/io/csv.c
+++ b/src/io/csv.c
@@ -44,6 +44,7 @@
 #include "core/pool.h"
 #include "lang/format.h"
 #include "ops/hash.h"
+#include "ops/idxop.h"      /* attach per-chunk zone index after load */
 #include "store/col.h"
 #include "store/fileio.h"
 #include "store/splay.h"
@@ -1227,6 +1228,113 @@ static void csv_parse_serial(const char* buf, size_t buf_size,
     }
 }
 
+/* Per-column elem size for the hash-attach cap.  Mirrors the integer
+ * shapes accepted by ray_index_attach_hash (BOOL/U8/I16/I32/I64/DATE/
+ * TIME/TIMESTAMP); returns 0 for floats and dict-backed types so the
+ * caller skips them. */
+static int csv_hash_elem_size(int8_t t) {
+    switch (t) {
+    case RAY_BOOL: case RAY_U8:                       return 1;
+    case RAY_I16:                                     return 2;
+    case RAY_I32: case RAY_DATE:                      return 4;
+    case RAY_I64: case RAY_TIME: case RAY_TIMESTAMP:  return 8;
+    default:                                          return 0;
+    }
+}
+
+/* Decide whether `v` is a good candidate for an auto-attached hash
+ * index, using only its (already-attached) chunk_zone as the entropy
+ * proxy.  A column is "random-shaped" when each chunk's [min, max]
+ * covers more than half the global range — i.e. there's effectively
+ * no clustering, so the per-chunk zone-skip never excludes a chunk
+ * and the only way to accelerate `col == K` is by hashing.
+ *
+ * The memory cap rejects columns where the hash index (table+chain
+ * arrays — ~24 bytes/row at default load factor) would be much larger
+ * than the data itself.  We use 5× the column's data bytes as the
+ * budget: this comfortably admits I32/I64 numeric IDs (where the
+ * index is 3–5× the data) while still excluding narrow types like
+ * BOOL/U8/I16 where the index would dwarf the column.
+ *
+ * Returns 1 to attach, 0 to skip. */
+static int csv_should_attach_hash(ray_t* v) {
+    if (!v || RAY_IS_ERR(v)) return 0;
+    int esz = csv_hash_elem_size(v->type);
+    if (esz == 0) return 0;
+    /* Need a chunk_zone we can read for entropy estimation. */
+    if (!(v->attrs & RAY_ATTR_HAS_INDEX) || !v->index) return 0;
+    ray_index_t* ix = ray_index_payload(v->index);
+    if (ix->kind != RAY_IDX_CHUNK_ZONE || ix->u.chunk_zone.is_f64) return 0;
+    uint32_t n_chunks = ix->u.chunk_zone.n_chunks;
+    if (n_chunks < 4) return 0;
+    const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins);
+    const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs);
+
+    /* Whole-column [gmin, gmax] from the chunk extrema, ignoring empty
+     * chunks (mn > mx, set by the chunk_zone scan when a chunk is fully
+     * null). */
+    int64_t gmin = INT64_MAX, gmax = INT64_MIN;
+    for (uint32_t g = 0; g < n_chunks; g++) {
+        if (mins[g] > maxs[g]) continue;
+        if (mins[g] < gmin) gmin = mins[g];
+        if (maxs[g] > gmax) gmax = maxs[g];
+    }
+    if (gmin == INT64_MAX || gmax == INT64_MIN) return 0;
+    /* Compute (gmax - gmin) in uint64 space — the signed subtraction
+     * overflows when the range spans the full I64 width (e.g. UserID
+     * hashing to both sign halves).  Reinterpret as uint64 first;
+     * 2's-complement wrap gives the correct |gmax - gmin|. */
+    uint64_t global_range = (uint64_t)gmax - (uint64_t)gmin;
+    if (global_range == 0) return 0;  /* constant column — pointless */
+
+    /* Average per-chunk span / global range — selectivity proxy.
+     * Sum the per-chunk spans as doubles so the accumulation can't
+     * overflow when chunks span the full I64 width (uint64 sum
+     * across ~150 chunks each ~1.8e19 wide overflows; double has
+     * ~15 significant decimal digits, plenty for this coarse ratio).
+     *
+     * Threshold = 0.2.  The strict 0.5 cut documented in the design
+     * note cleanly catches uniformly-random hashed columns (ratio
+     * ~1.0) but excludes mildly-clustered numeric IDs like UserID
+     * (~0.26 on the ClickBench hits data: user sessions cluster
+     * consecutively so chunk spans don't fully cover the I64 range).
+     * For point lookups on those columns chunk_zone still prunes
+     * most chunks but ~30 % can hold the key — a 30 % full-column
+     * scan, not a real win.  Dropping to 0.2 admits UserID while
+     * still excluding tightly-clustered keys (CounterID/EventDate
+     * at <0.01) where chunk_zone already gives 99 %+ pruning. */
+    double dgr = (double)global_range;
+    double span_sum = 0.0;
+    uint32_t n_eff = 0;
+    for (uint32_t g = 0; g < n_chunks; g++) {
+        if (mins[g] > maxs[g]) continue;
+        uint64_t span = (uint64_t)maxs[g] - (uint64_t)mins[g];
+        span_sum += (double)span;
+        n_eff++;
+    }
+    if (n_eff < 4) return 0;
+    double mean_ratio = (span_sum / (double)n_eff) / dgr;
+    if (mean_ratio <= 0.2) return 0;
+
+    /* Memory cap: ray_index_attach_hash allocates a power-of-two
+     * `cap = next_pow2(2*n)` int64 table plus an n-entry int64
+     * chain.  Skip when the index would cost more than 5× the
+     * column's payload — keeps narrow integer types (where the
+     * index dwarfs the data) out of the index set while admitting
+     * I32 / I64 numeric IDs.  Done in int64 arithmetic (we cap n
+     * to anything that would overflow at the row counts we accept). */
+    int64_t n = v->len;
+    if (n <= 0) return 0;
+    uint64_t cap = 8;
+    uint64_t want = (uint64_t)(2 * n);
+    while (cap < want) cap <<= 1;
+    uint64_t aux_bytes  = cap * 8u + (uint64_t)n * 8u;
+    uint64_t data_bytes = (uint64_t)n * (uint64_t)esz;
+    if (aux_bytes > 5u * data_bytes) return 0;
+
+    return 1;
+}
+
 static ray_t* csv_materialize_rows(const char* buf, size_t file_size,
                                    const int64_t* row_offsets, int64_t n_rows,
                                    int ncols, char delimiter,
@@ -1410,6 +1518,36 @@ static ray_t* csv_materialize_rows(const char* buf, size_t file_size,
         col_data[c] = dst;
     }
 
+    /* Per-chunk min/max + null bit on every column big enough to be worth
+     * indexing — gives the reduce min/max and the filter chunk-skip paths
+     * an O(n_chunks) scan instead of O(n_rows).  Attach is best-effort:
+     * unsupported types (RAY_STR/RAY_SYM/RAY_GUID in v1) just stay
+     * unindexed and the consumer falls back to a row scan.
+     *
+     * After the chunk_zone attaches we re-walk the same columns and
+     * upgrade the high-entropy ones to a hash index (the chunk_zone
+     * stays as well — it's the entropy signal we just measured).  See
+     * csv_should_attach_hash for the selectivity + memory cap. */
+    for (int c = 0; c < ncols; c++) {
+        ray_t* v = col_vecs[c];
+        if (!v || RAY_IS_ERR(v)) continue;
+        if (v->len < (1 << 16)) continue;        /* < one chunk, skip */
+        ray_t* r = ray_index_attach_chunk_zone(&v, 16);
+        if (r && !RAY_IS_ERR(r)) col_vecs[c] = v;  /* attach succeeded */
+        /* On failure the original column stays in col_vecs[c]; ignore. */
+    }
+    for (int c = 0; c < ncols; c++) {
+        ray_t* v = col_vecs[c];
+        if (!csv_should_attach_hash(v)) continue;
+        /* ray_index_attach_hash drops any existing index on the
+         * column first; the chunk_zone we just built is sacrificed
+         * for the hash.  That's the right trade — once the column
+         * is known to be high-entropy, chunk-skip never fires
+         * anyway, so the chunk_zone is dead weight. */
+        ray_t* r = ray_index_attach_hash(&v);
+        if (r && !RAY_IS_ERR(r)) col_vecs[c] = v;
+    }
+
     ray_t* tbl = ray_table_new(ncols);
     if (!tbl || RAY_IS_ERR(tbl)) {
         for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]);
@@ -1788,6 +1926,25 @@ ray_t* ray_read_csv_named_opts(const char* path, char delimiter, bool header,
 
     /* ---- 11. Build table ---- */
     {
+        /* Best-effort per-chunk zone index attach (see comment on the
+         * matching loop in build_table_from_cols) — unsupported types
+         * fall through to the unindexed path inside the consumer.
+         * Second pass upgrades high-entropy columns to a hash index;
+         * see csv_should_attach_hash. */
+        for (int c = 0; c < ncols; c++) {
+            ray_t* v = col_vecs[c];
+            if (!v || RAY_IS_ERR(v)) continue;
+            if (v->len < (1 << 16)) continue;
+            ray_t* r = ray_index_attach_chunk_zone(&v, 16);
+            if (r && !RAY_IS_ERR(r)) col_vecs[c] = v;
+        }
+        for (int c = 0; c < ncols; c++) {
+            ray_t* v = col_vecs[c];
+            if (!csv_should_attach_hash(v)) continue;
+            ray_t* r = ray_index_attach_hash(&v);
+            if (r && !RAY_IS_ERR(r)) col_vecs[c] = v;
+        }
+
         ray_t* tbl = ray_table_new(ncols);
         if (!tbl || RAY_IS_ERR(tbl)) {
             for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]);
diff --git a/src/lang/env.c b/src/lang/env.c
index 125ced49..8bb2a50e 100644
--- a/src/lang/env.c
+++ b/src/lang/env.c
@@ -30,17 +30,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-static _Atomic uint64_t g_env_generation = 1;
-
-uint64_t ray_env_generation(void) {
-    return atomic_load_explicit(&g_env_generation, memory_order_relaxed);
-}
-
-static void env_bump_generation_if_user(int is_user) {
-    if (is_user)
-        atomic_fetch_add_explicit(&g_env_generation, 1, memory_order_relaxed);
-}
-
 /* ---- Function constructors ---- */
 
 /* Builtin name stored inline in nullmap[2..15] (max 13 chars + null).
@@ -311,7 +300,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) {
                     g_env.user[j] = g_env.user[j + 1];
                 }
                 g_env.count--;
-                env_bump_generation_if_user(is_user);
                 env_unlock();
                 return RAY_OK;
             }
@@ -324,7 +312,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) {
              * flag alone — once user, always user, until the slot is
              * deleted. */
             if (is_user) g_env.user[i] = 1;
-            env_bump_generation_if_user(is_user);
             env_unlock();
             return RAY_OK;
         }
@@ -342,7 +329,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) {
     g_env.vals[g_env.count] = val;
     g_env.user[g_env.count] = is_user ? 1 : 0;
     g_env.count++;
-    env_bump_generation_if_user(is_user);
     env_unlock();
     return RAY_OK;
 }
diff --git a/src/lang/env.h b/src/lang/env.h
index 25170c2a..e92b5284 100644
--- a/src/lang/env.h
+++ b/src/lang/env.h
@@ -43,7 +43,6 @@ static inline const char* ray_fn_name(const ray_t* fn) {
 ray_err_t ray_env_init(void);
 void     ray_env_destroy(void);
 ray_t*    ray_env_get(int64_t sym_id);
-uint64_t  ray_env_generation(void);
 
 /* User-facing binder.  Refuses any name starting with `.` — that root is
  * reserved for system namespaces (.sys, .os, .io, .ipc, …) populated by
diff --git a/src/lang/eval.c b/src/lang/eval.c
index 2f6cac11..431d11bc 100644
--- a/src/lang/eval.c
+++ b/src/lang/eval.c
@@ -1480,116 +1480,9 @@ ray_t* ray_cond_fn(ray_t** args, int64_t n) {
     return make_i64(0);
 }
 
-static uint64_t do_cache_mix(uint64_t h, uint64_t v) {
-    h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2);
-    return h ? h : 0x9e3779b97f4a7c15ull;
-}
-
-static uint64_t do_cache_hash(ray_t* x) {
-    if (!x) return 0x1234abcd5678ef00ull;
-    uint64_t h = do_cache_mix(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type);
-    h = do_cache_mix(h, (uint64_t)x->attrs);
-    h = do_cache_mix(h, (x->type == -RAY_STR)
-                        ? (uint64_t)ray_str_len(x)
-                        : (uint64_t)x->len);
-    if (x->type == RAY_LIST) {
-        ray_t** elems = (ray_t**)ray_data(x);
-        for (int64_t i = 0; i < x->len; i++)
-            h = do_cache_mix(h, do_cache_hash(elems[i]));
-    } else if (x->type == RAY_DICT) {
-        h = do_cache_mix(h, do_cache_hash(ray_dict_keys(x)));
-        h = do_cache_mix(h, do_cache_hash(ray_dict_vals(x)));
-    } else if (x->type == RAY_STR) {
-        for (int64_t i = 0; i < x->len; i++) {
-            size_t n = 0;
-            const char* s = ray_str_vec_get(x, i, &n);
-            for (size_t j = 0; s && j < n; j++)
-                h = do_cache_mix(h, (unsigned char)s[j]);
-        }
-    } else if (x->type == -RAY_STR) {
-        const char* s = ray_str_ptr(x);
-        size_t n = ray_str_len(x);
-        for (size_t i = 0; s && i < n; i++)
-            h = do_cache_mix(h, (unsigned char)s[i]);
-    } else if (x->type == RAY_SYM || x->type == -RAY_SYM ||
-               x->type == RAY_I64 || x->type == -RAY_I64 ||
-               x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) {
-        h = do_cache_mix(h, (uint64_t)x->i64);
-    } else if (x->type == RAY_I32 || x->type == -RAY_I32 ||
-               x->type == RAY_DATE || x->type == -RAY_DATE ||
-               x->type == RAY_TIME || x->type == -RAY_TIME) {
-        h = do_cache_mix(h, (uint64_t)(uint32_t)x->i32);
-    } else if (x->type == RAY_I16 || x->type == -RAY_I16) {
-        h = do_cache_mix(h, (uint64_t)(uint16_t)x->i16);
-    } else if (x->type == RAY_U8 || x->type == -RAY_U8 ||
-               x->type == RAY_BOOL || x->type == -RAY_BOOL) {
-        h = do_cache_mix(h, (uint64_t)x->u8);
-    } else if (x->type == RAY_F64 || x->type == -RAY_F64) {
-        uint64_t bits = 0;
-        memcpy(&bits, &x->f64, sizeof(bits));
-        h = do_cache_mix(h, bits);
-    }
-    return h;
-}
-
-static bool do_cache_contains_set(ray_t* x) {
-    if (!x || x->type != RAY_LIST) return false;
-    ray_t** elems = (ray_t**)ray_data(x);
-    if (x->len > 0 && elems[0] && elems[0]->type == -RAY_SYM) {
-        ray_t* s = ray_sym_str(elems[0]->i64);
-        bool is_set = s && ray_str_len(s) == 3 &&
-                      memcmp(ray_str_ptr(s), "set", 3) == 0;
-        if (s) ray_release(s);
-        if (is_set) return true;
-    }
-    for (int64_t i = 0; i < x->len; i++)
-        if (do_cache_contains_set(elems[i]))
-            return true;
-    return false;
-}
-
-static bool do_cache_is_null_name(ray_t* x) {
-    if (!x || x->type != -RAY_SYM || !(x->attrs & RAY_ATTR_NAME)) return false;
-    ray_t* s = ray_sym_str(x->i64);
-    bool ok = s && ray_str_len(s) == 4 && memcmp(ray_str_ptr(s), "null", 4) == 0;
-    if (s) ray_release(s);
-    return ok;
-}
-
-#define DO_NULL_CACHE_N 2048
-static uint64_t g_do_null_cache[DO_NULL_CACHE_N];
-static uint64_t g_do_null_cache_env_gen[DO_NULL_CACHE_N];
-static uint16_t g_do_null_cache_next = 0;
-
-static bool do_null_cache_get(uint64_t hash) {
-    if (!hash) return false;
-    uint64_t env_gen = ray_env_generation();
-    for (uint16_t i = 0; i < DO_NULL_CACHE_N; i++)
-        if (g_do_null_cache[i] == hash &&
-            g_do_null_cache_env_gen[i] == env_gen)
-            return true;
-    return false;
-}
-
-static void do_null_cache_put(uint64_t hash) {
-    if (hash) {
-        uint16_t slot = g_do_null_cache_next++ % DO_NULL_CACHE_N;
-        g_do_null_cache[slot] = hash;
-        g_do_null_cache_env_gen[slot] = ray_env_generation();
-    }
-}
-
 /* (do expr1 expr2 ...) — evaluate in sequence, return last. Pushes local scope. */
 ray_t* ray_do_fn(ray_t** args, int64_t n) {
     if (n == 0) return make_i64(0);
-    uint64_t null_cache_hash = 0;
-    if (g_ray_profile.active &&
-        n == 2 && do_cache_is_null_name(args[1]) &&
-        !do_cache_contains_set(args[0])) {
-        null_cache_hash = do_cache_hash(args[0]);
-        if (do_null_cache_get(null_cache_hash))
-            return NULL;
-    }
     if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL);
     ray_t* result = NULL;
     for (int64_t i = 0; i < n; i++) {
@@ -1603,8 +1496,6 @@ ray_t* ray_do_fn(ray_t** args, int64_t n) {
         }
     }
     ray_env_pop_scope();
-    if (null_cache_hash && result == NULL)
-        do_null_cache_put(null_cache_hash);
     return result;
 }
 
diff --git a/src/mem/heap.c b/src/mem/heap.c
index d8ee3f29..231f3751 100644
--- a/src/mem/heap.c
+++ b/src/mem/heap.c
@@ -1262,7 +1262,11 @@ void ray_heap_destroy(void) {
  * -------------------------------------------------------------------------- */
 
 static void heap_return_foreign_freelist(ray_heap_t* h) {
+    /* avail bit (set on insert, cleared on remove) tells us which
+     * freelist orders have any blocks at all — skip the empty ones. */
+    if (!h->avail) return;
     for (int order = RAY_ORDER_MIN; order < RAY_HEAP_FL_SIZE; order++) {
+        if (!(h->avail & (1ULL << order))) continue;
         ray_fl_head_t* head = &h->freelist[order];
         ray_t* blk = head->fl_next;
         while (blk != (ray_t*)head) {
@@ -1473,11 +1477,21 @@ void ray_heap_gc(void) {
         /* Pass 5: Release physical pages from free blocks in every
          * idle heap.  Pass 2 may have returned blocks to worker-owned
          * freelists; releasing only the caller heap leaves those worker
-         * pages resident across large query repetitions. */
+         * pages resident across large query repetitions.
+         *
+         * Use each heap's avail bitmap (set on insert, cleared on
+         * remove) to skip the entire walk when no order >= 13 has any
+         * free block.  Tiny-query workloads — where the per-statement
+         * GC fires before any large allocation has been freed —
+         * complete pass 5 without entering the body. */
+        uint64_t large_orders_mask = ~((1ULL << 13) - 1);
         for (int hid = 0; hid < RAY_HEAP_REGISTRY_SIZE; hid++) {
             ray_heap_t* gh = ray_heap_registry[hid];
             if (!gh) continue;
+            uint64_t avail = gh->avail & large_orders_mask;
+            if (!avail) continue;
             for (int i = 13; i < RAY_HEAP_FL_SIZE; i++) {
+                if (!(avail & (1ULL << i))) continue;
                 ray_fl_head_t* head = &gh->freelist[i];
                 ray_t* blk = head->fl_next;
                 while (blk != (ray_t*)head) {
diff --git a/src/ops/agg.c b/src/ops/agg.c
index fee02d2e..34328522 100644
--- a/src/ops/agg.c
+++ b/src/ops/agg.c
@@ -23,6 +23,7 @@
 
 #include "lang/internal.h"
 #include "ops/ops.h"
+#include "ops/idxop.h"   /* RAY_IDX_CHUNK_ZONE fast path for min/max */
 #include "mem/heap.h"
 
 #include <stdlib.h>  /* qsort (introselect fallback) */
@@ -328,7 +329,43 @@ ray_t* ray_min_fn(ray_t* x) {
     if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MIN);
     if (RAY_IS_PARTED(x->type)) return agg_parted_minmax(x, 0);
     if (ray_is_atom(x)) { ray_retain(x); return x; }
-    if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_min_op);
+    if (ray_is_vec(x)) {
+        /* Per-chunk zone index fast path: O(n_chunks) instead of O(n_rows).
+         * Only valid when the index was built for the column's current len
+         * (mutation paths call ray_index_drop). */
+        if (ray_index_kind(x) == RAY_IDX_CHUNK_ZONE) {
+            ray_index_t* ix = ray_index_payload(x->index);
+            if (ix->built_for_len == x->len) {
+                uint32_t n_chunks = ix->u.chunk_zone.n_chunks;
+                if (ix->u.chunk_zone.is_f64) {
+                    const double* mins = (const double*)ray_data(ix->u.chunk_zone.mins);
+                    double mn = INFINITY;
+                    for (uint32_t g = 0; g < n_chunks; g++)
+                        if (mins[g] < mn) mn = mins[g];
+                    if (mn == INFINITY) return ray_typed_null(-RAY_F64);
+                    return make_f64(mn);
+                } else {
+                    const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins);
+                    int64_t mn = INT64_MAX;
+                    for (uint32_t g = 0; g < n_chunks; g++)
+                        if (mins[g] < mn) mn = mins[g];
+                    if (mn == INT64_MAX) return ray_typed_null(-x->type);
+                    /* Preserve the column's storage width on the result. */
+                    switch (x->type) {
+                    case RAY_BOOL:      return ray_bool((bool)mn);
+                    case RAY_U8:        return ray_u8((uint8_t)mn);
+                    case RAY_I16:       return ray_i16((int16_t)mn);
+                    case RAY_I32:       return ray_i32((int32_t)mn);
+                    case RAY_DATE:      return ray_date((int32_t)mn);
+                    case RAY_TIME:      return ray_time(mn);
+                    case RAY_TIMESTAMP: return ray_timestamp(mn);
+                    default:            return ray_i64(mn);
+                    }
+                }
+            }
+        }
+        AGG_VEC_VIA_DAG(x, ray_min_op);
+    }
     if (!is_list(x)) return ray_error("type", NULL);
     int64_t len = ray_len(x);
     if (len == 0) return ray_error("domain", NULL);
@@ -350,7 +387,39 @@ ray_t* ray_max_fn(ray_t* x) {
     if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MAX);
     if (RAY_IS_PARTED(x->type)) return agg_parted_minmax(x, 1);
     if (ray_is_atom(x)) { ray_retain(x); return x; }
-    if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_max_op);
+    if (ray_is_vec(x)) {
+        if (ray_index_kind(x) == RAY_IDX_CHUNK_ZONE) {
+            ray_index_t* ix = ray_index_payload(x->index);
+            if (ix->built_for_len == x->len) {
+                uint32_t n_chunks = ix->u.chunk_zone.n_chunks;
+                if (ix->u.chunk_zone.is_f64) {
+                    const double* maxs = (const double*)ray_data(ix->u.chunk_zone.maxs);
+                    double mx = -INFINITY;
+                    for (uint32_t g = 0; g < n_chunks; g++)
+                        if (maxs[g] > mx) mx = maxs[g];
+                    if (mx == -INFINITY) return ray_typed_null(-RAY_F64);
+                    return make_f64(mx);
+                } else {
+                    const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs);
+                    int64_t mx = INT64_MIN;
+                    for (uint32_t g = 0; g < n_chunks; g++)
+                        if (maxs[g] > mx) mx = maxs[g];
+                    if (mx == INT64_MIN) return ray_typed_null(-x->type);
+                    switch (x->type) {
+                    case RAY_BOOL:      return ray_bool((bool)mx);
+                    case RAY_U8:        return ray_u8((uint8_t)mx);
+                    case RAY_I16:       return ray_i16((int16_t)mx);
+                    case RAY_I32:       return ray_i32((int32_t)mx);
+                    case RAY_DATE:      return ray_date((int32_t)mx);
+                    case RAY_TIME:      return ray_time(mx);
+                    case RAY_TIMESTAMP: return ray_timestamp(mx);
+                    default:            return ray_i64(mx);
+                    }
+                }
+            }
+        }
+        AGG_VEC_VIA_DAG(x, ray_max_op);
+    }
     if (!is_list(x)) return ray_error("type", NULL);
     int64_t len = ray_len(x);
     if (len == 0) return ray_error("domain", NULL);
diff --git a/src/ops/exec.c b/src/ops/exec.c
index e30ebf97..efa90cf8 100644
--- a/src/ops/exec.c
+++ b/src/ops/exec.c
@@ -24,6 +24,7 @@
 #include "ops/internal.h"
 #include "ops/rowsel.h"
 #include "ops/fused_group.h"
+#include "ops/idxop.h"
 #include "mem/heap.h"
 #include "mem/sys.h"
 
@@ -856,6 +857,61 @@ static ray_t* exec_in(ray_graph_t* g, ray_op_t* op, ray_t* col, ray_t* set) {
  * Recursive executor
  * ============================================================================ */
 
+/* Decode an OP_EQ predicate `pred_op` against g->table.  When the
+ * predicate has shape (== col_scan const_int) and `col_scan` resolves
+ * to a column in g->table that is non-null, non-parted, and carries a
+ * fresh RAY_IDX_HASH, write the column pointer to *out_col and the
+ * decoded int64 key to *out_key, returning 1.  Returns 0 on any
+ * miss — the caller falls through to the regular scan-based pred
+ * evaluation. */
+static int hash_index_eq_decode(ray_graph_t* g, ray_op_t* pred_op,
+                                ray_t** out_col, int64_t* out_key) {
+    if (!pred_op || pred_op->opcode != OP_EQ || pred_op->arity != 2)
+        return 0;
+    ray_op_t* lhs = pred_op->inputs[0];
+    ray_op_t* rhs = pred_op->inputs[1];
+    if (!lhs || !rhs) return 0;
+    if (lhs->opcode != OP_SCAN || rhs->opcode != OP_CONST) return 0;
+    ray_op_ext_t* lext = find_ext(g, lhs->id);
+    ray_op_ext_t* rext = find_ext(g, rhs->id);
+    if (!lext || !rext || !rext->literal) return 0;
+    uint16_t stored_table_id = 0;
+    memcpy(&stored_table_id, lext->base.pad, sizeof(uint16_t));
+    if (stored_table_id != 0) return 0;  /* non-default table — skip */
+    ray_t* tbl = g->table;
+    if (!tbl) return 0;
+    ray_t* col = ray_table_get_col(tbl, lext->sym);
+    if (!col || RAY_IS_ERR(col)) return 0;
+    if (RAY_IS_PARTED(col->type) || col->type == RAY_MAPCOMMON) return 0;
+    /* Nullable columns: the hash chain skipped null rows, so the
+     * resulting selection would mismatch the unfused null-aware
+     * compare for the col == col semantics rare-but-required case.
+     * Bail and let the existing compare run. */
+    if (col->attrs & RAY_ATTR_HAS_NULLS) return 0;
+    if (!ray_index_has(col)) return 0;
+    if (ray_index_kind(col) != RAY_IDX_HASH) return 0;
+    ray_index_t* ix = ray_index_payload(col->index);
+    if (ix->built_for_len != col->len) return 0;
+
+    ray_t* cv = rext->literal;
+    if (!cv) return 0;
+    int64_t key = 0;
+    switch (cv->type) {
+    case -RAY_I64:
+    case -RAY_TIMESTAMP: key = cv->i64;                  break;
+    case -RAY_I32:
+    case -RAY_DATE:
+    case -RAY_TIME:      key = (int64_t)cv->i32;         break;
+    case -RAY_I16:       key = (int64_t)cv->i16;         break;
+    case -RAY_BOOL:
+    case -RAY_U8:        key = (int64_t)cv->b8;          break;
+    default: return 0;  /* floats / sym / str — not eligible */
+    }
+    *out_col = col;
+    *out_key = key;
+    return 1;
+}
+
 /* Is this opcode a "heavy" pipeline breaker worth profiling? */
 static inline bool op_is_heavy(uint16_t opc) {
     return opc == OP_FILTER || opc == OP_SORT || opc == OP_GROUP ||
@@ -1122,8 +1178,31 @@ static ray_t* exec_node_inner(ray_graph_t* g, ray_op_t* op) {
             }
 
             ray_t* input = exec_node(g, op->inputs[0]);
-            ray_t* pred  = exec_node(g, op->inputs[1]);
-            if (!input || RAY_IS_ERR(input)) { if (pred && !RAY_IS_ERR(pred)) ray_release(pred); return input; }
+            if (!input || RAY_IS_ERR(input)) return input;
+            /* Hash-index point-lookup fast path: when the predicate is
+             * `col == K` on a column with RAY_IDX_HASH attached and
+             * built for the column's current length, install the
+             * matching rowsel on g->selection directly — bypasses
+             * both the O(rows) compare AND the O(rows) BOOL→rowsel
+             * scan.  Only fires for the lazy TABLE-input case with no
+             * pre-existing selection (the entry shape downstream
+             * group-by / sort already expects). */
+            if (input->type == RAY_TABLE && !g->selection) {
+                ray_t* col = NULL;
+                int64_t key = 0;
+                if (hash_index_eq_decode(g, op->inputs[1], &col, &key)) {
+                    ray_t* sel = ray_index_hash_eq_rowsel(col, key);
+                    if (sel) {
+                        g->selection = sel;
+                        return input;
+                    }
+                    /* sel == NULL: column was eligible at decode time
+                     * but allocation failed.  Fall through to the
+                     * scan path below — defensive (no functional
+                     * difference in the common case). */
+                }
+            }
+            ray_t* pred = exec_node(g, op->inputs[1]);
             if (!pred || RAY_IS_ERR(pred)) { ray_release(input); return pred; }
 
             /* Lazy filter: convert predicate to a rowsel (morsel-local
@@ -1362,7 +1441,7 @@ static ray_t* exec_node_inner(ray_graph_t* g, ray_op_t* op) {
                 }
                 ray_t* result = exec_sort(g, child_op, tbl, n);
                 if (sort_input != g->table) ray_release(sort_input);
-                if (result && !RAY_IS_ERR(result)) ray_heap_gc();
+                /* Top-level statement GC catches intermediates. */
                 return result;
             }
 
@@ -1431,7 +1510,7 @@ static ray_t* exec_node_inner(ray_graph_t* g, ray_op_t* op) {
                 ray_release(pred);
                 if (filter_input != saved_table)
                     ray_release(filter_input);
-                if (result && !RAY_IS_ERR(result)) ray_heap_gc();
+                /* Top-level statement GC catches intermediates. */
                 return result;
             } else {
                 input = exec_node(g, op->inputs[0]);
diff --git a/src/ops/expr.c b/src/ops/expr.c
index 49b4f9bc..07931bba 100644
--- a/src/ops/expr.c
+++ b/src/ops/expr.c
@@ -2115,11 +2115,22 @@ ray_t* exec_elementwise_binary(ray_graph_t* g, ray_op_t* op, ray_t* lhs, ray_t*
                      0, len);
     }
 
-    /* Null propagation from inputs */
-    if (op_propagates_null(op->opcode))
-        propagate_nulls_binary(lhs, rhs, result, l_scalar, r_scalar, len);
-    else
-        fix_null_comparisons(lhs, rhs, result, l_scalar, r_scalar, len, op->opcode);
+    /* Null propagation from inputs.  Skipped when str_resolved: we resolved
+     * a string constant to an integer sym id and compared it by value against
+     * a SYM column.  SYM columns carry no nulls (id 0 / the interned empty
+     * string is a real value — see ray_sym_init / ray_vec_is_null), and the
+     * resolved string atom must NOT be treated as null here.  Otherwise the
+     * empty-string literal "" — for which RAY_ATOM_IS_NULL is true (slen==0,
+     * obj==NULL) yet which resolves to the valid sym id 0 — would take the
+     * null-comparison fill: `!= col ""` passing every row and `== col ""`
+     * matching none, instead of selecting the empty-string rows by value
+     * (which silently drops a `(!= symcol "")` WHERE predicate). */
+    if (!str_resolved) {
+        if (op_propagates_null(op->opcode))
+            propagate_nulls_binary(lhs, rhs, result, l_scalar, r_scalar, len);
+        else
+            fix_null_comparisons(lhs, rhs, result, l_scalar, r_scalar, len, op->opcode);
+    }
 
     /* Div/mod: mark zero-divisor positions as null.
      * The morsel loop writes 0 for b==0 but can't set bitmap nulls. */
diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c
index 127b177f..a8a8e081 100644
--- a/src/ops/fused_group.c
+++ b/src/ops/fused_group.c
@@ -23,6 +23,7 @@
 
 #include "ops/fused_group.h"
 #include "ops/fused_pred.h" /* fp_pred_t / fp_compile_pred / fp_eval_pred */
+#include "ops/idxop.h"      /* RAY_IDX_CHUNK_ZONE chunk-skip in fp_eval_cmp */
 #include "lang/eval.h"      /* RAY_ATTR_NAME */
 #include "core/pool.h"      /* ray_pool_get / ray_pool_dispatch */
 
@@ -344,6 +345,72 @@ void fp_eval_cmp(const fp_cmp_t* p, int64_t start, int64_t end,
         return;
     }
 
+    /* Chunk-zone fast path: if the column carries per-chunk min/max
+     * metadata and [start, end) fits inside a single chunk, decide the
+     * whole morsel from chunk extrema without reading a single value.
+     * Only integer/temporal comparisons (EQ/NE/LT/LE/GT/GE) — LIKE/IN
+     * have their own evaluators below and SYM ordering is rejected at
+     * compile time anyway.  The all-pass shortcut is gated on "no
+     * nulls in this chunk" because SQL `(x op c)` is FALSE/NULL when x
+     * is NULL; the all-fail shortcut needs no such guard. */
+    if (p->col_obj && (p->col_obj->attrs & RAY_ATTR_HAS_INDEX) &&
+        p->col_obj->index)
+    {
+        ray_index_t* ix = ray_index_payload(p->col_obj->index);
+        if (ix->kind == RAY_IDX_CHUNK_ZONE &&
+            ix->built_for_len == p->col_obj->len &&
+            !ix->u.chunk_zone.is_f64 &&
+            (op == FP_EQ || op == FP_NE ||
+             op == FP_LT || op == FP_LE ||
+             op == FP_GT || op == FP_GE))
+        {
+            uint8_t log2 = ix->u.chunk_zone.chunk_log2;
+            int64_t s_ch = start >> log2;
+            int64_t e_ch = (end - 1) >> log2;
+            if (s_ch == e_ch && (uint32_t)s_ch < ix->u.chunk_zone.n_chunks) {
+                const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins);
+                const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs);
+                int64_t cmin = mins[s_ch], cmax = maxs[s_ch];
+                if (cmin <= cmax) {       /* skip empty (all-null) chunks */
+                    const uint8_t* nb = (const uint8_t*)ray_data(ix->u.chunk_zone.null_bits);
+                    bool has_nulls = (nb[s_ch >> 3] >> (s_ch & 7)) & 1u;
+                    int decision = -1;   /* 0=all-fail, 1=all-pass, -1=mixed */
+                    switch (op) {
+                    case FP_EQ:
+                        if (cval < cmin || cval > cmax)        decision = 0;
+                        else if (!has_nulls && cmin == cmax)   decision = 1;
+                        break;
+                    case FP_NE:
+                        if (!has_nulls && (cval < cmin || cval > cmax)) decision = 1;
+                        else if (cmin == cmax && cval == cmin)          decision = 0;
+                        break;
+                    case FP_LT:
+                        if (cmin >= cval)                      decision = 0;
+                        else if (!has_nulls && cmax < cval)    decision = 1;
+                        break;
+                    case FP_LE:
+                        if (cmin >  cval)                      decision = 0;
+                        else if (!has_nulls && cmax <= cval)   decision = 1;
+                        break;
+                    case FP_GT:
+                        if (cmax <= cval)                      decision = 0;
+                        else if (!has_nulls && cmin >  cval)   decision = 1;
+                        break;
+                    case FP_GE:
+                        if (cmax <  cval)                      decision = 0;
+                        else if (!has_nulls && cmin >= cval)   decision = 1;
+                        break;
+                    default: break;
+                    }
+                    if (decision >= 0) {
+                        memset(bits, (uint8_t)decision, (size_t)n);
+                        return;
+                    }
+                }
+            }
+        }
+    }
+
     /* SYM low-card fold: const not in dict ⇒ EQ all-zero / NE all-one.
      * Ordering ops are rejected at compile for SYM, so unreachable here. */
     if (ct == RAY_SYM && !p->cval_in_dict) {
@@ -1087,6 +1154,37 @@ static uint32_t fp_i32_hash_slot(int32_t key, uint32_t mask) {
     return (uint32_t)h & mask;
 }
 
+static uint32_t fp_i64_hash_slot(int64_t key, uint32_t mask) {
+    uint64_t h = (uint64_t)key * 0x9E3779B97F4A7C15ULL;
+    h ^= h >> 33;
+    h *= 0xC2B2AE3D27D4EB4FULL;
+    h ^= h >> 29;
+    return (uint32_t)h & mask;
+}
+
+static void fp_i64_mg_rebuild(const int64_t* keys, const uint32_t* counts,
+                              uint32_t n, uint32_t* ht, uint32_t hcap) {
+    memset(ht, 0, (size_t)hcap * sizeof(uint32_t));
+    uint32_t mask = hcap - 1;
+    for (uint32_t i = 0; i < n; i++) {
+        if (!counts[i]) continue;
+        uint32_t slot = fp_i64_hash_slot(keys[i], mask);
+        while (ht[slot]) slot = (slot + 1u) & mask;
+        ht[slot] = i + 1u;
+    }
+}
+
+static uint32_t fp_i64_mg_lookup(const int64_t* keys, const uint32_t* ht,
+                                 uint32_t hmask, int64_t key) {
+    uint32_t slot = fp_i64_hash_slot(key, hmask);
+    while (ht[slot]) {
+        uint32_t idx = ht[slot] - 1u;
+        if (keys[idx] == key) return idx + 1u;
+        slot = (slot + 1u) & hmask;
+    }
+    return 0;
+}
+
 static void fp_i32_mg_rebuild(const int32_t* keys, const uint32_t* counts,
                               uint32_t n, uint32_t* ht, uint32_t hcap) {
     memset(ht, 0, (size_t)hcap * sizeof(uint32_t));
@@ -1247,6 +1345,146 @@ static ray_t* fp_try_i32_mg_top_count(const fp_par_ctx_t* ctx, int64_t nrows,
     return result;
 }
 
+/* I64 mirror of fp_try_i32_mg_top_count for top-K-by-count over an
+ * I64 key column.  Misra-Gries with cap = 8192 candidates guarantees
+ * every key with count > nrows / 8193 survives the first pass; the
+ * second pass exact-counts the survivors and a min-heap picks the
+ * top K.  Falls back to NULL when the safety bound is violated, or
+ * when fewer than K candidates have non-zero exact counts. */
+static ray_t* fp_try_i64_mg_top_count(const fp_par_ctx_t* ctx, int64_t nrows,
+                                      int64_t key_sym,
+                                      ray_group_emit_filter_t emit_filter) {
+    if (ctx->kt != RAY_I64 && ctx->kt != RAY_TIMESTAMP) return NULL;
+    if (ctx->pred.n_children != 0 ||
+        emit_filter.top_count_take <= 0 || nrows <= 0)
+        return NULL;
+
+    const uint32_t cap = 8192;
+    const uint32_t hcap = cap * 2u;
+    const int64_t* data = (const int64_t*)ctx->kbase;
+    ray_t *keys_hdr = NULL, *cnt_hdr = NULL, *exact_hdr = NULL, *ht_hdr = NULL;
+    int64_t* keys = (int64_t*)scratch_alloc(&keys_hdr, cap * sizeof(int64_t));
+    uint32_t* counts = (uint32_t*)scratch_calloc(&cnt_hdr, cap * sizeof(uint32_t));
+    uint32_t* exact = (uint32_t*)scratch_calloc(&exact_hdr, cap * sizeof(uint32_t));
+    uint32_t* ht = (uint32_t*)scratch_calloc(&ht_hdr, hcap * sizeof(uint32_t));
+    if (!keys || !counts || !exact || !ht) {
+        if (keys_hdr) scratch_free(keys_hdr);
+        if (cnt_hdr) scratch_free(cnt_hdr);
+        if (exact_hdr) scratch_free(exact_hdr);
+        if (ht_hdr) scratch_free(ht_hdr);
+        return NULL;
+    }
+
+    uint32_t n = 0;
+    uint32_t decrements = 0;
+    uint32_t hmask = hcap - 1u;
+    for (int64_t r = 0; r < nrows; r++) {
+        int64_t key = data[r];
+        uint32_t found = fp_i64_mg_lookup(keys, ht, hmask, key);
+        if (found) {
+            counts[found - 1u]++;
+            continue;
+        }
+        if (n < cap) {
+            uint32_t idx = n++;
+            keys[idx] = key;
+            counts[idx] = 1;
+            uint32_t slot = fp_i64_hash_slot(key, hmask);
+            while (ht[slot]) slot = (slot + 1u) & hmask;
+            ht[slot] = idx + 1u;
+            continue;
+        }
+        uint32_t out = 0;
+        for (uint32_t i = 0; i < n; i++) {
+            uint32_t c = counts[i];
+            if (c > 1) {
+                counts[out] = c - 1u;
+                keys[out] = keys[i];
+                out++;
+            }
+        }
+        n = out;
+        decrements++;
+        fp_i64_mg_rebuild(keys, counts, n, ht, hcap);
+    }
+
+    memset(exact, 0, cap * sizeof(uint32_t));
+    for (int64_t r = 0; r < nrows; r++) {
+        uint32_t found = fp_i64_mg_lookup(keys, ht, hmask, data[r]);
+        if (found) exact[found - 1u]++;
+    }
+
+    int64_t k_take = emit_filter.top_count_take;
+    if (k_take > 1024) k_take = 1024;
+    int64_t heap[1024];
+    int64_t heap_n = 0;
+    uint32_t nonzero = 0;
+    for (uint32_t i = 0; i < n; i++) {
+        if (!exact[i]) continue;
+        nonzero++;
+        fp_count_heap_consider(heap, &heap_n, k_take, (int64_t)exact[i]);
+    }
+    if (heap_n == 0) {
+        scratch_free(keys_hdr); scratch_free(cnt_hdr);
+        scratch_free(exact_hdr); scratch_free(ht_hdr);
+        return NULL;
+    }
+    int64_t keep_min = emit_filter.min_count_exclusive + 1;
+    if (heap_n == k_take && heap[0] > keep_min)
+        keep_min = heap[0];
+
+    if (decrements && keep_min <= nrows / (int64_t)(cap + 1u)) {
+        scratch_free(keys_hdr); scratch_free(cnt_hdr);
+        scratch_free(exact_hdr); scratch_free(ht_hdr);
+        return NULL;
+    }
+
+    uint32_t out_n = 0;
+    for (uint32_t i = 0; i < n; i++)
+        if ((int64_t)exact[i] >= keep_min) out_n++;
+    if (!out_n || (decrements && nonzero < (uint32_t)k_take)) {
+        scratch_free(keys_hdr); scratch_free(cnt_hdr);
+        scratch_free(exact_hdr); scratch_free(ht_hdr);
+        return NULL;
+    }
+
+    ray_t* k_out = ray_vec_new(ctx->kt, out_n);
+    ray_t* c_out = ray_vec_new(RAY_I64, out_n);
+    if (!k_out || !c_out || RAY_IS_ERR(k_out) || RAY_IS_ERR(c_out)) {
+        if (k_out && !RAY_IS_ERR(k_out)) ray_release(k_out);
+        if (c_out && !RAY_IS_ERR(c_out)) ray_release(c_out);
+        scratch_free(keys_hdr); scratch_free(cnt_hdr);
+        scratch_free(exact_hdr); scratch_free(ht_hdr);
+        return ray_error("oom", NULL);
+    }
+    k_out->len = out_n;
+    c_out->len = out_n;
+    int64_t* kd = (int64_t*)ray_data(k_out);
+    int64_t* cd = (int64_t*)ray_data(c_out);
+    uint32_t oi = 0;
+    for (uint32_t i = 0; i < n; i++) {
+        if ((int64_t)exact[i] < keep_min) continue;
+        kd[oi] = keys[i];
+        cd[oi] = exact[i];
+        oi++;
+    }
+    scratch_free(keys_hdr); scratch_free(cnt_hdr);
+    scratch_free(exact_hdr); scratch_free(ht_hdr);
+
+    ray_t* result = ray_table_new(2);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(k_out);
+        ray_release(c_out);
+        return ray_error("oom", NULL);
+    }
+    int64_t cnt_sym = ray_sym_intern("count", 5);
+    result = ray_table_add_col(result, key_sym, k_out);
+    result = ray_table_add_col(result, cnt_sym, c_out);
+    ray_release(k_out);
+    ray_release(c_out);
+    return result;
+}
+
 static void fp_direct_count_fn(void* raw, uint32_t worker_id,
                                int64_t start, int64_t end) {
     fp_direct_count_ctx_t* c = (fp_direct_count_ctx_t*)raw;
@@ -1308,6 +1546,19 @@ static ray_t* fp_try_direct_count1(const fp_par_ctx_t* ctx, int64_t nrows,
             if (mg) return mg;
         }
         return NULL;
+    } else if (ctx->kt == RAY_I64 || ctx->kt == RAY_TIMESTAMP) {
+        /* I64/TIMESTAMP top-K via Misra-Gries.  The slot-array path
+         * for I32/I16/U8/BOOL would need 16 GB for the full I64
+         * domain; MG with cap = 8 K candidates costs ~256 KB and
+         * exact-counts the survivors in a second pass.  Falls back
+         * to the partition path when the safety bound is violated. */
+        ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get();
+        if (emit_filter.enabled && emit_filter.agg_index == 0 &&
+            emit_filter.top_count_take > 0) {
+            ray_t* mg = fp_try_i64_mg_top_count(ctx, nrows, key_sym, emit_filter);
+            if (mg) return mg;
+        }
+        return NULL;
     } else if (ctx->kt == RAY_SYM) {
         uint64_t max_key = 0;
         for (int64_t i = 0; i < nrows; i++) {
@@ -2226,6 +2477,16 @@ static ray_t* exec_filtered_group_count1(ray_graph_t* g, ray_op_ext_t* ext,
 #define FP_MAX_AGGS 8
 #define FP_MAX_KEYS 16
 
+/* v2 path: per-(worker, partition) hash tables.  Each worker hashes its
+ * rows once and routes by RADIX_PART(h) to one of MK_RADIX_P small
+ * shards rather than a single fat per-worker shard.  Smaller shards stay
+ * cache-resident; the merge step is per-partition and trivially parallel.
+ * Mirrors the design in group.c (radix_v2_phase1_fn / _phase2_fn). */
+#define MK_RADIX_BITS 5
+#define MK_RADIX_P    (1u << MK_RADIX_BITS)
+#define MK_RADIX_MASK (MK_RADIX_P - 1u)
+#define MK_RADIX_PART(h) (((uint32_t)((h) >> 16)) & MK_RADIX_MASK)
+
 typedef enum {
     MK_AGG_COUNT = 0,
     MK_AGG_SUM   = 1,
@@ -2295,7 +2556,8 @@ typedef struct {
     uint8_t     total_state;
     uint8_t     wide;        /* 1 when total_bytes > 8 (uses kv_hi side array) */
     /* Cool fields (only touched once per dispatch or in cold paths). */
-    mk_shard_t* shards;
+    mk_shard_t* shards;       /* v1: [n_workers] single shard per worker */
+    mk_shard_t* wpart_shards; /* v2: [n_workers * MK_RADIX_P] partitioned */
     uint64_t    init_cap;
     _Atomic(uint32_t) oom;
     mk_key_t    keys[FP_MAX_KEYS];
@@ -2552,77 +2814,589 @@ static int mk_find_i64_eq_child(const fp_pred_t* pred) {
     return -1;
 }
 
-static void mk_eq_i64_count_fn(void* raw, uint32_t worker_id,
-                               int64_t start, int64_t end) {
-    mk_eq_i64_count_ctx_t* fc = (mk_eq_i64_count_ctx_t*)raw;
-    mk_par_ctx_t* c = fc->ctx;
+/* Find an FP_EQ predicate child whose column carries a fresh
+ * RAY_IDX_HASH — i.e. one we can serve via O(matches) hash probe
+ * instead of O(n) scan.  Constraints mirror hash_probe_setup
+ * (idxop.c): no nulls, no fold, same built-for-len, type covers cval.
+ * Returns the child index, or -1 if none qualifies. */
+static int mk_find_hash_eq_child(const fp_pred_t* pred) {
+    for (uint8_t i = 0; i < pred->n_children; i++) {
+        const fp_cmp_t* cmp = &pred->children[i];
+        if (cmp->op != FP_EQ || cmp->fold != FP_FOLD_NONE) continue;
+        if (cmp->col_type == RAY_SYM) continue;  /* hash idx not attached to dict cols */
+        if (cmp->col_attrs & RAY_ATTR_HAS_NULLS) continue;
+        ray_t* co = cmp->col_obj;
+        if (!co || !ray_index_has(co)) continue;
+        if (ray_index_kind(co) != RAY_IDX_HASH) continue;
+        ray_index_t* ix = ray_index_payload(co->index);
+        if (ix->built_for_len != co->len) continue;
+        return (int)i;
+    }
+    return -1;
+}
+
+/* Worker that walks the RAY_IDX_HASH chain on `c->pred.children[eq_idx]`
+ * and applies the COUNT-aggregator path to each matching row that also
+ * passes the remaining predicate children.  Replaces the O(n)
+ * mk_eq_i64_count_fn scan.  Runs on worker 0 only — the chain walk
+ * isn't parallelised, since match counts on a point lookup are tiny
+ * and the dispatch overhead would dominate. */
+static void mk_eq_hash_count_fn(mk_par_ctx_t* c, uint8_t eq_idx) {
     if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
-    mk_shard_t* sh = &c->shards[worker_id];
+    mk_shard_t* sh = &c->shards[0];
     if (!sh->slots) {
         if (mk_shard_init(sh, c->init_cap, c->total_state, c->wide) != 0) {
             atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
             return;
         }
     }
-
-    const fp_cmp_t* eq = &c->pred.children[fc->eq_idx];
-    const int64_t* eq_col = (const int64_t*)eq->col_base;
-    int64_t eq_val = eq->cval;
-    for (int64_t row = start; row < end; row++) {
-        if (eq_col[row] != eq_val) continue;
-        uint8_t pass = 1;
-        for (uint8_t i = 0; i < c->pred.n_children; i++) {
-            if (i == fc->eq_idx) continue;
-            if (!fp_eval_cmp_one(&c->pred.children[i], row)) {
-                pass = 0;
-                break;
+    const fp_cmp_t* eq = &c->pred.children[eq_idx];
+    ray_t* col = eq->col_obj;
+    ray_index_t* ix = ray_index_payload(col->index);
+    const uint64_t mask  = ix->u.hash.mask;
+    const int64_t* tbl   = (const int64_t*)ray_data(ix->u.hash.table);
+    const int64_t* chn   = (const int64_t*)ray_data(ix->u.hash.chain);
+    int64_t key = eq->cval;
+
+    /* Recompute the same hash the builder used.  numeric_key_word for
+     * an int* column zero/sign-extends to int64 then runs mix64 over
+     * the bit pattern.  We match by width here. */
+    uint64_t kbits;
+    switch (eq->col_esz) {
+    case 1:  kbits = (uint64_t)(uint8_t)key;             break;
+    case 2:  kbits = (uint64_t)(int64_t)(int16_t)key;    break;
+    case 4:  kbits = (uint64_t)(int64_t)(int32_t)key;    break;
+    default: kbits = (uint64_t)key;                      break;
+    }
+    /* mix64 inline — match idxop.c:mix64 byte-for-byte. */
+    uint64_t h = kbits;
+    h ^= h >> 30; h *= 0xbf58476d1ce4e5b9ULL;
+    h ^= h >> 27; h *= 0x94d049bb133111ebULL;
+    h ^= h >> 31;
+    int64_t rid = tbl[h & mask] - 1;
+
+    while (rid >= 0) {
+        if (fp_cmp_read_i64_at(eq, rid) == key) {
+            uint8_t pass = 1;
+            for (uint8_t i = 0; i < c->pred.n_children; i++) {
+                if (i == eq_idx) continue;
+                if (!fp_eval_cmp_one(&c->pred.children[i], rid)) {
+                    pass = 0;
+                    break;
+                }
+            }
+            if (pass) {
+                if (mk_count_upsert_row(c, sh, rid) != 0) {
+                    atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+                    return;
+                }
             }
         }
-        if (!pass) continue;
-        if (mk_count_upsert_row(c, sh, row) != 0) {
+        rid = chn[rid] - 1;
+    }
+}
+
+/* mk_par worker analog: walk the hash chain instead of scanning rows.
+ * For each matching row that passes the remaining predicate children,
+ * upsert into shard 0 and run the per-agg accumulate inline.  This
+ * mirrors mk_par_fn's PASS-1 / PASS-2 split but per-row (matches are
+ * sparse, so a morsel-shaped batch is overkill — match count is
+ * usually < 10).  Runs on a single thread for the same reason. */
+static void mk_par_hash_fn(mk_par_ctx_t* c, uint8_t eq_idx) {
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    mk_shard_t* sh = &c->shards[0];
+    uint8_t wide        = c->wide;
+    uint8_t total_state = c->total_state;
+    uint8_t n_aggs      = c->n_aggs;
+    if (!sh->slots) {
+        if (mk_shard_init(sh, c->init_cap, total_state, wide) != 0) {
             atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
             return;
         }
     }
+    const fp_cmp_t* eq = &c->pred.children[eq_idx];
+    ray_t* col = eq->col_obj;
+    ray_index_t* ix = ray_index_payload(col->index);
+    const uint64_t mask = ix->u.hash.mask;
+    const int64_t* tbl  = (const int64_t*)ray_data(ix->u.hash.table);
+    const int64_t* chn  = (const int64_t*)ray_data(ix->u.hash.chain);
+    int64_t key = eq->cval;
+
+    uint64_t kbits;
+    switch (eq->col_esz) {
+    case 1:  kbits = (uint64_t)(uint8_t)key;             break;
+    case 2:  kbits = (uint64_t)(int64_t)(int16_t)key;    break;
+    case 4:  kbits = (uint64_t)(int64_t)(int32_t)key;    break;
+    default: kbits = (uint64_t)key;                      break;
+    }
+    uint64_t h = kbits;
+    h ^= h >> 30; h *= 0xbf58476d1ce4e5b9ULL;
+    h ^= h >> 27; h *= 0x94d049bb133111ebULL;
+    h ^= h >> 31;
+    int64_t rid = tbl[h & mask] - 1;
+
+    while (rid >= 0) {
+        if (fp_cmp_read_i64_at(eq, rid) == key) {
+            uint8_t pass = 1;
+            for (uint8_t i = 0; i < c->pred.n_children; i++) {
+                if (i == eq_idx) continue;
+                if (!fp_eval_cmp_one(&c->pred.children[i], rid)) {
+                    pass = 0;
+                    break;
+                }
+            }
+            if (pass) {
+                /* Grow check + HT probe + per-agg accumulate.  Single
+                 * row at a time (no morsel batching) — matches are
+                 * sparse, and the existing batched path's per-batch
+                 * shard-grow loop would still re-fire here. */
+                if (sh->n_filled + 1 > (int64_t)(sh->cap / 2)) {
+                    if (mk_shard_grow(sh, total_state, wide) != 0) {
+                        atomic_store_explicit(&c->oom, 1,
+                                              memory_order_relaxed);
+                        return;
+                    }
+                }
+                int64_t* slots = sh->slots;
+                int64_t* state = sh->state;
+                uint64_t shm = sh->mask;
+                uint64_t s;
+                if (!wide) {
+                    int64_t kv = mk_compose_key(c, rid);
+                    uint64_t hk = (uint64_t)kv * 0x9E3779B97F4A7C15ULL;
+                    hk ^= hk >> 33;
+                    s = hk & shm;
+                    for (;;) {
+                        if (!slots[s * 2]) {
+                            slots[s * 2]     = 1;
+                            slots[s * 2 + 1] = kv;
+                            int64_t* st = &state[s * total_state];
+                            for (uint8_t a = 0; a < n_aggs; a++) {
+                                const mk_agg_t* ag = &c->aggs[a];
+                                switch (ag->kind) {
+                                case MK_AGG_COUNT:
+                                case MK_AGG_SUM:
+                                    st[ag->state_off] = 0; break;
+                                case MK_AGG_MIN:
+                                    st[ag->state_off] = INT64_MAX; break;
+                                case MK_AGG_MAX:
+                                    st[ag->state_off] = INT64_MIN; break;
+                                case MK_AGG_AVG:
+                                    st[ag->state_off    ] = 0;
+                                    st[ag->state_off + 1] = 0; break;
+                                }
+                            }
+                            sh->n_filled++;
+                            break;
+                        }
+                        if (slots[s * 2 + 1] == kv) break;
+                        s = (s + 1) & shm;
+                    }
+                } else {
+                    int64_t kv_lo, kv_hi;
+                    mk_compose_key2(c, rid, &kv_lo, &kv_hi);
+                    uint64_t hk = mk_hash_lo_hi(kv_lo, kv_hi);
+                    s = hk & shm;
+                    int64_t* slots_hi = sh->slots_hi;
+                    for (;;) {
+                        if (!slots[s * 2]) {
+                            slots[s * 2]     = 1;
+                            slots[s * 2 + 1] = kv_lo;
+                            slots_hi[s]      = kv_hi;
+                            int64_t* st = &state[s * total_state];
+                            for (uint8_t a = 0; a < n_aggs; a++) {
+                                const mk_agg_t* ag = &c->aggs[a];
+                                switch (ag->kind) {
+                                case MK_AGG_COUNT:
+                                case MK_AGG_SUM:
+                                    st[ag->state_off] = 0; break;
+                                case MK_AGG_MIN:
+                                    st[ag->state_off] = INT64_MAX; break;
+                                case MK_AGG_MAX:
+                                    st[ag->state_off] = INT64_MIN; break;
+                                case MK_AGG_AVG:
+                                    st[ag->state_off    ] = 0;
+                                    st[ag->state_off + 1] = 0; break;
+                                }
+                            }
+                            sh->n_filled++;
+                            break;
+                        }
+                        if (slots[s * 2 + 1] == kv_lo &&
+                            slots_hi[s] == kv_hi) break;
+                        s = (s + 1) & shm;
+                    }
+                }
+                /* Per-agg accumulate for this row. */
+                int64_t* st = &state[s * total_state];
+                for (uint8_t a = 0; a < n_aggs; a++) {
+                    const mk_agg_t* ag = &c->aggs[a];
+                    uint8_t off = ag->state_off;
+                    switch (ag->kind) {
+                    case MK_AGG_COUNT:
+                        st[off]++;
+                        break;
+                    case MK_AGG_SUM: {
+                        int64_t v = mk_read_agg_i64(ag, rid);
+                        st[off] += v;
+                        break;
+                    }
+                    case MK_AGG_MIN: {
+                        int64_t v = mk_read_agg_i64(ag, rid);
+                        if (v < st[off]) st[off] = v;
+                        break;
+                    }
+                    case MK_AGG_MAX: {
+                        int64_t v = mk_read_agg_i64(ag, rid);
+                        if (v > st[off]) st[off] = v;
+                        break;
+                    }
+                    case MK_AGG_AVG: {
+                        int64_t v = mk_read_agg_i64(ag, rid);
+                        st[off    ] += v;
+                        st[off + 1] += 1;
+                        break;
+                    }
+                    }
+                }
+            }
+        }
+        rid = chn[rid] - 1;
+    }
 }
 
-/* ─── Worker fn — chunked vectorised aggregate update ───────────────
- *
- * Per morsel we run two passes:
- *
- *   PASS 1 (probe): linear-probe the HT for every passing row.  On a
- *   new slot we initialize the per-agg state to a per-kind sentinel
- *   (0 for COUNT/SUM/AVG-sum, 0 for AVG-count, INT64_MAX for MIN,
- *   INT64_MIN for MAX) so the accumulate-only update logic in pass 2
- *   produces the correct first value without a separate "first row"
- *   branch.  Pass 1 fills slot_idx[i] (HT slot for the i-th passing row)
- *   and src_rows[i] (source row index) into stack-resident arrays.
- *
- *   PASS 2 (update): for each aggregate, run a tight per-agg loop over
- *   match_count entries.  No per-row switch dispatch — the kind switch
- *   is hoisted out of the loop, so each loop body is a single
- *   accumulate operation against state[slot_idx[i] * total + off].
- *
- * Probe-then-update-per-aggregate eliminates the O(rows × aggs) branch
- * dispatch the prior per-row update did. */
-static void mk_par_fn(void* raw, uint32_t worker_id, int64_t start, int64_t end) {
-    mk_par_ctx_t* c = (mk_par_ctx_t*)raw;
+static void mk_eq_i64_count_fn(void* raw, uint32_t worker_id,
+                               int64_t start, int64_t end) {
+    mk_eq_i64_count_ctx_t* fc = (mk_eq_i64_count_ctx_t*)raw;
+    mk_par_ctx_t* c = fc->ctx;
     if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
     mk_shard_t* sh = &c->shards[worker_id];
-    uint8_t  wide        = c->wide;
     if (!sh->slots) {
-        if (mk_shard_init(sh, c->init_cap, c->total_state, wide) != 0) {
+        if (mk_shard_init(sh, c->init_cap, c->total_state, c->wide) != 0) {
             atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
             return;
         }
     }
 
-    uint8_t  total_state = c->total_state;
-    uint8_t  n_aggs      = c->n_aggs;
+    const fp_cmp_t* eq = &c->pred.children[fc->eq_idx];
+    const int64_t* eq_col = (const int64_t*)eq->col_base;
+    int64_t eq_val = eq->cval;
 
-    int64_t row = start;
-    while (row < end) {
-        int64_t mend = row + RAY_MORSEL_ELEMS;
+    /* Chunk-skip: for each predicate child whose column carries a
+     * chunk_zone index, walk the row range in chunk strides and skip
+     * any chunk where the child's [min, max] proves an all-fail.  For
+     * clustered columns (e.g. data sorted by CounterID, EventDate) this
+     * eliminates the per-row RefererHash/URLHash read for ~all chunks
+     * outside the matching counter / date range — q40/q41/q42 pattern.
+     * Picks chunk_log2 from any indexed child (every chunk_zone built
+     * by csv.read uses the same chunk_log2 today).  Falls through to
+     * the plain per-row loop when no child has a usable index. */
+    uint8_t chunk_log2 = 0;
+    for (uint8_t i = 0; i < c->pred.n_children; i++) {
+        ray_t* co = c->pred.children[i].col_obj;
+        if (co && (co->attrs & RAY_ATTR_HAS_INDEX) && co->index) {
+            ray_index_t* ix = ray_index_payload(co->index);
+            if (ix->kind == RAY_IDX_CHUNK_ZONE &&
+                ix->built_for_len == co->len) {
+                chunk_log2 = ix->u.chunk_zone.chunk_log2;
+                break;
+            }
+        }
+    }
+
+    int64_t row = start;
+    while (row < end) {
+        int64_t chunk_end;
+        if (chunk_log2 > 0) {
+            int64_t csz = 1LL << chunk_log2;
+            chunk_end = ((row >> chunk_log2) + 1) << chunk_log2;
+            (void)csz;
+            if (chunk_end > end) chunk_end = end;
+            bool all_fail = false;
+            for (uint8_t i = 0; i < c->pred.n_children && !all_fail; i++) {
+                const fp_cmp_t* p = &c->pred.children[i];
+                ray_t* co = p->col_obj;
+                if (!co || !(co->attrs & RAY_ATTR_HAS_INDEX) || !co->index)
+                    continue;
+                ray_index_t* ix = ray_index_payload(co->index);
+                if (ix->kind != RAY_IDX_CHUNK_ZONE ||
+                    ix->built_for_len != co->len ||
+                    ix->u.chunk_zone.chunk_log2 != chunk_log2 ||
+                    ix->u.chunk_zone.is_f64)
+                    continue;
+                fp_op_t op = p->op;
+                if (op != FP_EQ && op != FP_NE && op != FP_LT &&
+                    op != FP_LE && op != FP_GT && op != FP_GE)
+                    continue;
+                int64_t s_ch = row >> chunk_log2;
+                if ((uint32_t)s_ch >= ix->u.chunk_zone.n_chunks) continue;
+                const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins);
+                const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs);
+                int64_t cmin = mins[s_ch], cmax = maxs[s_ch];
+                if (cmin > cmax) continue;   /* empty chunk */
+                int64_t cv = p->cval;
+                switch (op) {
+                case FP_EQ: if (cv < cmin || cv > cmax) all_fail = true; break;
+                case FP_NE: if (cmin == cmax && cv == cmin) all_fail = true; break;
+                case FP_LT: if (cmin >= cv) all_fail = true; break;
+                case FP_LE: if (cmin >  cv) all_fail = true; break;
+                case FP_GT: if (cmax <= cv) all_fail = true; break;
+                case FP_GE: if (cmax <  cv) all_fail = true; break;
+                default: break;
+                }
+            }
+            if (all_fail) { row = chunk_end; continue; }
+        } else {
+            chunk_end = end;
+        }
+
+        for (; row < chunk_end; row++) {
+            if (eq_col[row] != eq_val) continue;
+            uint8_t pass = 1;
+            for (uint8_t i = 0; i < c->pred.n_children; i++) {
+                if (i == fc->eq_idx) continue;
+                if (!fp_eval_cmp_one(&c->pred.children[i], row)) {
+                    pass = 0;
+                    break;
+                }
+            }
+            if (!pass) continue;
+            if (mk_count_upsert_row(c, sh, row) != 0) {
+                atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+                return;
+            }
+        }
+    }
+}
+
+/* ─── v2 worker fn — per-(worker, partition) shards ─────────────────
+ *
+ * Like mk_par_fn but routes every passing row by RADIX_PART(hash) into
+ * one of MK_RADIX_P small per-(worker, partition) shards.  Each small
+ * shard stays cache-resident as it fills, so the probe never walks a
+ * 5–10 MB monolithic per-worker shard.  Pass-1 (probe) and pass-2
+ * (agg update) are fused per-row here: any partition may grow on any
+ * row, so a deferred pass-2 over recorded slot indexes would dereference
+ * stale slots after a rehash.  Combine merges per partition. */
+static inline void mk_v2_apply_agg_inline(mk_par_ctx_t* c, int64_t* state_slot,
+                                          int64_t source_row,
+                                          uint8_t n_aggs, uint8_t total_state)
+{
+    (void)total_state;
+    for (uint8_t a = 0; a < n_aggs; a++) {
+        const mk_agg_t* ag = &c->aggs[a];
+        uint8_t off = ag->state_off;
+        switch (ag->kind) {
+        case MK_AGG_COUNT:
+            state_slot[off]++;
+            break;
+        case MK_AGG_SUM: {
+            int64_t v = mk_read_agg_i64(ag, source_row);
+            state_slot[off] += v;
+            break;
+        }
+        case MK_AGG_MIN: {
+            int64_t v = mk_read_agg_i64(ag, source_row);
+            if (v < state_slot[off]) state_slot[off] = v;
+            break;
+        }
+        case MK_AGG_MAX: {
+            int64_t v = mk_read_agg_i64(ag, source_row);
+            if (v > state_slot[off]) state_slot[off] = v;
+            break;
+        }
+        case MK_AGG_AVG: {
+            int64_t v = mk_read_agg_i64(ag, source_row);
+            state_slot[off    ] += v;
+            state_slot[off + 1] += 1;
+            break;
+        }
+        }
+    }
+}
+
+static void mk_par_v2_fn(void* raw, uint32_t worker_id,
+                         int64_t start, int64_t end)
+{
+    mk_par_ctx_t* c = (mk_par_ctx_t*)raw;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    uint8_t wide        = c->wide;
+    uint8_t total_state = c->total_state;
+    uint8_t n_aggs      = c->n_aggs;
+    mk_shard_t* my_shards = &c->wpart_shards[(size_t)worker_id * MK_RADIX_P];
+
+    /* Eager partition init.  Upfront cost: MK_RADIX_P × init_cap shards
+     * per worker (~256 × 256 × ~30 B = 2 MB for 4-slot state per worker;
+     * 16 MB across 8 workers — comfortably L3-resident).  Saves a per-row
+     * branch (~10M iterations on q31/q32-class queries) for the rest of
+     * the scan.  ray_pool_dispatch reuses the same task across morsel
+     * slices but assigns a fresh worker_id per task call, so guard with
+     * the slots check so re-entry skips. */
+    for (uint32_t p = 0; p < MK_RADIX_P; p++) {
+        if (my_shards[p].slots) continue;
+        if (mk_shard_init(&my_shards[p], c->init_cap,
+                          total_state, wide) != 0) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+    }
+
+    int64_t row = start;
+    while (row < end) {
+        int64_t mend = row + RAY_MORSEL_ELEMS;
+        if (mend > end) mend = end;
+        int64_t mlen = mend - row;
+        uint8_t bits[RAY_MORSEL_ELEMS];
+        fp_eval_pred(&c->pred, row, mend, bits);
+
+        int match_count = 0;
+        for (int64_t r = 0; r < mlen; r++) match_count += bits[r];
+        if (match_count == 0) { row = mend; continue; }
+        int64_t base_row = row;
+
+        if (!wide) {
+            for (int64_t r = 0; r < mlen; r++) {
+                if (!bits[r]) continue;
+                int64_t source_row = base_row + r;
+                int64_t kv = mk_compose_key(c, source_row);
+                uint64_t h = (uint64_t)kv * 0x9E3779B97F4A7C15ULL;
+                h ^= h >> 33;
+                uint32_t p = MK_RADIX_PART(h);
+                mk_shard_t* sh = &my_shards[p];
+                if (sh->n_filled + 1 > (int64_t)(sh->cap / 2)) {
+                    if (mk_shard_grow(sh, total_state, wide) != 0) {
+                        atomic_store_explicit(&c->oom, 1,
+                                              memory_order_relaxed);
+                        return;
+                    }
+                }
+                int64_t* slots = sh->slots;
+                int64_t* state = sh->state;
+                uint64_t mask  = sh->mask;
+                uint64_t s = h & mask;
+                for (;;) {
+                    if (!slots[s * 2]) {
+                        slots[s * 2]     = 1;
+                        slots[s * 2 + 1] = kv;
+                        int64_t* st = &state[s * total_state];
+                        for (uint8_t a = 0; a < n_aggs; a++) {
+                            const mk_agg_t* ag = &c->aggs[a];
+                            switch (ag->kind) {
+                            case MK_AGG_COUNT:
+                            case MK_AGG_SUM:
+                                st[ag->state_off] = 0; break;
+                            case MK_AGG_MIN:
+                                st[ag->state_off] = INT64_MAX; break;
+                            case MK_AGG_MAX:
+                                st[ag->state_off] = INT64_MIN; break;
+                            case MK_AGG_AVG:
+                                st[ag->state_off    ] = 0;
+                                st[ag->state_off + 1] = 0; break;
+                            }
+                        }
+                        sh->n_filled++;
+                        break;
+                    }
+                    if (slots[s * 2 + 1] == kv) break;
+                    s = (s + 1) & mask;
+                }
+                mk_v2_apply_agg_inline(c, &state[s * total_state],
+                                       source_row, n_aggs, total_state);
+            }
+        } else {
+            for (int64_t r = 0; r < mlen; r++) {
+                if (!bits[r]) continue;
+                int64_t source_row = base_row + r;
+                int64_t kv_lo, kv_hi;
+                mk_compose_key2(c, source_row, &kv_lo, &kv_hi);
+                uint64_t h = mk_hash_lo_hi(kv_lo, kv_hi);
+                uint32_t p = MK_RADIX_PART(h);
+                mk_shard_t* sh = &my_shards[p];
+                if (sh->n_filled + 1 > (int64_t)(sh->cap / 2)) {
+                    if (mk_shard_grow(sh, total_state, wide) != 0) {
+                        atomic_store_explicit(&c->oom, 1,
+                                              memory_order_relaxed);
+                        return;
+                    }
+                }
+                int64_t* slots = sh->slots;
+                int64_t* slots_hi = sh->slots_hi;
+                int64_t* state = sh->state;
+                uint64_t mask  = sh->mask;
+                uint64_t s = h & mask;
+                for (;;) {
+                    if (!slots[s * 2]) {
+                        slots[s * 2]     = 1;
+                        slots[s * 2 + 1] = kv_lo;
+                        slots_hi[s]      = kv_hi;
+                        int64_t* st = &state[s * total_state];
+                        for (uint8_t a = 0; a < n_aggs; a++) {
+                            const mk_agg_t* ag = &c->aggs[a];
+                            switch (ag->kind) {
+                            case MK_AGG_COUNT:
+                            case MK_AGG_SUM:
+                                st[ag->state_off] = 0; break;
+                            case MK_AGG_MIN:
+                                st[ag->state_off] = INT64_MAX; break;
+                            case MK_AGG_MAX:
+                                st[ag->state_off] = INT64_MIN; break;
+                            case MK_AGG_AVG:
+                                st[ag->state_off    ] = 0;
+                                st[ag->state_off + 1] = 0; break;
+                            }
+                        }
+                        sh->n_filled++;
+                        break;
+                    }
+                    if (slots[s * 2 + 1] == kv_lo && slots_hi[s] == kv_hi) break;
+                    s = (s + 1) & mask;
+                }
+                mk_v2_apply_agg_inline(c, &state[s * total_state],
+                                       source_row, n_aggs, total_state);
+            }
+        }
+
+        row = mend;
+    }
+}
+
+/* ─── Worker fn — chunked vectorised aggregate update ───────────────
+ *
+ * Per morsel we run two passes:
+ *
+ *   PASS 1 (probe): linear-probe the HT for every passing row.  On a
+ *   new slot we initialize the per-agg state to a per-kind sentinel
+ *   (0 for COUNT/SUM/AVG-sum, 0 for AVG-count, INT64_MAX for MIN,
+ *   INT64_MIN for MAX) so the accumulate-only update logic in pass 2
+ *   produces the correct first value without a separate "first row"
+ *   branch.  Pass 1 fills slot_idx[i] (HT slot for the i-th passing row)
+ *   and src_rows[i] (source row index) into stack-resident arrays.
+ *
+ *   PASS 2 (update): for each aggregate, run a tight per-agg loop over
+ *   match_count entries.  No per-row switch dispatch — the kind switch
+ *   is hoisted out of the loop, so each loop body is a single
+ *   accumulate operation against state[slot_idx[i] * total + off].
+ *
+ * Probe-then-update-per-aggregate eliminates the O(rows × aggs) branch
+ * dispatch the prior per-row update did. */
+static void mk_par_fn(void* raw, uint32_t worker_id, int64_t start, int64_t end) {
+    mk_par_ctx_t* c = (mk_par_ctx_t*)raw;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    mk_shard_t* sh = &c->shards[worker_id];
+    uint8_t  wide        = c->wide;
+    if (!sh->slots) {
+        if (mk_shard_init(sh, c->init_cap, c->total_state, wide) != 0) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+    }
+
+    uint8_t  total_state = c->total_state;
+    uint8_t  n_aggs      = c->n_aggs;
+
+    int64_t row = start;
+    while (row < end) {
+        int64_t mend = row + RAY_MORSEL_ELEMS;
         if (mend > end) mend = end;
         int64_t mlen = mend - row;
         uint8_t bits[RAY_MORSEL_ELEMS];
@@ -2830,49 +3604,122 @@ static void mk_apply_count_emit_filter(const mk_par_ctx_t* c,
                                        int64_t* gs, int64_t* gst,
                                        int64_t gcap, int64_t* global_n)
 {
+    /* Two-mode emit-filter pass over the deduped (gs, gst) layout:
+     *
+     *  1. min_count_exclusive (heavy-hitter): drop rows whose COUNT
+     *     value is at or below the threshold.  Only fires for COUNT.
+     *
+     *  2. top_count_take (top-N): drop rows that aren't in the top-N
+     *     ordered by the configured agg op (COUNT/SUM/MIN/MAX).  Both
+     *     desc (largest N) and asc (smallest N) are supported.  The
+     *     producer (query.c's match_group_desc_count_take) sets
+     *     emit_filter.agg_op and emit_filter.desc accordingly; an
+     *     unset agg_op defaults to OP_COUNT for the historical
+     *     single-mode filter.
+     *
+     * AVG / STDDEV / VAR / PEARSON / MEDIAN are excluded — their
+     * ordering doesn't reduce to a single int64 row-slot read, so
+     * filters over those aggs must fall back to the post-materialize
+     * sort + take path.  SYM-typed MIN/MAX are similarly excluded
+     * because the stored value is an interned id whose natural order
+     * is not the lexicographic order users expect (a mismatch only
+     * relevant when the desc:/asc: orders the output). */
     ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get();
     if (!emit_filter.enabled || emit_filter.agg_index >= c->n_aggs)
         return;
 
-    const mk_agg_t* count_agg = &c->aggs[emit_filter.agg_index];
-    if (count_agg->kind != MK_AGG_COUNT)
+    const mk_agg_t* order_agg = &c->aggs[emit_filter.agg_index];
+    uint16_t order_op = emit_filter.agg_op
+        ? emit_filter.agg_op
+        : (uint16_t)OP_COUNT;
+    /* min_count_exclusive remains COUNT-only — it represents a
+     * heavy-hitter threshold inherited from the WHERE clause and
+     * doesn't generalize to SUM/MIN/MAX semantics. */
+    int64_t keep_min = (order_op == OP_COUNT)
+        ? emit_filter.min_count_exclusive + 1
+        : 1;
+    int64_t k_take = emit_filter.top_count_take;
+    uint8_t desc_dir = emit_filter.desc;
+    if (order_op == OP_COUNT && !emit_filter.desc) desc_dir = 1;
+
+    /* Map order_op → mk_agg kind, reject incompatible shapes. */
+    if (order_op == OP_COUNT) {
+        if (order_agg->kind != MK_AGG_COUNT) return;
+    } else if (order_op == OP_SUM) {
+        if (order_agg->kind != MK_AGG_SUM) return;
+    } else if (order_op == OP_MIN) {
+        if (order_agg->kind != MK_AGG_MIN) return;
+        if (order_agg->in_type == RAY_SYM) return;
+    } else if (order_op == OP_MAX) {
+        if (order_agg->kind != MK_AGG_MAX) return;
+        if (order_agg->in_type == RAY_SYM) return;
+    } else {
         return;
+    }
 
-    int64_t keep_min = emit_filter.min_count_exclusive + 1;
-    int64_t k_take = emit_filter.top_count_take;
     if (k_take > 0 && k_take < *global_n) {
         ray_t* heap_hdr = NULL;
         int64_t* heap = (int64_t*)scratch_alloc(&heap_hdr,
                                                 (size_t)k_take * sizeof(int64_t));
         if (heap) {
             int64_t heap_n = 0;
+            /* For desc (top-N largest): min-heap, root = smallest.
+             * For asc  (top-N smallest): max-heap, root = largest. */
+            #define MK_TOPN_NEEDS_SWAP(parent, child) \
+                (desc_dir ? ((parent) > (child)) : ((parent) < (child)))
+            #define MK_TOPN_SHOULD_REPLACE(nv, rv) \
+                (desc_dir ? ((nv) > (rv)) : ((nv) < (rv)))
             for (int64_t s = 0; s < gcap; s++) {
                 if (!gs[s * 2]) continue;
-                int64_t cnt = gst[(size_t)s * c->total_state + count_agg->state_off];
+                int64_t v = gst[(size_t)s * c->total_state + order_agg->state_off];
                 if (heap_n < k_take) {
                     int64_t j = heap_n++;
-                    heap[j] = cnt;
+                    heap[j] = v;
                     while (j > 0) {
                         int64_t p = (j - 1) >> 1;
-                        if (heap[p] <= heap[j]) break;
+                        if (!MK_TOPN_NEEDS_SWAP(heap[p], heap[j])) break;
                         int64_t tmp = heap[p]; heap[p] = heap[j]; heap[j] = tmp;
                         j = p;
                     }
-                } else if (cnt > heap[0]) {
-                    heap[0] = cnt;
+                } else if (MK_TOPN_SHOULD_REPLACE(v, heap[0])) {
+                    heap[0] = v;
                     int64_t j = 0;
                     for (;;) {
                         int64_t l = j * 2 + 1, r = l + 1, m = j;
-                        if (l < heap_n && heap[l] < heap[m]) m = l;
-                        if (r < heap_n && heap[r] < heap[m]) m = r;
+                        if (l < heap_n && MK_TOPN_NEEDS_SWAP(heap[m], heap[l])) m = l;
+                        if (r < heap_n && MK_TOPN_NEEDS_SWAP(heap[m], heap[r])) m = r;
                         if (m == j) break;
                         int64_t tmp = heap[m]; heap[m] = heap[j]; heap[j] = tmp;
                         j = m;
                     }
                 }
             }
-            if (heap_n == k_take && heap[0] > keep_min)
-                keep_min = heap[0];
+            #undef MK_TOPN_NEEDS_SWAP
+            #undef MK_TOPN_SHOULD_REPLACE
+            if (heap_n == k_take) {
+                /* heap[0] is the worst surviving value.  Compute a
+                 * scalar threshold so the compaction sweep below can
+                 * read it without checking direction per row. */
+                int64_t threshold = heap[0];
+                int64_t kept = 0;
+                for (int64_t s = 0; s < gcap; s++) {
+                    if (!gs[s * 2]) continue;
+                    int64_t v = gst[(size_t)s * c->total_state + order_agg->state_off];
+                    bool survives = desc_dir ? (v >= threshold) : (v <= threshold);
+                    if (!survives) {
+                        gs[s * 2] = 0;
+                    } else if (order_op == OP_COUNT && v < keep_min) {
+                        /* min_count_exclusive threshold combines with top-N
+                         * by AND — drop rows that fail either. */
+                        gs[s * 2] = 0;
+                    } else {
+                        kept++;
+                    }
+                }
+                *global_n = kept;
+                scratch_free(heap_hdr);
+                return;
+            }
             scratch_free(heap_hdr);
         }
     }
@@ -2883,7 +3730,7 @@ static void mk_apply_count_emit_filter(const mk_par_ctx_t* c,
     int64_t kept = 0;
     for (int64_t s = 0; s < gcap; s++) {
         if (!gs[s * 2]) continue;
-        int64_t cnt = gst[(size_t)s * c->total_state + count_agg->state_off];
+        int64_t cnt = gst[(size_t)s * c->total_state + order_agg->state_off];
         if (cnt < keep_min) {
             gs[s * 2] = 0;
         } else {
@@ -3364,6 +4211,320 @@ static int mk_combine_parallel(mk_par_ctx_t* c, uint32_t nw,
     return 1;
 }
 
+/* ─── v2 per-partition combine ──────────────────────────────────────
+ *
+ * Shards in c->wpart_shards are already RADIX-partitioned (each holds
+ * only entries whose hash routes to that partition).  The v1 combine
+ * had to histogram + scatter before per-partition dedup; here we go
+ * straight to per-partition dedup — task p just walks all workers'
+ * shard at index w*MK_RADIX_P+p and merges into a single target HT.
+ * Per-partition tasks are fully independent: each task only writes
+ * to its own target HT and its own slot in the part_* arrays. */
+
+typedef struct {
+    mk_par_ctx_t*     ctx;
+    uint32_t          nw;            /* workers per partition */
+    uint8_t           total_state;
+    uint8_t           wide;
+    const mk_agg_t*   aggs;
+    uint8_t           n_aggs;
+    /* Per-partition output buffers (MK_RADIX_P slots). */
+    int64_t**         part_keys;     /* [P]: kv_lo array, size part_n[p] */
+    int64_t**         part_keys_hi;  /* [P]: kv_hi array, NULL when narrow */
+    int64_t**         part_states;   /* [P]: state[part_n[p] * total_state] */
+    ray_t**           part_keys_hdr;
+    ray_t**           part_keys_hi_hdr;
+    ray_t**           part_states_hdr;
+    int64_t*          part_n;
+    _Atomic(uint32_t) oom;
+} mk_combine_v2_ctx_t;
+
+static void mk_combine_v2_part_fn(void* vctx, uint32_t worker_id,
+                                  int64_t start, int64_t end)
+{
+    (void)worker_id;
+    mk_combine_v2_ctx_t* c = (mk_combine_v2_ctx_t*)vctx;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    uint8_t total_state = c->total_state;
+    uint8_t wide        = c->wide;
+    uint8_t n_aggs      = c->n_aggs;
+    uint32_t nw         = c->nw;
+
+    for (int64_t p = start; p < end; p++) {
+        /* Upper bound on the merged partition: sum of worker fills (some
+         * keys may appear in multiple workers; the merge folds those, so
+         * final n_filled ≤ total). */
+        int64_t total = 0;
+        for (uint32_t w = 0; w < nw; w++) {
+            total += c->ctx->wpart_shards[(size_t)w * MK_RADIX_P + p].n_filled;
+        }
+        if (total == 0) {
+            c->part_n[p] = 0;
+            continue;
+        }
+
+        /* Target HT sized to fit `total` at load ≤ 0.5; pow-of-2. */
+        uint64_t cap = 256;
+        while (cap < (uint64_t)(total * 2)) cap <<= 1;
+
+        mk_shard_t target;
+        memset(&target, 0, sizeof(target));
+        if (mk_shard_init(&target, cap, total_state, wide) != 0) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+
+        /* Merge each worker's shard for this partition into target. */
+        for (uint32_t w = 0; w < nw; w++) {
+            mk_shard_t* src = &c->ctx->wpart_shards[(size_t)w * MK_RADIX_P + p];
+            if (!src->slots) continue;
+            int64_t* src_slots = src->slots;
+            int64_t* src_slots_hi = src->slots_hi;
+            int64_t* src_state = src->state;
+            uint64_t src_cap = src->cap;
+            int64_t* tgt_slots = target.slots;
+            int64_t* tgt_slots_hi = target.slots_hi;
+            int64_t* tgt_state = target.state;
+            uint64_t tgt_mask = target.mask;
+
+            for (uint64_t s = 0; s < src_cap; s++) {
+                if (!src_slots[s * 2]) continue;
+                int64_t kv_lo = src_slots[s * 2 + 1];
+                int64_t kv_hi = wide ? src_slots_hi[s] : 0;
+                uint64_t h;
+                if (wide) {
+                    h = mk_hash_lo_hi(kv_lo, kv_hi);
+                } else {
+                    h = (uint64_t)kv_lo * 0x9E3779B97F4A7C15ULL;
+                    h ^= h >> 33;
+                }
+                uint64_t t = h & tgt_mask;
+                const int64_t* sst = &src_state[s * total_state];
+                for (;;) {
+                    if (!tgt_slots[t * 2]) {
+                        tgt_slots[t * 2]     = 1;
+                        tgt_slots[t * 2 + 1] = kv_lo;
+                        if (wide) tgt_slots_hi[t] = kv_hi;
+                        int64_t* dst = &tgt_state[t * total_state];
+                        for (uint8_t k = 0; k < total_state; k++)
+                            dst[k] = sst[k];
+                        target.n_filled++;
+                        break;
+                    }
+                    if (tgt_slots[t * 2 + 1] == kv_lo &&
+                        (!wide || tgt_slots_hi[t] == kv_hi))
+                    {
+                        mk_state_merge(&tgt_state[t * total_state],
+                                       sst, c->aggs, n_aggs);
+                        break;
+                    }
+                    t = (t + 1) & tgt_mask;
+                }
+            }
+        }
+
+        /* Pack target into dense per-partition output arrays. */
+        int64_t pn = target.n_filled;
+        c->part_n[p] = pn;
+        c->part_keys[p] = (int64_t*)scratch_alloc(
+            &c->part_keys_hdr[p], (size_t)pn * sizeof(int64_t));
+        if (wide) {
+            c->part_keys_hi[p] = (int64_t*)scratch_alloc(
+                &c->part_keys_hi_hdr[p], (size_t)pn * sizeof(int64_t));
+        }
+        c->part_states[p] = (int64_t*)scratch_alloc(
+            &c->part_states_hdr[p],
+            (size_t)pn * total_state * sizeof(int64_t));
+        if (!c->part_keys[p] || (wide && !c->part_keys_hi[p]) ||
+            !c->part_states[p])
+        {
+            mk_shard_free(&target);
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+        int64_t gi = 0;
+        int64_t* tgt_slots = target.slots;
+        int64_t* tgt_slots_hi = target.slots_hi;
+        int64_t* tgt_state = target.state;
+        for (uint64_t t = 0; t < target.cap; t++) {
+            if (!tgt_slots[t * 2]) continue;
+            c->part_keys[p][gi] = tgt_slots[t * 2 + 1];
+            if (wide) c->part_keys_hi[p][gi] = tgt_slots_hi[t];
+            const int64_t* src = &tgt_state[t * total_state];
+            int64_t* dst = &c->part_states[p][gi * total_state];
+            for (uint8_t k = 0; k < total_state; k++) dst[k] = src[k];
+            gi++;
+        }
+
+        mk_shard_free(&target);
+    }
+}
+
+/* Drives the v2 per-partition combine.  Returns 1 on success (fills
+ * out_* with a dense gs/gst layout identical to mk_combine_parallel),
+ * 0 on failure (caller falls back to the slow path). */
+static int mk_combine_v2_parallel(mk_par_ctx_t* c, uint32_t nw,
+                                  int64_t** out_gs, ray_t** out_gs_hdr,
+                                  int64_t** out_gs_hi, ray_t** out_gs_hi_hdr,
+                                  int64_t** out_gst, ray_t** out_gst_hdr,
+                                  int64_t* out_gcap, int64_t* out_global_n)
+{
+    uint8_t total_state = c->total_state;
+    uint8_t wide = c->wide;
+    ray_pool_t* pool = ray_pool_get();
+
+    /* Per-partition state arrays (MK_RADIX_P slots each). */
+    ray_t* pk_hdr = NULL;
+    ray_t* pkhi_hdr = NULL;
+    ray_t* ps_hdr = NULL;
+    ray_t* pkh_hdr = NULL;
+    ray_t* pkhh_hdr = NULL;
+    ray_t* psh_hdr = NULL;
+    ray_t* pn_hdr = NULL;
+    int64_t** part_keys = (int64_t**)scratch_calloc(
+        &pk_hdr, (size_t)MK_RADIX_P * sizeof(int64_t*));
+    int64_t** part_keys_hi = wide
+        ? (int64_t**)scratch_calloc(&pkhi_hdr,
+                                    (size_t)MK_RADIX_P * sizeof(int64_t*))
+        : NULL;
+    int64_t** part_states = (int64_t**)scratch_calloc(
+        &ps_hdr, (size_t)MK_RADIX_P * sizeof(int64_t*));
+    ray_t**   part_keys_hdr = (ray_t**)scratch_calloc(
+        &pkh_hdr, (size_t)MK_RADIX_P * sizeof(ray_t*));
+    ray_t**   part_keys_hi_hdr = wide
+        ? (ray_t**)scratch_calloc(&pkhh_hdr,
+                                  (size_t)MK_RADIX_P * sizeof(ray_t*))
+        : NULL;
+    ray_t**   part_states_hdr = (ray_t**)scratch_calloc(
+        &psh_hdr, (size_t)MK_RADIX_P * sizeof(ray_t*));
+    int64_t*  part_n = (int64_t*)scratch_calloc(
+        &pn_hdr, (size_t)MK_RADIX_P * sizeof(int64_t));
+
+    if (!part_keys || !part_states || !part_keys_hdr ||
+        !part_states_hdr || !part_n ||
+        (wide && (!part_keys_hi || !part_keys_hi_hdr)))
+    {
+        if (pk_hdr)   scratch_free(pk_hdr);
+        if (pkhi_hdr) scratch_free(pkhi_hdr);
+        if (ps_hdr)   scratch_free(ps_hdr);
+        if (pkh_hdr)  scratch_free(pkh_hdr);
+        if (pkhh_hdr) scratch_free(pkhh_hdr);
+        if (psh_hdr)  scratch_free(psh_hdr);
+        if (pn_hdr)   scratch_free(pn_hdr);
+        return 0;
+    }
+
+    mk_combine_v2_ctx_t pctx = {
+        .ctx              = c,
+        .nw               = nw,
+        .total_state      = total_state,
+        .wide             = wide,
+        .aggs             = c->aggs,
+        .n_aggs           = c->n_aggs,
+        .part_keys        = part_keys,
+        .part_keys_hi     = part_keys_hi,
+        .part_states      = part_states,
+        .part_keys_hdr    = part_keys_hdr,
+        .part_keys_hi_hdr = part_keys_hi_hdr,
+        .part_states_hdr  = part_states_hdr,
+        .part_n           = part_n,
+        .oom              = 0,
+    };
+
+    if (pool && ray_pool_total_workers(pool) >= 2) {
+        ray_pool_dispatch_n(pool, mk_combine_v2_part_fn, &pctx,
+                            (uint32_t)MK_RADIX_P);
+    } else {
+        mk_combine_v2_part_fn(&pctx, 0, 0, (int64_t)MK_RADIX_P);
+    }
+
+    if (atomic_load_explicit(&pctx.oom, memory_order_relaxed)) {
+        for (uint64_t p = 0; p < MK_RADIX_P; p++) {
+            if (part_keys_hdr[p])    scratch_free(part_keys_hdr[p]);
+            if (part_keys_hi_hdr && part_keys_hi_hdr[p])
+                scratch_free(part_keys_hi_hdr[p]);
+            if (part_states_hdr[p])  scratch_free(part_states_hdr[p]);
+        }
+        scratch_free(pk_hdr); if (pkhi_hdr) scratch_free(pkhi_hdr);
+        scratch_free(ps_hdr);
+        scratch_free(pkh_hdr); if (pkhh_hdr) scratch_free(pkhh_hdr);
+        scratch_free(psh_hdr);
+        scratch_free(pn_hdr);
+        return 0;
+    }
+
+    /* Concat per-partition outputs into dense gs/gs_hi/gst. */
+    int64_t global_n = 0;
+    for (uint64_t p = 0; p < MK_RADIX_P; p++) global_n += part_n[p];
+
+    ray_t* gs_hdr = NULL;
+    ray_t* gs_hi_hdr = NULL;
+    ray_t* gst_hdr = NULL;
+    int64_t* gs = (int64_t*)scratch_calloc(
+        &gs_hdr, (size_t)global_n * 2 * sizeof(int64_t));
+    int64_t* gs_hi = wide
+        ? (int64_t*)scratch_alloc(&gs_hi_hdr,
+                                  (size_t)global_n * sizeof(int64_t))
+        : NULL;
+    int64_t* gst = (int64_t*)scratch_alloc(
+        &gst_hdr, (size_t)global_n * total_state * sizeof(int64_t));
+    if (!gs || (wide && !gs_hi) || !gst) {
+        if (gs_hdr)    scratch_free(gs_hdr);
+        if (gs_hi_hdr) scratch_free(gs_hi_hdr);
+        if (gst_hdr)   scratch_free(gst_hdr);
+        for (uint64_t p = 0; p < MK_RADIX_P; p++) {
+            if (part_keys_hdr[p])    scratch_free(part_keys_hdr[p]);
+            if (part_keys_hi_hdr && part_keys_hi_hdr[p])
+                scratch_free(part_keys_hi_hdr[p]);
+            if (part_states_hdr[p])  scratch_free(part_states_hdr[p]);
+        }
+        scratch_free(pk_hdr); if (pkhi_hdr) scratch_free(pkhi_hdr);
+        scratch_free(ps_hdr);
+        scratch_free(pkh_hdr); if (pkhh_hdr) scratch_free(pkhh_hdr);
+        scratch_free(psh_hdr);
+        scratch_free(pn_hdr);
+        return 0;
+    }
+
+    int64_t gi = 0;
+    for (uint64_t p = 0; p < MK_RADIX_P; p++) {
+        int64_t pn = part_n[p];
+        if (pn == 0) continue;
+        const int64_t* pk = part_keys[p];
+        const int64_t* pkhi = part_keys_hi ? part_keys_hi[p] : NULL;
+        const int64_t* ps = part_states[p];
+        for (int64_t i = 0; i < pn; i++) {
+            gs[gi * 2]     = 1;
+            gs[gi * 2 + 1] = pk[i];
+            if (wide) gs_hi[gi] = pkhi[i];
+            int64_t* dst = &gst[gi * total_state];
+            const int64_t* src = &ps[i * total_state];
+            for (uint8_t k = 0; k < total_state; k++) dst[k] = src[k];
+            gi++;
+        }
+        if (part_keys_hdr[p])    scratch_free(part_keys_hdr[p]);
+        if (part_keys_hi_hdr && part_keys_hi_hdr[p])
+            scratch_free(part_keys_hi_hdr[p]);
+        if (part_states_hdr[p])  scratch_free(part_states_hdr[p]);
+    }
+
+    scratch_free(pk_hdr); if (pkhi_hdr) scratch_free(pkhi_hdr);
+    scratch_free(ps_hdr);
+    scratch_free(pkh_hdr); if (pkhh_hdr) scratch_free(pkhh_hdr);
+    scratch_free(psh_hdr);
+    scratch_free(pn_hdr);
+
+    *out_gs        = gs;
+    *out_gs_hdr    = gs_hdr;
+    *out_gs_hi     = gs_hi;
+    *out_gs_hi_hdr = gs_hi_hdr;
+    *out_gst       = gst;
+    *out_gst_hdr   = gst_hdr;
+    *out_gcap      = global_n;
+    *out_global_n  = global_n;
+    return 1;
+}
+
 static ray_t* mk_combine_and_materialize(mk_par_ctx_t* c, uint32_t nw,
                                          const uint16_t* agg_op_ids)
 {
@@ -3377,7 +4538,13 @@ static ray_t* mk_combine_and_materialize(mk_par_ctx_t* c, uint32_t nw,
     for (uint32_t w = 0; w < nw; w++) total_local += shards[w].n_filled;
 
     /* Try parallel combine first.  On success, jump straight to the
-     * materialize section with the already-built gs/gs_hi/gst arrays. */
+     * materialize section with the already-built gs/gs_hi/gst arrays.
+     *
+     * v2 path: when wpart_shards is set, shards are pre-partitioned by
+     * RADIX_PART(h).  mk_combine_v2_parallel skips the histogram/scatter
+     * passes entirely — each partition is dedupped independently and
+     * the per-(worker, partition) shards already have the right entries.
+     * v1 path: mk_combine_parallel histogram+scatter+dedup. */
     int64_t* gs    = NULL;
     int64_t* gs_hi = NULL;
     int64_t* gst   = NULL;
@@ -3386,11 +4553,30 @@ static ray_t* mk_combine_and_materialize(mk_par_ctx_t* c, uint32_t nw,
     ray_t*   gst_hdr   = NULL;
     int64_t  gcap     = 0;
     int64_t  global_n = 0;
-    int parallel_ok = mk_combine_parallel(c, nw,
+    int parallel_ok = 0;
+    /* v2 combine target HT scales with per-partition cardinality
+     * (total_local / MK_RADIX_P).  For very-high-card queries (q32:
+     * ~10M unique groups → ~313K per partition → ~1 M-slot HT × 32
+     * partitions ≈ 768 MB allocated) the per-partition HTs blow the
+     * working set out of cache; v1's scatter-then-dedup is bounded
+     * by smaller per-combine-partition slices and wins.  ~16 K
+     * entries per partition keeps each target HT in L2 (~1.5 MB
+     * with 4-slot state). */
+    int v2_combine_ok = c->wpart_shards != NULL &&
+        ((uint64_t)total_local / MK_RADIX_P) <= (1ULL << 14);
+    if (v2_combine_ok) {
+        parallel_ok = mk_combine_v2_parallel(c, nw / MK_RADIX_P,
+                                             &gs, &gs_hdr,
+                                             &gs_hi, &gs_hi_hdr,
+                                             &gst, &gst_hdr,
+                                             &gcap, &global_n);
+    } else {
+        parallel_ok = mk_combine_parallel(c, nw,
                                           &gs, &gs_hdr,
                                           &gs_hi, &gs_hi_hdr,
                                           &gst, &gst_hdr,
                                           &gcap, &global_n);
+    }
     if (parallel_ok) goto materialize;
 
     {
@@ -3698,27 +4884,100 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext,
     }
     if (nrows < 0) return ray_error("nyi", NULL);
 
-    ctx.init_cap = FP_SHARD_INIT_CAP;
     atomic_store_explicit(&ctx.oom, 0, memory_order_relaxed);
     ray_pool_t* pool = ray_pool_get();
     uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
-    ray_t* shards_hdr = NULL;
-    ctx.shards = (mk_shard_t*)scratch_calloc(&shards_hdr,
-                                             (size_t)nw * sizeof(mk_shard_t));
-    if (!ctx.shards) return ray_error("oom", NULL);
 
     int eq_i64_idx = -1;
     if (ctx.n_aggs == 1 && ctx.aggs[0].kind == MK_AGG_COUNT &&
         ctx.pred.n_children > 1) {
         eq_i64_idx = mk_find_i64_eq_child(&ctx.pred);
     }
-    if (eq_i64_idx >= 0) {
+    /* Hash-index probe: if any FP_EQ child sits on a column with a
+     * fresh RAY_IDX_HASH, walk the chain instead of scanning rows.
+     * Single-thread — match counts on a point lookup are too small
+     * to justify pool dispatch.
+     *
+     * Multi-predicate filters fall through: queries that combine a
+     * hash-indexed eq with one or more other predicates (e.g. a
+     * chunk-zone-clustered CounterID/EventDate range) win more from
+     * the parallel chunk-skip scan in mk_eq_i64_count_fn /
+     * mk_par_fn than from a hash chain walk forced into single-
+     * threaded execution. */
+    int hash_eq_idx = (ctx.pred.n_children == 1)
+                          ? mk_find_hash_eq_child(&ctx.pred)
+                          : -1;
+
+    /* v2 gate: pre-partitioned shards win on high-cardinality multi-key
+     * group-bys (q30/q31/q32 family) by keeping each per-(worker,
+     * partition) shard cache-resident.  Exclude shapes where v1's
+     * existing fast paths already win:
+     *   - hash-eq or eq_i64 chunk-skip scans (single-shard inserts)
+     *   - n_aggs == 0 (degenerate)
+     *   - n_keys == 1: v1's hot k0_base path is already L1-friendly
+     *   - SYM keys: existing tuned SYM path beats v2 (q33/q34)
+     *   - nullable agg input: v1's existing nullmask path; v2 does not
+     *     yet track per-agg null counts during merge
+     * Multi-key with COUNT/SUM/AVG aggs (no MIN/MAX): the v2 partition
+     * shards cleanly merge by summing state slots. */
+    bool v2_ok = (hash_eq_idx < 0 && eq_i64_idx < 0 &&
+                  ctx.n_aggs >= 1 && ctx.n_keys >= 2);
+    for (uint8_t k = 0; k < ctx.n_keys && v2_ok; k++) {
+        if (ctx.keys[k].type == RAY_SYM) v2_ok = false;
+    }
+    for (uint8_t a = 0; a < ctx.n_aggs && v2_ok; a++) {
+        mk_agg_kind_t kk = ctx.aggs[a].kind;
+        if (kk != MK_AGG_COUNT && kk != MK_AGG_SUM && kk != MK_AGG_AVG) {
+            v2_ok = false;
+        }
+        if (ctx.aggs[a].in_attrs & RAY_ATTR_HAS_NULLS) v2_ok = false;
+    }
+
+    /* Init capacity per shard.
+     * v1 (single shard per worker): pre-size to a fraction of nrows so
+     * high-cardinality scans pay fewer rehashes.
+     * v2 (MK_RADIX_P shards per worker): each partition holds ~1/256 of
+     * the worker's groups.  Start at 256 slots — matches group.c v2's
+     * design (~64 KB per partition with a 4-slot agg state) and keeps
+     * the upfront allocation total to a few MB instead of tens of MB.
+     * Sparse keys still grow on-demand. */
+    if (v2_ok) {
+        ctx.init_cap = 256;
+    } else {
+        uint64_t expected = (uint64_t)nrows / ((uint64_t)nw * 16u);
+        uint64_t init_cap = FP_SHARD_INIT_CAP;
+        while (init_cap < expected * 2u && init_cap < (1ULL << 14))
+            init_cap <<= 1;
+        ctx.init_cap = init_cap;
+    }
+
+    /* Allocate the shard array.  v2 uses nw * MK_RADIX_P slots, all
+     * stored in the same array — combine_and_materialize iterates
+     * `nw_effective` shards, which equals nw for v1 and nw * MK_RADIX_P
+     * for v2.  Both layouts use the same mk_shard_t per slot. */
+    uint32_t nw_effective = v2_ok ? (nw * MK_RADIX_P) : nw;
+    ray_t* shards_hdr = NULL;
+    ctx.shards = (mk_shard_t*)scratch_calloc(
+        &shards_hdr, (size_t)nw_effective * sizeof(mk_shard_t));
+    if (!ctx.shards) return ray_error("oom", NULL);
+    if (v2_ok) ctx.wpart_shards = ctx.shards;
+
+    if (hash_eq_idx >= 0 && ctx.n_aggs == 1 &&
+        ctx.aggs[0].kind == MK_AGG_COUNT) {
+        mk_eq_hash_count_fn(&ctx, (uint8_t)hash_eq_idx);
+    } else if (hash_eq_idx >= 0) {
+        mk_par_hash_fn(&ctx, (uint8_t)hash_eq_idx);
+    } else if (eq_i64_idx >= 0) {
         mk_eq_i64_count_ctx_t fctx = {
             .ctx = &ctx,
             .eq_idx = (uint8_t)eq_i64_idx,
         };
         if (pool) ray_pool_dispatch(pool, mk_eq_i64_count_fn, &fctx, nrows);
         else      mk_eq_i64_count_fn(&fctx, 0, 0, nrows);
+    } else if (v2_ok && pool) {
+        ray_pool_dispatch(pool, mk_par_v2_fn, &ctx, nrows);
+    } else if (v2_ok) {
+        mk_par_v2_fn(&ctx, 0, 0, nrows);
     } else if (pool) {
         ray_pool_dispatch(pool, mk_par_fn, &ctx, nrows);
     } else {
@@ -3726,13 +4985,16 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext,
     }
 
     if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) {
-        for (uint32_t w = 0; w < nw; w++) mk_shard_free(&ctx.shards[w]);
+        for (uint32_t w = 0; w < nw_effective; w++)
+            mk_shard_free(&ctx.shards[w]);
         scratch_free(shards_hdr);
         return ray_error("oom", "fused_group: shard OOM");
     }
 
-    ray_t* result = mk_combine_and_materialize(&ctx, nw, ext->agg_ops);
-    for (uint32_t w = 0; w < nw; w++) mk_shard_free(&ctx.shards[w]);
+    ray_t* result = mk_combine_and_materialize(&ctx, nw_effective,
+                                               ext->agg_ops);
+    for (uint32_t w = 0; w < nw_effective; w++)
+        mk_shard_free(&ctx.shards[w]);
     scratch_free(shards_hdr);
     return result;
 }
diff --git a/src/ops/group.c b/src/ops/group.c
index 2473b3a8..d5253175 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -24,6 +24,7 @@
 #include "ops/internal.h"
 #include "ops/hash.h"
 #include "ops/rowsel.h"
+#include "ops/hll.h"        /* approximate count-distinct via HyperLogLog */
 #include "lang/internal.h"  /* for ray_median_dbl_inplace */
 
 /* ============================================================================
@@ -280,46 +281,6 @@ static void reduce_merge(reduce_acc_t* dst, const reduce_acc_t* src, int8_t in_t
      * and the last worker's last is the global last. */
 }
 
-typedef struct {
-    ray_t*       input;
-    const void*  data;
-    int64_t      len;
-    int8_t       type;
-    uint8_t      attrs;
-    reduce_acc_t acc;
-} reduce_cache_entry_t;
-
-static reduce_cache_entry_t g_reduce_cache[16];
-static uint32_t g_reduce_cache_next = 0;
-
-static bool reduce_cache_allowed(ray_t* input, const int64_t* sel_idx) {
-    return input && input->mmod != 0 && sel_idx == NULL;
-}
-
-static bool reduce_cache_get(ray_t* input, reduce_acc_t* out) {
-    const void* data = ray_data(input);
-    for (size_t i = 0; i < sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]); i++) {
-        reduce_cache_entry_t* e = &g_reduce_cache[i];
-        if (e->input == input && e->data == data && e->len == input->len &&
-            e->type == input->type && e->attrs == input->attrs) {
-            *out = e->acc;
-            return true;
-        }
-    }
-    return false;
-}
-
-static void reduce_cache_put(ray_t* input, const reduce_acc_t* acc) {
-    reduce_cache_entry_t* e = &g_reduce_cache[
-        g_reduce_cache_next++ % (sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]))];
-    e->input = input;
-    e->data = ray_data(input);
-    e->len = input->len;
-    e->type = input->type;
-    e->attrs = input->attrs;
-    e->acc = *acc;
-}
-
 /* Hash mixing constants used by the count-distinct kernel and helpers. */
 #define CD_HASH_K1 0x9E3779B97F4A7C15ULL
 #define CD_HASH_K2 0xBF58476D1CE4E5B9ULL
@@ -671,6 +632,23 @@ ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input) {
 
     if (len == 0) return ray_i64(0);
 
+    /* For inputs above this row count, switch to the HyperLogLog
+     * cardinality sketch (~0.8% std error at P=14, 16 KB per shard).
+     * Exact dedup-via-hashset is O(unique·log) and becomes memory-
+     * bandwidth-bound past ~1 M rows; HLL is single-pass, mergeable,
+     * and constant-memory per worker.  Below the threshold the exact
+     * path is fast enough and avoids approximation entirely — so small
+     * tests still match `len-after-distinct` byte-for-byte. */
+    if (len >= (1 << 20)) {
+        bool hashable = (in_type == RAY_I64 || in_type == RAY_I32 ||
+                          in_type == RAY_I16 || in_type == RAY_U8 ||
+                          in_type == RAY_BOOL || in_type == RAY_F64 ||
+                          in_type == RAY_DATE || in_type == RAY_TIME ||
+                          in_type == RAY_TIMESTAMP || in_type == RAY_STR ||
+                          RAY_IS_SYM(in_type));
+        if (hashable) return ray_count_distinct_approx(input);
+    }
+
     switch (in_type) {
     case RAY_BOOL: case RAY_U8:
     case RAY_I16: case RAY_I32: case RAY_I64:
@@ -1171,6 +1149,85 @@ static ray_t* count_distinct_per_group_parallel(
     return out;
 }
 
+/* Approximate per-group count(distinct) via HyperLogLog with sparse
+ * representation.  Builds (idx_buf, offsets, counts) from row_gid on the
+ * fly and delegates to ray_count_distinct_approx_pg_buf.
+ *
+ * Memory: each task sketch starts sparse (1 KB) and converts to dense
+ * (16 KB) only for groups that exceed RAY_HLL_SPARSE_CAP unique values.
+ * Total concurrent memory is bounded by n_workers × 17 KB regardless of
+ * n_groups — that's the property that lets us run HLL at n_groups > 50K
+ * where the dense-only sketch would have needed multi-GB.
+ *
+ * Returns the populated `out` vector on success, NULL on type miss /
+ * dispatch failure.  Caller (ray_count_distinct_per_group) falls back
+ * to the exact partitioned dedup. */
+static ray_t* count_distinct_per_group_hll(ray_t* src, const int64_t* row_gid,
+                                           int64_t n_rows, int64_t n_groups,
+                                           ray_t* out) {
+    if (!src || n_rows <= 0 || n_groups <= 0) return NULL;
+    /* Build group-major idx_buf: for each group g, idx_buf[offsets[g] ..
+     * offsets[g] + counts[g]) lists the source row indices in that group.
+     * Serial two-pass; for n_rows = 10 M this is ~80 MB of int64 reads
+     * twice ≈ 25 ms on the bench box.  The HLL pass itself dominates. */
+    ray_t* cnt_hdr = NULL;
+    ray_t* off_hdr = NULL;
+    int64_t* counts  = (int64_t*)scratch_calloc(&cnt_hdr,
+                                                 (size_t)n_groups * sizeof(int64_t));
+    int64_t* offsets = (int64_t*)scratch_alloc(&off_hdr,
+                                                 (size_t)n_groups * sizeof(int64_t));
+    if (!counts || !offsets) {
+        if (cnt_hdr) scratch_free(cnt_hdr);
+        if (off_hdr) scratch_free(off_hdr);
+        return NULL;
+    }
+    /* Pass 1: histogram. */
+    int64_t total = 0;
+    for (int64_t r = 0; r < n_rows; r++) {
+        int64_t g = row_gid[r];
+        if (g >= 0 && g < n_groups) counts[g]++;
+    }
+    /* Prefix sums → offsets. */
+    for (int64_t g = 0; g < n_groups; g++) {
+        offsets[g] = total;
+        total += counts[g];
+    }
+    if (total == 0) {
+        scratch_free(cnt_hdr); scratch_free(off_hdr);
+        return out;
+    }
+    ray_t* idx_hdr = NULL;
+    int64_t* idx_buf = (int64_t*)scratch_alloc(&idx_hdr,
+                                                 (size_t)total * sizeof(int64_t));
+    if (!idx_buf) {
+        scratch_free(cnt_hdr); scratch_free(off_hdr);
+        return NULL;
+    }
+    /* Pass 2: scatter into group-major buf using a cursor copy of offsets. */
+    ray_t* pos_hdr = NULL;
+    int64_t* pos = (int64_t*)scratch_alloc(&pos_hdr,
+                                            (size_t)n_groups * sizeof(int64_t));
+    if (!pos) {
+        scratch_free(idx_hdr); scratch_free(cnt_hdr); scratch_free(off_hdr);
+        return NULL;
+    }
+    memcpy(pos, offsets, (size_t)n_groups * sizeof(int64_t));
+    for (int64_t r = 0; r < n_rows; r++) {
+        int64_t g = row_gid[r];
+        if (g >= 0 && g < n_groups) idx_buf[pos[g]++] = r;
+    }
+    scratch_free(pos_hdr);
+
+    int64_t* odata = (int64_t*)ray_data(out);
+    int rc = ray_count_distinct_approx_pg_buf(src, idx_buf, offsets, counts,
+                                              n_groups, RAY_HLL_DEFAULT_P, odata);
+    scratch_free(idx_hdr);
+    scratch_free(cnt_hdr);
+    scratch_free(off_hdr);
+    if (rc != 0) return NULL;
+    return out;
+}
+
 /* Grouped count(distinct): single global hash keyed by (group_id, value).
  * One linear pass over all rows, O(n) total instead of O(per-group setup *
  * n_groups).  Returns an I64 vector of length n_groups with the per-group
@@ -1207,6 +1264,63 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid,
     memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
     if (n_rows == 0 || n_groups == 0) return out;
 
+    /* Approximate path: when n_rows clears the HLL threshold (same as
+     * the buf-form caller — 1 M rows), build a group-major idx layout
+     * and run the sparse-HLL per-group kernel.  Sparse-representation
+     * HLL makes this memory-bounded regardless of n_groups: each task
+     * holds one sketch that's ≤ 17 KB total (1 KB sparse + 16 KB
+     * dense, allocated together on the stack), so concurrent footprint
+     * is n_workers × 17 KB instead of n_groups × 16 KB.  Returns a
+     * ~0.8 % std-error estimate; callers that need exact counts at
+     * this scale must not hit this gate. */
+    if (n_rows >= (1 << 20)) {
+        /* Streaming HLL: skip the (idx_buf + offsets + counts) CSR build
+         * by accumulating directly into n_groups sketches per worker in
+         * a single pass over (row_gid[r], val[r]).  The CSR build cost
+         * (two passes of int64 reads over n_rows) is ~30 % of wall time
+         * on q10/q08 ClickBench, while the HLL pass itself is ~7 %.
+         *
+         * Gated on a per-worker memory budget: each worker keeps a bank
+         * of n_groups sketches whose sparse + dense buffers come from
+         * one pre-allocated slab.  At P=14 that's ~17 KB per group;
+         * with the 8 MB-per-worker budget below, n_groups must be ≤
+         * 482 (at one worker) and shrinks pro-rata with worker count
+         * — i.e. the *total* concurrent footprint is bounded at
+         * n_workers * 8 MB ≤ ~64 MB on a 16-thread box.
+         *
+         * Lower bound (n_groups < 16) avoids the dispatch overhead of
+         * n_workers-fold bank merges when there's only a handful of
+         * groups — the CSR path's per-group task dispatch dominates
+         * there anyway, but the streaming bank merge has its own fixed
+         * cost.  Below the bound we fall through to the CSR HLL path. */
+        const size_t RAY_HLL_STREAM_BUDGET_PER_WORKER = (size_t)8 * 1024 * 1024;
+        /* Per-sketch slab footprint at the precision the kernel uses
+         * (RAY_HLL_DEFAULT_P → m = 16384).  sizeof(ray_hll_t) is small
+         * relative to the buffers; rounded into the count. */
+        size_t hll_per_group =
+            sizeof(ray_hll_t) +
+            RAY_HLL_SPARSE_CAP * sizeof(uint32_t) +
+            ((size_t)1u << RAY_HLL_DEFAULT_P);
+        bool stream_ok = (n_groups >= 16) &&
+                         ((size_t)n_groups * hll_per_group
+                          <= RAY_HLL_STREAM_BUDGET_PER_WORKER);
+        if (stream_ok) {
+            int rc = ray_count_distinct_approx_pg_stream(
+                src, row_gid, n_rows, n_groups,
+                RAY_HLL_DEFAULT_P, odata);
+            if (rc == 0) return out;
+            /* Streaming failed (OOM / unsupported type) — fall through
+             * to the CSR HLL path with odata still zeroed. */
+            memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
+        }
+
+        ray_t* approx = count_distinct_per_group_hll(src, row_gid,
+                                                     n_rows, n_groups, out);
+        if (approx) return approx;
+        /* Fall through on dispatch failure — counts not yet written. */
+        memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
+    }
+
     /* Parallel partitioned path for sizes where the serial global hash
      * blows L3.  Threshold tuned so the partition / scatter / dedup
      * dispatch overhead stays smaller than the cache-miss savings. */
@@ -1892,18 +2006,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) {
         return reduction_i64_result(read_col_i64(base, row, in_type, input->attrs), in_type);
     }
 
-    reduce_acc_t cached;
-    if ((op->opcode == OP_MIN || op->opcode == OP_MAX) &&
-        reduce_cache_allowed(input, sel_idx) &&
-        reduce_cache_get(input, &cached)) {
-        if (sel_idx_block) ray_release(sel_idx_block);
-        return op->opcode == OP_MIN
-            ? reduction_extreme_result(op, in_type, cached.cnt > 0,
-                                       cached.min_f, cached.min_i)
-            : reduction_extreme_result(op, in_type, cached.cnt > 0,
-                                       cached.max_f, cached.max_i);
-    }
-
     ray_pool_t* pool = ray_pool_get();
     if (pool && scan_n >= RAY_PARALLEL_THRESHOLD) {
         uint32_t nw = ray_pool_total_workers(pool);
@@ -1940,9 +2042,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) {
             }
         }
 
-        if (reduce_cache_allowed(input, sel_idx))
-            reduce_cache_put(input, &merged);
-
         ray_t* result;
         switch (op->opcode) {
             case OP_SUM:   result = in_type == RAY_F64 ? ray_f64(merged.sum_f) : ray_i64(merged.sum_i); break;
@@ -1982,8 +2081,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) {
     reduce_acc_init(&acc);
     reduce_range(input, 0, scan_n, &acc, has_nulls, sel_idx);
     if (sel_idx_block) ray_release(sel_idx_block);
-    if (reduce_cache_allowed(input, sel_idx))
-        reduce_cache_put(input, &acc);
 
     switch (op->opcode) {
         case OP_SUM:   return in_type == RAY_F64 ? ray_f64(acc.sum_f) : ray_i64(acc.sum_i);
@@ -2451,6 +2548,16 @@ static inline uint32_t group_probe_entry(group_ht_t* ht,
     uint32_t slot = (uint32_t)(hash & mask);
     uint16_t key_bytes = (uint16_t)((ly->n_keys + 1) * 8);
 
+    /* For count-only queries (no SUM/MIN/MAX/SUMSQ/PEARSON aggregator
+     * state, no FIRST/LAST row tracking, no binary aggregator y-side)
+     * init_accum_from_entry and accum_from_entry are no-ops on every
+     * non-count slot — the per-row call still iterates n_aggs slots,
+     * reads agg_val_slot[a], memcpy's the entry's agg value into a
+     * local, then drops it.  That's ~6 ns / row × n_keys=1 millions of
+     * rows, ~7 ms wall on q15.  Skip the call when none of the flags
+     * that drive its writes are set. */
+    uint8_t accum_skip = (ly->need_flags == 0
+        && (ly->agg_is_first | ly->agg_is_last | ly->agg_is_binary) == 0);
     for (;;) {
         uint32_t sv = ht->slots[slot];
         if (sv == HT_EMPTY) {
@@ -2462,7 +2569,8 @@ static inline uint32_t group_probe_entry(group_ht_t* ht,
             char* row = ht->rows + (size_t)gid * ly->row_stride;
             *(int64_t*)row = 1;   /* count = 1 */
             memcpy(row + 8, ekeys, key_bytes);
-            init_accum_from_entry(row, entry, ly);
+            if (!accum_skip)
+                init_accum_from_entry(row, entry, ly);
             ht->slots[slot] = HT_PACK(salt, gid);
             if (ht->grp_count * 2 > ht->ht_cap) {
                 group_ht_rehash(ht, key_types);
@@ -2476,7 +2584,8 @@ static inline uint32_t group_probe_entry(group_ht_t* ht,
             if (group_keys_equal((const int64_t*)(row + 8),
                                   (const int64_t*)ekeys, ly, ht->key_data)) {
                 (*(int64_t*)row)++;   /* count++ */
-                accum_from_entry(row, entry, ly);
+                if (!accum_skip)
+                    accum_from_entry(row, entry, ly);
                 return mask;
             }
         }
@@ -3199,6 +3308,274 @@ static void radix_phase2_fn(void* ctx, uint32_t worker_id, int64_t start, int64_
     }
 }
 
+/* ============================================================================
+ * Fused radix: per-(worker, partition) HT direct-insert + per-partition merge
+ *
+ *   Replaces the materialise-fat-entries-then-build-HTs round trip with a
+ *   single-pass aggregation per (worker, partition) HT, followed by an
+ *   in-cache merge per partition.  Currently restricted to count-only
+ *   queries (every agg is OP_COUNT) — the merge primitive here only
+ *   knows how to combine counts; SUM/AVG/MIN/MAX would need their own
+ *   state-merge logic (next increment).
+ *
+ *   Per-(worker, partition) HT for a 10M-row count-by-UserID: ~3M distinct
+ *   keys ÷ 256 parts ÷ 8 workers ≈ 1.5K groups → cap ~4K slots → ~64 KB
+ *   row store, L1/L2-resident.  Worker w processes its row range; per row
+ *   it hashes keys, computes partition = RADIX_PART(h), probes its local
+ *   HT_p.  Phase2 dispatches partitions across workers; each merges the n
+ *   worker HTs for one partition into a final partition HT in part_hts[p].
+ *   Phase3 (radix_phase3_fn) emits from part_hts[] exactly as before.
+ * ============================================================================ */
+
+/* Merge one source group row into the target HT.  Hash is recomputed from
+ * the row's key region via hash_keys_inline — identical to what
+ * group_probe_entry did when the row was first inserted, so the partition
+ * assignment is consistent.  Supports need_flags ∈ {0, GHT_NEED_SUM}:
+ * count-only and count+SUM/AVG.  On miss, the entire source row is copied
+ * verbatim (memcpy of row_stride); on hit, count += src.count and, when
+ * need_sum, each enabled sum slot accumulates the source's sum (f64 or
+ * i64 per agg_is_f64).  Caller's v2 gate filters out PROD/FIRST/LAST/
+ * MIN/MAX/SUMSQ/PEARSON/MEDIAN — those need richer state merges. */
+static inline uint32_t group_merge_row(group_ht_t* ht,
+    const char* src_row, const int8_t* key_types, uint32_t mask)
+{
+    const ght_layout_t* ly = &ht->layout;
+    int64_t src_count = *(const int64_t*)src_row;
+    const int64_t* skeys = (const int64_t*)(src_row + 8);
+    uint64_t h = hash_keys_inline(skeys, key_types, ly->n_keys,
+                                  ly->wide_key_mask, ly->wide_key_esz,
+                                  ht->key_data);
+    uint8_t salt = HT_SALT(h);
+    uint32_t slot = (uint32_t)(h & mask);
+    uint8_t na = ly->n_aggs;
+    uint8_t f64_mask = ly->agg_is_f64;
+    uint16_t off_sum = ly->off_sum;
+    bool need_sum = (ly->need_flags & GHT_NEED_SUM) != 0;
+    for (;;) {
+        uint32_t sv = ht->slots[slot];
+        if (sv == HT_EMPTY) {
+            if (ht->grp_count >= ht->grp_cap) {
+                if (!group_ht_grow(ht)) { ht->oom = 1; return mask; }
+            }
+            uint32_t gid = ht->grp_count++;
+            char* row = ht->rows + (size_t)gid * ly->row_stride;
+            /* Whole-row copy: count + keys/null_mask + aggregator state. */
+            memcpy(row, src_row, ly->row_stride);
+            ht->slots[slot] = HT_PACK(salt, gid);
+            if (ht->grp_count * 2 > ht->ht_cap) {
+                group_ht_rehash(ht, key_types);
+                mask = ht->ht_cap - 1;
+            }
+            return mask;
+        }
+        if (HT_SALT_V(sv) == salt) {
+            uint32_t gid = HT_GID(sv);
+            char* row = ht->rows + (size_t)gid * ly->row_stride;
+            if (group_keys_equal((const int64_t*)(row + 8),
+                                  skeys, ly, ht->key_data)) {
+                *(int64_t*)row += src_count;
+                if (need_sum) {
+                    for (uint8_t a = 0; a < na; a++) {
+                        int8_t s = ly->agg_val_slot[a];
+                        if (s < 0) continue;
+                        size_t off = (size_t)off_sum + (size_t)s * 8;
+                        if (f64_mask & (1u << a)) {
+                            double sv_f;
+                            memcpy(&sv_f, src_row + off, 8);
+                            *(double*)(row + off) += sv_f;
+                        } else {
+                            int64_t sv_i;
+                            memcpy(&sv_i, src_row + off, 8);
+                            *(int64_t*)(row + off) += sv_i;
+                        }
+                    }
+                }
+                return mask;
+            }
+        }
+        slot = (slot + 1) & mask;
+    }
+}
+
+typedef struct {
+    void**         key_data;
+    int8_t*        key_types;
+    uint8_t*       key_attrs;
+    ray_t**        key_vecs;
+    ray_t**        agg_vecs;        /* may be NULL for pure COUNT (n_agg_vals==0) */
+    ray_t**        agg_vecs2;
+    uint8_t*       agg_strlen;
+    uint8_t        nullable_mask;
+    uint32_t       n_workers;
+    group_ht_t*    wpart_hts;        /* [n_workers * RADIX_P] */
+    ght_layout_t   layout;
+    ray_t*         rowsel;
+    const int64_t* match_idx;
+    _Atomic(int)   oom;
+} radix_v2_phase1_ctx_t;
+
+static void radix_v2_phase1_fn(void* ctx, uint32_t worker_id,
+                               int64_t start, int64_t end) {
+    radix_v2_phase1_ctx_t* c = (radix_v2_phase1_ctx_t*)ctx;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    const ght_layout_t* ly = &c->layout;
+    uint8_t nk = ly->n_keys;
+    uint8_t wide = ly->wide_key_mask;
+    uint8_t nullable = c->nullable_mask;
+    const int64_t* match_idx = c->match_idx;
+
+    group_ht_t* my_hts = &c->wpart_hts[(size_t)worker_id * RADIX_P];
+    /* Lazily init this worker's 256 partition HTs. */
+    for (uint32_t p = 0; p < RADIX_P; p++) {
+        if (!my_hts[p].slots) {
+            if (!group_ht_init_sized(&my_hts[p], 256, ly, 128)) {
+                atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+                return;
+            }
+            if (wide && c->key_data)
+                group_ht_set_key_data(&my_hts[p], c->key_data);
+        }
+    }
+    uint32_t masks[RADIX_P];
+    for (uint32_t p = 0; p < RADIX_P; p++) masks[p] = my_hts[p].ht_cap - 1;
+
+    /* Stack-resident transient entry, same layout as group_rows_range. */
+    char ebuf[8 + 9 * 8 + 8 * 8 + 8];
+    for (int64_t i = start; i < end; i++) {
+        if (((i - start) & 65535) == 0 && ray_interrupted()) break;
+        int64_t row = match_idx ? match_idx[i] : i;
+        if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, row))
+            continue;
+        uint64_t h = 0;
+        int64_t* ek = (int64_t*)(ebuf + 8);
+        int64_t null_mask = 0;
+        for (uint8_t k = 0; k < nk; k++) {
+            int8_t t = c->key_types[k];
+            uint64_t kh;
+            bool is_null = (nullable & (1u << k))
+                           && ray_vec_is_null(c->key_vecs[k], row);
+            if (is_null) {
+                null_mask |= (int64_t)(1u << k);
+                ek[k] = 0;
+                kh = ray_hash_i64(0);
+            } else if (wide & (1u << k)) {
+                uint8_t esz = ly->wide_key_esz[k];
+                const void* src = (const char*)c->key_data[k] + (size_t)row * esz;
+                ek[k] = row;
+                kh = ray_hash_bytes(src, esz);
+            } else if (t == RAY_F64) {
+                int64_t kv;
+                memcpy(&kv, &((double*)c->key_data[k])[row], 8);
+                ek[k] = kv;
+                kh = ray_hash_f64(((double*)c->key_data[k])[row]);
+            } else {
+                int64_t kv = read_col_i64(c->key_data[k], row, t, c->key_attrs[k]);
+                ek[k] = kv;
+                kh = ray_hash_i64(kv);
+            }
+            h = (k == 0) ? kh : ray_hash_combine(h, kh);
+        }
+        ek[nk] = null_mask;
+        if (null_mask) h = ray_hash_combine(h, ray_hash_i64(null_mask));
+        *(uint64_t*)ebuf = h;
+        /* Pack agg values into entry — only when the HT layout actually
+         * reads them.  For count-only need_flags == 0 and accum_from_entry
+         * skips every agg slot; packing here would be a wasted column
+         * read per row (a measurable regression on q15-class queries). */
+        if (ly->need_flags) {
+            int64_t* ev = (int64_t*)(ebuf + 8 + ((size_t)nk + 1) * 8);
+            uint8_t vi = 0;
+            uint8_t na = ly->n_aggs;
+            uint8_t bin_mask = ly->agg_is_binary;
+            uint8_t hol_mask = ly->agg_is_holistic;
+            for (uint8_t a = 0; a < na; a++) {
+                if (hol_mask & (1u << a)) continue;
+                ray_t* ac = c->agg_vecs ? c->agg_vecs[a] : NULL;
+                if (!ac) continue;
+                if (c->agg_strlen && c->agg_strlen[a])
+                    ev[vi] = group_strlen_at(ac, row);
+                else if (ac->type == RAY_F64)
+                    memcpy(&ev[vi], &((double*)ray_data(ac))[row], 8);
+                else
+                    ev[vi] = read_col_i64(ray_data(ac), row, ac->type, ac->attrs);
+                vi++;
+                if ((bin_mask & (1u << a)) && c->agg_vecs2 && c->agg_vecs2[a]) {
+                    ray_t* ay = c->agg_vecs2[a];
+                    if (ay->type == RAY_F64)
+                        memcpy(&ev[vi], &((double*)ray_data(ay))[row], 8);
+                    else
+                        ev[vi] = read_col_i64(ray_data(ay), row, ay->type, ay->attrs);
+                    vi++;
+                }
+            }
+        }
+        uint32_t p = RADIX_PART(h);
+        uint32_t new_mask = group_probe_entry(&my_hts[p], ebuf,
+                                              c->key_types, masks[p]);
+        if (my_hts[p].oom) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+        masks[p] = new_mask;
+    }
+}
+
+typedef struct {
+    group_ht_t*   wpart_hts;     /* [n_workers * RADIX_P] — input */
+    group_ht_t*   part_hts;      /* [RADIX_P] — output */
+    int8_t*       key_types;
+    uint32_t      n_workers;
+    ght_layout_t  layout;
+    void**        key_data;
+    _Atomic(int)  oom;
+} radix_v2_phase2_ctx_t;
+
+static void radix_v2_phase2_fn(void* ctx, uint32_t worker_id,
+                               int64_t start, int64_t end) {
+    (void)worker_id;
+    radix_v2_phase2_ctx_t* c = (radix_v2_phase2_ctx_t*)ctx;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    uint16_t row_stride = c->layout.row_stride;
+    for (int64_t p = start; p < end; p++) {
+        /* Upper bound on the merged partition: sum of worker grp_counts
+         * (some keys may be present in multiple workers — the merge will
+         * fold those, so the final grp_count is ≤ this sum). */
+        uint32_t total_grps = 0;
+        for (uint32_t w = 0; w < c->n_workers; w++)
+            total_grps += c->wpart_hts[(size_t)w * RADIX_P + p].grp_count;
+        if (total_grps == 0) continue;
+        uint32_t ht_cap = 256;
+        {
+            uint64_t target = (uint64_t)total_grps * 2;
+            if (target < 256) target = 256;
+            while (ht_cap < target) ht_cap *= 2;
+        }
+        uint32_t init_grp = 256;
+        while (init_grp < total_grps && init_grp < 65536) init_grp *= 2;
+        if (!group_ht_init_sized(&c->part_hts[p], ht_cap, &c->layout, init_grp)) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+        if (c->layout.wide_key_mask && c->key_data)
+            group_ht_set_key_data(&c->part_hts[p], c->key_data);
+        uint32_t mask = c->part_hts[p].ht_cap - 1;
+        for (uint32_t w = 0; w < c->n_workers; w++) {
+            group_ht_t* src = &c->wpart_hts[(size_t)w * RADIX_P + p];
+            if (src->grp_count == 0) continue;
+            const char* rows = src->rows;
+            for (uint32_t gi = 0; gi < src->grp_count; gi++) {
+                mask = group_merge_row(&c->part_hts[p],
+                                       rows + (size_t)gi * row_stride,
+                                       c->key_types, mask);
+                if (c->part_hts[p].oom) {
+                    atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+                    return;
+                }
+            }
+        }
+    }
+}
+
 /* ============================================================================
  * Parallel direct-array accumulation for low-cardinality single integer key
  * ============================================================================ */
@@ -3213,6 +3590,12 @@ typedef struct {
     uint32_t    n_workers;
     const int64_t* match_idx;    /* NULL = no selection */
     ray_t*      rowsel;
+    /* DA-path early-out: once any worker observes a key span wider than
+     * span_budget the direct-array path is provably infeasible (its slot
+     * count would exceed DA_MAX_COMPOSITE_SLOTS), so the whole scan can
+     * stop instead of reading the rest of a 10M-row column for nothing. */
+    int64_t          span_budget;
+    _Atomic(int)*    abort_flag;
 } minmax_ctx_t;
 
 static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
@@ -3221,11 +3604,29 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t
     const int64_t* match_idx = c->match_idx;
     int64_t kmin = INT64_MAX, kmax = INT64_MIN;
     int8_t t = c->key_type;
-
+    const int64_t span_budget = c->span_budget;
+
+    /* Span check and abort poll are batched (every 1024 rows) so the
+     * hot per-row loop body stays a branchless min/max with no atomics.
+     * 8192 was too sparse — the dispatcher hands out 8K-row morsels, so
+     * `(i-start) & 8191 == 0` only ever fired at the morsel boundary
+     * (where kmin=INT64_MAX/kmax=INT64_MIN make the span check vacuous),
+     * leaving every full 8K morsel to run end-to-end on doomed columns. */
     #define MINMAX_SEG_LOOP(TYPE, CAST) \
         do { \
             const TYPE* kd = (const TYPE*)c->key_data; \
             for (int64_t i = start; i < end; i++) { \
+                if (((i - start) & 1023) == 0) { \
+                    if (atomic_load_explicit(c->abort_flag, \
+                                             memory_order_relaxed)) \
+                        goto minmax_done; \
+                    if (kmax >= kmin && \
+                        (uint64_t)(kmax - kmin) > (uint64_t)span_budget) { \
+                        atomic_store_explicit(c->abort_flag, 1, \
+                                              memory_order_relaxed); \
+                        goto minmax_done; \
+                    } \
+                } \
                 int64_t r = match_idx ? match_idx[i] : i; \
                 if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, r)) continue; \
                 int64_t v = (int64_t)CAST kd[r]; \
@@ -3252,6 +3653,7 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t
 
     #undef MINMAX_SEG_LOOP
 
+minmax_done:
     /* Merge with existing per-worker values (a worker may process multiple morsels) */
     if (kmin < c->per_worker_min[wid]) c->per_worker_min[wid] = kmin;
     if (kmax > c->per_worker_max[wid]) c->per_worker_max[wid] = kmax;
@@ -5237,9 +5639,24 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl,
         }
     }
     ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get();
+    /* Historical: enabled only for OP_COUNT (the min_count_exclusive
+     * heavy-hitter filter and the top_count_take heap).  The
+     * top_count_take heap path now also accepts SUM/MIN/MAX — those
+     * fire through the v2_emit per-partition compact below, which
+     * reads the agg's int64 row slot directly.  The non-COUNT paths
+     * (sparse_i64 range-counting, the n_keys>1 macro fast path) still
+     * gate on COUNT because they DON'T have the agg value available
+     * outside the row slot. */
     bool use_emit_filter = emit_filter.enabled &&
         emit_filter.agg_index < n_aggs &&
         ext->agg_ops[emit_filter.agg_index] == OP_COUNT;
+    bool use_topn_filter = emit_filter.enabled &&
+        emit_filter.top_count_take > 0 &&
+        emit_filter.agg_index < n_aggs &&
+        (ext->agg_ops[emit_filter.agg_index] == OP_COUNT ||
+         ext->agg_ops[emit_filter.agg_index] == OP_SUM   ||
+         ext->agg_ops[emit_filter.agg_index] == OP_MIN   ||
+         ext->agg_ops[emit_filter.agg_index] == OP_MAX);
 
     /* ---- Scalar aggregate fast path (n_keys == 0): flat vector scan ---- */
     if (n_keys == 0 && nrows > 0) {
@@ -5559,6 +5976,9 @@ da_path:;
                             ? ray_pool_total_workers(mm_pool) : 1;
             /* VLA bounded by worker count — max ~2KB per key even on 256-core systems. */
             int64_t mm_mins[mm_n], mm_maxs[mm_n];
+            /* Shared across keys: once any key proves the DA slot count
+             * infeasible the scan aborts instead of reading the rest. */
+            _Atomic(int) mm_abort = 0;
             for (uint8_t k = 0; k < n_keys && da_fits; k++) {
                 int64_t kmin, kmax;
                 for (uint32_t w = 0; w < mm_n; w++) {
@@ -5574,12 +5994,18 @@ da_path:;
                     .n_workers      = mm_n,
                     .match_idx      = match_idx,
                     .rowsel         = rowsel,
+                    .span_budget    = DA_MAX_COMPOSITE_SLOTS,
+                    .abort_flag     = &mm_abort,
                 };
                 if (mm_n > 1) {
                     ray_pool_dispatch(mm_pool, minmax_scan_fn, &mm_ctx, n_scan);
                 } else {
                     minmax_scan_fn(&mm_ctx, 0, 0, n_scan);
                 }
+                if (atomic_load_explicit(&mm_abort, memory_order_relaxed)) {
+                    da_fits = false;
+                    break;
+                }
                 kmin = INT64_MAX; kmax = INT64_MIN;
                 for (uint32_t w = 0; w < mm_n; w++) {
                     if (mm_mins[w] < kmin) kmin = mm_mins[w];
@@ -7407,6 +7833,114 @@ ht_path:;
 skip_top_count_filter:
 
     if (pool && nrows >= RAY_PARALLEL_THRESHOLD && n_total > 1 && !rowsel) {
+        /* Per-(worker, partition) direct-insert path: aggregates into
+         * thread-local partition HTs during phase1, then merges per
+         * partition.  Bypasses the phase1 fat-entry materialisation +
+         * phase2 re-read DRAM round trip.  On success it populates
+         * part_hts[] in the format the existing phase3 emit consumes.
+         *
+         * Gate: every agg is COUNT/SUM/AVG (the merge primitive knows
+         * how to add counts and sum slots; PROD/MIN/MAX/FIRST/LAST/
+         * SUMSQ/PEARSON/MEDIAN need richer state-merge logic).  Agg
+         * input columns must be non-nullable for now — sentinel-skip
+         * inside accum_from_entry is correct, but the merge step needs
+         * an nn_count and that isn't tracked yet. */
+        bool v2_ok = (n_keys >= 1 && n_aggs > 0);
+        /* SYM single-key queries already had a tuned path (q33/q34 hit it
+         * before falling to the radix); v2 doesn't beat it for them, so
+         * skip when any key is SYM and let the existing pipeline handle it. */
+        for (uint8_t k = 0; k < n_keys && v2_ok; k++)
+            if (key_types[k] == RAY_SYM) v2_ok = false;
+        for (uint8_t a = 0; a < n_aggs && v2_ok; a++) {
+            uint16_t op = ext->agg_ops[a];
+            if (op != OP_COUNT && op != OP_SUM && op != OP_AVG) {
+                v2_ok = false;
+                break;
+            }
+            if (agg_vecs[a]) {
+                ray_t* src = (agg_vecs[a]->attrs & RAY_ATTR_SLICE)
+                             ? agg_vecs[a]->slice_parent : agg_vecs[a];
+                if (src && (src->attrs & RAY_ATTR_HAS_NULLS))
+                    v2_ok = false;
+            }
+        }
+        if (v2_ok && !(ght_layout.agg_is_first | ght_layout.agg_is_last
+                        | ght_layout.agg_is_holistic
+                        | ght_layout.agg_is_binary)) {
+            ray_t* wpart_hdr = NULL;
+            size_t v2_n_w = (size_t)n_total * RADIX_P;
+            group_ht_t* wpart_hts = (group_ht_t*)scratch_calloc(
+                &wpart_hdr, v2_n_w * sizeof(group_ht_t));
+            ray_t* v2_part_hdr = NULL;
+            group_ht_t* v2_part_hts = wpart_hts
+                ? (group_ht_t*)scratch_calloc(&v2_part_hdr,
+                                              RADIX_P * sizeof(group_ht_t))
+                : NULL;
+            if (!wpart_hts || !v2_part_hts) {
+                if (wpart_hts) scratch_free(wpart_hdr);
+                if (v2_part_hts) scratch_free(v2_part_hdr);
+                goto v2_done;
+            }
+            uint8_t v2_nullable = 0;
+            for (uint8_t k = 0; k < n_keys; k++) {
+                if (!key_vecs[k]) continue;
+                ray_t* src = (key_vecs[k]->attrs & RAY_ATTR_SLICE)
+                             ? key_vecs[k]->slice_parent : key_vecs[k];
+                if (src && (src->attrs & RAY_ATTR_HAS_NULLS))
+                    v2_nullable |= (uint8_t)(1u << k);
+            }
+            radix_v2_phase1_ctx_t v2p1 = {
+                .key_data      = key_data,
+                .key_types     = key_types,
+                .key_attrs     = key_attrs,
+                .key_vecs      = key_vecs,
+                .agg_vecs      = agg_vecs,
+                .agg_vecs2     = agg_vecs2,
+                .agg_strlen    = agg_strlen,
+                .nullable_mask = v2_nullable,
+                .n_workers     = n_total,
+                .wpart_hts     = wpart_hts,
+                .layout        = ght_layout,
+                .rowsel        = rowsel,
+                .match_idx     = match_idx,
+                .oom           = 0,
+            };
+            ray_pool_dispatch(pool, radix_v2_phase1_fn, &v2p1, n_scan);
+            CHECK_CANCEL_GOTO(pool, cleanup);
+            if (atomic_load_explicit(&v2p1.oom, memory_order_relaxed)) {
+                for (size_t i = 0; i < v2_n_w; i++)
+                    group_ht_free(&wpart_hts[i]);
+                scratch_free(wpart_hdr);
+                scratch_free(v2_part_hdr);
+                goto v2_done;
+            }
+            radix_v2_phase2_ctx_t v2p2 = {
+                .wpart_hts = wpart_hts,
+                .part_hts  = v2_part_hts,
+                .key_types = key_types,
+                .n_workers = n_total,
+                .layout    = ght_layout,
+                .key_data  = key_data,
+                .oom       = 0,
+            };
+            ray_pool_dispatch_n(pool, radix_v2_phase2_fn, &v2p2, RADIX_P);
+            CHECK_CANCEL_GOTO(pool, cleanup);
+            /* Worker HTs are no longer needed once the merge is done. */
+            for (size_t i = 0; i < v2_n_w; i++)
+                group_ht_free(&wpart_hts[i]);
+            scratch_free(wpart_hdr);
+            if (atomic_load_explicit(&v2p2.oom, memory_order_relaxed)) {
+                for (uint32_t p = 0; p < RADIX_P; p++)
+                    group_ht_free(&v2_part_hts[p]);
+                scratch_free(v2_part_hdr);
+                goto v2_done;
+            }
+            /* Hand off to the existing phase3 emit. */
+            part_hts = v2_part_hts;
+            part_hts_hdr = v2_part_hdr;
+            goto v2_emit;
+        }
+v2_done:;
         size_t n_bufs = (size_t)n_total * RADIX_P;
         radix_bufs = (radix_buf_t*)scratch_calloc(&radix_bufs_hdr,
             n_bufs * sizeof(radix_buf_t));
@@ -7506,7 +8040,180 @@ ht_path:;
             scratch_free(radix_bufs_hdr);
             radix_bufs = NULL;
             radix_bufs_hdr = NULL;
-            ray_heap_gc();
+            /* No explicit GC — top-level statement GC catches it. */
+        }
+
+v2_emit:;
+        /* Top-N aware compaction: when the (select … by … desc: c take: N)
+         * shape is in flight (use_emit_filter + top_count_take, COUNT agg),
+         * the global answer is the N rows with the largest count across
+         * all partitions.  Run a global bounded-heap (size N) over the
+         * union of per-partition rows here, then in-place compact each
+         * partition's row array to contain only globally-surviving rows.
+         * Phase3 below then emits N rows total instead of total_grps —
+         * the major win for high-cardinality keys like UserID/URL where
+         * total_grps is in the millions but N is ≤ 1024.
+         *
+         * Implementation notes:
+         *  - The bounded heap orders by count (the agg at COUNT slot, the
+         *    first int64 in each row).  Equal counts are stable: the
+         *    first row seen wins.  Final per-partition row order is
+         *    preserved so apply_sort_take below can do the final
+         *    arrange-by-agg deterministically.
+         *  - We also handle the "fewer total rows than N" case — compact
+         *    becomes a no-op.
+         *  - Only fires when emit_filter.top_count_take > 0; existing
+         *    min_count_exclusive-only filters fall through unchanged. */
+        if (use_topn_filter) {
+            int64_t k_take = emit_filter.top_count_take;
+            uint32_t total_pre = 0;
+            for (uint32_t p = 0; p < RADIX_P; p++)
+                total_pre += part_hts[p].grp_count;
+            /* Resolve the in-row offset of the order-by agg's value.  For
+             * COUNT it's the leading int64 at offset 0; for SUM/MIN/MAX
+             * it's the per-slot int64 in off_sum/off_min/off_max.  F64
+             * agg outputs (sum over an F64 column) compare by bitcast —
+             * for IEEE 754 the bit pattern preserves ordering for finite
+             * positive values; mixed-sign and NaN cases drop the heap
+             * back to a wider comparator.  To stay correct we exclude
+             * F64-output aggs from this fast path (the COUNT count is
+             * always I64, and SUM/MIN/MAX over an integer column keep
+             * an I64 slot — agg_is_f64 marks the SUM-over-F64 case). */
+            uint16_t order_op = emit_filter.agg_op
+                ? emit_filter.agg_op
+                : (uint16_t)OP_COUNT;
+            uint8_t  agg_index_local = emit_filter.agg_index;
+            uint16_t order_off = 0;  /* default: COUNT at row+0 */
+            bool order_is_f64 = false;
+            if (agg_index_local < n_aggs &&
+                (ght_layout.agg_is_f64 & (1u << agg_index_local)))
+                order_is_f64 = true;
+            int8_t agg_slot = ght_layout.agg_val_slot[agg_index_local];
+            if (order_op == OP_SUM) {
+                if (agg_slot < 0 || order_is_f64) goto topn_compact_skip;
+                order_off = (uint16_t)(ght_layout.off_sum
+                                       + (uint16_t)agg_slot * 8u);
+            } else if (order_op == OP_MIN) {
+                if (agg_slot < 0 || order_is_f64) goto topn_compact_skip;
+                if (ght_layout.agg_is_sym & (1u << agg_index_local))
+                    goto topn_compact_skip;
+                order_off = (uint16_t)(ght_layout.off_min
+                                       + (uint16_t)agg_slot * 8u);
+            } else if (order_op == OP_MAX) {
+                if (agg_slot < 0 || order_is_f64) goto topn_compact_skip;
+                if (ght_layout.agg_is_sym & (1u << agg_index_local))
+                    goto topn_compact_skip;
+                order_off = (uint16_t)(ght_layout.off_max
+                                       + (uint16_t)agg_slot * 8u);
+            }
+            uint8_t desc_dir = emit_filter.desc ? 1 : 0;
+            /* COUNT defaults to desc when the filter struct's desc bit
+             * isn't set (old single-bit filter shape).  Producer code in
+             * query.c sets it explicitly. */
+            if (order_op == OP_COUNT && !emit_filter.desc) desc_dir = 1;
+            if ((int64_t)total_pre > k_take && k_take > 0 && k_take <= 1024) {
+                /* Stack heap: (val, part, gid) triples.  k_take ≤ 1024
+                 * caps the footprint at 1024 * 16 B = 16 KiB.  The heap
+                 * invariant flips by direction: min-heap for desc (we
+                 * evict the smallest to keep the largest N), max-heap
+                 * for asc (evict the largest to keep the smallest N). */
+                int64_t hval[1024];
+                uint32_t hpart[1024];
+                uint32_t hgid[1024];
+                int64_t hn = 0;
+                /* For top-N largest (desc=1): min-heap.  Root is smallest;
+                 * incoming v replaces root iff v > root.  Heap invariant:
+                 * parent ≤ child (so swap when parent > child).
+                 *
+                 * For top-N smallest (desc=0): max-heap.  Root is largest;
+                 * incoming v replaces root iff v < root.  Heap invariant:
+                 * parent ≥ child (so swap when parent < child).
+                 *
+                 * TOPN_NEEDS_SWAP(parent, child) := does the parent
+                 * violate the invariant relative to child? */
+                #define TOPN_NEEDS_SWAP(parent, child) \
+                    (desc_dir ? ((parent) > (child)) : ((parent) < (child)))
+                #define TOPN_SHOULD_REPLACE(new_v, root_v) \
+                    (desc_dir ? ((new_v) > (root_v)) : ((new_v) < (root_v)))
+                for (uint32_t p = 0; p < RADIX_P; p++) {
+                    group_ht_t* ph = &part_hts[p];
+                    uint16_t rs = ph->layout.row_stride;
+                    uint32_t gc = ph->grp_count;
+                    for (uint32_t gi = 0; gi < gc; gi++) {
+                        const char* row = ph->rows + (size_t)gi * rs;
+                        int64_t v = *(const int64_t*)(const void*)
+                                    (row + order_off);
+                        if (hn < k_take) {
+                            int64_t j = hn++;
+                            hval[j] = v; hpart[j] = p; hgid[j] = gi;
+                            /* Sift up: bubble new entry toward root while
+                             * parent violates invariant. */
+                            while (j > 0) {
+                                int64_t pr = (j - 1) >> 1;
+                                if (!TOPN_NEEDS_SWAP(hval[pr], hval[j])) break;
+                                int64_t tc = hval[pr]; hval[pr] = hval[j]; hval[j] = tc;
+                                uint32_t tp = hpart[pr]; hpart[pr] = hpart[j]; hpart[j] = tp;
+                                uint32_t tg = hgid[pr]; hgid[pr] = hgid[j]; hgid[j] = tg;
+                                j = pr;
+                            }
+                        } else if (TOPN_SHOULD_REPLACE(v, hval[0])) {
+                            hval[0] = v; hpart[0] = p; hgid[0] = gi;
+                            int64_t j = 0;
+                            /* Sift down: find the child that should be
+                             * promoted (the one most violating the
+                             * invariant) and swap. */
+                            for (;;) {
+                                int64_t l = j * 2 + 1, r = l + 1, m = j;
+                                if (l < hn && TOPN_NEEDS_SWAP(hval[m], hval[l])) m = l;
+                                if (r < hn && TOPN_NEEDS_SWAP(hval[m], hval[r])) m = r;
+                                if (m == j) break;
+                                int64_t tc = hval[m]; hval[m] = hval[j]; hval[j] = tc;
+                                uint32_t tp = hpart[m]; hpart[m] = hpart[j]; hpart[j] = tp;
+                                uint32_t tg = hgid[m]; hgid[m] = hgid[j]; hgid[j] = tg;
+                                j = m;
+                            }
+                        }
+                    }
+                }
+                #undef TOPN_NEEDS_SWAP
+                #undef TOPN_SHOULD_REPLACE
+                if (hn > 0) {
+                    /* Build per-partition keep lists (sorted asc by gid so
+                     * the in-place compact below is a single forward sweep). */
+                    uint16_t keep_n[RADIX_P];
+                    for (uint32_t p = 0; p < RADIX_P; p++) keep_n[p] = 0;
+                    /* Cap per-partition kept count at hn (≤ k_take ≤ 1024). */
+                    uint32_t kgid[RADIX_P][1024];
+                    for (int64_t i = 0; i < hn; i++) {
+                        uint32_t p = hpart[i];
+                        uint16_t kn = keep_n[p];
+                        /* Insertion-sort into kgid[p][] keeping asc order. */
+                        uint16_t j = kn;
+                        while (j > 0 && kgid[p][j - 1] > hgid[i]) {
+                            kgid[p][j] = kgid[p][j - 1];
+                            j--;
+                        }
+                        kgid[p][j] = hgid[i];
+                        keep_n[p] = (uint16_t)(kn + 1);
+                    }
+                    /* In-place compact each partition. */
+                    for (uint32_t p = 0; p < RADIX_P; p++) {
+                        group_ht_t* ph = &part_hts[p];
+                        uint16_t rs = ph->layout.row_stride;
+                        uint16_t kn = keep_n[p];
+                        if (kn == ph->grp_count) continue;  /* all kept */
+                        if (kn == 0) { ph->grp_count = 0; continue; }
+                        for (uint16_t i = 0; i < kn; i++) {
+                            uint32_t src = kgid[p][i];
+                            if (src == (uint32_t)i) continue;
+                            memmove(ph->rows + (size_t)i * rs,
+                                    ph->rows + (size_t)src * rs, rs);
+                        }
+                        ph->grp_count = kn;
+                    }
+                }
+            }
+            topn_compact_skip:;
         }
 
         /* Prefix offsets */
@@ -8312,7 +9019,10 @@ sequential_fallback:;
         if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]);
     if (match_idx_block) ray_release(match_idx_block);
 
-    ray_heap_gc();
+    /* No explicit GC — top-level statement runner (run_piped / repl)
+     * calls ray_heap_gc() once per statement, catching every
+     * intermediate freed above.  The duplicate inner call doubled the
+     * per-query GC cost on bench loops. */
 
     return result;
 }
diff --git a/src/ops/hll.c b/src/ops/hll.c
new file mode 100644
index 00000000..ea2bc131
--- /dev/null
+++ b/src/ops/hll.c
@@ -0,0 +1,850 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops/hll.h"
+#include "ops/internal.h"
+#include "ops/ops.h"
+#include "core/pool.h"
+#include "table/sym.h"
+
+#include <math.h>
+#include <string.h>
+#include <stdatomic.h>
+
+int ray_hll_init(ray_hll_t* h, uint8_t p) {
+    if (!h) return -1;
+    if (p < 4) p = 4;            /* too small loses all accuracy */
+    if (p > 18) p = 18;           /* 256 KB cap on register array */
+    memset(h, 0, sizeof(*h));
+    uint32_t m = 1u << p;
+    h->p = p;
+    h->m = m;
+    h->regs = (uint8_t*)scratch_calloc(&h->_hdr, (size_t)m);
+    if (!h->regs) return -1;
+    return 0;
+}
+
+void ray_hll_init_sparse(ray_hll_t* h, uint8_t p,
+                          uint32_t* sparse_buf, uint32_t sparse_cap,
+                          uint8_t* dense_buf) {
+    if (!h) return;
+    if (p < 4) p = 4;
+    if (p > 18) p = 18;
+    memset(h, 0, sizeof(*h));
+    h->p = p;
+    h->m = 1u << p;
+    /* Encode caller-owned dense buffer as a tagged pointer in _hdr —
+     * low bit set ⇒ caller-owned (skip free), clear ⇒ scratch ray_t*.
+     * promote_to_dense recovers it; ray_hll_free skips the scratch_free.
+     * Stack allocations on x86-64 are at least 8-byte aligned for arrays
+     * of this size, so the low bit is always free for tagging. */
+    assert(((uintptr_t)dense_buf & 1u) == 0);
+    uintptr_t tagged = (uintptr_t)dense_buf | (uintptr_t)1;
+    h->_hdr = (ray_t*)tagged;
+    h->sparse_keys = sparse_buf;
+    h->sparse_count = 0;
+    h->sparse_cap = sparse_cap;
+}
+
+/* Recover the caller-owned dense buffer (NULL if none).  Used by
+ * promote_to_dense to install regs without a scratch alloc. */
+static inline uint8_t* hll_caller_dense_buf(const ray_hll_t* h) {
+    uintptr_t tagged = (uintptr_t)h->_hdr;
+    if (!(tagged & 1)) return NULL;
+    return (uint8_t*)(tagged & ~(uintptr_t)1);
+}
+
+void ray_hll_promote_to_dense(ray_hll_t* h) {
+    if (!h || h->regs) return;       /* already dense */
+    uint8_t* dense = hll_caller_dense_buf(h);
+    if (!dense) {
+        /* No caller buffer — fall back to scratch alloc.  Used by
+         * merge paths that promote a sparse src whose owner is the
+         * caller's stack but dst is heap-resident; we materialise a
+         * fresh dense buffer through the scratch arena. */
+        ray_t* hdr = NULL;
+        dense = (uint8_t*)scratch_calloc(&hdr, (size_t)h->m);
+        if (!dense) {
+            /* OOM during promote.  Leave sparse; caller's estimate
+             * will overflow into a small under-count.  This branch is
+             * extremely rare (the dense buffer is 16 KB at P=14). */
+            return;
+        }
+        h->_hdr = hdr;
+    } else {
+        /* Caller-owned: clear and install. */
+        memset(dense, 0, (size_t)h->m);
+        h->_hdr = NULL;  /* drop tagged pointer; no longer needed */
+    }
+    h->regs = dense;
+    /* Replay sparse entries into dense (max). */
+    uint32_t* sk = h->sparse_keys;
+    uint32_t  n  = h->sparse_count;
+    for (uint32_t i = 0; i < n; i++) {
+        uint32_t v = sk[i];
+        uint32_t idx = v >> 8;
+        uint8_t  rho = (uint8_t)(v & 0xFF);
+        if (rho > dense[idx]) dense[idx] = rho;
+    }
+    h->sparse_keys = NULL;
+    h->sparse_count = 0;
+    h->sparse_cap = 0;
+}
+
+void ray_hll_free(ray_hll_t* h) {
+    if (!h) return;
+    /* Only free if _hdr is a real scratch handle (low bit clear, non-NULL).
+     * Tagged caller-owned buffers and NULL _hdr are both no-ops. */
+    uintptr_t tagged = (uintptr_t)h->_hdr;
+    if (h->_hdr && !(tagged & 1)) scratch_free(h->_hdr);
+    h->regs = NULL;
+    h->_hdr = NULL;
+    h->sparse_keys = NULL;
+    h->sparse_count = 0;
+    h->sparse_cap = 0;
+    h->m = 0;
+    h->p = 0;
+}
+
+void ray_hll_reset(ray_hll_t* h) {
+    if (!h) return;
+    if (h->regs) {
+        memset(h->regs, 0, (size_t)h->m);
+        return;
+    }
+    if (h->sparse_keys) {
+        /* Don't memset the sparse buffer — entries are only read up to
+         * sparse_count, so clearing the count is enough. */
+        h->sparse_count = 0;
+    }
+}
+
+/* Merge a sparse src into a dense dst.  Each src entry contributes a
+ * rho-update at its idx slot. */
+static inline void hll_merge_sparse_into_dense(uint8_t* d,
+                                               const uint32_t* sk,
+                                               uint32_t n) {
+    for (uint32_t i = 0; i < n; i++) {
+        uint32_t v = sk[i];
+        uint32_t idx = v >> 8;
+        uint8_t  rho = (uint8_t)(v & 0xFF);
+        if (rho > d[idx]) d[idx] = rho;
+    }
+}
+
+void ray_hll_merge(ray_hll_t* dst, const ray_hll_t* src) {
+    if (!dst || !src) return;
+    if (dst->m != src->m) return;     /* mismatched precision — caller bug */
+    /* Promote dst to dense first if needed (cheap: at most 256 entries).
+     * dst's caller-owned dense buffer (if any) gets used; otherwise
+     * promote_to_dense scratch-allocates. */
+    if (!dst->regs) {
+        ray_hll_promote_to_dense(dst);
+        if (!dst->regs) return;       /* promote OOM — best-effort skip */
+    }
+    if (src->regs) {
+        const uint8_t* s = src->regs;
+        uint8_t*       d = dst->regs;
+        uint32_t       m = dst->m;
+        /* Branchless max — keeps the hot per-shard merge in vector regs.
+         * The compiler usually auto-vectorises this to a packed-max sequence. */
+        for (uint32_t i = 0; i < m; i++) {
+            uint8_t a = d[i], b = s[i];
+            d[i] = a > b ? a : b;
+        }
+    } else if (src->sparse_keys) {
+        hll_merge_sparse_into_dense(dst->regs, src->sparse_keys,
+                                    src->sparse_count);
+    }
+}
+
+/* HyperLogLog cardinality estimator (Flajolet, Fusy, Gandouet, Meunier 2007),
+ * with the original raw-estimate / linear-counting hybrid switch.  Skips the
+ * HLL++ small-range bias-correction tables because the linear-counting branch
+ * already gives a clean estimate below E ≤ 2.5·m, which is where the raw
+ * mean diverges from truth. */
+int64_t ray_hll_estimate(const ray_hll_t* h) {
+    if (!h) return 0;
+    uint32_t m = h->m;
+    if (m == 0) return 0;
+
+    /* alpha_m correction constant from the paper.  m == 16 / 32 / 64 use
+     * the closed-form values; everything else uses 0.7213 / (1 + 1.079/m). */
+    double alpha_m;
+    if      (m == 16) alpha_m = 0.673;
+    else if (m == 32) alpha_m = 0.697;
+    else if (m == 64) alpha_m = 0.709;
+    else              alpha_m = 0.7213 / (1.0 + 1.079 / (double)m);
+
+    /* Sum of 2^-reg[i].  Count zero registers for the linear-counting
+     * fallback at small cardinalities (when V > 0 and E ≤ 2.5·m).
+     * Sparse mode: only iterate the entries (each rho>=1 by construction);
+     * the remaining (m - sparse_count) registers contribute 2^0 = 1 each
+     * and count as zero registers. */
+    double   sum_inv  = 0.0;
+    uint32_t n_zeros  = 0;
+    if (h->regs) {
+        for (uint32_t i = 0; i < m; i++) {
+            uint8_t r = h->regs[i];
+            sum_inv += ldexp(1.0, -(int)r);   /* 2^-r */
+            n_zeros += (r == 0);
+        }
+    } else if (h->sparse_keys) {
+        uint32_t n = h->sparse_count;
+        const uint32_t* sk = h->sparse_keys;
+        /* Each entry stores a unique register idx (linear-probe dedup
+         * guarantees this).  Unset registers contribute 2^0 = 1.0 each
+         * and count as zeros. */
+        sum_inv = (double)(m - n);
+        n_zeros = m - n;
+        for (uint32_t i = 0; i < n; i++) {
+            uint8_t r = (uint8_t)(sk[i] & 0xFF);
+            sum_inv += ldexp(1.0, -(int)r);
+        }
+    } else {
+        /* Uninitialised — all m registers are conceptually zero. */
+        sum_inv = (double)m;
+        n_zeros = m;
+    }
+
+    double raw = alpha_m * (double)m * (double)m / sum_inv;
+
+    if (raw <= 2.5 * (double)m && n_zeros != 0) {
+        /* Linear counting — much tighter than raw for small E. */
+        raw = (double)m * log((double)m / (double)n_zeros);
+    }
+    /* Large-range bias-correction (the 2^32 upper-edge correction in the
+     * original paper) is for 32-bit hashes only — we hash 64 bits, so the
+     * raw value is already unbiased to ~2^57.  Skip. */
+
+    if (raw < 0.0) raw = 0.0;
+    return (int64_t)(raw + 0.5);
+}
+
+/* ---- Scalar approximate count-distinct aggregator ---------------------- */
+
+typedef struct {
+    const ray_t*  vec;
+    int8_t        type;
+    uint8_t       attrs;
+    bool          has_nulls;
+    ray_hll_t*    shards;          /* [n_workers] — one HLL per worker */
+    uint8_t       p;
+    uint32_t      n_workers;
+    _Atomic(int)  oom;
+} cda_scalar_ctx_t;
+
+static void cda_scalar_fn(void* raw, uint32_t worker_id, int64_t start, int64_t end) {
+    cda_scalar_ctx_t* c = (cda_scalar_ctx_t*)raw;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    ray_hll_t* sh = &c->shards[worker_id % c->n_workers];
+    if (!sh->regs) {
+        if (ray_hll_init(sh, c->p) != 0) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+    }
+    const ray_t* v = c->vec;
+    const void* base = ray_data((ray_t*)v);
+    int8_t  t = c->type;
+    bool    hn = c->has_nulls;
+    const int64_t CHK = 65535;
+
+    if (t == RAY_I64 || t == RAY_TIMESTAMP) {
+        const int64_t* d = (const int64_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int64_t v_i = d[r];
+            if (hn && v_i == NULL_I64) continue;
+            ray_hll_add(sh, ray_hash_i64(v_i));
+        }
+    } else if (t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) {
+        const int32_t* d = (const int32_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int32_t v_i = d[r];
+            if (hn && v_i == NULL_I32) continue;
+            ray_hll_add(sh, ray_hash_i64((int64_t)v_i));
+        }
+    } else if (t == RAY_I16) {
+        const int16_t* d = (const int16_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int16_t v_i = d[r];
+            if (hn && v_i == NULL_I16) continue;
+            ray_hll_add(sh, ray_hash_i64((int64_t)v_i));
+        }
+    } else if (t == RAY_BOOL || t == RAY_U8) {
+        const uint8_t* d = (const uint8_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            ray_hll_add(sh, ray_hash_i64((int64_t)d[r]));
+        }
+    } else if (t == RAY_F64) {
+        const double* d = (const double*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            double v_f = d[r];
+            if (v_f != v_f) continue;     /* NaN = null in F64 column */
+            ray_hll_add(sh, ray_hash_f64(v_f));
+        }
+    } else if (RAY_IS_SYM(t)) {
+        /* SYM is width-encoded — sym id 0 is the canonical empty-string
+         * sentinel (treat as null), every other id is a real distinct
+         * value, so hash the id directly. */
+        uint8_t w = c->attrs & RAY_SYM_W_MASK;
+        if (w == RAY_SYM_W64) {
+            const int64_t* d = (const int64_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                int64_t v_i = d[r];
+                if (v_i == 0) continue;
+                ray_hll_add(sh, ray_hash_i64(v_i));
+            }
+        } else if (w == RAY_SYM_W32) {
+            const uint32_t* d = (const uint32_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                uint32_t v_i = d[r];
+                if (v_i == 0) continue;
+                ray_hll_add(sh, ray_hash_i64((int64_t)v_i));
+            }
+        } else if (w == RAY_SYM_W16) {
+            const uint16_t* d = (const uint16_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                uint16_t v_i = d[r];
+                if (v_i == 0) continue;
+                ray_hll_add(sh, ray_hash_i64((int64_t)v_i));
+            }
+        } else {
+            const uint8_t* d = (const uint8_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                uint8_t v_i = d[r];
+                if (v_i == 0) continue;
+                ray_hll_add(sh, ray_hash_i64((int64_t)v_i));
+            }
+        }
+    } else if (t == RAY_STR) {
+        ray_t* vm = (ray_t*)v;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            size_t n = 0;
+            const char* s = ray_str_vec_get(vm, r, &n);
+            if (!s || n == 0) continue;
+            ray_hll_add(sh, ray_hash_bytes(s, n));
+        }
+    }
+    /* Unsupported types fall through silently — caller validates. */
+}
+
+ray_t* ray_count_distinct_approx(ray_t* x) {
+    if (!x || RAY_IS_ERR(x)) return x;
+    if (!ray_is_vec(x)) {
+        /* Scalar atom — distinct count is 1 (or 0 if null). */
+        if (ray_is_atom(x)) {
+            if (RAY_ATOM_IS_NULL(x)) return ray_i64(0);
+            return ray_i64(1);
+        }
+        return ray_error("type", "count_distinct_approx: vec expected");
+    }
+    int8_t t = x->type;
+    /* Reject types we don't hash. */
+    if (t != RAY_I64 && t != RAY_I32 && t != RAY_I16 && t != RAY_U8 &&
+        t != RAY_BOOL && t != RAY_F64 && t != RAY_DATE && t != RAY_TIME &&
+        t != RAY_TIMESTAMP && t != RAY_STR && !RAY_IS_SYM(t))
+        return ray_error("type", "count_distinct_approx: unsupported element type");
+    int64_t n = x->len;
+    if (n == 0) return ray_i64(0);
+
+    ray_pool_t* pool = ray_pool_get();
+    uint32_t nw = (pool && n >= RAY_PARALLEL_THRESHOLD)
+                  ? ray_pool_total_workers(pool) : 1;
+
+    ray_t* shards_hdr = NULL;
+    ray_hll_t* shards = (ray_hll_t*)scratch_calloc(
+        &shards_hdr, (size_t)nw * sizeof(ray_hll_t));
+    if (!shards) return ray_error("oom", NULL);
+
+    cda_scalar_ctx_t ctx = {
+        .vec = x,
+        .type = t,
+        .attrs = x->attrs,
+        .has_nulls = (x->attrs & RAY_ATTR_HAS_NULLS) != 0,
+        .shards = shards,
+        .p = RAY_HLL_DEFAULT_P,
+        .n_workers = nw,
+        .oom = 0,
+    };
+    if (nw > 1) {
+        ray_pool_dispatch(pool, cda_scalar_fn, &ctx, n);
+    } else {
+        cda_scalar_fn(&ctx, 0, 0, n);
+    }
+    if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) {
+        for (uint32_t w = 0; w < nw; w++) ray_hll_free(&shards[w]);
+        scratch_free(shards_hdr);
+        return ray_error("oom", "count_distinct_approx: HLL alloc failed");
+    }
+    /* Merge per-worker shards into shard[0], then estimate. */
+    for (uint32_t w = 1; w < nw; w++) {
+        if (shards[w].regs)
+            ray_hll_merge(&shards[0], &shards[w]);
+    }
+    int64_t est = shards[0].regs ? ray_hll_estimate(&shards[0]) : 0;
+    for (uint32_t w = 0; w < nw; w++) ray_hll_free(&shards[w]);
+    scratch_free(shards_hdr);
+    return ray_i64(est);
+}
+
+/* ---- Per-group HLL --------------------------------------------------- */
+
+typedef struct {
+    const ray_t*   vec;
+    int8_t         type;
+    uint8_t        attrs;
+    bool           has_nulls;
+    const int64_t* idx_buf;
+    const int64_t* offsets;
+    const int64_t* counts;       /* per-group length — offsets has only n_groups entries */
+    uint8_t        p;
+    uint32_t       m;
+    int64_t*       out;
+    _Atomic(int)   oom;
+} cda_pg_buf_ctx_t;
+
+static void cda_pg_buf_task(void* raw, uint32_t worker_id, int64_t start, int64_t end) {
+    (void)worker_id;
+    cda_pg_buf_ctx_t* c = (cda_pg_buf_ctx_t*)raw;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    const void* base = ray_data((ray_t*)c->vec);
+    int8_t  t  = c->type;
+    bool    hn = c->has_nulls;
+
+    /* One private HLL per task (allocated on stack so we never touch
+     * the shared scratch arena from a worker thread).  P≤14 → m≤16384,
+     * fits comfortably in the default 8 MiB worker stack.
+     *
+     * Sparse start: the sketch begins in sparse mode using sparse_buf
+     * (256 entries, 1 KB).  Groups with few distinct values never touch
+     * the dense register array; once the sparse cap is hit on a group,
+     * promote_to_dense moves it into the stack regs[] buffer.  The
+     * dense buffer is unconditionally allocated on the stack so the
+     * promotion path is alloc-free. */
+    uint8_t  regs[1u << 14];
+    uint32_t sparse_buf[RAY_HLL_SPARSE_CAP];
+    ray_hll_t sk;
+
+    for (int64_t g = start; g < end; g++) {
+        ray_hll_init_sparse(&sk, c->p, sparse_buf,
+                            RAY_HLL_SPARSE_CAP, regs);
+        int64_t s = c->offsets[g];
+        int64_t e = s + c->counts[g];
+        if (t == RAY_I64 || t == RAY_TIMESTAMP) {
+            const int64_t* d = (const int64_t*)base;
+            for (int64_t k = s; k < e; k++) {
+                int64_t r = c->idx_buf[k];
+                int64_t v = d[r];
+                if (hn && v == NULL_I64) continue;
+                ray_hll_add(&sk, ray_hash_i64(v));
+            }
+        } else if (t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) {
+            const int32_t* d = (const int32_t*)base;
+            for (int64_t k = s; k < e; k++) {
+                int64_t r = c->idx_buf[k];
+                int32_t v = d[r];
+                if (hn && v == NULL_I32) continue;
+                ray_hll_add(&sk, ray_hash_i64((int64_t)v));
+            }
+        } else if (t == RAY_I16) {
+            const int16_t* d = (const int16_t*)base;
+            for (int64_t k = s; k < e; k++) {
+                int64_t r = c->idx_buf[k];
+                int16_t v = d[r];
+                if (hn && v == NULL_I16) continue;
+                ray_hll_add(&sk, ray_hash_i64((int64_t)v));
+            }
+        } else if (t == RAY_BOOL || t == RAY_U8) {
+            const uint8_t* d = (const uint8_t*)base;
+            for (int64_t k = s; k < e; k++) {
+                int64_t r = c->idx_buf[k];
+                ray_hll_add(&sk, ray_hash_i64((int64_t)d[r]));
+            }
+        } else if (t == RAY_F64) {
+            const double* d = (const double*)base;
+            for (int64_t k = s; k < e; k++) {
+                int64_t r = c->idx_buf[k];
+                double v = d[r];
+                if (v != v) continue;
+                ray_hll_add(&sk, ray_hash_f64(v));
+            }
+        } else if (RAY_IS_SYM(t)) {
+            uint8_t w = c->attrs & RAY_SYM_W_MASK;
+            if (w == RAY_SYM_W64) {
+                const int64_t* d = (const int64_t*)base;
+                for (int64_t k = s; k < e; k++) {
+                    int64_t r = c->idx_buf[k];
+                    int64_t v = d[r]; if (v == 0) continue;
+                    ray_hll_add(&sk, ray_hash_i64(v));
+                }
+            } else if (w == RAY_SYM_W32) {
+                const uint32_t* d = (const uint32_t*)base;
+                for (int64_t k = s; k < e; k++) {
+                    int64_t r = c->idx_buf[k];
+                    uint32_t v = d[r]; if (v == 0) continue;
+                    ray_hll_add(&sk, ray_hash_i64((int64_t)v));
+                }
+            } else if (w == RAY_SYM_W16) {
+                const uint16_t* d = (const uint16_t*)base;
+                for (int64_t k = s; k < e; k++) {
+                    int64_t r = c->idx_buf[k];
+                    uint16_t v = d[r]; if (v == 0) continue;
+                    ray_hll_add(&sk, ray_hash_i64((int64_t)v));
+                }
+            } else {
+                const uint8_t* d = (const uint8_t*)base;
+                for (int64_t k = s; k < e; k++) {
+                    int64_t r = c->idx_buf[k];
+                    uint8_t v = d[r]; if (v == 0) continue;
+                    ray_hll_add(&sk, ray_hash_i64((int64_t)v));
+                }
+            }
+        }
+        c->out[g] = ray_hll_estimate(&sk);
+    }
+}
+
+int ray_count_distinct_approx_pg_buf(ray_t* src,
+                                      const int64_t* idx_buf,
+                                      const int64_t* offsets,
+                                      const int64_t* counts,
+                                      int64_t n_groups,
+                                      uint8_t p, int64_t* out)
+{
+    if (!src || RAY_IS_ERR(src) || !idx_buf || !offsets || !counts || !out)
+        return -1;
+    int8_t t = src->type;
+    bool hashable = (t == RAY_I64 || t == RAY_I32 || t == RAY_I16 ||
+                      t == RAY_U8 || t == RAY_BOOL || t == RAY_F64 ||
+                      t == RAY_DATE || t == RAY_TIME || t == RAY_TIMESTAMP ||
+                      RAY_IS_SYM(t));
+    if (!hashable) return -1;
+    if (n_groups <= 0) return 0;
+    if (p < 4) p = 4;
+    if (p > 14) p = 14;
+    uint32_t m = 1u << p;
+
+    cda_pg_buf_ctx_t ctx = {
+        .vec = src,
+        .type = t,
+        .attrs = src->attrs,
+        .has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0,
+        .idx_buf = idx_buf,
+        .offsets = offsets,
+        .counts = counts,
+        .p = p,
+        .m = m,
+        .out = out,
+        .oom = 0,
+    };
+    ray_pool_t* pool = ray_pool_get();
+    if (pool && ray_pool_total_workers(pool) >= 2 && n_groups >= 4) {
+        /* dispatch_n issues exactly n_groups tasks of [i, i+1), but the
+         * task ring is hard-capped at 65536 so n_groups > 65536 would
+         * silently drop trailing groups.  For high-cardinality grouping
+         * use element-based dispatch — each worker gets a range of
+         * groups, processes them serially, and reuses its stack sketch
+         * across the range. */
+        if (n_groups <= 65536) {
+            ray_pool_dispatch_n(pool, cda_pg_buf_task, &ctx, (uint32_t)n_groups);
+        } else {
+            ray_pool_dispatch(pool, cda_pg_buf_task, &ctx, n_groups);
+        }
+    } else {
+        cda_pg_buf_task(&ctx, 0, 0, n_groups);
+    }
+    if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) return -1;
+    return 0;
+}
+
+/* ---- Streaming per-group HLL ----------------------------------------- */
+
+/* Streaming kernel layout
+ * -----------------------
+ * Each worker owns a contiguous *bank* of n_groups HLL sketches.  Memory
+ * for a bank is one slab allocated up-front (sketches + sparse keys +
+ * dense regs) so the per-row hot loop is alloc-free.  Each sketch starts
+ * sparse; ray_hll_add transparently promotes to its caller-owned dense
+ * buffer once the sparse cap is exceeded.
+ *
+ * After the streaming pass, banks are merged element-wise (max) into
+ * bank[0] and the per-group estimates are written to out[gid].
+ */
+
+typedef struct {
+    /* Per-worker bank base pointers.  Each bank holds n_groups sketches
+     * whose `sparse_keys` / dense slots point into the per-worker pool. */
+    ray_hll_t**      banks;          /* [n_workers] */
+    /* Constant inputs. */
+    const ray_t*     vec;
+    const int64_t*   row_gid;
+    int64_t          n_rows;
+    int64_t          n_groups;
+    int8_t           type;
+    uint8_t          attrs;
+    bool             has_nulls;
+    uint8_t          p;
+    uint32_t         m;
+} cda_pg_stream_ctx_t;
+
+/* Worker per-row body — picks up the bank for this worker, decodes the
+ * column-type once into a local pointer, and updates bank[gid] for each
+ * row in the assigned range. */
+static void cda_pg_stream_task(void* raw, uint32_t worker_id,
+                               int64_t start, int64_t end) {
+    cda_pg_stream_ctx_t* c = (cda_pg_stream_ctx_t*)raw;
+    ray_hll_t* bank = c->banks[worker_id];
+    if (!bank) return;
+    const void*    base    = ray_data((ray_t*)c->vec);
+    const int64_t* row_gid = c->row_gid;
+    int64_t        ng      = c->n_groups;
+    int8_t         t       = c->type;
+    bool           hn      = c->has_nulls;
+    const int64_t  CHK     = 65535;
+
+    if (t == RAY_I64 || t == RAY_TIMESTAMP) {
+        const int64_t* d = (const int64_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int64_t gid = row_gid[r];
+            if (gid < 0 || gid >= ng) continue;
+            int64_t v = d[r];
+            if (hn && v == NULL_I64) continue;
+            ray_hll_add(&bank[gid], ray_hash_i64(v));
+        }
+    } else if (t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) {
+        const int32_t* d = (const int32_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int64_t gid = row_gid[r];
+            if (gid < 0 || gid >= ng) continue;
+            int32_t v = d[r];
+            if (hn && v == NULL_I32) continue;
+            ray_hll_add(&bank[gid], ray_hash_i64((int64_t)v));
+        }
+    } else if (t == RAY_I16) {
+        const int16_t* d = (const int16_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int64_t gid = row_gid[r];
+            if (gid < 0 || gid >= ng) continue;
+            int16_t v = d[r];
+            if (hn && v == NULL_I16) continue;
+            ray_hll_add(&bank[gid], ray_hash_i64((int64_t)v));
+        }
+    } else if (t == RAY_BOOL || t == RAY_U8) {
+        const uint8_t* d = (const uint8_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int64_t gid = row_gid[r];
+            if (gid < 0 || gid >= ng) continue;
+            ray_hll_add(&bank[gid], ray_hash_i64((int64_t)d[r]));
+        }
+    } else if (t == RAY_F64) {
+        const double* d = (const double*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int64_t gid = row_gid[r];
+            if (gid < 0 || gid >= ng) continue;
+            double v = d[r];
+            if (v != v) continue;
+            ray_hll_add(&bank[gid], ray_hash_f64(v));
+        }
+    } else if (RAY_IS_SYM(t)) {
+        uint8_t w = c->attrs & RAY_SYM_W_MASK;
+        if (w == RAY_SYM_W64) {
+            const int64_t* d = (const int64_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                int64_t gid = row_gid[r];
+                if (gid < 0 || gid >= ng) continue;
+                int64_t v = d[r]; if (v == 0) continue;
+                ray_hll_add(&bank[gid], ray_hash_i64(v));
+            }
+        } else if (w == RAY_SYM_W32) {
+            const uint32_t* d = (const uint32_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                int64_t gid = row_gid[r];
+                if (gid < 0 || gid >= ng) continue;
+                uint32_t v = d[r]; if (v == 0) continue;
+                ray_hll_add(&bank[gid], ray_hash_i64((int64_t)v));
+            }
+        } else if (w == RAY_SYM_W16) {
+            const uint16_t* d = (const uint16_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                int64_t gid = row_gid[r];
+                if (gid < 0 || gid >= ng) continue;
+                uint16_t v = d[r]; if (v == 0) continue;
+                ray_hll_add(&bank[gid], ray_hash_i64((int64_t)v));
+            }
+        } else {
+            const uint8_t* d = (const uint8_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                int64_t gid = row_gid[r];
+                if (gid < 0 || gid >= ng) continue;
+                uint8_t v = d[r]; if (v == 0) continue;
+                ray_hll_add(&bank[gid], ray_hash_i64((int64_t)v));
+            }
+        }
+    }
+}
+
+int ray_count_distinct_approx_pg_stream(ray_t* src,
+                                         const int64_t* row_gid,
+                                         int64_t n_rows,
+                                         int64_t n_groups,
+                                         uint8_t p, int64_t* out)
+{
+    if (!src || RAY_IS_ERR(src) || !row_gid || !out) return -1;
+    if (n_rows <= 0 || n_groups <= 0) return -1;
+    int8_t t = src->type;
+    bool hashable = (t == RAY_I64 || t == RAY_I32 || t == RAY_I16 ||
+                      t == RAY_U8 || t == RAY_BOOL || t == RAY_F64 ||
+                      t == RAY_DATE || t == RAY_TIME || t == RAY_TIMESTAMP ||
+                      RAY_IS_SYM(t));
+    if (!hashable) return -1;
+    if (p < 4) p = 4;
+    if (p > 14) p = 14;
+    uint32_t m = 1u << p;
+
+    /* Choose worker count from the existing parallel threshold; the pool
+     * dispatcher partitions n_rows into morsels across n_workers + main. */
+    ray_pool_t* pool = ray_pool_get();
+    uint32_t nw = (pool && n_rows >= RAY_PARALLEL_THRESHOLD)
+                  ? ray_pool_total_workers(pool) : 1;
+
+    /* Allocate per-worker banks.  One slab per worker: sketches array,
+     * then sparse-key pool (n_groups * RAY_HLL_SPARSE_CAP * 4 bytes),
+     * then dense-regs pool (n_groups * m bytes).  Pre-allocating dense
+     * means promotion in the hot loop is a memset + replay, alloc-free. */
+    ray_t* banks_hdr = NULL;
+    ray_hll_t** banks = (ray_hll_t**)scratch_calloc(
+        &banks_hdr, (size_t)nw * sizeof(ray_hll_t*));
+    if (!banks) return -1;
+
+    /* Per-worker scratch headers, freed at end. */
+    ray_t** slab_hdrs_array = NULL;
+    ray_t* slab_hdrs_hdr = NULL;
+    slab_hdrs_array = (ray_t**)scratch_calloc(
+        &slab_hdrs_hdr, (size_t)nw * sizeof(ray_t*));
+    if (!slab_hdrs_array) {
+        scratch_free(banks_hdr);
+        return -1;
+    }
+
+    size_t sketches_bytes = (size_t)n_groups * sizeof(ray_hll_t);
+    size_t sparse_bytes   = (size_t)n_groups *
+                             RAY_HLL_SPARSE_CAP * sizeof(uint32_t);
+    size_t dense_bytes    = (size_t)n_groups * (size_t)m;
+    size_t per_worker     = sketches_bytes + sparse_bytes + dense_bytes;
+
+    bool oom = false;
+    for (uint32_t w = 0; w < nw; w++) {
+        ray_t* slab_hdr = NULL;
+        uint8_t* slab = (uint8_t*)scratch_alloc(&slab_hdr, per_worker);
+        if (!slab) { oom = true; break; }
+        slab_hdrs_array[w] = slab_hdr;
+        ray_hll_t* sketches = (ray_hll_t*)slab;
+        uint32_t*  sparse   = (uint32_t*)(slab + sketches_bytes);
+        uint8_t*   dense    = slab + sketches_bytes + sparse_bytes;
+        /* Init each sketch sparse, pointed at its slice of the pools. */
+        for (int64_t g = 0; g < n_groups; g++) {
+            ray_hll_init_sparse(&sketches[g], p,
+                                sparse + (size_t)g * RAY_HLL_SPARSE_CAP,
+                                RAY_HLL_SPARSE_CAP,
+                                dense + (size_t)g * m);
+        }
+        banks[w] = sketches;
+    }
+    if (oom) {
+        for (uint32_t w = 0; w < nw; w++) {
+            if (slab_hdrs_array[w]) scratch_free(slab_hdrs_array[w]);
+        }
+        scratch_free(slab_hdrs_hdr);
+        scratch_free(banks_hdr);
+        return -1;
+    }
+
+    cda_pg_stream_ctx_t ctx = {
+        .banks    = banks,
+        .vec      = src,
+        .row_gid  = row_gid,
+        .n_rows   = n_rows,
+        .n_groups = n_groups,
+        .type     = t,
+        .attrs    = src->attrs,
+        .has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0,
+        .p        = p,
+        .m        = m,
+    };
+
+    if (nw > 1) {
+        ray_pool_dispatch(pool, cda_pg_stream_task, &ctx, n_rows);
+    } else {
+        cda_pg_stream_task(&ctx, 0, 0, n_rows);
+    }
+
+    /* Merge worker banks into bank[0], then estimate per group.
+     *
+     * Per gid: merge bank[1..nw-1][gid] into bank[0][gid].  ray_hll_merge
+     * handles both (sparse|dense) × (sparse|dense) combinations and
+     * promotes dst as needed.  After merge, bank[0][gid] estimate is the
+     * answer.  We merge gid-by-gid (rather than worker-by-worker over all
+     * gids) so a finished dst stays hot across estimation. */
+    for (int64_t g = 0; g < n_groups; g++) {
+        ray_hll_t* dst = &banks[0][g];
+        for (uint32_t w = 1; w < nw; w++) {
+            ray_hll_merge(dst, &banks[w][g]);
+        }
+        out[g] = ray_hll_estimate(dst);
+    }
+
+    /* Free per-worker slabs.  Caller-owned sparse + dense buffers were
+     * not separately allocated, so ray_hll_free is a no-op on each
+     * sketch (low-bit-tagged _hdr or NULL _hdr).  Promotion-time scratch
+     * allocations (when promote_to_dense needed an arena dense buf — only
+     * possible if the caller's tagged buf had been cleared, which doesn't
+     * happen here since dense was provided up-front) are owned by the
+     * sketch's _hdr; if any are present, ray_hll_free releases them. */
+    for (uint32_t w = 0; w < nw; w++) {
+        for (int64_t g = 0; g < n_groups; g++) ray_hll_free(&banks[w][g]);
+        scratch_free(slab_hdrs_array[w]);
+    }
+    scratch_free(slab_hdrs_hdr);
+    scratch_free(banks_hdr);
+    return 0;
+}
diff --git a/src/ops/hll.h b/src/ops/hll.h
new file mode 100644
index 00000000..b996d21b
--- /dev/null
+++ b/src/ops/hll.h
@@ -0,0 +1,220 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_OPS_HLL_H
+#define RAY_OPS_HLL_H
+
+/**
+ * Probabilistic cardinality sketch (HyperLogLog).
+ *
+ * Each sketch holds 2^P registers; each register stores the maximum
+ * leading-zero count (rho) seen for any hash whose top P bits index
+ * that register.  Cardinality is then read off the harmonic mean of
+ * 2^reg over all registers, with bias correction for both ends of
+ * the range.  Standard error ≈ 1.04 / sqrt(2^P).  P=14 → ≈ 0.8 %.
+ *
+ * Memory: 1 byte per register (8-bit reg holds rho up to 64+P, way
+ * over the 6 bits a packed implementation would need; the extra few
+ * KB buys a tighter hot loop).  At P=14 a sketch is 16 KB and lives
+ * in L2 for the duration of one query.
+ *
+ * Sparse representation:
+ *   Per-group HLL at high group counts wants to amortise the 16 KB
+ *   sketch across groups that may only see a handful of hashes each
+ *   (q13 SearchPhrase × UserID: many groups with < 50 uniques).  In
+ *   sparse mode the sketch stores only the registers that have been
+ *   written, as 32-bit `(reg_idx << 8) | rho` entries in a small
+ *   caller-provided buffer.  The estimate / merge paths transparently
+ *   support both modes; sparse converts to dense when the entry count
+ *   exceeds the cap (caller-supplied; the per-group kernel uses 256).
+ *
+ * The sketch is mergeable element-wise (max), which is the property
+ * the per-group / per-worker aggregation paths rely on: each worker
+ * builds a local sketch and the planner merges them at finalisation.
+ */
+
+#include "rayforce.h"
+#include "core/platform.h"
+#include "ops/hash.h"
+
+/* Default precision: 14 (16384 registers, ~0.81 % std error, 16 KB). */
+#define RAY_HLL_DEFAULT_P  14
+
+/* Sparse cap for per-group sketches.  Each entry is 4 bytes, so the
+ * sparse buffer is 1 KB at this cap — well inside L1 and 16× smaller
+ * than the dense register array.  Above the cap, sparse is converted
+ * to dense in place (caller supplies both buffers on the stack). */
+#define RAY_HLL_SPARSE_CAP 256
+
+typedef struct {
+    uint8_t   p;             /* precision: register count = 1 << p */
+    uint32_t  m;             /* register count */
+    uint8_t*  regs;          /* dense: [m] register array (NULL in sparse mode) */
+    /* Sparse mode (active when sparse_keys != NULL && regs == NULL):
+     * sparse_keys[i] = (reg_idx << 8) | rho — unsorted linear-probe set
+     * over reg_idx (rho updated in-place on duplicate idx). */
+    uint32_t* sparse_keys;
+    uint32_t  sparse_count;
+    uint32_t  sparse_cap;
+    ray_t*    _hdr;          /* scratch handle for regs (sparse uses caller buf) */
+} ray_hll_t;
+
+/* Initialise an empty *dense* sketch with `p` precision bits.  Allocates
+ * regs via scratch_alloc; the caller frees with ray_hll_free.  Returns
+ * 0 on success, -1 on OOM. */
+int  ray_hll_init(ray_hll_t* h, uint8_t p);
+
+/* Initialise an empty *sparse* sketch with caller-provided buffers.
+ *   sparse_buf — buffer of size sparse_cap entries, used as the sparse
+ *                set until conversion to dense.
+ *   dense_buf  — buffer of size 1<<p bytes, populated on conversion.
+ * Both buffers are typically stack-allocated by the worker task.  The
+ * sketch starts sparse (regs == NULL).  No allocation occurs; this
+ * never fails.  Caller does not need to call ray_hll_free. */
+void ray_hll_init_sparse(ray_hll_t* h, uint8_t p,
+                          uint32_t* sparse_buf, uint32_t sparse_cap,
+                          uint8_t* dense_buf);
+
+/* Free the regs allocation.  Safe on a zeroed (uninitialised) sketch.
+ * Sparse sketches with caller-provided buffers have _hdr == NULL and
+ * are a no-op here — they're freed implicitly when the stack frame
+ * unwinds. */
+void ray_hll_free(ray_hll_t* h);
+
+/* Zero all registers (clears the sketch — same effect as init with the
+ * same p, but in-place; useful when reusing a sketch across calls).
+ * Resets to sparse mode if a sparse buffer is attached. */
+void ray_hll_reset(ray_hll_t* h);
+
+/* Sparse → dense conversion.  Replays sparse_keys into the (already-
+ * attached) dense buffer, zeros remaining registers, clears sparse_count.
+ * Out-of-line: only called when the sparse cap is hit. */
+void ray_hll_promote_to_dense(ray_hll_t* h);
+
+/* Add a 64-bit hash to the sketch.  Caller is responsible for hashing
+ * its value type before invoking — see ray_hash_i64 / ray_hash_bytes
+ * in ops/hash.h.  Hot path; kept fully inline.  Dense fast path is
+ * marked likely; the sparse arm is the fallback for per-group sketches
+ * that haven't yet exceeded RAY_HLL_SPARSE_CAP. */
+static inline void ray_hll_add(ray_hll_t* h, uint64_t hash) {
+    uint32_t idx = (uint32_t)(hash >> (64u - h->p));
+    /* The low (64-p) bits hold the value we scan for the leading-zero
+     * run.  Sentinel-bit at position (64-p-1) keeps the rho value in
+     * [1, 64-p+1] without a branch on all-zero. */
+    uint64_t rest = (hash << h->p) | (1ULL << (h->p - 1));
+    uint8_t  rho  = (uint8_t)(__builtin_clzll(rest) + 1u);
+
+    if (RAY_LIKELY(h->regs != NULL)) {
+        if (rho > h->regs[idx]) h->regs[idx] = rho;
+        return;
+    }
+    /* Sparse path — linear scan over up to RAY_HLL_SPARSE_CAP entries.
+     * Cap is small (256) so the inner loop is L1-resident; the compiler
+     * folds it into a SIMD-friendly compare-and-mask sequence. */
+    uint32_t* sk = h->sparse_keys;
+    uint32_t  n  = h->sparse_count;
+    uint32_t  enc = (idx << 8) | rho;
+    for (uint32_t i = 0; i < n; i++) {
+        uint32_t cur = sk[i];
+        if ((cur >> 8) == idx) {
+            /* Same register — keep max rho. */
+            if (rho > (cur & 0xFF)) sk[i] = enc;
+            return;
+        }
+    }
+    if (n < h->sparse_cap) {
+        sk[n] = enc;
+        h->sparse_count = n + 1;
+        return;
+    }
+    /* Cap hit — promote and re-insert. */
+    ray_hll_promote_to_dense(h);
+    if (rho > h->regs[idx]) h->regs[idx] = rho;
+}
+
+/* Merge src into dst (element-wise max).  src and dst must share the
+ * same precision p.  Handles all four (dense/sparse)×(dense/sparse)
+ * combinations; sparse+sparse promotes dst to dense first so the
+ * merged sketch remains a valid dense register array. */
+void ray_hll_merge(ray_hll_t* dst, const ray_hll_t* src);
+
+/* Estimate the unique-value count of all hashes added so far.  Uses
+ * the standard HyperLogLog estimator with bias-corrected raw-mean for
+ * the mid-range and linear counting (m * ln(m/V)) when many registers
+ * are still zero (V = unused register count).  Branches on mode:
+ * dense scans the register array; sparse iterates the entry set and
+ * accounts for (m - sparse_count) unset registers analytically. */
+int64_t ray_hll_estimate(const ray_hll_t* h);
+
+/* Scalar approximate `count(distinct …)` over a vec, ~0.8 % standard
+ * error.  Handles I64/I32/I16/I8/U8/BOOL/F64/DATE/TIME/TIMESTAMP/SYM/
+ * STR.  Nulls are skipped (matches the SQL `count distinct` semantics).
+ * Parallelised: each worker builds a private sketch over its row range
+ * and the main thread merges them before extracting the estimate.
+ * Wired into `exec_count_distinct` above an input-row threshold. */
+ray_t* ray_count_distinct_approx(ray_t* x);
+
+/* Per-group approximate `count(distinct …)` over a buffered row-index
+ * layout: group g owns the row indices
+ *   idx_buf[offsets[g] .. offsets[g] + counts[g]).
+ * Parallelised across groups — each task uses a private stack-resident
+ * HLL that starts in sparse mode (1 KB) and converts to dense (16 KB)
+ * on overflow.  Sparse mode keeps the memset / estimate cost bounded
+ * by `min(unique_in_group, sparse_cap)` instead of m, which is the
+ * decisive win at high group counts where the average group has few
+ * unique values.
+ *
+ * Callers holding a row_gid layout instead build idx_buf+offsets+counts
+ * once and call this; there's a single per-group kernel.  Writes the
+ * estimate to out[gid].  Returns 0 on success, -1 on unsupported type
+ * (caller falls back to exact). */
+int ray_count_distinct_approx_pg_buf(ray_t* src,
+                                      const int64_t* idx_buf,
+                                      const int64_t* offsets,
+                                      const int64_t* counts,
+                                      int64_t n_groups,
+                                      uint8_t p, int64_t* out);
+
+/* Streaming per-group HLL — single pass over (row_gid[r], hashes[r])
+ * directly accumulating into n_groups sketches per worker, skipping
+ * the (idx_buf + offsets + counts) CSR scatter that the _pg_buf entry
+ * point requires.  Each worker owns a private bank of n_groups sparse
+ * sketches; after the pass, banks are merged element-wise (max) into
+ * worker 0's bank and the estimates are written to out[gid].
+ *
+ * Memory: per worker = n_groups * (sparse_cap*4 + (1<<p)) bytes; at
+ * p=14 that's ~17 KB per group.  Caller must gate on a memory budget
+ * — this kernel does not validate `n_groups` against available memory.
+ *
+ * Supported types: BOOL / U8 / I16 / I32 / I64 / F64 / DATE / TIME /
+ * TIMESTAMP / SYM.  Returns 0 on success, -1 on unsupported type,
+ * OOM, or empty input.  Caller falls back to _pg_buf (which itself
+ * falls back to exact partitioned dedup) on failure. */
+int ray_count_distinct_approx_pg_stream(ray_t* src,
+                                         const int64_t* row_gid,
+                                         int64_t n_rows,
+                                         int64_t n_groups,
+                                         uint8_t p,
+                                         int64_t* out);
+
+#endif /* RAY_OPS_HLL_H */
diff --git a/src/ops/idxop.c b/src/ops/idxop.c
index 3f74476b..65263971 100644
--- a/src/ops/idxop.c
+++ b/src/ops/idxop.c
@@ -29,8 +29,10 @@
 #include "table/sym.h"
 #include "lang/eval.h"
 #include "ops/ops.h"
+#include "ops/rowsel.h"
 #include <math.h>
 #include <string.h>
+#include <stdlib.h>
 
 /* Width of one element of a numeric vector type, or 0 if unsupported. */
 static int numeric_elem_size(int8_t t) {
@@ -154,6 +156,17 @@ void ray_index_release_payload(ray_index_t* ix) {
             ray_release(ix->u.bloom.bits);
         ix->u.bloom.bits = NULL;
         break;
+    case RAY_IDX_CHUNK_ZONE:
+        if (ix->u.chunk_zone.mins && !RAY_IS_ERR(ix->u.chunk_zone.mins))
+            ray_release(ix->u.chunk_zone.mins);
+        if (ix->u.chunk_zone.maxs && !RAY_IS_ERR(ix->u.chunk_zone.maxs))
+            ray_release(ix->u.chunk_zone.maxs);
+        if (ix->u.chunk_zone.null_bits && !RAY_IS_ERR(ix->u.chunk_zone.null_bits))
+            ray_release(ix->u.chunk_zone.null_bits);
+        ix->u.chunk_zone.mins = NULL;
+        ix->u.chunk_zone.maxs = NULL;
+        ix->u.chunk_zone.null_bits = NULL;
+        break;
     case RAY_IDX_ZONE:
     case RAY_IDX_NONE:
         break;
@@ -176,6 +189,14 @@ void ray_index_retain_payload(ray_index_t* ix) {
         if (ix->u.bloom.bits && !RAY_IS_ERR(ix->u.bloom.bits))
             ray_retain(ix->u.bloom.bits);
         break;
+    case RAY_IDX_CHUNK_ZONE:
+        if (ix->u.chunk_zone.mins && !RAY_IS_ERR(ix->u.chunk_zone.mins))
+            ray_retain(ix->u.chunk_zone.mins);
+        if (ix->u.chunk_zone.maxs && !RAY_IS_ERR(ix->u.chunk_zone.maxs))
+            ray_retain(ix->u.chunk_zone.maxs);
+        if (ix->u.chunk_zone.null_bits && !RAY_IS_ERR(ix->u.chunk_zone.null_bits))
+            ray_retain(ix->u.chunk_zone.null_bits);
+        break;
     case RAY_IDX_ZONE:
     case RAY_IDX_NONE:
         break;
@@ -262,6 +283,107 @@ static ray_err_t zone_scan(ray_t* v, ray_index_t* ix) {
     }
 }
 
+/* --------------------------------------------------------------------------
+ * Chunk-zone scan -- per-(1<<chunk_log2)-row min/max + null flag
+ *
+ * For each chunk g in [0, n_chunks) the scan computes the chunk's min and
+ * max value across its row range and sets the chunk's null-bit if any row
+ * in that chunk is a null sentinel.  Whole-column extrema fall out as
+ * min(mins[*]) / max(maxs[*]) so the reduce min/max path can consume this
+ * index without needing a separate column-wide zone.
+ * -------------------------------------------------------------------------- */
+
+static ray_err_t chunk_zone_scan_int(ray_t* v, ray_index_t* ix,
+                                     int elem_size) {
+    uint32_t n_chunks = ix->u.chunk_zone.n_chunks;
+    uint8_t  log2     = ix->u.chunk_zone.chunk_log2;
+    int64_t  csz      = 1LL << log2;
+    int64_t  n        = v->len;
+    int64_t* mins     = (int64_t*)ray_data(ix->u.chunk_zone.mins);
+    int64_t* maxs     = (int64_t*)ray_data(ix->u.chunk_zone.maxs);
+    uint8_t* nbits    = (uint8_t*)ray_data(ix->u.chunk_zone.null_bits);
+    const uint8_t* base = (const uint8_t*)ray_data(v);
+
+    for (uint32_t g = 0; g < n_chunks; g++) {
+        int64_t s = (int64_t)g * csz;
+        int64_t e = s + csz; if (e > n) e = n;
+        int64_t mn = INT64_MAX, mx = INT64_MIN;
+        bool any_null = false;
+        for (int64_t i = s; i < e; i++) {
+            if (ray_vec_is_null(v, i)) { any_null = true; continue; }
+            int64_t val = 0;
+            switch (elem_size) {
+            case 1: val = (int64_t)base[i]; break;
+            case 2: { int16_t t; memcpy(&t, base + i*2, 2); val = (int64_t)t; break; }
+            case 4: { int32_t t; memcpy(&t, base + i*4, 4); val = (int64_t)t; break; }
+            case 8: { int64_t t; memcpy(&t, base + i*8, 8); val = t;          break; }
+            default: return RAY_ERR_TYPE;
+            }
+            if (val < mn) mn = val;
+            if (val > mx) mx = val;
+        }
+        /* Empty (all-null) chunks keep mn=INT64_MAX / mx=INT64_MIN so
+         * the reduce path's min(mins[*]) / max(maxs[*]) ignores them. */
+        mins[g] = mn;
+        maxs[g] = mx;
+        if (any_null) nbits[g >> 3] |= (uint8_t)(1u << (g & 7));
+    }
+    return RAY_OK;
+}
+
+static ray_err_t chunk_zone_scan_float(ray_t* v, ray_index_t* ix,
+                                       int elem_size) {
+    uint32_t n_chunks = ix->u.chunk_zone.n_chunks;
+    uint8_t  log2     = ix->u.chunk_zone.chunk_log2;
+    int64_t  csz      = 1LL << log2;
+    int64_t  n        = v->len;
+    double*  mins     = (double*)ray_data(ix->u.chunk_zone.mins);
+    double*  maxs     = (double*)ray_data(ix->u.chunk_zone.maxs);
+    uint8_t* nbits    = (uint8_t*)ray_data(ix->u.chunk_zone.null_bits);
+    const uint8_t* base = (const uint8_t*)ray_data(v);
+
+    for (uint32_t g = 0; g < n_chunks; g++) {
+        int64_t s = (int64_t)g * csz;
+        int64_t e = s + csz; if (e > n) e = n;
+        double mn = INFINITY, mx = -INFINITY;
+        bool any_null = false;
+        for (int64_t i = s; i < e; i++) {
+            if (ray_vec_is_null(v, i)) { any_null = true; continue; }
+            double val = 0.0;
+            if (elem_size == 4) {
+                float t; memcpy(&t, base + i*4, 4); val = (double)t;
+            } else {
+                memcpy(&val, base + i*8, 8);
+            }
+            if (isnan(val)) { any_null = true; continue; }
+            if (val < mn) mn = val;
+            if (val > mx) mx = val;
+        }
+        /* Empty (all-null) chunks keep mn=+inf / mx=-inf so reduce
+         * (min/max across mins[]/maxs[]) ignores them. */
+        mins[g] = mn;
+        maxs[g] = mx;
+        if (any_null) nbits[g >> 3] |= (uint8_t)(1u << (g & 7));
+    }
+    return RAY_OK;
+}
+
+static ray_err_t chunk_zone_scan(ray_t* v, ray_index_t* ix) {
+    switch (v->type) {
+    case RAY_BOOL:
+    case RAY_U8:        return chunk_zone_scan_int(v, ix, 1);
+    case RAY_I16:       return chunk_zone_scan_int(v, ix, 2);
+    case RAY_I32:
+    case RAY_DATE:      return chunk_zone_scan_int(v, ix, 4);
+    case RAY_I64:
+    case RAY_TIME:
+    case RAY_TIMESTAMP: return chunk_zone_scan_int(v, ix, 8);
+    case RAY_F32:       return chunk_zone_scan_float(v, ix, 4);
+    case RAY_F64:       return chunk_zone_scan_float(v, ix, 8);
+    default:            return RAY_ERR_NYI;
+    }
+}
+
 /* --------------------------------------------------------------------------
  * Attach
  *
@@ -335,6 +457,59 @@ ray_t* ray_index_attach_zone(ray_t** vp) {
     return attach_finalize(v, idx);
 }
 
+ray_t* ray_index_attach_chunk_zone(ray_t** vp, uint8_t chunk_log2) {
+    ray_t* v = prepare_attach(vp, "chunk_zone");
+    if (RAY_IS_ERR(v)) return v;
+
+    if (chunk_log2 == 0) chunk_log2 = 16;          /* default 64 K rows / chunk */
+    if (chunk_log2 < 8 || chunk_log2 > 22)
+        return ray_error("domain", "chunk_zone: chunk_log2 out of range [8, 22]");
+    int64_t csz = 1LL << chunk_log2;
+    /* No point indexing a column smaller than one chunk — fall back to
+     * the column-wide zone (or no index at all) at that size. */
+    if (v->len < csz)
+        return ray_error("domain", "chunk_zone: column has fewer rows than one chunk");
+
+    uint32_t n_chunks = (uint32_t)((v->len + csz - 1) / csz);
+
+    ray_t* idx = ray_index_alloc(RAY_IDX_CHUNK_ZONE, v->type, v->len);
+    if (!idx || RAY_IS_ERR(idx)) return idx;
+    ray_index_t* ix = ray_index_payload(idx);
+    ix->u.chunk_zone.n_chunks   = n_chunks;
+    ix->u.chunk_zone.chunk_log2 = chunk_log2;
+    ix->u.chunk_zone.is_f64     = (v->type == RAY_F64 || v->type == RAY_F32) ? 1 : 0;
+
+    int8_t arr_type = ix->u.chunk_zone.is_f64 ? RAY_F64 : RAY_I64;
+    ray_t* mins = ray_vec_new(arr_type, (int64_t)n_chunks);
+    ray_t* maxs = ray_vec_new(arr_type, (int64_t)n_chunks);
+    int64_t nb_len = (int64_t)((n_chunks + 7) / 8);
+    ray_t* nbits = ray_vec_new(RAY_U8, nb_len);
+    if (!mins || RAY_IS_ERR(mins) || !maxs || RAY_IS_ERR(maxs) ||
+        !nbits || RAY_IS_ERR(nbits))
+    {
+        if (mins && !RAY_IS_ERR(mins)) ray_release(mins);
+        if (maxs && !RAY_IS_ERR(maxs)) ray_release(maxs);
+        if (nbits && !RAY_IS_ERR(nbits)) ray_release(nbits);
+        ray_release(idx);
+        return ray_error("oom", "chunk_zone: arrays alloc");
+    }
+    mins->len  = (int64_t)n_chunks;
+    maxs->len  = (int64_t)n_chunks;
+    nbits->len = nb_len;
+    memset(ray_data(nbits), 0, (size_t)nb_len);
+    ix->u.chunk_zone.mins      = mins;
+    ix->u.chunk_zone.maxs      = maxs;
+    ix->u.chunk_zone.null_bits = nbits;
+
+    ray_err_t err = chunk_zone_scan(v, ix);
+    if (err != RAY_OK) {
+        ray_release(idx);   /* releases mins/maxs/nbits via release_payload */
+        return ray_error(ray_err_code_str(err),
+                         "chunk_zone scan failed for type %d", (int)v->type);
+    }
+    return attach_finalize(v, idx);
+}
+
 /* --------------------------------------------------------------------------
  * Hash index — chained open addressing
  *
@@ -399,6 +574,207 @@ ray_t* ray_index_attach_hash(ray_t** vp) {
     return attach_finalize(v, idx);
 }
 
+/* --------------------------------------------------------------------------
+ * Hash-index point-lookup probe — public entry point for the eq-filter
+ * fast path (ray_index_hash_eq_rowsel).
+ *
+ * Callers present the index with an int64 key; we mix64 it with the
+ * same hash the builder used, walk the bucket chain, collect matches,
+ * and emit a ray_rowsel sized for O(matches) memory (no intermediate
+ * row-wide BOOL pred vec).
+ *
+ * Type matrix.  An index built on column type T accepts a key only
+ * when T's storage width covers it without truncation — i.e. asking
+ * for `u8_col == 300` would never match, so we fail eligibility and
+ * the caller falls back to the scan (which folds out-of-range via
+ * fp_fold_t).  Float keys are not supported here — equality on
+ * F32/F64 has NaN / -0 semantics the unfused engine handles. */
+
+static int hash_key_in_range(int8_t t, int64_t k) {
+    switch (t) {
+    case RAY_BOOL: case RAY_U8:        return k >= 0 && k <= UINT8_MAX;
+    case RAY_I16:                      return k >= INT16_MIN && k <= INT16_MAX;
+    case RAY_I32: case RAY_DATE:       return k >= INT32_MIN && k <= INT32_MAX;
+    case RAY_I64:
+    case RAY_TIME:
+    case RAY_TIMESTAMP:                return 1;
+    default:                           return 0;
+    }
+}
+
+/* Read row `i` of a numeric column as int64 for equality compare. */
+static int64_t hash_col_read_i64(const uint8_t* base, int8_t t, int64_t i) {
+    int es;
+    switch (t) {
+    case RAY_BOOL: case RAY_U8:        es = 1; break;
+    case RAY_I16:                      es = 2; break;
+    case RAY_I32: case RAY_DATE:       es = 4; break;
+    case RAY_I64:
+    case RAY_TIME:
+    case RAY_TIMESTAMP:                es = 8; break;
+    default:                           return 0;
+    }
+    switch (es) {
+    case 1:  return (int64_t)base[i];
+    case 2:  { int16_t v; memcpy(&v, base + i*2, 2); return (int64_t)v; }
+    case 4:  { int32_t v; memcpy(&v, base + i*4, 4); return (int64_t)v; }
+    default: { int64_t v; memcpy(&v, base + i*8, 8); return v;          }
+    }
+}
+
+/* Validate eligibility, return the index payload + computed start row.
+ * On miss leaves *start = -1 so the caller can short-circuit. */
+static ray_index_t* hash_probe_setup(ray_t* col, int64_t key,
+                                     int64_t* start_rid) {
+    *start_rid = -1;
+    if (!col || RAY_IS_ERR(col) || !ray_is_vec(col)) return NULL;
+    if (!(col->attrs & RAY_ATTR_HAS_INDEX) || !col->index) return NULL;
+    ray_index_t* ix = ray_index_payload(col->index);
+    if (ix->kind != RAY_IDX_HASH) return NULL;
+    if (ix->built_for_len != col->len) return NULL;
+    if (!hash_key_in_range(col->type, key)) return NULL;
+    if (numeric_elem_size(col->type) == 0) return NULL;
+    if (!ix->u.hash.table || !ix->u.hash.chain) return NULL;
+
+    /* Mirror numeric_key_word for an int64 key: the canonical hash
+     * input is the raw bit pattern of the storage width.  We zero-
+     * extend U8/BOOL and sign-extend others up to int64; mix64 then
+     * folds them — the builder did the same on a per-row basis. */
+    int es = numeric_elem_size(col->type);
+    uint64_t kbits = 0;
+    switch (es) {
+    case 1: kbits = (uint64_t)(uint8_t)key;                  break;
+    case 2: kbits = (uint64_t)(int64_t)(int16_t)key;         break;
+    case 4: kbits = (uint64_t)(int64_t)(int32_t)key;         break;
+    default: kbits = (uint64_t)key;                          break;
+    }
+    uint64_t h = mix64(kbits);
+    uint64_t slot = h & ix->u.hash.mask;
+    const int64_t* tbl = (const int64_t*)ray_data(ix->u.hash.table);
+    *start_rid = tbl[slot] - 1;
+    return ix;
+}
+
+/* qsort comparator: ascending int64 row ids, used by the rowsel
+ * builder to put matches into per-segment order. */
+static int hash_match_cmp_i64(const void* a, const void* b) {
+    int64_t x = *(const int64_t*)a;
+    int64_t y = *(const int64_t*)b;
+    return (x > y) - (x < y);
+}
+
+ray_t* ray_index_hash_eq_rowsel(ray_t* col, int64_t key) {
+    int64_t rid = -1;
+    ray_index_t* ix = hash_probe_setup(col, key, &rid);
+    if (!ix) return NULL;
+
+    int64_t n = col->len;
+    /* Collect matching row ids.  The chain length is bounded by the
+     * bucket fill factor; for keys appearing rarely the bound is tight
+     * (~1 row).  For highly-duplicated keys it can degenerate to O(n)
+     * — but only if the value really occurs that many times, in which
+     * case the existing scan path also reads the same number of rows.
+     * We size the collect buffer dynamically; cap at n to bound memory
+     * in the pathological case. */
+    const int64_t* chn  = (const int64_t*)ray_data(ix->u.hash.chain);
+    const uint8_t* base = (const uint8_t*)ray_data(col);
+    int8_t t = col->type;
+
+    int64_t mcap = 16;
+    int64_t mcnt = 0;
+    ray_t* match_hdr = ray_alloc(mcap * (int64_t)sizeof(int64_t));
+    if (!match_hdr) return NULL;
+    int64_t* matches = (int64_t*)ray_data(match_hdr);
+
+    while (rid >= 0) {
+        if (hash_col_read_i64(base, t, rid) == key) {
+            if (mcnt == mcap) {
+                int64_t new_cap = mcap * 2;
+                if (new_cap > n) new_cap = n + 1;  /* defensive bound */
+                ray_t* new_hdr = ray_alloc(new_cap * (int64_t)sizeof(int64_t));
+                if (!new_hdr) { ray_release(match_hdr); return NULL; }
+                memcpy(ray_data(new_hdr), matches,
+                       (size_t)mcnt * sizeof(int64_t));
+                ray_release(match_hdr);
+                match_hdr = new_hdr;
+                matches = (int64_t*)ray_data(match_hdr);
+                mcap = new_cap;
+            }
+            matches[mcnt++] = rid;
+        }
+        rid = chn[rid] - 1;
+    }
+
+    /* Sort ascending so we can fill seg_flags / seg_offsets / idx[]
+     * in a single linear pass.  qsort dominates only when matches are
+     * many — in that case the hash probe itself is the larger cost
+     * and this is still O(matches log matches). */
+    if (mcnt > 1)
+        qsort(matches, (size_t)mcnt, sizeof(int64_t), hash_match_cmp_i64);
+
+    /* Count idx_count = # of MIX segments × matches in that segment.
+     * For a hash probe a segment is either NONE (no matches) or MIX
+     * (≥1 match; never ALL unless every row in the segment matched,
+     * which would require duplicate-key density > MORSEL_ELEMS in one
+     * 1024-row window — vanishingly rare and indistinguishable in the
+     * consumer from a normal MIX). */
+    ray_t* block = ray_rowsel_new(n, mcnt, mcnt);
+    if (!block) { ray_release(match_hdr); return NULL; }
+
+    uint32_t n_segs = ray_rowsel_meta(block)->n_segs;
+    uint8_t*  seg_flags   = ray_rowsel_flags(block);
+    uint32_t* seg_offsets = ray_rowsel_offsets(block);
+    uint16_t* idx_arr     = ray_rowsel_idx(block);
+
+    /* All segments default to NONE; the loop below flips MIX where
+     * a match lands.  ray_alloc does NOT zero the data area
+     * (only the 32-byte header), so explicit init is required. */
+    memset(seg_flags, RAY_SEL_NONE, (size_t)n_segs);
+    /* seg_offsets is built by linear sweep below — initialize to a
+     * sentinel that the sweep will overwrite. */
+    /* (no memset needed; the sweep writes every entry [0..n_segs]) */
+
+    /* Single sweep over the sorted matches: emit per-segment offsets
+     * and morsel-local indices into idx_arr.  cur_seg tracks the
+     * segment we're filling; gaps get RAY_SEL_NONE and zero spans. */
+    int64_t mi = 0;
+    uint32_t cum = 0;
+    for (uint32_t s = 0; s < n_segs; s++) {
+        seg_offsets[s] = cum;
+        int64_t seg_start = (int64_t)s * RAY_MORSEL_ELEMS;
+        int64_t seg_end   = seg_start + RAY_MORSEL_ELEMS;
+        if (seg_end > n) seg_end = n;
+        uint32_t pc = 0;
+        while (mi < mcnt && matches[mi] < seg_end) {
+            idx_arr[cum + pc] = (uint16_t)(matches[mi] - seg_start);
+            pc++;
+            mi++;
+        }
+        if (pc == 0) {
+            seg_flags[s] = RAY_SEL_NONE;
+        } else if ((int64_t)pc == seg_end - seg_start) {
+            seg_flags[s] = RAY_SEL_ALL;
+            /* Roll back the indices — ALL segments contribute zero
+             * idx[] entries in the rowsel contract. */
+            cum -= pc;  /* idx_arr writes for this seg get overwritten
+                          by the next MIX segment's writes; idx_count
+                          was sized for all matches, so this is safe. */
+        } else {
+            seg_flags[s] = RAY_SEL_MIX;
+            cum += pc;
+        }
+    }
+    seg_offsets[n_segs] = cum;
+    /* Adjust meta total_pass / idx layout — ALL-segment rows count
+     * toward total_pass but not idx_count.  We initially passed
+     * (mcnt, mcnt); fix up if any ALL segments collapsed. */
+    ray_rowsel_meta(block)->total_pass = mcnt;
+    (void)cum;
+
+    ray_release(match_hdr);
+    return block;
+}
+
 /* --------------------------------------------------------------------------
  * Sort index — ascending permutation of row ids
  *
@@ -540,11 +916,12 @@ ray_t* ray_index_drop(ray_t** vp) {
 
 static const char* kind_name(ray_idx_kind_t k) {
     switch (k) {
-    case RAY_IDX_HASH:  return "hash";
-    case RAY_IDX_SORT:  return "sort";
-    case RAY_IDX_ZONE:  return "zone";
-    case RAY_IDX_BLOOM: return "bloom";
-    default:            return "none";
+    case RAY_IDX_HASH:       return "hash";
+    case RAY_IDX_SORT:       return "sort";
+    case RAY_IDX_ZONE:       return "zone";
+    case RAY_IDX_BLOOM:      return "bloom";
+    case RAY_IDX_CHUNK_ZONE: return "chunk_zone";
+    default:                 return "none";
     }
 }
 
@@ -627,6 +1004,14 @@ ray_t* ray_index_info(ray_t* v) {
         r = dict_append_sym_i64(&keys, &vals, "n_keys", ix->u.bloom.n_keys);
         if (RAY_IS_ERR(r)) goto fail;
         break;
+    case RAY_IDX_CHUNK_ZONE:
+        r = dict_append_sym_i64(&keys, &vals, "n_chunks",
+                                (int64_t)ix->u.chunk_zone.n_chunks);
+        if (RAY_IS_ERR(r)) goto fail;
+        r = dict_append_sym_i64(&keys, &vals, "chunk_log2",
+                                (int64_t)ix->u.chunk_zone.chunk_log2);
+        if (RAY_IS_ERR(r)) goto fail;
+        break;
     case RAY_IDX_NONE:
         break;
     }
diff --git a/src/ops/idxop.h b/src/ops/idxop.h
index 2703ddea..025b51ce 100644
--- a/src/ops/idxop.h
+++ b/src/ops/idxop.h
@@ -47,11 +47,20 @@
 
 /* Index kinds.  Stored in ray_index_t.kind. */
 typedef enum {
-    RAY_IDX_NONE  = 0,
-    RAY_IDX_HASH  = 1,
-    RAY_IDX_SORT  = 2,
-    RAY_IDX_ZONE  = 3,
-    RAY_IDX_BLOOM = 4,
+    RAY_IDX_NONE       = 0,
+    RAY_IDX_HASH       = 1,
+    RAY_IDX_SORT       = 2,
+    RAY_IDX_ZONE       = 3,
+    RAY_IDX_BLOOM      = 4,
+    /* Per-chunk min/max + null bit, one entry per (1 << chunk_log2) rows.
+     * The whole-column zone is derivable as
+     *   min(chunk_mins)/max(chunk_maxs) over the entries, so this
+     *   subsumes RAY_IDX_ZONE wherever it's used in the reduce path.
+     * Built at column ingest (csv.read); read by the min/max reduce
+     * and by the predicate planner to skip chunks whose [min,max]
+     * provably excludes/includes the constant.  See chunk_zone arm
+     * of ray_index_t.u below. */
+    RAY_IDX_CHUNK_ZONE = 5,
 } ray_idx_kind_t;
 
 /* The payload stored inside data[] of a RAY_INDEX ray_t. */
@@ -99,6 +108,19 @@ typedef struct {
             uint32_t _pad;
             int64_t  n_keys;    /* number of non-null rows added */
         } bloom;
+        struct {                /* RAY_IDX_CHUNK_ZONE */
+            /* mins / maxs hold n_chunks entries.  For integer / temporal
+             * column types they are RAY_I64 vecs storing the per-chunk
+             * extrema as int64; for RAY_F64 columns they are RAY_F64
+             * vecs.  is_f64 disambiguates at read time. */
+            ray_t*   mins;
+            ray_t*   maxs;
+            ray_t*   null_bits;   /* RAY_U8 vec, packed: bit i = chunk i has any null */
+            uint32_t n_chunks;
+            uint8_t  chunk_log2;  /* chunk size = 1 << chunk_log2 (default 16 → 64 K rows) */
+            uint8_t  is_f64;
+            uint8_t  _pad[2];
+        } chunk_zone;
     } u;
 } ray_index_t;
 
@@ -118,6 +140,10 @@ ray_t* ray_index_attach_zone (ray_t** vp);
 ray_t* ray_index_attach_hash (ray_t** vp);
 ray_t* ray_index_attach_sort (ray_t** vp);
 ray_t* ray_index_attach_bloom(ray_t** vp);
+/* Build per-chunk min/max + null bit at chunk_size = 1 << chunk_log2.
+ * Passing 0 picks the default (16 → 64 K rows / chunk).  Only valid on
+ * numeric and temporal vectors; SYM/STR/GUID return RAY_ERR_NYI. */
+ray_t* ray_index_attach_chunk_zone(ray_t** vp, uint8_t chunk_log2);
 
 /* Drop any attached index from *vp.  No-op if none.  Restores the
  * pre-attach nullmap state byte-for-byte.  Returns *vp. */
@@ -141,6 +167,31 @@ static inline ray_idx_kind_t ray_index_kind(const ray_t* v) {
  * or RAY_NULL_OBJ when no index is attached. */
 ray_t* ray_index_info(ray_t* v);
 
+/* ===== Hash-index point-lookup probe =====
+ *
+ * Build a ray_rowsel directly from a hash probe on `col`'s
+ * RAY_IDX_HASH for rows where the payload equals `key`.  Bypasses
+ * the intermediate BOOL pred vec entirely — touches O(matches)
+ * memory instead of O(rows), which is the whole reason to ship
+ * this fast path.
+ *
+ * Returns:
+ *   - A fresh rowsel block (rc=1) on success — install on
+ *     g->selection.  The block carries per-segment NONE/MIX/ALL
+ *     flags and the morsel-local indices for matching rows.
+ *     Pure NONE blocks (no matches) are returned as a valid empty
+ *     rowsel rather than NULL — NULL is the "all-pass" sentinel
+ *     in the consumer and would let every row through.
+ *   - NULL when the column is not eligible: no index, wrong kind,
+ *     built_for_len mismatch (stale), type mismatch, or out-of-
+ *     range key.  Caller must fall back to the full scan path.
+ *
+ * Eligibility (and the canonical hashing used) match
+ * ray_index_attach_hash: BOOL/U8/I16/I32/I64/DATE/TIME/TIMESTAMP.
+ * Floats are intentionally not supported — equality on F32/F64
+ * has NaN / -0 semantics the unfused compare kernel handles. */
+ray_t* ray_index_hash_eq_rowsel(ray_t* col, int64_t key);
+
 /* ===== Internal helpers (used by retain/release/detach in heap.c
  * and by mutation paths in vec.c) ===== */
 
diff --git a/src/ops/internal.h b/src/ops/internal.h
index 23975955..25fa9b2e 100644
--- a/src/ops/internal.h
+++ b/src/ops/internal.h
@@ -953,6 +953,20 @@ typedef struct {
     uint8_t agg_index;
     int64_t min_count_exclusive;
     int64_t top_count_take;
+    /* Agg op of the filtered agg.  When 0 (the default for the
+     * historical COUNT-only filter), consumers MUST treat it as
+     * OP_COUNT.  When non-zero, must equal ext->agg_ops[agg_index].
+     * Supported here: OP_COUNT, OP_SUM, OP_MIN, OP_MAX.  AVG and
+     * higher-order aggs (STDDEV/VAR/PEARSON/MEDIAN) are excluded
+     * because their ordering doesn't reduce to a single int64 read
+     * from the row slot — they fall through to the full sort + take. */
+    uint16_t agg_op;
+    /* Direction: 1 = top-N largest (desc), 0 = top-N smallest (asc).
+     * For COUNT/SUM/MAX the natural ordering is largest-first; for
+     * MIN it's smallest-first.  Both directions are supported per
+     * agg kind so `desc: min_value take: N` (the N groups with the
+     * largest min) is also expressible. */
+    uint8_t  desc;
 } ray_group_emit_filter_t;
 ray_group_emit_filter_t ray_group_emit_filter_get(void);
 void ray_group_emit_filter_set(ray_group_emit_filter_t filter);
diff --git a/src/ops/query.c b/src/ops/query.c
index 451d4baf..3b08415c 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -34,6 +34,7 @@
 #include "ops/rowsel.h"
 #include "ops/fused_group.h"
 #include "ops/fused_topk.h"
+#include "ops/hll.h"
 #include "ops/temporal.h"
 #include "core/profile.h"
 #include "table/sym.h"
@@ -87,147 +88,6 @@ static int64_t dict_key_id(ray_t* dict, const char* key) {
     return -1;
 }
 
-typedef struct {
-    ray_t*   tbl;
-    int64_t  nrows;
-    uint64_t hash;
-    uint64_t from_hash;
-    uint64_t env_gen;
-    ray_t*   result;
-} select_cache_entry_t;
-
-#define SELECT_CACHE_N 512
-static select_cache_entry_t g_select_cache[SELECT_CACHE_N];
-static uint16_t g_select_cache_next = 0;
-
-static uint64_t hash_mix_u64(uint64_t h, uint64_t v) {
-    h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2);
-    return h ? h : 0x9e3779b97f4a7c15ull;
-}
-
-static uint64_t ray_expr_hash(ray_t* x) {
-    if (!x) return 0x1234abcd5678ef00ull;
-    uint64_t h = hash_mix_u64(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type);
-    h = hash_mix_u64(h, (uint64_t)x->attrs);
-    h = hash_mix_u64(h, (x->type == -RAY_STR)
-                        ? (uint64_t)ray_str_len(x)
-                        : (uint64_t)x->len);
-    if (x->type == RAY_LIST) {
-        ray_t** elems = (ray_t**)ray_data(x);
-        for (int64_t i = 0; i < x->len; i++)
-            h = hash_mix_u64(h, ray_expr_hash(elems[i]));
-    } else if (x->type == RAY_DICT) {
-        ray_t* keys = ray_dict_keys(x);
-        ray_t* vals = ray_dict_vals(x);
-        h = hash_mix_u64(h, ray_expr_hash(keys));
-        h = hash_mix_u64(h, ray_expr_hash(vals));
-    } else if (x->type == RAY_STR) {
-        size_t n = 0;
-        const char* s = ray_str_vec_get(x, 0, &n);
-        for (size_t i = 0; s && i < n; i++)
-            h = hash_mix_u64(h, (unsigned char)s[i]);
-    } else if (x->type == -RAY_STR) {
-        const char* s = ray_str_ptr(x);
-        size_t n = ray_str_len(x);
-        for (size_t i = 0; s && i < n; i++)
-            h = hash_mix_u64(h, (unsigned char)s[i]);
-    } else if (x->type == RAY_SYM || x->type == -RAY_SYM ||
-               x->type == RAY_I64 || x->type == -RAY_I64 ||
-               x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) {
-        h = hash_mix_u64(h, (uint64_t)x->i64);
-    } else if (x->type == RAY_I32 || x->type == -RAY_I32 ||
-               x->type == RAY_DATE || x->type == -RAY_DATE ||
-               x->type == RAY_TIME || x->type == -RAY_TIME) {
-        h = hash_mix_u64(h, (uint64_t)(uint32_t)x->i32);
-    } else if (x->type == RAY_I16 || x->type == -RAY_I16) {
-        h = hash_mix_u64(h, (uint64_t)(uint16_t)x->i16);
-    } else if (x->type == RAY_U8 || x->type == -RAY_U8 ||
-               x->type == RAY_BOOL || x->type == -RAY_BOOL) {
-        h = hash_mix_u64(h, (uint64_t)x->u8);
-    } else if (x->type == RAY_F64 || x->type == -RAY_F64) {
-        uint64_t bits = 0;
-        memcpy(&bits, &x->f64, sizeof(bits));
-        h = hash_mix_u64(h, bits);
-    }
-    return h;
-}
-
-static ray_t* select_cache_get(ray_t* tbl, int64_t nrows,
-                               uint64_t hash, uint64_t from_hash) {
-    if (!g_ray_profile.active) return NULL;
-    if (!hash) return NULL;
-    for (uint16_t i = 0; i < SELECT_CACHE_N; i++) {
-        select_cache_entry_t* e = &g_select_cache[i];
-        if (e->result && e->env_gen == ray_env_generation() &&
-            e->nrows == nrows && e->hash == hash &&
-            (e->tbl == tbl || (from_hash && e->from_hash == from_hash))) {
-            ray_retain(e->result);
-            return e->result;
-        }
-    }
-    return NULL;
-}
-
-static void select_expr_cache_put(uint64_t hash, uint64_t from_hash,
-                                  ray_t* result);
-
-static void select_cache_put(ray_t* tbl, int64_t nrows,
-                             uint64_t hash, uint64_t from_hash,
-                             ray_t* result) {
-    if (!g_ray_profile.active) return;
-    if (!tbl || !hash || !result || RAY_IS_ERR(result)) return;
-    select_cache_entry_t* e =
-        &g_select_cache[g_select_cache_next++ % SELECT_CACHE_N];
-    if (e->result) ray_release(e->result);
-    e->tbl = tbl;
-    e->nrows = nrows;
-    e->hash = hash;
-    e->from_hash = from_hash;
-    e->env_gen = ray_env_generation();
-    e->result = result;
-    ray_retain(e->result);
-    select_expr_cache_put(hash, from_hash, result);
-}
-
-typedef struct {
-    uint64_t hash;
-    uint64_t from_hash;
-    uint64_t env_gen;
-    ray_t*   result;
-} select_expr_cache_entry_t;
-
-#define SELECT_EXPR_CACHE_N 1024
-static select_expr_cache_entry_t g_select_expr_cache[SELECT_EXPR_CACHE_N];
-static uint16_t g_select_expr_cache_next = 0;
-
-static ray_t* select_expr_cache_get(uint64_t hash, uint64_t from_hash) {
-    if (!g_ray_profile.active) return NULL;
-    if (!hash) return NULL;
-    for (uint16_t i = 0; i < SELECT_EXPR_CACHE_N; i++) {
-        select_expr_cache_entry_t* e = &g_select_expr_cache[i];
-        if (e->result && e->env_gen == ray_env_generation() &&
-            e->hash == hash && e->from_hash == from_hash) {
-            ray_retain(e->result);
-            return e->result;
-        }
-    }
-    return NULL;
-}
-
-static void select_expr_cache_put(uint64_t hash, uint64_t from_hash,
-                                  ray_t* result) {
-    if (!g_ray_profile.active) return;
-    if (!hash || !result || RAY_IS_ERR(result)) return;
-    select_expr_cache_entry_t* e =
-        &g_select_expr_cache[g_select_expr_cache_next++ % SELECT_EXPR_CACHE_N];
-    if (e->result) ray_release(e->result);
-    e->hash = hash;
-    e->from_hash = from_hash;
-    e->env_gen = ray_env_generation();
-    e->result = result;
-    ray_retain(e->result);
-}
-
 /* Flatten a RAY_DICT (keys SYM vec + vals LIST) into a transient
  * [k0,v0,k1,v1,...] array view so the existing dict-walking loops in
  * ray_select_fn et al. can iterate without rewriting every site.
@@ -565,14 +425,17 @@ static ray_t* apply_sort_take(ray_t* result, ray_t** dict_elems, int64_t dict_n,
             rng->len = 2;
             ray_t* sliced = ray_take_fn(result, rng);
             ray_release(result);
-            ray_heap_gc();
+            /* No explicit GC here — every top-level statement (run_piped
+             * / repl) finishes with a ray_heap_gc() that catches the
+             * freed intermediates anyway.  The inner call was double-
+             * counting on benchmark loops where the same query runs
+             * back-to-back. */
             ray_release(rng);
             return sliced;
         }
         if (ray_is_vec(tv) && (tv->type == RAY_I64 || tv->type == RAY_I32) && tv->len == 2) {
             ray_t* sliced = ray_take_fn(result, tv);
             ray_release(result);
-            ray_heap_gc();
             ray_release(tv);
             return sliced;
         }
@@ -671,7 +534,9 @@ static ray_t* apply_sort_take(ray_t* result, ray_t** dict_elems, int64_t dict_n,
                         }
                         if (topk && !RAY_IS_ERR(topk)) {
                             ray_release(result);
-                            ray_heap_gc();
+                            /* No explicit GC — the top-level statement
+                             * runner's ray_heap_gc() reclaims the freed
+                             * intermediates one call later. */
                             return topk;
                         }
                         if (topk && RAY_IS_ERR(topk)) ray_release(topk);
@@ -1634,1260 +1499,6 @@ static int atom_i64_const(ray_t* v, int64_t* out) {
     }
 }
 
-typedef struct {
-    const void* base;
-    int8_t type;
-    uint8_t attrs;
-    int op;
-    int64_t rhs;
-} xbar_count_clause_t;
-
-typedef struct {
-    int64_t key;
-    int64_t count;
-} xbar_count_pair_t;
-
-typedef struct {
-    uint32_t key;
-    uint32_t count;
-} i16x2_count_pair_t;
-
-typedef struct {
-    int32_t key;
-    uint32_t count;
-} i32_count_pair_t;
-
-typedef struct {
-    int16_t key;
-    uint32_t count;
-} i16_count_pair_t;
-
-typedef struct {
-    const int64_t* key_data;
-    int64_t bucket;
-    xbar_count_clause_t clauses[16];
-    uint8_t n_clauses;
-    uint32_t cap;
-    int64_t* keys;
-    uint32_t* counts;
-    uint8_t* used;
-    _Atomic int overflow;
-} xbar_count_ctx_t;
-
-typedef struct {
-    const int16_t* key0;
-    const int16_t* key1;
-    xbar_count_clause_t clauses[16];
-    uint8_t n_clauses;
-    uint32_t cap;
-    uint32_t* keys;
-    uint32_t* counts;
-    uint8_t* used;
-    _Atomic int overflow;
-} i16x2_count_ctx_t;
-
-typedef struct {
-    const int16_t* key;
-    uint32_t* counts;
-} i16_ne0_count_ctx_t;
-
-typedef struct {
-    const int32_t* group;
-    const int64_t* distinct;
-    uint32_t cap;
-    int32_t* groups;
-    int64_t* values;
-    uint8_t* used;
-    _Atomic int overflow;
-} i32_i64_cd_ctx_t;
-
-static int xbar_count_pair_cmp(const void* a, const void* b) {
-    const xbar_count_pair_t* pa = (const xbar_count_pair_t*)a;
-    const xbar_count_pair_t* pb = (const xbar_count_pair_t*)b;
-    return (pa->key > pb->key) - (pa->key < pb->key);
-}
-
-static int i16x2_count_pair_desc_cmp(const void* a, const void* b) {
-    const i16x2_count_pair_t* pa = (const i16x2_count_pair_t*)a;
-    const i16x2_count_pair_t* pb = (const i16x2_count_pair_t*)b;
-    if (pa->count != pb->count)
-        return (pa->count < pb->count) - (pa->count > pb->count);
-    return (pa->key > pb->key) - (pa->key < pb->key);
-}
-
-static int i32_count_pair_desc_cmp(const void* a, const void* b) {
-    const i32_count_pair_t* pa = (const i32_count_pair_t*)a;
-    const i32_count_pair_t* pb = (const i32_count_pair_t*)b;
-    if (pa->count != pb->count)
-        return (pa->count < pb->count) - (pa->count > pb->count);
-    return (pa->key > pb->key) - (pa->key < pb->key);
-}
-
-static int i16_count_pair_desc_cmp(const void* a, const void* b) {
-    const i16_count_pair_t* pa = (const i16_count_pair_t*)a;
-    const i16_count_pair_t* pb = (const i16_count_pair_t*)b;
-    if (pa->count != pb->count)
-        return (pa->count < pb->count) - (pa->count > pb->count);
-    return (pa->key > pb->key) - (pa->key < pb->key);
-}
-
-static uint64_t xbar_count_hash_i64(int64_t v) {
-    uint64_t h = (uint64_t)v;
-    h ^= h >> 33;
-    h *= 0xff51afd7ed558ccdULL;
-    h ^= h >> 33;
-    h *= 0xc4ceb9fe1a85ec53ULL;
-    h ^= h >> 33;
-    return h;
-}
-
-static uint32_t count_hash_u32(uint32_t v) {
-    uint32_t h = v;
-    h ^= h >> 16;
-    h *= 0x7feb352dU;
-    h ^= h >> 15;
-    h *= 0x846ca68bU;
-    h ^= h >> 16;
-    return h;
-}
-
-static uint64_t count_hash_i32_i64(int32_t g, int64_t v) {
-    uint64_t h = (uint64_t)(uint32_t)g * 0x9E3779B97F4A7C15ULL;
-    uint64_t x = (uint64_t)v;
-    x ^= x >> 33;
-    x *= 0xff51afd7ed558ccdULL;
-    x ^= x >> 33;
-    h ^= x + 0xBF58476D1CE4E5B9ULL + (h << 6) + (h >> 2);
-    h ^= h >> 33;
-    return h;
-}
-
-static void xbar_count_worker_fn(void* raw, uint32_t worker_id,
-                                 int64_t start, int64_t end) {
-    xbar_count_ctx_t* ctx = (xbar_count_ctx_t*)raw;
-    uint32_t cap = ctx->cap;
-    uint32_t mask = cap - 1u;
-    int64_t* keys = ctx->keys + (size_t)worker_id * cap;
-    uint32_t* counts = ctx->counts + (size_t)worker_id * cap;
-    uint8_t* used = ctx->used + (size_t)worker_id * cap;
-    int64_t n_groups = 0;
-    int64_t bucket = ctx->bucket;
-
-    for (int64_t r = start; r < end; r++) {
-        uint8_t pass = 1;
-        for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) {
-            const xbar_count_clause_t* c = &ctx->clauses[ci];
-            int64_t v = read_col_i64(c->base, r, c->type, c->attrs);
-            if (c->op == 1) pass &= (uint8_t)(v == c->rhs);
-            else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs);
-            else pass &= (uint8_t)(v <= c->rhs);
-            if (!pass) break;
-        }
-        if (!pass) continue;
-        int64_t ts = ctx->key_data[r];
-        int64_t q = ts / bucket;
-        if ((ts ^ bucket) < 0 && q * bucket != ts) q--;
-        int64_t k = q * bucket;
-        uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask;
-        while (used[slot] && keys[slot] != k)
-            slot = (slot + 1u) & mask;
-        if (!used[slot]) {
-            if (n_groups >= (int64_t)(cap / 2)) {
-                atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed);
-                return;
-            }
-            used[slot] = 1;
-            keys[slot] = k;
-            n_groups++;
-        }
-        counts[slot]++;
-    }
-}
-
-static void i16x2_count_worker_fn(void* raw, uint32_t worker_id,
-                                  int64_t start, int64_t end) {
-    i16x2_count_ctx_t* ctx = (i16x2_count_ctx_t*)raw;
-    uint32_t cap = ctx->cap;
-    uint32_t mask = cap - 1u;
-    uint32_t* keys = ctx->keys + (size_t)worker_id * cap;
-    uint32_t* counts = ctx->counts + (size_t)worker_id * cap;
-    uint8_t* used = ctx->used + (size_t)worker_id * cap;
-    int64_t n_groups = 0;
-
-    for (int64_t r = start; r < end; r++) {
-        uint8_t pass = 1;
-        for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) {
-            const xbar_count_clause_t* c = &ctx->clauses[ci];
-            int64_t v = read_col_i64(c->base, r, c->type, c->attrs);
-            if (c->op == 1) pass &= (uint8_t)(v == c->rhs);
-            else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs);
-            else pass &= (uint8_t)(v <= c->rhs);
-            if (!pass) break;
-        }
-        if (!pass) continue;
-        uint32_t k = ((uint32_t)(uint16_t)ctx->key0[r] << 16) |
-                     (uint32_t)(uint16_t)ctx->key1[r];
-        uint32_t slot = count_hash_u32(k) & mask;
-        while (used[slot] && keys[slot] != k)
-            slot = (slot + 1u) & mask;
-        if (!used[slot]) {
-            if (n_groups >= (int64_t)(cap / 2)) {
-                atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed);
-                return;
-            }
-            used[slot] = 1;
-            keys[slot] = k;
-            n_groups++;
-        }
-        counts[slot]++;
-    }
-}
-
-static void i16_ne0_count_worker_fn(void* raw, uint32_t worker_id,
-                                    int64_t start, int64_t end) {
-    i16_ne0_count_ctx_t* ctx = (i16_ne0_count_ctx_t*)raw;
-    uint32_t* counts = ctx->counts + (size_t)worker_id * 65536u;
-    const int16_t* key = ctx->key;
-    for (int64_t r = start; r < end; r++) {
-        int16_t v = key[r];
-        if (v)
-            counts[(uint32_t)((int32_t)v + 32768)]++;
-    }
-}
-
-static void i32_i64_cd_worker_fn(void* raw, uint32_t worker_id,
-                                 int64_t start, int64_t end) {
-    i32_i64_cd_ctx_t* ctx = (i32_i64_cd_ctx_t*)raw;
-    uint32_t cap = ctx->cap;
-    uint32_t mask = cap - 1u;
-    int32_t* groups = ctx->groups + (size_t)worker_id * cap;
-    int64_t* values = ctx->values + (size_t)worker_id * cap;
-    uint8_t* used = ctx->used + (size_t)worker_id * cap;
-    int64_t n_filled = 0;
-
-    for (int64_t r = start; r < end; r++) {
-        int32_t g = ctx->group[r];
-        int64_t v = ctx->distinct[r];
-        uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & mask;
-        while (used[slot] && (groups[slot] != g || values[slot] != v))
-            slot = (slot + 1u) & mask;
-        if (!used[slot]) {
-            if (n_filled >= (int64_t)(cap * 7u / 10u)) {
-                atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed);
-                return;
-            }
-            used[slot] = 1;
-            groups[slot] = g;
-            values[slot] = v;
-            n_filled++;
-        }
-    }
-}
-
-static int sym_name_eq(int64_t sym, const char* name, size_t len) {
-    ray_t* s = ray_sym_str(sym);
-    return s && ray_str_len(s) == len &&
-           memcmp(ray_str_ptr(s), name, len) == 0;
-}
-
-static int parse_xbar_count_clause(ray_t* tbl, ray_t* expr,
-                                   xbar_count_clause_t* clauses,
-                                   uint8_t* n_clauses) {
-    if (!expr || expr->type != RAY_LIST || ray_len(expr) < 3) return 0;
-    ray_t** elems = (ray_t**)ray_data(expr);
-    if (!elems[0] || elems[0]->type != -RAY_SYM) return 0;
-    ray_t* head = ray_sym_str(elems[0]->i64);
-    if (!head) return 0;
-    const char* hn = ray_str_ptr(head);
-    size_t hl = ray_str_len(head);
-    if (hl == 3 && memcmp(hn, "and", 3) == 0) {
-        for (int64_t i = 1; i < ray_len(expr); i++)
-            if (!parse_xbar_count_clause(tbl, elems[i], clauses, n_clauses))
-                return 0;
-        return 1;
-    }
-    if (ray_len(expr) != 3 || *n_clauses >= 16) return 0;
-    int op = 0;
-    if (hl == 2 && memcmp(hn, "==", 2) == 0) op = 1;
-    else if (hl == 2 && memcmp(hn, ">=", 2) == 0) op = 2;
-    else if (hl == 2 && memcmp(hn, "<=", 2) == 0) op = 3;
-    else return 0;
-
-    ray_t* lhs = elems[1];
-    ray_t* rhs = elems[2];
-    int64_t rhs_i = 0;
-    if (!lhs || lhs->type != -RAY_SYM || !(lhs->attrs & RAY_ATTR_NAME) ||
-        !atom_i64_const(rhs, &rhs_i))
-        return 0;
-    ray_t* col = ray_table_get_col(tbl, lhs->i64);
-    if (!col || !ray_is_vec(col) || RAY_IS_PARTED(col->type) ||
-        col->type == RAY_MAPCOMMON || (col->attrs & RAY_ATTR_HAS_NULLS))
-        return 0;
-    int8_t ct = col->type;
-    if (ct != RAY_BOOL && ct != RAY_U8 && ct != RAY_I16 &&
-        ct != RAY_I32 && ct != RAY_I64 && ct != RAY_DATE &&
-        ct != RAY_TIME && ct != RAY_TIMESTAMP)
-        return 0;
-    clauses[*n_clauses] = (xbar_count_clause_t){
-        .base = ray_data(col),
-        .type = ct,
-        .attrs = col->attrs,
-        .op = op,
-        .rhs = rhs_i,
-    };
-    (*n_clauses)++;
-    return 1;
-}
-
-static int count_clause_score(const xbar_count_clause_t* c) {
-    if (c->op == 1 && ray_sym_elem_size(c->type, c->attrs) >= 8) return 0;
-    if (c->op == 1) return 1;
-    return 2;
-}
-
-static void order_count_clauses(xbar_count_clause_t* clauses, uint8_t n) {
-    for (uint8_t i = 1; i < n; i++) {
-        xbar_count_clause_t v = clauses[i];
-        int vs = count_clause_score(&v);
-        uint8_t j = i;
-        while (j > 0 && count_clause_score(&clauses[j - 1]) > vs) {
-            clauses[j] = clauses[j - 1];
-            j--;
-        }
-        clauses[j] = v;
-    }
-}
-
-static int xbar_clause_cache_eq(const xbar_count_clause_t* a, uint8_t an,
-                                const xbar_count_clause_t* b, uint8_t bn) {
-    if (an != bn) return 0;
-    for (uint8_t i = 0; i < an; i++) {
-        if (a[i].base != b[i].base || a[i].type != b[i].type ||
-            a[i].attrs != b[i].attrs || a[i].op != b[i].op ||
-            a[i].rhs != b[i].rhs)
-            return 0;
-    }
-    return 1;
-}
-
-static int match_i16_key_ne_zero(ray_t* where_expr, int64_t key_sym) {
-    if (!where_expr || where_expr->type != RAY_LIST || ray_len(where_expr) != 3)
-        return 0;
-    ray_t** e = (ray_t**)ray_data(where_expr);
-    if (!e[0] || e[0]->type != -RAY_SYM ||
-        !sym_name_eq(e[0]->i64, "!=", 2))
-        return 0;
-    ray_t* lhs = e[1];
-    int64_t rhs = 0;
-    return lhs && lhs->type == -RAY_SYM && (lhs->attrs & RAY_ATTR_NAME) &&
-           lhs->i64 == key_sym && atom_i64_const(e[2], &rhs) && rhs == 0;
-}
-
-static ray_t* try_i16_ne0_count_desc_select(ray_t* tbl, ray_t* where_expr,
-                                            ray_t* by_expr, ray_t* take_expr,
-                                            ray_t** dict_elems,
-                                            int64_t dict_n,
-                                            int64_t from_id,
-                                            int64_t where_id,
-                                            int64_t by_id,
-                                            int64_t take_id,
-                                            int64_t asc_id,
-                                            int64_t desc_id,
-                                            int64_t nearest_id) {
-    if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr ||
-        !take_expr || by_expr->type != -RAY_SYM ||
-        !(by_expr->attrs & RAY_ATTR_NAME))
-        return NULL;
-    int64_t key_sym = by_expr->i64;
-    int64_t take_n = 0;
-    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024)
-        return NULL;
-    if (!match_i16_key_ne_zero(where_expr, key_sym))
-        return NULL;
-
-    int64_t count_alias = -1;
-    int saw_desc = 0;
-    int saw_key_projection = 0;
-    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
-        int64_t kid = dict_elems[i]->i64;
-        ray_t* v = dict_elems[i + 1];
-        if (kid == from_id || kid == where_id || kid == by_id ||
-            kid == take_id || kid == nearest_id)
-            continue;
-        if (kid == desc_id) {
-            if (!v || v->type != -RAY_SYM)
-                return NULL;
-            saw_desc = 1;
-            continue;
-        }
-        if (kid == asc_id) return NULL;
-        if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) &&
-            kid == key_sym && v->i64 == key_sym) {
-            saw_key_projection = 1;
-            continue;
-        }
-        if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2)
-            return NULL;
-        ray_t** ae = (ray_t**)ray_data(v);
-        if (!ae[0] || ae[0]->type != -RAY_SYM ||
-            !sym_name_eq(ae[0]->i64, "count", 5))
-            return NULL;
-        ray_t* arg = ae[1];
-        if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME) ||
-            arg->i64 != key_sym)
-            return NULL;
-        count_alias = kid;
-    }
-    if (!saw_desc || !saw_key_projection || count_alias < 0)
-        return NULL;
-
-    ray_t* col = ray_table_get_col(tbl, key_sym);
-    if (!col || !ray_is_vec(col) || col->type != RAY_I16 ||
-        (col->attrs & RAY_ATTR_HAS_NULLS))
-        return NULL;
-
-    static ray_t* cache_result = NULL;
-    static ray_t* cache_tbl = NULL;
-    static ray_t* cache_col = NULL;
-    static int64_t cache_len = -1;
-    static int64_t cache_key_sym = -1;
-    static int64_t cache_count_alias = -1;
-    static int64_t cache_take = -1;
-    if (cache_result && cache_tbl == tbl && cache_col == col &&
-        cache_len == col->len && cache_key_sym == key_sym &&
-        cache_count_alias == count_alias && cache_take == take_n) {
-        ray_retain(cache_result);
-        return cache_result;
-    }
-
-    ray_pool_t* pool = ray_pool_get();
-    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
-    if (nw == 0) nw = 1;
-    ray_t* counts_hdr = NULL;
-    uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr,
-        (size_t)nw * 65536u * sizeof(uint32_t));
-    if (!counts)
-        return ray_error("oom", NULL);
-
-    i16_ne0_count_ctx_t ctx = {
-        .key = (const int16_t*)ray_data(col),
-        .counts = counts,
-    };
-    int64_t nrows = ray_table_nrows(tbl);
-    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
-        ray_pool_dispatch(pool, i16_ne0_count_worker_fn, &ctx, nrows);
-    else
-        i16_ne0_count_worker_fn(&ctx, 0, 0, nrows);
-
-    i16_count_pair_t top[1024];
-    int64_t top_n = 0;
-    for (uint32_t s = 0; s < 65536u; s++) {
-        uint32_t total = 0;
-        for (uint32_t w = 0; w < nw; w++)
-            total += counts[(size_t)w * 65536u + s];
-        if (!total) continue;
-        i16_count_pair_t cand = {
-            .key = (int16_t)((int32_t)s - 32768),
-            .count = total,
-        };
-        if (top_n < take_n) {
-            top[top_n++] = cand;
-            continue;
-        }
-        int64_t min_i = 0;
-        for (int64_t i = 1; i < top_n; i++) {
-            if (top[i].count < top[min_i].count ||
-                (top[i].count == top[min_i].count && top[i].key > top[min_i].key))
-                min_i = i;
-        }
-        if (cand.count > top[min_i].count ||
-            (cand.count == top[min_i].count && cand.key < top[min_i].key))
-            top[min_i] = cand;
-    }
-    scratch_free(counts_hdr);
-    qsort(top, (size_t)top_n, sizeof(i16_count_pair_t),
-          i16_count_pair_desc_cmp);
-
-    int64_t out_n = top_n;
-    ray_t* key_out = ray_vec_new(RAY_I16, out_n);
-    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
-    if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) {
-        if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out);
-        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
-        return ray_error("oom", NULL);
-    }
-    key_out->len = out_n;
-    cnt_out->len = out_n;
-    int16_t* ko = (int16_t*)ray_data(key_out);
-    int64_t* co = (int64_t*)ray_data(cnt_out);
-    for (int64_t i = 0; i < out_n; i++) {
-        ko[i] = top[i].key;
-        co[i] = (int64_t)top[i].count;
-    }
-
-    ray_t* out = ray_table_new(2);
-    if (!out || RAY_IS_ERR(out)) {
-        ray_release(key_out); ray_release(cnt_out);
-        return out ? out : ray_error("oom", NULL);
-    }
-    out = ray_table_add_col(out, key_sym, key_out);
-    out = ray_table_add_col(out, count_alias, cnt_out);
-    ray_release(key_out); ray_release(cnt_out);
-    if (cache_result)
-        ray_release(cache_result);
-    cache_result = out;
-    cache_tbl = tbl;
-    cache_col = col;
-    cache_len = col->len;
-    cache_key_sym = key_sym;
-    cache_count_alias = count_alias;
-    cache_take = take_n;
-    ray_retain(cache_result);
-    return out;
-}
-
-static ray_t* try_i32_i64_count_distinct_select(ray_t* tbl, ray_t* where_expr,
-                                                ray_t* by_expr,
-                                                ray_t* take_expr,
-                                                ray_t** dict_elems,
-                                                int64_t dict_n,
-                                                int64_t from_id,
-                                                int64_t where_id,
-                                                int64_t by_id,
-                                                int64_t take_id,
-                                                int64_t asc_id,
-                                                int64_t desc_id,
-                                                int64_t nearest_id) {
-    if (!tbl || tbl->type != RAY_TABLE || where_expr || !by_expr ||
-        !take_expr || by_expr->type != -RAY_SYM ||
-        !(by_expr->attrs & RAY_ATTR_NAME))
-        return NULL;
-
-    int64_t take_n = 0;
-    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024)
-        return NULL;
-
-    int64_t group_sym = by_expr->i64;
-    int64_t distinct_sym = -1;
-    int64_t count_alias = -1;
-    int saw_desc = 0;
-    int saw_group_projection = 0;
-    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
-        int64_t kid = dict_elems[i]->i64;
-        ray_t* v = dict_elems[i + 1];
-        if (kid == from_id || kid == where_id || kid == by_id ||
-            kid == take_id || kid == nearest_id)
-            continue;
-        if (kid == desc_id) {
-            if (!v || v->type != -RAY_SYM)
-                return NULL;
-            saw_desc = 1;
-            continue;
-        }
-        if (kid == asc_id) return NULL;
-        if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) &&
-            kid == group_sym && v->i64 == group_sym) {
-            saw_group_projection = 1;
-            continue;
-        }
-        if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2)
-            return NULL;
-        ray_t** ae = (ray_t**)ray_data(v);
-        if (!ae[0] || ae[0]->type != -RAY_SYM ||
-            !sym_name_eq(ae[0]->i64, "count", 5))
-            return NULL;
-        ray_t* inner = ae[1];
-        if (!inner || inner->type != RAY_LIST || ray_len(inner) != 2)
-            return NULL;
-        ray_t** ie = (ray_t**)ray_data(inner);
-        if (!ie[0] || ie[0]->type != -RAY_SYM ||
-            !sym_name_eq(ie[0]->i64, "distinct", 8))
-            return NULL;
-        ray_t* arg = ie[1];
-        if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME))
-            return NULL;
-        distinct_sym = arg->i64;
-        count_alias = kid;
-    }
-    if (!saw_desc || !saw_group_projection || count_alias < 0 ||
-        distinct_sym < 0)
-        return NULL;
-
-    ray_t* gcol = ray_table_get_col(tbl, group_sym);
-    ray_t* dcol = ray_table_get_col(tbl, distinct_sym);
-    if (!gcol || !dcol || !ray_is_vec(gcol) || !ray_is_vec(dcol) ||
-        gcol->type != RAY_I32 || dcol->type != RAY_I64 ||
-        (gcol->attrs & RAY_ATTR_HAS_NULLS) ||
-        (dcol->attrs & RAY_ATTR_HAS_NULLS))
-        return NULL;
-
-    static ray_t* cache_result = NULL;
-    static ray_t* cache_tbl = NULL;
-    static int64_t cache_len = -1;
-    static int64_t cache_group_sym = -1;
-    static int64_t cache_distinct_sym = -1;
-    static int64_t cache_count_alias = -1;
-    static int64_t cache_take = -1;
-    if (cache_result && cache_tbl == tbl && cache_len == gcol->len &&
-        cache_group_sym == group_sym && cache_distinct_sym == distinct_sym &&
-        cache_count_alias == count_alias && cache_take == take_n) {
-        ray_retain(cache_result);
-        return cache_result;
-    }
-
-    int64_t nrows = ray_table_nrows(tbl);
-    ray_pool_t* pool = ray_pool_get();
-    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
-    if (nw == 0) nw = 1;
-    const uint32_t local_cap = 1u << 20;
-    ray_t *lg_hdr = NULL, *lv_hdr = NULL, *lu_hdr = NULL;
-    int32_t* lg = (int32_t*)scratch_calloc(&lg_hdr,
-        (size_t)nw * local_cap * sizeof(int32_t));
-    int64_t* lv = (int64_t*)scratch_calloc(&lv_hdr,
-        (size_t)nw * local_cap * sizeof(int64_t));
-    uint8_t* lu = (uint8_t*)scratch_calloc(&lu_hdr, (size_t)nw * local_cap);
-    if (!lg || !lv || !lu) {
-        if (lg_hdr) scratch_free(lg_hdr);
-        if (lv_hdr) scratch_free(lv_hdr);
-        if (lu_hdr) scratch_free(lu_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    i32_i64_cd_ctx_t ctx = {
-        .group = (const int32_t*)ray_data(gcol),
-        .distinct = (const int64_t*)ray_data(dcol),
-        .cap = local_cap,
-        .groups = lg,
-        .values = lv,
-        .used = lu,
-    };
-    atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed);
-    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
-        ray_pool_dispatch(pool, i32_i64_cd_worker_fn, &ctx, nrows);
-    else
-        i32_i64_cd_worker_fn(&ctx, 0, 0, nrows);
-    if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) {
-        scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
-        return NULL;
-    }
-
-    const uint32_t gcap = 1u << 23;
-    const uint32_t gmask = gcap - 1u;
-    ray_t *gg_hdr = NULL, *gv_hdr = NULL, *gu_hdr = NULL;
-    int32_t* gg = (int32_t*)scratch_calloc(&gg_hdr, (size_t)gcap * sizeof(int32_t));
-    int64_t* gv = (int64_t*)scratch_calloc(&gv_hdr, (size_t)gcap * sizeof(int64_t));
-    uint8_t* gu = (uint8_t*)scratch_calloc(&gu_hdr, (size_t)gcap);
-    if (!gg || !gv || !gu) {
-        scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
-        if (gg_hdr) scratch_free(gg_hdr);
-        if (gv_hdr) scratch_free(gv_hdr);
-        if (gu_hdr) scratch_free(gu_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    int64_t global_n = 0;
-    for (uint32_t w = 0; w < nw; w++) {
-        int32_t* wg = lg + (size_t)w * local_cap;
-        int64_t* wv = lv + (size_t)w * local_cap;
-        uint8_t* wu = lu + (size_t)w * local_cap;
-        for (uint32_t s = 0; s < local_cap; s++) {
-            if (!wu[s]) continue;
-            int32_t g = wg[s];
-            int64_t v = wv[s];
-            uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & gmask;
-            while (gu[slot] && (gg[slot] != g || gv[slot] != v))
-                slot = (slot + 1u) & gmask;
-            if (!gu[slot]) {
-                if (global_n >= (int64_t)(gcap * 7u / 10u)) {
-                    scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr);
-                    scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
-                    return NULL;
-                }
-                gu[slot] = 1;
-                gg[slot] = g;
-                gv[slot] = v;
-                global_n++;
-            }
-        }
-    }
-    scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
-
-    const uint32_t rcap = 4096;
-    const uint32_t rmask = rcap - 1u;
-    int32_t rkeys[4096];
-    uint32_t rcounts[4096];
-    uint8_t rused[4096];
-    memset(rused, 0, sizeof(rused));
-    int64_t region_n = 0;
-    for (uint32_t s = 0; s < gcap; s++) {
-        if (!gu[s]) continue;
-        int32_t g = gg[s];
-        uint32_t slot = count_hash_u32((uint32_t)g) & rmask;
-        while (rused[slot] && rkeys[slot] != g)
-            slot = (slot + 1u) & rmask;
-        if (!rused[slot]) {
-            if (region_n >= (int64_t)(rcap / 2)) {
-                scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr);
-                return NULL;
-            }
-            rused[slot] = 1;
-            rkeys[slot] = g;
-            rcounts[slot] = 0;
-            region_n++;
-        }
-        rcounts[slot]++;
-    }
-    scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr);
-
-    ray_t* pairs_hdr = NULL;
-    i32_count_pair_t* pairs = (i32_count_pair_t*)scratch_alloc(
-        &pairs_hdr, (size_t)region_n * sizeof(i32_count_pair_t));
-    if (!pairs && region_n > 0)
-        return ray_error("oom", NULL);
-    int64_t pi = 0;
-    for (uint32_t s = 0; s < rcap; s++) {
-        if (!rused[s]) continue;
-        pairs[pi++] = (i32_count_pair_t){ .key = rkeys[s], .count = rcounts[s] };
-    }
-    qsort(pairs, (size_t)region_n, sizeof(i32_count_pair_t),
-          i32_count_pair_desc_cmp);
-
-    int64_t out_n = region_n < take_n ? region_n : take_n;
-    ray_t* key_out = ray_vec_new(RAY_I32, out_n);
-    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
-    if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) {
-        if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out);
-        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
-        scratch_free(pairs_hdr);
-        return ray_error("oom", NULL);
-    }
-    key_out->len = out_n;
-    cnt_out->len = out_n;
-    int32_t* ko = (int32_t*)ray_data(key_out);
-    int64_t* co = (int64_t*)ray_data(cnt_out);
-    for (int64_t i = 0; i < out_n; i++) {
-        ko[i] = pairs[i].key;
-        co[i] = (int64_t)pairs[i].count;
-    }
-    scratch_free(pairs_hdr);
-
-    ray_t* out = ray_table_new(2);
-    if (!out || RAY_IS_ERR(out)) {
-        ray_release(key_out); ray_release(cnt_out);
-        return out ? out : ray_error("oom", NULL);
-    }
-    out = ray_table_add_col(out, group_sym, key_out);
-    out = ray_table_add_col(out, count_alias, cnt_out);
-    ray_release(key_out); ray_release(cnt_out);
-    if (cache_result)
-        ray_release(cache_result);
-    cache_result = out;
-    cache_tbl = tbl;
-    cache_len = gcol->len;
-    cache_group_sym = group_sym;
-    cache_distinct_sym = distinct_sym;
-    cache_count_alias = count_alias;
-    cache_take = take_n;
-    ray_retain(cache_result);
-    return out;
-}
-
-static ray_t* try_i16x2_count_desc_select(ray_t* tbl, ray_t* where_expr,
-                                          ray_t* by_expr, ray_t* take_expr,
-                                          ray_t** dict_elems, int64_t dict_n,
-                                          int64_t from_id, int64_t where_id,
-                                          int64_t by_id, int64_t take_id,
-                                          int64_t asc_id, int64_t desc_id,
-                                          int64_t nearest_id) {
-    if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr ||
-        !take_expr || by_expr->type != RAY_DICT)
-        return NULL;
-
-    int64_t take_n = 0;
-    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000)
-        return NULL;
-
-    DICT_VIEW_DECL(bv);
-    DICT_VIEW_OPEN(by_expr, bv);
-    if (DICT_VIEW_OVERFLOW(bv) || bv_n != 4) return NULL;
-    ray_t* key0_atom = bv[0];
-    ray_t* key0_val = bv[1];
-    ray_t* key1_atom = bv[2];
-    ray_t* key1_val = bv[3];
-    if (!key0_atom || key0_atom->type != -RAY_SYM ||
-        !key1_atom || key1_atom->type != -RAY_SYM ||
-        !key0_val || key0_val->type != -RAY_SYM ||
-        !key1_val || key1_val->type != -RAY_SYM ||
-        !(key0_val->attrs & RAY_ATTR_NAME) ||
-        !(key1_val->attrs & RAY_ATTR_NAME) ||
-        key0_atom->i64 != key0_val->i64 ||
-        key1_atom->i64 != key1_val->i64)
-        return NULL;
-
-    int64_t count_alias = -1;
-    int saw_desc = 0;
-    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
-        int64_t kid = dict_elems[i]->i64;
-        ray_t* v = dict_elems[i + 1];
-        if (kid == from_id || kid == where_id || kid == by_id ||
-            kid == take_id || kid == nearest_id)
-            continue;
-        if (kid == desc_id) {
-            if (!v || v->type != -RAY_SYM)
-                return NULL;
-            saw_desc = 1;
-            continue;
-        }
-        if (kid == asc_id) return NULL;
-        if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL;
-        ray_t** ae = (ray_t**)ray_data(v);
-        if (!ae[0] || ae[0]->type != -RAY_SYM ||
-            !sym_name_eq(ae[0]->i64, "count", 5))
-            return NULL;
-        count_alias = kid;
-    }
-    if (!saw_desc || count_alias < 0) return NULL;
-
-    ray_t* col0 = ray_table_get_col(tbl, key0_atom->i64);
-    ray_t* col1 = ray_table_get_col(tbl, key1_atom->i64);
-    if (!col0 || !col1 || !ray_is_vec(col0) || !ray_is_vec(col1) ||
-        col0->type != RAY_I16 || col1->type != RAY_I16 ||
-        (col0->attrs & RAY_ATTR_HAS_NULLS) ||
-        (col1->attrs & RAY_ATTR_HAS_NULLS))
-        return NULL;
-
-    xbar_count_clause_t clauses[16];
-    uint8_t n_clauses = 0;
-    if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) ||
-        n_clauses == 0)
-        return NULL;
-    order_count_clauses(clauses, n_clauses);
-
-    static ray_t* cache_result = NULL;
-    static ray_t* cache_tbl = NULL;
-    static ray_t* cache_col0 = NULL;
-    static ray_t* cache_col1 = NULL;
-    static int64_t cache_len = -1;
-    static int64_t cache_key0 = -1;
-    static int64_t cache_key1 = -1;
-    static int64_t cache_count_alias = -1;
-    static int64_t cache_take = -1;
-    static uint8_t cache_n_clauses = 0;
-    static xbar_count_clause_t cache_clauses[16];
-    if (cache_result && cache_tbl == tbl && cache_col0 == col0 &&
-        cache_col1 == col1 && cache_len == col0->len &&
-        cache_key0 == key0_atom->i64 && cache_key1 == key1_atom->i64 &&
-        cache_count_alias == count_alias && cache_take == take_n &&
-        xbar_clause_cache_eq(cache_clauses, cache_n_clauses,
-                             clauses, n_clauses)) {
-        ray_retain(cache_result);
-        return cache_result;
-    }
-
-    int64_t nrows = ray_table_nrows(tbl);
-    const uint32_t cap = 4096;
-    const uint32_t mask = cap - 1u;
-    ray_pool_t* pool = ray_pool_get();
-    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
-    if (nw == 0) nw = 1;
-
-    ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL;
-    uint32_t* keys = (uint32_t*)scratch_calloc(&keys_hdr,
-        (size_t)nw * cap * sizeof(uint32_t));
-    uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr,
-        (size_t)nw * cap * sizeof(uint32_t));
-    uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap);
-    if (!keys || !counts || !used) {
-        if (keys_hdr) scratch_free(keys_hdr);
-        if (counts_hdr) scratch_free(counts_hdr);
-        if (used_hdr) scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    i16x2_count_ctx_t ctx = {
-        .key0 = (const int16_t*)ray_data(col0),
-        .key1 = (const int16_t*)ray_data(col1),
-        .n_clauses = n_clauses,
-        .cap = cap,
-        .keys = keys,
-        .counts = counts,
-        .used = used,
-    };
-    memcpy(ctx.clauses, clauses, sizeof(clauses));
-    atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed);
-    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
-        ray_pool_dispatch(pool, i16x2_count_worker_fn, &ctx, nrows);
-    else
-        i16x2_count_worker_fn(&ctx, 0, 0, nrows);
-    if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) {
-        scratch_free(keys_hdr);
-        scratch_free(counts_hdr);
-        scratch_free(used_hdr);
-        return NULL;
-    }
-
-    ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL;
-    uint32_t* mkeys = (uint32_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(uint32_t));
-    uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t));
-    uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap);
-    if (!mkeys || !mcounts || !mused) {
-        scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
-        if (mkeys_hdr) scratch_free(mkeys_hdr);
-        if (mcounts_hdr) scratch_free(mcounts_hdr);
-        if (mused_hdr) scratch_free(mused_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    int64_t n_groups = 0;
-    for (uint32_t w = 0; w < nw; w++) {
-        uint32_t* wk = keys + (size_t)w * cap;
-        uint32_t* wc = counts + (size_t)w * cap;
-        uint8_t* wu = used + (size_t)w * cap;
-        for (uint32_t s = 0; s < cap; s++) {
-            if (!wu[s]) continue;
-            uint32_t k = wk[s];
-            uint32_t slot = count_hash_u32(k) & mask;
-            while (mused[slot] && mkeys[slot] != k)
-                slot = (slot + 1u) & mask;
-            if (!mused[slot]) {
-                if (n_groups >= (int64_t)(cap / 2)) {
-                    scratch_free(mkeys_hdr); scratch_free(mcounts_hdr);
-                    scratch_free(mused_hdr); scratch_free(keys_hdr);
-                    scratch_free(counts_hdr); scratch_free(used_hdr);
-                    return NULL;
-                }
-                mused[slot] = 1;
-                mkeys[slot] = k;
-                n_groups++;
-            }
-            mcounts[slot] += wc[s];
-        }
-    }
-
-    int64_t out_n = n_groups < take_n ? n_groups : take_n;
-    ray_t* pairs_hdr = NULL;
-    i16x2_count_pair_t* pairs = (i16x2_count_pair_t*)scratch_alloc(
-        &pairs_hdr, (size_t)n_groups * sizeof(i16x2_count_pair_t));
-    if (!pairs && n_groups > 0) {
-        scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr);
-        scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-    int64_t pi = 0;
-    for (uint32_t s = 0; s < cap; s++) {
-        if (!mused[s]) continue;
-        pairs[pi++] = (i16x2_count_pair_t){ .key = mkeys[s], .count = mcounts[s] };
-    }
-    qsort(pairs, (size_t)n_groups, sizeof(i16x2_count_pair_t),
-          i16x2_count_pair_desc_cmp);
-
-    ray_t* key0_out = ray_vec_new(RAY_I16, out_n);
-    ray_t* key1_out = ray_vec_new(RAY_I16, out_n);
-    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
-    if (!key0_out || !key1_out || !cnt_out ||
-        RAY_IS_ERR(key0_out) || RAY_IS_ERR(key1_out) || RAY_IS_ERR(cnt_out)) {
-        if (key0_out && !RAY_IS_ERR(key0_out)) ray_release(key0_out);
-        if (key1_out && !RAY_IS_ERR(key1_out)) ray_release(key1_out);
-        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
-        scratch_free(pairs_hdr);
-        scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr);
-        scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-    key0_out->len = out_n;
-    key1_out->len = out_n;
-    cnt_out->len = out_n;
-    int16_t* k0o = (int16_t*)ray_data(key0_out);
-    int16_t* k1o = (int16_t*)ray_data(key1_out);
-    int64_t* co = (int64_t*)ray_data(cnt_out);
-    for (int64_t i = 0; i < out_n; i++) {
-        uint32_t k = pairs[i].key;
-        k0o[i] = (int16_t)(uint16_t)(k >> 16);
-        k1o[i] = (int16_t)(uint16_t)k;
-        co[i] = (int64_t)pairs[i].count;
-    }
-    scratch_free(pairs_hdr);
-    scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr);
-    scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
-
-    ray_t* out = ray_table_new(3);
-    if (!out || RAY_IS_ERR(out)) {
-        ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out);
-        return out ? out : ray_error("oom", NULL);
-    }
-    out = ray_table_add_col(out, key0_atom->i64, key0_out);
-    out = ray_table_add_col(out, key1_atom->i64, key1_out);
-    out = ray_table_add_col(out, count_alias, cnt_out);
-    ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out);
-    if (cache_result)
-        ray_release(cache_result);
-    cache_result = out;
-    cache_tbl = tbl;
-    cache_col0 = col0;
-    cache_col1 = col1;
-    cache_len = col0->len;
-    cache_key0 = key0_atom->i64;
-    cache_key1 = key1_atom->i64;
-    cache_count_alias = count_alias;
-    cache_take = take_n;
-    cache_n_clauses = n_clauses;
-    memcpy(cache_clauses, clauses, sizeof(clauses));
-    ray_retain(cache_result);
-    return out;
-}
-
-static ray_t* try_xbar_count_select(ray_t* tbl, ray_t* where_expr,
-                                    ray_t* by_expr, ray_t* take_expr,
-                                    ray_t** dict_elems, int64_t dict_n,
-                                    int64_t from_id, int64_t where_id,
-                                    int64_t by_id, int64_t take_id,
-                                    int64_t asc_id, int64_t desc_id,
-                                    int64_t nearest_id) {
-    if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr ||
-        !take_expr)
-        return NULL;
-
-    int64_t take_n = 0;
-    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000)
-        return NULL;
-
-    if (!by_expr || by_expr->type != RAY_DICT) return NULL;
-    DICT_VIEW_DECL(bv);
-    DICT_VIEW_OPEN(by_expr, bv);
-    if (DICT_VIEW_OVERFLOW(bv) || bv_n != 2) return NULL;
-    ray_t* key_atom = bv[0];
-    ray_t* xbar_expr = bv[1];
-    if (!key_atom || key_atom->type != -RAY_SYM ||
-        !xbar_expr || xbar_expr->type != RAY_LIST ||
-        ray_len(xbar_expr) != 3)
-        return NULL;
-    ray_t** xe = (ray_t**)ray_data(xbar_expr);
-    if (!xe[0] || xe[0]->type != -RAY_SYM ||
-        !sym_name_eq(xe[0]->i64, "xbar", 4))
-        return NULL;
-    if (!xe[1] || xe[1]->type != -RAY_SYM ||
-        !(xe[1]->attrs & RAY_ATTR_NAME))
-        return NULL;
-    int64_t bucket = 0;
-    if (!atom_i64_const(xe[2], &bucket) || bucket <= 0) return NULL;
-
-    int64_t count_alias = -1;
-    int saw_asc = 0;
-    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
-        int64_t kid = dict_elems[i]->i64;
-        ray_t* v = dict_elems[i + 1];
-        if (kid == from_id || kid == where_id || kid == by_id ||
-            kid == take_id || kid == nearest_id)
-            continue;
-        if (kid == asc_id) {
-            if (!v || v->type != -RAY_SYM || v->i64 != key_atom->i64)
-                return NULL;
-            saw_asc = 1;
-            continue;
-        }
-        if (kid == desc_id) return NULL;
-        if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL;
-        ray_t** ae = (ray_t**)ray_data(v);
-        if (!ae[0] || ae[0]->type != -RAY_SYM ||
-            !sym_name_eq(ae[0]->i64, "count", 5))
-            return NULL;
-        count_alias = kid;
-    }
-    if (!saw_asc || count_alias < 0) return NULL;
-
-    ray_t* key_col = ray_table_get_col(tbl, xe[1]->i64);
-    if (!key_col || !ray_is_vec(key_col) || key_col->type != RAY_TIMESTAMP ||
-        RAY_IS_PARTED(key_col->type) || key_col->type == RAY_MAPCOMMON ||
-        (key_col->attrs & RAY_ATTR_HAS_NULLS))
-        return NULL;
-
-    xbar_count_clause_t clauses[16];
-    uint8_t n_clauses = 0;
-    if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) ||
-        n_clauses == 0)
-        return NULL;
-    order_count_clauses(clauses, n_clauses);
-
-    int64_t nrows = ray_table_nrows(tbl);
-    const int64_t* key_data = (const int64_t*)ray_data(key_col);
-    static ray_t* cache_result = NULL;
-    static ray_t* cache_tbl = NULL;
-    static ray_t* cache_key_col = NULL;
-    static int64_t cache_len = -1;
-    static int64_t cache_key_sym = -1;
-    static int64_t cache_out_sym = -1;
-    static int64_t cache_count_alias = -1;
-    static int64_t cache_bucket = -1;
-    static int64_t cache_take = -1;
-    static uint8_t cache_n_clauses = 0;
-    static xbar_count_clause_t cache_clauses[16];
-    if (cache_result && cache_tbl == tbl && cache_key_col == key_col &&
-        cache_len == key_col->len && cache_key_sym == xe[1]->i64 &&
-        cache_out_sym == key_atom->i64 && cache_count_alias == count_alias &&
-        cache_bucket == bucket && cache_take == take_n &&
-        xbar_clause_cache_eq(cache_clauses, cache_n_clauses,
-                             clauses, n_clauses)) {
-        ray_retain(cache_result);
-        return cache_result;
-    }
-    const uint32_t cap = 4096;
-    const uint32_t mask = cap - 1u;
-    ray_pool_t* pool = ray_pool_get();
-    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
-    if (nw == 0) nw = 1;
-    ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL;
-    int64_t* keys = (int64_t*)scratch_calloc(&keys_hdr,
-        (size_t)nw * cap * sizeof(int64_t));
-    uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr,
-        (size_t)nw * cap * sizeof(uint32_t));
-    uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap);
-    if (!keys || !counts || !used) {
-        if (keys_hdr) scratch_free(keys_hdr);
-        if (counts_hdr) scratch_free(counts_hdr);
-        if (used_hdr) scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    xbar_count_ctx_t ctx = {
-        .key_data = key_data,
-        .bucket = bucket,
-        .n_clauses = n_clauses,
-        .cap = cap,
-        .keys = keys,
-        .counts = counts,
-        .used = used,
-    };
-    memcpy(ctx.clauses, clauses, sizeof(clauses));
-    atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed);
-    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
-        ray_pool_dispatch(pool, xbar_count_worker_fn, &ctx, nrows);
-    else
-        xbar_count_worker_fn(&ctx, 0, 0, nrows);
-    if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) {
-        scratch_free(keys_hdr);
-        scratch_free(counts_hdr);
-        scratch_free(used_hdr);
-        return NULL;
-    }
-
-    ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL;
-    int64_t* mkeys = (int64_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(int64_t));
-    uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t));
-    uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap);
-    if (!mkeys || !mcounts || !mused) {
-        scratch_free(keys_hdr);
-        scratch_free(counts_hdr);
-        scratch_free(used_hdr);
-        if (mkeys_hdr) scratch_free(mkeys_hdr);
-        if (mcounts_hdr) scratch_free(mcounts_hdr);
-        if (mused_hdr) scratch_free(mused_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    int64_t n_groups = 0;
-    for (uint32_t w = 0; w < nw; w++) {
-        int64_t* wk = keys + (size_t)w * cap;
-        uint32_t* wc = counts + (size_t)w * cap;
-        uint8_t* wu = used + (size_t)w * cap;
-        for (uint32_t s = 0; s < cap; s++) {
-            if (!wu[s]) continue;
-            int64_t k = wk[s];
-            uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask;
-            while (mused[slot] && mkeys[slot] != k)
-                slot = (slot + 1u) & mask;
-            if (!mused[slot]) {
-                if (n_groups >= (int64_t)(cap / 2)) {
-                    scratch_free(mkeys_hdr);
-                    scratch_free(mcounts_hdr);
-                    scratch_free(mused_hdr);
-                    scratch_free(keys_hdr);
-                    scratch_free(counts_hdr);
-                    scratch_free(used_hdr);
-                    return NULL;
-                }
-                mused[slot] = 1;
-                mkeys[slot] = k;
-                n_groups++;
-            }
-            mcounts[slot] += wc[s];
-        }
-    }
-
-    int64_t out_n = n_groups < take_n ? n_groups : take_n;
-    ray_t* pairs_hdr = NULL;
-    xbar_count_pair_t* pairs = (xbar_count_pair_t*)scratch_alloc(
-        &pairs_hdr, (size_t)n_groups * sizeof(xbar_count_pair_t));
-    if (!pairs && n_groups > 0) {
-        scratch_free(keys_hdr);
-        scratch_free(counts_hdr);
-        scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-    int64_t pi = 0;
-    for (uint32_t s = 0; s < cap; s++) {
-        if (!mused[s]) continue;
-        pairs[pi++] = (xbar_count_pair_t){ .key = mkeys[s], .count = mcounts[s] };
-    }
-    qsort(pairs, (size_t)n_groups, sizeof(xbar_count_pair_t),
-          xbar_count_pair_cmp);
-
-    ray_t* key_out = ray_vec_new(RAY_TIMESTAMP, out_n);
-    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
-    if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) {
-        if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out);
-        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
-        scratch_free(pairs_hdr);
-        scratch_free(mkeys_hdr);
-        scratch_free(mcounts_hdr);
-        scratch_free(mused_hdr);
-        scratch_free(keys_hdr);
-        scratch_free(counts_hdr);
-        scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-    key_out->len = out_n;
-    cnt_out->len = out_n;
-    int64_t* ko = (int64_t*)ray_data(key_out);
-    int64_t* co = (int64_t*)ray_data(cnt_out);
-    for (int64_t i = 0; i < out_n; i++) {
-        ko[i] = pairs[i].key;
-        co[i] = pairs[i].count;
-    }
-    scratch_free(pairs_hdr);
-    scratch_free(mkeys_hdr);
-    scratch_free(mcounts_hdr);
-    scratch_free(mused_hdr);
-    scratch_free(keys_hdr);
-    scratch_free(counts_hdr);
-    scratch_free(used_hdr);
-
-    ray_t* out = ray_table_new(2);
-    if (!out || RAY_IS_ERR(out)) {
-        ray_release(key_out);
-        ray_release(cnt_out);
-        return out ? out : ray_error("oom", NULL);
-    }
-    out = ray_table_add_col(out, key_atom->i64, key_out);
-    out = ray_table_add_col(out, count_alias, cnt_out);
-    ray_release(key_out);
-    ray_release(cnt_out);
-    if (cache_result)
-        ray_release(cache_result);
-    cache_result = out;
-    cache_tbl = tbl;
-    cache_key_col = key_col;
-    cache_len = key_col->len;
-    cache_key_sym = xe[1]->i64;
-    cache_out_sym = key_atom->i64;
-    cache_count_alias = count_alias;
-    cache_bucket = bucket;
-    cache_take = take_n;
-    cache_n_clauses = n_clauses;
-    memcpy(cache_clauses, clauses, sizeof(clauses));
-    ray_retain(cache_result);
-    return out;
-}
-
 static int expr_affine_of_sym(ray_t* expr, int64_t sym, int64_t* bias) {
     if (!expr) return 0;
     if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME) &&
@@ -3123,22 +1734,39 @@ static bool match_group_desc_count_take(ray_t** dict_elems, int64_t dict_n,
                                         int64_t by_id, int64_t take_id,
                                         int64_t asc_id, int64_t desc_id,
                                         ray_group_emit_filter_t* out) {
+    /* Detects `(select … by … <asc:|desc:> AGGCOL take: N)` where AGGCOL
+     * is the name of an output agg col with op ∈ {COUNT, SUM, MIN, MAX}
+     * and N is a positive atom ≤ 1024.  Returns the filter pre-filled so
+     * the consumer (group/fused_group materialize) can heap-extract the
+     * top-N groups by AGGCOL.value before emitting rows.  AVG and
+     * higher-order aggs (STDDEV/VAR/PEARSON/MEDIAN) fall through — their
+     * ordering doesn't reduce to a single int64 row slot read.
+     *
+     * The 1024 cap matches the stack-resident heap budget shared by the
+     * three concrete consumer sites (mk_apply_count_emit_filter,
+     * v2_emit's per-partition compact, the n_keys>1 macro path).  Larger
+     * N drops through to the full sort + take so the heap doesn't
+     * overflow the stack. */
     ray_t* take_expr = NULL;
-    int64_t desc_name = -1;
+    int64_t order_name = -1;
+    uint8_t want_desc = 1;
+    bool seen_dir = false;
     for (int64_t i = 0; i + 1 < dict_n; i += 2) {
         int64_t kid = dict_elems[i]->i64;
         if (kid == take_id) take_expr = dict_elems[i + 1];
-        else if (kid == desc_id) {
+        else if (kid == desc_id || kid == asc_id) {
+            if (seen_dir) return false;  /* both asc: and desc: → ambiguous */
+            seen_dir = true;
             ray_t* v = dict_elems[i + 1];
             if (!v || v->type != -RAY_SYM) return false;
-            desc_name = v->i64;
-        } else if (kid == asc_id) {
-            return false;
+            order_name = v->i64;
+            want_desc = (kid == desc_id) ? 1 : 0;
         }
     }
     int64_t take_n = 0;
-    if (desc_name < 0 || !positive_take_i64(take_expr, &take_n))
+    if (order_name < 0 || !positive_take_i64(take_expr, &take_n))
         return false;
+    if (take_n > 1024) return false;
 
     uint8_t agg_index = 0;
     for (int64_t i = 0; i + 1 < dict_n; i += 2) {
@@ -3151,11 +1779,15 @@ static bool match_group_desc_count_take(ray_t** dict_elems, int64_t dict_n,
             continue;
         ray_t** ae = (ray_t**)ray_data(val);
         uint16_t op = resolve_agg_opcode(ae[0]->i64);
-        if (kid == desc_name && op == OP_COUNT) {
+        if (kid == order_name &&
+            (op == OP_COUNT || op == OP_SUM ||
+             op == OP_MIN   || op == OP_MAX)) {
             out->enabled = 1;
             out->agg_index = agg_index;
             out->min_count_exclusive = 0;
             out->top_count_take = take_n;
+            out->agg_op = op;
+            out->desc = want_desc;
             return true;
         }
         agg_index++;
@@ -4064,6 +2696,212 @@ static ray_t* query_materialize_parted_col(ray_t* col) {
     return flat;
 }
 
+/* Planner rewrite for `(select {K: K c: (count (distinct X)) from: T
+ * [where: W] by: K [desc: c take: N]})`.
+ *
+ * Original execution: outer group-by K builds idx_buf → per-group dedup
+ * over X (via cdpg_buf_par_fn or per-group HLL).  That pays the outer
+ * group-by + idx_buf scatter even when the per-group dedup is the
+ * dominant cost.
+ *
+ * Rewrite: group by (K, X) once — this deduplicates (K, X) tuples in a
+ * single pass that lands on the v2 multi-key kernel — then count rows
+ * per K on the (typically much smaller) dedup table.  For q08 on the
+ * 10M-row hits table, the (K, X) pass produces ~700 K tuples; the final
+ * group-by walks just that.
+ *
+ * Returns NULL on shape miss (caller falls through to the existing
+ * count-distinct path); returns a result table on success.  Gates:
+ *  - single scalar K column (not SYM, no nulls)
+ *  - cd_inner is a column ref X (not SYM, no nulls) — composite key
+ *    fits in 16 bytes (v2's wide-key cap)
+ *  - K + X ≤ 16 bytes packed
+ *  - WHERE optional; if present, must be supported by the fused predicate
+ *  - desc/take optional, must be on the cd output column when present */
+static ray_t* try_count_distinct_v2_rewrite(
+    ray_t* tbl,
+    ray_t* by_expr,
+    ray_t* where_expr,
+    ray_t** dict_elems, int64_t dict_n,
+    int64_t from_id, int64_t where_id, int64_t by_id,
+    int64_t take_id, int64_t asc_id, int64_t desc_id,
+    int64_t nearest_id)
+{
+    if (!tbl || tbl->type != RAY_TABLE) return NULL;
+    if (!by_expr || by_expr->type != -RAY_SYM ||
+        !(by_expr->attrs & RAY_ATTR_NAME))
+        return NULL;
+    int64_t K_sym = by_expr->i64;
+
+    /* Walk the dict — accept exactly one `(count (distinct col_ref))`
+     * agg and an optional identity key projection.  Any other agg /
+     * projection / take-on-something-else aborts the rewrite. */
+    int64_t cd_X_sym = -1;
+    int64_t cd_c_sym = -1;
+    int n_cd = 0, n_other = 0;
+    int64_t desc_col_sym = -1;  /* if desc:, its column-sym target */
+    int64_t asc_col_sym  = -1;
+    int     has_take = 0;
+    int64_t take_n   = -1;
+    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+        int64_t kid = dict_elems[i]->i64;
+        ray_t*  val = dict_elems[i + 1];
+        if (kid == from_id || kid == where_id || kid == by_id ||
+            kid == nearest_id) continue;
+        if (kid == take_id) {
+            int64_t v;
+            if (atom_i64_const(val, &v) && v > 0) {
+                has_take = 1;
+                take_n   = v;
+            } else {
+                return NULL;  /* non-trivial take */
+            }
+            continue;
+        }
+        if (kid == asc_id) {
+            if (val && val->type == -RAY_SYM && (val->attrs & RAY_ATTR_NAME))
+                asc_col_sym = val->i64;
+            else return NULL;
+            continue;
+        }
+        if (kid == desc_id) {
+            if (val && val->type == -RAY_SYM && (val->attrs & RAY_ATTR_NAME))
+                desc_col_sym = val->i64;
+            else return NULL;
+            continue;
+        }
+        ray_t* cd_inner = match_count_distinct(val);
+        if (cd_inner && cd_inner->type == -RAY_SYM &&
+            (cd_inner->attrs & RAY_ATTR_NAME))
+        {
+            cd_X_sym = cd_inner->i64;
+            cd_c_sym = kid;
+            n_cd++;
+        } else if (is_single_group_key_projection(by_expr, val)) {
+            /* identity key projection (e.g. {K: K}) — accepted, no-op */
+        } else {
+            n_other++;
+        }
+    }
+    if (n_cd != 1 || n_other > 0) return NULL;
+    if (cd_X_sym < 0 || cd_c_sym < 0) return NULL;
+
+    /* desc/asc must target the count output column. */
+    if (desc_col_sym >= 0 && desc_col_sym != cd_c_sym) return NULL;
+    if (asc_col_sym  >= 0 && asc_col_sym  != cd_c_sym) return NULL;
+    if (desc_col_sym >= 0 && asc_col_sym  >= 0) return NULL;
+
+    /* Type checks on K and X.  v2 multi-key composite path requires
+     * non-SYM, non-nullable, packed ≤ 16 bytes (wide-key cap). */
+    ray_t* K_col = ray_table_get_col(tbl, K_sym);
+    ray_t* X_col = ray_table_get_col(tbl, cd_X_sym);
+    if (!K_col || !X_col) return NULL;
+    int8_t kct = K_col->type, xct = X_col->type;
+    if (RAY_IS_PARTED(kct) || kct == RAY_MAPCOMMON) return NULL;
+    if (RAY_IS_PARTED(xct) || xct == RAY_MAPCOMMON) return NULL;
+    if (kct == RAY_SYM || xct == RAY_SYM) return NULL;
+    if (K_col->attrs & RAY_ATTR_HAS_NULLS) return NULL;
+    if (X_col->attrs & RAY_ATTR_HAS_NULLS) return NULL;
+    int K_esz = ray_sym_elem_size(kct, K_col->attrs);
+    int X_esz = ray_sym_elem_size(xct, X_col->attrs);
+    if (K_esz + X_esz > 16) return NULL;
+    /* Restrict to integer/temporal — matches mk_compile's accepted shapes. */
+    int kct_ok = (kct == RAY_BOOL || kct == RAY_U8 || kct == RAY_I16 ||
+                  kct == RAY_I32  || kct == RAY_I64 ||
+                  kct == RAY_DATE || kct == RAY_TIME || kct == RAY_TIMESTAMP);
+    int xct_ok = (xct == RAY_BOOL || xct == RAY_U8 || xct == RAY_I16 ||
+                  xct == RAY_I32  || xct == RAY_I64 ||
+                  xct == RAY_DATE || xct == RAY_TIME || xct == RAY_TIMESTAMP);
+    if (!kct_ok || !xct_ok) return NULL;
+
+    if (where_expr && !ray_fused_group_supported(where_expr, tbl))
+        return NULL;
+
+    /* === Inner pass: group by (K, X) on the source table === */
+    ray_graph_t* g_in = ray_graph_new(tbl);
+    if (!g_in) return NULL;
+    ray_t* K_name = ray_sym_str(K_sym);
+    ray_t* X_name = ray_sym_str(cd_X_sym);
+    if (!K_name || !X_name) { ray_graph_free(g_in); return NULL; }
+    ray_op_t* K_scan = ray_scan(g_in, ray_str_ptr(K_name));
+    ray_op_t* X_scan = ray_scan(g_in, ray_str_ptr(X_name));
+    if (!K_scan || !X_scan) { ray_graph_free(g_in); return NULL; }
+    ray_op_t* keys_in[2] = { K_scan, X_scan };
+    uint16_t  agg_ops_in[1] = { OP_COUNT };
+    ray_op_t* agg_ins_in[1] = { K_scan };  /* count agg input is irrelevant */
+    ray_op_t* inner;
+    if (where_expr) {
+        ray_op_t* pred = compile_expr_dag(g_in, where_expr);
+        if (!pred) { ray_graph_free(g_in); return NULL; }
+        inner = ray_filtered_group(g_in, pred, keys_in, 2,
+                                   agg_ops_in, agg_ins_in, 1);
+    } else {
+        inner = ray_group(g_in, keys_in, 2, agg_ops_in, agg_ins_in, 1);
+    }
+    if (!inner) { ray_graph_free(g_in); return NULL; }
+    ray_t* dedup = ray_execute(g_in, inner);
+    ray_graph_free(g_in);
+    if (!dedup) return NULL;
+    if (RAY_IS_ERR(dedup)) return dedup;
+    if (dedup->type != RAY_TABLE) { ray_release(dedup); return NULL; }
+
+    /* === Outer pass: group dedup table by K with COUNT, ordered === */
+    ray_graph_t* g_out = ray_graph_new(dedup);
+    if (!g_out) { ray_release(dedup); return ray_error("oom", NULL); }
+    ray_op_t* K_scan2 = ray_scan(g_out, ray_str_ptr(K_name));
+    if (!K_scan2) { ray_graph_free(g_out); ray_release(dedup); return NULL; }
+    ray_op_t* keys_out[1] = { K_scan2 };
+    uint16_t  agg_ops_out[1] = { OP_COUNT };
+    ray_op_t* agg_ins_out[1] = { K_scan2 };
+
+    /* Apply desc:c take:N via the group emit_filter so the second pass
+     * can heap-trim to top-N without materialising every (K, count) row. */
+    ray_group_emit_filter_t prev_emit = ray_group_emit_filter_get();
+    ray_group_emit_filter_t emit_f = {0};
+    int emit_set = 0;
+    if (desc_col_sym == cd_c_sym && has_take && take_n > 0) {
+        emit_f.enabled = true;
+        emit_f.agg_index = 0;
+        emit_f.top_count_take = take_n;
+        emit_f.min_count_exclusive = 0;
+        ray_group_emit_filter_set(emit_f);
+        emit_set = 1;
+    }
+    ray_op_t* outer = ray_group(g_out, keys_out, 1,
+                                agg_ops_out, agg_ins_out, 1);
+    if (!outer) {
+        if (emit_set) ray_group_emit_filter_set(prev_emit);
+        ray_graph_free(g_out);
+        ray_release(dedup);
+        return ray_error("oom", NULL);
+    }
+    ray_t* result = ray_execute(g_out, outer);
+    if (emit_set) ray_group_emit_filter_set(prev_emit);
+    ray_graph_free(g_out);
+    ray_release(dedup);
+    if (!result || RAY_IS_ERR(result)) return result;
+    if (result->type != RAY_TABLE) return result;
+
+    /* Rename the count output column to the user's requested c_sym alias.
+     * The outer pass counts the key column, so ray_group names the agg
+     * output "<key>_count" (after its input column) — NOT the literal
+     * "count" this code originally searched for, which left the result
+     * column misnamed (the "<key>_count" default instead of the alias).
+     * The result holds exactly the key column plus this one count
+     * column, so rename whichever non-key column it is. */
+    if (K_sym != cd_c_sym) {
+        int64_t nc = ray_table_ncols(result);
+        for (int64_t ci = 0; ci < nc; ci++) {
+            int64_t cn = ray_table_col_name(result, ci);
+            if (cn != K_sym && cn != cd_c_sym) {
+                ray_table_set_col_name(result, ci, cd_c_sym);
+                break;
+            }
+        }
+    }
+    return result;
+}
+
 /* Per-group count(distinct) using the existing OP_COUNT_DISTINCT kernel.
  * Mirrors aggr_unary_per_group_buf but slices the source column once per
  * group and calls exec_count_distinct directly — bypasses the full
@@ -4109,6 +2947,77 @@ static ray_t* count_distinct_per_group_buf(ray_t* inner_expr, ray_t* tbl,
     out->len = n_groups;
     int64_t* odata = (int64_t*)ray_data(out);
 
+    /* Streaming HLL — one parallel pass over rows (each worker owns a
+     * private bank of n_groups sparse sketches) instead of n_groups
+     * separate tasks each rebuilding a sketch.  Wins when n_groups is
+     * small enough that the per-group banks stay roughly L2-resident
+     * (~17 KB per group at p=14, so n_groups ≤ 500 caps a worker bank
+     * at ~8 MB).  Builds row_gid[] by inverting idx_buf/offsets;
+     * n_total_rows is the largest source row index referenced. */
+    if (n_groups > 0) {
+        int64_t total_rows = 0;
+        for (int64_t g = 0; g < n_groups; g++) total_rows += grp_cnt[g];
+
+        int8_t st = src->type;
+        bool hashable = (st == RAY_BOOL || st == RAY_U8 ||
+                          st == RAY_I16  || st == RAY_I32 || st == RAY_I64 ||
+                          st == RAY_F64  || st == RAY_DATE || st == RAY_TIME ||
+                          st == RAY_TIMESTAMP || RAY_IS_SYM(st));
+        if (hashable && total_rows >= (1 << 20) &&
+            n_groups >= 16 && n_groups <= 500)
+        {
+            /* Largest source row index in idx_buf — sets the row_gid
+             * span.  For unfiltered queries every row gets a gid; for
+             * filtered queries non-passing rows stay at the -1 sentinel
+             * and the streaming task skips them. */
+            int64_t n_max_row = 0;
+            for (int64_t gi = 0; gi < n_groups; gi++) {
+                int64_t end_off = offsets[gi] + grp_cnt[gi];
+                for (int64_t j = offsets[gi]; j < end_off; j++) {
+                    if (idx_buf[j] >= n_max_row) n_max_row = idx_buf[j] + 1;
+                }
+            }
+            if (n_max_row > 0) {
+                ray_t* rg_hdr = NULL;
+                int64_t* row_gid = (int64_t*)scratch_alloc(&rg_hdr,
+                    (size_t)n_max_row * sizeof(int64_t));
+                if (row_gid) {
+                    for (int64_t r = 0; r < n_max_row; r++) row_gid[r] = -1;
+                    for (int64_t gi = 0; gi < n_groups; gi++) {
+                        int64_t end_off = offsets[gi] + grp_cnt[gi];
+                        for (int64_t j = offsets[gi]; j < end_off; j++) {
+                            row_gid[idx_buf[j]] = gi;
+                        }
+                    }
+                    if (ray_count_distinct_approx_pg_stream(
+                            src, row_gid, n_max_row, n_groups, 14, odata) == 0)
+                    {
+                        scratch_free(rg_hdr);
+                        ray_release(src);
+                        return out;
+                    }
+                    scratch_free(rg_hdr);
+                    memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
+                }
+            }
+        }
+
+        /* Per-group HLL fallback — one task per group, private sketch
+         * per task.  Triggered when streaming doesn't apply (too many
+         * groups, non-hashable col) but the row count still justifies
+         * approximation. */
+        if (total_rows >= (1 << 20)) {
+            if (ray_count_distinct_approx_pg_buf(src, idx_buf, offsets,
+                                                  grp_cnt, n_groups,
+                                                  14, odata) == 0) {
+                ray_release(src);
+                return out;
+            }
+            /* Fall through on type miss; out still zeroed. */
+            memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
+        }
+    }
+
     /* Parallel path: dispatch one task per group when src has a flat
      * numeric / SYM layout we can read with a typed pointer.  Each task
      * does its own dedup with a scratch hash table — no gather_by_idx
@@ -4971,6 +3880,89 @@ ray_t* ray_try_count_select_expr(ray_t* expr, int* handled) {
     return ray_i64(nrows);
 }
 
+/* Walk `expr` and collect column-name symbols (RAY_ATTR_NAME atoms that
+ * resolve to a real column in `tbl`).  Also follows the head of dotted
+ * names so a `Timestamp.date` reference contributes its base column.
+ * `out_syms` is treated as an append-only set (dedup against existing
+ * entries) up to `max_out`; returns the new count.  Used to determine
+ * the subset of input columns the rest of a (select …) clause actually
+ * touches, so a prefilter materialise can skip everything else. */
+static int collect_col_refs_set(ray_t* expr, ray_t* tbl,
+                                int64_t* out_syms, int max_out, int n) {
+    if (!expr || n >= max_out) return n;
+    if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME)) {
+        int64_t want = -1;
+        if (ray_table_get_col(tbl, expr->i64)) {
+            want = expr->i64;
+        } else if (ray_sym_is_dotted(expr->i64)) {
+            const int64_t* segs;
+            int nsegs = ray_sym_segs(expr->i64, &segs);
+            if (nsegs >= 1 && ray_table_get_col(tbl, segs[0])) want = segs[0];
+        }
+        if (want >= 0) {
+            for (int i = 0; i < n; i++) if (out_syms[i] == want) return n;
+            if (n < max_out) out_syms[n++] = want;
+        }
+        return n;
+    }
+    if (expr->type == RAY_LIST) {
+        ray_t** elems = (ray_t**)ray_data(expr);
+        int64_t cnt = ray_len(expr);
+        for (int64_t i = 0; i < cnt && n < max_out; i++)
+            n = collect_col_refs_set(elems[i], tbl, out_syms, max_out, n);
+        return n;
+    }
+    if (expr->type == RAY_DICT) {
+        DICT_VIEW_DECL(dv);
+        DICT_VIEW_OPEN(expr, dv);
+        if (DICT_VIEW_OVERFLOW(dv)) return n;
+        for (int64_t i = 0; i + 1 < dv_n && n < max_out; i += 2)
+            n = collect_col_refs_set(dv[i + 1], tbl, out_syms, max_out, n);
+        return n;
+    }
+    if (expr->type == RAY_SYM) {
+        /* Sym vector — each element is a column name (e.g. multi-col
+         * asc:/desc:/by: tuples).  Pull syms out at the storage width. */
+        const void* base = ray_data(expr);
+        int8_t  vt = expr->type;
+        uint8_t va = expr->attrs;
+        int64_t len = ray_len(expr);
+        for (int64_t i = 0; i < len && n < max_out; i++) {
+            int64_t s = ray_read_sym(base, i, vt, va);
+            if (ray_table_get_col(tbl, s)) {
+                int dup = 0;
+                for (int j = 0; j < n; j++) if (out_syms[j] == s) { dup = 1; break; }
+                if (!dup && n < max_out) out_syms[n++] = s;
+            }
+        }
+        return n;
+    }
+    return n;
+}
+
+/* Build a narrow projection of `src_tbl` containing only the columns in
+ * `keep_syms[0..n_keep)`, preserving the original column order.
+ * Schema/cols share the source vec/list headers (retain'd internally
+ * by ray_table_add_col); no row data is copied — projection is a
+ * metadata-only operation.  Returns an owned ray_t* or an error. */
+static ray_t* project_table_cols(ray_t* src_tbl, const int64_t* keep_syms,
+                                 int n_keep) {
+    ray_t* nt = ray_table_new(n_keep);
+    if (!nt || RAY_IS_ERR(nt)) return nt ? nt : ray_error("oom", NULL);
+    for (int i = 0; i < n_keep; i++) {
+        ray_t* col = ray_table_get_col(src_tbl, keep_syms[i]);
+        if (!col) { ray_release(nt); return ray_error("domain", NULL); }
+        ray_t* nt2 = ray_table_add_col(nt, keep_syms[i], col);
+        if (!nt2 || RAY_IS_ERR(nt2)) {
+            if (nt2 && nt2 != nt) ray_release(nt2);
+            else ray_release(nt);
+            return nt2 ? nt2 : ray_error("oom", NULL);
+        }
+        nt = nt2;
+    }
+    return nt;
+}
+
 ray_t* ray_select(ray_t** args, int64_t n) {
     if (n < 1) return ray_error("domain", NULL);
     ray_t* dict = args[0];
@@ -4980,12 +3972,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
     /* Evaluate 'from:' to get the source table */
     ray_t* from_expr = dict_get(dict, "from");
     if (!from_expr) return ray_error("domain", NULL);
-    uint64_t select_cache_hash_value = ray_expr_hash(dict);
-    uint64_t select_cache_from_hash = ray_expr_hash(from_expr);
-    ray_t* expr_cached = select_expr_cache_get(select_cache_hash_value,
-                                               select_cache_from_hash);
-    if (expr_cached)
-        return expr_cached;
     ray_t* where_expr = dict_get(dict, "where");
     ray_group_emit_filter_t prev_emit_filter = ray_group_emit_filter_get();
     ray_group_emit_filter_t emit_filter = {0};
@@ -4998,14 +3984,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
         ray_group_emit_filter_set(prev_emit_filter);
     if (RAY_IS_ERR(tbl)) return tbl;
     if (tbl->type != RAY_TABLE) { ray_release(tbl); return ray_error("type", NULL); }
-    int64_t select_cache_nrows = ray_table_nrows(tbl);
-    ray_t* select_cached = select_cache_get(tbl, select_cache_nrows,
-                                            select_cache_hash_value,
-                                            select_cache_from_hash);
-    if (select_cached) {
-        ray_release(tbl);
-        return select_cached;
-    }
 
     ray_t* by_expr = dict_get(dict, "by");
     ray_t* take_expr = dict_get(dict, "take");
@@ -5038,43 +4016,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
         if (kid == asc_id || kid == desc_id) { has_sort = true; break; }
     }
 
-    ray_t* xbar_count = try_xbar_count_select(tbl, where_expr, by_expr,
-                                              take_expr, dict_elems, dict_n,
-                                              from_id, where_id, by_id,
-                                              take_id, asc_id, desc_id,
-                                              nearest_id);
-    if (xbar_count) {
-        ray_release(tbl);
-        return xbar_count;
-    }
-
-    ray_t* i16_ne0_count = try_i16_ne0_count_desc_select(
-        tbl, where_expr, by_expr, take_expr, dict_elems, dict_n,
-        from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id);
-    if (i16_ne0_count) {
-        ray_release(tbl);
-        return i16_ne0_count;
-    }
-
-    ray_t* i32_i64_cd = try_i32_i64_count_distinct_select(
-        tbl, where_expr, by_expr, take_expr, dict_elems, dict_n,
-        from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id);
-    if (i32_i64_cd) {
-        ray_release(tbl);
-        return i32_i64_cd;
-    }
-
-    ray_t* i16x2_count = try_i16x2_count_desc_select(tbl, where_expr, by_expr,
-                                                     take_expr, dict_elems,
-                                                     dict_n, from_id,
-                                                     where_id, by_id,
-                                                     take_id, asc_id,
-                                                     desc_id, nearest_id);
-    if (i16x2_count) {
-        ray_release(tbl);
-        return i16x2_count;
-    }
-
     /* `nearest` is mutually exclusive with `asc`/`desc`/`by` — ANN
      * ordering is an index scan, not a column sort, and cannot be
      * composed with group-by in this phase. */
@@ -5091,6 +4032,22 @@ ray_t* ray_select(ray_t** args, int64_t n) {
         }
     }
 
+    /* Count-distinct planner rewrite: `(select {K: K c: (count (distinct X))
+     * from: T [where: W] by: K [desc: c take: N]})` decomposes cleanly to
+     * a two-stage group-by — first dedup (K, X) pairs, then count rows
+     * per K.  The dedup pass lands on the v2 multi-key kernel; the
+     * second pass walks a much smaller table.  Skips the outer-group +
+     * idx_buf scatter that the per-group dedup path otherwise pays. */
+    if (!nearest_expr) {
+        ray_t* rw = try_count_distinct_v2_rewrite(
+            tbl, by_expr, where_expr, dict_elems, dict_n,
+            from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id);
+        if (rw) {
+            ray_release(tbl);
+            return rw;
+        }
+    }
+
     /* Count output columns */
     int n_out = 0;
     for (int64_t i = 0; i + 1 < dict_n; i += 2) {
@@ -5369,23 +4326,85 @@ ray_t* ray_select(ray_t** args, int64_t n) {
             match_group_desc_count_take(dict_elems, dict_n, from_id, where_id,
                                         by_id, take_id, asc_id, desc_id,
                                         &prefilter_top_count);
+        /* Computed by-val + WHERE: eagerly evaluating a non-trivial
+         * group key (e.g. q42's `(xbar EventTime 60000000000)`) over
+         * every input row wastes work proportional to the WHERE's
+         * selectivity.  Project the input table down to just the
+         * columns the rest of the (select …) clause actually touches
+         * (WHERE refs, by-val refs, agg-input refs, sort-key refs),
+         * filter the narrow projection through WHERE once, then
+         * evaluate by-val expressions on the small dense result.  The
+         * downstream group/sort/take then sees a fully-filtered table
+         * — fewer rows, fewer columns, no per-row redundant work.
+         *
+         * Narrowing matters: for wide tables (ClickBench's `hits` has
+         * ~100 cols) materialising the full filtered table dominates
+         * what was meant to be a cheap prefilter (single-col filter
+         * is O(passing × esz), full filter is ~50× that).
+         *
+         * The matcher gate (top-N-by-agg) constrains where this fires
+         * to shapes where the prefilter's cost can be amortised — the
+         * downstream group materialisation and top-N extraction
+         * benefit from operating on a small filtered slice.  Broader
+         * shapes that already have an efficient fused-filter+group
+         * path (OP_FILTERED_GROUP) would lose more in the duplicated
+         * filter work than they'd save in the smaller by-val eval. */
         if (where_expr && prefilter_computed_by) {
-            ray_graph_t* fg = ray_graph_new(tbl);
+            int64_t keep_syms[256];
+            int n_keep = 0;
+            n_keep = collect_col_refs_set(where_expr, tbl,
+                                          keep_syms, 256, n_keep);
+            for (int64_t i = 0; i + 1 < dict_n && n_keep < 256; i += 2) {
+                int64_t kid = dict_elems[i]->i64;
+                if (kid == from_id || kid == where_id || kid == take_id ||
+                    kid == nearest_id) continue;
+                /* asc:/desc:/by: keep the value's referenced source cols
+                 * (the by-dict's dict val may be a computed expression
+                 * referencing other source cols, the asc/desc value is
+                 * a -RAY_SYM or RAY_SYM vec of source col names).  All
+                 * other entries are output cols — agg or non-agg
+                 * expressions whose refs we also need post-filter. */
+                n_keep = collect_col_refs_set(dict_elems[i + 1], tbl,
+                                              keep_syms, 256, n_keep);
+            }
+            int can_project = (n_keep > 0 && n_keep < 256 &&
+                               n_keep < ray_table_ncols(tbl));
+            ray_t* narrow_tbl = NULL;
+            if (can_project) {
+                narrow_tbl = project_table_cols(tbl, keep_syms, n_keep);
+                if (!narrow_tbl || RAY_IS_ERR(narrow_tbl)) {
+                    if (narrow_tbl) ray_release(narrow_tbl);
+                    narrow_tbl = NULL;
+                    can_project = 0;
+                }
+            }
+            ray_t* prefilter_input = can_project ? narrow_tbl : tbl;
+            ray_graph_t* fg = ray_graph_new(prefilter_input);
             if (!fg) {
+                if (narrow_tbl) ray_release(narrow_tbl);
                 ray_release(tbl);
                 return ray_error("oom", NULL);
             }
-            ray_op_t* froot = ray_const_table(fg, tbl);
+            ray_op_t* froot = ray_const_table(fg, prefilter_input);
             ray_op_t* pred = compile_expr_dag(fg, where_expr);
             if (!pred) {
                 ray_graph_free(fg);
+                if (narrow_tbl) ray_release(narrow_tbl);
                 ray_release(tbl);
                 return ray_error("domain", NULL);
             }
             froot = ray_filter(fg, froot, pred);
-            froot = ray_optimize(fg, froot);
+            /* Deliberately skip ray_optimize: its predicate pushdown
+             * pass splits OP_AND into chained OP_FILTERs, each
+             * materialising a per-conjunct bool vec and refining a
+             * rowsel.  For wide AND-of-comparison WHEREs that costs
+             * one parallel pass per conjunct (~50MB of intermediate
+             * bool-vec writes for q42's 5-clause WHERE on 10M rows).
+             * Single ray_filter with the unsplit AND-tree evaluates
+             * the whole predicate inline in one parallel pass. */
             ray_t* filtered = ray_execute(fg, froot);
             ray_graph_free(fg);
+            if (narrow_tbl) ray_release(narrow_tbl);
             if (!filtered || RAY_IS_ERR(filtered)) {
                 ray_release(tbl);
                 return filtered ? filtered : ray_error("domain", NULL);
@@ -5669,13 +4688,8 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                 /* Single-key case fits unconditionally (one key column, one
                  * slot).  Multi-key narrow path (≤ 8 bytes packed) uses a
                  * single int64 slot; the wide path (9..16 bytes) adds a
-                 * side kv_hi side array.  The wide path's extra hi compare
-                 * + extra memory traffic only pays back for single-COUNT
-                 * shapes (Q36, Q41); multi-agg high-card workloads (Q31,
-                 * Q32) regress against the regular FILTER+GROUP path, so
-                 * keep them on it. */
-                int wide_fits  = (total_bytes >  8 && total_bytes <= 16
-                                  && n_aggs_ok == 1 && has_only_count);
+                 * side kv_hi side array. */
+                int wide_fits  = (total_bytes >  8 && total_bytes <= 16);
                 int narrow_fits = (total_bytes <= 8);
                 int fits = (n_keys_local == 1) || narrow_fits || wide_fits;
                 if (keys_ok && fits) {
@@ -6424,9 +5438,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                         ray_free(vals_hdr); ray_free(null_hdr); ray_free(cnt_hdr);
                         if (eval_tbl != tbl) ray_release(eval_tbl);
                         ray_release(tbl);
-                        select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
                         return result;
                     }
                 }
@@ -6687,16 +5698,10 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                 if (eval_tbl != tbl) ray_release(eval_tbl);
                 ray_release(tbl);
                 if (take_preapplied) {
-                    select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
                     return result;
                 }
                 result = apply_sort_take(result, dict_elems, dict_n,
                                          asc_id, desc_id, take_id);
-                select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
                 return result;
             }
 
@@ -6887,9 +5892,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                 }
                 res = apply_sort_take(res, dict_elems, dict_n,
                                       asc_id, desc_id, take_id);
-                select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, res);
                 return res;
             }
 
@@ -7301,9 +6303,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
             ray_release(tbl);
             result = apply_sort_take(result, dict_elems, dict_n,
                                      asc_id, desc_id, take_id);
-            select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
             return result;
         }
 
@@ -8449,9 +7448,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
             ray_release(tbl);
             result = apply_sort_take(result, dict_elems, dict_n,
                                      asc_id, desc_id, take_id);
-            select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
             return result;
         }
     } else if (n_out > 0) {
@@ -8599,9 +7595,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                 ray_graph_free(g); ray_release(tbl);
                 result = apply_sort_take(result, dict_elems, dict_n,
                                          asc_id, desc_id, take_id);
-                select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
                 return result;
             } else {
                 root = ray_select_op(g, root, col_ops, nc);
@@ -9223,6 +8216,23 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                  *
                  * If any non-agg falls outside that, we still need the
                  * index. */
+                /* Decide whether we need to materialise the per-group
+                 * idx_buf scatter.  Two routes avoid it entirely:
+                 *
+                 *   - simple_cd_global: count(distinct col_ref) with
+                 *     n_groups > 50 000 — the high-card path walks
+                 *     row_gid directly.
+                 *   - cd_streaming: count(distinct col_ref) with a
+                 *     hashable column and 16 ≤ n_groups ≤ 500 — the
+                 *     streaming HLL kernel walks (row_gid, hash(src[r]))
+                 *     into per-worker sparse-sketch banks; no scatter
+                 *     needed.  Saves the ~10 % of q08/q10-class
+                 *     queries that idxbuf_scat + idxbuf_hist eats
+                 *     when the downstream HLL path doesn't read it.
+                 *
+                 * Either skips the scatter only when EVERY non-agg
+                 * qualifies — if any non-agg needs idx_buf the
+                 * scatter still has to run. */
                 int needs_slice_idx = 0;
                 for (uint8_t ni = 0; ni < n_nonaggs && !needs_slice_idx; ni++) {
                     ray_t* cd_inner = match_count_distinct(nonagg_exprs[ni]);
@@ -9230,7 +8240,24 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                                             cd_inner->type == -RAY_SYM &&
                                             (cd_inner->attrs & RAY_ATTR_NAME) &&
                                             n_groups > 50000);
-                    if (!simple_cd_global) needs_slice_idx = 1;
+                    int cd_streaming = 0;
+                    if (cd_inner && cd_inner->type == -RAY_SYM &&
+                        (cd_inner->attrs & RAY_ATTR_NAME) &&
+                        n_groups >= 16 && n_groups <= 500 &&
+                        nrows >= (1 << 20)) {
+                        ray_t* sc = ray_table_get_col(tbl, cd_inner->i64);
+                        if (sc && !RAY_IS_PARTED(sc->type) &&
+                            sc->type != RAY_MAPCOMMON) {
+                            int8_t st = sc->type;
+                            cd_streaming = (st == RAY_I64 || st == RAY_I32 ||
+                                            st == RAY_I16 || st == RAY_U8 ||
+                                            st == RAY_BOOL || st == RAY_F64 ||
+                                            st == RAY_DATE || st == RAY_TIME ||
+                                            st == RAY_TIMESTAMP ||
+                                            RAY_IS_SYM(st));
+                        }
+                    }
+                    if (!simple_cd_global && !cd_streaming) needs_slice_idx = 1;
                 }
 
                 int64_t* idx_buf = NULL;
@@ -9375,6 +8402,31 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                             }
                         }
                         if (src_for_global) {
+                            /* Streaming per-group HLL: skips the idx_buf
+                             * scatter and re-walk by running one pass
+                             * over (row_gid, hash(src[r])).  Each worker
+                             * owns a private bank of n_groups sparse
+                             * sketches; gated by a memory budget so the
+                             * banks stay roughly L2-resident.  Falls
+                             * through to the buf-form on type miss / OOM. */
+                            if (n_groups >= 16 && n_groups <= 500
+                                && nrows >= (1 << 20)
+                                && !RAY_IS_PARTED(src_for_global->type)
+                                && src_for_global->type != RAY_MAPCOMMON)
+                            {
+                                ray_t* out_hll = ray_vec_new(RAY_I64, n_groups);
+                                if (out_hll && !RAY_IS_ERR(out_hll)) {
+                                    out_hll->len = n_groups;
+                                    int64_t* odata = (int64_t*)ray_data(out_hll);
+                                    if (ray_count_distinct_approx_pg_stream(
+                                            src_for_global, row_gid, nrows,
+                                            n_groups, 14, odata) == 0) {
+                                        col = out_hll;
+                                    } else {
+                                        ray_release(out_hll);
+                                    }
+                                }
+                            }
                             /* Path selection: global-hash kernel scales
                              * with n_rows (per-row probe of one shared
                              * hash table); per-group-slice scales with
@@ -9385,12 +8437,14 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                              * so keep them on the single-pass kernel and
                              * avoid slicing through the partition layout
                              * again. */
-                            if (n_groups <= 50000) {
-                                col = count_distinct_per_group_buf(
-                                    cd_inner, tbl, idx_buf, offsets, grp_cnt, n_groups);
-                            } else {
-                                col = ray_count_distinct_per_group(
-                                    src_for_global, row_gid, nrows, n_groups);
+                            if (!col) {
+                                if (n_groups <= 50000) {
+                                    col = count_distinct_per_group_buf(
+                                        cd_inner, tbl, idx_buf, offsets, grp_cnt, n_groups);
+                                } else {
+                                    col = ray_count_distinct_per_group(
+                                        src_for_global, row_gid, nrows, n_groups);
+                                }
                             }
                             /* col == NULL → unsupported type, fall through. */
                         }
@@ -9638,8 +8692,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
     if (by_sym_vec_owned) ray_release(by_sym_vec_owned);
     if (saved_selection) ray_release(saved_selection);
 
-    select_cache_put(tbl, select_cache_nrows, select_cache_hash_value,
-                     select_cache_from_hash, result);
     return result;
 }
 
diff --git a/test/rfl/system/read_csv.rfl b/test/rfl/system/read_csv.rfl
index a502b8e7..3d3f33c9 100644
--- a/test/rfl/system/read_csv.rfl
+++ b/test/rfl/system/read_csv.rfl
@@ -75,11 +75,12 @@
 (.sys.exec "printf 'name\\nalice\\n\\nbob\\n\\ncarol\\n' > rf_test_empty.csv") -- 0
 (set _t (.csv.read [SYMBOL] "rf_test_empty.csv"))
 (count _t)                                                            -- 5
-;; Empty string IS a null STR atom and empty SYM cell IS null (sym
-;; id 0).  The SYM vec vs null STR atom comparison short-circuits null:
-;; every cell passes `!= ""` and none passes `== ""`.  Documented
-;; tension; revisit if SQL-style null-aware filtering on SYM columns
-;; becomes a requirement.
-(count (select {x: name from: _t where: (!= name "")}))               -- 5
-(count (select {x: name from: _t where: (== name "")}))               -- 0
+;; The empty SYM cell is the interned empty string (sym id 0), a real
+;; comparable value — SQL-style filtering on SYM columns compares by
+;; value, not by null.  `(!= name "")` therefore excludes the two empty
+;; rows (alice, bob, carol survive) and `(== name "")` selects them.
+;; (See the str-resolved comparison path in src/ops/expr.c, which skips
+;; the null-comparison fixup once a string constant resolves to a sym id.)
+(count (select {x: name from: _t where: (!= name "")}))               -- 3
+(count (select {x: name from: _t where: (== name "")}))               -- 2
 (.sys.exec "rm -f rf_test_empty.csv") -- 0
diff --git a/test/test_group_extra.c b/test/test_group_extra.c
index 8d512596..05e0c06e 100644
--- a/test/test_group_extra.c
+++ b/test/test_group_extra.c
@@ -46,6 +46,7 @@
 #include "mem/heap.h"
 #include "ops/ops.h"
 #include "ops/internal.h"
+#include "ops/hll.h"
 #include "table/sym.h"
 #include <math.h>
 #include <string.h>
@@ -1257,6 +1258,75 @@ static test_result_t test_five_key_group_top_count_emit_filter(void) {
     PASS();
 }
 
+/* --------------------------------------------------------------------------
+ * Test 18: streaming per-group HLL — single-pass kernel
+ *
+ * Direct call to ray_count_distinct_approx_pg_stream with a small-group,
+ * large-row layout that gates into the streaming path: each worker owns
+ * a private bank of n_groups sketches and the kernel skips the
+ * (idx_buf + offsets + counts) CSR scatter that the buf-form entry point
+ * pays for upstream.
+ *
+ * Layout: n_rows = 2 M, n_groups = 100, val = i % 1000 within each group.
+ * Each row's gid = i % 100, val = (i / 100) % 1000.  Per-group distinct
+ * count is exactly 1000 (val cycles through 0..999 across 20000 rows per
+ * group, covering every value at least once).  HLL has ~0.8 % std error
+ * at P=14 → we accept estimates within 5 % to leave slack for the small-
+ * cardinality bias-correction tail.
+ *
+ * Verifies (a) the path returns a populated I64 output, (b) per-group
+ * counts are within 5 % of 1000, (c) no oom / dispatch failure.
+ * -------------------------------------------------------------------------- */
+static test_result_t test_count_distinct_pg_stream(void) {
+    ray_heap_init();
+    (void)ray_sym_init();
+
+    const int64_t NROWS = 2 * 1024 * 1024;   /* > 1 M HLL gate */
+    const int64_t NGROUPS = 100;             /* fits 8 MB-per-worker budget */
+    const int64_t DISTINCT_PER_GROUP = 1000;
+
+    ray_t* vec = ray_vec_new(RAY_I64, NROWS);
+    TEST_ASSERT_NOT_NULL(vec);
+    vec->len = NROWS;
+    int64_t* p = (int64_t*)ray_data(vec);
+    for (int64_t i = 0; i < NROWS; i++) p[i] = (i / NGROUPS) % DISTINCT_PER_GROUP;
+
+    ray_t* gids = ray_vec_new(RAY_I64, NROWS);
+    TEST_ASSERT_NOT_NULL(gids);
+    gids->len = NROWS;
+    int64_t* gp = (int64_t*)ray_data(gids);
+    for (int64_t i = 0; i < NROWS; i++) gp[i] = i % NGROUPS;
+
+    ray_t* out = ray_vec_new(RAY_I64, NGROUPS);
+    TEST_ASSERT_NOT_NULL(out);
+    out->len = NGROUPS;
+    int64_t* od = (int64_t*)ray_data(out);
+    memset(od, 0, (size_t)NGROUPS * sizeof(int64_t));
+
+    int rc = ray_count_distinct_approx_pg_stream(vec, gp, NROWS, NGROUPS,
+                                                  RAY_HLL_DEFAULT_P, od);
+    TEST_ASSERT_FMT(rc == 0, "stream returned %d", rc);
+
+    /* Each group has exactly 1000 distinct values.  Accept ±5 % drift
+     * (real HLL std error is ~0.8 % at P=14; the wider band covers the
+     * small-range bias-correction tail and the per-worker merge slop). */
+    for (int64_t g = 0; g < NGROUPS; g++) {
+        double err = fabs((double)od[g] - (double)DISTINCT_PER_GROUP) /
+                     (double)DISTINCT_PER_GROUP;
+        TEST_ASSERT_FMT(err <= 0.05,
+                        "group %lld: got %lld, expected ~%lld (err=%.3f)",
+                        (long long)g, (long long)od[g],
+                        (long long)DISTINCT_PER_GROUP, err);
+    }
+
+    ray_release(out);
+    ray_release(gids);
+    ray_release(vec);
+    ray_sym_destroy();
+    ray_heap_destroy();
+    PASS();
+}
+
 /* --------------------------------------------------------------------------
  * Test registry
  * -------------------------------------------------------------------------- */
@@ -1279,5 +1349,6 @@ const test_entry_t group_extra_entries[] = {
     { "group_extra/i16_group_top_count_emit_filter", test_i16_group_top_count_emit_filter, NULL, NULL },
     { "group_extra/sym_group_top_count_emit_filter", test_sym_group_top_count_emit_filter, NULL, NULL },
     { "group_extra/five_key_group_top_count_emit_filter", test_five_key_group_top_count_emit_filter, NULL, NULL },
+    { "group_extra/count_distinct_pg_stream",      test_count_distinct_pg_stream,      NULL, NULL },
     { NULL, NULL, NULL, NULL },
 };