From 8c4f7ed5b1769ad137cc193a0ba7670727d3f72b Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 16:05:59 +0200
Subject: [PATCH 01/36] revert: remove fraudulent profiling-gated result caches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The select_cache / select_expr_cache helpers in query.c were gated on
`g_ray_profile.active` and would return memoised results during
benchmark runs without executing the query. A bench that runs each
query 3x and keeps the min would see runs 2-3 return the cached
result in microseconds — fake wins, not real speed.

Removed entirely:
  - select_cache_entry_t / g_select_cache + select_cache_get/put
  - select_expr_cache_entry_t / g_select_expr_cache + get/put
  - ray_expr_hash + hash_mix_u64 (only fed the above)
  - all 10 call sites in ray_select_fn

Queries now always compute their results; no cross-call memoisation.

Test suite: 2818 of 2820 passed (2 skipped, 0 failed).
---
 src/ops/query.c | 178 ------------------------------------------------
 1 file changed, 178 deletions(-)

diff --git a/src/ops/query.c b/src/ops/query.c
index 451d4baf..a4544414 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -87,147 +87,6 @@ static int64_t dict_key_id(ray_t* dict, const char* key) {
     return -1;
 }
 
-typedef struct {
-    ray_t*   tbl;
-    int64_t  nrows;
-    uint64_t hash;
-    uint64_t from_hash;
-    uint64_t env_gen;
-    ray_t*   result;
-} select_cache_entry_t;
-
-#define SELECT_CACHE_N 512
-static select_cache_entry_t g_select_cache[SELECT_CACHE_N];
-static uint16_t g_select_cache_next = 0;
-
-static uint64_t hash_mix_u64(uint64_t h, uint64_t v) {
-    h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2);
-    return h ? h : 0x9e3779b97f4a7c15ull;
-}
-
-static uint64_t ray_expr_hash(ray_t* x) {
-    if (!x) return 0x1234abcd5678ef00ull;
-    uint64_t h = hash_mix_u64(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type);
-    h = hash_mix_u64(h, (uint64_t)x->attrs);
-    h = hash_mix_u64(h, (x->type == -RAY_STR)
-                        ? (uint64_t)ray_str_len(x)
-                        : (uint64_t)x->len);
-    if (x->type == RAY_LIST) {
-        ray_t** elems = (ray_t**)ray_data(x);
-        for (int64_t i = 0; i < x->len; i++)
-            h = hash_mix_u64(h, ray_expr_hash(elems[i]));
-    } else if (x->type == RAY_DICT) {
-        ray_t* keys = ray_dict_keys(x);
-        ray_t* vals = ray_dict_vals(x);
-        h = hash_mix_u64(h, ray_expr_hash(keys));
-        h = hash_mix_u64(h, ray_expr_hash(vals));
-    } else if (x->type == RAY_STR) {
-        size_t n = 0;
-        const char* s = ray_str_vec_get(x, 0, &n);
-        for (size_t i = 0; s && i < n; i++)
-            h = hash_mix_u64(h, (unsigned char)s[i]);
-    } else if (x->type == -RAY_STR) {
-        const char* s = ray_str_ptr(x);
-        size_t n = ray_str_len(x);
-        for (size_t i = 0; s && i < n; i++)
-            h = hash_mix_u64(h, (unsigned char)s[i]);
-    } else if (x->type == RAY_SYM || x->type == -RAY_SYM ||
-               x->type == RAY_I64 || x->type == -RAY_I64 ||
-               x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) {
-        h = hash_mix_u64(h, (uint64_t)x->i64);
-    } else if (x->type == RAY_I32 || x->type == -RAY_I32 ||
-               x->type == RAY_DATE || x->type == -RAY_DATE ||
-               x->type == RAY_TIME || x->type == -RAY_TIME) {
-        h = hash_mix_u64(h, (uint64_t)(uint32_t)x->i32);
-    } else if (x->type == RAY_I16 || x->type == -RAY_I16) {
-        h = hash_mix_u64(h, (uint64_t)(uint16_t)x->i16);
-    } else if (x->type == RAY_U8 || x->type == -RAY_U8 ||
-               x->type == RAY_BOOL || x->type == -RAY_BOOL) {
-        h = hash_mix_u64(h, (uint64_t)x->u8);
-    } else if (x->type == RAY_F64 || x->type == -RAY_F64) {
-        uint64_t bits = 0;
-        memcpy(&bits, &x->f64, sizeof(bits));
-        h = hash_mix_u64(h, bits);
-    }
-    return h;
-}
-
-static ray_t* select_cache_get(ray_t* tbl, int64_t nrows,
-                               uint64_t hash, uint64_t from_hash) {
-    if (!g_ray_profile.active) return NULL;
-    if (!hash) return NULL;
-    for (uint16_t i = 0; i < SELECT_CACHE_N; i++) {
-        select_cache_entry_t* e = &g_select_cache[i];
-        if (e->result && e->env_gen == ray_env_generation() &&
-            e->nrows == nrows && e->hash == hash &&
-            (e->tbl == tbl || (from_hash && e->from_hash == from_hash))) {
-            ray_retain(e->result);
-            return e->result;
-        }
-    }
-    return NULL;
-}
-
-static void select_expr_cache_put(uint64_t hash, uint64_t from_hash,
-                                  ray_t* result);
-
-static void select_cache_put(ray_t* tbl, int64_t nrows,
-                             uint64_t hash, uint64_t from_hash,
-                             ray_t* result) {
-    if (!g_ray_profile.active) return;
-    if (!tbl || !hash || !result || RAY_IS_ERR(result)) return;
-    select_cache_entry_t* e =
-        &g_select_cache[g_select_cache_next++ % SELECT_CACHE_N];
-    if (e->result) ray_release(e->result);
-    e->tbl = tbl;
-    e->nrows = nrows;
-    e->hash = hash;
-    e->from_hash = from_hash;
-    e->env_gen = ray_env_generation();
-    e->result = result;
-    ray_retain(e->result);
-    select_expr_cache_put(hash, from_hash, result);
-}
-
-typedef struct {
-    uint64_t hash;
-    uint64_t from_hash;
-    uint64_t env_gen;
-    ray_t*   result;
-} select_expr_cache_entry_t;
-
-#define SELECT_EXPR_CACHE_N 1024
-static select_expr_cache_entry_t g_select_expr_cache[SELECT_EXPR_CACHE_N];
-static uint16_t g_select_expr_cache_next = 0;
-
-static ray_t* select_expr_cache_get(uint64_t hash, uint64_t from_hash) {
-    if (!g_ray_profile.active) return NULL;
-    if (!hash) return NULL;
-    for (uint16_t i = 0; i < SELECT_EXPR_CACHE_N; i++) {
-        select_expr_cache_entry_t* e = &g_select_expr_cache[i];
-        if (e->result && e->env_gen == ray_env_generation() &&
-            e->hash == hash && e->from_hash == from_hash) {
-            ray_retain(e->result);
-            return e->result;
-        }
-    }
-    return NULL;
-}
-
-static void select_expr_cache_put(uint64_t hash, uint64_t from_hash,
-                                  ray_t* result) {
-    if (!g_ray_profile.active) return;
-    if (!hash || !result || RAY_IS_ERR(result)) return;
-    select_expr_cache_entry_t* e =
-        &g_select_expr_cache[g_select_expr_cache_next++ % SELECT_EXPR_CACHE_N];
-    if (e->result) ray_release(e->result);
-    e->hash = hash;
-    e->from_hash = from_hash;
-    e->env_gen = ray_env_generation();
-    e->result = result;
-    ray_retain(e->result);
-}
-
 /* Flatten a RAY_DICT (keys SYM vec + vals LIST) into a transient
  * [k0,v0,k1,v1,...] array view so the existing dict-walking loops in
  * ray_select_fn et al. can iterate without rewriting every site.
@@ -4980,12 +4839,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
     /* Evaluate 'from:' to get the source table */
     ray_t* from_expr = dict_get(dict, "from");
     if (!from_expr) return ray_error("domain", NULL);
-    uint64_t select_cache_hash_value = ray_expr_hash(dict);
-    uint64_t select_cache_from_hash = ray_expr_hash(from_expr);
-    ray_t* expr_cached = select_expr_cache_get(select_cache_hash_value,
-                                               select_cache_from_hash);
-    if (expr_cached)
-        return expr_cached;
     ray_t* where_expr = dict_get(dict, "where");
     ray_group_emit_filter_t prev_emit_filter = ray_group_emit_filter_get();
     ray_group_emit_filter_t emit_filter = {0};
@@ -4998,14 +4851,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
         ray_group_emit_filter_set(prev_emit_filter);
     if (RAY_IS_ERR(tbl)) return tbl;
     if (tbl->type != RAY_TABLE) { ray_release(tbl); return ray_error("type", NULL); }
-    int64_t select_cache_nrows = ray_table_nrows(tbl);
-    ray_t* select_cached = select_cache_get(tbl, select_cache_nrows,
-                                            select_cache_hash_value,
-                                            select_cache_from_hash);
-    if (select_cached) {
-        ray_release(tbl);
-        return select_cached;
-    }
 
     ray_t* by_expr = dict_get(dict, "by");
     ray_t* take_expr = dict_get(dict, "take");
@@ -6424,9 +6269,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                         ray_free(vals_hdr); ray_free(null_hdr); ray_free(cnt_hdr);
                         if (eval_tbl != tbl) ray_release(eval_tbl);
                         ray_release(tbl);
-                        select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
                         return result;
                     }
                 }
@@ -6687,16 +6529,10 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                 if (eval_tbl != tbl) ray_release(eval_tbl);
                 ray_release(tbl);
                 if (take_preapplied) {
-                    select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
                     return result;
                 }
                 result = apply_sort_take(result, dict_elems, dict_n,
                                          asc_id, desc_id, take_id);
-                select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
                 return result;
             }
 
@@ -6887,9 +6723,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                 }
                 res = apply_sort_take(res, dict_elems, dict_n,
                                       asc_id, desc_id, take_id);
-                select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, res);
                 return res;
             }
 
@@ -7301,9 +7134,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
             ray_release(tbl);
             result = apply_sort_take(result, dict_elems, dict_n,
                                      asc_id, desc_id, take_id);
-            select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
             return result;
         }
 
@@ -8449,9 +8279,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
             ray_release(tbl);
             result = apply_sort_take(result, dict_elems, dict_n,
                                      asc_id, desc_id, take_id);
-            select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
             return result;
         }
     } else if (n_out > 0) {
@@ -8599,9 +8426,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                 ray_graph_free(g); ray_release(tbl);
                 result = apply_sort_take(result, dict_elems, dict_n,
                                          asc_id, desc_id, take_id);
-                select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
                 return result;
             } else {
                 root = ray_select_op(g, root, col_ops, nc);
@@ -9638,8 +9462,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
     if (by_sym_vec_owned) ray_release(by_sym_vec_owned);
     if (saved_selection) ray_release(saved_selection);
 
-    select_cache_put(tbl, select_cache_nrows, select_cache_hash_value,
-                     select_cache_from_hash, result);
     return result;
 }
 

From d4da302f262cb5a185554ba7c5f02829b61ab1c6 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 16:09:22 +0200
Subject: [PATCH 02/36] refactor(query): remove benchmark-shaped query
 fast-paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

try_xbar_count_select / try_i16_ne0_count_desc_select /
try_i32_i64_count_distinct_select / try_i16x2_count_desc_select
pattern-matched exact query shapes (i16 "!= 0" filter + count desc +
take; two i16 keys; i32/i64 count-distinct; xbar time-bucket count) and
ran hand-written kernels, bypassing the general select/group-by planner.

These are query-shape special cases, not general optimisations — removed
along with their exclusive supporting infrastructure:
  - xbar_count_clause_t / xbar_count_pair_t / i16x2_count_pair_t /
    i32_count_pair_t / i16_count_pair_t typedefs
  - xbar_count_ctx_t / i16x2_count_ctx_t / i16_ne0_count_ctx_t /
    i32_i64_cd_ctx_t worker-context typedefs
  - per-shape comparators (xbar_count_pair_cmp etc.)
  - per-shape hashes (xbar_count_hash_i64, count_hash_u32,
    count_hash_i32_i64)
  - per-shape worker fns and their cache-equality / clause-parsing
    helpers (parse_xbar_count_clause, order_count_clauses,
    xbar_clause_cache_eq, match_i16_key_ne_zero, sym_name_eq)
  - the four dispatch sites in ray_select_fn

Queries of these shapes now run through the normal select path.

Test suite: 2818 of 2820 passed (2 skipped, 0 failed).
---
 src/ops/query.c | 1291 -----------------------------------------------
 1 file changed, 1291 deletions(-)

diff --git a/src/ops/query.c b/src/ops/query.c
index a4544414..e8effbcf 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -1493,1260 +1493,6 @@ static int atom_i64_const(ray_t* v, int64_t* out) {
     }
 }
 
-typedef struct {
-    const void* base;
-    int8_t type;
-    uint8_t attrs;
-    int op;
-    int64_t rhs;
-} xbar_count_clause_t;
-
-typedef struct {
-    int64_t key;
-    int64_t count;
-} xbar_count_pair_t;
-
-typedef struct {
-    uint32_t key;
-    uint32_t count;
-} i16x2_count_pair_t;
-
-typedef struct {
-    int32_t key;
-    uint32_t count;
-} i32_count_pair_t;
-
-typedef struct {
-    int16_t key;
-    uint32_t count;
-} i16_count_pair_t;
-
-typedef struct {
-    const int64_t* key_data;
-    int64_t bucket;
-    xbar_count_clause_t clauses[16];
-    uint8_t n_clauses;
-    uint32_t cap;
-    int64_t* keys;
-    uint32_t* counts;
-    uint8_t* used;
-    _Atomic int overflow;
-} xbar_count_ctx_t;
-
-typedef struct {
-    const int16_t* key0;
-    const int16_t* key1;
-    xbar_count_clause_t clauses[16];
-    uint8_t n_clauses;
-    uint32_t cap;
-    uint32_t* keys;
-    uint32_t* counts;
-    uint8_t* used;
-    _Atomic int overflow;
-} i16x2_count_ctx_t;
-
-typedef struct {
-    const int16_t* key;
-    uint32_t* counts;
-} i16_ne0_count_ctx_t;
-
-typedef struct {
-    const int32_t* group;
-    const int64_t* distinct;
-    uint32_t cap;
-    int32_t* groups;
-    int64_t* values;
-    uint8_t* used;
-    _Atomic int overflow;
-} i32_i64_cd_ctx_t;
-
-static int xbar_count_pair_cmp(const void* a, const void* b) {
-    const xbar_count_pair_t* pa = (const xbar_count_pair_t*)a;
-    const xbar_count_pair_t* pb = (const xbar_count_pair_t*)b;
-    return (pa->key > pb->key) - (pa->key < pb->key);
-}
-
-static int i16x2_count_pair_desc_cmp(const void* a, const void* b) {
-    const i16x2_count_pair_t* pa = (const i16x2_count_pair_t*)a;
-    const i16x2_count_pair_t* pb = (const i16x2_count_pair_t*)b;
-    if (pa->count != pb->count)
-        return (pa->count < pb->count) - (pa->count > pb->count);
-    return (pa->key > pb->key) - (pa->key < pb->key);
-}
-
-static int i32_count_pair_desc_cmp(const void* a, const void* b) {
-    const i32_count_pair_t* pa = (const i32_count_pair_t*)a;
-    const i32_count_pair_t* pb = (const i32_count_pair_t*)b;
-    if (pa->count != pb->count)
-        return (pa->count < pb->count) - (pa->count > pb->count);
-    return (pa->key > pb->key) - (pa->key < pb->key);
-}
-
-static int i16_count_pair_desc_cmp(const void* a, const void* b) {
-    const i16_count_pair_t* pa = (const i16_count_pair_t*)a;
-    const i16_count_pair_t* pb = (const i16_count_pair_t*)b;
-    if (pa->count != pb->count)
-        return (pa->count < pb->count) - (pa->count > pb->count);
-    return (pa->key > pb->key) - (pa->key < pb->key);
-}
-
-static uint64_t xbar_count_hash_i64(int64_t v) {
-    uint64_t h = (uint64_t)v;
-    h ^= h >> 33;
-    h *= 0xff51afd7ed558ccdULL;
-    h ^= h >> 33;
-    h *= 0xc4ceb9fe1a85ec53ULL;
-    h ^= h >> 33;
-    return h;
-}
-
-static uint32_t count_hash_u32(uint32_t v) {
-    uint32_t h = v;
-    h ^= h >> 16;
-    h *= 0x7feb352dU;
-    h ^= h >> 15;
-    h *= 0x846ca68bU;
-    h ^= h >> 16;
-    return h;
-}
-
-static uint64_t count_hash_i32_i64(int32_t g, int64_t v) {
-    uint64_t h = (uint64_t)(uint32_t)g * 0x9E3779B97F4A7C15ULL;
-    uint64_t x = (uint64_t)v;
-    x ^= x >> 33;
-    x *= 0xff51afd7ed558ccdULL;
-    x ^= x >> 33;
-    h ^= x + 0xBF58476D1CE4E5B9ULL + (h << 6) + (h >> 2);
-    h ^= h >> 33;
-    return h;
-}
-
-static void xbar_count_worker_fn(void* raw, uint32_t worker_id,
-                                 int64_t start, int64_t end) {
-    xbar_count_ctx_t* ctx = (xbar_count_ctx_t*)raw;
-    uint32_t cap = ctx->cap;
-    uint32_t mask = cap - 1u;
-    int64_t* keys = ctx->keys + (size_t)worker_id * cap;
-    uint32_t* counts = ctx->counts + (size_t)worker_id * cap;
-    uint8_t* used = ctx->used + (size_t)worker_id * cap;
-    int64_t n_groups = 0;
-    int64_t bucket = ctx->bucket;
-
-    for (int64_t r = start; r < end; r++) {
-        uint8_t pass = 1;
-        for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) {
-            const xbar_count_clause_t* c = &ctx->clauses[ci];
-            int64_t v = read_col_i64(c->base, r, c->type, c->attrs);
-            if (c->op == 1) pass &= (uint8_t)(v == c->rhs);
-            else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs);
-            else pass &= (uint8_t)(v <= c->rhs);
-            if (!pass) break;
-        }
-        if (!pass) continue;
-        int64_t ts = ctx->key_data[r];
-        int64_t q = ts / bucket;
-        if ((ts ^ bucket) < 0 && q * bucket != ts) q--;
-        int64_t k = q * bucket;
-        uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask;
-        while (used[slot] && keys[slot] != k)
-            slot = (slot + 1u) & mask;
-        if (!used[slot]) {
-            if (n_groups >= (int64_t)(cap / 2)) {
-                atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed);
-                return;
-            }
-            used[slot] = 1;
-            keys[slot] = k;
-            n_groups++;
-        }
-        counts[slot]++;
-    }
-}
-
-static void i16x2_count_worker_fn(void* raw, uint32_t worker_id,
-                                  int64_t start, int64_t end) {
-    i16x2_count_ctx_t* ctx = (i16x2_count_ctx_t*)raw;
-    uint32_t cap = ctx->cap;
-    uint32_t mask = cap - 1u;
-    uint32_t* keys = ctx->keys + (size_t)worker_id * cap;
-    uint32_t* counts = ctx->counts + (size_t)worker_id * cap;
-    uint8_t* used = ctx->used + (size_t)worker_id * cap;
-    int64_t n_groups = 0;
-
-    for (int64_t r = start; r < end; r++) {
-        uint8_t pass = 1;
-        for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) {
-            const xbar_count_clause_t* c = &ctx->clauses[ci];
-            int64_t v = read_col_i64(c->base, r, c->type, c->attrs);
-            if (c->op == 1) pass &= (uint8_t)(v == c->rhs);
-            else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs);
-            else pass &= (uint8_t)(v <= c->rhs);
-            if (!pass) break;
-        }
-        if (!pass) continue;
-        uint32_t k = ((uint32_t)(uint16_t)ctx->key0[r] << 16) |
-                     (uint32_t)(uint16_t)ctx->key1[r];
-        uint32_t slot = count_hash_u32(k) & mask;
-        while (used[slot] && keys[slot] != k)
-            slot = (slot + 1u) & mask;
-        if (!used[slot]) {
-            if (n_groups >= (int64_t)(cap / 2)) {
-                atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed);
-                return;
-            }
-            used[slot] = 1;
-            keys[slot] = k;
-            n_groups++;
-        }
-        counts[slot]++;
-    }
-}
-
-static void i16_ne0_count_worker_fn(void* raw, uint32_t worker_id,
-                                    int64_t start, int64_t end) {
-    i16_ne0_count_ctx_t* ctx = (i16_ne0_count_ctx_t*)raw;
-    uint32_t* counts = ctx->counts + (size_t)worker_id * 65536u;
-    const int16_t* key = ctx->key;
-    for (int64_t r = start; r < end; r++) {
-        int16_t v = key[r];
-        if (v)
-            counts[(uint32_t)((int32_t)v + 32768)]++;
-    }
-}
-
-static void i32_i64_cd_worker_fn(void* raw, uint32_t worker_id,
-                                 int64_t start, int64_t end) {
-    i32_i64_cd_ctx_t* ctx = (i32_i64_cd_ctx_t*)raw;
-    uint32_t cap = ctx->cap;
-    uint32_t mask = cap - 1u;
-    int32_t* groups = ctx->groups + (size_t)worker_id * cap;
-    int64_t* values = ctx->values + (size_t)worker_id * cap;
-    uint8_t* used = ctx->used + (size_t)worker_id * cap;
-    int64_t n_filled = 0;
-
-    for (int64_t r = start; r < end; r++) {
-        int32_t g = ctx->group[r];
-        int64_t v = ctx->distinct[r];
-        uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & mask;
-        while (used[slot] && (groups[slot] != g || values[slot] != v))
-            slot = (slot + 1u) & mask;
-        if (!used[slot]) {
-            if (n_filled >= (int64_t)(cap * 7u / 10u)) {
-                atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed);
-                return;
-            }
-            used[slot] = 1;
-            groups[slot] = g;
-            values[slot] = v;
-            n_filled++;
-        }
-    }
-}
-
-static int sym_name_eq(int64_t sym, const char* name, size_t len) {
-    ray_t* s = ray_sym_str(sym);
-    return s && ray_str_len(s) == len &&
-           memcmp(ray_str_ptr(s), name, len) == 0;
-}
-
-static int parse_xbar_count_clause(ray_t* tbl, ray_t* expr,
-                                   xbar_count_clause_t* clauses,
-                                   uint8_t* n_clauses) {
-    if (!expr || expr->type != RAY_LIST || ray_len(expr) < 3) return 0;
-    ray_t** elems = (ray_t**)ray_data(expr);
-    if (!elems[0] || elems[0]->type != -RAY_SYM) return 0;
-    ray_t* head = ray_sym_str(elems[0]->i64);
-    if (!head) return 0;
-    const char* hn = ray_str_ptr(head);
-    size_t hl = ray_str_len(head);
-    if (hl == 3 && memcmp(hn, "and", 3) == 0) {
-        for (int64_t i = 1; i < ray_len(expr); i++)
-            if (!parse_xbar_count_clause(tbl, elems[i], clauses, n_clauses))
-                return 0;
-        return 1;
-    }
-    if (ray_len(expr) != 3 || *n_clauses >= 16) return 0;
-    int op = 0;
-    if (hl == 2 && memcmp(hn, "==", 2) == 0) op = 1;
-    else if (hl == 2 && memcmp(hn, ">=", 2) == 0) op = 2;
-    else if (hl == 2 && memcmp(hn, "<=", 2) == 0) op = 3;
-    else return 0;
-
-    ray_t* lhs = elems[1];
-    ray_t* rhs = elems[2];
-    int64_t rhs_i = 0;
-    if (!lhs || lhs->type != -RAY_SYM || !(lhs->attrs & RAY_ATTR_NAME) ||
-        !atom_i64_const(rhs, &rhs_i))
-        return 0;
-    ray_t* col = ray_table_get_col(tbl, lhs->i64);
-    if (!col || !ray_is_vec(col) || RAY_IS_PARTED(col->type) ||
-        col->type == RAY_MAPCOMMON || (col->attrs & RAY_ATTR_HAS_NULLS))
-        return 0;
-    int8_t ct = col->type;
-    if (ct != RAY_BOOL && ct != RAY_U8 && ct != RAY_I16 &&
-        ct != RAY_I32 && ct != RAY_I64 && ct != RAY_DATE &&
-        ct != RAY_TIME && ct != RAY_TIMESTAMP)
-        return 0;
-    clauses[*n_clauses] = (xbar_count_clause_t){
-        .base = ray_data(col),
-        .type = ct,
-        .attrs = col->attrs,
-        .op = op,
-        .rhs = rhs_i,
-    };
-    (*n_clauses)++;
-    return 1;
-}
-
-static int count_clause_score(const xbar_count_clause_t* c) {
-    if (c->op == 1 && ray_sym_elem_size(c->type, c->attrs) >= 8) return 0;
-    if (c->op == 1) return 1;
-    return 2;
-}
-
-static void order_count_clauses(xbar_count_clause_t* clauses, uint8_t n) {
-    for (uint8_t i = 1; i < n; i++) {
-        xbar_count_clause_t v = clauses[i];
-        int vs = count_clause_score(&v);
-        uint8_t j = i;
-        while (j > 0 && count_clause_score(&clauses[j - 1]) > vs) {
-            clauses[j] = clauses[j - 1];
-            j--;
-        }
-        clauses[j] = v;
-    }
-}
-
-static int xbar_clause_cache_eq(const xbar_count_clause_t* a, uint8_t an,
-                                const xbar_count_clause_t* b, uint8_t bn) {
-    if (an != bn) return 0;
-    for (uint8_t i = 0; i < an; i++) {
-        if (a[i].base != b[i].base || a[i].type != b[i].type ||
-            a[i].attrs != b[i].attrs || a[i].op != b[i].op ||
-            a[i].rhs != b[i].rhs)
-            return 0;
-    }
-    return 1;
-}
-
-static int match_i16_key_ne_zero(ray_t* where_expr, int64_t key_sym) {
-    if (!where_expr || where_expr->type != RAY_LIST || ray_len(where_expr) != 3)
-        return 0;
-    ray_t** e = (ray_t**)ray_data(where_expr);
-    if (!e[0] || e[0]->type != -RAY_SYM ||
-        !sym_name_eq(e[0]->i64, "!=", 2))
-        return 0;
-    ray_t* lhs = e[1];
-    int64_t rhs = 0;
-    return lhs && lhs->type == -RAY_SYM && (lhs->attrs & RAY_ATTR_NAME) &&
-           lhs->i64 == key_sym && atom_i64_const(e[2], &rhs) && rhs == 0;
-}
-
-static ray_t* try_i16_ne0_count_desc_select(ray_t* tbl, ray_t* where_expr,
-                                            ray_t* by_expr, ray_t* take_expr,
-                                            ray_t** dict_elems,
-                                            int64_t dict_n,
-                                            int64_t from_id,
-                                            int64_t where_id,
-                                            int64_t by_id,
-                                            int64_t take_id,
-                                            int64_t asc_id,
-                                            int64_t desc_id,
-                                            int64_t nearest_id) {
-    if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr ||
-        !take_expr || by_expr->type != -RAY_SYM ||
-        !(by_expr->attrs & RAY_ATTR_NAME))
-        return NULL;
-    int64_t key_sym = by_expr->i64;
-    int64_t take_n = 0;
-    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024)
-        return NULL;
-    if (!match_i16_key_ne_zero(where_expr, key_sym))
-        return NULL;
-
-    int64_t count_alias = -1;
-    int saw_desc = 0;
-    int saw_key_projection = 0;
-    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
-        int64_t kid = dict_elems[i]->i64;
-        ray_t* v = dict_elems[i + 1];
-        if (kid == from_id || kid == where_id || kid == by_id ||
-            kid == take_id || kid == nearest_id)
-            continue;
-        if (kid == desc_id) {
-            if (!v || v->type != -RAY_SYM)
-                return NULL;
-            saw_desc = 1;
-            continue;
-        }
-        if (kid == asc_id) return NULL;
-        if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) &&
-            kid == key_sym && v->i64 == key_sym) {
-            saw_key_projection = 1;
-            continue;
-        }
-        if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2)
-            return NULL;
-        ray_t** ae = (ray_t**)ray_data(v);
-        if (!ae[0] || ae[0]->type != -RAY_SYM ||
-            !sym_name_eq(ae[0]->i64, "count", 5))
-            return NULL;
-        ray_t* arg = ae[1];
-        if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME) ||
-            arg->i64 != key_sym)
-            return NULL;
-        count_alias = kid;
-    }
-    if (!saw_desc || !saw_key_projection || count_alias < 0)
-        return NULL;
-
-    ray_t* col = ray_table_get_col(tbl, key_sym);
-    if (!col || !ray_is_vec(col) || col->type != RAY_I16 ||
-        (col->attrs & RAY_ATTR_HAS_NULLS))
-        return NULL;
-
-    static ray_t* cache_result = NULL;
-    static ray_t* cache_tbl = NULL;
-    static ray_t* cache_col = NULL;
-    static int64_t cache_len = -1;
-    static int64_t cache_key_sym = -1;
-    static int64_t cache_count_alias = -1;
-    static int64_t cache_take = -1;
-    if (cache_result && cache_tbl == tbl && cache_col == col &&
-        cache_len == col->len && cache_key_sym == key_sym &&
-        cache_count_alias == count_alias && cache_take == take_n) {
-        ray_retain(cache_result);
-        return cache_result;
-    }
-
-    ray_pool_t* pool = ray_pool_get();
-    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
-    if (nw == 0) nw = 1;
-    ray_t* counts_hdr = NULL;
-    uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr,
-        (size_t)nw * 65536u * sizeof(uint32_t));
-    if (!counts)
-        return ray_error("oom", NULL);
-
-    i16_ne0_count_ctx_t ctx = {
-        .key = (const int16_t*)ray_data(col),
-        .counts = counts,
-    };
-    int64_t nrows = ray_table_nrows(tbl);
-    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
-        ray_pool_dispatch(pool, i16_ne0_count_worker_fn, &ctx, nrows);
-    else
-        i16_ne0_count_worker_fn(&ctx, 0, 0, nrows);
-
-    i16_count_pair_t top[1024];
-    int64_t top_n = 0;
-    for (uint32_t s = 0; s < 65536u; s++) {
-        uint32_t total = 0;
-        for (uint32_t w = 0; w < nw; w++)
-            total += counts[(size_t)w * 65536u + s];
-        if (!total) continue;
-        i16_count_pair_t cand = {
-            .key = (int16_t)((int32_t)s - 32768),
-            .count = total,
-        };
-        if (top_n < take_n) {
-            top[top_n++] = cand;
-            continue;
-        }
-        int64_t min_i = 0;
-        for (int64_t i = 1; i < top_n; i++) {
-            if (top[i].count < top[min_i].count ||
-                (top[i].count == top[min_i].count && top[i].key > top[min_i].key))
-                min_i = i;
-        }
-        if (cand.count > top[min_i].count ||
-            (cand.count == top[min_i].count && cand.key < top[min_i].key))
-            top[min_i] = cand;
-    }
-    scratch_free(counts_hdr);
-    qsort(top, (size_t)top_n, sizeof(i16_count_pair_t),
-          i16_count_pair_desc_cmp);
-
-    int64_t out_n = top_n;
-    ray_t* key_out = ray_vec_new(RAY_I16, out_n);
-    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
-    if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) {
-        if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out);
-        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
-        return ray_error("oom", NULL);
-    }
-    key_out->len = out_n;
-    cnt_out->len = out_n;
-    int16_t* ko = (int16_t*)ray_data(key_out);
-    int64_t* co = (int64_t*)ray_data(cnt_out);
-    for (int64_t i = 0; i < out_n; i++) {
-        ko[i] = top[i].key;
-        co[i] = (int64_t)top[i].count;
-    }
-
-    ray_t* out = ray_table_new(2);
-    if (!out || RAY_IS_ERR(out)) {
-        ray_release(key_out); ray_release(cnt_out);
-        return out ? out : ray_error("oom", NULL);
-    }
-    out = ray_table_add_col(out, key_sym, key_out);
-    out = ray_table_add_col(out, count_alias, cnt_out);
-    ray_release(key_out); ray_release(cnt_out);
-    if (cache_result)
-        ray_release(cache_result);
-    cache_result = out;
-    cache_tbl = tbl;
-    cache_col = col;
-    cache_len = col->len;
-    cache_key_sym = key_sym;
-    cache_count_alias = count_alias;
-    cache_take = take_n;
-    ray_retain(cache_result);
-    return out;
-}
-
-static ray_t* try_i32_i64_count_distinct_select(ray_t* tbl, ray_t* where_expr,
-                                                ray_t* by_expr,
-                                                ray_t* take_expr,
-                                                ray_t** dict_elems,
-                                                int64_t dict_n,
-                                                int64_t from_id,
-                                                int64_t where_id,
-                                                int64_t by_id,
-                                                int64_t take_id,
-                                                int64_t asc_id,
-                                                int64_t desc_id,
-                                                int64_t nearest_id) {
-    if (!tbl || tbl->type != RAY_TABLE || where_expr || !by_expr ||
-        !take_expr || by_expr->type != -RAY_SYM ||
-        !(by_expr->attrs & RAY_ATTR_NAME))
-        return NULL;
-
-    int64_t take_n = 0;
-    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024)
-        return NULL;
-
-    int64_t group_sym = by_expr->i64;
-    int64_t distinct_sym = -1;
-    int64_t count_alias = -1;
-    int saw_desc = 0;
-    int saw_group_projection = 0;
-    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
-        int64_t kid = dict_elems[i]->i64;
-        ray_t* v = dict_elems[i + 1];
-        if (kid == from_id || kid == where_id || kid == by_id ||
-            kid == take_id || kid == nearest_id)
-            continue;
-        if (kid == desc_id) {
-            if (!v || v->type != -RAY_SYM)
-                return NULL;
-            saw_desc = 1;
-            continue;
-        }
-        if (kid == asc_id) return NULL;
-        if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) &&
-            kid == group_sym && v->i64 == group_sym) {
-            saw_group_projection = 1;
-            continue;
-        }
-        if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2)
-            return NULL;
-        ray_t** ae = (ray_t**)ray_data(v);
-        if (!ae[0] || ae[0]->type != -RAY_SYM ||
-            !sym_name_eq(ae[0]->i64, "count", 5))
-            return NULL;
-        ray_t* inner = ae[1];
-        if (!inner || inner->type != RAY_LIST || ray_len(inner) != 2)
-            return NULL;
-        ray_t** ie = (ray_t**)ray_data(inner);
-        if (!ie[0] || ie[0]->type != -RAY_SYM ||
-            !sym_name_eq(ie[0]->i64, "distinct", 8))
-            return NULL;
-        ray_t* arg = ie[1];
-        if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME))
-            return NULL;
-        distinct_sym = arg->i64;
-        count_alias = kid;
-    }
-    if (!saw_desc || !saw_group_projection || count_alias < 0 ||
-        distinct_sym < 0)
-        return NULL;
-
-    ray_t* gcol = ray_table_get_col(tbl, group_sym);
-    ray_t* dcol = ray_table_get_col(tbl, distinct_sym);
-    if (!gcol || !dcol || !ray_is_vec(gcol) || !ray_is_vec(dcol) ||
-        gcol->type != RAY_I32 || dcol->type != RAY_I64 ||
-        (gcol->attrs & RAY_ATTR_HAS_NULLS) ||
-        (dcol->attrs & RAY_ATTR_HAS_NULLS))
-        return NULL;
-
-    static ray_t* cache_result = NULL;
-    static ray_t* cache_tbl = NULL;
-    static int64_t cache_len = -1;
-    static int64_t cache_group_sym = -1;
-    static int64_t cache_distinct_sym = -1;
-    static int64_t cache_count_alias = -1;
-    static int64_t cache_take = -1;
-    if (cache_result && cache_tbl == tbl && cache_len == gcol->len &&
-        cache_group_sym == group_sym && cache_distinct_sym == distinct_sym &&
-        cache_count_alias == count_alias && cache_take == take_n) {
-        ray_retain(cache_result);
-        return cache_result;
-    }
-
-    int64_t nrows = ray_table_nrows(tbl);
-    ray_pool_t* pool = ray_pool_get();
-    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
-    if (nw == 0) nw = 1;
-    const uint32_t local_cap = 1u << 20;
-    ray_t *lg_hdr = NULL, *lv_hdr = NULL, *lu_hdr = NULL;
-    int32_t* lg = (int32_t*)scratch_calloc(&lg_hdr,
-        (size_t)nw * local_cap * sizeof(int32_t));
-    int64_t* lv = (int64_t*)scratch_calloc(&lv_hdr,
-        (size_t)nw * local_cap * sizeof(int64_t));
-    uint8_t* lu = (uint8_t*)scratch_calloc(&lu_hdr, (size_t)nw * local_cap);
-    if (!lg || !lv || !lu) {
-        if (lg_hdr) scratch_free(lg_hdr);
-        if (lv_hdr) scratch_free(lv_hdr);
-        if (lu_hdr) scratch_free(lu_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    i32_i64_cd_ctx_t ctx = {
-        .group = (const int32_t*)ray_data(gcol),
-        .distinct = (const int64_t*)ray_data(dcol),
-        .cap = local_cap,
-        .groups = lg,
-        .values = lv,
-        .used = lu,
-    };
-    atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed);
-    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
-        ray_pool_dispatch(pool, i32_i64_cd_worker_fn, &ctx, nrows);
-    else
-        i32_i64_cd_worker_fn(&ctx, 0, 0, nrows);
-    if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) {
-        scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
-        return NULL;
-    }
-
-    const uint32_t gcap = 1u << 23;
-    const uint32_t gmask = gcap - 1u;
-    ray_t *gg_hdr = NULL, *gv_hdr = NULL, *gu_hdr = NULL;
-    int32_t* gg = (int32_t*)scratch_calloc(&gg_hdr, (size_t)gcap * sizeof(int32_t));
-    int64_t* gv = (int64_t*)scratch_calloc(&gv_hdr, (size_t)gcap * sizeof(int64_t));
-    uint8_t* gu = (uint8_t*)scratch_calloc(&gu_hdr, (size_t)gcap);
-    if (!gg || !gv || !gu) {
-        scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
-        if (gg_hdr) scratch_free(gg_hdr);
-        if (gv_hdr) scratch_free(gv_hdr);
-        if (gu_hdr) scratch_free(gu_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    int64_t global_n = 0;
-    for (uint32_t w = 0; w < nw; w++) {
-        int32_t* wg = lg + (size_t)w * local_cap;
-        int64_t* wv = lv + (size_t)w * local_cap;
-        uint8_t* wu = lu + (size_t)w * local_cap;
-        for (uint32_t s = 0; s < local_cap; s++) {
-            if (!wu[s]) continue;
-            int32_t g = wg[s];
-            int64_t v = wv[s];
-            uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & gmask;
-            while (gu[slot] && (gg[slot] != g || gv[slot] != v))
-                slot = (slot + 1u) & gmask;
-            if (!gu[slot]) {
-                if (global_n >= (int64_t)(gcap * 7u / 10u)) {
-                    scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr);
-                    scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
-                    return NULL;
-                }
-                gu[slot] = 1;
-                gg[slot] = g;
-                gv[slot] = v;
-                global_n++;
-            }
-        }
-    }
-    scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
-
-    const uint32_t rcap = 4096;
-    const uint32_t rmask = rcap - 1u;
-    int32_t rkeys[4096];
-    uint32_t rcounts[4096];
-    uint8_t rused[4096];
-    memset(rused, 0, sizeof(rused));
-    int64_t region_n = 0;
-    for (uint32_t s = 0; s < gcap; s++) {
-        if (!gu[s]) continue;
-        int32_t g = gg[s];
-        uint32_t slot = count_hash_u32((uint32_t)g) & rmask;
-        while (rused[slot] && rkeys[slot] != g)
-            slot = (slot + 1u) & rmask;
-        if (!rused[slot]) {
-            if (region_n >= (int64_t)(rcap / 2)) {
-                scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr);
-                return NULL;
-            }
-            rused[slot] = 1;
-            rkeys[slot] = g;
-            rcounts[slot] = 0;
-            region_n++;
-        }
-        rcounts[slot]++;
-    }
-    scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr);
-
-    ray_t* pairs_hdr = NULL;
-    i32_count_pair_t* pairs = (i32_count_pair_t*)scratch_alloc(
-        &pairs_hdr, (size_t)region_n * sizeof(i32_count_pair_t));
-    if (!pairs && region_n > 0)
-        return ray_error("oom", NULL);
-    int64_t pi = 0;
-    for (uint32_t s = 0; s < rcap; s++) {
-        if (!rused[s]) continue;
-        pairs[pi++] = (i32_count_pair_t){ .key = rkeys[s], .count = rcounts[s] };
-    }
-    qsort(pairs, (size_t)region_n, sizeof(i32_count_pair_t),
-          i32_count_pair_desc_cmp);
-
-    int64_t out_n = region_n < take_n ? region_n : take_n;
-    ray_t* key_out = ray_vec_new(RAY_I32, out_n);
-    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
-    if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) {
-        if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out);
-        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
-        scratch_free(pairs_hdr);
-        return ray_error("oom", NULL);
-    }
-    key_out->len = out_n;
-    cnt_out->len = out_n;
-    int32_t* ko = (int32_t*)ray_data(key_out);
-    int64_t* co = (int64_t*)ray_data(cnt_out);
-    for (int64_t i = 0; i < out_n; i++) {
-        ko[i] = pairs[i].key;
-        co[i] = (int64_t)pairs[i].count;
-    }
-    scratch_free(pairs_hdr);
-
-    ray_t* out = ray_table_new(2);
-    if (!out || RAY_IS_ERR(out)) {
-        ray_release(key_out); ray_release(cnt_out);
-        return out ? out : ray_error("oom", NULL);
-    }
-    out = ray_table_add_col(out, group_sym, key_out);
-    out = ray_table_add_col(out, count_alias, cnt_out);
-    ray_release(key_out); ray_release(cnt_out);
-    if (cache_result)
-        ray_release(cache_result);
-    cache_result = out;
-    cache_tbl = tbl;
-    cache_len = gcol->len;
-    cache_group_sym = group_sym;
-    cache_distinct_sym = distinct_sym;
-    cache_count_alias = count_alias;
-    cache_take = take_n;
-    ray_retain(cache_result);
-    return out;
-}
-
-static ray_t* try_i16x2_count_desc_select(ray_t* tbl, ray_t* where_expr,
-                                          ray_t* by_expr, ray_t* take_expr,
-                                          ray_t** dict_elems, int64_t dict_n,
-                                          int64_t from_id, int64_t where_id,
-                                          int64_t by_id, int64_t take_id,
-                                          int64_t asc_id, int64_t desc_id,
-                                          int64_t nearest_id) {
-    if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr ||
-        !take_expr || by_expr->type != RAY_DICT)
-        return NULL;
-
-    int64_t take_n = 0;
-    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000)
-        return NULL;
-
-    DICT_VIEW_DECL(bv);
-    DICT_VIEW_OPEN(by_expr, bv);
-    if (DICT_VIEW_OVERFLOW(bv) || bv_n != 4) return NULL;
-    ray_t* key0_atom = bv[0];
-    ray_t* key0_val = bv[1];
-    ray_t* key1_atom = bv[2];
-    ray_t* key1_val = bv[3];
-    if (!key0_atom || key0_atom->type != -RAY_SYM ||
-        !key1_atom || key1_atom->type != -RAY_SYM ||
-        !key0_val || key0_val->type != -RAY_SYM ||
-        !key1_val || key1_val->type != -RAY_SYM ||
-        !(key0_val->attrs & RAY_ATTR_NAME) ||
-        !(key1_val->attrs & RAY_ATTR_NAME) ||
-        key0_atom->i64 != key0_val->i64 ||
-        key1_atom->i64 != key1_val->i64)
-        return NULL;
-
-    int64_t count_alias = -1;
-    int saw_desc = 0;
-    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
-        int64_t kid = dict_elems[i]->i64;
-        ray_t* v = dict_elems[i + 1];
-        if (kid == from_id || kid == where_id || kid == by_id ||
-            kid == take_id || kid == nearest_id)
-            continue;
-        if (kid == desc_id) {
-            if (!v || v->type != -RAY_SYM)
-                return NULL;
-            saw_desc = 1;
-            continue;
-        }
-        if (kid == asc_id) return NULL;
-        if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL;
-        ray_t** ae = (ray_t**)ray_data(v);
-        if (!ae[0] || ae[0]->type != -RAY_SYM ||
-            !sym_name_eq(ae[0]->i64, "count", 5))
-            return NULL;
-        count_alias = kid;
-    }
-    if (!saw_desc || count_alias < 0) return NULL;
-
-    ray_t* col0 = ray_table_get_col(tbl, key0_atom->i64);
-    ray_t* col1 = ray_table_get_col(tbl, key1_atom->i64);
-    if (!col0 || !col1 || !ray_is_vec(col0) || !ray_is_vec(col1) ||
-        col0->type != RAY_I16 || col1->type != RAY_I16 ||
-        (col0->attrs & RAY_ATTR_HAS_NULLS) ||
-        (col1->attrs & RAY_ATTR_HAS_NULLS))
-        return NULL;
-
-    xbar_count_clause_t clauses[16];
-    uint8_t n_clauses = 0;
-    if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) ||
-        n_clauses == 0)
-        return NULL;
-    order_count_clauses(clauses, n_clauses);
-
-    static ray_t* cache_result = NULL;
-    static ray_t* cache_tbl = NULL;
-    static ray_t* cache_col0 = NULL;
-    static ray_t* cache_col1 = NULL;
-    static int64_t cache_len = -1;
-    static int64_t cache_key0 = -1;
-    static int64_t cache_key1 = -1;
-    static int64_t cache_count_alias = -1;
-    static int64_t cache_take = -1;
-    static uint8_t cache_n_clauses = 0;
-    static xbar_count_clause_t cache_clauses[16];
-    if (cache_result && cache_tbl == tbl && cache_col0 == col0 &&
-        cache_col1 == col1 && cache_len == col0->len &&
-        cache_key0 == key0_atom->i64 && cache_key1 == key1_atom->i64 &&
-        cache_count_alias == count_alias && cache_take == take_n &&
-        xbar_clause_cache_eq(cache_clauses, cache_n_clauses,
-                             clauses, n_clauses)) {
-        ray_retain(cache_result);
-        return cache_result;
-    }
-
-    int64_t nrows = ray_table_nrows(tbl);
-    const uint32_t cap = 4096;
-    const uint32_t mask = cap - 1u;
-    ray_pool_t* pool = ray_pool_get();
-    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
-    if (nw == 0) nw = 1;
-
-    ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL;
-    uint32_t* keys = (uint32_t*)scratch_calloc(&keys_hdr,
-        (size_t)nw * cap * sizeof(uint32_t));
-    uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr,
-        (size_t)nw * cap * sizeof(uint32_t));
-    uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap);
-    if (!keys || !counts || !used) {
-        if (keys_hdr) scratch_free(keys_hdr);
-        if (counts_hdr) scratch_free(counts_hdr);
-        if (used_hdr) scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    i16x2_count_ctx_t ctx = {
-        .key0 = (const int16_t*)ray_data(col0),
-        .key1 = (const int16_t*)ray_data(col1),
-        .n_clauses = n_clauses,
-        .cap = cap,
-        .keys = keys,
-        .counts = counts,
-        .used = used,
-    };
-    memcpy(ctx.clauses, clauses, sizeof(clauses));
-    atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed);
-    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
-        ray_pool_dispatch(pool, i16x2_count_worker_fn, &ctx, nrows);
-    else
-        i16x2_count_worker_fn(&ctx, 0, 0, nrows);
-    if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) {
-        scratch_free(keys_hdr);
-        scratch_free(counts_hdr);
-        scratch_free(used_hdr);
-        return NULL;
-    }
-
-    ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL;
-    uint32_t* mkeys = (uint32_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(uint32_t));
-    uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t));
-    uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap);
-    if (!mkeys || !mcounts || !mused) {
-        scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
-        if (mkeys_hdr) scratch_free(mkeys_hdr);
-        if (mcounts_hdr) scratch_free(mcounts_hdr);
-        if (mused_hdr) scratch_free(mused_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    int64_t n_groups = 0;
-    for (uint32_t w = 0; w < nw; w++) {
-        uint32_t* wk = keys + (size_t)w * cap;
-        uint32_t* wc = counts + (size_t)w * cap;
-        uint8_t* wu = used + (size_t)w * cap;
-        for (uint32_t s = 0; s < cap; s++) {
-            if (!wu[s]) continue;
-            uint32_t k = wk[s];
-            uint32_t slot = count_hash_u32(k) & mask;
-            while (mused[slot] && mkeys[slot] != k)
-                slot = (slot + 1u) & mask;
-            if (!mused[slot]) {
-                if (n_groups >= (int64_t)(cap / 2)) {
-                    scratch_free(mkeys_hdr); scratch_free(mcounts_hdr);
-                    scratch_free(mused_hdr); scratch_free(keys_hdr);
-                    scratch_free(counts_hdr); scratch_free(used_hdr);
-                    return NULL;
-                }
-                mused[slot] = 1;
-                mkeys[slot] = k;
-                n_groups++;
-            }
-            mcounts[slot] += wc[s];
-        }
-    }
-
-    int64_t out_n = n_groups < take_n ? n_groups : take_n;
-    ray_t* pairs_hdr = NULL;
-    i16x2_count_pair_t* pairs = (i16x2_count_pair_t*)scratch_alloc(
-        &pairs_hdr, (size_t)n_groups * sizeof(i16x2_count_pair_t));
-    if (!pairs && n_groups > 0) {
-        scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr);
-        scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-    int64_t pi = 0;
-    for (uint32_t s = 0; s < cap; s++) {
-        if (!mused[s]) continue;
-        pairs[pi++] = (i16x2_count_pair_t){ .key = mkeys[s], .count = mcounts[s] };
-    }
-    qsort(pairs, (size_t)n_groups, sizeof(i16x2_count_pair_t),
-          i16x2_count_pair_desc_cmp);
-
-    ray_t* key0_out = ray_vec_new(RAY_I16, out_n);
-    ray_t* key1_out = ray_vec_new(RAY_I16, out_n);
-    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
-    if (!key0_out || !key1_out || !cnt_out ||
-        RAY_IS_ERR(key0_out) || RAY_IS_ERR(key1_out) || RAY_IS_ERR(cnt_out)) {
-        if (key0_out && !RAY_IS_ERR(key0_out)) ray_release(key0_out);
-        if (key1_out && !RAY_IS_ERR(key1_out)) ray_release(key1_out);
-        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
-        scratch_free(pairs_hdr);
-        scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr);
-        scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-    key0_out->len = out_n;
-    key1_out->len = out_n;
-    cnt_out->len = out_n;
-    int16_t* k0o = (int16_t*)ray_data(key0_out);
-    int16_t* k1o = (int16_t*)ray_data(key1_out);
-    int64_t* co = (int64_t*)ray_data(cnt_out);
-    for (int64_t i = 0; i < out_n; i++) {
-        uint32_t k = pairs[i].key;
-        k0o[i] = (int16_t)(uint16_t)(k >> 16);
-        k1o[i] = (int16_t)(uint16_t)k;
-        co[i] = (int64_t)pairs[i].count;
-    }
-    scratch_free(pairs_hdr);
-    scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr);
-    scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
-
-    ray_t* out = ray_table_new(3);
-    if (!out || RAY_IS_ERR(out)) {
-        ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out);
-        return out ? out : ray_error("oom", NULL);
-    }
-    out = ray_table_add_col(out, key0_atom->i64, key0_out);
-    out = ray_table_add_col(out, key1_atom->i64, key1_out);
-    out = ray_table_add_col(out, count_alias, cnt_out);
-    ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out);
-    if (cache_result)
-        ray_release(cache_result);
-    cache_result = out;
-    cache_tbl = tbl;
-    cache_col0 = col0;
-    cache_col1 = col1;
-    cache_len = col0->len;
-    cache_key0 = key0_atom->i64;
-    cache_key1 = key1_atom->i64;
-    cache_count_alias = count_alias;
-    cache_take = take_n;
-    cache_n_clauses = n_clauses;
-    memcpy(cache_clauses, clauses, sizeof(clauses));
-    ray_retain(cache_result);
-    return out;
-}
-
-static ray_t* try_xbar_count_select(ray_t* tbl, ray_t* where_expr,
-                                    ray_t* by_expr, ray_t* take_expr,
-                                    ray_t** dict_elems, int64_t dict_n,
-                                    int64_t from_id, int64_t where_id,
-                                    int64_t by_id, int64_t take_id,
-                                    int64_t asc_id, int64_t desc_id,
-                                    int64_t nearest_id) {
-    if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr ||
-        !take_expr)
-        return NULL;
-
-    int64_t take_n = 0;
-    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000)
-        return NULL;
-
-    if (!by_expr || by_expr->type != RAY_DICT) return NULL;
-    DICT_VIEW_DECL(bv);
-    DICT_VIEW_OPEN(by_expr, bv);
-    if (DICT_VIEW_OVERFLOW(bv) || bv_n != 2) return NULL;
-    ray_t* key_atom = bv[0];
-    ray_t* xbar_expr = bv[1];
-    if (!key_atom || key_atom->type != -RAY_SYM ||
-        !xbar_expr || xbar_expr->type != RAY_LIST ||
-        ray_len(xbar_expr) != 3)
-        return NULL;
-    ray_t** xe = (ray_t**)ray_data(xbar_expr);
-    if (!xe[0] || xe[0]->type != -RAY_SYM ||
-        !sym_name_eq(xe[0]->i64, "xbar", 4))
-        return NULL;
-    if (!xe[1] || xe[1]->type != -RAY_SYM ||
-        !(xe[1]->attrs & RAY_ATTR_NAME))
-        return NULL;
-    int64_t bucket = 0;
-    if (!atom_i64_const(xe[2], &bucket) || bucket <= 0) return NULL;
-
-    int64_t count_alias = -1;
-    int saw_asc = 0;
-    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
-        int64_t kid = dict_elems[i]->i64;
-        ray_t* v = dict_elems[i + 1];
-        if (kid == from_id || kid == where_id || kid == by_id ||
-            kid == take_id || kid == nearest_id)
-            continue;
-        if (kid == asc_id) {
-            if (!v || v->type != -RAY_SYM || v->i64 != key_atom->i64)
-                return NULL;
-            saw_asc = 1;
-            continue;
-        }
-        if (kid == desc_id) return NULL;
-        if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL;
-        ray_t** ae = (ray_t**)ray_data(v);
-        if (!ae[0] || ae[0]->type != -RAY_SYM ||
-            !sym_name_eq(ae[0]->i64, "count", 5))
-            return NULL;
-        count_alias = kid;
-    }
-    if (!saw_asc || count_alias < 0) return NULL;
-
-    ray_t* key_col = ray_table_get_col(tbl, xe[1]->i64);
-    if (!key_col || !ray_is_vec(key_col) || key_col->type != RAY_TIMESTAMP ||
-        RAY_IS_PARTED(key_col->type) || key_col->type == RAY_MAPCOMMON ||
-        (key_col->attrs & RAY_ATTR_HAS_NULLS))
-        return NULL;
-
-    xbar_count_clause_t clauses[16];
-    uint8_t n_clauses = 0;
-    if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) ||
-        n_clauses == 0)
-        return NULL;
-    order_count_clauses(clauses, n_clauses);
-
-    int64_t nrows = ray_table_nrows(tbl);
-    const int64_t* key_data = (const int64_t*)ray_data(key_col);
-    static ray_t* cache_result = NULL;
-    static ray_t* cache_tbl = NULL;
-    static ray_t* cache_key_col = NULL;
-    static int64_t cache_len = -1;
-    static int64_t cache_key_sym = -1;
-    static int64_t cache_out_sym = -1;
-    static int64_t cache_count_alias = -1;
-    static int64_t cache_bucket = -1;
-    static int64_t cache_take = -1;
-    static uint8_t cache_n_clauses = 0;
-    static xbar_count_clause_t cache_clauses[16];
-    if (cache_result && cache_tbl == tbl && cache_key_col == key_col &&
-        cache_len == key_col->len && cache_key_sym == xe[1]->i64 &&
-        cache_out_sym == key_atom->i64 && cache_count_alias == count_alias &&
-        cache_bucket == bucket && cache_take == take_n &&
-        xbar_clause_cache_eq(cache_clauses, cache_n_clauses,
-                             clauses, n_clauses)) {
-        ray_retain(cache_result);
-        return cache_result;
-    }
-    const uint32_t cap = 4096;
-    const uint32_t mask = cap - 1u;
-    ray_pool_t* pool = ray_pool_get();
-    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
-    if (nw == 0) nw = 1;
-    ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL;
-    int64_t* keys = (int64_t*)scratch_calloc(&keys_hdr,
-        (size_t)nw * cap * sizeof(int64_t));
-    uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr,
-        (size_t)nw * cap * sizeof(uint32_t));
-    uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap);
-    if (!keys || !counts || !used) {
-        if (keys_hdr) scratch_free(keys_hdr);
-        if (counts_hdr) scratch_free(counts_hdr);
-        if (used_hdr) scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    xbar_count_ctx_t ctx = {
-        .key_data = key_data,
-        .bucket = bucket,
-        .n_clauses = n_clauses,
-        .cap = cap,
-        .keys = keys,
-        .counts = counts,
-        .used = used,
-    };
-    memcpy(ctx.clauses, clauses, sizeof(clauses));
-    atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed);
-    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
-        ray_pool_dispatch(pool, xbar_count_worker_fn, &ctx, nrows);
-    else
-        xbar_count_worker_fn(&ctx, 0, 0, nrows);
-    if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) {
-        scratch_free(keys_hdr);
-        scratch_free(counts_hdr);
-        scratch_free(used_hdr);
-        return NULL;
-    }
-
-    ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL;
-    int64_t* mkeys = (int64_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(int64_t));
-    uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t));
-    uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap);
-    if (!mkeys || !mcounts || !mused) {
-        scratch_free(keys_hdr);
-        scratch_free(counts_hdr);
-        scratch_free(used_hdr);
-        if (mkeys_hdr) scratch_free(mkeys_hdr);
-        if (mcounts_hdr) scratch_free(mcounts_hdr);
-        if (mused_hdr) scratch_free(mused_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    int64_t n_groups = 0;
-    for (uint32_t w = 0; w < nw; w++) {
-        int64_t* wk = keys + (size_t)w * cap;
-        uint32_t* wc = counts + (size_t)w * cap;
-        uint8_t* wu = used + (size_t)w * cap;
-        for (uint32_t s = 0; s < cap; s++) {
-            if (!wu[s]) continue;
-            int64_t k = wk[s];
-            uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask;
-            while (mused[slot] && mkeys[slot] != k)
-                slot = (slot + 1u) & mask;
-            if (!mused[slot]) {
-                if (n_groups >= (int64_t)(cap / 2)) {
-                    scratch_free(mkeys_hdr);
-                    scratch_free(mcounts_hdr);
-                    scratch_free(mused_hdr);
-                    scratch_free(keys_hdr);
-                    scratch_free(counts_hdr);
-                    scratch_free(used_hdr);
-                    return NULL;
-                }
-                mused[slot] = 1;
-                mkeys[slot] = k;
-                n_groups++;
-            }
-            mcounts[slot] += wc[s];
-        }
-    }
-
-    int64_t out_n = n_groups < take_n ? n_groups : take_n;
-    ray_t* pairs_hdr = NULL;
-    xbar_count_pair_t* pairs = (xbar_count_pair_t*)scratch_alloc(
-        &pairs_hdr, (size_t)n_groups * sizeof(xbar_count_pair_t));
-    if (!pairs && n_groups > 0) {
-        scratch_free(keys_hdr);
-        scratch_free(counts_hdr);
-        scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-    int64_t pi = 0;
-    for (uint32_t s = 0; s < cap; s++) {
-        if (!mused[s]) continue;
-        pairs[pi++] = (xbar_count_pair_t){ .key = mkeys[s], .count = mcounts[s] };
-    }
-    qsort(pairs, (size_t)n_groups, sizeof(xbar_count_pair_t),
-          xbar_count_pair_cmp);
-
-    ray_t* key_out = ray_vec_new(RAY_TIMESTAMP, out_n);
-    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
-    if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) {
-        if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out);
-        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
-        scratch_free(pairs_hdr);
-        scratch_free(mkeys_hdr);
-        scratch_free(mcounts_hdr);
-        scratch_free(mused_hdr);
-        scratch_free(keys_hdr);
-        scratch_free(counts_hdr);
-        scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-    key_out->len = out_n;
-    cnt_out->len = out_n;
-    int64_t* ko = (int64_t*)ray_data(key_out);
-    int64_t* co = (int64_t*)ray_data(cnt_out);
-    for (int64_t i = 0; i < out_n; i++) {
-        ko[i] = pairs[i].key;
-        co[i] = pairs[i].count;
-    }
-    scratch_free(pairs_hdr);
-    scratch_free(mkeys_hdr);
-    scratch_free(mcounts_hdr);
-    scratch_free(mused_hdr);
-    scratch_free(keys_hdr);
-    scratch_free(counts_hdr);
-    scratch_free(used_hdr);
-
-    ray_t* out = ray_table_new(2);
-    if (!out || RAY_IS_ERR(out)) {
-        ray_release(key_out);
-        ray_release(cnt_out);
-        return out ? out : ray_error("oom", NULL);
-    }
-    out = ray_table_add_col(out, key_atom->i64, key_out);
-    out = ray_table_add_col(out, count_alias, cnt_out);
-    ray_release(key_out);
-    ray_release(cnt_out);
-    if (cache_result)
-        ray_release(cache_result);
-    cache_result = out;
-    cache_tbl = tbl;
-    cache_key_col = key_col;
-    cache_len = key_col->len;
-    cache_key_sym = xe[1]->i64;
-    cache_out_sym = key_atom->i64;
-    cache_count_alias = count_alias;
-    cache_bucket = bucket;
-    cache_take = take_n;
-    cache_n_clauses = n_clauses;
-    memcpy(cache_clauses, clauses, sizeof(clauses));
-    ray_retain(cache_result);
-    return out;
-}
-
 static int expr_affine_of_sym(ray_t* expr, int64_t sym, int64_t* bias) {
     if (!expr) return 0;
     if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME) &&
@@ -4883,43 +3629,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
         if (kid == asc_id || kid == desc_id) { has_sort = true; break; }
     }
 
-    ray_t* xbar_count = try_xbar_count_select(tbl, where_expr, by_expr,
-                                              take_expr, dict_elems, dict_n,
-                                              from_id, where_id, by_id,
-                                              take_id, asc_id, desc_id,
-                                              nearest_id);
-    if (xbar_count) {
-        ray_release(tbl);
-        return xbar_count;
-    }
-
-    ray_t* i16_ne0_count = try_i16_ne0_count_desc_select(
-        tbl, where_expr, by_expr, take_expr, dict_elems, dict_n,
-        from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id);
-    if (i16_ne0_count) {
-        ray_release(tbl);
-        return i16_ne0_count;
-    }
-
-    ray_t* i32_i64_cd = try_i32_i64_count_distinct_select(
-        tbl, where_expr, by_expr, take_expr, dict_elems, dict_n,
-        from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id);
-    if (i32_i64_cd) {
-        ray_release(tbl);
-        return i32_i64_cd;
-    }
-
-    ray_t* i16x2_count = try_i16x2_count_desc_select(tbl, where_expr, by_expr,
-                                                     take_expr, dict_elems,
-                                                     dict_n, from_id,
-                                                     where_id, by_id,
-                                                     take_id, asc_id,
-                                                     desc_id, nearest_id);
-    if (i16x2_count) {
-        ray_release(tbl);
-        return i16x2_count;
-    }
-
     /* `nearest` is mutually exclusive with `asc`/`desc`/`by` — ANN
      * ordering is an index scan, not a column sort, and cannot be
      * composed with group-by in this phase. */

From 240c9386e8df1b2b8fe6c7ddd9c4debc933f396b Mon Sep 17 00:00:00 2001
From: Hetoku <volonter84@gmail.com>
Date: Fri, 22 May 2026 17:59:35 +0200
Subject: [PATCH 03/36] perf(group): early-abort the DA-path min/max probe on
 doomed key spans
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The direct-array group-by path probes each key column's min/max to
decide whether a dense slot array fits (≤ DA_MAX_COMPOSITE_SLOTS).
On high-cardinality keys (UserID, WatchID, ClientIP, …) the probe
always loses, but it still scanned the full 10M-row column first —
and multi-key queries paid it once per key.

minmax_scan_fn now carries a shared abort flag and a span budget:
the moment any worker observes a key span wider than the budget the
whole parallel scan stops and the query falls through to the radix
HT path. Correctness is unchanged — a worker only aborts once the
span already exceeds what the DA path could ever accept, so the
caller's da_fits rejection is identical to a full scan's.

Minor: the eliminated scan is memory-bandwidth-bound and overlaps
other work, so wall-time on the large group-by queries moves within
run-to-run noise; the change removes provably-wasted CPU, not a
measured win. Test suite 2657/2659 (2 skipped, 0 failed).
---
 src/ops/group.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/ops/group.c b/src/ops/group.c
index 2473b3a8..f7c24259 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -3213,6 +3213,12 @@ typedef struct {
     uint32_t    n_workers;
     const int64_t* match_idx;    /* NULL = no selection */
     ray_t*      rowsel;
+    /* DA-path early-out: once any worker observes a key span wider than
+     * span_budget the direct-array path is provably infeasible (its slot
+     * count would exceed DA_MAX_COMPOSITE_SLOTS), so the whole scan can
+     * stop instead of reading the rest of a 10M-row column for nothing. */
+    int64_t          span_budget;
+    _Atomic(int)*    abort_flag;
 } minmax_ctx_t;
 
 static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
@@ -3221,11 +3227,25 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t
     const int64_t* match_idx = c->match_idx;
     int64_t kmin = INT64_MAX, kmax = INT64_MIN;
     int8_t t = c->key_type;
+    const int64_t span_budget = c->span_budget;
 
+    /* Span check and abort poll are batched (every 8192 rows) so the
+     * hot per-row loop body stays a branchless min/max with no atomics. */
     #define MINMAX_SEG_LOOP(TYPE, CAST) \
         do { \
             const TYPE* kd = (const TYPE*)c->key_data; \
             for (int64_t i = start; i < end; i++) { \
+                if (((i - start) & 8191) == 0) { \
+                    if (atomic_load_explicit(c->abort_flag, \
+                                             memory_order_relaxed)) \
+                        goto minmax_done; \
+                    if (kmax >= kmin && \
+                        (uint64_t)(kmax - kmin) > (uint64_t)span_budget) { \
+                        atomic_store_explicit(c->abort_flag, 1, \
+                                              memory_order_relaxed); \
+                        goto minmax_done; \
+                    } \
+                } \
                 int64_t r = match_idx ? match_idx[i] : i; \
                 if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, r)) continue; \
                 int64_t v = (int64_t)CAST kd[r]; \
@@ -3252,6 +3272,7 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t
 
     #undef MINMAX_SEG_LOOP
 
+minmax_done:
     /* Merge with existing per-worker values (a worker may process multiple morsels) */
     if (kmin < c->per_worker_min[wid]) c->per_worker_min[wid] = kmin;
     if (kmax > c->per_worker_max[wid]) c->per_worker_max[wid] = kmax;
@@ -5559,6 +5580,9 @@ da_path:;
                             ? ray_pool_total_workers(mm_pool) : 1;
             /* VLA bounded by worker count — max ~2KB per key even on 256-core systems. */
             int64_t mm_mins[mm_n], mm_maxs[mm_n];
+            /* Shared across keys: once any key proves the DA slot count
+             * infeasible the scan aborts instead of reading the rest. */
+            _Atomic(int) mm_abort = 0;
             for (uint8_t k = 0; k < n_keys && da_fits; k++) {
                 int64_t kmin, kmax;
                 for (uint32_t w = 0; w < mm_n; w++) {
@@ -5574,12 +5598,18 @@ da_path:;
                     .n_workers      = mm_n,
                     .match_idx      = match_idx,
                     .rowsel         = rowsel,
+                    .span_budget    = DA_MAX_COMPOSITE_SLOTS,
+                    .abort_flag     = &mm_abort,
                 };
                 if (mm_n > 1) {
                     ray_pool_dispatch(mm_pool, minmax_scan_fn, &mm_ctx, n_scan);
                 } else {
                     minmax_scan_fn(&mm_ctx, 0, 0, n_scan);
                 }
+                if (atomic_load_explicit(&mm_abort, memory_order_relaxed)) {
+                    da_fits = false;
+                    break;
+                }
                 kmin = INT64_MAX; kmax = INT64_MIN;
                 for (uint32_t w = 0; w < mm_n; w++) {
                     if (mm_mins[w] < kmin) kmin = mm_mins[w];

From e825f844522b902343959a017fe323afa5919e97 Mon Sep 17 00:00:00 2001
From: Hetoku <volonter84@gmail.com>
Date: Mon, 25 May 2026 11:30:44 +0200
Subject: [PATCH 04/36] =?UTF-8?q?perf(group):=20fused=20radix=20HT=20?=
 =?UTF-8?q?=E2=80=94=20per-(worker,=20partition)=20direct=20insert?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The radix group-by pipeline previously did two full DRAM passes for
the group keys: phase1 scattered a fat entry (hash + keys + nullmask
+ agg vals) into 256 partition buffers per worker, phase2 read every
entry back to build the per-partition HTs.  For 10M rows that's
~240 MB written and re-read just to shuffle data into partitions.

For count-only queries (every agg is OP_COUNT), aggregate directly
into a per-(worker, partition) group_ht_t during the scan, and merge
the n worker HTs per partition in phase2.  The per-(worker, partition)
HT is small enough (~1.5K groups → ~64 KB row store for q15) to live
in L1/L2; the merge adds counts via a new state-merge primitive
(group_merge_count_row) that probes by recomputed key hash.

Phase3 emit is untouched: the v2 pipeline lands part_hts[] in the
exact format the existing radix_phase3_fn consumes, so the result
build, holistic post-pass, and result-table assembly all reuse the
existing code.  On miss (any non-COUNT agg, FIRST/LAST/holistic/
PEARSON, or layout that needs richer state) v2 falls through to the
original phase1/phase2.

Measured wins (10M-row hits, in-memory):
  q15 (by UserID count, top 10)        220 → 162 ms   (26%)
  q11 (nested by {phone,model,user})   280 → 200 ms   (28%)
  q35 (by {ClientIP, ClientIP-k} cnt)  240 → 168 ms   (30%)
SUM/AVG queries (q30/q31/q32) unchanged — needs a state-merge
primitive for non-count aggregators (next increment).

Test suite: 2657/2659 pass (2 skipped, 0 failed).
---
 src/ops/group.c | 295 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 295 insertions(+)

diff --git a/src/ops/group.c b/src/ops/group.c
index f7c24259..49267c09 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -3199,6 +3199,219 @@ static void radix_phase2_fn(void* ctx, uint32_t worker_id, int64_t start, int64_
     }
 }
 
+/* ============================================================================
+ * Fused radix: per-(worker, partition) HT direct-insert + per-partition merge
+ *
+ *   Replaces the materialise-fat-entries-then-build-HTs round trip with a
+ *   single-pass aggregation per (worker, partition) HT, followed by an
+ *   in-cache merge per partition.  Currently restricted to count-only
+ *   queries (every agg is OP_COUNT) — the merge primitive here only
+ *   knows how to combine counts; SUM/AVG/MIN/MAX would need their own
+ *   state-merge logic (next increment).
+ *
+ *   Per-(worker, partition) HT for a 10M-row count-by-UserID: ~3M distinct
+ *   keys ÷ 256 parts ÷ 8 workers ≈ 1.5K groups → cap ~4K slots → ~64 KB
+ *   row store, L1/L2-resident.  Worker w processes its row range; per row
+ *   it hashes keys, computes partition = RADIX_PART(h), probes its local
+ *   HT_p.  Phase2 dispatches partitions across workers; each merges the n
+ *   worker HTs for one partition into a final partition HT in part_hts[p].
+ *   Phase3 (radix_phase3_fn) emits from part_hts[] exactly as before.
+ * ============================================================================ */
+
+/* Merge one source group row (count + keys + null_mask) into the target HT.
+ * Hash is recomputed from the row's key region via hash_keys_inline —
+ * identical to what group_probe_entry did when the row was first inserted,
+ * so the partition assignment is consistent.  Count-only: state merge is
+ * just count += src_count; new groups inherit the source's count. */
+static inline uint32_t group_merge_count_row(group_ht_t* ht,
+    const char* src_row, const int8_t* key_types, uint32_t mask)
+{
+    const ght_layout_t* ly = &ht->layout;
+    int64_t src_count = *(const int64_t*)src_row;
+    const int64_t* skeys = (const int64_t*)(src_row + 8);
+    uint16_t key_bytes = (uint16_t)((ly->n_keys + 1) * 8);
+    uint64_t h = hash_keys_inline(skeys, key_types, ly->n_keys,
+                                  ly->wide_key_mask, ly->wide_key_esz,
+                                  ht->key_data);
+    uint8_t salt = HT_SALT(h);
+    uint32_t slot = (uint32_t)(h & mask);
+    for (;;) {
+        uint32_t sv = ht->slots[slot];
+        if (sv == HT_EMPTY) {
+            if (ht->grp_count >= ht->grp_cap) {
+                if (!group_ht_grow(ht)) { ht->oom = 1; return mask; }
+            }
+            uint32_t gid = ht->grp_count++;
+            char* row = ht->rows + (size_t)gid * ly->row_stride;
+            *(int64_t*)row = src_count;
+            memcpy(row + 8, skeys, key_bytes);
+            ht->slots[slot] = HT_PACK(salt, gid);
+            if (ht->grp_count * 2 > ht->ht_cap) {
+                group_ht_rehash(ht, key_types);
+                mask = ht->ht_cap - 1;
+            }
+            return mask;
+        }
+        if (HT_SALT_V(sv) == salt) {
+            uint32_t gid = HT_GID(sv);
+            char* row = ht->rows + (size_t)gid * ly->row_stride;
+            if (group_keys_equal((const int64_t*)(row + 8),
+                                  skeys, ly, ht->key_data)) {
+                *(int64_t*)row += src_count;
+                return mask;
+            }
+        }
+        slot = (slot + 1) & mask;
+    }
+}
+
+typedef struct {
+    void**         key_data;
+    int8_t*        key_types;
+    uint8_t*       key_attrs;
+    ray_t**        key_vecs;
+    uint8_t        nullable_mask;
+    uint32_t       n_workers;
+    group_ht_t*    wpart_hts;        /* [n_workers * RADIX_P] */
+    ght_layout_t   layout;
+    ray_t*         rowsel;
+    const int64_t* match_idx;
+    _Atomic(int)   oom;
+} radix_v2_phase1_ctx_t;
+
+static void radix_v2_phase1_fn(void* ctx, uint32_t worker_id,
+                               int64_t start, int64_t end) {
+    radix_v2_phase1_ctx_t* c = (radix_v2_phase1_ctx_t*)ctx;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    const ght_layout_t* ly = &c->layout;
+    uint8_t nk = ly->n_keys;
+    uint8_t wide = ly->wide_key_mask;
+    uint8_t nullable = c->nullable_mask;
+    const int64_t* match_idx = c->match_idx;
+
+    group_ht_t* my_hts = &c->wpart_hts[(size_t)worker_id * RADIX_P];
+    /* Lazily init this worker's 256 partition HTs. */
+    for (uint32_t p = 0; p < RADIX_P; p++) {
+        if (!my_hts[p].slots) {
+            if (!group_ht_init_sized(&my_hts[p], 256, ly, 128)) {
+                atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+                return;
+            }
+            if (wide && c->key_data)
+                group_ht_set_key_data(&my_hts[p], c->key_data);
+        }
+    }
+    uint32_t masks[RADIX_P];
+    for (uint32_t p = 0; p < RADIX_P; p++) masks[p] = my_hts[p].ht_cap - 1;
+
+    /* Stack-resident transient entry, same layout as group_rows_range. */
+    char ebuf[8 + 9 * 8 + 8 * 8 + 8];
+    for (int64_t i = start; i < end; i++) {
+        if (((i - start) & 65535) == 0 && ray_interrupted()) break;
+        int64_t row = match_idx ? match_idx[i] : i;
+        if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, row))
+            continue;
+        uint64_t h = 0;
+        int64_t* ek = (int64_t*)(ebuf + 8);
+        int64_t null_mask = 0;
+        for (uint8_t k = 0; k < nk; k++) {
+            int8_t t = c->key_types[k];
+            uint64_t kh;
+            bool is_null = (nullable & (1u << k))
+                           && ray_vec_is_null(c->key_vecs[k], row);
+            if (is_null) {
+                null_mask |= (int64_t)(1u << k);
+                ek[k] = 0;
+                kh = ray_hash_i64(0);
+            } else if (wide & (1u << k)) {
+                uint8_t esz = ly->wide_key_esz[k];
+                const void* src = (const char*)c->key_data[k] + (size_t)row * esz;
+                ek[k] = row;
+                kh = ray_hash_bytes(src, esz);
+            } else if (t == RAY_F64) {
+                int64_t kv;
+                memcpy(&kv, &((double*)c->key_data[k])[row], 8);
+                ek[k] = kv;
+                kh = ray_hash_f64(((double*)c->key_data[k])[row]);
+            } else {
+                int64_t kv = read_col_i64(c->key_data[k], row, t, c->key_attrs[k]);
+                ek[k] = kv;
+                kh = ray_hash_i64(kv);
+            }
+            h = (k == 0) ? kh : ray_hash_combine(h, kh);
+        }
+        ek[nk] = null_mask;
+        if (null_mask) h = ray_hash_combine(h, ray_hash_i64(null_mask));
+        *(uint64_t*)ebuf = h;
+        /* Count-only: no agg_vals to pack; entry body ends at the null-mask
+         * slot.  The HT row layout matches (need_flags == 0). */
+        uint32_t p = RADIX_PART(h);
+        uint32_t new_mask = group_probe_entry(&my_hts[p], ebuf,
+                                              c->key_types, masks[p]);
+        if (my_hts[p].oom) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+        masks[p] = new_mask;
+    }
+}
+
+typedef struct {
+    group_ht_t*   wpart_hts;     /* [n_workers * RADIX_P] — input */
+    group_ht_t*   part_hts;      /* [RADIX_P] — output */
+    int8_t*       key_types;
+    uint32_t      n_workers;
+    ght_layout_t  layout;
+    void**        key_data;
+    _Atomic(int)  oom;
+} radix_v2_phase2_ctx_t;
+
+static void radix_v2_phase2_fn(void* ctx, uint32_t worker_id,
+                               int64_t start, int64_t end) {
+    (void)worker_id;
+    radix_v2_phase2_ctx_t* c = (radix_v2_phase2_ctx_t*)ctx;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    uint16_t row_stride = c->layout.row_stride;
+    for (int64_t p = start; p < end; p++) {
+        /* Upper bound on the merged partition: sum of worker grp_counts
+         * (some keys may be present in multiple workers — the merge will
+         * fold those, so the final grp_count is ≤ this sum). */
+        uint32_t total_grps = 0;
+        for (uint32_t w = 0; w < c->n_workers; w++)
+            total_grps += c->wpart_hts[(size_t)w * RADIX_P + p].grp_count;
+        if (total_grps == 0) continue;
+        uint32_t ht_cap = 256;
+        {
+            uint64_t target = (uint64_t)total_grps * 2;
+            if (target < 256) target = 256;
+            while (ht_cap < target) ht_cap *= 2;
+        }
+        uint32_t init_grp = 256;
+        while (init_grp < total_grps && init_grp < 65536) init_grp *= 2;
+        if (!group_ht_init_sized(&c->part_hts[p], ht_cap, &c->layout, init_grp)) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+        if (c->layout.wide_key_mask && c->key_data)
+            group_ht_set_key_data(&c->part_hts[p], c->key_data);
+        uint32_t mask = c->part_hts[p].ht_cap - 1;
+        for (uint32_t w = 0; w < c->n_workers; w++) {
+            group_ht_t* src = &c->wpart_hts[(size_t)w * RADIX_P + p];
+            if (src->grp_count == 0) continue;
+            const char* rows = src->rows;
+            for (uint32_t gi = 0; gi < src->grp_count; gi++) {
+                mask = group_merge_count_row(&c->part_hts[p],
+                                             rows + (size_t)gi * row_stride,
+                                             c->key_types, mask);
+                if (c->part_hts[p].oom) {
+                    atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+                    return;
+                }
+            }
+        }
+    }
+}
+
 /* ============================================================================
  * Parallel direct-array accumulation for low-cardinality single integer key
  * ============================================================================ */
@@ -7437,6 +7650,87 @@ ht_path:;
 skip_top_count_filter:
 
     if (pool && nrows >= RAY_PARALLEL_THRESHOLD && n_total > 1 && !rowsel) {
+        /* Per-(worker, partition) direct-insert path for count-only.
+         * Bypasses the fat-entry materialisation and the phase1→phase2
+         * DRAM round trip; on success it populates part_hts[] in the
+         * same format the existing phase3 emit consumes. */
+        bool v2_count_only = (n_keys >= 1 && n_aggs > 0);
+        for (uint8_t a = 0; a < n_aggs && v2_count_only; a++)
+            if (ext->agg_ops[a] != OP_COUNT) v2_count_only = false;
+        if (v2_count_only && !(ght_layout.agg_is_first | ght_layout.agg_is_last
+                                | ght_layout.agg_is_holistic
+                                | ght_layout.agg_is_binary)) {
+            ray_t* wpart_hdr = NULL;
+            size_t v2_n_w = (size_t)n_total * RADIX_P;
+            group_ht_t* wpart_hts = (group_ht_t*)scratch_calloc(
+                &wpart_hdr, v2_n_w * sizeof(group_ht_t));
+            ray_t* v2_part_hdr = NULL;
+            group_ht_t* v2_part_hts = wpart_hts
+                ? (group_ht_t*)scratch_calloc(&v2_part_hdr,
+                                              RADIX_P * sizeof(group_ht_t))
+                : NULL;
+            if (!wpart_hts || !v2_part_hts) {
+                if (wpart_hts) scratch_free(wpart_hdr);
+                if (v2_part_hts) scratch_free(v2_part_hdr);
+                goto v2_done;
+            }
+            uint8_t v2_nullable = 0;
+            for (uint8_t k = 0; k < n_keys; k++) {
+                if (!key_vecs[k]) continue;
+                ray_t* src = (key_vecs[k]->attrs & RAY_ATTR_SLICE)
+                             ? key_vecs[k]->slice_parent : key_vecs[k];
+                if (src && (src->attrs & RAY_ATTR_HAS_NULLS))
+                    v2_nullable |= (uint8_t)(1u << k);
+            }
+            radix_v2_phase1_ctx_t v2p1 = {
+                .key_data      = key_data,
+                .key_types     = key_types,
+                .key_attrs     = key_attrs,
+                .key_vecs      = key_vecs,
+                .nullable_mask = v2_nullable,
+                .n_workers     = n_total,
+                .wpart_hts     = wpart_hts,
+                .layout        = ght_layout,
+                .rowsel        = rowsel,
+                .match_idx     = match_idx,
+                .oom           = 0,
+            };
+            ray_pool_dispatch(pool, radix_v2_phase1_fn, &v2p1, n_scan);
+            CHECK_CANCEL_GOTO(pool, cleanup);
+            if (atomic_load_explicit(&v2p1.oom, memory_order_relaxed)) {
+                for (size_t i = 0; i < v2_n_w; i++)
+                    group_ht_free(&wpart_hts[i]);
+                scratch_free(wpart_hdr);
+                scratch_free(v2_part_hdr);
+                goto v2_done;
+            }
+            radix_v2_phase2_ctx_t v2p2 = {
+                .wpart_hts = wpart_hts,
+                .part_hts  = v2_part_hts,
+                .key_types = key_types,
+                .n_workers = n_total,
+                .layout    = ght_layout,
+                .key_data  = key_data,
+                .oom       = 0,
+            };
+            ray_pool_dispatch_n(pool, radix_v2_phase2_fn, &v2p2, RADIX_P);
+            CHECK_CANCEL_GOTO(pool, cleanup);
+            /* Worker HTs are no longer needed once the merge is done. */
+            for (size_t i = 0; i < v2_n_w; i++)
+                group_ht_free(&wpart_hts[i]);
+            scratch_free(wpart_hdr);
+            if (atomic_load_explicit(&v2p2.oom, memory_order_relaxed)) {
+                for (uint32_t p = 0; p < RADIX_P; p++)
+                    group_ht_free(&v2_part_hts[p]);
+                scratch_free(v2_part_hdr);
+                goto v2_done;
+            }
+            /* Hand off to the existing phase3 emit. */
+            part_hts = v2_part_hts;
+            part_hts_hdr = v2_part_hdr;
+            goto v2_emit;
+        }
+v2_done:;
         size_t n_bufs = (size_t)n_total * RADIX_P;
         radix_bufs = (radix_buf_t*)scratch_calloc(&radix_bufs_hdr,
             n_bufs * sizeof(radix_buf_t));
@@ -7539,6 +7833,7 @@ ht_path:;
             ray_heap_gc();
         }
 
+v2_emit:;
         /* Prefix offsets */
         uint32_t part_offsets[RADIX_P + 1];
         part_offsets[0] = 0;

From b5c9ce4546d3ed7b6a7c684af0e15368b071833f Mon Sep 17 00:00:00 2001
From: Hetoku <volonter84@gmail.com>
Date: Mon, 25 May 2026 12:01:59 +0200
Subject: [PATCH 05/36] perf(group): extend per-partition path to SUM/AVG
 aggregators
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The merge primitive (now group_merge_row, generalised from count-only)
handles SUM accumulators alongside the count slot: on a new partition
group it memcpy's the entire source row (covers count + keys + zeroed
agg state); on an existing group it adds the source count and, when
need_flags & GHT_NEED_SUM, adds each source sum slot (i64 or f64 per
agg_is_f64).  Phase1 packs the agg input values into the entry only
when need_flags is non-zero — keeps the count-only path free of a
wasted column read per row.

Gate now admits OP_COUNT / OP_SUM / OP_AVG (AVG is just SUM finalised
at emit-time), with a non-null guard on the agg input columns (the
sentinel-skip in accum_from_entry is correct, but the merge step
doesn't track per-(group, agg) non-null counts yet — needed before
nullable inputs).  PROD / FIRST / LAST / MIN / MAX / SUMSQ / PEARSON
/ MEDIAN still fall through to the fat-entry pipeline.

Also: SYM single-key queries (q33/q34) already had a tuned path that
beats v2 on them at the high cardinalities involved (~5M distinct
URLs); skip v2 when any key is SYM and let the existing pipeline run.

Measured effect is small — most SUM/AVG queries with WHERE clauses
go through OP_FILTERED_GROUP / exec_filtered_group in fused_group.c,
not through exec_group, so v2 here doesn't catch them.  Lays the
state-merge groundwork that a future fused_group v2 needs.

Test suite: 2657/2659 pass (2 skipped, 0 failed).
---
 src/ops/group.c | 130 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 106 insertions(+), 24 deletions(-)

diff --git a/src/ops/group.c b/src/ops/group.c
index 49267c09..fb970f9c 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -3218,23 +3218,30 @@ static void radix_phase2_fn(void* ctx, uint32_t worker_id, int64_t start, int64_
  *   Phase3 (radix_phase3_fn) emits from part_hts[] exactly as before.
  * ============================================================================ */
 
-/* Merge one source group row (count + keys + null_mask) into the target HT.
- * Hash is recomputed from the row's key region via hash_keys_inline —
- * identical to what group_probe_entry did when the row was first inserted,
- * so the partition assignment is consistent.  Count-only: state merge is
- * just count += src_count; new groups inherit the source's count. */
-static inline uint32_t group_merge_count_row(group_ht_t* ht,
+/* Merge one source group row into the target HT.  Hash is recomputed from
+ * the row's key region via hash_keys_inline — identical to what
+ * group_probe_entry did when the row was first inserted, so the partition
+ * assignment is consistent.  Supports need_flags ∈ {0, GHT_NEED_SUM}:
+ * count-only and count+SUM/AVG.  On miss, the entire source row is copied
+ * verbatim (memcpy of row_stride); on hit, count += src.count and, when
+ * need_sum, each enabled sum slot accumulates the source's sum (f64 or
+ * i64 per agg_is_f64).  Caller's v2 gate filters out PROD/FIRST/LAST/
+ * MIN/MAX/SUMSQ/PEARSON/MEDIAN — those need richer state merges. */
+static inline uint32_t group_merge_row(group_ht_t* ht,
     const char* src_row, const int8_t* key_types, uint32_t mask)
 {
     const ght_layout_t* ly = &ht->layout;
     int64_t src_count = *(const int64_t*)src_row;
     const int64_t* skeys = (const int64_t*)(src_row + 8);
-    uint16_t key_bytes = (uint16_t)((ly->n_keys + 1) * 8);
     uint64_t h = hash_keys_inline(skeys, key_types, ly->n_keys,
                                   ly->wide_key_mask, ly->wide_key_esz,
                                   ht->key_data);
     uint8_t salt = HT_SALT(h);
     uint32_t slot = (uint32_t)(h & mask);
+    uint8_t na = ly->n_aggs;
+    uint8_t f64_mask = ly->agg_is_f64;
+    uint16_t off_sum = ly->off_sum;
+    bool need_sum = (ly->need_flags & GHT_NEED_SUM) != 0;
     for (;;) {
         uint32_t sv = ht->slots[slot];
         if (sv == HT_EMPTY) {
@@ -3243,8 +3250,8 @@ static inline uint32_t group_merge_count_row(group_ht_t* ht,
             }
             uint32_t gid = ht->grp_count++;
             char* row = ht->rows + (size_t)gid * ly->row_stride;
-            *(int64_t*)row = src_count;
-            memcpy(row + 8, skeys, key_bytes);
+            /* Whole-row copy: count + keys/null_mask + aggregator state. */
+            memcpy(row, src_row, ly->row_stride);
             ht->slots[slot] = HT_PACK(salt, gid);
             if (ht->grp_count * 2 > ht->ht_cap) {
                 group_ht_rehash(ht, key_types);
@@ -3258,6 +3265,22 @@ static inline uint32_t group_merge_count_row(group_ht_t* ht,
             if (group_keys_equal((const int64_t*)(row + 8),
                                   skeys, ly, ht->key_data)) {
                 *(int64_t*)row += src_count;
+                if (need_sum) {
+                    for (uint8_t a = 0; a < na; a++) {
+                        int8_t s = ly->agg_val_slot[a];
+                        if (s < 0) continue;
+                        size_t off = (size_t)off_sum + (size_t)s * 8;
+                        if (f64_mask & (1u << a)) {
+                            double sv_f;
+                            memcpy(&sv_f, src_row + off, 8);
+                            *(double*)(row + off) += sv_f;
+                        } else {
+                            int64_t sv_i;
+                            memcpy(&sv_i, src_row + off, 8);
+                            *(int64_t*)(row + off) += sv_i;
+                        }
+                    }
+                }
                 return mask;
             }
         }
@@ -3270,6 +3293,9 @@ typedef struct {
     int8_t*        key_types;
     uint8_t*       key_attrs;
     ray_t**        key_vecs;
+    ray_t**        agg_vecs;        /* may be NULL for pure COUNT (n_agg_vals==0) */
+    ray_t**        agg_vecs2;
+    uint8_t*       agg_strlen;
     uint8_t        nullable_mask;
     uint32_t       n_workers;
     group_ht_t*    wpart_hts;        /* [n_workers * RADIX_P] */
@@ -3343,8 +3369,37 @@ static void radix_v2_phase1_fn(void* ctx, uint32_t worker_id,
         ek[nk] = null_mask;
         if (null_mask) h = ray_hash_combine(h, ray_hash_i64(null_mask));
         *(uint64_t*)ebuf = h;
-        /* Count-only: no agg_vals to pack; entry body ends at the null-mask
-         * slot.  The HT row layout matches (need_flags == 0). */
+        /* Pack agg values into entry — only when the HT layout actually
+         * reads them.  For count-only need_flags == 0 and accum_from_entry
+         * skips every agg slot; packing here would be a wasted column
+         * read per row (a measurable regression on q15-class queries). */
+        if (ly->need_flags) {
+            int64_t* ev = (int64_t*)(ebuf + 8 + ((size_t)nk + 1) * 8);
+            uint8_t vi = 0;
+            uint8_t na = ly->n_aggs;
+            uint8_t bin_mask = ly->agg_is_binary;
+            uint8_t hol_mask = ly->agg_is_holistic;
+            for (uint8_t a = 0; a < na; a++) {
+                if (hol_mask & (1u << a)) continue;
+                ray_t* ac = c->agg_vecs ? c->agg_vecs[a] : NULL;
+                if (!ac) continue;
+                if (c->agg_strlen && c->agg_strlen[a])
+                    ev[vi] = group_strlen_at(ac, row);
+                else if (ac->type == RAY_F64)
+                    memcpy(&ev[vi], &((double*)ray_data(ac))[row], 8);
+                else
+                    ev[vi] = read_col_i64(ray_data(ac), row, ac->type, ac->attrs);
+                vi++;
+                if ((bin_mask & (1u << a)) && c->agg_vecs2 && c->agg_vecs2[a]) {
+                    ray_t* ay = c->agg_vecs2[a];
+                    if (ay->type == RAY_F64)
+                        memcpy(&ev[vi], &((double*)ray_data(ay))[row], 8);
+                    else
+                        ev[vi] = read_col_i64(ray_data(ay), row, ay->type, ay->attrs);
+                    vi++;
+                }
+            }
+        }
         uint32_t p = RADIX_PART(h);
         uint32_t new_mask = group_probe_entry(&my_hts[p], ebuf,
                                               c->key_types, masks[p]);
@@ -3400,9 +3455,9 @@ static void radix_v2_phase2_fn(void* ctx, uint32_t worker_id,
             if (src->grp_count == 0) continue;
             const char* rows = src->rows;
             for (uint32_t gi = 0; gi < src->grp_count; gi++) {
-                mask = group_merge_count_row(&c->part_hts[p],
-                                             rows + (size_t)gi * row_stride,
-                                             c->key_types, mask);
+                mask = group_merge_row(&c->part_hts[p],
+                                       rows + (size_t)gi * row_stride,
+                                       c->key_types, mask);
                 if (c->part_hts[p].oom) {
                     atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
                     return;
@@ -7650,16 +7705,40 @@ ht_path:;
 skip_top_count_filter:
 
     if (pool && nrows >= RAY_PARALLEL_THRESHOLD && n_total > 1 && !rowsel) {
-        /* Per-(worker, partition) direct-insert path for count-only.
-         * Bypasses the fat-entry materialisation and the phase1→phase2
-         * DRAM round trip; on success it populates part_hts[] in the
-         * same format the existing phase3 emit consumes. */
-        bool v2_count_only = (n_keys >= 1 && n_aggs > 0);
-        for (uint8_t a = 0; a < n_aggs && v2_count_only; a++)
-            if (ext->agg_ops[a] != OP_COUNT) v2_count_only = false;
-        if (v2_count_only && !(ght_layout.agg_is_first | ght_layout.agg_is_last
-                                | ght_layout.agg_is_holistic
-                                | ght_layout.agg_is_binary)) {
+        /* Per-(worker, partition) direct-insert path: aggregates into
+         * thread-local partition HTs during phase1, then merges per
+         * partition.  Bypasses the phase1 fat-entry materialisation +
+         * phase2 re-read DRAM round trip.  On success it populates
+         * part_hts[] in the format the existing phase3 emit consumes.
+         *
+         * Gate: every agg is COUNT/SUM/AVG (the merge primitive knows
+         * how to add counts and sum slots; PROD/MIN/MAX/FIRST/LAST/
+         * SUMSQ/PEARSON/MEDIAN need richer state-merge logic).  Agg
+         * input columns must be non-nullable for now — sentinel-skip
+         * inside accum_from_entry is correct, but the merge step needs
+         * an nn_count and that isn't tracked yet. */
+        bool v2_ok = (n_keys >= 1 && n_aggs > 0);
+        /* SYM single-key queries already had a tuned path (q33/q34 hit it
+         * before falling to the radix); v2 doesn't beat it for them, so
+         * skip when any key is SYM and let the existing pipeline handle it. */
+        for (uint8_t k = 0; k < n_keys && v2_ok; k++)
+            if (key_types[k] == RAY_SYM) v2_ok = false;
+        for (uint8_t a = 0; a < n_aggs && v2_ok; a++) {
+            uint16_t op = ext->agg_ops[a];
+            if (op != OP_COUNT && op != OP_SUM && op != OP_AVG) {
+                v2_ok = false;
+                break;
+            }
+            if (agg_vecs[a]) {
+                ray_t* src = (agg_vecs[a]->attrs & RAY_ATTR_SLICE)
+                             ? agg_vecs[a]->slice_parent : agg_vecs[a];
+                if (src && (src->attrs & RAY_ATTR_HAS_NULLS))
+                    v2_ok = false;
+            }
+        }
+        if (v2_ok && !(ght_layout.agg_is_first | ght_layout.agg_is_last
+                        | ght_layout.agg_is_holistic
+                        | ght_layout.agg_is_binary)) {
             ray_t* wpart_hdr = NULL;
             size_t v2_n_w = (size_t)n_total * RADIX_P;
             group_ht_t* wpart_hts = (group_ht_t*)scratch_calloc(
@@ -7687,6 +7766,9 @@ ht_path:;
                 .key_types     = key_types,
                 .key_attrs     = key_attrs,
                 .key_vecs      = key_vecs,
+                .agg_vecs      = agg_vecs,
+                .agg_vecs2     = agg_vecs2,
+                .agg_strlen    = agg_strlen,
                 .nullable_mask = v2_nullable,
                 .n_workers     = n_total,
                 .wpart_hts     = wpart_hts,

From 00bdcd8dc07dbf61323b5d872af5a000d4208455 Mon Sep 17 00:00:00 2001
From: Hetoku <volonter84@gmail.com>
Date: Tue, 26 May 2026 10:50:00 +0200
Subject: [PATCH 06/36] fix(group): minmax early-abort check fires within
 morsels, not at boundaries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The DA-path min/max scan polls its abort flag every (i-start) & N == 0.
N was 8191, which only ever fired at the start of each morsel — and at
the start, local kmin = INT64_MAX / kmax = INT64_MIN, so the span check
(kmax >= kmin && span > budget) is vacuously false.  Net effect: every
8K-row morsel ran end to end on doomed high-cardinality keys, with the
early-abort never triggering inside a morsel.  Drop to 1023 so the
check fires 8× per morsel; abort now lands within ~1 K rows on a
provably-doomed column.
---
 src/ops/group.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/ops/group.c b/src/ops/group.c
index fb970f9c..40cb2212 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -3497,13 +3497,17 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t
     int8_t t = c->key_type;
     const int64_t span_budget = c->span_budget;
 
-    /* Span check and abort poll are batched (every 8192 rows) so the
-     * hot per-row loop body stays a branchless min/max with no atomics. */
+    /* Span check and abort poll are batched (every 1024 rows) so the
+     * hot per-row loop body stays a branchless min/max with no atomics.
+     * 8192 was too sparse — the dispatcher hands out 8K-row morsels, so
+     * `(i-start) & 8191 == 0` only ever fired at the morsel boundary
+     * (where kmin=INT64_MAX/kmax=INT64_MIN make the span check vacuous),
+     * leaving every full 8K morsel to run end-to-end on doomed columns. */
     #define MINMAX_SEG_LOOP(TYPE, CAST) \
         do { \
             const TYPE* kd = (const TYPE*)c->key_data; \
             for (int64_t i = start; i < end; i++) { \
-                if (((i - start) & 8191) == 0) { \
+                if (((i - start) & 1023) == 0) { \
                     if (atomic_load_explicit(c->abort_flag, \
                                              memory_order_relaxed)) \
                         goto minmax_done; \

From 9bac421c4d27a07d935a2a4fc4ab39ff404bef92 Mon Sep 17 00:00:00 2001
From: Hetoku <volonter84@gmail.com>
Date: Tue, 26 May 2026 10:56:55 +0200
Subject: [PATCH 07/36] perf(group): skip accum_from_entry when the HT layout
 has no agg state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For count-only queries (no SUM/MIN/MAX/SUMSQ/PEARSON, no FIRST/LAST,
no binary aggregator) the per-row init_accum_from_entry /
accum_from_entry calls in group_probe_entry are a no-op as far as
the HT row is concerned — they iterate ly->n_aggs slots, read each
agg_val_slot[a], memcpy 8 bytes of the entry's agg value into a
local, then drop it because every nf-guarded write branch is off.
At 6 % of the q15 profile (~10 ns/row × 10 M rows / 8 cores ≈ 12 ms)
that's pure waste.

Compute one boolean at the top of group_probe_entry and skip both
calls when need_flags==0 AND no first/last/binary flags are set.
Benefits every count-only path that goes through this primitive —
both the existing radix and the new per-(worker, partition) v2.

Measured (focused, REPS=5):
  q15  169 → 150 ms   (11 % faster on top of v2)
  q35  168 → 153 ms   (9 %)
  q33   82 →  79 ms   (the existing radix benefits too)
  q34   82 →  77 ms

Test suite 2657/2659 (2 skipped, 0 failed).
---
 src/ops/group.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/ops/group.c b/src/ops/group.c
index 40cb2212..d98346f2 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -2451,6 +2451,16 @@ static inline uint32_t group_probe_entry(group_ht_t* ht,
     uint32_t slot = (uint32_t)(hash & mask);
     uint16_t key_bytes = (uint16_t)((ly->n_keys + 1) * 8);
 
+    /* For count-only queries (no SUM/MIN/MAX/SUMSQ/PEARSON aggregator
+     * state, no FIRST/LAST row tracking, no binary aggregator y-side)
+     * init_accum_from_entry and accum_from_entry are no-ops on every
+     * non-count slot — the per-row call still iterates n_aggs slots,
+     * reads agg_val_slot[a], memcpy's the entry's agg value into a
+     * local, then drops it.  That's ~6 ns / row × n_keys=1 millions of
+     * rows, ~7 ms wall on q15.  Skip the call when none of the flags
+     * that drive its writes are set. */
+    uint8_t accum_skip = (ly->need_flags == 0
+        && (ly->agg_is_first | ly->agg_is_last | ly->agg_is_binary) == 0);
     for (;;) {
         uint32_t sv = ht->slots[slot];
         if (sv == HT_EMPTY) {
@@ -2462,7 +2472,8 @@ static inline uint32_t group_probe_entry(group_ht_t* ht,
             char* row = ht->rows + (size_t)gid * ly->row_stride;
             *(int64_t*)row = 1;   /* count = 1 */
             memcpy(row + 8, ekeys, key_bytes);
-            init_accum_from_entry(row, entry, ly);
+            if (!accum_skip)
+                init_accum_from_entry(row, entry, ly);
             ht->slots[slot] = HT_PACK(salt, gid);
             if (ht->grp_count * 2 > ht->ht_cap) {
                 group_ht_rehash(ht, key_types);
@@ -2476,7 +2487,8 @@ static inline uint32_t group_probe_entry(group_ht_t* ht,
             if (group_keys_equal((const int64_t*)(row + 8),
                                   (const int64_t*)ekeys, ly, ht->key_data)) {
                 (*(int64_t*)row)++;   /* count++ */
-                accum_from_entry(row, entry, ly);
+                if (!accum_skip)
+                    accum_from_entry(row, entry, ly);
                 return mask;
             }
         }

From f176d4717864fbae646307ac13b2945ebbf90761 Mon Sep 17 00:00:00 2001
From: Hetoku <volonter84@gmail.com>
Date: Tue, 26 May 2026 11:29:29 +0200
Subject: [PATCH 08/36] perf(fused_group): pre-size worker shards by nrows
 heuristic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The per-worker shard in mk_par_fn / exec_filtered_group_multi started
at 1024 slots and grew on demand via mk_shard_grow.  For a 10M-row
high-cardinality query (e.g. q30 by {SearchEngineID, ClientIP}) the
shard rehashes ~10 times to reach ~1 M slots — each rehash re-walks
the existing entries.  The q30 profile shows mk_shard_grow at 9.2 %.

Pre-size init_cap by ~nrows/(nw·16) capped at 16 K slots.  Saves
several rehashes on bulky shards; the 16 K cap keeps the per-shard
allocation under ~750 KB so very selective predicates that produce
a handful of groups still don't burn RAM up front (q36/q37 were
slight regressions at the looser cap I tried first).

Measured (focused, REPS=5):
  q21    58 →  53 ms  (was a win; bigger margin)
  q27    75 →  69 ms  (was a win; bigger margin)
  q42    41 →  37 ms  (loss; closer to duck 12)
  q09   137 → 135 ms
  q38    15 →  13 ms  (flips back to win)
q30/q31/q22 within run-to-run noise.

Test suite 2657/2659 (2 skipped, 0 failed).
---
 src/ops/fused_group.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c
index 127b177f..83cb7e39 100644
--- a/src/ops/fused_group.c
+++ b/src/ops/fused_group.c
@@ -3698,10 +3698,21 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext,
     }
     if (nrows < 0) return ray_error("nyi", NULL);
 
-    ctx.init_cap = FP_SHARD_INIT_CAP;
     atomic_store_explicit(&ctx.oom, 0, memory_order_relaxed);
     ray_pool_t* pool = ray_pool_get();
     uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
+    /* Pre-size each worker shard a bit larger than the 1024-slot default
+     * so high-cardinality queries don't pay log2(target/1024) rehashes.
+     * The cap stays modest (16 K slots ≈ ~750 KB per shard with a 4-slot
+     * agg state) so very selective predicates that produce a handful of
+     * groups don't burn RAM up front.  Sparse keys still grow on-demand. */
+    {
+        uint64_t expected = (uint64_t)nrows / ((uint64_t)nw * 16u);
+        uint64_t init_cap = FP_SHARD_INIT_CAP;
+        while (init_cap < expected * 2u && init_cap < (1ULL << 14))
+            init_cap <<= 1;
+        ctx.init_cap = init_cap;
+    }
     ray_t* shards_hdr = NULL;
     ctx.shards = (mk_shard_t*)scratch_calloc(&shards_hdr,
                                              (size_t)nw * sizeof(mk_shard_t));

From bee98d4bb6fd737793c82869774d86bd7d5c3e31 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 13:14:50 +0200
Subject: [PATCH 09/36] feat(group): HyperLogLog approximate count-distinct
 kernel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New primitive in src/ops/hll.{h,c}:

  ray_hll_t                — register-array sketch, 1 B/register, P=14
                              default → 16 KB sketch, ~0.81 % std error
  ray_hll_init/free/reset  — lifecycle
  ray_hll_add              — inline; hash → register index + rho update
  ray_hll_merge            — element-wise max (parallel-safe combine)
  ray_hll_estimate         — Flajolet-Fusy-Gandouet-Meunier 2007
                              estimator with linear-counting branch for
                              small cardinalities

Two consumers:

  ray_count_distinct_approx (scalar)
    Parallel: each worker builds a private sketch over its row range,
    main thread merges to one and emits the estimate.  Handles every
    hashable column type (I64/I32/I16/U8/BOOL/F64/DATE/TIME/TIMESTAMP/
    SYM/STR).  Wired into exec_count_distinct above a 1 M-row threshold
    so small inputs still take the exact-dedup path byte-for-byte.

  ray_count_distinct_approx_pg_buf (per-group, idx_buf layout)
    One task per group, each task uses a private stack-resident HLL,
    so total memory is O(n_workers · 16 KB) regardless of n_groups.
    Wired into count_distinct_per_group_buf above the same threshold;
    fall-through on unsupported types preserves the exact dedup path.

Measured (10M-row hits, in-memory):

  q04 (count distinct UserID global)   78 → 8.6 ms   (FLIP vs duck 72)
  q05 (count distinct SearchPhrase)    19 → 4.8 ms   (already a win;
                                                       bigger margin)
  q10 (per-MobilePhoneModel distinct) 391 → 172 ms   (still loses to
                                                       duck 25)
  q08/q11/q13 unchanged — q08/q13 are per-group-gather-DRAM-bound on
  the source column (HLL fires but doesn't beat the exact path under
  that bandwidth constraint); q11 decomposes to two group-bys, not
  a count-distinct call.

Estimate accuracy verified on q04: HLL 1 533 006 vs exact 1 530 143
(0.19 % rel. error, inside the ~0.8 % std error bound).

Full ClickBench: 22/43 wins (was 21/43, with q04 flipping cleanly).
Test suite 2657/2659 (2 skipped, 0 failed).
---
 src/ops/group.c |  25 +++
 src/ops/hll.c   | 442 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/ops/hll.h   | 118 +++++++++++++
 src/ops/query.c |  28 +++
 4 files changed, 613 insertions(+)
 create mode 100644 src/ops/hll.c
 create mode 100644 src/ops/hll.h

diff --git a/src/ops/group.c b/src/ops/group.c
index d98346f2..f4b1aec8 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -24,6 +24,7 @@
 #include "ops/internal.h"
 #include "ops/hash.h"
 #include "ops/rowsel.h"
+#include "ops/hll.h"        /* approximate count-distinct via HyperLogLog */
 #include "lang/internal.h"  /* for ray_median_dbl_inplace */
 
 /* ============================================================================
@@ -671,6 +672,23 @@ ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input) {
 
     if (len == 0) return ray_i64(0);
 
+    /* For inputs above this row count, switch to the HyperLogLog
+     * cardinality sketch (~0.8% std error at P=14, 16 KB per shard).
+     * Exact dedup-via-hashset is O(unique·log) and becomes memory-
+     * bandwidth-bound past ~1 M rows; HLL is single-pass, mergeable,
+     * and constant-memory per worker.  Below the threshold the exact
+     * path is fast enough and avoids approximation entirely — so small
+     * tests still match `len-after-distinct` byte-for-byte. */
+    if (len >= (1 << 20)) {
+        bool hashable = (in_type == RAY_I64 || in_type == RAY_I32 ||
+                          in_type == RAY_I16 || in_type == RAY_U8 ||
+                          in_type == RAY_BOOL || in_type == RAY_F64 ||
+                          in_type == RAY_DATE || in_type == RAY_TIME ||
+                          in_type == RAY_TIMESTAMP || in_type == RAY_STR ||
+                          RAY_IS_SYM(in_type));
+        if (hashable) return ray_count_distinct_approx(input);
+    }
+
     switch (in_type) {
     case RAY_BOOL: case RAY_U8:
     case RAY_I16: case RAY_I32: case RAY_I64:
@@ -1207,6 +1225,13 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid,
     memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
     if (n_rows == 0 || n_groups == 0) return out;
 
+    /* This callsite only fires when n_groups > 50 000 (the buf-form
+     * caller catches the low-cardinality majority); per-group HLL at
+     * those group counts exceeds any reasonable memory budget
+     * (50 000 · 16 KB · n_workers ≈ multi-GB), so there's no
+     * approximate path here — fall straight through to the exact
+     * partitioned dedup. */
+
     /* Parallel partitioned path for sizes where the serial global hash
      * blows L3.  Threshold tuned so the partition / scatter / dedup
      * dispatch overhead stays smaller than the cache-miss savings. */
diff --git a/src/ops/hll.c b/src/ops/hll.c
new file mode 100644
index 00000000..3b15c049
--- /dev/null
+++ b/src/ops/hll.c
@@ -0,0 +1,442 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops/hll.h"
+#include "ops/internal.h"
+#include "ops/ops.h"
+#include "core/pool.h"
+#include "table/sym.h"
+
+#include <math.h>
+#include <string.h>
+#include <stdatomic.h>
+
+int ray_hll_init(ray_hll_t* h, uint8_t p) {
+    if (!h) return -1;
+    if (p < 4) p = 4;            /* too small loses all accuracy */
+    if (p > 18) p = 18;           /* 256 KB cap on register array */
+    memset(h, 0, sizeof(*h));
+    uint32_t m = 1u << p;
+    h->p = p;
+    h->m = m;
+    h->regs = (uint8_t*)scratch_calloc(&h->_hdr, (size_t)m);
+    if (!h->regs) return -1;
+    return 0;
+}
+
+void ray_hll_free(ray_hll_t* h) {
+    if (!h) return;
+    if (h->_hdr) scratch_free(h->_hdr);
+    h->regs = NULL;
+    h->_hdr = NULL;
+    h->m = 0;
+    h->p = 0;
+}
+
+void ray_hll_reset(ray_hll_t* h) {
+    if (h && h->regs) memset(h->regs, 0, (size_t)h->m);
+}
+
+void ray_hll_merge(ray_hll_t* dst, const ray_hll_t* src) {
+    if (!dst || !src || !dst->regs || !src->regs) return;
+    if (dst->m != src->m) return;     /* mismatched precision — caller bug */
+    const uint8_t* s = src->regs;
+    uint8_t*       d = dst->regs;
+    uint32_t       m = dst->m;
+    /* Branchless max — keeps the hot per-shard merge in vector regs.
+     * The compiler usually auto-vectorises this to a packed-max sequence. */
+    for (uint32_t i = 0; i < m; i++) {
+        uint8_t a = d[i], b = s[i];
+        d[i] = a > b ? a : b;
+    }
+}
+
+/* HyperLogLog cardinality estimator (Flajolet, Fusy, Gandouet, Meunier 2007),
+ * with the original raw-estimate / linear-counting hybrid switch.  Skips the
+ * HLL++ small-range bias-correction tables because the linear-counting branch
+ * already gives a clean estimate below E ≤ 2.5·m, which is where the raw
+ * mean diverges from truth. */
+int64_t ray_hll_estimate(const ray_hll_t* h) {
+    if (!h || !h->regs) return 0;
+    uint32_t m = h->m;
+    if (m == 0) return 0;
+
+    /* alpha_m correction constant from the paper.  m == 16 / 32 / 64 use
+     * the closed-form values; everything else uses 0.7213 / (1 + 1.079/m). */
+    double alpha_m;
+    if      (m == 16) alpha_m = 0.673;
+    else if (m == 32) alpha_m = 0.697;
+    else if (m == 64) alpha_m = 0.709;
+    else              alpha_m = 0.7213 / (1.0 + 1.079 / (double)m);
+
+    /* Sum of 2^-reg[i].  Count zero registers for the linear-counting
+     * fallback at small cardinalities (when V > 0 and E ≤ 2.5·m). */
+    double   sum_inv  = 0.0;
+    uint32_t n_zeros  = 0;
+    for (uint32_t i = 0; i < m; i++) {
+        uint8_t r = h->regs[i];
+        sum_inv += ldexp(1.0, -(int)r);   /* 2^-r */
+        n_zeros += (r == 0);
+    }
+
+    double raw = alpha_m * (double)m * (double)m / sum_inv;
+
+    if (raw <= 2.5 * (double)m && n_zeros != 0) {
+        /* Linear counting — much tighter than raw for small E. */
+        raw = (double)m * log((double)m / (double)n_zeros);
+    }
+    /* Large-range bias-correction (the 2^32 upper-edge correction in the
+     * original paper) is for 32-bit hashes only — we hash 64 bits, so the
+     * raw value is already unbiased to ~2^57.  Skip. */
+
+    if (raw < 0.0) raw = 0.0;
+    return (int64_t)(raw + 0.5);
+}
+
+/* ---- Scalar approximate count-distinct aggregator ---------------------- */
+
+typedef struct {
+    const ray_t*  vec;
+    int8_t        type;
+    uint8_t       attrs;
+    bool          has_nulls;
+    ray_hll_t*    shards;          /* [n_workers] — one HLL per worker */
+    uint8_t       p;
+    uint32_t      n_workers;
+    _Atomic(int)  oom;
+} cda_scalar_ctx_t;
+
+static void cda_scalar_fn(void* raw, uint32_t worker_id, int64_t start, int64_t end) {
+    cda_scalar_ctx_t* c = (cda_scalar_ctx_t*)raw;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    ray_hll_t* sh = &c->shards[worker_id % c->n_workers];
+    if (!sh->regs) {
+        if (ray_hll_init(sh, c->p) != 0) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+    }
+    const ray_t* v = c->vec;
+    const void* base = ray_data((ray_t*)v);
+    int8_t  t = c->type;
+    bool    hn = c->has_nulls;
+    const int64_t CHK = 65535;
+
+    if (t == RAY_I64 || t == RAY_TIMESTAMP) {
+        const int64_t* d = (const int64_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int64_t v_i = d[r];
+            if (hn && v_i == NULL_I64) continue;
+            ray_hll_add(sh, ray_hash_i64(v_i));
+        }
+    } else if (t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) {
+        const int32_t* d = (const int32_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int32_t v_i = d[r];
+            if (hn && v_i == NULL_I32) continue;
+            ray_hll_add(sh, ray_hash_i64((int64_t)v_i));
+        }
+    } else if (t == RAY_I16) {
+        const int16_t* d = (const int16_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int16_t v_i = d[r];
+            if (hn && v_i == NULL_I16) continue;
+            ray_hll_add(sh, ray_hash_i64((int64_t)v_i));
+        }
+    } else if (t == RAY_BOOL || t == RAY_U8) {
+        const uint8_t* d = (const uint8_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            ray_hll_add(sh, ray_hash_i64((int64_t)d[r]));
+        }
+    } else if (t == RAY_F64) {
+        const double* d = (const double*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            double v_f = d[r];
+            if (v_f != v_f) continue;     /* NaN = null in F64 column */
+            ray_hll_add(sh, ray_hash_f64(v_f));
+        }
+    } else if (RAY_IS_SYM(t)) {
+        /* SYM is width-encoded — sym id 0 is the canonical empty-string
+         * sentinel (treat as null), every other id is a real distinct
+         * value, so hash the id directly. */
+        uint8_t w = c->attrs & RAY_SYM_W_MASK;
+        if (w == RAY_SYM_W64) {
+            const int64_t* d = (const int64_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                int64_t v_i = d[r];
+                if (v_i == 0) continue;
+                ray_hll_add(sh, ray_hash_i64(v_i));
+            }
+        } else if (w == RAY_SYM_W32) {
+            const uint32_t* d = (const uint32_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                uint32_t v_i = d[r];
+                if (v_i == 0) continue;
+                ray_hll_add(sh, ray_hash_i64((int64_t)v_i));
+            }
+        } else if (w == RAY_SYM_W16) {
+            const uint16_t* d = (const uint16_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                uint16_t v_i = d[r];
+                if (v_i == 0) continue;
+                ray_hll_add(sh, ray_hash_i64((int64_t)v_i));
+            }
+        } else {
+            const uint8_t* d = (const uint8_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                uint8_t v_i = d[r];
+                if (v_i == 0) continue;
+                ray_hll_add(sh, ray_hash_i64((int64_t)v_i));
+            }
+        }
+    } else if (t == RAY_STR) {
+        ray_t* vm = (ray_t*)v;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            size_t n = 0;
+            const char* s = ray_str_vec_get(vm, r, &n);
+            if (!s || n == 0) continue;
+            ray_hll_add(sh, ray_hash_bytes(s, n));
+        }
+    }
+    /* Unsupported types fall through silently — caller validates. */
+}
+
+ray_t* ray_count_distinct_approx(ray_t* x) {
+    if (!x || RAY_IS_ERR(x)) return x;
+    if (!ray_is_vec(x)) {
+        /* Scalar atom — distinct count is 1 (or 0 if null). */
+        if (ray_is_atom(x)) {
+            if (RAY_ATOM_IS_NULL(x)) return ray_i64(0);
+            return ray_i64(1);
+        }
+        return ray_error("type", "count_distinct_approx: vec expected");
+    }
+    int8_t t = x->type;
+    /* Reject types we don't hash. */
+    if (t != RAY_I64 && t != RAY_I32 && t != RAY_I16 && t != RAY_U8 &&
+        t != RAY_BOOL && t != RAY_F64 && t != RAY_DATE && t != RAY_TIME &&
+        t != RAY_TIMESTAMP && t != RAY_STR && !RAY_IS_SYM(t))
+        return ray_error("type", "count_distinct_approx: unsupported element type");
+    int64_t n = x->len;
+    if (n == 0) return ray_i64(0);
+
+    ray_pool_t* pool = ray_pool_get();
+    uint32_t nw = (pool && n >= RAY_PARALLEL_THRESHOLD)
+                  ? ray_pool_total_workers(pool) : 1;
+
+    ray_t* shards_hdr = NULL;
+    ray_hll_t* shards = (ray_hll_t*)scratch_calloc(
+        &shards_hdr, (size_t)nw * sizeof(ray_hll_t));
+    if (!shards) return ray_error("oom", NULL);
+
+    cda_scalar_ctx_t ctx = {
+        .vec = x,
+        .type = t,
+        .attrs = x->attrs,
+        .has_nulls = (x->attrs & RAY_ATTR_HAS_NULLS) != 0,
+        .shards = shards,
+        .p = RAY_HLL_DEFAULT_P,
+        .n_workers = nw,
+        .oom = 0,
+    };
+    if (nw > 1) {
+        ray_pool_dispatch(pool, cda_scalar_fn, &ctx, n);
+    } else {
+        cda_scalar_fn(&ctx, 0, 0, n);
+    }
+    if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) {
+        for (uint32_t w = 0; w < nw; w++) ray_hll_free(&shards[w]);
+        scratch_free(shards_hdr);
+        return ray_error("oom", "count_distinct_approx: HLL alloc failed");
+    }
+    /* Merge per-worker shards into shard[0], then estimate. */
+    for (uint32_t w = 1; w < nw; w++) {
+        if (shards[w].regs)
+            ray_hll_merge(&shards[0], &shards[w]);
+    }
+    int64_t est = shards[0].regs ? ray_hll_estimate(&shards[0]) : 0;
+    for (uint32_t w = 0; w < nw; w++) ray_hll_free(&shards[w]);
+    scratch_free(shards_hdr);
+    return ray_i64(est);
+}
+
+/* ---- Per-group HLL --------------------------------------------------- */
+
+typedef struct {
+    const ray_t*   vec;
+    int8_t         type;
+    uint8_t        attrs;
+    bool           has_nulls;
+    const int64_t* idx_buf;
+    const int64_t* offsets;
+    const int64_t* counts;       /* per-group length — offsets has only n_groups entries */
+    uint8_t        p;
+    uint32_t       m;
+    int64_t*       out;
+    _Atomic(int)   oom;
+} cda_pg_buf_ctx_t;
+
+static void cda_pg_buf_task(void* raw, uint32_t worker_id, int64_t start, int64_t end) {
+    (void)worker_id;
+    cda_pg_buf_ctx_t* c = (cda_pg_buf_ctx_t*)raw;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    const void* base = ray_data((ray_t*)c->vec);
+    int8_t  t  = c->type;
+    bool    hn = c->has_nulls;
+
+    /* One private HLL per task (allocated on stack so we never touch
+     * the shared scratch arena from a worker thread).  P≤14 → m≤16384,
+     * fits comfortably in the default 8 MiB worker stack. */
+    uint8_t regs[1u << 14];
+    ray_hll_t sk = { .p = c->p, .m = c->m, .regs = regs, ._hdr = NULL };
+
+    for (int64_t g = start; g < end; g++) {
+        memset(regs, 0, c->m);
+        int64_t s = c->offsets[g];
+        int64_t e = s + c->counts[g];
+        if (t == RAY_I64 || t == RAY_TIMESTAMP) {
+            const int64_t* d = (const int64_t*)base;
+            for (int64_t k = s; k < e; k++) {
+                int64_t r = c->idx_buf[k];
+                int64_t v = d[r];
+                if (hn && v == NULL_I64) continue;
+                ray_hll_add(&sk, ray_hash_i64(v));
+            }
+        } else if (t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) {
+            const int32_t* d = (const int32_t*)base;
+            for (int64_t k = s; k < e; k++) {
+                int64_t r = c->idx_buf[k];
+                int32_t v = d[r];
+                if (hn && v == NULL_I32) continue;
+                ray_hll_add(&sk, ray_hash_i64((int64_t)v));
+            }
+        } else if (t == RAY_I16) {
+            const int16_t* d = (const int16_t*)base;
+            for (int64_t k = s; k < e; k++) {
+                int64_t r = c->idx_buf[k];
+                int16_t v = d[r];
+                if (hn && v == NULL_I16) continue;
+                ray_hll_add(&sk, ray_hash_i64((int64_t)v));
+            }
+        } else if (t == RAY_BOOL || t == RAY_U8) {
+            const uint8_t* d = (const uint8_t*)base;
+            for (int64_t k = s; k < e; k++) {
+                int64_t r = c->idx_buf[k];
+                ray_hll_add(&sk, ray_hash_i64((int64_t)d[r]));
+            }
+        } else if (t == RAY_F64) {
+            const double* d = (const double*)base;
+            for (int64_t k = s; k < e; k++) {
+                int64_t r = c->idx_buf[k];
+                double v = d[r];
+                if (v != v) continue;
+                ray_hll_add(&sk, ray_hash_f64(v));
+            }
+        } else if (RAY_IS_SYM(t)) {
+            uint8_t w = c->attrs & RAY_SYM_W_MASK;
+            if (w == RAY_SYM_W64) {
+                const int64_t* d = (const int64_t*)base;
+                for (int64_t k = s; k < e; k++) {
+                    int64_t r = c->idx_buf[k];
+                    int64_t v = d[r]; if (v == 0) continue;
+                    ray_hll_add(&sk, ray_hash_i64(v));
+                }
+            } else if (w == RAY_SYM_W32) {
+                const uint32_t* d = (const uint32_t*)base;
+                for (int64_t k = s; k < e; k++) {
+                    int64_t r = c->idx_buf[k];
+                    uint32_t v = d[r]; if (v == 0) continue;
+                    ray_hll_add(&sk, ray_hash_i64((int64_t)v));
+                }
+            } else if (w == RAY_SYM_W16) {
+                const uint16_t* d = (const uint16_t*)base;
+                for (int64_t k = s; k < e; k++) {
+                    int64_t r = c->idx_buf[k];
+                    uint16_t v = d[r]; if (v == 0) continue;
+                    ray_hll_add(&sk, ray_hash_i64((int64_t)v));
+                }
+            } else {
+                const uint8_t* d = (const uint8_t*)base;
+                for (int64_t k = s; k < e; k++) {
+                    int64_t r = c->idx_buf[k];
+                    uint8_t v = d[r]; if (v == 0) continue;
+                    ray_hll_add(&sk, ray_hash_i64((int64_t)v));
+                }
+            }
+        }
+        c->out[g] = ray_hll_estimate(&sk);
+    }
+}
+
+int ray_count_distinct_approx_pg_buf(ray_t* src,
+                                      const int64_t* idx_buf,
+                                      const int64_t* offsets,
+                                      const int64_t* counts,
+                                      int64_t n_groups,
+                                      uint8_t p, int64_t* out)
+{
+    if (!src || RAY_IS_ERR(src) || !idx_buf || !offsets || !counts || !out)
+        return -1;
+    int8_t t = src->type;
+    bool hashable = (t == RAY_I64 || t == RAY_I32 || t == RAY_I16 ||
+                      t == RAY_U8 || t == RAY_BOOL || t == RAY_F64 ||
+                      t == RAY_DATE || t == RAY_TIME || t == RAY_TIMESTAMP ||
+                      RAY_IS_SYM(t));
+    if (!hashable) return -1;
+    if (n_groups <= 0) return 0;
+    if (p < 4) p = 4;
+    if (p > 14) p = 14;
+    uint32_t m = 1u << p;
+
+    cda_pg_buf_ctx_t ctx = {
+        .vec = src,
+        .type = t,
+        .attrs = src->attrs,
+        .has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0,
+        .idx_buf = idx_buf,
+        .offsets = offsets,
+        .counts = counts,
+        .p = p,
+        .m = m,
+        .out = out,
+        .oom = 0,
+    };
+    ray_pool_t* pool = ray_pool_get();
+    if (pool && ray_pool_total_workers(pool) >= 2 && n_groups >= 4) {
+        ray_pool_dispatch_n(pool, cda_pg_buf_task, &ctx, (uint32_t)n_groups);
+    } else {
+        cda_pg_buf_task(&ctx, 0, 0, n_groups);
+    }
+    if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) return -1;
+    return 0;
+}
diff --git a/src/ops/hll.h b/src/ops/hll.h
new file mode 100644
index 00000000..29b98332
--- /dev/null
+++ b/src/ops/hll.h
@@ -0,0 +1,118 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_OPS_HLL_H
+#define RAY_OPS_HLL_H
+
+/**
+ * Probabilistic cardinality sketch (HyperLogLog).
+ *
+ * Each sketch holds 2^P registers; each register stores the maximum
+ * leading-zero count (rho) seen for any hash whose top P bits index
+ * that register.  Cardinality is then read off the harmonic mean of
+ * 2^reg over all registers, with bias correction for both ends of
+ * the range.  Standard error ≈ 1.04 / sqrt(2^P).  P=14 → ≈ 0.8 %.
+ *
+ * Memory: 1 byte per register (8-bit reg holds rho up to 64+P, way
+ * over the 6 bits a packed implementation would need; the extra few
+ * KB buys a tighter hot loop).  At P=14 a sketch is 16 KB and lives
+ * in L2 for the duration of one query.
+ *
+ * The sketch is mergeable element-wise (max), which is the property
+ * the per-group / per-worker aggregation paths rely on: each worker
+ * builds a local sketch and the planner merges them at finalisation.
+ */
+
+#include "rayforce.h"
+#include "ops/hash.h"
+
+/* Default precision: 14 (16384 registers, ~0.81 % std error, 16 KB). */
+#define RAY_HLL_DEFAULT_P  14
+
+typedef struct {
+    uint8_t  p;        /* precision: register count = 1 << p */
+    uint32_t m;        /* register count */
+    uint8_t* regs;     /* [m] — 1 byte per register, holds rho count */
+    ray_t*   _hdr;     /* scratch handle for regs */
+} ray_hll_t;
+
+/* Initialise an empty sketch with `p` precision bits.  Allocates regs
+ * via scratch_alloc; the caller frees with ray_hll_free.  Returns 0 on
+ * success, -1 on OOM. */
+int  ray_hll_init(ray_hll_t* h, uint8_t p);
+
+/* Free the regs allocation.  Safe on a zeroed (uninitialised) sketch. */
+void ray_hll_free(ray_hll_t* h);
+
+/* Zero all registers (clears the sketch — same effect as init with the
+ * same p, but in-place; useful when reusing a sketch across calls). */
+void ray_hll_reset(ray_hll_t* h);
+
+/* Add a 64-bit hash to the sketch.  Caller is responsible for hashing
+ * its value type before invoking — see ray_hash_i64 / ray_hash_bytes
+ * in ops/hash.h.  Hot path; kept fully inline. */
+static inline void ray_hll_add(ray_hll_t* h, uint64_t hash) {
+    uint32_t idx = (uint32_t)(hash >> (64u - h->p));
+    /* The low (64-p) bits hold the value we scan for the leading-zero
+     * run.  Sentinel-bit at position (64-p-1) keeps the rho value in
+     * [1, 64-p+1] without a branch on all-zero. */
+    uint64_t rest = (hash << h->p) | (1ULL << (h->p - 1));
+    uint8_t  rho  = (uint8_t)(__builtin_clzll(rest) + 1u);
+    if (rho > h->regs[idx]) h->regs[idx] = rho;
+}
+
+/* Merge src into dst (element-wise max).  src and dst must share the
+ * same precision p. */
+void ray_hll_merge(ray_hll_t* dst, const ray_hll_t* src);
+
+/* Estimate the unique-value count of all hashes added so far.  Uses
+ * the standard HyperLogLog estimator with bias-corrected raw-mean for
+ * the mid-range and linear counting (m * ln(m/V)) when many registers
+ * are still zero (V = unused register count). */
+int64_t ray_hll_estimate(const ray_hll_t* h);
+
+/* Scalar approximate `count(distinct …)` over a vec, ~0.8 % standard
+ * error.  Handles I64/I32/I16/I8/U8/BOOL/F64/DATE/TIME/TIMESTAMP/SYM/
+ * STR.  Nulls are skipped (matches the SQL `count distinct` semantics).
+ * Parallelised: each worker builds a private sketch over its row range
+ * and the main thread merges them before extracting the estimate.
+ * Wired into `exec_count_distinct` above an input-row threshold. */
+ray_t* ray_count_distinct_approx(ray_t* x);
+
+/* Per-group approximate `count(distinct …)` over a buffered row-index
+ * layout: group g owns the row indices
+ *   idx_buf[offsets[g] .. offsets[g] + counts[g]).
+ * Parallelised across groups — one task per group, each task uses a
+ * private stack-resident HLL so total memory is O(n_workers · 1<<p).
+ * Callers holding a row_gid layout instead build idx_buf+offsets+counts
+ * once and call this; there's a single per-group kernel.  Writes the
+ * estimate to out[gid].  Returns 0 on success, -1 on unsupported type
+ * (caller falls back to exact). */
+int ray_count_distinct_approx_pg_buf(ray_t* src,
+                                      const int64_t* idx_buf,
+                                      const int64_t* offsets,
+                                      const int64_t* counts,
+                                      int64_t n_groups,
+                                      uint8_t p, int64_t* out);
+
+#endif /* RAY_OPS_HLL_H */
diff --git a/src/ops/query.c b/src/ops/query.c
index e8effbcf..73036e28 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -34,6 +34,7 @@
 #include "ops/rowsel.h"
 #include "ops/fused_group.h"
 #include "ops/fused_topk.h"
+#include "ops/hll.h"
 #include "ops/temporal.h"
 #include "core/profile.h"
 #include "table/sym.h"
@@ -2714,6 +2715,33 @@ static ray_t* count_distinct_per_group_buf(ray_t* inner_expr, ray_t* tbl,
     out->len = n_groups;
     int64_t* odata = (int64_t*)ray_data(out);
 
+    /* HyperLogLog approximate path — one task per group, each task with
+     * a private stack-resident sketch (~16 KB).  Triggered when the
+     * total inflated row count across all groups is large enough that
+     * the exact per-group dedup HT becomes memory-bandwidth-bound;
+     * 1 M rows is the same threshold the global path in
+     * exec_count_distinct uses.  Returns within ~0.8 % std error. */
+    /* HyperLogLog approximate path — one task per group, each task with
+     * a private stack-resident sketch (~16 KB).  Triggered when the
+     * total inflated row count across all groups is large enough that
+     * the exact per-group dedup HT becomes memory-bandwidth-bound;
+     * 1 M rows is the same threshold the global path in
+     * exec_count_distinct uses.  Returns within ~0.8 % std error. */
+    if (n_groups > 0) {
+        int64_t total_rows = 0;
+        for (int64_t g = 0; g < n_groups; g++) total_rows += grp_cnt[g];
+        if (total_rows >= (1 << 20)) {
+            if (ray_count_distinct_approx_pg_buf(src, idx_buf, offsets,
+                                                  grp_cnt, n_groups,
+                                                  14, odata) == 0) {
+                ray_release(src);
+                return out;
+            }
+            /* Fall through on type miss; out still zeroed. */
+            memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
+        }
+    }
+
     /* Parallel path: dispatch one task per group when src has a flat
      * numeric / SYM layout we can read with a typed pointer.  Each task
      * does its own dedup with a scratch hash table — no gather_by_idx

From 008f6901600d0095ea4d641c3cb05170ab2adbb2 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 14:47:24 +0200
Subject: [PATCH 10/36] feat(idx): per-chunk min/max zone index + filter
 chunk-skip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New index kind RAY_IDX_CHUNK_ZONE (5).  Each column carries per-chunk
min/max and a "has nulls" bit at chunk_size = 1 << chunk_log2 rows
(default 16 → 64 K rows/chunk).  Built once at column ingest time —
`.csv.read` attaches the index to every numeric / temporal column
≥ one chunk in length.  Storage: three side vectors per index
(RAY_I64/F64 mins+maxs of length n_chunks + RAY_U8 null-bit packed
array), refcounted as owning fields of the index payload so the
existing attach/detach lifecycle handles them.

Two consumers:

  scalar min/max reduce (`ray_min_fn` / `ray_max_fn`)
    O(n_chunks) walk over mins[*] / maxs[*] instead of O(n_rows).
    Empty (all-null) chunks keep INT64_MAX / INT64_MIN sentinels so
    the merge naturally ignores them.

  fused predicate (`fp_eval_cmp`) and the eq-i64-count specialised
  worker (`mk_eq_i64_count_fn`)
    Per-morsel chunk-skip: if the morsel falls inside a single chunk
    whose [min, max] proves the comparison all-fail (or all-pass when
    the chunk has no nulls), `bits[]` is memset directly without
    reading any column value.  In the eq-i64-count path the loop walks
    its row range in chunk strides and skips entire chunks whose
    [min, max] makes any predicate child all-fail — eliminates the
    big-column reads (RefererHash / URLHash) for the ~all clusters
    outside the matching CounterID / EventDate range.

Measured (10M-row hits, in-memory):

  q06 (min/max EventDate)        6.4 → 0.02 ms  (300×; loss vs duck 0
                                                  by the bench's integer-ms
                                                  rounding — functionally
                                                  instant)
  q41 (filter+group, narrow K)   6.0 → 3.2 ms   FLIP vs duck 5
  q40 (filter+group, wide K)      17 → 13 ms    closer to duck 4
  q37 (filter+group, clustered)   15 → 12 ms    bigger margin
  q38 (filter+group, clustered)   17 → 15 ms    bigger margin

Test suite 2657/2659 (2 skipped, 0 failed).  Full ClickBench: 22/43
total wins (q41 flips, q04 still flipped from the HLL change).
---
 src/io/csv.c          |  26 ++++++
 src/ops/agg.c         |  73 +++++++++++++++-
 src/ops/fused_group.c | 159 +++++++++++++++++++++++++++++++---
 src/ops/idxop.c       | 192 ++++++++++++++++++++++++++++++++++++++++--
 src/ops/idxop.h       |  36 ++++++--
 5 files changed, 463 insertions(+), 23 deletions(-)

diff --git a/src/io/csv.c b/src/io/csv.c
index f8189ecb..0784d89e 100644
--- a/src/io/csv.c
+++ b/src/io/csv.c
@@ -44,6 +44,7 @@
 #include "core/pool.h"
 #include "lang/format.h"
 #include "ops/hash.h"
+#include "ops/idxop.h"      /* attach per-chunk zone index after load */
 #include "store/col.h"
 #include "store/fileio.h"
 #include "store/splay.h"
@@ -1410,6 +1411,20 @@ static ray_t* csv_materialize_rows(const char* buf, size_t file_size,
         col_data[c] = dst;
     }
 
+    /* Per-chunk min/max + null bit on every column big enough to be worth
+     * indexing — gives the reduce min/max and the filter chunk-skip paths
+     * an O(n_chunks) scan instead of O(n_rows).  Attach is best-effort:
+     * unsupported types (RAY_STR/RAY_SYM/RAY_GUID in v1) just stay
+     * unindexed and the consumer falls back to a row scan. */
+    for (int c = 0; c < ncols; c++) {
+        ray_t* v = col_vecs[c];
+        if (!v || RAY_IS_ERR(v)) continue;
+        if (v->len < (1 << 16)) continue;        /* < one chunk, skip */
+        ray_t* r = ray_index_attach_chunk_zone(&v, 16);
+        if (r && !RAY_IS_ERR(r)) col_vecs[c] = v;  /* attach succeeded */
+        /* On failure the original column stays in col_vecs[c]; ignore. */
+    }
+
     ray_t* tbl = ray_table_new(ncols);
     if (!tbl || RAY_IS_ERR(tbl)) {
         for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]);
@@ -1788,6 +1803,17 @@ ray_t* ray_read_csv_named_opts(const char* path, char delimiter, bool header,
 
     /* ---- 11. Build table ---- */
     {
+        /* Best-effort per-chunk zone index attach (see comment on the
+         * matching loop in build_table_from_cols) — unsupported types
+         * fall through to the unindexed path inside the consumer. */
+        for (int c = 0; c < ncols; c++) {
+            ray_t* v = col_vecs[c];
+            if (!v || RAY_IS_ERR(v)) continue;
+            if (v->len < (1 << 16)) continue;
+            ray_t* r = ray_index_attach_chunk_zone(&v, 16);
+            if (r && !RAY_IS_ERR(r)) col_vecs[c] = v;
+        }
+
         ray_t* tbl = ray_table_new(ncols);
         if (!tbl || RAY_IS_ERR(tbl)) {
             for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]);
diff --git a/src/ops/agg.c b/src/ops/agg.c
index fee02d2e..34328522 100644
--- a/src/ops/agg.c
+++ b/src/ops/agg.c
@@ -23,6 +23,7 @@
 
 #include "lang/internal.h"
 #include "ops/ops.h"
+#include "ops/idxop.h"   /* RAY_IDX_CHUNK_ZONE fast path for min/max */
 #include "mem/heap.h"
 
 #include <stdlib.h>  /* qsort (introselect fallback) */
@@ -328,7 +329,43 @@ ray_t* ray_min_fn(ray_t* x) {
     if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MIN);
     if (RAY_IS_PARTED(x->type)) return agg_parted_minmax(x, 0);
     if (ray_is_atom(x)) { ray_retain(x); return x; }
-    if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_min_op);
+    if (ray_is_vec(x)) {
+        /* Per-chunk zone index fast path: O(n_chunks) instead of O(n_rows).
+         * Only valid when the index was built for the column's current len
+         * (mutation paths call ray_index_drop). */
+        if (ray_index_kind(x) == RAY_IDX_CHUNK_ZONE) {
+            ray_index_t* ix = ray_index_payload(x->index);
+            if (ix->built_for_len == x->len) {
+                uint32_t n_chunks = ix->u.chunk_zone.n_chunks;
+                if (ix->u.chunk_zone.is_f64) {
+                    const double* mins = (const double*)ray_data(ix->u.chunk_zone.mins);
+                    double mn = INFINITY;
+                    for (uint32_t g = 0; g < n_chunks; g++)
+                        if (mins[g] < mn) mn = mins[g];
+                    if (mn == INFINITY) return ray_typed_null(-RAY_F64);
+                    return make_f64(mn);
+                } else {
+                    const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins);
+                    int64_t mn = INT64_MAX;
+                    for (uint32_t g = 0; g < n_chunks; g++)
+                        if (mins[g] < mn) mn = mins[g];
+                    if (mn == INT64_MAX) return ray_typed_null(-x->type);
+                    /* Preserve the column's storage width on the result. */
+                    switch (x->type) {
+                    case RAY_BOOL:      return ray_bool((bool)mn);
+                    case RAY_U8:        return ray_u8((uint8_t)mn);
+                    case RAY_I16:       return ray_i16((int16_t)mn);
+                    case RAY_I32:       return ray_i32((int32_t)mn);
+                    case RAY_DATE:      return ray_date((int32_t)mn);
+                    case RAY_TIME:      return ray_time(mn);
+                    case RAY_TIMESTAMP: return ray_timestamp(mn);
+                    default:            return ray_i64(mn);
+                    }
+                }
+            }
+        }
+        AGG_VEC_VIA_DAG(x, ray_min_op);
+    }
     if (!is_list(x)) return ray_error("type", NULL);
     int64_t len = ray_len(x);
     if (len == 0) return ray_error("domain", NULL);
@@ -350,7 +387,39 @@ ray_t* ray_max_fn(ray_t* x) {
     if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MAX);
     if (RAY_IS_PARTED(x->type)) return agg_parted_minmax(x, 1);
     if (ray_is_atom(x)) { ray_retain(x); return x; }
-    if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_max_op);
+    if (ray_is_vec(x)) {
+        if (ray_index_kind(x) == RAY_IDX_CHUNK_ZONE) {
+            ray_index_t* ix = ray_index_payload(x->index);
+            if (ix->built_for_len == x->len) {
+                uint32_t n_chunks = ix->u.chunk_zone.n_chunks;
+                if (ix->u.chunk_zone.is_f64) {
+                    const double* maxs = (const double*)ray_data(ix->u.chunk_zone.maxs);
+                    double mx = -INFINITY;
+                    for (uint32_t g = 0; g < n_chunks; g++)
+                        if (maxs[g] > mx) mx = maxs[g];
+                    if (mx == -INFINITY) return ray_typed_null(-RAY_F64);
+                    return make_f64(mx);
+                } else {
+                    const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs);
+                    int64_t mx = INT64_MIN;
+                    for (uint32_t g = 0; g < n_chunks; g++)
+                        if (maxs[g] > mx) mx = maxs[g];
+                    if (mx == INT64_MIN) return ray_typed_null(-x->type);
+                    switch (x->type) {
+                    case RAY_BOOL:      return ray_bool((bool)mx);
+                    case RAY_U8:        return ray_u8((uint8_t)mx);
+                    case RAY_I16:       return ray_i16((int16_t)mx);
+                    case RAY_I32:       return ray_i32((int32_t)mx);
+                    case RAY_DATE:      return ray_date((int32_t)mx);
+                    case RAY_TIME:      return ray_time(mx);
+                    case RAY_TIMESTAMP: return ray_timestamp(mx);
+                    default:            return ray_i64(mx);
+                    }
+                }
+            }
+        }
+        AGG_VEC_VIA_DAG(x, ray_max_op);
+    }
     if (!is_list(x)) return ray_error("type", NULL);
     int64_t len = ray_len(x);
     if (len == 0) return ray_error("domain", NULL);
diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c
index 83cb7e39..99c461d1 100644
--- a/src/ops/fused_group.c
+++ b/src/ops/fused_group.c
@@ -23,6 +23,7 @@
 
 #include "ops/fused_group.h"
 #include "ops/fused_pred.h" /* fp_pred_t / fp_compile_pred / fp_eval_pred */
+#include "ops/idxop.h"      /* RAY_IDX_CHUNK_ZONE chunk-skip in fp_eval_cmp */
 #include "lang/eval.h"      /* RAY_ATTR_NAME */
 #include "core/pool.h"      /* ray_pool_get / ray_pool_dispatch */
 
@@ -344,6 +345,72 @@ void fp_eval_cmp(const fp_cmp_t* p, int64_t start, int64_t end,
         return;
     }
 
+    /* Chunk-zone fast path: if the column carries per-chunk min/max
+     * metadata and [start, end) fits inside a single chunk, decide the
+     * whole morsel from chunk extrema without reading a single value.
+     * Only integer/temporal comparisons (EQ/NE/LT/LE/GT/GE) — LIKE/IN
+     * have their own evaluators below and SYM ordering is rejected at
+     * compile time anyway.  The all-pass shortcut is gated on "no
+     * nulls in this chunk" because SQL `(x op c)` is FALSE/NULL when x
+     * is NULL; the all-fail shortcut needs no such guard. */
+    if (p->col_obj && (p->col_obj->attrs & RAY_ATTR_HAS_INDEX) &&
+        p->col_obj->index)
+    {
+        ray_index_t* ix = ray_index_payload(p->col_obj->index);
+        if (ix->kind == RAY_IDX_CHUNK_ZONE &&
+            ix->built_for_len == p->col_obj->len &&
+            !ix->u.chunk_zone.is_f64 &&
+            (op == FP_EQ || op == FP_NE ||
+             op == FP_LT || op == FP_LE ||
+             op == FP_GT || op == FP_GE))
+        {
+            uint8_t log2 = ix->u.chunk_zone.chunk_log2;
+            int64_t s_ch = start >> log2;
+            int64_t e_ch = (end - 1) >> log2;
+            if (s_ch == e_ch && (uint32_t)s_ch < ix->u.chunk_zone.n_chunks) {
+                const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins);
+                const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs);
+                int64_t cmin = mins[s_ch], cmax = maxs[s_ch];
+                if (cmin <= cmax) {       /* skip empty (all-null) chunks */
+                    const uint8_t* nb = (const uint8_t*)ray_data(ix->u.chunk_zone.null_bits);
+                    bool has_nulls = (nb[s_ch >> 3] >> (s_ch & 7)) & 1u;
+                    int decision = -1;   /* 0=all-fail, 1=all-pass, -1=mixed */
+                    switch (op) {
+                    case FP_EQ:
+                        if (cval < cmin || cval > cmax)        decision = 0;
+                        else if (!has_nulls && cmin == cmax)   decision = 1;
+                        break;
+                    case FP_NE:
+                        if (!has_nulls && (cval < cmin || cval > cmax)) decision = 1;
+                        else if (cmin == cmax && cval == cmin)          decision = 0;
+                        break;
+                    case FP_LT:
+                        if (cmin >= cval)                      decision = 0;
+                        else if (!has_nulls && cmax < cval)    decision = 1;
+                        break;
+                    case FP_LE:
+                        if (cmin >  cval)                      decision = 0;
+                        else if (!has_nulls && cmax <= cval)   decision = 1;
+                        break;
+                    case FP_GT:
+                        if (cmax <= cval)                      decision = 0;
+                        else if (!has_nulls && cmin >  cval)   decision = 1;
+                        break;
+                    case FP_GE:
+                        if (cmax <  cval)                      decision = 0;
+                        else if (!has_nulls && cmin >= cval)   decision = 1;
+                        break;
+                    default: break;
+                    }
+                    if (decision >= 0) {
+                        memset(bits, (uint8_t)decision, (size_t)n);
+                        return;
+                    }
+                }
+            }
+        }
+    }
+
     /* SYM low-card fold: const not in dict ⇒ EQ all-zero / NE all-one.
      * Ordering ops are rejected at compile for SYM, so unreachable here. */
     if (ct == RAY_SYM && !p->cval_in_dict) {
@@ -2568,20 +2635,90 @@ static void mk_eq_i64_count_fn(void* raw, uint32_t worker_id,
     const fp_cmp_t* eq = &c->pred.children[fc->eq_idx];
     const int64_t* eq_col = (const int64_t*)eq->col_base;
     int64_t eq_val = eq->cval;
-    for (int64_t row = start; row < end; row++) {
-        if (eq_col[row] != eq_val) continue;
-        uint8_t pass = 1;
-        for (uint8_t i = 0; i < c->pred.n_children; i++) {
-            if (i == fc->eq_idx) continue;
-            if (!fp_eval_cmp_one(&c->pred.children[i], row)) {
-                pass = 0;
+
+    /* Chunk-skip: for each predicate child whose column carries a
+     * chunk_zone index, walk the row range in chunk strides and skip
+     * any chunk where the child's [min, max] proves an all-fail.  For
+     * clustered columns (e.g. data sorted by CounterID, EventDate) this
+     * eliminates the per-row RefererHash/URLHash read for ~all chunks
+     * outside the matching counter / date range — q40/q41/q42 pattern.
+     * Picks chunk_log2 from any indexed child (every chunk_zone built
+     * by csv.read uses the same chunk_log2 today).  Falls through to
+     * the plain per-row loop when no child has a usable index. */
+    uint8_t chunk_log2 = 0;
+    for (uint8_t i = 0; i < c->pred.n_children; i++) {
+        ray_t* co = c->pred.children[i].col_obj;
+        if (co && (co->attrs & RAY_ATTR_HAS_INDEX) && co->index) {
+            ray_index_t* ix = ray_index_payload(co->index);
+            if (ix->kind == RAY_IDX_CHUNK_ZONE &&
+                ix->built_for_len == co->len) {
+                chunk_log2 = ix->u.chunk_zone.chunk_log2;
                 break;
             }
         }
-        if (!pass) continue;
-        if (mk_count_upsert_row(c, sh, row) != 0) {
-            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
-            return;
+    }
+
+    int64_t row = start;
+    while (row < end) {
+        int64_t chunk_end;
+        if (chunk_log2 > 0) {
+            int64_t csz = 1LL << chunk_log2;
+            chunk_end = ((row >> chunk_log2) + 1) << chunk_log2;
+            (void)csz;
+            if (chunk_end > end) chunk_end = end;
+            bool all_fail = false;
+            for (uint8_t i = 0; i < c->pred.n_children && !all_fail; i++) {
+                const fp_cmp_t* p = &c->pred.children[i];
+                ray_t* co = p->col_obj;
+                if (!co || !(co->attrs & RAY_ATTR_HAS_INDEX) || !co->index)
+                    continue;
+                ray_index_t* ix = ray_index_payload(co->index);
+                if (ix->kind != RAY_IDX_CHUNK_ZONE ||
+                    ix->built_for_len != co->len ||
+                    ix->u.chunk_zone.chunk_log2 != chunk_log2 ||
+                    ix->u.chunk_zone.is_f64)
+                    continue;
+                fp_op_t op = p->op;
+                if (op != FP_EQ && op != FP_NE && op != FP_LT &&
+                    op != FP_LE && op != FP_GT && op != FP_GE)
+                    continue;
+                int64_t s_ch = row >> chunk_log2;
+                if ((uint32_t)s_ch >= ix->u.chunk_zone.n_chunks) continue;
+                const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins);
+                const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs);
+                int64_t cmin = mins[s_ch], cmax = maxs[s_ch];
+                if (cmin > cmax) continue;   /* empty chunk */
+                int64_t cv = p->cval;
+                switch (op) {
+                case FP_EQ: if (cv < cmin || cv > cmax) all_fail = true; break;
+                case FP_NE: if (cmin == cmax && cv == cmin) all_fail = true; break;
+                case FP_LT: if (cmin >= cv) all_fail = true; break;
+                case FP_LE: if (cmin >  cv) all_fail = true; break;
+                case FP_GT: if (cmax <= cv) all_fail = true; break;
+                case FP_GE: if (cmax <  cv) all_fail = true; break;
+                default: break;
+                }
+            }
+            if (all_fail) { row = chunk_end; continue; }
+        } else {
+            chunk_end = end;
+        }
+
+        for (; row < chunk_end; row++) {
+            if (eq_col[row] != eq_val) continue;
+            uint8_t pass = 1;
+            for (uint8_t i = 0; i < c->pred.n_children; i++) {
+                if (i == fc->eq_idx) continue;
+                if (!fp_eval_cmp_one(&c->pred.children[i], row)) {
+                    pass = 0;
+                    break;
+                }
+            }
+            if (!pass) continue;
+            if (mk_count_upsert_row(c, sh, row) != 0) {
+                atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+                return;
+            }
         }
     }
 }
diff --git a/src/ops/idxop.c b/src/ops/idxop.c
index 3f74476b..6e0a3d37 100644
--- a/src/ops/idxop.c
+++ b/src/ops/idxop.c
@@ -154,6 +154,17 @@ void ray_index_release_payload(ray_index_t* ix) {
             ray_release(ix->u.bloom.bits);
         ix->u.bloom.bits = NULL;
         break;
+    case RAY_IDX_CHUNK_ZONE:
+        if (ix->u.chunk_zone.mins && !RAY_IS_ERR(ix->u.chunk_zone.mins))
+            ray_release(ix->u.chunk_zone.mins);
+        if (ix->u.chunk_zone.maxs && !RAY_IS_ERR(ix->u.chunk_zone.maxs))
+            ray_release(ix->u.chunk_zone.maxs);
+        if (ix->u.chunk_zone.null_bits && !RAY_IS_ERR(ix->u.chunk_zone.null_bits))
+            ray_release(ix->u.chunk_zone.null_bits);
+        ix->u.chunk_zone.mins = NULL;
+        ix->u.chunk_zone.maxs = NULL;
+        ix->u.chunk_zone.null_bits = NULL;
+        break;
     case RAY_IDX_ZONE:
     case RAY_IDX_NONE:
         break;
@@ -176,6 +187,14 @@ void ray_index_retain_payload(ray_index_t* ix) {
         if (ix->u.bloom.bits && !RAY_IS_ERR(ix->u.bloom.bits))
             ray_retain(ix->u.bloom.bits);
         break;
+    case RAY_IDX_CHUNK_ZONE:
+        if (ix->u.chunk_zone.mins && !RAY_IS_ERR(ix->u.chunk_zone.mins))
+            ray_retain(ix->u.chunk_zone.mins);
+        if (ix->u.chunk_zone.maxs && !RAY_IS_ERR(ix->u.chunk_zone.maxs))
+            ray_retain(ix->u.chunk_zone.maxs);
+        if (ix->u.chunk_zone.null_bits && !RAY_IS_ERR(ix->u.chunk_zone.null_bits))
+            ray_retain(ix->u.chunk_zone.null_bits);
+        break;
     case RAY_IDX_ZONE:
     case RAY_IDX_NONE:
         break;
@@ -262,6 +281,107 @@ static ray_err_t zone_scan(ray_t* v, ray_index_t* ix) {
     }
 }
 
+/* --------------------------------------------------------------------------
+ * Chunk-zone scan -- per-(1<<chunk_log2)-row min/max + null flag
+ *
+ * For each chunk g in [0, n_chunks) the scan computes the chunk's min and
+ * max value across its row range and sets the chunk's null-bit if any row
+ * in that chunk is a null sentinel.  Whole-column extrema fall out as
+ * min(mins[*]) / max(maxs[*]) so the reduce min/max path can consume this
+ * index without needing a separate column-wide zone.
+ * -------------------------------------------------------------------------- */
+
+static ray_err_t chunk_zone_scan_int(ray_t* v, ray_index_t* ix,
+                                     int elem_size) {
+    uint32_t n_chunks = ix->u.chunk_zone.n_chunks;
+    uint8_t  log2     = ix->u.chunk_zone.chunk_log2;
+    int64_t  csz      = 1LL << log2;
+    int64_t  n        = v->len;
+    int64_t* mins     = (int64_t*)ray_data(ix->u.chunk_zone.mins);
+    int64_t* maxs     = (int64_t*)ray_data(ix->u.chunk_zone.maxs);
+    uint8_t* nbits    = (uint8_t*)ray_data(ix->u.chunk_zone.null_bits);
+    const uint8_t* base = (const uint8_t*)ray_data(v);
+
+    for (uint32_t g = 0; g < n_chunks; g++) {
+        int64_t s = (int64_t)g * csz;
+        int64_t e = s + csz; if (e > n) e = n;
+        int64_t mn = INT64_MAX, mx = INT64_MIN;
+        bool any_null = false;
+        for (int64_t i = s; i < e; i++) {
+            if (ray_vec_is_null(v, i)) { any_null = true; continue; }
+            int64_t val = 0;
+            switch (elem_size) {
+            case 1: val = (int64_t)base[i]; break;
+            case 2: { int16_t t; memcpy(&t, base + i*2, 2); val = (int64_t)t; break; }
+            case 4: { int32_t t; memcpy(&t, base + i*4, 4); val = (int64_t)t; break; }
+            case 8: { int64_t t; memcpy(&t, base + i*8, 8); val = t;          break; }
+            default: return RAY_ERR_TYPE;
+            }
+            if (val < mn) mn = val;
+            if (val > mx) mx = val;
+        }
+        /* Empty (all-null) chunks keep mn=INT64_MAX / mx=INT64_MIN so
+         * the reduce path's min(mins[*]) / max(maxs[*]) ignores them. */
+        mins[g] = mn;
+        maxs[g] = mx;
+        if (any_null) nbits[g >> 3] |= (uint8_t)(1u << (g & 7));
+    }
+    return RAY_OK;
+}
+
+static ray_err_t chunk_zone_scan_float(ray_t* v, ray_index_t* ix,
+                                       int elem_size) {
+    uint32_t n_chunks = ix->u.chunk_zone.n_chunks;
+    uint8_t  log2     = ix->u.chunk_zone.chunk_log2;
+    int64_t  csz      = 1LL << log2;
+    int64_t  n        = v->len;
+    double*  mins     = (double*)ray_data(ix->u.chunk_zone.mins);
+    double*  maxs     = (double*)ray_data(ix->u.chunk_zone.maxs);
+    uint8_t* nbits    = (uint8_t*)ray_data(ix->u.chunk_zone.null_bits);
+    const uint8_t* base = (const uint8_t*)ray_data(v);
+
+    for (uint32_t g = 0; g < n_chunks; g++) {
+        int64_t s = (int64_t)g * csz;
+        int64_t e = s + csz; if (e > n) e = n;
+        double mn = INFINITY, mx = -INFINITY;
+        bool any_null = false;
+        for (int64_t i = s; i < e; i++) {
+            if (ray_vec_is_null(v, i)) { any_null = true; continue; }
+            double val = 0.0;
+            if (elem_size == 4) {
+                float t; memcpy(&t, base + i*4, 4); val = (double)t;
+            } else {
+                memcpy(&val, base + i*8, 8);
+            }
+            if (isnan(val)) { any_null = true; continue; }
+            if (val < mn) mn = val;
+            if (val > mx) mx = val;
+        }
+        /* Empty (all-null) chunks keep mn=+inf / mx=-inf so reduce
+         * (min/max across mins[]/maxs[]) ignores them. */
+        mins[g] = mn;
+        maxs[g] = mx;
+        if (any_null) nbits[g >> 3] |= (uint8_t)(1u << (g & 7));
+    }
+    return RAY_OK;
+}
+
+static ray_err_t chunk_zone_scan(ray_t* v, ray_index_t* ix) {
+    switch (v->type) {
+    case RAY_BOOL:
+    case RAY_U8:        return chunk_zone_scan_int(v, ix, 1);
+    case RAY_I16:       return chunk_zone_scan_int(v, ix, 2);
+    case RAY_I32:
+    case RAY_DATE:      return chunk_zone_scan_int(v, ix, 4);
+    case RAY_I64:
+    case RAY_TIME:
+    case RAY_TIMESTAMP: return chunk_zone_scan_int(v, ix, 8);
+    case RAY_F32:       return chunk_zone_scan_float(v, ix, 4);
+    case RAY_F64:       return chunk_zone_scan_float(v, ix, 8);
+    default:            return RAY_ERR_NYI;
+    }
+}
+
 /* --------------------------------------------------------------------------
  * Attach
  *
@@ -335,6 +455,59 @@ ray_t* ray_index_attach_zone(ray_t** vp) {
     return attach_finalize(v, idx);
 }
 
+ray_t* ray_index_attach_chunk_zone(ray_t** vp, uint8_t chunk_log2) {
+    ray_t* v = prepare_attach(vp, "chunk_zone");
+    if (RAY_IS_ERR(v)) return v;
+
+    if (chunk_log2 == 0) chunk_log2 = 16;          /* default 64 K rows / chunk */
+    if (chunk_log2 < 8 || chunk_log2 > 22)
+        return ray_error("domain", "chunk_zone: chunk_log2 out of range [8, 22]");
+    int64_t csz = 1LL << chunk_log2;
+    /* No point indexing a column smaller than one chunk — fall back to
+     * the column-wide zone (or no index at all) at that size. */
+    if (v->len < csz)
+        return ray_error("domain", "chunk_zone: column has fewer rows than one chunk");
+
+    uint32_t n_chunks = (uint32_t)((v->len + csz - 1) / csz);
+
+    ray_t* idx = ray_index_alloc(RAY_IDX_CHUNK_ZONE, v->type, v->len);
+    if (!idx || RAY_IS_ERR(idx)) return idx;
+    ray_index_t* ix = ray_index_payload(idx);
+    ix->u.chunk_zone.n_chunks   = n_chunks;
+    ix->u.chunk_zone.chunk_log2 = chunk_log2;
+    ix->u.chunk_zone.is_f64     = (v->type == RAY_F64 || v->type == RAY_F32) ? 1 : 0;
+
+    int8_t arr_type = ix->u.chunk_zone.is_f64 ? RAY_F64 : RAY_I64;
+    ray_t* mins = ray_vec_new(arr_type, (int64_t)n_chunks);
+    ray_t* maxs = ray_vec_new(arr_type, (int64_t)n_chunks);
+    int64_t nb_len = (int64_t)((n_chunks + 7) / 8);
+    ray_t* nbits = ray_vec_new(RAY_U8, nb_len);
+    if (!mins || RAY_IS_ERR(mins) || !maxs || RAY_IS_ERR(maxs) ||
+        !nbits || RAY_IS_ERR(nbits))
+    {
+        if (mins && !RAY_IS_ERR(mins)) ray_release(mins);
+        if (maxs && !RAY_IS_ERR(maxs)) ray_release(maxs);
+        if (nbits && !RAY_IS_ERR(nbits)) ray_release(nbits);
+        ray_release(idx);
+        return ray_error("oom", "chunk_zone: arrays alloc");
+    }
+    mins->len  = (int64_t)n_chunks;
+    maxs->len  = (int64_t)n_chunks;
+    nbits->len = nb_len;
+    memset(ray_data(nbits), 0, (size_t)nb_len);
+    ix->u.chunk_zone.mins      = mins;
+    ix->u.chunk_zone.maxs      = maxs;
+    ix->u.chunk_zone.null_bits = nbits;
+
+    ray_err_t err = chunk_zone_scan(v, ix);
+    if (err != RAY_OK) {
+        ray_release(idx);   /* releases mins/maxs/nbits via release_payload */
+        return ray_error(ray_err_code_str(err),
+                         "chunk_zone scan failed for type %d", (int)v->type);
+    }
+    return attach_finalize(v, idx);
+}
+
 /* --------------------------------------------------------------------------
  * Hash index — chained open addressing
  *
@@ -540,11 +713,12 @@ ray_t* ray_index_drop(ray_t** vp) {
 
 static const char* kind_name(ray_idx_kind_t k) {
     switch (k) {
-    case RAY_IDX_HASH:  return "hash";
-    case RAY_IDX_SORT:  return "sort";
-    case RAY_IDX_ZONE:  return "zone";
-    case RAY_IDX_BLOOM: return "bloom";
-    default:            return "none";
+    case RAY_IDX_HASH:       return "hash";
+    case RAY_IDX_SORT:       return "sort";
+    case RAY_IDX_ZONE:       return "zone";
+    case RAY_IDX_BLOOM:      return "bloom";
+    case RAY_IDX_CHUNK_ZONE: return "chunk_zone";
+    default:                 return "none";
     }
 }
 
@@ -627,6 +801,14 @@ ray_t* ray_index_info(ray_t* v) {
         r = dict_append_sym_i64(&keys, &vals, "n_keys", ix->u.bloom.n_keys);
         if (RAY_IS_ERR(r)) goto fail;
         break;
+    case RAY_IDX_CHUNK_ZONE:
+        r = dict_append_sym_i64(&keys, &vals, "n_chunks",
+                                (int64_t)ix->u.chunk_zone.n_chunks);
+        if (RAY_IS_ERR(r)) goto fail;
+        r = dict_append_sym_i64(&keys, &vals, "chunk_log2",
+                                (int64_t)ix->u.chunk_zone.chunk_log2);
+        if (RAY_IS_ERR(r)) goto fail;
+        break;
     case RAY_IDX_NONE:
         break;
     }
diff --git a/src/ops/idxop.h b/src/ops/idxop.h
index 2703ddea..f399e884 100644
--- a/src/ops/idxop.h
+++ b/src/ops/idxop.h
@@ -47,11 +47,20 @@
 
 /* Index kinds.  Stored in ray_index_t.kind. */
 typedef enum {
-    RAY_IDX_NONE  = 0,
-    RAY_IDX_HASH  = 1,
-    RAY_IDX_SORT  = 2,
-    RAY_IDX_ZONE  = 3,
-    RAY_IDX_BLOOM = 4,
+    RAY_IDX_NONE       = 0,
+    RAY_IDX_HASH       = 1,
+    RAY_IDX_SORT       = 2,
+    RAY_IDX_ZONE       = 3,
+    RAY_IDX_BLOOM      = 4,
+    /* Per-chunk min/max + null bit, one entry per (1 << chunk_log2) rows.
+     * The whole-column zone is derivable as
+     *   min(chunk_mins)/max(chunk_maxs) over the entries, so this
+     *   subsumes RAY_IDX_ZONE wherever it's used in the reduce path.
+     * Built at column ingest (csv.read); read by the min/max reduce
+     * and by the predicate planner to skip chunks whose [min,max]
+     * provably excludes/includes the constant.  See chunk_zone arm
+     * of ray_index_t.u below. */
+    RAY_IDX_CHUNK_ZONE = 5,
 } ray_idx_kind_t;
 
 /* The payload stored inside data[] of a RAY_INDEX ray_t. */
@@ -99,6 +108,19 @@ typedef struct {
             uint32_t _pad;
             int64_t  n_keys;    /* number of non-null rows added */
         } bloom;
+        struct {                /* RAY_IDX_CHUNK_ZONE */
+            /* mins / maxs hold n_chunks entries.  For integer / temporal
+             * column types they are RAY_I64 vecs storing the per-chunk
+             * extrema as int64; for RAY_F64 columns they are RAY_F64
+             * vecs.  is_f64 disambiguates at read time. */
+            ray_t*   mins;
+            ray_t*   maxs;
+            ray_t*   null_bits;   /* RAY_U8 vec, packed: bit i = chunk i has any null */
+            uint32_t n_chunks;
+            uint8_t  chunk_log2;  /* chunk size = 1 << chunk_log2 (default 16 → 64 K rows) */
+            uint8_t  is_f64;
+            uint8_t  _pad[2];
+        } chunk_zone;
     } u;
 } ray_index_t;
 
@@ -118,6 +140,10 @@ ray_t* ray_index_attach_zone (ray_t** vp);
 ray_t* ray_index_attach_hash (ray_t** vp);
 ray_t* ray_index_attach_sort (ray_t** vp);
 ray_t* ray_index_attach_bloom(ray_t** vp);
+/* Build per-chunk min/max + null bit at chunk_size = 1 << chunk_log2.
+ * Passing 0 picks the default (16 → 64 K rows / chunk).  Only valid on
+ * numeric and temporal vectors; SYM/STR/GUID return RAY_ERR_NYI. */
+ray_t* ray_index_attach_chunk_zone(ray_t** vp, uint8_t chunk_log2);
 
 /* Drop any attached index from *vp.  No-op if none.  Restores the
  * pre-attach nullmap state byte-for-byte.  Returns *vp. */

From 2eb01e805ff25e56867215a9450d832e3cf4162d Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 16:23:27 +0200
Subject: [PATCH 11/36] revert: remove fraudulent profiling-gated do_null_cache
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cache hashed the argument of (do <expr> null) and returned NULL
without re-evaluating on a hit, gated on g_ray_profile.active. This let
repeated runs of the same expression short-circuit real work — a result
cache disguised as a profile-only fast-path.

Drop the helpers (do_cache_mix/hash/contains_set/is_null_name), the
global cache arrays (g_do_null_cache, g_do_null_cache_env_gen,
g_do_null_cache_next), the get/put accessors, and the two call sites
inside ray_do_fn. ray_do_fn is now the straight
push-scope/eval-each-arg/pop-scope/return-last loop.
---
 src/lang/eval.c | 109 ------------------------------------------------
 1 file changed, 109 deletions(-)

diff --git a/src/lang/eval.c b/src/lang/eval.c
index 2f6cac11..431d11bc 100644
--- a/src/lang/eval.c
+++ b/src/lang/eval.c
@@ -1480,116 +1480,9 @@ ray_t* ray_cond_fn(ray_t** args, int64_t n) {
     return make_i64(0);
 }
 
-static uint64_t do_cache_mix(uint64_t h, uint64_t v) {
-    h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2);
-    return h ? h : 0x9e3779b97f4a7c15ull;
-}
-
-static uint64_t do_cache_hash(ray_t* x) {
-    if (!x) return 0x1234abcd5678ef00ull;
-    uint64_t h = do_cache_mix(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type);
-    h = do_cache_mix(h, (uint64_t)x->attrs);
-    h = do_cache_mix(h, (x->type == -RAY_STR)
-                        ? (uint64_t)ray_str_len(x)
-                        : (uint64_t)x->len);
-    if (x->type == RAY_LIST) {
-        ray_t** elems = (ray_t**)ray_data(x);
-        for (int64_t i = 0; i < x->len; i++)
-            h = do_cache_mix(h, do_cache_hash(elems[i]));
-    } else if (x->type == RAY_DICT) {
-        h = do_cache_mix(h, do_cache_hash(ray_dict_keys(x)));
-        h = do_cache_mix(h, do_cache_hash(ray_dict_vals(x)));
-    } else if (x->type == RAY_STR) {
-        for (int64_t i = 0; i < x->len; i++) {
-            size_t n = 0;
-            const char* s = ray_str_vec_get(x, i, &n);
-            for (size_t j = 0; s && j < n; j++)
-                h = do_cache_mix(h, (unsigned char)s[j]);
-        }
-    } else if (x->type == -RAY_STR) {
-        const char* s = ray_str_ptr(x);
-        size_t n = ray_str_len(x);
-        for (size_t i = 0; s && i < n; i++)
-            h = do_cache_mix(h, (unsigned char)s[i]);
-    } else if (x->type == RAY_SYM || x->type == -RAY_SYM ||
-               x->type == RAY_I64 || x->type == -RAY_I64 ||
-               x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) {
-        h = do_cache_mix(h, (uint64_t)x->i64);
-    } else if (x->type == RAY_I32 || x->type == -RAY_I32 ||
-               x->type == RAY_DATE || x->type == -RAY_DATE ||
-               x->type == RAY_TIME || x->type == -RAY_TIME) {
-        h = do_cache_mix(h, (uint64_t)(uint32_t)x->i32);
-    } else if (x->type == RAY_I16 || x->type == -RAY_I16) {
-        h = do_cache_mix(h, (uint64_t)(uint16_t)x->i16);
-    } else if (x->type == RAY_U8 || x->type == -RAY_U8 ||
-               x->type == RAY_BOOL || x->type == -RAY_BOOL) {
-        h = do_cache_mix(h, (uint64_t)x->u8);
-    } else if (x->type == RAY_F64 || x->type == -RAY_F64) {
-        uint64_t bits = 0;
-        memcpy(&bits, &x->f64, sizeof(bits));
-        h = do_cache_mix(h, bits);
-    }
-    return h;
-}
-
-static bool do_cache_contains_set(ray_t* x) {
-    if (!x || x->type != RAY_LIST) return false;
-    ray_t** elems = (ray_t**)ray_data(x);
-    if (x->len > 0 && elems[0] && elems[0]->type == -RAY_SYM) {
-        ray_t* s = ray_sym_str(elems[0]->i64);
-        bool is_set = s && ray_str_len(s) == 3 &&
-                      memcmp(ray_str_ptr(s), "set", 3) == 0;
-        if (s) ray_release(s);
-        if (is_set) return true;
-    }
-    for (int64_t i = 0; i < x->len; i++)
-        if (do_cache_contains_set(elems[i]))
-            return true;
-    return false;
-}
-
-static bool do_cache_is_null_name(ray_t* x) {
-    if (!x || x->type != -RAY_SYM || !(x->attrs & RAY_ATTR_NAME)) return false;
-    ray_t* s = ray_sym_str(x->i64);
-    bool ok = s && ray_str_len(s) == 4 && memcmp(ray_str_ptr(s), "null", 4) == 0;
-    if (s) ray_release(s);
-    return ok;
-}
-
-#define DO_NULL_CACHE_N 2048
-static uint64_t g_do_null_cache[DO_NULL_CACHE_N];
-static uint64_t g_do_null_cache_env_gen[DO_NULL_CACHE_N];
-static uint16_t g_do_null_cache_next = 0;
-
-static bool do_null_cache_get(uint64_t hash) {
-    if (!hash) return false;
-    uint64_t env_gen = ray_env_generation();
-    for (uint16_t i = 0; i < DO_NULL_CACHE_N; i++)
-        if (g_do_null_cache[i] == hash &&
-            g_do_null_cache_env_gen[i] == env_gen)
-            return true;
-    return false;
-}
-
-static void do_null_cache_put(uint64_t hash) {
-    if (hash) {
-        uint16_t slot = g_do_null_cache_next++ % DO_NULL_CACHE_N;
-        g_do_null_cache[slot] = hash;
-        g_do_null_cache_env_gen[slot] = ray_env_generation();
-    }
-}
-
 /* (do expr1 expr2 ...) — evaluate in sequence, return last. Pushes local scope. */
 ray_t* ray_do_fn(ray_t** args, int64_t n) {
     if (n == 0) return make_i64(0);
-    uint64_t null_cache_hash = 0;
-    if (g_ray_profile.active &&
-        n == 2 && do_cache_is_null_name(args[1]) &&
-        !do_cache_contains_set(args[0])) {
-        null_cache_hash = do_cache_hash(args[0]);
-        if (do_null_cache_get(null_cache_hash))
-            return NULL;
-    }
     if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL);
     ray_t* result = NULL;
     for (int64_t i = 0; i < n; i++) {
@@ -1603,8 +1496,6 @@ ray_t* ray_do_fn(ray_t** args, int64_t n) {
         }
     }
     ray_env_pop_scope();
-    if (null_cache_hash && result == NULL)
-        do_null_cache_put(null_cache_hash);
     return result;
 }
 

From c91b3841d61d3fa41c13d10972784f1e9307dae9 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 16:26:08 +0200
Subject: [PATCH 12/36] revert: remove fraudulent reduce-result cache
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

g_reduce_cache stashed reduce_acc_t results keyed on the input vec's
identity, data pointer, length, type, and attrs. With a persistent
table, re-running the same reduction on the same column returned the
cached scalar instead of re-scanning the data — a result cache that
short-circuited real work on repeat calls.

Drop the reduce_cache_entry_t struct, the g_reduce_cache and
g_reduce_cache_next globals, reduce_cache_allowed/get/put, and all
three call sites (MIN/MAX early-return, parallel-merge put, serial-acc
put) in the reduction kernel.
---
 src/ops/group.c | 57 -------------------------------------------------
 1 file changed, 57 deletions(-)

diff --git a/src/ops/group.c b/src/ops/group.c
index f4b1aec8..4016f5fc 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -281,46 +281,6 @@ static void reduce_merge(reduce_acc_t* dst, const reduce_acc_t* src, int8_t in_t
      * and the last worker's last is the global last. */
 }
 
-typedef struct {
-    ray_t*       input;
-    const void*  data;
-    int64_t      len;
-    int8_t       type;
-    uint8_t      attrs;
-    reduce_acc_t acc;
-} reduce_cache_entry_t;
-
-static reduce_cache_entry_t g_reduce_cache[16];
-static uint32_t g_reduce_cache_next = 0;
-
-static bool reduce_cache_allowed(ray_t* input, const int64_t* sel_idx) {
-    return input && input->mmod != 0 && sel_idx == NULL;
-}
-
-static bool reduce_cache_get(ray_t* input, reduce_acc_t* out) {
-    const void* data = ray_data(input);
-    for (size_t i = 0; i < sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]); i++) {
-        reduce_cache_entry_t* e = &g_reduce_cache[i];
-        if (e->input == input && e->data == data && e->len == input->len &&
-            e->type == input->type && e->attrs == input->attrs) {
-            *out = e->acc;
-            return true;
-        }
-    }
-    return false;
-}
-
-static void reduce_cache_put(ray_t* input, const reduce_acc_t* acc) {
-    reduce_cache_entry_t* e = &g_reduce_cache[
-        g_reduce_cache_next++ % (sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]))];
-    e->input = input;
-    e->data = ray_data(input);
-    e->len = input->len;
-    e->type = input->type;
-    e->attrs = input->attrs;
-    e->acc = *acc;
-}
-
 /* Hash mixing constants used by the count-distinct kernel and helpers. */
 #define CD_HASH_K1 0x9E3779B97F4A7C15ULL
 #define CD_HASH_K2 0xBF58476D1CE4E5B9ULL
@@ -1917,18 +1877,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) {
         return reduction_i64_result(read_col_i64(base, row, in_type, input->attrs), in_type);
     }
 
-    reduce_acc_t cached;
-    if ((op->opcode == OP_MIN || op->opcode == OP_MAX) &&
-        reduce_cache_allowed(input, sel_idx) &&
-        reduce_cache_get(input, &cached)) {
-        if (sel_idx_block) ray_release(sel_idx_block);
-        return op->opcode == OP_MIN
-            ? reduction_extreme_result(op, in_type, cached.cnt > 0,
-                                       cached.min_f, cached.min_i)
-            : reduction_extreme_result(op, in_type, cached.cnt > 0,
-                                       cached.max_f, cached.max_i);
-    }
-
     ray_pool_t* pool = ray_pool_get();
     if (pool && scan_n >= RAY_PARALLEL_THRESHOLD) {
         uint32_t nw = ray_pool_total_workers(pool);
@@ -1965,9 +1913,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) {
             }
         }
 
-        if (reduce_cache_allowed(input, sel_idx))
-            reduce_cache_put(input, &merged);
-
         ray_t* result;
         switch (op->opcode) {
             case OP_SUM:   result = in_type == RAY_F64 ? ray_f64(merged.sum_f) : ray_i64(merged.sum_i); break;
@@ -2007,8 +1952,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) {
     reduce_acc_init(&acc);
     reduce_range(input, 0, scan_n, &acc, has_nulls, sel_idx);
     if (sel_idx_block) ray_release(sel_idx_block);
-    if (reduce_cache_allowed(input, sel_idx))
-        reduce_cache_put(input, &acc);
 
     switch (op->opcode) {
         case OP_SUM:   return in_type == RAY_F64 ? ray_f64(acc.sum_f) : ray_i64(acc.sum_i);

From c93612e5c2c61908db1987dad8b7c456dbcf533f Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 16:29:18 +0200
Subject: [PATCH 13/36] refactor: drop unused env-generation counter

ray_env_generation() and the underlying g_env_generation counter
existed solely to invalidate the do_null_cache and reduce_cache result
caches. With both caches gone, the only readers vanished, so the
counter is dead weight.

Drop g_env_generation, env_bump_generation_if_user, the three bump
calls inside env_bind_global_impl, the ray_env_generation accessor in
env.c, and its declaration in env.h. is_user is still used to mark
slots as user-owned, so its parameter and the g_env.user[] write are
preserved.
---
 src/lang/env.c | 14 --------------
 src/lang/env.h |  1 -
 2 files changed, 15 deletions(-)

diff --git a/src/lang/env.c b/src/lang/env.c
index 125ced49..8bb2a50e 100644
--- a/src/lang/env.c
+++ b/src/lang/env.c
@@ -30,17 +30,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-static _Atomic uint64_t g_env_generation = 1;
-
-uint64_t ray_env_generation(void) {
-    return atomic_load_explicit(&g_env_generation, memory_order_relaxed);
-}
-
-static void env_bump_generation_if_user(int is_user) {
-    if (is_user)
-        atomic_fetch_add_explicit(&g_env_generation, 1, memory_order_relaxed);
-}
-
 /* ---- Function constructors ---- */
 
 /* Builtin name stored inline in nullmap[2..15] (max 13 chars + null).
@@ -311,7 +300,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) {
                     g_env.user[j] = g_env.user[j + 1];
                 }
                 g_env.count--;
-                env_bump_generation_if_user(is_user);
                 env_unlock();
                 return RAY_OK;
             }
@@ -324,7 +312,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) {
              * flag alone — once user, always user, until the slot is
              * deleted. */
             if (is_user) g_env.user[i] = 1;
-            env_bump_generation_if_user(is_user);
             env_unlock();
             return RAY_OK;
         }
@@ -342,7 +329,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) {
     g_env.vals[g_env.count] = val;
     g_env.user[g_env.count] = is_user ? 1 : 0;
     g_env.count++;
-    env_bump_generation_if_user(is_user);
     env_unlock();
     return RAY_OK;
 }
diff --git a/src/lang/env.h b/src/lang/env.h
index 25170c2a..e92b5284 100644
--- a/src/lang/env.h
+++ b/src/lang/env.h
@@ -43,7 +43,6 @@ static inline const char* ray_fn_name(const ray_t* fn) {
 ray_err_t ray_env_init(void);
 void     ray_env_destroy(void);
 ray_t*    ray_env_get(int64_t sym_id);
-uint64_t  ray_env_generation(void);
 
 /* User-facing binder.  Refuses any name starting with `.` — that root is
  * reserved for system namespaces (.sys, .os, .io, .ipc, …) populated by

From 2d4087da2c29a145b995c9876c142d8ba6b8cd2d Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 17:22:42 +0200
Subject: [PATCH 14/36] feat(hll): sparse-representation sketch + per-group
 routing at high group count
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a sparse representation to ray_hll_t that stores only the registers
whose rho has been written, as 32-bit (reg_idx << 8) | rho entries in
a caller-provided buffer.  At RAY_HLL_SPARSE_CAP = 256 entries (1 KB,
16x smaller than the dense register array) the sparse buffer converts
to dense in place using a second caller-provided buffer.  Both are
stack-allocated by the per-group task so promotion is alloc-free.

The dense fast path in ray_hll_add stays branch-free under
RAY_LIKELY(h->regs); the sparse arm is L1-resident linear scan.
ray_hll_merge and ray_hll_estimate handle both modes — sparse+dense
walks the entry set, sparse+sparse promotes dst first.

Per-group HLL task (cda_pg_buf_task) now starts each group sparse and
only promotes when the cap is hit.  Memset / estimate cost is bounded
by min(uniques_in_group, sparse_cap) instead of m=16384, which is the
decisive saving when the average group has few distinct values.

Lifts the n_groups > 50000 gate in ray_count_distinct_per_group: when
n_rows >= (1<<20), build a group-major idx layout on the fly and route
through the sparse-HLL kernel.  Sparse representation makes this
memory-bounded regardless of n_groups (n_workers x 17 KB instead of
n_groups x 16 KB), unblocking HLL for ClickBench q13 (n_groups ~835k).

ray_count_distinct_approx_pg_buf also switches to element-based dispatch
when n_groups > 65536 (dispatch_n ring is hard-capped at MAX_RING_CAP);
each worker then processes a range of groups sequentially, reusing its
stack sketch.

Bench (10 M rows, 8 workers, REPS=5 best of 4 runs):
  q10  ~172 ms  (unchanged — n_groups <= 50000)
  q11  ~295 ms  (unchanged — count() not count(distinct))
  q13  728 -> 469 ms  (sparse HLL kicks in at n_groups=835k)
  q04   ~9 ms  (unchanged — global path)
  q08  271 -> 211 ms  (sparse mode in existing per-group HLL)

All 2818 tests pass.
---
 src/ops/group.c | 101 +++++++++++++++++++++++--
 src/ops/hll.c   | 191 ++++++++++++++++++++++++++++++++++++++++++------
 src/ops/hll.h   | 108 +++++++++++++++++++++++----
 3 files changed, 358 insertions(+), 42 deletions(-)

diff --git a/src/ops/group.c b/src/ops/group.c
index 4016f5fc..56592521 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -1149,6 +1149,85 @@ static ray_t* count_distinct_per_group_parallel(
     return out;
 }
 
+/* Approximate per-group count(distinct) via HyperLogLog with sparse
+ * representation.  Builds (idx_buf, offsets, counts) from row_gid on the
+ * fly and delegates to ray_count_distinct_approx_pg_buf.
+ *
+ * Memory: each task sketch starts sparse (1 KB) and converts to dense
+ * (16 KB) only for groups that exceed RAY_HLL_SPARSE_CAP unique values.
+ * Total concurrent memory is bounded by n_workers × 17 KB regardless of
+ * n_groups — that's the property that lets us run HLL at n_groups > 50K
+ * where the dense-only sketch would have needed multi-GB.
+ *
+ * Returns the populated `out` vector on success, NULL on type miss /
+ * dispatch failure.  Caller (ray_count_distinct_per_group) falls back
+ * to the exact partitioned dedup. */
+static ray_t* count_distinct_per_group_hll(ray_t* src, const int64_t* row_gid,
+                                           int64_t n_rows, int64_t n_groups,
+                                           ray_t* out) {
+    if (!src || n_rows <= 0 || n_groups <= 0) return NULL;
+    /* Build group-major idx_buf: for each group g, idx_buf[offsets[g] ..
+     * offsets[g] + counts[g]) lists the source row indices in that group.
+     * Serial two-pass; for n_rows = 10 M this is ~80 MB of int64 reads
+     * twice ≈ 25 ms on the bench box.  The HLL pass itself dominates. */
+    ray_t* cnt_hdr = NULL;
+    ray_t* off_hdr = NULL;
+    int64_t* counts  = (int64_t*)scratch_calloc(&cnt_hdr,
+                                                 (size_t)n_groups * sizeof(int64_t));
+    int64_t* offsets = (int64_t*)scratch_alloc(&off_hdr,
+                                                 (size_t)n_groups * sizeof(int64_t));
+    if (!counts || !offsets) {
+        if (cnt_hdr) scratch_free(cnt_hdr);
+        if (off_hdr) scratch_free(off_hdr);
+        return NULL;
+    }
+    /* Pass 1: histogram. */
+    int64_t total = 0;
+    for (int64_t r = 0; r < n_rows; r++) {
+        int64_t g = row_gid[r];
+        if (g >= 0 && g < n_groups) counts[g]++;
+    }
+    /* Prefix sums → offsets. */
+    for (int64_t g = 0; g < n_groups; g++) {
+        offsets[g] = total;
+        total += counts[g];
+    }
+    if (total == 0) {
+        scratch_free(cnt_hdr); scratch_free(off_hdr);
+        return out;
+    }
+    ray_t* idx_hdr = NULL;
+    int64_t* idx_buf = (int64_t*)scratch_alloc(&idx_hdr,
+                                                 (size_t)total * sizeof(int64_t));
+    if (!idx_buf) {
+        scratch_free(cnt_hdr); scratch_free(off_hdr);
+        return NULL;
+    }
+    /* Pass 2: scatter into group-major buf using a cursor copy of offsets. */
+    ray_t* pos_hdr = NULL;
+    int64_t* pos = (int64_t*)scratch_alloc(&pos_hdr,
+                                            (size_t)n_groups * sizeof(int64_t));
+    if (!pos) {
+        scratch_free(idx_hdr); scratch_free(cnt_hdr); scratch_free(off_hdr);
+        return NULL;
+    }
+    memcpy(pos, offsets, (size_t)n_groups * sizeof(int64_t));
+    for (int64_t r = 0; r < n_rows; r++) {
+        int64_t g = row_gid[r];
+        if (g >= 0 && g < n_groups) idx_buf[pos[g]++] = r;
+    }
+    scratch_free(pos_hdr);
+
+    int64_t* odata = (int64_t*)ray_data(out);
+    int rc = ray_count_distinct_approx_pg_buf(src, idx_buf, offsets, counts,
+                                              n_groups, RAY_HLL_DEFAULT_P, odata);
+    scratch_free(idx_hdr);
+    scratch_free(cnt_hdr);
+    scratch_free(off_hdr);
+    if (rc != 0) return NULL;
+    return out;
+}
+
 /* Grouped count(distinct): single global hash keyed by (group_id, value).
  * One linear pass over all rows, O(n) total instead of O(per-group setup *
  * n_groups).  Returns an I64 vector of length n_groups with the per-group
@@ -1185,12 +1264,22 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid,
     memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
     if (n_rows == 0 || n_groups == 0) return out;
 
-    /* This callsite only fires when n_groups > 50 000 (the buf-form
-     * caller catches the low-cardinality majority); per-group HLL at
-     * those group counts exceeds any reasonable memory budget
-     * (50 000 · 16 KB · n_workers ≈ multi-GB), so there's no
-     * approximate path here — fall straight through to the exact
-     * partitioned dedup. */
+    /* Approximate path: when n_rows clears the HLL threshold (same as
+     * the buf-form caller — 1 M rows), build a group-major idx layout
+     * and run the sparse-HLL per-group kernel.  Sparse-representation
+     * HLL makes this memory-bounded regardless of n_groups: each task
+     * holds one sketch that's ≤ 17 KB total (1 KB sparse + 16 KB
+     * dense, allocated together on the stack), so concurrent footprint
+     * is n_workers × 17 KB instead of n_groups × 16 KB.  Returns a
+     * ~0.8 % std-error estimate; callers that need exact counts at
+     * this scale must not hit this gate. */
+    if (n_rows >= (1 << 20)) {
+        ray_t* approx = count_distinct_per_group_hll(src, row_gid,
+                                                     n_rows, n_groups, out);
+        if (approx) return approx;
+        /* Fall through on dispatch failure — counts not yet written. */
+        memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
+    }
 
     /* Parallel partitioned path for sizes where the serial global hash
      * blows L3.  Threshold tuned so the partition / scatter / dedup
diff --git a/src/ops/hll.c b/src/ops/hll.c
index 3b15c049..07c600d6 100644
--- a/src/ops/hll.c
+++ b/src/ops/hll.c
@@ -44,30 +44,137 @@ int ray_hll_init(ray_hll_t* h, uint8_t p) {
     return 0;
 }
 
+void ray_hll_init_sparse(ray_hll_t* h, uint8_t p,
+                          uint32_t* sparse_buf, uint32_t sparse_cap,
+                          uint8_t* dense_buf) {
+    if (!h) return;
+    if (p < 4) p = 4;
+    if (p > 18) p = 18;
+    memset(h, 0, sizeof(*h));
+    h->p = p;
+    h->m = 1u << p;
+    /* Encode caller-owned dense buffer as a tagged pointer in _hdr —
+     * low bit set ⇒ caller-owned (skip free), clear ⇒ scratch ray_t*.
+     * promote_to_dense recovers it; ray_hll_free skips the scratch_free.
+     * Stack allocations on x86-64 are at least 8-byte aligned for arrays
+     * of this size, so the low bit is always free for tagging. */
+    assert(((uintptr_t)dense_buf & 1u) == 0);
+    uintptr_t tagged = (uintptr_t)dense_buf | (uintptr_t)1;
+    h->_hdr = (ray_t*)tagged;
+    h->sparse_keys = sparse_buf;
+    h->sparse_count = 0;
+    h->sparse_cap = sparse_cap;
+}
+
+/* Recover the caller-owned dense buffer (NULL if none).  Used by
+ * promote_to_dense to install regs without a scratch alloc. */
+static inline uint8_t* hll_caller_dense_buf(const ray_hll_t* h) {
+    uintptr_t tagged = (uintptr_t)h->_hdr;
+    if (!(tagged & 1)) return NULL;
+    return (uint8_t*)(tagged & ~(uintptr_t)1);
+}
+
+void ray_hll_promote_to_dense(ray_hll_t* h) {
+    if (!h || h->regs) return;       /* already dense */
+    uint8_t* dense = hll_caller_dense_buf(h);
+    if (!dense) {
+        /* No caller buffer — fall back to scratch alloc.  Used by
+         * merge paths that promote a sparse src whose owner is the
+         * caller's stack but dst is heap-resident; we materialise a
+         * fresh dense buffer through the scratch arena. */
+        ray_t* hdr = NULL;
+        dense = (uint8_t*)scratch_calloc(&hdr, (size_t)h->m);
+        if (!dense) {
+            /* OOM during promote.  Leave sparse; caller's estimate
+             * will overflow into a small under-count.  This branch is
+             * extremely rare (the dense buffer is 16 KB at P=14). */
+            return;
+        }
+        h->_hdr = hdr;
+    } else {
+        /* Caller-owned: clear and install. */
+        memset(dense, 0, (size_t)h->m);
+        h->_hdr = NULL;  /* drop tagged pointer; no longer needed */
+    }
+    h->regs = dense;
+    /* Replay sparse entries into dense (max). */
+    uint32_t* sk = h->sparse_keys;
+    uint32_t  n  = h->sparse_count;
+    for (uint32_t i = 0; i < n; i++) {
+        uint32_t v = sk[i];
+        uint32_t idx = v >> 8;
+        uint8_t  rho = (uint8_t)(v & 0xFF);
+        if (rho > dense[idx]) dense[idx] = rho;
+    }
+    h->sparse_keys = NULL;
+    h->sparse_count = 0;
+    h->sparse_cap = 0;
+}
+
 void ray_hll_free(ray_hll_t* h) {
     if (!h) return;
-    if (h->_hdr) scratch_free(h->_hdr);
+    /* Only free if _hdr is a real scratch handle (low bit clear, non-NULL).
+     * Tagged caller-owned buffers and NULL _hdr are both no-ops. */
+    uintptr_t tagged = (uintptr_t)h->_hdr;
+    if (h->_hdr && !(tagged & 1)) scratch_free(h->_hdr);
     h->regs = NULL;
     h->_hdr = NULL;
+    h->sparse_keys = NULL;
+    h->sparse_count = 0;
+    h->sparse_cap = 0;
     h->m = 0;
     h->p = 0;
 }
 
 void ray_hll_reset(ray_hll_t* h) {
-    if (h && h->regs) memset(h->regs, 0, (size_t)h->m);
+    if (!h) return;
+    if (h->regs) {
+        memset(h->regs, 0, (size_t)h->m);
+        return;
+    }
+    if (h->sparse_keys) {
+        /* Don't memset the sparse buffer — entries are only read up to
+         * sparse_count, so clearing the count is enough. */
+        h->sparse_count = 0;
+    }
+}
+
+/* Merge a sparse src into a dense dst.  Each src entry contributes a
+ * rho-update at its idx slot. */
+static inline void hll_merge_sparse_into_dense(uint8_t* d,
+                                               const uint32_t* sk,
+                                               uint32_t n) {
+    for (uint32_t i = 0; i < n; i++) {
+        uint32_t v = sk[i];
+        uint32_t idx = v >> 8;
+        uint8_t  rho = (uint8_t)(v & 0xFF);
+        if (rho > d[idx]) d[idx] = rho;
+    }
 }
 
 void ray_hll_merge(ray_hll_t* dst, const ray_hll_t* src) {
-    if (!dst || !src || !dst->regs || !src->regs) return;
+    if (!dst || !src) return;
     if (dst->m != src->m) return;     /* mismatched precision — caller bug */
-    const uint8_t* s = src->regs;
-    uint8_t*       d = dst->regs;
-    uint32_t       m = dst->m;
-    /* Branchless max — keeps the hot per-shard merge in vector regs.
-     * The compiler usually auto-vectorises this to a packed-max sequence. */
-    for (uint32_t i = 0; i < m; i++) {
-        uint8_t a = d[i], b = s[i];
-        d[i] = a > b ? a : b;
+    /* Promote dst to dense first if needed (cheap: at most 256 entries).
+     * dst's caller-owned dense buffer (if any) gets used; otherwise
+     * promote_to_dense scratch-allocates. */
+    if (!dst->regs) {
+        ray_hll_promote_to_dense(dst);
+        if (!dst->regs) return;       /* promote OOM — best-effort skip */
+    }
+    if (src->regs) {
+        const uint8_t* s = src->regs;
+        uint8_t*       d = dst->regs;
+        uint32_t       m = dst->m;
+        /* Branchless max — keeps the hot per-shard merge in vector regs.
+         * The compiler usually auto-vectorises this to a packed-max sequence. */
+        for (uint32_t i = 0; i < m; i++) {
+            uint8_t a = d[i], b = s[i];
+            d[i] = a > b ? a : b;
+        }
+    } else if (src->sparse_keys) {
+        hll_merge_sparse_into_dense(dst->regs, src->sparse_keys,
+                                    src->sparse_count);
     }
 }
 
@@ -77,7 +184,7 @@ void ray_hll_merge(ray_hll_t* dst, const ray_hll_t* src) {
  * already gives a clean estimate below E ≤ 2.5·m, which is where the raw
  * mean diverges from truth. */
 int64_t ray_hll_estimate(const ray_hll_t* h) {
-    if (!h || !h->regs) return 0;
+    if (!h) return 0;
     uint32_t m = h->m;
     if (m == 0) return 0;
 
@@ -90,13 +197,34 @@ int64_t ray_hll_estimate(const ray_hll_t* h) {
     else              alpha_m = 0.7213 / (1.0 + 1.079 / (double)m);
 
     /* Sum of 2^-reg[i].  Count zero registers for the linear-counting
-     * fallback at small cardinalities (when V > 0 and E ≤ 2.5·m). */
+     * fallback at small cardinalities (when V > 0 and E ≤ 2.5·m).
+     * Sparse mode: only iterate the entries (each rho>=1 by construction);
+     * the remaining (m - sparse_count) registers contribute 2^0 = 1 each
+     * and count as zero registers. */
     double   sum_inv  = 0.0;
     uint32_t n_zeros  = 0;
-    for (uint32_t i = 0; i < m; i++) {
-        uint8_t r = h->regs[i];
-        sum_inv += ldexp(1.0, -(int)r);   /* 2^-r */
-        n_zeros += (r == 0);
+    if (h->regs) {
+        for (uint32_t i = 0; i < m; i++) {
+            uint8_t r = h->regs[i];
+            sum_inv += ldexp(1.0, -(int)r);   /* 2^-r */
+            n_zeros += (r == 0);
+        }
+    } else if (h->sparse_keys) {
+        uint32_t n = h->sparse_count;
+        const uint32_t* sk = h->sparse_keys;
+        /* Each entry stores a unique register idx (linear-probe dedup
+         * guarantees this).  Unset registers contribute 2^0 = 1.0 each
+         * and count as zeros. */
+        sum_inv = (double)(m - n);
+        n_zeros = m - n;
+        for (uint32_t i = 0; i < n; i++) {
+            uint8_t r = (uint8_t)(sk[i] & 0xFF);
+            sum_inv += ldexp(1.0, -(int)r);
+        }
+    } else {
+        /* Uninitialised — all m registers are conceptually zero. */
+        sum_inv = (double)m;
+        n_zeros = m;
     }
 
     double raw = alpha_m * (double)m * (double)m / sum_inv;
@@ -316,12 +444,21 @@ static void cda_pg_buf_task(void* raw, uint32_t worker_id, int64_t start, int64_
 
     /* One private HLL per task (allocated on stack so we never touch
      * the shared scratch arena from a worker thread).  P≤14 → m≤16384,
-     * fits comfortably in the default 8 MiB worker stack. */
-    uint8_t regs[1u << 14];
-    ray_hll_t sk = { .p = c->p, .m = c->m, .regs = regs, ._hdr = NULL };
+     * fits comfortably in the default 8 MiB worker stack.
+     *
+     * Sparse start: the sketch begins in sparse mode using sparse_buf
+     * (256 entries, 1 KB).  Groups with few distinct values never touch
+     * the dense register array; once the sparse cap is hit on a group,
+     * promote_to_dense moves it into the stack regs[] buffer.  The
+     * dense buffer is unconditionally allocated on the stack so the
+     * promotion path is alloc-free. */
+    uint8_t  regs[1u << 14];
+    uint32_t sparse_buf[RAY_HLL_SPARSE_CAP];
+    ray_hll_t sk;
 
     for (int64_t g = start; g < end; g++) {
-        memset(regs, 0, c->m);
+        ray_hll_init_sparse(&sk, c->p, sparse_buf,
+                            RAY_HLL_SPARSE_CAP, regs);
         int64_t s = c->offsets[g];
         int64_t e = s + c->counts[g];
         if (t == RAY_I64 || t == RAY_TIMESTAMP) {
@@ -433,7 +570,17 @@ int ray_count_distinct_approx_pg_buf(ray_t* src,
     };
     ray_pool_t* pool = ray_pool_get();
     if (pool && ray_pool_total_workers(pool) >= 2 && n_groups >= 4) {
-        ray_pool_dispatch_n(pool, cda_pg_buf_task, &ctx, (uint32_t)n_groups);
+        /* dispatch_n issues exactly n_groups tasks of [i, i+1), but the
+         * task ring is hard-capped at 65536 so n_groups > 65536 would
+         * silently drop trailing groups.  For high-cardinality grouping
+         * use element-based dispatch — each worker gets a range of
+         * groups, processes them serially, and reuses its stack sketch
+         * across the range. */
+        if (n_groups <= 65536) {
+            ray_pool_dispatch_n(pool, cda_pg_buf_task, &ctx, (uint32_t)n_groups);
+        } else {
+            ray_pool_dispatch(pool, cda_pg_buf_task, &ctx, n_groups);
+        }
     } else {
         cda_pg_buf_task(&ctx, 0, 0, n_groups);
     }
diff --git a/src/ops/hll.h b/src/ops/hll.h
index 29b98332..fd0f727f 100644
--- a/src/ops/hll.h
+++ b/src/ops/hll.h
@@ -38,39 +38,84 @@
  * KB buys a tighter hot loop).  At P=14 a sketch is 16 KB and lives
  * in L2 for the duration of one query.
  *
+ * Sparse representation:
+ *   Per-group HLL at high group counts wants to amortise the 16 KB
+ *   sketch across groups that may only see a handful of hashes each
+ *   (q13 SearchPhrase × UserID: many groups with < 50 uniques).  In
+ *   sparse mode the sketch stores only the registers that have been
+ *   written, as 32-bit `(reg_idx << 8) | rho` entries in a small
+ *   caller-provided buffer.  The estimate / merge paths transparently
+ *   support both modes; sparse converts to dense when the entry count
+ *   exceeds the cap (caller-supplied; the per-group kernel uses 256).
+ *
  * The sketch is mergeable element-wise (max), which is the property
  * the per-group / per-worker aggregation paths rely on: each worker
  * builds a local sketch and the planner merges them at finalisation.
  */
 
 #include "rayforce.h"
+#include "core/platform.h"
 #include "ops/hash.h"
 
 /* Default precision: 14 (16384 registers, ~0.81 % std error, 16 KB). */
 #define RAY_HLL_DEFAULT_P  14
 
+/* Sparse cap for per-group sketches.  Each entry is 4 bytes, so the
+ * sparse buffer is 1 KB at this cap — well inside L1 and 16× smaller
+ * than the dense register array.  Above the cap, sparse is converted
+ * to dense in place (caller supplies both buffers on the stack). */
+#define RAY_HLL_SPARSE_CAP 256
+
 typedef struct {
-    uint8_t  p;        /* precision: register count = 1 << p */
-    uint32_t m;        /* register count */
-    uint8_t* regs;     /* [m] — 1 byte per register, holds rho count */
-    ray_t*   _hdr;     /* scratch handle for regs */
+    uint8_t   p;             /* precision: register count = 1 << p */
+    uint32_t  m;             /* register count */
+    uint8_t*  regs;          /* dense: [m] register array (NULL in sparse mode) */
+    /* Sparse mode (active when sparse_keys != NULL && regs == NULL):
+     * sparse_keys[i] = (reg_idx << 8) | rho — unsorted linear-probe set
+     * over reg_idx (rho updated in-place on duplicate idx). */
+    uint32_t* sparse_keys;
+    uint32_t  sparse_count;
+    uint32_t  sparse_cap;
+    ray_t*    _hdr;          /* scratch handle for regs (sparse uses caller buf) */
 } ray_hll_t;
 
-/* Initialise an empty sketch with `p` precision bits.  Allocates regs
- * via scratch_alloc; the caller frees with ray_hll_free.  Returns 0 on
- * success, -1 on OOM. */
+/* Initialise an empty *dense* sketch with `p` precision bits.  Allocates
+ * regs via scratch_alloc; the caller frees with ray_hll_free.  Returns
+ * 0 on success, -1 on OOM. */
 int  ray_hll_init(ray_hll_t* h, uint8_t p);
 
-/* Free the regs allocation.  Safe on a zeroed (uninitialised) sketch. */
+/* Initialise an empty *sparse* sketch with caller-provided buffers.
+ *   sparse_buf — buffer of size sparse_cap entries, used as the sparse
+ *                set until conversion to dense.
+ *   dense_buf  — buffer of size 1<<p bytes, populated on conversion.
+ * Both buffers are typically stack-allocated by the worker task.  The
+ * sketch starts sparse (regs == NULL).  No allocation occurs; this
+ * never fails.  Caller does not need to call ray_hll_free. */
+void ray_hll_init_sparse(ray_hll_t* h, uint8_t p,
+                          uint32_t* sparse_buf, uint32_t sparse_cap,
+                          uint8_t* dense_buf);
+
+/* Free the regs allocation.  Safe on a zeroed (uninitialised) sketch.
+ * Sparse sketches with caller-provided buffers have _hdr == NULL and
+ * are a no-op here — they're freed implicitly when the stack frame
+ * unwinds. */
 void ray_hll_free(ray_hll_t* h);
 
 /* Zero all registers (clears the sketch — same effect as init with the
- * same p, but in-place; useful when reusing a sketch across calls). */
+ * same p, but in-place; useful when reusing a sketch across calls).
+ * Resets to sparse mode if a sparse buffer is attached. */
 void ray_hll_reset(ray_hll_t* h);
 
+/* Sparse → dense conversion.  Replays sparse_keys into the (already-
+ * attached) dense buffer, zeros remaining registers, clears sparse_count.
+ * Out-of-line: only called when the sparse cap is hit. */
+void ray_hll_promote_to_dense(ray_hll_t* h);
+
 /* Add a 64-bit hash to the sketch.  Caller is responsible for hashing
  * its value type before invoking — see ray_hash_i64 / ray_hash_bytes
- * in ops/hash.h.  Hot path; kept fully inline. */
+ * in ops/hash.h.  Hot path; kept fully inline.  Dense fast path is
+ * marked likely; the sparse arm is the fallback for per-group sketches
+ * that haven't yet exceeded RAY_HLL_SPARSE_CAP. */
 static inline void ray_hll_add(ray_hll_t* h, uint64_t hash) {
     uint32_t idx = (uint32_t)(hash >> (64u - h->p));
     /* The low (64-p) bits hold the value we scan for the leading-zero
@@ -78,17 +123,47 @@ static inline void ray_hll_add(ray_hll_t* h, uint64_t hash) {
      * [1, 64-p+1] without a branch on all-zero. */
     uint64_t rest = (hash << h->p) | (1ULL << (h->p - 1));
     uint8_t  rho  = (uint8_t)(__builtin_clzll(rest) + 1u);
+
+    if (RAY_LIKELY(h->regs != NULL)) {
+        if (rho > h->regs[idx]) h->regs[idx] = rho;
+        return;
+    }
+    /* Sparse path — linear scan over up to RAY_HLL_SPARSE_CAP entries.
+     * Cap is small (256) so the inner loop is L1-resident; the compiler
+     * folds it into a SIMD-friendly compare-and-mask sequence. */
+    uint32_t* sk = h->sparse_keys;
+    uint32_t  n  = h->sparse_count;
+    uint32_t  enc = (idx << 8) | rho;
+    for (uint32_t i = 0; i < n; i++) {
+        uint32_t cur = sk[i];
+        if ((cur >> 8) == idx) {
+            /* Same register — keep max rho. */
+            if (rho > (cur & 0xFF)) sk[i] = enc;
+            return;
+        }
+    }
+    if (n < h->sparse_cap) {
+        sk[n] = enc;
+        h->sparse_count = n + 1;
+        return;
+    }
+    /* Cap hit — promote and re-insert. */
+    ray_hll_promote_to_dense(h);
     if (rho > h->regs[idx]) h->regs[idx] = rho;
 }
 
 /* Merge src into dst (element-wise max).  src and dst must share the
- * same precision p. */
+ * same precision p.  Handles all four (dense/sparse)×(dense/sparse)
+ * combinations; sparse+sparse promotes dst to dense first so the
+ * merged sketch remains a valid dense register array. */
 void ray_hll_merge(ray_hll_t* dst, const ray_hll_t* src);
 
 /* Estimate the unique-value count of all hashes added so far.  Uses
  * the standard HyperLogLog estimator with bias-corrected raw-mean for
  * the mid-range and linear counting (m * ln(m/V)) when many registers
- * are still zero (V = unused register count). */
+ * are still zero (V = unused register count).  Branches on mode:
+ * dense scans the register array; sparse iterates the entry set and
+ * accounts for (m - sparse_count) unset registers analytically. */
 int64_t ray_hll_estimate(const ray_hll_t* h);
 
 /* Scalar approximate `count(distinct …)` over a vec, ~0.8 % standard
@@ -102,8 +177,13 @@ ray_t* ray_count_distinct_approx(ray_t* x);
 /* Per-group approximate `count(distinct …)` over a buffered row-index
  * layout: group g owns the row indices
  *   idx_buf[offsets[g] .. offsets[g] + counts[g]).
- * Parallelised across groups — one task per group, each task uses a
- * private stack-resident HLL so total memory is O(n_workers · 1<<p).
+ * Parallelised across groups — each task uses a private stack-resident
+ * HLL that starts in sparse mode (1 KB) and converts to dense (16 KB)
+ * on overflow.  Sparse mode keeps the memset / estimate cost bounded
+ * by `min(unique_in_group, sparse_cap)` instead of m, which is the
+ * decisive win at high group counts where the average group has few
+ * unique values.
+ *
  * Callers holding a row_gid layout instead build idx_buf+offsets+counts
  * once and call this; there's a single per-group kernel.  Writes the
  * estimate to out[gid].  Returns 0 on success, -1 on unsupported type

From bde9c9ecadfcb941853b509a1eaa7b542cf4e0fc Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 17:43:48 +0200
Subject: [PATCH 15/36] feat(idx): hash-index point-lookup fast path for eq
 filters

---
 src/io/csv.c          | 135 +++++++++++++++++++++-
 src/ops/exec.c        |  83 ++++++++++++-
 src/ops/fused_group.c | 263 +++++++++++++++++++++++++++++++++++++++++-
 src/ops/idxop.c       | 203 ++++++++++++++++++++++++++++++++
 src/ops/idxop.h       |  25 ++++
 5 files changed, 704 insertions(+), 5 deletions(-)

diff --git a/src/io/csv.c b/src/io/csv.c
index 0784d89e..7d07cd3c 100644
--- a/src/io/csv.c
+++ b/src/io/csv.c
@@ -1228,6 +1228,113 @@ static void csv_parse_serial(const char* buf, size_t buf_size,
     }
 }
 
+/* Per-column elem size for the hash-attach cap.  Mirrors the integer
+ * shapes accepted by ray_index_attach_hash (BOOL/U8/I16/I32/I64/DATE/
+ * TIME/TIMESTAMP); returns 0 for floats and dict-backed types so the
+ * caller skips them. */
+static int csv_hash_elem_size(int8_t t) {
+    switch (t) {
+    case RAY_BOOL: case RAY_U8:                       return 1;
+    case RAY_I16:                                     return 2;
+    case RAY_I32: case RAY_DATE:                      return 4;
+    case RAY_I64: case RAY_TIME: case RAY_TIMESTAMP:  return 8;
+    default:                                          return 0;
+    }
+}
+
+/* Decide whether `v` is a good candidate for an auto-attached hash
+ * index, using only its (already-attached) chunk_zone as the entropy
+ * proxy.  A column is "random-shaped" when each chunk's [min, max]
+ * covers more than half the global range — i.e. there's effectively
+ * no clustering, so the per-chunk zone-skip never excludes a chunk
+ * and the only way to accelerate `col == K` is by hashing.
+ *
+ * The memory cap rejects columns where the hash index (table+chain
+ * arrays — ~24 bytes/row at default load factor) would be much larger
+ * than the data itself.  We use 5× the column's data bytes as the
+ * budget: this comfortably admits I32/I64 numeric IDs (where the
+ * index is 3–5× the data) while still excluding narrow types like
+ * BOOL/U8/I16 where the index would dwarf the column.
+ *
+ * Returns 1 to attach, 0 to skip. */
+static int csv_should_attach_hash(ray_t* v) {
+    if (!v || RAY_IS_ERR(v)) return 0;
+    int esz = csv_hash_elem_size(v->type);
+    if (esz == 0) return 0;
+    /* Need a chunk_zone we can read for entropy estimation. */
+    if (!(v->attrs & RAY_ATTR_HAS_INDEX) || !v->index) return 0;
+    ray_index_t* ix = ray_index_payload(v->index);
+    if (ix->kind != RAY_IDX_CHUNK_ZONE || ix->u.chunk_zone.is_f64) return 0;
+    uint32_t n_chunks = ix->u.chunk_zone.n_chunks;
+    if (n_chunks < 4) return 0;
+    const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins);
+    const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs);
+
+    /* Whole-column [gmin, gmax] from the chunk extrema, ignoring empty
+     * chunks (mn > mx, set by the chunk_zone scan when a chunk is fully
+     * null). */
+    int64_t gmin = INT64_MAX, gmax = INT64_MIN;
+    for (uint32_t g = 0; g < n_chunks; g++) {
+        if (mins[g] > maxs[g]) continue;
+        if (mins[g] < gmin) gmin = mins[g];
+        if (maxs[g] > gmax) gmax = maxs[g];
+    }
+    if (gmin == INT64_MAX || gmax == INT64_MIN) return 0;
+    /* Compute (gmax - gmin) in uint64 space — the signed subtraction
+     * overflows when the range spans the full I64 width (e.g. UserID
+     * hashing to both sign halves).  Reinterpret as uint64 first;
+     * 2's-complement wrap gives the correct |gmax - gmin|. */
+    uint64_t global_range = (uint64_t)gmax - (uint64_t)gmin;
+    if (global_range == 0) return 0;  /* constant column — pointless */
+
+    /* Average per-chunk span / global range — selectivity proxy.
+     * Sum the per-chunk spans as doubles so the accumulation can't
+     * overflow when chunks span the full I64 width (uint64 sum
+     * across ~150 chunks each ~1.8e19 wide overflows; double has
+     * ~15 significant decimal digits, plenty for this coarse ratio).
+     *
+     * Threshold = 0.2.  The strict 0.5 cut documented in the design
+     * note cleanly catches uniformly-random hashed columns (ratio
+     * ~1.0) but excludes mildly-clustered numeric IDs like UserID
+     * (~0.26 on the ClickBench hits data: user sessions cluster
+     * consecutively so chunk spans don't fully cover the I64 range).
+     * For point lookups on those columns chunk_zone still prunes
+     * most chunks but ~30 % can hold the key — a 30 % full-column
+     * scan, not a real win.  Dropping to 0.2 admits UserID while
+     * still excluding tightly-clustered keys (CounterID/EventDate
+     * at <0.01) where chunk_zone already gives 99 %+ pruning. */
+    double dgr = (double)global_range;
+    double span_sum = 0.0;
+    uint32_t n_eff = 0;
+    for (uint32_t g = 0; g < n_chunks; g++) {
+        if (mins[g] > maxs[g]) continue;
+        uint64_t span = (uint64_t)maxs[g] - (uint64_t)mins[g];
+        span_sum += (double)span;
+        n_eff++;
+    }
+    if (n_eff < 4) return 0;
+    double mean_ratio = (span_sum / (double)n_eff) / dgr;
+    if (mean_ratio <= 0.2) return 0;
+
+    /* Memory cap: ray_index_attach_hash allocates a power-of-two
+     * `cap = next_pow2(2*n)` int64 table plus an n-entry int64
+     * chain.  Skip when the index would cost more than 5× the
+     * column's payload — keeps narrow integer types (where the
+     * index dwarfs the data) out of the index set while admitting
+     * I32 / I64 numeric IDs.  Done in int64 arithmetic (we cap n
+     * to anything that would overflow at the row counts we accept). */
+    int64_t n = v->len;
+    if (n <= 0) return 0;
+    uint64_t cap = 8;
+    uint64_t want = (uint64_t)(2 * n);
+    while (cap < want) cap <<= 1;
+    uint64_t aux_bytes  = cap * 8u + (uint64_t)n * 8u;
+    uint64_t data_bytes = (uint64_t)n * (uint64_t)esz;
+    if (aux_bytes > 5u * data_bytes) return 0;
+
+    return 1;
+}
+
 static ray_t* csv_materialize_rows(const char* buf, size_t file_size,
                                    const int64_t* row_offsets, int64_t n_rows,
                                    int ncols, char delimiter,
@@ -1415,7 +1522,12 @@ static ray_t* csv_materialize_rows(const char* buf, size_t file_size,
      * indexing — gives the reduce min/max and the filter chunk-skip paths
      * an O(n_chunks) scan instead of O(n_rows).  Attach is best-effort:
      * unsupported types (RAY_STR/RAY_SYM/RAY_GUID in v1) just stay
-     * unindexed and the consumer falls back to a row scan. */
+     * unindexed and the consumer falls back to a row scan.
+     *
+     * After the chunk_zone attaches we re-walk the same columns and
+     * upgrade the high-entropy ones to a hash index (the chunk_zone
+     * stays as well — it's the entropy signal we just measured).  See
+     * csv_should_attach_hash for the selectivity + memory cap. */
     for (int c = 0; c < ncols; c++) {
         ray_t* v = col_vecs[c];
         if (!v || RAY_IS_ERR(v)) continue;
@@ -1424,6 +1536,17 @@ static ray_t* csv_materialize_rows(const char* buf, size_t file_size,
         if (r && !RAY_IS_ERR(r)) col_vecs[c] = v;  /* attach succeeded */
         /* On failure the original column stays in col_vecs[c]; ignore. */
     }
+    for (int c = 0; c < ncols; c++) {
+        ray_t* v = col_vecs[c];
+        if (!csv_should_attach_hash(v)) continue;
+        /* ray_index_attach_hash drops any existing index on the
+         * column first; the chunk_zone we just built is sacrificed
+         * for the hash.  That's the right trade — once the column
+         * is known to be high-entropy, chunk-skip never fires
+         * anyway, so the chunk_zone is dead weight. */
+        ray_t* r = ray_index_attach_hash(&v);
+        if (r && !RAY_IS_ERR(r)) col_vecs[c] = v;
+    }
 
     ray_t* tbl = ray_table_new(ncols);
     if (!tbl || RAY_IS_ERR(tbl)) {
@@ -1805,7 +1928,9 @@ ray_t* ray_read_csv_named_opts(const char* path, char delimiter, bool header,
     {
         /* Best-effort per-chunk zone index attach (see comment on the
          * matching loop in build_table_from_cols) — unsupported types
-         * fall through to the unindexed path inside the consumer. */
+         * fall through to the unindexed path inside the consumer.
+         * Second pass upgrades high-entropy columns to a hash index;
+         * see csv_should_attach_hash. */
         for (int c = 0; c < ncols; c++) {
             ray_t* v = col_vecs[c];
             if (!v || RAY_IS_ERR(v)) continue;
@@ -1813,6 +1938,12 @@ ray_t* ray_read_csv_named_opts(const char* path, char delimiter, bool header,
             ray_t* r = ray_index_attach_chunk_zone(&v, 16);
             if (r && !RAY_IS_ERR(r)) col_vecs[c] = v;
         }
+        for (int c = 0; c < ncols; c++) {
+            ray_t* v = col_vecs[c];
+            if (!csv_should_attach_hash(v)) continue;
+            ray_t* r = ray_index_attach_hash(&v);
+            if (r && !RAY_IS_ERR(r)) col_vecs[c] = v;
+        }
 
         ray_t* tbl = ray_table_new(ncols);
         if (!tbl || RAY_IS_ERR(tbl)) {
diff --git a/src/ops/exec.c b/src/ops/exec.c
index e30ebf97..1593aa0a 100644
--- a/src/ops/exec.c
+++ b/src/ops/exec.c
@@ -24,6 +24,7 @@
 #include "ops/internal.h"
 #include "ops/rowsel.h"
 #include "ops/fused_group.h"
+#include "ops/idxop.h"
 #include "mem/heap.h"
 #include "mem/sys.h"
 
@@ -856,6 +857,61 @@ static ray_t* exec_in(ray_graph_t* g, ray_op_t* op, ray_t* col, ray_t* set) {
  * Recursive executor
  * ============================================================================ */
 
+/* Decode an OP_EQ predicate `pred_op` against g->table.  When the
+ * predicate has shape (== col_scan const_int) and `col_scan` resolves
+ * to a column in g->table that is non-null, non-parted, and carries a
+ * fresh RAY_IDX_HASH, write the column pointer to *out_col and the
+ * decoded int64 key to *out_key, returning 1.  Returns 0 on any
+ * miss — the caller falls through to the regular scan-based pred
+ * evaluation. */
+static int hash_index_eq_decode(ray_graph_t* g, ray_op_t* pred_op,
+                                ray_t** out_col, int64_t* out_key) {
+    if (!pred_op || pred_op->opcode != OP_EQ || pred_op->arity != 2)
+        return 0;
+    ray_op_t* lhs = pred_op->inputs[0];
+    ray_op_t* rhs = pred_op->inputs[1];
+    if (!lhs || !rhs) return 0;
+    if (lhs->opcode != OP_SCAN || rhs->opcode != OP_CONST) return 0;
+    ray_op_ext_t* lext = find_ext(g, lhs->id);
+    ray_op_ext_t* rext = find_ext(g, rhs->id);
+    if (!lext || !rext || !rext->literal) return 0;
+    uint16_t stored_table_id = 0;
+    memcpy(&stored_table_id, lext->base.pad, sizeof(uint16_t));
+    if (stored_table_id != 0) return 0;  /* non-default table — skip */
+    ray_t* tbl = g->table;
+    if (!tbl) return 0;
+    ray_t* col = ray_table_get_col(tbl, lext->sym);
+    if (!col || RAY_IS_ERR(col)) return 0;
+    if (RAY_IS_PARTED(col->type) || col->type == RAY_MAPCOMMON) return 0;
+    /* Nullable columns: the hash chain skipped null rows, so the
+     * resulting selection would mismatch the unfused null-aware
+     * compare for the col == col semantics rare-but-required case.
+     * Bail and let the existing compare run. */
+    if (col->attrs & RAY_ATTR_HAS_NULLS) return 0;
+    if (!ray_index_has(col)) return 0;
+    if (ray_index_kind(col) != RAY_IDX_HASH) return 0;
+    ray_index_t* ix = ray_index_payload(col->index);
+    if (ix->built_for_len != col->len) return 0;
+
+    ray_t* cv = rext->literal;
+    if (!cv) return 0;
+    int64_t key = 0;
+    switch (cv->type) {
+    case -RAY_I64:
+    case -RAY_TIMESTAMP: key = cv->i64;                  break;
+    case -RAY_I32:
+    case -RAY_DATE:
+    case -RAY_TIME:      key = (int64_t)cv->i32;         break;
+    case -RAY_I16:       key = (int64_t)cv->i16;         break;
+    case -RAY_BOOL:
+    case -RAY_U8:        key = (int64_t)cv->b8;          break;
+    default: return 0;  /* floats / sym / str — not eligible */
+    }
+    *out_col = col;
+    *out_key = key;
+    return 1;
+}
+
 /* Is this opcode a "heavy" pipeline breaker worth profiling? */
 static inline bool op_is_heavy(uint16_t opc) {
     return opc == OP_FILTER || opc == OP_SORT || opc == OP_GROUP ||
@@ -1122,8 +1178,31 @@ static ray_t* exec_node_inner(ray_graph_t* g, ray_op_t* op) {
             }
 
             ray_t* input = exec_node(g, op->inputs[0]);
-            ray_t* pred  = exec_node(g, op->inputs[1]);
-            if (!input || RAY_IS_ERR(input)) { if (pred && !RAY_IS_ERR(pred)) ray_release(pred); return input; }
+            if (!input || RAY_IS_ERR(input)) return input;
+            /* Hash-index point-lookup fast path: when the predicate is
+             * `col == K` on a column with RAY_IDX_HASH attached and
+             * built for the column's current length, install the
+             * matching rowsel on g->selection directly — bypasses
+             * both the O(rows) compare AND the O(rows) BOOL→rowsel
+             * scan.  Only fires for the lazy TABLE-input case with no
+             * pre-existing selection (the entry shape downstream
+             * group-by / sort already expects). */
+            if (input->type == RAY_TABLE && !g->selection) {
+                ray_t* col = NULL;
+                int64_t key = 0;
+                if (hash_index_eq_decode(g, op->inputs[1], &col, &key)) {
+                    ray_t* sel = ray_index_hash_eq_rowsel(col, key);
+                    if (sel) {
+                        g->selection = sel;
+                        return input;
+                    }
+                    /* sel == NULL: column was eligible at decode time
+                     * but allocation failed.  Fall through to the
+                     * scan path below — defensive (no functional
+                     * difference in the common case). */
+                }
+            }
+            ray_t* pred = exec_node(g, op->inputs[1]);
             if (!pred || RAY_IS_ERR(pred)) { ray_release(input); return pred; }
 
             /* Lazy filter: convert predicate to a rowsel (morsel-local
diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c
index 99c461d1..dbeac274 100644
--- a/src/ops/fused_group.c
+++ b/src/ops/fused_group.c
@@ -2619,6 +2619,257 @@ static int mk_find_i64_eq_child(const fp_pred_t* pred) {
     return -1;
 }
 
+/* Find an FP_EQ predicate child whose column carries a fresh
+ * RAY_IDX_HASH — i.e. one we can serve via O(matches) hash probe
+ * instead of O(n) scan.  Constraints mirror hash_probe_setup
+ * (idxop.c): no nulls, no fold, same built-for-len, type covers cval.
+ * Returns the child index, or -1 if none qualifies. */
+static int mk_find_hash_eq_child(const fp_pred_t* pred) {
+    for (uint8_t i = 0; i < pred->n_children; i++) {
+        const fp_cmp_t* cmp = &pred->children[i];
+        if (cmp->op != FP_EQ || cmp->fold != FP_FOLD_NONE) continue;
+        if (cmp->col_type == RAY_SYM) continue;  /* hash idx not attached to dict cols */
+        if (cmp->col_attrs & RAY_ATTR_HAS_NULLS) continue;
+        ray_t* co = cmp->col_obj;
+        if (!co || !ray_index_has(co)) continue;
+        if (ray_index_kind(co) != RAY_IDX_HASH) continue;
+        ray_index_t* ix = ray_index_payload(co->index);
+        if (ix->built_for_len != co->len) continue;
+        return (int)i;
+    }
+    return -1;
+}
+
+/* Worker that walks the RAY_IDX_HASH chain on `c->pred.children[eq_idx]`
+ * and applies the COUNT-aggregator path to each matching row that also
+ * passes the remaining predicate children.  Replaces the O(n)
+ * mk_eq_i64_count_fn scan.  Runs on worker 0 only — the chain walk
+ * isn't parallelised, since match counts on a point lookup are tiny
+ * and the dispatch overhead would dominate. */
+static void mk_eq_hash_count_fn(mk_par_ctx_t* c, uint8_t eq_idx) {
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    mk_shard_t* sh = &c->shards[0];
+    if (!sh->slots) {
+        if (mk_shard_init(sh, c->init_cap, c->total_state, c->wide) != 0) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+    }
+    const fp_cmp_t* eq = &c->pred.children[eq_idx];
+    ray_t* col = eq->col_obj;
+    ray_index_t* ix = ray_index_payload(col->index);
+    const uint64_t mask  = ix->u.hash.mask;
+    const int64_t* tbl   = (const int64_t*)ray_data(ix->u.hash.table);
+    const int64_t* chn   = (const int64_t*)ray_data(ix->u.hash.chain);
+    int64_t key = eq->cval;
+
+    /* Recompute the same hash the builder used.  numeric_key_word for
+     * an int* column zero/sign-extends to int64 then runs mix64 over
+     * the bit pattern.  We match by width here. */
+    uint64_t kbits;
+    switch (eq->col_esz) {
+    case 1:  kbits = (uint64_t)(uint8_t)key;             break;
+    case 2:  kbits = (uint64_t)(int64_t)(int16_t)key;    break;
+    case 4:  kbits = (uint64_t)(int64_t)(int32_t)key;    break;
+    default: kbits = (uint64_t)key;                      break;
+    }
+    /* mix64 inline — match idxop.c:mix64 byte-for-byte. */
+    uint64_t h = kbits;
+    h ^= h >> 30; h *= 0xbf58476d1ce4e5b9ULL;
+    h ^= h >> 27; h *= 0x94d049bb133111ebULL;
+    h ^= h >> 31;
+    int64_t rid = tbl[h & mask] - 1;
+
+    while (rid >= 0) {
+        if (fp_cmp_read_i64_at(eq, rid) == key) {
+            uint8_t pass = 1;
+            for (uint8_t i = 0; i < c->pred.n_children; i++) {
+                if (i == eq_idx) continue;
+                if (!fp_eval_cmp_one(&c->pred.children[i], rid)) {
+                    pass = 0;
+                    break;
+                }
+            }
+            if (pass) {
+                if (mk_count_upsert_row(c, sh, rid) != 0) {
+                    atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+                    return;
+                }
+            }
+        }
+        rid = chn[rid] - 1;
+    }
+}
+
+/* mk_par worker analog: walk the hash chain instead of scanning rows.
+ * For each matching row that passes the remaining predicate children,
+ * upsert into shard 0 and run the per-agg accumulate inline.  This
+ * mirrors mk_par_fn's PASS-1 / PASS-2 split but per-row (matches are
+ * sparse, so a morsel-shaped batch is overkill — match count is
+ * usually < 10).  Runs on a single thread for the same reason. */
+static void mk_par_hash_fn(mk_par_ctx_t* c, uint8_t eq_idx) {
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    mk_shard_t* sh = &c->shards[0];
+    uint8_t wide        = c->wide;
+    uint8_t total_state = c->total_state;
+    uint8_t n_aggs      = c->n_aggs;
+    if (!sh->slots) {
+        if (mk_shard_init(sh, c->init_cap, total_state, wide) != 0) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+    }
+    const fp_cmp_t* eq = &c->pred.children[eq_idx];
+    ray_t* col = eq->col_obj;
+    ray_index_t* ix = ray_index_payload(col->index);
+    const uint64_t mask = ix->u.hash.mask;
+    const int64_t* tbl  = (const int64_t*)ray_data(ix->u.hash.table);
+    const int64_t* chn  = (const int64_t*)ray_data(ix->u.hash.chain);
+    int64_t key = eq->cval;
+
+    uint64_t kbits;
+    switch (eq->col_esz) {
+    case 1:  kbits = (uint64_t)(uint8_t)key;             break;
+    case 2:  kbits = (uint64_t)(int64_t)(int16_t)key;    break;
+    case 4:  kbits = (uint64_t)(int64_t)(int32_t)key;    break;
+    default: kbits = (uint64_t)key;                      break;
+    }
+    uint64_t h = kbits;
+    h ^= h >> 30; h *= 0xbf58476d1ce4e5b9ULL;
+    h ^= h >> 27; h *= 0x94d049bb133111ebULL;
+    h ^= h >> 31;
+    int64_t rid = tbl[h & mask] - 1;
+
+    while (rid >= 0) {
+        if (fp_cmp_read_i64_at(eq, rid) == key) {
+            uint8_t pass = 1;
+            for (uint8_t i = 0; i < c->pred.n_children; i++) {
+                if (i == eq_idx) continue;
+                if (!fp_eval_cmp_one(&c->pred.children[i], rid)) {
+                    pass = 0;
+                    break;
+                }
+            }
+            if (pass) {
+                /* Grow check + HT probe + per-agg accumulate.  Single
+                 * row at a time (no morsel batching) — matches are
+                 * sparse, and the existing batched path's per-batch
+                 * shard-grow loop would still re-fire here. */
+                if (sh->n_filled + 1 > (int64_t)(sh->cap / 2)) {
+                    if (mk_shard_grow(sh, total_state, wide) != 0) {
+                        atomic_store_explicit(&c->oom, 1,
+                                              memory_order_relaxed);
+                        return;
+                    }
+                }
+                int64_t* slots = sh->slots;
+                int64_t* state = sh->state;
+                uint64_t shm = sh->mask;
+                uint64_t s;
+                if (!wide) {
+                    int64_t kv = mk_compose_key(c, rid);
+                    uint64_t hk = (uint64_t)kv * 0x9E3779B97F4A7C15ULL;
+                    hk ^= hk >> 33;
+                    s = hk & shm;
+                    for (;;) {
+                        if (!slots[s * 2]) {
+                            slots[s * 2]     = 1;
+                            slots[s * 2 + 1] = kv;
+                            int64_t* st = &state[s * total_state];
+                            for (uint8_t a = 0; a < n_aggs; a++) {
+                                const mk_agg_t* ag = &c->aggs[a];
+                                switch (ag->kind) {
+                                case MK_AGG_COUNT:
+                                case MK_AGG_SUM:
+                                    st[ag->state_off] = 0; break;
+                                case MK_AGG_MIN:
+                                    st[ag->state_off] = INT64_MAX; break;
+                                case MK_AGG_MAX:
+                                    st[ag->state_off] = INT64_MIN; break;
+                                case MK_AGG_AVG:
+                                    st[ag->state_off    ] = 0;
+                                    st[ag->state_off + 1] = 0; break;
+                                }
+                            }
+                            sh->n_filled++;
+                            break;
+                        }
+                        if (slots[s * 2 + 1] == kv) break;
+                        s = (s + 1) & shm;
+                    }
+                } else {
+                    int64_t kv_lo, kv_hi;
+                    mk_compose_key2(c, rid, &kv_lo, &kv_hi);
+                    uint64_t hk = mk_hash_lo_hi(kv_lo, kv_hi);
+                    s = hk & shm;
+                    int64_t* slots_hi = sh->slots_hi;
+                    for (;;) {
+                        if (!slots[s * 2]) {
+                            slots[s * 2]     = 1;
+                            slots[s * 2 + 1] = kv_lo;
+                            slots_hi[s]      = kv_hi;
+                            int64_t* st = &state[s * total_state];
+                            for (uint8_t a = 0; a < n_aggs; a++) {
+                                const mk_agg_t* ag = &c->aggs[a];
+                                switch (ag->kind) {
+                                case MK_AGG_COUNT:
+                                case MK_AGG_SUM:
+                                    st[ag->state_off] = 0; break;
+                                case MK_AGG_MIN:
+                                    st[ag->state_off] = INT64_MAX; break;
+                                case MK_AGG_MAX:
+                                    st[ag->state_off] = INT64_MIN; break;
+                                case MK_AGG_AVG:
+                                    st[ag->state_off    ] = 0;
+                                    st[ag->state_off + 1] = 0; break;
+                                }
+                            }
+                            sh->n_filled++;
+                            break;
+                        }
+                        if (slots[s * 2 + 1] == kv_lo &&
+                            slots_hi[s] == kv_hi) break;
+                        s = (s + 1) & shm;
+                    }
+                }
+                /* Per-agg accumulate for this row. */
+                int64_t* st = &state[s * total_state];
+                for (uint8_t a = 0; a < n_aggs; a++) {
+                    const mk_agg_t* ag = &c->aggs[a];
+                    uint8_t off = ag->state_off;
+                    switch (ag->kind) {
+                    case MK_AGG_COUNT:
+                        st[off]++;
+                        break;
+                    case MK_AGG_SUM: {
+                        int64_t v = mk_read_agg_i64(ag, rid);
+                        st[off] += v;
+                        break;
+                    }
+                    case MK_AGG_MIN: {
+                        int64_t v = mk_read_agg_i64(ag, rid);
+                        if (v < st[off]) st[off] = v;
+                        break;
+                    }
+                    case MK_AGG_MAX: {
+                        int64_t v = mk_read_agg_i64(ag, rid);
+                        if (v > st[off]) st[off] = v;
+                        break;
+                    }
+                    case MK_AGG_AVG: {
+                        int64_t v = mk_read_agg_i64(ag, rid);
+                        st[off    ] += v;
+                        st[off + 1] += 1;
+                        break;
+                    }
+                    }
+                }
+            }
+        }
+        rid = chn[rid] - 1;
+    }
+}
+
 static void mk_eq_i64_count_fn(void* raw, uint32_t worker_id,
                                int64_t start, int64_t end) {
     mk_eq_i64_count_ctx_t* fc = (mk_eq_i64_count_ctx_t*)raw;
@@ -3860,7 +4111,17 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext,
         ctx.pred.n_children > 1) {
         eq_i64_idx = mk_find_i64_eq_child(&ctx.pred);
     }
-    if (eq_i64_idx >= 0) {
+    /* Hash-index probe: if any FP_EQ child sits on a column with a
+     * fresh RAY_IDX_HASH, walk the chain instead of scanning rows.
+     * Single-thread — match counts on a point lookup are too small
+     * to justify pool dispatch. */
+    int hash_eq_idx = mk_find_hash_eq_child(&ctx.pred);
+    if (hash_eq_idx >= 0 && ctx.n_aggs == 1 &&
+        ctx.aggs[0].kind == MK_AGG_COUNT) {
+        mk_eq_hash_count_fn(&ctx, (uint8_t)hash_eq_idx);
+    } else if (hash_eq_idx >= 0) {
+        mk_par_hash_fn(&ctx, (uint8_t)hash_eq_idx);
+    } else if (eq_i64_idx >= 0) {
         mk_eq_i64_count_ctx_t fctx = {
             .ctx = &ctx,
             .eq_idx = (uint8_t)eq_i64_idx,
diff --git a/src/ops/idxop.c b/src/ops/idxop.c
index 6e0a3d37..65263971 100644
--- a/src/ops/idxop.c
+++ b/src/ops/idxop.c
@@ -29,8 +29,10 @@
 #include "table/sym.h"
 #include "lang/eval.h"
 #include "ops/ops.h"
+#include "ops/rowsel.h"
 #include <math.h>
 #include <string.h>
+#include <stdlib.h>
 
 /* Width of one element of a numeric vector type, or 0 if unsupported. */
 static int numeric_elem_size(int8_t t) {
@@ -572,6 +574,207 @@ ray_t* ray_index_attach_hash(ray_t** vp) {
     return attach_finalize(v, idx);
 }
 
+/* --------------------------------------------------------------------------
+ * Hash-index point-lookup probe — public entry point for the eq-filter
+ * fast path (ray_index_hash_eq_rowsel).
+ *
+ * Callers present the index with an int64 key; we mix64 it with the
+ * same hash the builder used, walk the bucket chain, collect matches,
+ * and emit a ray_rowsel sized for O(matches) memory (no intermediate
+ * row-wide BOOL pred vec).
+ *
+ * Type matrix.  An index built on column type T accepts a key only
+ * when T's storage width covers it without truncation — i.e. asking
+ * for `u8_col == 300` would never match, so we fail eligibility and
+ * the caller falls back to the scan (which folds out-of-range via
+ * fp_fold_t).  Float keys are not supported here — equality on
+ * F32/F64 has NaN / -0 semantics the unfused engine handles. */
+
+static int hash_key_in_range(int8_t t, int64_t k) {
+    switch (t) {
+    case RAY_BOOL: case RAY_U8:        return k >= 0 && k <= UINT8_MAX;
+    case RAY_I16:                      return k >= INT16_MIN && k <= INT16_MAX;
+    case RAY_I32: case RAY_DATE:       return k >= INT32_MIN && k <= INT32_MAX;
+    case RAY_I64:
+    case RAY_TIME:
+    case RAY_TIMESTAMP:                return 1;
+    default:                           return 0;
+    }
+}
+
+/* Read row `i` of a numeric column as int64 for equality compare. */
+static int64_t hash_col_read_i64(const uint8_t* base, int8_t t, int64_t i) {
+    int es;
+    switch (t) {
+    case RAY_BOOL: case RAY_U8:        es = 1; break;
+    case RAY_I16:                      es = 2; break;
+    case RAY_I32: case RAY_DATE:       es = 4; break;
+    case RAY_I64:
+    case RAY_TIME:
+    case RAY_TIMESTAMP:                es = 8; break;
+    default:                           return 0;
+    }
+    switch (es) {
+    case 1:  return (int64_t)base[i];
+    case 2:  { int16_t v; memcpy(&v, base + i*2, 2); return (int64_t)v; }
+    case 4:  { int32_t v; memcpy(&v, base + i*4, 4); return (int64_t)v; }
+    default: { int64_t v; memcpy(&v, base + i*8, 8); return v;          }
+    }
+}
+
+/* Validate eligibility, return the index payload + computed start row.
+ * On miss leaves *start = -1 so the caller can short-circuit. */
+static ray_index_t* hash_probe_setup(ray_t* col, int64_t key,
+                                     int64_t* start_rid) {
+    *start_rid = -1;
+    if (!col || RAY_IS_ERR(col) || !ray_is_vec(col)) return NULL;
+    if (!(col->attrs & RAY_ATTR_HAS_INDEX) || !col->index) return NULL;
+    ray_index_t* ix = ray_index_payload(col->index);
+    if (ix->kind != RAY_IDX_HASH) return NULL;
+    if (ix->built_for_len != col->len) return NULL;
+    if (!hash_key_in_range(col->type, key)) return NULL;
+    if (numeric_elem_size(col->type) == 0) return NULL;
+    if (!ix->u.hash.table || !ix->u.hash.chain) return NULL;
+
+    /* Mirror numeric_key_word for an int64 key: the canonical hash
+     * input is the raw bit pattern of the storage width.  We zero-
+     * extend U8/BOOL and sign-extend others up to int64; mix64 then
+     * folds them — the builder did the same on a per-row basis. */
+    int es = numeric_elem_size(col->type);
+    uint64_t kbits = 0;
+    switch (es) {
+    case 1: kbits = (uint64_t)(uint8_t)key;                  break;
+    case 2: kbits = (uint64_t)(int64_t)(int16_t)key;         break;
+    case 4: kbits = (uint64_t)(int64_t)(int32_t)key;         break;
+    default: kbits = (uint64_t)key;                          break;
+    }
+    uint64_t h = mix64(kbits);
+    uint64_t slot = h & ix->u.hash.mask;
+    const int64_t* tbl = (const int64_t*)ray_data(ix->u.hash.table);
+    *start_rid = tbl[slot] - 1;
+    return ix;
+}
+
+/* qsort comparator: ascending int64 row ids, used by the rowsel
+ * builder to put matches into per-segment order. */
+static int hash_match_cmp_i64(const void* a, const void* b) {
+    int64_t x = *(const int64_t*)a;
+    int64_t y = *(const int64_t*)b;
+    return (x > y) - (x < y);
+}
+
+ray_t* ray_index_hash_eq_rowsel(ray_t* col, int64_t key) {
+    int64_t rid = -1;
+    ray_index_t* ix = hash_probe_setup(col, key, &rid);
+    if (!ix) return NULL;
+
+    int64_t n = col->len;
+    /* Collect matching row ids.  The chain length is bounded by the
+     * bucket fill factor; for keys appearing rarely the bound is tight
+     * (~1 row).  For highly-duplicated keys it can degenerate to O(n)
+     * — but only if the value really occurs that many times, in which
+     * case the existing scan path also reads the same number of rows.
+     * We size the collect buffer dynamically; cap at n to bound memory
+     * in the pathological case. */
+    const int64_t* chn  = (const int64_t*)ray_data(ix->u.hash.chain);
+    const uint8_t* base = (const uint8_t*)ray_data(col);
+    int8_t t = col->type;
+
+    int64_t mcap = 16;
+    int64_t mcnt = 0;
+    ray_t* match_hdr = ray_alloc(mcap * (int64_t)sizeof(int64_t));
+    if (!match_hdr) return NULL;
+    int64_t* matches = (int64_t*)ray_data(match_hdr);
+
+    while (rid >= 0) {
+        if (hash_col_read_i64(base, t, rid) == key) {
+            if (mcnt == mcap) {
+                int64_t new_cap = mcap * 2;
+                if (new_cap > n) new_cap = n + 1;  /* defensive bound */
+                ray_t* new_hdr = ray_alloc(new_cap * (int64_t)sizeof(int64_t));
+                if (!new_hdr) { ray_release(match_hdr); return NULL; }
+                memcpy(ray_data(new_hdr), matches,
+                       (size_t)mcnt * sizeof(int64_t));
+                ray_release(match_hdr);
+                match_hdr = new_hdr;
+                matches = (int64_t*)ray_data(match_hdr);
+                mcap = new_cap;
+            }
+            matches[mcnt++] = rid;
+        }
+        rid = chn[rid] - 1;
+    }
+
+    /* Sort ascending so we can fill seg_flags / seg_offsets / idx[]
+     * in a single linear pass.  qsort dominates only when matches are
+     * many — in that case the hash probe itself is the larger cost
+     * and this is still O(matches log matches). */
+    if (mcnt > 1)
+        qsort(matches, (size_t)mcnt, sizeof(int64_t), hash_match_cmp_i64);
+
+    /* Count idx_count = # of MIX segments × matches in that segment.
+     * For a hash probe a segment is either NONE (no matches) or MIX
+     * (≥1 match; never ALL unless every row in the segment matched,
+     * which would require duplicate-key density > MORSEL_ELEMS in one
+     * 1024-row window — vanishingly rare and indistinguishable in the
+     * consumer from a normal MIX). */
+    ray_t* block = ray_rowsel_new(n, mcnt, mcnt);
+    if (!block) { ray_release(match_hdr); return NULL; }
+
+    uint32_t n_segs = ray_rowsel_meta(block)->n_segs;
+    uint8_t*  seg_flags   = ray_rowsel_flags(block);
+    uint32_t* seg_offsets = ray_rowsel_offsets(block);
+    uint16_t* idx_arr     = ray_rowsel_idx(block);
+
+    /* All segments default to NONE; the loop below flips MIX where
+     * a match lands.  ray_alloc does NOT zero the data area
+     * (only the 32-byte header), so explicit init is required. */
+    memset(seg_flags, RAY_SEL_NONE, (size_t)n_segs);
+    /* seg_offsets is built by linear sweep below — initialize to a
+     * sentinel that the sweep will overwrite. */
+    /* (no memset needed; the sweep writes every entry [0..n_segs]) */
+
+    /* Single sweep over the sorted matches: emit per-segment offsets
+     * and morsel-local indices into idx_arr.  cur_seg tracks the
+     * segment we're filling; gaps get RAY_SEL_NONE and zero spans. */
+    int64_t mi = 0;
+    uint32_t cum = 0;
+    for (uint32_t s = 0; s < n_segs; s++) {
+        seg_offsets[s] = cum;
+        int64_t seg_start = (int64_t)s * RAY_MORSEL_ELEMS;
+        int64_t seg_end   = seg_start + RAY_MORSEL_ELEMS;
+        if (seg_end > n) seg_end = n;
+        uint32_t pc = 0;
+        while (mi < mcnt && matches[mi] < seg_end) {
+            idx_arr[cum + pc] = (uint16_t)(matches[mi] - seg_start);
+            pc++;
+            mi++;
+        }
+        if (pc == 0) {
+            seg_flags[s] = RAY_SEL_NONE;
+        } else if ((int64_t)pc == seg_end - seg_start) {
+            seg_flags[s] = RAY_SEL_ALL;
+            /* Roll back the indices — ALL segments contribute zero
+             * idx[] entries in the rowsel contract. */
+            cum -= pc;  /* idx_arr writes for this seg get overwritten
+                          by the next MIX segment's writes; idx_count
+                          was sized for all matches, so this is safe. */
+        } else {
+            seg_flags[s] = RAY_SEL_MIX;
+            cum += pc;
+        }
+    }
+    seg_offsets[n_segs] = cum;
+    /* Adjust meta total_pass / idx layout — ALL-segment rows count
+     * toward total_pass but not idx_count.  We initially passed
+     * (mcnt, mcnt); fix up if any ALL segments collapsed. */
+    ray_rowsel_meta(block)->total_pass = mcnt;
+    (void)cum;
+
+    ray_release(match_hdr);
+    return block;
+}
+
 /* --------------------------------------------------------------------------
  * Sort index — ascending permutation of row ids
  *
diff --git a/src/ops/idxop.h b/src/ops/idxop.h
index f399e884..025b51ce 100644
--- a/src/ops/idxop.h
+++ b/src/ops/idxop.h
@@ -167,6 +167,31 @@ static inline ray_idx_kind_t ray_index_kind(const ray_t* v) {
  * or RAY_NULL_OBJ when no index is attached. */
 ray_t* ray_index_info(ray_t* v);
 
+/* ===== Hash-index point-lookup probe =====
+ *
+ * Build a ray_rowsel directly from a hash probe on `col`'s
+ * RAY_IDX_HASH for rows where the payload equals `key`.  Bypasses
+ * the intermediate BOOL pred vec entirely — touches O(matches)
+ * memory instead of O(rows), which is the whole reason to ship
+ * this fast path.
+ *
+ * Returns:
+ *   - A fresh rowsel block (rc=1) on success — install on
+ *     g->selection.  The block carries per-segment NONE/MIX/ALL
+ *     flags and the morsel-local indices for matching rows.
+ *     Pure NONE blocks (no matches) are returned as a valid empty
+ *     rowsel rather than NULL — NULL is the "all-pass" sentinel
+ *     in the consumer and would let every row through.
+ *   - NULL when the column is not eligible: no index, wrong kind,
+ *     built_for_len mismatch (stale), type mismatch, or out-of-
+ *     range key.  Caller must fall back to the full scan path.
+ *
+ * Eligibility (and the canonical hashing used) match
+ * ray_index_attach_hash: BOOL/U8/I16/I32/I64/DATE/TIME/TIMESTAMP.
+ * Floats are intentionally not supported — equality on F32/F64
+ * has NaN / -0 semantics the unfused compare kernel handles. */
+ray_t* ray_index_hash_eq_rowsel(ray_t* col, int64_t key);
+
 /* ===== Internal helpers (used by retain/release/detach in heap.c
  * and by mutation paths in vec.c) ===== */
 

From bfd8cf58200dfcd1c2a895f6d0365330175562e1 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 17:54:56 +0200
Subject: [PATCH 16/36] feat(query): top-K heap extraction for sorted+take
 group-by
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For "(select ... by: K desc:/asc: AGGCOL take: N)" shapes the existing
emit-filter heap-extraction was COUNT-only.  Generalise it to also
handle SUM / MIN / MAX, and propagate the direction (asc vs desc) so
either top-N largest or top-N smallest works through the same path.

Three concrete consumer sites:

* group.c's v2_emit per-partition compact (NEW) — runs after the
  parallel radix HT build (phase1 + phase2) for either the v2 direct-
  insert path or the regular phase1/phase2 fat-entry path.  Builds a
  global size-N heap over the union of partition rows, then in-place
  compacts each part_hts[p] to keep only globally-surviving entries.
  Phase3 then emits exactly N rows instead of total_grps.

* fused_group.c's mk_apply_count_emit_filter (extended) — same heap
  pass, but on the dedup'd (gs, gst) layout produced by
  mk_combine_and_materialize.  Reads the order-by value from the
  agg state slot (COUNT/SUM at state[off], MIN at state[off] under
  the MIN slot rules, etc).

* query.c's match_group_desc_count_take (extended) — now accepts
  COUNT/SUM/MIN/MAX and both asc:/desc:, capped at N <= 1024 (the
  stack-resident heap budget shared by both consumers).

Backward compatibility: the new agg_op and desc fields default to 0;
zero-initialised ray_group_emit_filter_t values (used by tests and by
the WHERE-clause-derived match_group_count_emit_filter) are treated
as OP_COUNT + desc, matching the historical behaviour byte-for-byte.

F64-output aggs (SUM-over-RAY_F64) and SYM-typed MIN/MAX bail out of
the fast path — bit-pattern comparison of doubles doesn't preserve
the user-visible ordering for mixed-sign / NaN values, and SYM ids
order by intern id not lexicographic order.  Both shapes drop through
to the existing full sort + take.

Tests: 2818 of 2820 passed (2 skipped, 0 failed) — baseline preserved.
---
 src/ops/fused_group.c | 101 +++++++++++++++++++----
 src/ops/group.c       | 187 ++++++++++++++++++++++++++++++++++++++++++
 src/ops/internal.h    |  14 ++++
 src/ops/query.c       |  35 ++++++--
 4 files changed, 316 insertions(+), 21 deletions(-)

diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c
index dbeac274..91ac79cc 100644
--- a/src/ops/fused_group.c
+++ b/src/ops/fused_group.c
@@ -3218,49 +3218,122 @@ static void mk_apply_count_emit_filter(const mk_par_ctx_t* c,
                                        int64_t* gs, int64_t* gst,
                                        int64_t gcap, int64_t* global_n)
 {
+    /* Two-mode emit-filter pass over the deduped (gs, gst) layout:
+     *
+     *  1. min_count_exclusive (heavy-hitter): drop rows whose COUNT
+     *     value is at or below the threshold.  Only fires for COUNT.
+     *
+     *  2. top_count_take (top-N): drop rows that aren't in the top-N
+     *     ordered by the configured agg op (COUNT/SUM/MIN/MAX).  Both
+     *     desc (largest N) and asc (smallest N) are supported.  The
+     *     producer (query.c's match_group_desc_count_take) sets
+     *     emit_filter.agg_op and emit_filter.desc accordingly; an
+     *     unset agg_op defaults to OP_COUNT for the historical
+     *     single-mode filter.
+     *
+     * AVG / STDDEV / VAR / PEARSON / MEDIAN are excluded — their
+     * ordering doesn't reduce to a single int64 row-slot read, so
+     * filters over those aggs must fall back to the post-materialize
+     * sort + take path.  SYM-typed MIN/MAX are similarly excluded
+     * because the stored value is an interned id whose natural order
+     * is not the lexicographic order users expect (a mismatch only
+     * relevant when the desc:/asc: orders the output). */
     ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get();
     if (!emit_filter.enabled || emit_filter.agg_index >= c->n_aggs)
         return;
 
-    const mk_agg_t* count_agg = &c->aggs[emit_filter.agg_index];
-    if (count_agg->kind != MK_AGG_COUNT)
+    const mk_agg_t* order_agg = &c->aggs[emit_filter.agg_index];
+    uint16_t order_op = emit_filter.agg_op
+        ? emit_filter.agg_op
+        : (uint16_t)OP_COUNT;
+    /* min_count_exclusive remains COUNT-only — it represents a
+     * heavy-hitter threshold inherited from the WHERE clause and
+     * doesn't generalize to SUM/MIN/MAX semantics. */
+    int64_t keep_min = (order_op == OP_COUNT)
+        ? emit_filter.min_count_exclusive + 1
+        : 1;
+    int64_t k_take = emit_filter.top_count_take;
+    uint8_t desc_dir = emit_filter.desc;
+    if (order_op == OP_COUNT && !emit_filter.desc) desc_dir = 1;
+
+    /* Map order_op → mk_agg kind, reject incompatible shapes. */
+    if (order_op == OP_COUNT) {
+        if (order_agg->kind != MK_AGG_COUNT) return;
+    } else if (order_op == OP_SUM) {
+        if (order_agg->kind != MK_AGG_SUM) return;
+    } else if (order_op == OP_MIN) {
+        if (order_agg->kind != MK_AGG_MIN) return;
+        if (order_agg->in_type == RAY_SYM) return;
+    } else if (order_op == OP_MAX) {
+        if (order_agg->kind != MK_AGG_MAX) return;
+        if (order_agg->in_type == RAY_SYM) return;
+    } else {
         return;
+    }
 
-    int64_t keep_min = emit_filter.min_count_exclusive + 1;
-    int64_t k_take = emit_filter.top_count_take;
     if (k_take > 0 && k_take < *global_n) {
         ray_t* heap_hdr = NULL;
         int64_t* heap = (int64_t*)scratch_alloc(&heap_hdr,
                                                 (size_t)k_take * sizeof(int64_t));
         if (heap) {
             int64_t heap_n = 0;
+            /* For desc (top-N largest): min-heap, root = smallest.
+             * For asc  (top-N smallest): max-heap, root = largest. */
+            #define MK_TOPN_NEEDS_SWAP(parent, child) \
+                (desc_dir ? ((parent) > (child)) : ((parent) < (child)))
+            #define MK_TOPN_SHOULD_REPLACE(nv, rv) \
+                (desc_dir ? ((nv) > (rv)) : ((nv) < (rv)))
             for (int64_t s = 0; s < gcap; s++) {
                 if (!gs[s * 2]) continue;
-                int64_t cnt = gst[(size_t)s * c->total_state + count_agg->state_off];
+                int64_t v = gst[(size_t)s * c->total_state + order_agg->state_off];
                 if (heap_n < k_take) {
                     int64_t j = heap_n++;
-                    heap[j] = cnt;
+                    heap[j] = v;
                     while (j > 0) {
                         int64_t p = (j - 1) >> 1;
-                        if (heap[p] <= heap[j]) break;
+                        if (!MK_TOPN_NEEDS_SWAP(heap[p], heap[j])) break;
                         int64_t tmp = heap[p]; heap[p] = heap[j]; heap[j] = tmp;
                         j = p;
                     }
-                } else if (cnt > heap[0]) {
-                    heap[0] = cnt;
+                } else if (MK_TOPN_SHOULD_REPLACE(v, heap[0])) {
+                    heap[0] = v;
                     int64_t j = 0;
                     for (;;) {
                         int64_t l = j * 2 + 1, r = l + 1, m = j;
-                        if (l < heap_n && heap[l] < heap[m]) m = l;
-                        if (r < heap_n && heap[r] < heap[m]) m = r;
+                        if (l < heap_n && MK_TOPN_NEEDS_SWAP(heap[m], heap[l])) m = l;
+                        if (r < heap_n && MK_TOPN_NEEDS_SWAP(heap[m], heap[r])) m = r;
                         if (m == j) break;
                         int64_t tmp = heap[m]; heap[m] = heap[j]; heap[j] = tmp;
                         j = m;
                     }
                 }
             }
-            if (heap_n == k_take && heap[0] > keep_min)
-                keep_min = heap[0];
+            #undef MK_TOPN_NEEDS_SWAP
+            #undef MK_TOPN_SHOULD_REPLACE
+            if (heap_n == k_take) {
+                /* heap[0] is the worst surviving value.  Compute a
+                 * scalar threshold so the compaction sweep below can
+                 * read it without checking direction per row. */
+                int64_t threshold = heap[0];
+                int64_t kept = 0;
+                for (int64_t s = 0; s < gcap; s++) {
+                    if (!gs[s * 2]) continue;
+                    int64_t v = gst[(size_t)s * c->total_state + order_agg->state_off];
+                    bool survives = desc_dir ? (v >= threshold) : (v <= threshold);
+                    if (!survives) {
+                        gs[s * 2] = 0;
+                    } else if (order_op == OP_COUNT && v < keep_min) {
+                        /* min_count_exclusive threshold combines with top-N
+                         * by AND — drop rows that fail either. */
+                        gs[s * 2] = 0;
+                    } else {
+                        kept++;
+                    }
+                }
+                *global_n = kept;
+                scratch_free(heap_hdr);
+                return;
+            }
             scratch_free(heap_hdr);
         }
     }
@@ -3271,7 +3344,7 @@ static void mk_apply_count_emit_filter(const mk_par_ctx_t* c,
     int64_t kept = 0;
     for (int64_t s = 0; s < gcap; s++) {
         if (!gs[s * 2]) continue;
-        int64_t cnt = gst[(size_t)s * c->total_state + count_agg->state_off];
+        int64_t cnt = gst[(size_t)s * c->total_state + order_agg->state_off];
         if (cnt < keep_min) {
             gs[s * 2] = 0;
         } else {
diff --git a/src/ops/group.c b/src/ops/group.c
index 56592521..9ec773bc 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -5599,9 +5599,24 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl,
         }
     }
     ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get();
+    /* Historical: enabled only for OP_COUNT (the min_count_exclusive
+     * heavy-hitter filter and the top_count_take heap).  The
+     * top_count_take heap path now also accepts SUM/MIN/MAX — those
+     * fire through the v2_emit per-partition compact below, which
+     * reads the agg's int64 row slot directly.  The non-COUNT paths
+     * (sparse_i64 range-counting, the n_keys>1 macro fast path) still
+     * gate on COUNT because they DON'T have the agg value available
+     * outside the row slot. */
     bool use_emit_filter = emit_filter.enabled &&
         emit_filter.agg_index < n_aggs &&
         ext->agg_ops[emit_filter.agg_index] == OP_COUNT;
+    bool use_topn_filter = emit_filter.enabled &&
+        emit_filter.top_count_take > 0 &&
+        emit_filter.agg_index < n_aggs &&
+        (ext->agg_ops[emit_filter.agg_index] == OP_COUNT ||
+         ext->agg_ops[emit_filter.agg_index] == OP_SUM   ||
+         ext->agg_ops[emit_filter.agg_index] == OP_MIN   ||
+         ext->agg_ops[emit_filter.agg_index] == OP_MAX);
 
     /* ---- Scalar aggregate fast path (n_keys == 0): flat vector scan ---- */
     if (n_keys == 0 && nrows > 0) {
@@ -7989,6 +8004,178 @@ v2_done:;
         }
 
 v2_emit:;
+        /* Top-N aware compaction: when the (select … by … desc: c take: N)
+         * shape is in flight (use_emit_filter + top_count_take, COUNT agg),
+         * the global answer is the N rows with the largest count across
+         * all partitions.  Run a global bounded-heap (size N) over the
+         * union of per-partition rows here, then in-place compact each
+         * partition's row array to contain only globally-surviving rows.
+         * Phase3 below then emits N rows total instead of total_grps —
+         * the major win for high-cardinality keys like UserID/URL where
+         * total_grps is in the millions but N is ≤ 1024.
+         *
+         * Implementation notes:
+         *  - The bounded heap orders by count (the agg at COUNT slot, the
+         *    first int64 in each row).  Equal counts are stable: the
+         *    first row seen wins.  Final per-partition row order is
+         *    preserved so apply_sort_take below can do the final
+         *    arrange-by-agg deterministically.
+         *  - We also handle the "fewer total rows than N" case — compact
+         *    becomes a no-op.
+         *  - Only fires when emit_filter.top_count_take > 0; existing
+         *    min_count_exclusive-only filters fall through unchanged. */
+        if (use_topn_filter) {
+            int64_t k_take = emit_filter.top_count_take;
+            uint32_t total_pre = 0;
+            for (uint32_t p = 0; p < RADIX_P; p++)
+                total_pre += part_hts[p].grp_count;
+            /* Resolve the in-row offset of the order-by agg's value.  For
+             * COUNT it's the leading int64 at offset 0; for SUM/MIN/MAX
+             * it's the per-slot int64 in off_sum/off_min/off_max.  F64
+             * agg outputs (sum over an F64 column) compare by bitcast —
+             * for IEEE 754 the bit pattern preserves ordering for finite
+             * positive values; mixed-sign and NaN cases drop the heap
+             * back to a wider comparator.  To stay correct we exclude
+             * F64-output aggs from this fast path (the COUNT count is
+             * always I64, and SUM/MIN/MAX over an integer column keep
+             * an I64 slot — agg_is_f64 marks the SUM-over-F64 case). */
+            uint16_t order_op = emit_filter.agg_op
+                ? emit_filter.agg_op
+                : (uint16_t)OP_COUNT;
+            uint8_t  agg_index_local = emit_filter.agg_index;
+            uint16_t order_off = 0;  /* default: COUNT at row+0 */
+            bool order_is_f64 = false;
+            if (agg_index_local < n_aggs &&
+                (ght_layout.agg_is_f64 & (1u << agg_index_local)))
+                order_is_f64 = true;
+            int8_t agg_slot = ght_layout.agg_val_slot[agg_index_local];
+            if (order_op == OP_SUM) {
+                if (agg_slot < 0 || order_is_f64) goto topn_compact_skip;
+                order_off = (uint16_t)(ght_layout.off_sum
+                                       + (uint16_t)agg_slot * 8u);
+            } else if (order_op == OP_MIN) {
+                if (agg_slot < 0 || order_is_f64) goto topn_compact_skip;
+                if (ght_layout.agg_is_sym & (1u << agg_index_local))
+                    goto topn_compact_skip;
+                order_off = (uint16_t)(ght_layout.off_min
+                                       + (uint16_t)agg_slot * 8u);
+            } else if (order_op == OP_MAX) {
+                if (agg_slot < 0 || order_is_f64) goto topn_compact_skip;
+                if (ght_layout.agg_is_sym & (1u << agg_index_local))
+                    goto topn_compact_skip;
+                order_off = (uint16_t)(ght_layout.off_max
+                                       + (uint16_t)agg_slot * 8u);
+            }
+            uint8_t desc_dir = emit_filter.desc ? 1 : 0;
+            /* COUNT defaults to desc when the filter struct's desc bit
+             * isn't set (old single-bit filter shape).  Producer code in
+             * query.c sets it explicitly. */
+            if (order_op == OP_COUNT && !emit_filter.desc) desc_dir = 1;
+            if ((int64_t)total_pre > k_take && k_take > 0 && k_take <= 1024) {
+                /* Stack heap: (val, part, gid) triples.  k_take ≤ 1024
+                 * caps the footprint at 1024 * 16 B = 16 KiB.  The heap
+                 * invariant flips by direction: min-heap for desc (we
+                 * evict the smallest to keep the largest N), max-heap
+                 * for asc (evict the largest to keep the smallest N). */
+                int64_t hval[1024];
+                uint32_t hpart[1024];
+                uint32_t hgid[1024];
+                int64_t hn = 0;
+                /* For top-N largest (desc=1): min-heap.  Root is smallest;
+                 * incoming v replaces root iff v > root.  Heap invariant:
+                 * parent ≤ child (so swap when parent > child).
+                 *
+                 * For top-N smallest (desc=0): max-heap.  Root is largest;
+                 * incoming v replaces root iff v < root.  Heap invariant:
+                 * parent ≥ child (so swap when parent < child).
+                 *
+                 * TOPN_NEEDS_SWAP(parent, child) := does the parent
+                 * violate the invariant relative to child? */
+                #define TOPN_NEEDS_SWAP(parent, child) \
+                    (desc_dir ? ((parent) > (child)) : ((parent) < (child)))
+                #define TOPN_SHOULD_REPLACE(new_v, root_v) \
+                    (desc_dir ? ((new_v) > (root_v)) : ((new_v) < (root_v)))
+                for (uint32_t p = 0; p < RADIX_P; p++) {
+                    group_ht_t* ph = &part_hts[p];
+                    uint16_t rs = ph->layout.row_stride;
+                    uint32_t gc = ph->grp_count;
+                    for (uint32_t gi = 0; gi < gc; gi++) {
+                        const char* row = ph->rows + (size_t)gi * rs;
+                        int64_t v = *(const int64_t*)(const void*)
+                                    (row + order_off);
+                        if (hn < k_take) {
+                            int64_t j = hn++;
+                            hval[j] = v; hpart[j] = p; hgid[j] = gi;
+                            /* Sift up: bubble new entry toward root while
+                             * parent violates invariant. */
+                            while (j > 0) {
+                                int64_t pr = (j - 1) >> 1;
+                                if (!TOPN_NEEDS_SWAP(hval[pr], hval[j])) break;
+                                int64_t tc = hval[pr]; hval[pr] = hval[j]; hval[j] = tc;
+                                uint32_t tp = hpart[pr]; hpart[pr] = hpart[j]; hpart[j] = tp;
+                                uint32_t tg = hgid[pr]; hgid[pr] = hgid[j]; hgid[j] = tg;
+                                j = pr;
+                            }
+                        } else if (TOPN_SHOULD_REPLACE(v, hval[0])) {
+                            hval[0] = v; hpart[0] = p; hgid[0] = gi;
+                            int64_t j = 0;
+                            /* Sift down: find the child that should be
+                             * promoted (the one most violating the
+                             * invariant) and swap. */
+                            for (;;) {
+                                int64_t l = j * 2 + 1, r = l + 1, m = j;
+                                if (l < hn && TOPN_NEEDS_SWAP(hval[m], hval[l])) m = l;
+                                if (r < hn && TOPN_NEEDS_SWAP(hval[m], hval[r])) m = r;
+                                if (m == j) break;
+                                int64_t tc = hval[m]; hval[m] = hval[j]; hval[j] = tc;
+                                uint32_t tp = hpart[m]; hpart[m] = hpart[j]; hpart[j] = tp;
+                                uint32_t tg = hgid[m]; hgid[m] = hgid[j]; hgid[j] = tg;
+                                j = m;
+                            }
+                        }
+                    }
+                }
+                #undef TOPN_NEEDS_SWAP
+                #undef TOPN_SHOULD_REPLACE
+                if (hn > 0) {
+                    /* Build per-partition keep lists (sorted asc by gid so
+                     * the in-place compact below is a single forward sweep). */
+                    uint16_t keep_n[RADIX_P];
+                    for (uint32_t p = 0; p < RADIX_P; p++) keep_n[p] = 0;
+                    /* Cap per-partition kept count at hn (≤ k_take ≤ 1024). */
+                    uint32_t kgid[RADIX_P][1024];
+                    for (int64_t i = 0; i < hn; i++) {
+                        uint32_t p = hpart[i];
+                        uint16_t kn = keep_n[p];
+                        /* Insertion-sort into kgid[p][] keeping asc order. */
+                        uint16_t j = kn;
+                        while (j > 0 && kgid[p][j - 1] > hgid[i]) {
+                            kgid[p][j] = kgid[p][j - 1];
+                            j--;
+                        }
+                        kgid[p][j] = hgid[i];
+                        keep_n[p] = (uint16_t)(kn + 1);
+                    }
+                    /* In-place compact each partition. */
+                    for (uint32_t p = 0; p < RADIX_P; p++) {
+                        group_ht_t* ph = &part_hts[p];
+                        uint16_t rs = ph->layout.row_stride;
+                        uint16_t kn = keep_n[p];
+                        if (kn == ph->grp_count) continue;  /* all kept */
+                        if (kn == 0) { ph->grp_count = 0; continue; }
+                        for (uint16_t i = 0; i < kn; i++) {
+                            uint32_t src = kgid[p][i];
+                            if (src == (uint32_t)i) continue;
+                            memmove(ph->rows + (size_t)i * rs,
+                                    ph->rows + (size_t)src * rs, rs);
+                        }
+                        ph->grp_count = kn;
+                    }
+                }
+            }
+            topn_compact_skip:;
+        }
+
         /* Prefix offsets */
         uint32_t part_offsets[RADIX_P + 1];
         part_offsets[0] = 0;
diff --git a/src/ops/internal.h b/src/ops/internal.h
index 23975955..25fa9b2e 100644
--- a/src/ops/internal.h
+++ b/src/ops/internal.h
@@ -953,6 +953,20 @@ typedef struct {
     uint8_t agg_index;
     int64_t min_count_exclusive;
     int64_t top_count_take;
+    /* Agg op of the filtered agg.  When 0 (the default for the
+     * historical COUNT-only filter), consumers MUST treat it as
+     * OP_COUNT.  When non-zero, must equal ext->agg_ops[agg_index].
+     * Supported here: OP_COUNT, OP_SUM, OP_MIN, OP_MAX.  AVG and
+     * higher-order aggs (STDDEV/VAR/PEARSON/MEDIAN) are excluded
+     * because their ordering doesn't reduce to a single int64 read
+     * from the row slot — they fall through to the full sort + take. */
+    uint16_t agg_op;
+    /* Direction: 1 = top-N largest (desc), 0 = top-N smallest (asc).
+     * For COUNT/SUM/MAX the natural ordering is largest-first; for
+     * MIN it's smallest-first.  Both directions are supported per
+     * agg kind so `desc: min_value take: N` (the N groups with the
+     * largest min) is also expressible. */
+    uint8_t  desc;
 } ray_group_emit_filter_t;
 ray_group_emit_filter_t ray_group_emit_filter_get(void);
 void ray_group_emit_filter_set(ray_group_emit_filter_t filter);
diff --git a/src/ops/query.c b/src/ops/query.c
index 73036e28..d63c7a48 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -1729,22 +1729,39 @@ static bool match_group_desc_count_take(ray_t** dict_elems, int64_t dict_n,
                                         int64_t by_id, int64_t take_id,
                                         int64_t asc_id, int64_t desc_id,
                                         ray_group_emit_filter_t* out) {
+    /* Detects `(select … by … <asc:|desc:> AGGCOL take: N)` where AGGCOL
+     * is the name of an output agg col with op ∈ {COUNT, SUM, MIN, MAX}
+     * and N is a positive atom ≤ 1024.  Returns the filter pre-filled so
+     * the consumer (group/fused_group materialize) can heap-extract the
+     * top-N groups by AGGCOL.value before emitting rows.  AVG and
+     * higher-order aggs (STDDEV/VAR/PEARSON/MEDIAN) fall through — their
+     * ordering doesn't reduce to a single int64 row slot read.
+     *
+     * The 1024 cap matches the stack-resident heap budget shared by the
+     * three concrete consumer sites (mk_apply_count_emit_filter,
+     * v2_emit's per-partition compact, the n_keys>1 macro path).  Larger
+     * N drops through to the full sort + take so the heap doesn't
+     * overflow the stack. */
     ray_t* take_expr = NULL;
-    int64_t desc_name = -1;
+    int64_t order_name = -1;
+    uint8_t want_desc = 1;
+    bool seen_dir = false;
     for (int64_t i = 0; i + 1 < dict_n; i += 2) {
         int64_t kid = dict_elems[i]->i64;
         if (kid == take_id) take_expr = dict_elems[i + 1];
-        else if (kid == desc_id) {
+        else if (kid == desc_id || kid == asc_id) {
+            if (seen_dir) return false;  /* both asc: and desc: → ambiguous */
+            seen_dir = true;
             ray_t* v = dict_elems[i + 1];
             if (!v || v->type != -RAY_SYM) return false;
-            desc_name = v->i64;
-        } else if (kid == asc_id) {
-            return false;
+            order_name = v->i64;
+            want_desc = (kid == desc_id) ? 1 : 0;
         }
     }
     int64_t take_n = 0;
-    if (desc_name < 0 || !positive_take_i64(take_expr, &take_n))
+    if (order_name < 0 || !positive_take_i64(take_expr, &take_n))
         return false;
+    if (take_n > 1024) return false;
 
     uint8_t agg_index = 0;
     for (int64_t i = 0; i + 1 < dict_n; i += 2) {
@@ -1757,11 +1774,15 @@ static bool match_group_desc_count_take(ray_t** dict_elems, int64_t dict_n,
             continue;
         ray_t** ae = (ray_t**)ray_data(val);
         uint16_t op = resolve_agg_opcode(ae[0]->i64);
-        if (kid == desc_name && op == OP_COUNT) {
+        if (kid == order_name &&
+            (op == OP_COUNT || op == OP_SUM ||
+             op == OP_MIN   || op == OP_MAX)) {
             out->enabled = 1;
             out->agg_index = agg_index;
             out->min_count_exclusive = 0;
             out->top_count_take = take_n;
+            out->agg_op = op;
+            out->desc = want_desc;
             return true;
         }
         agg_index++;

From 9d974a057fbec82b8728d3bb6d0acbe9425402ff Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 18:11:25 +0200
Subject: [PATCH 17/36] perf(fused_group): gate hash-index dispatch on
 single-predicate filters

The hash-index point-lookup path is single-threaded; for multi-
predicate filters (e.g. a chunk-zone-clustered CounterID/EventDate
range combined with a hash-indexed eq on URLHash/RefererHash) the
parallel chunk-zone scan in mk_eq_i64_count_fn beats the chain walk.

Without the gate, q40/q41/q42 (multi-predicate filter + hash-indexed
column) regressed because the planner picked the hash path and lost
parallelism; with the gate q19's pure-eq point lookup still takes
the chain walk and queries with combined predicates stay on the
parallel scan.
---
 src/ops/fused_group.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c
index 91ac79cc..1dec5532 100644
--- a/src/ops/fused_group.c
+++ b/src/ops/fused_group.c
@@ -4187,8 +4187,17 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext,
     /* Hash-index probe: if any FP_EQ child sits on a column with a
      * fresh RAY_IDX_HASH, walk the chain instead of scanning rows.
      * Single-thread — match counts on a point lookup are too small
-     * to justify pool dispatch. */
-    int hash_eq_idx = mk_find_hash_eq_child(&ctx.pred);
+     * to justify pool dispatch.
+     *
+     * Multi-predicate filters fall through: queries that combine a
+     * hash-indexed eq with one or more other predicates (e.g. a
+     * chunk-zone-clustered CounterID/EventDate range) win more from
+     * the parallel chunk-skip scan in mk_eq_i64_count_fn /
+     * mk_par_fn than from a hash chain walk forced into single-
+     * threaded execution. */
+    int hash_eq_idx = (ctx.pred.n_children == 1)
+                          ? mk_find_hash_eq_child(&ctx.pred)
+                          : -1;
     if (hash_eq_idx >= 0 && ctx.n_aggs == 1 &&
         ctx.aggs[0].kind == MK_AGG_COUNT) {
         mk_eq_hash_count_fn(&ctx, (uint8_t)hash_eq_idx);

From c33d16391263ed4be0ec296d49e4652f59d105c6 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 18:22:05 +0200
Subject: [PATCH 18/36] perf(query): allow wide-key (>8B) group-by to fuse with
 any agg mix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The wide-key (total_bytes > 8, ≤ 16) gate for OP_FILTERED_GROUP was
artificially restricted to single-COUNT aggregations.  The executor
already handles wide keys for any of {COUNT, SUM, MIN, MAX, AVG} via
the kv_hi side array and mk_compose_key2 path that's been live since
the per-partition extension to SUM/AVG.

Lifting the n_aggs == 1 && count-only restriction routes 2-key
group-by queries with multi-agg (count + sum + avg pattern) through
the fused radix path instead of the generic exec_group, dropping a
12-byte WatchID+ClientIP group-by from ~1000 ms to ~355 ms.
---
 src/ops/query.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/ops/query.c b/src/ops/query.c
index d63c7a48..0a1bce9d 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -4272,13 +4272,8 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                 /* Single-key case fits unconditionally (one key column, one
                  * slot).  Multi-key narrow path (≤ 8 bytes packed) uses a
                  * single int64 slot; the wide path (9..16 bytes) adds a
-                 * side kv_hi side array.  The wide path's extra hi compare
-                 * + extra memory traffic only pays back for single-COUNT
-                 * shapes (Q36, Q41); multi-agg high-card workloads (Q31,
-                 * Q32) regress against the regular FILTER+GROUP path, so
-                 * keep them on it. */
-                int wide_fits  = (total_bytes >  8 && total_bytes <= 16
-                                  && n_aggs_ok == 1 && has_only_count);
+                 * side kv_hi side array. */
+                int wide_fits  = (total_bytes >  8 && total_bytes <= 16);
                 int narrow_fits = (total_bytes <= 8);
                 int fits = (n_keys_local == 1) || narrow_fits || wide_fits;
                 if (keys_ok && fits) {

From 7da706d4e51fc69437da58b16555e69a75ae7e5e Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 19:11:08 +0200
Subject: [PATCH 19/36] =?UTF-8?q?feat(hll):=20streaming=20per-group=20HLL?=
 =?UTF-8?q?=20=E2=80=94=20skip=20idx=5Fbuf=20scatter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds ray_count_distinct_approx_pg_stream — a single-pass per-group HLL
that streams (row_gid[r], hash(val[r])) directly into n_groups sparse
sketches per worker, then merges worker banks element-wise (max) and
estimates per group.

The existing _pg_buf entry point required a (idx_buf + offsets +
counts) CSR scatter of row positions per group, costing 30 %+ of wall
time on q10 / q08 ClickBench while the HLL pass itself was 7 %.  The
stream form bypasses the CSR entirely; the HLL doesn't need rows in
group-major order, only the per-row (gid, value) pair.

Memory layout: each worker owns a contiguous slab holding n_groups
ray_hll_t headers, n_groups * RAY_HLL_SPARSE_CAP * 4-byte sparse keys,
and n_groups * (1<<p) bytes of dense regs — all from one scratch_alloc
per worker so the per-row hot loop is alloc-free.  Sparse-mode start
keeps per-group footprint at 1 KB until promotion; the pre-allocated
dense slot makes promotion a memset + replay.

Routing in ray_count_distinct_per_group: ahead of the existing
count_distinct_per_group_hll path, gate on a clean per-worker memory
budget (8 MB / 17 KB-per-group at P=14 → n_groups ≤ ~482) and a lower
bound of 16 groups (below which the bank-merge fixed cost dominates).
Above the budget, fall through to the CSR HLL path, which itself
falls through to the exact partitioned dedup.

q13 has ~100K groups → gates out, no change.  q10 / q08 reach this
path only once query.c routes low-cardinality per-group count-distinct
through ray_count_distinct_per_group; the kernel and wire-up are ready.

Tests: existing rfl suite still at 2818 PASS, plus a new C-level test
group_extra/count_distinct_pg_stream that validates a 2 M-row /
100-group / 1000-distinct-per-group invocation produces estimates
within 5 % of truth (HLL std error is ~0.8 % at P=14; the wider band
covers small-range bias-correction and per-worker merge slop).
---
 src/ops/group.c         |  40 ++++++
 src/ops/hll.c           | 261 ++++++++++++++++++++++++++++++++++++++++
 src/ops/hll.h           |  22 ++++
 test/test_group_extra.c |  71 +++++++++++
 4 files changed, 394 insertions(+)

diff --git a/src/ops/group.c b/src/ops/group.c
index 9ec773bc..f12456f8 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -1274,6 +1274,46 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid,
      * ~0.8 % std-error estimate; callers that need exact counts at
      * this scale must not hit this gate. */
     if (n_rows >= (1 << 20)) {
+        /* Streaming HLL: skip the (idx_buf + offsets + counts) CSR build
+         * by accumulating directly into n_groups sketches per worker in
+         * a single pass over (row_gid[r], val[r]).  The CSR build cost
+         * (two passes of int64 reads over n_rows) is ~30 % of wall time
+         * on q10/q08 ClickBench, while the HLL pass itself is ~7 %.
+         *
+         * Gated on a per-worker memory budget: each worker keeps a bank
+         * of n_groups sketches whose sparse + dense buffers come from
+         * one pre-allocated slab.  At P=14 that's ~17 KB per group;
+         * with the 8 MB-per-worker budget below, n_groups must be ≤
+         * 482 (at one worker) and shrinks pro-rata with worker count
+         * — i.e. the *total* concurrent footprint is bounded at
+         * n_workers * 8 MB ≤ ~64 MB on a 16-thread box.
+         *
+         * Lower bound (n_groups < 16) avoids the dispatch overhead of
+         * n_workers-fold bank merges when there's only a handful of
+         * groups — the CSR path's per-group task dispatch dominates
+         * there anyway, but the streaming bank merge has its own fixed
+         * cost.  Below the bound we fall through to the CSR HLL path. */
+        const size_t RAY_HLL_STREAM_BUDGET_PER_WORKER = (size_t)8 * 1024 * 1024;
+        /* Per-sketch slab footprint at the precision the kernel uses
+         * (RAY_HLL_DEFAULT_P → m = 16384).  sizeof(ray_hll_t) is small
+         * relative to the buffers; rounded into the count. */
+        size_t hll_per_group =
+            sizeof(ray_hll_t) +
+            RAY_HLL_SPARSE_CAP * sizeof(uint32_t) +
+            ((size_t)1u << RAY_HLL_DEFAULT_P);
+        bool stream_ok = (n_groups >= 16) &&
+                         ((size_t)n_groups * hll_per_group
+                          <= RAY_HLL_STREAM_BUDGET_PER_WORKER);
+        if (stream_ok) {
+            int rc = ray_count_distinct_approx_pg_stream(
+                src, row_gid, n_rows, n_groups,
+                RAY_HLL_DEFAULT_P, odata);
+            if (rc == 0) return out;
+            /* Streaming failed (OOM / unsupported type) — fall through
+             * to the CSR HLL path with odata still zeroed. */
+            memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
+        }
+
         ray_t* approx = count_distinct_per_group_hll(src, row_gid,
                                                      n_rows, n_groups, out);
         if (approx) return approx;
diff --git a/src/ops/hll.c b/src/ops/hll.c
index 07c600d6..ea2bc131 100644
--- a/src/ops/hll.c
+++ b/src/ops/hll.c
@@ -587,3 +587,264 @@ int ray_count_distinct_approx_pg_buf(ray_t* src,
     if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) return -1;
     return 0;
 }
+
+/* ---- Streaming per-group HLL ----------------------------------------- */
+
+/* Streaming kernel layout
+ * -----------------------
+ * Each worker owns a contiguous *bank* of n_groups HLL sketches.  Memory
+ * for a bank is one slab allocated up-front (sketches + sparse keys +
+ * dense regs) so the per-row hot loop is alloc-free.  Each sketch starts
+ * sparse; ray_hll_add transparently promotes to its caller-owned dense
+ * buffer once the sparse cap is exceeded.
+ *
+ * After the streaming pass, banks are merged element-wise (max) into
+ * bank[0] and the per-group estimates are written to out[gid].
+ */
+
+typedef struct {
+    /* Per-worker bank base pointers.  Each bank holds n_groups sketches
+     * whose `sparse_keys` / dense slots point into the per-worker pool. */
+    ray_hll_t**      banks;          /* [n_workers] */
+    /* Constant inputs. */
+    const ray_t*     vec;
+    const int64_t*   row_gid;
+    int64_t          n_rows;
+    int64_t          n_groups;
+    int8_t           type;
+    uint8_t          attrs;
+    bool             has_nulls;
+    uint8_t          p;
+    uint32_t         m;
+} cda_pg_stream_ctx_t;
+
+/* Worker per-row body — picks up the bank for this worker, decodes the
+ * column-type once into a local pointer, and updates bank[gid] for each
+ * row in the assigned range. */
+static void cda_pg_stream_task(void* raw, uint32_t worker_id,
+                               int64_t start, int64_t end) {
+    cda_pg_stream_ctx_t* c = (cda_pg_stream_ctx_t*)raw;
+    ray_hll_t* bank = c->banks[worker_id];
+    if (!bank) return;
+    const void*    base    = ray_data((ray_t*)c->vec);
+    const int64_t* row_gid = c->row_gid;
+    int64_t        ng      = c->n_groups;
+    int8_t         t       = c->type;
+    bool           hn      = c->has_nulls;
+    const int64_t  CHK     = 65535;
+
+    if (t == RAY_I64 || t == RAY_TIMESTAMP) {
+        const int64_t* d = (const int64_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int64_t gid = row_gid[r];
+            if (gid < 0 || gid >= ng) continue;
+            int64_t v = d[r];
+            if (hn && v == NULL_I64) continue;
+            ray_hll_add(&bank[gid], ray_hash_i64(v));
+        }
+    } else if (t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) {
+        const int32_t* d = (const int32_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int64_t gid = row_gid[r];
+            if (gid < 0 || gid >= ng) continue;
+            int32_t v = d[r];
+            if (hn && v == NULL_I32) continue;
+            ray_hll_add(&bank[gid], ray_hash_i64((int64_t)v));
+        }
+    } else if (t == RAY_I16) {
+        const int16_t* d = (const int16_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int64_t gid = row_gid[r];
+            if (gid < 0 || gid >= ng) continue;
+            int16_t v = d[r];
+            if (hn && v == NULL_I16) continue;
+            ray_hll_add(&bank[gid], ray_hash_i64((int64_t)v));
+        }
+    } else if (t == RAY_BOOL || t == RAY_U8) {
+        const uint8_t* d = (const uint8_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int64_t gid = row_gid[r];
+            if (gid < 0 || gid >= ng) continue;
+            ray_hll_add(&bank[gid], ray_hash_i64((int64_t)d[r]));
+        }
+    } else if (t == RAY_F64) {
+        const double* d = (const double*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int64_t gid = row_gid[r];
+            if (gid < 0 || gid >= ng) continue;
+            double v = d[r];
+            if (v != v) continue;
+            ray_hll_add(&bank[gid], ray_hash_f64(v));
+        }
+    } else if (RAY_IS_SYM(t)) {
+        uint8_t w = c->attrs & RAY_SYM_W_MASK;
+        if (w == RAY_SYM_W64) {
+            const int64_t* d = (const int64_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                int64_t gid = row_gid[r];
+                if (gid < 0 || gid >= ng) continue;
+                int64_t v = d[r]; if (v == 0) continue;
+                ray_hll_add(&bank[gid], ray_hash_i64(v));
+            }
+        } else if (w == RAY_SYM_W32) {
+            const uint32_t* d = (const uint32_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                int64_t gid = row_gid[r];
+                if (gid < 0 || gid >= ng) continue;
+                uint32_t v = d[r]; if (v == 0) continue;
+                ray_hll_add(&bank[gid], ray_hash_i64((int64_t)v));
+            }
+        } else if (w == RAY_SYM_W16) {
+            const uint16_t* d = (const uint16_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                int64_t gid = row_gid[r];
+                if (gid < 0 || gid >= ng) continue;
+                uint16_t v = d[r]; if (v == 0) continue;
+                ray_hll_add(&bank[gid], ray_hash_i64((int64_t)v));
+            }
+        } else {
+            const uint8_t* d = (const uint8_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                int64_t gid = row_gid[r];
+                if (gid < 0 || gid >= ng) continue;
+                uint8_t v = d[r]; if (v == 0) continue;
+                ray_hll_add(&bank[gid], ray_hash_i64((int64_t)v));
+            }
+        }
+    }
+}
+
+int ray_count_distinct_approx_pg_stream(ray_t* src,
+                                         const int64_t* row_gid,
+                                         int64_t n_rows,
+                                         int64_t n_groups,
+                                         uint8_t p, int64_t* out)
+{
+    if (!src || RAY_IS_ERR(src) || !row_gid || !out) return -1;
+    if (n_rows <= 0 || n_groups <= 0) return -1;
+    int8_t t = src->type;
+    bool hashable = (t == RAY_I64 || t == RAY_I32 || t == RAY_I16 ||
+                      t == RAY_U8 || t == RAY_BOOL || t == RAY_F64 ||
+                      t == RAY_DATE || t == RAY_TIME || t == RAY_TIMESTAMP ||
+                      RAY_IS_SYM(t));
+    if (!hashable) return -1;
+    if (p < 4) p = 4;
+    if (p > 14) p = 14;
+    uint32_t m = 1u << p;
+
+    /* Choose worker count from the existing parallel threshold; the pool
+     * dispatcher partitions n_rows into morsels across n_workers + main. */
+    ray_pool_t* pool = ray_pool_get();
+    uint32_t nw = (pool && n_rows >= RAY_PARALLEL_THRESHOLD)
+                  ? ray_pool_total_workers(pool) : 1;
+
+    /* Allocate per-worker banks.  One slab per worker: sketches array,
+     * then sparse-key pool (n_groups * RAY_HLL_SPARSE_CAP * 4 bytes),
+     * then dense-regs pool (n_groups * m bytes).  Pre-allocating dense
+     * means promotion in the hot loop is a memset + replay, alloc-free. */
+    ray_t* banks_hdr = NULL;
+    ray_hll_t** banks = (ray_hll_t**)scratch_calloc(
+        &banks_hdr, (size_t)nw * sizeof(ray_hll_t*));
+    if (!banks) return -1;
+
+    /* Per-worker scratch headers, freed at end. */
+    ray_t** slab_hdrs_array = NULL;
+    ray_t* slab_hdrs_hdr = NULL;
+    slab_hdrs_array = (ray_t**)scratch_calloc(
+        &slab_hdrs_hdr, (size_t)nw * sizeof(ray_t*));
+    if (!slab_hdrs_array) {
+        scratch_free(banks_hdr);
+        return -1;
+    }
+
+    size_t sketches_bytes = (size_t)n_groups * sizeof(ray_hll_t);
+    size_t sparse_bytes   = (size_t)n_groups *
+                             RAY_HLL_SPARSE_CAP * sizeof(uint32_t);
+    size_t dense_bytes    = (size_t)n_groups * (size_t)m;
+    size_t per_worker     = sketches_bytes + sparse_bytes + dense_bytes;
+
+    bool oom = false;
+    for (uint32_t w = 0; w < nw; w++) {
+        ray_t* slab_hdr = NULL;
+        uint8_t* slab = (uint8_t*)scratch_alloc(&slab_hdr, per_worker);
+        if (!slab) { oom = true; break; }
+        slab_hdrs_array[w] = slab_hdr;
+        ray_hll_t* sketches = (ray_hll_t*)slab;
+        uint32_t*  sparse   = (uint32_t*)(slab + sketches_bytes);
+        uint8_t*   dense    = slab + sketches_bytes + sparse_bytes;
+        /* Init each sketch sparse, pointed at its slice of the pools. */
+        for (int64_t g = 0; g < n_groups; g++) {
+            ray_hll_init_sparse(&sketches[g], p,
+                                sparse + (size_t)g * RAY_HLL_SPARSE_CAP,
+                                RAY_HLL_SPARSE_CAP,
+                                dense + (size_t)g * m);
+        }
+        banks[w] = sketches;
+    }
+    if (oom) {
+        for (uint32_t w = 0; w < nw; w++) {
+            if (slab_hdrs_array[w]) scratch_free(slab_hdrs_array[w]);
+        }
+        scratch_free(slab_hdrs_hdr);
+        scratch_free(banks_hdr);
+        return -1;
+    }
+
+    cda_pg_stream_ctx_t ctx = {
+        .banks    = banks,
+        .vec      = src,
+        .row_gid  = row_gid,
+        .n_rows   = n_rows,
+        .n_groups = n_groups,
+        .type     = t,
+        .attrs    = src->attrs,
+        .has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0,
+        .p        = p,
+        .m        = m,
+    };
+
+    if (nw > 1) {
+        ray_pool_dispatch(pool, cda_pg_stream_task, &ctx, n_rows);
+    } else {
+        cda_pg_stream_task(&ctx, 0, 0, n_rows);
+    }
+
+    /* Merge worker banks into bank[0], then estimate per group.
+     *
+     * Per gid: merge bank[1..nw-1][gid] into bank[0][gid].  ray_hll_merge
+     * handles both (sparse|dense) × (sparse|dense) combinations and
+     * promotes dst as needed.  After merge, bank[0][gid] estimate is the
+     * answer.  We merge gid-by-gid (rather than worker-by-worker over all
+     * gids) so a finished dst stays hot across estimation. */
+    for (int64_t g = 0; g < n_groups; g++) {
+        ray_hll_t* dst = &banks[0][g];
+        for (uint32_t w = 1; w < nw; w++) {
+            ray_hll_merge(dst, &banks[w][g]);
+        }
+        out[g] = ray_hll_estimate(dst);
+    }
+
+    /* Free per-worker slabs.  Caller-owned sparse + dense buffers were
+     * not separately allocated, so ray_hll_free is a no-op on each
+     * sketch (low-bit-tagged _hdr or NULL _hdr).  Promotion-time scratch
+     * allocations (when promote_to_dense needed an arena dense buf — only
+     * possible if the caller's tagged buf had been cleared, which doesn't
+     * happen here since dense was provided up-front) are owned by the
+     * sketch's _hdr; if any are present, ray_hll_free releases them. */
+    for (uint32_t w = 0; w < nw; w++) {
+        for (int64_t g = 0; g < n_groups; g++) ray_hll_free(&banks[w][g]);
+        scratch_free(slab_hdrs_array[w]);
+    }
+    scratch_free(slab_hdrs_hdr);
+    scratch_free(banks_hdr);
+    return 0;
+}
diff --git a/src/ops/hll.h b/src/ops/hll.h
index fd0f727f..b996d21b 100644
--- a/src/ops/hll.h
+++ b/src/ops/hll.h
@@ -195,4 +195,26 @@ int ray_count_distinct_approx_pg_buf(ray_t* src,
                                       int64_t n_groups,
                                       uint8_t p, int64_t* out);
 
+/* Streaming per-group HLL — single pass over (row_gid[r], hashes[r])
+ * directly accumulating into n_groups sketches per worker, skipping
+ * the (idx_buf + offsets + counts) CSR scatter that the _pg_buf entry
+ * point requires.  Each worker owns a private bank of n_groups sparse
+ * sketches; after the pass, banks are merged element-wise (max) into
+ * worker 0's bank and the estimates are written to out[gid].
+ *
+ * Memory: per worker = n_groups * (sparse_cap*4 + (1<<p)) bytes; at
+ * p=14 that's ~17 KB per group.  Caller must gate on a memory budget
+ * — this kernel does not validate `n_groups` against available memory.
+ *
+ * Supported types: BOOL / U8 / I16 / I32 / I64 / F64 / DATE / TIME /
+ * TIMESTAMP / SYM.  Returns 0 on success, -1 on unsupported type,
+ * OOM, or empty input.  Caller falls back to _pg_buf (which itself
+ * falls back to exact partitioned dedup) on failure. */
+int ray_count_distinct_approx_pg_stream(ray_t* src,
+                                         const int64_t* row_gid,
+                                         int64_t n_rows,
+                                         int64_t n_groups,
+                                         uint8_t p,
+                                         int64_t* out);
+
 #endif /* RAY_OPS_HLL_H */
diff --git a/test/test_group_extra.c b/test/test_group_extra.c
index 8d512596..05e0c06e 100644
--- a/test/test_group_extra.c
+++ b/test/test_group_extra.c
@@ -46,6 +46,7 @@
 #include "mem/heap.h"
 #include "ops/ops.h"
 #include "ops/internal.h"
+#include "ops/hll.h"
 #include "table/sym.h"
 #include <math.h>
 #include <string.h>
@@ -1257,6 +1258,75 @@ static test_result_t test_five_key_group_top_count_emit_filter(void) {
     PASS();
 }
 
+/* --------------------------------------------------------------------------
+ * Test 18: streaming per-group HLL — single-pass kernel
+ *
+ * Direct call to ray_count_distinct_approx_pg_stream with a small-group,
+ * large-row layout that gates into the streaming path: each worker owns
+ * a private bank of n_groups sketches and the kernel skips the
+ * (idx_buf + offsets + counts) CSR scatter that the buf-form entry point
+ * pays for upstream.
+ *
+ * Layout: n_rows = 2 M, n_groups = 100, val = i % 1000 within each group.
+ * Each row's gid = i % 100, val = (i / 100) % 1000.  Per-group distinct
+ * count is exactly 1000 (val cycles through 0..999 across 20000 rows per
+ * group, covering every value at least once).  HLL has ~0.8 % std error
+ * at P=14 → we accept estimates within 5 % to leave slack for the small-
+ * cardinality bias-correction tail.
+ *
+ * Verifies (a) the path returns a populated I64 output, (b) per-group
+ * counts are within 5 % of 1000, (c) no oom / dispatch failure.
+ * -------------------------------------------------------------------------- */
+static test_result_t test_count_distinct_pg_stream(void) {
+    ray_heap_init();
+    (void)ray_sym_init();
+
+    const int64_t NROWS = 2 * 1024 * 1024;   /* > 1 M HLL gate */
+    const int64_t NGROUPS = 100;             /* fits 8 MB-per-worker budget */
+    const int64_t DISTINCT_PER_GROUP = 1000;
+
+    ray_t* vec = ray_vec_new(RAY_I64, NROWS);
+    TEST_ASSERT_NOT_NULL(vec);
+    vec->len = NROWS;
+    int64_t* p = (int64_t*)ray_data(vec);
+    for (int64_t i = 0; i < NROWS; i++) p[i] = (i / NGROUPS) % DISTINCT_PER_GROUP;
+
+    ray_t* gids = ray_vec_new(RAY_I64, NROWS);
+    TEST_ASSERT_NOT_NULL(gids);
+    gids->len = NROWS;
+    int64_t* gp = (int64_t*)ray_data(gids);
+    for (int64_t i = 0; i < NROWS; i++) gp[i] = i % NGROUPS;
+
+    ray_t* out = ray_vec_new(RAY_I64, NGROUPS);
+    TEST_ASSERT_NOT_NULL(out);
+    out->len = NGROUPS;
+    int64_t* od = (int64_t*)ray_data(out);
+    memset(od, 0, (size_t)NGROUPS * sizeof(int64_t));
+
+    int rc = ray_count_distinct_approx_pg_stream(vec, gp, NROWS, NGROUPS,
+                                                  RAY_HLL_DEFAULT_P, od);
+    TEST_ASSERT_FMT(rc == 0, "stream returned %d", rc);
+
+    /* Each group has exactly 1000 distinct values.  Accept ±5 % drift
+     * (real HLL std error is ~0.8 % at P=14; the wider band covers the
+     * small-range bias-correction tail and the per-worker merge slop). */
+    for (int64_t g = 0; g < NGROUPS; g++) {
+        double err = fabs((double)od[g] - (double)DISTINCT_PER_GROUP) /
+                     (double)DISTINCT_PER_GROUP;
+        TEST_ASSERT_FMT(err <= 0.05,
+                        "group %lld: got %lld, expected ~%lld (err=%.3f)",
+                        (long long)g, (long long)od[g],
+                        (long long)DISTINCT_PER_GROUP, err);
+    }
+
+    ray_release(out);
+    ray_release(gids);
+    ray_release(vec);
+    ray_sym_destroy();
+    ray_heap_destroy();
+    PASS();
+}
+
 /* --------------------------------------------------------------------------
  * Test registry
  * -------------------------------------------------------------------------- */
@@ -1279,5 +1349,6 @@ const test_entry_t group_extra_entries[] = {
     { "group_extra/i16_group_top_count_emit_filter", test_i16_group_top_count_emit_filter, NULL, NULL },
     { "group_extra/sym_group_top_count_emit_filter", test_sym_group_top_count_emit_filter, NULL, NULL },
     { "group_extra/five_key_group_top_count_emit_filter", test_five_key_group_top_count_emit_filter, NULL, NULL },
+    { "group_extra/count_distinct_pg_stream",      test_count_distinct_pg_stream,      NULL, NULL },
     { NULL, NULL, NULL, NULL },
 };

From 92ed6daf4309c8ad3f9b55ef2babceb6b4cc1c35 Mon Sep 17 00:00:00 2001
From: Hetoku <anton.kundenko@gmail.com>
Date: Tue, 26 May 2026 19:57:41 +0200
Subject: [PATCH 20/36] perf(query): evaluate by-expression keys under
 selection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the by-clause of a (select ...) query contains a non-trivial
computed expression (e.g. q42's '(xbar EventTime 60000000000)') and a
WHERE clause, the existing dict-by-eval path materialises that
expression over every input row before the WHERE has a chance to
shrink the working set.  For selective WHEREs that's mostly wasted
work — q42 evaluates xbar across 10M rows then discards 94% of them.

The existing prefilter (gated by match_group_desc_count_take's
top-N-by-agg shape) already addresses this for the common
'desc: count_col take: N' shape — but it materialises the entire
filtered table, including columns the rest of the (select ...) clause
will never touch.  On ClickBench's 100+ column 'hits' table that
full materialise dominated what was meant to be a cheap prefilter.

Two changes:

  1. Narrow projection: walk WHERE + all dict-val expressions to
     collect every source column the query actually references.
     Project the input table to just those columns (metadata-only,
     no row data copy) before feeding into the prefilter graph.
     For q39 this drops the prefilter's gather cost from ~100 cols
     × 600K rows down to ~5 cols × 600K rows.

  2. Skip ray_optimize on the prefilter sub-graph.  The optimizer's
     predicate_pushdown pass splits OP_AND into chained OP_FILTERs,
     each evaluating its conjunct as a separate parallel pass and
     materialising a per-conjunct bool vec.  For a 5-conjunct WHERE
     on 10M rows that's ~50MB of intermediate bool-vec writes.  The
     unsplit AND tree compiles into a single fused expression
     evaluator that runs all comparisons inline in one pass.

ClickBench 10M (REPS=7, min ms):
  q39 (5-key by-dict, computed by-val): 267 → 165  (-38%)
  q42, q40, q41, q36, q37, q38, q30, q35, q28: unchanged.

The matcher gate is kept as-is — q42's shape (asc on a by-key, not
an agg col) does not match, so q42 continues to flow through the
fused OP_FILTERED_GROUP path which is faster than any
prefilter-then-group split can match for that shape.
---
 src/ops/query.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 148 insertions(+), 3 deletions(-)

diff --git a/src/ops/query.c b/src/ops/query.c
index 0a1bce9d..eac6ed96 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -3625,6 +3625,89 @@ ray_t* ray_try_count_select_expr(ray_t* expr, int* handled) {
     return ray_i64(nrows);
 }
 
+/* Walk `expr` and collect column-name symbols (RAY_ATTR_NAME atoms that
+ * resolve to a real column in `tbl`).  Also follows the head of dotted
+ * names so a `Timestamp.date` reference contributes its base column.
+ * `out_syms` is treated as an append-only set (dedup against existing
+ * entries) up to `max_out`; returns the new count.  Used to determine
+ * the subset of input columns the rest of a (select …) clause actually
+ * touches, so a prefilter materialise can skip everything else. */
+static int collect_col_refs_set(ray_t* expr, ray_t* tbl,
+                                int64_t* out_syms, int max_out, int n) {
+    if (!expr || n >= max_out) return n;
+    if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME)) {
+        int64_t want = -1;
+        if (ray_table_get_col(tbl, expr->i64)) {
+            want = expr->i64;
+        } else if (ray_sym_is_dotted(expr->i64)) {
+            const int64_t* segs;
+            int nsegs = ray_sym_segs(expr->i64, &segs);
+            if (nsegs >= 1 && ray_table_get_col(tbl, segs[0])) want = segs[0];
+        }
+        if (want >= 0) {
+            for (int i = 0; i < n; i++) if (out_syms[i] == want) return n;
+            if (n < max_out) out_syms[n++] = want;
+        }
+        return n;
+    }
+    if (expr->type == RAY_LIST) {
+        ray_t** elems = (ray_t**)ray_data(expr);
+        int64_t cnt = ray_len(expr);
+        for (int64_t i = 0; i < cnt && n < max_out; i++)
+            n = collect_col_refs_set(elems[i], tbl, out_syms, max_out, n);
+        return n;
+    }
+    if (expr->type == RAY_DICT) {
+        DICT_VIEW_DECL(dv);
+        DICT_VIEW_OPEN(expr, dv);
+        if (DICT_VIEW_OVERFLOW(dv)) return n;
+        for (int64_t i = 0; i + 1 < dv_n && n < max_out; i += 2)
+            n = collect_col_refs_set(dv[i + 1], tbl, out_syms, max_out, n);
+        return n;
+    }
+    if (expr->type == RAY_SYM) {
+        /* Sym vector — each element is a column name (e.g. multi-col
+         * asc:/desc:/by: tuples).  Pull syms out at the storage width. */
+        const void* base = ray_data(expr);
+        int8_t  vt = expr->type;
+        uint8_t va = expr->attrs;
+        int64_t len = ray_len(expr);
+        for (int64_t i = 0; i < len && n < max_out; i++) {
+            int64_t s = ray_read_sym(base, i, vt, va);
+            if (ray_table_get_col(tbl, s)) {
+                int dup = 0;
+                for (int j = 0; j < n; j++) if (out_syms[j] == s) { dup = 1; break; }
+                if (!dup && n < max_out) out_syms[n++] = s;
+            }
+        }
+        return n;
+    }
+    return n;
+}
+
+/* Build a narrow projection of `src_tbl` containing only the columns in
+ * `keep_syms[0..n_keep)`, preserving the original column order.
+ * Schema/cols share the source vec/list headers (retain'd internally
+ * by ray_table_add_col); no row data is copied — projection is a
+ * metadata-only operation.  Returns an owned ray_t* or an error. */
+static ray_t* project_table_cols(ray_t* src_tbl, const int64_t* keep_syms,
+                                 int n_keep) {
+    ray_t* nt = ray_table_new(n_keep);
+    if (!nt || RAY_IS_ERR(nt)) return nt ? nt : ray_error("oom", NULL);
+    for (int i = 0; i < n_keep; i++) {
+        ray_t* col = ray_table_get_col(src_tbl, keep_syms[i]);
+        if (!col) { ray_release(nt); return ray_error("domain", NULL); }
+        ray_t* nt2 = ray_table_add_col(nt, keep_syms[i], col);
+        if (!nt2 || RAY_IS_ERR(nt2)) {
+            if (nt2 && nt2 != nt) ray_release(nt2);
+            else ray_release(nt);
+            return nt2 ? nt2 : ray_error("oom", NULL);
+        }
+        nt = nt2;
+    }
+    return nt;
+}
+
 ray_t* ray_select(ray_t** args, int64_t n) {
     if (n < 1) return ray_error("domain", NULL);
     ray_t* dict = args[0];
@@ -3972,23 +4055,85 @@ ray_t* ray_select(ray_t** args, int64_t n) {
             match_group_desc_count_take(dict_elems, dict_n, from_id, where_id,
                                         by_id, take_id, asc_id, desc_id,
                                         &prefilter_top_count);
+        /* Computed by-val + WHERE: eagerly evaluating a non-trivial
+         * group key (e.g. q42's `(xbar EventTime 60000000000)`) over
+         * every input row wastes work proportional to the WHERE's
+         * selectivity.  Project the input table down to just the
+         * columns the rest of the (select …) clause actually touches
+         * (WHERE refs, by-val refs, agg-input refs, sort-key refs),
+         * filter the narrow projection through WHERE once, then
+         * evaluate by-val expressions on the small dense result.  The
+         * downstream group/sort/take then sees a fully-filtered table
+         * — fewer rows, fewer columns, no per-row redundant work.
+         *
+         * Narrowing matters: for wide tables (ClickBench's `hits` has
+         * ~100 cols) materialising the full filtered table dominates
+         * what was meant to be a cheap prefilter (single-col filter
+         * is O(passing × esz), full filter is ~50× that).
+         *
+         * The matcher gate (top-N-by-agg) constrains where this fires
+         * to shapes where the prefilter's cost can be amortised — the
+         * downstream group materialisation and top-N extraction
+         * benefit from operating on a small filtered slice.  Broader
+         * shapes that already have an efficient fused-filter+group
+         * path (OP_FILTERED_GROUP) would lose more in the duplicated
+         * filter work than they'd save in the smaller by-val eval. */
         if (where_expr && prefilter_computed_by) {
-            ray_graph_t* fg = ray_graph_new(tbl);
+            int64_t keep_syms[256];
+            int n_keep = 0;
+            n_keep = collect_col_refs_set(where_expr, tbl,
+                                          keep_syms, 256, n_keep);
+            for (int64_t i = 0; i + 1 < dict_n && n_keep < 256; i += 2) {
+                int64_t kid = dict_elems[i]->i64;
+                if (kid == from_id || kid == where_id || kid == take_id ||
+                    kid == nearest_id) continue;
+                /* asc:/desc:/by: keep the value's referenced source cols
+                 * (the by-dict's dict val may be a computed expression
+                 * referencing other source cols, the asc/desc value is
+                 * a -RAY_SYM or RAY_SYM vec of source col names).  All
+                 * other entries are output cols — agg or non-agg
+                 * expressions whose refs we also need post-filter. */
+                n_keep = collect_col_refs_set(dict_elems[i + 1], tbl,
+                                              keep_syms, 256, n_keep);
+            }
+            int can_project = (n_keep > 0 && n_keep < 256 &&
+                               n_keep < ray_table_ncols(tbl));
+            ray_t* narrow_tbl = NULL;
+            if (can_project) {
+                narrow_tbl = project_table_cols(tbl, keep_syms, n_keep);
+                if (!narrow_tbl || RAY_IS_ERR(narrow_tbl)) {
+                    if (narrow_tbl) ray_release(narrow_tbl);
+                    narrow_tbl = NULL;
+                    can_project = 0;
+                }
+            }
+            ray_t* prefilter_input = can_project ? narrow_tbl : tbl;
+            ray_graph_t* fg = ray_graph_new(prefilter_input);
             if (!fg) {
+                if (narrow_tbl) ray_release(narrow_tbl);
                 ray_release(tbl);
                 return ray_error("oom", NULL);
             }
-            ray_op_t* froot = ray_const_table(fg, tbl);
+            ray_op_t* froot = ray_const_table(fg, prefilter_input);
             ray_op_t* pred = compile_expr_dag(fg, where_expr);
             if (!pred) {
                 ray_graph_free(fg);
+                if (narrow_tbl) ray_release(narrow_tbl);
                 ray_release(tbl);
                 return ray_error("domain", NULL);
             }
             froot = ray_filter(fg, froot, pred);
-            froot = ray_optimize(fg, froot);
+            /* Deliberately skip ray_optimize: its predicate pushdown
+             * pass splits OP_AND into chained OP_FILTERs, each
+             * materialising a per-conjunct bool vec and refining a
+             * rowsel.  For wide AND-of-comparison WHEREs that costs
+             * one parallel pass per conjunct (~50MB of intermediate
+             * bool-vec writes for q42's 5-clause WHERE on 10M rows).
+             * Single ray_filter with the unsplit AND-tree evaluates
+             * the whole predicate inline in one parallel pass. */
             ray_t* filtered = ray_execute(fg, froot);
             ray_graph_free(fg);
+            if (narrow_tbl) ray_release(narrow_tbl);
             if (!filtered || RAY_IS_ERR(filtered)) {
                 ray_release(tbl);
                 return filtered ? filtered : ray_error("domain", NULL);

From e11623a46b8aaf50a6cf00f30b857d7f24bef7ba Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 20:11:33 +0200
Subject: [PATCH 21/36] perf(query): route small-n_groups count-distinct
 through streaming HLL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

count_distinct_per_group_buf's caller has row_gid in scope but
historically only routed through it for the > 50 000 n_groups
branch.  The streaming HLL kernel (added in the previous commit)
runs one pass over (row_gid, hash(src[r])) into per-worker sparse
sketches — no idx_buf scatter required.

Gate: 16 ≤ n_groups ≤ 500 (memory budget keeps per-worker banks
roughly L2-resident), nrows ≥ 1<<20 (same threshold the global
HLL path uses), and src must be a flat (non-parted) hashable
type.  Falls through to the existing buf-form on type miss / OOM.
---
 src/ops/query.c | 39 +++++++++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/src/ops/query.c b/src/ops/query.c
index eac6ed96..ed2fc307 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -8097,6 +8097,31 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                             }
                         }
                         if (src_for_global) {
+                            /* Streaming per-group HLL: skips the idx_buf
+                             * scatter and re-walk by running one pass
+                             * over (row_gid, hash(src[r])).  Each worker
+                             * owns a private bank of n_groups sparse
+                             * sketches; gated by a memory budget so the
+                             * banks stay roughly L2-resident.  Falls
+                             * through to the buf-form on type miss / OOM. */
+                            if (n_groups >= 16 && n_groups <= 500
+                                && nrows >= (1 << 20)
+                                && !RAY_IS_PARTED(src_for_global->type)
+                                && src_for_global->type != RAY_MAPCOMMON)
+                            {
+                                ray_t* out_hll = ray_vec_new(RAY_I64, n_groups);
+                                if (out_hll && !RAY_IS_ERR(out_hll)) {
+                                    out_hll->len = n_groups;
+                                    int64_t* odata = (int64_t*)ray_data(out_hll);
+                                    if (ray_count_distinct_approx_pg_stream(
+                                            src_for_global, row_gid, nrows,
+                                            n_groups, 14, odata) == 0) {
+                                        col = out_hll;
+                                    } else {
+                                        ray_release(out_hll);
+                                    }
+                                }
+                            }
                             /* Path selection: global-hash kernel scales
                              * with n_rows (per-row probe of one shared
                              * hash table); per-group-slice scales with
@@ -8107,12 +8132,14 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                              * so keep them on the single-pass kernel and
                              * avoid slicing through the partition layout
                              * again. */
-                            if (n_groups <= 50000) {
-                                col = count_distinct_per_group_buf(
-                                    cd_inner, tbl, idx_buf, offsets, grp_cnt, n_groups);
-                            } else {
-                                col = ray_count_distinct_per_group(
-                                    src_for_global, row_gid, nrows, n_groups);
+                            if (!col) {
+                                if (n_groups <= 50000) {
+                                    col = count_distinct_per_group_buf(
+                                        cd_inner, tbl, idx_buf, offsets, grp_cnt, n_groups);
+                                } else {
+                                    col = ray_count_distinct_per_group(
+                                        src_for_global, row_gid, nrows, n_groups);
+                                }
                             }
                             /* col == NULL → unsupported type, fall through. */
                         }

From e178099541a1c34164b50f08e3268283ee134fd3 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 21:05:15 +0200
Subject: [PATCH 22/36] perf(fused_group): Misra-Gries top-K for I64 /
 TIMESTAMP key columns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fp_try_direct_count1 fast path already handles top-K-by-count
shapes on BOOL/U8/I16/I32 (slot arrays) and on I32 with high
cardinality (Misra-Gries).  I64 (and TIMESTAMP) returned NULL —
the slot array would need 16 GB for the full domain.

Add an I64 mirror of fp_try_i32_mg_top_count.  Same algorithm:
8192-candidate Misra-Gries pass over the data, exact second pass
on survivors, top-N heap.  Safety bound (nrows / 8193) guards
against missing heavy hitters; on violation falls through to the
existing partition path.
---
 src/ops/fused_group.c | 183 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 183 insertions(+)

diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c
index 1dec5532..8ca1e0a9 100644
--- a/src/ops/fused_group.c
+++ b/src/ops/fused_group.c
@@ -1154,6 +1154,37 @@ static uint32_t fp_i32_hash_slot(int32_t key, uint32_t mask) {
     return (uint32_t)h & mask;
 }
 
+static uint32_t fp_i64_hash_slot(int64_t key, uint32_t mask) {
+    uint64_t h = (uint64_t)key * 0x9E3779B97F4A7C15ULL;
+    h ^= h >> 33;
+    h *= 0xC2B2AE3D27D4EB4FULL;
+    h ^= h >> 29;
+    return (uint32_t)h & mask;
+}
+
+static void fp_i64_mg_rebuild(const int64_t* keys, const uint32_t* counts,
+                              uint32_t n, uint32_t* ht, uint32_t hcap) {
+    memset(ht, 0, (size_t)hcap * sizeof(uint32_t));
+    uint32_t mask = hcap - 1;
+    for (uint32_t i = 0; i < n; i++) {
+        if (!counts[i]) continue;
+        uint32_t slot = fp_i64_hash_slot(keys[i], mask);
+        while (ht[slot]) slot = (slot + 1u) & mask;
+        ht[slot] = i + 1u;
+    }
+}
+
+static uint32_t fp_i64_mg_lookup(const int64_t* keys, const uint32_t* ht,
+                                 uint32_t hmask, int64_t key) {
+    uint32_t slot = fp_i64_hash_slot(key, hmask);
+    while (ht[slot]) {
+        uint32_t idx = ht[slot] - 1u;
+        if (keys[idx] == key) return idx + 1u;
+        slot = (slot + 1u) & hmask;
+    }
+    return 0;
+}
+
 static void fp_i32_mg_rebuild(const int32_t* keys, const uint32_t* counts,
                               uint32_t n, uint32_t* ht, uint32_t hcap) {
     memset(ht, 0, (size_t)hcap * sizeof(uint32_t));
@@ -1314,6 +1345,145 @@ static ray_t* fp_try_i32_mg_top_count(const fp_par_ctx_t* ctx, int64_t nrows,
     return result;
 }
 
+/* I64 mirror of fp_try_i32_mg_top_count for top-K-by-count over an
+ * I64 key column.  Misra-Gries with cap = 8192 candidates guarantees
+ * every key with count > nrows / 8193 survives the first pass; the
+ * second pass exact-counts the survivors and a min-heap picks the
+ * top K.  Falls back to NULL when the safety bound is violated, or
+ * when fewer than K candidates have non-zero exact counts. */
+static ray_t* fp_try_i64_mg_top_count(const fp_par_ctx_t* ctx, int64_t nrows,
+                                      int64_t key_sym,
+                                      ray_group_emit_filter_t emit_filter) {
+    if (ctx->kt != RAY_I64 || ctx->pred.n_children != 0 ||
+        emit_filter.top_count_take <= 0 || nrows <= 0)
+        return NULL;
+
+    const uint32_t cap = 8192;
+    const uint32_t hcap = cap * 2u;
+    const int64_t* data = (const int64_t*)ctx->kbase;
+    ray_t *keys_hdr = NULL, *cnt_hdr = NULL, *exact_hdr = NULL, *ht_hdr = NULL;
+    int64_t* keys = (int64_t*)scratch_alloc(&keys_hdr, cap * sizeof(int64_t));
+    uint32_t* counts = (uint32_t*)scratch_calloc(&cnt_hdr, cap * sizeof(uint32_t));
+    uint32_t* exact = (uint32_t*)scratch_calloc(&exact_hdr, cap * sizeof(uint32_t));
+    uint32_t* ht = (uint32_t*)scratch_calloc(&ht_hdr, hcap * sizeof(uint32_t));
+    if (!keys || !counts || !exact || !ht) {
+        if (keys_hdr) scratch_free(keys_hdr);
+        if (cnt_hdr) scratch_free(cnt_hdr);
+        if (exact_hdr) scratch_free(exact_hdr);
+        if (ht_hdr) scratch_free(ht_hdr);
+        return NULL;
+    }
+
+    uint32_t n = 0;
+    uint32_t decrements = 0;
+    uint32_t hmask = hcap - 1u;
+    for (int64_t r = 0; r < nrows; r++) {
+        int64_t key = data[r];
+        uint32_t found = fp_i64_mg_lookup(keys, ht, hmask, key);
+        if (found) {
+            counts[found - 1u]++;
+            continue;
+        }
+        if (n < cap) {
+            uint32_t idx = n++;
+            keys[idx] = key;
+            counts[idx] = 1;
+            uint32_t slot = fp_i64_hash_slot(key, hmask);
+            while (ht[slot]) slot = (slot + 1u) & hmask;
+            ht[slot] = idx + 1u;
+            continue;
+        }
+        uint32_t out = 0;
+        for (uint32_t i = 0; i < n; i++) {
+            uint32_t c = counts[i];
+            if (c > 1) {
+                counts[out] = c - 1u;
+                keys[out] = keys[i];
+                out++;
+            }
+        }
+        n = out;
+        decrements++;
+        fp_i64_mg_rebuild(keys, counts, n, ht, hcap);
+    }
+
+    memset(exact, 0, cap * sizeof(uint32_t));
+    for (int64_t r = 0; r < nrows; r++) {
+        uint32_t found = fp_i64_mg_lookup(keys, ht, hmask, data[r]);
+        if (found) exact[found - 1u]++;
+    }
+
+    int64_t k_take = emit_filter.top_count_take;
+    if (k_take > 1024) k_take = 1024;
+    int64_t heap[1024];
+    int64_t heap_n = 0;
+    uint32_t nonzero = 0;
+    for (uint32_t i = 0; i < n; i++) {
+        if (!exact[i]) continue;
+        nonzero++;
+        fp_count_heap_consider(heap, &heap_n, k_take, (int64_t)exact[i]);
+    }
+    if (heap_n == 0) {
+        scratch_free(keys_hdr); scratch_free(cnt_hdr);
+        scratch_free(exact_hdr); scratch_free(ht_hdr);
+        return NULL;
+    }
+    int64_t keep_min = emit_filter.min_count_exclusive + 1;
+    if (heap_n == k_take && heap[0] > keep_min)
+        keep_min = heap[0];
+
+    if (decrements && keep_min <= nrows / (int64_t)(cap + 1u)) {
+        scratch_free(keys_hdr); scratch_free(cnt_hdr);
+        scratch_free(exact_hdr); scratch_free(ht_hdr);
+        return NULL;
+    }
+
+    uint32_t out_n = 0;
+    for (uint32_t i = 0; i < n; i++)
+        if ((int64_t)exact[i] >= keep_min) out_n++;
+    if (!out_n || (decrements && nonzero < (uint32_t)k_take)) {
+        scratch_free(keys_hdr); scratch_free(cnt_hdr);
+        scratch_free(exact_hdr); scratch_free(ht_hdr);
+        return NULL;
+    }
+
+    ray_t* k_out = ray_vec_new(ctx->kt, out_n);
+    ray_t* c_out = ray_vec_new(RAY_I64, out_n);
+    if (!k_out || !c_out || RAY_IS_ERR(k_out) || RAY_IS_ERR(c_out)) {
+        if (k_out && !RAY_IS_ERR(k_out)) ray_release(k_out);
+        if (c_out && !RAY_IS_ERR(c_out)) ray_release(c_out);
+        scratch_free(keys_hdr); scratch_free(cnt_hdr);
+        scratch_free(exact_hdr); scratch_free(ht_hdr);
+        return ray_error("oom", NULL);
+    }
+    k_out->len = out_n;
+    c_out->len = out_n;
+    int64_t* kd = (int64_t*)ray_data(k_out);
+    int64_t* cd = (int64_t*)ray_data(c_out);
+    uint32_t oi = 0;
+    for (uint32_t i = 0; i < n; i++) {
+        if ((int64_t)exact[i] < keep_min) continue;
+        kd[oi] = keys[i];
+        cd[oi] = exact[i];
+        oi++;
+    }
+    scratch_free(keys_hdr); scratch_free(cnt_hdr);
+    scratch_free(exact_hdr); scratch_free(ht_hdr);
+
+    ray_t* result = ray_table_new(2);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(k_out);
+        ray_release(c_out);
+        return ray_error("oom", NULL);
+    }
+    int64_t cnt_sym = ray_sym_intern("count", 5);
+    result = ray_table_add_col(result, key_sym, k_out);
+    result = ray_table_add_col(result, cnt_sym, c_out);
+    ray_release(k_out);
+    ray_release(c_out);
+    return result;
+}
+
 static void fp_direct_count_fn(void* raw, uint32_t worker_id,
                                int64_t start, int64_t end) {
     fp_direct_count_ctx_t* c = (fp_direct_count_ctx_t*)raw;
@@ -1375,6 +1545,19 @@ static ray_t* fp_try_direct_count1(const fp_par_ctx_t* ctx, int64_t nrows,
             if (mg) return mg;
         }
         return NULL;
+    } else if (ctx->kt == RAY_I64 || ctx->kt == RAY_TIMESTAMP) {
+        /* I64/TIMESTAMP top-K via Misra-Gries.  The slot-array path
+         * for I32/I16/U8/BOOL would need 16 GB for the full I64
+         * domain; MG with cap = 8 K candidates costs ~256 KB and
+         * exact-counts the survivors in a second pass.  Falls back
+         * to the partition path when the safety bound is violated. */
+        ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get();
+        if (emit_filter.enabled && emit_filter.agg_index == 0 &&
+            emit_filter.top_count_take > 0) {
+            ray_t* mg = fp_try_i64_mg_top_count(ctx, nrows, key_sym, emit_filter);
+            if (mg) return mg;
+        }
+        return NULL;
     } else if (ctx->kt == RAY_SYM) {
         uint64_t max_key = 0;
         for (int64_t i = 0; i < nrows; i++) {

From e5639a1dc89a5bdf47b059ef96db0fcb051c5cd8 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 21:20:26 +0200
Subject: [PATCH 23/36] fix(fused_group): properly accept TIMESTAMP for I64 MG
 top-K

The kt-type check used a combined OR that conflated the type-mismatch
case with other guard conditions, rejecting TIMESTAMP columns that
the dispatcher above this fast path explicitly routes here.  Split
the type check from the other guards.
---
 src/ops/fused_group.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c
index 8ca1e0a9..75993924 100644
--- a/src/ops/fused_group.c
+++ b/src/ops/fused_group.c
@@ -1354,7 +1354,8 @@ static ray_t* fp_try_i32_mg_top_count(const fp_par_ctx_t* ctx, int64_t nrows,
 static ray_t* fp_try_i64_mg_top_count(const fp_par_ctx_t* ctx, int64_t nrows,
                                       int64_t key_sym,
                                       ray_group_emit_filter_t emit_filter) {
-    if (ctx->kt != RAY_I64 || ctx->pred.n_children != 0 ||
+    if (ctx->kt != RAY_I64 && ctx->kt != RAY_TIMESTAMP) return NULL;
+    if (ctx->pred.n_children != 0 ||
         emit_filter.top_count_take <= 0 || nrows <= 0)
         return NULL;
 

From 2702cb756f93caed2207cc1683a54376f40c176b Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Wed, 27 May 2026 11:59:38 +0200
Subject: [PATCH 24/36] perf(query): skip idx_buf scatter when streaming HLL
 covers all non-aggs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

count_distinct_per_group_buf reads idx_buf+offsets+counts only because
that's the layout the buf-form HLL kernel needs.  The streaming HLL
kernel (added earlier) walks (row_gid, hash(src[r])) directly with
zero scatter, so when every per-group count(distinct ...) qualifies
for the streaming gate (16 ≤ n_groups ≤ 500, hashable type, nrows ≥
1<<20) the scatter step is pure overhead.

Extend the needs_slice_idx guard to accept count-distinct shapes that
the downstream router will pick up via streaming, alongside the
existing simple_cd_global (n_groups > 50 000) case.  q10 drops
174→153 ms (-14%); other queries unaffected.
---
 src/ops/query.c | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/src/ops/query.c b/src/ops/query.c
index ed2fc307..063b8dab 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -7945,6 +7945,23 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                  *
                  * If any non-agg falls outside that, we still need the
                  * index. */
+                /* Decide whether we need to materialise the per-group
+                 * idx_buf scatter.  Two routes avoid it entirely:
+                 *
+                 *   - simple_cd_global: count(distinct col_ref) with
+                 *     n_groups > 50 000 — the high-card path walks
+                 *     row_gid directly.
+                 *   - cd_streaming: count(distinct col_ref) with a
+                 *     hashable column and 16 ≤ n_groups ≤ 500 — the
+                 *     streaming HLL kernel walks (row_gid, hash(src[r]))
+                 *     into per-worker sparse-sketch banks; no scatter
+                 *     needed.  Saves the ~10 % of q08/q10-class
+                 *     queries that idxbuf_scat + idxbuf_hist eats
+                 *     when the downstream HLL path doesn't read it.
+                 *
+                 * Either skips the scatter only when EVERY non-agg
+                 * qualifies — if any non-agg needs idx_buf the
+                 * scatter still has to run. */
                 int needs_slice_idx = 0;
                 for (uint8_t ni = 0; ni < n_nonaggs && !needs_slice_idx; ni++) {
                     ray_t* cd_inner = match_count_distinct(nonagg_exprs[ni]);
@@ -7952,7 +7969,24 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                                             cd_inner->type == -RAY_SYM &&
                                             (cd_inner->attrs & RAY_ATTR_NAME) &&
                                             n_groups > 50000);
-                    if (!simple_cd_global) needs_slice_idx = 1;
+                    int cd_streaming = 0;
+                    if (cd_inner && cd_inner->type == -RAY_SYM &&
+                        (cd_inner->attrs & RAY_ATTR_NAME) &&
+                        n_groups >= 16 && n_groups <= 500 &&
+                        nrows >= (1 << 20)) {
+                        ray_t* sc = ray_table_get_col(tbl, cd_inner->i64);
+                        if (sc && !RAY_IS_PARTED(sc->type) &&
+                            sc->type != RAY_MAPCOMMON) {
+                            int8_t st = sc->type;
+                            cd_streaming = (st == RAY_I64 || st == RAY_I32 ||
+                                            st == RAY_I16 || st == RAY_U8 ||
+                                            st == RAY_BOOL || st == RAY_F64 ||
+                                            st == RAY_DATE || st == RAY_TIME ||
+                                            st == RAY_TIMESTAMP ||
+                                            RAY_IS_SYM(st));
+                        }
+                    }
+                    if (!simple_cd_global && !cd_streaming) needs_slice_idx = 1;
                 }
 
                 int64_t* idx_buf = NULL;

From c0bdb641f4fd804e4d7b61423baa6dcfe87a05ad Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Wed, 27 May 2026 12:20:09 +0200
Subject: [PATCH 25/36] perf(query): drop redundant ray_heap_gc inside
 apply_sort_take
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The top-level statement runner (run_piped / repl) already calls
ray_heap_gc() at end of each statement.  The pair of inner GCs
inside apply_sort_take's no-sort + take branches were running an
extra full GC pass per query in benchmark loops, costing ~2.5 ms
on every query that takes this code path.

Removing them defers cleanup by exactly one call site — the next
top-level GC catches the freed intermediates.  q40 drops 19 → 14.5
ms (-23 %); all other queries unchanged within noise.
---
 src/ops/query.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/ops/query.c b/src/ops/query.c
index 063b8dab..37598537 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -425,14 +425,17 @@ static ray_t* apply_sort_take(ray_t* result, ray_t** dict_elems, int64_t dict_n,
             rng->len = 2;
             ray_t* sliced = ray_take_fn(result, rng);
             ray_release(result);
-            ray_heap_gc();
+            /* No explicit GC here — every top-level statement (run_piped
+             * / repl) finishes with a ray_heap_gc() that catches the
+             * freed intermediates anyway.  The inner call was double-
+             * counting on benchmark loops where the same query runs
+             * back-to-back. */
             ray_release(rng);
             return sliced;
         }
         if (ray_is_vec(tv) && (tv->type == RAY_I64 || tv->type == RAY_I32) && tv->len == 2) {
             ray_t* sliced = ray_take_fn(result, tv);
             ray_release(result);
-            ray_heap_gc();
             ray_release(tv);
             return sliced;
         }

From 531b1119a494000b1e220f4857fbdd2fc5270388 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Wed, 27 May 2026 12:24:57 +0200
Subject: [PATCH 26/36] perf(query): drop redundant ray_heap_gc inside
 apply_sort_take topk path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same pattern as the earlier no-sort+take path: the explicit GC
after ray_topk_table_multi was duplicating the top-level statement
runner's GC, costing ~2.5 ms per matching query.

q40 drops 14.5 → 6.2 ms (-57 %), q42 drops 41 → 27 ms (-34 %),
q38 drops 22 → 8 ms (-64 %, flips to win over duck 17 ms).
All other queries unchanged within noise.
---
 src/ops/query.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/ops/query.c b/src/ops/query.c
index 37598537..73aaaf48 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -534,7 +534,9 @@ static ray_t* apply_sort_take(ray_t* result, ray_t** dict_elems, int64_t dict_n,
                         }
                         if (topk && !RAY_IS_ERR(topk)) {
                             ray_release(result);
-                            ray_heap_gc();
+                            /* No explicit GC — the top-level statement
+                             * runner's ray_heap_gc() reclaims the freed
+                             * intermediates one call later. */
                             return topk;
                         }
                         if (topk && RAY_IS_ERR(topk)) ray_release(topk);

From 2585aee791bd3d505f83c177764bda521c290d25 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Wed, 27 May 2026 12:33:11 +0200
Subject: [PATCH 27/36] perf: drop 3 more redundant ray_heap_gc calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same pattern: explicit GC calls inside exec_group / exec_sort+head /
exec_filter+head that the top-level statement runner's GC catches
one call later.

Major wall-time wins on the high-cardinality group-by family:
  q11: 301 → 231 ms (-23%)
  q17: 346 → 273 ms (-21%)
  q15: 146 → 129 ms (-12%)
  q39: 186 → 164 ms (-12%)
  q08: 188 → 179 ms (-5%)
Plus the closer flips into reach: q40 still 6 ms vs duck 4.
---
 src/ops/exec.c  | 4 ++--
 src/ops/group.c | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/ops/exec.c b/src/ops/exec.c
index 1593aa0a..efa90cf8 100644
--- a/src/ops/exec.c
+++ b/src/ops/exec.c
@@ -1441,7 +1441,7 @@ static ray_t* exec_node_inner(ray_graph_t* g, ray_op_t* op) {
                 }
                 ray_t* result = exec_sort(g, child_op, tbl, n);
                 if (sort_input != g->table) ray_release(sort_input);
-                if (result && !RAY_IS_ERR(result)) ray_heap_gc();
+                /* Top-level statement GC catches intermediates. */
                 return result;
             }
 
@@ -1510,7 +1510,7 @@ static ray_t* exec_node_inner(ray_graph_t* g, ray_op_t* op) {
                 ray_release(pred);
                 if (filter_input != saved_table)
                     ray_release(filter_input);
-                if (result && !RAY_IS_ERR(result)) ray_heap_gc();
+                /* Top-level statement GC catches intermediates. */
                 return result;
             } else {
                 input = exec_node(g, op->inputs[0]);
diff --git a/src/ops/group.c b/src/ops/group.c
index f12456f8..d5253175 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -8040,7 +8040,7 @@ v2_done:;
             scratch_free(radix_bufs_hdr);
             radix_bufs = NULL;
             radix_bufs_hdr = NULL;
-            ray_heap_gc();
+            /* No explicit GC — top-level statement GC catches it. */
         }
 
 v2_emit:;
@@ -9019,7 +9019,10 @@ sequential_fallback:;
         if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]);
     if (match_idx_block) ray_release(match_idx_block);
 
-    ray_heap_gc();
+    /* No explicit GC — top-level statement runner (run_piped / repl)
+     * calls ray_heap_gc() once per statement, catching every
+     * intermediate freed above.  The duplicate inner call doubled the
+     * per-query GC cost on bench loops. */
 
     return result;
 }

From c6522d4b055252355b4c2ebb396ce15e92a8d7b0 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Wed, 27 May 2026 12:46:22 +0200
Subject: [PATCH 28/36] perf(heap): skip empty freelist orders in ray_heap_gc
 pass 5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use each heap's avail bitmap (set on freelist insert, cleared on
remove) to short-circuit the per-order walk in the page-release
pass.  When no order ≥ 13 has any free block (tiny-query, GC fires
before any large allocation has been freed) the entire pass exits
without entering the loop body.

State-based, not constant-tuned: when there are free large blocks,
all of them are still released; when there aren't, nothing changes.
---
 src/mem/heap.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/mem/heap.c b/src/mem/heap.c
index d8ee3f29..d73c5780 100644
--- a/src/mem/heap.c
+++ b/src/mem/heap.c
@@ -1473,11 +1473,21 @@ void ray_heap_gc(void) {
         /* Pass 5: Release physical pages from free blocks in every
          * idle heap.  Pass 2 may have returned blocks to worker-owned
          * freelists; releasing only the caller heap leaves those worker
-         * pages resident across large query repetitions. */
+         * pages resident across large query repetitions.
+         *
+         * Use each heap's avail bitmap (set on insert, cleared on
+         * remove) to skip the entire walk when no order >= 13 has any
+         * free block.  Tiny-query workloads — where the per-statement
+         * GC fires before any large allocation has been freed —
+         * complete pass 5 without entering the body. */
+        uint64_t large_orders_mask = ~((1ULL << 13) - 1);
         for (int hid = 0; hid < RAY_HEAP_REGISTRY_SIZE; hid++) {
             ray_heap_t* gh = ray_heap_registry[hid];
             if (!gh) continue;
+            uint64_t avail = gh->avail & large_orders_mask;
+            if (!avail) continue;
             for (int i = 13; i < RAY_HEAP_FL_SIZE; i++) {
+                if (!(avail & (1ULL << i))) continue;
                 ray_fl_head_t* head = &gh->freelist[i];
                 ray_t* blk = head->fl_next;
                 while (blk != (ray_t*)head) {

From b711b435161ceb4a94c8b6df69dbc4f282e0d585 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Wed, 27 May 2026 12:50:40 +0200
Subject: [PATCH 29/36] perf(heap): skip empty freelist orders in pass 2
 foreign-block return

Same avail-bitmap trick as pass 5: when no orders have entries,
short-circuit; per-order, skip empty freelists.  Pass 2's per-block
foreign-vs-local check still runs over occupied freelists, so the
saving is bounded but real on tiny queries where the freelist is
mostly empty.
---
 src/mem/heap.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/mem/heap.c b/src/mem/heap.c
index d73c5780..231f3751 100644
--- a/src/mem/heap.c
+++ b/src/mem/heap.c
@@ -1262,7 +1262,11 @@ void ray_heap_destroy(void) {
  * -------------------------------------------------------------------------- */
 
 static void heap_return_foreign_freelist(ray_heap_t* h) {
+    /* avail bit (set on insert, cleared on remove) tells us which
+     * freelist orders have any blocks at all — skip the empty ones. */
+    if (!h->avail) return;
     for (int order = RAY_ORDER_MIN; order < RAY_HEAP_FL_SIZE; order++) {
+        if (!(h->avail & (1ULL << order))) continue;
         ray_fl_head_t* head = &h->freelist[order];
         ray_t* blk = head->fl_next;
         while (blk != (ray_t*)head) {

From 0db6f8fe41a41c5d90a3ec087eccc36edeab31a7 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Wed, 27 May 2026 16:04:23 +0200
Subject: [PATCH 30/36] =?UTF-8?q?perf(fused=5Fgroup):=20WIP=20v2=20?=
 =?UTF-8?q?=E2=80=94=20per-(worker,=20partition)=20shards=20for=20multi-ke?=
 =?UTF-8?q?y?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirrors the radix_v2 design from group.c into exec_filtered_group_multi.
Each row is hashed once, routed by RADIX_PART(h) high-bits to one of
MK_RADIX_P=32 per-(worker, partition) shards instead of a single fat
per-worker shard.  Each small shard stays cache-resident; combine
re-uses mk_combine_parallel with nw_effective = nw * MK_RADIX_P.

Gate: COUNT/SUM/AVG aggs, non-nullable agg input, non-SYM keys, n_keys ≥ 2,
no eq_i64/hash-eq shortcut already firing.  Single-key and SYM-key
queries continue through the existing mk_par_fn path.

ClickBench 10M-row deltas (min-of-3, vs perf/clickbench-improvements head):
  q32  1028 → 960  ms  (−68, target cluster)
  q11   234 → 214  ms  (−20)
  q13   492 → 483  ms  (−9)
  q10   140 → 126  ms  (−14)
  q08   191 → 182  ms  (−9)
  q31   296 → 292  ms  (−4)

WIP because:
- Combine still uses mk_combine_parallel with nw_effective=256; a proper
  per-partition combine (mirror radix_v2_phase2_fn) would skip the
  histogram/scatter overhead since shards are already partitioned.
- MIN/MAX aggs untested — state-merge semantics for parallel combine
  not yet validated.
- Heuristics (gate, MK_RADIX_BITS, init_cap) tuned empirically on
  the bench; not necessarily optimal for all input shapes.
---
 src/ops/fused_group.c | 284 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 264 insertions(+), 20 deletions(-)

diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c
index 75993924..0bfa7f6f 100644
--- a/src/ops/fused_group.c
+++ b/src/ops/fused_group.c
@@ -2477,6 +2477,16 @@ static ray_t* exec_filtered_group_count1(ray_graph_t* g, ray_op_ext_t* ext,
 #define FP_MAX_AGGS 8
 #define FP_MAX_KEYS 16
 
+/* v2 path: per-(worker, partition) hash tables.  Each worker hashes its
+ * rows once and routes by RADIX_PART(h) to one of MK_RADIX_P small
+ * shards rather than a single fat per-worker shard.  Smaller shards stay
+ * cache-resident; the merge step is per-partition and trivially parallel.
+ * Mirrors the design in group.c (radix_v2_phase1_fn / _phase2_fn). */
+#define MK_RADIX_BITS 5
+#define MK_RADIX_P    (1u << MK_RADIX_BITS)
+#define MK_RADIX_MASK (MK_RADIX_P - 1u)
+#define MK_RADIX_PART(h) (((uint32_t)((h) >> 16)) & MK_RADIX_MASK)
+
 typedef enum {
     MK_AGG_COUNT = 0,
     MK_AGG_SUM   = 1,
@@ -2546,7 +2556,8 @@ typedef struct {
     uint8_t     total_state;
     uint8_t     wide;        /* 1 when total_bytes > 8 (uses kv_hi side array) */
     /* Cool fields (only touched once per dispatch or in cold paths). */
-    mk_shard_t* shards;
+    mk_shard_t* shards;       /* v1: [n_workers] single shard per worker */
+    mk_shard_t* wpart_shards; /* v2: [n_workers * MK_RADIX_P] partitioned */
     uint64_t    init_cap;
     _Atomic(uint32_t) oom;
     mk_key_t    keys[FP_MAX_KEYS];
@@ -3158,6 +3169,193 @@ static void mk_eq_i64_count_fn(void* raw, uint32_t worker_id,
     }
 }
 
+/* ─── v2 worker fn — per-(worker, partition) shards ─────────────────
+ *
+ * Like mk_par_fn but routes every passing row by RADIX_PART(hash) into
+ * one of MK_RADIX_P small per-(worker, partition) shards.  Each small
+ * shard stays cache-resident as it fills, so the probe never walks a
+ * 5–10 MB monolithic per-worker shard.  Pass-1 (probe) and pass-2
+ * (agg update) are fused per-row here: any partition may grow on any
+ * row, so a deferred pass-2 over recorded slot indexes would dereference
+ * stale slots after a rehash.  Combine merges per partition. */
+static inline void mk_v2_apply_agg_inline(mk_par_ctx_t* c, int64_t* state_slot,
+                                          int64_t source_row,
+                                          uint8_t n_aggs, uint8_t total_state)
+{
+    (void)total_state;
+    for (uint8_t a = 0; a < n_aggs; a++) {
+        const mk_agg_t* ag = &c->aggs[a];
+        uint8_t off = ag->state_off;
+        switch (ag->kind) {
+        case MK_AGG_COUNT:
+            state_slot[off]++;
+            break;
+        case MK_AGG_SUM: {
+            int64_t v = mk_read_agg_i64(ag, source_row);
+            state_slot[off] += v;
+            break;
+        }
+        case MK_AGG_MIN: {
+            int64_t v = mk_read_agg_i64(ag, source_row);
+            if (v < state_slot[off]) state_slot[off] = v;
+            break;
+        }
+        case MK_AGG_MAX: {
+            int64_t v = mk_read_agg_i64(ag, source_row);
+            if (v > state_slot[off]) state_slot[off] = v;
+            break;
+        }
+        case MK_AGG_AVG: {
+            int64_t v = mk_read_agg_i64(ag, source_row);
+            state_slot[off    ] += v;
+            state_slot[off + 1] += 1;
+            break;
+        }
+        }
+    }
+}
+
+static void mk_par_v2_fn(void* raw, uint32_t worker_id,
+                         int64_t start, int64_t end)
+{
+    mk_par_ctx_t* c = (mk_par_ctx_t*)raw;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    uint8_t wide        = c->wide;
+    uint8_t total_state = c->total_state;
+    uint8_t n_aggs      = c->n_aggs;
+    mk_shard_t* my_shards = &c->wpart_shards[(size_t)worker_id * MK_RADIX_P];
+
+    int64_t row = start;
+    while (row < end) {
+        int64_t mend = row + RAY_MORSEL_ELEMS;
+        if (mend > end) mend = end;
+        int64_t mlen = mend - row;
+        uint8_t bits[RAY_MORSEL_ELEMS];
+        fp_eval_pred(&c->pred, row, mend, bits);
+
+        int match_count = 0;
+        for (int64_t r = 0; r < mlen; r++) match_count += bits[r];
+        if (match_count == 0) { row = mend; continue; }
+        int64_t base_row = row;
+
+        if (!wide) {
+            for (int64_t r = 0; r < mlen; r++) {
+                if (!bits[r]) continue;
+                int64_t source_row = base_row + r;
+                int64_t kv = mk_compose_key(c, source_row);
+                uint64_t h = (uint64_t)kv * 0x9E3779B97F4A7C15ULL;
+                h ^= h >> 33;
+                uint32_t p = MK_RADIX_PART(h);
+                mk_shard_t* sh = &my_shards[p];
+                if (!sh->slots) {
+                    if (mk_shard_init(sh, c->init_cap, total_state, wide) != 0) {
+                        atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+                        return;
+                    }
+                }
+                if (sh->n_filled + 1 > (int64_t)(sh->cap / 2)) {
+                    if (mk_shard_grow(sh, total_state, wide) != 0) {
+                        atomic_store_explicit(&c->oom, 1,
+                                              memory_order_relaxed);
+                        return;
+                    }
+                }
+                int64_t* slots = sh->slots;
+                int64_t* state = sh->state;
+                uint64_t mask  = sh->mask;
+                uint64_t s = h & mask;
+                for (;;) {
+                    if (!slots[s * 2]) {
+                        slots[s * 2]     = 1;
+                        slots[s * 2 + 1] = kv;
+                        int64_t* st = &state[s * total_state];
+                        for (uint8_t a = 0; a < n_aggs; a++) {
+                            const mk_agg_t* ag = &c->aggs[a];
+                            switch (ag->kind) {
+                            case MK_AGG_COUNT:
+                            case MK_AGG_SUM:
+                                st[ag->state_off] = 0; break;
+                            case MK_AGG_MIN:
+                                st[ag->state_off] = INT64_MAX; break;
+                            case MK_AGG_MAX:
+                                st[ag->state_off] = INT64_MIN; break;
+                            case MK_AGG_AVG:
+                                st[ag->state_off    ] = 0;
+                                st[ag->state_off + 1] = 0; break;
+                            }
+                        }
+                        sh->n_filled++;
+                        break;
+                    }
+                    if (slots[s * 2 + 1] == kv) break;
+                    s = (s + 1) & mask;
+                }
+                mk_v2_apply_agg_inline(c, &state[s * total_state],
+                                       source_row, n_aggs, total_state);
+            }
+        } else {
+            for (int64_t r = 0; r < mlen; r++) {
+                if (!bits[r]) continue;
+                int64_t source_row = base_row + r;
+                int64_t kv_lo, kv_hi;
+                mk_compose_key2(c, source_row, &kv_lo, &kv_hi);
+                uint64_t h = mk_hash_lo_hi(kv_lo, kv_hi);
+                uint32_t p = MK_RADIX_PART(h);
+                mk_shard_t* sh = &my_shards[p];
+                if (!sh->slots) {
+                    if (mk_shard_init(sh, c->init_cap, total_state, wide) != 0) {
+                        atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+                        return;
+                    }
+                }
+                if (sh->n_filled + 1 > (int64_t)(sh->cap / 2)) {
+                    if (mk_shard_grow(sh, total_state, wide) != 0) {
+                        atomic_store_explicit(&c->oom, 1,
+                                              memory_order_relaxed);
+                        return;
+                    }
+                }
+                int64_t* slots = sh->slots;
+                int64_t* slots_hi = sh->slots_hi;
+                int64_t* state = sh->state;
+                uint64_t mask  = sh->mask;
+                uint64_t s = h & mask;
+                for (;;) {
+                    if (!slots[s * 2]) {
+                        slots[s * 2]     = 1;
+                        slots[s * 2 + 1] = kv_lo;
+                        slots_hi[s]      = kv_hi;
+                        int64_t* st = &state[s * total_state];
+                        for (uint8_t a = 0; a < n_aggs; a++) {
+                            const mk_agg_t* ag = &c->aggs[a];
+                            switch (ag->kind) {
+                            case MK_AGG_COUNT:
+                            case MK_AGG_SUM:
+                                st[ag->state_off] = 0; break;
+                            case MK_AGG_MIN:
+                                st[ag->state_off] = INT64_MAX; break;
+                            case MK_AGG_MAX:
+                                st[ag->state_off] = INT64_MIN; break;
+                            case MK_AGG_AVG:
+                                st[ag->state_off    ] = 0;
+                                st[ag->state_off + 1] = 0; break;
+                            }
+                        }
+                        sh->n_filled++;
+                        break;
+                    }
+                    if (slots[s * 2 + 1] == kv_lo && slots_hi[s] == kv_hi) break;
+                    s = (s + 1) & mask;
+                }
+                mk_v2_apply_agg_inline(c, &state[s * total_state],
+                                       source_row, n_aggs, total_state);
+            }
+        }
+
+        row = mend;
+    }
+}
+
 /* ─── Worker fn — chunked vectorised aggregate update ───────────────
  *
  * Per morsel we run two passes:
@@ -4346,22 +4544,6 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext,
     atomic_store_explicit(&ctx.oom, 0, memory_order_relaxed);
     ray_pool_t* pool = ray_pool_get();
     uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
-    /* Pre-size each worker shard a bit larger than the 1024-slot default
-     * so high-cardinality queries don't pay log2(target/1024) rehashes.
-     * The cap stays modest (16 K slots ≈ ~750 KB per shard with a 4-slot
-     * agg state) so very selective predicates that produce a handful of
-     * groups don't burn RAM up front.  Sparse keys still grow on-demand. */
-    {
-        uint64_t expected = (uint64_t)nrows / ((uint64_t)nw * 16u);
-        uint64_t init_cap = FP_SHARD_INIT_CAP;
-        while (init_cap < expected * 2u && init_cap < (1ULL << 14))
-            init_cap <<= 1;
-        ctx.init_cap = init_cap;
-    }
-    ray_t* shards_hdr = NULL;
-    ctx.shards = (mk_shard_t*)scratch_calloc(&shards_hdr,
-                                             (size_t)nw * sizeof(mk_shard_t));
-    if (!ctx.shards) return ray_error("oom", NULL);
 
     int eq_i64_idx = -1;
     if (ctx.n_aggs == 1 && ctx.aggs[0].kind == MK_AGG_COUNT &&
@@ -4382,6 +4564,61 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext,
     int hash_eq_idx = (ctx.pred.n_children == 1)
                           ? mk_find_hash_eq_child(&ctx.pred)
                           : -1;
+
+    /* v2 gate: pre-partitioned shards win on high-cardinality multi-key
+     * group-bys (q30/q31/q32 family) by keeping each per-(worker,
+     * partition) shard cache-resident.  Exclude shapes where v1's
+     * existing fast paths already win:
+     *   - hash-eq or eq_i64 chunk-skip scans (single-shard inserts)
+     *   - n_aggs == 0 (degenerate)
+     *   - n_keys == 1: v1's hot k0_base path is already L1-friendly
+     *   - SYM keys: existing tuned SYM path beats v2 (q33/q34)
+     *   - nullable agg input: v1's existing nullmask path; v2 does not
+     *     yet track per-agg null counts during merge
+     * Multi-key with COUNT/SUM/AVG aggs (no MIN/MAX): the v2 partition
+     * shards cleanly merge by summing state slots. */
+    bool v2_ok = (hash_eq_idx < 0 && eq_i64_idx < 0 &&
+                  ctx.n_aggs >= 1 && ctx.n_keys >= 2);
+    for (uint8_t k = 0; k < ctx.n_keys && v2_ok; k++) {
+        if (ctx.keys[k].type == RAY_SYM) v2_ok = false;
+    }
+    for (uint8_t a = 0; a < ctx.n_aggs && v2_ok; a++) {
+        mk_agg_kind_t kk = ctx.aggs[a].kind;
+        if (kk != MK_AGG_COUNT && kk != MK_AGG_SUM && kk != MK_AGG_AVG) {
+            v2_ok = false;
+        }
+        if (ctx.aggs[a].in_attrs & RAY_ATTR_HAS_NULLS) v2_ok = false;
+    }
+
+    /* Init capacity per shard.
+     * v1 (single shard per worker): pre-size to a fraction of nrows so
+     * high-cardinality scans pay fewer rehashes.
+     * v2 (MK_RADIX_P shards per worker): each partition holds ~1/256 of
+     * the worker's groups.  Start at 256 slots — matches group.c v2's
+     * design (~64 KB per partition with a 4-slot agg state) and keeps
+     * the upfront allocation total to a few MB instead of tens of MB.
+     * Sparse keys still grow on-demand. */
+    if (v2_ok) {
+        ctx.init_cap = 256;
+    } else {
+        uint64_t expected = (uint64_t)nrows / ((uint64_t)nw * 16u);
+        uint64_t init_cap = FP_SHARD_INIT_CAP;
+        while (init_cap < expected * 2u && init_cap < (1ULL << 14))
+            init_cap <<= 1;
+        ctx.init_cap = init_cap;
+    }
+
+    /* Allocate the shard array.  v2 uses nw * MK_RADIX_P slots, all
+     * stored in the same array — combine_and_materialize iterates
+     * `nw_effective` shards, which equals nw for v1 and nw * MK_RADIX_P
+     * for v2.  Both layouts use the same mk_shard_t per slot. */
+    uint32_t nw_effective = v2_ok ? (nw * MK_RADIX_P) : nw;
+    ray_t* shards_hdr = NULL;
+    ctx.shards = (mk_shard_t*)scratch_calloc(
+        &shards_hdr, (size_t)nw_effective * sizeof(mk_shard_t));
+    if (!ctx.shards) return ray_error("oom", NULL);
+    if (v2_ok) ctx.wpart_shards = ctx.shards;
+
     if (hash_eq_idx >= 0 && ctx.n_aggs == 1 &&
         ctx.aggs[0].kind == MK_AGG_COUNT) {
         mk_eq_hash_count_fn(&ctx, (uint8_t)hash_eq_idx);
@@ -4394,6 +4631,10 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext,
         };
         if (pool) ray_pool_dispatch(pool, mk_eq_i64_count_fn, &fctx, nrows);
         else      mk_eq_i64_count_fn(&fctx, 0, 0, nrows);
+    } else if (v2_ok && pool) {
+        ray_pool_dispatch(pool, mk_par_v2_fn, &ctx, nrows);
+    } else if (v2_ok) {
+        mk_par_v2_fn(&ctx, 0, 0, nrows);
     } else if (pool) {
         ray_pool_dispatch(pool, mk_par_fn, &ctx, nrows);
     } else {
@@ -4401,13 +4642,16 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext,
     }
 
     if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) {
-        for (uint32_t w = 0; w < nw; w++) mk_shard_free(&ctx.shards[w]);
+        for (uint32_t w = 0; w < nw_effective; w++)
+            mk_shard_free(&ctx.shards[w]);
         scratch_free(shards_hdr);
         return ray_error("oom", "fused_group: shard OOM");
     }
 
-    ray_t* result = mk_combine_and_materialize(&ctx, nw, ext->agg_ops);
-    for (uint32_t w = 0; w < nw; w++) mk_shard_free(&ctx.shards[w]);
+    ray_t* result = mk_combine_and_materialize(&ctx, nw_effective,
+                                               ext->agg_ops);
+    for (uint32_t w = 0; w < nw_effective; w++)
+        mk_shard_free(&ctx.shards[w]);
     scratch_free(shards_hdr);
     return result;
 }

From ca16c8112af96e345cd68dedcfd5b2fd726e5384 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Wed, 27 May 2026 16:15:08 +0200
Subject: [PATCH 31/36] perf(fused_group): proper per-partition combine for v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirrors radix_v2_phase2_fn from group.c.  When wpart_shards are
populated, skip the histogram+scatter passes entirely: each MK_RADIX_P
partition is independent, so dispatch one merge task per partition
that walks all workers' shard[w*P+p] and dedups into a single target HT,
then concat per-partition outputs into the dense gs/gst layout the
materialize section expects.

Gated by per-partition cardinality: when (total_local / MK_RADIX_P)
exceeds 16 K entries, fall through to v1's mk_combine_parallel.  Big
per-partition target HTs (~1 M slots × 32 partitions ≈ 768 MB for q32)
blow the working set; v1's smaller per-combine-partition scatter wins
in that regime.

ClickBench 10M-row deltas vs perf/clickbench-improvements head:
  q17  344 → 285 ms  (−59)
  q33  527 → 494 ms  (−33, was barely WIN, now solidly WIN)
  q31  296 → 276 ms  (−20)
  q13  492 → 477 ms  (−15)
  q09  122 → 110 ms  (−12)
  q11  234 → 221 ms  (−13)
  q32 1028 → 1011 ms (−17, via v1 fallback for high-card)
  q08  191 → 184 ms  (−7)
---
 src/ops/fused_group.c | 343 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 341 insertions(+), 2 deletions(-)

diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c
index 0bfa7f6f..71f05ab5 100644
--- a/src/ops/fused_group.c
+++ b/src/ops/fused_group.c
@@ -4207,6 +4207,320 @@ static int mk_combine_parallel(mk_par_ctx_t* c, uint32_t nw,
     return 1;
 }
 
+/* ─── v2 per-partition combine ──────────────────────────────────────
+ *
+ * Shards in c->wpart_shards are already RADIX-partitioned (each holds
+ * only entries whose hash routes to that partition).  The v1 combine
+ * had to histogram + scatter before per-partition dedup; here we go
+ * straight to per-partition dedup — task p just walks all workers'
+ * shard at index w*MK_RADIX_P+p and merges into a single target HT.
+ * Per-partition tasks are fully independent: each task only writes
+ * to its own target HT and its own slot in the part_* arrays. */
+
+typedef struct {
+    mk_par_ctx_t*     ctx;
+    uint32_t          nw;            /* workers per partition */
+    uint8_t           total_state;
+    uint8_t           wide;
+    const mk_agg_t*   aggs;
+    uint8_t           n_aggs;
+    /* Per-partition output buffers (MK_RADIX_P slots). */
+    int64_t**         part_keys;     /* [P]: kv_lo array, size part_n[p] */
+    int64_t**         part_keys_hi;  /* [P]: kv_hi array, NULL when narrow */
+    int64_t**         part_states;   /* [P]: state[part_n[p] * total_state] */
+    ray_t**           part_keys_hdr;
+    ray_t**           part_keys_hi_hdr;
+    ray_t**           part_states_hdr;
+    int64_t*          part_n;
+    _Atomic(uint32_t) oom;
+} mk_combine_v2_ctx_t;
+
+static void mk_combine_v2_part_fn(void* vctx, uint32_t worker_id,
+                                  int64_t start, int64_t end)
+{
+    (void)worker_id;
+    mk_combine_v2_ctx_t* c = (mk_combine_v2_ctx_t*)vctx;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    uint8_t total_state = c->total_state;
+    uint8_t wide        = c->wide;
+    uint8_t n_aggs      = c->n_aggs;
+    uint32_t nw         = c->nw;
+
+    for (int64_t p = start; p < end; p++) {
+        /* Upper bound on the merged partition: sum of worker fills (some
+         * keys may appear in multiple workers; the merge folds those, so
+         * final n_filled ≤ total). */
+        int64_t total = 0;
+        for (uint32_t w = 0; w < nw; w++) {
+            total += c->ctx->wpart_shards[(size_t)w * MK_RADIX_P + p].n_filled;
+        }
+        if (total == 0) {
+            c->part_n[p] = 0;
+            continue;
+        }
+
+        /* Target HT sized to fit `total` at load ≤ 0.5; pow-of-2. */
+        uint64_t cap = 256;
+        while (cap < (uint64_t)(total * 2)) cap <<= 1;
+
+        mk_shard_t target;
+        memset(&target, 0, sizeof(target));
+        if (mk_shard_init(&target, cap, total_state, wide) != 0) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+
+        /* Merge each worker's shard for this partition into target. */
+        for (uint32_t w = 0; w < nw; w++) {
+            mk_shard_t* src = &c->ctx->wpart_shards[(size_t)w * MK_RADIX_P + p];
+            if (!src->slots) continue;
+            int64_t* src_slots = src->slots;
+            int64_t* src_slots_hi = src->slots_hi;
+            int64_t* src_state = src->state;
+            uint64_t src_cap = src->cap;
+            int64_t* tgt_slots = target.slots;
+            int64_t* tgt_slots_hi = target.slots_hi;
+            int64_t* tgt_state = target.state;
+            uint64_t tgt_mask = target.mask;
+
+            for (uint64_t s = 0; s < src_cap; s++) {
+                if (!src_slots[s * 2]) continue;
+                int64_t kv_lo = src_slots[s * 2 + 1];
+                int64_t kv_hi = wide ? src_slots_hi[s] : 0;
+                uint64_t h;
+                if (wide) {
+                    h = mk_hash_lo_hi(kv_lo, kv_hi);
+                } else {
+                    h = (uint64_t)kv_lo * 0x9E3779B97F4A7C15ULL;
+                    h ^= h >> 33;
+                }
+                uint64_t t = h & tgt_mask;
+                const int64_t* sst = &src_state[s * total_state];
+                for (;;) {
+                    if (!tgt_slots[t * 2]) {
+                        tgt_slots[t * 2]     = 1;
+                        tgt_slots[t * 2 + 1] = kv_lo;
+                        if (wide) tgt_slots_hi[t] = kv_hi;
+                        int64_t* dst = &tgt_state[t * total_state];
+                        for (uint8_t k = 0; k < total_state; k++)
+                            dst[k] = sst[k];
+                        target.n_filled++;
+                        break;
+                    }
+                    if (tgt_slots[t * 2 + 1] == kv_lo &&
+                        (!wide || tgt_slots_hi[t] == kv_hi))
+                    {
+                        mk_state_merge(&tgt_state[t * total_state],
+                                       sst, c->aggs, n_aggs);
+                        break;
+                    }
+                    t = (t + 1) & tgt_mask;
+                }
+            }
+        }
+
+        /* Pack target into dense per-partition output arrays. */
+        int64_t pn = target.n_filled;
+        c->part_n[p] = pn;
+        c->part_keys[p] = (int64_t*)scratch_alloc(
+            &c->part_keys_hdr[p], (size_t)pn * sizeof(int64_t));
+        if (wide) {
+            c->part_keys_hi[p] = (int64_t*)scratch_alloc(
+                &c->part_keys_hi_hdr[p], (size_t)pn * sizeof(int64_t));
+        }
+        c->part_states[p] = (int64_t*)scratch_alloc(
+            &c->part_states_hdr[p],
+            (size_t)pn * total_state * sizeof(int64_t));
+        if (!c->part_keys[p] || (wide && !c->part_keys_hi[p]) ||
+            !c->part_states[p])
+        {
+            mk_shard_free(&target);
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+        int64_t gi = 0;
+        int64_t* tgt_slots = target.slots;
+        int64_t* tgt_slots_hi = target.slots_hi;
+        int64_t* tgt_state = target.state;
+        for (uint64_t t = 0; t < target.cap; t++) {
+            if (!tgt_slots[t * 2]) continue;
+            c->part_keys[p][gi] = tgt_slots[t * 2 + 1];
+            if (wide) c->part_keys_hi[p][gi] = tgt_slots_hi[t];
+            const int64_t* src = &tgt_state[t * total_state];
+            int64_t* dst = &c->part_states[p][gi * total_state];
+            for (uint8_t k = 0; k < total_state; k++) dst[k] = src[k];
+            gi++;
+        }
+
+        mk_shard_free(&target);
+    }
+}
+
+/* Drives the v2 per-partition combine.  Returns 1 on success (fills
+ * out_* with a dense gs/gst layout identical to mk_combine_parallel),
+ * 0 on failure (caller falls back to the slow path). */
+static int mk_combine_v2_parallel(mk_par_ctx_t* c, uint32_t nw,
+                                  int64_t** out_gs, ray_t** out_gs_hdr,
+                                  int64_t** out_gs_hi, ray_t** out_gs_hi_hdr,
+                                  int64_t** out_gst, ray_t** out_gst_hdr,
+                                  int64_t* out_gcap, int64_t* out_global_n)
+{
+    uint8_t total_state = c->total_state;
+    uint8_t wide = c->wide;
+    ray_pool_t* pool = ray_pool_get();
+
+    /* Per-partition state arrays (MK_RADIX_P slots each). */
+    ray_t* pk_hdr = NULL;
+    ray_t* pkhi_hdr = NULL;
+    ray_t* ps_hdr = NULL;
+    ray_t* pkh_hdr = NULL;
+    ray_t* pkhh_hdr = NULL;
+    ray_t* psh_hdr = NULL;
+    ray_t* pn_hdr = NULL;
+    int64_t** part_keys = (int64_t**)scratch_calloc(
+        &pk_hdr, (size_t)MK_RADIX_P * sizeof(int64_t*));
+    int64_t** part_keys_hi = wide
+        ? (int64_t**)scratch_calloc(&pkhi_hdr,
+                                    (size_t)MK_RADIX_P * sizeof(int64_t*))
+        : NULL;
+    int64_t** part_states = (int64_t**)scratch_calloc(
+        &ps_hdr, (size_t)MK_RADIX_P * sizeof(int64_t*));
+    ray_t**   part_keys_hdr = (ray_t**)scratch_calloc(
+        &pkh_hdr, (size_t)MK_RADIX_P * sizeof(ray_t*));
+    ray_t**   part_keys_hi_hdr = wide
+        ? (ray_t**)scratch_calloc(&pkhh_hdr,
+                                  (size_t)MK_RADIX_P * sizeof(ray_t*))
+        : NULL;
+    ray_t**   part_states_hdr = (ray_t**)scratch_calloc(
+        &psh_hdr, (size_t)MK_RADIX_P * sizeof(ray_t*));
+    int64_t*  part_n = (int64_t*)scratch_calloc(
+        &pn_hdr, (size_t)MK_RADIX_P * sizeof(int64_t));
+
+    if (!part_keys || !part_states || !part_keys_hdr ||
+        !part_states_hdr || !part_n ||
+        (wide && (!part_keys_hi || !part_keys_hi_hdr)))
+    {
+        if (pk_hdr)   scratch_free(pk_hdr);
+        if (pkhi_hdr) scratch_free(pkhi_hdr);
+        if (ps_hdr)   scratch_free(ps_hdr);
+        if (pkh_hdr)  scratch_free(pkh_hdr);
+        if (pkhh_hdr) scratch_free(pkhh_hdr);
+        if (psh_hdr)  scratch_free(psh_hdr);
+        if (pn_hdr)   scratch_free(pn_hdr);
+        return 0;
+    }
+
+    mk_combine_v2_ctx_t pctx = {
+        .ctx              = c,
+        .nw               = nw,
+        .total_state      = total_state,
+        .wide             = wide,
+        .aggs             = c->aggs,
+        .n_aggs           = c->n_aggs,
+        .part_keys        = part_keys,
+        .part_keys_hi     = part_keys_hi,
+        .part_states      = part_states,
+        .part_keys_hdr    = part_keys_hdr,
+        .part_keys_hi_hdr = part_keys_hi_hdr,
+        .part_states_hdr  = part_states_hdr,
+        .part_n           = part_n,
+        .oom              = 0,
+    };
+
+    if (pool && ray_pool_total_workers(pool) >= 2) {
+        ray_pool_dispatch_n(pool, mk_combine_v2_part_fn, &pctx,
+                            (uint32_t)MK_RADIX_P);
+    } else {
+        mk_combine_v2_part_fn(&pctx, 0, 0, (int64_t)MK_RADIX_P);
+    }
+
+    if (atomic_load_explicit(&pctx.oom, memory_order_relaxed)) {
+        for (uint64_t p = 0; p < MK_RADIX_P; p++) {
+            if (part_keys_hdr[p])    scratch_free(part_keys_hdr[p]);
+            if (part_keys_hi_hdr && part_keys_hi_hdr[p])
+                scratch_free(part_keys_hi_hdr[p]);
+            if (part_states_hdr[p])  scratch_free(part_states_hdr[p]);
+        }
+        scratch_free(pk_hdr); if (pkhi_hdr) scratch_free(pkhi_hdr);
+        scratch_free(ps_hdr);
+        scratch_free(pkh_hdr); if (pkhh_hdr) scratch_free(pkhh_hdr);
+        scratch_free(psh_hdr);
+        scratch_free(pn_hdr);
+        return 0;
+    }
+
+    /* Concat per-partition outputs into dense gs/gs_hi/gst. */
+    int64_t global_n = 0;
+    for (uint64_t p = 0; p < MK_RADIX_P; p++) global_n += part_n[p];
+
+    ray_t* gs_hdr = NULL;
+    ray_t* gs_hi_hdr = NULL;
+    ray_t* gst_hdr = NULL;
+    int64_t* gs = (int64_t*)scratch_calloc(
+        &gs_hdr, (size_t)global_n * 2 * sizeof(int64_t));
+    int64_t* gs_hi = wide
+        ? (int64_t*)scratch_alloc(&gs_hi_hdr,
+                                  (size_t)global_n * sizeof(int64_t))
+        : NULL;
+    int64_t* gst = (int64_t*)scratch_alloc(
+        &gst_hdr, (size_t)global_n * total_state * sizeof(int64_t));
+    if (!gs || (wide && !gs_hi) || !gst) {
+        if (gs_hdr)    scratch_free(gs_hdr);
+        if (gs_hi_hdr) scratch_free(gs_hi_hdr);
+        if (gst_hdr)   scratch_free(gst_hdr);
+        for (uint64_t p = 0; p < MK_RADIX_P; p++) {
+            if (part_keys_hdr[p])    scratch_free(part_keys_hdr[p]);
+            if (part_keys_hi_hdr && part_keys_hi_hdr[p])
+                scratch_free(part_keys_hi_hdr[p]);
+            if (part_states_hdr[p])  scratch_free(part_states_hdr[p]);
+        }
+        scratch_free(pk_hdr); if (pkhi_hdr) scratch_free(pkhi_hdr);
+        scratch_free(ps_hdr);
+        scratch_free(pkh_hdr); if (pkhh_hdr) scratch_free(pkhh_hdr);
+        scratch_free(psh_hdr);
+        scratch_free(pn_hdr);
+        return 0;
+    }
+
+    int64_t gi = 0;
+    for (uint64_t p = 0; p < MK_RADIX_P; p++) {
+        int64_t pn = part_n[p];
+        if (pn == 0) continue;
+        const int64_t* pk = part_keys[p];
+        const int64_t* pkhi = part_keys_hi ? part_keys_hi[p] : NULL;
+        const int64_t* ps = part_states[p];
+        for (int64_t i = 0; i < pn; i++) {
+            gs[gi * 2]     = 1;
+            gs[gi * 2 + 1] = pk[i];
+            if (wide) gs_hi[gi] = pkhi[i];
+            int64_t* dst = &gst[gi * total_state];
+            const int64_t* src = &ps[i * total_state];
+            for (uint8_t k = 0; k < total_state; k++) dst[k] = src[k];
+            gi++;
+        }
+        if (part_keys_hdr[p])    scratch_free(part_keys_hdr[p]);
+        if (part_keys_hi_hdr && part_keys_hi_hdr[p])
+            scratch_free(part_keys_hi_hdr[p]);
+        if (part_states_hdr[p])  scratch_free(part_states_hdr[p]);
+    }
+
+    scratch_free(pk_hdr); if (pkhi_hdr) scratch_free(pkhi_hdr);
+    scratch_free(ps_hdr);
+    scratch_free(pkh_hdr); if (pkhh_hdr) scratch_free(pkhh_hdr);
+    scratch_free(psh_hdr);
+    scratch_free(pn_hdr);
+
+    *out_gs        = gs;
+    *out_gs_hdr    = gs_hdr;
+    *out_gs_hi     = gs_hi;
+    *out_gs_hi_hdr = gs_hi_hdr;
+    *out_gst       = gst;
+    *out_gst_hdr   = gst_hdr;
+    *out_gcap      = global_n;
+    *out_global_n  = global_n;
+    return 1;
+}
+
 static ray_t* mk_combine_and_materialize(mk_par_ctx_t* c, uint32_t nw,
                                          const uint16_t* agg_op_ids)
 {
@@ -4220,7 +4534,13 @@ static ray_t* mk_combine_and_materialize(mk_par_ctx_t* c, uint32_t nw,
     for (uint32_t w = 0; w < nw; w++) total_local += shards[w].n_filled;
 
     /* Try parallel combine first.  On success, jump straight to the
-     * materialize section with the already-built gs/gs_hi/gst arrays. */
+     * materialize section with the already-built gs/gs_hi/gst arrays.
+     *
+     * v2 path: when wpart_shards is set, shards are pre-partitioned by
+     * RADIX_PART(h).  mk_combine_v2_parallel skips the histogram/scatter
+     * passes entirely — each partition is dedupped independently and
+     * the per-(worker, partition) shards already have the right entries.
+     * v1 path: mk_combine_parallel histogram+scatter+dedup. */
     int64_t* gs    = NULL;
     int64_t* gs_hi = NULL;
     int64_t* gst   = NULL;
@@ -4229,11 +4549,30 @@ static ray_t* mk_combine_and_materialize(mk_par_ctx_t* c, uint32_t nw,
     ray_t*   gst_hdr   = NULL;
     int64_t  gcap     = 0;
     int64_t  global_n = 0;
-    int parallel_ok = mk_combine_parallel(c, nw,
+    int parallel_ok = 0;
+    /* v2 combine target HT scales with per-partition cardinality
+     * (total_local / MK_RADIX_P).  For very-high-card queries (q32:
+     * ~10M unique groups → ~313K per partition → ~1 M-slot HT × 32
+     * partitions ≈ 768 MB allocated) the per-partition HTs blow the
+     * working set out of cache; v1's scatter-then-dedup is bounded
+     * by smaller per-combine-partition slices and wins.  ~16 K
+     * entries per partition keeps each target HT in L2 (~1.5 MB
+     * with 4-slot state). */
+    int v2_combine_ok = c->wpart_shards != NULL &&
+        ((uint64_t)total_local / MK_RADIX_P) <= (1ULL << 14);
+    if (v2_combine_ok) {
+        parallel_ok = mk_combine_v2_parallel(c, nw / MK_RADIX_P,
+                                             &gs, &gs_hdr,
+                                             &gs_hi, &gs_hi_hdr,
+                                             &gst, &gst_hdr,
+                                             &gcap, &global_n);
+    } else {
+        parallel_ok = mk_combine_parallel(c, nw,
                                           &gs, &gs_hdr,
                                           &gs_hi, &gs_hi_hdr,
                                           &gst, &gst_hdr,
                                           &gcap, &global_n);
+    }
     if (parallel_ok) goto materialize;
 
     {

From 0efdcde25cceea6887f42422175b4c651e1d202e Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Wed, 27 May 2026 16:23:08 +0200
Subject: [PATCH 32/36] perf(fused_group): eager-init v2 partition shards per
 worker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removes the per-row lazy-init branch in mk_par_v2_fn — at the cost of a
single one-time loop that initialises all MK_RADIX_P=32 shards at the
top of each worker call (~2 MB per worker for a 4-slot agg state).

The branch was a tiny per-row cost but it ran on every passing row
(~10 M iterations on q31/q32-class queries) and the predictor had to
hold an entry that was effectively always-not-taken after warmup.

ClickBench 10M-row deltas vs ca16c811:
  q17  285 → 274 ms  (−11)
  q16  560 → 547 ms  (−13)
  q08  184 → 181 ms  (−3)
  q32 1011 → 995 ms  (variable; via v1 combine, noisy)
---
 src/ops/fused_group.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c
index 71f05ab5..a8a8e081 100644
--- a/src/ops/fused_group.c
+++ b/src/ops/fused_group.c
@@ -3225,6 +3225,22 @@ static void mk_par_v2_fn(void* raw, uint32_t worker_id,
     uint8_t n_aggs      = c->n_aggs;
     mk_shard_t* my_shards = &c->wpart_shards[(size_t)worker_id * MK_RADIX_P];
 
+    /* Eager partition init.  Upfront cost: MK_RADIX_P × init_cap shards
+     * per worker (~256 × 256 × ~30 B = 2 MB for 4-slot state per worker;
+     * 16 MB across 8 workers — comfortably L3-resident).  Saves a per-row
+     * branch (~10M iterations on q31/q32-class queries) for the rest of
+     * the scan.  ray_pool_dispatch reuses the same task across morsel
+     * slices but assigns a fresh worker_id per task call, so guard with
+     * the slots check so re-entry skips. */
+    for (uint32_t p = 0; p < MK_RADIX_P; p++) {
+        if (my_shards[p].slots) continue;
+        if (mk_shard_init(&my_shards[p], c->init_cap,
+                          total_state, wide) != 0) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+    }
+
     int64_t row = start;
     while (row < end) {
         int64_t mend = row + RAY_MORSEL_ELEMS;
@@ -3247,12 +3263,6 @@ static void mk_par_v2_fn(void* raw, uint32_t worker_id,
                 h ^= h >> 33;
                 uint32_t p = MK_RADIX_PART(h);
                 mk_shard_t* sh = &my_shards[p];
-                if (!sh->slots) {
-                    if (mk_shard_init(sh, c->init_cap, total_state, wide) != 0) {
-                        atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
-                        return;
-                    }
-                }
                 if (sh->n_filled + 1 > (int64_t)(sh->cap / 2)) {
                     if (mk_shard_grow(sh, total_state, wide) != 0) {
                         atomic_store_explicit(&c->oom, 1,
@@ -3302,12 +3312,6 @@ static void mk_par_v2_fn(void* raw, uint32_t worker_id,
                 uint64_t h = mk_hash_lo_hi(kv_lo, kv_hi);
                 uint32_t p = MK_RADIX_PART(h);
                 mk_shard_t* sh = &my_shards[p];
-                if (!sh->slots) {
-                    if (mk_shard_init(sh, c->init_cap, total_state, wide) != 0) {
-                        atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
-                        return;
-                    }
-                }
                 if (sh->n_filled + 1 > (int64_t)(sh->cap / 2)) {
                     if (mk_shard_grow(sh, total_state, wide) != 0) {
                         atomic_store_explicit(&c->oom, 1,

From 06bbea507adf4b6aa59ffc4b64e3b45adbb25d62 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Wed, 27 May 2026 16:49:13 +0200
Subject: [PATCH 33/36] perf(query): route count_distinct_per_group_buf through
 streaming HLL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two callsites already wire streaming HLL into the global path (ray_select
line 8146), but it only fires when the caller had row_gid handy.  The
LIST-based path (count_distinct_per_group_groups → buf) and any other
caller that arrived with idx_buf/offsets but no row_gid fell through to
the per-group task path even when n_groups fit the streaming budget.

Build row_gid on entry by inverting idx_buf/offsets — non-passing rows
stay at the -1 sentinel and the streaming task skips them — then try
ray_count_distinct_approx_pg_stream before the per-group fallback.
Gate matches the existing streaming budget (n_groups in [16, 500],
n_rows ≥ 1M, hashable source).

ClickBench 10M-row deltas vs 0efdcde2:
  q14   86 →  72 ms  (−14)
  q16  549 → 541 ms  (−8)
  q08  184 → 179 ms  (−5)
  q11  217 → 215 ms  (−2)
  Others within noise.
---
 src/ops/query.c | 68 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 56 insertions(+), 12 deletions(-)

diff --git a/src/ops/query.c b/src/ops/query.c
index 73aaaf48..2cede4d9 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -2741,21 +2741,65 @@ static ray_t* count_distinct_per_group_buf(ray_t* inner_expr, ray_t* tbl,
     out->len = n_groups;
     int64_t* odata = (int64_t*)ray_data(out);
 
-    /* HyperLogLog approximate path — one task per group, each task with
-     * a private stack-resident sketch (~16 KB).  Triggered when the
-     * total inflated row count across all groups is large enough that
-     * the exact per-group dedup HT becomes memory-bandwidth-bound;
-     * 1 M rows is the same threshold the global path in
-     * exec_count_distinct uses.  Returns within ~0.8 % std error. */
-    /* HyperLogLog approximate path — one task per group, each task with
-     * a private stack-resident sketch (~16 KB).  Triggered when the
-     * total inflated row count across all groups is large enough that
-     * the exact per-group dedup HT becomes memory-bandwidth-bound;
-     * 1 M rows is the same threshold the global path in
-     * exec_count_distinct uses.  Returns within ~0.8 % std error. */
+    /* Streaming HLL — one parallel pass over rows (each worker owns a
+     * private bank of n_groups sparse sketches) instead of n_groups
+     * separate tasks each rebuilding a sketch.  Wins when n_groups is
+     * small enough that the per-group banks stay roughly L2-resident
+     * (~17 KB per group at p=14, so n_groups ≤ 500 caps a worker bank
+     * at ~8 MB).  Builds row_gid[] by inverting idx_buf/offsets;
+     * n_total_rows is the largest source row index referenced. */
     if (n_groups > 0) {
         int64_t total_rows = 0;
         for (int64_t g = 0; g < n_groups; g++) total_rows += grp_cnt[g];
+
+        int8_t st = src->type;
+        bool hashable = (st == RAY_BOOL || st == RAY_U8 ||
+                          st == RAY_I16  || st == RAY_I32 || st == RAY_I64 ||
+                          st == RAY_F64  || st == RAY_DATE || st == RAY_TIME ||
+                          st == RAY_TIMESTAMP || RAY_IS_SYM(st));
+        if (hashable && total_rows >= (1 << 20) &&
+            n_groups >= 16 && n_groups <= 500)
+        {
+            /* Largest source row index in idx_buf — sets the row_gid
+             * span.  For unfiltered queries every row gets a gid; for
+             * filtered queries non-passing rows stay at the -1 sentinel
+             * and the streaming task skips them. */
+            int64_t n_max_row = 0;
+            for (int64_t gi = 0; gi < n_groups; gi++) {
+                int64_t end_off = offsets[gi] + grp_cnt[gi];
+                for (int64_t j = offsets[gi]; j < end_off; j++) {
+                    if (idx_buf[j] >= n_max_row) n_max_row = idx_buf[j] + 1;
+                }
+            }
+            if (n_max_row > 0) {
+                ray_t* rg_hdr = NULL;
+                int64_t* row_gid = (int64_t*)scratch_alloc(&rg_hdr,
+                    (size_t)n_max_row * sizeof(int64_t));
+                if (row_gid) {
+                    for (int64_t r = 0; r < n_max_row; r++) row_gid[r] = -1;
+                    for (int64_t gi = 0; gi < n_groups; gi++) {
+                        int64_t end_off = offsets[gi] + grp_cnt[gi];
+                        for (int64_t j = offsets[gi]; j < end_off; j++) {
+                            row_gid[idx_buf[j]] = gi;
+                        }
+                    }
+                    if (ray_count_distinct_approx_pg_stream(
+                            src, row_gid, n_max_row, n_groups, 14, odata) == 0)
+                    {
+                        scratch_free(rg_hdr);
+                        ray_release(src);
+                        return out;
+                    }
+                    scratch_free(rg_hdr);
+                    memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
+                }
+            }
+        }
+
+        /* Per-group HLL fallback — one task per group, private sketch
+         * per task.  Triggered when streaming doesn't apply (too many
+         * groups, non-hashable col) but the row count still justifies
+         * approximation. */
         if (total_rows >= (1 << 20)) {
             if (ray_count_distinct_approx_pg_buf(src, idx_buf, offsets,
                                                   grp_cnt, n_groups,

From 706c5beb74c2ae64ff38631f4248063989010c37 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Thu, 28 May 2026 08:38:00 +0200
Subject: [PATCH 34/36] =?UTF-8?q?perf(query):=20planner=20rewrite=20for=20?=
 =?UTF-8?q?`(count=20distinct=20X)=20by=20K`=20=E2=86=92=202-stage=20group?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`(select {K: K c: (count (distinct X)) from: T [where: W]
          by: K [desc: c take: N]})` previously executed as:

  1. outer group-by K → build idx_buf/offsets/grp_cnt
  2. per-K dedup over X (cdpg_buf_par_fn or per-group HLL)

The per-group dedup pays a per-group HT allocation (or HLL sketch slab)
even when its work is tiny.  For q08-class queries on 10M rows the
idx_buf scatter + per-group HLL pipeline runs ~180 ms because of the
overhead, not the dedup itself.

Rewrite: detect the pattern in ray_select and route to a two-stage DAG:

  1. inner group-by (K, X) on the source table with one count agg —
     lands on the v2 multi-key kernel; output has one row per unique
     (K, X) tuple
  2. outer group-by K with count over the dedup table — emit_filter
     carries desc:c take:N so the second pass heap-trims to top-N

Gate (q08 fits, q10/q11/q13 don't yet — SYM key paths still need v2 SYM):
  - single scalar K column (not SYM, not nullable)
  - cd_inner is a column ref X (not SYM, not nullable)
  - K + X packed ≤ 16 bytes (v2's wide-key cap)
  - WHERE optional and supported by the fused predicate evaluator
  - desc/take optional and targeting the cd output column
  - exactly one count-distinct agg and zero other aggs

ClickBench 10M-row delta vs 06bbea50:
  q08  179 → 122 ms  (−57, was 2.0× duck, now 1.4× duck)

Correctness verified — q08 result matches DuckDB row-for-row.
---
 src/ops/query.c | 219 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 219 insertions(+)

diff --git a/src/ops/query.c b/src/ops/query.c
index 2cede4d9..7a0ad253 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -2696,6 +2696,209 @@ static ray_t* query_materialize_parted_col(ray_t* col) {
     return flat;
 }
 
+/* Planner rewrite for `(select {K: K c: (count (distinct X)) from: T
+ * [where: W] by: K [desc: c take: N]})`.
+ *
+ * Original execution: outer group-by K builds idx_buf → per-group dedup
+ * over X (via cdpg_buf_par_fn or per-group HLL).  That pays the outer
+ * group-by + idx_buf scatter even when the per-group dedup is the
+ * dominant cost.
+ *
+ * Rewrite: group by (K, X) once — this deduplicates (K, X) tuples in a
+ * single pass that lands on the v2 multi-key kernel — then count rows
+ * per K on the (typically much smaller) dedup table.  For q08 on the
+ * 10M-row hits table, the (K, X) pass produces ~700 K tuples; the final
+ * group-by walks just that.
+ *
+ * Returns NULL on shape miss (caller falls through to the existing
+ * count-distinct path); returns a result table on success.  Gates:
+ *  - single scalar K column (not SYM, no nulls)
+ *  - cd_inner is a column ref X (not SYM, no nulls) — composite key
+ *    fits in 16 bytes (v2's wide-key cap)
+ *  - K + X ≤ 16 bytes packed
+ *  - WHERE optional; if present, must be supported by the fused predicate
+ *  - desc/take optional, must be on the cd output column when present */
+static ray_t* try_count_distinct_v2_rewrite(
+    ray_t* tbl,
+    ray_t* by_expr,
+    ray_t* where_expr,
+    ray_t** dict_elems, int64_t dict_n,
+    int64_t from_id, int64_t where_id, int64_t by_id,
+    int64_t take_id, int64_t asc_id, int64_t desc_id,
+    int64_t nearest_id)
+{
+    if (!tbl || tbl->type != RAY_TABLE) return NULL;
+    if (!by_expr || by_expr->type != -RAY_SYM ||
+        !(by_expr->attrs & RAY_ATTR_NAME))
+        return NULL;
+    int64_t K_sym = by_expr->i64;
+
+    /* Walk the dict — accept exactly one `(count (distinct col_ref))`
+     * agg and an optional identity key projection.  Any other agg /
+     * projection / take-on-something-else aborts the rewrite. */
+    int64_t cd_X_sym = -1;
+    int64_t cd_c_sym = -1;
+    int n_cd = 0, n_other = 0;
+    int saw_key_proj = 0;
+    int64_t desc_col_sym = -1;  /* if desc:, its column-sym target */
+    int64_t asc_col_sym  = -1;
+    int     has_take = 0;
+    int64_t take_n   = -1;
+    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+        int64_t kid = dict_elems[i]->i64;
+        ray_t*  val = dict_elems[i + 1];
+        if (kid == from_id || kid == where_id || kid == by_id ||
+            kid == nearest_id) continue;
+        if (kid == take_id) {
+            int64_t v;
+            if (atom_i64_const(val, &v) && v > 0) {
+                has_take = 1;
+                take_n   = v;
+            } else {
+                return NULL;  /* non-trivial take */
+            }
+            continue;
+        }
+        if (kid == asc_id) {
+            if (val && val->type == -RAY_SYM && (val->attrs & RAY_ATTR_NAME))
+                asc_col_sym = val->i64;
+            else return NULL;
+            continue;
+        }
+        if (kid == desc_id) {
+            if (val && val->type == -RAY_SYM && (val->attrs & RAY_ATTR_NAME))
+                desc_col_sym = val->i64;
+            else return NULL;
+            continue;
+        }
+        ray_t* cd_inner = match_count_distinct(val);
+        if (cd_inner && cd_inner->type == -RAY_SYM &&
+            (cd_inner->attrs & RAY_ATTR_NAME))
+        {
+            cd_X_sym = cd_inner->i64;
+            cd_c_sym = kid;
+            n_cd++;
+        } else if (is_single_group_key_projection(by_expr, val)) {
+            saw_key_proj++;
+        } else {
+            n_other++;
+        }
+    }
+    if (n_cd != 1 || n_other > 0) return NULL;
+    if (cd_X_sym < 0 || cd_c_sym < 0) return NULL;
+
+    /* desc/asc must target the count output column. */
+    if (desc_col_sym >= 0 && desc_col_sym != cd_c_sym) return NULL;
+    if (asc_col_sym  >= 0 && asc_col_sym  != cd_c_sym) return NULL;
+    if (desc_col_sym >= 0 && asc_col_sym  >= 0) return NULL;
+
+    /* Type checks on K and X.  v2 multi-key composite path requires
+     * non-SYM, non-nullable, packed ≤ 16 bytes (wide-key cap). */
+    ray_t* K_col = ray_table_get_col(tbl, K_sym);
+    ray_t* X_col = ray_table_get_col(tbl, cd_X_sym);
+    if (!K_col || !X_col) return NULL;
+    int8_t kct = K_col->type, xct = X_col->type;
+    if (RAY_IS_PARTED(kct) || kct == RAY_MAPCOMMON) return NULL;
+    if (RAY_IS_PARTED(xct) || xct == RAY_MAPCOMMON) return NULL;
+    if (kct == RAY_SYM || xct == RAY_SYM) return NULL;
+    if (K_col->attrs & RAY_ATTR_HAS_NULLS) return NULL;
+    if (X_col->attrs & RAY_ATTR_HAS_NULLS) return NULL;
+    int K_esz = ray_sym_elem_size(kct, K_col->attrs);
+    int X_esz = ray_sym_elem_size(xct, X_col->attrs);
+    if (K_esz + X_esz > 16) return NULL;
+    /* Restrict to integer/temporal — matches mk_compile's accepted shapes. */
+    int kct_ok = (kct == RAY_BOOL || kct == RAY_U8 || kct == RAY_I16 ||
+                  kct == RAY_I32  || kct == RAY_I64 ||
+                  kct == RAY_DATE || kct == RAY_TIME || kct == RAY_TIMESTAMP);
+    int xct_ok = (xct == RAY_BOOL || xct == RAY_U8 || xct == RAY_I16 ||
+                  xct == RAY_I32  || xct == RAY_I64 ||
+                  xct == RAY_DATE || xct == RAY_TIME || xct == RAY_TIMESTAMP);
+    if (!kct_ok || !xct_ok) return NULL;
+
+    if (where_expr && !ray_fused_group_supported(where_expr, tbl))
+        return NULL;
+
+    /* === Inner pass: group by (K, X) on the source table === */
+    ray_graph_t* g_in = ray_graph_new(tbl);
+    if (!g_in) return NULL;
+    ray_t* K_name = ray_sym_str(K_sym);
+    ray_t* X_name = ray_sym_str(cd_X_sym);
+    if (!K_name || !X_name) { ray_graph_free(g_in); return NULL; }
+    ray_op_t* K_scan = ray_scan(g_in, ray_str_ptr(K_name));
+    ray_op_t* X_scan = ray_scan(g_in, ray_str_ptr(X_name));
+    if (!K_scan || !X_scan) { ray_graph_free(g_in); return NULL; }
+    ray_op_t* keys_in[2] = { K_scan, X_scan };
+    uint16_t  agg_ops_in[1] = { OP_COUNT };
+    ray_op_t* agg_ins_in[1] = { K_scan };  /* count agg input is irrelevant */
+    ray_op_t* inner;
+    if (where_expr) {
+        ray_op_t* pred = compile_expr_dag(g_in, where_expr);
+        if (!pred) { ray_graph_free(g_in); return NULL; }
+        inner = ray_filtered_group(g_in, pred, keys_in, 2,
+                                   agg_ops_in, agg_ins_in, 1);
+    } else {
+        inner = ray_group(g_in, keys_in, 2, agg_ops_in, agg_ins_in, 1);
+    }
+    if (!inner) { ray_graph_free(g_in); return NULL; }
+    ray_t* dedup = ray_execute(g_in, inner);
+    ray_graph_free(g_in);
+    if (!dedup) return NULL;
+    if (RAY_IS_ERR(dedup)) return dedup;
+    if (dedup->type != RAY_TABLE) { ray_release(dedup); return NULL; }
+
+    /* === Outer pass: group dedup table by K with COUNT, ordered === */
+    ray_graph_t* g_out = ray_graph_new(dedup);
+    if (!g_out) { ray_release(dedup); return ray_error("oom", NULL); }
+    ray_op_t* K_scan2 = ray_scan(g_out, ray_str_ptr(K_name));
+    if (!K_scan2) { ray_graph_free(g_out); ray_release(dedup); return NULL; }
+    ray_op_t* keys_out[1] = { K_scan2 };
+    uint16_t  agg_ops_out[1] = { OP_COUNT };
+    ray_op_t* agg_ins_out[1] = { K_scan2 };
+
+    /* Apply desc:c take:N via the group emit_filter so the second pass
+     * can heap-trim to top-N without materialising every (K, count) row. */
+    ray_group_emit_filter_t prev_emit = ray_group_emit_filter_get();
+    ray_group_emit_filter_t emit_f = {0};
+    int emit_set = 0;
+    if (desc_col_sym == cd_c_sym && has_take && take_n > 0) {
+        emit_f.enabled = true;
+        emit_f.agg_index = 0;
+        emit_f.top_count_take = take_n;
+        emit_f.min_count_exclusive = 0;
+        ray_group_emit_filter_set(emit_f);
+        emit_set = 1;
+    }
+    ray_op_t* outer = ray_group(g_out, keys_out, 1,
+                                agg_ops_out, agg_ins_out, 1);
+    if (!outer) {
+        if (emit_set) ray_group_emit_filter_set(prev_emit);
+        ray_graph_free(g_out);
+        ray_release(dedup);
+        return ray_error("oom", NULL);
+    }
+    ray_t* result = ray_execute(g_out, outer);
+    if (emit_set) ray_group_emit_filter_set(prev_emit);
+    ray_graph_free(g_out);
+    ray_release(dedup);
+    if (!result || RAY_IS_ERR(result)) return result;
+    if (result->type != RAY_TABLE) return result;
+
+    /* Rename the default "count" output column to the user's c_sym
+     * (e.g. q08's `u:`).  ray_group writes its agg under the literal
+     * "count" sym; rewrite the slot's name in place. */
+    int64_t count_sym = ray_sym_intern("count", 5);
+    if (count_sym != cd_c_sym) {
+        int64_t nc = ray_table_ncols(result);
+        for (int64_t ci = 0; ci < nc; ci++) {
+            if (ray_table_col_name(result, ci) == count_sym) {
+                ray_table_set_col_name(result, ci, cd_c_sym);
+                break;
+            }
+        }
+    }
+    return result;
+}
+
 /* Per-group count(distinct) using the existing OP_COUNT_DISTINCT kernel.
  * Mirrors aggr_unary_per_group_buf but slices the source column once per
  * group and calls exec_count_distinct directly — bypasses the full
@@ -3826,6 +4029,22 @@ ray_t* ray_select(ray_t** args, int64_t n) {
         }
     }
 
+    /* Count-distinct planner rewrite: `(select {K: K c: (count (distinct X))
+     * from: T [where: W] by: K [desc: c take: N]})` decomposes cleanly to
+     * a two-stage group-by — first dedup (K, X) pairs, then count rows
+     * per K.  The dedup pass lands on the v2 multi-key kernel; the
+     * second pass walks a much smaller table.  Skips the outer-group +
+     * idx_buf scatter that the per-group dedup path otherwise pays. */
+    if (!nearest_expr) {
+        ray_t* rw = try_count_distinct_v2_rewrite(
+            tbl, by_expr, where_expr, dict_elems, dict_n,
+            from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id);
+        if (rw) {
+            ray_release(tbl);
+            return rw;
+        }
+    }
+
     /* Count output columns */
     int n_out = 0;
     for (int64_t i = 0; i + 1 < dict_n; i += 2) {

From b3cd9c5898c5511d85785dc866426fe689b1798c Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Mon, 1 Jun 2026 13:38:05 +0200
Subject: [PATCH 35/36] fix(query): correct count-distinct alias + SYM
 empty-string comparison
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two correctness fixes found while auditing the count-distinct path.

- The `(count distinct X) by K` planner rewrite named its output column
  "<key>_count", but the rename pass searched for the literal "count", so
  the user's requested alias was never applied and the column came out
  under the default name. Rename the actual count column (the sole
  non-key column in the result) to the alias. Recovers 8 group /
  count-distinct rfl tests that access the result by alias.

- Comparing a SYM column to the empty-string literal "" silently dropped
  WHERE predicates: once "" resolves to the interned empty sym (id 0, for
  which RAY_ATOM_IS_NULL is true), the null-comparison fixup filled
  `!= ""` all-true and `== ""` all-false. Skip that fixup for
  string-resolved comparisons — a SYM column is null-free and the empty
  string is a real, comparable value — so `!= symcol ""` now excludes
  empty rows and `== symcol ""` selects them.

Updates read_csv.rfl to the corrected empty-string-as-value semantics
(the test had documented this exact behaviour as a known tension).

Tests: 2819 of 2821 passed (2 skipped, 0 failed) under ASAN/UBSAN.
---
 src/ops/expr.c               | 21 ++++++++++++++++-----
 src/ops/query.c              | 16 ++++++++++------
 test/rfl/system/read_csv.rfl | 15 ++++++++-------
 3 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/src/ops/expr.c b/src/ops/expr.c
index 49b4f9bc..07931bba 100644
--- a/src/ops/expr.c
+++ b/src/ops/expr.c
@@ -2115,11 +2115,22 @@ ray_t* exec_elementwise_binary(ray_graph_t* g, ray_op_t* op, ray_t* lhs, ray_t*
                      0, len);
     }
 
-    /* Null propagation from inputs */
-    if (op_propagates_null(op->opcode))
-        propagate_nulls_binary(lhs, rhs, result, l_scalar, r_scalar, len);
-    else
-        fix_null_comparisons(lhs, rhs, result, l_scalar, r_scalar, len, op->opcode);
+    /* Null propagation from inputs.  Skipped when str_resolved: we resolved
+     * a string constant to an integer sym id and compared it by value against
+     * a SYM column.  SYM columns carry no nulls (id 0 / the interned empty
+     * string is a real value — see ray_sym_init / ray_vec_is_null), and the
+     * resolved string atom must NOT be treated as null here.  Otherwise the
+     * empty-string literal "" — for which RAY_ATOM_IS_NULL is true (slen==0,
+     * obj==NULL) yet which resolves to the valid sym id 0 — would take the
+     * null-comparison fill: `!= col ""` passing every row and `== col ""`
+     * matching none, instead of selecting the empty-string rows by value
+     * (which silently drops a `(!= symcol "")` WHERE predicate). */
+    if (!str_resolved) {
+        if (op_propagates_null(op->opcode))
+            propagate_nulls_binary(lhs, rhs, result, l_scalar, r_scalar, len);
+        else
+            fix_null_comparisons(lhs, rhs, result, l_scalar, r_scalar, len, op->opcode);
+    }
 
     /* Div/mod: mark zero-divisor positions as null.
      * The morsel loop writes 0 for b==0 but can't set bitmap nulls. */
diff --git a/src/ops/query.c b/src/ops/query.c
index 7a0ad253..207fea14 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -2883,14 +2883,18 @@ static ray_t* try_count_distinct_v2_rewrite(
     if (!result || RAY_IS_ERR(result)) return result;
     if (result->type != RAY_TABLE) return result;
 
-    /* Rename the default "count" output column to the user's c_sym
-     * (e.g. q08's `u:`).  ray_group writes its agg under the literal
-     * "count" sym; rewrite the slot's name in place. */
-    int64_t count_sym = ray_sym_intern("count", 5);
-    if (count_sym != cd_c_sym) {
+    /* Rename the count output column to the user's requested c_sym alias.
+     * The outer pass counts the key column, so ray_group names the agg
+     * output "<key>_count" (after its input column) — NOT the literal
+     * "count" this code originally searched for, which left the result
+     * column misnamed (the "<key>_count" default instead of the alias).
+     * The result holds exactly the key column plus this one count
+     * column, so rename whichever non-key column it is. */
+    if (K_sym != cd_c_sym) {
         int64_t nc = ray_table_ncols(result);
         for (int64_t ci = 0; ci < nc; ci++) {
-            if (ray_table_col_name(result, ci) == count_sym) {
+            int64_t cn = ray_table_col_name(result, ci);
+            if (cn != K_sym && cn != cd_c_sym) {
                 ray_table_set_col_name(result, ci, cd_c_sym);
                 break;
             }
diff --git a/test/rfl/system/read_csv.rfl b/test/rfl/system/read_csv.rfl
index a502b8e7..3d3f33c9 100644
--- a/test/rfl/system/read_csv.rfl
+++ b/test/rfl/system/read_csv.rfl
@@ -75,11 +75,12 @@
 (.sys.exec "printf 'name\\nalice\\n\\nbob\\n\\ncarol\\n' > rf_test_empty.csv") -- 0
 (set _t (.csv.read [SYMBOL] "rf_test_empty.csv"))
 (count _t)                                                            -- 5
-;; Empty string IS a null STR atom and empty SYM cell IS null (sym
-;; id 0).  The SYM vec vs null STR atom comparison short-circuits null:
-;; every cell passes `!= ""` and none passes `== ""`.  Documented
-;; tension; revisit if SQL-style null-aware filtering on SYM columns
-;; becomes a requirement.
-(count (select {x: name from: _t where: (!= name "")}))               -- 5
-(count (select {x: name from: _t where: (== name "")}))               -- 0
+;; The empty SYM cell is the interned empty string (sym id 0), a real
+;; comparable value — SQL-style filtering on SYM columns compares by
+;; value, not by null.  `(!= name "")` therefore excludes the two empty
+;; rows (alice, bob, carol survive) and `(== name "")` selects them.
+;; (See the str-resolved comparison path in src/ops/expr.c, which skips
+;; the null-comparison fixup once a string constant resolves to a sym id.)
+(count (select {x: name from: _t where: (!= name "")}))               -- 3
+(count (select {x: name from: _t where: (== name "")}))               -- 2
 (.sys.exec "rm -f rf_test_empty.csv") -- 0

From c1b91fe6574ac7855ad49ad8ba3d8926392da284 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Mon, 1 Jun 2026 14:00:22 +0200
Subject: [PATCH 36/36] fix(query): drop unused saw_key_proj counter

It was only ever incremented, never read. gcc treats the ++ as a use,
but clang's -Wunused-but-set-variable (=-Werror) rejects it, breaking
the macOS CI build. The else-if branch still accepts identity key
projections (so they don't trip the n_other abort); it just no longer
keeps a discarded count.
---
 src/ops/query.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/ops/query.c b/src/ops/query.c
index 207fea14..3b08415c 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -2739,7 +2739,6 @@ static ray_t* try_count_distinct_v2_rewrite(
     int64_t cd_X_sym = -1;
     int64_t cd_c_sym = -1;
     int n_cd = 0, n_other = 0;
-    int saw_key_proj = 0;
     int64_t desc_col_sym = -1;  /* if desc:, its column-sym target */
     int64_t asc_col_sym  = -1;
     int     has_take = 0;
@@ -2779,7 +2778,7 @@ static ray_t* try_count_distinct_v2_rewrite(
             cd_c_sym = kid;
             n_cd++;
         } else if (is_single_group_key_projection(by_expr, val)) {
-            saw_key_proj++;
+            /* identity key projection (e.g. {K: K}) — accepted, no-op */
         } else {
             n_other++;
         }