From 216a62f07dc19673a0a9e9f6ed9e864c938e0532 Mon Sep 17 00:00:00 2001
From: Hetoku <volonter84@gmail.com>
Date: Fri, 22 May 2026 17:32:55 +0200
Subject: [PATCH 01/11] revert: remove fraudulent profiling-gated result caches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 597f06ca added result-memoization caches that activated only
under `g_ray_profile.active` (i.e. only while a timed benchmark is
running) or unconditionally across repeated calls. A benchmark that
runs each query 3x and keeps the min would see runs 2-3 return the
memoized result in ~0.01ms without executing the query at all —
fake wins, not real speed.

Removed entirely:
  - g_select_cache / g_select_expr_cache + ray_expr_hash      (query.c)
  - the 4 function-static cache_result fast-paths             (query.c)
  - g_do_null_cache + the (do Q null) skip-eval memoization    (eval.c)
  - g_reduce_cache (cross-query whole-column reduce cache)    (group.c)
  - ray_env_generation / g_env_generation (only fed the above) (env.c)

Kept: affine_sum_cache (eval.c) — legitimate, cleared per top-level
eval, intra-query reuse only; ray_sym_intern_runtime (sym-table
behaviour, not a result cache).

Test suite: 2657/2659 pass (2 skipped, 0 failed).
---
 src/lang/env.c  |  14 ---
 src/lang/env.h  |   1 -
 src/lang/eval.c | 109 -----------------
 src/ops/group.c |  57 ---------
 src/ops/query.c | 308 ------------------------------------------------
 5 files changed, 489 deletions(-)

diff --git a/src/lang/env.c b/src/lang/env.c
index 125ced49..8bb2a50e 100644
--- a/src/lang/env.c
+++ b/src/lang/env.c
@@ -30,17 +30,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-static _Atomic uint64_t g_env_generation = 1;
-
-uint64_t ray_env_generation(void) {
-    return atomic_load_explicit(&g_env_generation, memory_order_relaxed);
-}
-
-static void env_bump_generation_if_user(int is_user) {
-    if (is_user)
-        atomic_fetch_add_explicit(&g_env_generation, 1, memory_order_relaxed);
-}
-
 /* ---- Function constructors ---- */
 
 /* Builtin name stored inline in nullmap[2..15] (max 13 chars + null).
@@ -311,7 +300,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) {
                     g_env.user[j] = g_env.user[j + 1];
                 }
                 g_env.count--;
-                env_bump_generation_if_user(is_user);
                 env_unlock();
                 return RAY_OK;
             }
@@ -324,7 +312,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) {
              * flag alone — once user, always user, until the slot is
              * deleted. */
             if (is_user) g_env.user[i] = 1;
-            env_bump_generation_if_user(is_user);
             env_unlock();
             return RAY_OK;
         }
@@ -342,7 +329,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) {
     g_env.vals[g_env.count] = val;
     g_env.user[g_env.count] = is_user ? 1 : 0;
     g_env.count++;
-    env_bump_generation_if_user(is_user);
     env_unlock();
     return RAY_OK;
 }
diff --git a/src/lang/env.h b/src/lang/env.h
index 25170c2a..e92b5284 100644
--- a/src/lang/env.h
+++ b/src/lang/env.h
@@ -43,7 +43,6 @@ static inline const char* ray_fn_name(const ray_t* fn) {
 ray_err_t ray_env_init(void);
 void     ray_env_destroy(void);
 ray_t*    ray_env_get(int64_t sym_id);
-uint64_t  ray_env_generation(void);
 
 /* User-facing binder.  Refuses any name starting with `.` — that root is
  * reserved for system namespaces (.sys, .os, .io, .ipc, …) populated by
diff --git a/src/lang/eval.c b/src/lang/eval.c
index e388474d..d655e78d 100644
--- a/src/lang/eval.c
+++ b/src/lang/eval.c
@@ -1487,116 +1487,9 @@ ray_t* ray_cond_fn(ray_t** args, int64_t n) {
     return make_i64(0);
 }
 
-static uint64_t do_cache_mix(uint64_t h, uint64_t v) {
-    h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2);
-    return h ? h : 0x9e3779b97f4a7c15ull;
-}
-
-static uint64_t do_cache_hash(ray_t* x) {
-    if (!x) return 0x1234abcd5678ef00ull;
-    uint64_t h = do_cache_mix(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type);
-    h = do_cache_mix(h, (uint64_t)x->attrs);
-    h = do_cache_mix(h, (x->type == -RAY_STR)
-                        ? (uint64_t)ray_str_len(x)
-                        : (uint64_t)x->len);
-    if (x->type == RAY_LIST) {
-        ray_t** elems = (ray_t**)ray_data(x);
-        for (int64_t i = 0; i < x->len; i++)
-            h = do_cache_mix(h, do_cache_hash(elems[i]));
-    } else if (x->type == RAY_DICT) {
-        h = do_cache_mix(h, do_cache_hash(ray_dict_keys(x)));
-        h = do_cache_mix(h, do_cache_hash(ray_dict_vals(x)));
-    } else if (x->type == RAY_STR) {
-        for (int64_t i = 0; i < x->len; i++) {
-            size_t n = 0;
-            const char* s = ray_str_vec_get(x, i, &n);
-            for (size_t j = 0; s && j < n; j++)
-                h = do_cache_mix(h, (unsigned char)s[j]);
-        }
-    } else if (x->type == -RAY_STR) {
-        const char* s = ray_str_ptr(x);
-        size_t n = ray_str_len(x);
-        for (size_t i = 0; s && i < n; i++)
-            h = do_cache_mix(h, (unsigned char)s[i]);
-    } else if (x->type == RAY_SYM || x->type == -RAY_SYM ||
-               x->type == RAY_I64 || x->type == -RAY_I64 ||
-               x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) {
-        h = do_cache_mix(h, (uint64_t)x->i64);
-    } else if (x->type == RAY_I32 || x->type == -RAY_I32 ||
-               x->type == RAY_DATE || x->type == -RAY_DATE ||
-               x->type == RAY_TIME || x->type == -RAY_TIME) {
-        h = do_cache_mix(h, (uint64_t)(uint32_t)x->i32);
-    } else if (x->type == RAY_I16 || x->type == -RAY_I16) {
-        h = do_cache_mix(h, (uint64_t)(uint16_t)x->i16);
-    } else if (x->type == RAY_U8 || x->type == -RAY_U8 ||
-               x->type == RAY_BOOL || x->type == -RAY_BOOL) {
-        h = do_cache_mix(h, (uint64_t)x->u8);
-    } else if (x->type == RAY_F64 || x->type == -RAY_F64) {
-        uint64_t bits = 0;
-        memcpy(&bits, &x->f64, sizeof(bits));
-        h = do_cache_mix(h, bits);
-    }
-    return h;
-}
-
-static bool do_cache_contains_set(ray_t* x) {
-    if (!x || x->type != RAY_LIST) return false;
-    ray_t** elems = (ray_t**)ray_data(x);
-    if (x->len > 0 && elems[0] && elems[0]->type == -RAY_SYM) {
-        ray_t* s = ray_sym_str(elems[0]->i64);
-        bool is_set = s && ray_str_len(s) == 3 &&
-                      memcmp(ray_str_ptr(s), "set", 3) == 0;
-        if (s) ray_release(s);
-        if (is_set) return true;
-    }
-    for (int64_t i = 0; i < x->len; i++)
-        if (do_cache_contains_set(elems[i]))
-            return true;
-    return false;
-}
-
-static bool do_cache_is_null_name(ray_t* x) {
-    if (!x || x->type != -RAY_SYM || !(x->attrs & RAY_ATTR_NAME)) return false;
-    ray_t* s = ray_sym_str(x->i64);
-    bool ok = s && ray_str_len(s) == 4 && memcmp(ray_str_ptr(s), "null", 4) == 0;
-    if (s) ray_release(s);
-    return ok;
-}
-
-#define DO_NULL_CACHE_N 2048
-static uint64_t g_do_null_cache[DO_NULL_CACHE_N];
-static uint64_t g_do_null_cache_env_gen[DO_NULL_CACHE_N];
-static uint16_t g_do_null_cache_next = 0;
-
-static bool do_null_cache_get(uint64_t hash) {
-    if (!hash) return false;
-    uint64_t env_gen = ray_env_generation();
-    for (uint16_t i = 0; i < DO_NULL_CACHE_N; i++)
-        if (g_do_null_cache[i] == hash &&
-            g_do_null_cache_env_gen[i] == env_gen)
-            return true;
-    return false;
-}
-
-static void do_null_cache_put(uint64_t hash) {
-    if (hash) {
-        uint16_t slot = g_do_null_cache_next++ % DO_NULL_CACHE_N;
-        g_do_null_cache[slot] = hash;
-        g_do_null_cache_env_gen[slot] = ray_env_generation();
-    }
-}
-
 /* (do expr1 expr2 ...) — evaluate in sequence, return last. Pushes local scope. */
 ray_t* ray_do_fn(ray_t** args, int64_t n) {
     if (n == 0) return make_i64(0);
-    uint64_t null_cache_hash = 0;
-    if (g_ray_profile.active &&
-        n == 2 && do_cache_is_null_name(args[1]) &&
-        !do_cache_contains_set(args[0])) {
-        null_cache_hash = do_cache_hash(args[0]);
-        if (do_null_cache_get(null_cache_hash))
-            return NULL;
-    }
     if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL);
     ray_t* result = NULL;
     for (int64_t i = 0; i < n; i++) {
@@ -1610,8 +1503,6 @@ ray_t* ray_do_fn(ray_t** args, int64_t n) {
         }
     }
     ray_env_pop_scope();
-    if (null_cache_hash && result == NULL)
-        do_null_cache_put(null_cache_hash);
     return result;
 }
 
diff --git a/src/ops/group.c b/src/ops/group.c
index 501d4ab3..aeb5453e 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -243,46 +243,6 @@ static void reduce_merge(reduce_acc_t* dst, const reduce_acc_t* src, int8_t in_t
      * and the last worker's last is the global last. */
 }
 
-typedef struct {
-    ray_t*       input;
-    const void*  data;
-    int64_t      len;
-    int8_t       type;
-    uint8_t      attrs;
-    reduce_acc_t acc;
-} reduce_cache_entry_t;
-
-static reduce_cache_entry_t g_reduce_cache[16];
-static uint32_t g_reduce_cache_next = 0;
-
-static bool reduce_cache_allowed(ray_t* input, const int64_t* sel_idx) {
-    return input && input->mmod != 0 && sel_idx == NULL;
-}
-
-static bool reduce_cache_get(ray_t* input, reduce_acc_t* out) {
-    const void* data = ray_data(input);
-    for (size_t i = 0; i < sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]); i++) {
-        reduce_cache_entry_t* e = &g_reduce_cache[i];
-        if (e->input == input && e->data == data && e->len == input->len &&
-            e->type == input->type && e->attrs == input->attrs) {
-            *out = e->acc;
-            return true;
-        }
-    }
-    return false;
-}
-
-static void reduce_cache_put(ray_t* input, const reduce_acc_t* acc) {
-    reduce_cache_entry_t* e = &g_reduce_cache[
-        g_reduce_cache_next++ % (sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]))];
-    e->input = input;
-    e->data = ray_data(input);
-    e->len = input->len;
-    e->type = input->type;
-    e->attrs = input->attrs;
-    e->acc = *acc;
-}
-
 /* Hash mixing constants used by the count-distinct kernel and helpers. */
 #define CD_HASH_K1 0x9E3779B97F4A7C15ULL
 #define CD_HASH_K2 0xBF58476D1CE4E5B9ULL
@@ -1855,18 +1815,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) {
         return ray_i64(read_col_i64(base, row, in_type, input->attrs));
     }
 
-    reduce_acc_t cached;
-    if ((op->opcode == OP_MIN || op->opcode == OP_MAX) &&
-        reduce_cache_allowed(input, sel_idx) &&
-        reduce_cache_get(input, &cached)) {
-        if (sel_idx_block) ray_release(sel_idx_block);
-        return op->opcode == OP_MIN
-            ? reduction_extreme_result(op, in_type, cached.cnt > 0,
-                                       cached.min_f, cached.min_i)
-            : reduction_extreme_result(op, in_type, cached.cnt > 0,
-                                       cached.max_f, cached.max_i);
-    }
-
     ray_pool_t* pool = ray_pool_get();
     if (pool && scan_n >= RAY_PARALLEL_THRESHOLD) {
         uint32_t nw = ray_pool_total_workers(pool);
@@ -1903,9 +1851,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) {
             }
         }
 
-        if (reduce_cache_allowed(input, sel_idx))
-            reduce_cache_put(input, &merged);
-
         ray_t* result;
         switch (op->opcode) {
             case OP_SUM:   result = in_type == RAY_F64 ? ray_f64(merged.sum_f) : ray_i64(merged.sum_i); break;
@@ -1945,8 +1890,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) {
     reduce_acc_init(&acc);
     reduce_range(input, 0, scan_n, &acc, has_nulls, sel_idx);
     if (sel_idx_block) ray_release(sel_idx_block);
-    if (reduce_cache_allowed(input, sel_idx))
-        reduce_cache_put(input, &acc);
 
     switch (op->opcode) {
         case OP_SUM:   return in_type == RAY_F64 ? ray_f64(acc.sum_f) : ray_i64(acc.sum_i);
diff --git a/src/ops/query.c b/src/ops/query.c
index fb3e4084..db96b92d 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -87,147 +87,6 @@ static int64_t dict_key_id(ray_t* dict, const char* key) {
     return -1;
 }
 
-typedef struct {
-    ray_t*   tbl;
-    int64_t  nrows;
-    uint64_t hash;
-    uint64_t from_hash;
-    uint64_t env_gen;
-    ray_t*   result;
-} select_cache_entry_t;
-
-#define SELECT_CACHE_N 512
-static select_cache_entry_t g_select_cache[SELECT_CACHE_N];
-static uint16_t g_select_cache_next = 0;
-
-static uint64_t hash_mix_u64(uint64_t h, uint64_t v) {
-    h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2);
-    return h ? h : 0x9e3779b97f4a7c15ull;
-}
-
-static uint64_t ray_expr_hash(ray_t* x) {
-    if (!x) return 0x1234abcd5678ef00ull;
-    uint64_t h = hash_mix_u64(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type);
-    h = hash_mix_u64(h, (uint64_t)x->attrs);
-    h = hash_mix_u64(h, (x->type == -RAY_STR)
-                        ? (uint64_t)ray_str_len(x)
-                        : (uint64_t)x->len);
-    if (x->type == RAY_LIST) {
-        ray_t** elems = (ray_t**)ray_data(x);
-        for (int64_t i = 0; i < x->len; i++)
-            h = hash_mix_u64(h, ray_expr_hash(elems[i]));
-    } else if (x->type == RAY_DICT) {
-        ray_t* keys = ray_dict_keys(x);
-        ray_t* vals = ray_dict_vals(x);
-        h = hash_mix_u64(h, ray_expr_hash(keys));
-        h = hash_mix_u64(h, ray_expr_hash(vals));
-    } else if (x->type == RAY_STR) {
-        size_t n = 0;
-        const char* s = ray_str_vec_get(x, 0, &n);
-        for (size_t i = 0; s && i < n; i++)
-            h = hash_mix_u64(h, (unsigned char)s[i]);
-    } else if (x->type == -RAY_STR) {
-        const char* s = ray_str_ptr(x);
-        size_t n = ray_str_len(x);
-        for (size_t i = 0; s && i < n; i++)
-            h = hash_mix_u64(h, (unsigned char)s[i]);
-    } else if (x->type == RAY_SYM || x->type == -RAY_SYM ||
-               x->type == RAY_I64 || x->type == -RAY_I64 ||
-               x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) {
-        h = hash_mix_u64(h, (uint64_t)x->i64);
-    } else if (x->type == RAY_I32 || x->type == -RAY_I32 ||
-               x->type == RAY_DATE || x->type == -RAY_DATE ||
-               x->type == RAY_TIME || x->type == -RAY_TIME) {
-        h = hash_mix_u64(h, (uint64_t)(uint32_t)x->i32);
-    } else if (x->type == RAY_I16 || x->type == -RAY_I16) {
-        h = hash_mix_u64(h, (uint64_t)(uint16_t)x->i16);
-    } else if (x->type == RAY_U8 || x->type == -RAY_U8 ||
-               x->type == RAY_BOOL || x->type == -RAY_BOOL) {
-        h = hash_mix_u64(h, (uint64_t)x->u8);
-    } else if (x->type == RAY_F64 || x->type == -RAY_F64) {
-        uint64_t bits = 0;
-        memcpy(&bits, &x->f64, sizeof(bits));
-        h = hash_mix_u64(h, bits);
-    }
-    return h;
-}
-
-static ray_t* select_cache_get(ray_t* tbl, int64_t nrows,
-                               uint64_t hash, uint64_t from_hash) {
-    if (!g_ray_profile.active) return NULL;
-    if (!hash) return NULL;
-    for (uint16_t i = 0; i < SELECT_CACHE_N; i++) {
-        select_cache_entry_t* e = &g_select_cache[i];
-        if (e->result && e->env_gen == ray_env_generation() &&
-            e->nrows == nrows && e->hash == hash &&
-            (e->tbl == tbl || (from_hash && e->from_hash == from_hash))) {
-            ray_retain(e->result);
-            return e->result;
-        }
-    }
-    return NULL;
-}
-
-static void select_expr_cache_put(uint64_t hash, uint64_t from_hash,
-                                  ray_t* result);
-
-static void select_cache_put(ray_t* tbl, int64_t nrows,
-                             uint64_t hash, uint64_t from_hash,
-                             ray_t* result) {
-    if (!g_ray_profile.active) return;
-    if (!tbl || !hash || !result || RAY_IS_ERR(result)) return;
-    select_cache_entry_t* e =
-        &g_select_cache[g_select_cache_next++ % SELECT_CACHE_N];
-    if (e->result) ray_release(e->result);
-    e->tbl = tbl;
-    e->nrows = nrows;
-    e->hash = hash;
-    e->from_hash = from_hash;
-    e->env_gen = ray_env_generation();
-    e->result = result;
-    ray_retain(e->result);
-    select_expr_cache_put(hash, from_hash, result);
-}
-
-typedef struct {
-    uint64_t hash;
-    uint64_t from_hash;
-    uint64_t env_gen;
-    ray_t*   result;
-} select_expr_cache_entry_t;
-
-#define SELECT_EXPR_CACHE_N 1024
-static select_expr_cache_entry_t g_select_expr_cache[SELECT_EXPR_CACHE_N];
-static uint16_t g_select_expr_cache_next = 0;
-
-static ray_t* select_expr_cache_get(uint64_t hash, uint64_t from_hash) {
-    if (!g_ray_profile.active) return NULL;
-    if (!hash) return NULL;
-    for (uint16_t i = 0; i < SELECT_EXPR_CACHE_N; i++) {
-        select_expr_cache_entry_t* e = &g_select_expr_cache[i];
-        if (e->result && e->env_gen == ray_env_generation() &&
-            e->hash == hash && e->from_hash == from_hash) {
-            ray_retain(e->result);
-            return e->result;
-        }
-    }
-    return NULL;
-}
-
-static void select_expr_cache_put(uint64_t hash, uint64_t from_hash,
-                                  ray_t* result) {
-    if (!g_ray_profile.active) return;
-    if (!hash || !result || RAY_IS_ERR(result)) return;
-    select_expr_cache_entry_t* e =
-        &g_select_expr_cache[g_select_expr_cache_next++ % SELECT_EXPR_CACHE_N];
-    if (e->result) ray_release(e->result);
-    e->hash = hash;
-    e->from_hash = from_hash;
-    e->env_gen = ray_env_generation();
-    e->result = result;
-    ray_retain(e->result);
-}
-
 /* Flatten a RAY_DICT (keys SYM vec + vals LIST) into a transient
  * [k0,v0,k1,v1,...] array view so the existing dict-walking loops in
  * ray_select_fn et al. can iterate without rewriting every site.
@@ -1958,18 +1817,6 @@ static void order_count_clauses(xbar_count_clause_t* clauses, uint8_t n) {
     }
 }
 
-static int xbar_clause_cache_eq(const xbar_count_clause_t* a, uint8_t an,
-                                const xbar_count_clause_t* b, uint8_t bn) {
-    if (an != bn) return 0;
-    for (uint8_t i = 0; i < an; i++) {
-        if (a[i].base != b[i].base || a[i].type != b[i].type ||
-            a[i].attrs != b[i].attrs || a[i].op != b[i].op ||
-            a[i].rhs != b[i].rhs)
-            return 0;
-    }
-    return 1;
-}
-
 static int match_i16_key_ne_zero(ray_t* where_expr, int64_t key_sym) {
     if (!where_expr || where_expr->type != RAY_LIST || ray_len(where_expr) != 3)
         return 0;
@@ -2046,20 +1893,6 @@ static ray_t* try_i16_ne0_count_desc_select(ray_t* tbl, ray_t* where_expr,
         (col->attrs & RAY_ATTR_HAS_NULLS))
         return NULL;
 
-    static ray_t* cache_result = NULL;
-    static ray_t* cache_tbl = NULL;
-    static ray_t* cache_col = NULL;
-    static int64_t cache_len = -1;
-    static int64_t cache_key_sym = -1;
-    static int64_t cache_count_alias = -1;
-    static int64_t cache_take = -1;
-    if (cache_result && cache_tbl == tbl && cache_col == col &&
-        cache_len == col->len && cache_key_sym == key_sym &&
-        cache_count_alias == count_alias && cache_take == take_n) {
-        ray_retain(cache_result);
-        return cache_result;
-    }
-
     ray_pool_t* pool = ray_pool_get();
     uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
     if (nw == 0) nw = 1;
@@ -2133,16 +1966,6 @@ static ray_t* try_i16_ne0_count_desc_select(ray_t* tbl, ray_t* where_expr,
     out = ray_table_add_col(out, key_sym, key_out);
     out = ray_table_add_col(out, count_alias, cnt_out);
     ray_release(key_out); ray_release(cnt_out);
-    if (cache_result)
-        ray_release(cache_result);
-    cache_result = out;
-    cache_tbl = tbl;
-    cache_col = col;
-    cache_len = col->len;
-    cache_key_sym = key_sym;
-    cache_count_alias = count_alias;
-    cache_take = take_n;
-    ray_retain(cache_result);
     return out;
 }
 
@@ -2221,20 +2044,6 @@ static ray_t* try_i32_i64_count_distinct_select(ray_t* tbl, ray_t* where_expr,
         (dcol->attrs & RAY_ATTR_HAS_NULLS))
         return NULL;
 
-    static ray_t* cache_result = NULL;
-    static ray_t* cache_tbl = NULL;
-    static int64_t cache_len = -1;
-    static int64_t cache_group_sym = -1;
-    static int64_t cache_distinct_sym = -1;
-    static int64_t cache_count_alias = -1;
-    static int64_t cache_take = -1;
-    if (cache_result && cache_tbl == tbl && cache_len == gcol->len &&
-        cache_group_sym == group_sym && cache_distinct_sym == distinct_sym &&
-        cache_count_alias == count_alias && cache_take == take_n) {
-        ray_retain(cache_result);
-        return cache_result;
-    }
-
     int64_t nrows = ray_table_nrows(tbl);
     ray_pool_t* pool = ray_pool_get();
     uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
@@ -2379,16 +2188,6 @@ static ray_t* try_i32_i64_count_distinct_select(ray_t* tbl, ray_t* where_expr,
     out = ray_table_add_col(out, group_sym, key_out);
     out = ray_table_add_col(out, count_alias, cnt_out);
     ray_release(key_out); ray_release(cnt_out);
-    if (cache_result)
-        ray_release(cache_result);
-    cache_result = out;
-    cache_tbl = tbl;
-    cache_len = gcol->len;
-    cache_group_sym = group_sym;
-    cache_distinct_sym = distinct_sym;
-    cache_count_alias = count_alias;
-    cache_take = take_n;
-    ray_retain(cache_result);
     return out;
 }
 
@@ -2463,27 +2262,6 @@ static ray_t* try_i16x2_count_desc_select(ray_t* tbl, ray_t* where_expr,
         return NULL;
     order_count_clauses(clauses, n_clauses);
 
-    static ray_t* cache_result = NULL;
-    static ray_t* cache_tbl = NULL;
-    static ray_t* cache_col0 = NULL;
-    static ray_t* cache_col1 = NULL;
-    static int64_t cache_len = -1;
-    static int64_t cache_key0 = -1;
-    static int64_t cache_key1 = -1;
-    static int64_t cache_count_alias = -1;
-    static int64_t cache_take = -1;
-    static uint8_t cache_n_clauses = 0;
-    static xbar_count_clause_t cache_clauses[16];
-    if (cache_result && cache_tbl == tbl && cache_col0 == col0 &&
-        cache_col1 == col1 && cache_len == col0->len &&
-        cache_key0 == key0_atom->i64 && cache_key1 == key1_atom->i64 &&
-        cache_count_alias == count_alias && cache_take == take_n &&
-        xbar_clause_cache_eq(cache_clauses, cache_n_clauses,
-                             clauses, n_clauses)) {
-        ray_retain(cache_result);
-        return cache_result;
-    }
-
     int64_t nrows = ray_table_nrows(tbl);
     const uint32_t cap = 4096;
     const uint32_t mask = cap - 1u;
@@ -2619,20 +2397,6 @@ static ray_t* try_i16x2_count_desc_select(ray_t* tbl, ray_t* where_expr,
     out = ray_table_add_col(out, key1_atom->i64, key1_out);
     out = ray_table_add_col(out, count_alias, cnt_out);
     ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out);
-    if (cache_result)
-        ray_release(cache_result);
-    cache_result = out;
-    cache_tbl = tbl;
-    cache_col0 = col0;
-    cache_col1 = col1;
-    cache_len = col0->len;
-    cache_key0 = key0_atom->i64;
-    cache_key1 = key1_atom->i64;
-    cache_count_alias = count_alias;
-    cache_take = take_n;
-    cache_n_clauses = n_clauses;
-    memcpy(cache_clauses, clauses, sizeof(clauses));
-    ray_retain(cache_result);
     return out;
 }
 
@@ -2710,26 +2474,6 @@ static ray_t* try_xbar_count_select(ray_t* tbl, ray_t* where_expr,
 
     int64_t nrows = ray_table_nrows(tbl);
     const int64_t* key_data = (const int64_t*)ray_data(key_col);
-    static ray_t* cache_result = NULL;
-    static ray_t* cache_tbl = NULL;
-    static ray_t* cache_key_col = NULL;
-    static int64_t cache_len = -1;
-    static int64_t cache_key_sym = -1;
-    static int64_t cache_out_sym = -1;
-    static int64_t cache_count_alias = -1;
-    static int64_t cache_bucket = -1;
-    static int64_t cache_take = -1;
-    static uint8_t cache_n_clauses = 0;
-    static xbar_count_clause_t cache_clauses[16];
-    if (cache_result && cache_tbl == tbl && cache_key_col == key_col &&
-        cache_len == key_col->len && cache_key_sym == xe[1]->i64 &&
-        cache_out_sym == key_atom->i64 && cache_count_alias == count_alias &&
-        cache_bucket == bucket && cache_take == take_n &&
-        xbar_clause_cache_eq(cache_clauses, cache_n_clauses,
-                             clauses, n_clauses)) {
-        ray_retain(cache_result);
-        return cache_result;
-    }
     const uint32_t cap = 4096;
     const uint32_t mask = cap - 1u;
     ray_pool_t* pool = ray_pool_get();
@@ -2871,20 +2615,6 @@ static ray_t* try_xbar_count_select(ray_t* tbl, ray_t* where_expr,
     out = ray_table_add_col(out, count_alias, cnt_out);
     ray_release(key_out);
     ray_release(cnt_out);
-    if (cache_result)
-        ray_release(cache_result);
-    cache_result = out;
-    cache_tbl = tbl;
-    cache_key_col = key_col;
-    cache_len = key_col->len;
-    cache_key_sym = xe[1]->i64;
-    cache_out_sym = key_atom->i64;
-    cache_count_alias = count_alias;
-    cache_bucket = bucket;
-    cache_take = take_n;
-    cache_n_clauses = n_clauses;
-    memcpy(cache_clauses, clauses, sizeof(clauses));
-    ray_retain(cache_result);
     return out;
 }
 
@@ -4980,12 +4710,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
     /* Evaluate 'from:' to get the source table */
     ray_t* from_expr = dict_get(dict, "from");
     if (!from_expr) return ray_error("domain", NULL);
-    uint64_t select_cache_hash_value = ray_expr_hash(dict);
-    uint64_t select_cache_from_hash = ray_expr_hash(from_expr);
-    ray_t* expr_cached = select_expr_cache_get(select_cache_hash_value,
-                                               select_cache_from_hash);
-    if (expr_cached)
-        return expr_cached;
     ray_t* where_expr = dict_get(dict, "where");
     ray_group_emit_filter_t prev_emit_filter = ray_group_emit_filter_get();
     ray_group_emit_filter_t emit_filter = {0};
@@ -4998,15 +4722,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
         ray_group_emit_filter_set(prev_emit_filter);
     if (RAY_IS_ERR(tbl)) return tbl;
     if (tbl->type != RAY_TABLE) { ray_release(tbl); return ray_error("type", NULL); }
-    int64_t select_cache_nrows = ray_table_nrows(tbl);
-    ray_t* select_cached = select_cache_get(tbl, select_cache_nrows,
-                                            select_cache_hash_value,
-                                            select_cache_from_hash);
-    if (select_cached) {
-        ray_release(tbl);
-        return select_cached;
-    }
-
     ray_t* by_expr = dict_get(dict, "by");
     ray_t* take_expr = dict_get(dict, "take");
     ray_t* nearest_expr = dict_get(dict, "nearest");
@@ -6405,9 +6120,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                         ray_free(vals_hdr); ray_free(null_hdr); ray_free(cnt_hdr);
                         if (eval_tbl != tbl) ray_release(eval_tbl);
                         ray_release(tbl);
-                        select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
                         return result;
                     }
                 }
@@ -6668,16 +6380,10 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                 if (eval_tbl != tbl) ray_release(eval_tbl);
                 ray_release(tbl);
                 if (take_preapplied) {
-                    select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
                     return result;
                 }
                 result = apply_sort_take(result, dict_elems, dict_n,
                                          asc_id, desc_id, take_id);
-                select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
                 return result;
             }
 
@@ -6868,9 +6574,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                 }
                 res = apply_sort_take(res, dict_elems, dict_n,
                                       asc_id, desc_id, take_id);
-                select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, res);
                 return res;
             }
 
@@ -7282,9 +6985,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
             ray_release(tbl);
             result = apply_sort_take(result, dict_elems, dict_n,
                                      asc_id, desc_id, take_id);
-            select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
             return result;
         }
 
@@ -8423,9 +8123,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
             ray_release(tbl);
             result = apply_sort_take(result, dict_elems, dict_n,
                                      asc_id, desc_id, take_id);
-            select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
             return result;
         }
     } else if (n_out > 0) {
@@ -8573,9 +8270,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                 ray_graph_free(g); ray_release(tbl);
                 result = apply_sort_take(result, dict_elems, dict_n,
                                          asc_id, desc_id, take_id);
-                select_cache_put(tbl, select_cache_nrows,
-                                 select_cache_hash_value,
-                                 select_cache_from_hash, result);
                 return result;
             } else {
                 root = ray_select_op(g, root, col_ops, nc);
@@ -9615,8 +9309,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
     if (by_sym_vec_owned) ray_release(by_sym_vec_owned);
     if (saved_selection) ray_release(saved_selection);
 
-    select_cache_put(tbl, select_cache_nrows, select_cache_hash_value,
-                     select_cache_from_hash, result);
     return result;
 }
 

From 9a992ab24c99cf2ab06a1a35bbf03e947c9b3d14 Mon Sep 17 00:00:00 2001
From: Hetoku <volonter84@gmail.com>
Date: Fri, 22 May 2026 17:59:35 +0200
Subject: [PATCH 02/11] perf(group): early-abort the DA-path min/max probe on
 doomed key spans
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The direct-array group-by path probes each key column's min/max to
decide whether a dense slot array fits (≤ DA_MAX_COMPOSITE_SLOTS).
On high-cardinality keys (UserID, WatchID, ClientIP, …) the probe
always loses, but it still scanned the full 10M-row column first —
and multi-key queries paid it once per key.

minmax_scan_fn now carries a shared abort flag and a span budget:
the moment any worker observes a key span wider than the budget the
whole parallel scan stops and the query falls through to the radix
HT path. Correctness is unchanged — a worker only aborts once the
span already exceeds what the DA path could ever accept, so the
caller's da_fits rejection is identical to a full scan's.

Minor: the eliminated scan is memory-bandwidth-bound and overlaps
other work, so wall-time on the large group-by queries moves within
run-to-run noise; the change removes provably-wasted CPU, not a
measured win. Test suite 2657/2659 (2 skipped, 0 failed).
---
 src/ops/group.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/ops/group.c b/src/ops/group.c
index aeb5453e..37f01670 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -3107,6 +3107,12 @@ typedef struct {
     uint32_t    n_workers;
     const int64_t* match_idx;    /* NULL = no selection */
     ray_t*      rowsel;
+    /* DA-path early-out: once any worker observes a key span wider than
+     * span_budget the direct-array path is provably infeasible (its slot
+     * count would exceed DA_MAX_COMPOSITE_SLOTS), so the whole scan can
+     * stop instead of reading the rest of a 10M-row column for nothing. */
+    int64_t          span_budget;
+    _Atomic(int)*    abort_flag;
 } minmax_ctx_t;
 
 static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
@@ -3115,11 +3121,25 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t
     const int64_t* match_idx = c->match_idx;
     int64_t kmin = INT64_MAX, kmax = INT64_MIN;
     int8_t t = c->key_type;
+    const int64_t span_budget = c->span_budget;
 
+    /* Span check and abort poll are batched (every 8192 rows) so the
+     * hot per-row loop body stays a branchless min/max with no atomics. */
     #define MINMAX_SEG_LOOP(TYPE, CAST) \
         do { \
             const TYPE* kd = (const TYPE*)c->key_data; \
             for (int64_t i = start; i < end; i++) { \
+                if (((i - start) & 8191) == 0) { \
+                    if (atomic_load_explicit(c->abort_flag, \
+                                             memory_order_relaxed)) \
+                        goto minmax_done; \
+                    if (kmax >= kmin && \
+                        (uint64_t)(kmax - kmin) > (uint64_t)span_budget) { \
+                        atomic_store_explicit(c->abort_flag, 1, \
+                                              memory_order_relaxed); \
+                        goto minmax_done; \
+                    } \
+                } \
                 int64_t r = match_idx ? match_idx[i] : i; \
                 if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, r)) continue; \
                 int64_t v = (int64_t)CAST kd[r]; \
@@ -3146,6 +3166,7 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t
 
     #undef MINMAX_SEG_LOOP
 
+minmax_done:
     /* Merge with existing per-worker values (a worker may process multiple morsels) */
     if (kmin < c->per_worker_min[wid]) c->per_worker_min[wid] = kmin;
     if (kmax > c->per_worker_max[wid]) c->per_worker_max[wid] = kmax;
@@ -5414,6 +5435,9 @@ da_path:;
                             ? ray_pool_total_workers(mm_pool) : 1;
             /* VLA bounded by worker count — max ~2KB per key even on 256-core systems. */
             int64_t mm_mins[mm_n], mm_maxs[mm_n];
+            /* Shared across keys: once any key proves the DA slot count
+             * infeasible the scan aborts instead of reading the rest. */
+            _Atomic(int) mm_abort = 0;
             for (uint8_t k = 0; k < n_keys && da_fits; k++) {
                 int64_t kmin, kmax;
                 for (uint32_t w = 0; w < mm_n; w++) {
@@ -5429,12 +5453,18 @@ da_path:;
                     .n_workers      = mm_n,
                     .match_idx      = match_idx,
                     .rowsel         = rowsel,
+                    .span_budget    = DA_MAX_COMPOSITE_SLOTS,
+                    .abort_flag     = &mm_abort,
                 };
                 if (mm_n > 1) {
                     ray_pool_dispatch(mm_pool, minmax_scan_fn, &mm_ctx, n_scan);
                 } else {
                     minmax_scan_fn(&mm_ctx, 0, 0, n_scan);
                 }
+                if (atomic_load_explicit(&mm_abort, memory_order_relaxed)) {
+                    da_fits = false;
+                    break;
+                }
                 kmin = INT64_MAX; kmax = INT64_MIN;
                 for (uint32_t w = 0; w < mm_n; w++) {
                     if (mm_mins[w] < kmin) kmin = mm_mins[w];

From 477990e3d5b17f3abc65b79da448905d46e0065e Mon Sep 17 00:00:00 2001
From: Hetoku <volonter84@gmail.com>
Date: Fri, 22 May 2026 18:16:04 +0200
Subject: [PATCH 03/11] refactor(query): remove benchmark-shaped query
 fast-paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

try_xbar_count_select / try_i16_ne0_count_desc_select /
try_i32_i64_count_distinct_select / try_i16x2_count_desc_select
pattern-matched exact query shapes from a specific benchmark suite
(i16 "!= 0" filter + count + desc + take; two i16 keys; i32/i64
count-distinct; xbar time-bucket count) and ran hand-written kernels
for them, bypassing the general select/group-by planner.

These are benchmark-specific special-cases, not general query
optimizations — removed along with their exclusive helpers
(parse_xbar_count_clause, order_count_clauses, the per-shape worker
fns and comparators; ~1125 lines). Queries of these shapes now run
through the normal select path.

Test suite: 2657/2659 pass (2 skipped, 0 failed).
---
 src/ops/query.c | 1162 -----------------------------------------------
 1 file changed, 1162 deletions(-)

diff --git a/src/ops/query.c b/src/ops/query.c
index db96b92d..c738c844 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -1493,1131 +1493,6 @@ static int atom_i64_const(ray_t* v, int64_t* out) {
     }
 }
 
-typedef struct {
-    const void* base;
-    int8_t type;
-    uint8_t attrs;
-    int op;
-    int64_t rhs;
-} xbar_count_clause_t;
-
-typedef struct {
-    int64_t key;
-    int64_t count;
-} xbar_count_pair_t;
-
-typedef struct {
-    uint32_t key;
-    uint32_t count;
-} i16x2_count_pair_t;
-
-typedef struct {
-    int32_t key;
-    uint32_t count;
-} i32_count_pair_t;
-
-typedef struct {
-    int16_t key;
-    uint32_t count;
-} i16_count_pair_t;
-
-typedef struct {
-    const int64_t* key_data;
-    int64_t bucket;
-    xbar_count_clause_t clauses[16];
-    uint8_t n_clauses;
-    uint32_t cap;
-    int64_t* keys;
-    uint32_t* counts;
-    uint8_t* used;
-    _Atomic int overflow;
-} xbar_count_ctx_t;
-
-typedef struct {
-    const int16_t* key0;
-    const int16_t* key1;
-    xbar_count_clause_t clauses[16];
-    uint8_t n_clauses;
-    uint32_t cap;
-    uint32_t* keys;
-    uint32_t* counts;
-    uint8_t* used;
-    _Atomic int overflow;
-} i16x2_count_ctx_t;
-
-typedef struct {
-    const int16_t* key;
-    uint32_t* counts;
-} i16_ne0_count_ctx_t;
-
-typedef struct {
-    const int32_t* group;
-    const int64_t* distinct;
-    uint32_t cap;
-    int32_t* groups;
-    int64_t* values;
-    uint8_t* used;
-    _Atomic int overflow;
-} i32_i64_cd_ctx_t;
-
-static int xbar_count_pair_cmp(const void* a, const void* b) {
-    const xbar_count_pair_t* pa = (const xbar_count_pair_t*)a;
-    const xbar_count_pair_t* pb = (const xbar_count_pair_t*)b;
-    return (pa->key > pb->key) - (pa->key < pb->key);
-}
-
-static int i16x2_count_pair_desc_cmp(const void* a, const void* b) {
-    const i16x2_count_pair_t* pa = (const i16x2_count_pair_t*)a;
-    const i16x2_count_pair_t* pb = (const i16x2_count_pair_t*)b;
-    if (pa->count != pb->count)
-        return (pa->count < pb->count) - (pa->count > pb->count);
-    return (pa->key > pb->key) - (pa->key < pb->key);
-}
-
-static int i32_count_pair_desc_cmp(const void* a, const void* b) {
-    const i32_count_pair_t* pa = (const i32_count_pair_t*)a;
-    const i32_count_pair_t* pb = (const i32_count_pair_t*)b;
-    if (pa->count != pb->count)
-        return (pa->count < pb->count) - (pa->count > pb->count);
-    return (pa->key > pb->key) - (pa->key < pb->key);
-}
-
-static int i16_count_pair_desc_cmp(const void* a, const void* b) {
-    const i16_count_pair_t* pa = (const i16_count_pair_t*)a;
-    const i16_count_pair_t* pb = (const i16_count_pair_t*)b;
-    if (pa->count != pb->count)
-        return (pa->count < pb->count) - (pa->count > pb->count);
-    return (pa->key > pb->key) - (pa->key < pb->key);
-}
-
-static uint64_t xbar_count_hash_i64(int64_t v) {
-    uint64_t h = (uint64_t)v;
-    h ^= h >> 33;
-    h *= 0xff51afd7ed558ccdULL;
-    h ^= h >> 33;
-    h *= 0xc4ceb9fe1a85ec53ULL;
-    h ^= h >> 33;
-    return h;
-}
-
-static uint32_t count_hash_u32(uint32_t v) {
-    uint32_t h = v;
-    h ^= h >> 16;
-    h *= 0x7feb352dU;
-    h ^= h >> 15;
-    h *= 0x846ca68bU;
-    h ^= h >> 16;
-    return h;
-}
-
-static uint64_t count_hash_i32_i64(int32_t g, int64_t v) {
-    uint64_t h = (uint64_t)(uint32_t)g * 0x9E3779B97F4A7C15ULL;
-    uint64_t x = (uint64_t)v;
-    x ^= x >> 33;
-    x *= 0xff51afd7ed558ccdULL;
-    x ^= x >> 33;
-    h ^= x + 0xBF58476D1CE4E5B9ULL + (h << 6) + (h >> 2);
-    h ^= h >> 33;
-    return h;
-}
-
-static void xbar_count_worker_fn(void* raw, uint32_t worker_id,
-                                 int64_t start, int64_t end) {
-    xbar_count_ctx_t* ctx = (xbar_count_ctx_t*)raw;
-    uint32_t cap = ctx->cap;
-    uint32_t mask = cap - 1u;
-    int64_t* keys = ctx->keys + (size_t)worker_id * cap;
-    uint32_t* counts = ctx->counts + (size_t)worker_id * cap;
-    uint8_t* used = ctx->used + (size_t)worker_id * cap;
-    int64_t n_groups = 0;
-    int64_t bucket = ctx->bucket;
-
-    for (int64_t r = start; r < end; r++) {
-        uint8_t pass = 1;
-        for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) {
-            const xbar_count_clause_t* c = &ctx->clauses[ci];
-            int64_t v = read_col_i64(c->base, r, c->type, c->attrs);
-            if (c->op == 1) pass &= (uint8_t)(v == c->rhs);
-            else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs);
-            else pass &= (uint8_t)(v <= c->rhs);
-            if (!pass) break;
-        }
-        if (!pass) continue;
-        int64_t ts = ctx->key_data[r];
-        int64_t q = ts / bucket;
-        if ((ts ^ bucket) < 0 && q * bucket != ts) q--;
-        int64_t k = q * bucket;
-        uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask;
-        while (used[slot] && keys[slot] != k)
-            slot = (slot + 1u) & mask;
-        if (!used[slot]) {
-            if (n_groups >= (int64_t)(cap / 2)) {
-                atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed);
-                return;
-            }
-            used[slot] = 1;
-            keys[slot] = k;
-            n_groups++;
-        }
-        counts[slot]++;
-    }
-}
-
-static void i16x2_count_worker_fn(void* raw, uint32_t worker_id,
-                                  int64_t start, int64_t end) {
-    i16x2_count_ctx_t* ctx = (i16x2_count_ctx_t*)raw;
-    uint32_t cap = ctx->cap;
-    uint32_t mask = cap - 1u;
-    uint32_t* keys = ctx->keys + (size_t)worker_id * cap;
-    uint32_t* counts = ctx->counts + (size_t)worker_id * cap;
-    uint8_t* used = ctx->used + (size_t)worker_id * cap;
-    int64_t n_groups = 0;
-
-    for (int64_t r = start; r < end; r++) {
-        uint8_t pass = 1;
-        for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) {
-            const xbar_count_clause_t* c = &ctx->clauses[ci];
-            int64_t v = read_col_i64(c->base, r, c->type, c->attrs);
-            if (c->op == 1) pass &= (uint8_t)(v == c->rhs);
-            else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs);
-            else pass &= (uint8_t)(v <= c->rhs);
-            if (!pass) break;
-        }
-        if (!pass) continue;
-        uint32_t k = ((uint32_t)(uint16_t)ctx->key0[r] << 16) |
-                     (uint32_t)(uint16_t)ctx->key1[r];
-        uint32_t slot = count_hash_u32(k) & mask;
-        while (used[slot] && keys[slot] != k)
-            slot = (slot + 1u) & mask;
-        if (!used[slot]) {
-            if (n_groups >= (int64_t)(cap / 2)) {
-                atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed);
-                return;
-            }
-            used[slot] = 1;
-            keys[slot] = k;
-            n_groups++;
-        }
-        counts[slot]++;
-    }
-}
-
-static void i16_ne0_count_worker_fn(void* raw, uint32_t worker_id,
-                                    int64_t start, int64_t end) {
-    i16_ne0_count_ctx_t* ctx = (i16_ne0_count_ctx_t*)raw;
-    uint32_t* counts = ctx->counts + (size_t)worker_id * 65536u;
-    const int16_t* key = ctx->key;
-    for (int64_t r = start; r < end; r++) {
-        int16_t v = key[r];
-        if (v)
-            counts[(uint32_t)((int32_t)v + 32768)]++;
-    }
-}
-
-static void i32_i64_cd_worker_fn(void* raw, uint32_t worker_id,
-                                 int64_t start, int64_t end) {
-    i32_i64_cd_ctx_t* ctx = (i32_i64_cd_ctx_t*)raw;
-    uint32_t cap = ctx->cap;
-    uint32_t mask = cap - 1u;
-    int32_t* groups = ctx->groups + (size_t)worker_id * cap;
-    int64_t* values = ctx->values + (size_t)worker_id * cap;
-    uint8_t* used = ctx->used + (size_t)worker_id * cap;
-    int64_t n_filled = 0;
-
-    for (int64_t r = start; r < end; r++) {
-        int32_t g = ctx->group[r];
-        int64_t v = ctx->distinct[r];
-        uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & mask;
-        while (used[slot] && (groups[slot] != g || values[slot] != v))
-            slot = (slot + 1u) & mask;
-        if (!used[slot]) {
-            if (n_filled >= (int64_t)(cap * 7u / 10u)) {
-                atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed);
-                return;
-            }
-            used[slot] = 1;
-            groups[slot] = g;
-            values[slot] = v;
-            n_filled++;
-        }
-    }
-}
-
-static int sym_name_eq(int64_t sym, const char* name, size_t len) {
-    ray_t* s = ray_sym_str(sym);
-    return s && ray_str_len(s) == len &&
-           memcmp(ray_str_ptr(s), name, len) == 0;
-}
-
-static int parse_xbar_count_clause(ray_t* tbl, ray_t* expr,
-                                   xbar_count_clause_t* clauses,
-                                   uint8_t* n_clauses) {
-    if (!expr || expr->type != RAY_LIST || ray_len(expr) < 3) return 0;
-    ray_t** elems = (ray_t**)ray_data(expr);
-    if (!elems[0] || elems[0]->type != -RAY_SYM) return 0;
-    ray_t* head = ray_sym_str(elems[0]->i64);
-    if (!head) return 0;
-    const char* hn = ray_str_ptr(head);
-    size_t hl = ray_str_len(head);
-    if (hl == 3 && memcmp(hn, "and", 3) == 0) {
-        for (int64_t i = 1; i < ray_len(expr); i++)
-            if (!parse_xbar_count_clause(tbl, elems[i], clauses, n_clauses))
-                return 0;
-        return 1;
-    }
-    if (ray_len(expr) != 3 || *n_clauses >= 16) return 0;
-    int op = 0;
-    if (hl == 2 && memcmp(hn, "==", 2) == 0) op = 1;
-    else if (hl == 2 && memcmp(hn, ">=", 2) == 0) op = 2;
-    else if (hl == 2 && memcmp(hn, "<=", 2) == 0) op = 3;
-    else return 0;
-
-    ray_t* lhs = elems[1];
-    ray_t* rhs = elems[2];
-    int64_t rhs_i = 0;
-    if (!lhs || lhs->type != -RAY_SYM || !(lhs->attrs & RAY_ATTR_NAME) ||
-        !atom_i64_const(rhs, &rhs_i))
-        return 0;
-    ray_t* col = ray_table_get_col(tbl, lhs->i64);
-    if (!col || !ray_is_vec(col) || RAY_IS_PARTED(col->type) ||
-        col->type == RAY_MAPCOMMON || (col->attrs & RAY_ATTR_HAS_NULLS))
-        return 0;
-    int8_t ct = col->type;
-    if (ct != RAY_BOOL && ct != RAY_U8 && ct != RAY_I16 &&
-        ct != RAY_I32 && ct != RAY_I64 && ct != RAY_DATE &&
-        ct != RAY_TIME && ct != RAY_TIMESTAMP)
-        return 0;
-    clauses[*n_clauses] = (xbar_count_clause_t){
-        .base = ray_data(col),
-        .type = ct,
-        .attrs = col->attrs,
-        .op = op,
-        .rhs = rhs_i,
-    };
-    (*n_clauses)++;
-    return 1;
-}
-
-static int count_clause_score(const xbar_count_clause_t* c) {
-    if (c->op == 1 && ray_sym_elem_size(c->type, c->attrs) >= 8) return 0;
-    if (c->op == 1) return 1;
-    return 2;
-}
-
-static void order_count_clauses(xbar_count_clause_t* clauses, uint8_t n) {
-    for (uint8_t i = 1; i < n; i++) {
-        xbar_count_clause_t v = clauses[i];
-        int vs = count_clause_score(&v);
-        uint8_t j = i;
-        while (j > 0 && count_clause_score(&clauses[j - 1]) > vs) {
-            clauses[j] = clauses[j - 1];
-            j--;
-        }
-        clauses[j] = v;
-    }
-}
-
-static int match_i16_key_ne_zero(ray_t* where_expr, int64_t key_sym) {
-    if (!where_expr || where_expr->type != RAY_LIST || ray_len(where_expr) != 3)
-        return 0;
-    ray_t** e = (ray_t**)ray_data(where_expr);
-    if (!e[0] || e[0]->type != -RAY_SYM ||
-        !sym_name_eq(e[0]->i64, "!=", 2))
-        return 0;
-    ray_t* lhs = e[1];
-    int64_t rhs = 0;
-    return lhs && lhs->type == -RAY_SYM && (lhs->attrs & RAY_ATTR_NAME) &&
-           lhs->i64 == key_sym && atom_i64_const(e[2], &rhs) && rhs == 0;
-}
-
-static ray_t* try_i16_ne0_count_desc_select(ray_t* tbl, ray_t* where_expr,
-                                            ray_t* by_expr, ray_t* take_expr,
-                                            ray_t** dict_elems,
-                                            int64_t dict_n,
-                                            int64_t from_id,
-                                            int64_t where_id,
-                                            int64_t by_id,
-                                            int64_t take_id,
-                                            int64_t asc_id,
-                                            int64_t desc_id,
-                                            int64_t nearest_id) {
-    if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr ||
-        !take_expr || by_expr->type != -RAY_SYM ||
-        !(by_expr->attrs & RAY_ATTR_NAME))
-        return NULL;
-    int64_t key_sym = by_expr->i64;
-    int64_t take_n = 0;
-    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024)
-        return NULL;
-    if (!match_i16_key_ne_zero(where_expr, key_sym))
-        return NULL;
-
-    int64_t count_alias = -1;
-    int saw_desc = 0;
-    int saw_key_projection = 0;
-    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
-        int64_t kid = dict_elems[i]->i64;
-        ray_t* v = dict_elems[i + 1];
-        if (kid == from_id || kid == where_id || kid == by_id ||
-            kid == take_id || kid == nearest_id)
-            continue;
-        if (kid == desc_id) {
-            if (!v || v->type != -RAY_SYM)
-                return NULL;
-            saw_desc = 1;
-            continue;
-        }
-        if (kid == asc_id) return NULL;
-        if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) &&
-            kid == key_sym && v->i64 == key_sym) {
-            saw_key_projection = 1;
-            continue;
-        }
-        if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2)
-            return NULL;
-        ray_t** ae = (ray_t**)ray_data(v);
-        if (!ae[0] || ae[0]->type != -RAY_SYM ||
-            !sym_name_eq(ae[0]->i64, "count", 5))
-            return NULL;
-        ray_t* arg = ae[1];
-        if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME) ||
-            arg->i64 != key_sym)
-            return NULL;
-        count_alias = kid;
-    }
-    if (!saw_desc || !saw_key_projection || count_alias < 0)
-        return NULL;
-
-    ray_t* col = ray_table_get_col(tbl, key_sym);
-    if (!col || !ray_is_vec(col) || col->type != RAY_I16 ||
-        (col->attrs & RAY_ATTR_HAS_NULLS))
-        return NULL;
-
-    ray_pool_t* pool = ray_pool_get();
-    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
-    if (nw == 0) nw = 1;
-    ray_t* counts_hdr = NULL;
-    uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr,
-        (size_t)nw * 65536u * sizeof(uint32_t));
-    if (!counts)
-        return ray_error("oom", NULL);
-
-    i16_ne0_count_ctx_t ctx = {
-        .key = (const int16_t*)ray_data(col),
-        .counts = counts,
-    };
-    int64_t nrows = ray_table_nrows(tbl);
-    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
-        ray_pool_dispatch(pool, i16_ne0_count_worker_fn, &ctx, nrows);
-    else
-        i16_ne0_count_worker_fn(&ctx, 0, 0, nrows);
-
-    i16_count_pair_t top[1024];
-    int64_t top_n = 0;
-    for (uint32_t s = 0; s < 65536u; s++) {
-        uint32_t total = 0;
-        for (uint32_t w = 0; w < nw; w++)
-            total += counts[(size_t)w * 65536u + s];
-        if (!total) continue;
-        i16_count_pair_t cand = {
-            .key = (int16_t)((int32_t)s - 32768),
-            .count = total,
-        };
-        if (top_n < take_n) {
-            top[top_n++] = cand;
-            continue;
-        }
-        int64_t min_i = 0;
-        for (int64_t i = 1; i < top_n; i++) {
-            if (top[i].count < top[min_i].count ||
-                (top[i].count == top[min_i].count && top[i].key > top[min_i].key))
-                min_i = i;
-        }
-        if (cand.count > top[min_i].count ||
-            (cand.count == top[min_i].count && cand.key < top[min_i].key))
-            top[min_i] = cand;
-    }
-    scratch_free(counts_hdr);
-    qsort(top, (size_t)top_n, sizeof(i16_count_pair_t),
-          i16_count_pair_desc_cmp);
-
-    int64_t out_n = top_n;
-    ray_t* key_out = ray_vec_new(RAY_I16, out_n);
-    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
-    if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) {
-        if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out);
-        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
-        return ray_error("oom", NULL);
-    }
-    key_out->len = out_n;
-    cnt_out->len = out_n;
-    int16_t* ko = (int16_t*)ray_data(key_out);
-    int64_t* co = (int64_t*)ray_data(cnt_out);
-    for (int64_t i = 0; i < out_n; i++) {
-        ko[i] = top[i].key;
-        co[i] = (int64_t)top[i].count;
-    }
-
-    ray_t* out = ray_table_new(2);
-    if (!out || RAY_IS_ERR(out)) {
-        ray_release(key_out); ray_release(cnt_out);
-        return out ? out : ray_error("oom", NULL);
-    }
-    out = ray_table_add_col(out, key_sym, key_out);
-    out = ray_table_add_col(out, count_alias, cnt_out);
-    ray_release(key_out); ray_release(cnt_out);
-    return out;
-}
-
-static ray_t* try_i32_i64_count_distinct_select(ray_t* tbl, ray_t* where_expr,
-                                                ray_t* by_expr,
-                                                ray_t* take_expr,
-                                                ray_t** dict_elems,
-                                                int64_t dict_n,
-                                                int64_t from_id,
-                                                int64_t where_id,
-                                                int64_t by_id,
-                                                int64_t take_id,
-                                                int64_t asc_id,
-                                                int64_t desc_id,
-                                                int64_t nearest_id) {
-    if (!tbl || tbl->type != RAY_TABLE || where_expr || !by_expr ||
-        !take_expr || by_expr->type != -RAY_SYM ||
-        !(by_expr->attrs & RAY_ATTR_NAME))
-        return NULL;
-
-    int64_t take_n = 0;
-    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024)
-        return NULL;
-
-    int64_t group_sym = by_expr->i64;
-    int64_t distinct_sym = -1;
-    int64_t count_alias = -1;
-    int saw_desc = 0;
-    int saw_group_projection = 0;
-    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
-        int64_t kid = dict_elems[i]->i64;
-        ray_t* v = dict_elems[i + 1];
-        if (kid == from_id || kid == where_id || kid == by_id ||
-            kid == take_id || kid == nearest_id)
-            continue;
-        if (kid == desc_id) {
-            if (!v || v->type != -RAY_SYM)
-                return NULL;
-            saw_desc = 1;
-            continue;
-        }
-        if (kid == asc_id) return NULL;
-        if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) &&
-            kid == group_sym && v->i64 == group_sym) {
-            saw_group_projection = 1;
-            continue;
-        }
-        if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2)
-            return NULL;
-        ray_t** ae = (ray_t**)ray_data(v);
-        if (!ae[0] || ae[0]->type != -RAY_SYM ||
-            !sym_name_eq(ae[0]->i64, "count", 5))
-            return NULL;
-        ray_t* inner = ae[1];
-        if (!inner || inner->type != RAY_LIST || ray_len(inner) != 2)
-            return NULL;
-        ray_t** ie = (ray_t**)ray_data(inner);
-        if (!ie[0] || ie[0]->type != -RAY_SYM ||
-            !sym_name_eq(ie[0]->i64, "distinct", 8))
-            return NULL;
-        ray_t* arg = ie[1];
-        if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME))
-            return NULL;
-        distinct_sym = arg->i64;
-        count_alias = kid;
-    }
-    if (!saw_desc || !saw_group_projection || count_alias < 0 ||
-        distinct_sym < 0)
-        return NULL;
-
-    ray_t* gcol = ray_table_get_col(tbl, group_sym);
-    ray_t* dcol = ray_table_get_col(tbl, distinct_sym);
-    if (!gcol || !dcol || !ray_is_vec(gcol) || !ray_is_vec(dcol) ||
-        gcol->type != RAY_I32 || dcol->type != RAY_I64 ||
-        (gcol->attrs & RAY_ATTR_HAS_NULLS) ||
-        (dcol->attrs & RAY_ATTR_HAS_NULLS))
-        return NULL;
-
-    int64_t nrows = ray_table_nrows(tbl);
-    ray_pool_t* pool = ray_pool_get();
-    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
-    if (nw == 0) nw = 1;
-    const uint32_t local_cap = 1u << 20;
-    ray_t *lg_hdr = NULL, *lv_hdr = NULL, *lu_hdr = NULL;
-    int32_t* lg = (int32_t*)scratch_calloc(&lg_hdr,
-        (size_t)nw * local_cap * sizeof(int32_t));
-    int64_t* lv = (int64_t*)scratch_calloc(&lv_hdr,
-        (size_t)nw * local_cap * sizeof(int64_t));
-    uint8_t* lu = (uint8_t*)scratch_calloc(&lu_hdr, (size_t)nw * local_cap);
-    if (!lg || !lv || !lu) {
-        if (lg_hdr) scratch_free(lg_hdr);
-        if (lv_hdr) scratch_free(lv_hdr);
-        if (lu_hdr) scratch_free(lu_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    i32_i64_cd_ctx_t ctx = {
-        .group = (const int32_t*)ray_data(gcol),
-        .distinct = (const int64_t*)ray_data(dcol),
-        .cap = local_cap,
-        .groups = lg,
-        .values = lv,
-        .used = lu,
-    };
-    atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed);
-    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
-        ray_pool_dispatch(pool, i32_i64_cd_worker_fn, &ctx, nrows);
-    else
-        i32_i64_cd_worker_fn(&ctx, 0, 0, nrows);
-    if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) {
-        scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
-        return NULL;
-    }
-
-    const uint32_t gcap = 1u << 23;
-    const uint32_t gmask = gcap - 1u;
-    ray_t *gg_hdr = NULL, *gv_hdr = NULL, *gu_hdr = NULL;
-    int32_t* gg = (int32_t*)scratch_calloc(&gg_hdr, (size_t)gcap * sizeof(int32_t));
-    int64_t* gv = (int64_t*)scratch_calloc(&gv_hdr, (size_t)gcap * sizeof(int64_t));
-    uint8_t* gu = (uint8_t*)scratch_calloc(&gu_hdr, (size_t)gcap);
-    if (!gg || !gv || !gu) {
-        scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
-        if (gg_hdr) scratch_free(gg_hdr);
-        if (gv_hdr) scratch_free(gv_hdr);
-        if (gu_hdr) scratch_free(gu_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    int64_t global_n = 0;
-    for (uint32_t w = 0; w < nw; w++) {
-        int32_t* wg = lg + (size_t)w * local_cap;
-        int64_t* wv = lv + (size_t)w * local_cap;
-        uint8_t* wu = lu + (size_t)w * local_cap;
-        for (uint32_t s = 0; s < local_cap; s++) {
-            if (!wu[s]) continue;
-            int32_t g = wg[s];
-            int64_t v = wv[s];
-            uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & gmask;
-            while (gu[slot] && (gg[slot] != g || gv[slot] != v))
-                slot = (slot + 1u) & gmask;
-            if (!gu[slot]) {
-                if (global_n >= (int64_t)(gcap * 7u / 10u)) {
-                    scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr);
-                    scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
-                    return NULL;
-                }
-                gu[slot] = 1;
-                gg[slot] = g;
-                gv[slot] = v;
-                global_n++;
-            }
-        }
-    }
-    scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
-
-    const uint32_t rcap = 4096;
-    const uint32_t rmask = rcap - 1u;
-    int32_t rkeys[4096];
-    uint32_t rcounts[4096];
-    uint8_t rused[4096];
-    memset(rused, 0, sizeof(rused));
-    int64_t region_n = 0;
-    for (uint32_t s = 0; s < gcap; s++) {
-        if (!gu[s]) continue;
-        int32_t g = gg[s];
-        uint32_t slot = count_hash_u32((uint32_t)g) & rmask;
-        while (rused[slot] && rkeys[slot] != g)
-            slot = (slot + 1u) & rmask;
-        if (!rused[slot]) {
-            if (region_n >= (int64_t)(rcap / 2)) {
-                scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr);
-                return NULL;
-            }
-            rused[slot] = 1;
-            rkeys[slot] = g;
-            rcounts[slot] = 0;
-            region_n++;
-        }
-        rcounts[slot]++;
-    }
-    scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr);
-
-    ray_t* pairs_hdr = NULL;
-    i32_count_pair_t* pairs = (i32_count_pair_t*)scratch_alloc(
-        &pairs_hdr, (size_t)region_n * sizeof(i32_count_pair_t));
-    if (!pairs && region_n > 0)
-        return ray_error("oom", NULL);
-    int64_t pi = 0;
-    for (uint32_t s = 0; s < rcap; s++) {
-        if (!rused[s]) continue;
-        pairs[pi++] = (i32_count_pair_t){ .key = rkeys[s], .count = rcounts[s] };
-    }
-    qsort(pairs, (size_t)region_n, sizeof(i32_count_pair_t),
-          i32_count_pair_desc_cmp);
-
-    int64_t out_n = region_n < take_n ? region_n : take_n;
-    ray_t* key_out = ray_vec_new(RAY_I32, out_n);
-    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
-    if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) {
-        if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out);
-        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
-        scratch_free(pairs_hdr);
-        return ray_error("oom", NULL);
-    }
-    key_out->len = out_n;
-    cnt_out->len = out_n;
-    int32_t* ko = (int32_t*)ray_data(key_out);
-    int64_t* co = (int64_t*)ray_data(cnt_out);
-    for (int64_t i = 0; i < out_n; i++) {
-        ko[i] = pairs[i].key;
-        co[i] = (int64_t)pairs[i].count;
-    }
-    scratch_free(pairs_hdr);
-
-    ray_t* out = ray_table_new(2);
-    if (!out || RAY_IS_ERR(out)) {
-        ray_release(key_out); ray_release(cnt_out);
-        return out ? out : ray_error("oom", NULL);
-    }
-    out = ray_table_add_col(out, group_sym, key_out);
-    out = ray_table_add_col(out, count_alias, cnt_out);
-    ray_release(key_out); ray_release(cnt_out);
-    return out;
-}
-
-static ray_t* try_i16x2_count_desc_select(ray_t* tbl, ray_t* where_expr,
-                                          ray_t* by_expr, ray_t* take_expr,
-                                          ray_t** dict_elems, int64_t dict_n,
-                                          int64_t from_id, int64_t where_id,
-                                          int64_t by_id, int64_t take_id,
-                                          int64_t asc_id, int64_t desc_id,
-                                          int64_t nearest_id) {
-    if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr ||
-        !take_expr || by_expr->type != RAY_DICT)
-        return NULL;
-
-    int64_t take_n = 0;
-    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000)
-        return NULL;
-
-    DICT_VIEW_DECL(bv);
-    DICT_VIEW_OPEN(by_expr, bv);
-    if (DICT_VIEW_OVERFLOW(bv) || bv_n != 4) return NULL;
-    ray_t* key0_atom = bv[0];
-    ray_t* key0_val = bv[1];
-    ray_t* key1_atom = bv[2];
-    ray_t* key1_val = bv[3];
-    if (!key0_atom || key0_atom->type != -RAY_SYM ||
-        !key1_atom || key1_atom->type != -RAY_SYM ||
-        !key0_val || key0_val->type != -RAY_SYM ||
-        !key1_val || key1_val->type != -RAY_SYM ||
-        !(key0_val->attrs & RAY_ATTR_NAME) ||
-        !(key1_val->attrs & RAY_ATTR_NAME) ||
-        key0_atom->i64 != key0_val->i64 ||
-        key1_atom->i64 != key1_val->i64)
-        return NULL;
-
-    int64_t count_alias = -1;
-    int saw_desc = 0;
-    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
-        int64_t kid = dict_elems[i]->i64;
-        ray_t* v = dict_elems[i + 1];
-        if (kid == from_id || kid == where_id || kid == by_id ||
-            kid == take_id || kid == nearest_id)
-            continue;
-        if (kid == desc_id) {
-            if (!v || v->type != -RAY_SYM)
-                return NULL;
-            saw_desc = 1;
-            continue;
-        }
-        if (kid == asc_id) return NULL;
-        if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL;
-        ray_t** ae = (ray_t**)ray_data(v);
-        if (!ae[0] || ae[0]->type != -RAY_SYM ||
-            !sym_name_eq(ae[0]->i64, "count", 5))
-            return NULL;
-        count_alias = kid;
-    }
-    if (!saw_desc || count_alias < 0) return NULL;
-
-    ray_t* col0 = ray_table_get_col(tbl, key0_atom->i64);
-    ray_t* col1 = ray_table_get_col(tbl, key1_atom->i64);
-    if (!col0 || !col1 || !ray_is_vec(col0) || !ray_is_vec(col1) ||
-        col0->type != RAY_I16 || col1->type != RAY_I16 ||
-        (col0->attrs & RAY_ATTR_HAS_NULLS) ||
-        (col1->attrs & RAY_ATTR_HAS_NULLS))
-        return NULL;
-
-    xbar_count_clause_t clauses[16];
-    uint8_t n_clauses = 0;
-    if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) ||
-        n_clauses == 0)
-        return NULL;
-    order_count_clauses(clauses, n_clauses);
-
-    int64_t nrows = ray_table_nrows(tbl);
-    const uint32_t cap = 4096;
-    const uint32_t mask = cap - 1u;
-    ray_pool_t* pool = ray_pool_get();
-    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
-    if (nw == 0) nw = 1;
-
-    ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL;
-    uint32_t* keys = (uint32_t*)scratch_calloc(&keys_hdr,
-        (size_t)nw * cap * sizeof(uint32_t));
-    uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr,
-        (size_t)nw * cap * sizeof(uint32_t));
-    uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap);
-    if (!keys || !counts || !used) {
-        if (keys_hdr) scratch_free(keys_hdr);
-        if (counts_hdr) scratch_free(counts_hdr);
-        if (used_hdr) scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    i16x2_count_ctx_t ctx = {
-        .key0 = (const int16_t*)ray_data(col0),
-        .key1 = (const int16_t*)ray_data(col1),
-        .n_clauses = n_clauses,
-        .cap = cap,
-        .keys = keys,
-        .counts = counts,
-        .used = used,
-    };
-    memcpy(ctx.clauses, clauses, sizeof(clauses));
-    atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed);
-    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
-        ray_pool_dispatch(pool, i16x2_count_worker_fn, &ctx, nrows);
-    else
-        i16x2_count_worker_fn(&ctx, 0, 0, nrows);
-    if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) {
-        scratch_free(keys_hdr);
-        scratch_free(counts_hdr);
-        scratch_free(used_hdr);
-        return NULL;
-    }
-
-    ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL;
-    uint32_t* mkeys = (uint32_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(uint32_t));
-    uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t));
-    uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap);
-    if (!mkeys || !mcounts || !mused) {
-        scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
-        if (mkeys_hdr) scratch_free(mkeys_hdr);
-        if (mcounts_hdr) scratch_free(mcounts_hdr);
-        if (mused_hdr) scratch_free(mused_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    int64_t n_groups = 0;
-    for (uint32_t w = 0; w < nw; w++) {
-        uint32_t* wk = keys + (size_t)w * cap;
-        uint32_t* wc = counts + (size_t)w * cap;
-        uint8_t* wu = used + (size_t)w * cap;
-        for (uint32_t s = 0; s < cap; s++) {
-            if (!wu[s]) continue;
-            uint32_t k = wk[s];
-            uint32_t slot = count_hash_u32(k) & mask;
-            while (mused[slot] && mkeys[slot] != k)
-                slot = (slot + 1u) & mask;
-            if (!mused[slot]) {
-                if (n_groups >= (int64_t)(cap / 2)) {
-                    scratch_free(mkeys_hdr); scratch_free(mcounts_hdr);
-                    scratch_free(mused_hdr); scratch_free(keys_hdr);
-                    scratch_free(counts_hdr); scratch_free(used_hdr);
-                    return NULL;
-                }
-                mused[slot] = 1;
-                mkeys[slot] = k;
-                n_groups++;
-            }
-            mcounts[slot] += wc[s];
-        }
-    }
-
-    int64_t out_n = n_groups < take_n ? n_groups : take_n;
-    ray_t* pairs_hdr = NULL;
-    i16x2_count_pair_t* pairs = (i16x2_count_pair_t*)scratch_alloc(
-        &pairs_hdr, (size_t)n_groups * sizeof(i16x2_count_pair_t));
-    if (!pairs && n_groups > 0) {
-        scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr);
-        scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-    int64_t pi = 0;
-    for (uint32_t s = 0; s < cap; s++) {
-        if (!mused[s]) continue;
-        pairs[pi++] = (i16x2_count_pair_t){ .key = mkeys[s], .count = mcounts[s] };
-    }
-    qsort(pairs, (size_t)n_groups, sizeof(i16x2_count_pair_t),
-          i16x2_count_pair_desc_cmp);
-
-    ray_t* key0_out = ray_vec_new(RAY_I16, out_n);
-    ray_t* key1_out = ray_vec_new(RAY_I16, out_n);
-    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
-    if (!key0_out || !key1_out || !cnt_out ||
-        RAY_IS_ERR(key0_out) || RAY_IS_ERR(key1_out) || RAY_IS_ERR(cnt_out)) {
-        if (key0_out && !RAY_IS_ERR(key0_out)) ray_release(key0_out);
-        if (key1_out && !RAY_IS_ERR(key1_out)) ray_release(key1_out);
-        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
-        scratch_free(pairs_hdr);
-        scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr);
-        scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-    key0_out->len = out_n;
-    key1_out->len = out_n;
-    cnt_out->len = out_n;
-    int16_t* k0o = (int16_t*)ray_data(key0_out);
-    int16_t* k1o = (int16_t*)ray_data(key1_out);
-    int64_t* co = (int64_t*)ray_data(cnt_out);
-    for (int64_t i = 0; i < out_n; i++) {
-        uint32_t k = pairs[i].key;
-        k0o[i] = (int16_t)(uint16_t)(k >> 16);
-        k1o[i] = (int16_t)(uint16_t)k;
-        co[i] = (int64_t)pairs[i].count;
-    }
-    scratch_free(pairs_hdr);
-    scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr);
-    scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
-
-    ray_t* out = ray_table_new(3);
-    if (!out || RAY_IS_ERR(out)) {
-        ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out);
-        return out ? out : ray_error("oom", NULL);
-    }
-    out = ray_table_add_col(out, key0_atom->i64, key0_out);
-    out = ray_table_add_col(out, key1_atom->i64, key1_out);
-    out = ray_table_add_col(out, count_alias, cnt_out);
-    ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out);
-    return out;
-}
-
-static ray_t* try_xbar_count_select(ray_t* tbl, ray_t* where_expr,
-                                    ray_t* by_expr, ray_t* take_expr,
-                                    ray_t** dict_elems, int64_t dict_n,
-                                    int64_t from_id, int64_t where_id,
-                                    int64_t by_id, int64_t take_id,
-                                    int64_t asc_id, int64_t desc_id,
-                                    int64_t nearest_id) {
-    if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr ||
-        !take_expr)
-        return NULL;
-
-    int64_t take_n = 0;
-    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000)
-        return NULL;
-
-    if (!by_expr || by_expr->type != RAY_DICT) return NULL;
-    DICT_VIEW_DECL(bv);
-    DICT_VIEW_OPEN(by_expr, bv);
-    if (DICT_VIEW_OVERFLOW(bv) || bv_n != 2) return NULL;
-    ray_t* key_atom = bv[0];
-    ray_t* xbar_expr = bv[1];
-    if (!key_atom || key_atom->type != -RAY_SYM ||
-        !xbar_expr || xbar_expr->type != RAY_LIST ||
-        ray_len(xbar_expr) != 3)
-        return NULL;
-    ray_t** xe = (ray_t**)ray_data(xbar_expr);
-    if (!xe[0] || xe[0]->type != -RAY_SYM ||
-        !sym_name_eq(xe[0]->i64, "xbar", 4))
-        return NULL;
-    if (!xe[1] || xe[1]->type != -RAY_SYM ||
-        !(xe[1]->attrs & RAY_ATTR_NAME))
-        return NULL;
-    int64_t bucket = 0;
-    if (!atom_i64_const(xe[2], &bucket) || bucket <= 0) return NULL;
-
-    int64_t count_alias = -1;
-    int saw_asc = 0;
-    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
-        int64_t kid = dict_elems[i]->i64;
-        ray_t* v = dict_elems[i + 1];
-        if (kid == from_id || kid == where_id || kid == by_id ||
-            kid == take_id || kid == nearest_id)
-            continue;
-        if (kid == asc_id) {
-            if (!v || v->type != -RAY_SYM || v->i64 != key_atom->i64)
-                return NULL;
-            saw_asc = 1;
-            continue;
-        }
-        if (kid == desc_id) return NULL;
-        if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL;
-        ray_t** ae = (ray_t**)ray_data(v);
-        if (!ae[0] || ae[0]->type != -RAY_SYM ||
-            !sym_name_eq(ae[0]->i64, "count", 5))
-            return NULL;
-        count_alias = kid;
-    }
-    if (!saw_asc || count_alias < 0) return NULL;
-
-    ray_t* key_col = ray_table_get_col(tbl, xe[1]->i64);
-    if (!key_col || !ray_is_vec(key_col) || key_col->type != RAY_TIMESTAMP ||
-        RAY_IS_PARTED(key_col->type) || key_col->type == RAY_MAPCOMMON ||
-        (key_col->attrs & RAY_ATTR_HAS_NULLS))
-        return NULL;
-
-    xbar_count_clause_t clauses[16];
-    uint8_t n_clauses = 0;
-    if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) ||
-        n_clauses == 0)
-        return NULL;
-    order_count_clauses(clauses, n_clauses);
-
-    int64_t nrows = ray_table_nrows(tbl);
-    const int64_t* key_data = (const int64_t*)ray_data(key_col);
-    const uint32_t cap = 4096;
-    const uint32_t mask = cap - 1u;
-    ray_pool_t* pool = ray_pool_get();
-    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
-    if (nw == 0) nw = 1;
-    ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL;
-    int64_t* keys = (int64_t*)scratch_calloc(&keys_hdr,
-        (size_t)nw * cap * sizeof(int64_t));
-    uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr,
-        (size_t)nw * cap * sizeof(uint32_t));
-    uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap);
-    if (!keys || !counts || !used) {
-        if (keys_hdr) scratch_free(keys_hdr);
-        if (counts_hdr) scratch_free(counts_hdr);
-        if (used_hdr) scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    xbar_count_ctx_t ctx = {
-        .key_data = key_data,
-        .bucket = bucket,
-        .n_clauses = n_clauses,
-        .cap = cap,
-        .keys = keys,
-        .counts = counts,
-        .used = used,
-    };
-    memcpy(ctx.clauses, clauses, sizeof(clauses));
-    atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed);
-    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
-        ray_pool_dispatch(pool, xbar_count_worker_fn, &ctx, nrows);
-    else
-        xbar_count_worker_fn(&ctx, 0, 0, nrows);
-    if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) {
-        scratch_free(keys_hdr);
-        scratch_free(counts_hdr);
-        scratch_free(used_hdr);
-        return NULL;
-    }
-
-    ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL;
-    int64_t* mkeys = (int64_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(int64_t));
-    uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t));
-    uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap);
-    if (!mkeys || !mcounts || !mused) {
-        scratch_free(keys_hdr);
-        scratch_free(counts_hdr);
-        scratch_free(used_hdr);
-        if (mkeys_hdr) scratch_free(mkeys_hdr);
-        if (mcounts_hdr) scratch_free(mcounts_hdr);
-        if (mused_hdr) scratch_free(mused_hdr);
-        return ray_error("oom", NULL);
-    }
-
-    int64_t n_groups = 0;
-    for (uint32_t w = 0; w < nw; w++) {
-        int64_t* wk = keys + (size_t)w * cap;
-        uint32_t* wc = counts + (size_t)w * cap;
-        uint8_t* wu = used + (size_t)w * cap;
-        for (uint32_t s = 0; s < cap; s++) {
-            if (!wu[s]) continue;
-            int64_t k = wk[s];
-            uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask;
-            while (mused[slot] && mkeys[slot] != k)
-                slot = (slot + 1u) & mask;
-            if (!mused[slot]) {
-                if (n_groups >= (int64_t)(cap / 2)) {
-                    scratch_free(mkeys_hdr);
-                    scratch_free(mcounts_hdr);
-                    scratch_free(mused_hdr);
-                    scratch_free(keys_hdr);
-                    scratch_free(counts_hdr);
-                    scratch_free(used_hdr);
-                    return NULL;
-                }
-                mused[slot] = 1;
-                mkeys[slot] = k;
-                n_groups++;
-            }
-            mcounts[slot] += wc[s];
-        }
-    }
-
-    int64_t out_n = n_groups < take_n ? n_groups : take_n;
-    ray_t* pairs_hdr = NULL;
-    xbar_count_pair_t* pairs = (xbar_count_pair_t*)scratch_alloc(
-        &pairs_hdr, (size_t)n_groups * sizeof(xbar_count_pair_t));
-    if (!pairs && n_groups > 0) {
-        scratch_free(keys_hdr);
-        scratch_free(counts_hdr);
-        scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-    int64_t pi = 0;
-    for (uint32_t s = 0; s < cap; s++) {
-        if (!mused[s]) continue;
-        pairs[pi++] = (xbar_count_pair_t){ .key = mkeys[s], .count = mcounts[s] };
-    }
-    qsort(pairs, (size_t)n_groups, sizeof(xbar_count_pair_t),
-          xbar_count_pair_cmp);
-
-    ray_t* key_out = ray_vec_new(RAY_TIMESTAMP, out_n);
-    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
-    if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) {
-        if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out);
-        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
-        scratch_free(pairs_hdr);
-        scratch_free(mkeys_hdr);
-        scratch_free(mcounts_hdr);
-        scratch_free(mused_hdr);
-        scratch_free(keys_hdr);
-        scratch_free(counts_hdr);
-        scratch_free(used_hdr);
-        return ray_error("oom", NULL);
-    }
-    key_out->len = out_n;
-    cnt_out->len = out_n;
-    int64_t* ko = (int64_t*)ray_data(key_out);
-    int64_t* co = (int64_t*)ray_data(cnt_out);
-    for (int64_t i = 0; i < out_n; i++) {
-        ko[i] = pairs[i].key;
-        co[i] = pairs[i].count;
-    }
-    scratch_free(pairs_hdr);
-    scratch_free(mkeys_hdr);
-    scratch_free(mcounts_hdr);
-    scratch_free(mused_hdr);
-    scratch_free(keys_hdr);
-    scratch_free(counts_hdr);
-    scratch_free(used_hdr);
-
-    ray_t* out = ray_table_new(2);
-    if (!out || RAY_IS_ERR(out)) {
-        ray_release(key_out);
-        ray_release(cnt_out);
-        return out ? out : ray_error("oom", NULL);
-    }
-    out = ray_table_add_col(out, key_atom->i64, key_out);
-    out = ray_table_add_col(out, count_alias, cnt_out);
-    ray_release(key_out);
-    ray_release(cnt_out);
-    return out;
-}
-
 static int expr_affine_of_sym(ray_t* expr, int64_t sym, int64_t* bias) {
     if (!expr) return 0;
     if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME) &&
@@ -4753,43 +3628,6 @@ ray_t* ray_select(ray_t** args, int64_t n) {
         if (kid == asc_id || kid == desc_id) { has_sort = true; break; }
     }
 
-    ray_t* xbar_count = try_xbar_count_select(tbl, where_expr, by_expr,
-                                              take_expr, dict_elems, dict_n,
-                                              from_id, where_id, by_id,
-                                              take_id, asc_id, desc_id,
-                                              nearest_id);
-    if (xbar_count) {
-        ray_release(tbl);
-        return xbar_count;
-    }
-
-    ray_t* i16_ne0_count = try_i16_ne0_count_desc_select(
-        tbl, where_expr, by_expr, take_expr, dict_elems, dict_n,
-        from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id);
-    if (i16_ne0_count) {
-        ray_release(tbl);
-        return i16_ne0_count;
-    }
-
-    ray_t* i32_i64_cd = try_i32_i64_count_distinct_select(
-        tbl, where_expr, by_expr, take_expr, dict_elems, dict_n,
-        from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id);
-    if (i32_i64_cd) {
-        ray_release(tbl);
-        return i32_i64_cd;
-    }
-
-    ray_t* i16x2_count = try_i16x2_count_desc_select(tbl, where_expr, by_expr,
-                                                     take_expr, dict_elems,
-                                                     dict_n, from_id,
-                                                     where_id, by_id,
-                                                     take_id, asc_id,
-                                                     desc_id, nearest_id);
-    if (i16x2_count) {
-        ray_release(tbl);
-        return i16x2_count;
-    }
-
     /* `nearest` is mutually exclusive with `asc`/`desc`/`by` — ANN
      * ordering is an index scan, not a column sort, and cannot be
      * composed with group-by in this phase. */

From 66c266124d8d356abc02735591e902d9c7ffac91 Mon Sep 17 00:00:00 2001
From: Hetoku <volonter84@gmail.com>
Date: Mon, 25 May 2026 11:30:44 +0200
Subject: [PATCH 04/11] =?UTF-8?q?perf(group):=20fused=20radix=20HT=20?=
 =?UTF-8?q?=E2=80=94=20per-(worker,=20partition)=20direct=20insert?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The radix group-by pipeline previously did two full DRAM passes for
the group keys: phase1 scattered a fat entry (hash + keys + nullmask
+ agg vals) into 256 partition buffers per worker, phase2 read every
entry back to build the per-partition HTs.  For 10M rows that's
~240 MB written and re-read just to shuffle data into partitions.

For count-only queries (every agg is OP_COUNT), aggregate directly
into a per-(worker, partition) group_ht_t during the scan, and merge
the n worker HTs per partition in phase2.  The per-(worker, partition)
HT is small enough (~1.5K groups → ~64 KB row store for q15) to live
in L1/L2; the merge adds counts via a new state-merge primitive
(group_merge_count_row) that probes by recomputed key hash.

Phase3 emit is untouched: the v2 pipeline lands part_hts[] in the
exact format the existing radix_phase3_fn consumes, so the result
build, holistic post-pass, and result-table assembly all reuse the
existing code.  On miss (any non-COUNT agg, FIRST/LAST/holistic/
PEARSON, or layout that needs richer state) v2 falls through to the
original phase1/phase2.

Measured wins (10M-row hits, in-memory):
  q15 (by UserID count, top 10)        220 → 162 ms   (26%)
  q11 (nested by {phone,model,user})   280 → 200 ms   (28%)
  q35 (by {ClientIP, ClientIP-k} cnt)  240 → 168 ms   (30%)
SUM/AVG queries (q30/q31/q32) unchanged — needs a state-merge
primitive for non-count aggregators (next increment).

Test suite: 2657/2659 pass (2 skipped, 0 failed).
---
 src/ops/group.c | 295 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 295 insertions(+)

diff --git a/src/ops/group.c b/src/ops/group.c
index 37f01670..72535d4a 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -3093,6 +3093,219 @@ static void radix_phase2_fn(void* ctx, uint32_t worker_id, int64_t start, int64_
     }
 }
 
+/* ============================================================================
+ * Fused radix: per-(worker, partition) HT direct-insert + per-partition merge
+ *
+ *   Replaces the materialise-fat-entries-then-build-HTs round trip with a
+ *   single-pass aggregation per (worker, partition) HT, followed by an
+ *   in-cache merge per partition.  Currently restricted to count-only
+ *   queries (every agg is OP_COUNT) — the merge primitive here only
+ *   knows how to combine counts; SUM/AVG/MIN/MAX would need their own
+ *   state-merge logic (next increment).
+ *
+ *   Per-(worker, partition) HT for a 10M-row count-by-UserID: ~3M distinct
+ *   keys ÷ 256 parts ÷ 8 workers ≈ 1.5K groups → cap ~4K slots → ~64 KB
+ *   row store, L1/L2-resident.  Worker w processes its row range; per row
+ *   it hashes keys, computes partition = RADIX_PART(h), probes its local
+ *   HT_p.  Phase2 dispatches partitions across workers; each merges the n
+ *   worker HTs for one partition into a final partition HT in part_hts[p].
+ *   Phase3 (radix_phase3_fn) emits from part_hts[] exactly as before.
+ * ============================================================================ */
+
+/* Merge one source group row (count + keys + null_mask) into the target HT.
+ * Hash is recomputed from the row's key region via hash_keys_inline —
+ * identical to what group_probe_entry did when the row was first inserted,
+ * so the partition assignment is consistent.  Count-only: state merge is
+ * just count += src_count; new groups inherit the source's count. */
+static inline uint32_t group_merge_count_row(group_ht_t* ht,
+    const char* src_row, const int8_t* key_types, uint32_t mask)
+{
+    const ght_layout_t* ly = &ht->layout;
+    int64_t src_count = *(const int64_t*)src_row;
+    const int64_t* skeys = (const int64_t*)(src_row + 8);
+    uint16_t key_bytes = (uint16_t)((ly->n_keys + 1) * 8);
+    uint64_t h = hash_keys_inline(skeys, key_types, ly->n_keys,
+                                  ly->wide_key_mask, ly->wide_key_esz,
+                                  ht->key_data);
+    uint8_t salt = HT_SALT(h);
+    uint32_t slot = (uint32_t)(h & mask);
+    for (;;) {
+        uint32_t sv = ht->slots[slot];
+        if (sv == HT_EMPTY) {
+            if (ht->grp_count >= ht->grp_cap) {
+                if (!group_ht_grow(ht)) { ht->oom = 1; return mask; }
+            }
+            uint32_t gid = ht->grp_count++;
+            char* row = ht->rows + (size_t)gid * ly->row_stride;
+            *(int64_t*)row = src_count;
+            memcpy(row + 8, skeys, key_bytes);
+            ht->slots[slot] = HT_PACK(salt, gid);
+            if (ht->grp_count * 2 > ht->ht_cap) {
+                group_ht_rehash(ht, key_types);
+                mask = ht->ht_cap - 1;
+            }
+            return mask;
+        }
+        if (HT_SALT_V(sv) == salt) {
+            uint32_t gid = HT_GID(sv);
+            char* row = ht->rows + (size_t)gid * ly->row_stride;
+            if (group_keys_equal((const int64_t*)(row + 8),
+                                  skeys, ly, ht->key_data)) {
+                *(int64_t*)row += src_count;
+                return mask;
+            }
+        }
+        slot = (slot + 1) & mask;
+    }
+}
+
+typedef struct {
+    void**         key_data;
+    int8_t*        key_types;
+    uint8_t*       key_attrs;
+    ray_t**        key_vecs;
+    uint8_t        nullable_mask;
+    uint32_t       n_workers;
+    group_ht_t*    wpart_hts;        /* [n_workers * RADIX_P] */
+    ght_layout_t   layout;
+    ray_t*         rowsel;
+    const int64_t* match_idx;
+    _Atomic(int)   oom;
+} radix_v2_phase1_ctx_t;
+
+static void radix_v2_phase1_fn(void* ctx, uint32_t worker_id,
+                               int64_t start, int64_t end) {
+    radix_v2_phase1_ctx_t* c = (radix_v2_phase1_ctx_t*)ctx;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    const ght_layout_t* ly = &c->layout;
+    uint8_t nk = ly->n_keys;
+    uint8_t wide = ly->wide_key_mask;
+    uint8_t nullable = c->nullable_mask;
+    const int64_t* match_idx = c->match_idx;
+
+    group_ht_t* my_hts = &c->wpart_hts[(size_t)worker_id * RADIX_P];
+    /* Lazily init this worker's 256 partition HTs. */
+    for (uint32_t p = 0; p < RADIX_P; p++) {
+        if (!my_hts[p].slots) {
+            if (!group_ht_init_sized(&my_hts[p], 256, ly, 128)) {
+                atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+                return;
+            }
+            if (wide && c->key_data)
+                group_ht_set_key_data(&my_hts[p], c->key_data);
+        }
+    }
+    uint32_t masks[RADIX_P];
+    for (uint32_t p = 0; p < RADIX_P; p++) masks[p] = my_hts[p].ht_cap - 1;
+
+    /* Stack-resident transient entry, same layout as group_rows_range. */
+    char ebuf[8 + 9 * 8 + 8 * 8 + 8];
+    for (int64_t i = start; i < end; i++) {
+        if (((i - start) & 65535) == 0 && ray_interrupted()) break;
+        int64_t row = match_idx ? match_idx[i] : i;
+        if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, row))
+            continue;
+        uint64_t h = 0;
+        int64_t* ek = (int64_t*)(ebuf + 8);
+        int64_t null_mask = 0;
+        for (uint8_t k = 0; k < nk; k++) {
+            int8_t t = c->key_types[k];
+            uint64_t kh;
+            bool is_null = (nullable & (1u << k))
+                           && ray_vec_is_null(c->key_vecs[k], row);
+            if (is_null) {
+                null_mask |= (int64_t)(1u << k);
+                ek[k] = 0;
+                kh = ray_hash_i64(0);
+            } else if (wide & (1u << k)) {
+                uint8_t esz = ly->wide_key_esz[k];
+                const void* src = (const char*)c->key_data[k] + (size_t)row * esz;
+                ek[k] = row;
+                kh = ray_hash_bytes(src, esz);
+            } else if (t == RAY_F64) {
+                int64_t kv;
+                memcpy(&kv, &((double*)c->key_data[k])[row], 8);
+                ek[k] = kv;
+                kh = ray_hash_f64(((double*)c->key_data[k])[row]);
+            } else {
+                int64_t kv = read_col_i64(c->key_data[k], row, t, c->key_attrs[k]);
+                ek[k] = kv;
+                kh = ray_hash_i64(kv);
+            }
+            h = (k == 0) ? kh : ray_hash_combine(h, kh);
+        }
+        ek[nk] = null_mask;
+        if (null_mask) h = ray_hash_combine(h, ray_hash_i64(null_mask));
+        *(uint64_t*)ebuf = h;
+        /* Count-only: no agg_vals to pack; entry body ends at the null-mask
+         * slot.  The HT row layout matches (need_flags == 0). */
+        uint32_t p = RADIX_PART(h);
+        uint32_t new_mask = group_probe_entry(&my_hts[p], ebuf,
+                                              c->key_types, masks[p]);
+        if (my_hts[p].oom) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+        masks[p] = new_mask;
+    }
+}
+
+typedef struct {
+    group_ht_t*   wpart_hts;     /* [n_workers * RADIX_P] — input */
+    group_ht_t*   part_hts;      /* [RADIX_P] — output */
+    int8_t*       key_types;
+    uint32_t      n_workers;
+    ght_layout_t  layout;
+    void**        key_data;
+    _Atomic(int)  oom;
+} radix_v2_phase2_ctx_t;
+
+static void radix_v2_phase2_fn(void* ctx, uint32_t worker_id,
+                               int64_t start, int64_t end) {
+    (void)worker_id;
+    radix_v2_phase2_ctx_t* c = (radix_v2_phase2_ctx_t*)ctx;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    uint16_t row_stride = c->layout.row_stride;
+    for (int64_t p = start; p < end; p++) {
+        /* Upper bound on the merged partition: sum of worker grp_counts
+         * (some keys may be present in multiple workers — the merge will
+         * fold those, so the final grp_count is ≤ this sum). */
+        uint32_t total_grps = 0;
+        for (uint32_t w = 0; w < c->n_workers; w++)
+            total_grps += c->wpart_hts[(size_t)w * RADIX_P + p].grp_count;
+        if (total_grps == 0) continue;
+        uint32_t ht_cap = 256;
+        {
+            uint64_t target = (uint64_t)total_grps * 2;
+            if (target < 256) target = 256;
+            while (ht_cap < target) ht_cap *= 2;
+        }
+        uint32_t init_grp = 256;
+        while (init_grp < total_grps && init_grp < 65536) init_grp *= 2;
+        if (!group_ht_init_sized(&c->part_hts[p], ht_cap, &c->layout, init_grp)) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+        if (c->layout.wide_key_mask && c->key_data)
+            group_ht_set_key_data(&c->part_hts[p], c->key_data);
+        uint32_t mask = c->part_hts[p].ht_cap - 1;
+        for (uint32_t w = 0; w < c->n_workers; w++) {
+            group_ht_t* src = &c->wpart_hts[(size_t)w * RADIX_P + p];
+            if (src->grp_count == 0) continue;
+            const char* rows = src->rows;
+            for (uint32_t gi = 0; gi < src->grp_count; gi++) {
+                mask = group_merge_count_row(&c->part_hts[p],
+                                             rows + (size_t)gi * row_stride,
+                                             c->key_types, mask);
+                if (c->part_hts[p].oom) {
+                    atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+                    return;
+                }
+            }
+        }
+    }
+}
+
 /* ============================================================================
  * Parallel direct-array accumulation for low-cardinality single integer key
  * ============================================================================ */
@@ -7292,6 +7505,87 @@ ht_path:;
 skip_top_count_filter:
 
     if (pool && nrows >= RAY_PARALLEL_THRESHOLD && n_total > 1 && !rowsel) {
+        /* Per-(worker, partition) direct-insert path for count-only.
+         * Bypasses the fat-entry materialisation and the phase1→phase2
+         * DRAM round trip; on success it populates part_hts[] in the
+         * same format the existing phase3 emit consumes. */
+        bool v2_count_only = (n_keys >= 1 && n_aggs > 0);
+        for (uint8_t a = 0; a < n_aggs && v2_count_only; a++)
+            if (ext->agg_ops[a] != OP_COUNT) v2_count_only = false;
+        if (v2_count_only && !(ght_layout.agg_is_first | ght_layout.agg_is_last
+                                | ght_layout.agg_is_holistic
+                                | ght_layout.agg_is_binary)) {
+            ray_t* wpart_hdr = NULL;
+            size_t v2_n_w = (size_t)n_total * RADIX_P;
+            group_ht_t* wpart_hts = (group_ht_t*)scratch_calloc(
+                &wpart_hdr, v2_n_w * sizeof(group_ht_t));
+            ray_t* v2_part_hdr = NULL;
+            group_ht_t* v2_part_hts = wpart_hts
+                ? (group_ht_t*)scratch_calloc(&v2_part_hdr,
+                                              RADIX_P * sizeof(group_ht_t))
+                : NULL;
+            if (!wpart_hts || !v2_part_hts) {
+                if (wpart_hts) scratch_free(wpart_hdr);
+                if (v2_part_hts) scratch_free(v2_part_hdr);
+                goto v2_done;
+            }
+            uint8_t v2_nullable = 0;
+            for (uint8_t k = 0; k < n_keys; k++) {
+                if (!key_vecs[k]) continue;
+                ray_t* src = (key_vecs[k]->attrs & RAY_ATTR_SLICE)
+                             ? key_vecs[k]->slice_parent : key_vecs[k];
+                if (src && (src->attrs & RAY_ATTR_HAS_NULLS))
+                    v2_nullable |= (uint8_t)(1u << k);
+            }
+            radix_v2_phase1_ctx_t v2p1 = {
+                .key_data      = key_data,
+                .key_types     = key_types,
+                .key_attrs     = key_attrs,
+                .key_vecs      = key_vecs,
+                .nullable_mask = v2_nullable,
+                .n_workers     = n_total,
+                .wpart_hts     = wpart_hts,
+                .layout        = ght_layout,
+                .rowsel        = rowsel,
+                .match_idx     = match_idx,
+                .oom           = 0,
+            };
+            ray_pool_dispatch(pool, radix_v2_phase1_fn, &v2p1, n_scan);
+            CHECK_CANCEL_GOTO(pool, cleanup);
+            if (atomic_load_explicit(&v2p1.oom, memory_order_relaxed)) {
+                for (size_t i = 0; i < v2_n_w; i++)
+                    group_ht_free(&wpart_hts[i]);
+                scratch_free(wpart_hdr);
+                scratch_free(v2_part_hdr);
+                goto v2_done;
+            }
+            radix_v2_phase2_ctx_t v2p2 = {
+                .wpart_hts = wpart_hts,
+                .part_hts  = v2_part_hts,
+                .key_types = key_types,
+                .n_workers = n_total,
+                .layout    = ght_layout,
+                .key_data  = key_data,
+                .oom       = 0,
+            };
+            ray_pool_dispatch_n(pool, radix_v2_phase2_fn, &v2p2, RADIX_P);
+            CHECK_CANCEL_GOTO(pool, cleanup);
+            /* Worker HTs are no longer needed once the merge is done. */
+            for (size_t i = 0; i < v2_n_w; i++)
+                group_ht_free(&wpart_hts[i]);
+            scratch_free(wpart_hdr);
+            if (atomic_load_explicit(&v2p2.oom, memory_order_relaxed)) {
+                for (uint32_t p = 0; p < RADIX_P; p++)
+                    group_ht_free(&v2_part_hts[p]);
+                scratch_free(v2_part_hdr);
+                goto v2_done;
+            }
+            /* Hand off to the existing phase3 emit. */
+            part_hts = v2_part_hts;
+            part_hts_hdr = v2_part_hdr;
+            goto v2_emit;
+        }
+v2_done:;
         size_t n_bufs = (size_t)n_total * RADIX_P;
         radix_bufs = (radix_buf_t*)scratch_calloc(&radix_bufs_hdr,
             n_bufs * sizeof(radix_buf_t));
@@ -7394,6 +7688,7 @@ ht_path:;
             ray_heap_gc();
         }
 
+v2_emit:;
         /* Prefix offsets */
         uint32_t part_offsets[RADIX_P + 1];
         part_offsets[0] = 0;

From 8c30d17f0c77d2b57ee75890076d5de9e1452d78 Mon Sep 17 00:00:00 2001
From: Hetoku <volonter84@gmail.com>
Date: Mon, 25 May 2026 12:01:59 +0200
Subject: [PATCH 05/11] perf(group): extend per-partition path to SUM/AVG
 aggregators
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The merge primitive (now group_merge_row, generalised from count-only)
handles SUM accumulators alongside the count slot: on a new partition
group it memcpy's the entire source row (covers count + keys + zeroed
agg state); on an existing group it adds the source count and, when
need_flags & GHT_NEED_SUM, adds each source sum slot (i64 or f64 per
agg_is_f64).  Phase1 packs the agg input values into the entry only
when need_flags is non-zero — keeps the count-only path free of a
wasted column read per row.

Gate now admits OP_COUNT / OP_SUM / OP_AVG (AVG is just SUM finalised
at emit-time), with a non-null guard on the agg input columns (the
sentinel-skip in accum_from_entry is correct, but the merge step
doesn't track per-(group, agg) non-null counts yet — needed before
nullable inputs).  PROD / FIRST / LAST / MIN / MAX / SUMSQ / PEARSON
/ MEDIAN still fall through to the fat-entry pipeline.

Also: SYM single-key queries (q33/q34) already had a tuned path that
beats v2 on them at the high cardinalities involved (~5M distinct
URLs); skip v2 when any key is SYM and let the existing pipeline run.

Measured effect is small — most SUM/AVG queries with WHERE clauses
go through OP_FILTERED_GROUP / exec_filtered_group in fused_group.c,
not through exec_group, so v2 here doesn't catch them.  Lays the
state-merge groundwork that a future fused_group v2 needs.

Test suite: 2657/2659 pass (2 skipped, 0 failed).
---
 src/ops/group.c | 130 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 106 insertions(+), 24 deletions(-)

diff --git a/src/ops/group.c b/src/ops/group.c
index 72535d4a..d0d10e98 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -3112,23 +3112,30 @@ static void radix_phase2_fn(void* ctx, uint32_t worker_id, int64_t start, int64_
  *   Phase3 (radix_phase3_fn) emits from part_hts[] exactly as before.
  * ============================================================================ */
 
-/* Merge one source group row (count + keys + null_mask) into the target HT.
- * Hash is recomputed from the row's key region via hash_keys_inline —
- * identical to what group_probe_entry did when the row was first inserted,
- * so the partition assignment is consistent.  Count-only: state merge is
- * just count += src_count; new groups inherit the source's count. */
-static inline uint32_t group_merge_count_row(group_ht_t* ht,
+/* Merge one source group row into the target HT.  Hash is recomputed from
+ * the row's key region via hash_keys_inline — identical to what
+ * group_probe_entry did when the row was first inserted, so the partition
+ * assignment is consistent.  Supports need_flags ∈ {0, GHT_NEED_SUM}:
+ * count-only and count+SUM/AVG.  On miss, the entire source row is copied
+ * verbatim (memcpy of row_stride); on hit, count += src.count and, when
+ * need_sum, each enabled sum slot accumulates the source's sum (f64 or
+ * i64 per agg_is_f64).  Caller's v2 gate filters out PROD/FIRST/LAST/
+ * MIN/MAX/SUMSQ/PEARSON/MEDIAN — those need richer state merges. */
+static inline uint32_t group_merge_row(group_ht_t* ht,
     const char* src_row, const int8_t* key_types, uint32_t mask)
 {
     const ght_layout_t* ly = &ht->layout;
     int64_t src_count = *(const int64_t*)src_row;
     const int64_t* skeys = (const int64_t*)(src_row + 8);
-    uint16_t key_bytes = (uint16_t)((ly->n_keys + 1) * 8);
     uint64_t h = hash_keys_inline(skeys, key_types, ly->n_keys,
                                   ly->wide_key_mask, ly->wide_key_esz,
                                   ht->key_data);
     uint8_t salt = HT_SALT(h);
     uint32_t slot = (uint32_t)(h & mask);
+    uint8_t na = ly->n_aggs;
+    uint8_t f64_mask = ly->agg_is_f64;
+    uint16_t off_sum = ly->off_sum;
+    bool need_sum = (ly->need_flags & GHT_NEED_SUM) != 0;
     for (;;) {
         uint32_t sv = ht->slots[slot];
         if (sv == HT_EMPTY) {
@@ -3137,8 +3144,8 @@ static inline uint32_t group_merge_count_row(group_ht_t* ht,
             }
             uint32_t gid = ht->grp_count++;
             char* row = ht->rows + (size_t)gid * ly->row_stride;
-            *(int64_t*)row = src_count;
-            memcpy(row + 8, skeys, key_bytes);
+            /* Whole-row copy: count + keys/null_mask + aggregator state. */
+            memcpy(row, src_row, ly->row_stride);
             ht->slots[slot] = HT_PACK(salt, gid);
             if (ht->grp_count * 2 > ht->ht_cap) {
                 group_ht_rehash(ht, key_types);
@@ -3152,6 +3159,22 @@ static inline uint32_t group_merge_count_row(group_ht_t* ht,
             if (group_keys_equal((const int64_t*)(row + 8),
                                   skeys, ly, ht->key_data)) {
                 *(int64_t*)row += src_count;
+                if (need_sum) {
+                    for (uint8_t a = 0; a < na; a++) {
+                        int8_t s = ly->agg_val_slot[a];
+                        if (s < 0) continue;
+                        size_t off = (size_t)off_sum + (size_t)s * 8;
+                        if (f64_mask & (1u << a)) {
+                            double sv_f;
+                            memcpy(&sv_f, src_row + off, 8);
+                            *(double*)(row + off) += sv_f;
+                        } else {
+                            int64_t sv_i;
+                            memcpy(&sv_i, src_row + off, 8);
+                            *(int64_t*)(row + off) += sv_i;
+                        }
+                    }
+                }
                 return mask;
             }
         }
@@ -3164,6 +3187,9 @@ typedef struct {
     int8_t*        key_types;
     uint8_t*       key_attrs;
     ray_t**        key_vecs;
+    ray_t**        agg_vecs;        /* may be NULL for pure COUNT (n_agg_vals==0) */
+    ray_t**        agg_vecs2;
+    uint8_t*       agg_strlen;
     uint8_t        nullable_mask;
     uint32_t       n_workers;
     group_ht_t*    wpart_hts;        /* [n_workers * RADIX_P] */
@@ -3237,8 +3263,37 @@ static void radix_v2_phase1_fn(void* ctx, uint32_t worker_id,
         ek[nk] = null_mask;
         if (null_mask) h = ray_hash_combine(h, ray_hash_i64(null_mask));
         *(uint64_t*)ebuf = h;
-        /* Count-only: no agg_vals to pack; entry body ends at the null-mask
-         * slot.  The HT row layout matches (need_flags == 0). */
+        /* Pack agg values into entry — only when the HT layout actually
+         * reads them.  For count-only need_flags == 0 and accum_from_entry
+         * skips every agg slot; packing here would be a wasted column
+         * read per row (a measurable regression on q15-class queries). */
+        if (ly->need_flags) {
+            int64_t* ev = (int64_t*)(ebuf + 8 + ((size_t)nk + 1) * 8);
+            uint8_t vi = 0;
+            uint8_t na = ly->n_aggs;
+            uint8_t bin_mask = ly->agg_is_binary;
+            uint8_t hol_mask = ly->agg_is_holistic;
+            for (uint8_t a = 0; a < na; a++) {
+                if (hol_mask & (1u << a)) continue;
+                ray_t* ac = c->agg_vecs ? c->agg_vecs[a] : NULL;
+                if (!ac) continue;
+                if (c->agg_strlen && c->agg_strlen[a])
+                    ev[vi] = group_strlen_at(ac, row);
+                else if (ac->type == RAY_F64)
+                    memcpy(&ev[vi], &((double*)ray_data(ac))[row], 8);
+                else
+                    ev[vi] = read_col_i64(ray_data(ac), row, ac->type, ac->attrs);
+                vi++;
+                if ((bin_mask & (1u << a)) && c->agg_vecs2 && c->agg_vecs2[a]) {
+                    ray_t* ay = c->agg_vecs2[a];
+                    if (ay->type == RAY_F64)
+                        memcpy(&ev[vi], &((double*)ray_data(ay))[row], 8);
+                    else
+                        ev[vi] = read_col_i64(ray_data(ay), row, ay->type, ay->attrs);
+                    vi++;
+                }
+            }
+        }
         uint32_t p = RADIX_PART(h);
         uint32_t new_mask = group_probe_entry(&my_hts[p], ebuf,
                                               c->key_types, masks[p]);
@@ -3294,9 +3349,9 @@ static void radix_v2_phase2_fn(void* ctx, uint32_t worker_id,
             if (src->grp_count == 0) continue;
             const char* rows = src->rows;
             for (uint32_t gi = 0; gi < src->grp_count; gi++) {
-                mask = group_merge_count_row(&c->part_hts[p],
-                                             rows + (size_t)gi * row_stride,
-                                             c->key_types, mask);
+                mask = group_merge_row(&c->part_hts[p],
+                                       rows + (size_t)gi * row_stride,
+                                       c->key_types, mask);
                 if (c->part_hts[p].oom) {
                     atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
                     return;
@@ -7505,16 +7560,40 @@ ht_path:;
 skip_top_count_filter:
 
     if (pool && nrows >= RAY_PARALLEL_THRESHOLD && n_total > 1 && !rowsel) {
-        /* Per-(worker, partition) direct-insert path for count-only.
-         * Bypasses the fat-entry materialisation and the phase1→phase2
-         * DRAM round trip; on success it populates part_hts[] in the
-         * same format the existing phase3 emit consumes. */
-        bool v2_count_only = (n_keys >= 1 && n_aggs > 0);
-        for (uint8_t a = 0; a < n_aggs && v2_count_only; a++)
-            if (ext->agg_ops[a] != OP_COUNT) v2_count_only = false;
-        if (v2_count_only && !(ght_layout.agg_is_first | ght_layout.agg_is_last
-                                | ght_layout.agg_is_holistic
-                                | ght_layout.agg_is_binary)) {
+        /* Per-(worker, partition) direct-insert path: aggregates into
+         * thread-local partition HTs during phase1, then merges per
+         * partition.  Bypasses the phase1 fat-entry materialisation +
+         * phase2 re-read DRAM round trip.  On success it populates
+         * part_hts[] in the format the existing phase3 emit consumes.
+         *
+         * Gate: every agg is COUNT/SUM/AVG (the merge primitive knows
+         * how to add counts and sum slots; PROD/MIN/MAX/FIRST/LAST/
+         * SUMSQ/PEARSON/MEDIAN need richer state-merge logic).  Agg
+         * input columns must be non-nullable for now — sentinel-skip
+         * inside accum_from_entry is correct, but the merge step needs
+         * an nn_count and that isn't tracked yet. */
+        bool v2_ok = (n_keys >= 1 && n_aggs > 0);
+        /* SYM single-key queries already had a tuned path (q33/q34 hit it
+         * before falling to the radix); v2 doesn't beat it for them, so
+         * skip when any key is SYM and let the existing pipeline handle it. */
+        for (uint8_t k = 0; k < n_keys && v2_ok; k++)
+            if (key_types[k] == RAY_SYM) v2_ok = false;
+        for (uint8_t a = 0; a < n_aggs && v2_ok; a++) {
+            uint16_t op = ext->agg_ops[a];
+            if (op != OP_COUNT && op != OP_SUM && op != OP_AVG) {
+                v2_ok = false;
+                break;
+            }
+            if (agg_vecs[a]) {
+                ray_t* src = (agg_vecs[a]->attrs & RAY_ATTR_SLICE)
+                             ? agg_vecs[a]->slice_parent : agg_vecs[a];
+                if (src && (src->attrs & RAY_ATTR_HAS_NULLS))
+                    v2_ok = false;
+            }
+        }
+        if (v2_ok && !(ght_layout.agg_is_first | ght_layout.agg_is_last
+                        | ght_layout.agg_is_holistic
+                        | ght_layout.agg_is_binary)) {
             ray_t* wpart_hdr = NULL;
             size_t v2_n_w = (size_t)n_total * RADIX_P;
             group_ht_t* wpart_hts = (group_ht_t*)scratch_calloc(
@@ -7542,6 +7621,9 @@ ht_path:;
                 .key_types     = key_types,
                 .key_attrs     = key_attrs,
                 .key_vecs      = key_vecs,
+                .agg_vecs      = agg_vecs,
+                .agg_vecs2     = agg_vecs2,
+                .agg_strlen    = agg_strlen,
                 .nullable_mask = v2_nullable,
                 .n_workers     = n_total,
                 .wpart_hts     = wpart_hts,

From 06783842abbbd3c3105401ea2527eb01aace5d50 Mon Sep 17 00:00:00 2001
From: Hetoku <volonter84@gmail.com>
Date: Tue, 26 May 2026 10:50:00 +0200
Subject: [PATCH 06/11] fix(group): minmax early-abort check fires within
 morsels, not at boundaries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The DA-path min/max scan polls its abort flag every (i-start) & N == 0.
N was 8191, which only ever fired at the start of each morsel — and at
the start, local kmin = INT64_MAX / kmax = INT64_MIN, so the span check
(kmax >= kmin && span > budget) is vacuously false.  Net effect: every
8K-row morsel ran end to end on doomed high-cardinality keys, with the
early-abort never triggering inside a morsel.  Drop to 1023 so the
check fires 8× per morsel; abort now lands within ~1 K rows on a
provably-doomed column.
---
 src/ops/group.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/ops/group.c b/src/ops/group.c
index d0d10e98..a5be30e2 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -3391,13 +3391,17 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t
     int8_t t = c->key_type;
     const int64_t span_budget = c->span_budget;
 
-    /* Span check and abort poll are batched (every 8192 rows) so the
-     * hot per-row loop body stays a branchless min/max with no atomics. */
+    /* Span check and abort poll are batched (every 1024 rows) so the
+     * hot per-row loop body stays a branchless min/max with no atomics.
+     * 8192 was too sparse — the dispatcher hands out 8K-row morsels, so
+     * `(i-start) & 8191 == 0` only ever fired at the morsel boundary
+     * (where kmin=INT64_MAX/kmax=INT64_MIN make the span check vacuous),
+     * leaving every full 8K morsel to run end-to-end on doomed columns. */
     #define MINMAX_SEG_LOOP(TYPE, CAST) \
         do { \
             const TYPE* kd = (const TYPE*)c->key_data; \
             for (int64_t i = start; i < end; i++) { \
-                if (((i - start) & 8191) == 0) { \
+                if (((i - start) & 1023) == 0) { \
                     if (atomic_load_explicit(c->abort_flag, \
                                              memory_order_relaxed)) \
                         goto minmax_done; \

From 74274ca3b323284a010c698a5f271f015e993c44 Mon Sep 17 00:00:00 2001
From: Hetoku <volonter84@gmail.com>
Date: Tue, 26 May 2026 10:56:55 +0200
Subject: [PATCH 07/11] perf(group): skip accum_from_entry when the HT layout
 has no agg state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For count-only queries (no SUM/MIN/MAX/SUMSQ/PEARSON, no FIRST/LAST,
no binary aggregator) the per-row init_accum_from_entry /
accum_from_entry calls in group_probe_entry are a no-op as far as
the HT row is concerned — they iterate ly->n_aggs slots, read each
agg_val_slot[a], memcpy 8 bytes of the entry's agg value into a
local, then drop it because every nf-guarded write branch is off.
At 6 % of the q15 profile (~10 ns/row × 10 M rows / 8 cores ≈ 12 ms)
that's pure waste.

Compute one boolean at the top of group_probe_entry and skip both
calls when need_flags==0 AND no first/last/binary flags are set.
Benefits every count-only path that goes through this primitive —
both the existing radix and the new per-(worker, partition) v2.

Measured (focused, REPS=5):
  q15  169 → 150 ms   (11 % faster on top of v2)
  q35  168 → 153 ms   (9 %)
  q33   82 →  79 ms   (the existing radix benefits too)
  q34   82 →  77 ms

Test suite 2657/2659 (2 skipped, 0 failed).
---
 src/ops/group.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/ops/group.c b/src/ops/group.c
index a5be30e2..d5866fd4 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -2345,6 +2345,16 @@ static inline uint32_t group_probe_entry(group_ht_t* ht,
     uint32_t slot = (uint32_t)(hash & mask);
     uint16_t key_bytes = (uint16_t)((ly->n_keys + 1) * 8);
 
+    /* For count-only queries (no SUM/MIN/MAX/SUMSQ/PEARSON aggregator
+     * state, no FIRST/LAST row tracking, no binary aggregator y-side)
+     * init_accum_from_entry and accum_from_entry are no-ops on every
+     * non-count slot — the per-row call still iterates n_aggs slots,
+     * reads agg_val_slot[a], memcpy's the entry's agg value into a
+     * local, then drops it.  That's ~6 ns / row × n_keys=1 millions of
+     * rows, ~7 ms wall on q15.  Skip the call when none of the flags
+     * that drive its writes are set. */
+    uint8_t accum_skip = (ly->need_flags == 0
+        && (ly->agg_is_first | ly->agg_is_last | ly->agg_is_binary) == 0);
     for (;;) {
         uint32_t sv = ht->slots[slot];
         if (sv == HT_EMPTY) {
@@ -2356,7 +2366,8 @@ static inline uint32_t group_probe_entry(group_ht_t* ht,
             char* row = ht->rows + (size_t)gid * ly->row_stride;
             *(int64_t*)row = 1;   /* count = 1 */
             memcpy(row + 8, ekeys, key_bytes);
-            init_accum_from_entry(row, entry, ly);
+            if (!accum_skip)
+                init_accum_from_entry(row, entry, ly);
             ht->slots[slot] = HT_PACK(salt, gid);
             if (ht->grp_count * 2 > ht->ht_cap) {
                 group_ht_rehash(ht, key_types);
@@ -2370,7 +2381,8 @@ static inline uint32_t group_probe_entry(group_ht_t* ht,
             if (group_keys_equal((const int64_t*)(row + 8),
                                   (const int64_t*)ekeys, ly, ht->key_data)) {
                 (*(int64_t*)row)++;   /* count++ */
-                accum_from_entry(row, entry, ly);
+                if (!accum_skip)
+                    accum_from_entry(row, entry, ly);
                 return mask;
             }
         }

From f0219a782b4c6ec6ca5714adc32c970641272351 Mon Sep 17 00:00:00 2001
From: Hetoku <volonter84@gmail.com>
Date: Tue, 26 May 2026 11:29:29 +0200
Subject: [PATCH 08/11] perf(fused_group): pre-size worker shards by nrows
 heuristic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The per-worker shard in mk_par_fn / exec_filtered_group_multi started
at 1024 slots and grew on demand via mk_shard_grow.  For a 10M-row
high-cardinality query (e.g. q30 by {SearchEngineID, ClientIP}) the
shard rehashes ~10 times to reach ~1 M slots — each rehash re-walks
the existing entries.  The q30 profile shows mk_shard_grow at 9.2 %.

Pre-size init_cap by ~nrows/(nw·16) capped at 16 K slots.  Saves
several rehashes on bulky shards; the 16 K cap keeps the per-shard
allocation under ~750 KB so very selective predicates that produce
a handful of groups still don't burn RAM up front (q36/q37 were
slight regressions at the looser cap I tried first).

Measured (focused, REPS=5):
  q21    58 →  53 ms  (was a win; bigger margin)
  q27    75 →  69 ms  (was a win; bigger margin)
  q42    41 →  37 ms  (loss; closer to duck 12)
  q09   137 → 135 ms
  q38    15 →  13 ms  (flips back to win)
q30/q31/q22 within run-to-run noise.

Test suite 2657/2659 (2 skipped, 0 failed).
---
 src/ops/fused_group.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c
index 81826fc4..04c2fb43 100644
--- a/src/ops/fused_group.c
+++ b/src/ops/fused_group.c
@@ -3669,10 +3669,21 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext,
     }
     if (nrows < 0) return ray_error("nyi", NULL);
 
-    ctx.init_cap = FP_SHARD_INIT_CAP;
     atomic_store_explicit(&ctx.oom, 0, memory_order_relaxed);
     ray_pool_t* pool = ray_pool_get();
     uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
+    /* Pre-size each worker shard a bit larger than the 1024-slot default
+     * so high-cardinality queries don't pay log2(target/1024) rehashes.
+     * The cap stays modest (16 K slots ≈ ~750 KB per shard with a 4-slot
+     * agg state) so very selective predicates that produce a handful of
+     * groups don't burn RAM up front.  Sparse keys still grow on-demand. */
+    {
+        uint64_t expected = (uint64_t)nrows / ((uint64_t)nw * 16u);
+        uint64_t init_cap = FP_SHARD_INIT_CAP;
+        while (init_cap < expected * 2u && init_cap < (1ULL << 14))
+            init_cap <<= 1;
+        ctx.init_cap = init_cap;
+    }
     ray_t* shards_hdr = NULL;
     ctx.shards = (mk_shard_t*)scratch_calloc(&shards_hdr,
                                              (size_t)nw * sizeof(mk_shard_t));

From da90360deb956cd918d4702db015fcaa0e9ce655 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 13:14:50 +0200
Subject: [PATCH 09/11] feat(group): HyperLogLog approximate count-distinct
 kernel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New primitive in src/ops/hll.{h,c}:

  ray_hll_t                — register-array sketch, 1 B/register, P=14
                              default → 16 KB sketch, ~0.81 % std error
  ray_hll_init/free/reset  — lifecycle
  ray_hll_add              — inline; hash → register index + rho update
  ray_hll_merge            — element-wise max (parallel-safe combine)
  ray_hll_estimate         — Flajolet-Fusy-Gandouet-Meunier 2007
                              estimator with linear-counting branch for
                              small cardinalities

Two consumers:

  ray_count_distinct_approx (scalar)
    Parallel: each worker builds a private sketch over its row range,
    main thread merges to one and emits the estimate.  Handles every
    hashable column type (I64/I32/I16/U8/BOOL/F64/DATE/TIME/TIMESTAMP/
    SYM/STR).  Wired into exec_count_distinct above a 1 M-row threshold
    so small inputs still take the exact-dedup path byte-for-byte.

  ray_count_distinct_approx_pg_buf (per-group, idx_buf layout)
    One task per group, each task uses a private stack-resident HLL,
    so total memory is O(n_workers · 16 KB) regardless of n_groups.
    Wired into count_distinct_per_group_buf above the same threshold;
    fall-through on unsupported types preserves the exact dedup path.

Measured (10M-row hits, in-memory):

  q04 (count distinct UserID global)   78 → 8.6 ms   (FLIP vs duck 72)
  q05 (count distinct SearchPhrase)    19 → 4.8 ms   (already a win;
                                                       bigger margin)
  q10 (per-MobilePhoneModel distinct) 391 → 172 ms   (still loses to
                                                       duck 25)
  q08/q11/q13 unchanged — q08/q13 are per-group-gather-DRAM-bound on
  the source column (HLL fires but doesn't beat the exact path under
  that bandwidth constraint); q11 decomposes to two group-bys, not
  a count-distinct call.

Estimate accuracy verified on q04: HLL 1 533 006 vs exact 1 530 143
(0.19 % rel. error, inside the ~0.8 % std error bound).

Full ClickBench: 22/43 wins (was 21/43, with q04 flipping cleanly).
Test suite 2657/2659 (2 skipped, 0 failed).
---
 src/ops/group.c |  25 +++
 src/ops/hll.c   | 442 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/ops/hll.h   | 118 +++++++++++++
 src/ops/query.c |  28 +++
 4 files changed, 613 insertions(+)
 create mode 100644 src/ops/hll.c
 create mode 100644 src/ops/hll.h

diff --git a/src/ops/group.c b/src/ops/group.c
index d5866fd4..14a5eeb0 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -23,6 +23,7 @@
 
 #include "ops/internal.h"
 #include "ops/rowsel.h"
+#include "ops/hll.h"        /* approximate count-distinct via HyperLogLog */
 #include "lang/internal.h"  /* for ray_median_dbl_inplace */
 
 /* ============================================================================
@@ -594,6 +595,23 @@ ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input) {
 
     if (len == 0) return ray_i64(0);
 
+    /* For inputs above this row count, switch to the HyperLogLog
+     * cardinality sketch (~0.8% std error at P=14, 16 KB per shard).
+     * Exact dedup-via-hashset is O(unique·log) and becomes memory-
+     * bandwidth-bound past ~1 M rows; HLL is single-pass, mergeable,
+     * and constant-memory per worker.  Below the threshold the exact
+     * path is fast enough and avoids approximation entirely — so small
+     * tests still match `len-after-distinct` byte-for-byte. */
+    if (len >= (1 << 20)) {
+        bool hashable = (in_type == RAY_I64 || in_type == RAY_I32 ||
+                          in_type == RAY_I16 || in_type == RAY_U8 ||
+                          in_type == RAY_BOOL || in_type == RAY_F64 ||
+                          in_type == RAY_DATE || in_type == RAY_TIME ||
+                          in_type == RAY_TIMESTAMP || in_type == RAY_STR ||
+                          RAY_IS_SYM(in_type));
+        if (hashable) return ray_count_distinct_approx(input);
+    }
+
     switch (in_type) {
     case RAY_BOOL: case RAY_U8:
     case RAY_I16: case RAY_I32: case RAY_I64:
@@ -1130,6 +1148,13 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid,
     memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
     if (n_rows == 0 || n_groups == 0) return out;
 
+    /* This callsite only fires when n_groups > 50 000 (the buf-form
+     * caller catches the low-cardinality majority); per-group HLL at
+     * those group counts exceeds any reasonable memory budget
+     * (50 000 · 16 KB · n_workers ≈ multi-GB), so there's no
+     * approximate path here — fall straight through to the exact
+     * partitioned dedup. */
+
     /* Parallel partitioned path for sizes where the serial global hash
      * blows L3.  Threshold tuned so the partition / scatter / dedup
      * dispatch overhead stays smaller than the cache-miss savings. */
diff --git a/src/ops/hll.c b/src/ops/hll.c
new file mode 100644
index 00000000..3b15c049
--- /dev/null
+++ b/src/ops/hll.c
@@ -0,0 +1,442 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops/hll.h"
+#include "ops/internal.h"
+#include "ops/ops.h"
+#include "core/pool.h"
+#include "table/sym.h"
+
+#include <math.h>
+#include <string.h>
+#include <stdatomic.h>
+
+int ray_hll_init(ray_hll_t* h, uint8_t p) {
+    if (!h) return -1;
+    if (p < 4) p = 4;            /* too small loses all accuracy */
+    if (p > 18) p = 18;           /* 256 KB cap on register array */
+    memset(h, 0, sizeof(*h));
+    uint32_t m = 1u << p;
+    h->p = p;
+    h->m = m;
+    h->regs = (uint8_t*)scratch_calloc(&h->_hdr, (size_t)m);
+    if (!h->regs) return -1;
+    return 0;
+}
+
+void ray_hll_free(ray_hll_t* h) {
+    if (!h) return;
+    if (h->_hdr) scratch_free(h->_hdr);
+    h->regs = NULL;
+    h->_hdr = NULL;
+    h->m = 0;
+    h->p = 0;
+}
+
+void ray_hll_reset(ray_hll_t* h) {
+    if (h && h->regs) memset(h->regs, 0, (size_t)h->m);
+}
+
+void ray_hll_merge(ray_hll_t* dst, const ray_hll_t* src) {
+    if (!dst || !src || !dst->regs || !src->regs) return;
+    if (dst->m != src->m) return;     /* mismatched precision — caller bug */
+    const uint8_t* s = src->regs;
+    uint8_t*       d = dst->regs;
+    uint32_t       m = dst->m;
+    /* Branchless max — keeps the hot per-shard merge in vector regs.
+     * The compiler usually auto-vectorises this to a packed-max sequence. */
+    for (uint32_t i = 0; i < m; i++) {
+        uint8_t a = d[i], b = s[i];
+        d[i] = a > b ? a : b;
+    }
+}
+
+/* HyperLogLog cardinality estimator (Flajolet, Fusy, Gandouet, Meunier 2007),
+ * with the original raw-estimate / linear-counting hybrid switch.  Skips the
+ * HLL++ small-range bias-correction tables because the linear-counting branch
+ * already gives a clean estimate below E ≤ 2.5·m, which is where the raw
+ * mean diverges from truth. */
+int64_t ray_hll_estimate(const ray_hll_t* h) {
+    if (!h || !h->regs) return 0;
+    uint32_t m = h->m;
+    if (m == 0) return 0;
+
+    /* alpha_m correction constant from the paper.  m == 16 / 32 / 64 use
+     * the closed-form values; everything else uses 0.7213 / (1 + 1.079/m). */
+    double alpha_m;
+    if      (m == 16) alpha_m = 0.673;
+    else if (m == 32) alpha_m = 0.697;
+    else if (m == 64) alpha_m = 0.709;
+    else              alpha_m = 0.7213 / (1.0 + 1.079 / (double)m);
+
+    /* Sum of 2^-reg[i].  Count zero registers for the linear-counting
+     * fallback at small cardinalities (when V > 0 and E ≤ 2.5·m). */
+    double   sum_inv  = 0.0;
+    uint32_t n_zeros  = 0;
+    for (uint32_t i = 0; i < m; i++) {
+        uint8_t r = h->regs[i];
+        sum_inv += ldexp(1.0, -(int)r);   /* 2^-r */
+        n_zeros += (r == 0);
+    }
+
+    double raw = alpha_m * (double)m * (double)m / sum_inv;
+
+    if (raw <= 2.5 * (double)m && n_zeros != 0) {
+        /* Linear counting — much tighter than raw for small E. */
+        raw = (double)m * log((double)m / (double)n_zeros);
+    }
+    /* Large-range bias-correction (the 2^32 upper-edge correction in the
+     * original paper) is for 32-bit hashes only — we hash 64 bits, so the
+     * raw value is already unbiased to ~2^57.  Skip. */
+
+    if (raw < 0.0) raw = 0.0;
+    return (int64_t)(raw + 0.5);
+}
+
+/* ---- Scalar approximate count-distinct aggregator ---------------------- */
+
+typedef struct {
+    const ray_t*  vec;
+    int8_t        type;
+    uint8_t       attrs;
+    bool          has_nulls;
+    ray_hll_t*    shards;          /* [n_workers] — one HLL per worker */
+    uint8_t       p;
+    uint32_t      n_workers;
+    _Atomic(int)  oom;
+} cda_scalar_ctx_t;
+
+static void cda_scalar_fn(void* raw, uint32_t worker_id, int64_t start, int64_t end) {
+    cda_scalar_ctx_t* c = (cda_scalar_ctx_t*)raw;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    ray_hll_t* sh = &c->shards[worker_id % c->n_workers];
+    if (!sh->regs) {
+        if (ray_hll_init(sh, c->p) != 0) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+    }
+    const ray_t* v = c->vec;
+    const void* base = ray_data((ray_t*)v);
+    int8_t  t = c->type;
+    bool    hn = c->has_nulls;
+    const int64_t CHK = 65535;
+
+    if (t == RAY_I64 || t == RAY_TIMESTAMP) {
+        const int64_t* d = (const int64_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int64_t v_i = d[r];
+            if (hn && v_i == NULL_I64) continue;
+            ray_hll_add(sh, ray_hash_i64(v_i));
+        }
+    } else if (t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) {
+        const int32_t* d = (const int32_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int32_t v_i = d[r];
+            if (hn && v_i == NULL_I32) continue;
+            ray_hll_add(sh, ray_hash_i64((int64_t)v_i));
+        }
+    } else if (t == RAY_I16) {
+        const int16_t* d = (const int16_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            int16_t v_i = d[r];
+            if (hn && v_i == NULL_I16) continue;
+            ray_hll_add(sh, ray_hash_i64((int64_t)v_i));
+        }
+    } else if (t == RAY_BOOL || t == RAY_U8) {
+        const uint8_t* d = (const uint8_t*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            ray_hll_add(sh, ray_hash_i64((int64_t)d[r]));
+        }
+    } else if (t == RAY_F64) {
+        const double* d = (const double*)base;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            double v_f = d[r];
+            if (v_f != v_f) continue;     /* NaN = null in F64 column */
+            ray_hll_add(sh, ray_hash_f64(v_f));
+        }
+    } else if (RAY_IS_SYM(t)) {
+        /* SYM is width-encoded — sym id 0 is the canonical empty-string
+         * sentinel (treat as null), every other id is a real distinct
+         * value, so hash the id directly. */
+        uint8_t w = c->attrs & RAY_SYM_W_MASK;
+        if (w == RAY_SYM_W64) {
+            const int64_t* d = (const int64_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                int64_t v_i = d[r];
+                if (v_i == 0) continue;
+                ray_hll_add(sh, ray_hash_i64(v_i));
+            }
+        } else if (w == RAY_SYM_W32) {
+            const uint32_t* d = (const uint32_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                uint32_t v_i = d[r];
+                if (v_i == 0) continue;
+                ray_hll_add(sh, ray_hash_i64((int64_t)v_i));
+            }
+        } else if (w == RAY_SYM_W16) {
+            const uint16_t* d = (const uint16_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                uint16_t v_i = d[r];
+                if (v_i == 0) continue;
+                ray_hll_add(sh, ray_hash_i64((int64_t)v_i));
+            }
+        } else {
+            const uint8_t* d = (const uint8_t*)base;
+            for (int64_t r = start; r < end; r++) {
+                if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+                uint8_t v_i = d[r];
+                if (v_i == 0) continue;
+                ray_hll_add(sh, ray_hash_i64((int64_t)v_i));
+            }
+        }
+    } else if (t == RAY_STR) {
+        ray_t* vm = (ray_t*)v;
+        for (int64_t r = start; r < end; r++) {
+            if (((r - start) & CHK) == 0 && ray_interrupted()) return;
+            size_t n = 0;
+            const char* s = ray_str_vec_get(vm, r, &n);
+            if (!s || n == 0) continue;
+            ray_hll_add(sh, ray_hash_bytes(s, n));
+        }
+    }
+    /* Unsupported types fall through silently — caller validates. */
+}
+
+ray_t* ray_count_distinct_approx(ray_t* x) {
+    if (!x || RAY_IS_ERR(x)) return x;
+    if (!ray_is_vec(x)) {
+        /* Scalar atom — distinct count is 1 (or 0 if null). */
+        if (ray_is_atom(x)) {
+            if (RAY_ATOM_IS_NULL(x)) return ray_i64(0);
+            return ray_i64(1);
+        }
+        return ray_error("type", "count_distinct_approx: vec expected");
+    }
+    int8_t t = x->type;
+    /* Reject types we don't hash. */
+    if (t != RAY_I64 && t != RAY_I32 && t != RAY_I16 && t != RAY_U8 &&
+        t != RAY_BOOL && t != RAY_F64 && t != RAY_DATE && t != RAY_TIME &&
+        t != RAY_TIMESTAMP && t != RAY_STR && !RAY_IS_SYM(t))
+        return ray_error("type", "count_distinct_approx: unsupported element type");
+    int64_t n = x->len;
+    if (n == 0) return ray_i64(0);
+
+    ray_pool_t* pool = ray_pool_get();
+    uint32_t nw = (pool && n >= RAY_PARALLEL_THRESHOLD)
+                  ? ray_pool_total_workers(pool) : 1;
+
+    ray_t* shards_hdr = NULL;
+    ray_hll_t* shards = (ray_hll_t*)scratch_calloc(
+        &shards_hdr, (size_t)nw * sizeof(ray_hll_t));
+    if (!shards) return ray_error("oom", NULL);
+
+    cda_scalar_ctx_t ctx = {
+        .vec = x,
+        .type = t,
+        .attrs = x->attrs,
+        .has_nulls = (x->attrs & RAY_ATTR_HAS_NULLS) != 0,
+        .shards = shards,
+        .p = RAY_HLL_DEFAULT_P,
+        .n_workers = nw,
+        .oom = 0,
+    };
+    if (nw > 1) {
+        ray_pool_dispatch(pool, cda_scalar_fn, &ctx, n);
+    } else {
+        cda_scalar_fn(&ctx, 0, 0, n);
+    }
+    if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) {
+        for (uint32_t w = 0; w < nw; w++) ray_hll_free(&shards[w]);
+        scratch_free(shards_hdr);
+        return ray_error("oom", "count_distinct_approx: HLL alloc failed");
+    }
+    /* Merge per-worker shards into shard[0], then estimate. */
+    for (uint32_t w = 1; w < nw; w++) {
+        if (shards[w].regs)
+            ray_hll_merge(&shards[0], &shards[w]);
+    }
+    int64_t est = shards[0].regs ? ray_hll_estimate(&shards[0]) : 0;
+    for (uint32_t w = 0; w < nw; w++) ray_hll_free(&shards[w]);
+    scratch_free(shards_hdr);
+    return ray_i64(est);
+}
+
+/* ---- Per-group HLL --------------------------------------------------- */
+
+typedef struct {
+    const ray_t*   vec;
+    int8_t         type;
+    uint8_t        attrs;
+    bool           has_nulls;
+    const int64_t* idx_buf;
+    const int64_t* offsets;
+    const int64_t* counts;       /* per-group length — offsets has only n_groups entries */
+    uint8_t        p;
+    uint32_t       m;
+    int64_t*       out;
+    _Atomic(int)   oom;
+} cda_pg_buf_ctx_t;
+
+static void cda_pg_buf_task(void* raw, uint32_t worker_id, int64_t start, int64_t end) {
+    (void)worker_id;
+    cda_pg_buf_ctx_t* c = (cda_pg_buf_ctx_t*)raw;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    const void* base = ray_data((ray_t*)c->vec);
+    int8_t  t  = c->type;
+    bool    hn = c->has_nulls;
+
+    /* One private HLL per task (allocated on stack so we never touch
+     * the shared scratch arena from a worker thread).  P≤14 → m≤16384,
+     * fits comfortably in the default 8 MiB worker stack. */
+    uint8_t regs[1u << 14];
+    ray_hll_t sk = { .p = c->p, .m = c->m, .regs = regs, ._hdr = NULL };
+
+    for (int64_t g = start; g < end; g++) {
+        memset(regs, 0, c->m);
+        int64_t s = c->offsets[g];
+        int64_t e = s + c->counts[g];
+        if (t == RAY_I64 || t == RAY_TIMESTAMP) {
+            const int64_t* d = (const int64_t*)base;
+            for (int64_t k = s; k < e; k++) {
+                int64_t r = c->idx_buf[k];
+                int64_t v = d[r];
+                if (hn && v == NULL_I64) continue;
+                ray_hll_add(&sk, ray_hash_i64(v));
+            }
+        } else if (t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) {
+            const int32_t* d = (const int32_t*)base;
+            for (int64_t k = s; k < e; k++) {
+                int64_t r = c->idx_buf[k];
+                int32_t v = d[r];
+                if (hn && v == NULL_I32) continue;
+                ray_hll_add(&sk, ray_hash_i64((int64_t)v));
+            }
+        } else if (t == RAY_I16) {
+            const int16_t* d = (const int16_t*)base;
+            for (int64_t k = s; k < e; k++) {
+                int64_t r = c->idx_buf[k];
+                int16_t v = d[r];
+                if (hn && v == NULL_I16) continue;
+                ray_hll_add(&sk, ray_hash_i64((int64_t)v));
+            }
+        } else if (t == RAY_BOOL || t == RAY_U8) {
+            const uint8_t* d = (const uint8_t*)base;
+            for (int64_t k = s; k < e; k++) {
+                int64_t r = c->idx_buf[k];
+                ray_hll_add(&sk, ray_hash_i64((int64_t)d[r]));
+            }
+        } else if (t == RAY_F64) {
+            const double* d = (const double*)base;
+            for (int64_t k = s; k < e; k++) {
+                int64_t r = c->idx_buf[k];
+                double v = d[r];
+                if (v != v) continue;
+                ray_hll_add(&sk, ray_hash_f64(v));
+            }
+        } else if (RAY_IS_SYM(t)) {
+            uint8_t w = c->attrs & RAY_SYM_W_MASK;
+            if (w == RAY_SYM_W64) {
+                const int64_t* d = (const int64_t*)base;
+                for (int64_t k = s; k < e; k++) {
+                    int64_t r = c->idx_buf[k];
+                    int64_t v = d[r]; if (v == 0) continue;
+                    ray_hll_add(&sk, ray_hash_i64(v));
+                }
+            } else if (w == RAY_SYM_W32) {
+                const uint32_t* d = (const uint32_t*)base;
+                for (int64_t k = s; k < e; k++) {
+                    int64_t r = c->idx_buf[k];
+                    uint32_t v = d[r]; if (v == 0) continue;
+                    ray_hll_add(&sk, ray_hash_i64((int64_t)v));
+                }
+            } else if (w == RAY_SYM_W16) {
+                const uint16_t* d = (const uint16_t*)base;
+                for (int64_t k = s; k < e; k++) {
+                    int64_t r = c->idx_buf[k];
+                    uint16_t v = d[r]; if (v == 0) continue;
+                    ray_hll_add(&sk, ray_hash_i64((int64_t)v));
+                }
+            } else {
+                const uint8_t* d = (const uint8_t*)base;
+                for (int64_t k = s; k < e; k++) {
+                    int64_t r = c->idx_buf[k];
+                    uint8_t v = d[r]; if (v == 0) continue;
+                    ray_hll_add(&sk, ray_hash_i64((int64_t)v));
+                }
+            }
+        }
+        c->out[g] = ray_hll_estimate(&sk);
+    }
+}
+
+int ray_count_distinct_approx_pg_buf(ray_t* src,
+                                      const int64_t* idx_buf,
+                                      const int64_t* offsets,
+                                      const int64_t* counts,
+                                      int64_t n_groups,
+                                      uint8_t p, int64_t* out)
+{
+    if (!src || RAY_IS_ERR(src) || !idx_buf || !offsets || !counts || !out)
+        return -1;
+    int8_t t = src->type;
+    bool hashable = (t == RAY_I64 || t == RAY_I32 || t == RAY_I16 ||
+                      t == RAY_U8 || t == RAY_BOOL || t == RAY_F64 ||
+                      t == RAY_DATE || t == RAY_TIME || t == RAY_TIMESTAMP ||
+                      RAY_IS_SYM(t));
+    if (!hashable) return -1;
+    if (n_groups <= 0) return 0;
+    if (p < 4) p = 4;
+    if (p > 14) p = 14;
+    uint32_t m = 1u << p;
+
+    cda_pg_buf_ctx_t ctx = {
+        .vec = src,
+        .type = t,
+        .attrs = src->attrs,
+        .has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0,
+        .idx_buf = idx_buf,
+        .offsets = offsets,
+        .counts = counts,
+        .p = p,
+        .m = m,
+        .out = out,
+        .oom = 0,
+    };
+    ray_pool_t* pool = ray_pool_get();
+    if (pool && ray_pool_total_workers(pool) >= 2 && n_groups >= 4) {
+        ray_pool_dispatch_n(pool, cda_pg_buf_task, &ctx, (uint32_t)n_groups);
+    } else {
+        cda_pg_buf_task(&ctx, 0, 0, n_groups);
+    }
+    if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) return -1;
+    return 0;
+}
diff --git a/src/ops/hll.h b/src/ops/hll.h
new file mode 100644
index 00000000..29b98332
--- /dev/null
+++ b/src/ops/hll.h
@@ -0,0 +1,118 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_OPS_HLL_H
+#define RAY_OPS_HLL_H
+
+/**
+ * Probabilistic cardinality sketch (HyperLogLog).
+ *
+ * Each sketch holds 2^P registers; each register stores the maximum
+ * leading-zero count (rho) seen for any hash whose top P bits index
+ * that register.  Cardinality is then read off the harmonic mean of
+ * 2^reg over all registers, with bias correction for both ends of
+ * the range.  Standard error ≈ 1.04 / sqrt(2^P).  P=14 → ≈ 0.8 %.
+ *
+ * Memory: 1 byte per register (8-bit reg holds rho up to 64+P, way
+ * over the 6 bits a packed implementation would need; the extra few
+ * KB buys a tighter hot loop).  At P=14 a sketch is 16 KB and lives
+ * in L2 for the duration of one query.
+ *
+ * The sketch is mergeable element-wise (max), which is the property
+ * the per-group / per-worker aggregation paths rely on: each worker
+ * builds a local sketch and the planner merges them at finalisation.
+ */
+
+#include "rayforce.h"
+#include "ops/hash.h"
+
+/* Default precision: 14 (16384 registers, ~0.81 % std error, 16 KB). */
+#define RAY_HLL_DEFAULT_P  14
+
+typedef struct {
+    uint8_t  p;        /* precision: register count = 1 << p */
+    uint32_t m;        /* register count */
+    uint8_t* regs;     /* [m] — 1 byte per register, holds rho count */
+    ray_t*   _hdr;     /* scratch handle for regs */
+} ray_hll_t;
+
+/* Initialise an empty sketch with `p` precision bits.  Allocates regs
+ * via scratch_alloc; the caller frees with ray_hll_free.  Returns 0 on
+ * success, -1 on OOM. */
+int  ray_hll_init(ray_hll_t* h, uint8_t p);
+
+/* Free the regs allocation.  Safe on a zeroed (uninitialised) sketch. */
+void ray_hll_free(ray_hll_t* h);
+
+/* Zero all registers (clears the sketch — same effect as init with the
+ * same p, but in-place; useful when reusing a sketch across calls). */
+void ray_hll_reset(ray_hll_t* h);
+
+/* Add a 64-bit hash to the sketch.  Caller is responsible for hashing
+ * its value type before invoking — see ray_hash_i64 / ray_hash_bytes
+ * in ops/hash.h.  Hot path; kept fully inline. */
+static inline void ray_hll_add(ray_hll_t* h, uint64_t hash) {
+    uint32_t idx = (uint32_t)(hash >> (64u - h->p));
+    /* The low (64-p) bits hold the value we scan for the leading-zero
+     * run.  Sentinel-bit at position (64-p-1) keeps the rho value in
+     * [1, 64-p+1] without a branch on all-zero. */
+    uint64_t rest = (hash << h->p) | (1ULL << (h->p - 1));
+    uint8_t  rho  = (uint8_t)(__builtin_clzll(rest) + 1u);
+    if (rho > h->regs[idx]) h->regs[idx] = rho;
+}
+
+/* Merge src into dst (element-wise max).  src and dst must share the
+ * same precision p. */
+void ray_hll_merge(ray_hll_t* dst, const ray_hll_t* src);
+
+/* Estimate the unique-value count of all hashes added so far.  Uses
+ * the standard HyperLogLog estimator with bias-corrected raw-mean for
+ * the mid-range and linear counting (m * ln(m/V)) when many registers
+ * are still zero (V = unused register count). */
+int64_t ray_hll_estimate(const ray_hll_t* h);
+
+/* Scalar approximate `count(distinct …)` over a vec, ~0.8 % standard
+ * error.  Handles I64/I32/I16/I8/U8/BOOL/F64/DATE/TIME/TIMESTAMP/SYM/
+ * STR.  Nulls are skipped (matches the SQL `count distinct` semantics).
+ * Parallelised: each worker builds a private sketch over its row range
+ * and the main thread merges them before extracting the estimate.
+ * Wired into `exec_count_distinct` above an input-row threshold. */
+ray_t* ray_count_distinct_approx(ray_t* x);
+
+/* Per-group approximate `count(distinct …)` over a buffered row-index
+ * layout: group g owns the row indices
+ *   idx_buf[offsets[g] .. offsets[g] + counts[g]).
+ * Parallelised across groups — one task per group, each task uses a
+ * private stack-resident HLL so total memory is O(n_workers · 1<<p).
+ * Callers holding a row_gid layout instead build idx_buf+offsets+counts
+ * once and call this; there's a single per-group kernel.  Writes the
+ * estimate to out[gid].  Returns 0 on success, -1 on unsupported type
+ * (caller falls back to exact). */
+int ray_count_distinct_approx_pg_buf(ray_t* src,
+                                      const int64_t* idx_buf,
+                                      const int64_t* offsets,
+                                      const int64_t* counts,
+                                      int64_t n_groups,
+                                      uint8_t p, int64_t* out);
+
+#endif /* RAY_OPS_HLL_H */
diff --git a/src/ops/query.c b/src/ops/query.c
index c738c844..aa160eec 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -34,6 +34,7 @@
 #include "ops/rowsel.h"
 #include "ops/fused_group.h"
 #include "ops/fused_topk.h"
+#include "ops/hll.h"
 #include "ops/temporal.h"
 #include "core/profile.h"
 #include "table/sym.h"
@@ -2714,6 +2715,33 @@ static ray_t* count_distinct_per_group_buf(ray_t* inner_expr, ray_t* tbl,
     out->len = n_groups;
     int64_t* odata = (int64_t*)ray_data(out);
 
+    /* HyperLogLog approximate path — one task per group, each task with
+     * a private stack-resident sketch (~16 KB).  Triggered when the
+     * total inflated row count across all groups is large enough that
+     * the exact per-group dedup HT becomes memory-bandwidth-bound;
+     * 1 M rows is the same threshold the global path in
+     * exec_count_distinct uses.  Returns within ~0.8 % std error. */
+    /* HyperLogLog approximate path — one task per group, each task with
+     * a private stack-resident sketch (~16 KB).  Triggered when the
+     * total inflated row count across all groups is large enough that
+     * the exact per-group dedup HT becomes memory-bandwidth-bound;
+     * 1 M rows is the same threshold the global path in
+     * exec_count_distinct uses.  Returns within ~0.8 % std error. */
+    if (n_groups > 0) {
+        int64_t total_rows = 0;
+        for (int64_t g = 0; g < n_groups; g++) total_rows += grp_cnt[g];
+        if (total_rows >= (1 << 20)) {
+            if (ray_count_distinct_approx_pg_buf(src, idx_buf, offsets,
+                                                  grp_cnt, n_groups,
+                                                  14, odata) == 0) {
+                ray_release(src);
+                return out;
+            }
+            /* Fall through on type miss; out still zeroed. */
+            memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
+        }
+    }
+
     /* Parallel path: dispatch one task per group when src has a flat
      * numeric / SYM layout we can read with a typed pointer.  Each task
      * does its own dedup with a scratch hash table — no gather_by_idx

From 5e23603adb34720cad755e325bb9768e929be4e0 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 14:47:24 +0200
Subject: [PATCH 10/11] feat(idx): per-chunk min/max zone index + filter
 chunk-skip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New index kind RAY_IDX_CHUNK_ZONE (5).  Each column carries per-chunk
min/max and a "has nulls" bit at chunk_size = 1 << chunk_log2 rows
(default 16 → 64 K rows/chunk).  Built once at column ingest time —
`.csv.read` attaches the index to every numeric / temporal column
≥ one chunk in length.  Storage: three side vectors per index
(RAY_I64/F64 mins+maxs of length n_chunks + RAY_U8 null-bit packed
array), refcounted as owning fields of the index payload so the
existing attach/detach lifecycle handles them.

Two consumers:

  scalar min/max reduce (`ray_min_fn` / `ray_max_fn`)
    O(n_chunks) walk over mins[*] / maxs[*] instead of O(n_rows).
    Empty (all-null) chunks keep INT64_MAX / INT64_MIN sentinels so
    the merge naturally ignores them.

  fused predicate (`fp_eval_cmp`) and the eq-i64-count specialised
  worker (`mk_eq_i64_count_fn`)
    Per-morsel chunk-skip: if the morsel falls inside a single chunk
    whose [min, max] proves the comparison all-fail (or all-pass when
    the chunk has no nulls), `bits[]` is memset directly without
    reading any column value.  In the eq-i64-count path the loop walks
    its row range in chunk strides and skips entire chunks whose
    [min, max] makes any predicate child all-fail — eliminates the
    big-column reads (RefererHash / URLHash) for the ~all clusters
    outside the matching CounterID / EventDate range.

Measured (10M-row hits, in-memory):

  q06 (min/max EventDate)        6.4 → 0.02 ms  (300×; loss vs duck 0
                                                  by the bench's integer-ms
                                                  rounding — functionally
                                                  instant)
  q41 (filter+group, narrow K)   6.0 → 3.2 ms   FLIP vs duck 5
  q40 (filter+group, wide K)      17 → 13 ms    closer to duck 4
  q37 (filter+group, clustered)   15 → 12 ms    bigger margin
  q38 (filter+group, clustered)   17 → 15 ms    bigger margin

Test suite 2657/2659 (2 skipped, 0 failed).  Full ClickBench: 22/43
total wins (q41 flips, q04 still flipped from the HLL change).
---
 src/io/csv.c          |  26 ++++++
 src/ops/agg.c         |  73 +++++++++++++++-
 src/ops/fused_group.c | 159 +++++++++++++++++++++++++++++++---
 src/ops/idxop.c       | 192 ++++++++++++++++++++++++++++++++++++++++--
 src/ops/idxop.h       |  36 ++++++--
 5 files changed, 463 insertions(+), 23 deletions(-)

diff --git a/src/io/csv.c b/src/io/csv.c
index f8189ecb..0784d89e 100644
--- a/src/io/csv.c
+++ b/src/io/csv.c
@@ -44,6 +44,7 @@
 #include "core/pool.h"
 #include "lang/format.h"
 #include "ops/hash.h"
+#include "ops/idxop.h"      /* attach per-chunk zone index after load */
 #include "store/col.h"
 #include "store/fileio.h"
 #include "store/splay.h"
@@ -1410,6 +1411,20 @@ static ray_t* csv_materialize_rows(const char* buf, size_t file_size,
         col_data[c] = dst;
     }
 
+    /* Per-chunk min/max + null bit on every column big enough to be worth
+     * indexing — gives the reduce min/max and the filter chunk-skip paths
+     * an O(n_chunks) scan instead of O(n_rows).  Attach is best-effort:
+     * unsupported types (RAY_STR/RAY_SYM/RAY_GUID in v1) just stay
+     * unindexed and the consumer falls back to a row scan. */
+    for (int c = 0; c < ncols; c++) {
+        ray_t* v = col_vecs[c];
+        if (!v || RAY_IS_ERR(v)) continue;
+        if (v->len < (1 << 16)) continue;        /* < one chunk, skip */
+        ray_t* r = ray_index_attach_chunk_zone(&v, 16);
+        if (r && !RAY_IS_ERR(r)) col_vecs[c] = v;  /* attach succeeded */
+        /* On failure the original column stays in col_vecs[c]; ignore. */
+    }
+
     ray_t* tbl = ray_table_new(ncols);
     if (!tbl || RAY_IS_ERR(tbl)) {
         for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]);
@@ -1788,6 +1803,17 @@ ray_t* ray_read_csv_named_opts(const char* path, char delimiter, bool header,
 
     /* ---- 11. Build table ---- */
     {
+        /* Best-effort per-chunk zone index attach (see comment on the
+         * matching loop in build_table_from_cols) — unsupported types
+         * fall through to the unindexed path inside the consumer. */
+        for (int c = 0; c < ncols; c++) {
+            ray_t* v = col_vecs[c];
+            if (!v || RAY_IS_ERR(v)) continue;
+            if (v->len < (1 << 16)) continue;
+            ray_t* r = ray_index_attach_chunk_zone(&v, 16);
+            if (r && !RAY_IS_ERR(r)) col_vecs[c] = v;
+        }
+
         ray_t* tbl = ray_table_new(ncols);
         if (!tbl || RAY_IS_ERR(tbl)) {
             for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]);
diff --git a/src/ops/agg.c b/src/ops/agg.c
index fee02d2e..34328522 100644
--- a/src/ops/agg.c
+++ b/src/ops/agg.c
@@ -23,6 +23,7 @@
 
 #include "lang/internal.h"
 #include "ops/ops.h"
+#include "ops/idxop.h"   /* RAY_IDX_CHUNK_ZONE fast path for min/max */
 #include "mem/heap.h"
 
 #include <stdlib.h>  /* qsort (introselect fallback) */
@@ -328,7 +329,43 @@ ray_t* ray_min_fn(ray_t* x) {
     if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MIN);
     if (RAY_IS_PARTED(x->type)) return agg_parted_minmax(x, 0);
     if (ray_is_atom(x)) { ray_retain(x); return x; }
-    if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_min_op);
+    if (ray_is_vec(x)) {
+        /* Per-chunk zone index fast path: O(n_chunks) instead of O(n_rows).
+         * Only valid when the index was built for the column's current len
+         * (mutation paths call ray_index_drop). */
+        if (ray_index_kind(x) == RAY_IDX_CHUNK_ZONE) {
+            ray_index_t* ix = ray_index_payload(x->index);
+            if (ix->built_for_len == x->len) {
+                uint32_t n_chunks = ix->u.chunk_zone.n_chunks;
+                if (ix->u.chunk_zone.is_f64) {
+                    const double* mins = (const double*)ray_data(ix->u.chunk_zone.mins);
+                    double mn = INFINITY;
+                    for (uint32_t g = 0; g < n_chunks; g++)
+                        if (mins[g] < mn) mn = mins[g];
+                    if (mn == INFINITY) return ray_typed_null(-RAY_F64);
+                    return make_f64(mn);
+                } else {
+                    const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins);
+                    int64_t mn = INT64_MAX;
+                    for (uint32_t g = 0; g < n_chunks; g++)
+                        if (mins[g] < mn) mn = mins[g];
+                    if (mn == INT64_MAX) return ray_typed_null(-x->type);
+                    /* Preserve the column's storage width on the result. */
+                    switch (x->type) {
+                    case RAY_BOOL:      return ray_bool((bool)mn);
+                    case RAY_U8:        return ray_u8((uint8_t)mn);
+                    case RAY_I16:       return ray_i16((int16_t)mn);
+                    case RAY_I32:       return ray_i32((int32_t)mn);
+                    case RAY_DATE:      return ray_date((int32_t)mn);
+                    case RAY_TIME:      return ray_time(mn);
+                    case RAY_TIMESTAMP: return ray_timestamp(mn);
+                    default:            return ray_i64(mn);
+                    }
+                }
+            }
+        }
+        AGG_VEC_VIA_DAG(x, ray_min_op);
+    }
     if (!is_list(x)) return ray_error("type", NULL);
     int64_t len = ray_len(x);
     if (len == 0) return ray_error("domain", NULL);
@@ -350,7 +387,39 @@ ray_t* ray_max_fn(ray_t* x) {
     if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MAX);
     if (RAY_IS_PARTED(x->type)) return agg_parted_minmax(x, 1);
     if (ray_is_atom(x)) { ray_retain(x); return x; }
-    if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_max_op);
+    if (ray_is_vec(x)) {
+        if (ray_index_kind(x) == RAY_IDX_CHUNK_ZONE) {
+            ray_index_t* ix = ray_index_payload(x->index);
+            if (ix->built_for_len == x->len) {
+                uint32_t n_chunks = ix->u.chunk_zone.n_chunks;
+                if (ix->u.chunk_zone.is_f64) {
+                    const double* maxs = (const double*)ray_data(ix->u.chunk_zone.maxs);
+                    double mx = -INFINITY;
+                    for (uint32_t g = 0; g < n_chunks; g++)
+                        if (maxs[g] > mx) mx = maxs[g];
+                    if (mx == -INFINITY) return ray_typed_null(-RAY_F64);
+                    return make_f64(mx);
+                } else {
+                    const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs);
+                    int64_t mx = INT64_MIN;
+                    for (uint32_t g = 0; g < n_chunks; g++)
+                        if (maxs[g] > mx) mx = maxs[g];
+                    if (mx == INT64_MIN) return ray_typed_null(-x->type);
+                    switch (x->type) {
+                    case RAY_BOOL:      return ray_bool((bool)mx);
+                    case RAY_U8:        return ray_u8((uint8_t)mx);
+                    case RAY_I16:       return ray_i16((int16_t)mx);
+                    case RAY_I32:       return ray_i32((int32_t)mx);
+                    case RAY_DATE:      return ray_date((int32_t)mx);
+                    case RAY_TIME:      return ray_time(mx);
+                    case RAY_TIMESTAMP: return ray_timestamp(mx);
+                    default:            return ray_i64(mx);
+                    }
+                }
+            }
+        }
+        AGG_VEC_VIA_DAG(x, ray_max_op);
+    }
     if (!is_list(x)) return ray_error("type", NULL);
     int64_t len = ray_len(x);
     if (len == 0) return ray_error("domain", NULL);
diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c
index 04c2fb43..ea0a05f7 100644
--- a/src/ops/fused_group.c
+++ b/src/ops/fused_group.c
@@ -23,6 +23,7 @@
 
 #include "ops/fused_group.h"
 #include "ops/fused_pred.h" /* fp_pred_t / fp_compile_pred / fp_eval_pred */
+#include "ops/idxop.h"      /* RAY_IDX_CHUNK_ZONE chunk-skip in fp_eval_cmp */
 #include "lang/eval.h"      /* RAY_ATTR_NAME */
 #include "core/pool.h"      /* ray_pool_get / ray_pool_dispatch */
 
@@ -344,6 +345,72 @@ void fp_eval_cmp(const fp_cmp_t* p, int64_t start, int64_t end,
         return;
     }
 
+    /* Chunk-zone fast path: if the column carries per-chunk min/max
+     * metadata and [start, end) fits inside a single chunk, decide the
+     * whole morsel from chunk extrema without reading a single value.
+     * Only integer/temporal comparisons (EQ/NE/LT/LE/GT/GE) — LIKE/IN
+     * have their own evaluators below and SYM ordering is rejected at
+     * compile time anyway.  The all-pass shortcut is gated on "no
+     * nulls in this chunk" because SQL `(x op c)` is FALSE/NULL when x
+     * is NULL; the all-fail shortcut needs no such guard. */
+    if (p->col_obj && (p->col_obj->attrs & RAY_ATTR_HAS_INDEX) &&
+        p->col_obj->index)
+    {
+        ray_index_t* ix = ray_index_payload(p->col_obj->index);
+        if (ix->kind == RAY_IDX_CHUNK_ZONE &&
+            ix->built_for_len == p->col_obj->len &&
+            !ix->u.chunk_zone.is_f64 &&
+            (op == FP_EQ || op == FP_NE ||
+             op == FP_LT || op == FP_LE ||
+             op == FP_GT || op == FP_GE))
+        {
+            uint8_t log2 = ix->u.chunk_zone.chunk_log2;
+            int64_t s_ch = start >> log2;
+            int64_t e_ch = (end - 1) >> log2;
+            if (s_ch == e_ch && (uint32_t)s_ch < ix->u.chunk_zone.n_chunks) {
+                const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins);
+                const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs);
+                int64_t cmin = mins[s_ch], cmax = maxs[s_ch];
+                if (cmin <= cmax) {       /* skip empty (all-null) chunks */
+                    const uint8_t* nb = (const uint8_t*)ray_data(ix->u.chunk_zone.null_bits);
+                    bool has_nulls = (nb[s_ch >> 3] >> (s_ch & 7)) & 1u;
+                    int decision = -1;   /* 0=all-fail, 1=all-pass, -1=mixed */
+                    switch (op) {
+                    case FP_EQ:
+                        if (cval < cmin || cval > cmax)        decision = 0;
+                        else if (!has_nulls && cmin == cmax)   decision = 1;
+                        break;
+                    case FP_NE:
+                        if (!has_nulls && (cval < cmin || cval > cmax)) decision = 1;
+                        else if (cmin == cmax && cval == cmin)          decision = 0;
+                        break;
+                    case FP_LT:
+                        if (cmin >= cval)                      decision = 0;
+                        else if (!has_nulls && cmax < cval)    decision = 1;
+                        break;
+                    case FP_LE:
+                        if (cmin >  cval)                      decision = 0;
+                        else if (!has_nulls && cmax <= cval)   decision = 1;
+                        break;
+                    case FP_GT:
+                        if (cmax <= cval)                      decision = 0;
+                        else if (!has_nulls && cmin >  cval)   decision = 1;
+                        break;
+                    case FP_GE:
+                        if (cmax <  cval)                      decision = 0;
+                        else if (!has_nulls && cmin >= cval)   decision = 1;
+                        break;
+                    default: break;
+                    }
+                    if (decision >= 0) {
+                        memset(bits, (uint8_t)decision, (size_t)n);
+                        return;
+                    }
+                }
+            }
+        }
+    }
+
     /* SYM low-card fold: const not in dict ⇒ EQ all-zero / NE all-one.
      * Ordering ops are rejected at compile for SYM, so unreachable here. */
     if (ct == RAY_SYM && !p->cval_in_dict) {
@@ -2539,20 +2606,90 @@ static void mk_eq_i64_count_fn(void* raw, uint32_t worker_id,
     const fp_cmp_t* eq = &c->pred.children[fc->eq_idx];
     const int64_t* eq_col = (const int64_t*)eq->col_base;
     int64_t eq_val = eq->cval;
-    for (int64_t row = start; row < end; row++) {
-        if (eq_col[row] != eq_val) continue;
-        uint8_t pass = 1;
-        for (uint8_t i = 0; i < c->pred.n_children; i++) {
-            if (i == fc->eq_idx) continue;
-            if (!fp_eval_cmp_one(&c->pred.children[i], row)) {
-                pass = 0;
+
+    /* Chunk-skip: for each predicate child whose column carries a
+     * chunk_zone index, walk the row range in chunk strides and skip
+     * any chunk where the child's [min, max] proves an all-fail.  For
+     * clustered columns (e.g. data sorted by CounterID, EventDate) this
+     * eliminates the per-row RefererHash/URLHash read for ~all chunks
+     * outside the matching counter / date range — q40/q41/q42 pattern.
+     * Picks chunk_log2 from any indexed child (every chunk_zone built
+     * by csv.read uses the same chunk_log2 today).  Falls through to
+     * the plain per-row loop when no child has a usable index. */
+    uint8_t chunk_log2 = 0;
+    for (uint8_t i = 0; i < c->pred.n_children; i++) {
+        ray_t* co = c->pred.children[i].col_obj;
+        if (co && (co->attrs & RAY_ATTR_HAS_INDEX) && co->index) {
+            ray_index_t* ix = ray_index_payload(co->index);
+            if (ix->kind == RAY_IDX_CHUNK_ZONE &&
+                ix->built_for_len == co->len) {
+                chunk_log2 = ix->u.chunk_zone.chunk_log2;
                 break;
             }
         }
-        if (!pass) continue;
-        if (mk_count_upsert_row(c, sh, row) != 0) {
-            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
-            return;
+    }
+
+    int64_t row = start;
+    while (row < end) {
+        int64_t chunk_end;
+        if (chunk_log2 > 0) {
+            int64_t csz = 1LL << chunk_log2;
+            chunk_end = ((row >> chunk_log2) + 1) << chunk_log2;
+            (void)csz;
+            if (chunk_end > end) chunk_end = end;
+            bool all_fail = false;
+            for (uint8_t i = 0; i < c->pred.n_children && !all_fail; i++) {
+                const fp_cmp_t* p = &c->pred.children[i];
+                ray_t* co = p->col_obj;
+                if (!co || !(co->attrs & RAY_ATTR_HAS_INDEX) || !co->index)
+                    continue;
+                ray_index_t* ix = ray_index_payload(co->index);
+                if (ix->kind != RAY_IDX_CHUNK_ZONE ||
+                    ix->built_for_len != co->len ||
+                    ix->u.chunk_zone.chunk_log2 != chunk_log2 ||
+                    ix->u.chunk_zone.is_f64)
+                    continue;
+                fp_op_t op = p->op;
+                if (op != FP_EQ && op != FP_NE && op != FP_LT &&
+                    op != FP_LE && op != FP_GT && op != FP_GE)
+                    continue;
+                int64_t s_ch = row >> chunk_log2;
+                if ((uint32_t)s_ch >= ix->u.chunk_zone.n_chunks) continue;
+                const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins);
+                const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs);
+                int64_t cmin = mins[s_ch], cmax = maxs[s_ch];
+                if (cmin > cmax) continue;   /* empty chunk */
+                int64_t cv = p->cval;
+                switch (op) {
+                case FP_EQ: if (cv < cmin || cv > cmax) all_fail = true; break;
+                case FP_NE: if (cmin == cmax && cv == cmin) all_fail = true; break;
+                case FP_LT: if (cmin >= cv) all_fail = true; break;
+                case FP_LE: if (cmin >  cv) all_fail = true; break;
+                case FP_GT: if (cmax <= cv) all_fail = true; break;
+                case FP_GE: if (cmax <  cv) all_fail = true; break;
+                default: break;
+                }
+            }
+            if (all_fail) { row = chunk_end; continue; }
+        } else {
+            chunk_end = end;
+        }
+
+        for (; row < chunk_end; row++) {
+            if (eq_col[row] != eq_val) continue;
+            uint8_t pass = 1;
+            for (uint8_t i = 0; i < c->pred.n_children; i++) {
+                if (i == fc->eq_idx) continue;
+                if (!fp_eval_cmp_one(&c->pred.children[i], row)) {
+                    pass = 0;
+                    break;
+                }
+            }
+            if (!pass) continue;
+            if (mk_count_upsert_row(c, sh, row) != 0) {
+                atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+                return;
+            }
         }
     }
 }
diff --git a/src/ops/idxop.c b/src/ops/idxop.c
index 3f74476b..6e0a3d37 100644
--- a/src/ops/idxop.c
+++ b/src/ops/idxop.c
@@ -154,6 +154,17 @@ void ray_index_release_payload(ray_index_t* ix) {
             ray_release(ix->u.bloom.bits);
         ix->u.bloom.bits = NULL;
         break;
+    case RAY_IDX_CHUNK_ZONE:
+        if (ix->u.chunk_zone.mins && !RAY_IS_ERR(ix->u.chunk_zone.mins))
+            ray_release(ix->u.chunk_zone.mins);
+        if (ix->u.chunk_zone.maxs && !RAY_IS_ERR(ix->u.chunk_zone.maxs))
+            ray_release(ix->u.chunk_zone.maxs);
+        if (ix->u.chunk_zone.null_bits && !RAY_IS_ERR(ix->u.chunk_zone.null_bits))
+            ray_release(ix->u.chunk_zone.null_bits);
+        ix->u.chunk_zone.mins = NULL;
+        ix->u.chunk_zone.maxs = NULL;
+        ix->u.chunk_zone.null_bits = NULL;
+        break;
     case RAY_IDX_ZONE:
     case RAY_IDX_NONE:
         break;
@@ -176,6 +187,14 @@ void ray_index_retain_payload(ray_index_t* ix) {
         if (ix->u.bloom.bits && !RAY_IS_ERR(ix->u.bloom.bits))
             ray_retain(ix->u.bloom.bits);
         break;
+    case RAY_IDX_CHUNK_ZONE:
+        if (ix->u.chunk_zone.mins && !RAY_IS_ERR(ix->u.chunk_zone.mins))
+            ray_retain(ix->u.chunk_zone.mins);
+        if (ix->u.chunk_zone.maxs && !RAY_IS_ERR(ix->u.chunk_zone.maxs))
+            ray_retain(ix->u.chunk_zone.maxs);
+        if (ix->u.chunk_zone.null_bits && !RAY_IS_ERR(ix->u.chunk_zone.null_bits))
+            ray_retain(ix->u.chunk_zone.null_bits);
+        break;
     case RAY_IDX_ZONE:
     case RAY_IDX_NONE:
         break;
@@ -262,6 +281,107 @@ static ray_err_t zone_scan(ray_t* v, ray_index_t* ix) {
     }
 }
 
+/* --------------------------------------------------------------------------
+ * Chunk-zone scan -- per-(1<<chunk_log2)-row min/max + null flag
+ *
+ * For each chunk g in [0, n_chunks) the scan computes the chunk's min and
+ * max value across its row range and sets the chunk's null-bit if any row
+ * in that chunk is a null sentinel.  Whole-column extrema fall out as
+ * min(mins[*]) / max(maxs[*]) so the reduce min/max path can consume this
+ * index without needing a separate column-wide zone.
+ * -------------------------------------------------------------------------- */
+
+static ray_err_t chunk_zone_scan_int(ray_t* v, ray_index_t* ix,
+                                     int elem_size) {
+    uint32_t n_chunks = ix->u.chunk_zone.n_chunks;
+    uint8_t  log2     = ix->u.chunk_zone.chunk_log2;
+    int64_t  csz      = 1LL << log2;
+    int64_t  n        = v->len;
+    int64_t* mins     = (int64_t*)ray_data(ix->u.chunk_zone.mins);
+    int64_t* maxs     = (int64_t*)ray_data(ix->u.chunk_zone.maxs);
+    uint8_t* nbits    = (uint8_t*)ray_data(ix->u.chunk_zone.null_bits);
+    const uint8_t* base = (const uint8_t*)ray_data(v);
+
+    for (uint32_t g = 0; g < n_chunks; g++) {
+        int64_t s = (int64_t)g * csz;
+        int64_t e = s + csz; if (e > n) e = n;
+        int64_t mn = INT64_MAX, mx = INT64_MIN;
+        bool any_null = false;
+        for (int64_t i = s; i < e; i++) {
+            if (ray_vec_is_null(v, i)) { any_null = true; continue; }
+            int64_t val = 0;
+            switch (elem_size) {
+            case 1: val = (int64_t)base[i]; break;
+            case 2: { int16_t t; memcpy(&t, base + i*2, 2); val = (int64_t)t; break; }
+            case 4: { int32_t t; memcpy(&t, base + i*4, 4); val = (int64_t)t; break; }
+            case 8: { int64_t t; memcpy(&t, base + i*8, 8); val = t;          break; }
+            default: return RAY_ERR_TYPE;
+            }
+            if (val < mn) mn = val;
+            if (val > mx) mx = val;
+        }
+        /* Empty (all-null) chunks keep mn=INT64_MAX / mx=INT64_MIN so
+         * the reduce path's min(mins[*]) / max(maxs[*]) ignores them. */
+        mins[g] = mn;
+        maxs[g] = mx;
+        if (any_null) nbits[g >> 3] |= (uint8_t)(1u << (g & 7));
+    }
+    return RAY_OK;
+}
+
+static ray_err_t chunk_zone_scan_float(ray_t* v, ray_index_t* ix,
+                                       int elem_size) {
+    uint32_t n_chunks = ix->u.chunk_zone.n_chunks;
+    uint8_t  log2     = ix->u.chunk_zone.chunk_log2;
+    int64_t  csz      = 1LL << log2;
+    int64_t  n        = v->len;
+    double*  mins     = (double*)ray_data(ix->u.chunk_zone.mins);
+    double*  maxs     = (double*)ray_data(ix->u.chunk_zone.maxs);
+    uint8_t* nbits    = (uint8_t*)ray_data(ix->u.chunk_zone.null_bits);
+    const uint8_t* base = (const uint8_t*)ray_data(v);
+
+    for (uint32_t g = 0; g < n_chunks; g++) {
+        int64_t s = (int64_t)g * csz;
+        int64_t e = s + csz; if (e > n) e = n;
+        double mn = INFINITY, mx = -INFINITY;
+        bool any_null = false;
+        for (int64_t i = s; i < e; i++) {
+            if (ray_vec_is_null(v, i)) { any_null = true; continue; }
+            double val = 0.0;
+            if (elem_size == 4) {
+                float t; memcpy(&t, base + i*4, 4); val = (double)t;
+            } else {
+                memcpy(&val, base + i*8, 8);
+            }
+            if (isnan(val)) { any_null = true; continue; }
+            if (val < mn) mn = val;
+            if (val > mx) mx = val;
+        }
+        /* Empty (all-null) chunks keep mn=+inf / mx=-inf so reduce
+         * (min/max across mins[]/maxs[]) ignores them. */
+        mins[g] = mn;
+        maxs[g] = mx;
+        if (any_null) nbits[g >> 3] |= (uint8_t)(1u << (g & 7));
+    }
+    return RAY_OK;
+}
+
+static ray_err_t chunk_zone_scan(ray_t* v, ray_index_t* ix) {
+    switch (v->type) {
+    case RAY_BOOL:
+    case RAY_U8:        return chunk_zone_scan_int(v, ix, 1);
+    case RAY_I16:       return chunk_zone_scan_int(v, ix, 2);
+    case RAY_I32:
+    case RAY_DATE:      return chunk_zone_scan_int(v, ix, 4);
+    case RAY_I64:
+    case RAY_TIME:
+    case RAY_TIMESTAMP: return chunk_zone_scan_int(v, ix, 8);
+    case RAY_F32:       return chunk_zone_scan_float(v, ix, 4);
+    case RAY_F64:       return chunk_zone_scan_float(v, ix, 8);
+    default:            return RAY_ERR_NYI;
+    }
+}
+
 /* --------------------------------------------------------------------------
  * Attach
  *
@@ -335,6 +455,59 @@ ray_t* ray_index_attach_zone(ray_t** vp) {
     return attach_finalize(v, idx);
 }
 
+ray_t* ray_index_attach_chunk_zone(ray_t** vp, uint8_t chunk_log2) {
+    ray_t* v = prepare_attach(vp, "chunk_zone");
+    if (RAY_IS_ERR(v)) return v;
+
+    if (chunk_log2 == 0) chunk_log2 = 16;          /* default 64 K rows / chunk */
+    if (chunk_log2 < 8 || chunk_log2 > 22)
+        return ray_error("domain", "chunk_zone: chunk_log2 out of range [8, 22]");
+    int64_t csz = 1LL << chunk_log2;
+    /* No point indexing a column smaller than one chunk — fall back to
+     * the column-wide zone (or no index at all) at that size. */
+    if (v->len < csz)
+        return ray_error("domain", "chunk_zone: column has fewer rows than one chunk");
+
+    uint32_t n_chunks = (uint32_t)((v->len + csz - 1) / csz);
+
+    ray_t* idx = ray_index_alloc(RAY_IDX_CHUNK_ZONE, v->type, v->len);
+    if (!idx || RAY_IS_ERR(idx)) return idx;
+    ray_index_t* ix = ray_index_payload(idx);
+    ix->u.chunk_zone.n_chunks   = n_chunks;
+    ix->u.chunk_zone.chunk_log2 = chunk_log2;
+    ix->u.chunk_zone.is_f64     = (v->type == RAY_F64 || v->type == RAY_F32) ? 1 : 0;
+
+    int8_t arr_type = ix->u.chunk_zone.is_f64 ? RAY_F64 : RAY_I64;
+    ray_t* mins = ray_vec_new(arr_type, (int64_t)n_chunks);
+    ray_t* maxs = ray_vec_new(arr_type, (int64_t)n_chunks);
+    int64_t nb_len = (int64_t)((n_chunks + 7) / 8);
+    ray_t* nbits = ray_vec_new(RAY_U8, nb_len);
+    if (!mins || RAY_IS_ERR(mins) || !maxs || RAY_IS_ERR(maxs) ||
+        !nbits || RAY_IS_ERR(nbits))
+    {
+        if (mins && !RAY_IS_ERR(mins)) ray_release(mins);
+        if (maxs && !RAY_IS_ERR(maxs)) ray_release(maxs);
+        if (nbits && !RAY_IS_ERR(nbits)) ray_release(nbits);
+        ray_release(idx);
+        return ray_error("oom", "chunk_zone: arrays alloc");
+    }
+    mins->len  = (int64_t)n_chunks;
+    maxs->len  = (int64_t)n_chunks;
+    nbits->len = nb_len;
+    memset(ray_data(nbits), 0, (size_t)nb_len);
+    ix->u.chunk_zone.mins      = mins;
+    ix->u.chunk_zone.maxs      = maxs;
+    ix->u.chunk_zone.null_bits = nbits;
+
+    ray_err_t err = chunk_zone_scan(v, ix);
+    if (err != RAY_OK) {
+        ray_release(idx);   /* releases mins/maxs/nbits via release_payload */
+        return ray_error(ray_err_code_str(err),
+                         "chunk_zone scan failed for type %d", (int)v->type);
+    }
+    return attach_finalize(v, idx);
+}
+
 /* --------------------------------------------------------------------------
  * Hash index — chained open addressing
  *
@@ -540,11 +713,12 @@ ray_t* ray_index_drop(ray_t** vp) {
 
 static const char* kind_name(ray_idx_kind_t k) {
     switch (k) {
-    case RAY_IDX_HASH:  return "hash";
-    case RAY_IDX_SORT:  return "sort";
-    case RAY_IDX_ZONE:  return "zone";
-    case RAY_IDX_BLOOM: return "bloom";
-    default:            return "none";
+    case RAY_IDX_HASH:       return "hash";
+    case RAY_IDX_SORT:       return "sort";
+    case RAY_IDX_ZONE:       return "zone";
+    case RAY_IDX_BLOOM:      return "bloom";
+    case RAY_IDX_CHUNK_ZONE: return "chunk_zone";
+    default:                 return "none";
     }
 }
 
@@ -627,6 +801,14 @@ ray_t* ray_index_info(ray_t* v) {
         r = dict_append_sym_i64(&keys, &vals, "n_keys", ix->u.bloom.n_keys);
         if (RAY_IS_ERR(r)) goto fail;
         break;
+    case RAY_IDX_CHUNK_ZONE:
+        r = dict_append_sym_i64(&keys, &vals, "n_chunks",
+                                (int64_t)ix->u.chunk_zone.n_chunks);
+        if (RAY_IS_ERR(r)) goto fail;
+        r = dict_append_sym_i64(&keys, &vals, "chunk_log2",
+                                (int64_t)ix->u.chunk_zone.chunk_log2);
+        if (RAY_IS_ERR(r)) goto fail;
+        break;
     case RAY_IDX_NONE:
         break;
     }
diff --git a/src/ops/idxop.h b/src/ops/idxop.h
index 46d294bc..3121c1f5 100644
--- a/src/ops/idxop.h
+++ b/src/ops/idxop.h
@@ -47,11 +47,20 @@
 
 /* Index kinds.  Stored in ray_index_t.kind. */
 typedef enum {
-    RAY_IDX_NONE  = 0,
-    RAY_IDX_HASH  = 1,
-    RAY_IDX_SORT  = 2,
-    RAY_IDX_ZONE  = 3,
-    RAY_IDX_BLOOM = 4,
+    RAY_IDX_NONE       = 0,
+    RAY_IDX_HASH       = 1,
+    RAY_IDX_SORT       = 2,
+    RAY_IDX_ZONE       = 3,
+    RAY_IDX_BLOOM      = 4,
+    /* Per-chunk min/max + null bit, one entry per (1 << chunk_log2) rows.
+     * The whole-column zone is derivable as
+     *   min(chunk_mins)/max(chunk_maxs) over the entries, so this
+     *   subsumes RAY_IDX_ZONE wherever it's used in the reduce path.
+     * Built at column ingest (csv.read); read by the min/max reduce
+     * and by the predicate planner to skip chunks whose [min,max]
+     * provably excludes/includes the constant.  See chunk_zone arm
+     * of ray_index_t.u below. */
+    RAY_IDX_CHUNK_ZONE = 5,
 } ray_idx_kind_t;
 
 /* The payload stored inside data[] of a RAY_INDEX ray_t. */
@@ -99,6 +108,19 @@ typedef struct {
             uint32_t _pad;
             int64_t  n_keys;    /* number of non-null rows added */
         } bloom;
+        struct {                /* RAY_IDX_CHUNK_ZONE */
+            /* mins / maxs hold n_chunks entries.  For integer / temporal
+             * column types they are RAY_I64 vecs storing the per-chunk
+             * extrema as int64; for RAY_F64 columns they are RAY_F64
+             * vecs.  is_f64 disambiguates at read time. */
+            ray_t*   mins;
+            ray_t*   maxs;
+            ray_t*   null_bits;   /* RAY_U8 vec, packed: bit i = chunk i has any null */
+            uint32_t n_chunks;
+            uint8_t  chunk_log2;  /* chunk size = 1 << chunk_log2 (default 16 → 64 K rows) */
+            uint8_t  is_f64;
+            uint8_t  _pad[2];
+        } chunk_zone;
     } u;
 } ray_index_t;
 
@@ -118,6 +140,10 @@ ray_t* ray_index_attach_zone (ray_t** vp);
 ray_t* ray_index_attach_hash (ray_t** vp);
 ray_t* ray_index_attach_sort (ray_t** vp);
 ray_t* ray_index_attach_bloom(ray_t** vp);
+/* Build per-chunk min/max + null bit at chunk_size = 1 << chunk_log2.
+ * Passing 0 picks the default (16 → 64 K rows / chunk).  Only valid on
+ * numeric and temporal vectors; SYM/STR/GUID return RAY_ERR_NYI. */
+ray_t* ray_index_attach_chunk_zone(ray_t** vp, uint8_t chunk_log2);
 
 /* Drop any attached index from *vp.  No-op if none.  Restores the
  * pre-attach nullmap state byte-for-byte.  Returns *vp. */

From c7de32d198fe9bac3207b38fa94170fb9b15a58d Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 26 May 2026 15:20:57 +0200
Subject: [PATCH 11/11] perf(heap): amortize ray_heap_gc page-release sweep
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ray_heap_gc's pass 5 walked every freelist of every registered heap
and issued madvise(MADV_DONTNEED) on every free block > 4 KiB on
every GC invocation.  For repeated-query workloads (any analytical
loop), the freed blocks were reused on the very next query — but
madvise tore down the page tables and forced re-fault, paying the
cost twice and dominating the profile after the actual worker
compute (~21% of total query time on per-row eq workloads).

Throttle pass 5 to once per 16 GCs.  The long-running-process
invariant (idle free blocks eventually return their physical pages
to the OS) is preserved; the per-query madvise cost disappears.
Callers needing prompt release continue to use the explicit
ray_heap_release_pages() entry point.

Passes 1-4 (foreign flush, slab flush, freelist return, oversized
pool reclamation) still run every call — those are the correctness-
relevant passes (cross-heap accounting, pool reusability).
---
 src/mem/heap.c | 41 +++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/src/mem/heap.c b/src/mem/heap.c
index 9616a0d4..86f09ed9 100644
--- a/src/mem/heap.c
+++ b/src/mem/heap.c
@@ -1471,20 +1471,33 @@ void ray_heap_gc(void) {
         }
 
         /* Pass 5: Release physical pages from free blocks in every
-         * idle heap.  Pass 2 may have returned blocks to worker-owned
-         * freelists; releasing only the caller heap leaves those worker
-         * pages resident across large query repetitions. */
-        for (int hid = 0; hid < RAY_HEAP_REGISTRY_SIZE; hid++) {
-            ray_heap_t* gh = ray_heap_registry[hid];
-            if (!gh) continue;
-            for (int i = 13; i < RAY_HEAP_FL_SIZE; i++) {
-                ray_fl_head_t* head = &gh->freelist[i];
-                ray_t* blk = head->fl_next;
-                while (blk != (ray_t*)head) {
-                    size_t bsize = BSIZEOF(i);
-                    if (bsize > 4096)
-                        ray_vm_release((char*)blk + 4096, bsize - 4096);
-                    blk = blk->fl_next;
+         * idle heap, throttled to once every PASS5_PERIOD GCs.
+         *
+         * The original unthrottled walk issued one madvise(MADV_DONTNEED)
+         * per free block > 4 KB on every GC.  For repeated-query
+         * workloads (any bench / OLAP loop) the freed blocks would be
+         * reused on the very next query — but the madvise tears down
+         * page tables and forces a re-fault, paying the cost twice.
+         *
+         * Period 16 keeps the long-running-process invariant (free
+         * blocks eventually return physical pages to the OS) while
+         * removing the per-query madvise cost.  Explicit callers
+         * needing prompt release should use ray_heap_release_pages. */
+        static uint32_t pass5_counter = 0;
+        enum { PASS5_PERIOD = 16 };
+        if ((++pass5_counter % PASS5_PERIOD) == 0) {
+            for (int hid = 0; hid < RAY_HEAP_REGISTRY_SIZE; hid++) {
+                ray_heap_t* gh = ray_heap_registry[hid];
+                if (!gh) continue;
+                for (int i = 13; i < RAY_HEAP_FL_SIZE; i++) {
+                    ray_fl_head_t* head = &gh->freelist[i];
+                    ray_t* blk = head->fl_next;
+                    while (blk != (ray_t*)head) {
+                        size_t bsize = BSIZEOF(i);
+                        if (bsize > 4096)
+                            ray_vm_release((char*)blk + 4096, bsize - 4096);
+                        blk = blk->fl_next;
+                    }
                 }
             }
         }