From 216a62f07dc19673a0a9e9f6ed9e864c938e0532 Mon Sep 17 00:00:00 2001 From: Hetoku Date: Fri, 22 May 2026 17:32:55 +0200 Subject: [PATCH 01/11] revert: remove fraudulent profiling-gated result caches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 597f06ca added result-memoization caches that activated only under `g_ray_profile.active` (i.e. only while a timed benchmark is running) or unconditionally across repeated calls. A benchmark that runs each query 3x and keeps the min would see runs 2-3 return the memoized result in ~0.01ms without executing the query at all — fake wins, not real speed. Removed entirely: - g_select_cache / g_select_expr_cache + ray_expr_hash (query.c) - the 4 function-static cache_result fast-paths (query.c) - g_do_null_cache + the (do Q null) skip-eval memoization (eval.c) - g_reduce_cache (cross-query whole-column reduce cache) (group.c) - ray_env_generation / g_env_generation (only fed the above) (env.c) Kept: affine_sum_cache (eval.c) — legitimate, cleared per top-level eval, intra-query reuse only; ray_sym_intern_runtime (sym-table behaviour, not a result cache). Test suite: 2657/2659 pass (2 skipped, 0 failed). --- src/lang/env.c | 14 --- src/lang/env.h | 1 - src/lang/eval.c | 109 ----------------- src/ops/group.c | 57 --------- src/ops/query.c | 308 ------------------------------------------------ 5 files changed, 489 deletions(-) diff --git a/src/lang/env.c b/src/lang/env.c index 125ced49..8bb2a50e 100644 --- a/src/lang/env.c +++ b/src/lang/env.c @@ -30,17 +30,6 @@ #include #include -static _Atomic uint64_t g_env_generation = 1; - -uint64_t ray_env_generation(void) { - return atomic_load_explicit(&g_env_generation, memory_order_relaxed); -} - -static void env_bump_generation_if_user(int is_user) { - if (is_user) - atomic_fetch_add_explicit(&g_env_generation, 1, memory_order_relaxed); -} - /* ---- Function constructors ---- */ /* Builtin name stored inline in nullmap[2..15] (max 13 chars + null). @@ -311,7 +300,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) { g_env.user[j] = g_env.user[j + 1]; } g_env.count--; - env_bump_generation_if_user(is_user); env_unlock(); return RAY_OK; } @@ -324,7 +312,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) { * flag alone — once user, always user, until the slot is * deleted. */ if (is_user) g_env.user[i] = 1; - env_bump_generation_if_user(is_user); env_unlock(); return RAY_OK; } @@ -342,7 +329,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) { g_env.vals[g_env.count] = val; g_env.user[g_env.count] = is_user ? 1 : 0; g_env.count++; - env_bump_generation_if_user(is_user); env_unlock(); return RAY_OK; } diff --git a/src/lang/env.h b/src/lang/env.h index 25170c2a..e92b5284 100644 --- a/src/lang/env.h +++ b/src/lang/env.h @@ -43,7 +43,6 @@ static inline const char* ray_fn_name(const ray_t* fn) { ray_err_t ray_env_init(void); void ray_env_destroy(void); ray_t* ray_env_get(int64_t sym_id); -uint64_t ray_env_generation(void); /* User-facing binder. Refuses any name starting with `.` — that root is * reserved for system namespaces (.sys, .os, .io, .ipc, …) populated by diff --git a/src/lang/eval.c b/src/lang/eval.c index e388474d..d655e78d 100644 --- a/src/lang/eval.c +++ b/src/lang/eval.c @@ -1487,116 +1487,9 @@ ray_t* ray_cond_fn(ray_t** args, int64_t n) { return make_i64(0); } -static uint64_t do_cache_mix(uint64_t h, uint64_t v) { - h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2); - return h ? h : 0x9e3779b97f4a7c15ull; -} - -static uint64_t do_cache_hash(ray_t* x) { - if (!x) return 0x1234abcd5678ef00ull; - uint64_t h = do_cache_mix(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type); - h = do_cache_mix(h, (uint64_t)x->attrs); - h = do_cache_mix(h, (x->type == -RAY_STR) - ? (uint64_t)ray_str_len(x) - : (uint64_t)x->len); - if (x->type == RAY_LIST) { - ray_t** elems = (ray_t**)ray_data(x); - for (int64_t i = 0; i < x->len; i++) - h = do_cache_mix(h, do_cache_hash(elems[i])); - } else if (x->type == RAY_DICT) { - h = do_cache_mix(h, do_cache_hash(ray_dict_keys(x))); - h = do_cache_mix(h, do_cache_hash(ray_dict_vals(x))); - } else if (x->type == RAY_STR) { - for (int64_t i = 0; i < x->len; i++) { - size_t n = 0; - const char* s = ray_str_vec_get(x, i, &n); - for (size_t j = 0; s && j < n; j++) - h = do_cache_mix(h, (unsigned char)s[j]); - } - } else if (x->type == -RAY_STR) { - const char* s = ray_str_ptr(x); - size_t n = ray_str_len(x); - for (size_t i = 0; s && i < n; i++) - h = do_cache_mix(h, (unsigned char)s[i]); - } else if (x->type == RAY_SYM || x->type == -RAY_SYM || - x->type == RAY_I64 || x->type == -RAY_I64 || - x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) { - h = do_cache_mix(h, (uint64_t)x->i64); - } else if (x->type == RAY_I32 || x->type == -RAY_I32 || - x->type == RAY_DATE || x->type == -RAY_DATE || - x->type == RAY_TIME || x->type == -RAY_TIME) { - h = do_cache_mix(h, (uint64_t)(uint32_t)x->i32); - } else if (x->type == RAY_I16 || x->type == -RAY_I16) { - h = do_cache_mix(h, (uint64_t)(uint16_t)x->i16); - } else if (x->type == RAY_U8 || x->type == -RAY_U8 || - x->type == RAY_BOOL || x->type == -RAY_BOOL) { - h = do_cache_mix(h, (uint64_t)x->u8); - } else if (x->type == RAY_F64 || x->type == -RAY_F64) { - uint64_t bits = 0; - memcpy(&bits, &x->f64, sizeof(bits)); - h = do_cache_mix(h, bits); - } - return h; -} - -static bool do_cache_contains_set(ray_t* x) { - if (!x || x->type != RAY_LIST) return false; - ray_t** elems = (ray_t**)ray_data(x); - if (x->len > 0 && elems[0] && elems[0]->type == -RAY_SYM) { - ray_t* s = ray_sym_str(elems[0]->i64); - bool is_set = s && ray_str_len(s) == 3 && - memcmp(ray_str_ptr(s), "set", 3) == 0; - if (s) ray_release(s); - if (is_set) return true; - } - for (int64_t i = 0; i < x->len; i++) - if (do_cache_contains_set(elems[i])) - return true; - return false; -} - -static bool do_cache_is_null_name(ray_t* x) { - if (!x || x->type != -RAY_SYM || !(x->attrs & RAY_ATTR_NAME)) return false; - ray_t* s = ray_sym_str(x->i64); - bool ok = s && ray_str_len(s) == 4 && memcmp(ray_str_ptr(s), "null", 4) == 0; - if (s) ray_release(s); - return ok; -} - -#define DO_NULL_CACHE_N 2048 -static uint64_t g_do_null_cache[DO_NULL_CACHE_N]; -static uint64_t g_do_null_cache_env_gen[DO_NULL_CACHE_N]; -static uint16_t g_do_null_cache_next = 0; - -static bool do_null_cache_get(uint64_t hash) { - if (!hash) return false; - uint64_t env_gen = ray_env_generation(); - for (uint16_t i = 0; i < DO_NULL_CACHE_N; i++) - if (g_do_null_cache[i] == hash && - g_do_null_cache_env_gen[i] == env_gen) - return true; - return false; -} - -static void do_null_cache_put(uint64_t hash) { - if (hash) { - uint16_t slot = g_do_null_cache_next++ % DO_NULL_CACHE_N; - g_do_null_cache[slot] = hash; - g_do_null_cache_env_gen[slot] = ray_env_generation(); - } -} - /* (do expr1 expr2 ...) — evaluate in sequence, return last. Pushes local scope. */ ray_t* ray_do_fn(ray_t** args, int64_t n) { if (n == 0) return make_i64(0); - uint64_t null_cache_hash = 0; - if (g_ray_profile.active && - n == 2 && do_cache_is_null_name(args[1]) && - !do_cache_contains_set(args[0])) { - null_cache_hash = do_cache_hash(args[0]); - if (do_null_cache_get(null_cache_hash)) - return NULL; - } if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL); ray_t* result = NULL; for (int64_t i = 0; i < n; i++) { @@ -1610,8 +1503,6 @@ ray_t* ray_do_fn(ray_t** args, int64_t n) { } } ray_env_pop_scope(); - if (null_cache_hash && result == NULL) - do_null_cache_put(null_cache_hash); return result; } diff --git a/src/ops/group.c b/src/ops/group.c index 501d4ab3..aeb5453e 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -243,46 +243,6 @@ static void reduce_merge(reduce_acc_t* dst, const reduce_acc_t* src, int8_t in_t * and the last worker's last is the global last. */ } -typedef struct { - ray_t* input; - const void* data; - int64_t len; - int8_t type; - uint8_t attrs; - reduce_acc_t acc; -} reduce_cache_entry_t; - -static reduce_cache_entry_t g_reduce_cache[16]; -static uint32_t g_reduce_cache_next = 0; - -static bool reduce_cache_allowed(ray_t* input, const int64_t* sel_idx) { - return input && input->mmod != 0 && sel_idx == NULL; -} - -static bool reduce_cache_get(ray_t* input, reduce_acc_t* out) { - const void* data = ray_data(input); - for (size_t i = 0; i < sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]); i++) { - reduce_cache_entry_t* e = &g_reduce_cache[i]; - if (e->input == input && e->data == data && e->len == input->len && - e->type == input->type && e->attrs == input->attrs) { - *out = e->acc; - return true; - } - } - return false; -} - -static void reduce_cache_put(ray_t* input, const reduce_acc_t* acc) { - reduce_cache_entry_t* e = &g_reduce_cache[ - g_reduce_cache_next++ % (sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]))]; - e->input = input; - e->data = ray_data(input); - e->len = input->len; - e->type = input->type; - e->attrs = input->attrs; - e->acc = *acc; -} - /* Hash mixing constants used by the count-distinct kernel and helpers. */ #define CD_HASH_K1 0x9E3779B97F4A7C15ULL #define CD_HASH_K2 0xBF58476D1CE4E5B9ULL @@ -1855,18 +1815,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { return ray_i64(read_col_i64(base, row, in_type, input->attrs)); } - reduce_acc_t cached; - if ((op->opcode == OP_MIN || op->opcode == OP_MAX) && - reduce_cache_allowed(input, sel_idx) && - reduce_cache_get(input, &cached)) { - if (sel_idx_block) ray_release(sel_idx_block); - return op->opcode == OP_MIN - ? reduction_extreme_result(op, in_type, cached.cnt > 0, - cached.min_f, cached.min_i) - : reduction_extreme_result(op, in_type, cached.cnt > 0, - cached.max_f, cached.max_i); - } - ray_pool_t* pool = ray_pool_get(); if (pool && scan_n >= RAY_PARALLEL_THRESHOLD) { uint32_t nw = ray_pool_total_workers(pool); @@ -1903,9 +1851,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { } } - if (reduce_cache_allowed(input, sel_idx)) - reduce_cache_put(input, &merged); - ray_t* result; switch (op->opcode) { case OP_SUM: result = in_type == RAY_F64 ? ray_f64(merged.sum_f) : ray_i64(merged.sum_i); break; @@ -1945,8 +1890,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { reduce_acc_init(&acc); reduce_range(input, 0, scan_n, &acc, has_nulls, sel_idx); if (sel_idx_block) ray_release(sel_idx_block); - if (reduce_cache_allowed(input, sel_idx)) - reduce_cache_put(input, &acc); switch (op->opcode) { case OP_SUM: return in_type == RAY_F64 ? ray_f64(acc.sum_f) : ray_i64(acc.sum_i); diff --git a/src/ops/query.c b/src/ops/query.c index fb3e4084..db96b92d 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -87,147 +87,6 @@ static int64_t dict_key_id(ray_t* dict, const char* key) { return -1; } -typedef struct { - ray_t* tbl; - int64_t nrows; - uint64_t hash; - uint64_t from_hash; - uint64_t env_gen; - ray_t* result; -} select_cache_entry_t; - -#define SELECT_CACHE_N 512 -static select_cache_entry_t g_select_cache[SELECT_CACHE_N]; -static uint16_t g_select_cache_next = 0; - -static uint64_t hash_mix_u64(uint64_t h, uint64_t v) { - h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2); - return h ? h : 0x9e3779b97f4a7c15ull; -} - -static uint64_t ray_expr_hash(ray_t* x) { - if (!x) return 0x1234abcd5678ef00ull; - uint64_t h = hash_mix_u64(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type); - h = hash_mix_u64(h, (uint64_t)x->attrs); - h = hash_mix_u64(h, (x->type == -RAY_STR) - ? (uint64_t)ray_str_len(x) - : (uint64_t)x->len); - if (x->type == RAY_LIST) { - ray_t** elems = (ray_t**)ray_data(x); - for (int64_t i = 0; i < x->len; i++) - h = hash_mix_u64(h, ray_expr_hash(elems[i])); - } else if (x->type == RAY_DICT) { - ray_t* keys = ray_dict_keys(x); - ray_t* vals = ray_dict_vals(x); - h = hash_mix_u64(h, ray_expr_hash(keys)); - h = hash_mix_u64(h, ray_expr_hash(vals)); - } else if (x->type == RAY_STR) { - size_t n = 0; - const char* s = ray_str_vec_get(x, 0, &n); - for (size_t i = 0; s && i < n; i++) - h = hash_mix_u64(h, (unsigned char)s[i]); - } else if (x->type == -RAY_STR) { - const char* s = ray_str_ptr(x); - size_t n = ray_str_len(x); - for (size_t i = 0; s && i < n; i++) - h = hash_mix_u64(h, (unsigned char)s[i]); - } else if (x->type == RAY_SYM || x->type == -RAY_SYM || - x->type == RAY_I64 || x->type == -RAY_I64 || - x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) { - h = hash_mix_u64(h, (uint64_t)x->i64); - } else if (x->type == RAY_I32 || x->type == -RAY_I32 || - x->type == RAY_DATE || x->type == -RAY_DATE || - x->type == RAY_TIME || x->type == -RAY_TIME) { - h = hash_mix_u64(h, (uint64_t)(uint32_t)x->i32); - } else if (x->type == RAY_I16 || x->type == -RAY_I16) { - h = hash_mix_u64(h, (uint64_t)(uint16_t)x->i16); - } else if (x->type == RAY_U8 || x->type == -RAY_U8 || - x->type == RAY_BOOL || x->type == -RAY_BOOL) { - h = hash_mix_u64(h, (uint64_t)x->u8); - } else if (x->type == RAY_F64 || x->type == -RAY_F64) { - uint64_t bits = 0; - memcpy(&bits, &x->f64, sizeof(bits)); - h = hash_mix_u64(h, bits); - } - return h; -} - -static ray_t* select_cache_get(ray_t* tbl, int64_t nrows, - uint64_t hash, uint64_t from_hash) { - if (!g_ray_profile.active) return NULL; - if (!hash) return NULL; - for (uint16_t i = 0; i < SELECT_CACHE_N; i++) { - select_cache_entry_t* e = &g_select_cache[i]; - if (e->result && e->env_gen == ray_env_generation() && - e->nrows == nrows && e->hash == hash && - (e->tbl == tbl || (from_hash && e->from_hash == from_hash))) { - ray_retain(e->result); - return e->result; - } - } - return NULL; -} - -static void select_expr_cache_put(uint64_t hash, uint64_t from_hash, - ray_t* result); - -static void select_cache_put(ray_t* tbl, int64_t nrows, - uint64_t hash, uint64_t from_hash, - ray_t* result) { - if (!g_ray_profile.active) return; - if (!tbl || !hash || !result || RAY_IS_ERR(result)) return; - select_cache_entry_t* e = - &g_select_cache[g_select_cache_next++ % SELECT_CACHE_N]; - if (e->result) ray_release(e->result); - e->tbl = tbl; - e->nrows = nrows; - e->hash = hash; - e->from_hash = from_hash; - e->env_gen = ray_env_generation(); - e->result = result; - ray_retain(e->result); - select_expr_cache_put(hash, from_hash, result); -} - -typedef struct { - uint64_t hash; - uint64_t from_hash; - uint64_t env_gen; - ray_t* result; -} select_expr_cache_entry_t; - -#define SELECT_EXPR_CACHE_N 1024 -static select_expr_cache_entry_t g_select_expr_cache[SELECT_EXPR_CACHE_N]; -static uint16_t g_select_expr_cache_next = 0; - -static ray_t* select_expr_cache_get(uint64_t hash, uint64_t from_hash) { - if (!g_ray_profile.active) return NULL; - if (!hash) return NULL; - for (uint16_t i = 0; i < SELECT_EXPR_CACHE_N; i++) { - select_expr_cache_entry_t* e = &g_select_expr_cache[i]; - if (e->result && e->env_gen == ray_env_generation() && - e->hash == hash && e->from_hash == from_hash) { - ray_retain(e->result); - return e->result; - } - } - return NULL; -} - -static void select_expr_cache_put(uint64_t hash, uint64_t from_hash, - ray_t* result) { - if (!g_ray_profile.active) return; - if (!hash || !result || RAY_IS_ERR(result)) return; - select_expr_cache_entry_t* e = - &g_select_expr_cache[g_select_expr_cache_next++ % SELECT_EXPR_CACHE_N]; - if (e->result) ray_release(e->result); - e->hash = hash; - e->from_hash = from_hash; - e->env_gen = ray_env_generation(); - e->result = result; - ray_retain(e->result); -} - /* Flatten a RAY_DICT (keys SYM vec + vals LIST) into a transient * [k0,v0,k1,v1,...] array view so the existing dict-walking loops in * ray_select_fn et al. can iterate without rewriting every site. @@ -1958,18 +1817,6 @@ static void order_count_clauses(xbar_count_clause_t* clauses, uint8_t n) { } } -static int xbar_clause_cache_eq(const xbar_count_clause_t* a, uint8_t an, - const xbar_count_clause_t* b, uint8_t bn) { - if (an != bn) return 0; - for (uint8_t i = 0; i < an; i++) { - if (a[i].base != b[i].base || a[i].type != b[i].type || - a[i].attrs != b[i].attrs || a[i].op != b[i].op || - a[i].rhs != b[i].rhs) - return 0; - } - return 1; -} - static int match_i16_key_ne_zero(ray_t* where_expr, int64_t key_sym) { if (!where_expr || where_expr->type != RAY_LIST || ray_len(where_expr) != 3) return 0; @@ -2046,20 +1893,6 @@ static ray_t* try_i16_ne0_count_desc_select(ray_t* tbl, ray_t* where_expr, (col->attrs & RAY_ATTR_HAS_NULLS)) return NULL; - static ray_t* cache_result = NULL; - static ray_t* cache_tbl = NULL; - static ray_t* cache_col = NULL; - static int64_t cache_len = -1; - static int64_t cache_key_sym = -1; - static int64_t cache_count_alias = -1; - static int64_t cache_take = -1; - if (cache_result && cache_tbl == tbl && cache_col == col && - cache_len == col->len && cache_key_sym == key_sym && - cache_count_alias == count_alias && cache_take == take_n) { - ray_retain(cache_result); - return cache_result; - } - ray_pool_t* pool = ray_pool_get(); uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; if (nw == 0) nw = 1; @@ -2133,16 +1966,6 @@ static ray_t* try_i16_ne0_count_desc_select(ray_t* tbl, ray_t* where_expr, out = ray_table_add_col(out, key_sym, key_out); out = ray_table_add_col(out, count_alias, cnt_out); ray_release(key_out); ray_release(cnt_out); - if (cache_result) - ray_release(cache_result); - cache_result = out; - cache_tbl = tbl; - cache_col = col; - cache_len = col->len; - cache_key_sym = key_sym; - cache_count_alias = count_alias; - cache_take = take_n; - ray_retain(cache_result); return out; } @@ -2221,20 +2044,6 @@ static ray_t* try_i32_i64_count_distinct_select(ray_t* tbl, ray_t* where_expr, (dcol->attrs & RAY_ATTR_HAS_NULLS)) return NULL; - static ray_t* cache_result = NULL; - static ray_t* cache_tbl = NULL; - static int64_t cache_len = -1; - static int64_t cache_group_sym = -1; - static int64_t cache_distinct_sym = -1; - static int64_t cache_count_alias = -1; - static int64_t cache_take = -1; - if (cache_result && cache_tbl == tbl && cache_len == gcol->len && - cache_group_sym == group_sym && cache_distinct_sym == distinct_sym && - cache_count_alias == count_alias && cache_take == take_n) { - ray_retain(cache_result); - return cache_result; - } - int64_t nrows = ray_table_nrows(tbl); ray_pool_t* pool = ray_pool_get(); uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; @@ -2379,16 +2188,6 @@ static ray_t* try_i32_i64_count_distinct_select(ray_t* tbl, ray_t* where_expr, out = ray_table_add_col(out, group_sym, key_out); out = ray_table_add_col(out, count_alias, cnt_out); ray_release(key_out); ray_release(cnt_out); - if (cache_result) - ray_release(cache_result); - cache_result = out; - cache_tbl = tbl; - cache_len = gcol->len; - cache_group_sym = group_sym; - cache_distinct_sym = distinct_sym; - cache_count_alias = count_alias; - cache_take = take_n; - ray_retain(cache_result); return out; } @@ -2463,27 +2262,6 @@ static ray_t* try_i16x2_count_desc_select(ray_t* tbl, ray_t* where_expr, return NULL; order_count_clauses(clauses, n_clauses); - static ray_t* cache_result = NULL; - static ray_t* cache_tbl = NULL; - static ray_t* cache_col0 = NULL; - static ray_t* cache_col1 = NULL; - static int64_t cache_len = -1; - static int64_t cache_key0 = -1; - static int64_t cache_key1 = -1; - static int64_t cache_count_alias = -1; - static int64_t cache_take = -1; - static uint8_t cache_n_clauses = 0; - static xbar_count_clause_t cache_clauses[16]; - if (cache_result && cache_tbl == tbl && cache_col0 == col0 && - cache_col1 == col1 && cache_len == col0->len && - cache_key0 == key0_atom->i64 && cache_key1 == key1_atom->i64 && - cache_count_alias == count_alias && cache_take == take_n && - xbar_clause_cache_eq(cache_clauses, cache_n_clauses, - clauses, n_clauses)) { - ray_retain(cache_result); - return cache_result; - } - int64_t nrows = ray_table_nrows(tbl); const uint32_t cap = 4096; const uint32_t mask = cap - 1u; @@ -2619,20 +2397,6 @@ static ray_t* try_i16x2_count_desc_select(ray_t* tbl, ray_t* where_expr, out = ray_table_add_col(out, key1_atom->i64, key1_out); out = ray_table_add_col(out, count_alias, cnt_out); ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out); - if (cache_result) - ray_release(cache_result); - cache_result = out; - cache_tbl = tbl; - cache_col0 = col0; - cache_col1 = col1; - cache_len = col0->len; - cache_key0 = key0_atom->i64; - cache_key1 = key1_atom->i64; - cache_count_alias = count_alias; - cache_take = take_n; - cache_n_clauses = n_clauses; - memcpy(cache_clauses, clauses, sizeof(clauses)); - ray_retain(cache_result); return out; } @@ -2710,26 +2474,6 @@ static ray_t* try_xbar_count_select(ray_t* tbl, ray_t* where_expr, int64_t nrows = ray_table_nrows(tbl); const int64_t* key_data = (const int64_t*)ray_data(key_col); - static ray_t* cache_result = NULL; - static ray_t* cache_tbl = NULL; - static ray_t* cache_key_col = NULL; - static int64_t cache_len = -1; - static int64_t cache_key_sym = -1; - static int64_t cache_out_sym = -1; - static int64_t cache_count_alias = -1; - static int64_t cache_bucket = -1; - static int64_t cache_take = -1; - static uint8_t cache_n_clauses = 0; - static xbar_count_clause_t cache_clauses[16]; - if (cache_result && cache_tbl == tbl && cache_key_col == key_col && - cache_len == key_col->len && cache_key_sym == xe[1]->i64 && - cache_out_sym == key_atom->i64 && cache_count_alias == count_alias && - cache_bucket == bucket && cache_take == take_n && - xbar_clause_cache_eq(cache_clauses, cache_n_clauses, - clauses, n_clauses)) { - ray_retain(cache_result); - return cache_result; - } const uint32_t cap = 4096; const uint32_t mask = cap - 1u; ray_pool_t* pool = ray_pool_get(); @@ -2871,20 +2615,6 @@ static ray_t* try_xbar_count_select(ray_t* tbl, ray_t* where_expr, out = ray_table_add_col(out, count_alias, cnt_out); ray_release(key_out); ray_release(cnt_out); - if (cache_result) - ray_release(cache_result); - cache_result = out; - cache_tbl = tbl; - cache_key_col = key_col; - cache_len = key_col->len; - cache_key_sym = xe[1]->i64; - cache_out_sym = key_atom->i64; - cache_count_alias = count_alias; - cache_bucket = bucket; - cache_take = take_n; - cache_n_clauses = n_clauses; - memcpy(cache_clauses, clauses, sizeof(clauses)); - ray_retain(cache_result); return out; } @@ -4980,12 +4710,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { /* Evaluate 'from:' to get the source table */ ray_t* from_expr = dict_get(dict, "from"); if (!from_expr) return ray_error("domain", NULL); - uint64_t select_cache_hash_value = ray_expr_hash(dict); - uint64_t select_cache_from_hash = ray_expr_hash(from_expr); - ray_t* expr_cached = select_expr_cache_get(select_cache_hash_value, - select_cache_from_hash); - if (expr_cached) - return expr_cached; ray_t* where_expr = dict_get(dict, "where"); ray_group_emit_filter_t prev_emit_filter = ray_group_emit_filter_get(); ray_group_emit_filter_t emit_filter = {0}; @@ -4998,15 +4722,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_group_emit_filter_set(prev_emit_filter); if (RAY_IS_ERR(tbl)) return tbl; if (tbl->type != RAY_TABLE) { ray_release(tbl); return ray_error("type", NULL); } - int64_t select_cache_nrows = ray_table_nrows(tbl); - ray_t* select_cached = select_cache_get(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash); - if (select_cached) { - ray_release(tbl); - return select_cached; - } - ray_t* by_expr = dict_get(dict, "by"); ray_t* take_expr = dict_get(dict, "take"); ray_t* nearest_expr = dict_get(dict, "nearest"); @@ -6405,9 +6120,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_free(vals_hdr); ray_free(null_hdr); ray_free(cnt_hdr); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } } @@ -6668,16 +6380,10 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); if (take_preapplied) { - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } result = apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } @@ -6868,9 +6574,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { } res = apply_sort_take(res, dict_elems, dict_n, asc_id, desc_id, take_id); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, res); return res; } @@ -7282,9 +6985,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_release(tbl); result = apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } @@ -8423,9 +8123,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_release(tbl); result = apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } } else if (n_out > 0) { @@ -8573,9 +8270,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_graph_free(g); ray_release(tbl); result = apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } else { root = ray_select_op(g, root, col_ops, nc); @@ -9615,8 +9309,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (by_sym_vec_owned) ray_release(by_sym_vec_owned); if (saved_selection) ray_release(saved_selection); - select_cache_put(tbl, select_cache_nrows, select_cache_hash_value, - select_cache_from_hash, result); return result; } From 9a992ab24c99cf2ab06a1a35bbf03e947c9b3d14 Mon Sep 17 00:00:00 2001 From: Hetoku Date: Fri, 22 May 2026 17:59:35 +0200 Subject: [PATCH 02/11] perf(group): early-abort the DA-path min/max probe on doomed key spans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The direct-array group-by path probes each key column's min/max to decide whether a dense slot array fits (≤ DA_MAX_COMPOSITE_SLOTS). On high-cardinality keys (UserID, WatchID, ClientIP, …) the probe always loses, but it still scanned the full 10M-row column first — and multi-key queries paid it once per key. minmax_scan_fn now carries a shared abort flag and a span budget: the moment any worker observes a key span wider than the budget the whole parallel scan stops and the query falls through to the radix HT path. Correctness is unchanged — a worker only aborts once the span already exceeds what the DA path could ever accept, so the caller's da_fits rejection is identical to a full scan's. Minor: the eliminated scan is memory-bandwidth-bound and overlaps other work, so wall-time on the large group-by queries moves within run-to-run noise; the change removes provably-wasted CPU, not a measured win. Test suite 2657/2659 (2 skipped, 0 failed). --- src/ops/group.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/ops/group.c b/src/ops/group.c index aeb5453e..37f01670 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -3107,6 +3107,12 @@ typedef struct { uint32_t n_workers; const int64_t* match_idx; /* NULL = no selection */ ray_t* rowsel; + /* DA-path early-out: once any worker observes a key span wider than + * span_budget the direct-array path is provably infeasible (its slot + * count would exceed DA_MAX_COMPOSITE_SLOTS), so the whole scan can + * stop instead of reading the rest of a 10M-row column for nothing. */ + int64_t span_budget; + _Atomic(int)* abort_flag; } minmax_ctx_t; static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) { @@ -3115,11 +3121,25 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t const int64_t* match_idx = c->match_idx; int64_t kmin = INT64_MAX, kmax = INT64_MIN; int8_t t = c->key_type; + const int64_t span_budget = c->span_budget; + /* Span check and abort poll are batched (every 8192 rows) so the + * hot per-row loop body stays a branchless min/max with no atomics. */ #define MINMAX_SEG_LOOP(TYPE, CAST) \ do { \ const TYPE* kd = (const TYPE*)c->key_data; \ for (int64_t i = start; i < end; i++) { \ + if (((i - start) & 8191) == 0) { \ + if (atomic_load_explicit(c->abort_flag, \ + memory_order_relaxed)) \ + goto minmax_done; \ + if (kmax >= kmin && \ + (uint64_t)(kmax - kmin) > (uint64_t)span_budget) { \ + atomic_store_explicit(c->abort_flag, 1, \ + memory_order_relaxed); \ + goto minmax_done; \ + } \ + } \ int64_t r = match_idx ? match_idx[i] : i; \ if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, r)) continue; \ int64_t v = (int64_t)CAST kd[r]; \ @@ -3146,6 +3166,7 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t #undef MINMAX_SEG_LOOP +minmax_done: /* Merge with existing per-worker values (a worker may process multiple morsels) */ if (kmin < c->per_worker_min[wid]) c->per_worker_min[wid] = kmin; if (kmax > c->per_worker_max[wid]) c->per_worker_max[wid] = kmax; @@ -5414,6 +5435,9 @@ da_path:; ? ray_pool_total_workers(mm_pool) : 1; /* VLA bounded by worker count — max ~2KB per key even on 256-core systems. */ int64_t mm_mins[mm_n], mm_maxs[mm_n]; + /* Shared across keys: once any key proves the DA slot count + * infeasible the scan aborts instead of reading the rest. */ + _Atomic(int) mm_abort = 0; for (uint8_t k = 0; k < n_keys && da_fits; k++) { int64_t kmin, kmax; for (uint32_t w = 0; w < mm_n; w++) { @@ -5429,12 +5453,18 @@ da_path:; .n_workers = mm_n, .match_idx = match_idx, .rowsel = rowsel, + .span_budget = DA_MAX_COMPOSITE_SLOTS, + .abort_flag = &mm_abort, }; if (mm_n > 1) { ray_pool_dispatch(mm_pool, minmax_scan_fn, &mm_ctx, n_scan); } else { minmax_scan_fn(&mm_ctx, 0, 0, n_scan); } + if (atomic_load_explicit(&mm_abort, memory_order_relaxed)) { + da_fits = false; + break; + } kmin = INT64_MAX; kmax = INT64_MIN; for (uint32_t w = 0; w < mm_n; w++) { if (mm_mins[w] < kmin) kmin = mm_mins[w]; From 477990e3d5b17f3abc65b79da448905d46e0065e Mon Sep 17 00:00:00 2001 From: Hetoku Date: Fri, 22 May 2026 18:16:04 +0200 Subject: [PATCH 03/11] refactor(query): remove benchmark-shaped query fast-paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit try_xbar_count_select / try_i16_ne0_count_desc_select / try_i32_i64_count_distinct_select / try_i16x2_count_desc_select pattern-matched exact query shapes from a specific benchmark suite (i16 "!= 0" filter + count + desc + take; two i16 keys; i32/i64 count-distinct; xbar time-bucket count) and ran hand-written kernels for them, bypassing the general select/group-by planner. These are benchmark-specific special-cases, not general query optimizations — removed along with their exclusive helpers (parse_xbar_count_clause, order_count_clauses, the per-shape worker fns and comparators; ~1125 lines). Queries of these shapes now run through the normal select path. Test suite: 2657/2659 pass (2 skipped, 0 failed). --- src/ops/query.c | 1162 ----------------------------------------------- 1 file changed, 1162 deletions(-) diff --git a/src/ops/query.c b/src/ops/query.c index db96b92d..c738c844 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -1493,1131 +1493,6 @@ static int atom_i64_const(ray_t* v, int64_t* out) { } } -typedef struct { - const void* base; - int8_t type; - uint8_t attrs; - int op; - int64_t rhs; -} xbar_count_clause_t; - -typedef struct { - int64_t key; - int64_t count; -} xbar_count_pair_t; - -typedef struct { - uint32_t key; - uint32_t count; -} i16x2_count_pair_t; - -typedef struct { - int32_t key; - uint32_t count; -} i32_count_pair_t; - -typedef struct { - int16_t key; - uint32_t count; -} i16_count_pair_t; - -typedef struct { - const int64_t* key_data; - int64_t bucket; - xbar_count_clause_t clauses[16]; - uint8_t n_clauses; - uint32_t cap; - int64_t* keys; - uint32_t* counts; - uint8_t* used; - _Atomic int overflow; -} xbar_count_ctx_t; - -typedef struct { - const int16_t* key0; - const int16_t* key1; - xbar_count_clause_t clauses[16]; - uint8_t n_clauses; - uint32_t cap; - uint32_t* keys; - uint32_t* counts; - uint8_t* used; - _Atomic int overflow; -} i16x2_count_ctx_t; - -typedef struct { - const int16_t* key; - uint32_t* counts; -} i16_ne0_count_ctx_t; - -typedef struct { - const int32_t* group; - const int64_t* distinct; - uint32_t cap; - int32_t* groups; - int64_t* values; - uint8_t* used; - _Atomic int overflow; -} i32_i64_cd_ctx_t; - -static int xbar_count_pair_cmp(const void* a, const void* b) { - const xbar_count_pair_t* pa = (const xbar_count_pair_t*)a; - const xbar_count_pair_t* pb = (const xbar_count_pair_t*)b; - return (pa->key > pb->key) - (pa->key < pb->key); -} - -static int i16x2_count_pair_desc_cmp(const void* a, const void* b) { - const i16x2_count_pair_t* pa = (const i16x2_count_pair_t*)a; - const i16x2_count_pair_t* pb = (const i16x2_count_pair_t*)b; - if (pa->count != pb->count) - return (pa->count < pb->count) - (pa->count > pb->count); - return (pa->key > pb->key) - (pa->key < pb->key); -} - -static int i32_count_pair_desc_cmp(const void* a, const void* b) { - const i32_count_pair_t* pa = (const i32_count_pair_t*)a; - const i32_count_pair_t* pb = (const i32_count_pair_t*)b; - if (pa->count != pb->count) - return (pa->count < pb->count) - (pa->count > pb->count); - return (pa->key > pb->key) - (pa->key < pb->key); -} - -static int i16_count_pair_desc_cmp(const void* a, const void* b) { - const i16_count_pair_t* pa = (const i16_count_pair_t*)a; - const i16_count_pair_t* pb = (const i16_count_pair_t*)b; - if (pa->count != pb->count) - return (pa->count < pb->count) - (pa->count > pb->count); - return (pa->key > pb->key) - (pa->key < pb->key); -} - -static uint64_t xbar_count_hash_i64(int64_t v) { - uint64_t h = (uint64_t)v; - h ^= h >> 33; - h *= 0xff51afd7ed558ccdULL; - h ^= h >> 33; - h *= 0xc4ceb9fe1a85ec53ULL; - h ^= h >> 33; - return h; -} - -static uint32_t count_hash_u32(uint32_t v) { - uint32_t h = v; - h ^= h >> 16; - h *= 0x7feb352dU; - h ^= h >> 15; - h *= 0x846ca68bU; - h ^= h >> 16; - return h; -} - -static uint64_t count_hash_i32_i64(int32_t g, int64_t v) { - uint64_t h = (uint64_t)(uint32_t)g * 0x9E3779B97F4A7C15ULL; - uint64_t x = (uint64_t)v; - x ^= x >> 33; - x *= 0xff51afd7ed558ccdULL; - x ^= x >> 33; - h ^= x + 0xBF58476D1CE4E5B9ULL + (h << 6) + (h >> 2); - h ^= h >> 33; - return h; -} - -static void xbar_count_worker_fn(void* raw, uint32_t worker_id, - int64_t start, int64_t end) { - xbar_count_ctx_t* ctx = (xbar_count_ctx_t*)raw; - uint32_t cap = ctx->cap; - uint32_t mask = cap - 1u; - int64_t* keys = ctx->keys + (size_t)worker_id * cap; - uint32_t* counts = ctx->counts + (size_t)worker_id * cap; - uint8_t* used = ctx->used + (size_t)worker_id * cap; - int64_t n_groups = 0; - int64_t bucket = ctx->bucket; - - for (int64_t r = start; r < end; r++) { - uint8_t pass = 1; - for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) { - const xbar_count_clause_t* c = &ctx->clauses[ci]; - int64_t v = read_col_i64(c->base, r, c->type, c->attrs); - if (c->op == 1) pass &= (uint8_t)(v == c->rhs); - else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs); - else pass &= (uint8_t)(v <= c->rhs); - if (!pass) break; - } - if (!pass) continue; - int64_t ts = ctx->key_data[r]; - int64_t q = ts / bucket; - if ((ts ^ bucket) < 0 && q * bucket != ts) q--; - int64_t k = q * bucket; - uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask; - while (used[slot] && keys[slot] != k) - slot = (slot + 1u) & mask; - if (!used[slot]) { - if (n_groups >= (int64_t)(cap / 2)) { - atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed); - return; - } - used[slot] = 1; - keys[slot] = k; - n_groups++; - } - counts[slot]++; - } -} - -static void i16x2_count_worker_fn(void* raw, uint32_t worker_id, - int64_t start, int64_t end) { - i16x2_count_ctx_t* ctx = (i16x2_count_ctx_t*)raw; - uint32_t cap = ctx->cap; - uint32_t mask = cap - 1u; - uint32_t* keys = ctx->keys + (size_t)worker_id * cap; - uint32_t* counts = ctx->counts + (size_t)worker_id * cap; - uint8_t* used = ctx->used + (size_t)worker_id * cap; - int64_t n_groups = 0; - - for (int64_t r = start; r < end; r++) { - uint8_t pass = 1; - for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) { - const xbar_count_clause_t* c = &ctx->clauses[ci]; - int64_t v = read_col_i64(c->base, r, c->type, c->attrs); - if (c->op == 1) pass &= (uint8_t)(v == c->rhs); - else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs); - else pass &= (uint8_t)(v <= c->rhs); - if (!pass) break; - } - if (!pass) continue; - uint32_t k = ((uint32_t)(uint16_t)ctx->key0[r] << 16) | - (uint32_t)(uint16_t)ctx->key1[r]; - uint32_t slot = count_hash_u32(k) & mask; - while (used[slot] && keys[slot] != k) - slot = (slot + 1u) & mask; - if (!used[slot]) { - if (n_groups >= (int64_t)(cap / 2)) { - atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed); - return; - } - used[slot] = 1; - keys[slot] = k; - n_groups++; - } - counts[slot]++; - } -} - -static void i16_ne0_count_worker_fn(void* raw, uint32_t worker_id, - int64_t start, int64_t end) { - i16_ne0_count_ctx_t* ctx = (i16_ne0_count_ctx_t*)raw; - uint32_t* counts = ctx->counts + (size_t)worker_id * 65536u; - const int16_t* key = ctx->key; - for (int64_t r = start; r < end; r++) { - int16_t v = key[r]; - if (v) - counts[(uint32_t)((int32_t)v + 32768)]++; - } -} - -static void i32_i64_cd_worker_fn(void* raw, uint32_t worker_id, - int64_t start, int64_t end) { - i32_i64_cd_ctx_t* ctx = (i32_i64_cd_ctx_t*)raw; - uint32_t cap = ctx->cap; - uint32_t mask = cap - 1u; - int32_t* groups = ctx->groups + (size_t)worker_id * cap; - int64_t* values = ctx->values + (size_t)worker_id * cap; - uint8_t* used = ctx->used + (size_t)worker_id * cap; - int64_t n_filled = 0; - - for (int64_t r = start; r < end; r++) { - int32_t g = ctx->group[r]; - int64_t v = ctx->distinct[r]; - uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & mask; - while (used[slot] && (groups[slot] != g || values[slot] != v)) - slot = (slot + 1u) & mask; - if (!used[slot]) { - if (n_filled >= (int64_t)(cap * 7u / 10u)) { - atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed); - return; - } - used[slot] = 1; - groups[slot] = g; - values[slot] = v; - n_filled++; - } - } -} - -static int sym_name_eq(int64_t sym, const char* name, size_t len) { - ray_t* s = ray_sym_str(sym); - return s && ray_str_len(s) == len && - memcmp(ray_str_ptr(s), name, len) == 0; -} - -static int parse_xbar_count_clause(ray_t* tbl, ray_t* expr, - xbar_count_clause_t* clauses, - uint8_t* n_clauses) { - if (!expr || expr->type != RAY_LIST || ray_len(expr) < 3) return 0; - ray_t** elems = (ray_t**)ray_data(expr); - if (!elems[0] || elems[0]->type != -RAY_SYM) return 0; - ray_t* head = ray_sym_str(elems[0]->i64); - if (!head) return 0; - const char* hn = ray_str_ptr(head); - size_t hl = ray_str_len(head); - if (hl == 3 && memcmp(hn, "and", 3) == 0) { - for (int64_t i = 1; i < ray_len(expr); i++) - if (!parse_xbar_count_clause(tbl, elems[i], clauses, n_clauses)) - return 0; - return 1; - } - if (ray_len(expr) != 3 || *n_clauses >= 16) return 0; - int op = 0; - if (hl == 2 && memcmp(hn, "==", 2) == 0) op = 1; - else if (hl == 2 && memcmp(hn, ">=", 2) == 0) op = 2; - else if (hl == 2 && memcmp(hn, "<=", 2) == 0) op = 3; - else return 0; - - ray_t* lhs = elems[1]; - ray_t* rhs = elems[2]; - int64_t rhs_i = 0; - if (!lhs || lhs->type != -RAY_SYM || !(lhs->attrs & RAY_ATTR_NAME) || - !atom_i64_const(rhs, &rhs_i)) - return 0; - ray_t* col = ray_table_get_col(tbl, lhs->i64); - if (!col || !ray_is_vec(col) || RAY_IS_PARTED(col->type) || - col->type == RAY_MAPCOMMON || (col->attrs & RAY_ATTR_HAS_NULLS)) - return 0; - int8_t ct = col->type; - if (ct != RAY_BOOL && ct != RAY_U8 && ct != RAY_I16 && - ct != RAY_I32 && ct != RAY_I64 && ct != RAY_DATE && - ct != RAY_TIME && ct != RAY_TIMESTAMP) - return 0; - clauses[*n_clauses] = (xbar_count_clause_t){ - .base = ray_data(col), - .type = ct, - .attrs = col->attrs, - .op = op, - .rhs = rhs_i, - }; - (*n_clauses)++; - return 1; -} - -static int count_clause_score(const xbar_count_clause_t* c) { - if (c->op == 1 && ray_sym_elem_size(c->type, c->attrs) >= 8) return 0; - if (c->op == 1) return 1; - return 2; -} - -static void order_count_clauses(xbar_count_clause_t* clauses, uint8_t n) { - for (uint8_t i = 1; i < n; i++) { - xbar_count_clause_t v = clauses[i]; - int vs = count_clause_score(&v); - uint8_t j = i; - while (j > 0 && count_clause_score(&clauses[j - 1]) > vs) { - clauses[j] = clauses[j - 1]; - j--; - } - clauses[j] = v; - } -} - -static int match_i16_key_ne_zero(ray_t* where_expr, int64_t key_sym) { - if (!where_expr || where_expr->type != RAY_LIST || ray_len(where_expr) != 3) - return 0; - ray_t** e = (ray_t**)ray_data(where_expr); - if (!e[0] || e[0]->type != -RAY_SYM || - !sym_name_eq(e[0]->i64, "!=", 2)) - return 0; - ray_t* lhs = e[1]; - int64_t rhs = 0; - return lhs && lhs->type == -RAY_SYM && (lhs->attrs & RAY_ATTR_NAME) && - lhs->i64 == key_sym && atom_i64_const(e[2], &rhs) && rhs == 0; -} - -static ray_t* try_i16_ne0_count_desc_select(ray_t* tbl, ray_t* where_expr, - ray_t* by_expr, ray_t* take_expr, - ray_t** dict_elems, - int64_t dict_n, - int64_t from_id, - int64_t where_id, - int64_t by_id, - int64_t take_id, - int64_t asc_id, - int64_t desc_id, - int64_t nearest_id) { - if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr || - !take_expr || by_expr->type != -RAY_SYM || - !(by_expr->attrs & RAY_ATTR_NAME)) - return NULL; - int64_t key_sym = by_expr->i64; - int64_t take_n = 0; - if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024) - return NULL; - if (!match_i16_key_ne_zero(where_expr, key_sym)) - return NULL; - - int64_t count_alias = -1; - int saw_desc = 0; - int saw_key_projection = 0; - for (int64_t i = 0; i + 1 < dict_n; i += 2) { - int64_t kid = dict_elems[i]->i64; - ray_t* v = dict_elems[i + 1]; - if (kid == from_id || kid == where_id || kid == by_id || - kid == take_id || kid == nearest_id) - continue; - if (kid == desc_id) { - if (!v || v->type != -RAY_SYM) - return NULL; - saw_desc = 1; - continue; - } - if (kid == asc_id) return NULL; - if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) && - kid == key_sym && v->i64 == key_sym) { - saw_key_projection = 1; - continue; - } - if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2) - return NULL; - ray_t** ae = (ray_t**)ray_data(v); - if (!ae[0] || ae[0]->type != -RAY_SYM || - !sym_name_eq(ae[0]->i64, "count", 5)) - return NULL; - ray_t* arg = ae[1]; - if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME) || - arg->i64 != key_sym) - return NULL; - count_alias = kid; - } - if (!saw_desc || !saw_key_projection || count_alias < 0) - return NULL; - - ray_t* col = ray_table_get_col(tbl, key_sym); - if (!col || !ray_is_vec(col) || col->type != RAY_I16 || - (col->attrs & RAY_ATTR_HAS_NULLS)) - return NULL; - - ray_pool_t* pool = ray_pool_get(); - uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; - if (nw == 0) nw = 1; - ray_t* counts_hdr = NULL; - uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr, - (size_t)nw * 65536u * sizeof(uint32_t)); - if (!counts) - return ray_error("oom", NULL); - - i16_ne0_count_ctx_t ctx = { - .key = (const int16_t*)ray_data(col), - .counts = counts, - }; - int64_t nrows = ray_table_nrows(tbl); - if (pool && nrows >= RAY_PARALLEL_THRESHOLD) - ray_pool_dispatch(pool, i16_ne0_count_worker_fn, &ctx, nrows); - else - i16_ne0_count_worker_fn(&ctx, 0, 0, nrows); - - i16_count_pair_t top[1024]; - int64_t top_n = 0; - for (uint32_t s = 0; s < 65536u; s++) { - uint32_t total = 0; - for (uint32_t w = 0; w < nw; w++) - total += counts[(size_t)w * 65536u + s]; - if (!total) continue; - i16_count_pair_t cand = { - .key = (int16_t)((int32_t)s - 32768), - .count = total, - }; - if (top_n < take_n) { - top[top_n++] = cand; - continue; - } - int64_t min_i = 0; - for (int64_t i = 1; i < top_n; i++) { - if (top[i].count < top[min_i].count || - (top[i].count == top[min_i].count && top[i].key > top[min_i].key)) - min_i = i; - } - if (cand.count > top[min_i].count || - (cand.count == top[min_i].count && cand.key < top[min_i].key)) - top[min_i] = cand; - } - scratch_free(counts_hdr); - qsort(top, (size_t)top_n, sizeof(i16_count_pair_t), - i16_count_pair_desc_cmp); - - int64_t out_n = top_n; - ray_t* key_out = ray_vec_new(RAY_I16, out_n); - ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); - if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) { - if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out); - if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); - return ray_error("oom", NULL); - } - key_out->len = out_n; - cnt_out->len = out_n; - int16_t* ko = (int16_t*)ray_data(key_out); - int64_t* co = (int64_t*)ray_data(cnt_out); - for (int64_t i = 0; i < out_n; i++) { - ko[i] = top[i].key; - co[i] = (int64_t)top[i].count; - } - - ray_t* out = ray_table_new(2); - if (!out || RAY_IS_ERR(out)) { - ray_release(key_out); ray_release(cnt_out); - return out ? out : ray_error("oom", NULL); - } - out = ray_table_add_col(out, key_sym, key_out); - out = ray_table_add_col(out, count_alias, cnt_out); - ray_release(key_out); ray_release(cnt_out); - return out; -} - -static ray_t* try_i32_i64_count_distinct_select(ray_t* tbl, ray_t* where_expr, - ray_t* by_expr, - ray_t* take_expr, - ray_t** dict_elems, - int64_t dict_n, - int64_t from_id, - int64_t where_id, - int64_t by_id, - int64_t take_id, - int64_t asc_id, - int64_t desc_id, - int64_t nearest_id) { - if (!tbl || tbl->type != RAY_TABLE || where_expr || !by_expr || - !take_expr || by_expr->type != -RAY_SYM || - !(by_expr->attrs & RAY_ATTR_NAME)) - return NULL; - - int64_t take_n = 0; - if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024) - return NULL; - - int64_t group_sym = by_expr->i64; - int64_t distinct_sym = -1; - int64_t count_alias = -1; - int saw_desc = 0; - int saw_group_projection = 0; - for (int64_t i = 0; i + 1 < dict_n; i += 2) { - int64_t kid = dict_elems[i]->i64; - ray_t* v = dict_elems[i + 1]; - if (kid == from_id || kid == where_id || kid == by_id || - kid == take_id || kid == nearest_id) - continue; - if (kid == desc_id) { - if (!v || v->type != -RAY_SYM) - return NULL; - saw_desc = 1; - continue; - } - if (kid == asc_id) return NULL; - if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) && - kid == group_sym && v->i64 == group_sym) { - saw_group_projection = 1; - continue; - } - if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2) - return NULL; - ray_t** ae = (ray_t**)ray_data(v); - if (!ae[0] || ae[0]->type != -RAY_SYM || - !sym_name_eq(ae[0]->i64, "count", 5)) - return NULL; - ray_t* inner = ae[1]; - if (!inner || inner->type != RAY_LIST || ray_len(inner) != 2) - return NULL; - ray_t** ie = (ray_t**)ray_data(inner); - if (!ie[0] || ie[0]->type != -RAY_SYM || - !sym_name_eq(ie[0]->i64, "distinct", 8)) - return NULL; - ray_t* arg = ie[1]; - if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME)) - return NULL; - distinct_sym = arg->i64; - count_alias = kid; - } - if (!saw_desc || !saw_group_projection || count_alias < 0 || - distinct_sym < 0) - return NULL; - - ray_t* gcol = ray_table_get_col(tbl, group_sym); - ray_t* dcol = ray_table_get_col(tbl, distinct_sym); - if (!gcol || !dcol || !ray_is_vec(gcol) || !ray_is_vec(dcol) || - gcol->type != RAY_I32 || dcol->type != RAY_I64 || - (gcol->attrs & RAY_ATTR_HAS_NULLS) || - (dcol->attrs & RAY_ATTR_HAS_NULLS)) - return NULL; - - int64_t nrows = ray_table_nrows(tbl); - ray_pool_t* pool = ray_pool_get(); - uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; - if (nw == 0) nw = 1; - const uint32_t local_cap = 1u << 20; - ray_t *lg_hdr = NULL, *lv_hdr = NULL, *lu_hdr = NULL; - int32_t* lg = (int32_t*)scratch_calloc(&lg_hdr, - (size_t)nw * local_cap * sizeof(int32_t)); - int64_t* lv = (int64_t*)scratch_calloc(&lv_hdr, - (size_t)nw * local_cap * sizeof(int64_t)); - uint8_t* lu = (uint8_t*)scratch_calloc(&lu_hdr, (size_t)nw * local_cap); - if (!lg || !lv || !lu) { - if (lg_hdr) scratch_free(lg_hdr); - if (lv_hdr) scratch_free(lv_hdr); - if (lu_hdr) scratch_free(lu_hdr); - return ray_error("oom", NULL); - } - - i32_i64_cd_ctx_t ctx = { - .group = (const int32_t*)ray_data(gcol), - .distinct = (const int64_t*)ray_data(dcol), - .cap = local_cap, - .groups = lg, - .values = lv, - .used = lu, - }; - atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed); - if (pool && nrows >= RAY_PARALLEL_THRESHOLD) - ray_pool_dispatch(pool, i32_i64_cd_worker_fn, &ctx, nrows); - else - i32_i64_cd_worker_fn(&ctx, 0, 0, nrows); - if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) { - scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); - return NULL; - } - - const uint32_t gcap = 1u << 23; - const uint32_t gmask = gcap - 1u; - ray_t *gg_hdr = NULL, *gv_hdr = NULL, *gu_hdr = NULL; - int32_t* gg = (int32_t*)scratch_calloc(&gg_hdr, (size_t)gcap * sizeof(int32_t)); - int64_t* gv = (int64_t*)scratch_calloc(&gv_hdr, (size_t)gcap * sizeof(int64_t)); - uint8_t* gu = (uint8_t*)scratch_calloc(&gu_hdr, (size_t)gcap); - if (!gg || !gv || !gu) { - scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); - if (gg_hdr) scratch_free(gg_hdr); - if (gv_hdr) scratch_free(gv_hdr); - if (gu_hdr) scratch_free(gu_hdr); - return ray_error("oom", NULL); - } - - int64_t global_n = 0; - for (uint32_t w = 0; w < nw; w++) { - int32_t* wg = lg + (size_t)w * local_cap; - int64_t* wv = lv + (size_t)w * local_cap; - uint8_t* wu = lu + (size_t)w * local_cap; - for (uint32_t s = 0; s < local_cap; s++) { - if (!wu[s]) continue; - int32_t g = wg[s]; - int64_t v = wv[s]; - uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & gmask; - while (gu[slot] && (gg[slot] != g || gv[slot] != v)) - slot = (slot + 1u) & gmask; - if (!gu[slot]) { - if (global_n >= (int64_t)(gcap * 7u / 10u)) { - scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr); - scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); - return NULL; - } - gu[slot] = 1; - gg[slot] = g; - gv[slot] = v; - global_n++; - } - } - } - scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); - - const uint32_t rcap = 4096; - const uint32_t rmask = rcap - 1u; - int32_t rkeys[4096]; - uint32_t rcounts[4096]; - uint8_t rused[4096]; - memset(rused, 0, sizeof(rused)); - int64_t region_n = 0; - for (uint32_t s = 0; s < gcap; s++) { - if (!gu[s]) continue; - int32_t g = gg[s]; - uint32_t slot = count_hash_u32((uint32_t)g) & rmask; - while (rused[slot] && rkeys[slot] != g) - slot = (slot + 1u) & rmask; - if (!rused[slot]) { - if (region_n >= (int64_t)(rcap / 2)) { - scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr); - return NULL; - } - rused[slot] = 1; - rkeys[slot] = g; - rcounts[slot] = 0; - region_n++; - } - rcounts[slot]++; - } - scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr); - - ray_t* pairs_hdr = NULL; - i32_count_pair_t* pairs = (i32_count_pair_t*)scratch_alloc( - &pairs_hdr, (size_t)region_n * sizeof(i32_count_pair_t)); - if (!pairs && region_n > 0) - return ray_error("oom", NULL); - int64_t pi = 0; - for (uint32_t s = 0; s < rcap; s++) { - if (!rused[s]) continue; - pairs[pi++] = (i32_count_pair_t){ .key = rkeys[s], .count = rcounts[s] }; - } - qsort(pairs, (size_t)region_n, sizeof(i32_count_pair_t), - i32_count_pair_desc_cmp); - - int64_t out_n = region_n < take_n ? region_n : take_n; - ray_t* key_out = ray_vec_new(RAY_I32, out_n); - ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); - if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) { - if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out); - if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); - scratch_free(pairs_hdr); - return ray_error("oom", NULL); - } - key_out->len = out_n; - cnt_out->len = out_n; - int32_t* ko = (int32_t*)ray_data(key_out); - int64_t* co = (int64_t*)ray_data(cnt_out); - for (int64_t i = 0; i < out_n; i++) { - ko[i] = pairs[i].key; - co[i] = (int64_t)pairs[i].count; - } - scratch_free(pairs_hdr); - - ray_t* out = ray_table_new(2); - if (!out || RAY_IS_ERR(out)) { - ray_release(key_out); ray_release(cnt_out); - return out ? out : ray_error("oom", NULL); - } - out = ray_table_add_col(out, group_sym, key_out); - out = ray_table_add_col(out, count_alias, cnt_out); - ray_release(key_out); ray_release(cnt_out); - return out; -} - -static ray_t* try_i16x2_count_desc_select(ray_t* tbl, ray_t* where_expr, - ray_t* by_expr, ray_t* take_expr, - ray_t** dict_elems, int64_t dict_n, - int64_t from_id, int64_t where_id, - int64_t by_id, int64_t take_id, - int64_t asc_id, int64_t desc_id, - int64_t nearest_id) { - if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr || - !take_expr || by_expr->type != RAY_DICT) - return NULL; - - int64_t take_n = 0; - if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000) - return NULL; - - DICT_VIEW_DECL(bv); - DICT_VIEW_OPEN(by_expr, bv); - if (DICT_VIEW_OVERFLOW(bv) || bv_n != 4) return NULL; - ray_t* key0_atom = bv[0]; - ray_t* key0_val = bv[1]; - ray_t* key1_atom = bv[2]; - ray_t* key1_val = bv[3]; - if (!key0_atom || key0_atom->type != -RAY_SYM || - !key1_atom || key1_atom->type != -RAY_SYM || - !key0_val || key0_val->type != -RAY_SYM || - !key1_val || key1_val->type != -RAY_SYM || - !(key0_val->attrs & RAY_ATTR_NAME) || - !(key1_val->attrs & RAY_ATTR_NAME) || - key0_atom->i64 != key0_val->i64 || - key1_atom->i64 != key1_val->i64) - return NULL; - - int64_t count_alias = -1; - int saw_desc = 0; - for (int64_t i = 0; i + 1 < dict_n; i += 2) { - int64_t kid = dict_elems[i]->i64; - ray_t* v = dict_elems[i + 1]; - if (kid == from_id || kid == where_id || kid == by_id || - kid == take_id || kid == nearest_id) - continue; - if (kid == desc_id) { - if (!v || v->type != -RAY_SYM) - return NULL; - saw_desc = 1; - continue; - } - if (kid == asc_id) return NULL; - if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL; - ray_t** ae = (ray_t**)ray_data(v); - if (!ae[0] || ae[0]->type != -RAY_SYM || - !sym_name_eq(ae[0]->i64, "count", 5)) - return NULL; - count_alias = kid; - } - if (!saw_desc || count_alias < 0) return NULL; - - ray_t* col0 = ray_table_get_col(tbl, key0_atom->i64); - ray_t* col1 = ray_table_get_col(tbl, key1_atom->i64); - if (!col0 || !col1 || !ray_is_vec(col0) || !ray_is_vec(col1) || - col0->type != RAY_I16 || col1->type != RAY_I16 || - (col0->attrs & RAY_ATTR_HAS_NULLS) || - (col1->attrs & RAY_ATTR_HAS_NULLS)) - return NULL; - - xbar_count_clause_t clauses[16]; - uint8_t n_clauses = 0; - if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) || - n_clauses == 0) - return NULL; - order_count_clauses(clauses, n_clauses); - - int64_t nrows = ray_table_nrows(tbl); - const uint32_t cap = 4096; - const uint32_t mask = cap - 1u; - ray_pool_t* pool = ray_pool_get(); - uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; - if (nw == 0) nw = 1; - - ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL; - uint32_t* keys = (uint32_t*)scratch_calloc(&keys_hdr, - (size_t)nw * cap * sizeof(uint32_t)); - uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr, - (size_t)nw * cap * sizeof(uint32_t)); - uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap); - if (!keys || !counts || !used) { - if (keys_hdr) scratch_free(keys_hdr); - if (counts_hdr) scratch_free(counts_hdr); - if (used_hdr) scratch_free(used_hdr); - return ray_error("oom", NULL); - } - - i16x2_count_ctx_t ctx = { - .key0 = (const int16_t*)ray_data(col0), - .key1 = (const int16_t*)ray_data(col1), - .n_clauses = n_clauses, - .cap = cap, - .keys = keys, - .counts = counts, - .used = used, - }; - memcpy(ctx.clauses, clauses, sizeof(clauses)); - atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed); - if (pool && nrows >= RAY_PARALLEL_THRESHOLD) - ray_pool_dispatch(pool, i16x2_count_worker_fn, &ctx, nrows); - else - i16x2_count_worker_fn(&ctx, 0, 0, nrows); - if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) { - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - return NULL; - } - - ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL; - uint32_t* mkeys = (uint32_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(uint32_t)); - uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t)); - uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap); - if (!mkeys || !mcounts || !mused) { - scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); - if (mkeys_hdr) scratch_free(mkeys_hdr); - if (mcounts_hdr) scratch_free(mcounts_hdr); - if (mused_hdr) scratch_free(mused_hdr); - return ray_error("oom", NULL); - } - - int64_t n_groups = 0; - for (uint32_t w = 0; w < nw; w++) { - uint32_t* wk = keys + (size_t)w * cap; - uint32_t* wc = counts + (size_t)w * cap; - uint8_t* wu = used + (size_t)w * cap; - for (uint32_t s = 0; s < cap; s++) { - if (!wu[s]) continue; - uint32_t k = wk[s]; - uint32_t slot = count_hash_u32(k) & mask; - while (mused[slot] && mkeys[slot] != k) - slot = (slot + 1u) & mask; - if (!mused[slot]) { - if (n_groups >= (int64_t)(cap / 2)) { - scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); - scratch_free(mused_hdr); scratch_free(keys_hdr); - scratch_free(counts_hdr); scratch_free(used_hdr); - return NULL; - } - mused[slot] = 1; - mkeys[slot] = k; - n_groups++; - } - mcounts[slot] += wc[s]; - } - } - - int64_t out_n = n_groups < take_n ? n_groups : take_n; - ray_t* pairs_hdr = NULL; - i16x2_count_pair_t* pairs = (i16x2_count_pair_t*)scratch_alloc( - &pairs_hdr, (size_t)n_groups * sizeof(i16x2_count_pair_t)); - if (!pairs && n_groups > 0) { - scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr); - scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); - return ray_error("oom", NULL); - } - int64_t pi = 0; - for (uint32_t s = 0; s < cap; s++) { - if (!mused[s]) continue; - pairs[pi++] = (i16x2_count_pair_t){ .key = mkeys[s], .count = mcounts[s] }; - } - qsort(pairs, (size_t)n_groups, sizeof(i16x2_count_pair_t), - i16x2_count_pair_desc_cmp); - - ray_t* key0_out = ray_vec_new(RAY_I16, out_n); - ray_t* key1_out = ray_vec_new(RAY_I16, out_n); - ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); - if (!key0_out || !key1_out || !cnt_out || - RAY_IS_ERR(key0_out) || RAY_IS_ERR(key1_out) || RAY_IS_ERR(cnt_out)) { - if (key0_out && !RAY_IS_ERR(key0_out)) ray_release(key0_out); - if (key1_out && !RAY_IS_ERR(key1_out)) ray_release(key1_out); - if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); - scratch_free(pairs_hdr); - scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr); - scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); - return ray_error("oom", NULL); - } - key0_out->len = out_n; - key1_out->len = out_n; - cnt_out->len = out_n; - int16_t* k0o = (int16_t*)ray_data(key0_out); - int16_t* k1o = (int16_t*)ray_data(key1_out); - int64_t* co = (int64_t*)ray_data(cnt_out); - for (int64_t i = 0; i < out_n; i++) { - uint32_t k = pairs[i].key; - k0o[i] = (int16_t)(uint16_t)(k >> 16); - k1o[i] = (int16_t)(uint16_t)k; - co[i] = (int64_t)pairs[i].count; - } - scratch_free(pairs_hdr); - scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr); - scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); - - ray_t* out = ray_table_new(3); - if (!out || RAY_IS_ERR(out)) { - ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out); - return out ? out : ray_error("oom", NULL); - } - out = ray_table_add_col(out, key0_atom->i64, key0_out); - out = ray_table_add_col(out, key1_atom->i64, key1_out); - out = ray_table_add_col(out, count_alias, cnt_out); - ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out); - return out; -} - -static ray_t* try_xbar_count_select(ray_t* tbl, ray_t* where_expr, - ray_t* by_expr, ray_t* take_expr, - ray_t** dict_elems, int64_t dict_n, - int64_t from_id, int64_t where_id, - int64_t by_id, int64_t take_id, - int64_t asc_id, int64_t desc_id, - int64_t nearest_id) { - if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr || - !take_expr) - return NULL; - - int64_t take_n = 0; - if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000) - return NULL; - - if (!by_expr || by_expr->type != RAY_DICT) return NULL; - DICT_VIEW_DECL(bv); - DICT_VIEW_OPEN(by_expr, bv); - if (DICT_VIEW_OVERFLOW(bv) || bv_n != 2) return NULL; - ray_t* key_atom = bv[0]; - ray_t* xbar_expr = bv[1]; - if (!key_atom || key_atom->type != -RAY_SYM || - !xbar_expr || xbar_expr->type != RAY_LIST || - ray_len(xbar_expr) != 3) - return NULL; - ray_t** xe = (ray_t**)ray_data(xbar_expr); - if (!xe[0] || xe[0]->type != -RAY_SYM || - !sym_name_eq(xe[0]->i64, "xbar", 4)) - return NULL; - if (!xe[1] || xe[1]->type != -RAY_SYM || - !(xe[1]->attrs & RAY_ATTR_NAME)) - return NULL; - int64_t bucket = 0; - if (!atom_i64_const(xe[2], &bucket) || bucket <= 0) return NULL; - - int64_t count_alias = -1; - int saw_asc = 0; - for (int64_t i = 0; i + 1 < dict_n; i += 2) { - int64_t kid = dict_elems[i]->i64; - ray_t* v = dict_elems[i + 1]; - if (kid == from_id || kid == where_id || kid == by_id || - kid == take_id || kid == nearest_id) - continue; - if (kid == asc_id) { - if (!v || v->type != -RAY_SYM || v->i64 != key_atom->i64) - return NULL; - saw_asc = 1; - continue; - } - if (kid == desc_id) return NULL; - if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL; - ray_t** ae = (ray_t**)ray_data(v); - if (!ae[0] || ae[0]->type != -RAY_SYM || - !sym_name_eq(ae[0]->i64, "count", 5)) - return NULL; - count_alias = kid; - } - if (!saw_asc || count_alias < 0) return NULL; - - ray_t* key_col = ray_table_get_col(tbl, xe[1]->i64); - if (!key_col || !ray_is_vec(key_col) || key_col->type != RAY_TIMESTAMP || - RAY_IS_PARTED(key_col->type) || key_col->type == RAY_MAPCOMMON || - (key_col->attrs & RAY_ATTR_HAS_NULLS)) - return NULL; - - xbar_count_clause_t clauses[16]; - uint8_t n_clauses = 0; - if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) || - n_clauses == 0) - return NULL; - order_count_clauses(clauses, n_clauses); - - int64_t nrows = ray_table_nrows(tbl); - const int64_t* key_data = (const int64_t*)ray_data(key_col); - const uint32_t cap = 4096; - const uint32_t mask = cap - 1u; - ray_pool_t* pool = ray_pool_get(); - uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; - if (nw == 0) nw = 1; - ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL; - int64_t* keys = (int64_t*)scratch_calloc(&keys_hdr, - (size_t)nw * cap * sizeof(int64_t)); - uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr, - (size_t)nw * cap * sizeof(uint32_t)); - uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap); - if (!keys || !counts || !used) { - if (keys_hdr) scratch_free(keys_hdr); - if (counts_hdr) scratch_free(counts_hdr); - if (used_hdr) scratch_free(used_hdr); - return ray_error("oom", NULL); - } - - xbar_count_ctx_t ctx = { - .key_data = key_data, - .bucket = bucket, - .n_clauses = n_clauses, - .cap = cap, - .keys = keys, - .counts = counts, - .used = used, - }; - memcpy(ctx.clauses, clauses, sizeof(clauses)); - atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed); - if (pool && nrows >= RAY_PARALLEL_THRESHOLD) - ray_pool_dispatch(pool, xbar_count_worker_fn, &ctx, nrows); - else - xbar_count_worker_fn(&ctx, 0, 0, nrows); - if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) { - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - return NULL; - } - - ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL; - int64_t* mkeys = (int64_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(int64_t)); - uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t)); - uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap); - if (!mkeys || !mcounts || !mused) { - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - if (mkeys_hdr) scratch_free(mkeys_hdr); - if (mcounts_hdr) scratch_free(mcounts_hdr); - if (mused_hdr) scratch_free(mused_hdr); - return ray_error("oom", NULL); - } - - int64_t n_groups = 0; - for (uint32_t w = 0; w < nw; w++) { - int64_t* wk = keys + (size_t)w * cap; - uint32_t* wc = counts + (size_t)w * cap; - uint8_t* wu = used + (size_t)w * cap; - for (uint32_t s = 0; s < cap; s++) { - if (!wu[s]) continue; - int64_t k = wk[s]; - uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask; - while (mused[slot] && mkeys[slot] != k) - slot = (slot + 1u) & mask; - if (!mused[slot]) { - if (n_groups >= (int64_t)(cap / 2)) { - scratch_free(mkeys_hdr); - scratch_free(mcounts_hdr); - scratch_free(mused_hdr); - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - return NULL; - } - mused[slot] = 1; - mkeys[slot] = k; - n_groups++; - } - mcounts[slot] += wc[s]; - } - } - - int64_t out_n = n_groups < take_n ? n_groups : take_n; - ray_t* pairs_hdr = NULL; - xbar_count_pair_t* pairs = (xbar_count_pair_t*)scratch_alloc( - &pairs_hdr, (size_t)n_groups * sizeof(xbar_count_pair_t)); - if (!pairs && n_groups > 0) { - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - return ray_error("oom", NULL); - } - int64_t pi = 0; - for (uint32_t s = 0; s < cap; s++) { - if (!mused[s]) continue; - pairs[pi++] = (xbar_count_pair_t){ .key = mkeys[s], .count = mcounts[s] }; - } - qsort(pairs, (size_t)n_groups, sizeof(xbar_count_pair_t), - xbar_count_pair_cmp); - - ray_t* key_out = ray_vec_new(RAY_TIMESTAMP, out_n); - ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); - if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) { - if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out); - if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); - scratch_free(pairs_hdr); - scratch_free(mkeys_hdr); - scratch_free(mcounts_hdr); - scratch_free(mused_hdr); - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - return ray_error("oom", NULL); - } - key_out->len = out_n; - cnt_out->len = out_n; - int64_t* ko = (int64_t*)ray_data(key_out); - int64_t* co = (int64_t*)ray_data(cnt_out); - for (int64_t i = 0; i < out_n; i++) { - ko[i] = pairs[i].key; - co[i] = pairs[i].count; - } - scratch_free(pairs_hdr); - scratch_free(mkeys_hdr); - scratch_free(mcounts_hdr); - scratch_free(mused_hdr); - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - - ray_t* out = ray_table_new(2); - if (!out || RAY_IS_ERR(out)) { - ray_release(key_out); - ray_release(cnt_out); - return out ? out : ray_error("oom", NULL); - } - out = ray_table_add_col(out, key_atom->i64, key_out); - out = ray_table_add_col(out, count_alias, cnt_out); - ray_release(key_out); - ray_release(cnt_out); - return out; -} - static int expr_affine_of_sym(ray_t* expr, int64_t sym, int64_t* bias) { if (!expr) return 0; if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME) && @@ -4753,43 +3628,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (kid == asc_id || kid == desc_id) { has_sort = true; break; } } - ray_t* xbar_count = try_xbar_count_select(tbl, where_expr, by_expr, - take_expr, dict_elems, dict_n, - from_id, where_id, by_id, - take_id, asc_id, desc_id, - nearest_id); - if (xbar_count) { - ray_release(tbl); - return xbar_count; - } - - ray_t* i16_ne0_count = try_i16_ne0_count_desc_select( - tbl, where_expr, by_expr, take_expr, dict_elems, dict_n, - from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id); - if (i16_ne0_count) { - ray_release(tbl); - return i16_ne0_count; - } - - ray_t* i32_i64_cd = try_i32_i64_count_distinct_select( - tbl, where_expr, by_expr, take_expr, dict_elems, dict_n, - from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id); - if (i32_i64_cd) { - ray_release(tbl); - return i32_i64_cd; - } - - ray_t* i16x2_count = try_i16x2_count_desc_select(tbl, where_expr, by_expr, - take_expr, dict_elems, - dict_n, from_id, - where_id, by_id, - take_id, asc_id, - desc_id, nearest_id); - if (i16x2_count) { - ray_release(tbl); - return i16x2_count; - } - /* `nearest` is mutually exclusive with `asc`/`desc`/`by` — ANN * ordering is an index scan, not a column sort, and cannot be * composed with group-by in this phase. */ From 66c266124d8d356abc02735591e902d9c7ffac91 Mon Sep 17 00:00:00 2001 From: Hetoku Date: Mon, 25 May 2026 11:30:44 +0200 Subject: [PATCH 04/11] =?UTF-8?q?perf(group):=20fused=20radix=20HT=20?= =?UTF-8?q?=E2=80=94=20per-(worker,=20partition)=20direct=20insert?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The radix group-by pipeline previously did two full DRAM passes for the group keys: phase1 scattered a fat entry (hash + keys + nullmask + agg vals) into 256 partition buffers per worker, phase2 read every entry back to build the per-partition HTs. For 10M rows that's ~240 MB written and re-read just to shuffle data into partitions. For count-only queries (every agg is OP_COUNT), aggregate directly into a per-(worker, partition) group_ht_t during the scan, and merge the n worker HTs per partition in phase2. The per-(worker, partition) HT is small enough (~1.5K groups → ~64 KB row store for q15) to live in L1/L2; the merge adds counts via a new state-merge primitive (group_merge_count_row) that probes by recomputed key hash. Phase3 emit is untouched: the v2 pipeline lands part_hts[] in the exact format the existing radix_phase3_fn consumes, so the result build, holistic post-pass, and result-table assembly all reuse the existing code. On miss (any non-COUNT agg, FIRST/LAST/holistic/ PEARSON, or layout that needs richer state) v2 falls through to the original phase1/phase2. Measured wins (10M-row hits, in-memory): q15 (by UserID count, top 10) 220 → 162 ms (26%) q11 (nested by {phone,model,user}) 280 → 200 ms (28%) q35 (by {ClientIP, ClientIP-k} cnt) 240 → 168 ms (30%) SUM/AVG queries (q30/q31/q32) unchanged — needs a state-merge primitive for non-count aggregators (next increment). Test suite: 2657/2659 pass (2 skipped, 0 failed). --- src/ops/group.c | 295 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 295 insertions(+) diff --git a/src/ops/group.c b/src/ops/group.c index 37f01670..72535d4a 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -3093,6 +3093,219 @@ static void radix_phase2_fn(void* ctx, uint32_t worker_id, int64_t start, int64_ } } +/* ============================================================================ + * Fused radix: per-(worker, partition) HT direct-insert + per-partition merge + * + * Replaces the materialise-fat-entries-then-build-HTs round trip with a + * single-pass aggregation per (worker, partition) HT, followed by an + * in-cache merge per partition. Currently restricted to count-only + * queries (every agg is OP_COUNT) — the merge primitive here only + * knows how to combine counts; SUM/AVG/MIN/MAX would need their own + * state-merge logic (next increment). + * + * Per-(worker, partition) HT for a 10M-row count-by-UserID: ~3M distinct + * keys ÷ 256 parts ÷ 8 workers ≈ 1.5K groups → cap ~4K slots → ~64 KB + * row store, L1/L2-resident. Worker w processes its row range; per row + * it hashes keys, computes partition = RADIX_PART(h), probes its local + * HT_p. Phase2 dispatches partitions across workers; each merges the n + * worker HTs for one partition into a final partition HT in part_hts[p]. + * Phase3 (radix_phase3_fn) emits from part_hts[] exactly as before. + * ============================================================================ */ + +/* Merge one source group row (count + keys + null_mask) into the target HT. + * Hash is recomputed from the row's key region via hash_keys_inline — + * identical to what group_probe_entry did when the row was first inserted, + * so the partition assignment is consistent. Count-only: state merge is + * just count += src_count; new groups inherit the source's count. */ +static inline uint32_t group_merge_count_row(group_ht_t* ht, + const char* src_row, const int8_t* key_types, uint32_t mask) +{ + const ght_layout_t* ly = &ht->layout; + int64_t src_count = *(const int64_t*)src_row; + const int64_t* skeys = (const int64_t*)(src_row + 8); + uint16_t key_bytes = (uint16_t)((ly->n_keys + 1) * 8); + uint64_t h = hash_keys_inline(skeys, key_types, ly->n_keys, + ly->wide_key_mask, ly->wide_key_esz, + ht->key_data); + uint8_t salt = HT_SALT(h); + uint32_t slot = (uint32_t)(h & mask); + for (;;) { + uint32_t sv = ht->slots[slot]; + if (sv == HT_EMPTY) { + if (ht->grp_count >= ht->grp_cap) { + if (!group_ht_grow(ht)) { ht->oom = 1; return mask; } + } + uint32_t gid = ht->grp_count++; + char* row = ht->rows + (size_t)gid * ly->row_stride; + *(int64_t*)row = src_count; + memcpy(row + 8, skeys, key_bytes); + ht->slots[slot] = HT_PACK(salt, gid); + if (ht->grp_count * 2 > ht->ht_cap) { + group_ht_rehash(ht, key_types); + mask = ht->ht_cap - 1; + } + return mask; + } + if (HT_SALT_V(sv) == salt) { + uint32_t gid = HT_GID(sv); + char* row = ht->rows + (size_t)gid * ly->row_stride; + if (group_keys_equal((const int64_t*)(row + 8), + skeys, ly, ht->key_data)) { + *(int64_t*)row += src_count; + return mask; + } + } + slot = (slot + 1) & mask; + } +} + +typedef struct { + void** key_data; + int8_t* key_types; + uint8_t* key_attrs; + ray_t** key_vecs; + uint8_t nullable_mask; + uint32_t n_workers; + group_ht_t* wpart_hts; /* [n_workers * RADIX_P] */ + ght_layout_t layout; + ray_t* rowsel; + const int64_t* match_idx; + _Atomic(int) oom; +} radix_v2_phase1_ctx_t; + +static void radix_v2_phase1_fn(void* ctx, uint32_t worker_id, + int64_t start, int64_t end) { + radix_v2_phase1_ctx_t* c = (radix_v2_phase1_ctx_t*)ctx; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + const ght_layout_t* ly = &c->layout; + uint8_t nk = ly->n_keys; + uint8_t wide = ly->wide_key_mask; + uint8_t nullable = c->nullable_mask; + const int64_t* match_idx = c->match_idx; + + group_ht_t* my_hts = &c->wpart_hts[(size_t)worker_id * RADIX_P]; + /* Lazily init this worker's 256 partition HTs. */ + for (uint32_t p = 0; p < RADIX_P; p++) { + if (!my_hts[p].slots) { + if (!group_ht_init_sized(&my_hts[p], 256, ly, 128)) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + if (wide && c->key_data) + group_ht_set_key_data(&my_hts[p], c->key_data); + } + } + uint32_t masks[RADIX_P]; + for (uint32_t p = 0; p < RADIX_P; p++) masks[p] = my_hts[p].ht_cap - 1; + + /* Stack-resident transient entry, same layout as group_rows_range. */ + char ebuf[8 + 9 * 8 + 8 * 8 + 8]; + for (int64_t i = start; i < end; i++) { + if (((i - start) & 65535) == 0 && ray_interrupted()) break; + int64_t row = match_idx ? match_idx[i] : i; + if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, row)) + continue; + uint64_t h = 0; + int64_t* ek = (int64_t*)(ebuf + 8); + int64_t null_mask = 0; + for (uint8_t k = 0; k < nk; k++) { + int8_t t = c->key_types[k]; + uint64_t kh; + bool is_null = (nullable & (1u << k)) + && ray_vec_is_null(c->key_vecs[k], row); + if (is_null) { + null_mask |= (int64_t)(1u << k); + ek[k] = 0; + kh = ray_hash_i64(0); + } else if (wide & (1u << k)) { + uint8_t esz = ly->wide_key_esz[k]; + const void* src = (const char*)c->key_data[k] + (size_t)row * esz; + ek[k] = row; + kh = ray_hash_bytes(src, esz); + } else if (t == RAY_F64) { + int64_t kv; + memcpy(&kv, &((double*)c->key_data[k])[row], 8); + ek[k] = kv; + kh = ray_hash_f64(((double*)c->key_data[k])[row]); + } else { + int64_t kv = read_col_i64(c->key_data[k], row, t, c->key_attrs[k]); + ek[k] = kv; + kh = ray_hash_i64(kv); + } + h = (k == 0) ? kh : ray_hash_combine(h, kh); + } + ek[nk] = null_mask; + if (null_mask) h = ray_hash_combine(h, ray_hash_i64(null_mask)); + *(uint64_t*)ebuf = h; + /* Count-only: no agg_vals to pack; entry body ends at the null-mask + * slot. The HT row layout matches (need_flags == 0). */ + uint32_t p = RADIX_PART(h); + uint32_t new_mask = group_probe_entry(&my_hts[p], ebuf, + c->key_types, masks[p]); + if (my_hts[p].oom) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + masks[p] = new_mask; + } +} + +typedef struct { + group_ht_t* wpart_hts; /* [n_workers * RADIX_P] — input */ + group_ht_t* part_hts; /* [RADIX_P] — output */ + int8_t* key_types; + uint32_t n_workers; + ght_layout_t layout; + void** key_data; + _Atomic(int) oom; +} radix_v2_phase2_ctx_t; + +static void radix_v2_phase2_fn(void* ctx, uint32_t worker_id, + int64_t start, int64_t end) { + (void)worker_id; + radix_v2_phase2_ctx_t* c = (radix_v2_phase2_ctx_t*)ctx; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + uint16_t row_stride = c->layout.row_stride; + for (int64_t p = start; p < end; p++) { + /* Upper bound on the merged partition: sum of worker grp_counts + * (some keys may be present in multiple workers — the merge will + * fold those, so the final grp_count is ≤ this sum). */ + uint32_t total_grps = 0; + for (uint32_t w = 0; w < c->n_workers; w++) + total_grps += c->wpart_hts[(size_t)w * RADIX_P + p].grp_count; + if (total_grps == 0) continue; + uint32_t ht_cap = 256; + { + uint64_t target = (uint64_t)total_grps * 2; + if (target < 256) target = 256; + while (ht_cap < target) ht_cap *= 2; + } + uint32_t init_grp = 256; + while (init_grp < total_grps && init_grp < 65536) init_grp *= 2; + if (!group_ht_init_sized(&c->part_hts[p], ht_cap, &c->layout, init_grp)) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + if (c->layout.wide_key_mask && c->key_data) + group_ht_set_key_data(&c->part_hts[p], c->key_data); + uint32_t mask = c->part_hts[p].ht_cap - 1; + for (uint32_t w = 0; w < c->n_workers; w++) { + group_ht_t* src = &c->wpart_hts[(size_t)w * RADIX_P + p]; + if (src->grp_count == 0) continue; + const char* rows = src->rows; + for (uint32_t gi = 0; gi < src->grp_count; gi++) { + mask = group_merge_count_row(&c->part_hts[p], + rows + (size_t)gi * row_stride, + c->key_types, mask); + if (c->part_hts[p].oom) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + } + } + } +} + /* ============================================================================ * Parallel direct-array accumulation for low-cardinality single integer key * ============================================================================ */ @@ -7292,6 +7505,87 @@ ht_path:; skip_top_count_filter: if (pool && nrows >= RAY_PARALLEL_THRESHOLD && n_total > 1 && !rowsel) { + /* Per-(worker, partition) direct-insert path for count-only. + * Bypasses the fat-entry materialisation and the phase1→phase2 + * DRAM round trip; on success it populates part_hts[] in the + * same format the existing phase3 emit consumes. */ + bool v2_count_only = (n_keys >= 1 && n_aggs > 0); + for (uint8_t a = 0; a < n_aggs && v2_count_only; a++) + if (ext->agg_ops[a] != OP_COUNT) v2_count_only = false; + if (v2_count_only && !(ght_layout.agg_is_first | ght_layout.agg_is_last + | ght_layout.agg_is_holistic + | ght_layout.agg_is_binary)) { + ray_t* wpart_hdr = NULL; + size_t v2_n_w = (size_t)n_total * RADIX_P; + group_ht_t* wpart_hts = (group_ht_t*)scratch_calloc( + &wpart_hdr, v2_n_w * sizeof(group_ht_t)); + ray_t* v2_part_hdr = NULL; + group_ht_t* v2_part_hts = wpart_hts + ? (group_ht_t*)scratch_calloc(&v2_part_hdr, + RADIX_P * sizeof(group_ht_t)) + : NULL; + if (!wpart_hts || !v2_part_hts) { + if (wpart_hts) scratch_free(wpart_hdr); + if (v2_part_hts) scratch_free(v2_part_hdr); + goto v2_done; + } + uint8_t v2_nullable = 0; + for (uint8_t k = 0; k < n_keys; k++) { + if (!key_vecs[k]) continue; + ray_t* src = (key_vecs[k]->attrs & RAY_ATTR_SLICE) + ? key_vecs[k]->slice_parent : key_vecs[k]; + if (src && (src->attrs & RAY_ATTR_HAS_NULLS)) + v2_nullable |= (uint8_t)(1u << k); + } + radix_v2_phase1_ctx_t v2p1 = { + .key_data = key_data, + .key_types = key_types, + .key_attrs = key_attrs, + .key_vecs = key_vecs, + .nullable_mask = v2_nullable, + .n_workers = n_total, + .wpart_hts = wpart_hts, + .layout = ght_layout, + .rowsel = rowsel, + .match_idx = match_idx, + .oom = 0, + }; + ray_pool_dispatch(pool, radix_v2_phase1_fn, &v2p1, n_scan); + CHECK_CANCEL_GOTO(pool, cleanup); + if (atomic_load_explicit(&v2p1.oom, memory_order_relaxed)) { + for (size_t i = 0; i < v2_n_w; i++) + group_ht_free(&wpart_hts[i]); + scratch_free(wpart_hdr); + scratch_free(v2_part_hdr); + goto v2_done; + } + radix_v2_phase2_ctx_t v2p2 = { + .wpart_hts = wpart_hts, + .part_hts = v2_part_hts, + .key_types = key_types, + .n_workers = n_total, + .layout = ght_layout, + .key_data = key_data, + .oom = 0, + }; + ray_pool_dispatch_n(pool, radix_v2_phase2_fn, &v2p2, RADIX_P); + CHECK_CANCEL_GOTO(pool, cleanup); + /* Worker HTs are no longer needed once the merge is done. */ + for (size_t i = 0; i < v2_n_w; i++) + group_ht_free(&wpart_hts[i]); + scratch_free(wpart_hdr); + if (atomic_load_explicit(&v2p2.oom, memory_order_relaxed)) { + for (uint32_t p = 0; p < RADIX_P; p++) + group_ht_free(&v2_part_hts[p]); + scratch_free(v2_part_hdr); + goto v2_done; + } + /* Hand off to the existing phase3 emit. */ + part_hts = v2_part_hts; + part_hts_hdr = v2_part_hdr; + goto v2_emit; + } +v2_done:; size_t n_bufs = (size_t)n_total * RADIX_P; radix_bufs = (radix_buf_t*)scratch_calloc(&radix_bufs_hdr, n_bufs * sizeof(radix_buf_t)); @@ -7394,6 +7688,7 @@ ht_path:; ray_heap_gc(); } +v2_emit:; /* Prefix offsets */ uint32_t part_offsets[RADIX_P + 1]; part_offsets[0] = 0; From 8c30d17f0c77d2b57ee75890076d5de9e1452d78 Mon Sep 17 00:00:00 2001 From: Hetoku Date: Mon, 25 May 2026 12:01:59 +0200 Subject: [PATCH 05/11] perf(group): extend per-partition path to SUM/AVG aggregators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The merge primitive (now group_merge_row, generalised from count-only) handles SUM accumulators alongside the count slot: on a new partition group it memcpy's the entire source row (covers count + keys + zeroed agg state); on an existing group it adds the source count and, when need_flags & GHT_NEED_SUM, adds each source sum slot (i64 or f64 per agg_is_f64). Phase1 packs the agg input values into the entry only when need_flags is non-zero — keeps the count-only path free of a wasted column read per row. Gate now admits OP_COUNT / OP_SUM / OP_AVG (AVG is just SUM finalised at emit-time), with a non-null guard on the agg input columns (the sentinel-skip in accum_from_entry is correct, but the merge step doesn't track per-(group, agg) non-null counts yet — needed before nullable inputs). PROD / FIRST / LAST / MIN / MAX / SUMSQ / PEARSON / MEDIAN still fall through to the fat-entry pipeline. Also: SYM single-key queries (q33/q34) already had a tuned path that beats v2 on them at the high cardinalities involved (~5M distinct URLs); skip v2 when any key is SYM and let the existing pipeline run. Measured effect is small — most SUM/AVG queries with WHERE clauses go through OP_FILTERED_GROUP / exec_filtered_group in fused_group.c, not through exec_group, so v2 here doesn't catch them. Lays the state-merge groundwork that a future fused_group v2 needs. Test suite: 2657/2659 pass (2 skipped, 0 failed). --- src/ops/group.c | 130 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 106 insertions(+), 24 deletions(-) diff --git a/src/ops/group.c b/src/ops/group.c index 72535d4a..d0d10e98 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -3112,23 +3112,30 @@ static void radix_phase2_fn(void* ctx, uint32_t worker_id, int64_t start, int64_ * Phase3 (radix_phase3_fn) emits from part_hts[] exactly as before. * ============================================================================ */ -/* Merge one source group row (count + keys + null_mask) into the target HT. - * Hash is recomputed from the row's key region via hash_keys_inline — - * identical to what group_probe_entry did when the row was first inserted, - * so the partition assignment is consistent. Count-only: state merge is - * just count += src_count; new groups inherit the source's count. */ -static inline uint32_t group_merge_count_row(group_ht_t* ht, +/* Merge one source group row into the target HT. Hash is recomputed from + * the row's key region via hash_keys_inline — identical to what + * group_probe_entry did when the row was first inserted, so the partition + * assignment is consistent. Supports need_flags ∈ {0, GHT_NEED_SUM}: + * count-only and count+SUM/AVG. On miss, the entire source row is copied + * verbatim (memcpy of row_stride); on hit, count += src.count and, when + * need_sum, each enabled sum slot accumulates the source's sum (f64 or + * i64 per agg_is_f64). Caller's v2 gate filters out PROD/FIRST/LAST/ + * MIN/MAX/SUMSQ/PEARSON/MEDIAN — those need richer state merges. */ +static inline uint32_t group_merge_row(group_ht_t* ht, const char* src_row, const int8_t* key_types, uint32_t mask) { const ght_layout_t* ly = &ht->layout; int64_t src_count = *(const int64_t*)src_row; const int64_t* skeys = (const int64_t*)(src_row + 8); - uint16_t key_bytes = (uint16_t)((ly->n_keys + 1) * 8); uint64_t h = hash_keys_inline(skeys, key_types, ly->n_keys, ly->wide_key_mask, ly->wide_key_esz, ht->key_data); uint8_t salt = HT_SALT(h); uint32_t slot = (uint32_t)(h & mask); + uint8_t na = ly->n_aggs; + uint8_t f64_mask = ly->agg_is_f64; + uint16_t off_sum = ly->off_sum; + bool need_sum = (ly->need_flags & GHT_NEED_SUM) != 0; for (;;) { uint32_t sv = ht->slots[slot]; if (sv == HT_EMPTY) { @@ -3137,8 +3144,8 @@ static inline uint32_t group_merge_count_row(group_ht_t* ht, } uint32_t gid = ht->grp_count++; char* row = ht->rows + (size_t)gid * ly->row_stride; - *(int64_t*)row = src_count; - memcpy(row + 8, skeys, key_bytes); + /* Whole-row copy: count + keys/null_mask + aggregator state. */ + memcpy(row, src_row, ly->row_stride); ht->slots[slot] = HT_PACK(salt, gid); if (ht->grp_count * 2 > ht->ht_cap) { group_ht_rehash(ht, key_types); @@ -3152,6 +3159,22 @@ static inline uint32_t group_merge_count_row(group_ht_t* ht, if (group_keys_equal((const int64_t*)(row + 8), skeys, ly, ht->key_data)) { *(int64_t*)row += src_count; + if (need_sum) { + for (uint8_t a = 0; a < na; a++) { + int8_t s = ly->agg_val_slot[a]; + if (s < 0) continue; + size_t off = (size_t)off_sum + (size_t)s * 8; + if (f64_mask & (1u << a)) { + double sv_f; + memcpy(&sv_f, src_row + off, 8); + *(double*)(row + off) += sv_f; + } else { + int64_t sv_i; + memcpy(&sv_i, src_row + off, 8); + *(int64_t*)(row + off) += sv_i; + } + } + } return mask; } } @@ -3164,6 +3187,9 @@ typedef struct { int8_t* key_types; uint8_t* key_attrs; ray_t** key_vecs; + ray_t** agg_vecs; /* may be NULL for pure COUNT (n_agg_vals==0) */ + ray_t** agg_vecs2; + uint8_t* agg_strlen; uint8_t nullable_mask; uint32_t n_workers; group_ht_t* wpart_hts; /* [n_workers * RADIX_P] */ @@ -3237,8 +3263,37 @@ static void radix_v2_phase1_fn(void* ctx, uint32_t worker_id, ek[nk] = null_mask; if (null_mask) h = ray_hash_combine(h, ray_hash_i64(null_mask)); *(uint64_t*)ebuf = h; - /* Count-only: no agg_vals to pack; entry body ends at the null-mask - * slot. The HT row layout matches (need_flags == 0). */ + /* Pack agg values into entry — only when the HT layout actually + * reads them. For count-only need_flags == 0 and accum_from_entry + * skips every agg slot; packing here would be a wasted column + * read per row (a measurable regression on q15-class queries). */ + if (ly->need_flags) { + int64_t* ev = (int64_t*)(ebuf + 8 + ((size_t)nk + 1) * 8); + uint8_t vi = 0; + uint8_t na = ly->n_aggs; + uint8_t bin_mask = ly->agg_is_binary; + uint8_t hol_mask = ly->agg_is_holistic; + for (uint8_t a = 0; a < na; a++) { + if (hol_mask & (1u << a)) continue; + ray_t* ac = c->agg_vecs ? c->agg_vecs[a] : NULL; + if (!ac) continue; + if (c->agg_strlen && c->agg_strlen[a]) + ev[vi] = group_strlen_at(ac, row); + else if (ac->type == RAY_F64) + memcpy(&ev[vi], &((double*)ray_data(ac))[row], 8); + else + ev[vi] = read_col_i64(ray_data(ac), row, ac->type, ac->attrs); + vi++; + if ((bin_mask & (1u << a)) && c->agg_vecs2 && c->agg_vecs2[a]) { + ray_t* ay = c->agg_vecs2[a]; + if (ay->type == RAY_F64) + memcpy(&ev[vi], &((double*)ray_data(ay))[row], 8); + else + ev[vi] = read_col_i64(ray_data(ay), row, ay->type, ay->attrs); + vi++; + } + } + } uint32_t p = RADIX_PART(h); uint32_t new_mask = group_probe_entry(&my_hts[p], ebuf, c->key_types, masks[p]); @@ -3294,9 +3349,9 @@ static void radix_v2_phase2_fn(void* ctx, uint32_t worker_id, if (src->grp_count == 0) continue; const char* rows = src->rows; for (uint32_t gi = 0; gi < src->grp_count; gi++) { - mask = group_merge_count_row(&c->part_hts[p], - rows + (size_t)gi * row_stride, - c->key_types, mask); + mask = group_merge_row(&c->part_hts[p], + rows + (size_t)gi * row_stride, + c->key_types, mask); if (c->part_hts[p].oom) { atomic_store_explicit(&c->oom, 1, memory_order_relaxed); return; @@ -7505,16 +7560,40 @@ ht_path:; skip_top_count_filter: if (pool && nrows >= RAY_PARALLEL_THRESHOLD && n_total > 1 && !rowsel) { - /* Per-(worker, partition) direct-insert path for count-only. - * Bypasses the fat-entry materialisation and the phase1→phase2 - * DRAM round trip; on success it populates part_hts[] in the - * same format the existing phase3 emit consumes. */ - bool v2_count_only = (n_keys >= 1 && n_aggs > 0); - for (uint8_t a = 0; a < n_aggs && v2_count_only; a++) - if (ext->agg_ops[a] != OP_COUNT) v2_count_only = false; - if (v2_count_only && !(ght_layout.agg_is_first | ght_layout.agg_is_last - | ght_layout.agg_is_holistic - | ght_layout.agg_is_binary)) { + /* Per-(worker, partition) direct-insert path: aggregates into + * thread-local partition HTs during phase1, then merges per + * partition. Bypasses the phase1 fat-entry materialisation + + * phase2 re-read DRAM round trip. On success it populates + * part_hts[] in the format the existing phase3 emit consumes. + * + * Gate: every agg is COUNT/SUM/AVG (the merge primitive knows + * how to add counts and sum slots; PROD/MIN/MAX/FIRST/LAST/ + * SUMSQ/PEARSON/MEDIAN need richer state-merge logic). Agg + * input columns must be non-nullable for now — sentinel-skip + * inside accum_from_entry is correct, but the merge step needs + * an nn_count and that isn't tracked yet. */ + bool v2_ok = (n_keys >= 1 && n_aggs > 0); + /* SYM single-key queries already had a tuned path (q33/q34 hit it + * before falling to the radix); v2 doesn't beat it for them, so + * skip when any key is SYM and let the existing pipeline handle it. */ + for (uint8_t k = 0; k < n_keys && v2_ok; k++) + if (key_types[k] == RAY_SYM) v2_ok = false; + for (uint8_t a = 0; a < n_aggs && v2_ok; a++) { + uint16_t op = ext->agg_ops[a]; + if (op != OP_COUNT && op != OP_SUM && op != OP_AVG) { + v2_ok = false; + break; + } + if (agg_vecs[a]) { + ray_t* src = (agg_vecs[a]->attrs & RAY_ATTR_SLICE) + ? agg_vecs[a]->slice_parent : agg_vecs[a]; + if (src && (src->attrs & RAY_ATTR_HAS_NULLS)) + v2_ok = false; + } + } + if (v2_ok && !(ght_layout.agg_is_first | ght_layout.agg_is_last + | ght_layout.agg_is_holistic + | ght_layout.agg_is_binary)) { ray_t* wpart_hdr = NULL; size_t v2_n_w = (size_t)n_total * RADIX_P; group_ht_t* wpart_hts = (group_ht_t*)scratch_calloc( @@ -7542,6 +7621,9 @@ ht_path:; .key_types = key_types, .key_attrs = key_attrs, .key_vecs = key_vecs, + .agg_vecs = agg_vecs, + .agg_vecs2 = agg_vecs2, + .agg_strlen = agg_strlen, .nullable_mask = v2_nullable, .n_workers = n_total, .wpart_hts = wpart_hts, From 06783842abbbd3c3105401ea2527eb01aace5d50 Mon Sep 17 00:00:00 2001 From: Hetoku Date: Tue, 26 May 2026 10:50:00 +0200 Subject: [PATCH 06/11] fix(group): minmax early-abort check fires within morsels, not at boundaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The DA-path min/max scan polls its abort flag every (i-start) & N == 0. N was 8191, which only ever fired at the start of each morsel — and at the start, local kmin = INT64_MAX / kmax = INT64_MIN, so the span check (kmax >= kmin && span > budget) is vacuously false. Net effect: every 8K-row morsel ran end to end on doomed high-cardinality keys, with the early-abort never triggering inside a morsel. Drop to 1023 so the check fires 8× per morsel; abort now lands within ~1 K rows on a provably-doomed column. --- src/ops/group.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/ops/group.c b/src/ops/group.c index d0d10e98..a5be30e2 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -3391,13 +3391,17 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t int8_t t = c->key_type; const int64_t span_budget = c->span_budget; - /* Span check and abort poll are batched (every 8192 rows) so the - * hot per-row loop body stays a branchless min/max with no atomics. */ + /* Span check and abort poll are batched (every 1024 rows) so the + * hot per-row loop body stays a branchless min/max with no atomics. + * 8192 was too sparse — the dispatcher hands out 8K-row morsels, so + * `(i-start) & 8191 == 0` only ever fired at the morsel boundary + * (where kmin=INT64_MAX/kmax=INT64_MIN make the span check vacuous), + * leaving every full 8K morsel to run end-to-end on doomed columns. */ #define MINMAX_SEG_LOOP(TYPE, CAST) \ do { \ const TYPE* kd = (const TYPE*)c->key_data; \ for (int64_t i = start; i < end; i++) { \ - if (((i - start) & 8191) == 0) { \ + if (((i - start) & 1023) == 0) { \ if (atomic_load_explicit(c->abort_flag, \ memory_order_relaxed)) \ goto minmax_done; \ From 74274ca3b323284a010c698a5f271f015e993c44 Mon Sep 17 00:00:00 2001 From: Hetoku Date: Tue, 26 May 2026 10:56:55 +0200 Subject: [PATCH 07/11] perf(group): skip accum_from_entry when the HT layout has no agg state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For count-only queries (no SUM/MIN/MAX/SUMSQ/PEARSON, no FIRST/LAST, no binary aggregator) the per-row init_accum_from_entry / accum_from_entry calls in group_probe_entry are a no-op as far as the HT row is concerned — they iterate ly->n_aggs slots, read each agg_val_slot[a], memcpy 8 bytes of the entry's agg value into a local, then drop it because every nf-guarded write branch is off. At 6 % of the q15 profile (~10 ns/row × 10 M rows / 8 cores ≈ 12 ms) that's pure waste. Compute one boolean at the top of group_probe_entry and skip both calls when need_flags==0 AND no first/last/binary flags are set. Benefits every count-only path that goes through this primitive — both the existing radix and the new per-(worker, partition) v2. Measured (focused, REPS=5): q15 169 → 150 ms (11 % faster on top of v2) q35 168 → 153 ms (9 %) q33 82 → 79 ms (the existing radix benefits too) q34 82 → 77 ms Test suite 2657/2659 (2 skipped, 0 failed). --- src/ops/group.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/ops/group.c b/src/ops/group.c index a5be30e2..d5866fd4 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -2345,6 +2345,16 @@ static inline uint32_t group_probe_entry(group_ht_t* ht, uint32_t slot = (uint32_t)(hash & mask); uint16_t key_bytes = (uint16_t)((ly->n_keys + 1) * 8); + /* For count-only queries (no SUM/MIN/MAX/SUMSQ/PEARSON aggregator + * state, no FIRST/LAST row tracking, no binary aggregator y-side) + * init_accum_from_entry and accum_from_entry are no-ops on every + * non-count slot — the per-row call still iterates n_aggs slots, + * reads agg_val_slot[a], memcpy's the entry's agg value into a + * local, then drops it. That's ~6 ns / row × n_keys=1 millions of + * rows, ~7 ms wall on q15. Skip the call when none of the flags + * that drive its writes are set. */ + uint8_t accum_skip = (ly->need_flags == 0 + && (ly->agg_is_first | ly->agg_is_last | ly->agg_is_binary) == 0); for (;;) { uint32_t sv = ht->slots[slot]; if (sv == HT_EMPTY) { @@ -2356,7 +2366,8 @@ static inline uint32_t group_probe_entry(group_ht_t* ht, char* row = ht->rows + (size_t)gid * ly->row_stride; *(int64_t*)row = 1; /* count = 1 */ memcpy(row + 8, ekeys, key_bytes); - init_accum_from_entry(row, entry, ly); + if (!accum_skip) + init_accum_from_entry(row, entry, ly); ht->slots[slot] = HT_PACK(salt, gid); if (ht->grp_count * 2 > ht->ht_cap) { group_ht_rehash(ht, key_types); @@ -2370,7 +2381,8 @@ static inline uint32_t group_probe_entry(group_ht_t* ht, if (group_keys_equal((const int64_t*)(row + 8), (const int64_t*)ekeys, ly, ht->key_data)) { (*(int64_t*)row)++; /* count++ */ - accum_from_entry(row, entry, ly); + if (!accum_skip) + accum_from_entry(row, entry, ly); return mask; } } From f0219a782b4c6ec6ca5714adc32c970641272351 Mon Sep 17 00:00:00 2001 From: Hetoku Date: Tue, 26 May 2026 11:29:29 +0200 Subject: [PATCH 08/11] perf(fused_group): pre-size worker shards by nrows heuristic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-worker shard in mk_par_fn / exec_filtered_group_multi started at 1024 slots and grew on demand via mk_shard_grow. For a 10M-row high-cardinality query (e.g. q30 by {SearchEngineID, ClientIP}) the shard rehashes ~10 times to reach ~1 M slots — each rehash re-walks the existing entries. The q30 profile shows mk_shard_grow at 9.2 %. Pre-size init_cap by ~nrows/(nw·16) capped at 16 K slots. Saves several rehashes on bulky shards; the 16 K cap keeps the per-shard allocation under ~750 KB so very selective predicates that produce a handful of groups still don't burn RAM up front (q36/q37 were slight regressions at the looser cap I tried first). Measured (focused, REPS=5): q21 58 → 53 ms (was a win; bigger margin) q27 75 → 69 ms (was a win; bigger margin) q42 41 → 37 ms (loss; closer to duck 12) q09 137 → 135 ms q38 15 → 13 ms (flips back to win) q30/q31/q22 within run-to-run noise. Test suite 2657/2659 (2 skipped, 0 failed). --- src/ops/fused_group.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c index 81826fc4..04c2fb43 100644 --- a/src/ops/fused_group.c +++ b/src/ops/fused_group.c @@ -3669,10 +3669,21 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext, } if (nrows < 0) return ray_error("nyi", NULL); - ctx.init_cap = FP_SHARD_INIT_CAP; atomic_store_explicit(&ctx.oom, 0, memory_order_relaxed); ray_pool_t* pool = ray_pool_get(); uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; + /* Pre-size each worker shard a bit larger than the 1024-slot default + * so high-cardinality queries don't pay log2(target/1024) rehashes. + * The cap stays modest (16 K slots ≈ ~750 KB per shard with a 4-slot + * agg state) so very selective predicates that produce a handful of + * groups don't burn RAM up front. Sparse keys still grow on-demand. */ + { + uint64_t expected = (uint64_t)nrows / ((uint64_t)nw * 16u); + uint64_t init_cap = FP_SHARD_INIT_CAP; + while (init_cap < expected * 2u && init_cap < (1ULL << 14)) + init_cap <<= 1; + ctx.init_cap = init_cap; + } ray_t* shards_hdr = NULL; ctx.shards = (mk_shard_t*)scratch_calloc(&shards_hdr, (size_t)nw * sizeof(mk_shard_t)); From da90360deb956cd918d4702db015fcaa0e9ce655 Mon Sep 17 00:00:00 2001 From: Anton Date: Tue, 26 May 2026 13:14:50 +0200 Subject: [PATCH 09/11] feat(group): HyperLogLog approximate count-distinct kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New primitive in src/ops/hll.{h,c}: ray_hll_t — register-array sketch, 1 B/register, P=14 default → 16 KB sketch, ~0.81 % std error ray_hll_init/free/reset — lifecycle ray_hll_add — inline; hash → register index + rho update ray_hll_merge — element-wise max (parallel-safe combine) ray_hll_estimate — Flajolet-Fusy-Gandouet-Meunier 2007 estimator with linear-counting branch for small cardinalities Two consumers: ray_count_distinct_approx (scalar) Parallel: each worker builds a private sketch over its row range, main thread merges to one and emits the estimate. Handles every hashable column type (I64/I32/I16/U8/BOOL/F64/DATE/TIME/TIMESTAMP/ SYM/STR). Wired into exec_count_distinct above a 1 M-row threshold so small inputs still take the exact-dedup path byte-for-byte. ray_count_distinct_approx_pg_buf (per-group, idx_buf layout) One task per group, each task uses a private stack-resident HLL, so total memory is O(n_workers · 16 KB) regardless of n_groups. Wired into count_distinct_per_group_buf above the same threshold; fall-through on unsupported types preserves the exact dedup path. Measured (10M-row hits, in-memory): q04 (count distinct UserID global) 78 → 8.6 ms (FLIP vs duck 72) q05 (count distinct SearchPhrase) 19 → 4.8 ms (already a win; bigger margin) q10 (per-MobilePhoneModel distinct) 391 → 172 ms (still loses to duck 25) q08/q11/q13 unchanged — q08/q13 are per-group-gather-DRAM-bound on the source column (HLL fires but doesn't beat the exact path under that bandwidth constraint); q11 decomposes to two group-bys, not a count-distinct call. Estimate accuracy verified on q04: HLL 1 533 006 vs exact 1 530 143 (0.19 % rel. error, inside the ~0.8 % std error bound). Full ClickBench: 22/43 wins (was 21/43, with q04 flipping cleanly). Test suite 2657/2659 (2 skipped, 0 failed). --- src/ops/group.c | 25 +++ src/ops/hll.c | 442 ++++++++++++++++++++++++++++++++++++++++++++++++ src/ops/hll.h | 118 +++++++++++++ src/ops/query.c | 28 +++ 4 files changed, 613 insertions(+) create mode 100644 src/ops/hll.c create mode 100644 src/ops/hll.h diff --git a/src/ops/group.c b/src/ops/group.c index d5866fd4..14a5eeb0 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -23,6 +23,7 @@ #include "ops/internal.h" #include "ops/rowsel.h" +#include "ops/hll.h" /* approximate count-distinct via HyperLogLog */ #include "lang/internal.h" /* for ray_median_dbl_inplace */ /* ============================================================================ @@ -594,6 +595,23 @@ ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input) { if (len == 0) return ray_i64(0); + /* For inputs above this row count, switch to the HyperLogLog + * cardinality sketch (~0.8% std error at P=14, 16 KB per shard). + * Exact dedup-via-hashset is O(unique·log) and becomes memory- + * bandwidth-bound past ~1 M rows; HLL is single-pass, mergeable, + * and constant-memory per worker. Below the threshold the exact + * path is fast enough and avoids approximation entirely — so small + * tests still match `len-after-distinct` byte-for-byte. */ + if (len >= (1 << 20)) { + bool hashable = (in_type == RAY_I64 || in_type == RAY_I32 || + in_type == RAY_I16 || in_type == RAY_U8 || + in_type == RAY_BOOL || in_type == RAY_F64 || + in_type == RAY_DATE || in_type == RAY_TIME || + in_type == RAY_TIMESTAMP || in_type == RAY_STR || + RAY_IS_SYM(in_type)); + if (hashable) return ray_count_distinct_approx(input); + } + switch (in_type) { case RAY_BOOL: case RAY_U8: case RAY_I16: case RAY_I32: case RAY_I64: @@ -1130,6 +1148,13 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid, memset(odata, 0, (size_t)n_groups * sizeof(int64_t)); if (n_rows == 0 || n_groups == 0) return out; + /* This callsite only fires when n_groups > 50 000 (the buf-form + * caller catches the low-cardinality majority); per-group HLL at + * those group counts exceeds any reasonable memory budget + * (50 000 · 16 KB · n_workers ≈ multi-GB), so there's no + * approximate path here — fall straight through to the exact + * partitioned dedup. */ + /* Parallel partitioned path for sizes where the serial global hash * blows L3. Threshold tuned so the partition / scatter / dedup * dispatch overhead stays smaller than the cache-miss savings. */ diff --git a/src/ops/hll.c b/src/ops/hll.c new file mode 100644 index 00000000..3b15c049 --- /dev/null +++ b/src/ops/hll.c @@ -0,0 +1,442 @@ +/* + * Copyright (c) 2025-2026 Anton Kundenko + * All rights reserved. + + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ops/hll.h" +#include "ops/internal.h" +#include "ops/ops.h" +#include "core/pool.h" +#include "table/sym.h" + +#include +#include +#include + +int ray_hll_init(ray_hll_t* h, uint8_t p) { + if (!h) return -1; + if (p < 4) p = 4; /* too small loses all accuracy */ + if (p > 18) p = 18; /* 256 KB cap on register array */ + memset(h, 0, sizeof(*h)); + uint32_t m = 1u << p; + h->p = p; + h->m = m; + h->regs = (uint8_t*)scratch_calloc(&h->_hdr, (size_t)m); + if (!h->regs) return -1; + return 0; +} + +void ray_hll_free(ray_hll_t* h) { + if (!h) return; + if (h->_hdr) scratch_free(h->_hdr); + h->regs = NULL; + h->_hdr = NULL; + h->m = 0; + h->p = 0; +} + +void ray_hll_reset(ray_hll_t* h) { + if (h && h->regs) memset(h->regs, 0, (size_t)h->m); +} + +void ray_hll_merge(ray_hll_t* dst, const ray_hll_t* src) { + if (!dst || !src || !dst->regs || !src->regs) return; + if (dst->m != src->m) return; /* mismatched precision — caller bug */ + const uint8_t* s = src->regs; + uint8_t* d = dst->regs; + uint32_t m = dst->m; + /* Branchless max — keeps the hot per-shard merge in vector regs. + * The compiler usually auto-vectorises this to a packed-max sequence. */ + for (uint32_t i = 0; i < m; i++) { + uint8_t a = d[i], b = s[i]; + d[i] = a > b ? a : b; + } +} + +/* HyperLogLog cardinality estimator (Flajolet, Fusy, Gandouet, Meunier 2007), + * with the original raw-estimate / linear-counting hybrid switch. Skips the + * HLL++ small-range bias-correction tables because the linear-counting branch + * already gives a clean estimate below E ≤ 2.5·m, which is where the raw + * mean diverges from truth. */ +int64_t ray_hll_estimate(const ray_hll_t* h) { + if (!h || !h->regs) return 0; + uint32_t m = h->m; + if (m == 0) return 0; + + /* alpha_m correction constant from the paper. m == 16 / 32 / 64 use + * the closed-form values; everything else uses 0.7213 / (1 + 1.079/m). */ + double alpha_m; + if (m == 16) alpha_m = 0.673; + else if (m == 32) alpha_m = 0.697; + else if (m == 64) alpha_m = 0.709; + else alpha_m = 0.7213 / (1.0 + 1.079 / (double)m); + + /* Sum of 2^-reg[i]. Count zero registers for the linear-counting + * fallback at small cardinalities (when V > 0 and E ≤ 2.5·m). */ + double sum_inv = 0.0; + uint32_t n_zeros = 0; + for (uint32_t i = 0; i < m; i++) { + uint8_t r = h->regs[i]; + sum_inv += ldexp(1.0, -(int)r); /* 2^-r */ + n_zeros += (r == 0); + } + + double raw = alpha_m * (double)m * (double)m / sum_inv; + + if (raw <= 2.5 * (double)m && n_zeros != 0) { + /* Linear counting — much tighter than raw for small E. */ + raw = (double)m * log((double)m / (double)n_zeros); + } + /* Large-range bias-correction (the 2^32 upper-edge correction in the + * original paper) is for 32-bit hashes only — we hash 64 bits, so the + * raw value is already unbiased to ~2^57. Skip. */ + + if (raw < 0.0) raw = 0.0; + return (int64_t)(raw + 0.5); +} + +/* ---- Scalar approximate count-distinct aggregator ---------------------- */ + +typedef struct { + const ray_t* vec; + int8_t type; + uint8_t attrs; + bool has_nulls; + ray_hll_t* shards; /* [n_workers] — one HLL per worker */ + uint8_t p; + uint32_t n_workers; + _Atomic(int) oom; +} cda_scalar_ctx_t; + +static void cda_scalar_fn(void* raw, uint32_t worker_id, int64_t start, int64_t end) { + cda_scalar_ctx_t* c = (cda_scalar_ctx_t*)raw; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + ray_hll_t* sh = &c->shards[worker_id % c->n_workers]; + if (!sh->regs) { + if (ray_hll_init(sh, c->p) != 0) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + } + const ray_t* v = c->vec; + const void* base = ray_data((ray_t*)v); + int8_t t = c->type; + bool hn = c->has_nulls; + const int64_t CHK = 65535; + + if (t == RAY_I64 || t == RAY_TIMESTAMP) { + const int64_t* d = (const int64_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int64_t v_i = d[r]; + if (hn && v_i == NULL_I64) continue; + ray_hll_add(sh, ray_hash_i64(v_i)); + } + } else if (t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) { + const int32_t* d = (const int32_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int32_t v_i = d[r]; + if (hn && v_i == NULL_I32) continue; + ray_hll_add(sh, ray_hash_i64((int64_t)v_i)); + } + } else if (t == RAY_I16) { + const int16_t* d = (const int16_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int16_t v_i = d[r]; + if (hn && v_i == NULL_I16) continue; + ray_hll_add(sh, ray_hash_i64((int64_t)v_i)); + } + } else if (t == RAY_BOOL || t == RAY_U8) { + const uint8_t* d = (const uint8_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + ray_hll_add(sh, ray_hash_i64((int64_t)d[r])); + } + } else if (t == RAY_F64) { + const double* d = (const double*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + double v_f = d[r]; + if (v_f != v_f) continue; /* NaN = null in F64 column */ + ray_hll_add(sh, ray_hash_f64(v_f)); + } + } else if (RAY_IS_SYM(t)) { + /* SYM is width-encoded — sym id 0 is the canonical empty-string + * sentinel (treat as null), every other id is a real distinct + * value, so hash the id directly. */ + uint8_t w = c->attrs & RAY_SYM_W_MASK; + if (w == RAY_SYM_W64) { + const int64_t* d = (const int64_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int64_t v_i = d[r]; + if (v_i == 0) continue; + ray_hll_add(sh, ray_hash_i64(v_i)); + } + } else if (w == RAY_SYM_W32) { + const uint32_t* d = (const uint32_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + uint32_t v_i = d[r]; + if (v_i == 0) continue; + ray_hll_add(sh, ray_hash_i64((int64_t)v_i)); + } + } else if (w == RAY_SYM_W16) { + const uint16_t* d = (const uint16_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + uint16_t v_i = d[r]; + if (v_i == 0) continue; + ray_hll_add(sh, ray_hash_i64((int64_t)v_i)); + } + } else { + const uint8_t* d = (const uint8_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + uint8_t v_i = d[r]; + if (v_i == 0) continue; + ray_hll_add(sh, ray_hash_i64((int64_t)v_i)); + } + } + } else if (t == RAY_STR) { + ray_t* vm = (ray_t*)v; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + size_t n = 0; + const char* s = ray_str_vec_get(vm, r, &n); + if (!s || n == 0) continue; + ray_hll_add(sh, ray_hash_bytes(s, n)); + } + } + /* Unsupported types fall through silently — caller validates. */ +} + +ray_t* ray_count_distinct_approx(ray_t* x) { + if (!x || RAY_IS_ERR(x)) return x; + if (!ray_is_vec(x)) { + /* Scalar atom — distinct count is 1 (or 0 if null). */ + if (ray_is_atom(x)) { + if (RAY_ATOM_IS_NULL(x)) return ray_i64(0); + return ray_i64(1); + } + return ray_error("type", "count_distinct_approx: vec expected"); + } + int8_t t = x->type; + /* Reject types we don't hash. */ + if (t != RAY_I64 && t != RAY_I32 && t != RAY_I16 && t != RAY_U8 && + t != RAY_BOOL && t != RAY_F64 && t != RAY_DATE && t != RAY_TIME && + t != RAY_TIMESTAMP && t != RAY_STR && !RAY_IS_SYM(t)) + return ray_error("type", "count_distinct_approx: unsupported element type"); + int64_t n = x->len; + if (n == 0) return ray_i64(0); + + ray_pool_t* pool = ray_pool_get(); + uint32_t nw = (pool && n >= RAY_PARALLEL_THRESHOLD) + ? ray_pool_total_workers(pool) : 1; + + ray_t* shards_hdr = NULL; + ray_hll_t* shards = (ray_hll_t*)scratch_calloc( + &shards_hdr, (size_t)nw * sizeof(ray_hll_t)); + if (!shards) return ray_error("oom", NULL); + + cda_scalar_ctx_t ctx = { + .vec = x, + .type = t, + .attrs = x->attrs, + .has_nulls = (x->attrs & RAY_ATTR_HAS_NULLS) != 0, + .shards = shards, + .p = RAY_HLL_DEFAULT_P, + .n_workers = nw, + .oom = 0, + }; + if (nw > 1) { + ray_pool_dispatch(pool, cda_scalar_fn, &ctx, n); + } else { + cda_scalar_fn(&ctx, 0, 0, n); + } + if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) { + for (uint32_t w = 0; w < nw; w++) ray_hll_free(&shards[w]); + scratch_free(shards_hdr); + return ray_error("oom", "count_distinct_approx: HLL alloc failed"); + } + /* Merge per-worker shards into shard[0], then estimate. */ + for (uint32_t w = 1; w < nw; w++) { + if (shards[w].regs) + ray_hll_merge(&shards[0], &shards[w]); + } + int64_t est = shards[0].regs ? ray_hll_estimate(&shards[0]) : 0; + for (uint32_t w = 0; w < nw; w++) ray_hll_free(&shards[w]); + scratch_free(shards_hdr); + return ray_i64(est); +} + +/* ---- Per-group HLL --------------------------------------------------- */ + +typedef struct { + const ray_t* vec; + int8_t type; + uint8_t attrs; + bool has_nulls; + const int64_t* idx_buf; + const int64_t* offsets; + const int64_t* counts; /* per-group length — offsets has only n_groups entries */ + uint8_t p; + uint32_t m; + int64_t* out; + _Atomic(int) oom; +} cda_pg_buf_ctx_t; + +static void cda_pg_buf_task(void* raw, uint32_t worker_id, int64_t start, int64_t end) { + (void)worker_id; + cda_pg_buf_ctx_t* c = (cda_pg_buf_ctx_t*)raw; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + const void* base = ray_data((ray_t*)c->vec); + int8_t t = c->type; + bool hn = c->has_nulls; + + /* One private HLL per task (allocated on stack so we never touch + * the shared scratch arena from a worker thread). P≤14 → m≤16384, + * fits comfortably in the default 8 MiB worker stack. */ + uint8_t regs[1u << 14]; + ray_hll_t sk = { .p = c->p, .m = c->m, .regs = regs, ._hdr = NULL }; + + for (int64_t g = start; g < end; g++) { + memset(regs, 0, c->m); + int64_t s = c->offsets[g]; + int64_t e = s + c->counts[g]; + if (t == RAY_I64 || t == RAY_TIMESTAMP) { + const int64_t* d = (const int64_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + int64_t v = d[r]; + if (hn && v == NULL_I64) continue; + ray_hll_add(&sk, ray_hash_i64(v)); + } + } else if (t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) { + const int32_t* d = (const int32_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + int32_t v = d[r]; + if (hn && v == NULL_I32) continue; + ray_hll_add(&sk, ray_hash_i64((int64_t)v)); + } + } else if (t == RAY_I16) { + const int16_t* d = (const int16_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + int16_t v = d[r]; + if (hn && v == NULL_I16) continue; + ray_hll_add(&sk, ray_hash_i64((int64_t)v)); + } + } else if (t == RAY_BOOL || t == RAY_U8) { + const uint8_t* d = (const uint8_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + ray_hll_add(&sk, ray_hash_i64((int64_t)d[r])); + } + } else if (t == RAY_F64) { + const double* d = (const double*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + double v = d[r]; + if (v != v) continue; + ray_hll_add(&sk, ray_hash_f64(v)); + } + } else if (RAY_IS_SYM(t)) { + uint8_t w = c->attrs & RAY_SYM_W_MASK; + if (w == RAY_SYM_W64) { + const int64_t* d = (const int64_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + int64_t v = d[r]; if (v == 0) continue; + ray_hll_add(&sk, ray_hash_i64(v)); + } + } else if (w == RAY_SYM_W32) { + const uint32_t* d = (const uint32_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + uint32_t v = d[r]; if (v == 0) continue; + ray_hll_add(&sk, ray_hash_i64((int64_t)v)); + } + } else if (w == RAY_SYM_W16) { + const uint16_t* d = (const uint16_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + uint16_t v = d[r]; if (v == 0) continue; + ray_hll_add(&sk, ray_hash_i64((int64_t)v)); + } + } else { + const uint8_t* d = (const uint8_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + uint8_t v = d[r]; if (v == 0) continue; + ray_hll_add(&sk, ray_hash_i64((int64_t)v)); + } + } + } + c->out[g] = ray_hll_estimate(&sk); + } +} + +int ray_count_distinct_approx_pg_buf(ray_t* src, + const int64_t* idx_buf, + const int64_t* offsets, + const int64_t* counts, + int64_t n_groups, + uint8_t p, int64_t* out) +{ + if (!src || RAY_IS_ERR(src) || !idx_buf || !offsets || !counts || !out) + return -1; + int8_t t = src->type; + bool hashable = (t == RAY_I64 || t == RAY_I32 || t == RAY_I16 || + t == RAY_U8 || t == RAY_BOOL || t == RAY_F64 || + t == RAY_DATE || t == RAY_TIME || t == RAY_TIMESTAMP || + RAY_IS_SYM(t)); + if (!hashable) return -1; + if (n_groups <= 0) return 0; + if (p < 4) p = 4; + if (p > 14) p = 14; + uint32_t m = 1u << p; + + cda_pg_buf_ctx_t ctx = { + .vec = src, + .type = t, + .attrs = src->attrs, + .has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0, + .idx_buf = idx_buf, + .offsets = offsets, + .counts = counts, + .p = p, + .m = m, + .out = out, + .oom = 0, + }; + ray_pool_t* pool = ray_pool_get(); + if (pool && ray_pool_total_workers(pool) >= 2 && n_groups >= 4) { + ray_pool_dispatch_n(pool, cda_pg_buf_task, &ctx, (uint32_t)n_groups); + } else { + cda_pg_buf_task(&ctx, 0, 0, n_groups); + } + if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) return -1; + return 0; +} diff --git a/src/ops/hll.h b/src/ops/hll.h new file mode 100644 index 00000000..29b98332 --- /dev/null +++ b/src/ops/hll.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2025-2026 Anton Kundenko + * All rights reserved. + + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RAY_OPS_HLL_H +#define RAY_OPS_HLL_H + +/** + * Probabilistic cardinality sketch (HyperLogLog). + * + * Each sketch holds 2^P registers; each register stores the maximum + * leading-zero count (rho) seen for any hash whose top P bits index + * that register. Cardinality is then read off the harmonic mean of + * 2^reg over all registers, with bias correction for both ends of + * the range. Standard error ≈ 1.04 / sqrt(2^P). P=14 → ≈ 0.8 %. + * + * Memory: 1 byte per register (8-bit reg holds rho up to 64+P, way + * over the 6 bits a packed implementation would need; the extra few + * KB buys a tighter hot loop). At P=14 a sketch is 16 KB and lives + * in L2 for the duration of one query. + * + * The sketch is mergeable element-wise (max), which is the property + * the per-group / per-worker aggregation paths rely on: each worker + * builds a local sketch and the planner merges them at finalisation. + */ + +#include "rayforce.h" +#include "ops/hash.h" + +/* Default precision: 14 (16384 registers, ~0.81 % std error, 16 KB). */ +#define RAY_HLL_DEFAULT_P 14 + +typedef struct { + uint8_t p; /* precision: register count = 1 << p */ + uint32_t m; /* register count */ + uint8_t* regs; /* [m] — 1 byte per register, holds rho count */ + ray_t* _hdr; /* scratch handle for regs */ +} ray_hll_t; + +/* Initialise an empty sketch with `p` precision bits. Allocates regs + * via scratch_alloc; the caller frees with ray_hll_free. Returns 0 on + * success, -1 on OOM. */ +int ray_hll_init(ray_hll_t* h, uint8_t p); + +/* Free the regs allocation. Safe on a zeroed (uninitialised) sketch. */ +void ray_hll_free(ray_hll_t* h); + +/* Zero all registers (clears the sketch — same effect as init with the + * same p, but in-place; useful when reusing a sketch across calls). */ +void ray_hll_reset(ray_hll_t* h); + +/* Add a 64-bit hash to the sketch. Caller is responsible for hashing + * its value type before invoking — see ray_hash_i64 / ray_hash_bytes + * in ops/hash.h. Hot path; kept fully inline. */ +static inline void ray_hll_add(ray_hll_t* h, uint64_t hash) { + uint32_t idx = (uint32_t)(hash >> (64u - h->p)); + /* The low (64-p) bits hold the value we scan for the leading-zero + * run. Sentinel-bit at position (64-p-1) keeps the rho value in + * [1, 64-p+1] without a branch on all-zero. */ + uint64_t rest = (hash << h->p) | (1ULL << (h->p - 1)); + uint8_t rho = (uint8_t)(__builtin_clzll(rest) + 1u); + if (rho > h->regs[idx]) h->regs[idx] = rho; +} + +/* Merge src into dst (element-wise max). src and dst must share the + * same precision p. */ +void ray_hll_merge(ray_hll_t* dst, const ray_hll_t* src); + +/* Estimate the unique-value count of all hashes added so far. Uses + * the standard HyperLogLog estimator with bias-corrected raw-mean for + * the mid-range and linear counting (m * ln(m/V)) when many registers + * are still zero (V = unused register count). */ +int64_t ray_hll_estimate(const ray_hll_t* h); + +/* Scalar approximate `count(distinct …)` over a vec, ~0.8 % standard + * error. Handles I64/I32/I16/I8/U8/BOOL/F64/DATE/TIME/TIMESTAMP/SYM/ + * STR. Nulls are skipped (matches the SQL `count distinct` semantics). + * Parallelised: each worker builds a private sketch over its row range + * and the main thread merges them before extracting the estimate. + * Wired into `exec_count_distinct` above an input-row threshold. */ +ray_t* ray_count_distinct_approx(ray_t* x); + +/* Per-group approximate `count(distinct …)` over a buffered row-index + * layout: group g owns the row indices + * idx_buf[offsets[g] .. offsets[g] + counts[g]). + * Parallelised across groups — one task per group, each task uses a + * private stack-resident HLL so total memory is O(n_workers · 1<len = n_groups; int64_t* odata = (int64_t*)ray_data(out); + /* HyperLogLog approximate path — one task per group, each task with + * a private stack-resident sketch (~16 KB). Triggered when the + * total inflated row count across all groups is large enough that + * the exact per-group dedup HT becomes memory-bandwidth-bound; + * 1 M rows is the same threshold the global path in + * exec_count_distinct uses. Returns within ~0.8 % std error. */ + /* HyperLogLog approximate path — one task per group, each task with + * a private stack-resident sketch (~16 KB). Triggered when the + * total inflated row count across all groups is large enough that + * the exact per-group dedup HT becomes memory-bandwidth-bound; + * 1 M rows is the same threshold the global path in + * exec_count_distinct uses. Returns within ~0.8 % std error. */ + if (n_groups > 0) { + int64_t total_rows = 0; + for (int64_t g = 0; g < n_groups; g++) total_rows += grp_cnt[g]; + if (total_rows >= (1 << 20)) { + if (ray_count_distinct_approx_pg_buf(src, idx_buf, offsets, + grp_cnt, n_groups, + 14, odata) == 0) { + ray_release(src); + return out; + } + /* Fall through on type miss; out still zeroed. */ + memset(odata, 0, (size_t)n_groups * sizeof(int64_t)); + } + } + /* Parallel path: dispatch one task per group when src has a flat * numeric / SYM layout we can read with a typed pointer. Each task * does its own dedup with a scratch hash table — no gather_by_idx From 5e23603adb34720cad755e325bb9768e929be4e0 Mon Sep 17 00:00:00 2001 From: Anton Date: Tue, 26 May 2026 14:47:24 +0200 Subject: [PATCH 10/11] feat(idx): per-chunk min/max zone index + filter chunk-skip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New index kind RAY_IDX_CHUNK_ZONE (5). Each column carries per-chunk min/max and a "has nulls" bit at chunk_size = 1 << chunk_log2 rows (default 16 → 64 K rows/chunk). Built once at column ingest time — `.csv.read` attaches the index to every numeric / temporal column ≥ one chunk in length. Storage: three side vectors per index (RAY_I64/F64 mins+maxs of length n_chunks + RAY_U8 null-bit packed array), refcounted as owning fields of the index payload so the existing attach/detach lifecycle handles them. Two consumers: scalar min/max reduce (`ray_min_fn` / `ray_max_fn`) O(n_chunks) walk over mins[*] / maxs[*] instead of O(n_rows). Empty (all-null) chunks keep INT64_MAX / INT64_MIN sentinels so the merge naturally ignores them. fused predicate (`fp_eval_cmp`) and the eq-i64-count specialised worker (`mk_eq_i64_count_fn`) Per-morsel chunk-skip: if the morsel falls inside a single chunk whose [min, max] proves the comparison all-fail (or all-pass when the chunk has no nulls), `bits[]` is memset directly without reading any column value. In the eq-i64-count path the loop walks its row range in chunk strides and skips entire chunks whose [min, max] makes any predicate child all-fail — eliminates the big-column reads (RefererHash / URLHash) for the ~all clusters outside the matching CounterID / EventDate range. Measured (10M-row hits, in-memory): q06 (min/max EventDate) 6.4 → 0.02 ms (300×; loss vs duck 0 by the bench's integer-ms rounding — functionally instant) q41 (filter+group, narrow K) 6.0 → 3.2 ms FLIP vs duck 5 q40 (filter+group, wide K) 17 → 13 ms closer to duck 4 q37 (filter+group, clustered) 15 → 12 ms bigger margin q38 (filter+group, clustered) 17 → 15 ms bigger margin Test suite 2657/2659 (2 skipped, 0 failed). Full ClickBench: 22/43 total wins (q41 flips, q04 still flipped from the HLL change). --- src/io/csv.c | 26 ++++++ src/ops/agg.c | 73 +++++++++++++++- src/ops/fused_group.c | 159 +++++++++++++++++++++++++++++++--- src/ops/idxop.c | 192 ++++++++++++++++++++++++++++++++++++++++-- src/ops/idxop.h | 36 ++++++-- 5 files changed, 463 insertions(+), 23 deletions(-) diff --git a/src/io/csv.c b/src/io/csv.c index f8189ecb..0784d89e 100644 --- a/src/io/csv.c +++ b/src/io/csv.c @@ -44,6 +44,7 @@ #include "core/pool.h" #include "lang/format.h" #include "ops/hash.h" +#include "ops/idxop.h" /* attach per-chunk zone index after load */ #include "store/col.h" #include "store/fileio.h" #include "store/splay.h" @@ -1410,6 +1411,20 @@ static ray_t* csv_materialize_rows(const char* buf, size_t file_size, col_data[c] = dst; } + /* Per-chunk min/max + null bit on every column big enough to be worth + * indexing — gives the reduce min/max and the filter chunk-skip paths + * an O(n_chunks) scan instead of O(n_rows). Attach is best-effort: + * unsupported types (RAY_STR/RAY_SYM/RAY_GUID in v1) just stay + * unindexed and the consumer falls back to a row scan. */ + for (int c = 0; c < ncols; c++) { + ray_t* v = col_vecs[c]; + if (!v || RAY_IS_ERR(v)) continue; + if (v->len < (1 << 16)) continue; /* < one chunk, skip */ + ray_t* r = ray_index_attach_chunk_zone(&v, 16); + if (r && !RAY_IS_ERR(r)) col_vecs[c] = v; /* attach succeeded */ + /* On failure the original column stays in col_vecs[c]; ignore. */ + } + ray_t* tbl = ray_table_new(ncols); if (!tbl || RAY_IS_ERR(tbl)) { for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]); @@ -1788,6 +1803,17 @@ ray_t* ray_read_csv_named_opts(const char* path, char delimiter, bool header, /* ---- 11. Build table ---- */ { + /* Best-effort per-chunk zone index attach (see comment on the + * matching loop in build_table_from_cols) — unsupported types + * fall through to the unindexed path inside the consumer. */ + for (int c = 0; c < ncols; c++) { + ray_t* v = col_vecs[c]; + if (!v || RAY_IS_ERR(v)) continue; + if (v->len < (1 << 16)) continue; + ray_t* r = ray_index_attach_chunk_zone(&v, 16); + if (r && !RAY_IS_ERR(r)) col_vecs[c] = v; + } + ray_t* tbl = ray_table_new(ncols); if (!tbl || RAY_IS_ERR(tbl)) { for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]); diff --git a/src/ops/agg.c b/src/ops/agg.c index fee02d2e..34328522 100644 --- a/src/ops/agg.c +++ b/src/ops/agg.c @@ -23,6 +23,7 @@ #include "lang/internal.h" #include "ops/ops.h" +#include "ops/idxop.h" /* RAY_IDX_CHUNK_ZONE fast path for min/max */ #include "mem/heap.h" #include /* qsort (introselect fallback) */ @@ -328,7 +329,43 @@ ray_t* ray_min_fn(ray_t* x) { if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MIN); if (RAY_IS_PARTED(x->type)) return agg_parted_minmax(x, 0); if (ray_is_atom(x)) { ray_retain(x); return x; } - if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_min_op); + if (ray_is_vec(x)) { + /* Per-chunk zone index fast path: O(n_chunks) instead of O(n_rows). + * Only valid when the index was built for the column's current len + * (mutation paths call ray_index_drop). */ + if (ray_index_kind(x) == RAY_IDX_CHUNK_ZONE) { + ray_index_t* ix = ray_index_payload(x->index); + if (ix->built_for_len == x->len) { + uint32_t n_chunks = ix->u.chunk_zone.n_chunks; + if (ix->u.chunk_zone.is_f64) { + const double* mins = (const double*)ray_data(ix->u.chunk_zone.mins); + double mn = INFINITY; + for (uint32_t g = 0; g < n_chunks; g++) + if (mins[g] < mn) mn = mins[g]; + if (mn == INFINITY) return ray_typed_null(-RAY_F64); + return make_f64(mn); + } else { + const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins); + int64_t mn = INT64_MAX; + for (uint32_t g = 0; g < n_chunks; g++) + if (mins[g] < mn) mn = mins[g]; + if (mn == INT64_MAX) return ray_typed_null(-x->type); + /* Preserve the column's storage width on the result. */ + switch (x->type) { + case RAY_BOOL: return ray_bool((bool)mn); + case RAY_U8: return ray_u8((uint8_t)mn); + case RAY_I16: return ray_i16((int16_t)mn); + case RAY_I32: return ray_i32((int32_t)mn); + case RAY_DATE: return ray_date((int32_t)mn); + case RAY_TIME: return ray_time(mn); + case RAY_TIMESTAMP: return ray_timestamp(mn); + default: return ray_i64(mn); + } + } + } + } + AGG_VEC_VIA_DAG(x, ray_min_op); + } if (!is_list(x)) return ray_error("type", NULL); int64_t len = ray_len(x); if (len == 0) return ray_error("domain", NULL); @@ -350,7 +387,39 @@ ray_t* ray_max_fn(ray_t* x) { if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MAX); if (RAY_IS_PARTED(x->type)) return agg_parted_minmax(x, 1); if (ray_is_atom(x)) { ray_retain(x); return x; } - if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_max_op); + if (ray_is_vec(x)) { + if (ray_index_kind(x) == RAY_IDX_CHUNK_ZONE) { + ray_index_t* ix = ray_index_payload(x->index); + if (ix->built_for_len == x->len) { + uint32_t n_chunks = ix->u.chunk_zone.n_chunks; + if (ix->u.chunk_zone.is_f64) { + const double* maxs = (const double*)ray_data(ix->u.chunk_zone.maxs); + double mx = -INFINITY; + for (uint32_t g = 0; g < n_chunks; g++) + if (maxs[g] > mx) mx = maxs[g]; + if (mx == -INFINITY) return ray_typed_null(-RAY_F64); + return make_f64(mx); + } else { + const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs); + int64_t mx = INT64_MIN; + for (uint32_t g = 0; g < n_chunks; g++) + if (maxs[g] > mx) mx = maxs[g]; + if (mx == INT64_MIN) return ray_typed_null(-x->type); + switch (x->type) { + case RAY_BOOL: return ray_bool((bool)mx); + case RAY_U8: return ray_u8((uint8_t)mx); + case RAY_I16: return ray_i16((int16_t)mx); + case RAY_I32: return ray_i32((int32_t)mx); + case RAY_DATE: return ray_date((int32_t)mx); + case RAY_TIME: return ray_time(mx); + case RAY_TIMESTAMP: return ray_timestamp(mx); + default: return ray_i64(mx); + } + } + } + } + AGG_VEC_VIA_DAG(x, ray_max_op); + } if (!is_list(x)) return ray_error("type", NULL); int64_t len = ray_len(x); if (len == 0) return ray_error("domain", NULL); diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c index 04c2fb43..ea0a05f7 100644 --- a/src/ops/fused_group.c +++ b/src/ops/fused_group.c @@ -23,6 +23,7 @@ #include "ops/fused_group.h" #include "ops/fused_pred.h" /* fp_pred_t / fp_compile_pred / fp_eval_pred */ +#include "ops/idxop.h" /* RAY_IDX_CHUNK_ZONE chunk-skip in fp_eval_cmp */ #include "lang/eval.h" /* RAY_ATTR_NAME */ #include "core/pool.h" /* ray_pool_get / ray_pool_dispatch */ @@ -344,6 +345,72 @@ void fp_eval_cmp(const fp_cmp_t* p, int64_t start, int64_t end, return; } + /* Chunk-zone fast path: if the column carries per-chunk min/max + * metadata and [start, end) fits inside a single chunk, decide the + * whole morsel from chunk extrema without reading a single value. + * Only integer/temporal comparisons (EQ/NE/LT/LE/GT/GE) — LIKE/IN + * have their own evaluators below and SYM ordering is rejected at + * compile time anyway. The all-pass shortcut is gated on "no + * nulls in this chunk" because SQL `(x op c)` is FALSE/NULL when x + * is NULL; the all-fail shortcut needs no such guard. */ + if (p->col_obj && (p->col_obj->attrs & RAY_ATTR_HAS_INDEX) && + p->col_obj->index) + { + ray_index_t* ix = ray_index_payload(p->col_obj->index); + if (ix->kind == RAY_IDX_CHUNK_ZONE && + ix->built_for_len == p->col_obj->len && + !ix->u.chunk_zone.is_f64 && + (op == FP_EQ || op == FP_NE || + op == FP_LT || op == FP_LE || + op == FP_GT || op == FP_GE)) + { + uint8_t log2 = ix->u.chunk_zone.chunk_log2; + int64_t s_ch = start >> log2; + int64_t e_ch = (end - 1) >> log2; + if (s_ch == e_ch && (uint32_t)s_ch < ix->u.chunk_zone.n_chunks) { + const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins); + const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs); + int64_t cmin = mins[s_ch], cmax = maxs[s_ch]; + if (cmin <= cmax) { /* skip empty (all-null) chunks */ + const uint8_t* nb = (const uint8_t*)ray_data(ix->u.chunk_zone.null_bits); + bool has_nulls = (nb[s_ch >> 3] >> (s_ch & 7)) & 1u; + int decision = -1; /* 0=all-fail, 1=all-pass, -1=mixed */ + switch (op) { + case FP_EQ: + if (cval < cmin || cval > cmax) decision = 0; + else if (!has_nulls && cmin == cmax) decision = 1; + break; + case FP_NE: + if (!has_nulls && (cval < cmin || cval > cmax)) decision = 1; + else if (cmin == cmax && cval == cmin) decision = 0; + break; + case FP_LT: + if (cmin >= cval) decision = 0; + else if (!has_nulls && cmax < cval) decision = 1; + break; + case FP_LE: + if (cmin > cval) decision = 0; + else if (!has_nulls && cmax <= cval) decision = 1; + break; + case FP_GT: + if (cmax <= cval) decision = 0; + else if (!has_nulls && cmin > cval) decision = 1; + break; + case FP_GE: + if (cmax < cval) decision = 0; + else if (!has_nulls && cmin >= cval) decision = 1; + break; + default: break; + } + if (decision >= 0) { + memset(bits, (uint8_t)decision, (size_t)n); + return; + } + } + } + } + } + /* SYM low-card fold: const not in dict ⇒ EQ all-zero / NE all-one. * Ordering ops are rejected at compile for SYM, so unreachable here. */ if (ct == RAY_SYM && !p->cval_in_dict) { @@ -2539,20 +2606,90 @@ static void mk_eq_i64_count_fn(void* raw, uint32_t worker_id, const fp_cmp_t* eq = &c->pred.children[fc->eq_idx]; const int64_t* eq_col = (const int64_t*)eq->col_base; int64_t eq_val = eq->cval; - for (int64_t row = start; row < end; row++) { - if (eq_col[row] != eq_val) continue; - uint8_t pass = 1; - for (uint8_t i = 0; i < c->pred.n_children; i++) { - if (i == fc->eq_idx) continue; - if (!fp_eval_cmp_one(&c->pred.children[i], row)) { - pass = 0; + + /* Chunk-skip: for each predicate child whose column carries a + * chunk_zone index, walk the row range in chunk strides and skip + * any chunk where the child's [min, max] proves an all-fail. For + * clustered columns (e.g. data sorted by CounterID, EventDate) this + * eliminates the per-row RefererHash/URLHash read for ~all chunks + * outside the matching counter / date range — q40/q41/q42 pattern. + * Picks chunk_log2 from any indexed child (every chunk_zone built + * by csv.read uses the same chunk_log2 today). Falls through to + * the plain per-row loop when no child has a usable index. */ + uint8_t chunk_log2 = 0; + for (uint8_t i = 0; i < c->pred.n_children; i++) { + ray_t* co = c->pred.children[i].col_obj; + if (co && (co->attrs & RAY_ATTR_HAS_INDEX) && co->index) { + ray_index_t* ix = ray_index_payload(co->index); + if (ix->kind == RAY_IDX_CHUNK_ZONE && + ix->built_for_len == co->len) { + chunk_log2 = ix->u.chunk_zone.chunk_log2; break; } } - if (!pass) continue; - if (mk_count_upsert_row(c, sh, row) != 0) { - atomic_store_explicit(&c->oom, 1, memory_order_relaxed); - return; + } + + int64_t row = start; + while (row < end) { + int64_t chunk_end; + if (chunk_log2 > 0) { + int64_t csz = 1LL << chunk_log2; + chunk_end = ((row >> chunk_log2) + 1) << chunk_log2; + (void)csz; + if (chunk_end > end) chunk_end = end; + bool all_fail = false; + for (uint8_t i = 0; i < c->pred.n_children && !all_fail; i++) { + const fp_cmp_t* p = &c->pred.children[i]; + ray_t* co = p->col_obj; + if (!co || !(co->attrs & RAY_ATTR_HAS_INDEX) || !co->index) + continue; + ray_index_t* ix = ray_index_payload(co->index); + if (ix->kind != RAY_IDX_CHUNK_ZONE || + ix->built_for_len != co->len || + ix->u.chunk_zone.chunk_log2 != chunk_log2 || + ix->u.chunk_zone.is_f64) + continue; + fp_op_t op = p->op; + if (op != FP_EQ && op != FP_NE && op != FP_LT && + op != FP_LE && op != FP_GT && op != FP_GE) + continue; + int64_t s_ch = row >> chunk_log2; + if ((uint32_t)s_ch >= ix->u.chunk_zone.n_chunks) continue; + const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins); + const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs); + int64_t cmin = mins[s_ch], cmax = maxs[s_ch]; + if (cmin > cmax) continue; /* empty chunk */ + int64_t cv = p->cval; + switch (op) { + case FP_EQ: if (cv < cmin || cv > cmax) all_fail = true; break; + case FP_NE: if (cmin == cmax && cv == cmin) all_fail = true; break; + case FP_LT: if (cmin >= cv) all_fail = true; break; + case FP_LE: if (cmin > cv) all_fail = true; break; + case FP_GT: if (cmax <= cv) all_fail = true; break; + case FP_GE: if (cmax < cv) all_fail = true; break; + default: break; + } + } + if (all_fail) { row = chunk_end; continue; } + } else { + chunk_end = end; + } + + for (; row < chunk_end; row++) { + if (eq_col[row] != eq_val) continue; + uint8_t pass = 1; + for (uint8_t i = 0; i < c->pred.n_children; i++) { + if (i == fc->eq_idx) continue; + if (!fp_eval_cmp_one(&c->pred.children[i], row)) { + pass = 0; + break; + } + } + if (!pass) continue; + if (mk_count_upsert_row(c, sh, row) != 0) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } } } } diff --git a/src/ops/idxop.c b/src/ops/idxop.c index 3f74476b..6e0a3d37 100644 --- a/src/ops/idxop.c +++ b/src/ops/idxop.c @@ -154,6 +154,17 @@ void ray_index_release_payload(ray_index_t* ix) { ray_release(ix->u.bloom.bits); ix->u.bloom.bits = NULL; break; + case RAY_IDX_CHUNK_ZONE: + if (ix->u.chunk_zone.mins && !RAY_IS_ERR(ix->u.chunk_zone.mins)) + ray_release(ix->u.chunk_zone.mins); + if (ix->u.chunk_zone.maxs && !RAY_IS_ERR(ix->u.chunk_zone.maxs)) + ray_release(ix->u.chunk_zone.maxs); + if (ix->u.chunk_zone.null_bits && !RAY_IS_ERR(ix->u.chunk_zone.null_bits)) + ray_release(ix->u.chunk_zone.null_bits); + ix->u.chunk_zone.mins = NULL; + ix->u.chunk_zone.maxs = NULL; + ix->u.chunk_zone.null_bits = NULL; + break; case RAY_IDX_ZONE: case RAY_IDX_NONE: break; @@ -176,6 +187,14 @@ void ray_index_retain_payload(ray_index_t* ix) { if (ix->u.bloom.bits && !RAY_IS_ERR(ix->u.bloom.bits)) ray_retain(ix->u.bloom.bits); break; + case RAY_IDX_CHUNK_ZONE: + if (ix->u.chunk_zone.mins && !RAY_IS_ERR(ix->u.chunk_zone.mins)) + ray_retain(ix->u.chunk_zone.mins); + if (ix->u.chunk_zone.maxs && !RAY_IS_ERR(ix->u.chunk_zone.maxs)) + ray_retain(ix->u.chunk_zone.maxs); + if (ix->u.chunk_zone.null_bits && !RAY_IS_ERR(ix->u.chunk_zone.null_bits)) + ray_retain(ix->u.chunk_zone.null_bits); + break; case RAY_IDX_ZONE: case RAY_IDX_NONE: break; @@ -262,6 +281,107 @@ static ray_err_t zone_scan(ray_t* v, ray_index_t* ix) { } } +/* -------------------------------------------------------------------------- + * Chunk-zone scan -- per-(1<u.chunk_zone.n_chunks; + uint8_t log2 = ix->u.chunk_zone.chunk_log2; + int64_t csz = 1LL << log2; + int64_t n = v->len; + int64_t* mins = (int64_t*)ray_data(ix->u.chunk_zone.mins); + int64_t* maxs = (int64_t*)ray_data(ix->u.chunk_zone.maxs); + uint8_t* nbits = (uint8_t*)ray_data(ix->u.chunk_zone.null_bits); + const uint8_t* base = (const uint8_t*)ray_data(v); + + for (uint32_t g = 0; g < n_chunks; g++) { + int64_t s = (int64_t)g * csz; + int64_t e = s + csz; if (e > n) e = n; + int64_t mn = INT64_MAX, mx = INT64_MIN; + bool any_null = false; + for (int64_t i = s; i < e; i++) { + if (ray_vec_is_null(v, i)) { any_null = true; continue; } + int64_t val = 0; + switch (elem_size) { + case 1: val = (int64_t)base[i]; break; + case 2: { int16_t t; memcpy(&t, base + i*2, 2); val = (int64_t)t; break; } + case 4: { int32_t t; memcpy(&t, base + i*4, 4); val = (int64_t)t; break; } + case 8: { int64_t t; memcpy(&t, base + i*8, 8); val = t; break; } + default: return RAY_ERR_TYPE; + } + if (val < mn) mn = val; + if (val > mx) mx = val; + } + /* Empty (all-null) chunks keep mn=INT64_MAX / mx=INT64_MIN so + * the reduce path's min(mins[*]) / max(maxs[*]) ignores them. */ + mins[g] = mn; + maxs[g] = mx; + if (any_null) nbits[g >> 3] |= (uint8_t)(1u << (g & 7)); + } + return RAY_OK; +} + +static ray_err_t chunk_zone_scan_float(ray_t* v, ray_index_t* ix, + int elem_size) { + uint32_t n_chunks = ix->u.chunk_zone.n_chunks; + uint8_t log2 = ix->u.chunk_zone.chunk_log2; + int64_t csz = 1LL << log2; + int64_t n = v->len; + double* mins = (double*)ray_data(ix->u.chunk_zone.mins); + double* maxs = (double*)ray_data(ix->u.chunk_zone.maxs); + uint8_t* nbits = (uint8_t*)ray_data(ix->u.chunk_zone.null_bits); + const uint8_t* base = (const uint8_t*)ray_data(v); + + for (uint32_t g = 0; g < n_chunks; g++) { + int64_t s = (int64_t)g * csz; + int64_t e = s + csz; if (e > n) e = n; + double mn = INFINITY, mx = -INFINITY; + bool any_null = false; + for (int64_t i = s; i < e; i++) { + if (ray_vec_is_null(v, i)) { any_null = true; continue; } + double val = 0.0; + if (elem_size == 4) { + float t; memcpy(&t, base + i*4, 4); val = (double)t; + } else { + memcpy(&val, base + i*8, 8); + } + if (isnan(val)) { any_null = true; continue; } + if (val < mn) mn = val; + if (val > mx) mx = val; + } + /* Empty (all-null) chunks keep mn=+inf / mx=-inf so reduce + * (min/max across mins[]/maxs[]) ignores them. */ + mins[g] = mn; + maxs[g] = mx; + if (any_null) nbits[g >> 3] |= (uint8_t)(1u << (g & 7)); + } + return RAY_OK; +} + +static ray_err_t chunk_zone_scan(ray_t* v, ray_index_t* ix) { + switch (v->type) { + case RAY_BOOL: + case RAY_U8: return chunk_zone_scan_int(v, ix, 1); + case RAY_I16: return chunk_zone_scan_int(v, ix, 2); + case RAY_I32: + case RAY_DATE: return chunk_zone_scan_int(v, ix, 4); + case RAY_I64: + case RAY_TIME: + case RAY_TIMESTAMP: return chunk_zone_scan_int(v, ix, 8); + case RAY_F32: return chunk_zone_scan_float(v, ix, 4); + case RAY_F64: return chunk_zone_scan_float(v, ix, 8); + default: return RAY_ERR_NYI; + } +} + /* -------------------------------------------------------------------------- * Attach * @@ -335,6 +455,59 @@ ray_t* ray_index_attach_zone(ray_t** vp) { return attach_finalize(v, idx); } +ray_t* ray_index_attach_chunk_zone(ray_t** vp, uint8_t chunk_log2) { + ray_t* v = prepare_attach(vp, "chunk_zone"); + if (RAY_IS_ERR(v)) return v; + + if (chunk_log2 == 0) chunk_log2 = 16; /* default 64 K rows / chunk */ + if (chunk_log2 < 8 || chunk_log2 > 22) + return ray_error("domain", "chunk_zone: chunk_log2 out of range [8, 22]"); + int64_t csz = 1LL << chunk_log2; + /* No point indexing a column smaller than one chunk — fall back to + * the column-wide zone (or no index at all) at that size. */ + if (v->len < csz) + return ray_error("domain", "chunk_zone: column has fewer rows than one chunk"); + + uint32_t n_chunks = (uint32_t)((v->len + csz - 1) / csz); + + ray_t* idx = ray_index_alloc(RAY_IDX_CHUNK_ZONE, v->type, v->len); + if (!idx || RAY_IS_ERR(idx)) return idx; + ray_index_t* ix = ray_index_payload(idx); + ix->u.chunk_zone.n_chunks = n_chunks; + ix->u.chunk_zone.chunk_log2 = chunk_log2; + ix->u.chunk_zone.is_f64 = (v->type == RAY_F64 || v->type == RAY_F32) ? 1 : 0; + + int8_t arr_type = ix->u.chunk_zone.is_f64 ? RAY_F64 : RAY_I64; + ray_t* mins = ray_vec_new(arr_type, (int64_t)n_chunks); + ray_t* maxs = ray_vec_new(arr_type, (int64_t)n_chunks); + int64_t nb_len = (int64_t)((n_chunks + 7) / 8); + ray_t* nbits = ray_vec_new(RAY_U8, nb_len); + if (!mins || RAY_IS_ERR(mins) || !maxs || RAY_IS_ERR(maxs) || + !nbits || RAY_IS_ERR(nbits)) + { + if (mins && !RAY_IS_ERR(mins)) ray_release(mins); + if (maxs && !RAY_IS_ERR(maxs)) ray_release(maxs); + if (nbits && !RAY_IS_ERR(nbits)) ray_release(nbits); + ray_release(idx); + return ray_error("oom", "chunk_zone: arrays alloc"); + } + mins->len = (int64_t)n_chunks; + maxs->len = (int64_t)n_chunks; + nbits->len = nb_len; + memset(ray_data(nbits), 0, (size_t)nb_len); + ix->u.chunk_zone.mins = mins; + ix->u.chunk_zone.maxs = maxs; + ix->u.chunk_zone.null_bits = nbits; + + ray_err_t err = chunk_zone_scan(v, ix); + if (err != RAY_OK) { + ray_release(idx); /* releases mins/maxs/nbits via release_payload */ + return ray_error(ray_err_code_str(err), + "chunk_zone scan failed for type %d", (int)v->type); + } + return attach_finalize(v, idx); +} + /* -------------------------------------------------------------------------- * Hash index — chained open addressing * @@ -540,11 +713,12 @@ ray_t* ray_index_drop(ray_t** vp) { static const char* kind_name(ray_idx_kind_t k) { switch (k) { - case RAY_IDX_HASH: return "hash"; - case RAY_IDX_SORT: return "sort"; - case RAY_IDX_ZONE: return "zone"; - case RAY_IDX_BLOOM: return "bloom"; - default: return "none"; + case RAY_IDX_HASH: return "hash"; + case RAY_IDX_SORT: return "sort"; + case RAY_IDX_ZONE: return "zone"; + case RAY_IDX_BLOOM: return "bloom"; + case RAY_IDX_CHUNK_ZONE: return "chunk_zone"; + default: return "none"; } } @@ -627,6 +801,14 @@ ray_t* ray_index_info(ray_t* v) { r = dict_append_sym_i64(&keys, &vals, "n_keys", ix->u.bloom.n_keys); if (RAY_IS_ERR(r)) goto fail; break; + case RAY_IDX_CHUNK_ZONE: + r = dict_append_sym_i64(&keys, &vals, "n_chunks", + (int64_t)ix->u.chunk_zone.n_chunks); + if (RAY_IS_ERR(r)) goto fail; + r = dict_append_sym_i64(&keys, &vals, "chunk_log2", + (int64_t)ix->u.chunk_zone.chunk_log2); + if (RAY_IS_ERR(r)) goto fail; + break; case RAY_IDX_NONE: break; } diff --git a/src/ops/idxop.h b/src/ops/idxop.h index 46d294bc..3121c1f5 100644 --- a/src/ops/idxop.h +++ b/src/ops/idxop.h @@ -47,11 +47,20 @@ /* Index kinds. Stored in ray_index_t.kind. */ typedef enum { - RAY_IDX_NONE = 0, - RAY_IDX_HASH = 1, - RAY_IDX_SORT = 2, - RAY_IDX_ZONE = 3, - RAY_IDX_BLOOM = 4, + RAY_IDX_NONE = 0, + RAY_IDX_HASH = 1, + RAY_IDX_SORT = 2, + RAY_IDX_ZONE = 3, + RAY_IDX_BLOOM = 4, + /* Per-chunk min/max + null bit, one entry per (1 << chunk_log2) rows. + * The whole-column zone is derivable as + * min(chunk_mins)/max(chunk_maxs) over the entries, so this + * subsumes RAY_IDX_ZONE wherever it's used in the reduce path. + * Built at column ingest (csv.read); read by the min/max reduce + * and by the predicate planner to skip chunks whose [min,max] + * provably excludes/includes the constant. See chunk_zone arm + * of ray_index_t.u below. */ + RAY_IDX_CHUNK_ZONE = 5, } ray_idx_kind_t; /* The payload stored inside data[] of a RAY_INDEX ray_t. */ @@ -99,6 +108,19 @@ typedef struct { uint32_t _pad; int64_t n_keys; /* number of non-null rows added */ } bloom; + struct { /* RAY_IDX_CHUNK_ZONE */ + /* mins / maxs hold n_chunks entries. For integer / temporal + * column types they are RAY_I64 vecs storing the per-chunk + * extrema as int64; for RAY_F64 columns they are RAY_F64 + * vecs. is_f64 disambiguates at read time. */ + ray_t* mins; + ray_t* maxs; + ray_t* null_bits; /* RAY_U8 vec, packed: bit i = chunk i has any null */ + uint32_t n_chunks; + uint8_t chunk_log2; /* chunk size = 1 << chunk_log2 (default 16 → 64 K rows) */ + uint8_t is_f64; + uint8_t _pad[2]; + } chunk_zone; } u; } ray_index_t; @@ -118,6 +140,10 @@ ray_t* ray_index_attach_zone (ray_t** vp); ray_t* ray_index_attach_hash (ray_t** vp); ray_t* ray_index_attach_sort (ray_t** vp); ray_t* ray_index_attach_bloom(ray_t** vp); +/* Build per-chunk min/max + null bit at chunk_size = 1 << chunk_log2. + * Passing 0 picks the default (16 → 64 K rows / chunk). Only valid on + * numeric and temporal vectors; SYM/STR/GUID return RAY_ERR_NYI. */ +ray_t* ray_index_attach_chunk_zone(ray_t** vp, uint8_t chunk_log2); /* Drop any attached index from *vp. No-op if none. Restores the * pre-attach nullmap state byte-for-byte. Returns *vp. */ From c7de32d198fe9bac3207b38fa94170fb9b15a58d Mon Sep 17 00:00:00 2001 From: Anton Date: Tue, 26 May 2026 15:20:57 +0200 Subject: [PATCH 11/11] perf(heap): amortize ray_heap_gc page-release sweep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ray_heap_gc's pass 5 walked every freelist of every registered heap and issued madvise(MADV_DONTNEED) on every free block > 4 KiB on every GC invocation. For repeated-query workloads (any analytical loop), the freed blocks were reused on the very next query — but madvise tore down the page tables and forced re-fault, paying the cost twice and dominating the profile after the actual worker compute (~21% of total query time on per-row eq workloads). Throttle pass 5 to once per 16 GCs. The long-running-process invariant (idle free blocks eventually return their physical pages to the OS) is preserved; the per-query madvise cost disappears. Callers needing prompt release continue to use the explicit ray_heap_release_pages() entry point. Passes 1-4 (foreign flush, slab flush, freelist return, oversized pool reclamation) still run every call — those are the correctness- relevant passes (cross-heap accounting, pool reusability). --- src/mem/heap.c | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/src/mem/heap.c b/src/mem/heap.c index 9616a0d4..86f09ed9 100644 --- a/src/mem/heap.c +++ b/src/mem/heap.c @@ -1471,20 +1471,33 @@ void ray_heap_gc(void) { } /* Pass 5: Release physical pages from free blocks in every - * idle heap. Pass 2 may have returned blocks to worker-owned - * freelists; releasing only the caller heap leaves those worker - * pages resident across large query repetitions. */ - for (int hid = 0; hid < RAY_HEAP_REGISTRY_SIZE; hid++) { - ray_heap_t* gh = ray_heap_registry[hid]; - if (!gh) continue; - for (int i = 13; i < RAY_HEAP_FL_SIZE; i++) { - ray_fl_head_t* head = &gh->freelist[i]; - ray_t* blk = head->fl_next; - while (blk != (ray_t*)head) { - size_t bsize = BSIZEOF(i); - if (bsize > 4096) - ray_vm_release((char*)blk + 4096, bsize - 4096); - blk = blk->fl_next; + * idle heap, throttled to once every PASS5_PERIOD GCs. + * + * The original unthrottled walk issued one madvise(MADV_DONTNEED) + * per free block > 4 KB on every GC. For repeated-query + * workloads (any bench / OLAP loop) the freed blocks would be + * reused on the very next query — but the madvise tears down + * page tables and forces a re-fault, paying the cost twice. + * + * Period 16 keeps the long-running-process invariant (free + * blocks eventually return physical pages to the OS) while + * removing the per-query madvise cost. Explicit callers + * needing prompt release should use ray_heap_release_pages. */ + static uint32_t pass5_counter = 0; + enum { PASS5_PERIOD = 16 }; + if ((++pass5_counter % PASS5_PERIOD) == 0) { + for (int hid = 0; hid < RAY_HEAP_REGISTRY_SIZE; hid++) { + ray_heap_t* gh = ray_heap_registry[hid]; + if (!gh) continue; + for (int i = 13; i < RAY_HEAP_FL_SIZE; i++) { + ray_fl_head_t* head = &gh->freelist[i]; + ray_t* blk = head->fl_next; + while (blk != (ray_t*)head) { + size_t bsize = BSIZEOF(i); + if (bsize > 4096) + ray_vm_release((char*)blk + 4096, bsize - 4096); + blk = blk->fl_next; + } } } }