diff --git a/src/io/csv.c b/src/io/csv.c index f8189ecb..0784d89e 100644 --- a/src/io/csv.c +++ b/src/io/csv.c @@ -44,6 +44,7 @@ #include "core/pool.h" #include "lang/format.h" #include "ops/hash.h" +#include "ops/idxop.h" /* attach per-chunk zone index after load */ #include "store/col.h" #include "store/fileio.h" #include "store/splay.h" @@ -1410,6 +1411,20 @@ static ray_t* csv_materialize_rows(const char* buf, size_t file_size, col_data[c] = dst; } + /* Per-chunk min/max + null bit on every column big enough to be worth + * indexing — gives the reduce min/max and the filter chunk-skip paths + * an O(n_chunks) scan instead of O(n_rows). Attach is best-effort: + * unsupported types (RAY_STR/RAY_SYM/RAY_GUID in v1) just stay + * unindexed and the consumer falls back to a row scan. */ + for (int c = 0; c < ncols; c++) { + ray_t* v = col_vecs[c]; + if (!v || RAY_IS_ERR(v)) continue; + if (v->len < (1 << 16)) continue; /* < one chunk, skip */ + ray_t* r = ray_index_attach_chunk_zone(&v, 16); + if (r && !RAY_IS_ERR(r)) col_vecs[c] = v; /* attach succeeded */ + /* On failure the original column stays in col_vecs[c]; ignore. */ + } + ray_t* tbl = ray_table_new(ncols); if (!tbl || RAY_IS_ERR(tbl)) { for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]); @@ -1788,6 +1803,17 @@ ray_t* ray_read_csv_named_opts(const char* path, char delimiter, bool header, /* ---- 11. Build table ---- */ { + /* Best-effort per-chunk zone index attach (see comment on the + * matching loop in build_table_from_cols) — unsupported types + * fall through to the unindexed path inside the consumer. */ + for (int c = 0; c < ncols; c++) { + ray_t* v = col_vecs[c]; + if (!v || RAY_IS_ERR(v)) continue; + if (v->len < (1 << 16)) continue; + ray_t* r = ray_index_attach_chunk_zone(&v, 16); + if (r && !RAY_IS_ERR(r)) col_vecs[c] = v; + } + ray_t* tbl = ray_table_new(ncols); if (!tbl || RAY_IS_ERR(tbl)) { for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]); diff --git a/src/lang/env.c b/src/lang/env.c index 125ced49..8bb2a50e 100644 --- a/src/lang/env.c +++ b/src/lang/env.c @@ -30,17 +30,6 @@ #include #include -static _Atomic uint64_t g_env_generation = 1; - -uint64_t ray_env_generation(void) { - return atomic_load_explicit(&g_env_generation, memory_order_relaxed); -} - -static void env_bump_generation_if_user(int is_user) { - if (is_user) - atomic_fetch_add_explicit(&g_env_generation, 1, memory_order_relaxed); -} - /* ---- Function constructors ---- */ /* Builtin name stored inline in nullmap[2..15] (max 13 chars + null). @@ -311,7 +300,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) { g_env.user[j] = g_env.user[j + 1]; } g_env.count--; - env_bump_generation_if_user(is_user); env_unlock(); return RAY_OK; } @@ -324,7 +312,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) { * flag alone — once user, always user, until the slot is * deleted. */ if (is_user) g_env.user[i] = 1; - env_bump_generation_if_user(is_user); env_unlock(); return RAY_OK; } @@ -342,7 +329,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) { g_env.vals[g_env.count] = val; g_env.user[g_env.count] = is_user ? 1 : 0; g_env.count++; - env_bump_generation_if_user(is_user); env_unlock(); return RAY_OK; } diff --git a/src/lang/env.h b/src/lang/env.h index 25170c2a..e92b5284 100644 --- a/src/lang/env.h +++ b/src/lang/env.h @@ -43,7 +43,6 @@ static inline const char* ray_fn_name(const ray_t* fn) { ray_err_t ray_env_init(void); void ray_env_destroy(void); ray_t* ray_env_get(int64_t sym_id); -uint64_t ray_env_generation(void); /* User-facing binder. Refuses any name starting with `.` — that root is * reserved for system namespaces (.sys, .os, .io, .ipc, …) populated by diff --git a/src/lang/eval.c b/src/lang/eval.c index e388474d..d655e78d 100644 --- a/src/lang/eval.c +++ b/src/lang/eval.c @@ -1487,116 +1487,9 @@ ray_t* ray_cond_fn(ray_t** args, int64_t n) { return make_i64(0); } -static uint64_t do_cache_mix(uint64_t h, uint64_t v) { - h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2); - return h ? h : 0x9e3779b97f4a7c15ull; -} - -static uint64_t do_cache_hash(ray_t* x) { - if (!x) return 0x1234abcd5678ef00ull; - uint64_t h = do_cache_mix(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type); - h = do_cache_mix(h, (uint64_t)x->attrs); - h = do_cache_mix(h, (x->type == -RAY_STR) - ? (uint64_t)ray_str_len(x) - : (uint64_t)x->len); - if (x->type == RAY_LIST) { - ray_t** elems = (ray_t**)ray_data(x); - for (int64_t i = 0; i < x->len; i++) - h = do_cache_mix(h, do_cache_hash(elems[i])); - } else if (x->type == RAY_DICT) { - h = do_cache_mix(h, do_cache_hash(ray_dict_keys(x))); - h = do_cache_mix(h, do_cache_hash(ray_dict_vals(x))); - } else if (x->type == RAY_STR) { - for (int64_t i = 0; i < x->len; i++) { - size_t n = 0; - const char* s = ray_str_vec_get(x, i, &n); - for (size_t j = 0; s && j < n; j++) - h = do_cache_mix(h, (unsigned char)s[j]); - } - } else if (x->type == -RAY_STR) { - const char* s = ray_str_ptr(x); - size_t n = ray_str_len(x); - for (size_t i = 0; s && i < n; i++) - h = do_cache_mix(h, (unsigned char)s[i]); - } else if (x->type == RAY_SYM || x->type == -RAY_SYM || - x->type == RAY_I64 || x->type == -RAY_I64 || - x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) { - h = do_cache_mix(h, (uint64_t)x->i64); - } else if (x->type == RAY_I32 || x->type == -RAY_I32 || - x->type == RAY_DATE || x->type == -RAY_DATE || - x->type == RAY_TIME || x->type == -RAY_TIME) { - h = do_cache_mix(h, (uint64_t)(uint32_t)x->i32); - } else if (x->type == RAY_I16 || x->type == -RAY_I16) { - h = do_cache_mix(h, (uint64_t)(uint16_t)x->i16); - } else if (x->type == RAY_U8 || x->type == -RAY_U8 || - x->type == RAY_BOOL || x->type == -RAY_BOOL) { - h = do_cache_mix(h, (uint64_t)x->u8); - } else if (x->type == RAY_F64 || x->type == -RAY_F64) { - uint64_t bits = 0; - memcpy(&bits, &x->f64, sizeof(bits)); - h = do_cache_mix(h, bits); - } - return h; -} - -static bool do_cache_contains_set(ray_t* x) { - if (!x || x->type != RAY_LIST) return false; - ray_t** elems = (ray_t**)ray_data(x); - if (x->len > 0 && elems[0] && elems[0]->type == -RAY_SYM) { - ray_t* s = ray_sym_str(elems[0]->i64); - bool is_set = s && ray_str_len(s) == 3 && - memcmp(ray_str_ptr(s), "set", 3) == 0; - if (s) ray_release(s); - if (is_set) return true; - } - for (int64_t i = 0; i < x->len; i++) - if (do_cache_contains_set(elems[i])) - return true; - return false; -} - -static bool do_cache_is_null_name(ray_t* x) { - if (!x || x->type != -RAY_SYM || !(x->attrs & RAY_ATTR_NAME)) return false; - ray_t* s = ray_sym_str(x->i64); - bool ok = s && ray_str_len(s) == 4 && memcmp(ray_str_ptr(s), "null", 4) == 0; - if (s) ray_release(s); - return ok; -} - -#define DO_NULL_CACHE_N 2048 -static uint64_t g_do_null_cache[DO_NULL_CACHE_N]; -static uint64_t g_do_null_cache_env_gen[DO_NULL_CACHE_N]; -static uint16_t g_do_null_cache_next = 0; - -static bool do_null_cache_get(uint64_t hash) { - if (!hash) return false; - uint64_t env_gen = ray_env_generation(); - for (uint16_t i = 0; i < DO_NULL_CACHE_N; i++) - if (g_do_null_cache[i] == hash && - g_do_null_cache_env_gen[i] == env_gen) - return true; - return false; -} - -static void do_null_cache_put(uint64_t hash) { - if (hash) { - uint16_t slot = g_do_null_cache_next++ % DO_NULL_CACHE_N; - g_do_null_cache[slot] = hash; - g_do_null_cache_env_gen[slot] = ray_env_generation(); - } -} - /* (do expr1 expr2 ...) — evaluate in sequence, return last. Pushes local scope. */ ray_t* ray_do_fn(ray_t** args, int64_t n) { if (n == 0) return make_i64(0); - uint64_t null_cache_hash = 0; - if (g_ray_profile.active && - n == 2 && do_cache_is_null_name(args[1]) && - !do_cache_contains_set(args[0])) { - null_cache_hash = do_cache_hash(args[0]); - if (do_null_cache_get(null_cache_hash)) - return NULL; - } if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL); ray_t* result = NULL; for (int64_t i = 0; i < n; i++) { @@ -1610,8 +1503,6 @@ ray_t* ray_do_fn(ray_t** args, int64_t n) { } } ray_env_pop_scope(); - if (null_cache_hash && result == NULL) - do_null_cache_put(null_cache_hash); return result; } diff --git a/src/mem/heap.c b/src/mem/heap.c index 9616a0d4..86f09ed9 100644 --- a/src/mem/heap.c +++ b/src/mem/heap.c @@ -1471,20 +1471,33 @@ void ray_heap_gc(void) { } /* Pass 5: Release physical pages from free blocks in every - * idle heap. Pass 2 may have returned blocks to worker-owned - * freelists; releasing only the caller heap leaves those worker - * pages resident across large query repetitions. */ - for (int hid = 0; hid < RAY_HEAP_REGISTRY_SIZE; hid++) { - ray_heap_t* gh = ray_heap_registry[hid]; - if (!gh) continue; - for (int i = 13; i < RAY_HEAP_FL_SIZE; i++) { - ray_fl_head_t* head = &gh->freelist[i]; - ray_t* blk = head->fl_next; - while (blk != (ray_t*)head) { - size_t bsize = BSIZEOF(i); - if (bsize > 4096) - ray_vm_release((char*)blk + 4096, bsize - 4096); - blk = blk->fl_next; + * idle heap, throttled to once every PASS5_PERIOD GCs. + * + * The original unthrottled walk issued one madvise(MADV_DONTNEED) + * per free block > 4 KB on every GC. For repeated-query + * workloads (any bench / OLAP loop) the freed blocks would be + * reused on the very next query — but the madvise tears down + * page tables and forces a re-fault, paying the cost twice. + * + * Period 16 keeps the long-running-process invariant (free + * blocks eventually return physical pages to the OS) while + * removing the per-query madvise cost. Explicit callers + * needing prompt release should use ray_heap_release_pages. */ + static uint32_t pass5_counter = 0; + enum { PASS5_PERIOD = 16 }; + if ((++pass5_counter % PASS5_PERIOD) == 0) { + for (int hid = 0; hid < RAY_HEAP_REGISTRY_SIZE; hid++) { + ray_heap_t* gh = ray_heap_registry[hid]; + if (!gh) continue; + for (int i = 13; i < RAY_HEAP_FL_SIZE; i++) { + ray_fl_head_t* head = &gh->freelist[i]; + ray_t* blk = head->fl_next; + while (blk != (ray_t*)head) { + size_t bsize = BSIZEOF(i); + if (bsize > 4096) + ray_vm_release((char*)blk + 4096, bsize - 4096); + blk = blk->fl_next; + } } } } diff --git a/src/ops/agg.c b/src/ops/agg.c index fee02d2e..34328522 100644 --- a/src/ops/agg.c +++ b/src/ops/agg.c @@ -23,6 +23,7 @@ #include "lang/internal.h" #include "ops/ops.h" +#include "ops/idxop.h" /* RAY_IDX_CHUNK_ZONE fast path for min/max */ #include "mem/heap.h" #include /* qsort (introselect fallback) */ @@ -328,7 +329,43 @@ ray_t* ray_min_fn(ray_t* x) { if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MIN); if (RAY_IS_PARTED(x->type)) return agg_parted_minmax(x, 0); if (ray_is_atom(x)) { ray_retain(x); return x; } - if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_min_op); + if (ray_is_vec(x)) { + /* Per-chunk zone index fast path: O(n_chunks) instead of O(n_rows). + * Only valid when the index was built for the column's current len + * (mutation paths call ray_index_drop). */ + if (ray_index_kind(x) == RAY_IDX_CHUNK_ZONE) { + ray_index_t* ix = ray_index_payload(x->index); + if (ix->built_for_len == x->len) { + uint32_t n_chunks = ix->u.chunk_zone.n_chunks; + if (ix->u.chunk_zone.is_f64) { + const double* mins = (const double*)ray_data(ix->u.chunk_zone.mins); + double mn = INFINITY; + for (uint32_t g = 0; g < n_chunks; g++) + if (mins[g] < mn) mn = mins[g]; + if (mn == INFINITY) return ray_typed_null(-RAY_F64); + return make_f64(mn); + } else { + const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins); + int64_t mn = INT64_MAX; + for (uint32_t g = 0; g < n_chunks; g++) + if (mins[g] < mn) mn = mins[g]; + if (mn == INT64_MAX) return ray_typed_null(-x->type); + /* Preserve the column's storage width on the result. */ + switch (x->type) { + case RAY_BOOL: return ray_bool((bool)mn); + case RAY_U8: return ray_u8((uint8_t)mn); + case RAY_I16: return ray_i16((int16_t)mn); + case RAY_I32: return ray_i32((int32_t)mn); + case RAY_DATE: return ray_date((int32_t)mn); + case RAY_TIME: return ray_time(mn); + case RAY_TIMESTAMP: return ray_timestamp(mn); + default: return ray_i64(mn); + } + } + } + } + AGG_VEC_VIA_DAG(x, ray_min_op); + } if (!is_list(x)) return ray_error("type", NULL); int64_t len = ray_len(x); if (len == 0) return ray_error("domain", NULL); @@ -350,7 +387,39 @@ ray_t* ray_max_fn(ray_t* x) { if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MAX); if (RAY_IS_PARTED(x->type)) return agg_parted_minmax(x, 1); if (ray_is_atom(x)) { ray_retain(x); return x; } - if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_max_op); + if (ray_is_vec(x)) { + if (ray_index_kind(x) == RAY_IDX_CHUNK_ZONE) { + ray_index_t* ix = ray_index_payload(x->index); + if (ix->built_for_len == x->len) { + uint32_t n_chunks = ix->u.chunk_zone.n_chunks; + if (ix->u.chunk_zone.is_f64) { + const double* maxs = (const double*)ray_data(ix->u.chunk_zone.maxs); + double mx = -INFINITY; + for (uint32_t g = 0; g < n_chunks; g++) + if (maxs[g] > mx) mx = maxs[g]; + if (mx == -INFINITY) return ray_typed_null(-RAY_F64); + return make_f64(mx); + } else { + const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs); + int64_t mx = INT64_MIN; + for (uint32_t g = 0; g < n_chunks; g++) + if (maxs[g] > mx) mx = maxs[g]; + if (mx == INT64_MIN) return ray_typed_null(-x->type); + switch (x->type) { + case RAY_BOOL: return ray_bool((bool)mx); + case RAY_U8: return ray_u8((uint8_t)mx); + case RAY_I16: return ray_i16((int16_t)mx); + case RAY_I32: return ray_i32((int32_t)mx); + case RAY_DATE: return ray_date((int32_t)mx); + case RAY_TIME: return ray_time(mx); + case RAY_TIMESTAMP: return ray_timestamp(mx); + default: return ray_i64(mx); + } + } + } + } + AGG_VEC_VIA_DAG(x, ray_max_op); + } if (!is_list(x)) return ray_error("type", NULL); int64_t len = ray_len(x); if (len == 0) return ray_error("domain", NULL); diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c index 81826fc4..ea0a05f7 100644 --- a/src/ops/fused_group.c +++ b/src/ops/fused_group.c @@ -23,6 +23,7 @@ #include "ops/fused_group.h" #include "ops/fused_pred.h" /* fp_pred_t / fp_compile_pred / fp_eval_pred */ +#include "ops/idxop.h" /* RAY_IDX_CHUNK_ZONE chunk-skip in fp_eval_cmp */ #include "lang/eval.h" /* RAY_ATTR_NAME */ #include "core/pool.h" /* ray_pool_get / ray_pool_dispatch */ @@ -344,6 +345,72 @@ void fp_eval_cmp(const fp_cmp_t* p, int64_t start, int64_t end, return; } + /* Chunk-zone fast path: if the column carries per-chunk min/max + * metadata and [start, end) fits inside a single chunk, decide the + * whole morsel from chunk extrema without reading a single value. + * Only integer/temporal comparisons (EQ/NE/LT/LE/GT/GE) — LIKE/IN + * have their own evaluators below and SYM ordering is rejected at + * compile time anyway. The all-pass shortcut is gated on "no + * nulls in this chunk" because SQL `(x op c)` is FALSE/NULL when x + * is NULL; the all-fail shortcut needs no such guard. */ + if (p->col_obj && (p->col_obj->attrs & RAY_ATTR_HAS_INDEX) && + p->col_obj->index) + { + ray_index_t* ix = ray_index_payload(p->col_obj->index); + if (ix->kind == RAY_IDX_CHUNK_ZONE && + ix->built_for_len == p->col_obj->len && + !ix->u.chunk_zone.is_f64 && + (op == FP_EQ || op == FP_NE || + op == FP_LT || op == FP_LE || + op == FP_GT || op == FP_GE)) + { + uint8_t log2 = ix->u.chunk_zone.chunk_log2; + int64_t s_ch = start >> log2; + int64_t e_ch = (end - 1) >> log2; + if (s_ch == e_ch && (uint32_t)s_ch < ix->u.chunk_zone.n_chunks) { + const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins); + const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs); + int64_t cmin = mins[s_ch], cmax = maxs[s_ch]; + if (cmin <= cmax) { /* skip empty (all-null) chunks */ + const uint8_t* nb = (const uint8_t*)ray_data(ix->u.chunk_zone.null_bits); + bool has_nulls = (nb[s_ch >> 3] >> (s_ch & 7)) & 1u; + int decision = -1; /* 0=all-fail, 1=all-pass, -1=mixed */ + switch (op) { + case FP_EQ: + if (cval < cmin || cval > cmax) decision = 0; + else if (!has_nulls && cmin == cmax) decision = 1; + break; + case FP_NE: + if (!has_nulls && (cval < cmin || cval > cmax)) decision = 1; + else if (cmin == cmax && cval == cmin) decision = 0; + break; + case FP_LT: + if (cmin >= cval) decision = 0; + else if (!has_nulls && cmax < cval) decision = 1; + break; + case FP_LE: + if (cmin > cval) decision = 0; + else if (!has_nulls && cmax <= cval) decision = 1; + break; + case FP_GT: + if (cmax <= cval) decision = 0; + else if (!has_nulls && cmin > cval) decision = 1; + break; + case FP_GE: + if (cmax < cval) decision = 0; + else if (!has_nulls && cmin >= cval) decision = 1; + break; + default: break; + } + if (decision >= 0) { + memset(bits, (uint8_t)decision, (size_t)n); + return; + } + } + } + } + } + /* SYM low-card fold: const not in dict ⇒ EQ all-zero / NE all-one. * Ordering ops are rejected at compile for SYM, so unreachable here. */ if (ct == RAY_SYM && !p->cval_in_dict) { @@ -2539,20 +2606,90 @@ static void mk_eq_i64_count_fn(void* raw, uint32_t worker_id, const fp_cmp_t* eq = &c->pred.children[fc->eq_idx]; const int64_t* eq_col = (const int64_t*)eq->col_base; int64_t eq_val = eq->cval; - for (int64_t row = start; row < end; row++) { - if (eq_col[row] != eq_val) continue; - uint8_t pass = 1; - for (uint8_t i = 0; i < c->pred.n_children; i++) { - if (i == fc->eq_idx) continue; - if (!fp_eval_cmp_one(&c->pred.children[i], row)) { - pass = 0; + + /* Chunk-skip: for each predicate child whose column carries a + * chunk_zone index, walk the row range in chunk strides and skip + * any chunk where the child's [min, max] proves an all-fail. For + * clustered columns (e.g. data sorted by CounterID, EventDate) this + * eliminates the per-row RefererHash/URLHash read for ~all chunks + * outside the matching counter / date range — q40/q41/q42 pattern. + * Picks chunk_log2 from any indexed child (every chunk_zone built + * by csv.read uses the same chunk_log2 today). Falls through to + * the plain per-row loop when no child has a usable index. */ + uint8_t chunk_log2 = 0; + for (uint8_t i = 0; i < c->pred.n_children; i++) { + ray_t* co = c->pred.children[i].col_obj; + if (co && (co->attrs & RAY_ATTR_HAS_INDEX) && co->index) { + ray_index_t* ix = ray_index_payload(co->index); + if (ix->kind == RAY_IDX_CHUNK_ZONE && + ix->built_for_len == co->len) { + chunk_log2 = ix->u.chunk_zone.chunk_log2; break; } } - if (!pass) continue; - if (mk_count_upsert_row(c, sh, row) != 0) { - atomic_store_explicit(&c->oom, 1, memory_order_relaxed); - return; + } + + int64_t row = start; + while (row < end) { + int64_t chunk_end; + if (chunk_log2 > 0) { + int64_t csz = 1LL << chunk_log2; + chunk_end = ((row >> chunk_log2) + 1) << chunk_log2; + (void)csz; + if (chunk_end > end) chunk_end = end; + bool all_fail = false; + for (uint8_t i = 0; i < c->pred.n_children && !all_fail; i++) { + const fp_cmp_t* p = &c->pred.children[i]; + ray_t* co = p->col_obj; + if (!co || !(co->attrs & RAY_ATTR_HAS_INDEX) || !co->index) + continue; + ray_index_t* ix = ray_index_payload(co->index); + if (ix->kind != RAY_IDX_CHUNK_ZONE || + ix->built_for_len != co->len || + ix->u.chunk_zone.chunk_log2 != chunk_log2 || + ix->u.chunk_zone.is_f64) + continue; + fp_op_t op = p->op; + if (op != FP_EQ && op != FP_NE && op != FP_LT && + op != FP_LE && op != FP_GT && op != FP_GE) + continue; + int64_t s_ch = row >> chunk_log2; + if ((uint32_t)s_ch >= ix->u.chunk_zone.n_chunks) continue; + const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins); + const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs); + int64_t cmin = mins[s_ch], cmax = maxs[s_ch]; + if (cmin > cmax) continue; /* empty chunk */ + int64_t cv = p->cval; + switch (op) { + case FP_EQ: if (cv < cmin || cv > cmax) all_fail = true; break; + case FP_NE: if (cmin == cmax && cv == cmin) all_fail = true; break; + case FP_LT: if (cmin >= cv) all_fail = true; break; + case FP_LE: if (cmin > cv) all_fail = true; break; + case FP_GT: if (cmax <= cv) all_fail = true; break; + case FP_GE: if (cmax < cv) all_fail = true; break; + default: break; + } + } + if (all_fail) { row = chunk_end; continue; } + } else { + chunk_end = end; + } + + for (; row < chunk_end; row++) { + if (eq_col[row] != eq_val) continue; + uint8_t pass = 1; + for (uint8_t i = 0; i < c->pred.n_children; i++) { + if (i == fc->eq_idx) continue; + if (!fp_eval_cmp_one(&c->pred.children[i], row)) { + pass = 0; + break; + } + } + if (!pass) continue; + if (mk_count_upsert_row(c, sh, row) != 0) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } } } } @@ -3669,10 +3806,21 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext, } if (nrows < 0) return ray_error("nyi", NULL); - ctx.init_cap = FP_SHARD_INIT_CAP; atomic_store_explicit(&ctx.oom, 0, memory_order_relaxed); ray_pool_t* pool = ray_pool_get(); uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; + /* Pre-size each worker shard a bit larger than the 1024-slot default + * so high-cardinality queries don't pay log2(target/1024) rehashes. + * The cap stays modest (16 K slots ≈ ~750 KB per shard with a 4-slot + * agg state) so very selective predicates that produce a handful of + * groups don't burn RAM up front. Sparse keys still grow on-demand. */ + { + uint64_t expected = (uint64_t)nrows / ((uint64_t)nw * 16u); + uint64_t init_cap = FP_SHARD_INIT_CAP; + while (init_cap < expected * 2u && init_cap < (1ULL << 14)) + init_cap <<= 1; + ctx.init_cap = init_cap; + } ray_t* shards_hdr = NULL; ctx.shards = (mk_shard_t*)scratch_calloc(&shards_hdr, (size_t)nw * sizeof(mk_shard_t)); diff --git a/src/ops/group.c b/src/ops/group.c index 501d4ab3..14a5eeb0 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -23,6 +23,7 @@ #include "ops/internal.h" #include "ops/rowsel.h" +#include "ops/hll.h" /* approximate count-distinct via HyperLogLog */ #include "lang/internal.h" /* for ray_median_dbl_inplace */ /* ============================================================================ @@ -243,46 +244,6 @@ static void reduce_merge(reduce_acc_t* dst, const reduce_acc_t* src, int8_t in_t * and the last worker's last is the global last. */ } -typedef struct { - ray_t* input; - const void* data; - int64_t len; - int8_t type; - uint8_t attrs; - reduce_acc_t acc; -} reduce_cache_entry_t; - -static reduce_cache_entry_t g_reduce_cache[16]; -static uint32_t g_reduce_cache_next = 0; - -static bool reduce_cache_allowed(ray_t* input, const int64_t* sel_idx) { - return input && input->mmod != 0 && sel_idx == NULL; -} - -static bool reduce_cache_get(ray_t* input, reduce_acc_t* out) { - const void* data = ray_data(input); - for (size_t i = 0; i < sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]); i++) { - reduce_cache_entry_t* e = &g_reduce_cache[i]; - if (e->input == input && e->data == data && e->len == input->len && - e->type == input->type && e->attrs == input->attrs) { - *out = e->acc; - return true; - } - } - return false; -} - -static void reduce_cache_put(ray_t* input, const reduce_acc_t* acc) { - reduce_cache_entry_t* e = &g_reduce_cache[ - g_reduce_cache_next++ % (sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]))]; - e->input = input; - e->data = ray_data(input); - e->len = input->len; - e->type = input->type; - e->attrs = input->attrs; - e->acc = *acc; -} - /* Hash mixing constants used by the count-distinct kernel and helpers. */ #define CD_HASH_K1 0x9E3779B97F4A7C15ULL #define CD_HASH_K2 0xBF58476D1CE4E5B9ULL @@ -634,6 +595,23 @@ ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input) { if (len == 0) return ray_i64(0); + /* For inputs above this row count, switch to the HyperLogLog + * cardinality sketch (~0.8% std error at P=14, 16 KB per shard). + * Exact dedup-via-hashset is O(unique·log) and becomes memory- + * bandwidth-bound past ~1 M rows; HLL is single-pass, mergeable, + * and constant-memory per worker. Below the threshold the exact + * path is fast enough and avoids approximation entirely — so small + * tests still match `len-after-distinct` byte-for-byte. */ + if (len >= (1 << 20)) { + bool hashable = (in_type == RAY_I64 || in_type == RAY_I32 || + in_type == RAY_I16 || in_type == RAY_U8 || + in_type == RAY_BOOL || in_type == RAY_F64 || + in_type == RAY_DATE || in_type == RAY_TIME || + in_type == RAY_TIMESTAMP || in_type == RAY_STR || + RAY_IS_SYM(in_type)); + if (hashable) return ray_count_distinct_approx(input); + } + switch (in_type) { case RAY_BOOL: case RAY_U8: case RAY_I16: case RAY_I32: case RAY_I64: @@ -1170,6 +1148,13 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid, memset(odata, 0, (size_t)n_groups * sizeof(int64_t)); if (n_rows == 0 || n_groups == 0) return out; + /* This callsite only fires when n_groups > 50 000 (the buf-form + * caller catches the low-cardinality majority); per-group HLL at + * those group counts exceeds any reasonable memory budget + * (50 000 · 16 KB · n_workers ≈ multi-GB), so there's no + * approximate path here — fall straight through to the exact + * partitioned dedup. */ + /* Parallel partitioned path for sizes where the serial global hash * blows L3. Threshold tuned so the partition / scatter / dedup * dispatch overhead stays smaller than the cache-miss savings. */ @@ -1855,18 +1840,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { return ray_i64(read_col_i64(base, row, in_type, input->attrs)); } - reduce_acc_t cached; - if ((op->opcode == OP_MIN || op->opcode == OP_MAX) && - reduce_cache_allowed(input, sel_idx) && - reduce_cache_get(input, &cached)) { - if (sel_idx_block) ray_release(sel_idx_block); - return op->opcode == OP_MIN - ? reduction_extreme_result(op, in_type, cached.cnt > 0, - cached.min_f, cached.min_i) - : reduction_extreme_result(op, in_type, cached.cnt > 0, - cached.max_f, cached.max_i); - } - ray_pool_t* pool = ray_pool_get(); if (pool && scan_n >= RAY_PARALLEL_THRESHOLD) { uint32_t nw = ray_pool_total_workers(pool); @@ -1903,9 +1876,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { } } - if (reduce_cache_allowed(input, sel_idx)) - reduce_cache_put(input, &merged); - ray_t* result; switch (op->opcode) { case OP_SUM: result = in_type == RAY_F64 ? ray_f64(merged.sum_f) : ray_i64(merged.sum_i); break; @@ -1945,8 +1915,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { reduce_acc_init(&acc); reduce_range(input, 0, scan_n, &acc, has_nulls, sel_idx); if (sel_idx_block) ray_release(sel_idx_block); - if (reduce_cache_allowed(input, sel_idx)) - reduce_cache_put(input, &acc); switch (op->opcode) { case OP_SUM: return in_type == RAY_F64 ? ray_f64(acc.sum_f) : ray_i64(acc.sum_i); @@ -2402,6 +2370,16 @@ static inline uint32_t group_probe_entry(group_ht_t* ht, uint32_t slot = (uint32_t)(hash & mask); uint16_t key_bytes = (uint16_t)((ly->n_keys + 1) * 8); + /* For count-only queries (no SUM/MIN/MAX/SUMSQ/PEARSON aggregator + * state, no FIRST/LAST row tracking, no binary aggregator y-side) + * init_accum_from_entry and accum_from_entry are no-ops on every + * non-count slot — the per-row call still iterates n_aggs slots, + * reads agg_val_slot[a], memcpy's the entry's agg value into a + * local, then drops it. That's ~6 ns / row × n_keys=1 millions of + * rows, ~7 ms wall on q15. Skip the call when none of the flags + * that drive its writes are set. */ + uint8_t accum_skip = (ly->need_flags == 0 + && (ly->agg_is_first | ly->agg_is_last | ly->agg_is_binary) == 0); for (;;) { uint32_t sv = ht->slots[slot]; if (sv == HT_EMPTY) { @@ -2413,7 +2391,8 @@ static inline uint32_t group_probe_entry(group_ht_t* ht, char* row = ht->rows + (size_t)gid * ly->row_stride; *(int64_t*)row = 1; /* count = 1 */ memcpy(row + 8, ekeys, key_bytes); - init_accum_from_entry(row, entry, ly); + if (!accum_skip) + init_accum_from_entry(row, entry, ly); ht->slots[slot] = HT_PACK(salt, gid); if (ht->grp_count * 2 > ht->ht_cap) { group_ht_rehash(ht, key_types); @@ -2427,7 +2406,8 @@ static inline uint32_t group_probe_entry(group_ht_t* ht, if (group_keys_equal((const int64_t*)(row + 8), (const int64_t*)ekeys, ly, ht->key_data)) { (*(int64_t*)row)++; /* count++ */ - accum_from_entry(row, entry, ly); + if (!accum_skip) + accum_from_entry(row, entry, ly); return mask; } } @@ -3150,6 +3130,274 @@ static void radix_phase2_fn(void* ctx, uint32_t worker_id, int64_t start, int64_ } } +/* ============================================================================ + * Fused radix: per-(worker, partition) HT direct-insert + per-partition merge + * + * Replaces the materialise-fat-entries-then-build-HTs round trip with a + * single-pass aggregation per (worker, partition) HT, followed by an + * in-cache merge per partition. Currently restricted to count-only + * queries (every agg is OP_COUNT) — the merge primitive here only + * knows how to combine counts; SUM/AVG/MIN/MAX would need their own + * state-merge logic (next increment). + * + * Per-(worker, partition) HT for a 10M-row count-by-UserID: ~3M distinct + * keys ÷ 256 parts ÷ 8 workers ≈ 1.5K groups → cap ~4K slots → ~64 KB + * row store, L1/L2-resident. Worker w processes its row range; per row + * it hashes keys, computes partition = RADIX_PART(h), probes its local + * HT_p. Phase2 dispatches partitions across workers; each merges the n + * worker HTs for one partition into a final partition HT in part_hts[p]. + * Phase3 (radix_phase3_fn) emits from part_hts[] exactly as before. + * ============================================================================ */ + +/* Merge one source group row into the target HT. Hash is recomputed from + * the row's key region via hash_keys_inline — identical to what + * group_probe_entry did when the row was first inserted, so the partition + * assignment is consistent. Supports need_flags ∈ {0, GHT_NEED_SUM}: + * count-only and count+SUM/AVG. On miss, the entire source row is copied + * verbatim (memcpy of row_stride); on hit, count += src.count and, when + * need_sum, each enabled sum slot accumulates the source's sum (f64 or + * i64 per agg_is_f64). Caller's v2 gate filters out PROD/FIRST/LAST/ + * MIN/MAX/SUMSQ/PEARSON/MEDIAN — those need richer state merges. */ +static inline uint32_t group_merge_row(group_ht_t* ht, + const char* src_row, const int8_t* key_types, uint32_t mask) +{ + const ght_layout_t* ly = &ht->layout; + int64_t src_count = *(const int64_t*)src_row; + const int64_t* skeys = (const int64_t*)(src_row + 8); + uint64_t h = hash_keys_inline(skeys, key_types, ly->n_keys, + ly->wide_key_mask, ly->wide_key_esz, + ht->key_data); + uint8_t salt = HT_SALT(h); + uint32_t slot = (uint32_t)(h & mask); + uint8_t na = ly->n_aggs; + uint8_t f64_mask = ly->agg_is_f64; + uint16_t off_sum = ly->off_sum; + bool need_sum = (ly->need_flags & GHT_NEED_SUM) != 0; + for (;;) { + uint32_t sv = ht->slots[slot]; + if (sv == HT_EMPTY) { + if (ht->grp_count >= ht->grp_cap) { + if (!group_ht_grow(ht)) { ht->oom = 1; return mask; } + } + uint32_t gid = ht->grp_count++; + char* row = ht->rows + (size_t)gid * ly->row_stride; + /* Whole-row copy: count + keys/null_mask + aggregator state. */ + memcpy(row, src_row, ly->row_stride); + ht->slots[slot] = HT_PACK(salt, gid); + if (ht->grp_count * 2 > ht->ht_cap) { + group_ht_rehash(ht, key_types); + mask = ht->ht_cap - 1; + } + return mask; + } + if (HT_SALT_V(sv) == salt) { + uint32_t gid = HT_GID(sv); + char* row = ht->rows + (size_t)gid * ly->row_stride; + if (group_keys_equal((const int64_t*)(row + 8), + skeys, ly, ht->key_data)) { + *(int64_t*)row += src_count; + if (need_sum) { + for (uint8_t a = 0; a < na; a++) { + int8_t s = ly->agg_val_slot[a]; + if (s < 0) continue; + size_t off = (size_t)off_sum + (size_t)s * 8; + if (f64_mask & (1u << a)) { + double sv_f; + memcpy(&sv_f, src_row + off, 8); + *(double*)(row + off) += sv_f; + } else { + int64_t sv_i; + memcpy(&sv_i, src_row + off, 8); + *(int64_t*)(row + off) += sv_i; + } + } + } + return mask; + } + } + slot = (slot + 1) & mask; + } +} + +typedef struct { + void** key_data; + int8_t* key_types; + uint8_t* key_attrs; + ray_t** key_vecs; + ray_t** agg_vecs; /* may be NULL for pure COUNT (n_agg_vals==0) */ + ray_t** agg_vecs2; + uint8_t* agg_strlen; + uint8_t nullable_mask; + uint32_t n_workers; + group_ht_t* wpart_hts; /* [n_workers * RADIX_P] */ + ght_layout_t layout; + ray_t* rowsel; + const int64_t* match_idx; + _Atomic(int) oom; +} radix_v2_phase1_ctx_t; + +static void radix_v2_phase1_fn(void* ctx, uint32_t worker_id, + int64_t start, int64_t end) { + radix_v2_phase1_ctx_t* c = (radix_v2_phase1_ctx_t*)ctx; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + const ght_layout_t* ly = &c->layout; + uint8_t nk = ly->n_keys; + uint8_t wide = ly->wide_key_mask; + uint8_t nullable = c->nullable_mask; + const int64_t* match_idx = c->match_idx; + + group_ht_t* my_hts = &c->wpart_hts[(size_t)worker_id * RADIX_P]; + /* Lazily init this worker's 256 partition HTs. */ + for (uint32_t p = 0; p < RADIX_P; p++) { + if (!my_hts[p].slots) { + if (!group_ht_init_sized(&my_hts[p], 256, ly, 128)) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + if (wide && c->key_data) + group_ht_set_key_data(&my_hts[p], c->key_data); + } + } + uint32_t masks[RADIX_P]; + for (uint32_t p = 0; p < RADIX_P; p++) masks[p] = my_hts[p].ht_cap - 1; + + /* Stack-resident transient entry, same layout as group_rows_range. */ + char ebuf[8 + 9 * 8 + 8 * 8 + 8]; + for (int64_t i = start; i < end; i++) { + if (((i - start) & 65535) == 0 && ray_interrupted()) break; + int64_t row = match_idx ? match_idx[i] : i; + if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, row)) + continue; + uint64_t h = 0; + int64_t* ek = (int64_t*)(ebuf + 8); + int64_t null_mask = 0; + for (uint8_t k = 0; k < nk; k++) { + int8_t t = c->key_types[k]; + uint64_t kh; + bool is_null = (nullable & (1u << k)) + && ray_vec_is_null(c->key_vecs[k], row); + if (is_null) { + null_mask |= (int64_t)(1u << k); + ek[k] = 0; + kh = ray_hash_i64(0); + } else if (wide & (1u << k)) { + uint8_t esz = ly->wide_key_esz[k]; + const void* src = (const char*)c->key_data[k] + (size_t)row * esz; + ek[k] = row; + kh = ray_hash_bytes(src, esz); + } else if (t == RAY_F64) { + int64_t kv; + memcpy(&kv, &((double*)c->key_data[k])[row], 8); + ek[k] = kv; + kh = ray_hash_f64(((double*)c->key_data[k])[row]); + } else { + int64_t kv = read_col_i64(c->key_data[k], row, t, c->key_attrs[k]); + ek[k] = kv; + kh = ray_hash_i64(kv); + } + h = (k == 0) ? kh : ray_hash_combine(h, kh); + } + ek[nk] = null_mask; + if (null_mask) h = ray_hash_combine(h, ray_hash_i64(null_mask)); + *(uint64_t*)ebuf = h; + /* Pack agg values into entry — only when the HT layout actually + * reads them. For count-only need_flags == 0 and accum_from_entry + * skips every agg slot; packing here would be a wasted column + * read per row (a measurable regression on q15-class queries). */ + if (ly->need_flags) { + int64_t* ev = (int64_t*)(ebuf + 8 + ((size_t)nk + 1) * 8); + uint8_t vi = 0; + uint8_t na = ly->n_aggs; + uint8_t bin_mask = ly->agg_is_binary; + uint8_t hol_mask = ly->agg_is_holistic; + for (uint8_t a = 0; a < na; a++) { + if (hol_mask & (1u << a)) continue; + ray_t* ac = c->agg_vecs ? c->agg_vecs[a] : NULL; + if (!ac) continue; + if (c->agg_strlen && c->agg_strlen[a]) + ev[vi] = group_strlen_at(ac, row); + else if (ac->type == RAY_F64) + memcpy(&ev[vi], &((double*)ray_data(ac))[row], 8); + else + ev[vi] = read_col_i64(ray_data(ac), row, ac->type, ac->attrs); + vi++; + if ((bin_mask & (1u << a)) && c->agg_vecs2 && c->agg_vecs2[a]) { + ray_t* ay = c->agg_vecs2[a]; + if (ay->type == RAY_F64) + memcpy(&ev[vi], &((double*)ray_data(ay))[row], 8); + else + ev[vi] = read_col_i64(ray_data(ay), row, ay->type, ay->attrs); + vi++; + } + } + } + uint32_t p = RADIX_PART(h); + uint32_t new_mask = group_probe_entry(&my_hts[p], ebuf, + c->key_types, masks[p]); + if (my_hts[p].oom) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + masks[p] = new_mask; + } +} + +typedef struct { + group_ht_t* wpart_hts; /* [n_workers * RADIX_P] — input */ + group_ht_t* part_hts; /* [RADIX_P] — output */ + int8_t* key_types; + uint32_t n_workers; + ght_layout_t layout; + void** key_data; + _Atomic(int) oom; +} radix_v2_phase2_ctx_t; + +static void radix_v2_phase2_fn(void* ctx, uint32_t worker_id, + int64_t start, int64_t end) { + (void)worker_id; + radix_v2_phase2_ctx_t* c = (radix_v2_phase2_ctx_t*)ctx; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + uint16_t row_stride = c->layout.row_stride; + for (int64_t p = start; p < end; p++) { + /* Upper bound on the merged partition: sum of worker grp_counts + * (some keys may be present in multiple workers — the merge will + * fold those, so the final grp_count is ≤ this sum). */ + uint32_t total_grps = 0; + for (uint32_t w = 0; w < c->n_workers; w++) + total_grps += c->wpart_hts[(size_t)w * RADIX_P + p].grp_count; + if (total_grps == 0) continue; + uint32_t ht_cap = 256; + { + uint64_t target = (uint64_t)total_grps * 2; + if (target < 256) target = 256; + while (ht_cap < target) ht_cap *= 2; + } + uint32_t init_grp = 256; + while (init_grp < total_grps && init_grp < 65536) init_grp *= 2; + if (!group_ht_init_sized(&c->part_hts[p], ht_cap, &c->layout, init_grp)) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + if (c->layout.wide_key_mask && c->key_data) + group_ht_set_key_data(&c->part_hts[p], c->key_data); + uint32_t mask = c->part_hts[p].ht_cap - 1; + for (uint32_t w = 0; w < c->n_workers; w++) { + group_ht_t* src = &c->wpart_hts[(size_t)w * RADIX_P + p]; + if (src->grp_count == 0) continue; + const char* rows = src->rows; + for (uint32_t gi = 0; gi < src->grp_count; gi++) { + mask = group_merge_row(&c->part_hts[p], + rows + (size_t)gi * row_stride, + c->key_types, mask); + if (c->part_hts[p].oom) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + } + } + } +} + /* ============================================================================ * Parallel direct-array accumulation for low-cardinality single integer key * ============================================================================ */ @@ -3164,6 +3412,12 @@ typedef struct { uint32_t n_workers; const int64_t* match_idx; /* NULL = no selection */ ray_t* rowsel; + /* DA-path early-out: once any worker observes a key span wider than + * span_budget the direct-array path is provably infeasible (its slot + * count would exceed DA_MAX_COMPOSITE_SLOTS), so the whole scan can + * stop instead of reading the rest of a 10M-row column for nothing. */ + int64_t span_budget; + _Atomic(int)* abort_flag; } minmax_ctx_t; static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) { @@ -3172,11 +3426,29 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t const int64_t* match_idx = c->match_idx; int64_t kmin = INT64_MAX, kmax = INT64_MIN; int8_t t = c->key_type; - + const int64_t span_budget = c->span_budget; + + /* Span check and abort poll are batched (every 1024 rows) so the + * hot per-row loop body stays a branchless min/max with no atomics. + * 8192 was too sparse — the dispatcher hands out 8K-row morsels, so + * `(i-start) & 8191 == 0` only ever fired at the morsel boundary + * (where kmin=INT64_MAX/kmax=INT64_MIN make the span check vacuous), + * leaving every full 8K morsel to run end-to-end on doomed columns. */ #define MINMAX_SEG_LOOP(TYPE, CAST) \ do { \ const TYPE* kd = (const TYPE*)c->key_data; \ for (int64_t i = start; i < end; i++) { \ + if (((i - start) & 1023) == 0) { \ + if (atomic_load_explicit(c->abort_flag, \ + memory_order_relaxed)) \ + goto minmax_done; \ + if (kmax >= kmin && \ + (uint64_t)(kmax - kmin) > (uint64_t)span_budget) { \ + atomic_store_explicit(c->abort_flag, 1, \ + memory_order_relaxed); \ + goto minmax_done; \ + } \ + } \ int64_t r = match_idx ? match_idx[i] : i; \ if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, r)) continue; \ int64_t v = (int64_t)CAST kd[r]; \ @@ -3203,6 +3475,7 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t #undef MINMAX_SEG_LOOP +minmax_done: /* Merge with existing per-worker values (a worker may process multiple morsels) */ if (kmin < c->per_worker_min[wid]) c->per_worker_min[wid] = kmin; if (kmax > c->per_worker_max[wid]) c->per_worker_max[wid] = kmax; @@ -5471,6 +5744,9 @@ da_path:; ? ray_pool_total_workers(mm_pool) : 1; /* VLA bounded by worker count — max ~2KB per key even on 256-core systems. */ int64_t mm_mins[mm_n], mm_maxs[mm_n]; + /* Shared across keys: once any key proves the DA slot count + * infeasible the scan aborts instead of reading the rest. */ + _Atomic(int) mm_abort = 0; for (uint8_t k = 0; k < n_keys && da_fits; k++) { int64_t kmin, kmax; for (uint32_t w = 0; w < mm_n; w++) { @@ -5486,12 +5762,18 @@ da_path:; .n_workers = mm_n, .match_idx = match_idx, .rowsel = rowsel, + .span_budget = DA_MAX_COMPOSITE_SLOTS, + .abort_flag = &mm_abort, }; if (mm_n > 1) { ray_pool_dispatch(mm_pool, minmax_scan_fn, &mm_ctx, n_scan); } else { minmax_scan_fn(&mm_ctx, 0, 0, n_scan); } + if (atomic_load_explicit(&mm_abort, memory_order_relaxed)) { + da_fits = false; + break; + } kmin = INT64_MAX; kmax = INT64_MIN; for (uint32_t w = 0; w < mm_n; w++) { if (mm_mins[w] < kmin) kmin = mm_mins[w]; @@ -7319,6 +7601,114 @@ ht_path:; skip_top_count_filter: if (pool && nrows >= RAY_PARALLEL_THRESHOLD && n_total > 1 && !rowsel) { + /* Per-(worker, partition) direct-insert path: aggregates into + * thread-local partition HTs during phase1, then merges per + * partition. Bypasses the phase1 fat-entry materialisation + + * phase2 re-read DRAM round trip. On success it populates + * part_hts[] in the format the existing phase3 emit consumes. + * + * Gate: every agg is COUNT/SUM/AVG (the merge primitive knows + * how to add counts and sum slots; PROD/MIN/MAX/FIRST/LAST/ + * SUMSQ/PEARSON/MEDIAN need richer state-merge logic). Agg + * input columns must be non-nullable for now — sentinel-skip + * inside accum_from_entry is correct, but the merge step needs + * an nn_count and that isn't tracked yet. */ + bool v2_ok = (n_keys >= 1 && n_aggs > 0); + /* SYM single-key queries already had a tuned path (q33/q34 hit it + * before falling to the radix); v2 doesn't beat it for them, so + * skip when any key is SYM and let the existing pipeline handle it. */ + for (uint8_t k = 0; k < n_keys && v2_ok; k++) + if (key_types[k] == RAY_SYM) v2_ok = false; + for (uint8_t a = 0; a < n_aggs && v2_ok; a++) { + uint16_t op = ext->agg_ops[a]; + if (op != OP_COUNT && op != OP_SUM && op != OP_AVG) { + v2_ok = false; + break; + } + if (agg_vecs[a]) { + ray_t* src = (agg_vecs[a]->attrs & RAY_ATTR_SLICE) + ? agg_vecs[a]->slice_parent : agg_vecs[a]; + if (src && (src->attrs & RAY_ATTR_HAS_NULLS)) + v2_ok = false; + } + } + if (v2_ok && !(ght_layout.agg_is_first | ght_layout.agg_is_last + | ght_layout.agg_is_holistic + | ght_layout.agg_is_binary)) { + ray_t* wpart_hdr = NULL; + size_t v2_n_w = (size_t)n_total * RADIX_P; + group_ht_t* wpart_hts = (group_ht_t*)scratch_calloc( + &wpart_hdr, v2_n_w * sizeof(group_ht_t)); + ray_t* v2_part_hdr = NULL; + group_ht_t* v2_part_hts = wpart_hts + ? (group_ht_t*)scratch_calloc(&v2_part_hdr, + RADIX_P * sizeof(group_ht_t)) + : NULL; + if (!wpart_hts || !v2_part_hts) { + if (wpart_hts) scratch_free(wpart_hdr); + if (v2_part_hts) scratch_free(v2_part_hdr); + goto v2_done; + } + uint8_t v2_nullable = 0; + for (uint8_t k = 0; k < n_keys; k++) { + if (!key_vecs[k]) continue; + ray_t* src = (key_vecs[k]->attrs & RAY_ATTR_SLICE) + ? key_vecs[k]->slice_parent : key_vecs[k]; + if (src && (src->attrs & RAY_ATTR_HAS_NULLS)) + v2_nullable |= (uint8_t)(1u << k); + } + radix_v2_phase1_ctx_t v2p1 = { + .key_data = key_data, + .key_types = key_types, + .key_attrs = key_attrs, + .key_vecs = key_vecs, + .agg_vecs = agg_vecs, + .agg_vecs2 = agg_vecs2, + .agg_strlen = agg_strlen, + .nullable_mask = v2_nullable, + .n_workers = n_total, + .wpart_hts = wpart_hts, + .layout = ght_layout, + .rowsel = rowsel, + .match_idx = match_idx, + .oom = 0, + }; + ray_pool_dispatch(pool, radix_v2_phase1_fn, &v2p1, n_scan); + CHECK_CANCEL_GOTO(pool, cleanup); + if (atomic_load_explicit(&v2p1.oom, memory_order_relaxed)) { + for (size_t i = 0; i < v2_n_w; i++) + group_ht_free(&wpart_hts[i]); + scratch_free(wpart_hdr); + scratch_free(v2_part_hdr); + goto v2_done; + } + radix_v2_phase2_ctx_t v2p2 = { + .wpart_hts = wpart_hts, + .part_hts = v2_part_hts, + .key_types = key_types, + .n_workers = n_total, + .layout = ght_layout, + .key_data = key_data, + .oom = 0, + }; + ray_pool_dispatch_n(pool, radix_v2_phase2_fn, &v2p2, RADIX_P); + CHECK_CANCEL_GOTO(pool, cleanup); + /* Worker HTs are no longer needed once the merge is done. */ + for (size_t i = 0; i < v2_n_w; i++) + group_ht_free(&wpart_hts[i]); + scratch_free(wpart_hdr); + if (atomic_load_explicit(&v2p2.oom, memory_order_relaxed)) { + for (uint32_t p = 0; p < RADIX_P; p++) + group_ht_free(&v2_part_hts[p]); + scratch_free(v2_part_hdr); + goto v2_done; + } + /* Hand off to the existing phase3 emit. */ + part_hts = v2_part_hts; + part_hts_hdr = v2_part_hdr; + goto v2_emit; + } +v2_done:; size_t n_bufs = (size_t)n_total * RADIX_P; radix_bufs = (radix_buf_t*)scratch_calloc(&radix_bufs_hdr, n_bufs * sizeof(radix_buf_t)); @@ -7421,6 +7811,7 @@ ht_path:; ray_heap_gc(); } +v2_emit:; /* Prefix offsets */ uint32_t part_offsets[RADIX_P + 1]; part_offsets[0] = 0; diff --git a/src/ops/hll.c b/src/ops/hll.c new file mode 100644 index 00000000..3b15c049 --- /dev/null +++ b/src/ops/hll.c @@ -0,0 +1,442 @@ +/* + * Copyright (c) 2025-2026 Anton Kundenko + * All rights reserved. + + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ops/hll.h" +#include "ops/internal.h" +#include "ops/ops.h" +#include "core/pool.h" +#include "table/sym.h" + +#include +#include +#include + +int ray_hll_init(ray_hll_t* h, uint8_t p) { + if (!h) return -1; + if (p < 4) p = 4; /* too small loses all accuracy */ + if (p > 18) p = 18; /* 256 KB cap on register array */ + memset(h, 0, sizeof(*h)); + uint32_t m = 1u << p; + h->p = p; + h->m = m; + h->regs = (uint8_t*)scratch_calloc(&h->_hdr, (size_t)m); + if (!h->regs) return -1; + return 0; +} + +void ray_hll_free(ray_hll_t* h) { + if (!h) return; + if (h->_hdr) scratch_free(h->_hdr); + h->regs = NULL; + h->_hdr = NULL; + h->m = 0; + h->p = 0; +} + +void ray_hll_reset(ray_hll_t* h) { + if (h && h->regs) memset(h->regs, 0, (size_t)h->m); +} + +void ray_hll_merge(ray_hll_t* dst, const ray_hll_t* src) { + if (!dst || !src || !dst->regs || !src->regs) return; + if (dst->m != src->m) return; /* mismatched precision — caller bug */ + const uint8_t* s = src->regs; + uint8_t* d = dst->regs; + uint32_t m = dst->m; + /* Branchless max — keeps the hot per-shard merge in vector regs. + * The compiler usually auto-vectorises this to a packed-max sequence. */ + for (uint32_t i = 0; i < m; i++) { + uint8_t a = d[i], b = s[i]; + d[i] = a > b ? a : b; + } +} + +/* HyperLogLog cardinality estimator (Flajolet, Fusy, Gandouet, Meunier 2007), + * with the original raw-estimate / linear-counting hybrid switch. Skips the + * HLL++ small-range bias-correction tables because the linear-counting branch + * already gives a clean estimate below E ≤ 2.5·m, which is where the raw + * mean diverges from truth. */ +int64_t ray_hll_estimate(const ray_hll_t* h) { + if (!h || !h->regs) return 0; + uint32_t m = h->m; + if (m == 0) return 0; + + /* alpha_m correction constant from the paper. m == 16 / 32 / 64 use + * the closed-form values; everything else uses 0.7213 / (1 + 1.079/m). */ + double alpha_m; + if (m == 16) alpha_m = 0.673; + else if (m == 32) alpha_m = 0.697; + else if (m == 64) alpha_m = 0.709; + else alpha_m = 0.7213 / (1.0 + 1.079 / (double)m); + + /* Sum of 2^-reg[i]. Count zero registers for the linear-counting + * fallback at small cardinalities (when V > 0 and E ≤ 2.5·m). */ + double sum_inv = 0.0; + uint32_t n_zeros = 0; + for (uint32_t i = 0; i < m; i++) { + uint8_t r = h->regs[i]; + sum_inv += ldexp(1.0, -(int)r); /* 2^-r */ + n_zeros += (r == 0); + } + + double raw = alpha_m * (double)m * (double)m / sum_inv; + + if (raw <= 2.5 * (double)m && n_zeros != 0) { + /* Linear counting — much tighter than raw for small E. */ + raw = (double)m * log((double)m / (double)n_zeros); + } + /* Large-range bias-correction (the 2^32 upper-edge correction in the + * original paper) is for 32-bit hashes only — we hash 64 bits, so the + * raw value is already unbiased to ~2^57. Skip. */ + + if (raw < 0.0) raw = 0.0; + return (int64_t)(raw + 0.5); +} + +/* ---- Scalar approximate count-distinct aggregator ---------------------- */ + +typedef struct { + const ray_t* vec; + int8_t type; + uint8_t attrs; + bool has_nulls; + ray_hll_t* shards; /* [n_workers] — one HLL per worker */ + uint8_t p; + uint32_t n_workers; + _Atomic(int) oom; +} cda_scalar_ctx_t; + +static void cda_scalar_fn(void* raw, uint32_t worker_id, int64_t start, int64_t end) { + cda_scalar_ctx_t* c = (cda_scalar_ctx_t*)raw; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + ray_hll_t* sh = &c->shards[worker_id % c->n_workers]; + if (!sh->regs) { + if (ray_hll_init(sh, c->p) != 0) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + } + const ray_t* v = c->vec; + const void* base = ray_data((ray_t*)v); + int8_t t = c->type; + bool hn = c->has_nulls; + const int64_t CHK = 65535; + + if (t == RAY_I64 || t == RAY_TIMESTAMP) { + const int64_t* d = (const int64_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int64_t v_i = d[r]; + if (hn && v_i == NULL_I64) continue; + ray_hll_add(sh, ray_hash_i64(v_i)); + } + } else if (t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) { + const int32_t* d = (const int32_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int32_t v_i = d[r]; + if (hn && v_i == NULL_I32) continue; + ray_hll_add(sh, ray_hash_i64((int64_t)v_i)); + } + } else if (t == RAY_I16) { + const int16_t* d = (const int16_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int16_t v_i = d[r]; + if (hn && v_i == NULL_I16) continue; + ray_hll_add(sh, ray_hash_i64((int64_t)v_i)); + } + } else if (t == RAY_BOOL || t == RAY_U8) { + const uint8_t* d = (const uint8_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + ray_hll_add(sh, ray_hash_i64((int64_t)d[r])); + } + } else if (t == RAY_F64) { + const double* d = (const double*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + double v_f = d[r]; + if (v_f != v_f) continue; /* NaN = null in F64 column */ + ray_hll_add(sh, ray_hash_f64(v_f)); + } + } else if (RAY_IS_SYM(t)) { + /* SYM is width-encoded — sym id 0 is the canonical empty-string + * sentinel (treat as null), every other id is a real distinct + * value, so hash the id directly. */ + uint8_t w = c->attrs & RAY_SYM_W_MASK; + if (w == RAY_SYM_W64) { + const int64_t* d = (const int64_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int64_t v_i = d[r]; + if (v_i == 0) continue; + ray_hll_add(sh, ray_hash_i64(v_i)); + } + } else if (w == RAY_SYM_W32) { + const uint32_t* d = (const uint32_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + uint32_t v_i = d[r]; + if (v_i == 0) continue; + ray_hll_add(sh, ray_hash_i64((int64_t)v_i)); + } + } else if (w == RAY_SYM_W16) { + const uint16_t* d = (const uint16_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + uint16_t v_i = d[r]; + if (v_i == 0) continue; + ray_hll_add(sh, ray_hash_i64((int64_t)v_i)); + } + } else { + const uint8_t* d = (const uint8_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + uint8_t v_i = d[r]; + if (v_i == 0) continue; + ray_hll_add(sh, ray_hash_i64((int64_t)v_i)); + } + } + } else if (t == RAY_STR) { + ray_t* vm = (ray_t*)v; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + size_t n = 0; + const char* s = ray_str_vec_get(vm, r, &n); + if (!s || n == 0) continue; + ray_hll_add(sh, ray_hash_bytes(s, n)); + } + } + /* Unsupported types fall through silently — caller validates. */ +} + +ray_t* ray_count_distinct_approx(ray_t* x) { + if (!x || RAY_IS_ERR(x)) return x; + if (!ray_is_vec(x)) { + /* Scalar atom — distinct count is 1 (or 0 if null). */ + if (ray_is_atom(x)) { + if (RAY_ATOM_IS_NULL(x)) return ray_i64(0); + return ray_i64(1); + } + return ray_error("type", "count_distinct_approx: vec expected"); + } + int8_t t = x->type; + /* Reject types we don't hash. */ + if (t != RAY_I64 && t != RAY_I32 && t != RAY_I16 && t != RAY_U8 && + t != RAY_BOOL && t != RAY_F64 && t != RAY_DATE && t != RAY_TIME && + t != RAY_TIMESTAMP && t != RAY_STR && !RAY_IS_SYM(t)) + return ray_error("type", "count_distinct_approx: unsupported element type"); + int64_t n = x->len; + if (n == 0) return ray_i64(0); + + ray_pool_t* pool = ray_pool_get(); + uint32_t nw = (pool && n >= RAY_PARALLEL_THRESHOLD) + ? ray_pool_total_workers(pool) : 1; + + ray_t* shards_hdr = NULL; + ray_hll_t* shards = (ray_hll_t*)scratch_calloc( + &shards_hdr, (size_t)nw * sizeof(ray_hll_t)); + if (!shards) return ray_error("oom", NULL); + + cda_scalar_ctx_t ctx = { + .vec = x, + .type = t, + .attrs = x->attrs, + .has_nulls = (x->attrs & RAY_ATTR_HAS_NULLS) != 0, + .shards = shards, + .p = RAY_HLL_DEFAULT_P, + .n_workers = nw, + .oom = 0, + }; + if (nw > 1) { + ray_pool_dispatch(pool, cda_scalar_fn, &ctx, n); + } else { + cda_scalar_fn(&ctx, 0, 0, n); + } + if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) { + for (uint32_t w = 0; w < nw; w++) ray_hll_free(&shards[w]); + scratch_free(shards_hdr); + return ray_error("oom", "count_distinct_approx: HLL alloc failed"); + } + /* Merge per-worker shards into shard[0], then estimate. */ + for (uint32_t w = 1; w < nw; w++) { + if (shards[w].regs) + ray_hll_merge(&shards[0], &shards[w]); + } + int64_t est = shards[0].regs ? ray_hll_estimate(&shards[0]) : 0; + for (uint32_t w = 0; w < nw; w++) ray_hll_free(&shards[w]); + scratch_free(shards_hdr); + return ray_i64(est); +} + +/* ---- Per-group HLL --------------------------------------------------- */ + +typedef struct { + const ray_t* vec; + int8_t type; + uint8_t attrs; + bool has_nulls; + const int64_t* idx_buf; + const int64_t* offsets; + const int64_t* counts; /* per-group length — offsets has only n_groups entries */ + uint8_t p; + uint32_t m; + int64_t* out; + _Atomic(int) oom; +} cda_pg_buf_ctx_t; + +static void cda_pg_buf_task(void* raw, uint32_t worker_id, int64_t start, int64_t end) { + (void)worker_id; + cda_pg_buf_ctx_t* c = (cda_pg_buf_ctx_t*)raw; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + const void* base = ray_data((ray_t*)c->vec); + int8_t t = c->type; + bool hn = c->has_nulls; + + /* One private HLL per task (allocated on stack so we never touch + * the shared scratch arena from a worker thread). P≤14 → m≤16384, + * fits comfortably in the default 8 MiB worker stack. */ + uint8_t regs[1u << 14]; + ray_hll_t sk = { .p = c->p, .m = c->m, .regs = regs, ._hdr = NULL }; + + for (int64_t g = start; g < end; g++) { + memset(regs, 0, c->m); + int64_t s = c->offsets[g]; + int64_t e = s + c->counts[g]; + if (t == RAY_I64 || t == RAY_TIMESTAMP) { + const int64_t* d = (const int64_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + int64_t v = d[r]; + if (hn && v == NULL_I64) continue; + ray_hll_add(&sk, ray_hash_i64(v)); + } + } else if (t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) { + const int32_t* d = (const int32_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + int32_t v = d[r]; + if (hn && v == NULL_I32) continue; + ray_hll_add(&sk, ray_hash_i64((int64_t)v)); + } + } else if (t == RAY_I16) { + const int16_t* d = (const int16_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + int16_t v = d[r]; + if (hn && v == NULL_I16) continue; + ray_hll_add(&sk, ray_hash_i64((int64_t)v)); + } + } else if (t == RAY_BOOL || t == RAY_U8) { + const uint8_t* d = (const uint8_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + ray_hll_add(&sk, ray_hash_i64((int64_t)d[r])); + } + } else if (t == RAY_F64) { + const double* d = (const double*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + double v = d[r]; + if (v != v) continue; + ray_hll_add(&sk, ray_hash_f64(v)); + } + } else if (RAY_IS_SYM(t)) { + uint8_t w = c->attrs & RAY_SYM_W_MASK; + if (w == RAY_SYM_W64) { + const int64_t* d = (const int64_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + int64_t v = d[r]; if (v == 0) continue; + ray_hll_add(&sk, ray_hash_i64(v)); + } + } else if (w == RAY_SYM_W32) { + const uint32_t* d = (const uint32_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + uint32_t v = d[r]; if (v == 0) continue; + ray_hll_add(&sk, ray_hash_i64((int64_t)v)); + } + } else if (w == RAY_SYM_W16) { + const uint16_t* d = (const uint16_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + uint16_t v = d[r]; if (v == 0) continue; + ray_hll_add(&sk, ray_hash_i64((int64_t)v)); + } + } else { + const uint8_t* d = (const uint8_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + uint8_t v = d[r]; if (v == 0) continue; + ray_hll_add(&sk, ray_hash_i64((int64_t)v)); + } + } + } + c->out[g] = ray_hll_estimate(&sk); + } +} + +int ray_count_distinct_approx_pg_buf(ray_t* src, + const int64_t* idx_buf, + const int64_t* offsets, + const int64_t* counts, + int64_t n_groups, + uint8_t p, int64_t* out) +{ + if (!src || RAY_IS_ERR(src) || !idx_buf || !offsets || !counts || !out) + return -1; + int8_t t = src->type; + bool hashable = (t == RAY_I64 || t == RAY_I32 || t == RAY_I16 || + t == RAY_U8 || t == RAY_BOOL || t == RAY_F64 || + t == RAY_DATE || t == RAY_TIME || t == RAY_TIMESTAMP || + RAY_IS_SYM(t)); + if (!hashable) return -1; + if (n_groups <= 0) return 0; + if (p < 4) p = 4; + if (p > 14) p = 14; + uint32_t m = 1u << p; + + cda_pg_buf_ctx_t ctx = { + .vec = src, + .type = t, + .attrs = src->attrs, + .has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0, + .idx_buf = idx_buf, + .offsets = offsets, + .counts = counts, + .p = p, + .m = m, + .out = out, + .oom = 0, + }; + ray_pool_t* pool = ray_pool_get(); + if (pool && ray_pool_total_workers(pool) >= 2 && n_groups >= 4) { + ray_pool_dispatch_n(pool, cda_pg_buf_task, &ctx, (uint32_t)n_groups); + } else { + cda_pg_buf_task(&ctx, 0, 0, n_groups); + } + if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) return -1; + return 0; +} diff --git a/src/ops/hll.h b/src/ops/hll.h new file mode 100644 index 00000000..29b98332 --- /dev/null +++ b/src/ops/hll.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2025-2026 Anton Kundenko + * All rights reserved. + + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RAY_OPS_HLL_H +#define RAY_OPS_HLL_H + +/** + * Probabilistic cardinality sketch (HyperLogLog). + * + * Each sketch holds 2^P registers; each register stores the maximum + * leading-zero count (rho) seen for any hash whose top P bits index + * that register. Cardinality is then read off the harmonic mean of + * 2^reg over all registers, with bias correction for both ends of + * the range. Standard error ≈ 1.04 / sqrt(2^P). P=14 → ≈ 0.8 %. + * + * Memory: 1 byte per register (8-bit reg holds rho up to 64+P, way + * over the 6 bits a packed implementation would need; the extra few + * KB buys a tighter hot loop). At P=14 a sketch is 16 KB and lives + * in L2 for the duration of one query. + * + * The sketch is mergeable element-wise (max), which is the property + * the per-group / per-worker aggregation paths rely on: each worker + * builds a local sketch and the planner merges them at finalisation. + */ + +#include "rayforce.h" +#include "ops/hash.h" + +/* Default precision: 14 (16384 registers, ~0.81 % std error, 16 KB). */ +#define RAY_HLL_DEFAULT_P 14 + +typedef struct { + uint8_t p; /* precision: register count = 1 << p */ + uint32_t m; /* register count */ + uint8_t* regs; /* [m] — 1 byte per register, holds rho count */ + ray_t* _hdr; /* scratch handle for regs */ +} ray_hll_t; + +/* Initialise an empty sketch with `p` precision bits. Allocates regs + * via scratch_alloc; the caller frees with ray_hll_free. Returns 0 on + * success, -1 on OOM. */ +int ray_hll_init(ray_hll_t* h, uint8_t p); + +/* Free the regs allocation. Safe on a zeroed (uninitialised) sketch. */ +void ray_hll_free(ray_hll_t* h); + +/* Zero all registers (clears the sketch — same effect as init with the + * same p, but in-place; useful when reusing a sketch across calls). */ +void ray_hll_reset(ray_hll_t* h); + +/* Add a 64-bit hash to the sketch. Caller is responsible for hashing + * its value type before invoking — see ray_hash_i64 / ray_hash_bytes + * in ops/hash.h. Hot path; kept fully inline. */ +static inline void ray_hll_add(ray_hll_t* h, uint64_t hash) { + uint32_t idx = (uint32_t)(hash >> (64u - h->p)); + /* The low (64-p) bits hold the value we scan for the leading-zero + * run. Sentinel-bit at position (64-p-1) keeps the rho value in + * [1, 64-p+1] without a branch on all-zero. */ + uint64_t rest = (hash << h->p) | (1ULL << (h->p - 1)); + uint8_t rho = (uint8_t)(__builtin_clzll(rest) + 1u); + if (rho > h->regs[idx]) h->regs[idx] = rho; +} + +/* Merge src into dst (element-wise max). src and dst must share the + * same precision p. */ +void ray_hll_merge(ray_hll_t* dst, const ray_hll_t* src); + +/* Estimate the unique-value count of all hashes added so far. Uses + * the standard HyperLogLog estimator with bias-corrected raw-mean for + * the mid-range and linear counting (m * ln(m/V)) when many registers + * are still zero (V = unused register count). */ +int64_t ray_hll_estimate(const ray_hll_t* h); + +/* Scalar approximate `count(distinct …)` over a vec, ~0.8 % standard + * error. Handles I64/I32/I16/I8/U8/BOOL/F64/DATE/TIME/TIMESTAMP/SYM/ + * STR. Nulls are skipped (matches the SQL `count distinct` semantics). + * Parallelised: each worker builds a private sketch over its row range + * and the main thread merges them before extracting the estimate. + * Wired into `exec_count_distinct` above an input-row threshold. */ +ray_t* ray_count_distinct_approx(ray_t* x); + +/* Per-group approximate `count(distinct …)` over a buffered row-index + * layout: group g owns the row indices + * idx_buf[offsets[g] .. offsets[g] + counts[g]). + * Parallelised across groups — one task per group, each task uses a + * private stack-resident HLL so total memory is O(n_workers · 1<u.bloom.bits); ix->u.bloom.bits = NULL; break; + case RAY_IDX_CHUNK_ZONE: + if (ix->u.chunk_zone.mins && !RAY_IS_ERR(ix->u.chunk_zone.mins)) + ray_release(ix->u.chunk_zone.mins); + if (ix->u.chunk_zone.maxs && !RAY_IS_ERR(ix->u.chunk_zone.maxs)) + ray_release(ix->u.chunk_zone.maxs); + if (ix->u.chunk_zone.null_bits && !RAY_IS_ERR(ix->u.chunk_zone.null_bits)) + ray_release(ix->u.chunk_zone.null_bits); + ix->u.chunk_zone.mins = NULL; + ix->u.chunk_zone.maxs = NULL; + ix->u.chunk_zone.null_bits = NULL; + break; case RAY_IDX_ZONE: case RAY_IDX_NONE: break; @@ -176,6 +187,14 @@ void ray_index_retain_payload(ray_index_t* ix) { if (ix->u.bloom.bits && !RAY_IS_ERR(ix->u.bloom.bits)) ray_retain(ix->u.bloom.bits); break; + case RAY_IDX_CHUNK_ZONE: + if (ix->u.chunk_zone.mins && !RAY_IS_ERR(ix->u.chunk_zone.mins)) + ray_retain(ix->u.chunk_zone.mins); + if (ix->u.chunk_zone.maxs && !RAY_IS_ERR(ix->u.chunk_zone.maxs)) + ray_retain(ix->u.chunk_zone.maxs); + if (ix->u.chunk_zone.null_bits && !RAY_IS_ERR(ix->u.chunk_zone.null_bits)) + ray_retain(ix->u.chunk_zone.null_bits); + break; case RAY_IDX_ZONE: case RAY_IDX_NONE: break; @@ -262,6 +281,107 @@ static ray_err_t zone_scan(ray_t* v, ray_index_t* ix) { } } +/* -------------------------------------------------------------------------- + * Chunk-zone scan -- per-(1<u.chunk_zone.n_chunks; + uint8_t log2 = ix->u.chunk_zone.chunk_log2; + int64_t csz = 1LL << log2; + int64_t n = v->len; + int64_t* mins = (int64_t*)ray_data(ix->u.chunk_zone.mins); + int64_t* maxs = (int64_t*)ray_data(ix->u.chunk_zone.maxs); + uint8_t* nbits = (uint8_t*)ray_data(ix->u.chunk_zone.null_bits); + const uint8_t* base = (const uint8_t*)ray_data(v); + + for (uint32_t g = 0; g < n_chunks; g++) { + int64_t s = (int64_t)g * csz; + int64_t e = s + csz; if (e > n) e = n; + int64_t mn = INT64_MAX, mx = INT64_MIN; + bool any_null = false; + for (int64_t i = s; i < e; i++) { + if (ray_vec_is_null(v, i)) { any_null = true; continue; } + int64_t val = 0; + switch (elem_size) { + case 1: val = (int64_t)base[i]; break; + case 2: { int16_t t; memcpy(&t, base + i*2, 2); val = (int64_t)t; break; } + case 4: { int32_t t; memcpy(&t, base + i*4, 4); val = (int64_t)t; break; } + case 8: { int64_t t; memcpy(&t, base + i*8, 8); val = t; break; } + default: return RAY_ERR_TYPE; + } + if (val < mn) mn = val; + if (val > mx) mx = val; + } + /* Empty (all-null) chunks keep mn=INT64_MAX / mx=INT64_MIN so + * the reduce path's min(mins[*]) / max(maxs[*]) ignores them. */ + mins[g] = mn; + maxs[g] = mx; + if (any_null) nbits[g >> 3] |= (uint8_t)(1u << (g & 7)); + } + return RAY_OK; +} + +static ray_err_t chunk_zone_scan_float(ray_t* v, ray_index_t* ix, + int elem_size) { + uint32_t n_chunks = ix->u.chunk_zone.n_chunks; + uint8_t log2 = ix->u.chunk_zone.chunk_log2; + int64_t csz = 1LL << log2; + int64_t n = v->len; + double* mins = (double*)ray_data(ix->u.chunk_zone.mins); + double* maxs = (double*)ray_data(ix->u.chunk_zone.maxs); + uint8_t* nbits = (uint8_t*)ray_data(ix->u.chunk_zone.null_bits); + const uint8_t* base = (const uint8_t*)ray_data(v); + + for (uint32_t g = 0; g < n_chunks; g++) { + int64_t s = (int64_t)g * csz; + int64_t e = s + csz; if (e > n) e = n; + double mn = INFINITY, mx = -INFINITY; + bool any_null = false; + for (int64_t i = s; i < e; i++) { + if (ray_vec_is_null(v, i)) { any_null = true; continue; } + double val = 0.0; + if (elem_size == 4) { + float t; memcpy(&t, base + i*4, 4); val = (double)t; + } else { + memcpy(&val, base + i*8, 8); + } + if (isnan(val)) { any_null = true; continue; } + if (val < mn) mn = val; + if (val > mx) mx = val; + } + /* Empty (all-null) chunks keep mn=+inf / mx=-inf so reduce + * (min/max across mins[]/maxs[]) ignores them. */ + mins[g] = mn; + maxs[g] = mx; + if (any_null) nbits[g >> 3] |= (uint8_t)(1u << (g & 7)); + } + return RAY_OK; +} + +static ray_err_t chunk_zone_scan(ray_t* v, ray_index_t* ix) { + switch (v->type) { + case RAY_BOOL: + case RAY_U8: return chunk_zone_scan_int(v, ix, 1); + case RAY_I16: return chunk_zone_scan_int(v, ix, 2); + case RAY_I32: + case RAY_DATE: return chunk_zone_scan_int(v, ix, 4); + case RAY_I64: + case RAY_TIME: + case RAY_TIMESTAMP: return chunk_zone_scan_int(v, ix, 8); + case RAY_F32: return chunk_zone_scan_float(v, ix, 4); + case RAY_F64: return chunk_zone_scan_float(v, ix, 8); + default: return RAY_ERR_NYI; + } +} + /* -------------------------------------------------------------------------- * Attach * @@ -335,6 +455,59 @@ ray_t* ray_index_attach_zone(ray_t** vp) { return attach_finalize(v, idx); } +ray_t* ray_index_attach_chunk_zone(ray_t** vp, uint8_t chunk_log2) { + ray_t* v = prepare_attach(vp, "chunk_zone"); + if (RAY_IS_ERR(v)) return v; + + if (chunk_log2 == 0) chunk_log2 = 16; /* default 64 K rows / chunk */ + if (chunk_log2 < 8 || chunk_log2 > 22) + return ray_error("domain", "chunk_zone: chunk_log2 out of range [8, 22]"); + int64_t csz = 1LL << chunk_log2; + /* No point indexing a column smaller than one chunk — fall back to + * the column-wide zone (or no index at all) at that size. */ + if (v->len < csz) + return ray_error("domain", "chunk_zone: column has fewer rows than one chunk"); + + uint32_t n_chunks = (uint32_t)((v->len + csz - 1) / csz); + + ray_t* idx = ray_index_alloc(RAY_IDX_CHUNK_ZONE, v->type, v->len); + if (!idx || RAY_IS_ERR(idx)) return idx; + ray_index_t* ix = ray_index_payload(idx); + ix->u.chunk_zone.n_chunks = n_chunks; + ix->u.chunk_zone.chunk_log2 = chunk_log2; + ix->u.chunk_zone.is_f64 = (v->type == RAY_F64 || v->type == RAY_F32) ? 1 : 0; + + int8_t arr_type = ix->u.chunk_zone.is_f64 ? RAY_F64 : RAY_I64; + ray_t* mins = ray_vec_new(arr_type, (int64_t)n_chunks); + ray_t* maxs = ray_vec_new(arr_type, (int64_t)n_chunks); + int64_t nb_len = (int64_t)((n_chunks + 7) / 8); + ray_t* nbits = ray_vec_new(RAY_U8, nb_len); + if (!mins || RAY_IS_ERR(mins) || !maxs || RAY_IS_ERR(maxs) || + !nbits || RAY_IS_ERR(nbits)) + { + if (mins && !RAY_IS_ERR(mins)) ray_release(mins); + if (maxs && !RAY_IS_ERR(maxs)) ray_release(maxs); + if (nbits && !RAY_IS_ERR(nbits)) ray_release(nbits); + ray_release(idx); + return ray_error("oom", "chunk_zone: arrays alloc"); + } + mins->len = (int64_t)n_chunks; + maxs->len = (int64_t)n_chunks; + nbits->len = nb_len; + memset(ray_data(nbits), 0, (size_t)nb_len); + ix->u.chunk_zone.mins = mins; + ix->u.chunk_zone.maxs = maxs; + ix->u.chunk_zone.null_bits = nbits; + + ray_err_t err = chunk_zone_scan(v, ix); + if (err != RAY_OK) { + ray_release(idx); /* releases mins/maxs/nbits via release_payload */ + return ray_error(ray_err_code_str(err), + "chunk_zone scan failed for type %d", (int)v->type); + } + return attach_finalize(v, idx); +} + /* -------------------------------------------------------------------------- * Hash index — chained open addressing * @@ -540,11 +713,12 @@ ray_t* ray_index_drop(ray_t** vp) { static const char* kind_name(ray_idx_kind_t k) { switch (k) { - case RAY_IDX_HASH: return "hash"; - case RAY_IDX_SORT: return "sort"; - case RAY_IDX_ZONE: return "zone"; - case RAY_IDX_BLOOM: return "bloom"; - default: return "none"; + case RAY_IDX_HASH: return "hash"; + case RAY_IDX_SORT: return "sort"; + case RAY_IDX_ZONE: return "zone"; + case RAY_IDX_BLOOM: return "bloom"; + case RAY_IDX_CHUNK_ZONE: return "chunk_zone"; + default: return "none"; } } @@ -627,6 +801,14 @@ ray_t* ray_index_info(ray_t* v) { r = dict_append_sym_i64(&keys, &vals, "n_keys", ix->u.bloom.n_keys); if (RAY_IS_ERR(r)) goto fail; break; + case RAY_IDX_CHUNK_ZONE: + r = dict_append_sym_i64(&keys, &vals, "n_chunks", + (int64_t)ix->u.chunk_zone.n_chunks); + if (RAY_IS_ERR(r)) goto fail; + r = dict_append_sym_i64(&keys, &vals, "chunk_log2", + (int64_t)ix->u.chunk_zone.chunk_log2); + if (RAY_IS_ERR(r)) goto fail; + break; case RAY_IDX_NONE: break; } diff --git a/src/ops/idxop.h b/src/ops/idxop.h index 46d294bc..3121c1f5 100644 --- a/src/ops/idxop.h +++ b/src/ops/idxop.h @@ -47,11 +47,20 @@ /* Index kinds. Stored in ray_index_t.kind. */ typedef enum { - RAY_IDX_NONE = 0, - RAY_IDX_HASH = 1, - RAY_IDX_SORT = 2, - RAY_IDX_ZONE = 3, - RAY_IDX_BLOOM = 4, + RAY_IDX_NONE = 0, + RAY_IDX_HASH = 1, + RAY_IDX_SORT = 2, + RAY_IDX_ZONE = 3, + RAY_IDX_BLOOM = 4, + /* Per-chunk min/max + null bit, one entry per (1 << chunk_log2) rows. + * The whole-column zone is derivable as + * min(chunk_mins)/max(chunk_maxs) over the entries, so this + * subsumes RAY_IDX_ZONE wherever it's used in the reduce path. + * Built at column ingest (csv.read); read by the min/max reduce + * and by the predicate planner to skip chunks whose [min,max] + * provably excludes/includes the constant. See chunk_zone arm + * of ray_index_t.u below. */ + RAY_IDX_CHUNK_ZONE = 5, } ray_idx_kind_t; /* The payload stored inside data[] of a RAY_INDEX ray_t. */ @@ -99,6 +108,19 @@ typedef struct { uint32_t _pad; int64_t n_keys; /* number of non-null rows added */ } bloom; + struct { /* RAY_IDX_CHUNK_ZONE */ + /* mins / maxs hold n_chunks entries. For integer / temporal + * column types they are RAY_I64 vecs storing the per-chunk + * extrema as int64; for RAY_F64 columns they are RAY_F64 + * vecs. is_f64 disambiguates at read time. */ + ray_t* mins; + ray_t* maxs; + ray_t* null_bits; /* RAY_U8 vec, packed: bit i = chunk i has any null */ + uint32_t n_chunks; + uint8_t chunk_log2; /* chunk size = 1 << chunk_log2 (default 16 → 64 K rows) */ + uint8_t is_f64; + uint8_t _pad[2]; + } chunk_zone; } u; } ray_index_t; @@ -118,6 +140,10 @@ ray_t* ray_index_attach_zone (ray_t** vp); ray_t* ray_index_attach_hash (ray_t** vp); ray_t* ray_index_attach_sort (ray_t** vp); ray_t* ray_index_attach_bloom(ray_t** vp); +/* Build per-chunk min/max + null bit at chunk_size = 1 << chunk_log2. + * Passing 0 picks the default (16 → 64 K rows / chunk). Only valid on + * numeric and temporal vectors; SYM/STR/GUID return RAY_ERR_NYI. */ +ray_t* ray_index_attach_chunk_zone(ray_t** vp, uint8_t chunk_log2); /* Drop any attached index from *vp. No-op if none. Restores the * pre-attach nullmap state byte-for-byte. Returns *vp. */ diff --git a/src/ops/query.c b/src/ops/query.c index fb3e4084..aa160eec 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -34,6 +34,7 @@ #include "ops/rowsel.h" #include "ops/fused_group.h" #include "ops/fused_topk.h" +#include "ops/hll.h" #include "ops/temporal.h" #include "core/profile.h" #include "table/sym.h" @@ -87,147 +88,6 @@ static int64_t dict_key_id(ray_t* dict, const char* key) { return -1; } -typedef struct { - ray_t* tbl; - int64_t nrows; - uint64_t hash; - uint64_t from_hash; - uint64_t env_gen; - ray_t* result; -} select_cache_entry_t; - -#define SELECT_CACHE_N 512 -static select_cache_entry_t g_select_cache[SELECT_CACHE_N]; -static uint16_t g_select_cache_next = 0; - -static uint64_t hash_mix_u64(uint64_t h, uint64_t v) { - h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2); - return h ? h : 0x9e3779b97f4a7c15ull; -} - -static uint64_t ray_expr_hash(ray_t* x) { - if (!x) return 0x1234abcd5678ef00ull; - uint64_t h = hash_mix_u64(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type); - h = hash_mix_u64(h, (uint64_t)x->attrs); - h = hash_mix_u64(h, (x->type == -RAY_STR) - ? (uint64_t)ray_str_len(x) - : (uint64_t)x->len); - if (x->type == RAY_LIST) { - ray_t** elems = (ray_t**)ray_data(x); - for (int64_t i = 0; i < x->len; i++) - h = hash_mix_u64(h, ray_expr_hash(elems[i])); - } else if (x->type == RAY_DICT) { - ray_t* keys = ray_dict_keys(x); - ray_t* vals = ray_dict_vals(x); - h = hash_mix_u64(h, ray_expr_hash(keys)); - h = hash_mix_u64(h, ray_expr_hash(vals)); - } else if (x->type == RAY_STR) { - size_t n = 0; - const char* s = ray_str_vec_get(x, 0, &n); - for (size_t i = 0; s && i < n; i++) - h = hash_mix_u64(h, (unsigned char)s[i]); - } else if (x->type == -RAY_STR) { - const char* s = ray_str_ptr(x); - size_t n = ray_str_len(x); - for (size_t i = 0; s && i < n; i++) - h = hash_mix_u64(h, (unsigned char)s[i]); - } else if (x->type == RAY_SYM || x->type == -RAY_SYM || - x->type == RAY_I64 || x->type == -RAY_I64 || - x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) { - h = hash_mix_u64(h, (uint64_t)x->i64); - } else if (x->type == RAY_I32 || x->type == -RAY_I32 || - x->type == RAY_DATE || x->type == -RAY_DATE || - x->type == RAY_TIME || x->type == -RAY_TIME) { - h = hash_mix_u64(h, (uint64_t)(uint32_t)x->i32); - } else if (x->type == RAY_I16 || x->type == -RAY_I16) { - h = hash_mix_u64(h, (uint64_t)(uint16_t)x->i16); - } else if (x->type == RAY_U8 || x->type == -RAY_U8 || - x->type == RAY_BOOL || x->type == -RAY_BOOL) { - h = hash_mix_u64(h, (uint64_t)x->u8); - } else if (x->type == RAY_F64 || x->type == -RAY_F64) { - uint64_t bits = 0; - memcpy(&bits, &x->f64, sizeof(bits)); - h = hash_mix_u64(h, bits); - } - return h; -} - -static ray_t* select_cache_get(ray_t* tbl, int64_t nrows, - uint64_t hash, uint64_t from_hash) { - if (!g_ray_profile.active) return NULL; - if (!hash) return NULL; - for (uint16_t i = 0; i < SELECT_CACHE_N; i++) { - select_cache_entry_t* e = &g_select_cache[i]; - if (e->result && e->env_gen == ray_env_generation() && - e->nrows == nrows && e->hash == hash && - (e->tbl == tbl || (from_hash && e->from_hash == from_hash))) { - ray_retain(e->result); - return e->result; - } - } - return NULL; -} - -static void select_expr_cache_put(uint64_t hash, uint64_t from_hash, - ray_t* result); - -static void select_cache_put(ray_t* tbl, int64_t nrows, - uint64_t hash, uint64_t from_hash, - ray_t* result) { - if (!g_ray_profile.active) return; - if (!tbl || !hash || !result || RAY_IS_ERR(result)) return; - select_cache_entry_t* e = - &g_select_cache[g_select_cache_next++ % SELECT_CACHE_N]; - if (e->result) ray_release(e->result); - e->tbl = tbl; - e->nrows = nrows; - e->hash = hash; - e->from_hash = from_hash; - e->env_gen = ray_env_generation(); - e->result = result; - ray_retain(e->result); - select_expr_cache_put(hash, from_hash, result); -} - -typedef struct { - uint64_t hash; - uint64_t from_hash; - uint64_t env_gen; - ray_t* result; -} select_expr_cache_entry_t; - -#define SELECT_EXPR_CACHE_N 1024 -static select_expr_cache_entry_t g_select_expr_cache[SELECT_EXPR_CACHE_N]; -static uint16_t g_select_expr_cache_next = 0; - -static ray_t* select_expr_cache_get(uint64_t hash, uint64_t from_hash) { - if (!g_ray_profile.active) return NULL; - if (!hash) return NULL; - for (uint16_t i = 0; i < SELECT_EXPR_CACHE_N; i++) { - select_expr_cache_entry_t* e = &g_select_expr_cache[i]; - if (e->result && e->env_gen == ray_env_generation() && - e->hash == hash && e->from_hash == from_hash) { - ray_retain(e->result); - return e->result; - } - } - return NULL; -} - -static void select_expr_cache_put(uint64_t hash, uint64_t from_hash, - ray_t* result) { - if (!g_ray_profile.active) return; - if (!hash || !result || RAY_IS_ERR(result)) return; - select_expr_cache_entry_t* e = - &g_select_expr_cache[g_select_expr_cache_next++ % SELECT_EXPR_CACHE_N]; - if (e->result) ray_release(e->result); - e->hash = hash; - e->from_hash = from_hash; - e->env_gen = ray_env_generation(); - e->result = result; - ray_retain(e->result); -} - /* Flatten a RAY_DICT (keys SYM vec + vals LIST) into a transient * [k0,v0,k1,v1,...] array view so the existing dict-walking loops in * ray_select_fn et al. can iterate without rewriting every site. @@ -1634,1260 +1494,6 @@ static int atom_i64_const(ray_t* v, int64_t* out) { } } -typedef struct { - const void* base; - int8_t type; - uint8_t attrs; - int op; - int64_t rhs; -} xbar_count_clause_t; - -typedef struct { - int64_t key; - int64_t count; -} xbar_count_pair_t; - -typedef struct { - uint32_t key; - uint32_t count; -} i16x2_count_pair_t; - -typedef struct { - int32_t key; - uint32_t count; -} i32_count_pair_t; - -typedef struct { - int16_t key; - uint32_t count; -} i16_count_pair_t; - -typedef struct { - const int64_t* key_data; - int64_t bucket; - xbar_count_clause_t clauses[16]; - uint8_t n_clauses; - uint32_t cap; - int64_t* keys; - uint32_t* counts; - uint8_t* used; - _Atomic int overflow; -} xbar_count_ctx_t; - -typedef struct { - const int16_t* key0; - const int16_t* key1; - xbar_count_clause_t clauses[16]; - uint8_t n_clauses; - uint32_t cap; - uint32_t* keys; - uint32_t* counts; - uint8_t* used; - _Atomic int overflow; -} i16x2_count_ctx_t; - -typedef struct { - const int16_t* key; - uint32_t* counts; -} i16_ne0_count_ctx_t; - -typedef struct { - const int32_t* group; - const int64_t* distinct; - uint32_t cap; - int32_t* groups; - int64_t* values; - uint8_t* used; - _Atomic int overflow; -} i32_i64_cd_ctx_t; - -static int xbar_count_pair_cmp(const void* a, const void* b) { - const xbar_count_pair_t* pa = (const xbar_count_pair_t*)a; - const xbar_count_pair_t* pb = (const xbar_count_pair_t*)b; - return (pa->key > pb->key) - (pa->key < pb->key); -} - -static int i16x2_count_pair_desc_cmp(const void* a, const void* b) { - const i16x2_count_pair_t* pa = (const i16x2_count_pair_t*)a; - const i16x2_count_pair_t* pb = (const i16x2_count_pair_t*)b; - if (pa->count != pb->count) - return (pa->count < pb->count) - (pa->count > pb->count); - return (pa->key > pb->key) - (pa->key < pb->key); -} - -static int i32_count_pair_desc_cmp(const void* a, const void* b) { - const i32_count_pair_t* pa = (const i32_count_pair_t*)a; - const i32_count_pair_t* pb = (const i32_count_pair_t*)b; - if (pa->count != pb->count) - return (pa->count < pb->count) - (pa->count > pb->count); - return (pa->key > pb->key) - (pa->key < pb->key); -} - -static int i16_count_pair_desc_cmp(const void* a, const void* b) { - const i16_count_pair_t* pa = (const i16_count_pair_t*)a; - const i16_count_pair_t* pb = (const i16_count_pair_t*)b; - if (pa->count != pb->count) - return (pa->count < pb->count) - (pa->count > pb->count); - return (pa->key > pb->key) - (pa->key < pb->key); -} - -static uint64_t xbar_count_hash_i64(int64_t v) { - uint64_t h = (uint64_t)v; - h ^= h >> 33; - h *= 0xff51afd7ed558ccdULL; - h ^= h >> 33; - h *= 0xc4ceb9fe1a85ec53ULL; - h ^= h >> 33; - return h; -} - -static uint32_t count_hash_u32(uint32_t v) { - uint32_t h = v; - h ^= h >> 16; - h *= 0x7feb352dU; - h ^= h >> 15; - h *= 0x846ca68bU; - h ^= h >> 16; - return h; -} - -static uint64_t count_hash_i32_i64(int32_t g, int64_t v) { - uint64_t h = (uint64_t)(uint32_t)g * 0x9E3779B97F4A7C15ULL; - uint64_t x = (uint64_t)v; - x ^= x >> 33; - x *= 0xff51afd7ed558ccdULL; - x ^= x >> 33; - h ^= x + 0xBF58476D1CE4E5B9ULL + (h << 6) + (h >> 2); - h ^= h >> 33; - return h; -} - -static void xbar_count_worker_fn(void* raw, uint32_t worker_id, - int64_t start, int64_t end) { - xbar_count_ctx_t* ctx = (xbar_count_ctx_t*)raw; - uint32_t cap = ctx->cap; - uint32_t mask = cap - 1u; - int64_t* keys = ctx->keys + (size_t)worker_id * cap; - uint32_t* counts = ctx->counts + (size_t)worker_id * cap; - uint8_t* used = ctx->used + (size_t)worker_id * cap; - int64_t n_groups = 0; - int64_t bucket = ctx->bucket; - - for (int64_t r = start; r < end; r++) { - uint8_t pass = 1; - for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) { - const xbar_count_clause_t* c = &ctx->clauses[ci]; - int64_t v = read_col_i64(c->base, r, c->type, c->attrs); - if (c->op == 1) pass &= (uint8_t)(v == c->rhs); - else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs); - else pass &= (uint8_t)(v <= c->rhs); - if (!pass) break; - } - if (!pass) continue; - int64_t ts = ctx->key_data[r]; - int64_t q = ts / bucket; - if ((ts ^ bucket) < 0 && q * bucket != ts) q--; - int64_t k = q * bucket; - uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask; - while (used[slot] && keys[slot] != k) - slot = (slot + 1u) & mask; - if (!used[slot]) { - if (n_groups >= (int64_t)(cap / 2)) { - atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed); - return; - } - used[slot] = 1; - keys[slot] = k; - n_groups++; - } - counts[slot]++; - } -} - -static void i16x2_count_worker_fn(void* raw, uint32_t worker_id, - int64_t start, int64_t end) { - i16x2_count_ctx_t* ctx = (i16x2_count_ctx_t*)raw; - uint32_t cap = ctx->cap; - uint32_t mask = cap - 1u; - uint32_t* keys = ctx->keys + (size_t)worker_id * cap; - uint32_t* counts = ctx->counts + (size_t)worker_id * cap; - uint8_t* used = ctx->used + (size_t)worker_id * cap; - int64_t n_groups = 0; - - for (int64_t r = start; r < end; r++) { - uint8_t pass = 1; - for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) { - const xbar_count_clause_t* c = &ctx->clauses[ci]; - int64_t v = read_col_i64(c->base, r, c->type, c->attrs); - if (c->op == 1) pass &= (uint8_t)(v == c->rhs); - else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs); - else pass &= (uint8_t)(v <= c->rhs); - if (!pass) break; - } - if (!pass) continue; - uint32_t k = ((uint32_t)(uint16_t)ctx->key0[r] << 16) | - (uint32_t)(uint16_t)ctx->key1[r]; - uint32_t slot = count_hash_u32(k) & mask; - while (used[slot] && keys[slot] != k) - slot = (slot + 1u) & mask; - if (!used[slot]) { - if (n_groups >= (int64_t)(cap / 2)) { - atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed); - return; - } - used[slot] = 1; - keys[slot] = k; - n_groups++; - } - counts[slot]++; - } -} - -static void i16_ne0_count_worker_fn(void* raw, uint32_t worker_id, - int64_t start, int64_t end) { - i16_ne0_count_ctx_t* ctx = (i16_ne0_count_ctx_t*)raw; - uint32_t* counts = ctx->counts + (size_t)worker_id * 65536u; - const int16_t* key = ctx->key; - for (int64_t r = start; r < end; r++) { - int16_t v = key[r]; - if (v) - counts[(uint32_t)((int32_t)v + 32768)]++; - } -} - -static void i32_i64_cd_worker_fn(void* raw, uint32_t worker_id, - int64_t start, int64_t end) { - i32_i64_cd_ctx_t* ctx = (i32_i64_cd_ctx_t*)raw; - uint32_t cap = ctx->cap; - uint32_t mask = cap - 1u; - int32_t* groups = ctx->groups + (size_t)worker_id * cap; - int64_t* values = ctx->values + (size_t)worker_id * cap; - uint8_t* used = ctx->used + (size_t)worker_id * cap; - int64_t n_filled = 0; - - for (int64_t r = start; r < end; r++) { - int32_t g = ctx->group[r]; - int64_t v = ctx->distinct[r]; - uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & mask; - while (used[slot] && (groups[slot] != g || values[slot] != v)) - slot = (slot + 1u) & mask; - if (!used[slot]) { - if (n_filled >= (int64_t)(cap * 7u / 10u)) { - atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed); - return; - } - used[slot] = 1; - groups[slot] = g; - values[slot] = v; - n_filled++; - } - } -} - -static int sym_name_eq(int64_t sym, const char* name, size_t len) { - ray_t* s = ray_sym_str(sym); - return s && ray_str_len(s) == len && - memcmp(ray_str_ptr(s), name, len) == 0; -} - -static int parse_xbar_count_clause(ray_t* tbl, ray_t* expr, - xbar_count_clause_t* clauses, - uint8_t* n_clauses) { - if (!expr || expr->type != RAY_LIST || ray_len(expr) < 3) return 0; - ray_t** elems = (ray_t**)ray_data(expr); - if (!elems[0] || elems[0]->type != -RAY_SYM) return 0; - ray_t* head = ray_sym_str(elems[0]->i64); - if (!head) return 0; - const char* hn = ray_str_ptr(head); - size_t hl = ray_str_len(head); - if (hl == 3 && memcmp(hn, "and", 3) == 0) { - for (int64_t i = 1; i < ray_len(expr); i++) - if (!parse_xbar_count_clause(tbl, elems[i], clauses, n_clauses)) - return 0; - return 1; - } - if (ray_len(expr) != 3 || *n_clauses >= 16) return 0; - int op = 0; - if (hl == 2 && memcmp(hn, "==", 2) == 0) op = 1; - else if (hl == 2 && memcmp(hn, ">=", 2) == 0) op = 2; - else if (hl == 2 && memcmp(hn, "<=", 2) == 0) op = 3; - else return 0; - - ray_t* lhs = elems[1]; - ray_t* rhs = elems[2]; - int64_t rhs_i = 0; - if (!lhs || lhs->type != -RAY_SYM || !(lhs->attrs & RAY_ATTR_NAME) || - !atom_i64_const(rhs, &rhs_i)) - return 0; - ray_t* col = ray_table_get_col(tbl, lhs->i64); - if (!col || !ray_is_vec(col) || RAY_IS_PARTED(col->type) || - col->type == RAY_MAPCOMMON || (col->attrs & RAY_ATTR_HAS_NULLS)) - return 0; - int8_t ct = col->type; - if (ct != RAY_BOOL && ct != RAY_U8 && ct != RAY_I16 && - ct != RAY_I32 && ct != RAY_I64 && ct != RAY_DATE && - ct != RAY_TIME && ct != RAY_TIMESTAMP) - return 0; - clauses[*n_clauses] = (xbar_count_clause_t){ - .base = ray_data(col), - .type = ct, - .attrs = col->attrs, - .op = op, - .rhs = rhs_i, - }; - (*n_clauses)++; - return 1; -} - -static int count_clause_score(const xbar_count_clause_t* c) { - if (c->op == 1 && ray_sym_elem_size(c->type, c->attrs) >= 8) return 0; - if (c->op == 1) return 1; - return 2; -} - -static void order_count_clauses(xbar_count_clause_t* clauses, uint8_t n) { - for (uint8_t i = 1; i < n; i++) { - xbar_count_clause_t v = clauses[i]; - int vs = count_clause_score(&v); - uint8_t j = i; - while (j > 0 && count_clause_score(&clauses[j - 1]) > vs) { - clauses[j] = clauses[j - 1]; - j--; - } - clauses[j] = v; - } -} - -static int xbar_clause_cache_eq(const xbar_count_clause_t* a, uint8_t an, - const xbar_count_clause_t* b, uint8_t bn) { - if (an != bn) return 0; - for (uint8_t i = 0; i < an; i++) { - if (a[i].base != b[i].base || a[i].type != b[i].type || - a[i].attrs != b[i].attrs || a[i].op != b[i].op || - a[i].rhs != b[i].rhs) - return 0; - } - return 1; -} - -static int match_i16_key_ne_zero(ray_t* where_expr, int64_t key_sym) { - if (!where_expr || where_expr->type != RAY_LIST || ray_len(where_expr) != 3) - return 0; - ray_t** e = (ray_t**)ray_data(where_expr); - if (!e[0] || e[0]->type != -RAY_SYM || - !sym_name_eq(e[0]->i64, "!=", 2)) - return 0; - ray_t* lhs = e[1]; - int64_t rhs = 0; - return lhs && lhs->type == -RAY_SYM && (lhs->attrs & RAY_ATTR_NAME) && - lhs->i64 == key_sym && atom_i64_const(e[2], &rhs) && rhs == 0; -} - -static ray_t* try_i16_ne0_count_desc_select(ray_t* tbl, ray_t* where_expr, - ray_t* by_expr, ray_t* take_expr, - ray_t** dict_elems, - int64_t dict_n, - int64_t from_id, - int64_t where_id, - int64_t by_id, - int64_t take_id, - int64_t asc_id, - int64_t desc_id, - int64_t nearest_id) { - if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr || - !take_expr || by_expr->type != -RAY_SYM || - !(by_expr->attrs & RAY_ATTR_NAME)) - return NULL; - int64_t key_sym = by_expr->i64; - int64_t take_n = 0; - if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024) - return NULL; - if (!match_i16_key_ne_zero(where_expr, key_sym)) - return NULL; - - int64_t count_alias = -1; - int saw_desc = 0; - int saw_key_projection = 0; - for (int64_t i = 0; i + 1 < dict_n; i += 2) { - int64_t kid = dict_elems[i]->i64; - ray_t* v = dict_elems[i + 1]; - if (kid == from_id || kid == where_id || kid == by_id || - kid == take_id || kid == nearest_id) - continue; - if (kid == desc_id) { - if (!v || v->type != -RAY_SYM) - return NULL; - saw_desc = 1; - continue; - } - if (kid == asc_id) return NULL; - if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) && - kid == key_sym && v->i64 == key_sym) { - saw_key_projection = 1; - continue; - } - if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2) - return NULL; - ray_t** ae = (ray_t**)ray_data(v); - if (!ae[0] || ae[0]->type != -RAY_SYM || - !sym_name_eq(ae[0]->i64, "count", 5)) - return NULL; - ray_t* arg = ae[1]; - if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME) || - arg->i64 != key_sym) - return NULL; - count_alias = kid; - } - if (!saw_desc || !saw_key_projection || count_alias < 0) - return NULL; - - ray_t* col = ray_table_get_col(tbl, key_sym); - if (!col || !ray_is_vec(col) || col->type != RAY_I16 || - (col->attrs & RAY_ATTR_HAS_NULLS)) - return NULL; - - static ray_t* cache_result = NULL; - static ray_t* cache_tbl = NULL; - static ray_t* cache_col = NULL; - static int64_t cache_len = -1; - static int64_t cache_key_sym = -1; - static int64_t cache_count_alias = -1; - static int64_t cache_take = -1; - if (cache_result && cache_tbl == tbl && cache_col == col && - cache_len == col->len && cache_key_sym == key_sym && - cache_count_alias == count_alias && cache_take == take_n) { - ray_retain(cache_result); - return cache_result; - } - - ray_pool_t* pool = ray_pool_get(); - uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; - if (nw == 0) nw = 1; - ray_t* counts_hdr = NULL; - uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr, - (size_t)nw * 65536u * sizeof(uint32_t)); - if (!counts) - return ray_error("oom", NULL); - - i16_ne0_count_ctx_t ctx = { - .key = (const int16_t*)ray_data(col), - .counts = counts, - }; - int64_t nrows = ray_table_nrows(tbl); - if (pool && nrows >= RAY_PARALLEL_THRESHOLD) - ray_pool_dispatch(pool, i16_ne0_count_worker_fn, &ctx, nrows); - else - i16_ne0_count_worker_fn(&ctx, 0, 0, nrows); - - i16_count_pair_t top[1024]; - int64_t top_n = 0; - for (uint32_t s = 0; s < 65536u; s++) { - uint32_t total = 0; - for (uint32_t w = 0; w < nw; w++) - total += counts[(size_t)w * 65536u + s]; - if (!total) continue; - i16_count_pair_t cand = { - .key = (int16_t)((int32_t)s - 32768), - .count = total, - }; - if (top_n < take_n) { - top[top_n++] = cand; - continue; - } - int64_t min_i = 0; - for (int64_t i = 1; i < top_n; i++) { - if (top[i].count < top[min_i].count || - (top[i].count == top[min_i].count && top[i].key > top[min_i].key)) - min_i = i; - } - if (cand.count > top[min_i].count || - (cand.count == top[min_i].count && cand.key < top[min_i].key)) - top[min_i] = cand; - } - scratch_free(counts_hdr); - qsort(top, (size_t)top_n, sizeof(i16_count_pair_t), - i16_count_pair_desc_cmp); - - int64_t out_n = top_n; - ray_t* key_out = ray_vec_new(RAY_I16, out_n); - ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); - if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) { - if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out); - if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); - return ray_error("oom", NULL); - } - key_out->len = out_n; - cnt_out->len = out_n; - int16_t* ko = (int16_t*)ray_data(key_out); - int64_t* co = (int64_t*)ray_data(cnt_out); - for (int64_t i = 0; i < out_n; i++) { - ko[i] = top[i].key; - co[i] = (int64_t)top[i].count; - } - - ray_t* out = ray_table_new(2); - if (!out || RAY_IS_ERR(out)) { - ray_release(key_out); ray_release(cnt_out); - return out ? out : ray_error("oom", NULL); - } - out = ray_table_add_col(out, key_sym, key_out); - out = ray_table_add_col(out, count_alias, cnt_out); - ray_release(key_out); ray_release(cnt_out); - if (cache_result) - ray_release(cache_result); - cache_result = out; - cache_tbl = tbl; - cache_col = col; - cache_len = col->len; - cache_key_sym = key_sym; - cache_count_alias = count_alias; - cache_take = take_n; - ray_retain(cache_result); - return out; -} - -static ray_t* try_i32_i64_count_distinct_select(ray_t* tbl, ray_t* where_expr, - ray_t* by_expr, - ray_t* take_expr, - ray_t** dict_elems, - int64_t dict_n, - int64_t from_id, - int64_t where_id, - int64_t by_id, - int64_t take_id, - int64_t asc_id, - int64_t desc_id, - int64_t nearest_id) { - if (!tbl || tbl->type != RAY_TABLE || where_expr || !by_expr || - !take_expr || by_expr->type != -RAY_SYM || - !(by_expr->attrs & RAY_ATTR_NAME)) - return NULL; - - int64_t take_n = 0; - if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024) - return NULL; - - int64_t group_sym = by_expr->i64; - int64_t distinct_sym = -1; - int64_t count_alias = -1; - int saw_desc = 0; - int saw_group_projection = 0; - for (int64_t i = 0; i + 1 < dict_n; i += 2) { - int64_t kid = dict_elems[i]->i64; - ray_t* v = dict_elems[i + 1]; - if (kid == from_id || kid == where_id || kid == by_id || - kid == take_id || kid == nearest_id) - continue; - if (kid == desc_id) { - if (!v || v->type != -RAY_SYM) - return NULL; - saw_desc = 1; - continue; - } - if (kid == asc_id) return NULL; - if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) && - kid == group_sym && v->i64 == group_sym) { - saw_group_projection = 1; - continue; - } - if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2) - return NULL; - ray_t** ae = (ray_t**)ray_data(v); - if (!ae[0] || ae[0]->type != -RAY_SYM || - !sym_name_eq(ae[0]->i64, "count", 5)) - return NULL; - ray_t* inner = ae[1]; - if (!inner || inner->type != RAY_LIST || ray_len(inner) != 2) - return NULL; - ray_t** ie = (ray_t**)ray_data(inner); - if (!ie[0] || ie[0]->type != -RAY_SYM || - !sym_name_eq(ie[0]->i64, "distinct", 8)) - return NULL; - ray_t* arg = ie[1]; - if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME)) - return NULL; - distinct_sym = arg->i64; - count_alias = kid; - } - if (!saw_desc || !saw_group_projection || count_alias < 0 || - distinct_sym < 0) - return NULL; - - ray_t* gcol = ray_table_get_col(tbl, group_sym); - ray_t* dcol = ray_table_get_col(tbl, distinct_sym); - if (!gcol || !dcol || !ray_is_vec(gcol) || !ray_is_vec(dcol) || - gcol->type != RAY_I32 || dcol->type != RAY_I64 || - (gcol->attrs & RAY_ATTR_HAS_NULLS) || - (dcol->attrs & RAY_ATTR_HAS_NULLS)) - return NULL; - - static ray_t* cache_result = NULL; - static ray_t* cache_tbl = NULL; - static int64_t cache_len = -1; - static int64_t cache_group_sym = -1; - static int64_t cache_distinct_sym = -1; - static int64_t cache_count_alias = -1; - static int64_t cache_take = -1; - if (cache_result && cache_tbl == tbl && cache_len == gcol->len && - cache_group_sym == group_sym && cache_distinct_sym == distinct_sym && - cache_count_alias == count_alias && cache_take == take_n) { - ray_retain(cache_result); - return cache_result; - } - - int64_t nrows = ray_table_nrows(tbl); - ray_pool_t* pool = ray_pool_get(); - uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; - if (nw == 0) nw = 1; - const uint32_t local_cap = 1u << 20; - ray_t *lg_hdr = NULL, *lv_hdr = NULL, *lu_hdr = NULL; - int32_t* lg = (int32_t*)scratch_calloc(&lg_hdr, - (size_t)nw * local_cap * sizeof(int32_t)); - int64_t* lv = (int64_t*)scratch_calloc(&lv_hdr, - (size_t)nw * local_cap * sizeof(int64_t)); - uint8_t* lu = (uint8_t*)scratch_calloc(&lu_hdr, (size_t)nw * local_cap); - if (!lg || !lv || !lu) { - if (lg_hdr) scratch_free(lg_hdr); - if (lv_hdr) scratch_free(lv_hdr); - if (lu_hdr) scratch_free(lu_hdr); - return ray_error("oom", NULL); - } - - i32_i64_cd_ctx_t ctx = { - .group = (const int32_t*)ray_data(gcol), - .distinct = (const int64_t*)ray_data(dcol), - .cap = local_cap, - .groups = lg, - .values = lv, - .used = lu, - }; - atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed); - if (pool && nrows >= RAY_PARALLEL_THRESHOLD) - ray_pool_dispatch(pool, i32_i64_cd_worker_fn, &ctx, nrows); - else - i32_i64_cd_worker_fn(&ctx, 0, 0, nrows); - if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) { - scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); - return NULL; - } - - const uint32_t gcap = 1u << 23; - const uint32_t gmask = gcap - 1u; - ray_t *gg_hdr = NULL, *gv_hdr = NULL, *gu_hdr = NULL; - int32_t* gg = (int32_t*)scratch_calloc(&gg_hdr, (size_t)gcap * sizeof(int32_t)); - int64_t* gv = (int64_t*)scratch_calloc(&gv_hdr, (size_t)gcap * sizeof(int64_t)); - uint8_t* gu = (uint8_t*)scratch_calloc(&gu_hdr, (size_t)gcap); - if (!gg || !gv || !gu) { - scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); - if (gg_hdr) scratch_free(gg_hdr); - if (gv_hdr) scratch_free(gv_hdr); - if (gu_hdr) scratch_free(gu_hdr); - return ray_error("oom", NULL); - } - - int64_t global_n = 0; - for (uint32_t w = 0; w < nw; w++) { - int32_t* wg = lg + (size_t)w * local_cap; - int64_t* wv = lv + (size_t)w * local_cap; - uint8_t* wu = lu + (size_t)w * local_cap; - for (uint32_t s = 0; s < local_cap; s++) { - if (!wu[s]) continue; - int32_t g = wg[s]; - int64_t v = wv[s]; - uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & gmask; - while (gu[slot] && (gg[slot] != g || gv[slot] != v)) - slot = (slot + 1u) & gmask; - if (!gu[slot]) { - if (global_n >= (int64_t)(gcap * 7u / 10u)) { - scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr); - scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); - return NULL; - } - gu[slot] = 1; - gg[slot] = g; - gv[slot] = v; - global_n++; - } - } - } - scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); - - const uint32_t rcap = 4096; - const uint32_t rmask = rcap - 1u; - int32_t rkeys[4096]; - uint32_t rcounts[4096]; - uint8_t rused[4096]; - memset(rused, 0, sizeof(rused)); - int64_t region_n = 0; - for (uint32_t s = 0; s < gcap; s++) { - if (!gu[s]) continue; - int32_t g = gg[s]; - uint32_t slot = count_hash_u32((uint32_t)g) & rmask; - while (rused[slot] && rkeys[slot] != g) - slot = (slot + 1u) & rmask; - if (!rused[slot]) { - if (region_n >= (int64_t)(rcap / 2)) { - scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr); - return NULL; - } - rused[slot] = 1; - rkeys[slot] = g; - rcounts[slot] = 0; - region_n++; - } - rcounts[slot]++; - } - scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr); - - ray_t* pairs_hdr = NULL; - i32_count_pair_t* pairs = (i32_count_pair_t*)scratch_alloc( - &pairs_hdr, (size_t)region_n * sizeof(i32_count_pair_t)); - if (!pairs && region_n > 0) - return ray_error("oom", NULL); - int64_t pi = 0; - for (uint32_t s = 0; s < rcap; s++) { - if (!rused[s]) continue; - pairs[pi++] = (i32_count_pair_t){ .key = rkeys[s], .count = rcounts[s] }; - } - qsort(pairs, (size_t)region_n, sizeof(i32_count_pair_t), - i32_count_pair_desc_cmp); - - int64_t out_n = region_n < take_n ? region_n : take_n; - ray_t* key_out = ray_vec_new(RAY_I32, out_n); - ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); - if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) { - if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out); - if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); - scratch_free(pairs_hdr); - return ray_error("oom", NULL); - } - key_out->len = out_n; - cnt_out->len = out_n; - int32_t* ko = (int32_t*)ray_data(key_out); - int64_t* co = (int64_t*)ray_data(cnt_out); - for (int64_t i = 0; i < out_n; i++) { - ko[i] = pairs[i].key; - co[i] = (int64_t)pairs[i].count; - } - scratch_free(pairs_hdr); - - ray_t* out = ray_table_new(2); - if (!out || RAY_IS_ERR(out)) { - ray_release(key_out); ray_release(cnt_out); - return out ? out : ray_error("oom", NULL); - } - out = ray_table_add_col(out, group_sym, key_out); - out = ray_table_add_col(out, count_alias, cnt_out); - ray_release(key_out); ray_release(cnt_out); - if (cache_result) - ray_release(cache_result); - cache_result = out; - cache_tbl = tbl; - cache_len = gcol->len; - cache_group_sym = group_sym; - cache_distinct_sym = distinct_sym; - cache_count_alias = count_alias; - cache_take = take_n; - ray_retain(cache_result); - return out; -} - -static ray_t* try_i16x2_count_desc_select(ray_t* tbl, ray_t* where_expr, - ray_t* by_expr, ray_t* take_expr, - ray_t** dict_elems, int64_t dict_n, - int64_t from_id, int64_t where_id, - int64_t by_id, int64_t take_id, - int64_t asc_id, int64_t desc_id, - int64_t nearest_id) { - if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr || - !take_expr || by_expr->type != RAY_DICT) - return NULL; - - int64_t take_n = 0; - if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000) - return NULL; - - DICT_VIEW_DECL(bv); - DICT_VIEW_OPEN(by_expr, bv); - if (DICT_VIEW_OVERFLOW(bv) || bv_n != 4) return NULL; - ray_t* key0_atom = bv[0]; - ray_t* key0_val = bv[1]; - ray_t* key1_atom = bv[2]; - ray_t* key1_val = bv[3]; - if (!key0_atom || key0_atom->type != -RAY_SYM || - !key1_atom || key1_atom->type != -RAY_SYM || - !key0_val || key0_val->type != -RAY_SYM || - !key1_val || key1_val->type != -RAY_SYM || - !(key0_val->attrs & RAY_ATTR_NAME) || - !(key1_val->attrs & RAY_ATTR_NAME) || - key0_atom->i64 != key0_val->i64 || - key1_atom->i64 != key1_val->i64) - return NULL; - - int64_t count_alias = -1; - int saw_desc = 0; - for (int64_t i = 0; i + 1 < dict_n; i += 2) { - int64_t kid = dict_elems[i]->i64; - ray_t* v = dict_elems[i + 1]; - if (kid == from_id || kid == where_id || kid == by_id || - kid == take_id || kid == nearest_id) - continue; - if (kid == desc_id) { - if (!v || v->type != -RAY_SYM) - return NULL; - saw_desc = 1; - continue; - } - if (kid == asc_id) return NULL; - if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL; - ray_t** ae = (ray_t**)ray_data(v); - if (!ae[0] || ae[0]->type != -RAY_SYM || - !sym_name_eq(ae[0]->i64, "count", 5)) - return NULL; - count_alias = kid; - } - if (!saw_desc || count_alias < 0) return NULL; - - ray_t* col0 = ray_table_get_col(tbl, key0_atom->i64); - ray_t* col1 = ray_table_get_col(tbl, key1_atom->i64); - if (!col0 || !col1 || !ray_is_vec(col0) || !ray_is_vec(col1) || - col0->type != RAY_I16 || col1->type != RAY_I16 || - (col0->attrs & RAY_ATTR_HAS_NULLS) || - (col1->attrs & RAY_ATTR_HAS_NULLS)) - return NULL; - - xbar_count_clause_t clauses[16]; - uint8_t n_clauses = 0; - if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) || - n_clauses == 0) - return NULL; - order_count_clauses(clauses, n_clauses); - - static ray_t* cache_result = NULL; - static ray_t* cache_tbl = NULL; - static ray_t* cache_col0 = NULL; - static ray_t* cache_col1 = NULL; - static int64_t cache_len = -1; - static int64_t cache_key0 = -1; - static int64_t cache_key1 = -1; - static int64_t cache_count_alias = -1; - static int64_t cache_take = -1; - static uint8_t cache_n_clauses = 0; - static xbar_count_clause_t cache_clauses[16]; - if (cache_result && cache_tbl == tbl && cache_col0 == col0 && - cache_col1 == col1 && cache_len == col0->len && - cache_key0 == key0_atom->i64 && cache_key1 == key1_atom->i64 && - cache_count_alias == count_alias && cache_take == take_n && - xbar_clause_cache_eq(cache_clauses, cache_n_clauses, - clauses, n_clauses)) { - ray_retain(cache_result); - return cache_result; - } - - int64_t nrows = ray_table_nrows(tbl); - const uint32_t cap = 4096; - const uint32_t mask = cap - 1u; - ray_pool_t* pool = ray_pool_get(); - uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; - if (nw == 0) nw = 1; - - ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL; - uint32_t* keys = (uint32_t*)scratch_calloc(&keys_hdr, - (size_t)nw * cap * sizeof(uint32_t)); - uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr, - (size_t)nw * cap * sizeof(uint32_t)); - uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap); - if (!keys || !counts || !used) { - if (keys_hdr) scratch_free(keys_hdr); - if (counts_hdr) scratch_free(counts_hdr); - if (used_hdr) scratch_free(used_hdr); - return ray_error("oom", NULL); - } - - i16x2_count_ctx_t ctx = { - .key0 = (const int16_t*)ray_data(col0), - .key1 = (const int16_t*)ray_data(col1), - .n_clauses = n_clauses, - .cap = cap, - .keys = keys, - .counts = counts, - .used = used, - }; - memcpy(ctx.clauses, clauses, sizeof(clauses)); - atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed); - if (pool && nrows >= RAY_PARALLEL_THRESHOLD) - ray_pool_dispatch(pool, i16x2_count_worker_fn, &ctx, nrows); - else - i16x2_count_worker_fn(&ctx, 0, 0, nrows); - if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) { - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - return NULL; - } - - ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL; - uint32_t* mkeys = (uint32_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(uint32_t)); - uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t)); - uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap); - if (!mkeys || !mcounts || !mused) { - scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); - if (mkeys_hdr) scratch_free(mkeys_hdr); - if (mcounts_hdr) scratch_free(mcounts_hdr); - if (mused_hdr) scratch_free(mused_hdr); - return ray_error("oom", NULL); - } - - int64_t n_groups = 0; - for (uint32_t w = 0; w < nw; w++) { - uint32_t* wk = keys + (size_t)w * cap; - uint32_t* wc = counts + (size_t)w * cap; - uint8_t* wu = used + (size_t)w * cap; - for (uint32_t s = 0; s < cap; s++) { - if (!wu[s]) continue; - uint32_t k = wk[s]; - uint32_t slot = count_hash_u32(k) & mask; - while (mused[slot] && mkeys[slot] != k) - slot = (slot + 1u) & mask; - if (!mused[slot]) { - if (n_groups >= (int64_t)(cap / 2)) { - scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); - scratch_free(mused_hdr); scratch_free(keys_hdr); - scratch_free(counts_hdr); scratch_free(used_hdr); - return NULL; - } - mused[slot] = 1; - mkeys[slot] = k; - n_groups++; - } - mcounts[slot] += wc[s]; - } - } - - int64_t out_n = n_groups < take_n ? n_groups : take_n; - ray_t* pairs_hdr = NULL; - i16x2_count_pair_t* pairs = (i16x2_count_pair_t*)scratch_alloc( - &pairs_hdr, (size_t)n_groups * sizeof(i16x2_count_pair_t)); - if (!pairs && n_groups > 0) { - scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr); - scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); - return ray_error("oom", NULL); - } - int64_t pi = 0; - for (uint32_t s = 0; s < cap; s++) { - if (!mused[s]) continue; - pairs[pi++] = (i16x2_count_pair_t){ .key = mkeys[s], .count = mcounts[s] }; - } - qsort(pairs, (size_t)n_groups, sizeof(i16x2_count_pair_t), - i16x2_count_pair_desc_cmp); - - ray_t* key0_out = ray_vec_new(RAY_I16, out_n); - ray_t* key1_out = ray_vec_new(RAY_I16, out_n); - ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); - if (!key0_out || !key1_out || !cnt_out || - RAY_IS_ERR(key0_out) || RAY_IS_ERR(key1_out) || RAY_IS_ERR(cnt_out)) { - if (key0_out && !RAY_IS_ERR(key0_out)) ray_release(key0_out); - if (key1_out && !RAY_IS_ERR(key1_out)) ray_release(key1_out); - if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); - scratch_free(pairs_hdr); - scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr); - scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); - return ray_error("oom", NULL); - } - key0_out->len = out_n; - key1_out->len = out_n; - cnt_out->len = out_n; - int16_t* k0o = (int16_t*)ray_data(key0_out); - int16_t* k1o = (int16_t*)ray_data(key1_out); - int64_t* co = (int64_t*)ray_data(cnt_out); - for (int64_t i = 0; i < out_n; i++) { - uint32_t k = pairs[i].key; - k0o[i] = (int16_t)(uint16_t)(k >> 16); - k1o[i] = (int16_t)(uint16_t)k; - co[i] = (int64_t)pairs[i].count; - } - scratch_free(pairs_hdr); - scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr); - scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); - - ray_t* out = ray_table_new(3); - if (!out || RAY_IS_ERR(out)) { - ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out); - return out ? out : ray_error("oom", NULL); - } - out = ray_table_add_col(out, key0_atom->i64, key0_out); - out = ray_table_add_col(out, key1_atom->i64, key1_out); - out = ray_table_add_col(out, count_alias, cnt_out); - ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out); - if (cache_result) - ray_release(cache_result); - cache_result = out; - cache_tbl = tbl; - cache_col0 = col0; - cache_col1 = col1; - cache_len = col0->len; - cache_key0 = key0_atom->i64; - cache_key1 = key1_atom->i64; - cache_count_alias = count_alias; - cache_take = take_n; - cache_n_clauses = n_clauses; - memcpy(cache_clauses, clauses, sizeof(clauses)); - ray_retain(cache_result); - return out; -} - -static ray_t* try_xbar_count_select(ray_t* tbl, ray_t* where_expr, - ray_t* by_expr, ray_t* take_expr, - ray_t** dict_elems, int64_t dict_n, - int64_t from_id, int64_t where_id, - int64_t by_id, int64_t take_id, - int64_t asc_id, int64_t desc_id, - int64_t nearest_id) { - if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr || - !take_expr) - return NULL; - - int64_t take_n = 0; - if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000) - return NULL; - - if (!by_expr || by_expr->type != RAY_DICT) return NULL; - DICT_VIEW_DECL(bv); - DICT_VIEW_OPEN(by_expr, bv); - if (DICT_VIEW_OVERFLOW(bv) || bv_n != 2) return NULL; - ray_t* key_atom = bv[0]; - ray_t* xbar_expr = bv[1]; - if (!key_atom || key_atom->type != -RAY_SYM || - !xbar_expr || xbar_expr->type != RAY_LIST || - ray_len(xbar_expr) != 3) - return NULL; - ray_t** xe = (ray_t**)ray_data(xbar_expr); - if (!xe[0] || xe[0]->type != -RAY_SYM || - !sym_name_eq(xe[0]->i64, "xbar", 4)) - return NULL; - if (!xe[1] || xe[1]->type != -RAY_SYM || - !(xe[1]->attrs & RAY_ATTR_NAME)) - return NULL; - int64_t bucket = 0; - if (!atom_i64_const(xe[2], &bucket) || bucket <= 0) return NULL; - - int64_t count_alias = -1; - int saw_asc = 0; - for (int64_t i = 0; i + 1 < dict_n; i += 2) { - int64_t kid = dict_elems[i]->i64; - ray_t* v = dict_elems[i + 1]; - if (kid == from_id || kid == where_id || kid == by_id || - kid == take_id || kid == nearest_id) - continue; - if (kid == asc_id) { - if (!v || v->type != -RAY_SYM || v->i64 != key_atom->i64) - return NULL; - saw_asc = 1; - continue; - } - if (kid == desc_id) return NULL; - if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL; - ray_t** ae = (ray_t**)ray_data(v); - if (!ae[0] || ae[0]->type != -RAY_SYM || - !sym_name_eq(ae[0]->i64, "count", 5)) - return NULL; - count_alias = kid; - } - if (!saw_asc || count_alias < 0) return NULL; - - ray_t* key_col = ray_table_get_col(tbl, xe[1]->i64); - if (!key_col || !ray_is_vec(key_col) || key_col->type != RAY_TIMESTAMP || - RAY_IS_PARTED(key_col->type) || key_col->type == RAY_MAPCOMMON || - (key_col->attrs & RAY_ATTR_HAS_NULLS)) - return NULL; - - xbar_count_clause_t clauses[16]; - uint8_t n_clauses = 0; - if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) || - n_clauses == 0) - return NULL; - order_count_clauses(clauses, n_clauses); - - int64_t nrows = ray_table_nrows(tbl); - const int64_t* key_data = (const int64_t*)ray_data(key_col); - static ray_t* cache_result = NULL; - static ray_t* cache_tbl = NULL; - static ray_t* cache_key_col = NULL; - static int64_t cache_len = -1; - static int64_t cache_key_sym = -1; - static int64_t cache_out_sym = -1; - static int64_t cache_count_alias = -1; - static int64_t cache_bucket = -1; - static int64_t cache_take = -1; - static uint8_t cache_n_clauses = 0; - static xbar_count_clause_t cache_clauses[16]; - if (cache_result && cache_tbl == tbl && cache_key_col == key_col && - cache_len == key_col->len && cache_key_sym == xe[1]->i64 && - cache_out_sym == key_atom->i64 && cache_count_alias == count_alias && - cache_bucket == bucket && cache_take == take_n && - xbar_clause_cache_eq(cache_clauses, cache_n_clauses, - clauses, n_clauses)) { - ray_retain(cache_result); - return cache_result; - } - const uint32_t cap = 4096; - const uint32_t mask = cap - 1u; - ray_pool_t* pool = ray_pool_get(); - uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; - if (nw == 0) nw = 1; - ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL; - int64_t* keys = (int64_t*)scratch_calloc(&keys_hdr, - (size_t)nw * cap * sizeof(int64_t)); - uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr, - (size_t)nw * cap * sizeof(uint32_t)); - uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap); - if (!keys || !counts || !used) { - if (keys_hdr) scratch_free(keys_hdr); - if (counts_hdr) scratch_free(counts_hdr); - if (used_hdr) scratch_free(used_hdr); - return ray_error("oom", NULL); - } - - xbar_count_ctx_t ctx = { - .key_data = key_data, - .bucket = bucket, - .n_clauses = n_clauses, - .cap = cap, - .keys = keys, - .counts = counts, - .used = used, - }; - memcpy(ctx.clauses, clauses, sizeof(clauses)); - atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed); - if (pool && nrows >= RAY_PARALLEL_THRESHOLD) - ray_pool_dispatch(pool, xbar_count_worker_fn, &ctx, nrows); - else - xbar_count_worker_fn(&ctx, 0, 0, nrows); - if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) { - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - return NULL; - } - - ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL; - int64_t* mkeys = (int64_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(int64_t)); - uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t)); - uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap); - if (!mkeys || !mcounts || !mused) { - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - if (mkeys_hdr) scratch_free(mkeys_hdr); - if (mcounts_hdr) scratch_free(mcounts_hdr); - if (mused_hdr) scratch_free(mused_hdr); - return ray_error("oom", NULL); - } - - int64_t n_groups = 0; - for (uint32_t w = 0; w < nw; w++) { - int64_t* wk = keys + (size_t)w * cap; - uint32_t* wc = counts + (size_t)w * cap; - uint8_t* wu = used + (size_t)w * cap; - for (uint32_t s = 0; s < cap; s++) { - if (!wu[s]) continue; - int64_t k = wk[s]; - uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask; - while (mused[slot] && mkeys[slot] != k) - slot = (slot + 1u) & mask; - if (!mused[slot]) { - if (n_groups >= (int64_t)(cap / 2)) { - scratch_free(mkeys_hdr); - scratch_free(mcounts_hdr); - scratch_free(mused_hdr); - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - return NULL; - } - mused[slot] = 1; - mkeys[slot] = k; - n_groups++; - } - mcounts[slot] += wc[s]; - } - } - - int64_t out_n = n_groups < take_n ? n_groups : take_n; - ray_t* pairs_hdr = NULL; - xbar_count_pair_t* pairs = (xbar_count_pair_t*)scratch_alloc( - &pairs_hdr, (size_t)n_groups * sizeof(xbar_count_pair_t)); - if (!pairs && n_groups > 0) { - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - return ray_error("oom", NULL); - } - int64_t pi = 0; - for (uint32_t s = 0; s < cap; s++) { - if (!mused[s]) continue; - pairs[pi++] = (xbar_count_pair_t){ .key = mkeys[s], .count = mcounts[s] }; - } - qsort(pairs, (size_t)n_groups, sizeof(xbar_count_pair_t), - xbar_count_pair_cmp); - - ray_t* key_out = ray_vec_new(RAY_TIMESTAMP, out_n); - ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); - if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) { - if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out); - if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); - scratch_free(pairs_hdr); - scratch_free(mkeys_hdr); - scratch_free(mcounts_hdr); - scratch_free(mused_hdr); - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - return ray_error("oom", NULL); - } - key_out->len = out_n; - cnt_out->len = out_n; - int64_t* ko = (int64_t*)ray_data(key_out); - int64_t* co = (int64_t*)ray_data(cnt_out); - for (int64_t i = 0; i < out_n; i++) { - ko[i] = pairs[i].key; - co[i] = pairs[i].count; - } - scratch_free(pairs_hdr); - scratch_free(mkeys_hdr); - scratch_free(mcounts_hdr); - scratch_free(mused_hdr); - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - - ray_t* out = ray_table_new(2); - if (!out || RAY_IS_ERR(out)) { - ray_release(key_out); - ray_release(cnt_out); - return out ? out : ray_error("oom", NULL); - } - out = ray_table_add_col(out, key_atom->i64, key_out); - out = ray_table_add_col(out, count_alias, cnt_out); - ray_release(key_out); - ray_release(cnt_out); - if (cache_result) - ray_release(cache_result); - cache_result = out; - cache_tbl = tbl; - cache_key_col = key_col; - cache_len = key_col->len; - cache_key_sym = xe[1]->i64; - cache_out_sym = key_atom->i64; - cache_count_alias = count_alias; - cache_bucket = bucket; - cache_take = take_n; - cache_n_clauses = n_clauses; - memcpy(cache_clauses, clauses, sizeof(clauses)); - ray_retain(cache_result); - return out; -} - static int expr_affine_of_sym(ray_t* expr, int64_t sym, int64_t* bias) { if (!expr) return 0; if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME) && @@ -4109,6 +2715,33 @@ static ray_t* count_distinct_per_group_buf(ray_t* inner_expr, ray_t* tbl, out->len = n_groups; int64_t* odata = (int64_t*)ray_data(out); + /* HyperLogLog approximate path — one task per group, each task with + * a private stack-resident sketch (~16 KB). Triggered when the + * total inflated row count across all groups is large enough that + * the exact per-group dedup HT becomes memory-bandwidth-bound; + * 1 M rows is the same threshold the global path in + * exec_count_distinct uses. Returns within ~0.8 % std error. */ + /* HyperLogLog approximate path — one task per group, each task with + * a private stack-resident sketch (~16 KB). Triggered when the + * total inflated row count across all groups is large enough that + * the exact per-group dedup HT becomes memory-bandwidth-bound; + * 1 M rows is the same threshold the global path in + * exec_count_distinct uses. Returns within ~0.8 % std error. */ + if (n_groups > 0) { + int64_t total_rows = 0; + for (int64_t g = 0; g < n_groups; g++) total_rows += grp_cnt[g]; + if (total_rows >= (1 << 20)) { + if (ray_count_distinct_approx_pg_buf(src, idx_buf, offsets, + grp_cnt, n_groups, + 14, odata) == 0) { + ray_release(src); + return out; + } + /* Fall through on type miss; out still zeroed. */ + memset(odata, 0, (size_t)n_groups * sizeof(int64_t)); + } + } + /* Parallel path: dispatch one task per group when src has a flat * numeric / SYM layout we can read with a typed pointer. Each task * does its own dedup with a scratch hash table — no gather_by_idx @@ -4980,12 +3613,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { /* Evaluate 'from:' to get the source table */ ray_t* from_expr = dict_get(dict, "from"); if (!from_expr) return ray_error("domain", NULL); - uint64_t select_cache_hash_value = ray_expr_hash(dict); - uint64_t select_cache_from_hash = ray_expr_hash(from_expr); - ray_t* expr_cached = select_expr_cache_get(select_cache_hash_value, - select_cache_from_hash); - if (expr_cached) - return expr_cached; ray_t* where_expr = dict_get(dict, "where"); ray_group_emit_filter_t prev_emit_filter = ray_group_emit_filter_get(); ray_group_emit_filter_t emit_filter = {0}; @@ -4998,15 +3625,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_group_emit_filter_set(prev_emit_filter); if (RAY_IS_ERR(tbl)) return tbl; if (tbl->type != RAY_TABLE) { ray_release(tbl); return ray_error("type", NULL); } - int64_t select_cache_nrows = ray_table_nrows(tbl); - ray_t* select_cached = select_cache_get(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash); - if (select_cached) { - ray_release(tbl); - return select_cached; - } - ray_t* by_expr = dict_get(dict, "by"); ray_t* take_expr = dict_get(dict, "take"); ray_t* nearest_expr = dict_get(dict, "nearest"); @@ -5038,43 +3656,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (kid == asc_id || kid == desc_id) { has_sort = true; break; } } - ray_t* xbar_count = try_xbar_count_select(tbl, where_expr, by_expr, - take_expr, dict_elems, dict_n, - from_id, where_id, by_id, - take_id, asc_id, desc_id, - nearest_id); - if (xbar_count) { - ray_release(tbl); - return xbar_count; - } - - ray_t* i16_ne0_count = try_i16_ne0_count_desc_select( - tbl, where_expr, by_expr, take_expr, dict_elems, dict_n, - from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id); - if (i16_ne0_count) { - ray_release(tbl); - return i16_ne0_count; - } - - ray_t* i32_i64_cd = try_i32_i64_count_distinct_select( - tbl, where_expr, by_expr, take_expr, dict_elems, dict_n, - from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id); - if (i32_i64_cd) { - ray_release(tbl); - return i32_i64_cd; - } - - ray_t* i16x2_count = try_i16x2_count_desc_select(tbl, where_expr, by_expr, - take_expr, dict_elems, - dict_n, from_id, - where_id, by_id, - take_id, asc_id, - desc_id, nearest_id); - if (i16x2_count) { - ray_release(tbl); - return i16x2_count; - } - /* `nearest` is mutually exclusive with `asc`/`desc`/`by` — ANN * ordering is an index scan, not a column sort, and cannot be * composed with group-by in this phase. */ @@ -6405,9 +4986,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_free(vals_hdr); ray_free(null_hdr); ray_free(cnt_hdr); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } } @@ -6668,16 +5246,10 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); if (take_preapplied) { - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } result = apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } @@ -6868,9 +5440,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { } res = apply_sort_take(res, dict_elems, dict_n, asc_id, desc_id, take_id); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, res); return res; } @@ -7282,9 +5851,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_release(tbl); result = apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } @@ -8423,9 +6989,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_release(tbl); result = apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } } else if (n_out > 0) { @@ -8573,9 +7136,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_graph_free(g); ray_release(tbl); result = apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } else { root = ray_select_op(g, root, col_ops, nc); @@ -9615,8 +8175,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (by_sym_vec_owned) ray_release(by_sym_vec_owned); if (saved_selection) ray_release(saved_selection); - select_cache_put(tbl, select_cache_nrows, select_cache_hash_value, - select_cache_from_hash, result); return result; }