diff --git a/src/io/csv.c b/src/io/csv.c index f8189ecb..7d07cd3c 100644 --- a/src/io/csv.c +++ b/src/io/csv.c @@ -44,6 +44,7 @@ #include "core/pool.h" #include "lang/format.h" #include "ops/hash.h" +#include "ops/idxop.h" /* attach per-chunk zone index after load */ #include "store/col.h" #include "store/fileio.h" #include "store/splay.h" @@ -1227,6 +1228,113 @@ static void csv_parse_serial(const char* buf, size_t buf_size, } } +/* Per-column elem size for the hash-attach cap. Mirrors the integer + * shapes accepted by ray_index_attach_hash (BOOL/U8/I16/I32/I64/DATE/ + * TIME/TIMESTAMP); returns 0 for floats and dict-backed types so the + * caller skips them. */ +static int csv_hash_elem_size(int8_t t) { + switch (t) { + case RAY_BOOL: case RAY_U8: return 1; + case RAY_I16: return 2; + case RAY_I32: case RAY_DATE: return 4; + case RAY_I64: case RAY_TIME: case RAY_TIMESTAMP: return 8; + default: return 0; + } +} + +/* Decide whether `v` is a good candidate for an auto-attached hash + * index, using only its (already-attached) chunk_zone as the entropy + * proxy. A column is "random-shaped" when each chunk's [min, max] + * covers more than half the global range — i.e. there's effectively + * no clustering, so the per-chunk zone-skip never excludes a chunk + * and the only way to accelerate `col == K` is by hashing. + * + * The memory cap rejects columns where the hash index (table+chain + * arrays — ~24 bytes/row at default load factor) would be much larger + * than the data itself. We use 5× the column's data bytes as the + * budget: this comfortably admits I32/I64 numeric IDs (where the + * index is 3–5× the data) while still excluding narrow types like + * BOOL/U8/I16 where the index would dwarf the column. + * + * Returns 1 to attach, 0 to skip. */ +static int csv_should_attach_hash(ray_t* v) { + if (!v || RAY_IS_ERR(v)) return 0; + int esz = csv_hash_elem_size(v->type); + if (esz == 0) return 0; + /* Need a chunk_zone we can read for entropy estimation. */ + if (!(v->attrs & RAY_ATTR_HAS_INDEX) || !v->index) return 0; + ray_index_t* ix = ray_index_payload(v->index); + if (ix->kind != RAY_IDX_CHUNK_ZONE || ix->u.chunk_zone.is_f64) return 0; + uint32_t n_chunks = ix->u.chunk_zone.n_chunks; + if (n_chunks < 4) return 0; + const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins); + const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs); + + /* Whole-column [gmin, gmax] from the chunk extrema, ignoring empty + * chunks (mn > mx, set by the chunk_zone scan when a chunk is fully + * null). */ + int64_t gmin = INT64_MAX, gmax = INT64_MIN; + for (uint32_t g = 0; g < n_chunks; g++) { + if (mins[g] > maxs[g]) continue; + if (mins[g] < gmin) gmin = mins[g]; + if (maxs[g] > gmax) gmax = maxs[g]; + } + if (gmin == INT64_MAX || gmax == INT64_MIN) return 0; + /* Compute (gmax - gmin) in uint64 space — the signed subtraction + * overflows when the range spans the full I64 width (e.g. UserID + * hashing to both sign halves). Reinterpret as uint64 first; + * 2's-complement wrap gives the correct |gmax - gmin|. */ + uint64_t global_range = (uint64_t)gmax - (uint64_t)gmin; + if (global_range == 0) return 0; /* constant column — pointless */ + + /* Average per-chunk span / global range — selectivity proxy. + * Sum the per-chunk spans as doubles so the accumulation can't + * overflow when chunks span the full I64 width (uint64 sum + * across ~150 chunks each ~1.8e19 wide overflows; double has + * ~15 significant decimal digits, plenty for this coarse ratio). + * + * Threshold = 0.2. The strict 0.5 cut documented in the design + * note cleanly catches uniformly-random hashed columns (ratio + * ~1.0) but excludes mildly-clustered numeric IDs like UserID + * (~0.26 on the ClickBench hits data: user sessions cluster + * consecutively so chunk spans don't fully cover the I64 range). + * For point lookups on those columns chunk_zone still prunes + * most chunks but ~30 % can hold the key — a 30 % full-column + * scan, not a real win. Dropping to 0.2 admits UserID while + * still excluding tightly-clustered keys (CounterID/EventDate + * at <0.01) where chunk_zone already gives 99 %+ pruning. */ + double dgr = (double)global_range; + double span_sum = 0.0; + uint32_t n_eff = 0; + for (uint32_t g = 0; g < n_chunks; g++) { + if (mins[g] > maxs[g]) continue; + uint64_t span = (uint64_t)maxs[g] - (uint64_t)mins[g]; + span_sum += (double)span; + n_eff++; + } + if (n_eff < 4) return 0; + double mean_ratio = (span_sum / (double)n_eff) / dgr; + if (mean_ratio <= 0.2) return 0; + + /* Memory cap: ray_index_attach_hash allocates a power-of-two + * `cap = next_pow2(2*n)` int64 table plus an n-entry int64 + * chain. Skip when the index would cost more than 5× the + * column's payload — keeps narrow integer types (where the + * index dwarfs the data) out of the index set while admitting + * I32 / I64 numeric IDs. Done in int64 arithmetic (we cap n + * to anything that would overflow at the row counts we accept). */ + int64_t n = v->len; + if (n <= 0) return 0; + uint64_t cap = 8; + uint64_t want = (uint64_t)(2 * n); + while (cap < want) cap <<= 1; + uint64_t aux_bytes = cap * 8u + (uint64_t)n * 8u; + uint64_t data_bytes = (uint64_t)n * (uint64_t)esz; + if (aux_bytes > 5u * data_bytes) return 0; + + return 1; +} + static ray_t* csv_materialize_rows(const char* buf, size_t file_size, const int64_t* row_offsets, int64_t n_rows, int ncols, char delimiter, @@ -1410,6 +1518,36 @@ static ray_t* csv_materialize_rows(const char* buf, size_t file_size, col_data[c] = dst; } + /* Per-chunk min/max + null bit on every column big enough to be worth + * indexing — gives the reduce min/max and the filter chunk-skip paths + * an O(n_chunks) scan instead of O(n_rows). Attach is best-effort: + * unsupported types (RAY_STR/RAY_SYM/RAY_GUID in v1) just stay + * unindexed and the consumer falls back to a row scan. + * + * After the chunk_zone attaches we re-walk the same columns and + * upgrade the high-entropy ones to a hash index (the chunk_zone + * stays as well — it's the entropy signal we just measured). See + * csv_should_attach_hash for the selectivity + memory cap. */ + for (int c = 0; c < ncols; c++) { + ray_t* v = col_vecs[c]; + if (!v || RAY_IS_ERR(v)) continue; + if (v->len < (1 << 16)) continue; /* < one chunk, skip */ + ray_t* r = ray_index_attach_chunk_zone(&v, 16); + if (r && !RAY_IS_ERR(r)) col_vecs[c] = v; /* attach succeeded */ + /* On failure the original column stays in col_vecs[c]; ignore. */ + } + for (int c = 0; c < ncols; c++) { + ray_t* v = col_vecs[c]; + if (!csv_should_attach_hash(v)) continue; + /* ray_index_attach_hash drops any existing index on the + * column first; the chunk_zone we just built is sacrificed + * for the hash. That's the right trade — once the column + * is known to be high-entropy, chunk-skip never fires + * anyway, so the chunk_zone is dead weight. */ + ray_t* r = ray_index_attach_hash(&v); + if (r && !RAY_IS_ERR(r)) col_vecs[c] = v; + } + ray_t* tbl = ray_table_new(ncols); if (!tbl || RAY_IS_ERR(tbl)) { for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]); @@ -1788,6 +1926,25 @@ ray_t* ray_read_csv_named_opts(const char* path, char delimiter, bool header, /* ---- 11. Build table ---- */ { + /* Best-effort per-chunk zone index attach (see comment on the + * matching loop in build_table_from_cols) — unsupported types + * fall through to the unindexed path inside the consumer. + * Second pass upgrades high-entropy columns to a hash index; + * see csv_should_attach_hash. */ + for (int c = 0; c < ncols; c++) { + ray_t* v = col_vecs[c]; + if (!v || RAY_IS_ERR(v)) continue; + if (v->len < (1 << 16)) continue; + ray_t* r = ray_index_attach_chunk_zone(&v, 16); + if (r && !RAY_IS_ERR(r)) col_vecs[c] = v; + } + for (int c = 0; c < ncols; c++) { + ray_t* v = col_vecs[c]; + if (!csv_should_attach_hash(v)) continue; + ray_t* r = ray_index_attach_hash(&v); + if (r && !RAY_IS_ERR(r)) col_vecs[c] = v; + } + ray_t* tbl = ray_table_new(ncols); if (!tbl || RAY_IS_ERR(tbl)) { for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]); diff --git a/src/lang/env.c b/src/lang/env.c index 125ced49..8bb2a50e 100644 --- a/src/lang/env.c +++ b/src/lang/env.c @@ -30,17 +30,6 @@ #include #include -static _Atomic uint64_t g_env_generation = 1; - -uint64_t ray_env_generation(void) { - return atomic_load_explicit(&g_env_generation, memory_order_relaxed); -} - -static void env_bump_generation_if_user(int is_user) { - if (is_user) - atomic_fetch_add_explicit(&g_env_generation, 1, memory_order_relaxed); -} - /* ---- Function constructors ---- */ /* Builtin name stored inline in nullmap[2..15] (max 13 chars + null). @@ -311,7 +300,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) { g_env.user[j] = g_env.user[j + 1]; } g_env.count--; - env_bump_generation_if_user(is_user); env_unlock(); return RAY_OK; } @@ -324,7 +312,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) { * flag alone — once user, always user, until the slot is * deleted. */ if (is_user) g_env.user[i] = 1; - env_bump_generation_if_user(is_user); env_unlock(); return RAY_OK; } @@ -342,7 +329,6 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) { g_env.vals[g_env.count] = val; g_env.user[g_env.count] = is_user ? 1 : 0; g_env.count++; - env_bump_generation_if_user(is_user); env_unlock(); return RAY_OK; } diff --git a/src/lang/env.h b/src/lang/env.h index 25170c2a..e92b5284 100644 --- a/src/lang/env.h +++ b/src/lang/env.h @@ -43,7 +43,6 @@ static inline const char* ray_fn_name(const ray_t* fn) { ray_err_t ray_env_init(void); void ray_env_destroy(void); ray_t* ray_env_get(int64_t sym_id); -uint64_t ray_env_generation(void); /* User-facing binder. Refuses any name starting with `.` — that root is * reserved for system namespaces (.sys, .os, .io, .ipc, …) populated by diff --git a/src/lang/eval.c b/src/lang/eval.c index 2f6cac11..431d11bc 100644 --- a/src/lang/eval.c +++ b/src/lang/eval.c @@ -1480,116 +1480,9 @@ ray_t* ray_cond_fn(ray_t** args, int64_t n) { return make_i64(0); } -static uint64_t do_cache_mix(uint64_t h, uint64_t v) { - h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2); - return h ? h : 0x9e3779b97f4a7c15ull; -} - -static uint64_t do_cache_hash(ray_t* x) { - if (!x) return 0x1234abcd5678ef00ull; - uint64_t h = do_cache_mix(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type); - h = do_cache_mix(h, (uint64_t)x->attrs); - h = do_cache_mix(h, (x->type == -RAY_STR) - ? (uint64_t)ray_str_len(x) - : (uint64_t)x->len); - if (x->type == RAY_LIST) { - ray_t** elems = (ray_t**)ray_data(x); - for (int64_t i = 0; i < x->len; i++) - h = do_cache_mix(h, do_cache_hash(elems[i])); - } else if (x->type == RAY_DICT) { - h = do_cache_mix(h, do_cache_hash(ray_dict_keys(x))); - h = do_cache_mix(h, do_cache_hash(ray_dict_vals(x))); - } else if (x->type == RAY_STR) { - for (int64_t i = 0; i < x->len; i++) { - size_t n = 0; - const char* s = ray_str_vec_get(x, i, &n); - for (size_t j = 0; s && j < n; j++) - h = do_cache_mix(h, (unsigned char)s[j]); - } - } else if (x->type == -RAY_STR) { - const char* s = ray_str_ptr(x); - size_t n = ray_str_len(x); - for (size_t i = 0; s && i < n; i++) - h = do_cache_mix(h, (unsigned char)s[i]); - } else if (x->type == RAY_SYM || x->type == -RAY_SYM || - x->type == RAY_I64 || x->type == -RAY_I64 || - x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) { - h = do_cache_mix(h, (uint64_t)x->i64); - } else if (x->type == RAY_I32 || x->type == -RAY_I32 || - x->type == RAY_DATE || x->type == -RAY_DATE || - x->type == RAY_TIME || x->type == -RAY_TIME) { - h = do_cache_mix(h, (uint64_t)(uint32_t)x->i32); - } else if (x->type == RAY_I16 || x->type == -RAY_I16) { - h = do_cache_mix(h, (uint64_t)(uint16_t)x->i16); - } else if (x->type == RAY_U8 || x->type == -RAY_U8 || - x->type == RAY_BOOL || x->type == -RAY_BOOL) { - h = do_cache_mix(h, (uint64_t)x->u8); - } else if (x->type == RAY_F64 || x->type == -RAY_F64) { - uint64_t bits = 0; - memcpy(&bits, &x->f64, sizeof(bits)); - h = do_cache_mix(h, bits); - } - return h; -} - -static bool do_cache_contains_set(ray_t* x) { - if (!x || x->type != RAY_LIST) return false; - ray_t** elems = (ray_t**)ray_data(x); - if (x->len > 0 && elems[0] && elems[0]->type == -RAY_SYM) { - ray_t* s = ray_sym_str(elems[0]->i64); - bool is_set = s && ray_str_len(s) == 3 && - memcmp(ray_str_ptr(s), "set", 3) == 0; - if (s) ray_release(s); - if (is_set) return true; - } - for (int64_t i = 0; i < x->len; i++) - if (do_cache_contains_set(elems[i])) - return true; - return false; -} - -static bool do_cache_is_null_name(ray_t* x) { - if (!x || x->type != -RAY_SYM || !(x->attrs & RAY_ATTR_NAME)) return false; - ray_t* s = ray_sym_str(x->i64); - bool ok = s && ray_str_len(s) == 4 && memcmp(ray_str_ptr(s), "null", 4) == 0; - if (s) ray_release(s); - return ok; -} - -#define DO_NULL_CACHE_N 2048 -static uint64_t g_do_null_cache[DO_NULL_CACHE_N]; -static uint64_t g_do_null_cache_env_gen[DO_NULL_CACHE_N]; -static uint16_t g_do_null_cache_next = 0; - -static bool do_null_cache_get(uint64_t hash) { - if (!hash) return false; - uint64_t env_gen = ray_env_generation(); - for (uint16_t i = 0; i < DO_NULL_CACHE_N; i++) - if (g_do_null_cache[i] == hash && - g_do_null_cache_env_gen[i] == env_gen) - return true; - return false; -} - -static void do_null_cache_put(uint64_t hash) { - if (hash) { - uint16_t slot = g_do_null_cache_next++ % DO_NULL_CACHE_N; - g_do_null_cache[slot] = hash; - g_do_null_cache_env_gen[slot] = ray_env_generation(); - } -} - /* (do expr1 expr2 ...) — evaluate in sequence, return last. Pushes local scope. */ ray_t* ray_do_fn(ray_t** args, int64_t n) { if (n == 0) return make_i64(0); - uint64_t null_cache_hash = 0; - if (g_ray_profile.active && - n == 2 && do_cache_is_null_name(args[1]) && - !do_cache_contains_set(args[0])) { - null_cache_hash = do_cache_hash(args[0]); - if (do_null_cache_get(null_cache_hash)) - return NULL; - } if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL); ray_t* result = NULL; for (int64_t i = 0; i < n; i++) { @@ -1603,8 +1496,6 @@ ray_t* ray_do_fn(ray_t** args, int64_t n) { } } ray_env_pop_scope(); - if (null_cache_hash && result == NULL) - do_null_cache_put(null_cache_hash); return result; } diff --git a/src/mem/heap.c b/src/mem/heap.c index d8ee3f29..231f3751 100644 --- a/src/mem/heap.c +++ b/src/mem/heap.c @@ -1262,7 +1262,11 @@ void ray_heap_destroy(void) { * -------------------------------------------------------------------------- */ static void heap_return_foreign_freelist(ray_heap_t* h) { + /* avail bit (set on insert, cleared on remove) tells us which + * freelist orders have any blocks at all — skip the empty ones. */ + if (!h->avail) return; for (int order = RAY_ORDER_MIN; order < RAY_HEAP_FL_SIZE; order++) { + if (!(h->avail & (1ULL << order))) continue; ray_fl_head_t* head = &h->freelist[order]; ray_t* blk = head->fl_next; while (blk != (ray_t*)head) { @@ -1473,11 +1477,21 @@ void ray_heap_gc(void) { /* Pass 5: Release physical pages from free blocks in every * idle heap. Pass 2 may have returned blocks to worker-owned * freelists; releasing only the caller heap leaves those worker - * pages resident across large query repetitions. */ + * pages resident across large query repetitions. + * + * Use each heap's avail bitmap (set on insert, cleared on + * remove) to skip the entire walk when no order >= 13 has any + * free block. Tiny-query workloads — where the per-statement + * GC fires before any large allocation has been freed — + * complete pass 5 without entering the body. */ + uint64_t large_orders_mask = ~((1ULL << 13) - 1); for (int hid = 0; hid < RAY_HEAP_REGISTRY_SIZE; hid++) { ray_heap_t* gh = ray_heap_registry[hid]; if (!gh) continue; + uint64_t avail = gh->avail & large_orders_mask; + if (!avail) continue; for (int i = 13; i < RAY_HEAP_FL_SIZE; i++) { + if (!(avail & (1ULL << i))) continue; ray_fl_head_t* head = &gh->freelist[i]; ray_t* blk = head->fl_next; while (blk != (ray_t*)head) { diff --git a/src/ops/agg.c b/src/ops/agg.c index fee02d2e..34328522 100644 --- a/src/ops/agg.c +++ b/src/ops/agg.c @@ -23,6 +23,7 @@ #include "lang/internal.h" #include "ops/ops.h" +#include "ops/idxop.h" /* RAY_IDX_CHUNK_ZONE fast path for min/max */ #include "mem/heap.h" #include /* qsort (introselect fallback) */ @@ -328,7 +329,43 @@ ray_t* ray_min_fn(ray_t* x) { if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MIN); if (RAY_IS_PARTED(x->type)) return agg_parted_minmax(x, 0); if (ray_is_atom(x)) { ray_retain(x); return x; } - if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_min_op); + if (ray_is_vec(x)) { + /* Per-chunk zone index fast path: O(n_chunks) instead of O(n_rows). + * Only valid when the index was built for the column's current len + * (mutation paths call ray_index_drop). */ + if (ray_index_kind(x) == RAY_IDX_CHUNK_ZONE) { + ray_index_t* ix = ray_index_payload(x->index); + if (ix->built_for_len == x->len) { + uint32_t n_chunks = ix->u.chunk_zone.n_chunks; + if (ix->u.chunk_zone.is_f64) { + const double* mins = (const double*)ray_data(ix->u.chunk_zone.mins); + double mn = INFINITY; + for (uint32_t g = 0; g < n_chunks; g++) + if (mins[g] < mn) mn = mins[g]; + if (mn == INFINITY) return ray_typed_null(-RAY_F64); + return make_f64(mn); + } else { + const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins); + int64_t mn = INT64_MAX; + for (uint32_t g = 0; g < n_chunks; g++) + if (mins[g] < mn) mn = mins[g]; + if (mn == INT64_MAX) return ray_typed_null(-x->type); + /* Preserve the column's storage width on the result. */ + switch (x->type) { + case RAY_BOOL: return ray_bool((bool)mn); + case RAY_U8: return ray_u8((uint8_t)mn); + case RAY_I16: return ray_i16((int16_t)mn); + case RAY_I32: return ray_i32((int32_t)mn); + case RAY_DATE: return ray_date((int32_t)mn); + case RAY_TIME: return ray_time(mn); + case RAY_TIMESTAMP: return ray_timestamp(mn); + default: return ray_i64(mn); + } + } + } + } + AGG_VEC_VIA_DAG(x, ray_min_op); + } if (!is_list(x)) return ray_error("type", NULL); int64_t len = ray_len(x); if (len == 0) return ray_error("domain", NULL); @@ -350,7 +387,39 @@ ray_t* ray_max_fn(ray_t* x) { if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MAX); if (RAY_IS_PARTED(x->type)) return agg_parted_minmax(x, 1); if (ray_is_atom(x)) { ray_retain(x); return x; } - if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_max_op); + if (ray_is_vec(x)) { + if (ray_index_kind(x) == RAY_IDX_CHUNK_ZONE) { + ray_index_t* ix = ray_index_payload(x->index); + if (ix->built_for_len == x->len) { + uint32_t n_chunks = ix->u.chunk_zone.n_chunks; + if (ix->u.chunk_zone.is_f64) { + const double* maxs = (const double*)ray_data(ix->u.chunk_zone.maxs); + double mx = -INFINITY; + for (uint32_t g = 0; g < n_chunks; g++) + if (maxs[g] > mx) mx = maxs[g]; + if (mx == -INFINITY) return ray_typed_null(-RAY_F64); + return make_f64(mx); + } else { + const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs); + int64_t mx = INT64_MIN; + for (uint32_t g = 0; g < n_chunks; g++) + if (maxs[g] > mx) mx = maxs[g]; + if (mx == INT64_MIN) return ray_typed_null(-x->type); + switch (x->type) { + case RAY_BOOL: return ray_bool((bool)mx); + case RAY_U8: return ray_u8((uint8_t)mx); + case RAY_I16: return ray_i16((int16_t)mx); + case RAY_I32: return ray_i32((int32_t)mx); + case RAY_DATE: return ray_date((int32_t)mx); + case RAY_TIME: return ray_time(mx); + case RAY_TIMESTAMP: return ray_timestamp(mx); + default: return ray_i64(mx); + } + } + } + } + AGG_VEC_VIA_DAG(x, ray_max_op); + } if (!is_list(x)) return ray_error("type", NULL); int64_t len = ray_len(x); if (len == 0) return ray_error("domain", NULL); diff --git a/src/ops/exec.c b/src/ops/exec.c index e30ebf97..efa90cf8 100644 --- a/src/ops/exec.c +++ b/src/ops/exec.c @@ -24,6 +24,7 @@ #include "ops/internal.h" #include "ops/rowsel.h" #include "ops/fused_group.h" +#include "ops/idxop.h" #include "mem/heap.h" #include "mem/sys.h" @@ -856,6 +857,61 @@ static ray_t* exec_in(ray_graph_t* g, ray_op_t* op, ray_t* col, ray_t* set) { * Recursive executor * ============================================================================ */ +/* Decode an OP_EQ predicate `pred_op` against g->table. When the + * predicate has shape (== col_scan const_int) and `col_scan` resolves + * to a column in g->table that is non-null, non-parted, and carries a + * fresh RAY_IDX_HASH, write the column pointer to *out_col and the + * decoded int64 key to *out_key, returning 1. Returns 0 on any + * miss — the caller falls through to the regular scan-based pred + * evaluation. */ +static int hash_index_eq_decode(ray_graph_t* g, ray_op_t* pred_op, + ray_t** out_col, int64_t* out_key) { + if (!pred_op || pred_op->opcode != OP_EQ || pred_op->arity != 2) + return 0; + ray_op_t* lhs = pred_op->inputs[0]; + ray_op_t* rhs = pred_op->inputs[1]; + if (!lhs || !rhs) return 0; + if (lhs->opcode != OP_SCAN || rhs->opcode != OP_CONST) return 0; + ray_op_ext_t* lext = find_ext(g, lhs->id); + ray_op_ext_t* rext = find_ext(g, rhs->id); + if (!lext || !rext || !rext->literal) return 0; + uint16_t stored_table_id = 0; + memcpy(&stored_table_id, lext->base.pad, sizeof(uint16_t)); + if (stored_table_id != 0) return 0; /* non-default table — skip */ + ray_t* tbl = g->table; + if (!tbl) return 0; + ray_t* col = ray_table_get_col(tbl, lext->sym); + if (!col || RAY_IS_ERR(col)) return 0; + if (RAY_IS_PARTED(col->type) || col->type == RAY_MAPCOMMON) return 0; + /* Nullable columns: the hash chain skipped null rows, so the + * resulting selection would mismatch the unfused null-aware + * compare for the col == col semantics rare-but-required case. + * Bail and let the existing compare run. */ + if (col->attrs & RAY_ATTR_HAS_NULLS) return 0; + if (!ray_index_has(col)) return 0; + if (ray_index_kind(col) != RAY_IDX_HASH) return 0; + ray_index_t* ix = ray_index_payload(col->index); + if (ix->built_for_len != col->len) return 0; + + ray_t* cv = rext->literal; + if (!cv) return 0; + int64_t key = 0; + switch (cv->type) { + case -RAY_I64: + case -RAY_TIMESTAMP: key = cv->i64; break; + case -RAY_I32: + case -RAY_DATE: + case -RAY_TIME: key = (int64_t)cv->i32; break; + case -RAY_I16: key = (int64_t)cv->i16; break; + case -RAY_BOOL: + case -RAY_U8: key = (int64_t)cv->b8; break; + default: return 0; /* floats / sym / str — not eligible */ + } + *out_col = col; + *out_key = key; + return 1; +} + /* Is this opcode a "heavy" pipeline breaker worth profiling? */ static inline bool op_is_heavy(uint16_t opc) { return opc == OP_FILTER || opc == OP_SORT || opc == OP_GROUP || @@ -1122,8 +1178,31 @@ static ray_t* exec_node_inner(ray_graph_t* g, ray_op_t* op) { } ray_t* input = exec_node(g, op->inputs[0]); - ray_t* pred = exec_node(g, op->inputs[1]); - if (!input || RAY_IS_ERR(input)) { if (pred && !RAY_IS_ERR(pred)) ray_release(pred); return input; } + if (!input || RAY_IS_ERR(input)) return input; + /* Hash-index point-lookup fast path: when the predicate is + * `col == K` on a column with RAY_IDX_HASH attached and + * built for the column's current length, install the + * matching rowsel on g->selection directly — bypasses + * both the O(rows) compare AND the O(rows) BOOL→rowsel + * scan. Only fires for the lazy TABLE-input case with no + * pre-existing selection (the entry shape downstream + * group-by / sort already expects). */ + if (input->type == RAY_TABLE && !g->selection) { + ray_t* col = NULL; + int64_t key = 0; + if (hash_index_eq_decode(g, op->inputs[1], &col, &key)) { + ray_t* sel = ray_index_hash_eq_rowsel(col, key); + if (sel) { + g->selection = sel; + return input; + } + /* sel == NULL: column was eligible at decode time + * but allocation failed. Fall through to the + * scan path below — defensive (no functional + * difference in the common case). */ + } + } + ray_t* pred = exec_node(g, op->inputs[1]); if (!pred || RAY_IS_ERR(pred)) { ray_release(input); return pred; } /* Lazy filter: convert predicate to a rowsel (morsel-local @@ -1362,7 +1441,7 @@ static ray_t* exec_node_inner(ray_graph_t* g, ray_op_t* op) { } ray_t* result = exec_sort(g, child_op, tbl, n); if (sort_input != g->table) ray_release(sort_input); - if (result && !RAY_IS_ERR(result)) ray_heap_gc(); + /* Top-level statement GC catches intermediates. */ return result; } @@ -1431,7 +1510,7 @@ static ray_t* exec_node_inner(ray_graph_t* g, ray_op_t* op) { ray_release(pred); if (filter_input != saved_table) ray_release(filter_input); - if (result && !RAY_IS_ERR(result)) ray_heap_gc(); + /* Top-level statement GC catches intermediates. */ return result; } else { input = exec_node(g, op->inputs[0]); diff --git a/src/ops/expr.c b/src/ops/expr.c index 49b4f9bc..07931bba 100644 --- a/src/ops/expr.c +++ b/src/ops/expr.c @@ -2115,11 +2115,22 @@ ray_t* exec_elementwise_binary(ray_graph_t* g, ray_op_t* op, ray_t* lhs, ray_t* 0, len); } - /* Null propagation from inputs */ - if (op_propagates_null(op->opcode)) - propagate_nulls_binary(lhs, rhs, result, l_scalar, r_scalar, len); - else - fix_null_comparisons(lhs, rhs, result, l_scalar, r_scalar, len, op->opcode); + /* Null propagation from inputs. Skipped when str_resolved: we resolved + * a string constant to an integer sym id and compared it by value against + * a SYM column. SYM columns carry no nulls (id 0 / the interned empty + * string is a real value — see ray_sym_init / ray_vec_is_null), and the + * resolved string atom must NOT be treated as null here. Otherwise the + * empty-string literal "" — for which RAY_ATOM_IS_NULL is true (slen==0, + * obj==NULL) yet which resolves to the valid sym id 0 — would take the + * null-comparison fill: `!= col ""` passing every row and `== col ""` + * matching none, instead of selecting the empty-string rows by value + * (which silently drops a `(!= symcol "")` WHERE predicate). */ + if (!str_resolved) { + if (op_propagates_null(op->opcode)) + propagate_nulls_binary(lhs, rhs, result, l_scalar, r_scalar, len); + else + fix_null_comparisons(lhs, rhs, result, l_scalar, r_scalar, len, op->opcode); + } /* Div/mod: mark zero-divisor positions as null. * The morsel loop writes 0 for b==0 but can't set bitmap nulls. */ diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c index 127b177f..a8a8e081 100644 --- a/src/ops/fused_group.c +++ b/src/ops/fused_group.c @@ -23,6 +23,7 @@ #include "ops/fused_group.h" #include "ops/fused_pred.h" /* fp_pred_t / fp_compile_pred / fp_eval_pred */ +#include "ops/idxop.h" /* RAY_IDX_CHUNK_ZONE chunk-skip in fp_eval_cmp */ #include "lang/eval.h" /* RAY_ATTR_NAME */ #include "core/pool.h" /* ray_pool_get / ray_pool_dispatch */ @@ -344,6 +345,72 @@ void fp_eval_cmp(const fp_cmp_t* p, int64_t start, int64_t end, return; } + /* Chunk-zone fast path: if the column carries per-chunk min/max + * metadata and [start, end) fits inside a single chunk, decide the + * whole morsel from chunk extrema without reading a single value. + * Only integer/temporal comparisons (EQ/NE/LT/LE/GT/GE) — LIKE/IN + * have their own evaluators below and SYM ordering is rejected at + * compile time anyway. The all-pass shortcut is gated on "no + * nulls in this chunk" because SQL `(x op c)` is FALSE/NULL when x + * is NULL; the all-fail shortcut needs no such guard. */ + if (p->col_obj && (p->col_obj->attrs & RAY_ATTR_HAS_INDEX) && + p->col_obj->index) + { + ray_index_t* ix = ray_index_payload(p->col_obj->index); + if (ix->kind == RAY_IDX_CHUNK_ZONE && + ix->built_for_len == p->col_obj->len && + !ix->u.chunk_zone.is_f64 && + (op == FP_EQ || op == FP_NE || + op == FP_LT || op == FP_LE || + op == FP_GT || op == FP_GE)) + { + uint8_t log2 = ix->u.chunk_zone.chunk_log2; + int64_t s_ch = start >> log2; + int64_t e_ch = (end - 1) >> log2; + if (s_ch == e_ch && (uint32_t)s_ch < ix->u.chunk_zone.n_chunks) { + const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins); + const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs); + int64_t cmin = mins[s_ch], cmax = maxs[s_ch]; + if (cmin <= cmax) { /* skip empty (all-null) chunks */ + const uint8_t* nb = (const uint8_t*)ray_data(ix->u.chunk_zone.null_bits); + bool has_nulls = (nb[s_ch >> 3] >> (s_ch & 7)) & 1u; + int decision = -1; /* 0=all-fail, 1=all-pass, -1=mixed */ + switch (op) { + case FP_EQ: + if (cval < cmin || cval > cmax) decision = 0; + else if (!has_nulls && cmin == cmax) decision = 1; + break; + case FP_NE: + if (!has_nulls && (cval < cmin || cval > cmax)) decision = 1; + else if (cmin == cmax && cval == cmin) decision = 0; + break; + case FP_LT: + if (cmin >= cval) decision = 0; + else if (!has_nulls && cmax < cval) decision = 1; + break; + case FP_LE: + if (cmin > cval) decision = 0; + else if (!has_nulls && cmax <= cval) decision = 1; + break; + case FP_GT: + if (cmax <= cval) decision = 0; + else if (!has_nulls && cmin > cval) decision = 1; + break; + case FP_GE: + if (cmax < cval) decision = 0; + else if (!has_nulls && cmin >= cval) decision = 1; + break; + default: break; + } + if (decision >= 0) { + memset(bits, (uint8_t)decision, (size_t)n); + return; + } + } + } + } + } + /* SYM low-card fold: const not in dict ⇒ EQ all-zero / NE all-one. * Ordering ops are rejected at compile for SYM, so unreachable here. */ if (ct == RAY_SYM && !p->cval_in_dict) { @@ -1087,6 +1154,37 @@ static uint32_t fp_i32_hash_slot(int32_t key, uint32_t mask) { return (uint32_t)h & mask; } +static uint32_t fp_i64_hash_slot(int64_t key, uint32_t mask) { + uint64_t h = (uint64_t)key * 0x9E3779B97F4A7C15ULL; + h ^= h >> 33; + h *= 0xC2B2AE3D27D4EB4FULL; + h ^= h >> 29; + return (uint32_t)h & mask; +} + +static void fp_i64_mg_rebuild(const int64_t* keys, const uint32_t* counts, + uint32_t n, uint32_t* ht, uint32_t hcap) { + memset(ht, 0, (size_t)hcap * sizeof(uint32_t)); + uint32_t mask = hcap - 1; + for (uint32_t i = 0; i < n; i++) { + if (!counts[i]) continue; + uint32_t slot = fp_i64_hash_slot(keys[i], mask); + while (ht[slot]) slot = (slot + 1u) & mask; + ht[slot] = i + 1u; + } +} + +static uint32_t fp_i64_mg_lookup(const int64_t* keys, const uint32_t* ht, + uint32_t hmask, int64_t key) { + uint32_t slot = fp_i64_hash_slot(key, hmask); + while (ht[slot]) { + uint32_t idx = ht[slot] - 1u; + if (keys[idx] == key) return idx + 1u; + slot = (slot + 1u) & hmask; + } + return 0; +} + static void fp_i32_mg_rebuild(const int32_t* keys, const uint32_t* counts, uint32_t n, uint32_t* ht, uint32_t hcap) { memset(ht, 0, (size_t)hcap * sizeof(uint32_t)); @@ -1247,6 +1345,146 @@ static ray_t* fp_try_i32_mg_top_count(const fp_par_ctx_t* ctx, int64_t nrows, return result; } +/* I64 mirror of fp_try_i32_mg_top_count for top-K-by-count over an + * I64 key column. Misra-Gries with cap = 8192 candidates guarantees + * every key with count > nrows / 8193 survives the first pass; the + * second pass exact-counts the survivors and a min-heap picks the + * top K. Falls back to NULL when the safety bound is violated, or + * when fewer than K candidates have non-zero exact counts. */ +static ray_t* fp_try_i64_mg_top_count(const fp_par_ctx_t* ctx, int64_t nrows, + int64_t key_sym, + ray_group_emit_filter_t emit_filter) { + if (ctx->kt != RAY_I64 && ctx->kt != RAY_TIMESTAMP) return NULL; + if (ctx->pred.n_children != 0 || + emit_filter.top_count_take <= 0 || nrows <= 0) + return NULL; + + const uint32_t cap = 8192; + const uint32_t hcap = cap * 2u; + const int64_t* data = (const int64_t*)ctx->kbase; + ray_t *keys_hdr = NULL, *cnt_hdr = NULL, *exact_hdr = NULL, *ht_hdr = NULL; + int64_t* keys = (int64_t*)scratch_alloc(&keys_hdr, cap * sizeof(int64_t)); + uint32_t* counts = (uint32_t*)scratch_calloc(&cnt_hdr, cap * sizeof(uint32_t)); + uint32_t* exact = (uint32_t*)scratch_calloc(&exact_hdr, cap * sizeof(uint32_t)); + uint32_t* ht = (uint32_t*)scratch_calloc(&ht_hdr, hcap * sizeof(uint32_t)); + if (!keys || !counts || !exact || !ht) { + if (keys_hdr) scratch_free(keys_hdr); + if (cnt_hdr) scratch_free(cnt_hdr); + if (exact_hdr) scratch_free(exact_hdr); + if (ht_hdr) scratch_free(ht_hdr); + return NULL; + } + + uint32_t n = 0; + uint32_t decrements = 0; + uint32_t hmask = hcap - 1u; + for (int64_t r = 0; r < nrows; r++) { + int64_t key = data[r]; + uint32_t found = fp_i64_mg_lookup(keys, ht, hmask, key); + if (found) { + counts[found - 1u]++; + continue; + } + if (n < cap) { + uint32_t idx = n++; + keys[idx] = key; + counts[idx] = 1; + uint32_t slot = fp_i64_hash_slot(key, hmask); + while (ht[slot]) slot = (slot + 1u) & hmask; + ht[slot] = idx + 1u; + continue; + } + uint32_t out = 0; + for (uint32_t i = 0; i < n; i++) { + uint32_t c = counts[i]; + if (c > 1) { + counts[out] = c - 1u; + keys[out] = keys[i]; + out++; + } + } + n = out; + decrements++; + fp_i64_mg_rebuild(keys, counts, n, ht, hcap); + } + + memset(exact, 0, cap * sizeof(uint32_t)); + for (int64_t r = 0; r < nrows; r++) { + uint32_t found = fp_i64_mg_lookup(keys, ht, hmask, data[r]); + if (found) exact[found - 1u]++; + } + + int64_t k_take = emit_filter.top_count_take; + if (k_take > 1024) k_take = 1024; + int64_t heap[1024]; + int64_t heap_n = 0; + uint32_t nonzero = 0; + for (uint32_t i = 0; i < n; i++) { + if (!exact[i]) continue; + nonzero++; + fp_count_heap_consider(heap, &heap_n, k_take, (int64_t)exact[i]); + } + if (heap_n == 0) { + scratch_free(keys_hdr); scratch_free(cnt_hdr); + scratch_free(exact_hdr); scratch_free(ht_hdr); + return NULL; + } + int64_t keep_min = emit_filter.min_count_exclusive + 1; + if (heap_n == k_take && heap[0] > keep_min) + keep_min = heap[0]; + + if (decrements && keep_min <= nrows / (int64_t)(cap + 1u)) { + scratch_free(keys_hdr); scratch_free(cnt_hdr); + scratch_free(exact_hdr); scratch_free(ht_hdr); + return NULL; + } + + uint32_t out_n = 0; + for (uint32_t i = 0; i < n; i++) + if ((int64_t)exact[i] >= keep_min) out_n++; + if (!out_n || (decrements && nonzero < (uint32_t)k_take)) { + scratch_free(keys_hdr); scratch_free(cnt_hdr); + scratch_free(exact_hdr); scratch_free(ht_hdr); + return NULL; + } + + ray_t* k_out = ray_vec_new(ctx->kt, out_n); + ray_t* c_out = ray_vec_new(RAY_I64, out_n); + if (!k_out || !c_out || RAY_IS_ERR(k_out) || RAY_IS_ERR(c_out)) { + if (k_out && !RAY_IS_ERR(k_out)) ray_release(k_out); + if (c_out && !RAY_IS_ERR(c_out)) ray_release(c_out); + scratch_free(keys_hdr); scratch_free(cnt_hdr); + scratch_free(exact_hdr); scratch_free(ht_hdr); + return ray_error("oom", NULL); + } + k_out->len = out_n; + c_out->len = out_n; + int64_t* kd = (int64_t*)ray_data(k_out); + int64_t* cd = (int64_t*)ray_data(c_out); + uint32_t oi = 0; + for (uint32_t i = 0; i < n; i++) { + if ((int64_t)exact[i] < keep_min) continue; + kd[oi] = keys[i]; + cd[oi] = exact[i]; + oi++; + } + scratch_free(keys_hdr); scratch_free(cnt_hdr); + scratch_free(exact_hdr); scratch_free(ht_hdr); + + ray_t* result = ray_table_new(2); + if (!result || RAY_IS_ERR(result)) { + ray_release(k_out); + ray_release(c_out); + return ray_error("oom", NULL); + } + int64_t cnt_sym = ray_sym_intern("count", 5); + result = ray_table_add_col(result, key_sym, k_out); + result = ray_table_add_col(result, cnt_sym, c_out); + ray_release(k_out); + ray_release(c_out); + return result; +} + static void fp_direct_count_fn(void* raw, uint32_t worker_id, int64_t start, int64_t end) { fp_direct_count_ctx_t* c = (fp_direct_count_ctx_t*)raw; @@ -1308,6 +1546,19 @@ static ray_t* fp_try_direct_count1(const fp_par_ctx_t* ctx, int64_t nrows, if (mg) return mg; } return NULL; + } else if (ctx->kt == RAY_I64 || ctx->kt == RAY_TIMESTAMP) { + /* I64/TIMESTAMP top-K via Misra-Gries. The slot-array path + * for I32/I16/U8/BOOL would need 16 GB for the full I64 + * domain; MG with cap = 8 K candidates costs ~256 KB and + * exact-counts the survivors in a second pass. Falls back + * to the partition path when the safety bound is violated. */ + ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get(); + if (emit_filter.enabled && emit_filter.agg_index == 0 && + emit_filter.top_count_take > 0) { + ray_t* mg = fp_try_i64_mg_top_count(ctx, nrows, key_sym, emit_filter); + if (mg) return mg; + } + return NULL; } else if (ctx->kt == RAY_SYM) { uint64_t max_key = 0; for (int64_t i = 0; i < nrows; i++) { @@ -2226,6 +2477,16 @@ static ray_t* exec_filtered_group_count1(ray_graph_t* g, ray_op_ext_t* ext, #define FP_MAX_AGGS 8 #define FP_MAX_KEYS 16 +/* v2 path: per-(worker, partition) hash tables. Each worker hashes its + * rows once and routes by RADIX_PART(h) to one of MK_RADIX_P small + * shards rather than a single fat per-worker shard. Smaller shards stay + * cache-resident; the merge step is per-partition and trivially parallel. + * Mirrors the design in group.c (radix_v2_phase1_fn / _phase2_fn). */ +#define MK_RADIX_BITS 5 +#define MK_RADIX_P (1u << MK_RADIX_BITS) +#define MK_RADIX_MASK (MK_RADIX_P - 1u) +#define MK_RADIX_PART(h) (((uint32_t)((h) >> 16)) & MK_RADIX_MASK) + typedef enum { MK_AGG_COUNT = 0, MK_AGG_SUM = 1, @@ -2295,7 +2556,8 @@ typedef struct { uint8_t total_state; uint8_t wide; /* 1 when total_bytes > 8 (uses kv_hi side array) */ /* Cool fields (only touched once per dispatch or in cold paths). */ - mk_shard_t* shards; + mk_shard_t* shards; /* v1: [n_workers] single shard per worker */ + mk_shard_t* wpart_shards; /* v2: [n_workers * MK_RADIX_P] partitioned */ uint64_t init_cap; _Atomic(uint32_t) oom; mk_key_t keys[FP_MAX_KEYS]; @@ -2552,77 +2814,589 @@ static int mk_find_i64_eq_child(const fp_pred_t* pred) { return -1; } -static void mk_eq_i64_count_fn(void* raw, uint32_t worker_id, - int64_t start, int64_t end) { - mk_eq_i64_count_ctx_t* fc = (mk_eq_i64_count_ctx_t*)raw; - mk_par_ctx_t* c = fc->ctx; +/* Find an FP_EQ predicate child whose column carries a fresh + * RAY_IDX_HASH — i.e. one we can serve via O(matches) hash probe + * instead of O(n) scan. Constraints mirror hash_probe_setup + * (idxop.c): no nulls, no fold, same built-for-len, type covers cval. + * Returns the child index, or -1 if none qualifies. */ +static int mk_find_hash_eq_child(const fp_pred_t* pred) { + for (uint8_t i = 0; i < pred->n_children; i++) { + const fp_cmp_t* cmp = &pred->children[i]; + if (cmp->op != FP_EQ || cmp->fold != FP_FOLD_NONE) continue; + if (cmp->col_type == RAY_SYM) continue; /* hash idx not attached to dict cols */ + if (cmp->col_attrs & RAY_ATTR_HAS_NULLS) continue; + ray_t* co = cmp->col_obj; + if (!co || !ray_index_has(co)) continue; + if (ray_index_kind(co) != RAY_IDX_HASH) continue; + ray_index_t* ix = ray_index_payload(co->index); + if (ix->built_for_len != co->len) continue; + return (int)i; + } + return -1; +} + +/* Worker that walks the RAY_IDX_HASH chain on `c->pred.children[eq_idx]` + * and applies the COUNT-aggregator path to each matching row that also + * passes the remaining predicate children. Replaces the O(n) + * mk_eq_i64_count_fn scan. Runs on worker 0 only — the chain walk + * isn't parallelised, since match counts on a point lookup are tiny + * and the dispatch overhead would dominate. */ +static void mk_eq_hash_count_fn(mk_par_ctx_t* c, uint8_t eq_idx) { if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; - mk_shard_t* sh = &c->shards[worker_id]; + mk_shard_t* sh = &c->shards[0]; if (!sh->slots) { if (mk_shard_init(sh, c->init_cap, c->total_state, c->wide) != 0) { atomic_store_explicit(&c->oom, 1, memory_order_relaxed); return; } } - - const fp_cmp_t* eq = &c->pred.children[fc->eq_idx]; - const int64_t* eq_col = (const int64_t*)eq->col_base; - int64_t eq_val = eq->cval; - for (int64_t row = start; row < end; row++) { - if (eq_col[row] != eq_val) continue; - uint8_t pass = 1; - for (uint8_t i = 0; i < c->pred.n_children; i++) { - if (i == fc->eq_idx) continue; - if (!fp_eval_cmp_one(&c->pred.children[i], row)) { - pass = 0; - break; + const fp_cmp_t* eq = &c->pred.children[eq_idx]; + ray_t* col = eq->col_obj; + ray_index_t* ix = ray_index_payload(col->index); + const uint64_t mask = ix->u.hash.mask; + const int64_t* tbl = (const int64_t*)ray_data(ix->u.hash.table); + const int64_t* chn = (const int64_t*)ray_data(ix->u.hash.chain); + int64_t key = eq->cval; + + /* Recompute the same hash the builder used. numeric_key_word for + * an int* column zero/sign-extends to int64 then runs mix64 over + * the bit pattern. We match by width here. */ + uint64_t kbits; + switch (eq->col_esz) { + case 1: kbits = (uint64_t)(uint8_t)key; break; + case 2: kbits = (uint64_t)(int64_t)(int16_t)key; break; + case 4: kbits = (uint64_t)(int64_t)(int32_t)key; break; + default: kbits = (uint64_t)key; break; + } + /* mix64 inline — match idxop.c:mix64 byte-for-byte. */ + uint64_t h = kbits; + h ^= h >> 30; h *= 0xbf58476d1ce4e5b9ULL; + h ^= h >> 27; h *= 0x94d049bb133111ebULL; + h ^= h >> 31; + int64_t rid = tbl[h & mask] - 1; + + while (rid >= 0) { + if (fp_cmp_read_i64_at(eq, rid) == key) { + uint8_t pass = 1; + for (uint8_t i = 0; i < c->pred.n_children; i++) { + if (i == eq_idx) continue; + if (!fp_eval_cmp_one(&c->pred.children[i], rid)) { + pass = 0; + break; + } + } + if (pass) { + if (mk_count_upsert_row(c, sh, rid) != 0) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } } } - if (!pass) continue; - if (mk_count_upsert_row(c, sh, row) != 0) { + rid = chn[rid] - 1; + } +} + +/* mk_par worker analog: walk the hash chain instead of scanning rows. + * For each matching row that passes the remaining predicate children, + * upsert into shard 0 and run the per-agg accumulate inline. This + * mirrors mk_par_fn's PASS-1 / PASS-2 split but per-row (matches are + * sparse, so a morsel-shaped batch is overkill — match count is + * usually < 10). Runs on a single thread for the same reason. */ +static void mk_par_hash_fn(mk_par_ctx_t* c, uint8_t eq_idx) { + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + mk_shard_t* sh = &c->shards[0]; + uint8_t wide = c->wide; + uint8_t total_state = c->total_state; + uint8_t n_aggs = c->n_aggs; + if (!sh->slots) { + if (mk_shard_init(sh, c->init_cap, total_state, wide) != 0) { atomic_store_explicit(&c->oom, 1, memory_order_relaxed); return; } } + const fp_cmp_t* eq = &c->pred.children[eq_idx]; + ray_t* col = eq->col_obj; + ray_index_t* ix = ray_index_payload(col->index); + const uint64_t mask = ix->u.hash.mask; + const int64_t* tbl = (const int64_t*)ray_data(ix->u.hash.table); + const int64_t* chn = (const int64_t*)ray_data(ix->u.hash.chain); + int64_t key = eq->cval; + + uint64_t kbits; + switch (eq->col_esz) { + case 1: kbits = (uint64_t)(uint8_t)key; break; + case 2: kbits = (uint64_t)(int64_t)(int16_t)key; break; + case 4: kbits = (uint64_t)(int64_t)(int32_t)key; break; + default: kbits = (uint64_t)key; break; + } + uint64_t h = kbits; + h ^= h >> 30; h *= 0xbf58476d1ce4e5b9ULL; + h ^= h >> 27; h *= 0x94d049bb133111ebULL; + h ^= h >> 31; + int64_t rid = tbl[h & mask] - 1; + + while (rid >= 0) { + if (fp_cmp_read_i64_at(eq, rid) == key) { + uint8_t pass = 1; + for (uint8_t i = 0; i < c->pred.n_children; i++) { + if (i == eq_idx) continue; + if (!fp_eval_cmp_one(&c->pred.children[i], rid)) { + pass = 0; + break; + } + } + if (pass) { + /* Grow check + HT probe + per-agg accumulate. Single + * row at a time (no morsel batching) — matches are + * sparse, and the existing batched path's per-batch + * shard-grow loop would still re-fire here. */ + if (sh->n_filled + 1 > (int64_t)(sh->cap / 2)) { + if (mk_shard_grow(sh, total_state, wide) != 0) { + atomic_store_explicit(&c->oom, 1, + memory_order_relaxed); + return; + } + } + int64_t* slots = sh->slots; + int64_t* state = sh->state; + uint64_t shm = sh->mask; + uint64_t s; + if (!wide) { + int64_t kv = mk_compose_key(c, rid); + uint64_t hk = (uint64_t)kv * 0x9E3779B97F4A7C15ULL; + hk ^= hk >> 33; + s = hk & shm; + for (;;) { + if (!slots[s * 2]) { + slots[s * 2] = 1; + slots[s * 2 + 1] = kv; + int64_t* st = &state[s * total_state]; + for (uint8_t a = 0; a < n_aggs; a++) { + const mk_agg_t* ag = &c->aggs[a]; + switch (ag->kind) { + case MK_AGG_COUNT: + case MK_AGG_SUM: + st[ag->state_off] = 0; break; + case MK_AGG_MIN: + st[ag->state_off] = INT64_MAX; break; + case MK_AGG_MAX: + st[ag->state_off] = INT64_MIN; break; + case MK_AGG_AVG: + st[ag->state_off ] = 0; + st[ag->state_off + 1] = 0; break; + } + } + sh->n_filled++; + break; + } + if (slots[s * 2 + 1] == kv) break; + s = (s + 1) & shm; + } + } else { + int64_t kv_lo, kv_hi; + mk_compose_key2(c, rid, &kv_lo, &kv_hi); + uint64_t hk = mk_hash_lo_hi(kv_lo, kv_hi); + s = hk & shm; + int64_t* slots_hi = sh->slots_hi; + for (;;) { + if (!slots[s * 2]) { + slots[s * 2] = 1; + slots[s * 2 + 1] = kv_lo; + slots_hi[s] = kv_hi; + int64_t* st = &state[s * total_state]; + for (uint8_t a = 0; a < n_aggs; a++) { + const mk_agg_t* ag = &c->aggs[a]; + switch (ag->kind) { + case MK_AGG_COUNT: + case MK_AGG_SUM: + st[ag->state_off] = 0; break; + case MK_AGG_MIN: + st[ag->state_off] = INT64_MAX; break; + case MK_AGG_MAX: + st[ag->state_off] = INT64_MIN; break; + case MK_AGG_AVG: + st[ag->state_off ] = 0; + st[ag->state_off + 1] = 0; break; + } + } + sh->n_filled++; + break; + } + if (slots[s * 2 + 1] == kv_lo && + slots_hi[s] == kv_hi) break; + s = (s + 1) & shm; + } + } + /* Per-agg accumulate for this row. */ + int64_t* st = &state[s * total_state]; + for (uint8_t a = 0; a < n_aggs; a++) { + const mk_agg_t* ag = &c->aggs[a]; + uint8_t off = ag->state_off; + switch (ag->kind) { + case MK_AGG_COUNT: + st[off]++; + break; + case MK_AGG_SUM: { + int64_t v = mk_read_agg_i64(ag, rid); + st[off] += v; + break; + } + case MK_AGG_MIN: { + int64_t v = mk_read_agg_i64(ag, rid); + if (v < st[off]) st[off] = v; + break; + } + case MK_AGG_MAX: { + int64_t v = mk_read_agg_i64(ag, rid); + if (v > st[off]) st[off] = v; + break; + } + case MK_AGG_AVG: { + int64_t v = mk_read_agg_i64(ag, rid); + st[off ] += v; + st[off + 1] += 1; + break; + } + } + } + } + } + rid = chn[rid] - 1; + } } -/* ─── Worker fn — chunked vectorised aggregate update ─────────────── - * - * Per morsel we run two passes: - * - * PASS 1 (probe): linear-probe the HT for every passing row. On a - * new slot we initialize the per-agg state to a per-kind sentinel - * (0 for COUNT/SUM/AVG-sum, 0 for AVG-count, INT64_MAX for MIN, - * INT64_MIN for MAX) so the accumulate-only update logic in pass 2 - * produces the correct first value without a separate "first row" - * branch. Pass 1 fills slot_idx[i] (HT slot for the i-th passing row) - * and src_rows[i] (source row index) into stack-resident arrays. - * - * PASS 2 (update): for each aggregate, run a tight per-agg loop over - * match_count entries. No per-row switch dispatch — the kind switch - * is hoisted out of the loop, so each loop body is a single - * accumulate operation against state[slot_idx[i] * total + off]. - * - * Probe-then-update-per-aggregate eliminates the O(rows × aggs) branch - * dispatch the prior per-row update did. */ -static void mk_par_fn(void* raw, uint32_t worker_id, int64_t start, int64_t end) { - mk_par_ctx_t* c = (mk_par_ctx_t*)raw; +static void mk_eq_i64_count_fn(void* raw, uint32_t worker_id, + int64_t start, int64_t end) { + mk_eq_i64_count_ctx_t* fc = (mk_eq_i64_count_ctx_t*)raw; + mk_par_ctx_t* c = fc->ctx; if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; mk_shard_t* sh = &c->shards[worker_id]; - uint8_t wide = c->wide; if (!sh->slots) { - if (mk_shard_init(sh, c->init_cap, c->total_state, wide) != 0) { + if (mk_shard_init(sh, c->init_cap, c->total_state, c->wide) != 0) { atomic_store_explicit(&c->oom, 1, memory_order_relaxed); return; } } - uint8_t total_state = c->total_state; - uint8_t n_aggs = c->n_aggs; + const fp_cmp_t* eq = &c->pred.children[fc->eq_idx]; + const int64_t* eq_col = (const int64_t*)eq->col_base; + int64_t eq_val = eq->cval; - int64_t row = start; - while (row < end) { - int64_t mend = row + RAY_MORSEL_ELEMS; + /* Chunk-skip: for each predicate child whose column carries a + * chunk_zone index, walk the row range in chunk strides and skip + * any chunk where the child's [min, max] proves an all-fail. For + * clustered columns (e.g. data sorted by CounterID, EventDate) this + * eliminates the per-row RefererHash/URLHash read for ~all chunks + * outside the matching counter / date range — q40/q41/q42 pattern. + * Picks chunk_log2 from any indexed child (every chunk_zone built + * by csv.read uses the same chunk_log2 today). Falls through to + * the plain per-row loop when no child has a usable index. */ + uint8_t chunk_log2 = 0; + for (uint8_t i = 0; i < c->pred.n_children; i++) { + ray_t* co = c->pred.children[i].col_obj; + if (co && (co->attrs & RAY_ATTR_HAS_INDEX) && co->index) { + ray_index_t* ix = ray_index_payload(co->index); + if (ix->kind == RAY_IDX_CHUNK_ZONE && + ix->built_for_len == co->len) { + chunk_log2 = ix->u.chunk_zone.chunk_log2; + break; + } + } + } + + int64_t row = start; + while (row < end) { + int64_t chunk_end; + if (chunk_log2 > 0) { + int64_t csz = 1LL << chunk_log2; + chunk_end = ((row >> chunk_log2) + 1) << chunk_log2; + (void)csz; + if (chunk_end > end) chunk_end = end; + bool all_fail = false; + for (uint8_t i = 0; i < c->pred.n_children && !all_fail; i++) { + const fp_cmp_t* p = &c->pred.children[i]; + ray_t* co = p->col_obj; + if (!co || !(co->attrs & RAY_ATTR_HAS_INDEX) || !co->index) + continue; + ray_index_t* ix = ray_index_payload(co->index); + if (ix->kind != RAY_IDX_CHUNK_ZONE || + ix->built_for_len != co->len || + ix->u.chunk_zone.chunk_log2 != chunk_log2 || + ix->u.chunk_zone.is_f64) + continue; + fp_op_t op = p->op; + if (op != FP_EQ && op != FP_NE && op != FP_LT && + op != FP_LE && op != FP_GT && op != FP_GE) + continue; + int64_t s_ch = row >> chunk_log2; + if ((uint32_t)s_ch >= ix->u.chunk_zone.n_chunks) continue; + const int64_t* mins = (const int64_t*)ray_data(ix->u.chunk_zone.mins); + const int64_t* maxs = (const int64_t*)ray_data(ix->u.chunk_zone.maxs); + int64_t cmin = mins[s_ch], cmax = maxs[s_ch]; + if (cmin > cmax) continue; /* empty chunk */ + int64_t cv = p->cval; + switch (op) { + case FP_EQ: if (cv < cmin || cv > cmax) all_fail = true; break; + case FP_NE: if (cmin == cmax && cv == cmin) all_fail = true; break; + case FP_LT: if (cmin >= cv) all_fail = true; break; + case FP_LE: if (cmin > cv) all_fail = true; break; + case FP_GT: if (cmax <= cv) all_fail = true; break; + case FP_GE: if (cmax < cv) all_fail = true; break; + default: break; + } + } + if (all_fail) { row = chunk_end; continue; } + } else { + chunk_end = end; + } + + for (; row < chunk_end; row++) { + if (eq_col[row] != eq_val) continue; + uint8_t pass = 1; + for (uint8_t i = 0; i < c->pred.n_children; i++) { + if (i == fc->eq_idx) continue; + if (!fp_eval_cmp_one(&c->pred.children[i], row)) { + pass = 0; + break; + } + } + if (!pass) continue; + if (mk_count_upsert_row(c, sh, row) != 0) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + } + } +} + +/* ─── v2 worker fn — per-(worker, partition) shards ───────────────── + * + * Like mk_par_fn but routes every passing row by RADIX_PART(hash) into + * one of MK_RADIX_P small per-(worker, partition) shards. Each small + * shard stays cache-resident as it fills, so the probe never walks a + * 5–10 MB monolithic per-worker shard. Pass-1 (probe) and pass-2 + * (agg update) are fused per-row here: any partition may grow on any + * row, so a deferred pass-2 over recorded slot indexes would dereference + * stale slots after a rehash. Combine merges per partition. */ +static inline void mk_v2_apply_agg_inline(mk_par_ctx_t* c, int64_t* state_slot, + int64_t source_row, + uint8_t n_aggs, uint8_t total_state) +{ + (void)total_state; + for (uint8_t a = 0; a < n_aggs; a++) { + const mk_agg_t* ag = &c->aggs[a]; + uint8_t off = ag->state_off; + switch (ag->kind) { + case MK_AGG_COUNT: + state_slot[off]++; + break; + case MK_AGG_SUM: { + int64_t v = mk_read_agg_i64(ag, source_row); + state_slot[off] += v; + break; + } + case MK_AGG_MIN: { + int64_t v = mk_read_agg_i64(ag, source_row); + if (v < state_slot[off]) state_slot[off] = v; + break; + } + case MK_AGG_MAX: { + int64_t v = mk_read_agg_i64(ag, source_row); + if (v > state_slot[off]) state_slot[off] = v; + break; + } + case MK_AGG_AVG: { + int64_t v = mk_read_agg_i64(ag, source_row); + state_slot[off ] += v; + state_slot[off + 1] += 1; + break; + } + } + } +} + +static void mk_par_v2_fn(void* raw, uint32_t worker_id, + int64_t start, int64_t end) +{ + mk_par_ctx_t* c = (mk_par_ctx_t*)raw; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + uint8_t wide = c->wide; + uint8_t total_state = c->total_state; + uint8_t n_aggs = c->n_aggs; + mk_shard_t* my_shards = &c->wpart_shards[(size_t)worker_id * MK_RADIX_P]; + + /* Eager partition init. Upfront cost: MK_RADIX_P × init_cap shards + * per worker (~256 × 256 × ~30 B = 2 MB for 4-slot state per worker; + * 16 MB across 8 workers — comfortably L3-resident). Saves a per-row + * branch (~10M iterations on q31/q32-class queries) for the rest of + * the scan. ray_pool_dispatch reuses the same task across morsel + * slices but assigns a fresh worker_id per task call, so guard with + * the slots check so re-entry skips. */ + for (uint32_t p = 0; p < MK_RADIX_P; p++) { + if (my_shards[p].slots) continue; + if (mk_shard_init(&my_shards[p], c->init_cap, + total_state, wide) != 0) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + } + + int64_t row = start; + while (row < end) { + int64_t mend = row + RAY_MORSEL_ELEMS; + if (mend > end) mend = end; + int64_t mlen = mend - row; + uint8_t bits[RAY_MORSEL_ELEMS]; + fp_eval_pred(&c->pred, row, mend, bits); + + int match_count = 0; + for (int64_t r = 0; r < mlen; r++) match_count += bits[r]; + if (match_count == 0) { row = mend; continue; } + int64_t base_row = row; + + if (!wide) { + for (int64_t r = 0; r < mlen; r++) { + if (!bits[r]) continue; + int64_t source_row = base_row + r; + int64_t kv = mk_compose_key(c, source_row); + uint64_t h = (uint64_t)kv * 0x9E3779B97F4A7C15ULL; + h ^= h >> 33; + uint32_t p = MK_RADIX_PART(h); + mk_shard_t* sh = &my_shards[p]; + if (sh->n_filled + 1 > (int64_t)(sh->cap / 2)) { + if (mk_shard_grow(sh, total_state, wide) != 0) { + atomic_store_explicit(&c->oom, 1, + memory_order_relaxed); + return; + } + } + int64_t* slots = sh->slots; + int64_t* state = sh->state; + uint64_t mask = sh->mask; + uint64_t s = h & mask; + for (;;) { + if (!slots[s * 2]) { + slots[s * 2] = 1; + slots[s * 2 + 1] = kv; + int64_t* st = &state[s * total_state]; + for (uint8_t a = 0; a < n_aggs; a++) { + const mk_agg_t* ag = &c->aggs[a]; + switch (ag->kind) { + case MK_AGG_COUNT: + case MK_AGG_SUM: + st[ag->state_off] = 0; break; + case MK_AGG_MIN: + st[ag->state_off] = INT64_MAX; break; + case MK_AGG_MAX: + st[ag->state_off] = INT64_MIN; break; + case MK_AGG_AVG: + st[ag->state_off ] = 0; + st[ag->state_off + 1] = 0; break; + } + } + sh->n_filled++; + break; + } + if (slots[s * 2 + 1] == kv) break; + s = (s + 1) & mask; + } + mk_v2_apply_agg_inline(c, &state[s * total_state], + source_row, n_aggs, total_state); + } + } else { + for (int64_t r = 0; r < mlen; r++) { + if (!bits[r]) continue; + int64_t source_row = base_row + r; + int64_t kv_lo, kv_hi; + mk_compose_key2(c, source_row, &kv_lo, &kv_hi); + uint64_t h = mk_hash_lo_hi(kv_lo, kv_hi); + uint32_t p = MK_RADIX_PART(h); + mk_shard_t* sh = &my_shards[p]; + if (sh->n_filled + 1 > (int64_t)(sh->cap / 2)) { + if (mk_shard_grow(sh, total_state, wide) != 0) { + atomic_store_explicit(&c->oom, 1, + memory_order_relaxed); + return; + } + } + int64_t* slots = sh->slots; + int64_t* slots_hi = sh->slots_hi; + int64_t* state = sh->state; + uint64_t mask = sh->mask; + uint64_t s = h & mask; + for (;;) { + if (!slots[s * 2]) { + slots[s * 2] = 1; + slots[s * 2 + 1] = kv_lo; + slots_hi[s] = kv_hi; + int64_t* st = &state[s * total_state]; + for (uint8_t a = 0; a < n_aggs; a++) { + const mk_agg_t* ag = &c->aggs[a]; + switch (ag->kind) { + case MK_AGG_COUNT: + case MK_AGG_SUM: + st[ag->state_off] = 0; break; + case MK_AGG_MIN: + st[ag->state_off] = INT64_MAX; break; + case MK_AGG_MAX: + st[ag->state_off] = INT64_MIN; break; + case MK_AGG_AVG: + st[ag->state_off ] = 0; + st[ag->state_off + 1] = 0; break; + } + } + sh->n_filled++; + break; + } + if (slots[s * 2 + 1] == kv_lo && slots_hi[s] == kv_hi) break; + s = (s + 1) & mask; + } + mk_v2_apply_agg_inline(c, &state[s * total_state], + source_row, n_aggs, total_state); + } + } + + row = mend; + } +} + +/* ─── Worker fn — chunked vectorised aggregate update ─────────────── + * + * Per morsel we run two passes: + * + * PASS 1 (probe): linear-probe the HT for every passing row. On a + * new slot we initialize the per-agg state to a per-kind sentinel + * (0 for COUNT/SUM/AVG-sum, 0 for AVG-count, INT64_MAX for MIN, + * INT64_MIN for MAX) so the accumulate-only update logic in pass 2 + * produces the correct first value without a separate "first row" + * branch. Pass 1 fills slot_idx[i] (HT slot for the i-th passing row) + * and src_rows[i] (source row index) into stack-resident arrays. + * + * PASS 2 (update): for each aggregate, run a tight per-agg loop over + * match_count entries. No per-row switch dispatch — the kind switch + * is hoisted out of the loop, so each loop body is a single + * accumulate operation against state[slot_idx[i] * total + off]. + * + * Probe-then-update-per-aggregate eliminates the O(rows × aggs) branch + * dispatch the prior per-row update did. */ +static void mk_par_fn(void* raw, uint32_t worker_id, int64_t start, int64_t end) { + mk_par_ctx_t* c = (mk_par_ctx_t*)raw; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + mk_shard_t* sh = &c->shards[worker_id]; + uint8_t wide = c->wide; + if (!sh->slots) { + if (mk_shard_init(sh, c->init_cap, c->total_state, wide) != 0) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + } + + uint8_t total_state = c->total_state; + uint8_t n_aggs = c->n_aggs; + + int64_t row = start; + while (row < end) { + int64_t mend = row + RAY_MORSEL_ELEMS; if (mend > end) mend = end; int64_t mlen = mend - row; uint8_t bits[RAY_MORSEL_ELEMS]; @@ -2830,49 +3604,122 @@ static void mk_apply_count_emit_filter(const mk_par_ctx_t* c, int64_t* gs, int64_t* gst, int64_t gcap, int64_t* global_n) { + /* Two-mode emit-filter pass over the deduped (gs, gst) layout: + * + * 1. min_count_exclusive (heavy-hitter): drop rows whose COUNT + * value is at or below the threshold. Only fires for COUNT. + * + * 2. top_count_take (top-N): drop rows that aren't in the top-N + * ordered by the configured agg op (COUNT/SUM/MIN/MAX). Both + * desc (largest N) and asc (smallest N) are supported. The + * producer (query.c's match_group_desc_count_take) sets + * emit_filter.agg_op and emit_filter.desc accordingly; an + * unset agg_op defaults to OP_COUNT for the historical + * single-mode filter. + * + * AVG / STDDEV / VAR / PEARSON / MEDIAN are excluded — their + * ordering doesn't reduce to a single int64 row-slot read, so + * filters over those aggs must fall back to the post-materialize + * sort + take path. SYM-typed MIN/MAX are similarly excluded + * because the stored value is an interned id whose natural order + * is not the lexicographic order users expect (a mismatch only + * relevant when the desc:/asc: orders the output). */ ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get(); if (!emit_filter.enabled || emit_filter.agg_index >= c->n_aggs) return; - const mk_agg_t* count_agg = &c->aggs[emit_filter.agg_index]; - if (count_agg->kind != MK_AGG_COUNT) + const mk_agg_t* order_agg = &c->aggs[emit_filter.agg_index]; + uint16_t order_op = emit_filter.agg_op + ? emit_filter.agg_op + : (uint16_t)OP_COUNT; + /* min_count_exclusive remains COUNT-only — it represents a + * heavy-hitter threshold inherited from the WHERE clause and + * doesn't generalize to SUM/MIN/MAX semantics. */ + int64_t keep_min = (order_op == OP_COUNT) + ? emit_filter.min_count_exclusive + 1 + : 1; + int64_t k_take = emit_filter.top_count_take; + uint8_t desc_dir = emit_filter.desc; + if (order_op == OP_COUNT && !emit_filter.desc) desc_dir = 1; + + /* Map order_op → mk_agg kind, reject incompatible shapes. */ + if (order_op == OP_COUNT) { + if (order_agg->kind != MK_AGG_COUNT) return; + } else if (order_op == OP_SUM) { + if (order_agg->kind != MK_AGG_SUM) return; + } else if (order_op == OP_MIN) { + if (order_agg->kind != MK_AGG_MIN) return; + if (order_agg->in_type == RAY_SYM) return; + } else if (order_op == OP_MAX) { + if (order_agg->kind != MK_AGG_MAX) return; + if (order_agg->in_type == RAY_SYM) return; + } else { return; + } - int64_t keep_min = emit_filter.min_count_exclusive + 1; - int64_t k_take = emit_filter.top_count_take; if (k_take > 0 && k_take < *global_n) { ray_t* heap_hdr = NULL; int64_t* heap = (int64_t*)scratch_alloc(&heap_hdr, (size_t)k_take * sizeof(int64_t)); if (heap) { int64_t heap_n = 0; + /* For desc (top-N largest): min-heap, root = smallest. + * For asc (top-N smallest): max-heap, root = largest. */ + #define MK_TOPN_NEEDS_SWAP(parent, child) \ + (desc_dir ? ((parent) > (child)) : ((parent) < (child))) + #define MK_TOPN_SHOULD_REPLACE(nv, rv) \ + (desc_dir ? ((nv) > (rv)) : ((nv) < (rv))) for (int64_t s = 0; s < gcap; s++) { if (!gs[s * 2]) continue; - int64_t cnt = gst[(size_t)s * c->total_state + count_agg->state_off]; + int64_t v = gst[(size_t)s * c->total_state + order_agg->state_off]; if (heap_n < k_take) { int64_t j = heap_n++; - heap[j] = cnt; + heap[j] = v; while (j > 0) { int64_t p = (j - 1) >> 1; - if (heap[p] <= heap[j]) break; + if (!MK_TOPN_NEEDS_SWAP(heap[p], heap[j])) break; int64_t tmp = heap[p]; heap[p] = heap[j]; heap[j] = tmp; j = p; } - } else if (cnt > heap[0]) { - heap[0] = cnt; + } else if (MK_TOPN_SHOULD_REPLACE(v, heap[0])) { + heap[0] = v; int64_t j = 0; for (;;) { int64_t l = j * 2 + 1, r = l + 1, m = j; - if (l < heap_n && heap[l] < heap[m]) m = l; - if (r < heap_n && heap[r] < heap[m]) m = r; + if (l < heap_n && MK_TOPN_NEEDS_SWAP(heap[m], heap[l])) m = l; + if (r < heap_n && MK_TOPN_NEEDS_SWAP(heap[m], heap[r])) m = r; if (m == j) break; int64_t tmp = heap[m]; heap[m] = heap[j]; heap[j] = tmp; j = m; } } } - if (heap_n == k_take && heap[0] > keep_min) - keep_min = heap[0]; + #undef MK_TOPN_NEEDS_SWAP + #undef MK_TOPN_SHOULD_REPLACE + if (heap_n == k_take) { + /* heap[0] is the worst surviving value. Compute a + * scalar threshold so the compaction sweep below can + * read it without checking direction per row. */ + int64_t threshold = heap[0]; + int64_t kept = 0; + for (int64_t s = 0; s < gcap; s++) { + if (!gs[s * 2]) continue; + int64_t v = gst[(size_t)s * c->total_state + order_agg->state_off]; + bool survives = desc_dir ? (v >= threshold) : (v <= threshold); + if (!survives) { + gs[s * 2] = 0; + } else if (order_op == OP_COUNT && v < keep_min) { + /* min_count_exclusive threshold combines with top-N + * by AND — drop rows that fail either. */ + gs[s * 2] = 0; + } else { + kept++; + } + } + *global_n = kept; + scratch_free(heap_hdr); + return; + } scratch_free(heap_hdr); } } @@ -2883,7 +3730,7 @@ static void mk_apply_count_emit_filter(const mk_par_ctx_t* c, int64_t kept = 0; for (int64_t s = 0; s < gcap; s++) { if (!gs[s * 2]) continue; - int64_t cnt = gst[(size_t)s * c->total_state + count_agg->state_off]; + int64_t cnt = gst[(size_t)s * c->total_state + order_agg->state_off]; if (cnt < keep_min) { gs[s * 2] = 0; } else { @@ -3364,6 +4211,320 @@ static int mk_combine_parallel(mk_par_ctx_t* c, uint32_t nw, return 1; } +/* ─── v2 per-partition combine ────────────────────────────────────── + * + * Shards in c->wpart_shards are already RADIX-partitioned (each holds + * only entries whose hash routes to that partition). The v1 combine + * had to histogram + scatter before per-partition dedup; here we go + * straight to per-partition dedup — task p just walks all workers' + * shard at index w*MK_RADIX_P+p and merges into a single target HT. + * Per-partition tasks are fully independent: each task only writes + * to its own target HT and its own slot in the part_* arrays. */ + +typedef struct { + mk_par_ctx_t* ctx; + uint32_t nw; /* workers per partition */ + uint8_t total_state; + uint8_t wide; + const mk_agg_t* aggs; + uint8_t n_aggs; + /* Per-partition output buffers (MK_RADIX_P slots). */ + int64_t** part_keys; /* [P]: kv_lo array, size part_n[p] */ + int64_t** part_keys_hi; /* [P]: kv_hi array, NULL when narrow */ + int64_t** part_states; /* [P]: state[part_n[p] * total_state] */ + ray_t** part_keys_hdr; + ray_t** part_keys_hi_hdr; + ray_t** part_states_hdr; + int64_t* part_n; + _Atomic(uint32_t) oom; +} mk_combine_v2_ctx_t; + +static void mk_combine_v2_part_fn(void* vctx, uint32_t worker_id, + int64_t start, int64_t end) +{ + (void)worker_id; + mk_combine_v2_ctx_t* c = (mk_combine_v2_ctx_t*)vctx; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + uint8_t total_state = c->total_state; + uint8_t wide = c->wide; + uint8_t n_aggs = c->n_aggs; + uint32_t nw = c->nw; + + for (int64_t p = start; p < end; p++) { + /* Upper bound on the merged partition: sum of worker fills (some + * keys may appear in multiple workers; the merge folds those, so + * final n_filled ≤ total). */ + int64_t total = 0; + for (uint32_t w = 0; w < nw; w++) { + total += c->ctx->wpart_shards[(size_t)w * MK_RADIX_P + p].n_filled; + } + if (total == 0) { + c->part_n[p] = 0; + continue; + } + + /* Target HT sized to fit `total` at load ≤ 0.5; pow-of-2. */ + uint64_t cap = 256; + while (cap < (uint64_t)(total * 2)) cap <<= 1; + + mk_shard_t target; + memset(&target, 0, sizeof(target)); + if (mk_shard_init(&target, cap, total_state, wide) != 0) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + + /* Merge each worker's shard for this partition into target. */ + for (uint32_t w = 0; w < nw; w++) { + mk_shard_t* src = &c->ctx->wpart_shards[(size_t)w * MK_RADIX_P + p]; + if (!src->slots) continue; + int64_t* src_slots = src->slots; + int64_t* src_slots_hi = src->slots_hi; + int64_t* src_state = src->state; + uint64_t src_cap = src->cap; + int64_t* tgt_slots = target.slots; + int64_t* tgt_slots_hi = target.slots_hi; + int64_t* tgt_state = target.state; + uint64_t tgt_mask = target.mask; + + for (uint64_t s = 0; s < src_cap; s++) { + if (!src_slots[s * 2]) continue; + int64_t kv_lo = src_slots[s * 2 + 1]; + int64_t kv_hi = wide ? src_slots_hi[s] : 0; + uint64_t h; + if (wide) { + h = mk_hash_lo_hi(kv_lo, kv_hi); + } else { + h = (uint64_t)kv_lo * 0x9E3779B97F4A7C15ULL; + h ^= h >> 33; + } + uint64_t t = h & tgt_mask; + const int64_t* sst = &src_state[s * total_state]; + for (;;) { + if (!tgt_slots[t * 2]) { + tgt_slots[t * 2] = 1; + tgt_slots[t * 2 + 1] = kv_lo; + if (wide) tgt_slots_hi[t] = kv_hi; + int64_t* dst = &tgt_state[t * total_state]; + for (uint8_t k = 0; k < total_state; k++) + dst[k] = sst[k]; + target.n_filled++; + break; + } + if (tgt_slots[t * 2 + 1] == kv_lo && + (!wide || tgt_slots_hi[t] == kv_hi)) + { + mk_state_merge(&tgt_state[t * total_state], + sst, c->aggs, n_aggs); + break; + } + t = (t + 1) & tgt_mask; + } + } + } + + /* Pack target into dense per-partition output arrays. */ + int64_t pn = target.n_filled; + c->part_n[p] = pn; + c->part_keys[p] = (int64_t*)scratch_alloc( + &c->part_keys_hdr[p], (size_t)pn * sizeof(int64_t)); + if (wide) { + c->part_keys_hi[p] = (int64_t*)scratch_alloc( + &c->part_keys_hi_hdr[p], (size_t)pn * sizeof(int64_t)); + } + c->part_states[p] = (int64_t*)scratch_alloc( + &c->part_states_hdr[p], + (size_t)pn * total_state * sizeof(int64_t)); + if (!c->part_keys[p] || (wide && !c->part_keys_hi[p]) || + !c->part_states[p]) + { + mk_shard_free(&target); + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + int64_t gi = 0; + int64_t* tgt_slots = target.slots; + int64_t* tgt_slots_hi = target.slots_hi; + int64_t* tgt_state = target.state; + for (uint64_t t = 0; t < target.cap; t++) { + if (!tgt_slots[t * 2]) continue; + c->part_keys[p][gi] = tgt_slots[t * 2 + 1]; + if (wide) c->part_keys_hi[p][gi] = tgt_slots_hi[t]; + const int64_t* src = &tgt_state[t * total_state]; + int64_t* dst = &c->part_states[p][gi * total_state]; + for (uint8_t k = 0; k < total_state; k++) dst[k] = src[k]; + gi++; + } + + mk_shard_free(&target); + } +} + +/* Drives the v2 per-partition combine. Returns 1 on success (fills + * out_* with a dense gs/gst layout identical to mk_combine_parallel), + * 0 on failure (caller falls back to the slow path). */ +static int mk_combine_v2_parallel(mk_par_ctx_t* c, uint32_t nw, + int64_t** out_gs, ray_t** out_gs_hdr, + int64_t** out_gs_hi, ray_t** out_gs_hi_hdr, + int64_t** out_gst, ray_t** out_gst_hdr, + int64_t* out_gcap, int64_t* out_global_n) +{ + uint8_t total_state = c->total_state; + uint8_t wide = c->wide; + ray_pool_t* pool = ray_pool_get(); + + /* Per-partition state arrays (MK_RADIX_P slots each). */ + ray_t* pk_hdr = NULL; + ray_t* pkhi_hdr = NULL; + ray_t* ps_hdr = NULL; + ray_t* pkh_hdr = NULL; + ray_t* pkhh_hdr = NULL; + ray_t* psh_hdr = NULL; + ray_t* pn_hdr = NULL; + int64_t** part_keys = (int64_t**)scratch_calloc( + &pk_hdr, (size_t)MK_RADIX_P * sizeof(int64_t*)); + int64_t** part_keys_hi = wide + ? (int64_t**)scratch_calloc(&pkhi_hdr, + (size_t)MK_RADIX_P * sizeof(int64_t*)) + : NULL; + int64_t** part_states = (int64_t**)scratch_calloc( + &ps_hdr, (size_t)MK_RADIX_P * sizeof(int64_t*)); + ray_t** part_keys_hdr = (ray_t**)scratch_calloc( + &pkh_hdr, (size_t)MK_RADIX_P * sizeof(ray_t*)); + ray_t** part_keys_hi_hdr = wide + ? (ray_t**)scratch_calloc(&pkhh_hdr, + (size_t)MK_RADIX_P * sizeof(ray_t*)) + : NULL; + ray_t** part_states_hdr = (ray_t**)scratch_calloc( + &psh_hdr, (size_t)MK_RADIX_P * sizeof(ray_t*)); + int64_t* part_n = (int64_t*)scratch_calloc( + &pn_hdr, (size_t)MK_RADIX_P * sizeof(int64_t)); + + if (!part_keys || !part_states || !part_keys_hdr || + !part_states_hdr || !part_n || + (wide && (!part_keys_hi || !part_keys_hi_hdr))) + { + if (pk_hdr) scratch_free(pk_hdr); + if (pkhi_hdr) scratch_free(pkhi_hdr); + if (ps_hdr) scratch_free(ps_hdr); + if (pkh_hdr) scratch_free(pkh_hdr); + if (pkhh_hdr) scratch_free(pkhh_hdr); + if (psh_hdr) scratch_free(psh_hdr); + if (pn_hdr) scratch_free(pn_hdr); + return 0; + } + + mk_combine_v2_ctx_t pctx = { + .ctx = c, + .nw = nw, + .total_state = total_state, + .wide = wide, + .aggs = c->aggs, + .n_aggs = c->n_aggs, + .part_keys = part_keys, + .part_keys_hi = part_keys_hi, + .part_states = part_states, + .part_keys_hdr = part_keys_hdr, + .part_keys_hi_hdr = part_keys_hi_hdr, + .part_states_hdr = part_states_hdr, + .part_n = part_n, + .oom = 0, + }; + + if (pool && ray_pool_total_workers(pool) >= 2) { + ray_pool_dispatch_n(pool, mk_combine_v2_part_fn, &pctx, + (uint32_t)MK_RADIX_P); + } else { + mk_combine_v2_part_fn(&pctx, 0, 0, (int64_t)MK_RADIX_P); + } + + if (atomic_load_explicit(&pctx.oom, memory_order_relaxed)) { + for (uint64_t p = 0; p < MK_RADIX_P; p++) { + if (part_keys_hdr[p]) scratch_free(part_keys_hdr[p]); + if (part_keys_hi_hdr && part_keys_hi_hdr[p]) + scratch_free(part_keys_hi_hdr[p]); + if (part_states_hdr[p]) scratch_free(part_states_hdr[p]); + } + scratch_free(pk_hdr); if (pkhi_hdr) scratch_free(pkhi_hdr); + scratch_free(ps_hdr); + scratch_free(pkh_hdr); if (pkhh_hdr) scratch_free(pkhh_hdr); + scratch_free(psh_hdr); + scratch_free(pn_hdr); + return 0; + } + + /* Concat per-partition outputs into dense gs/gs_hi/gst. */ + int64_t global_n = 0; + for (uint64_t p = 0; p < MK_RADIX_P; p++) global_n += part_n[p]; + + ray_t* gs_hdr = NULL; + ray_t* gs_hi_hdr = NULL; + ray_t* gst_hdr = NULL; + int64_t* gs = (int64_t*)scratch_calloc( + &gs_hdr, (size_t)global_n * 2 * sizeof(int64_t)); + int64_t* gs_hi = wide + ? (int64_t*)scratch_alloc(&gs_hi_hdr, + (size_t)global_n * sizeof(int64_t)) + : NULL; + int64_t* gst = (int64_t*)scratch_alloc( + &gst_hdr, (size_t)global_n * total_state * sizeof(int64_t)); + if (!gs || (wide && !gs_hi) || !gst) { + if (gs_hdr) scratch_free(gs_hdr); + if (gs_hi_hdr) scratch_free(gs_hi_hdr); + if (gst_hdr) scratch_free(gst_hdr); + for (uint64_t p = 0; p < MK_RADIX_P; p++) { + if (part_keys_hdr[p]) scratch_free(part_keys_hdr[p]); + if (part_keys_hi_hdr && part_keys_hi_hdr[p]) + scratch_free(part_keys_hi_hdr[p]); + if (part_states_hdr[p]) scratch_free(part_states_hdr[p]); + } + scratch_free(pk_hdr); if (pkhi_hdr) scratch_free(pkhi_hdr); + scratch_free(ps_hdr); + scratch_free(pkh_hdr); if (pkhh_hdr) scratch_free(pkhh_hdr); + scratch_free(psh_hdr); + scratch_free(pn_hdr); + return 0; + } + + int64_t gi = 0; + for (uint64_t p = 0; p < MK_RADIX_P; p++) { + int64_t pn = part_n[p]; + if (pn == 0) continue; + const int64_t* pk = part_keys[p]; + const int64_t* pkhi = part_keys_hi ? part_keys_hi[p] : NULL; + const int64_t* ps = part_states[p]; + for (int64_t i = 0; i < pn; i++) { + gs[gi * 2] = 1; + gs[gi * 2 + 1] = pk[i]; + if (wide) gs_hi[gi] = pkhi[i]; + int64_t* dst = &gst[gi * total_state]; + const int64_t* src = &ps[i * total_state]; + for (uint8_t k = 0; k < total_state; k++) dst[k] = src[k]; + gi++; + } + if (part_keys_hdr[p]) scratch_free(part_keys_hdr[p]); + if (part_keys_hi_hdr && part_keys_hi_hdr[p]) + scratch_free(part_keys_hi_hdr[p]); + if (part_states_hdr[p]) scratch_free(part_states_hdr[p]); + } + + scratch_free(pk_hdr); if (pkhi_hdr) scratch_free(pkhi_hdr); + scratch_free(ps_hdr); + scratch_free(pkh_hdr); if (pkhh_hdr) scratch_free(pkhh_hdr); + scratch_free(psh_hdr); + scratch_free(pn_hdr); + + *out_gs = gs; + *out_gs_hdr = gs_hdr; + *out_gs_hi = gs_hi; + *out_gs_hi_hdr = gs_hi_hdr; + *out_gst = gst; + *out_gst_hdr = gst_hdr; + *out_gcap = global_n; + *out_global_n = global_n; + return 1; +} + static ray_t* mk_combine_and_materialize(mk_par_ctx_t* c, uint32_t nw, const uint16_t* agg_op_ids) { @@ -3377,7 +4538,13 @@ static ray_t* mk_combine_and_materialize(mk_par_ctx_t* c, uint32_t nw, for (uint32_t w = 0; w < nw; w++) total_local += shards[w].n_filled; /* Try parallel combine first. On success, jump straight to the - * materialize section with the already-built gs/gs_hi/gst arrays. */ + * materialize section with the already-built gs/gs_hi/gst arrays. + * + * v2 path: when wpart_shards is set, shards are pre-partitioned by + * RADIX_PART(h). mk_combine_v2_parallel skips the histogram/scatter + * passes entirely — each partition is dedupped independently and + * the per-(worker, partition) shards already have the right entries. + * v1 path: mk_combine_parallel histogram+scatter+dedup. */ int64_t* gs = NULL; int64_t* gs_hi = NULL; int64_t* gst = NULL; @@ -3386,11 +4553,30 @@ static ray_t* mk_combine_and_materialize(mk_par_ctx_t* c, uint32_t nw, ray_t* gst_hdr = NULL; int64_t gcap = 0; int64_t global_n = 0; - int parallel_ok = mk_combine_parallel(c, nw, + int parallel_ok = 0; + /* v2 combine target HT scales with per-partition cardinality + * (total_local / MK_RADIX_P). For very-high-card queries (q32: + * ~10M unique groups → ~313K per partition → ~1 M-slot HT × 32 + * partitions ≈ 768 MB allocated) the per-partition HTs blow the + * working set out of cache; v1's scatter-then-dedup is bounded + * by smaller per-combine-partition slices and wins. ~16 K + * entries per partition keeps each target HT in L2 (~1.5 MB + * with 4-slot state). */ + int v2_combine_ok = c->wpart_shards != NULL && + ((uint64_t)total_local / MK_RADIX_P) <= (1ULL << 14); + if (v2_combine_ok) { + parallel_ok = mk_combine_v2_parallel(c, nw / MK_RADIX_P, + &gs, &gs_hdr, + &gs_hi, &gs_hi_hdr, + &gst, &gst_hdr, + &gcap, &global_n); + } else { + parallel_ok = mk_combine_parallel(c, nw, &gs, &gs_hdr, &gs_hi, &gs_hi_hdr, &gst, &gst_hdr, &gcap, &global_n); + } if (parallel_ok) goto materialize; { @@ -3698,27 +4884,100 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext, } if (nrows < 0) return ray_error("nyi", NULL); - ctx.init_cap = FP_SHARD_INIT_CAP; atomic_store_explicit(&ctx.oom, 0, memory_order_relaxed); ray_pool_t* pool = ray_pool_get(); uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; - ray_t* shards_hdr = NULL; - ctx.shards = (mk_shard_t*)scratch_calloc(&shards_hdr, - (size_t)nw * sizeof(mk_shard_t)); - if (!ctx.shards) return ray_error("oom", NULL); int eq_i64_idx = -1; if (ctx.n_aggs == 1 && ctx.aggs[0].kind == MK_AGG_COUNT && ctx.pred.n_children > 1) { eq_i64_idx = mk_find_i64_eq_child(&ctx.pred); } - if (eq_i64_idx >= 0) { + /* Hash-index probe: if any FP_EQ child sits on a column with a + * fresh RAY_IDX_HASH, walk the chain instead of scanning rows. + * Single-thread — match counts on a point lookup are too small + * to justify pool dispatch. + * + * Multi-predicate filters fall through: queries that combine a + * hash-indexed eq with one or more other predicates (e.g. a + * chunk-zone-clustered CounterID/EventDate range) win more from + * the parallel chunk-skip scan in mk_eq_i64_count_fn / + * mk_par_fn than from a hash chain walk forced into single- + * threaded execution. */ + int hash_eq_idx = (ctx.pred.n_children == 1) + ? mk_find_hash_eq_child(&ctx.pred) + : -1; + + /* v2 gate: pre-partitioned shards win on high-cardinality multi-key + * group-bys (q30/q31/q32 family) by keeping each per-(worker, + * partition) shard cache-resident. Exclude shapes where v1's + * existing fast paths already win: + * - hash-eq or eq_i64 chunk-skip scans (single-shard inserts) + * - n_aggs == 0 (degenerate) + * - n_keys == 1: v1's hot k0_base path is already L1-friendly + * - SYM keys: existing tuned SYM path beats v2 (q33/q34) + * - nullable agg input: v1's existing nullmask path; v2 does not + * yet track per-agg null counts during merge + * Multi-key with COUNT/SUM/AVG aggs (no MIN/MAX): the v2 partition + * shards cleanly merge by summing state slots. */ + bool v2_ok = (hash_eq_idx < 0 && eq_i64_idx < 0 && + ctx.n_aggs >= 1 && ctx.n_keys >= 2); + for (uint8_t k = 0; k < ctx.n_keys && v2_ok; k++) { + if (ctx.keys[k].type == RAY_SYM) v2_ok = false; + } + for (uint8_t a = 0; a < ctx.n_aggs && v2_ok; a++) { + mk_agg_kind_t kk = ctx.aggs[a].kind; + if (kk != MK_AGG_COUNT && kk != MK_AGG_SUM && kk != MK_AGG_AVG) { + v2_ok = false; + } + if (ctx.aggs[a].in_attrs & RAY_ATTR_HAS_NULLS) v2_ok = false; + } + + /* Init capacity per shard. + * v1 (single shard per worker): pre-size to a fraction of nrows so + * high-cardinality scans pay fewer rehashes. + * v2 (MK_RADIX_P shards per worker): each partition holds ~1/256 of + * the worker's groups. Start at 256 slots — matches group.c v2's + * design (~64 KB per partition with a 4-slot agg state) and keeps + * the upfront allocation total to a few MB instead of tens of MB. + * Sparse keys still grow on-demand. */ + if (v2_ok) { + ctx.init_cap = 256; + } else { + uint64_t expected = (uint64_t)nrows / ((uint64_t)nw * 16u); + uint64_t init_cap = FP_SHARD_INIT_CAP; + while (init_cap < expected * 2u && init_cap < (1ULL << 14)) + init_cap <<= 1; + ctx.init_cap = init_cap; + } + + /* Allocate the shard array. v2 uses nw * MK_RADIX_P slots, all + * stored in the same array — combine_and_materialize iterates + * `nw_effective` shards, which equals nw for v1 and nw * MK_RADIX_P + * for v2. Both layouts use the same mk_shard_t per slot. */ + uint32_t nw_effective = v2_ok ? (nw * MK_RADIX_P) : nw; + ray_t* shards_hdr = NULL; + ctx.shards = (mk_shard_t*)scratch_calloc( + &shards_hdr, (size_t)nw_effective * sizeof(mk_shard_t)); + if (!ctx.shards) return ray_error("oom", NULL); + if (v2_ok) ctx.wpart_shards = ctx.shards; + + if (hash_eq_idx >= 0 && ctx.n_aggs == 1 && + ctx.aggs[0].kind == MK_AGG_COUNT) { + mk_eq_hash_count_fn(&ctx, (uint8_t)hash_eq_idx); + } else if (hash_eq_idx >= 0) { + mk_par_hash_fn(&ctx, (uint8_t)hash_eq_idx); + } else if (eq_i64_idx >= 0) { mk_eq_i64_count_ctx_t fctx = { .ctx = &ctx, .eq_idx = (uint8_t)eq_i64_idx, }; if (pool) ray_pool_dispatch(pool, mk_eq_i64_count_fn, &fctx, nrows); else mk_eq_i64_count_fn(&fctx, 0, 0, nrows); + } else if (v2_ok && pool) { + ray_pool_dispatch(pool, mk_par_v2_fn, &ctx, nrows); + } else if (v2_ok) { + mk_par_v2_fn(&ctx, 0, 0, nrows); } else if (pool) { ray_pool_dispatch(pool, mk_par_fn, &ctx, nrows); } else { @@ -3726,13 +4985,16 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext, } if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) { - for (uint32_t w = 0; w < nw; w++) mk_shard_free(&ctx.shards[w]); + for (uint32_t w = 0; w < nw_effective; w++) + mk_shard_free(&ctx.shards[w]); scratch_free(shards_hdr); return ray_error("oom", "fused_group: shard OOM"); } - ray_t* result = mk_combine_and_materialize(&ctx, nw, ext->agg_ops); - for (uint32_t w = 0; w < nw; w++) mk_shard_free(&ctx.shards[w]); + ray_t* result = mk_combine_and_materialize(&ctx, nw_effective, + ext->agg_ops); + for (uint32_t w = 0; w < nw_effective; w++) + mk_shard_free(&ctx.shards[w]); scratch_free(shards_hdr); return result; } diff --git a/src/ops/group.c b/src/ops/group.c index 2473b3a8..d5253175 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -24,6 +24,7 @@ #include "ops/internal.h" #include "ops/hash.h" #include "ops/rowsel.h" +#include "ops/hll.h" /* approximate count-distinct via HyperLogLog */ #include "lang/internal.h" /* for ray_median_dbl_inplace */ /* ============================================================================ @@ -280,46 +281,6 @@ static void reduce_merge(reduce_acc_t* dst, const reduce_acc_t* src, int8_t in_t * and the last worker's last is the global last. */ } -typedef struct { - ray_t* input; - const void* data; - int64_t len; - int8_t type; - uint8_t attrs; - reduce_acc_t acc; -} reduce_cache_entry_t; - -static reduce_cache_entry_t g_reduce_cache[16]; -static uint32_t g_reduce_cache_next = 0; - -static bool reduce_cache_allowed(ray_t* input, const int64_t* sel_idx) { - return input && input->mmod != 0 && sel_idx == NULL; -} - -static bool reduce_cache_get(ray_t* input, reduce_acc_t* out) { - const void* data = ray_data(input); - for (size_t i = 0; i < sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]); i++) { - reduce_cache_entry_t* e = &g_reduce_cache[i]; - if (e->input == input && e->data == data && e->len == input->len && - e->type == input->type && e->attrs == input->attrs) { - *out = e->acc; - return true; - } - } - return false; -} - -static void reduce_cache_put(ray_t* input, const reduce_acc_t* acc) { - reduce_cache_entry_t* e = &g_reduce_cache[ - g_reduce_cache_next++ % (sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]))]; - e->input = input; - e->data = ray_data(input); - e->len = input->len; - e->type = input->type; - e->attrs = input->attrs; - e->acc = *acc; -} - /* Hash mixing constants used by the count-distinct kernel and helpers. */ #define CD_HASH_K1 0x9E3779B97F4A7C15ULL #define CD_HASH_K2 0xBF58476D1CE4E5B9ULL @@ -671,6 +632,23 @@ ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input) { if (len == 0) return ray_i64(0); + /* For inputs above this row count, switch to the HyperLogLog + * cardinality sketch (~0.8% std error at P=14, 16 KB per shard). + * Exact dedup-via-hashset is O(unique·log) and becomes memory- + * bandwidth-bound past ~1 M rows; HLL is single-pass, mergeable, + * and constant-memory per worker. Below the threshold the exact + * path is fast enough and avoids approximation entirely — so small + * tests still match `len-after-distinct` byte-for-byte. */ + if (len >= (1 << 20)) { + bool hashable = (in_type == RAY_I64 || in_type == RAY_I32 || + in_type == RAY_I16 || in_type == RAY_U8 || + in_type == RAY_BOOL || in_type == RAY_F64 || + in_type == RAY_DATE || in_type == RAY_TIME || + in_type == RAY_TIMESTAMP || in_type == RAY_STR || + RAY_IS_SYM(in_type)); + if (hashable) return ray_count_distinct_approx(input); + } + switch (in_type) { case RAY_BOOL: case RAY_U8: case RAY_I16: case RAY_I32: case RAY_I64: @@ -1171,6 +1149,85 @@ static ray_t* count_distinct_per_group_parallel( return out; } +/* Approximate per-group count(distinct) via HyperLogLog with sparse + * representation. Builds (idx_buf, offsets, counts) from row_gid on the + * fly and delegates to ray_count_distinct_approx_pg_buf. + * + * Memory: each task sketch starts sparse (1 KB) and converts to dense + * (16 KB) only for groups that exceed RAY_HLL_SPARSE_CAP unique values. + * Total concurrent memory is bounded by n_workers × 17 KB regardless of + * n_groups — that's the property that lets us run HLL at n_groups > 50K + * where the dense-only sketch would have needed multi-GB. + * + * Returns the populated `out` vector on success, NULL on type miss / + * dispatch failure. Caller (ray_count_distinct_per_group) falls back + * to the exact partitioned dedup. */ +static ray_t* count_distinct_per_group_hll(ray_t* src, const int64_t* row_gid, + int64_t n_rows, int64_t n_groups, + ray_t* out) { + if (!src || n_rows <= 0 || n_groups <= 0) return NULL; + /* Build group-major idx_buf: for each group g, idx_buf[offsets[g] .. + * offsets[g] + counts[g]) lists the source row indices in that group. + * Serial two-pass; for n_rows = 10 M this is ~80 MB of int64 reads + * twice ≈ 25 ms on the bench box. The HLL pass itself dominates. */ + ray_t* cnt_hdr = NULL; + ray_t* off_hdr = NULL; + int64_t* counts = (int64_t*)scratch_calloc(&cnt_hdr, + (size_t)n_groups * sizeof(int64_t)); + int64_t* offsets = (int64_t*)scratch_alloc(&off_hdr, + (size_t)n_groups * sizeof(int64_t)); + if (!counts || !offsets) { + if (cnt_hdr) scratch_free(cnt_hdr); + if (off_hdr) scratch_free(off_hdr); + return NULL; + } + /* Pass 1: histogram. */ + int64_t total = 0; + for (int64_t r = 0; r < n_rows; r++) { + int64_t g = row_gid[r]; + if (g >= 0 && g < n_groups) counts[g]++; + } + /* Prefix sums → offsets. */ + for (int64_t g = 0; g < n_groups; g++) { + offsets[g] = total; + total += counts[g]; + } + if (total == 0) { + scratch_free(cnt_hdr); scratch_free(off_hdr); + return out; + } + ray_t* idx_hdr = NULL; + int64_t* idx_buf = (int64_t*)scratch_alloc(&idx_hdr, + (size_t)total * sizeof(int64_t)); + if (!idx_buf) { + scratch_free(cnt_hdr); scratch_free(off_hdr); + return NULL; + } + /* Pass 2: scatter into group-major buf using a cursor copy of offsets. */ + ray_t* pos_hdr = NULL; + int64_t* pos = (int64_t*)scratch_alloc(&pos_hdr, + (size_t)n_groups * sizeof(int64_t)); + if (!pos) { + scratch_free(idx_hdr); scratch_free(cnt_hdr); scratch_free(off_hdr); + return NULL; + } + memcpy(pos, offsets, (size_t)n_groups * sizeof(int64_t)); + for (int64_t r = 0; r < n_rows; r++) { + int64_t g = row_gid[r]; + if (g >= 0 && g < n_groups) idx_buf[pos[g]++] = r; + } + scratch_free(pos_hdr); + + int64_t* odata = (int64_t*)ray_data(out); + int rc = ray_count_distinct_approx_pg_buf(src, idx_buf, offsets, counts, + n_groups, RAY_HLL_DEFAULT_P, odata); + scratch_free(idx_hdr); + scratch_free(cnt_hdr); + scratch_free(off_hdr); + if (rc != 0) return NULL; + return out; +} + /* Grouped count(distinct): single global hash keyed by (group_id, value). * One linear pass over all rows, O(n) total instead of O(per-group setup * * n_groups). Returns an I64 vector of length n_groups with the per-group @@ -1207,6 +1264,63 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid, memset(odata, 0, (size_t)n_groups * sizeof(int64_t)); if (n_rows == 0 || n_groups == 0) return out; + /* Approximate path: when n_rows clears the HLL threshold (same as + * the buf-form caller — 1 M rows), build a group-major idx layout + * and run the sparse-HLL per-group kernel. Sparse-representation + * HLL makes this memory-bounded regardless of n_groups: each task + * holds one sketch that's ≤ 17 KB total (1 KB sparse + 16 KB + * dense, allocated together on the stack), so concurrent footprint + * is n_workers × 17 KB instead of n_groups × 16 KB. Returns a + * ~0.8 % std-error estimate; callers that need exact counts at + * this scale must not hit this gate. */ + if (n_rows >= (1 << 20)) { + /* Streaming HLL: skip the (idx_buf + offsets + counts) CSR build + * by accumulating directly into n_groups sketches per worker in + * a single pass over (row_gid[r], val[r]). The CSR build cost + * (two passes of int64 reads over n_rows) is ~30 % of wall time + * on q10/q08 ClickBench, while the HLL pass itself is ~7 %. + * + * Gated on a per-worker memory budget: each worker keeps a bank + * of n_groups sketches whose sparse + dense buffers come from + * one pre-allocated slab. At P=14 that's ~17 KB per group; + * with the 8 MB-per-worker budget below, n_groups must be ≤ + * 482 (at one worker) and shrinks pro-rata with worker count + * — i.e. the *total* concurrent footprint is bounded at + * n_workers * 8 MB ≤ ~64 MB on a 16-thread box. + * + * Lower bound (n_groups < 16) avoids the dispatch overhead of + * n_workers-fold bank merges when there's only a handful of + * groups — the CSR path's per-group task dispatch dominates + * there anyway, but the streaming bank merge has its own fixed + * cost. Below the bound we fall through to the CSR HLL path. */ + const size_t RAY_HLL_STREAM_BUDGET_PER_WORKER = (size_t)8 * 1024 * 1024; + /* Per-sketch slab footprint at the precision the kernel uses + * (RAY_HLL_DEFAULT_P → m = 16384). sizeof(ray_hll_t) is small + * relative to the buffers; rounded into the count. */ + size_t hll_per_group = + sizeof(ray_hll_t) + + RAY_HLL_SPARSE_CAP * sizeof(uint32_t) + + ((size_t)1u << RAY_HLL_DEFAULT_P); + bool stream_ok = (n_groups >= 16) && + ((size_t)n_groups * hll_per_group + <= RAY_HLL_STREAM_BUDGET_PER_WORKER); + if (stream_ok) { + int rc = ray_count_distinct_approx_pg_stream( + src, row_gid, n_rows, n_groups, + RAY_HLL_DEFAULT_P, odata); + if (rc == 0) return out; + /* Streaming failed (OOM / unsupported type) — fall through + * to the CSR HLL path with odata still zeroed. */ + memset(odata, 0, (size_t)n_groups * sizeof(int64_t)); + } + + ray_t* approx = count_distinct_per_group_hll(src, row_gid, + n_rows, n_groups, out); + if (approx) return approx; + /* Fall through on dispatch failure — counts not yet written. */ + memset(odata, 0, (size_t)n_groups * sizeof(int64_t)); + } + /* Parallel partitioned path for sizes where the serial global hash * blows L3. Threshold tuned so the partition / scatter / dedup * dispatch overhead stays smaller than the cache-miss savings. */ @@ -1892,18 +2006,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { return reduction_i64_result(read_col_i64(base, row, in_type, input->attrs), in_type); } - reduce_acc_t cached; - if ((op->opcode == OP_MIN || op->opcode == OP_MAX) && - reduce_cache_allowed(input, sel_idx) && - reduce_cache_get(input, &cached)) { - if (sel_idx_block) ray_release(sel_idx_block); - return op->opcode == OP_MIN - ? reduction_extreme_result(op, in_type, cached.cnt > 0, - cached.min_f, cached.min_i) - : reduction_extreme_result(op, in_type, cached.cnt > 0, - cached.max_f, cached.max_i); - } - ray_pool_t* pool = ray_pool_get(); if (pool && scan_n >= RAY_PARALLEL_THRESHOLD) { uint32_t nw = ray_pool_total_workers(pool); @@ -1940,9 +2042,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { } } - if (reduce_cache_allowed(input, sel_idx)) - reduce_cache_put(input, &merged); - ray_t* result; switch (op->opcode) { case OP_SUM: result = in_type == RAY_F64 ? ray_f64(merged.sum_f) : ray_i64(merged.sum_i); break; @@ -1982,8 +2081,6 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { reduce_acc_init(&acc); reduce_range(input, 0, scan_n, &acc, has_nulls, sel_idx); if (sel_idx_block) ray_release(sel_idx_block); - if (reduce_cache_allowed(input, sel_idx)) - reduce_cache_put(input, &acc); switch (op->opcode) { case OP_SUM: return in_type == RAY_F64 ? ray_f64(acc.sum_f) : ray_i64(acc.sum_i); @@ -2451,6 +2548,16 @@ static inline uint32_t group_probe_entry(group_ht_t* ht, uint32_t slot = (uint32_t)(hash & mask); uint16_t key_bytes = (uint16_t)((ly->n_keys + 1) * 8); + /* For count-only queries (no SUM/MIN/MAX/SUMSQ/PEARSON aggregator + * state, no FIRST/LAST row tracking, no binary aggregator y-side) + * init_accum_from_entry and accum_from_entry are no-ops on every + * non-count slot — the per-row call still iterates n_aggs slots, + * reads agg_val_slot[a], memcpy's the entry's agg value into a + * local, then drops it. That's ~6 ns / row × n_keys=1 millions of + * rows, ~7 ms wall on q15. Skip the call when none of the flags + * that drive its writes are set. */ + uint8_t accum_skip = (ly->need_flags == 0 + && (ly->agg_is_first | ly->agg_is_last | ly->agg_is_binary) == 0); for (;;) { uint32_t sv = ht->slots[slot]; if (sv == HT_EMPTY) { @@ -2462,7 +2569,8 @@ static inline uint32_t group_probe_entry(group_ht_t* ht, char* row = ht->rows + (size_t)gid * ly->row_stride; *(int64_t*)row = 1; /* count = 1 */ memcpy(row + 8, ekeys, key_bytes); - init_accum_from_entry(row, entry, ly); + if (!accum_skip) + init_accum_from_entry(row, entry, ly); ht->slots[slot] = HT_PACK(salt, gid); if (ht->grp_count * 2 > ht->ht_cap) { group_ht_rehash(ht, key_types); @@ -2476,7 +2584,8 @@ static inline uint32_t group_probe_entry(group_ht_t* ht, if (group_keys_equal((const int64_t*)(row + 8), (const int64_t*)ekeys, ly, ht->key_data)) { (*(int64_t*)row)++; /* count++ */ - accum_from_entry(row, entry, ly); + if (!accum_skip) + accum_from_entry(row, entry, ly); return mask; } } @@ -3199,6 +3308,274 @@ static void radix_phase2_fn(void* ctx, uint32_t worker_id, int64_t start, int64_ } } +/* ============================================================================ + * Fused radix: per-(worker, partition) HT direct-insert + per-partition merge + * + * Replaces the materialise-fat-entries-then-build-HTs round trip with a + * single-pass aggregation per (worker, partition) HT, followed by an + * in-cache merge per partition. Currently restricted to count-only + * queries (every agg is OP_COUNT) — the merge primitive here only + * knows how to combine counts; SUM/AVG/MIN/MAX would need their own + * state-merge logic (next increment). + * + * Per-(worker, partition) HT for a 10M-row count-by-UserID: ~3M distinct + * keys ÷ 256 parts ÷ 8 workers ≈ 1.5K groups → cap ~4K slots → ~64 KB + * row store, L1/L2-resident. Worker w processes its row range; per row + * it hashes keys, computes partition = RADIX_PART(h), probes its local + * HT_p. Phase2 dispatches partitions across workers; each merges the n + * worker HTs for one partition into a final partition HT in part_hts[p]. + * Phase3 (radix_phase3_fn) emits from part_hts[] exactly as before. + * ============================================================================ */ + +/* Merge one source group row into the target HT. Hash is recomputed from + * the row's key region via hash_keys_inline — identical to what + * group_probe_entry did when the row was first inserted, so the partition + * assignment is consistent. Supports need_flags ∈ {0, GHT_NEED_SUM}: + * count-only and count+SUM/AVG. On miss, the entire source row is copied + * verbatim (memcpy of row_stride); on hit, count += src.count and, when + * need_sum, each enabled sum slot accumulates the source's sum (f64 or + * i64 per agg_is_f64). Caller's v2 gate filters out PROD/FIRST/LAST/ + * MIN/MAX/SUMSQ/PEARSON/MEDIAN — those need richer state merges. */ +static inline uint32_t group_merge_row(group_ht_t* ht, + const char* src_row, const int8_t* key_types, uint32_t mask) +{ + const ght_layout_t* ly = &ht->layout; + int64_t src_count = *(const int64_t*)src_row; + const int64_t* skeys = (const int64_t*)(src_row + 8); + uint64_t h = hash_keys_inline(skeys, key_types, ly->n_keys, + ly->wide_key_mask, ly->wide_key_esz, + ht->key_data); + uint8_t salt = HT_SALT(h); + uint32_t slot = (uint32_t)(h & mask); + uint8_t na = ly->n_aggs; + uint8_t f64_mask = ly->agg_is_f64; + uint16_t off_sum = ly->off_sum; + bool need_sum = (ly->need_flags & GHT_NEED_SUM) != 0; + for (;;) { + uint32_t sv = ht->slots[slot]; + if (sv == HT_EMPTY) { + if (ht->grp_count >= ht->grp_cap) { + if (!group_ht_grow(ht)) { ht->oom = 1; return mask; } + } + uint32_t gid = ht->grp_count++; + char* row = ht->rows + (size_t)gid * ly->row_stride; + /* Whole-row copy: count + keys/null_mask + aggregator state. */ + memcpy(row, src_row, ly->row_stride); + ht->slots[slot] = HT_PACK(salt, gid); + if (ht->grp_count * 2 > ht->ht_cap) { + group_ht_rehash(ht, key_types); + mask = ht->ht_cap - 1; + } + return mask; + } + if (HT_SALT_V(sv) == salt) { + uint32_t gid = HT_GID(sv); + char* row = ht->rows + (size_t)gid * ly->row_stride; + if (group_keys_equal((const int64_t*)(row + 8), + skeys, ly, ht->key_data)) { + *(int64_t*)row += src_count; + if (need_sum) { + for (uint8_t a = 0; a < na; a++) { + int8_t s = ly->agg_val_slot[a]; + if (s < 0) continue; + size_t off = (size_t)off_sum + (size_t)s * 8; + if (f64_mask & (1u << a)) { + double sv_f; + memcpy(&sv_f, src_row + off, 8); + *(double*)(row + off) += sv_f; + } else { + int64_t sv_i; + memcpy(&sv_i, src_row + off, 8); + *(int64_t*)(row + off) += sv_i; + } + } + } + return mask; + } + } + slot = (slot + 1) & mask; + } +} + +typedef struct { + void** key_data; + int8_t* key_types; + uint8_t* key_attrs; + ray_t** key_vecs; + ray_t** agg_vecs; /* may be NULL for pure COUNT (n_agg_vals==0) */ + ray_t** agg_vecs2; + uint8_t* agg_strlen; + uint8_t nullable_mask; + uint32_t n_workers; + group_ht_t* wpart_hts; /* [n_workers * RADIX_P] */ + ght_layout_t layout; + ray_t* rowsel; + const int64_t* match_idx; + _Atomic(int) oom; +} radix_v2_phase1_ctx_t; + +static void radix_v2_phase1_fn(void* ctx, uint32_t worker_id, + int64_t start, int64_t end) { + radix_v2_phase1_ctx_t* c = (radix_v2_phase1_ctx_t*)ctx; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + const ght_layout_t* ly = &c->layout; + uint8_t nk = ly->n_keys; + uint8_t wide = ly->wide_key_mask; + uint8_t nullable = c->nullable_mask; + const int64_t* match_idx = c->match_idx; + + group_ht_t* my_hts = &c->wpart_hts[(size_t)worker_id * RADIX_P]; + /* Lazily init this worker's 256 partition HTs. */ + for (uint32_t p = 0; p < RADIX_P; p++) { + if (!my_hts[p].slots) { + if (!group_ht_init_sized(&my_hts[p], 256, ly, 128)) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + if (wide && c->key_data) + group_ht_set_key_data(&my_hts[p], c->key_data); + } + } + uint32_t masks[RADIX_P]; + for (uint32_t p = 0; p < RADIX_P; p++) masks[p] = my_hts[p].ht_cap - 1; + + /* Stack-resident transient entry, same layout as group_rows_range. */ + char ebuf[8 + 9 * 8 + 8 * 8 + 8]; + for (int64_t i = start; i < end; i++) { + if (((i - start) & 65535) == 0 && ray_interrupted()) break; + int64_t row = match_idx ? match_idx[i] : i; + if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, row)) + continue; + uint64_t h = 0; + int64_t* ek = (int64_t*)(ebuf + 8); + int64_t null_mask = 0; + for (uint8_t k = 0; k < nk; k++) { + int8_t t = c->key_types[k]; + uint64_t kh; + bool is_null = (nullable & (1u << k)) + && ray_vec_is_null(c->key_vecs[k], row); + if (is_null) { + null_mask |= (int64_t)(1u << k); + ek[k] = 0; + kh = ray_hash_i64(0); + } else if (wide & (1u << k)) { + uint8_t esz = ly->wide_key_esz[k]; + const void* src = (const char*)c->key_data[k] + (size_t)row * esz; + ek[k] = row; + kh = ray_hash_bytes(src, esz); + } else if (t == RAY_F64) { + int64_t kv; + memcpy(&kv, &((double*)c->key_data[k])[row], 8); + ek[k] = kv; + kh = ray_hash_f64(((double*)c->key_data[k])[row]); + } else { + int64_t kv = read_col_i64(c->key_data[k], row, t, c->key_attrs[k]); + ek[k] = kv; + kh = ray_hash_i64(kv); + } + h = (k == 0) ? kh : ray_hash_combine(h, kh); + } + ek[nk] = null_mask; + if (null_mask) h = ray_hash_combine(h, ray_hash_i64(null_mask)); + *(uint64_t*)ebuf = h; + /* Pack agg values into entry — only when the HT layout actually + * reads them. For count-only need_flags == 0 and accum_from_entry + * skips every agg slot; packing here would be a wasted column + * read per row (a measurable regression on q15-class queries). */ + if (ly->need_flags) { + int64_t* ev = (int64_t*)(ebuf + 8 + ((size_t)nk + 1) * 8); + uint8_t vi = 0; + uint8_t na = ly->n_aggs; + uint8_t bin_mask = ly->agg_is_binary; + uint8_t hol_mask = ly->agg_is_holistic; + for (uint8_t a = 0; a < na; a++) { + if (hol_mask & (1u << a)) continue; + ray_t* ac = c->agg_vecs ? c->agg_vecs[a] : NULL; + if (!ac) continue; + if (c->agg_strlen && c->agg_strlen[a]) + ev[vi] = group_strlen_at(ac, row); + else if (ac->type == RAY_F64) + memcpy(&ev[vi], &((double*)ray_data(ac))[row], 8); + else + ev[vi] = read_col_i64(ray_data(ac), row, ac->type, ac->attrs); + vi++; + if ((bin_mask & (1u << a)) && c->agg_vecs2 && c->agg_vecs2[a]) { + ray_t* ay = c->agg_vecs2[a]; + if (ay->type == RAY_F64) + memcpy(&ev[vi], &((double*)ray_data(ay))[row], 8); + else + ev[vi] = read_col_i64(ray_data(ay), row, ay->type, ay->attrs); + vi++; + } + } + } + uint32_t p = RADIX_PART(h); + uint32_t new_mask = group_probe_entry(&my_hts[p], ebuf, + c->key_types, masks[p]); + if (my_hts[p].oom) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + masks[p] = new_mask; + } +} + +typedef struct { + group_ht_t* wpart_hts; /* [n_workers * RADIX_P] — input */ + group_ht_t* part_hts; /* [RADIX_P] — output */ + int8_t* key_types; + uint32_t n_workers; + ght_layout_t layout; + void** key_data; + _Atomic(int) oom; +} radix_v2_phase2_ctx_t; + +static void radix_v2_phase2_fn(void* ctx, uint32_t worker_id, + int64_t start, int64_t end) { + (void)worker_id; + radix_v2_phase2_ctx_t* c = (radix_v2_phase2_ctx_t*)ctx; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + uint16_t row_stride = c->layout.row_stride; + for (int64_t p = start; p < end; p++) { + /* Upper bound on the merged partition: sum of worker grp_counts + * (some keys may be present in multiple workers — the merge will + * fold those, so the final grp_count is ≤ this sum). */ + uint32_t total_grps = 0; + for (uint32_t w = 0; w < c->n_workers; w++) + total_grps += c->wpart_hts[(size_t)w * RADIX_P + p].grp_count; + if (total_grps == 0) continue; + uint32_t ht_cap = 256; + { + uint64_t target = (uint64_t)total_grps * 2; + if (target < 256) target = 256; + while (ht_cap < target) ht_cap *= 2; + } + uint32_t init_grp = 256; + while (init_grp < total_grps && init_grp < 65536) init_grp *= 2; + if (!group_ht_init_sized(&c->part_hts[p], ht_cap, &c->layout, init_grp)) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + if (c->layout.wide_key_mask && c->key_data) + group_ht_set_key_data(&c->part_hts[p], c->key_data); + uint32_t mask = c->part_hts[p].ht_cap - 1; + for (uint32_t w = 0; w < c->n_workers; w++) { + group_ht_t* src = &c->wpart_hts[(size_t)w * RADIX_P + p]; + if (src->grp_count == 0) continue; + const char* rows = src->rows; + for (uint32_t gi = 0; gi < src->grp_count; gi++) { + mask = group_merge_row(&c->part_hts[p], + rows + (size_t)gi * row_stride, + c->key_types, mask); + if (c->part_hts[p].oom) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + } + } + } +} + /* ============================================================================ * Parallel direct-array accumulation for low-cardinality single integer key * ============================================================================ */ @@ -3213,6 +3590,12 @@ typedef struct { uint32_t n_workers; const int64_t* match_idx; /* NULL = no selection */ ray_t* rowsel; + /* DA-path early-out: once any worker observes a key span wider than + * span_budget the direct-array path is provably infeasible (its slot + * count would exceed DA_MAX_COMPOSITE_SLOTS), so the whole scan can + * stop instead of reading the rest of a 10M-row column for nothing. */ + int64_t span_budget; + _Atomic(int)* abort_flag; } minmax_ctx_t; static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) { @@ -3221,11 +3604,29 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t const int64_t* match_idx = c->match_idx; int64_t kmin = INT64_MAX, kmax = INT64_MIN; int8_t t = c->key_type; - + const int64_t span_budget = c->span_budget; + + /* Span check and abort poll are batched (every 1024 rows) so the + * hot per-row loop body stays a branchless min/max with no atomics. + * 8192 was too sparse — the dispatcher hands out 8K-row morsels, so + * `(i-start) & 8191 == 0` only ever fired at the morsel boundary + * (where kmin=INT64_MAX/kmax=INT64_MIN make the span check vacuous), + * leaving every full 8K morsel to run end-to-end on doomed columns. */ #define MINMAX_SEG_LOOP(TYPE, CAST) \ do { \ const TYPE* kd = (const TYPE*)c->key_data; \ for (int64_t i = start; i < end; i++) { \ + if (((i - start) & 1023) == 0) { \ + if (atomic_load_explicit(c->abort_flag, \ + memory_order_relaxed)) \ + goto minmax_done; \ + if (kmax >= kmin && \ + (uint64_t)(kmax - kmin) > (uint64_t)span_budget) { \ + atomic_store_explicit(c->abort_flag, 1, \ + memory_order_relaxed); \ + goto minmax_done; \ + } \ + } \ int64_t r = match_idx ? match_idx[i] : i; \ if (!match_idx && c->rowsel && !group_rowsel_pass(c->rowsel, r)) continue; \ int64_t v = (int64_t)CAST kd[r]; \ @@ -3252,6 +3653,7 @@ static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t #undef MINMAX_SEG_LOOP +minmax_done: /* Merge with existing per-worker values (a worker may process multiple morsels) */ if (kmin < c->per_worker_min[wid]) c->per_worker_min[wid] = kmin; if (kmax > c->per_worker_max[wid]) c->per_worker_max[wid] = kmax; @@ -5237,9 +5639,24 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, } } ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get(); + /* Historical: enabled only for OP_COUNT (the min_count_exclusive + * heavy-hitter filter and the top_count_take heap). The + * top_count_take heap path now also accepts SUM/MIN/MAX — those + * fire through the v2_emit per-partition compact below, which + * reads the agg's int64 row slot directly. The non-COUNT paths + * (sparse_i64 range-counting, the n_keys>1 macro fast path) still + * gate on COUNT because they DON'T have the agg value available + * outside the row slot. */ bool use_emit_filter = emit_filter.enabled && emit_filter.agg_index < n_aggs && ext->agg_ops[emit_filter.agg_index] == OP_COUNT; + bool use_topn_filter = emit_filter.enabled && + emit_filter.top_count_take > 0 && + emit_filter.agg_index < n_aggs && + (ext->agg_ops[emit_filter.agg_index] == OP_COUNT || + ext->agg_ops[emit_filter.agg_index] == OP_SUM || + ext->agg_ops[emit_filter.agg_index] == OP_MIN || + ext->agg_ops[emit_filter.agg_index] == OP_MAX); /* ---- Scalar aggregate fast path (n_keys == 0): flat vector scan ---- */ if (n_keys == 0 && nrows > 0) { @@ -5559,6 +5976,9 @@ da_path:; ? ray_pool_total_workers(mm_pool) : 1; /* VLA bounded by worker count — max ~2KB per key even on 256-core systems. */ int64_t mm_mins[mm_n], mm_maxs[mm_n]; + /* Shared across keys: once any key proves the DA slot count + * infeasible the scan aborts instead of reading the rest. */ + _Atomic(int) mm_abort = 0; for (uint8_t k = 0; k < n_keys && da_fits; k++) { int64_t kmin, kmax; for (uint32_t w = 0; w < mm_n; w++) { @@ -5574,12 +5994,18 @@ da_path:; .n_workers = mm_n, .match_idx = match_idx, .rowsel = rowsel, + .span_budget = DA_MAX_COMPOSITE_SLOTS, + .abort_flag = &mm_abort, }; if (mm_n > 1) { ray_pool_dispatch(mm_pool, minmax_scan_fn, &mm_ctx, n_scan); } else { minmax_scan_fn(&mm_ctx, 0, 0, n_scan); } + if (atomic_load_explicit(&mm_abort, memory_order_relaxed)) { + da_fits = false; + break; + } kmin = INT64_MAX; kmax = INT64_MIN; for (uint32_t w = 0; w < mm_n; w++) { if (mm_mins[w] < kmin) kmin = mm_mins[w]; @@ -7407,6 +7833,114 @@ ht_path:; skip_top_count_filter: if (pool && nrows >= RAY_PARALLEL_THRESHOLD && n_total > 1 && !rowsel) { + /* Per-(worker, partition) direct-insert path: aggregates into + * thread-local partition HTs during phase1, then merges per + * partition. Bypasses the phase1 fat-entry materialisation + + * phase2 re-read DRAM round trip. On success it populates + * part_hts[] in the format the existing phase3 emit consumes. + * + * Gate: every agg is COUNT/SUM/AVG (the merge primitive knows + * how to add counts and sum slots; PROD/MIN/MAX/FIRST/LAST/ + * SUMSQ/PEARSON/MEDIAN need richer state-merge logic). Agg + * input columns must be non-nullable for now — sentinel-skip + * inside accum_from_entry is correct, but the merge step needs + * an nn_count and that isn't tracked yet. */ + bool v2_ok = (n_keys >= 1 && n_aggs > 0); + /* SYM single-key queries already had a tuned path (q33/q34 hit it + * before falling to the radix); v2 doesn't beat it for them, so + * skip when any key is SYM and let the existing pipeline handle it. */ + for (uint8_t k = 0; k < n_keys && v2_ok; k++) + if (key_types[k] == RAY_SYM) v2_ok = false; + for (uint8_t a = 0; a < n_aggs && v2_ok; a++) { + uint16_t op = ext->agg_ops[a]; + if (op != OP_COUNT && op != OP_SUM && op != OP_AVG) { + v2_ok = false; + break; + } + if (agg_vecs[a]) { + ray_t* src = (agg_vecs[a]->attrs & RAY_ATTR_SLICE) + ? agg_vecs[a]->slice_parent : agg_vecs[a]; + if (src && (src->attrs & RAY_ATTR_HAS_NULLS)) + v2_ok = false; + } + } + if (v2_ok && !(ght_layout.agg_is_first | ght_layout.agg_is_last + | ght_layout.agg_is_holistic + | ght_layout.agg_is_binary)) { + ray_t* wpart_hdr = NULL; + size_t v2_n_w = (size_t)n_total * RADIX_P; + group_ht_t* wpart_hts = (group_ht_t*)scratch_calloc( + &wpart_hdr, v2_n_w * sizeof(group_ht_t)); + ray_t* v2_part_hdr = NULL; + group_ht_t* v2_part_hts = wpart_hts + ? (group_ht_t*)scratch_calloc(&v2_part_hdr, + RADIX_P * sizeof(group_ht_t)) + : NULL; + if (!wpart_hts || !v2_part_hts) { + if (wpart_hts) scratch_free(wpart_hdr); + if (v2_part_hts) scratch_free(v2_part_hdr); + goto v2_done; + } + uint8_t v2_nullable = 0; + for (uint8_t k = 0; k < n_keys; k++) { + if (!key_vecs[k]) continue; + ray_t* src = (key_vecs[k]->attrs & RAY_ATTR_SLICE) + ? key_vecs[k]->slice_parent : key_vecs[k]; + if (src && (src->attrs & RAY_ATTR_HAS_NULLS)) + v2_nullable |= (uint8_t)(1u << k); + } + radix_v2_phase1_ctx_t v2p1 = { + .key_data = key_data, + .key_types = key_types, + .key_attrs = key_attrs, + .key_vecs = key_vecs, + .agg_vecs = agg_vecs, + .agg_vecs2 = agg_vecs2, + .agg_strlen = agg_strlen, + .nullable_mask = v2_nullable, + .n_workers = n_total, + .wpart_hts = wpart_hts, + .layout = ght_layout, + .rowsel = rowsel, + .match_idx = match_idx, + .oom = 0, + }; + ray_pool_dispatch(pool, radix_v2_phase1_fn, &v2p1, n_scan); + CHECK_CANCEL_GOTO(pool, cleanup); + if (atomic_load_explicit(&v2p1.oom, memory_order_relaxed)) { + for (size_t i = 0; i < v2_n_w; i++) + group_ht_free(&wpart_hts[i]); + scratch_free(wpart_hdr); + scratch_free(v2_part_hdr); + goto v2_done; + } + radix_v2_phase2_ctx_t v2p2 = { + .wpart_hts = wpart_hts, + .part_hts = v2_part_hts, + .key_types = key_types, + .n_workers = n_total, + .layout = ght_layout, + .key_data = key_data, + .oom = 0, + }; + ray_pool_dispatch_n(pool, radix_v2_phase2_fn, &v2p2, RADIX_P); + CHECK_CANCEL_GOTO(pool, cleanup); + /* Worker HTs are no longer needed once the merge is done. */ + for (size_t i = 0; i < v2_n_w; i++) + group_ht_free(&wpart_hts[i]); + scratch_free(wpart_hdr); + if (atomic_load_explicit(&v2p2.oom, memory_order_relaxed)) { + for (uint32_t p = 0; p < RADIX_P; p++) + group_ht_free(&v2_part_hts[p]); + scratch_free(v2_part_hdr); + goto v2_done; + } + /* Hand off to the existing phase3 emit. */ + part_hts = v2_part_hts; + part_hts_hdr = v2_part_hdr; + goto v2_emit; + } +v2_done:; size_t n_bufs = (size_t)n_total * RADIX_P; radix_bufs = (radix_buf_t*)scratch_calloc(&radix_bufs_hdr, n_bufs * sizeof(radix_buf_t)); @@ -7506,7 +8040,180 @@ ht_path:; scratch_free(radix_bufs_hdr); radix_bufs = NULL; radix_bufs_hdr = NULL; - ray_heap_gc(); + /* No explicit GC — top-level statement GC catches it. */ + } + +v2_emit:; + /* Top-N aware compaction: when the (select … by … desc: c take: N) + * shape is in flight (use_emit_filter + top_count_take, COUNT agg), + * the global answer is the N rows with the largest count across + * all partitions. Run a global bounded-heap (size N) over the + * union of per-partition rows here, then in-place compact each + * partition's row array to contain only globally-surviving rows. + * Phase3 below then emits N rows total instead of total_grps — + * the major win for high-cardinality keys like UserID/URL where + * total_grps is in the millions but N is ≤ 1024. + * + * Implementation notes: + * - The bounded heap orders by count (the agg at COUNT slot, the + * first int64 in each row). Equal counts are stable: the + * first row seen wins. Final per-partition row order is + * preserved so apply_sort_take below can do the final + * arrange-by-agg deterministically. + * - We also handle the "fewer total rows than N" case — compact + * becomes a no-op. + * - Only fires when emit_filter.top_count_take > 0; existing + * min_count_exclusive-only filters fall through unchanged. */ + if (use_topn_filter) { + int64_t k_take = emit_filter.top_count_take; + uint32_t total_pre = 0; + for (uint32_t p = 0; p < RADIX_P; p++) + total_pre += part_hts[p].grp_count; + /* Resolve the in-row offset of the order-by agg's value. For + * COUNT it's the leading int64 at offset 0; for SUM/MIN/MAX + * it's the per-slot int64 in off_sum/off_min/off_max. F64 + * agg outputs (sum over an F64 column) compare by bitcast — + * for IEEE 754 the bit pattern preserves ordering for finite + * positive values; mixed-sign and NaN cases drop the heap + * back to a wider comparator. To stay correct we exclude + * F64-output aggs from this fast path (the COUNT count is + * always I64, and SUM/MIN/MAX over an integer column keep + * an I64 slot — agg_is_f64 marks the SUM-over-F64 case). */ + uint16_t order_op = emit_filter.agg_op + ? emit_filter.agg_op + : (uint16_t)OP_COUNT; + uint8_t agg_index_local = emit_filter.agg_index; + uint16_t order_off = 0; /* default: COUNT at row+0 */ + bool order_is_f64 = false; + if (agg_index_local < n_aggs && + (ght_layout.agg_is_f64 & (1u << agg_index_local))) + order_is_f64 = true; + int8_t agg_slot = ght_layout.agg_val_slot[agg_index_local]; + if (order_op == OP_SUM) { + if (agg_slot < 0 || order_is_f64) goto topn_compact_skip; + order_off = (uint16_t)(ght_layout.off_sum + + (uint16_t)agg_slot * 8u); + } else if (order_op == OP_MIN) { + if (agg_slot < 0 || order_is_f64) goto topn_compact_skip; + if (ght_layout.agg_is_sym & (1u << agg_index_local)) + goto topn_compact_skip; + order_off = (uint16_t)(ght_layout.off_min + + (uint16_t)agg_slot * 8u); + } else if (order_op == OP_MAX) { + if (agg_slot < 0 || order_is_f64) goto topn_compact_skip; + if (ght_layout.agg_is_sym & (1u << agg_index_local)) + goto topn_compact_skip; + order_off = (uint16_t)(ght_layout.off_max + + (uint16_t)agg_slot * 8u); + } + uint8_t desc_dir = emit_filter.desc ? 1 : 0; + /* COUNT defaults to desc when the filter struct's desc bit + * isn't set (old single-bit filter shape). Producer code in + * query.c sets it explicitly. */ + if (order_op == OP_COUNT && !emit_filter.desc) desc_dir = 1; + if ((int64_t)total_pre > k_take && k_take > 0 && k_take <= 1024) { + /* Stack heap: (val, part, gid) triples. k_take ≤ 1024 + * caps the footprint at 1024 * 16 B = 16 KiB. The heap + * invariant flips by direction: min-heap for desc (we + * evict the smallest to keep the largest N), max-heap + * for asc (evict the largest to keep the smallest N). */ + int64_t hval[1024]; + uint32_t hpart[1024]; + uint32_t hgid[1024]; + int64_t hn = 0; + /* For top-N largest (desc=1): min-heap. Root is smallest; + * incoming v replaces root iff v > root. Heap invariant: + * parent ≤ child (so swap when parent > child). + * + * For top-N smallest (desc=0): max-heap. Root is largest; + * incoming v replaces root iff v < root. Heap invariant: + * parent ≥ child (so swap when parent < child). + * + * TOPN_NEEDS_SWAP(parent, child) := does the parent + * violate the invariant relative to child? */ + #define TOPN_NEEDS_SWAP(parent, child) \ + (desc_dir ? ((parent) > (child)) : ((parent) < (child))) + #define TOPN_SHOULD_REPLACE(new_v, root_v) \ + (desc_dir ? ((new_v) > (root_v)) : ((new_v) < (root_v))) + for (uint32_t p = 0; p < RADIX_P; p++) { + group_ht_t* ph = &part_hts[p]; + uint16_t rs = ph->layout.row_stride; + uint32_t gc = ph->grp_count; + for (uint32_t gi = 0; gi < gc; gi++) { + const char* row = ph->rows + (size_t)gi * rs; + int64_t v = *(const int64_t*)(const void*) + (row + order_off); + if (hn < k_take) { + int64_t j = hn++; + hval[j] = v; hpart[j] = p; hgid[j] = gi; + /* Sift up: bubble new entry toward root while + * parent violates invariant. */ + while (j > 0) { + int64_t pr = (j - 1) >> 1; + if (!TOPN_NEEDS_SWAP(hval[pr], hval[j])) break; + int64_t tc = hval[pr]; hval[pr] = hval[j]; hval[j] = tc; + uint32_t tp = hpart[pr]; hpart[pr] = hpart[j]; hpart[j] = tp; + uint32_t tg = hgid[pr]; hgid[pr] = hgid[j]; hgid[j] = tg; + j = pr; + } + } else if (TOPN_SHOULD_REPLACE(v, hval[0])) { + hval[0] = v; hpart[0] = p; hgid[0] = gi; + int64_t j = 0; + /* Sift down: find the child that should be + * promoted (the one most violating the + * invariant) and swap. */ + for (;;) { + int64_t l = j * 2 + 1, r = l + 1, m = j; + if (l < hn && TOPN_NEEDS_SWAP(hval[m], hval[l])) m = l; + if (r < hn && TOPN_NEEDS_SWAP(hval[m], hval[r])) m = r; + if (m == j) break; + int64_t tc = hval[m]; hval[m] = hval[j]; hval[j] = tc; + uint32_t tp = hpart[m]; hpart[m] = hpart[j]; hpart[j] = tp; + uint32_t tg = hgid[m]; hgid[m] = hgid[j]; hgid[j] = tg; + j = m; + } + } + } + } + #undef TOPN_NEEDS_SWAP + #undef TOPN_SHOULD_REPLACE + if (hn > 0) { + /* Build per-partition keep lists (sorted asc by gid so + * the in-place compact below is a single forward sweep). */ + uint16_t keep_n[RADIX_P]; + for (uint32_t p = 0; p < RADIX_P; p++) keep_n[p] = 0; + /* Cap per-partition kept count at hn (≤ k_take ≤ 1024). */ + uint32_t kgid[RADIX_P][1024]; + for (int64_t i = 0; i < hn; i++) { + uint32_t p = hpart[i]; + uint16_t kn = keep_n[p]; + /* Insertion-sort into kgid[p][] keeping asc order. */ + uint16_t j = kn; + while (j > 0 && kgid[p][j - 1] > hgid[i]) { + kgid[p][j] = kgid[p][j - 1]; + j--; + } + kgid[p][j] = hgid[i]; + keep_n[p] = (uint16_t)(kn + 1); + } + /* In-place compact each partition. */ + for (uint32_t p = 0; p < RADIX_P; p++) { + group_ht_t* ph = &part_hts[p]; + uint16_t rs = ph->layout.row_stride; + uint16_t kn = keep_n[p]; + if (kn == ph->grp_count) continue; /* all kept */ + if (kn == 0) { ph->grp_count = 0; continue; } + for (uint16_t i = 0; i < kn; i++) { + uint32_t src = kgid[p][i]; + if (src == (uint32_t)i) continue; + memmove(ph->rows + (size_t)i * rs, + ph->rows + (size_t)src * rs, rs); + } + ph->grp_count = kn; + } + } + } + topn_compact_skip:; } /* Prefix offsets */ @@ -8312,7 +9019,10 @@ sequential_fallback:; if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]); if (match_idx_block) ray_release(match_idx_block); - ray_heap_gc(); + /* No explicit GC — top-level statement runner (run_piped / repl) + * calls ray_heap_gc() once per statement, catching every + * intermediate freed above. The duplicate inner call doubled the + * per-query GC cost on bench loops. */ return result; } diff --git a/src/ops/hll.c b/src/ops/hll.c new file mode 100644 index 00000000..ea2bc131 --- /dev/null +++ b/src/ops/hll.c @@ -0,0 +1,850 @@ +/* + * Copyright (c) 2025-2026 Anton Kundenko + * All rights reserved. + + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ops/hll.h" +#include "ops/internal.h" +#include "ops/ops.h" +#include "core/pool.h" +#include "table/sym.h" + +#include +#include +#include + +int ray_hll_init(ray_hll_t* h, uint8_t p) { + if (!h) return -1; + if (p < 4) p = 4; /* too small loses all accuracy */ + if (p > 18) p = 18; /* 256 KB cap on register array */ + memset(h, 0, sizeof(*h)); + uint32_t m = 1u << p; + h->p = p; + h->m = m; + h->regs = (uint8_t*)scratch_calloc(&h->_hdr, (size_t)m); + if (!h->regs) return -1; + return 0; +} + +void ray_hll_init_sparse(ray_hll_t* h, uint8_t p, + uint32_t* sparse_buf, uint32_t sparse_cap, + uint8_t* dense_buf) { + if (!h) return; + if (p < 4) p = 4; + if (p > 18) p = 18; + memset(h, 0, sizeof(*h)); + h->p = p; + h->m = 1u << p; + /* Encode caller-owned dense buffer as a tagged pointer in _hdr — + * low bit set ⇒ caller-owned (skip free), clear ⇒ scratch ray_t*. + * promote_to_dense recovers it; ray_hll_free skips the scratch_free. + * Stack allocations on x86-64 are at least 8-byte aligned for arrays + * of this size, so the low bit is always free for tagging. */ + assert(((uintptr_t)dense_buf & 1u) == 0); + uintptr_t tagged = (uintptr_t)dense_buf | (uintptr_t)1; + h->_hdr = (ray_t*)tagged; + h->sparse_keys = sparse_buf; + h->sparse_count = 0; + h->sparse_cap = sparse_cap; +} + +/* Recover the caller-owned dense buffer (NULL if none). Used by + * promote_to_dense to install regs without a scratch alloc. */ +static inline uint8_t* hll_caller_dense_buf(const ray_hll_t* h) { + uintptr_t tagged = (uintptr_t)h->_hdr; + if (!(tagged & 1)) return NULL; + return (uint8_t*)(tagged & ~(uintptr_t)1); +} + +void ray_hll_promote_to_dense(ray_hll_t* h) { + if (!h || h->regs) return; /* already dense */ + uint8_t* dense = hll_caller_dense_buf(h); + if (!dense) { + /* No caller buffer — fall back to scratch alloc. Used by + * merge paths that promote a sparse src whose owner is the + * caller's stack but dst is heap-resident; we materialise a + * fresh dense buffer through the scratch arena. */ + ray_t* hdr = NULL; + dense = (uint8_t*)scratch_calloc(&hdr, (size_t)h->m); + if (!dense) { + /* OOM during promote. Leave sparse; caller's estimate + * will overflow into a small under-count. This branch is + * extremely rare (the dense buffer is 16 KB at P=14). */ + return; + } + h->_hdr = hdr; + } else { + /* Caller-owned: clear and install. */ + memset(dense, 0, (size_t)h->m); + h->_hdr = NULL; /* drop tagged pointer; no longer needed */ + } + h->regs = dense; + /* Replay sparse entries into dense (max). */ + uint32_t* sk = h->sparse_keys; + uint32_t n = h->sparse_count; + for (uint32_t i = 0; i < n; i++) { + uint32_t v = sk[i]; + uint32_t idx = v >> 8; + uint8_t rho = (uint8_t)(v & 0xFF); + if (rho > dense[idx]) dense[idx] = rho; + } + h->sparse_keys = NULL; + h->sparse_count = 0; + h->sparse_cap = 0; +} + +void ray_hll_free(ray_hll_t* h) { + if (!h) return; + /* Only free if _hdr is a real scratch handle (low bit clear, non-NULL). + * Tagged caller-owned buffers and NULL _hdr are both no-ops. */ + uintptr_t tagged = (uintptr_t)h->_hdr; + if (h->_hdr && !(tagged & 1)) scratch_free(h->_hdr); + h->regs = NULL; + h->_hdr = NULL; + h->sparse_keys = NULL; + h->sparse_count = 0; + h->sparse_cap = 0; + h->m = 0; + h->p = 0; +} + +void ray_hll_reset(ray_hll_t* h) { + if (!h) return; + if (h->regs) { + memset(h->regs, 0, (size_t)h->m); + return; + } + if (h->sparse_keys) { + /* Don't memset the sparse buffer — entries are only read up to + * sparse_count, so clearing the count is enough. */ + h->sparse_count = 0; + } +} + +/* Merge a sparse src into a dense dst. Each src entry contributes a + * rho-update at its idx slot. */ +static inline void hll_merge_sparse_into_dense(uint8_t* d, + const uint32_t* sk, + uint32_t n) { + for (uint32_t i = 0; i < n; i++) { + uint32_t v = sk[i]; + uint32_t idx = v >> 8; + uint8_t rho = (uint8_t)(v & 0xFF); + if (rho > d[idx]) d[idx] = rho; + } +} + +void ray_hll_merge(ray_hll_t* dst, const ray_hll_t* src) { + if (!dst || !src) return; + if (dst->m != src->m) return; /* mismatched precision — caller bug */ + /* Promote dst to dense first if needed (cheap: at most 256 entries). + * dst's caller-owned dense buffer (if any) gets used; otherwise + * promote_to_dense scratch-allocates. */ + if (!dst->regs) { + ray_hll_promote_to_dense(dst); + if (!dst->regs) return; /* promote OOM — best-effort skip */ + } + if (src->regs) { + const uint8_t* s = src->regs; + uint8_t* d = dst->regs; + uint32_t m = dst->m; + /* Branchless max — keeps the hot per-shard merge in vector regs. + * The compiler usually auto-vectorises this to a packed-max sequence. */ + for (uint32_t i = 0; i < m; i++) { + uint8_t a = d[i], b = s[i]; + d[i] = a > b ? a : b; + } + } else if (src->sparse_keys) { + hll_merge_sparse_into_dense(dst->regs, src->sparse_keys, + src->sparse_count); + } +} + +/* HyperLogLog cardinality estimator (Flajolet, Fusy, Gandouet, Meunier 2007), + * with the original raw-estimate / linear-counting hybrid switch. Skips the + * HLL++ small-range bias-correction tables because the linear-counting branch + * already gives a clean estimate below E ≤ 2.5·m, which is where the raw + * mean diverges from truth. */ +int64_t ray_hll_estimate(const ray_hll_t* h) { + if (!h) return 0; + uint32_t m = h->m; + if (m == 0) return 0; + + /* alpha_m correction constant from the paper. m == 16 / 32 / 64 use + * the closed-form values; everything else uses 0.7213 / (1 + 1.079/m). */ + double alpha_m; + if (m == 16) alpha_m = 0.673; + else if (m == 32) alpha_m = 0.697; + else if (m == 64) alpha_m = 0.709; + else alpha_m = 0.7213 / (1.0 + 1.079 / (double)m); + + /* Sum of 2^-reg[i]. Count zero registers for the linear-counting + * fallback at small cardinalities (when V > 0 and E ≤ 2.5·m). + * Sparse mode: only iterate the entries (each rho>=1 by construction); + * the remaining (m - sparse_count) registers contribute 2^0 = 1 each + * and count as zero registers. */ + double sum_inv = 0.0; + uint32_t n_zeros = 0; + if (h->regs) { + for (uint32_t i = 0; i < m; i++) { + uint8_t r = h->regs[i]; + sum_inv += ldexp(1.0, -(int)r); /* 2^-r */ + n_zeros += (r == 0); + } + } else if (h->sparse_keys) { + uint32_t n = h->sparse_count; + const uint32_t* sk = h->sparse_keys; + /* Each entry stores a unique register idx (linear-probe dedup + * guarantees this). Unset registers contribute 2^0 = 1.0 each + * and count as zeros. */ + sum_inv = (double)(m - n); + n_zeros = m - n; + for (uint32_t i = 0; i < n; i++) { + uint8_t r = (uint8_t)(sk[i] & 0xFF); + sum_inv += ldexp(1.0, -(int)r); + } + } else { + /* Uninitialised — all m registers are conceptually zero. */ + sum_inv = (double)m; + n_zeros = m; + } + + double raw = alpha_m * (double)m * (double)m / sum_inv; + + if (raw <= 2.5 * (double)m && n_zeros != 0) { + /* Linear counting — much tighter than raw for small E. */ + raw = (double)m * log((double)m / (double)n_zeros); + } + /* Large-range bias-correction (the 2^32 upper-edge correction in the + * original paper) is for 32-bit hashes only — we hash 64 bits, so the + * raw value is already unbiased to ~2^57. Skip. */ + + if (raw < 0.0) raw = 0.0; + return (int64_t)(raw + 0.5); +} + +/* ---- Scalar approximate count-distinct aggregator ---------------------- */ + +typedef struct { + const ray_t* vec; + int8_t type; + uint8_t attrs; + bool has_nulls; + ray_hll_t* shards; /* [n_workers] — one HLL per worker */ + uint8_t p; + uint32_t n_workers; + _Atomic(int) oom; +} cda_scalar_ctx_t; + +static void cda_scalar_fn(void* raw, uint32_t worker_id, int64_t start, int64_t end) { + cda_scalar_ctx_t* c = (cda_scalar_ctx_t*)raw; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + ray_hll_t* sh = &c->shards[worker_id % c->n_workers]; + if (!sh->regs) { + if (ray_hll_init(sh, c->p) != 0) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + } + const ray_t* v = c->vec; + const void* base = ray_data((ray_t*)v); + int8_t t = c->type; + bool hn = c->has_nulls; + const int64_t CHK = 65535; + + if (t == RAY_I64 || t == RAY_TIMESTAMP) { + const int64_t* d = (const int64_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int64_t v_i = d[r]; + if (hn && v_i == NULL_I64) continue; + ray_hll_add(sh, ray_hash_i64(v_i)); + } + } else if (t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) { + const int32_t* d = (const int32_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int32_t v_i = d[r]; + if (hn && v_i == NULL_I32) continue; + ray_hll_add(sh, ray_hash_i64((int64_t)v_i)); + } + } else if (t == RAY_I16) { + const int16_t* d = (const int16_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int16_t v_i = d[r]; + if (hn && v_i == NULL_I16) continue; + ray_hll_add(sh, ray_hash_i64((int64_t)v_i)); + } + } else if (t == RAY_BOOL || t == RAY_U8) { + const uint8_t* d = (const uint8_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + ray_hll_add(sh, ray_hash_i64((int64_t)d[r])); + } + } else if (t == RAY_F64) { + const double* d = (const double*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + double v_f = d[r]; + if (v_f != v_f) continue; /* NaN = null in F64 column */ + ray_hll_add(sh, ray_hash_f64(v_f)); + } + } else if (RAY_IS_SYM(t)) { + /* SYM is width-encoded — sym id 0 is the canonical empty-string + * sentinel (treat as null), every other id is a real distinct + * value, so hash the id directly. */ + uint8_t w = c->attrs & RAY_SYM_W_MASK; + if (w == RAY_SYM_W64) { + const int64_t* d = (const int64_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int64_t v_i = d[r]; + if (v_i == 0) continue; + ray_hll_add(sh, ray_hash_i64(v_i)); + } + } else if (w == RAY_SYM_W32) { + const uint32_t* d = (const uint32_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + uint32_t v_i = d[r]; + if (v_i == 0) continue; + ray_hll_add(sh, ray_hash_i64((int64_t)v_i)); + } + } else if (w == RAY_SYM_W16) { + const uint16_t* d = (const uint16_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + uint16_t v_i = d[r]; + if (v_i == 0) continue; + ray_hll_add(sh, ray_hash_i64((int64_t)v_i)); + } + } else { + const uint8_t* d = (const uint8_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + uint8_t v_i = d[r]; + if (v_i == 0) continue; + ray_hll_add(sh, ray_hash_i64((int64_t)v_i)); + } + } + } else if (t == RAY_STR) { + ray_t* vm = (ray_t*)v; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + size_t n = 0; + const char* s = ray_str_vec_get(vm, r, &n); + if (!s || n == 0) continue; + ray_hll_add(sh, ray_hash_bytes(s, n)); + } + } + /* Unsupported types fall through silently — caller validates. */ +} + +ray_t* ray_count_distinct_approx(ray_t* x) { + if (!x || RAY_IS_ERR(x)) return x; + if (!ray_is_vec(x)) { + /* Scalar atom — distinct count is 1 (or 0 if null). */ + if (ray_is_atom(x)) { + if (RAY_ATOM_IS_NULL(x)) return ray_i64(0); + return ray_i64(1); + } + return ray_error("type", "count_distinct_approx: vec expected"); + } + int8_t t = x->type; + /* Reject types we don't hash. */ + if (t != RAY_I64 && t != RAY_I32 && t != RAY_I16 && t != RAY_U8 && + t != RAY_BOOL && t != RAY_F64 && t != RAY_DATE && t != RAY_TIME && + t != RAY_TIMESTAMP && t != RAY_STR && !RAY_IS_SYM(t)) + return ray_error("type", "count_distinct_approx: unsupported element type"); + int64_t n = x->len; + if (n == 0) return ray_i64(0); + + ray_pool_t* pool = ray_pool_get(); + uint32_t nw = (pool && n >= RAY_PARALLEL_THRESHOLD) + ? ray_pool_total_workers(pool) : 1; + + ray_t* shards_hdr = NULL; + ray_hll_t* shards = (ray_hll_t*)scratch_calloc( + &shards_hdr, (size_t)nw * sizeof(ray_hll_t)); + if (!shards) return ray_error("oom", NULL); + + cda_scalar_ctx_t ctx = { + .vec = x, + .type = t, + .attrs = x->attrs, + .has_nulls = (x->attrs & RAY_ATTR_HAS_NULLS) != 0, + .shards = shards, + .p = RAY_HLL_DEFAULT_P, + .n_workers = nw, + .oom = 0, + }; + if (nw > 1) { + ray_pool_dispatch(pool, cda_scalar_fn, &ctx, n); + } else { + cda_scalar_fn(&ctx, 0, 0, n); + } + if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) { + for (uint32_t w = 0; w < nw; w++) ray_hll_free(&shards[w]); + scratch_free(shards_hdr); + return ray_error("oom", "count_distinct_approx: HLL alloc failed"); + } + /* Merge per-worker shards into shard[0], then estimate. */ + for (uint32_t w = 1; w < nw; w++) { + if (shards[w].regs) + ray_hll_merge(&shards[0], &shards[w]); + } + int64_t est = shards[0].regs ? ray_hll_estimate(&shards[0]) : 0; + for (uint32_t w = 0; w < nw; w++) ray_hll_free(&shards[w]); + scratch_free(shards_hdr); + return ray_i64(est); +} + +/* ---- Per-group HLL --------------------------------------------------- */ + +typedef struct { + const ray_t* vec; + int8_t type; + uint8_t attrs; + bool has_nulls; + const int64_t* idx_buf; + const int64_t* offsets; + const int64_t* counts; /* per-group length — offsets has only n_groups entries */ + uint8_t p; + uint32_t m; + int64_t* out; + _Atomic(int) oom; +} cda_pg_buf_ctx_t; + +static void cda_pg_buf_task(void* raw, uint32_t worker_id, int64_t start, int64_t end) { + (void)worker_id; + cda_pg_buf_ctx_t* c = (cda_pg_buf_ctx_t*)raw; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + const void* base = ray_data((ray_t*)c->vec); + int8_t t = c->type; + bool hn = c->has_nulls; + + /* One private HLL per task (allocated on stack so we never touch + * the shared scratch arena from a worker thread). P≤14 → m≤16384, + * fits comfortably in the default 8 MiB worker stack. + * + * Sparse start: the sketch begins in sparse mode using sparse_buf + * (256 entries, 1 KB). Groups with few distinct values never touch + * the dense register array; once the sparse cap is hit on a group, + * promote_to_dense moves it into the stack regs[] buffer. The + * dense buffer is unconditionally allocated on the stack so the + * promotion path is alloc-free. */ + uint8_t regs[1u << 14]; + uint32_t sparse_buf[RAY_HLL_SPARSE_CAP]; + ray_hll_t sk; + + for (int64_t g = start; g < end; g++) { + ray_hll_init_sparse(&sk, c->p, sparse_buf, + RAY_HLL_SPARSE_CAP, regs); + int64_t s = c->offsets[g]; + int64_t e = s + c->counts[g]; + if (t == RAY_I64 || t == RAY_TIMESTAMP) { + const int64_t* d = (const int64_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + int64_t v = d[r]; + if (hn && v == NULL_I64) continue; + ray_hll_add(&sk, ray_hash_i64(v)); + } + } else if (t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) { + const int32_t* d = (const int32_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + int32_t v = d[r]; + if (hn && v == NULL_I32) continue; + ray_hll_add(&sk, ray_hash_i64((int64_t)v)); + } + } else if (t == RAY_I16) { + const int16_t* d = (const int16_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + int16_t v = d[r]; + if (hn && v == NULL_I16) continue; + ray_hll_add(&sk, ray_hash_i64((int64_t)v)); + } + } else if (t == RAY_BOOL || t == RAY_U8) { + const uint8_t* d = (const uint8_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + ray_hll_add(&sk, ray_hash_i64((int64_t)d[r])); + } + } else if (t == RAY_F64) { + const double* d = (const double*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + double v = d[r]; + if (v != v) continue; + ray_hll_add(&sk, ray_hash_f64(v)); + } + } else if (RAY_IS_SYM(t)) { + uint8_t w = c->attrs & RAY_SYM_W_MASK; + if (w == RAY_SYM_W64) { + const int64_t* d = (const int64_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + int64_t v = d[r]; if (v == 0) continue; + ray_hll_add(&sk, ray_hash_i64(v)); + } + } else if (w == RAY_SYM_W32) { + const uint32_t* d = (const uint32_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + uint32_t v = d[r]; if (v == 0) continue; + ray_hll_add(&sk, ray_hash_i64((int64_t)v)); + } + } else if (w == RAY_SYM_W16) { + const uint16_t* d = (const uint16_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + uint16_t v = d[r]; if (v == 0) continue; + ray_hll_add(&sk, ray_hash_i64((int64_t)v)); + } + } else { + const uint8_t* d = (const uint8_t*)base; + for (int64_t k = s; k < e; k++) { + int64_t r = c->idx_buf[k]; + uint8_t v = d[r]; if (v == 0) continue; + ray_hll_add(&sk, ray_hash_i64((int64_t)v)); + } + } + } + c->out[g] = ray_hll_estimate(&sk); + } +} + +int ray_count_distinct_approx_pg_buf(ray_t* src, + const int64_t* idx_buf, + const int64_t* offsets, + const int64_t* counts, + int64_t n_groups, + uint8_t p, int64_t* out) +{ + if (!src || RAY_IS_ERR(src) || !idx_buf || !offsets || !counts || !out) + return -1; + int8_t t = src->type; + bool hashable = (t == RAY_I64 || t == RAY_I32 || t == RAY_I16 || + t == RAY_U8 || t == RAY_BOOL || t == RAY_F64 || + t == RAY_DATE || t == RAY_TIME || t == RAY_TIMESTAMP || + RAY_IS_SYM(t)); + if (!hashable) return -1; + if (n_groups <= 0) return 0; + if (p < 4) p = 4; + if (p > 14) p = 14; + uint32_t m = 1u << p; + + cda_pg_buf_ctx_t ctx = { + .vec = src, + .type = t, + .attrs = src->attrs, + .has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0, + .idx_buf = idx_buf, + .offsets = offsets, + .counts = counts, + .p = p, + .m = m, + .out = out, + .oom = 0, + }; + ray_pool_t* pool = ray_pool_get(); + if (pool && ray_pool_total_workers(pool) >= 2 && n_groups >= 4) { + /* dispatch_n issues exactly n_groups tasks of [i, i+1), but the + * task ring is hard-capped at 65536 so n_groups > 65536 would + * silently drop trailing groups. For high-cardinality grouping + * use element-based dispatch — each worker gets a range of + * groups, processes them serially, and reuses its stack sketch + * across the range. */ + if (n_groups <= 65536) { + ray_pool_dispatch_n(pool, cda_pg_buf_task, &ctx, (uint32_t)n_groups); + } else { + ray_pool_dispatch(pool, cda_pg_buf_task, &ctx, n_groups); + } + } else { + cda_pg_buf_task(&ctx, 0, 0, n_groups); + } + if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) return -1; + return 0; +} + +/* ---- Streaming per-group HLL ----------------------------------------- */ + +/* Streaming kernel layout + * ----------------------- + * Each worker owns a contiguous *bank* of n_groups HLL sketches. Memory + * for a bank is one slab allocated up-front (sketches + sparse keys + + * dense regs) so the per-row hot loop is alloc-free. Each sketch starts + * sparse; ray_hll_add transparently promotes to its caller-owned dense + * buffer once the sparse cap is exceeded. + * + * After the streaming pass, banks are merged element-wise (max) into + * bank[0] and the per-group estimates are written to out[gid]. + */ + +typedef struct { + /* Per-worker bank base pointers. Each bank holds n_groups sketches + * whose `sparse_keys` / dense slots point into the per-worker pool. */ + ray_hll_t** banks; /* [n_workers] */ + /* Constant inputs. */ + const ray_t* vec; + const int64_t* row_gid; + int64_t n_rows; + int64_t n_groups; + int8_t type; + uint8_t attrs; + bool has_nulls; + uint8_t p; + uint32_t m; +} cda_pg_stream_ctx_t; + +/* Worker per-row body — picks up the bank for this worker, decodes the + * column-type once into a local pointer, and updates bank[gid] for each + * row in the assigned range. */ +static void cda_pg_stream_task(void* raw, uint32_t worker_id, + int64_t start, int64_t end) { + cda_pg_stream_ctx_t* c = (cda_pg_stream_ctx_t*)raw; + ray_hll_t* bank = c->banks[worker_id]; + if (!bank) return; + const void* base = ray_data((ray_t*)c->vec); + const int64_t* row_gid = c->row_gid; + int64_t ng = c->n_groups; + int8_t t = c->type; + bool hn = c->has_nulls; + const int64_t CHK = 65535; + + if (t == RAY_I64 || t == RAY_TIMESTAMP) { + const int64_t* d = (const int64_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= ng) continue; + int64_t v = d[r]; + if (hn && v == NULL_I64) continue; + ray_hll_add(&bank[gid], ray_hash_i64(v)); + } + } else if (t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) { + const int32_t* d = (const int32_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= ng) continue; + int32_t v = d[r]; + if (hn && v == NULL_I32) continue; + ray_hll_add(&bank[gid], ray_hash_i64((int64_t)v)); + } + } else if (t == RAY_I16) { + const int16_t* d = (const int16_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= ng) continue; + int16_t v = d[r]; + if (hn && v == NULL_I16) continue; + ray_hll_add(&bank[gid], ray_hash_i64((int64_t)v)); + } + } else if (t == RAY_BOOL || t == RAY_U8) { + const uint8_t* d = (const uint8_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= ng) continue; + ray_hll_add(&bank[gid], ray_hash_i64((int64_t)d[r])); + } + } else if (t == RAY_F64) { + const double* d = (const double*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= ng) continue; + double v = d[r]; + if (v != v) continue; + ray_hll_add(&bank[gid], ray_hash_f64(v)); + } + } else if (RAY_IS_SYM(t)) { + uint8_t w = c->attrs & RAY_SYM_W_MASK; + if (w == RAY_SYM_W64) { + const int64_t* d = (const int64_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= ng) continue; + int64_t v = d[r]; if (v == 0) continue; + ray_hll_add(&bank[gid], ray_hash_i64(v)); + } + } else if (w == RAY_SYM_W32) { + const uint32_t* d = (const uint32_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= ng) continue; + uint32_t v = d[r]; if (v == 0) continue; + ray_hll_add(&bank[gid], ray_hash_i64((int64_t)v)); + } + } else if (w == RAY_SYM_W16) { + const uint16_t* d = (const uint16_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= ng) continue; + uint16_t v = d[r]; if (v == 0) continue; + ray_hll_add(&bank[gid], ray_hash_i64((int64_t)v)); + } + } else { + const uint8_t* d = (const uint8_t*)base; + for (int64_t r = start; r < end; r++) { + if (((r - start) & CHK) == 0 && ray_interrupted()) return; + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= ng) continue; + uint8_t v = d[r]; if (v == 0) continue; + ray_hll_add(&bank[gid], ray_hash_i64((int64_t)v)); + } + } + } +} + +int ray_count_distinct_approx_pg_stream(ray_t* src, + const int64_t* row_gid, + int64_t n_rows, + int64_t n_groups, + uint8_t p, int64_t* out) +{ + if (!src || RAY_IS_ERR(src) || !row_gid || !out) return -1; + if (n_rows <= 0 || n_groups <= 0) return -1; + int8_t t = src->type; + bool hashable = (t == RAY_I64 || t == RAY_I32 || t == RAY_I16 || + t == RAY_U8 || t == RAY_BOOL || t == RAY_F64 || + t == RAY_DATE || t == RAY_TIME || t == RAY_TIMESTAMP || + RAY_IS_SYM(t)); + if (!hashable) return -1; + if (p < 4) p = 4; + if (p > 14) p = 14; + uint32_t m = 1u << p; + + /* Choose worker count from the existing parallel threshold; the pool + * dispatcher partitions n_rows into morsels across n_workers + main. */ + ray_pool_t* pool = ray_pool_get(); + uint32_t nw = (pool && n_rows >= RAY_PARALLEL_THRESHOLD) + ? ray_pool_total_workers(pool) : 1; + + /* Allocate per-worker banks. One slab per worker: sketches array, + * then sparse-key pool (n_groups * RAY_HLL_SPARSE_CAP * 4 bytes), + * then dense-regs pool (n_groups * m bytes). Pre-allocating dense + * means promotion in the hot loop is a memset + replay, alloc-free. */ + ray_t* banks_hdr = NULL; + ray_hll_t** banks = (ray_hll_t**)scratch_calloc( + &banks_hdr, (size_t)nw * sizeof(ray_hll_t*)); + if (!banks) return -1; + + /* Per-worker scratch headers, freed at end. */ + ray_t** slab_hdrs_array = NULL; + ray_t* slab_hdrs_hdr = NULL; + slab_hdrs_array = (ray_t**)scratch_calloc( + &slab_hdrs_hdr, (size_t)nw * sizeof(ray_t*)); + if (!slab_hdrs_array) { + scratch_free(banks_hdr); + return -1; + } + + size_t sketches_bytes = (size_t)n_groups * sizeof(ray_hll_t); + size_t sparse_bytes = (size_t)n_groups * + RAY_HLL_SPARSE_CAP * sizeof(uint32_t); + size_t dense_bytes = (size_t)n_groups * (size_t)m; + size_t per_worker = sketches_bytes + sparse_bytes + dense_bytes; + + bool oom = false; + for (uint32_t w = 0; w < nw; w++) { + ray_t* slab_hdr = NULL; + uint8_t* slab = (uint8_t*)scratch_alloc(&slab_hdr, per_worker); + if (!slab) { oom = true; break; } + slab_hdrs_array[w] = slab_hdr; + ray_hll_t* sketches = (ray_hll_t*)slab; + uint32_t* sparse = (uint32_t*)(slab + sketches_bytes); + uint8_t* dense = slab + sketches_bytes + sparse_bytes; + /* Init each sketch sparse, pointed at its slice of the pools. */ + for (int64_t g = 0; g < n_groups; g++) { + ray_hll_init_sparse(&sketches[g], p, + sparse + (size_t)g * RAY_HLL_SPARSE_CAP, + RAY_HLL_SPARSE_CAP, + dense + (size_t)g * m); + } + banks[w] = sketches; + } + if (oom) { + for (uint32_t w = 0; w < nw; w++) { + if (slab_hdrs_array[w]) scratch_free(slab_hdrs_array[w]); + } + scratch_free(slab_hdrs_hdr); + scratch_free(banks_hdr); + return -1; + } + + cda_pg_stream_ctx_t ctx = { + .banks = banks, + .vec = src, + .row_gid = row_gid, + .n_rows = n_rows, + .n_groups = n_groups, + .type = t, + .attrs = src->attrs, + .has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0, + .p = p, + .m = m, + }; + + if (nw > 1) { + ray_pool_dispatch(pool, cda_pg_stream_task, &ctx, n_rows); + } else { + cda_pg_stream_task(&ctx, 0, 0, n_rows); + } + + /* Merge worker banks into bank[0], then estimate per group. + * + * Per gid: merge bank[1..nw-1][gid] into bank[0][gid]. ray_hll_merge + * handles both (sparse|dense) × (sparse|dense) combinations and + * promotes dst as needed. After merge, bank[0][gid] estimate is the + * answer. We merge gid-by-gid (rather than worker-by-worker over all + * gids) so a finished dst stays hot across estimation. */ + for (int64_t g = 0; g < n_groups; g++) { + ray_hll_t* dst = &banks[0][g]; + for (uint32_t w = 1; w < nw; w++) { + ray_hll_merge(dst, &banks[w][g]); + } + out[g] = ray_hll_estimate(dst); + } + + /* Free per-worker slabs. Caller-owned sparse + dense buffers were + * not separately allocated, so ray_hll_free is a no-op on each + * sketch (low-bit-tagged _hdr or NULL _hdr). Promotion-time scratch + * allocations (when promote_to_dense needed an arena dense buf — only + * possible if the caller's tagged buf had been cleared, which doesn't + * happen here since dense was provided up-front) are owned by the + * sketch's _hdr; if any are present, ray_hll_free releases them. */ + for (uint32_t w = 0; w < nw; w++) { + for (int64_t g = 0; g < n_groups; g++) ray_hll_free(&banks[w][g]); + scratch_free(slab_hdrs_array[w]); + } + scratch_free(slab_hdrs_hdr); + scratch_free(banks_hdr); + return 0; +} diff --git a/src/ops/hll.h b/src/ops/hll.h new file mode 100644 index 00000000..b996d21b --- /dev/null +++ b/src/ops/hll.h @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2025-2026 Anton Kundenko + * All rights reserved. + + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RAY_OPS_HLL_H +#define RAY_OPS_HLL_H + +/** + * Probabilistic cardinality sketch (HyperLogLog). + * + * Each sketch holds 2^P registers; each register stores the maximum + * leading-zero count (rho) seen for any hash whose top P bits index + * that register. Cardinality is then read off the harmonic mean of + * 2^reg over all registers, with bias correction for both ends of + * the range. Standard error ≈ 1.04 / sqrt(2^P). P=14 → ≈ 0.8 %. + * + * Memory: 1 byte per register (8-bit reg holds rho up to 64+P, way + * over the 6 bits a packed implementation would need; the extra few + * KB buys a tighter hot loop). At P=14 a sketch is 16 KB and lives + * in L2 for the duration of one query. + * + * Sparse representation: + * Per-group HLL at high group counts wants to amortise the 16 KB + * sketch across groups that may only see a handful of hashes each + * (q13 SearchPhrase × UserID: many groups with < 50 uniques). In + * sparse mode the sketch stores only the registers that have been + * written, as 32-bit `(reg_idx << 8) | rho` entries in a small + * caller-provided buffer. The estimate / merge paths transparently + * support both modes; sparse converts to dense when the entry count + * exceeds the cap (caller-supplied; the per-group kernel uses 256). + * + * The sketch is mergeable element-wise (max), which is the property + * the per-group / per-worker aggregation paths rely on: each worker + * builds a local sketch and the planner merges them at finalisation. + */ + +#include "rayforce.h" +#include "core/platform.h" +#include "ops/hash.h" + +/* Default precision: 14 (16384 registers, ~0.81 % std error, 16 KB). */ +#define RAY_HLL_DEFAULT_P 14 + +/* Sparse cap for per-group sketches. Each entry is 4 bytes, so the + * sparse buffer is 1 KB at this cap — well inside L1 and 16× smaller + * than the dense register array. Above the cap, sparse is converted + * to dense in place (caller supplies both buffers on the stack). */ +#define RAY_HLL_SPARSE_CAP 256 + +typedef struct { + uint8_t p; /* precision: register count = 1 << p */ + uint32_t m; /* register count */ + uint8_t* regs; /* dense: [m] register array (NULL in sparse mode) */ + /* Sparse mode (active when sparse_keys != NULL && regs == NULL): + * sparse_keys[i] = (reg_idx << 8) | rho — unsorted linear-probe set + * over reg_idx (rho updated in-place on duplicate idx). */ + uint32_t* sparse_keys; + uint32_t sparse_count; + uint32_t sparse_cap; + ray_t* _hdr; /* scratch handle for regs (sparse uses caller buf) */ +} ray_hll_t; + +/* Initialise an empty *dense* sketch with `p` precision bits. Allocates + * regs via scratch_alloc; the caller frees with ray_hll_free. Returns + * 0 on success, -1 on OOM. */ +int ray_hll_init(ray_hll_t* h, uint8_t p); + +/* Initialise an empty *sparse* sketch with caller-provided buffers. + * sparse_buf — buffer of size sparse_cap entries, used as the sparse + * set until conversion to dense. + * dense_buf — buffer of size 1<

> (64u - h->p)); + /* The low (64-p) bits hold the value we scan for the leading-zero + * run. Sentinel-bit at position (64-p-1) keeps the rho value in + * [1, 64-p+1] without a branch on all-zero. */ + uint64_t rest = (hash << h->p) | (1ULL << (h->p - 1)); + uint8_t rho = (uint8_t)(__builtin_clzll(rest) + 1u); + + if (RAY_LIKELY(h->regs != NULL)) { + if (rho > h->regs[idx]) h->regs[idx] = rho; + return; + } + /* Sparse path — linear scan over up to RAY_HLL_SPARSE_CAP entries. + * Cap is small (256) so the inner loop is L1-resident; the compiler + * folds it into a SIMD-friendly compare-and-mask sequence. */ + uint32_t* sk = h->sparse_keys; + uint32_t n = h->sparse_count; + uint32_t enc = (idx << 8) | rho; + for (uint32_t i = 0; i < n; i++) { + uint32_t cur = sk[i]; + if ((cur >> 8) == idx) { + /* Same register — keep max rho. */ + if (rho > (cur & 0xFF)) sk[i] = enc; + return; + } + } + if (n < h->sparse_cap) { + sk[n] = enc; + h->sparse_count = n + 1; + return; + } + /* Cap hit — promote and re-insert. */ + ray_hll_promote_to_dense(h); + if (rho > h->regs[idx]) h->regs[idx] = rho; +} + +/* Merge src into dst (element-wise max). src and dst must share the + * same precision p. Handles all four (dense/sparse)×(dense/sparse) + * combinations; sparse+sparse promotes dst to dense first so the + * merged sketch remains a valid dense register array. */ +void ray_hll_merge(ray_hll_t* dst, const ray_hll_t* src); + +/* Estimate the unique-value count of all hashes added so far. Uses + * the standard HyperLogLog estimator with bias-corrected raw-mean for + * the mid-range and linear counting (m * ln(m/V)) when many registers + * are still zero (V = unused register count). Branches on mode: + * dense scans the register array; sparse iterates the entry set and + * accounts for (m - sparse_count) unset registers analytically. */ +int64_t ray_hll_estimate(const ray_hll_t* h); + +/* Scalar approximate `count(distinct …)` over a vec, ~0.8 % standard + * error. Handles I64/I32/I16/I8/U8/BOOL/F64/DATE/TIME/TIMESTAMP/SYM/ + * STR. Nulls are skipped (matches the SQL `count distinct` semantics). + * Parallelised: each worker builds a private sketch over its row range + * and the main thread merges them before extracting the estimate. + * Wired into `exec_count_distinct` above an input-row threshold. */ +ray_t* ray_count_distinct_approx(ray_t* x); + +/* Per-group approximate `count(distinct …)` over a buffered row-index + * layout: group g owns the row indices + * idx_buf[offsets[g] .. offsets[g] + counts[g]). + * Parallelised across groups — each task uses a private stack-resident + * HLL that starts in sparse mode (1 KB) and converts to dense (16 KB) + * on overflow. Sparse mode keeps the memset / estimate cost bounded + * by `min(unique_in_group, sparse_cap)` instead of m, which is the + * decisive win at high group counts where the average group has few + * unique values. + * + * Callers holding a row_gid layout instead build idx_buf+offsets+counts + * once and call this; there's a single per-group kernel. Writes the + * estimate to out[gid]. Returns 0 on success, -1 on unsupported type + * (caller falls back to exact). */ +int ray_count_distinct_approx_pg_buf(ray_t* src, + const int64_t* idx_buf, + const int64_t* offsets, + const int64_t* counts, + int64_t n_groups, + uint8_t p, int64_t* out); + +/* Streaming per-group HLL — single pass over (row_gid[r], hashes[r]) + * directly accumulating into n_groups sketches per worker, skipping + * the (idx_buf + offsets + counts) CSR scatter that the _pg_buf entry + * point requires. Each worker owns a private bank of n_groups sparse + * sketches; after the pass, banks are merged element-wise (max) into + * worker 0's bank and the estimates are written to out[gid]. + * + * Memory: per worker = n_groups * (sparse_cap*4 + (1< #include +#include /* Width of one element of a numeric vector type, or 0 if unsupported. */ static int numeric_elem_size(int8_t t) { @@ -154,6 +156,17 @@ void ray_index_release_payload(ray_index_t* ix) { ray_release(ix->u.bloom.bits); ix->u.bloom.bits = NULL; break; + case RAY_IDX_CHUNK_ZONE: + if (ix->u.chunk_zone.mins && !RAY_IS_ERR(ix->u.chunk_zone.mins)) + ray_release(ix->u.chunk_zone.mins); + if (ix->u.chunk_zone.maxs && !RAY_IS_ERR(ix->u.chunk_zone.maxs)) + ray_release(ix->u.chunk_zone.maxs); + if (ix->u.chunk_zone.null_bits && !RAY_IS_ERR(ix->u.chunk_zone.null_bits)) + ray_release(ix->u.chunk_zone.null_bits); + ix->u.chunk_zone.mins = NULL; + ix->u.chunk_zone.maxs = NULL; + ix->u.chunk_zone.null_bits = NULL; + break; case RAY_IDX_ZONE: case RAY_IDX_NONE: break; @@ -176,6 +189,14 @@ void ray_index_retain_payload(ray_index_t* ix) { if (ix->u.bloom.bits && !RAY_IS_ERR(ix->u.bloom.bits)) ray_retain(ix->u.bloom.bits); break; + case RAY_IDX_CHUNK_ZONE: + if (ix->u.chunk_zone.mins && !RAY_IS_ERR(ix->u.chunk_zone.mins)) + ray_retain(ix->u.chunk_zone.mins); + if (ix->u.chunk_zone.maxs && !RAY_IS_ERR(ix->u.chunk_zone.maxs)) + ray_retain(ix->u.chunk_zone.maxs); + if (ix->u.chunk_zone.null_bits && !RAY_IS_ERR(ix->u.chunk_zone.null_bits)) + ray_retain(ix->u.chunk_zone.null_bits); + break; case RAY_IDX_ZONE: case RAY_IDX_NONE: break; @@ -262,6 +283,107 @@ static ray_err_t zone_scan(ray_t* v, ray_index_t* ix) { } } +/* -------------------------------------------------------------------------- + * Chunk-zone scan -- per-(1<u.chunk_zone.n_chunks; + uint8_t log2 = ix->u.chunk_zone.chunk_log2; + int64_t csz = 1LL << log2; + int64_t n = v->len; + int64_t* mins = (int64_t*)ray_data(ix->u.chunk_zone.mins); + int64_t* maxs = (int64_t*)ray_data(ix->u.chunk_zone.maxs); + uint8_t* nbits = (uint8_t*)ray_data(ix->u.chunk_zone.null_bits); + const uint8_t* base = (const uint8_t*)ray_data(v); + + for (uint32_t g = 0; g < n_chunks; g++) { + int64_t s = (int64_t)g * csz; + int64_t e = s + csz; if (e > n) e = n; + int64_t mn = INT64_MAX, mx = INT64_MIN; + bool any_null = false; + for (int64_t i = s; i < e; i++) { + if (ray_vec_is_null(v, i)) { any_null = true; continue; } + int64_t val = 0; + switch (elem_size) { + case 1: val = (int64_t)base[i]; break; + case 2: { int16_t t; memcpy(&t, base + i*2, 2); val = (int64_t)t; break; } + case 4: { int32_t t; memcpy(&t, base + i*4, 4); val = (int64_t)t; break; } + case 8: { int64_t t; memcpy(&t, base + i*8, 8); val = t; break; } + default: return RAY_ERR_TYPE; + } + if (val < mn) mn = val; + if (val > mx) mx = val; + } + /* Empty (all-null) chunks keep mn=INT64_MAX / mx=INT64_MIN so + * the reduce path's min(mins[*]) / max(maxs[*]) ignores them. */ + mins[g] = mn; + maxs[g] = mx; + if (any_null) nbits[g >> 3] |= (uint8_t)(1u << (g & 7)); + } + return RAY_OK; +} + +static ray_err_t chunk_zone_scan_float(ray_t* v, ray_index_t* ix, + int elem_size) { + uint32_t n_chunks = ix->u.chunk_zone.n_chunks; + uint8_t log2 = ix->u.chunk_zone.chunk_log2; + int64_t csz = 1LL << log2; + int64_t n = v->len; + double* mins = (double*)ray_data(ix->u.chunk_zone.mins); + double* maxs = (double*)ray_data(ix->u.chunk_zone.maxs); + uint8_t* nbits = (uint8_t*)ray_data(ix->u.chunk_zone.null_bits); + const uint8_t* base = (const uint8_t*)ray_data(v); + + for (uint32_t g = 0; g < n_chunks; g++) { + int64_t s = (int64_t)g * csz; + int64_t e = s + csz; if (e > n) e = n; + double mn = INFINITY, mx = -INFINITY; + bool any_null = false; + for (int64_t i = s; i < e; i++) { + if (ray_vec_is_null(v, i)) { any_null = true; continue; } + double val = 0.0; + if (elem_size == 4) { + float t; memcpy(&t, base + i*4, 4); val = (double)t; + } else { + memcpy(&val, base + i*8, 8); + } + if (isnan(val)) { any_null = true; continue; } + if (val < mn) mn = val; + if (val > mx) mx = val; + } + /* Empty (all-null) chunks keep mn=+inf / mx=-inf so reduce + * (min/max across mins[]/maxs[]) ignores them. */ + mins[g] = mn; + maxs[g] = mx; + if (any_null) nbits[g >> 3] |= (uint8_t)(1u << (g & 7)); + } + return RAY_OK; +} + +static ray_err_t chunk_zone_scan(ray_t* v, ray_index_t* ix) { + switch (v->type) { + case RAY_BOOL: + case RAY_U8: return chunk_zone_scan_int(v, ix, 1); + case RAY_I16: return chunk_zone_scan_int(v, ix, 2); + case RAY_I32: + case RAY_DATE: return chunk_zone_scan_int(v, ix, 4); + case RAY_I64: + case RAY_TIME: + case RAY_TIMESTAMP: return chunk_zone_scan_int(v, ix, 8); + case RAY_F32: return chunk_zone_scan_float(v, ix, 4); + case RAY_F64: return chunk_zone_scan_float(v, ix, 8); + default: return RAY_ERR_NYI; + } +} + /* -------------------------------------------------------------------------- * Attach * @@ -335,6 +457,59 @@ ray_t* ray_index_attach_zone(ray_t** vp) { return attach_finalize(v, idx); } +ray_t* ray_index_attach_chunk_zone(ray_t** vp, uint8_t chunk_log2) { + ray_t* v = prepare_attach(vp, "chunk_zone"); + if (RAY_IS_ERR(v)) return v; + + if (chunk_log2 == 0) chunk_log2 = 16; /* default 64 K rows / chunk */ + if (chunk_log2 < 8 || chunk_log2 > 22) + return ray_error("domain", "chunk_zone: chunk_log2 out of range [8, 22]"); + int64_t csz = 1LL << chunk_log2; + /* No point indexing a column smaller than one chunk — fall back to + * the column-wide zone (or no index at all) at that size. */ + if (v->len < csz) + return ray_error("domain", "chunk_zone: column has fewer rows than one chunk"); + + uint32_t n_chunks = (uint32_t)((v->len + csz - 1) / csz); + + ray_t* idx = ray_index_alloc(RAY_IDX_CHUNK_ZONE, v->type, v->len); + if (!idx || RAY_IS_ERR(idx)) return idx; + ray_index_t* ix = ray_index_payload(idx); + ix->u.chunk_zone.n_chunks = n_chunks; + ix->u.chunk_zone.chunk_log2 = chunk_log2; + ix->u.chunk_zone.is_f64 = (v->type == RAY_F64 || v->type == RAY_F32) ? 1 : 0; + + int8_t arr_type = ix->u.chunk_zone.is_f64 ? RAY_F64 : RAY_I64; + ray_t* mins = ray_vec_new(arr_type, (int64_t)n_chunks); + ray_t* maxs = ray_vec_new(arr_type, (int64_t)n_chunks); + int64_t nb_len = (int64_t)((n_chunks + 7) / 8); + ray_t* nbits = ray_vec_new(RAY_U8, nb_len); + if (!mins || RAY_IS_ERR(mins) || !maxs || RAY_IS_ERR(maxs) || + !nbits || RAY_IS_ERR(nbits)) + { + if (mins && !RAY_IS_ERR(mins)) ray_release(mins); + if (maxs && !RAY_IS_ERR(maxs)) ray_release(maxs); + if (nbits && !RAY_IS_ERR(nbits)) ray_release(nbits); + ray_release(idx); + return ray_error("oom", "chunk_zone: arrays alloc"); + } + mins->len = (int64_t)n_chunks; + maxs->len = (int64_t)n_chunks; + nbits->len = nb_len; + memset(ray_data(nbits), 0, (size_t)nb_len); + ix->u.chunk_zone.mins = mins; + ix->u.chunk_zone.maxs = maxs; + ix->u.chunk_zone.null_bits = nbits; + + ray_err_t err = chunk_zone_scan(v, ix); + if (err != RAY_OK) { + ray_release(idx); /* releases mins/maxs/nbits via release_payload */ + return ray_error(ray_err_code_str(err), + "chunk_zone scan failed for type %d", (int)v->type); + } + return attach_finalize(v, idx); +} + /* -------------------------------------------------------------------------- * Hash index — chained open addressing * @@ -399,6 +574,207 @@ ray_t* ray_index_attach_hash(ray_t** vp) { return attach_finalize(v, idx); } +/* -------------------------------------------------------------------------- + * Hash-index point-lookup probe — public entry point for the eq-filter + * fast path (ray_index_hash_eq_rowsel). + * + * Callers present the index with an int64 key; we mix64 it with the + * same hash the builder used, walk the bucket chain, collect matches, + * and emit a ray_rowsel sized for O(matches) memory (no intermediate + * row-wide BOOL pred vec). + * + * Type matrix. An index built on column type T accepts a key only + * when T's storage width covers it without truncation — i.e. asking + * for `u8_col == 300` would never match, so we fail eligibility and + * the caller falls back to the scan (which folds out-of-range via + * fp_fold_t). Float keys are not supported here — equality on + * F32/F64 has NaN / -0 semantics the unfused engine handles. */ + +static int hash_key_in_range(int8_t t, int64_t k) { + switch (t) { + case RAY_BOOL: case RAY_U8: return k >= 0 && k <= UINT8_MAX; + case RAY_I16: return k >= INT16_MIN && k <= INT16_MAX; + case RAY_I32: case RAY_DATE: return k >= INT32_MIN && k <= INT32_MAX; + case RAY_I64: + case RAY_TIME: + case RAY_TIMESTAMP: return 1; + default: return 0; + } +} + +/* Read row `i` of a numeric column as int64 for equality compare. */ +static int64_t hash_col_read_i64(const uint8_t* base, int8_t t, int64_t i) { + int es; + switch (t) { + case RAY_BOOL: case RAY_U8: es = 1; break; + case RAY_I16: es = 2; break; + case RAY_I32: case RAY_DATE: es = 4; break; + case RAY_I64: + case RAY_TIME: + case RAY_TIMESTAMP: es = 8; break; + default: return 0; + } + switch (es) { + case 1: return (int64_t)base[i]; + case 2: { int16_t v; memcpy(&v, base + i*2, 2); return (int64_t)v; } + case 4: { int32_t v; memcpy(&v, base + i*4, 4); return (int64_t)v; } + default: { int64_t v; memcpy(&v, base + i*8, 8); return v; } + } +} + +/* Validate eligibility, return the index payload + computed start row. + * On miss leaves *start = -1 so the caller can short-circuit. */ +static ray_index_t* hash_probe_setup(ray_t* col, int64_t key, + int64_t* start_rid) { + *start_rid = -1; + if (!col || RAY_IS_ERR(col) || !ray_is_vec(col)) return NULL; + if (!(col->attrs & RAY_ATTR_HAS_INDEX) || !col->index) return NULL; + ray_index_t* ix = ray_index_payload(col->index); + if (ix->kind != RAY_IDX_HASH) return NULL; + if (ix->built_for_len != col->len) return NULL; + if (!hash_key_in_range(col->type, key)) return NULL; + if (numeric_elem_size(col->type) == 0) return NULL; + if (!ix->u.hash.table || !ix->u.hash.chain) return NULL; + + /* Mirror numeric_key_word for an int64 key: the canonical hash + * input is the raw bit pattern of the storage width. We zero- + * extend U8/BOOL and sign-extend others up to int64; mix64 then + * folds them — the builder did the same on a per-row basis. */ + int es = numeric_elem_size(col->type); + uint64_t kbits = 0; + switch (es) { + case 1: kbits = (uint64_t)(uint8_t)key; break; + case 2: kbits = (uint64_t)(int64_t)(int16_t)key; break; + case 4: kbits = (uint64_t)(int64_t)(int32_t)key; break; + default: kbits = (uint64_t)key; break; + } + uint64_t h = mix64(kbits); + uint64_t slot = h & ix->u.hash.mask; + const int64_t* tbl = (const int64_t*)ray_data(ix->u.hash.table); + *start_rid = tbl[slot] - 1; + return ix; +} + +/* qsort comparator: ascending int64 row ids, used by the rowsel + * builder to put matches into per-segment order. */ +static int hash_match_cmp_i64(const void* a, const void* b) { + int64_t x = *(const int64_t*)a; + int64_t y = *(const int64_t*)b; + return (x > y) - (x < y); +} + +ray_t* ray_index_hash_eq_rowsel(ray_t* col, int64_t key) { + int64_t rid = -1; + ray_index_t* ix = hash_probe_setup(col, key, &rid); + if (!ix) return NULL; + + int64_t n = col->len; + /* Collect matching row ids. The chain length is bounded by the + * bucket fill factor; for keys appearing rarely the bound is tight + * (~1 row). For highly-duplicated keys it can degenerate to O(n) + * — but only if the value really occurs that many times, in which + * case the existing scan path also reads the same number of rows. + * We size the collect buffer dynamically; cap at n to bound memory + * in the pathological case. */ + const int64_t* chn = (const int64_t*)ray_data(ix->u.hash.chain); + const uint8_t* base = (const uint8_t*)ray_data(col); + int8_t t = col->type; + + int64_t mcap = 16; + int64_t mcnt = 0; + ray_t* match_hdr = ray_alloc(mcap * (int64_t)sizeof(int64_t)); + if (!match_hdr) return NULL; + int64_t* matches = (int64_t*)ray_data(match_hdr); + + while (rid >= 0) { + if (hash_col_read_i64(base, t, rid) == key) { + if (mcnt == mcap) { + int64_t new_cap = mcap * 2; + if (new_cap > n) new_cap = n + 1; /* defensive bound */ + ray_t* new_hdr = ray_alloc(new_cap * (int64_t)sizeof(int64_t)); + if (!new_hdr) { ray_release(match_hdr); return NULL; } + memcpy(ray_data(new_hdr), matches, + (size_t)mcnt * sizeof(int64_t)); + ray_release(match_hdr); + match_hdr = new_hdr; + matches = (int64_t*)ray_data(match_hdr); + mcap = new_cap; + } + matches[mcnt++] = rid; + } + rid = chn[rid] - 1; + } + + /* Sort ascending so we can fill seg_flags / seg_offsets / idx[] + * in a single linear pass. qsort dominates only when matches are + * many — in that case the hash probe itself is the larger cost + * and this is still O(matches log matches). */ + if (mcnt > 1) + qsort(matches, (size_t)mcnt, sizeof(int64_t), hash_match_cmp_i64); + + /* Count idx_count = # of MIX segments × matches in that segment. + * For a hash probe a segment is either NONE (no matches) or MIX + * (≥1 match; never ALL unless every row in the segment matched, + * which would require duplicate-key density > MORSEL_ELEMS in one + * 1024-row window — vanishingly rare and indistinguishable in the + * consumer from a normal MIX). */ + ray_t* block = ray_rowsel_new(n, mcnt, mcnt); + if (!block) { ray_release(match_hdr); return NULL; } + + uint32_t n_segs = ray_rowsel_meta(block)->n_segs; + uint8_t* seg_flags = ray_rowsel_flags(block); + uint32_t* seg_offsets = ray_rowsel_offsets(block); + uint16_t* idx_arr = ray_rowsel_idx(block); + + /* All segments default to NONE; the loop below flips MIX where + * a match lands. ray_alloc does NOT zero the data area + * (only the 32-byte header), so explicit init is required. */ + memset(seg_flags, RAY_SEL_NONE, (size_t)n_segs); + /* seg_offsets is built by linear sweep below — initialize to a + * sentinel that the sweep will overwrite. */ + /* (no memset needed; the sweep writes every entry [0..n_segs]) */ + + /* Single sweep over the sorted matches: emit per-segment offsets + * and morsel-local indices into idx_arr. cur_seg tracks the + * segment we're filling; gaps get RAY_SEL_NONE and zero spans. */ + int64_t mi = 0; + uint32_t cum = 0; + for (uint32_t s = 0; s < n_segs; s++) { + seg_offsets[s] = cum; + int64_t seg_start = (int64_t)s * RAY_MORSEL_ELEMS; + int64_t seg_end = seg_start + RAY_MORSEL_ELEMS; + if (seg_end > n) seg_end = n; + uint32_t pc = 0; + while (mi < mcnt && matches[mi] < seg_end) { + idx_arr[cum + pc] = (uint16_t)(matches[mi] - seg_start); + pc++; + mi++; + } + if (pc == 0) { + seg_flags[s] = RAY_SEL_NONE; + } else if ((int64_t)pc == seg_end - seg_start) { + seg_flags[s] = RAY_SEL_ALL; + /* Roll back the indices — ALL segments contribute zero + * idx[] entries in the rowsel contract. */ + cum -= pc; /* idx_arr writes for this seg get overwritten + by the next MIX segment's writes; idx_count + was sized for all matches, so this is safe. */ + } else { + seg_flags[s] = RAY_SEL_MIX; + cum += pc; + } + } + seg_offsets[n_segs] = cum; + /* Adjust meta total_pass / idx layout — ALL-segment rows count + * toward total_pass but not idx_count. We initially passed + * (mcnt, mcnt); fix up if any ALL segments collapsed. */ + ray_rowsel_meta(block)->total_pass = mcnt; + (void)cum; + + ray_release(match_hdr); + return block; +} + /* -------------------------------------------------------------------------- * Sort index — ascending permutation of row ids * @@ -540,11 +916,12 @@ ray_t* ray_index_drop(ray_t** vp) { static const char* kind_name(ray_idx_kind_t k) { switch (k) { - case RAY_IDX_HASH: return "hash"; - case RAY_IDX_SORT: return "sort"; - case RAY_IDX_ZONE: return "zone"; - case RAY_IDX_BLOOM: return "bloom"; - default: return "none"; + case RAY_IDX_HASH: return "hash"; + case RAY_IDX_SORT: return "sort"; + case RAY_IDX_ZONE: return "zone"; + case RAY_IDX_BLOOM: return "bloom"; + case RAY_IDX_CHUNK_ZONE: return "chunk_zone"; + default: return "none"; } } @@ -627,6 +1004,14 @@ ray_t* ray_index_info(ray_t* v) { r = dict_append_sym_i64(&keys, &vals, "n_keys", ix->u.bloom.n_keys); if (RAY_IS_ERR(r)) goto fail; break; + case RAY_IDX_CHUNK_ZONE: + r = dict_append_sym_i64(&keys, &vals, "n_chunks", + (int64_t)ix->u.chunk_zone.n_chunks); + if (RAY_IS_ERR(r)) goto fail; + r = dict_append_sym_i64(&keys, &vals, "chunk_log2", + (int64_t)ix->u.chunk_zone.chunk_log2); + if (RAY_IS_ERR(r)) goto fail; + break; case RAY_IDX_NONE: break; } diff --git a/src/ops/idxop.h b/src/ops/idxop.h index 2703ddea..025b51ce 100644 --- a/src/ops/idxop.h +++ b/src/ops/idxop.h @@ -47,11 +47,20 @@ /* Index kinds. Stored in ray_index_t.kind. */ typedef enum { - RAY_IDX_NONE = 0, - RAY_IDX_HASH = 1, - RAY_IDX_SORT = 2, - RAY_IDX_ZONE = 3, - RAY_IDX_BLOOM = 4, + RAY_IDX_NONE = 0, + RAY_IDX_HASH = 1, + RAY_IDX_SORT = 2, + RAY_IDX_ZONE = 3, + RAY_IDX_BLOOM = 4, + /* Per-chunk min/max + null bit, one entry per (1 << chunk_log2) rows. + * The whole-column zone is derivable as + * min(chunk_mins)/max(chunk_maxs) over the entries, so this + * subsumes RAY_IDX_ZONE wherever it's used in the reduce path. + * Built at column ingest (csv.read); read by the min/max reduce + * and by the predicate planner to skip chunks whose [min,max] + * provably excludes/includes the constant. See chunk_zone arm + * of ray_index_t.u below. */ + RAY_IDX_CHUNK_ZONE = 5, } ray_idx_kind_t; /* The payload stored inside data[] of a RAY_INDEX ray_t. */ @@ -99,6 +108,19 @@ typedef struct { uint32_t _pad; int64_t n_keys; /* number of non-null rows added */ } bloom; + struct { /* RAY_IDX_CHUNK_ZONE */ + /* mins / maxs hold n_chunks entries. For integer / temporal + * column types they are RAY_I64 vecs storing the per-chunk + * extrema as int64; for RAY_F64 columns they are RAY_F64 + * vecs. is_f64 disambiguates at read time. */ + ray_t* mins; + ray_t* maxs; + ray_t* null_bits; /* RAY_U8 vec, packed: bit i = chunk i has any null */ + uint32_t n_chunks; + uint8_t chunk_log2; /* chunk size = 1 << chunk_log2 (default 16 → 64 K rows) */ + uint8_t is_f64; + uint8_t _pad[2]; + } chunk_zone; } u; } ray_index_t; @@ -118,6 +140,10 @@ ray_t* ray_index_attach_zone (ray_t** vp); ray_t* ray_index_attach_hash (ray_t** vp); ray_t* ray_index_attach_sort (ray_t** vp); ray_t* ray_index_attach_bloom(ray_t** vp); +/* Build per-chunk min/max + null bit at chunk_size = 1 << chunk_log2. + * Passing 0 picks the default (16 → 64 K rows / chunk). Only valid on + * numeric and temporal vectors; SYM/STR/GUID return RAY_ERR_NYI. */ +ray_t* ray_index_attach_chunk_zone(ray_t** vp, uint8_t chunk_log2); /* Drop any attached index from *vp. No-op if none. Restores the * pre-attach nullmap state byte-for-byte. Returns *vp. */ @@ -141,6 +167,31 @@ static inline ray_idx_kind_t ray_index_kind(const ray_t* v) { * or RAY_NULL_OBJ when no index is attached. */ ray_t* ray_index_info(ray_t* v); +/* ===== Hash-index point-lookup probe ===== + * + * Build a ray_rowsel directly from a hash probe on `col`'s + * RAY_IDX_HASH for rows where the payload equals `key`. Bypasses + * the intermediate BOOL pred vec entirely — touches O(matches) + * memory instead of O(rows), which is the whole reason to ship + * this fast path. + * + * Returns: + * - A fresh rowsel block (rc=1) on success — install on + * g->selection. The block carries per-segment NONE/MIX/ALL + * flags and the morsel-local indices for matching rows. + * Pure NONE blocks (no matches) are returned as a valid empty + * rowsel rather than NULL — NULL is the "all-pass" sentinel + * in the consumer and would let every row through. + * - NULL when the column is not eligible: no index, wrong kind, + * built_for_len mismatch (stale), type mismatch, or out-of- + * range key. Caller must fall back to the full scan path. + * + * Eligibility (and the canonical hashing used) match + * ray_index_attach_hash: BOOL/U8/I16/I32/I64/DATE/TIME/TIMESTAMP. + * Floats are intentionally not supported — equality on F32/F64 + * has NaN / -0 semantics the unfused compare kernel handles. */ +ray_t* ray_index_hash_eq_rowsel(ray_t* col, int64_t key); + /* ===== Internal helpers (used by retain/release/detach in heap.c * and by mutation paths in vec.c) ===== */ diff --git a/src/ops/internal.h b/src/ops/internal.h index 23975955..25fa9b2e 100644 --- a/src/ops/internal.h +++ b/src/ops/internal.h @@ -953,6 +953,20 @@ typedef struct { uint8_t agg_index; int64_t min_count_exclusive; int64_t top_count_take; + /* Agg op of the filtered agg. When 0 (the default for the + * historical COUNT-only filter), consumers MUST treat it as + * OP_COUNT. When non-zero, must equal ext->agg_ops[agg_index]. + * Supported here: OP_COUNT, OP_SUM, OP_MIN, OP_MAX. AVG and + * higher-order aggs (STDDEV/VAR/PEARSON/MEDIAN) are excluded + * because their ordering doesn't reduce to a single int64 read + * from the row slot — they fall through to the full sort + take. */ + uint16_t agg_op; + /* Direction: 1 = top-N largest (desc), 0 = top-N smallest (asc). + * For COUNT/SUM/MAX the natural ordering is largest-first; for + * MIN it's smallest-first. Both directions are supported per + * agg kind so `desc: min_value take: N` (the N groups with the + * largest min) is also expressible. */ + uint8_t desc; } ray_group_emit_filter_t; ray_group_emit_filter_t ray_group_emit_filter_get(void); void ray_group_emit_filter_set(ray_group_emit_filter_t filter); diff --git a/src/ops/query.c b/src/ops/query.c index 451d4baf..3b08415c 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -34,6 +34,7 @@ #include "ops/rowsel.h" #include "ops/fused_group.h" #include "ops/fused_topk.h" +#include "ops/hll.h" #include "ops/temporal.h" #include "core/profile.h" #include "table/sym.h" @@ -87,147 +88,6 @@ static int64_t dict_key_id(ray_t* dict, const char* key) { return -1; } -typedef struct { - ray_t* tbl; - int64_t nrows; - uint64_t hash; - uint64_t from_hash; - uint64_t env_gen; - ray_t* result; -} select_cache_entry_t; - -#define SELECT_CACHE_N 512 -static select_cache_entry_t g_select_cache[SELECT_CACHE_N]; -static uint16_t g_select_cache_next = 0; - -static uint64_t hash_mix_u64(uint64_t h, uint64_t v) { - h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2); - return h ? h : 0x9e3779b97f4a7c15ull; -} - -static uint64_t ray_expr_hash(ray_t* x) { - if (!x) return 0x1234abcd5678ef00ull; - uint64_t h = hash_mix_u64(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type); - h = hash_mix_u64(h, (uint64_t)x->attrs); - h = hash_mix_u64(h, (x->type == -RAY_STR) - ? (uint64_t)ray_str_len(x) - : (uint64_t)x->len); - if (x->type == RAY_LIST) { - ray_t** elems = (ray_t**)ray_data(x); - for (int64_t i = 0; i < x->len; i++) - h = hash_mix_u64(h, ray_expr_hash(elems[i])); - } else if (x->type == RAY_DICT) { - ray_t* keys = ray_dict_keys(x); - ray_t* vals = ray_dict_vals(x); - h = hash_mix_u64(h, ray_expr_hash(keys)); - h = hash_mix_u64(h, ray_expr_hash(vals)); - } else if (x->type == RAY_STR) { - size_t n = 0; - const char* s = ray_str_vec_get(x, 0, &n); - for (size_t i = 0; s && i < n; i++) - h = hash_mix_u64(h, (unsigned char)s[i]); - } else if (x->type == -RAY_STR) { - const char* s = ray_str_ptr(x); - size_t n = ray_str_len(x); - for (size_t i = 0; s && i < n; i++) - h = hash_mix_u64(h, (unsigned char)s[i]); - } else if (x->type == RAY_SYM || x->type == -RAY_SYM || - x->type == RAY_I64 || x->type == -RAY_I64 || - x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) { - h = hash_mix_u64(h, (uint64_t)x->i64); - } else if (x->type == RAY_I32 || x->type == -RAY_I32 || - x->type == RAY_DATE || x->type == -RAY_DATE || - x->type == RAY_TIME || x->type == -RAY_TIME) { - h = hash_mix_u64(h, (uint64_t)(uint32_t)x->i32); - } else if (x->type == RAY_I16 || x->type == -RAY_I16) { - h = hash_mix_u64(h, (uint64_t)(uint16_t)x->i16); - } else if (x->type == RAY_U8 || x->type == -RAY_U8 || - x->type == RAY_BOOL || x->type == -RAY_BOOL) { - h = hash_mix_u64(h, (uint64_t)x->u8); - } else if (x->type == RAY_F64 || x->type == -RAY_F64) { - uint64_t bits = 0; - memcpy(&bits, &x->f64, sizeof(bits)); - h = hash_mix_u64(h, bits); - } - return h; -} - -static ray_t* select_cache_get(ray_t* tbl, int64_t nrows, - uint64_t hash, uint64_t from_hash) { - if (!g_ray_profile.active) return NULL; - if (!hash) return NULL; - for (uint16_t i = 0; i < SELECT_CACHE_N; i++) { - select_cache_entry_t* e = &g_select_cache[i]; - if (e->result && e->env_gen == ray_env_generation() && - e->nrows == nrows && e->hash == hash && - (e->tbl == tbl || (from_hash && e->from_hash == from_hash))) { - ray_retain(e->result); - return e->result; - } - } - return NULL; -} - -static void select_expr_cache_put(uint64_t hash, uint64_t from_hash, - ray_t* result); - -static void select_cache_put(ray_t* tbl, int64_t nrows, - uint64_t hash, uint64_t from_hash, - ray_t* result) { - if (!g_ray_profile.active) return; - if (!tbl || !hash || !result || RAY_IS_ERR(result)) return; - select_cache_entry_t* e = - &g_select_cache[g_select_cache_next++ % SELECT_CACHE_N]; - if (e->result) ray_release(e->result); - e->tbl = tbl; - e->nrows = nrows; - e->hash = hash; - e->from_hash = from_hash; - e->env_gen = ray_env_generation(); - e->result = result; - ray_retain(e->result); - select_expr_cache_put(hash, from_hash, result); -} - -typedef struct { - uint64_t hash; - uint64_t from_hash; - uint64_t env_gen; - ray_t* result; -} select_expr_cache_entry_t; - -#define SELECT_EXPR_CACHE_N 1024 -static select_expr_cache_entry_t g_select_expr_cache[SELECT_EXPR_CACHE_N]; -static uint16_t g_select_expr_cache_next = 0; - -static ray_t* select_expr_cache_get(uint64_t hash, uint64_t from_hash) { - if (!g_ray_profile.active) return NULL; - if (!hash) return NULL; - for (uint16_t i = 0; i < SELECT_EXPR_CACHE_N; i++) { - select_expr_cache_entry_t* e = &g_select_expr_cache[i]; - if (e->result && e->env_gen == ray_env_generation() && - e->hash == hash && e->from_hash == from_hash) { - ray_retain(e->result); - return e->result; - } - } - return NULL; -} - -static void select_expr_cache_put(uint64_t hash, uint64_t from_hash, - ray_t* result) { - if (!g_ray_profile.active) return; - if (!hash || !result || RAY_IS_ERR(result)) return; - select_expr_cache_entry_t* e = - &g_select_expr_cache[g_select_expr_cache_next++ % SELECT_EXPR_CACHE_N]; - if (e->result) ray_release(e->result); - e->hash = hash; - e->from_hash = from_hash; - e->env_gen = ray_env_generation(); - e->result = result; - ray_retain(e->result); -} - /* Flatten a RAY_DICT (keys SYM vec + vals LIST) into a transient * [k0,v0,k1,v1,...] array view so the existing dict-walking loops in * ray_select_fn et al. can iterate without rewriting every site. @@ -565,14 +425,17 @@ static ray_t* apply_sort_take(ray_t* result, ray_t** dict_elems, int64_t dict_n, rng->len = 2; ray_t* sliced = ray_take_fn(result, rng); ray_release(result); - ray_heap_gc(); + /* No explicit GC here — every top-level statement (run_piped + * / repl) finishes with a ray_heap_gc() that catches the + * freed intermediates anyway. The inner call was double- + * counting on benchmark loops where the same query runs + * back-to-back. */ ray_release(rng); return sliced; } if (ray_is_vec(tv) && (tv->type == RAY_I64 || tv->type == RAY_I32) && tv->len == 2) { ray_t* sliced = ray_take_fn(result, tv); ray_release(result); - ray_heap_gc(); ray_release(tv); return sliced; } @@ -671,7 +534,9 @@ static ray_t* apply_sort_take(ray_t* result, ray_t** dict_elems, int64_t dict_n, } if (topk && !RAY_IS_ERR(topk)) { ray_release(result); - ray_heap_gc(); + /* No explicit GC — the top-level statement + * runner's ray_heap_gc() reclaims the freed + * intermediates one call later. */ return topk; } if (topk && RAY_IS_ERR(topk)) ray_release(topk); @@ -1634,1260 +1499,6 @@ static int atom_i64_const(ray_t* v, int64_t* out) { } } -typedef struct { - const void* base; - int8_t type; - uint8_t attrs; - int op; - int64_t rhs; -} xbar_count_clause_t; - -typedef struct { - int64_t key; - int64_t count; -} xbar_count_pair_t; - -typedef struct { - uint32_t key; - uint32_t count; -} i16x2_count_pair_t; - -typedef struct { - int32_t key; - uint32_t count; -} i32_count_pair_t; - -typedef struct { - int16_t key; - uint32_t count; -} i16_count_pair_t; - -typedef struct { - const int64_t* key_data; - int64_t bucket; - xbar_count_clause_t clauses[16]; - uint8_t n_clauses; - uint32_t cap; - int64_t* keys; - uint32_t* counts; - uint8_t* used; - _Atomic int overflow; -} xbar_count_ctx_t; - -typedef struct { - const int16_t* key0; - const int16_t* key1; - xbar_count_clause_t clauses[16]; - uint8_t n_clauses; - uint32_t cap; - uint32_t* keys; - uint32_t* counts; - uint8_t* used; - _Atomic int overflow; -} i16x2_count_ctx_t; - -typedef struct { - const int16_t* key; - uint32_t* counts; -} i16_ne0_count_ctx_t; - -typedef struct { - const int32_t* group; - const int64_t* distinct; - uint32_t cap; - int32_t* groups; - int64_t* values; - uint8_t* used; - _Atomic int overflow; -} i32_i64_cd_ctx_t; - -static int xbar_count_pair_cmp(const void* a, const void* b) { - const xbar_count_pair_t* pa = (const xbar_count_pair_t*)a; - const xbar_count_pair_t* pb = (const xbar_count_pair_t*)b; - return (pa->key > pb->key) - (pa->key < pb->key); -} - -static int i16x2_count_pair_desc_cmp(const void* a, const void* b) { - const i16x2_count_pair_t* pa = (const i16x2_count_pair_t*)a; - const i16x2_count_pair_t* pb = (const i16x2_count_pair_t*)b; - if (pa->count != pb->count) - return (pa->count < pb->count) - (pa->count > pb->count); - return (pa->key > pb->key) - (pa->key < pb->key); -} - -static int i32_count_pair_desc_cmp(const void* a, const void* b) { - const i32_count_pair_t* pa = (const i32_count_pair_t*)a; - const i32_count_pair_t* pb = (const i32_count_pair_t*)b; - if (pa->count != pb->count) - return (pa->count < pb->count) - (pa->count > pb->count); - return (pa->key > pb->key) - (pa->key < pb->key); -} - -static int i16_count_pair_desc_cmp(const void* a, const void* b) { - const i16_count_pair_t* pa = (const i16_count_pair_t*)a; - const i16_count_pair_t* pb = (const i16_count_pair_t*)b; - if (pa->count != pb->count) - return (pa->count < pb->count) - (pa->count > pb->count); - return (pa->key > pb->key) - (pa->key < pb->key); -} - -static uint64_t xbar_count_hash_i64(int64_t v) { - uint64_t h = (uint64_t)v; - h ^= h >> 33; - h *= 0xff51afd7ed558ccdULL; - h ^= h >> 33; - h *= 0xc4ceb9fe1a85ec53ULL; - h ^= h >> 33; - return h; -} - -static uint32_t count_hash_u32(uint32_t v) { - uint32_t h = v; - h ^= h >> 16; - h *= 0x7feb352dU; - h ^= h >> 15; - h *= 0x846ca68bU; - h ^= h >> 16; - return h; -} - -static uint64_t count_hash_i32_i64(int32_t g, int64_t v) { - uint64_t h = (uint64_t)(uint32_t)g * 0x9E3779B97F4A7C15ULL; - uint64_t x = (uint64_t)v; - x ^= x >> 33; - x *= 0xff51afd7ed558ccdULL; - x ^= x >> 33; - h ^= x + 0xBF58476D1CE4E5B9ULL + (h << 6) + (h >> 2); - h ^= h >> 33; - return h; -} - -static void xbar_count_worker_fn(void* raw, uint32_t worker_id, - int64_t start, int64_t end) { - xbar_count_ctx_t* ctx = (xbar_count_ctx_t*)raw; - uint32_t cap = ctx->cap; - uint32_t mask = cap - 1u; - int64_t* keys = ctx->keys + (size_t)worker_id * cap; - uint32_t* counts = ctx->counts + (size_t)worker_id * cap; - uint8_t* used = ctx->used + (size_t)worker_id * cap; - int64_t n_groups = 0; - int64_t bucket = ctx->bucket; - - for (int64_t r = start; r < end; r++) { - uint8_t pass = 1; - for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) { - const xbar_count_clause_t* c = &ctx->clauses[ci]; - int64_t v = read_col_i64(c->base, r, c->type, c->attrs); - if (c->op == 1) pass &= (uint8_t)(v == c->rhs); - else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs); - else pass &= (uint8_t)(v <= c->rhs); - if (!pass) break; - } - if (!pass) continue; - int64_t ts = ctx->key_data[r]; - int64_t q = ts / bucket; - if ((ts ^ bucket) < 0 && q * bucket != ts) q--; - int64_t k = q * bucket; - uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask; - while (used[slot] && keys[slot] != k) - slot = (slot + 1u) & mask; - if (!used[slot]) { - if (n_groups >= (int64_t)(cap / 2)) { - atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed); - return; - } - used[slot] = 1; - keys[slot] = k; - n_groups++; - } - counts[slot]++; - } -} - -static void i16x2_count_worker_fn(void* raw, uint32_t worker_id, - int64_t start, int64_t end) { - i16x2_count_ctx_t* ctx = (i16x2_count_ctx_t*)raw; - uint32_t cap = ctx->cap; - uint32_t mask = cap - 1u; - uint32_t* keys = ctx->keys + (size_t)worker_id * cap; - uint32_t* counts = ctx->counts + (size_t)worker_id * cap; - uint8_t* used = ctx->used + (size_t)worker_id * cap; - int64_t n_groups = 0; - - for (int64_t r = start; r < end; r++) { - uint8_t pass = 1; - for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) { - const xbar_count_clause_t* c = &ctx->clauses[ci]; - int64_t v = read_col_i64(c->base, r, c->type, c->attrs); - if (c->op == 1) pass &= (uint8_t)(v == c->rhs); - else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs); - else pass &= (uint8_t)(v <= c->rhs); - if (!pass) break; - } - if (!pass) continue; - uint32_t k = ((uint32_t)(uint16_t)ctx->key0[r] << 16) | - (uint32_t)(uint16_t)ctx->key1[r]; - uint32_t slot = count_hash_u32(k) & mask; - while (used[slot] && keys[slot] != k) - slot = (slot + 1u) & mask; - if (!used[slot]) { - if (n_groups >= (int64_t)(cap / 2)) { - atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed); - return; - } - used[slot] = 1; - keys[slot] = k; - n_groups++; - } - counts[slot]++; - } -} - -static void i16_ne0_count_worker_fn(void* raw, uint32_t worker_id, - int64_t start, int64_t end) { - i16_ne0_count_ctx_t* ctx = (i16_ne0_count_ctx_t*)raw; - uint32_t* counts = ctx->counts + (size_t)worker_id * 65536u; - const int16_t* key = ctx->key; - for (int64_t r = start; r < end; r++) { - int16_t v = key[r]; - if (v) - counts[(uint32_t)((int32_t)v + 32768)]++; - } -} - -static void i32_i64_cd_worker_fn(void* raw, uint32_t worker_id, - int64_t start, int64_t end) { - i32_i64_cd_ctx_t* ctx = (i32_i64_cd_ctx_t*)raw; - uint32_t cap = ctx->cap; - uint32_t mask = cap - 1u; - int32_t* groups = ctx->groups + (size_t)worker_id * cap; - int64_t* values = ctx->values + (size_t)worker_id * cap; - uint8_t* used = ctx->used + (size_t)worker_id * cap; - int64_t n_filled = 0; - - for (int64_t r = start; r < end; r++) { - int32_t g = ctx->group[r]; - int64_t v = ctx->distinct[r]; - uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & mask; - while (used[slot] && (groups[slot] != g || values[slot] != v)) - slot = (slot + 1u) & mask; - if (!used[slot]) { - if (n_filled >= (int64_t)(cap * 7u / 10u)) { - atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed); - return; - } - used[slot] = 1; - groups[slot] = g; - values[slot] = v; - n_filled++; - } - } -} - -static int sym_name_eq(int64_t sym, const char* name, size_t len) { - ray_t* s = ray_sym_str(sym); - return s && ray_str_len(s) == len && - memcmp(ray_str_ptr(s), name, len) == 0; -} - -static int parse_xbar_count_clause(ray_t* tbl, ray_t* expr, - xbar_count_clause_t* clauses, - uint8_t* n_clauses) { - if (!expr || expr->type != RAY_LIST || ray_len(expr) < 3) return 0; - ray_t** elems = (ray_t**)ray_data(expr); - if (!elems[0] || elems[0]->type != -RAY_SYM) return 0; - ray_t* head = ray_sym_str(elems[0]->i64); - if (!head) return 0; - const char* hn = ray_str_ptr(head); - size_t hl = ray_str_len(head); - if (hl == 3 && memcmp(hn, "and", 3) == 0) { - for (int64_t i = 1; i < ray_len(expr); i++) - if (!parse_xbar_count_clause(tbl, elems[i], clauses, n_clauses)) - return 0; - return 1; - } - if (ray_len(expr) != 3 || *n_clauses >= 16) return 0; - int op = 0; - if (hl == 2 && memcmp(hn, "==", 2) == 0) op = 1; - else if (hl == 2 && memcmp(hn, ">=", 2) == 0) op = 2; - else if (hl == 2 && memcmp(hn, "<=", 2) == 0) op = 3; - else return 0; - - ray_t* lhs = elems[1]; - ray_t* rhs = elems[2]; - int64_t rhs_i = 0; - if (!lhs || lhs->type != -RAY_SYM || !(lhs->attrs & RAY_ATTR_NAME) || - !atom_i64_const(rhs, &rhs_i)) - return 0; - ray_t* col = ray_table_get_col(tbl, lhs->i64); - if (!col || !ray_is_vec(col) || RAY_IS_PARTED(col->type) || - col->type == RAY_MAPCOMMON || (col->attrs & RAY_ATTR_HAS_NULLS)) - return 0; - int8_t ct = col->type; - if (ct != RAY_BOOL && ct != RAY_U8 && ct != RAY_I16 && - ct != RAY_I32 && ct != RAY_I64 && ct != RAY_DATE && - ct != RAY_TIME && ct != RAY_TIMESTAMP) - return 0; - clauses[*n_clauses] = (xbar_count_clause_t){ - .base = ray_data(col), - .type = ct, - .attrs = col->attrs, - .op = op, - .rhs = rhs_i, - }; - (*n_clauses)++; - return 1; -} - -static int count_clause_score(const xbar_count_clause_t* c) { - if (c->op == 1 && ray_sym_elem_size(c->type, c->attrs) >= 8) return 0; - if (c->op == 1) return 1; - return 2; -} - -static void order_count_clauses(xbar_count_clause_t* clauses, uint8_t n) { - for (uint8_t i = 1; i < n; i++) { - xbar_count_clause_t v = clauses[i]; - int vs = count_clause_score(&v); - uint8_t j = i; - while (j > 0 && count_clause_score(&clauses[j - 1]) > vs) { - clauses[j] = clauses[j - 1]; - j--; - } - clauses[j] = v; - } -} - -static int xbar_clause_cache_eq(const xbar_count_clause_t* a, uint8_t an, - const xbar_count_clause_t* b, uint8_t bn) { - if (an != bn) return 0; - for (uint8_t i = 0; i < an; i++) { - if (a[i].base != b[i].base || a[i].type != b[i].type || - a[i].attrs != b[i].attrs || a[i].op != b[i].op || - a[i].rhs != b[i].rhs) - return 0; - } - return 1; -} - -static int match_i16_key_ne_zero(ray_t* where_expr, int64_t key_sym) { - if (!where_expr || where_expr->type != RAY_LIST || ray_len(where_expr) != 3) - return 0; - ray_t** e = (ray_t**)ray_data(where_expr); - if (!e[0] || e[0]->type != -RAY_SYM || - !sym_name_eq(e[0]->i64, "!=", 2)) - return 0; - ray_t* lhs = e[1]; - int64_t rhs = 0; - return lhs && lhs->type == -RAY_SYM && (lhs->attrs & RAY_ATTR_NAME) && - lhs->i64 == key_sym && atom_i64_const(e[2], &rhs) && rhs == 0; -} - -static ray_t* try_i16_ne0_count_desc_select(ray_t* tbl, ray_t* where_expr, - ray_t* by_expr, ray_t* take_expr, - ray_t** dict_elems, - int64_t dict_n, - int64_t from_id, - int64_t where_id, - int64_t by_id, - int64_t take_id, - int64_t asc_id, - int64_t desc_id, - int64_t nearest_id) { - if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr || - !take_expr || by_expr->type != -RAY_SYM || - !(by_expr->attrs & RAY_ATTR_NAME)) - return NULL; - int64_t key_sym = by_expr->i64; - int64_t take_n = 0; - if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024) - return NULL; - if (!match_i16_key_ne_zero(where_expr, key_sym)) - return NULL; - - int64_t count_alias = -1; - int saw_desc = 0; - int saw_key_projection = 0; - for (int64_t i = 0; i + 1 < dict_n; i += 2) { - int64_t kid = dict_elems[i]->i64; - ray_t* v = dict_elems[i + 1]; - if (kid == from_id || kid == where_id || kid == by_id || - kid == take_id || kid == nearest_id) - continue; - if (kid == desc_id) { - if (!v || v->type != -RAY_SYM) - return NULL; - saw_desc = 1; - continue; - } - if (kid == asc_id) return NULL; - if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) && - kid == key_sym && v->i64 == key_sym) { - saw_key_projection = 1; - continue; - } - if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2) - return NULL; - ray_t** ae = (ray_t**)ray_data(v); - if (!ae[0] || ae[0]->type != -RAY_SYM || - !sym_name_eq(ae[0]->i64, "count", 5)) - return NULL; - ray_t* arg = ae[1]; - if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME) || - arg->i64 != key_sym) - return NULL; - count_alias = kid; - } - if (!saw_desc || !saw_key_projection || count_alias < 0) - return NULL; - - ray_t* col = ray_table_get_col(tbl, key_sym); - if (!col || !ray_is_vec(col) || col->type != RAY_I16 || - (col->attrs & RAY_ATTR_HAS_NULLS)) - return NULL; - - static ray_t* cache_result = NULL; - static ray_t* cache_tbl = NULL; - static ray_t* cache_col = NULL; - static int64_t cache_len = -1; - static int64_t cache_key_sym = -1; - static int64_t cache_count_alias = -1; - static int64_t cache_take = -1; - if (cache_result && cache_tbl == tbl && cache_col == col && - cache_len == col->len && cache_key_sym == key_sym && - cache_count_alias == count_alias && cache_take == take_n) { - ray_retain(cache_result); - return cache_result; - } - - ray_pool_t* pool = ray_pool_get(); - uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; - if (nw == 0) nw = 1; - ray_t* counts_hdr = NULL; - uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr, - (size_t)nw * 65536u * sizeof(uint32_t)); - if (!counts) - return ray_error("oom", NULL); - - i16_ne0_count_ctx_t ctx = { - .key = (const int16_t*)ray_data(col), - .counts = counts, - }; - int64_t nrows = ray_table_nrows(tbl); - if (pool && nrows >= RAY_PARALLEL_THRESHOLD) - ray_pool_dispatch(pool, i16_ne0_count_worker_fn, &ctx, nrows); - else - i16_ne0_count_worker_fn(&ctx, 0, 0, nrows); - - i16_count_pair_t top[1024]; - int64_t top_n = 0; - for (uint32_t s = 0; s < 65536u; s++) { - uint32_t total = 0; - for (uint32_t w = 0; w < nw; w++) - total += counts[(size_t)w * 65536u + s]; - if (!total) continue; - i16_count_pair_t cand = { - .key = (int16_t)((int32_t)s - 32768), - .count = total, - }; - if (top_n < take_n) { - top[top_n++] = cand; - continue; - } - int64_t min_i = 0; - for (int64_t i = 1; i < top_n; i++) { - if (top[i].count < top[min_i].count || - (top[i].count == top[min_i].count && top[i].key > top[min_i].key)) - min_i = i; - } - if (cand.count > top[min_i].count || - (cand.count == top[min_i].count && cand.key < top[min_i].key)) - top[min_i] = cand; - } - scratch_free(counts_hdr); - qsort(top, (size_t)top_n, sizeof(i16_count_pair_t), - i16_count_pair_desc_cmp); - - int64_t out_n = top_n; - ray_t* key_out = ray_vec_new(RAY_I16, out_n); - ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); - if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) { - if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out); - if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); - return ray_error("oom", NULL); - } - key_out->len = out_n; - cnt_out->len = out_n; - int16_t* ko = (int16_t*)ray_data(key_out); - int64_t* co = (int64_t*)ray_data(cnt_out); - for (int64_t i = 0; i < out_n; i++) { - ko[i] = top[i].key; - co[i] = (int64_t)top[i].count; - } - - ray_t* out = ray_table_new(2); - if (!out || RAY_IS_ERR(out)) { - ray_release(key_out); ray_release(cnt_out); - return out ? out : ray_error("oom", NULL); - } - out = ray_table_add_col(out, key_sym, key_out); - out = ray_table_add_col(out, count_alias, cnt_out); - ray_release(key_out); ray_release(cnt_out); - if (cache_result) - ray_release(cache_result); - cache_result = out; - cache_tbl = tbl; - cache_col = col; - cache_len = col->len; - cache_key_sym = key_sym; - cache_count_alias = count_alias; - cache_take = take_n; - ray_retain(cache_result); - return out; -} - -static ray_t* try_i32_i64_count_distinct_select(ray_t* tbl, ray_t* where_expr, - ray_t* by_expr, - ray_t* take_expr, - ray_t** dict_elems, - int64_t dict_n, - int64_t from_id, - int64_t where_id, - int64_t by_id, - int64_t take_id, - int64_t asc_id, - int64_t desc_id, - int64_t nearest_id) { - if (!tbl || tbl->type != RAY_TABLE || where_expr || !by_expr || - !take_expr || by_expr->type != -RAY_SYM || - !(by_expr->attrs & RAY_ATTR_NAME)) - return NULL; - - int64_t take_n = 0; - if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024) - return NULL; - - int64_t group_sym = by_expr->i64; - int64_t distinct_sym = -1; - int64_t count_alias = -1; - int saw_desc = 0; - int saw_group_projection = 0; - for (int64_t i = 0; i + 1 < dict_n; i += 2) { - int64_t kid = dict_elems[i]->i64; - ray_t* v = dict_elems[i + 1]; - if (kid == from_id || kid == where_id || kid == by_id || - kid == take_id || kid == nearest_id) - continue; - if (kid == desc_id) { - if (!v || v->type != -RAY_SYM) - return NULL; - saw_desc = 1; - continue; - } - if (kid == asc_id) return NULL; - if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) && - kid == group_sym && v->i64 == group_sym) { - saw_group_projection = 1; - continue; - } - if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2) - return NULL; - ray_t** ae = (ray_t**)ray_data(v); - if (!ae[0] || ae[0]->type != -RAY_SYM || - !sym_name_eq(ae[0]->i64, "count", 5)) - return NULL; - ray_t* inner = ae[1]; - if (!inner || inner->type != RAY_LIST || ray_len(inner) != 2) - return NULL; - ray_t** ie = (ray_t**)ray_data(inner); - if (!ie[0] || ie[0]->type != -RAY_SYM || - !sym_name_eq(ie[0]->i64, "distinct", 8)) - return NULL; - ray_t* arg = ie[1]; - if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME)) - return NULL; - distinct_sym = arg->i64; - count_alias = kid; - } - if (!saw_desc || !saw_group_projection || count_alias < 0 || - distinct_sym < 0) - return NULL; - - ray_t* gcol = ray_table_get_col(tbl, group_sym); - ray_t* dcol = ray_table_get_col(tbl, distinct_sym); - if (!gcol || !dcol || !ray_is_vec(gcol) || !ray_is_vec(dcol) || - gcol->type != RAY_I32 || dcol->type != RAY_I64 || - (gcol->attrs & RAY_ATTR_HAS_NULLS) || - (dcol->attrs & RAY_ATTR_HAS_NULLS)) - return NULL; - - static ray_t* cache_result = NULL; - static ray_t* cache_tbl = NULL; - static int64_t cache_len = -1; - static int64_t cache_group_sym = -1; - static int64_t cache_distinct_sym = -1; - static int64_t cache_count_alias = -1; - static int64_t cache_take = -1; - if (cache_result && cache_tbl == tbl && cache_len == gcol->len && - cache_group_sym == group_sym && cache_distinct_sym == distinct_sym && - cache_count_alias == count_alias && cache_take == take_n) { - ray_retain(cache_result); - return cache_result; - } - - int64_t nrows = ray_table_nrows(tbl); - ray_pool_t* pool = ray_pool_get(); - uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; - if (nw == 0) nw = 1; - const uint32_t local_cap = 1u << 20; - ray_t *lg_hdr = NULL, *lv_hdr = NULL, *lu_hdr = NULL; - int32_t* lg = (int32_t*)scratch_calloc(&lg_hdr, - (size_t)nw * local_cap * sizeof(int32_t)); - int64_t* lv = (int64_t*)scratch_calloc(&lv_hdr, - (size_t)nw * local_cap * sizeof(int64_t)); - uint8_t* lu = (uint8_t*)scratch_calloc(&lu_hdr, (size_t)nw * local_cap); - if (!lg || !lv || !lu) { - if (lg_hdr) scratch_free(lg_hdr); - if (lv_hdr) scratch_free(lv_hdr); - if (lu_hdr) scratch_free(lu_hdr); - return ray_error("oom", NULL); - } - - i32_i64_cd_ctx_t ctx = { - .group = (const int32_t*)ray_data(gcol), - .distinct = (const int64_t*)ray_data(dcol), - .cap = local_cap, - .groups = lg, - .values = lv, - .used = lu, - }; - atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed); - if (pool && nrows >= RAY_PARALLEL_THRESHOLD) - ray_pool_dispatch(pool, i32_i64_cd_worker_fn, &ctx, nrows); - else - i32_i64_cd_worker_fn(&ctx, 0, 0, nrows); - if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) { - scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); - return NULL; - } - - const uint32_t gcap = 1u << 23; - const uint32_t gmask = gcap - 1u; - ray_t *gg_hdr = NULL, *gv_hdr = NULL, *gu_hdr = NULL; - int32_t* gg = (int32_t*)scratch_calloc(&gg_hdr, (size_t)gcap * sizeof(int32_t)); - int64_t* gv = (int64_t*)scratch_calloc(&gv_hdr, (size_t)gcap * sizeof(int64_t)); - uint8_t* gu = (uint8_t*)scratch_calloc(&gu_hdr, (size_t)gcap); - if (!gg || !gv || !gu) { - scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); - if (gg_hdr) scratch_free(gg_hdr); - if (gv_hdr) scratch_free(gv_hdr); - if (gu_hdr) scratch_free(gu_hdr); - return ray_error("oom", NULL); - } - - int64_t global_n = 0; - for (uint32_t w = 0; w < nw; w++) { - int32_t* wg = lg + (size_t)w * local_cap; - int64_t* wv = lv + (size_t)w * local_cap; - uint8_t* wu = lu + (size_t)w * local_cap; - for (uint32_t s = 0; s < local_cap; s++) { - if (!wu[s]) continue; - int32_t g = wg[s]; - int64_t v = wv[s]; - uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & gmask; - while (gu[slot] && (gg[slot] != g || gv[slot] != v)) - slot = (slot + 1u) & gmask; - if (!gu[slot]) { - if (global_n >= (int64_t)(gcap * 7u / 10u)) { - scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr); - scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); - return NULL; - } - gu[slot] = 1; - gg[slot] = g; - gv[slot] = v; - global_n++; - } - } - } - scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); - - const uint32_t rcap = 4096; - const uint32_t rmask = rcap - 1u; - int32_t rkeys[4096]; - uint32_t rcounts[4096]; - uint8_t rused[4096]; - memset(rused, 0, sizeof(rused)); - int64_t region_n = 0; - for (uint32_t s = 0; s < gcap; s++) { - if (!gu[s]) continue; - int32_t g = gg[s]; - uint32_t slot = count_hash_u32((uint32_t)g) & rmask; - while (rused[slot] && rkeys[slot] != g) - slot = (slot + 1u) & rmask; - if (!rused[slot]) { - if (region_n >= (int64_t)(rcap / 2)) { - scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr); - return NULL; - } - rused[slot] = 1; - rkeys[slot] = g; - rcounts[slot] = 0; - region_n++; - } - rcounts[slot]++; - } - scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr); - - ray_t* pairs_hdr = NULL; - i32_count_pair_t* pairs = (i32_count_pair_t*)scratch_alloc( - &pairs_hdr, (size_t)region_n * sizeof(i32_count_pair_t)); - if (!pairs && region_n > 0) - return ray_error("oom", NULL); - int64_t pi = 0; - for (uint32_t s = 0; s < rcap; s++) { - if (!rused[s]) continue; - pairs[pi++] = (i32_count_pair_t){ .key = rkeys[s], .count = rcounts[s] }; - } - qsort(pairs, (size_t)region_n, sizeof(i32_count_pair_t), - i32_count_pair_desc_cmp); - - int64_t out_n = region_n < take_n ? region_n : take_n; - ray_t* key_out = ray_vec_new(RAY_I32, out_n); - ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); - if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) { - if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out); - if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); - scratch_free(pairs_hdr); - return ray_error("oom", NULL); - } - key_out->len = out_n; - cnt_out->len = out_n; - int32_t* ko = (int32_t*)ray_data(key_out); - int64_t* co = (int64_t*)ray_data(cnt_out); - for (int64_t i = 0; i < out_n; i++) { - ko[i] = pairs[i].key; - co[i] = (int64_t)pairs[i].count; - } - scratch_free(pairs_hdr); - - ray_t* out = ray_table_new(2); - if (!out || RAY_IS_ERR(out)) { - ray_release(key_out); ray_release(cnt_out); - return out ? out : ray_error("oom", NULL); - } - out = ray_table_add_col(out, group_sym, key_out); - out = ray_table_add_col(out, count_alias, cnt_out); - ray_release(key_out); ray_release(cnt_out); - if (cache_result) - ray_release(cache_result); - cache_result = out; - cache_tbl = tbl; - cache_len = gcol->len; - cache_group_sym = group_sym; - cache_distinct_sym = distinct_sym; - cache_count_alias = count_alias; - cache_take = take_n; - ray_retain(cache_result); - return out; -} - -static ray_t* try_i16x2_count_desc_select(ray_t* tbl, ray_t* where_expr, - ray_t* by_expr, ray_t* take_expr, - ray_t** dict_elems, int64_t dict_n, - int64_t from_id, int64_t where_id, - int64_t by_id, int64_t take_id, - int64_t asc_id, int64_t desc_id, - int64_t nearest_id) { - if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr || - !take_expr || by_expr->type != RAY_DICT) - return NULL; - - int64_t take_n = 0; - if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000) - return NULL; - - DICT_VIEW_DECL(bv); - DICT_VIEW_OPEN(by_expr, bv); - if (DICT_VIEW_OVERFLOW(bv) || bv_n != 4) return NULL; - ray_t* key0_atom = bv[0]; - ray_t* key0_val = bv[1]; - ray_t* key1_atom = bv[2]; - ray_t* key1_val = bv[3]; - if (!key0_atom || key0_atom->type != -RAY_SYM || - !key1_atom || key1_atom->type != -RAY_SYM || - !key0_val || key0_val->type != -RAY_SYM || - !key1_val || key1_val->type != -RAY_SYM || - !(key0_val->attrs & RAY_ATTR_NAME) || - !(key1_val->attrs & RAY_ATTR_NAME) || - key0_atom->i64 != key0_val->i64 || - key1_atom->i64 != key1_val->i64) - return NULL; - - int64_t count_alias = -1; - int saw_desc = 0; - for (int64_t i = 0; i + 1 < dict_n; i += 2) { - int64_t kid = dict_elems[i]->i64; - ray_t* v = dict_elems[i + 1]; - if (kid == from_id || kid == where_id || kid == by_id || - kid == take_id || kid == nearest_id) - continue; - if (kid == desc_id) { - if (!v || v->type != -RAY_SYM) - return NULL; - saw_desc = 1; - continue; - } - if (kid == asc_id) return NULL; - if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL; - ray_t** ae = (ray_t**)ray_data(v); - if (!ae[0] || ae[0]->type != -RAY_SYM || - !sym_name_eq(ae[0]->i64, "count", 5)) - return NULL; - count_alias = kid; - } - if (!saw_desc || count_alias < 0) return NULL; - - ray_t* col0 = ray_table_get_col(tbl, key0_atom->i64); - ray_t* col1 = ray_table_get_col(tbl, key1_atom->i64); - if (!col0 || !col1 || !ray_is_vec(col0) || !ray_is_vec(col1) || - col0->type != RAY_I16 || col1->type != RAY_I16 || - (col0->attrs & RAY_ATTR_HAS_NULLS) || - (col1->attrs & RAY_ATTR_HAS_NULLS)) - return NULL; - - xbar_count_clause_t clauses[16]; - uint8_t n_clauses = 0; - if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) || - n_clauses == 0) - return NULL; - order_count_clauses(clauses, n_clauses); - - static ray_t* cache_result = NULL; - static ray_t* cache_tbl = NULL; - static ray_t* cache_col0 = NULL; - static ray_t* cache_col1 = NULL; - static int64_t cache_len = -1; - static int64_t cache_key0 = -1; - static int64_t cache_key1 = -1; - static int64_t cache_count_alias = -1; - static int64_t cache_take = -1; - static uint8_t cache_n_clauses = 0; - static xbar_count_clause_t cache_clauses[16]; - if (cache_result && cache_tbl == tbl && cache_col0 == col0 && - cache_col1 == col1 && cache_len == col0->len && - cache_key0 == key0_atom->i64 && cache_key1 == key1_atom->i64 && - cache_count_alias == count_alias && cache_take == take_n && - xbar_clause_cache_eq(cache_clauses, cache_n_clauses, - clauses, n_clauses)) { - ray_retain(cache_result); - return cache_result; - } - - int64_t nrows = ray_table_nrows(tbl); - const uint32_t cap = 4096; - const uint32_t mask = cap - 1u; - ray_pool_t* pool = ray_pool_get(); - uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; - if (nw == 0) nw = 1; - - ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL; - uint32_t* keys = (uint32_t*)scratch_calloc(&keys_hdr, - (size_t)nw * cap * sizeof(uint32_t)); - uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr, - (size_t)nw * cap * sizeof(uint32_t)); - uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap); - if (!keys || !counts || !used) { - if (keys_hdr) scratch_free(keys_hdr); - if (counts_hdr) scratch_free(counts_hdr); - if (used_hdr) scratch_free(used_hdr); - return ray_error("oom", NULL); - } - - i16x2_count_ctx_t ctx = { - .key0 = (const int16_t*)ray_data(col0), - .key1 = (const int16_t*)ray_data(col1), - .n_clauses = n_clauses, - .cap = cap, - .keys = keys, - .counts = counts, - .used = used, - }; - memcpy(ctx.clauses, clauses, sizeof(clauses)); - atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed); - if (pool && nrows >= RAY_PARALLEL_THRESHOLD) - ray_pool_dispatch(pool, i16x2_count_worker_fn, &ctx, nrows); - else - i16x2_count_worker_fn(&ctx, 0, 0, nrows); - if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) { - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - return NULL; - } - - ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL; - uint32_t* mkeys = (uint32_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(uint32_t)); - uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t)); - uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap); - if (!mkeys || !mcounts || !mused) { - scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); - if (mkeys_hdr) scratch_free(mkeys_hdr); - if (mcounts_hdr) scratch_free(mcounts_hdr); - if (mused_hdr) scratch_free(mused_hdr); - return ray_error("oom", NULL); - } - - int64_t n_groups = 0; - for (uint32_t w = 0; w < nw; w++) { - uint32_t* wk = keys + (size_t)w * cap; - uint32_t* wc = counts + (size_t)w * cap; - uint8_t* wu = used + (size_t)w * cap; - for (uint32_t s = 0; s < cap; s++) { - if (!wu[s]) continue; - uint32_t k = wk[s]; - uint32_t slot = count_hash_u32(k) & mask; - while (mused[slot] && mkeys[slot] != k) - slot = (slot + 1u) & mask; - if (!mused[slot]) { - if (n_groups >= (int64_t)(cap / 2)) { - scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); - scratch_free(mused_hdr); scratch_free(keys_hdr); - scratch_free(counts_hdr); scratch_free(used_hdr); - return NULL; - } - mused[slot] = 1; - mkeys[slot] = k; - n_groups++; - } - mcounts[slot] += wc[s]; - } - } - - int64_t out_n = n_groups < take_n ? n_groups : take_n; - ray_t* pairs_hdr = NULL; - i16x2_count_pair_t* pairs = (i16x2_count_pair_t*)scratch_alloc( - &pairs_hdr, (size_t)n_groups * sizeof(i16x2_count_pair_t)); - if (!pairs && n_groups > 0) { - scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr); - scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); - return ray_error("oom", NULL); - } - int64_t pi = 0; - for (uint32_t s = 0; s < cap; s++) { - if (!mused[s]) continue; - pairs[pi++] = (i16x2_count_pair_t){ .key = mkeys[s], .count = mcounts[s] }; - } - qsort(pairs, (size_t)n_groups, sizeof(i16x2_count_pair_t), - i16x2_count_pair_desc_cmp); - - ray_t* key0_out = ray_vec_new(RAY_I16, out_n); - ray_t* key1_out = ray_vec_new(RAY_I16, out_n); - ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); - if (!key0_out || !key1_out || !cnt_out || - RAY_IS_ERR(key0_out) || RAY_IS_ERR(key1_out) || RAY_IS_ERR(cnt_out)) { - if (key0_out && !RAY_IS_ERR(key0_out)) ray_release(key0_out); - if (key1_out && !RAY_IS_ERR(key1_out)) ray_release(key1_out); - if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); - scratch_free(pairs_hdr); - scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr); - scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); - return ray_error("oom", NULL); - } - key0_out->len = out_n; - key1_out->len = out_n; - cnt_out->len = out_n; - int16_t* k0o = (int16_t*)ray_data(key0_out); - int16_t* k1o = (int16_t*)ray_data(key1_out); - int64_t* co = (int64_t*)ray_data(cnt_out); - for (int64_t i = 0; i < out_n; i++) { - uint32_t k = pairs[i].key; - k0o[i] = (int16_t)(uint16_t)(k >> 16); - k1o[i] = (int16_t)(uint16_t)k; - co[i] = (int64_t)pairs[i].count; - } - scratch_free(pairs_hdr); - scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr); - scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); - - ray_t* out = ray_table_new(3); - if (!out || RAY_IS_ERR(out)) { - ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out); - return out ? out : ray_error("oom", NULL); - } - out = ray_table_add_col(out, key0_atom->i64, key0_out); - out = ray_table_add_col(out, key1_atom->i64, key1_out); - out = ray_table_add_col(out, count_alias, cnt_out); - ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out); - if (cache_result) - ray_release(cache_result); - cache_result = out; - cache_tbl = tbl; - cache_col0 = col0; - cache_col1 = col1; - cache_len = col0->len; - cache_key0 = key0_atom->i64; - cache_key1 = key1_atom->i64; - cache_count_alias = count_alias; - cache_take = take_n; - cache_n_clauses = n_clauses; - memcpy(cache_clauses, clauses, sizeof(clauses)); - ray_retain(cache_result); - return out; -} - -static ray_t* try_xbar_count_select(ray_t* tbl, ray_t* where_expr, - ray_t* by_expr, ray_t* take_expr, - ray_t** dict_elems, int64_t dict_n, - int64_t from_id, int64_t where_id, - int64_t by_id, int64_t take_id, - int64_t asc_id, int64_t desc_id, - int64_t nearest_id) { - if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr || - !take_expr) - return NULL; - - int64_t take_n = 0; - if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000) - return NULL; - - if (!by_expr || by_expr->type != RAY_DICT) return NULL; - DICT_VIEW_DECL(bv); - DICT_VIEW_OPEN(by_expr, bv); - if (DICT_VIEW_OVERFLOW(bv) || bv_n != 2) return NULL; - ray_t* key_atom = bv[0]; - ray_t* xbar_expr = bv[1]; - if (!key_atom || key_atom->type != -RAY_SYM || - !xbar_expr || xbar_expr->type != RAY_LIST || - ray_len(xbar_expr) != 3) - return NULL; - ray_t** xe = (ray_t**)ray_data(xbar_expr); - if (!xe[0] || xe[0]->type != -RAY_SYM || - !sym_name_eq(xe[0]->i64, "xbar", 4)) - return NULL; - if (!xe[1] || xe[1]->type != -RAY_SYM || - !(xe[1]->attrs & RAY_ATTR_NAME)) - return NULL; - int64_t bucket = 0; - if (!atom_i64_const(xe[2], &bucket) || bucket <= 0) return NULL; - - int64_t count_alias = -1; - int saw_asc = 0; - for (int64_t i = 0; i + 1 < dict_n; i += 2) { - int64_t kid = dict_elems[i]->i64; - ray_t* v = dict_elems[i + 1]; - if (kid == from_id || kid == where_id || kid == by_id || - kid == take_id || kid == nearest_id) - continue; - if (kid == asc_id) { - if (!v || v->type != -RAY_SYM || v->i64 != key_atom->i64) - return NULL; - saw_asc = 1; - continue; - } - if (kid == desc_id) return NULL; - if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL; - ray_t** ae = (ray_t**)ray_data(v); - if (!ae[0] || ae[0]->type != -RAY_SYM || - !sym_name_eq(ae[0]->i64, "count", 5)) - return NULL; - count_alias = kid; - } - if (!saw_asc || count_alias < 0) return NULL; - - ray_t* key_col = ray_table_get_col(tbl, xe[1]->i64); - if (!key_col || !ray_is_vec(key_col) || key_col->type != RAY_TIMESTAMP || - RAY_IS_PARTED(key_col->type) || key_col->type == RAY_MAPCOMMON || - (key_col->attrs & RAY_ATTR_HAS_NULLS)) - return NULL; - - xbar_count_clause_t clauses[16]; - uint8_t n_clauses = 0; - if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) || - n_clauses == 0) - return NULL; - order_count_clauses(clauses, n_clauses); - - int64_t nrows = ray_table_nrows(tbl); - const int64_t* key_data = (const int64_t*)ray_data(key_col); - static ray_t* cache_result = NULL; - static ray_t* cache_tbl = NULL; - static ray_t* cache_key_col = NULL; - static int64_t cache_len = -1; - static int64_t cache_key_sym = -1; - static int64_t cache_out_sym = -1; - static int64_t cache_count_alias = -1; - static int64_t cache_bucket = -1; - static int64_t cache_take = -1; - static uint8_t cache_n_clauses = 0; - static xbar_count_clause_t cache_clauses[16]; - if (cache_result && cache_tbl == tbl && cache_key_col == key_col && - cache_len == key_col->len && cache_key_sym == xe[1]->i64 && - cache_out_sym == key_atom->i64 && cache_count_alias == count_alias && - cache_bucket == bucket && cache_take == take_n && - xbar_clause_cache_eq(cache_clauses, cache_n_clauses, - clauses, n_clauses)) { - ray_retain(cache_result); - return cache_result; - } - const uint32_t cap = 4096; - const uint32_t mask = cap - 1u; - ray_pool_t* pool = ray_pool_get(); - uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; - if (nw == 0) nw = 1; - ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL; - int64_t* keys = (int64_t*)scratch_calloc(&keys_hdr, - (size_t)nw * cap * sizeof(int64_t)); - uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr, - (size_t)nw * cap * sizeof(uint32_t)); - uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap); - if (!keys || !counts || !used) { - if (keys_hdr) scratch_free(keys_hdr); - if (counts_hdr) scratch_free(counts_hdr); - if (used_hdr) scratch_free(used_hdr); - return ray_error("oom", NULL); - } - - xbar_count_ctx_t ctx = { - .key_data = key_data, - .bucket = bucket, - .n_clauses = n_clauses, - .cap = cap, - .keys = keys, - .counts = counts, - .used = used, - }; - memcpy(ctx.clauses, clauses, sizeof(clauses)); - atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed); - if (pool && nrows >= RAY_PARALLEL_THRESHOLD) - ray_pool_dispatch(pool, xbar_count_worker_fn, &ctx, nrows); - else - xbar_count_worker_fn(&ctx, 0, 0, nrows); - if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) { - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - return NULL; - } - - ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL; - int64_t* mkeys = (int64_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(int64_t)); - uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t)); - uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap); - if (!mkeys || !mcounts || !mused) { - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - if (mkeys_hdr) scratch_free(mkeys_hdr); - if (mcounts_hdr) scratch_free(mcounts_hdr); - if (mused_hdr) scratch_free(mused_hdr); - return ray_error("oom", NULL); - } - - int64_t n_groups = 0; - for (uint32_t w = 0; w < nw; w++) { - int64_t* wk = keys + (size_t)w * cap; - uint32_t* wc = counts + (size_t)w * cap; - uint8_t* wu = used + (size_t)w * cap; - for (uint32_t s = 0; s < cap; s++) { - if (!wu[s]) continue; - int64_t k = wk[s]; - uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask; - while (mused[slot] && mkeys[slot] != k) - slot = (slot + 1u) & mask; - if (!mused[slot]) { - if (n_groups >= (int64_t)(cap / 2)) { - scratch_free(mkeys_hdr); - scratch_free(mcounts_hdr); - scratch_free(mused_hdr); - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - return NULL; - } - mused[slot] = 1; - mkeys[slot] = k; - n_groups++; - } - mcounts[slot] += wc[s]; - } - } - - int64_t out_n = n_groups < take_n ? n_groups : take_n; - ray_t* pairs_hdr = NULL; - xbar_count_pair_t* pairs = (xbar_count_pair_t*)scratch_alloc( - &pairs_hdr, (size_t)n_groups * sizeof(xbar_count_pair_t)); - if (!pairs && n_groups > 0) { - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - return ray_error("oom", NULL); - } - int64_t pi = 0; - for (uint32_t s = 0; s < cap; s++) { - if (!mused[s]) continue; - pairs[pi++] = (xbar_count_pair_t){ .key = mkeys[s], .count = mcounts[s] }; - } - qsort(pairs, (size_t)n_groups, sizeof(xbar_count_pair_t), - xbar_count_pair_cmp); - - ray_t* key_out = ray_vec_new(RAY_TIMESTAMP, out_n); - ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); - if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) { - if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out); - if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); - scratch_free(pairs_hdr); - scratch_free(mkeys_hdr); - scratch_free(mcounts_hdr); - scratch_free(mused_hdr); - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - return ray_error("oom", NULL); - } - key_out->len = out_n; - cnt_out->len = out_n; - int64_t* ko = (int64_t*)ray_data(key_out); - int64_t* co = (int64_t*)ray_data(cnt_out); - for (int64_t i = 0; i < out_n; i++) { - ko[i] = pairs[i].key; - co[i] = pairs[i].count; - } - scratch_free(pairs_hdr); - scratch_free(mkeys_hdr); - scratch_free(mcounts_hdr); - scratch_free(mused_hdr); - scratch_free(keys_hdr); - scratch_free(counts_hdr); - scratch_free(used_hdr); - - ray_t* out = ray_table_new(2); - if (!out || RAY_IS_ERR(out)) { - ray_release(key_out); - ray_release(cnt_out); - return out ? out : ray_error("oom", NULL); - } - out = ray_table_add_col(out, key_atom->i64, key_out); - out = ray_table_add_col(out, count_alias, cnt_out); - ray_release(key_out); - ray_release(cnt_out); - if (cache_result) - ray_release(cache_result); - cache_result = out; - cache_tbl = tbl; - cache_key_col = key_col; - cache_len = key_col->len; - cache_key_sym = xe[1]->i64; - cache_out_sym = key_atom->i64; - cache_count_alias = count_alias; - cache_bucket = bucket; - cache_take = take_n; - cache_n_clauses = n_clauses; - memcpy(cache_clauses, clauses, sizeof(clauses)); - ray_retain(cache_result); - return out; -} - static int expr_affine_of_sym(ray_t* expr, int64_t sym, int64_t* bias) { if (!expr) return 0; if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME) && @@ -3123,22 +1734,39 @@ static bool match_group_desc_count_take(ray_t** dict_elems, int64_t dict_n, int64_t by_id, int64_t take_id, int64_t asc_id, int64_t desc_id, ray_group_emit_filter_t* out) { + /* Detects `(select … by … AGGCOL take: N)` where AGGCOL + * is the name of an output agg col with op ∈ {COUNT, SUM, MIN, MAX} + * and N is a positive atom ≤ 1024. Returns the filter pre-filled so + * the consumer (group/fused_group materialize) can heap-extract the + * top-N groups by AGGCOL.value before emitting rows. AVG and + * higher-order aggs (STDDEV/VAR/PEARSON/MEDIAN) fall through — their + * ordering doesn't reduce to a single int64 row slot read. + * + * The 1024 cap matches the stack-resident heap budget shared by the + * three concrete consumer sites (mk_apply_count_emit_filter, + * v2_emit's per-partition compact, the n_keys>1 macro path). Larger + * N drops through to the full sort + take so the heap doesn't + * overflow the stack. */ ray_t* take_expr = NULL; - int64_t desc_name = -1; + int64_t order_name = -1; + uint8_t want_desc = 1; + bool seen_dir = false; for (int64_t i = 0; i + 1 < dict_n; i += 2) { int64_t kid = dict_elems[i]->i64; if (kid == take_id) take_expr = dict_elems[i + 1]; - else if (kid == desc_id) { + else if (kid == desc_id || kid == asc_id) { + if (seen_dir) return false; /* both asc: and desc: → ambiguous */ + seen_dir = true; ray_t* v = dict_elems[i + 1]; if (!v || v->type != -RAY_SYM) return false; - desc_name = v->i64; - } else if (kid == asc_id) { - return false; + order_name = v->i64; + want_desc = (kid == desc_id) ? 1 : 0; } } int64_t take_n = 0; - if (desc_name < 0 || !positive_take_i64(take_expr, &take_n)) + if (order_name < 0 || !positive_take_i64(take_expr, &take_n)) return false; + if (take_n > 1024) return false; uint8_t agg_index = 0; for (int64_t i = 0; i + 1 < dict_n; i += 2) { @@ -3151,11 +1779,15 @@ static bool match_group_desc_count_take(ray_t** dict_elems, int64_t dict_n, continue; ray_t** ae = (ray_t**)ray_data(val); uint16_t op = resolve_agg_opcode(ae[0]->i64); - if (kid == desc_name && op == OP_COUNT) { + if (kid == order_name && + (op == OP_COUNT || op == OP_SUM || + op == OP_MIN || op == OP_MAX)) { out->enabled = 1; out->agg_index = agg_index; out->min_count_exclusive = 0; out->top_count_take = take_n; + out->agg_op = op; + out->desc = want_desc; return true; } agg_index++; @@ -4064,6 +2696,212 @@ static ray_t* query_materialize_parted_col(ray_t* col) { return flat; } +/* Planner rewrite for `(select {K: K c: (count (distinct X)) from: T + * [where: W] by: K [desc: c take: N]})`. + * + * Original execution: outer group-by K builds idx_buf → per-group dedup + * over X (via cdpg_buf_par_fn or per-group HLL). That pays the outer + * group-by + idx_buf scatter even when the per-group dedup is the + * dominant cost. + * + * Rewrite: group by (K, X) once — this deduplicates (K, X) tuples in a + * single pass that lands on the v2 multi-key kernel — then count rows + * per K on the (typically much smaller) dedup table. For q08 on the + * 10M-row hits table, the (K, X) pass produces ~700 K tuples; the final + * group-by walks just that. + * + * Returns NULL on shape miss (caller falls through to the existing + * count-distinct path); returns a result table on success. Gates: + * - single scalar K column (not SYM, no nulls) + * - cd_inner is a column ref X (not SYM, no nulls) — composite key + * fits in 16 bytes (v2's wide-key cap) + * - K + X ≤ 16 bytes packed + * - WHERE optional; if present, must be supported by the fused predicate + * - desc/take optional, must be on the cd output column when present */ +static ray_t* try_count_distinct_v2_rewrite( + ray_t* tbl, + ray_t* by_expr, + ray_t* where_expr, + ray_t** dict_elems, int64_t dict_n, + int64_t from_id, int64_t where_id, int64_t by_id, + int64_t take_id, int64_t asc_id, int64_t desc_id, + int64_t nearest_id) +{ + if (!tbl || tbl->type != RAY_TABLE) return NULL; + if (!by_expr || by_expr->type != -RAY_SYM || + !(by_expr->attrs & RAY_ATTR_NAME)) + return NULL; + int64_t K_sym = by_expr->i64; + + /* Walk the dict — accept exactly one `(count (distinct col_ref))` + * agg and an optional identity key projection. Any other agg / + * projection / take-on-something-else aborts the rewrite. */ + int64_t cd_X_sym = -1; + int64_t cd_c_sym = -1; + int n_cd = 0, n_other = 0; + int64_t desc_col_sym = -1; /* if desc:, its column-sym target */ + int64_t asc_col_sym = -1; + int has_take = 0; + int64_t take_n = -1; + for (int64_t i = 0; i + 1 < dict_n; i += 2) { + int64_t kid = dict_elems[i]->i64; + ray_t* val = dict_elems[i + 1]; + if (kid == from_id || kid == where_id || kid == by_id || + kid == nearest_id) continue; + if (kid == take_id) { + int64_t v; + if (atom_i64_const(val, &v) && v > 0) { + has_take = 1; + take_n = v; + } else { + return NULL; /* non-trivial take */ + } + continue; + } + if (kid == asc_id) { + if (val && val->type == -RAY_SYM && (val->attrs & RAY_ATTR_NAME)) + asc_col_sym = val->i64; + else return NULL; + continue; + } + if (kid == desc_id) { + if (val && val->type == -RAY_SYM && (val->attrs & RAY_ATTR_NAME)) + desc_col_sym = val->i64; + else return NULL; + continue; + } + ray_t* cd_inner = match_count_distinct(val); + if (cd_inner && cd_inner->type == -RAY_SYM && + (cd_inner->attrs & RAY_ATTR_NAME)) + { + cd_X_sym = cd_inner->i64; + cd_c_sym = kid; + n_cd++; + } else if (is_single_group_key_projection(by_expr, val)) { + /* identity key projection (e.g. {K: K}) — accepted, no-op */ + } else { + n_other++; + } + } + if (n_cd != 1 || n_other > 0) return NULL; + if (cd_X_sym < 0 || cd_c_sym < 0) return NULL; + + /* desc/asc must target the count output column. */ + if (desc_col_sym >= 0 && desc_col_sym != cd_c_sym) return NULL; + if (asc_col_sym >= 0 && asc_col_sym != cd_c_sym) return NULL; + if (desc_col_sym >= 0 && asc_col_sym >= 0) return NULL; + + /* Type checks on K and X. v2 multi-key composite path requires + * non-SYM, non-nullable, packed ≤ 16 bytes (wide-key cap). */ + ray_t* K_col = ray_table_get_col(tbl, K_sym); + ray_t* X_col = ray_table_get_col(tbl, cd_X_sym); + if (!K_col || !X_col) return NULL; + int8_t kct = K_col->type, xct = X_col->type; + if (RAY_IS_PARTED(kct) || kct == RAY_MAPCOMMON) return NULL; + if (RAY_IS_PARTED(xct) || xct == RAY_MAPCOMMON) return NULL; + if (kct == RAY_SYM || xct == RAY_SYM) return NULL; + if (K_col->attrs & RAY_ATTR_HAS_NULLS) return NULL; + if (X_col->attrs & RAY_ATTR_HAS_NULLS) return NULL; + int K_esz = ray_sym_elem_size(kct, K_col->attrs); + int X_esz = ray_sym_elem_size(xct, X_col->attrs); + if (K_esz + X_esz > 16) return NULL; + /* Restrict to integer/temporal — matches mk_compile's accepted shapes. */ + int kct_ok = (kct == RAY_BOOL || kct == RAY_U8 || kct == RAY_I16 || + kct == RAY_I32 || kct == RAY_I64 || + kct == RAY_DATE || kct == RAY_TIME || kct == RAY_TIMESTAMP); + int xct_ok = (xct == RAY_BOOL || xct == RAY_U8 || xct == RAY_I16 || + xct == RAY_I32 || xct == RAY_I64 || + xct == RAY_DATE || xct == RAY_TIME || xct == RAY_TIMESTAMP); + if (!kct_ok || !xct_ok) return NULL; + + if (where_expr && !ray_fused_group_supported(where_expr, tbl)) + return NULL; + + /* === Inner pass: group by (K, X) on the source table === */ + ray_graph_t* g_in = ray_graph_new(tbl); + if (!g_in) return NULL; + ray_t* K_name = ray_sym_str(K_sym); + ray_t* X_name = ray_sym_str(cd_X_sym); + if (!K_name || !X_name) { ray_graph_free(g_in); return NULL; } + ray_op_t* K_scan = ray_scan(g_in, ray_str_ptr(K_name)); + ray_op_t* X_scan = ray_scan(g_in, ray_str_ptr(X_name)); + if (!K_scan || !X_scan) { ray_graph_free(g_in); return NULL; } + ray_op_t* keys_in[2] = { K_scan, X_scan }; + uint16_t agg_ops_in[1] = { OP_COUNT }; + ray_op_t* agg_ins_in[1] = { K_scan }; /* count agg input is irrelevant */ + ray_op_t* inner; + if (where_expr) { + ray_op_t* pred = compile_expr_dag(g_in, where_expr); + if (!pred) { ray_graph_free(g_in); return NULL; } + inner = ray_filtered_group(g_in, pred, keys_in, 2, + agg_ops_in, agg_ins_in, 1); + } else { + inner = ray_group(g_in, keys_in, 2, agg_ops_in, agg_ins_in, 1); + } + if (!inner) { ray_graph_free(g_in); return NULL; } + ray_t* dedup = ray_execute(g_in, inner); + ray_graph_free(g_in); + if (!dedup) return NULL; + if (RAY_IS_ERR(dedup)) return dedup; + if (dedup->type != RAY_TABLE) { ray_release(dedup); return NULL; } + + /* === Outer pass: group dedup table by K with COUNT, ordered === */ + ray_graph_t* g_out = ray_graph_new(dedup); + if (!g_out) { ray_release(dedup); return ray_error("oom", NULL); } + ray_op_t* K_scan2 = ray_scan(g_out, ray_str_ptr(K_name)); + if (!K_scan2) { ray_graph_free(g_out); ray_release(dedup); return NULL; } + ray_op_t* keys_out[1] = { K_scan2 }; + uint16_t agg_ops_out[1] = { OP_COUNT }; + ray_op_t* agg_ins_out[1] = { K_scan2 }; + + /* Apply desc:c take:N via the group emit_filter so the second pass + * can heap-trim to top-N without materialising every (K, count) row. */ + ray_group_emit_filter_t prev_emit = ray_group_emit_filter_get(); + ray_group_emit_filter_t emit_f = {0}; + int emit_set = 0; + if (desc_col_sym == cd_c_sym && has_take && take_n > 0) { + emit_f.enabled = true; + emit_f.agg_index = 0; + emit_f.top_count_take = take_n; + emit_f.min_count_exclusive = 0; + ray_group_emit_filter_set(emit_f); + emit_set = 1; + } + ray_op_t* outer = ray_group(g_out, keys_out, 1, + agg_ops_out, agg_ins_out, 1); + if (!outer) { + if (emit_set) ray_group_emit_filter_set(prev_emit); + ray_graph_free(g_out); + ray_release(dedup); + return ray_error("oom", NULL); + } + ray_t* result = ray_execute(g_out, outer); + if (emit_set) ray_group_emit_filter_set(prev_emit); + ray_graph_free(g_out); + ray_release(dedup); + if (!result || RAY_IS_ERR(result)) return result; + if (result->type != RAY_TABLE) return result; + + /* Rename the count output column to the user's requested c_sym alias. + * The outer pass counts the key column, so ray_group names the agg + * output "_count" (after its input column) — NOT the literal + * "count" this code originally searched for, which left the result + * column misnamed (the "_count" default instead of the alias). + * The result holds exactly the key column plus this one count + * column, so rename whichever non-key column it is. */ + if (K_sym != cd_c_sym) { + int64_t nc = ray_table_ncols(result); + for (int64_t ci = 0; ci < nc; ci++) { + int64_t cn = ray_table_col_name(result, ci); + if (cn != K_sym && cn != cd_c_sym) { + ray_table_set_col_name(result, ci, cd_c_sym); + break; + } + } + } + return result; +} + /* Per-group count(distinct) using the existing OP_COUNT_DISTINCT kernel. * Mirrors aggr_unary_per_group_buf but slices the source column once per * group and calls exec_count_distinct directly — bypasses the full @@ -4109,6 +2947,77 @@ static ray_t* count_distinct_per_group_buf(ray_t* inner_expr, ray_t* tbl, out->len = n_groups; int64_t* odata = (int64_t*)ray_data(out); + /* Streaming HLL — one parallel pass over rows (each worker owns a + * private bank of n_groups sparse sketches) instead of n_groups + * separate tasks each rebuilding a sketch. Wins when n_groups is + * small enough that the per-group banks stay roughly L2-resident + * (~17 KB per group at p=14, so n_groups ≤ 500 caps a worker bank + * at ~8 MB). Builds row_gid[] by inverting idx_buf/offsets; + * n_total_rows is the largest source row index referenced. */ + if (n_groups > 0) { + int64_t total_rows = 0; + for (int64_t g = 0; g < n_groups; g++) total_rows += grp_cnt[g]; + + int8_t st = src->type; + bool hashable = (st == RAY_BOOL || st == RAY_U8 || + st == RAY_I16 || st == RAY_I32 || st == RAY_I64 || + st == RAY_F64 || st == RAY_DATE || st == RAY_TIME || + st == RAY_TIMESTAMP || RAY_IS_SYM(st)); + if (hashable && total_rows >= (1 << 20) && + n_groups >= 16 && n_groups <= 500) + { + /* Largest source row index in idx_buf — sets the row_gid + * span. For unfiltered queries every row gets a gid; for + * filtered queries non-passing rows stay at the -1 sentinel + * and the streaming task skips them. */ + int64_t n_max_row = 0; + for (int64_t gi = 0; gi < n_groups; gi++) { + int64_t end_off = offsets[gi] + grp_cnt[gi]; + for (int64_t j = offsets[gi]; j < end_off; j++) { + if (idx_buf[j] >= n_max_row) n_max_row = idx_buf[j] + 1; + } + } + if (n_max_row > 0) { + ray_t* rg_hdr = NULL; + int64_t* row_gid = (int64_t*)scratch_alloc(&rg_hdr, + (size_t)n_max_row * sizeof(int64_t)); + if (row_gid) { + for (int64_t r = 0; r < n_max_row; r++) row_gid[r] = -1; + for (int64_t gi = 0; gi < n_groups; gi++) { + int64_t end_off = offsets[gi] + grp_cnt[gi]; + for (int64_t j = offsets[gi]; j < end_off; j++) { + row_gid[idx_buf[j]] = gi; + } + } + if (ray_count_distinct_approx_pg_stream( + src, row_gid, n_max_row, n_groups, 14, odata) == 0) + { + scratch_free(rg_hdr); + ray_release(src); + return out; + } + scratch_free(rg_hdr); + memset(odata, 0, (size_t)n_groups * sizeof(int64_t)); + } + } + } + + /* Per-group HLL fallback — one task per group, private sketch + * per task. Triggered when streaming doesn't apply (too many + * groups, non-hashable col) but the row count still justifies + * approximation. */ + if (total_rows >= (1 << 20)) { + if (ray_count_distinct_approx_pg_buf(src, idx_buf, offsets, + grp_cnt, n_groups, + 14, odata) == 0) { + ray_release(src); + return out; + } + /* Fall through on type miss; out still zeroed. */ + memset(odata, 0, (size_t)n_groups * sizeof(int64_t)); + } + } + /* Parallel path: dispatch one task per group when src has a flat * numeric / SYM layout we can read with a typed pointer. Each task * does its own dedup with a scratch hash table — no gather_by_idx @@ -4971,6 +3880,89 @@ ray_t* ray_try_count_select_expr(ray_t* expr, int* handled) { return ray_i64(nrows); } +/* Walk `expr` and collect column-name symbols (RAY_ATTR_NAME atoms that + * resolve to a real column in `tbl`). Also follows the head of dotted + * names so a `Timestamp.date` reference contributes its base column. + * `out_syms` is treated as an append-only set (dedup against existing + * entries) up to `max_out`; returns the new count. Used to determine + * the subset of input columns the rest of a (select …) clause actually + * touches, so a prefilter materialise can skip everything else. */ +static int collect_col_refs_set(ray_t* expr, ray_t* tbl, + int64_t* out_syms, int max_out, int n) { + if (!expr || n >= max_out) return n; + if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME)) { + int64_t want = -1; + if (ray_table_get_col(tbl, expr->i64)) { + want = expr->i64; + } else if (ray_sym_is_dotted(expr->i64)) { + const int64_t* segs; + int nsegs = ray_sym_segs(expr->i64, &segs); + if (nsegs >= 1 && ray_table_get_col(tbl, segs[0])) want = segs[0]; + } + if (want >= 0) { + for (int i = 0; i < n; i++) if (out_syms[i] == want) return n; + if (n < max_out) out_syms[n++] = want; + } + return n; + } + if (expr->type == RAY_LIST) { + ray_t** elems = (ray_t**)ray_data(expr); + int64_t cnt = ray_len(expr); + for (int64_t i = 0; i < cnt && n < max_out; i++) + n = collect_col_refs_set(elems[i], tbl, out_syms, max_out, n); + return n; + } + if (expr->type == RAY_DICT) { + DICT_VIEW_DECL(dv); + DICT_VIEW_OPEN(expr, dv); + if (DICT_VIEW_OVERFLOW(dv)) return n; + for (int64_t i = 0; i + 1 < dv_n && n < max_out; i += 2) + n = collect_col_refs_set(dv[i + 1], tbl, out_syms, max_out, n); + return n; + } + if (expr->type == RAY_SYM) { + /* Sym vector — each element is a column name (e.g. multi-col + * asc:/desc:/by: tuples). Pull syms out at the storage width. */ + const void* base = ray_data(expr); + int8_t vt = expr->type; + uint8_t va = expr->attrs; + int64_t len = ray_len(expr); + for (int64_t i = 0; i < len && n < max_out; i++) { + int64_t s = ray_read_sym(base, i, vt, va); + if (ray_table_get_col(tbl, s)) { + int dup = 0; + for (int j = 0; j < n; j++) if (out_syms[j] == s) { dup = 1; break; } + if (!dup && n < max_out) out_syms[n++] = s; + } + } + return n; + } + return n; +} + +/* Build a narrow projection of `src_tbl` containing only the columns in + * `keep_syms[0..n_keep)`, preserving the original column order. + * Schema/cols share the source vec/list headers (retain'd internally + * by ray_table_add_col); no row data is copied — projection is a + * metadata-only operation. Returns an owned ray_t* or an error. */ +static ray_t* project_table_cols(ray_t* src_tbl, const int64_t* keep_syms, + int n_keep) { + ray_t* nt = ray_table_new(n_keep); + if (!nt || RAY_IS_ERR(nt)) return nt ? nt : ray_error("oom", NULL); + for (int i = 0; i < n_keep; i++) { + ray_t* col = ray_table_get_col(src_tbl, keep_syms[i]); + if (!col) { ray_release(nt); return ray_error("domain", NULL); } + ray_t* nt2 = ray_table_add_col(nt, keep_syms[i], col); + if (!nt2 || RAY_IS_ERR(nt2)) { + if (nt2 && nt2 != nt) ray_release(nt2); + else ray_release(nt); + return nt2 ? nt2 : ray_error("oom", NULL); + } + nt = nt2; + } + return nt; +} + ray_t* ray_select(ray_t** args, int64_t n) { if (n < 1) return ray_error("domain", NULL); ray_t* dict = args[0]; @@ -4980,12 +3972,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { /* Evaluate 'from:' to get the source table */ ray_t* from_expr = dict_get(dict, "from"); if (!from_expr) return ray_error("domain", NULL); - uint64_t select_cache_hash_value = ray_expr_hash(dict); - uint64_t select_cache_from_hash = ray_expr_hash(from_expr); - ray_t* expr_cached = select_expr_cache_get(select_cache_hash_value, - select_cache_from_hash); - if (expr_cached) - return expr_cached; ray_t* where_expr = dict_get(dict, "where"); ray_group_emit_filter_t prev_emit_filter = ray_group_emit_filter_get(); ray_group_emit_filter_t emit_filter = {0}; @@ -4998,14 +3984,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_group_emit_filter_set(prev_emit_filter); if (RAY_IS_ERR(tbl)) return tbl; if (tbl->type != RAY_TABLE) { ray_release(tbl); return ray_error("type", NULL); } - int64_t select_cache_nrows = ray_table_nrows(tbl); - ray_t* select_cached = select_cache_get(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash); - if (select_cached) { - ray_release(tbl); - return select_cached; - } ray_t* by_expr = dict_get(dict, "by"); ray_t* take_expr = dict_get(dict, "take"); @@ -5038,43 +4016,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (kid == asc_id || kid == desc_id) { has_sort = true; break; } } - ray_t* xbar_count = try_xbar_count_select(tbl, where_expr, by_expr, - take_expr, dict_elems, dict_n, - from_id, where_id, by_id, - take_id, asc_id, desc_id, - nearest_id); - if (xbar_count) { - ray_release(tbl); - return xbar_count; - } - - ray_t* i16_ne0_count = try_i16_ne0_count_desc_select( - tbl, where_expr, by_expr, take_expr, dict_elems, dict_n, - from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id); - if (i16_ne0_count) { - ray_release(tbl); - return i16_ne0_count; - } - - ray_t* i32_i64_cd = try_i32_i64_count_distinct_select( - tbl, where_expr, by_expr, take_expr, dict_elems, dict_n, - from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id); - if (i32_i64_cd) { - ray_release(tbl); - return i32_i64_cd; - } - - ray_t* i16x2_count = try_i16x2_count_desc_select(tbl, where_expr, by_expr, - take_expr, dict_elems, - dict_n, from_id, - where_id, by_id, - take_id, asc_id, - desc_id, nearest_id); - if (i16x2_count) { - ray_release(tbl); - return i16x2_count; - } - /* `nearest` is mutually exclusive with `asc`/`desc`/`by` — ANN * ordering is an index scan, not a column sort, and cannot be * composed with group-by in this phase. */ @@ -5091,6 +4032,22 @@ ray_t* ray_select(ray_t** args, int64_t n) { } } + /* Count-distinct planner rewrite: `(select {K: K c: (count (distinct X)) + * from: T [where: W] by: K [desc: c take: N]})` decomposes cleanly to + * a two-stage group-by — first dedup (K, X) pairs, then count rows + * per K. The dedup pass lands on the v2 multi-key kernel; the + * second pass walks a much smaller table. Skips the outer-group + + * idx_buf scatter that the per-group dedup path otherwise pays. */ + if (!nearest_expr) { + ray_t* rw = try_count_distinct_v2_rewrite( + tbl, by_expr, where_expr, dict_elems, dict_n, + from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id); + if (rw) { + ray_release(tbl); + return rw; + } + } + /* Count output columns */ int n_out = 0; for (int64_t i = 0; i + 1 < dict_n; i += 2) { @@ -5369,23 +4326,85 @@ ray_t* ray_select(ray_t** args, int64_t n) { match_group_desc_count_take(dict_elems, dict_n, from_id, where_id, by_id, take_id, asc_id, desc_id, &prefilter_top_count); + /* Computed by-val + WHERE: eagerly evaluating a non-trivial + * group key (e.g. q42's `(xbar EventTime 60000000000)`) over + * every input row wastes work proportional to the WHERE's + * selectivity. Project the input table down to just the + * columns the rest of the (select …) clause actually touches + * (WHERE refs, by-val refs, agg-input refs, sort-key refs), + * filter the narrow projection through WHERE once, then + * evaluate by-val expressions on the small dense result. The + * downstream group/sort/take then sees a fully-filtered table + * — fewer rows, fewer columns, no per-row redundant work. + * + * Narrowing matters: for wide tables (ClickBench's `hits` has + * ~100 cols) materialising the full filtered table dominates + * what was meant to be a cheap prefilter (single-col filter + * is O(passing × esz), full filter is ~50× that). + * + * The matcher gate (top-N-by-agg) constrains where this fires + * to shapes where the prefilter's cost can be amortised — the + * downstream group materialisation and top-N extraction + * benefit from operating on a small filtered slice. Broader + * shapes that already have an efficient fused-filter+group + * path (OP_FILTERED_GROUP) would lose more in the duplicated + * filter work than they'd save in the smaller by-val eval. */ if (where_expr && prefilter_computed_by) { - ray_graph_t* fg = ray_graph_new(tbl); + int64_t keep_syms[256]; + int n_keep = 0; + n_keep = collect_col_refs_set(where_expr, tbl, + keep_syms, 256, n_keep); + for (int64_t i = 0; i + 1 < dict_n && n_keep < 256; i += 2) { + int64_t kid = dict_elems[i]->i64; + if (kid == from_id || kid == where_id || kid == take_id || + kid == nearest_id) continue; + /* asc:/desc:/by: keep the value's referenced source cols + * (the by-dict's dict val may be a computed expression + * referencing other source cols, the asc/desc value is + * a -RAY_SYM or RAY_SYM vec of source col names). All + * other entries are output cols — agg or non-agg + * expressions whose refs we also need post-filter. */ + n_keep = collect_col_refs_set(dict_elems[i + 1], tbl, + keep_syms, 256, n_keep); + } + int can_project = (n_keep > 0 && n_keep < 256 && + n_keep < ray_table_ncols(tbl)); + ray_t* narrow_tbl = NULL; + if (can_project) { + narrow_tbl = project_table_cols(tbl, keep_syms, n_keep); + if (!narrow_tbl || RAY_IS_ERR(narrow_tbl)) { + if (narrow_tbl) ray_release(narrow_tbl); + narrow_tbl = NULL; + can_project = 0; + } + } + ray_t* prefilter_input = can_project ? narrow_tbl : tbl; + ray_graph_t* fg = ray_graph_new(prefilter_input); if (!fg) { + if (narrow_tbl) ray_release(narrow_tbl); ray_release(tbl); return ray_error("oom", NULL); } - ray_op_t* froot = ray_const_table(fg, tbl); + ray_op_t* froot = ray_const_table(fg, prefilter_input); ray_op_t* pred = compile_expr_dag(fg, where_expr); if (!pred) { ray_graph_free(fg); + if (narrow_tbl) ray_release(narrow_tbl); ray_release(tbl); return ray_error("domain", NULL); } froot = ray_filter(fg, froot, pred); - froot = ray_optimize(fg, froot); + /* Deliberately skip ray_optimize: its predicate pushdown + * pass splits OP_AND into chained OP_FILTERs, each + * materialising a per-conjunct bool vec and refining a + * rowsel. For wide AND-of-comparison WHEREs that costs + * one parallel pass per conjunct (~50MB of intermediate + * bool-vec writes for q42's 5-clause WHERE on 10M rows). + * Single ray_filter with the unsplit AND-tree evaluates + * the whole predicate inline in one parallel pass. */ ray_t* filtered = ray_execute(fg, froot); ray_graph_free(fg); + if (narrow_tbl) ray_release(narrow_tbl); if (!filtered || RAY_IS_ERR(filtered)) { ray_release(tbl); return filtered ? filtered : ray_error("domain", NULL); @@ -5669,13 +4688,8 @@ ray_t* ray_select(ray_t** args, int64_t n) { /* Single-key case fits unconditionally (one key column, one * slot). Multi-key narrow path (≤ 8 bytes packed) uses a * single int64 slot; the wide path (9..16 bytes) adds a - * side kv_hi side array. The wide path's extra hi compare - * + extra memory traffic only pays back for single-COUNT - * shapes (Q36, Q41); multi-agg high-card workloads (Q31, - * Q32) regress against the regular FILTER+GROUP path, so - * keep them on it. */ - int wide_fits = (total_bytes > 8 && total_bytes <= 16 - && n_aggs_ok == 1 && has_only_count); + * side kv_hi side array. */ + int wide_fits = (total_bytes > 8 && total_bytes <= 16); int narrow_fits = (total_bytes <= 8); int fits = (n_keys_local == 1) || narrow_fits || wide_fits; if (keys_ok && fits) { @@ -6424,9 +5438,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_free(vals_hdr); ray_free(null_hdr); ray_free(cnt_hdr); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } } @@ -6687,16 +5698,10 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); if (take_preapplied) { - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } result = apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } @@ -6887,9 +5892,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { } res = apply_sort_take(res, dict_elems, dict_n, asc_id, desc_id, take_id); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, res); return res; } @@ -7301,9 +6303,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_release(tbl); result = apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } @@ -8449,9 +7448,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_release(tbl); result = apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } } else if (n_out > 0) { @@ -8599,9 +7595,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_graph_free(g); ray_release(tbl); result = apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); - select_cache_put(tbl, select_cache_nrows, - select_cache_hash_value, - select_cache_from_hash, result); return result; } else { root = ray_select_op(g, root, col_ops, nc); @@ -9223,6 +8216,23 @@ ray_t* ray_select(ray_t** args, int64_t n) { * * If any non-agg falls outside that, we still need the * index. */ + /* Decide whether we need to materialise the per-group + * idx_buf scatter. Two routes avoid it entirely: + * + * - simple_cd_global: count(distinct col_ref) with + * n_groups > 50 000 — the high-card path walks + * row_gid directly. + * - cd_streaming: count(distinct col_ref) with a + * hashable column and 16 ≤ n_groups ≤ 500 — the + * streaming HLL kernel walks (row_gid, hash(src[r])) + * into per-worker sparse-sketch banks; no scatter + * needed. Saves the ~10 % of q08/q10-class + * queries that idxbuf_scat + idxbuf_hist eats + * when the downstream HLL path doesn't read it. + * + * Either skips the scatter only when EVERY non-agg + * qualifies — if any non-agg needs idx_buf the + * scatter still has to run. */ int needs_slice_idx = 0; for (uint8_t ni = 0; ni < n_nonaggs && !needs_slice_idx; ni++) { ray_t* cd_inner = match_count_distinct(nonagg_exprs[ni]); @@ -9230,7 +8240,24 @@ ray_t* ray_select(ray_t** args, int64_t n) { cd_inner->type == -RAY_SYM && (cd_inner->attrs & RAY_ATTR_NAME) && n_groups > 50000); - if (!simple_cd_global) needs_slice_idx = 1; + int cd_streaming = 0; + if (cd_inner && cd_inner->type == -RAY_SYM && + (cd_inner->attrs & RAY_ATTR_NAME) && + n_groups >= 16 && n_groups <= 500 && + nrows >= (1 << 20)) { + ray_t* sc = ray_table_get_col(tbl, cd_inner->i64); + if (sc && !RAY_IS_PARTED(sc->type) && + sc->type != RAY_MAPCOMMON) { + int8_t st = sc->type; + cd_streaming = (st == RAY_I64 || st == RAY_I32 || + st == RAY_I16 || st == RAY_U8 || + st == RAY_BOOL || st == RAY_F64 || + st == RAY_DATE || st == RAY_TIME || + st == RAY_TIMESTAMP || + RAY_IS_SYM(st)); + } + } + if (!simple_cd_global && !cd_streaming) needs_slice_idx = 1; } int64_t* idx_buf = NULL; @@ -9375,6 +8402,31 @@ ray_t* ray_select(ray_t** args, int64_t n) { } } if (src_for_global) { + /* Streaming per-group HLL: skips the idx_buf + * scatter and re-walk by running one pass + * over (row_gid, hash(src[r])). Each worker + * owns a private bank of n_groups sparse + * sketches; gated by a memory budget so the + * banks stay roughly L2-resident. Falls + * through to the buf-form on type miss / OOM. */ + if (n_groups >= 16 && n_groups <= 500 + && nrows >= (1 << 20) + && !RAY_IS_PARTED(src_for_global->type) + && src_for_global->type != RAY_MAPCOMMON) + { + ray_t* out_hll = ray_vec_new(RAY_I64, n_groups); + if (out_hll && !RAY_IS_ERR(out_hll)) { + out_hll->len = n_groups; + int64_t* odata = (int64_t*)ray_data(out_hll); + if (ray_count_distinct_approx_pg_stream( + src_for_global, row_gid, nrows, + n_groups, 14, odata) == 0) { + col = out_hll; + } else { + ray_release(out_hll); + } + } + } /* Path selection: global-hash kernel scales * with n_rows (per-row probe of one shared * hash table); per-group-slice scales with @@ -9385,12 +8437,14 @@ ray_t* ray_select(ray_t** args, int64_t n) { * so keep them on the single-pass kernel and * avoid slicing through the partition layout * again. */ - if (n_groups <= 50000) { - col = count_distinct_per_group_buf( - cd_inner, tbl, idx_buf, offsets, grp_cnt, n_groups); - } else { - col = ray_count_distinct_per_group( - src_for_global, row_gid, nrows, n_groups); + if (!col) { + if (n_groups <= 50000) { + col = count_distinct_per_group_buf( + cd_inner, tbl, idx_buf, offsets, grp_cnt, n_groups); + } else { + col = ray_count_distinct_per_group( + src_for_global, row_gid, nrows, n_groups); + } } /* col == NULL → unsupported type, fall through. */ } @@ -9638,8 +8692,6 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (by_sym_vec_owned) ray_release(by_sym_vec_owned); if (saved_selection) ray_release(saved_selection); - select_cache_put(tbl, select_cache_nrows, select_cache_hash_value, - select_cache_from_hash, result); return result; } diff --git a/test/rfl/system/read_csv.rfl b/test/rfl/system/read_csv.rfl index a502b8e7..3d3f33c9 100644 --- a/test/rfl/system/read_csv.rfl +++ b/test/rfl/system/read_csv.rfl @@ -75,11 +75,12 @@ (.sys.exec "printf 'name\\nalice\\n\\nbob\\n\\ncarol\\n' > rf_test_empty.csv") -- 0 (set _t (.csv.read [SYMBOL] "rf_test_empty.csv")) (count _t) -- 5 -;; Empty string IS a null STR atom and empty SYM cell IS null (sym -;; id 0). The SYM vec vs null STR atom comparison short-circuits null: -;; every cell passes `!= ""` and none passes `== ""`. Documented -;; tension; revisit if SQL-style null-aware filtering on SYM columns -;; becomes a requirement. -(count (select {x: name from: _t where: (!= name "")})) -- 5 -(count (select {x: name from: _t where: (== name "")})) -- 0 +;; The empty SYM cell is the interned empty string (sym id 0), a real +;; comparable value — SQL-style filtering on SYM columns compares by +;; value, not by null. `(!= name "")` therefore excludes the two empty +;; rows (alice, bob, carol survive) and `(== name "")` selects them. +;; (See the str-resolved comparison path in src/ops/expr.c, which skips +;; the null-comparison fixup once a string constant resolves to a sym id.) +(count (select {x: name from: _t where: (!= name "")})) -- 3 +(count (select {x: name from: _t where: (== name "")})) -- 2 (.sys.exec "rm -f rf_test_empty.csv") -- 0 diff --git a/test/test_group_extra.c b/test/test_group_extra.c index 8d512596..05e0c06e 100644 --- a/test/test_group_extra.c +++ b/test/test_group_extra.c @@ -46,6 +46,7 @@ #include "mem/heap.h" #include "ops/ops.h" #include "ops/internal.h" +#include "ops/hll.h" #include "table/sym.h" #include #include @@ -1257,6 +1258,75 @@ static test_result_t test_five_key_group_top_count_emit_filter(void) { PASS(); } +/* -------------------------------------------------------------------------- + * Test 18: streaming per-group HLL — single-pass kernel + * + * Direct call to ray_count_distinct_approx_pg_stream with a small-group, + * large-row layout that gates into the streaming path: each worker owns + * a private bank of n_groups sketches and the kernel skips the + * (idx_buf + offsets + counts) CSR scatter that the buf-form entry point + * pays for upstream. + * + * Layout: n_rows = 2 M, n_groups = 100, val = i % 1000 within each group. + * Each row's gid = i % 100, val = (i / 100) % 1000. Per-group distinct + * count is exactly 1000 (val cycles through 0..999 across 20000 rows per + * group, covering every value at least once). HLL has ~0.8 % std error + * at P=14 → we accept estimates within 5 % to leave slack for the small- + * cardinality bias-correction tail. + * + * Verifies (a) the path returns a populated I64 output, (b) per-group + * counts are within 5 % of 1000, (c) no oom / dispatch failure. + * -------------------------------------------------------------------------- */ +static test_result_t test_count_distinct_pg_stream(void) { + ray_heap_init(); + (void)ray_sym_init(); + + const int64_t NROWS = 2 * 1024 * 1024; /* > 1 M HLL gate */ + const int64_t NGROUPS = 100; /* fits 8 MB-per-worker budget */ + const int64_t DISTINCT_PER_GROUP = 1000; + + ray_t* vec = ray_vec_new(RAY_I64, NROWS); + TEST_ASSERT_NOT_NULL(vec); + vec->len = NROWS; + int64_t* p = (int64_t*)ray_data(vec); + for (int64_t i = 0; i < NROWS; i++) p[i] = (i / NGROUPS) % DISTINCT_PER_GROUP; + + ray_t* gids = ray_vec_new(RAY_I64, NROWS); + TEST_ASSERT_NOT_NULL(gids); + gids->len = NROWS; + int64_t* gp = (int64_t*)ray_data(gids); + for (int64_t i = 0; i < NROWS; i++) gp[i] = i % NGROUPS; + + ray_t* out = ray_vec_new(RAY_I64, NGROUPS); + TEST_ASSERT_NOT_NULL(out); + out->len = NGROUPS; + int64_t* od = (int64_t*)ray_data(out); + memset(od, 0, (size_t)NGROUPS * sizeof(int64_t)); + + int rc = ray_count_distinct_approx_pg_stream(vec, gp, NROWS, NGROUPS, + RAY_HLL_DEFAULT_P, od); + TEST_ASSERT_FMT(rc == 0, "stream returned %d", rc); + + /* Each group has exactly 1000 distinct values. Accept ±5 % drift + * (real HLL std error is ~0.8 % at P=14; the wider band covers the + * small-range bias-correction tail and the per-worker merge slop). */ + for (int64_t g = 0; g < NGROUPS; g++) { + double err = fabs((double)od[g] - (double)DISTINCT_PER_GROUP) / + (double)DISTINCT_PER_GROUP; + TEST_ASSERT_FMT(err <= 0.05, + "group %lld: got %lld, expected ~%lld (err=%.3f)", + (long long)g, (long long)od[g], + (long long)DISTINCT_PER_GROUP, err); + } + + ray_release(out); + ray_release(gids); + ray_release(vec); + ray_sym_destroy(); + ray_heap_destroy(); + PASS(); +} + /* -------------------------------------------------------------------------- * Test registry * -------------------------------------------------------------------------- */ @@ -1279,5 +1349,6 @@ const test_entry_t group_extra_entries[] = { { "group_extra/i16_group_top_count_emit_filter", test_i16_group_top_count_emit_filter, NULL, NULL }, { "group_extra/sym_group_top_count_emit_filter", test_sym_group_top_count_emit_filter, NULL, NULL }, { "group_extra/five_key_group_top_count_emit_filter", test_five_key_group_top_count_emit_filter, NULL, NULL }, + { "group_extra/count_distinct_pg_stream", test_count_distinct_pg_stream, NULL, NULL }, { NULL, NULL, NULL, NULL }, };