From 37be26049fcf7f8b8f80a4ec72e7e3bde72ce980 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 12 Jun 2026 00:56:37 +0200 Subject: [PATCH] bench: add correlated-proxy case to the predicate_eval suite The correlation subgroup's existing cases (q70-q72) use two predicates of equal cost and equal selectivity, so the two orders cost the same and correlation only affects the result cardinality - no ordering system can win or lose on them. They measure overhead, not opportunity. Add q73: a cheap integer predicate that is a perfect proxy for three string regexes, plus one independent regex of the same ~30% selectivity and similar cost. Marginal statistics cannot tell the four regexes apart in any position; their joint distribution with the proxy is what matters. Written in the natural-but-pessimal order (redundant regexes grouped with their proxy), the query runs ~1.9x slower than the hand-optimal order [c0, s2, s1...] on an M-series laptop, so a correlation-aware ordering system has real, measurable headroom here while an independence-assuming one is blind to it. Co-Authored-By: Claude Fable 5 --- .../benchmarks/correlation/q73.benchmark | 7 +++++ .../predicate_eval/load/corrproxy.sql | 30 +++++++++++++++++++ .../queries/correlation/q73.sql | 14 +++++++++ 3 files changed, 51 insertions(+) create mode 100644 benchmarks/sql_benchmarks/predicate_eval/benchmarks/correlation/q73.benchmark create mode 100644 benchmarks/sql_benchmarks/predicate_eval/load/corrproxy.sql create mode 100644 benchmarks/sql_benchmarks/predicate_eval/queries/correlation/q73.sql diff --git a/benchmarks/sql_benchmarks/predicate_eval/benchmarks/correlation/q73.benchmark b/benchmarks/sql_benchmarks/predicate_eval/benchmarks/correlation/q73.benchmark new file mode 100644 index 0000000000000..cc3f7bcf54901 --- /dev/null +++ b/benchmarks/sql_benchmarks/predicate_eval/benchmarks/correlation/q73.benchmark @@ -0,0 +1,7 @@ +subgroup correlation + +template sql_benchmarks/predicate_eval/predicate_eval.benchmark.template +SUBGROUP=correlation +QPAD=73 +DATASET=corrproxy +NAME=correlation_q73_redundant_proxy diff --git a/benchmarks/sql_benchmarks/predicate_eval/load/corrproxy.sql b/benchmarks/sql_benchmarks/predicate_eval/load/corrproxy.sql new file mode 100644 index 0000000000000..7b0b1859e1034 --- /dev/null +++ b/benchmarks/sql_benchmarks/predicate_eval/load/corrproxy.sql @@ -0,0 +1,30 @@ +-- Correlated-proxy dataset: a cheap integer predicate that is a perfect proxy +-- for three string predicates, plus one independent string predicate. +-- +-- c0 = 1 for ~30% of rows (cheap proxy) +-- s1 contains 'aaa', 'ccc' and 'ddd' exactly where c0 = 1 (correlated) +-- s2 contains 'bbb' for an independent ~30% of rows (independent) +-- +-- Marginally, the four regex predicates are indistinguishable: similar cost, +-- the same ~30% selectivity. Their *conditional* selectivities behind the +-- proxy differ completely: after `c0 = 1`, the three s1 regexes keep every +-- survivor (each re-tests the proxy's condition) while the s2 regex still +-- discards ~70%. Only joint statistics can see that; an independence +-- assumption prices all four regexes identically in every position. +-- +-- PRED_FILL sets the filler width around each marker (a non-matching +-- `regexp_like` must scan the whole value), and PRED_ROWS sizes the table. +CREATE TABLE t AS +SELECT + CASE WHEN (value * 7) % 100 < 30 THEN 1 ELSE 0 END AS c0, + repeat('q', ${PRED_FILL:-30}) + || CASE WHEN (value * 7) % 100 < 30 THEN 'aaa' ELSE 'zzz' END + || repeat('q', ${PRED_FILL:-30}) + || CASE WHEN (value * 7) % 100 < 30 THEN 'ccc' ELSE 'zzz' END + || repeat('q', ${PRED_FILL:-30}) + || CASE WHEN (value * 7) % 100 < 30 THEN 'ddd' ELSE 'zzz' END + || repeat('q', ${PRED_FILL:-30}) AS s1, + repeat('q', ${PRED_FILL:-30}) + || CASE WHEN (value * 13) % 100 < 30 THEN 'bbb' ELSE 'zzz' END + || repeat('q', ${PRED_FILL:-30}) AS s2 +FROM generate_series(1, ${PRED_ROWS:-1000000}); diff --git a/benchmarks/sql_benchmarks/predicate_eval/queries/correlation/q73.sql b/benchmarks/sql_benchmarks/predicate_eval/queries/correlation/q73.sql new file mode 100644 index 0000000000000..1db05062b57f2 --- /dev/null +++ b/benchmarks/sql_benchmarks/predicate_eval/queries/correlation/q73.sql @@ -0,0 +1,14 @@ +-- Hidden: `c0 = 1` is a perfect proxy for all three s1 regexes -- after the +-- cheap proxy, each s1 regex keeps every survivor while the equally selective +-- (~30%) s2 regex still discards ~70%. The optimal order is [c0, s2, s1...] +-- (one informative regex on 30% of rows, the three redundant ones on 9%), +-- but marginal statistics cannot tell the four regexes apart in any position: +-- ranking them takes their *joint* distribution with the proxy. Written with +-- the redundant regexes first, grouped with their proxy, as an author +-- naturally would. +SELECT count(*) FROM t +WHERE c0 = 1 + AND regexp_like(s1, 'a.a') + AND regexp_like(s1, 'c.c') + AND regexp_like(s1, 'd.d') + AND regexp_like(s2, 'b.b');