From 37be26049fcf7f8b8f80a4ec72e7e3bde72ce980 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Fri, 12 Jun 2026 00:56:37 +0200
Subject: [PATCH] bench: add correlated-proxy case to the predicate_eval suite

The correlation subgroup's existing cases (q70-q72) use two predicates of
equal cost and equal selectivity, so the two orders cost the same and
correlation only affects the result cardinality - no ordering system can win
or lose on them. They measure overhead, not opportunity.

Add q73: a cheap integer predicate that is a perfect proxy for three string
regexes, plus one independent regex of the same ~30% selectivity and similar
cost. Marginal statistics cannot tell the four regexes apart in any position;
their joint distribution with the proxy is what matters. Written in the
natural-but-pessimal order (redundant regexes grouped with their proxy), the
query runs ~1.9x slower than the hand-optimal order [c0, s2, s1...] on an
M-series laptop, so a correlation-aware ordering system has real, measurable
headroom here while an independence-assuming one is blind to it.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../benchmarks/correlation/q73.benchmark      |  7 +++++
 .../predicate_eval/load/corrproxy.sql         | 30 +++++++++++++++++++
 .../queries/correlation/q73.sql               | 14 +++++++++
 3 files changed, 51 insertions(+)
 create mode 100644 benchmarks/sql_benchmarks/predicate_eval/benchmarks/correlation/q73.benchmark
 create mode 100644 benchmarks/sql_benchmarks/predicate_eval/load/corrproxy.sql
 create mode 100644 benchmarks/sql_benchmarks/predicate_eval/queries/correlation/q73.sql

diff --git a/benchmarks/sql_benchmarks/predicate_eval/benchmarks/correlation/q73.benchmark b/benchmarks/sql_benchmarks/predicate_eval/benchmarks/correlation/q73.benchmark
new file mode 100644
index 0000000000000..cc3f7bcf54901
--- /dev/null
+++ b/benchmarks/sql_benchmarks/predicate_eval/benchmarks/correlation/q73.benchmark
@@ -0,0 +1,7 @@
+subgroup correlation
+
+template sql_benchmarks/predicate_eval/predicate_eval.benchmark.template
+SUBGROUP=correlation
+QPAD=73
+DATASET=corrproxy
+NAME=correlation_q73_redundant_proxy
diff --git a/benchmarks/sql_benchmarks/predicate_eval/load/corrproxy.sql b/benchmarks/sql_benchmarks/predicate_eval/load/corrproxy.sql
new file mode 100644
index 0000000000000..7b0b1859e1034
--- /dev/null
+++ b/benchmarks/sql_benchmarks/predicate_eval/load/corrproxy.sql
@@ -0,0 +1,30 @@
+-- Correlated-proxy dataset: a cheap integer predicate that is a perfect proxy
+-- for three string predicates, plus one independent string predicate.
+--
+--   c0    = 1 for ~30% of rows (cheap proxy)
+--   s1    contains 'aaa', 'ccc' and 'ddd' exactly where c0 = 1  (correlated)
+--   s2    contains 'bbb' for an independent ~30% of rows        (independent)
+--
+-- Marginally, the four regex predicates are indistinguishable: similar cost,
+-- the same ~30% selectivity. Their *conditional* selectivities behind the
+-- proxy differ completely: after `c0 = 1`, the three s1 regexes keep every
+-- survivor (each re-tests the proxy's condition) while the s2 regex still
+-- discards ~70%. Only joint statistics can see that; an independence
+-- assumption prices all four regexes identically in every position.
+--
+-- PRED_FILL sets the filler width around each marker (a non-matching
+-- `regexp_like` must scan the whole value), and PRED_ROWS sizes the table.
+CREATE TABLE t AS
+SELECT
+  CASE WHEN (value * 7) % 100 < 30 THEN 1 ELSE 0 END AS c0,
+  repeat('q', ${PRED_FILL:-30})
+    || CASE WHEN (value * 7) % 100 < 30 THEN 'aaa' ELSE 'zzz' END
+    || repeat('q', ${PRED_FILL:-30})
+    || CASE WHEN (value * 7) % 100 < 30 THEN 'ccc' ELSE 'zzz' END
+    || repeat('q', ${PRED_FILL:-30})
+    || CASE WHEN (value * 7) % 100 < 30 THEN 'ddd' ELSE 'zzz' END
+    || repeat('q', ${PRED_FILL:-30}) AS s1,
+  repeat('q', ${PRED_FILL:-30})
+    || CASE WHEN (value * 13) % 100 < 30 THEN 'bbb' ELSE 'zzz' END
+    || repeat('q', ${PRED_FILL:-30}) AS s2
+FROM generate_series(1, ${PRED_ROWS:-1000000});
diff --git a/benchmarks/sql_benchmarks/predicate_eval/queries/correlation/q73.sql b/benchmarks/sql_benchmarks/predicate_eval/queries/correlation/q73.sql
new file mode 100644
index 0000000000000..1db05062b57f2
--- /dev/null
+++ b/benchmarks/sql_benchmarks/predicate_eval/queries/correlation/q73.sql
@@ -0,0 +1,14 @@
+-- Hidden: `c0 = 1` is a perfect proxy for all three s1 regexes -- after the
+-- cheap proxy, each s1 regex keeps every survivor while the equally selective
+-- (~30%) s2 regex still discards ~70%. The optimal order is [c0, s2, s1...]
+-- (one informative regex on 30% of rows, the three redundant ones on 9%),
+-- but marginal statistics cannot tell the four regexes apart in any position:
+-- ranking them takes their *joint* distribution with the proxy. Written with
+-- the redundant regexes first, grouped with their proxy, as an author
+-- naturally would.
+SELECT count(*) FROM t
+WHERE c0 = 1
+  AND regexp_like(s1, 'a.a')
+  AND regexp_like(s1, 'c.c')
+  AND regexp_like(s1, 'd.d')
+  AND regexp_like(s2, 'b.b');