From 411ec599a8c39a73ab377e425a0dc07e625d800f Mon Sep 17 00:00:00 2001 From: Dan Draper Date: Mon, 11 May 2026 15:24:55 +1000 Subject: [PATCH 1/3] test(bench): cover GROUP BY / JOIN / DISTINCT plan and timing on encrypted columns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds bench coverage for the three hash-strategy access patterns the Phase 1 hash operator class (#196) enabled but the Phase 2 chain inlining (#202) is yet to make perf-competitive. Plan assertions in bench_plan_tests.rs (gated on the bench feature, pass today): - group_by_encrypted_uses_hash_aggregate — confirms `GROUP BY encrypted_col` picks HashAggregate via the hash op class rather than degenerating to GroupAggregate-after-Sort or a Nested-Loop self-comparison. - join_on_encrypted_uses_hmac_index — confirms self-join on encrypted equality engages bench_text_hmac_idx (Hash Join or Nested Loop + Memoize + Index Scan, both acceptable). - distinct_encrypted_uses_hash_aggregate — confirms unbounded DISTINCT picks HashAggregate (the bounded-LIMIT variant biases toward IndexOnlyScan over the ORE btree opclass; that path is fine on full installs but unavailable on Supabase). Regression timing assertions in bench_regression_tests.rs (#[ignore]'d pending #202; remove the markers when it merges): - group_by_encrypted_under_threshold — 150ms (current ~309ms via plpgsql hash chain, ~70ms with chain inlined; threshold ~2x the inlined target). - self_join_encrypted_under_threshold — 350ms (current ~308ms, ~182ms with chain inlined; cardinality dominates so threshold is generous). - distinct_encrypted_under_threshold — 200ms (current ~515ms unbounded via ORE btree path, expected to drop into HashAggregate-driven territory after chain inlining). Each timing test panic message states the expected post-#202 number and the current observed number, so the diagnostic remains useful when the gate flips after the fix lands. --- tests/sqlx/tests/bench_plan_tests.rs | 68 +++++++++++++++++++- tests/sqlx/tests/bench_regression_tests.rs | 74 ++++++++++++++++++++++ 2 files changed, 141 insertions(+), 1 deletion(-) diff --git a/tests/sqlx/tests/bench_plan_tests.rs b/tests/sqlx/tests/bench_plan_tests.rs index 2cb85ec1..8120f6af 100644 --- a/tests/sqlx/tests/bench_plan_tests.rs +++ b/tests/sqlx/tests/bench_plan_tests.rs @@ -6,7 +6,9 @@ //! ANALYZE is run by the bench_setup fixture — planner statistics are populated at fixture load. use anyhow::Result; -use eql_tests::{assert_uses_index, get_bench_encrypted_int, get_bench_encrypted_text}; +use eql_tests::{ + assert_uses_index, explain_query, get_bench_encrypted_int, get_bench_encrypted_text, +}; use sqlx::PgPool; const BENCH_INT_ORE_IDX: &str = "bench_int_ore_idx"; @@ -157,3 +159,67 @@ async fn bare_ilike_uses_bloom_index(pool: PgPool) -> Result<()> { assert_uses_index(&pool, &sql, BENCH_TEXT_BLOOM_IDX).await?; Ok(()) } + +// ============================================================================ +// Hash-strategy plans: GROUP BY / JOIN / DISTINCT on encrypted columns engage +// the hash operator class (#196). The plan-shape assertions below cover the +// surface PR #196 enabled; the corresponding timing thresholds in +// bench_regression_tests.rs are #[ignore]'d pending the hash-chain inlining +// work tracked in #202. +// ============================================================================ + +/// `GROUP BY encrypted_col` engages HashAggregate via the hash operator class. +/// Without the hash op class registered in #196 this would fall back to +/// GroupAggregate-after-Sort or — worse — degenerate to a Nested-Loop self-comparison. +#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))] +#[cfg_attr( + not(feature = "bench"), + ignore = "perf-bench: gated, run via mise test:bench" +)] +async fn group_by_encrypted_uses_hash_aggregate(pool: PgPool) -> Result<()> { + let sql = "SELECT count(*) FROM bench GROUP BY encrypted_text"; + let plan = explain_query(&pool, sql).await?; + assert!( + plan.contains("HashAggregate"), + "Expected GROUP BY to use HashAggregate. EXPLAIN output:\n{}", + plan + ); + Ok(()) +} + +/// JOIN on `a.encrypted_col = b.encrypted_col` engages the hmac functional index. +/// Acceptable plan shapes: Hash Join (preferred), or Nested Loop + Memoize + +/// Index Scan via `bench_text_hmac_idx` (current planner choice — fine since +/// the index lookup remains the per-probe cost). +#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))] +#[cfg_attr( + not(feature = "bench"), + ignore = "perf-bench: gated, run via mise test:bench" +)] +async fn join_on_encrypted_uses_hmac_index(pool: PgPool) -> Result<()> { + let sql = "SELECT count(*) FROM bench a JOIN bench b \ + ON a.encrypted_text = b.encrypted_text"; + assert_uses_index(&pool, sql, BENCH_TEXT_HMAC_IDX).await?; + Ok(()) +} + +/// `SELECT DISTINCT encrypted_col FROM t` (unbounded) engages HashAggregate +/// via the hash operator class. The bounded variant (`... LIMIT N`) biases +/// the planner toward Index Only Scan over the ORE btree opclass — that's +/// fine on full installs but unavailable on Supabase, where this hash path +/// becomes the only viable one. +#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))] +#[cfg_attr( + not(feature = "bench"), + ignore = "perf-bench: gated, run via mise test:bench" +)] +async fn distinct_encrypted_uses_hash_aggregate(pool: PgPool) -> Result<()> { + let sql = "SELECT DISTINCT encrypted_text FROM bench"; + let plan = explain_query(&pool, sql).await?; + assert!( + plan.contains("HashAggregate"), + "Expected DISTINCT to use HashAggregate. EXPLAIN output:\n{}", + plan + ); + Ok(()) +} diff --git a/tests/sqlx/tests/bench_regression_tests.rs b/tests/sqlx/tests/bench_regression_tests.rs index 2d49faa6..cc7921df 100644 --- a/tests/sqlx/tests/bench_regression_tests.rs +++ b/tests/sqlx/tests/bench_regression_tests.rs @@ -111,3 +111,77 @@ async fn ore_order_by_under_threshold(pool: PgPool) -> Result<()> { ); Ok(()) } + +// ============================================================================ +// Hash-strategy timing regressions: GROUP BY / JOIN / DISTINCT on encrypted +// columns. The plan shapes already engage the hash operator class (#196), but +// per-row cost is dominated by plpgsql call overhead in the +// `hash_encrypted` → `to_ste_vec_value` → `hmac_256` chain. +// +// All three are #[ignore]'d pending the chain inlining tracked in #202. +// Thresholds are set to the post-inlining target (measured by patching the +// chain to LANGUAGE sql IMMUTABLE in-place: ~70ms for GROUP BY, ~182ms for +// the self-join, with consistent plan shapes). Remove the #[ignore] when #202 +// merges and confirm green. +// ============================================================================ + +/// `GROUP BY encrypted_text` should be under 150ms at 10K rows. +/// Measured baseline today: ~309ms (HashAggregate + Seq Scan, plpgsql per-row +/// cost in hash_encrypted chain). Measured with chain inlined: ~70ms. +/// Threshold of 150ms is ~2× the inlined baseline to absorb CI variance. +#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))] +#[ignore = "#202: hash_encrypted chain not yet inlined; remove ignore when #202 merges"] +async fn group_by_encrypted_under_threshold(pool: PgPool) -> Result<()> { + let stats: ExplainStats = explain_analyze_avg( + &pool, + "SELECT count(*) FROM bench GROUP BY encrypted_text", + 5, + ) + .await?; + assert!( + stats.execution_time_ms < 150.0, + "GROUP BY encrypted_text took {:.1}ms, threshold 150ms (~70ms expected after #202, currently ~309ms, node_type={})", + stats.execution_time_ms, stats.node_type + ); + Ok(()) +} + +/// Self-join on `a.encrypted_text = b.encrypted_text` should be under 350ms at +/// 10K rows (which produces ~1M result rows due to ~99 distinct values × ~100 +/// matches each — most of the time is intrinsic result cardinality, not the +/// per-probe cost). +/// Measured baseline today: ~308ms. Measured with chain inlined: ~182ms. +/// Threshold of 350ms catches a regression to seq scan (>1s) without flapping +/// on cardinality variance. +#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))] +#[ignore = "#202: hash_encrypted chain not yet inlined; remove ignore when #202 merges"] +async fn self_join_encrypted_under_threshold(pool: PgPool) -> Result<()> { + let stats: ExplainStats = explain_analyze_avg( + &pool, + "SELECT count(*) FROM bench a JOIN bench b ON a.encrypted_text = b.encrypted_text", + 3, + ) + .await?; + assert!( + stats.execution_time_ms < 350.0, + "Self-join on encrypted_text took {:.1}ms, threshold 350ms (~182ms expected after #202, currently ~308ms, node_type={})", + stats.execution_time_ms, stats.node_type + ); + Ok(()) +} + +/// `SELECT DISTINCT encrypted_text` should be under 200ms at 10K rows once +/// HashAggregate becomes the planner choice (currently picks Index Only Scan +/// over the ORE btree opclass at ~132ms — usable but unavailable on Supabase). +#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))] +#[ignore = "#202: hash_encrypted chain not yet inlined; remove ignore when #202 merges"] +async fn distinct_encrypted_under_threshold(pool: PgPool) -> Result<()> { + let stats: ExplainStats = + explain_analyze_avg(&pool, "SELECT DISTINCT encrypted_text FROM bench", 5).await?; + assert!( + stats.execution_time_ms < 200.0, + "DISTINCT encrypted_text took {:.1}ms, threshold 200ms (currently ~132ms via ORE btree on full install; want HashAggregate-driven path post-#202, node_type={})", + stats.execution_time_ms, stats.node_type + ); + Ok(()) +} From 7326fc24c123e10ed835f78debafa170ab3c58d5 Mon Sep 17 00:00:00 2001 From: Dan Draper Date: Mon, 11 May 2026 16:41:57 +1000 Subject: [PATCH 2/3] =?UTF-8?q?test(bench):=20correct=20hash-strategy=20re?= =?UTF-8?q?gression=20numbers=20=E2=80=94=20fast-path,=20not=20naive=20inl?= =?UTF-8?q?ining?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After measuring properly, naive plpgsql → LANGUAGE sql conversion of the hash_encrypted chain does not deliver the speedup originally cited: `to_ste_vec_value`'s per-row JSONB inspection/reconstruction dominates even when fully inlined. The actual #202 fix is a fast-path read of root-level `hm` (`coalesce(val.data ->> 'hm', ...)`) with the `to_ste_vec_value` unwrap reserved for single-element ste_vec-wrapped payloads. Updates the regression test docstrings, panic messages, and section header to reflect the measured fast-path numbers: - GROUP BY: expected ~73ms (was ~70ms — close, but framing was wrong). - Self-join: expected ~185ms (was ~182ms — same). - DISTINCT (previously TBD): measured ~72ms with fast-path applied. Thresholds unchanged: 150ms / 350ms / 200ms — all still have ~2x headroom over the fast-path numbers. --- tests/sqlx/tests/bench_regression_tests.rs | 45 ++++++++++++++-------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/tests/sqlx/tests/bench_regression_tests.rs b/tests/sqlx/tests/bench_regression_tests.rs index cc7921df..6edcfe63 100644 --- a/tests/sqlx/tests/bench_regression_tests.rs +++ b/tests/sqlx/tests/bench_regression_tests.rs @@ -118,17 +118,22 @@ async fn ore_order_by_under_threshold(pool: PgPool) -> Result<()> { // per-row cost is dominated by plpgsql call overhead in the // `hash_encrypted` → `to_ste_vec_value` → `hmac_256` chain. // -// All three are #[ignore]'d pending the chain inlining tracked in #202. -// Thresholds are set to the post-inlining target (measured by patching the -// chain to LANGUAGE sql IMMUTABLE in-place: ~70ms for GROUP BY, ~182ms for -// the self-join, with consistent plan shapes). Remove the #[ignore] when #202 -// merges and confirm green. +// All three are #[ignore]'d pending the hash_encrypted fast-path tracked in +// #202. The dominant cost on these queries isn't plpgsql call overhead (a +// naive plpgsql → LANGUAGE sql conversion of the existing body leaves them +// effectively unchanged); it's `to_ste_vec_value`'s per-row JSONB inspection +// and reconstruction. The #202 fix short-circuits via root-level `hm` +// (`coalesce(val.data ->> 'hm', ...)`), falling through to `to_ste_vec_value` +// only for single-element ste_vec-wrapped payloads. Thresholds below reflect +// measured numbers with that fast-path applied in-place. Remove the +// `#[ignore]` markers when #202 merges and confirm green. // ============================================================================ /// `GROUP BY encrypted_text` should be under 150ms at 10K rows. -/// Measured baseline today: ~309ms (HashAggregate + Seq Scan, plpgsql per-row -/// cost in hash_encrypted chain). Measured with chain inlined: ~70ms. -/// Threshold of 150ms is ~2× the inlined baseline to absorb CI variance. +/// Measured baseline today: ~309ms (HashAggregate + Seq Scan, dominated by +/// per-row `to_ste_vec_value` cost in the hash_encrypted chain). Measured +/// with the #202 fast-path applied: ~73ms. Threshold of 150ms is ~2x the +/// fast-path number to absorb CI variance. #[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))] #[ignore = "#202: hash_encrypted chain not yet inlined; remove ignore when #202 merges"] async fn group_by_encrypted_under_threshold(pool: PgPool) -> Result<()> { @@ -140,7 +145,7 @@ async fn group_by_encrypted_under_threshold(pool: PgPool) -> Result<()> { .await?; assert!( stats.execution_time_ms < 150.0, - "GROUP BY encrypted_text took {:.1}ms, threshold 150ms (~70ms expected after #202, currently ~309ms, node_type={})", + "GROUP BY encrypted_text took {:.1}ms, threshold 150ms (~73ms expected after #202 fast-path, currently ~309ms, node_type={})", stats.execution_time_ms, stats.node_type ); Ok(()) @@ -150,9 +155,9 @@ async fn group_by_encrypted_under_threshold(pool: PgPool) -> Result<()> { /// 10K rows (which produces ~1M result rows due to ~99 distinct values × ~100 /// matches each — most of the time is intrinsic result cardinality, not the /// per-probe cost). -/// Measured baseline today: ~308ms. Measured with chain inlined: ~182ms. -/// Threshold of 350ms catches a regression to seq scan (>1s) without flapping -/// on cardinality variance. +/// Measured baseline today: ~308ms. Measured with the #202 fast-path applied: +/// ~185ms. Threshold of 350ms catches a regression to seq scan (>1s) without +/// flapping on cardinality variance. #[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))] #[ignore = "#202: hash_encrypted chain not yet inlined; remove ignore when #202 merges"] async fn self_join_encrypted_under_threshold(pool: PgPool) -> Result<()> { @@ -164,15 +169,21 @@ async fn self_join_encrypted_under_threshold(pool: PgPool) -> Result<()> { .await?; assert!( stats.execution_time_ms < 350.0, - "Self-join on encrypted_text took {:.1}ms, threshold 350ms (~182ms expected after #202, currently ~308ms, node_type={})", + "Self-join on encrypted_text took {:.1}ms, threshold 350ms (~185ms expected after #202 fast-path, currently ~308ms, node_type={})", stats.execution_time_ms, stats.node_type ); Ok(()) } -/// `SELECT DISTINCT encrypted_text` should be under 200ms at 10K rows once -/// HashAggregate becomes the planner choice (currently picks Index Only Scan -/// over the ORE btree opclass at ~132ms — usable but unavailable on Supabase). +/// Unbounded `SELECT DISTINCT encrypted_text` should be under 200ms at 10K rows. +/// Measured baseline today: ~515ms (HashAggregate over plpgsql hash_encrypted +/// chain). Measured with the #202 fast-path applied (`coalesce(val.data ->> +/// 'hm', ...)`): ~72ms. Threshold of 200ms is ~2.8x the fast-path number to +/// absorb CI variance. +/// +/// (The `... LIMIT N` variant biases the planner toward Index Only Scan over +/// the ORE btree opclass — fine on full installs but unavailable on Supabase. +/// This test exercises the unbounded path that engages HashAggregate.) #[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))] #[ignore = "#202: hash_encrypted chain not yet inlined; remove ignore when #202 merges"] async fn distinct_encrypted_under_threshold(pool: PgPool) -> Result<()> { @@ -180,7 +191,7 @@ async fn distinct_encrypted_under_threshold(pool: PgPool) -> Result<()> { explain_analyze_avg(&pool, "SELECT DISTINCT encrypted_text FROM bench", 5).await?; assert!( stats.execution_time_ms < 200.0, - "DISTINCT encrypted_text took {:.1}ms, threshold 200ms (currently ~132ms via ORE btree on full install; want HashAggregate-driven path post-#202, node_type={})", + "DISTINCT encrypted_text took {:.1}ms, threshold 200ms (~72ms expected after #202 fast-path, currently ~515ms, node_type={})", stats.execution_time_ms, stats.node_type ); Ok(()) From 02b539e62b30219b1c60394296295babe88fc395 Mon Sep 17 00:00:00 2001 From: Dan Draper Date: Mon, 11 May 2026 17:24:39 +1000 Subject: [PATCH 3/3] test(bench): cover GROUP BY on JSON field extracted from encrypted column MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the field-level GROUP BY scenario alongside the root-level coverage. The pattern — `GROUP BY eql_v2.jsonb_path_query_first(col, '')` — is the canonical "tell me how many rows per region" query against ste_vec encryption where the field has a `unique` index configured. The current bench fixture's field-level sv elements carry only OPE terms (`ocv` / `ocf`), no `hm`, which means field-level GROUP BY raises ("Cannot hash eql_v2_encrypted value: no hmac_256 index term found"). The new `bench_json_data.sql` fixture overlays `hm` onto the $.hello sv element of each bench row — synthesising what `@cipherstash/protect` produces when the $.hello path is configured with `unique`. Reuses the existing `bench` rows so the fixture cost stays the same. Plan assertion in `bench_plan_tests.rs` (gated on bench feature, passes today): `group_by_jsonb_field_uses_hash_aggregate` — confirms the planner engages `HashAggregate` (today as `Partial HashAggregate` under a parallel worker, then Sort + GroupAggregate for the merge — once #204 inlines the extractors, this may flatten to a single HashAggregate). Regression timing in `bench_regression_tests.rs` (#[ignore]'d pending #204): `group_by_jsonb_field_under_threshold` — threshold 50ms. Measured numbers at 10K rows: ~278ms on main, ~234ms with the #202 fast-path patched in, ~7ms achievable via raw JSONB extraction (`(col).data->'sv'->N->>'hm'`). The 50ms threshold targets the post-#204-extractor-inlining state. --- tests/sqlx/fixtures/bench_json_data.sql | 49 ++++++++++++++++++++++ tests/sqlx/tests/bench_plan_tests.rs | 40 ++++++++++++++++++ tests/sqlx/tests/bench_regression_tests.rs | 32 ++++++++++++++ 3 files changed, 121 insertions(+) create mode 100644 tests/sqlx/fixtures/bench_json_data.sql diff --git a/tests/sqlx/fixtures/bench_json_data.sql b/tests/sqlx/fixtures/bench_json_data.sql new file mode 100644 index 00000000..3002a8b1 --- /dev/null +++ b/tests/sqlx/fixtures/bench_json_data.sql @@ -0,0 +1,49 @@ +-- Fixture: bench_json_data.sql +-- +-- Builds the bench_json table by overlaying the existing `bench` rows +-- (loaded by the bench_data.sql fixture, which must run first) and adding +-- `hm` to the $.hello sv element of each row. This mirrors what +-- `@cipherstash/protect` would produce for a JSONB column where the +-- $.hello path is configured with a `unique` index — without that, the +-- field-level sv element carries only OPE terms (`ocv`) and field-level +-- GROUP BY / DISTINCT / hash joins on the extracted value raise: +-- +-- ERROR: Cannot hash eql_v2_encrypted value: no hmac_256 index term +-- found. Configure a `unique` index on the column for hash +-- operations (GROUP BY, DISTINCT, hash joins). +-- +-- The synthesised `hm` is the field's existing `ocv` hex string (already +-- deterministic over the plaintext at that selector) so it serves as a +-- valid equality token without us inventing a separate one. The shape +-- matches production: `c`, `s`, `ocv`, `hm` at the sv element level. +-- +-- Selector cheatsheet (matches Selectors:: in tests/sqlx/src/selectors.rs): +-- bca213de9ccce676fa849ff9c4807963 → $ (root, has b3 here today) +-- a7cea93975ed8c01f861ccb6bd082784 → $.hello (we add `hm` here) +-- 2517068c0d1f9d4d41d2c666211f785e → $.n (left alone) + +CREATE TABLE IF NOT EXISTS bench_json ( + id bigserial PRIMARY KEY, + e eql_v2_encrypted +); + +INSERT INTO bench_json (e) +SELECT (jsonb_build_object( + 'c', (encrypted_text).data ->> 'c', + 'i', (encrypted_text).data -> 'i', + 'v', 2, + 'hm', (encrypted_text).data ->> 'hm', + 'sv', ( + SELECT jsonb_agg( + CASE + WHEN elem ->> 's' = 'a7cea93975ed8c01f861ccb6bd082784' + THEN elem || jsonb_build_object('hm', elem ->> 'ocv') + ELSE elem + END + ) + FROM jsonb_array_elements((encrypted_text).data -> 'sv') elem + ) +)) :: eql_v2_encrypted +FROM bench; + +ANALYZE bench_json; diff --git a/tests/sqlx/tests/bench_plan_tests.rs b/tests/sqlx/tests/bench_plan_tests.rs index 8120f6af..0be9bab7 100644 --- a/tests/sqlx/tests/bench_plan_tests.rs +++ b/tests/sqlx/tests/bench_plan_tests.rs @@ -223,3 +223,43 @@ async fn distinct_encrypted_uses_hash_aggregate(pool: PgPool) -> Result<()> { ); Ok(()) } + +// ============================================================================ +// Field-level hash-strategy: GROUP BY on a JSON path extracted from an +// encrypted column. This is the "how many users per region" pattern against +// ste_vec encryption. +// +// Uses the bench_json fixture, which overlays `hm` onto the `$.hello` sv +// element of each bench row — simulating what `@cipherstash/protect` produces +// for a JSONB column where the `$.hello` path is configured with a `unique` +// index. Without that overlay, field-level GROUP BY raises today +// ("Cannot hash eql_v2_encrypted value: no hmac_256 index term found"). +// ============================================================================ + +/// Documented EQL form for field-level GROUP BY: +/// `GROUP BY eql_v2.jsonb_path_query_first(col, '')`. +/// The planner engages a parallel Partial HashAggregate + Sort + GroupAggregate +/// merge — i.e., `HashAggregate` appears in the plan even if it's not the top +/// node. The bare-`->` form (`col -> ''::text`) currently picks +/// Sort + GroupAggregate instead; once #204 inlines the extractors, the +/// planner has the option of flattening to a single HashAggregate. +#[sqlx::test(fixtures( + path = "../fixtures", + scripts("bench_data", "bench_setup", "bench_json_data") +))] +#[cfg_attr( + not(feature = "bench"), + ignore = "perf-bench: gated, run via mise test:bench" +)] +async fn group_by_jsonb_field_uses_hash_aggregate(pool: PgPool) -> Result<()> { + // Selectors::HELLO = $.hello — see tests/sqlx/src/selectors.rs. + let sql = "SELECT count(*) FROM bench_json \ + GROUP BY eql_v2.jsonb_path_query_first(e, 'a7cea93975ed8c01f861ccb6bd082784')"; + let plan = explain_query(&pool, sql).await?; + assert!( + plan.contains("HashAggregate"), + "Expected field-level GROUP BY plan to include a HashAggregate node. EXPLAIN output:\n{}", + plan + ); + Ok(()) +} diff --git a/tests/sqlx/tests/bench_regression_tests.rs b/tests/sqlx/tests/bench_regression_tests.rs index 6edcfe63..5335db95 100644 --- a/tests/sqlx/tests/bench_regression_tests.rs +++ b/tests/sqlx/tests/bench_regression_tests.rs @@ -196,3 +196,35 @@ async fn distinct_encrypted_under_threshold(pool: PgPool) -> Result<()> { ); Ok(()) } + +/// Field-level `GROUP BY eql_v2.jsonb_path_query_first(col, '')` — +/// the canonical pattern for "count rows per JSON field value" against an +/// encrypted column where the field has a `unique` index configured. +/// +/// Closing #202 (hash_encrypted fast-path) helps but doesn't fully solve this: +/// the dominant cost shifts to `jsonb_path_query_first`'s per-row plpgsql +/// overhead. The threshold below reflects what becomes achievable once #204 +/// inlines the JSONB field extractors. +/// +/// Measured at 10K rows on a synthesized fixture with `hm` overlaid at +/// $.hello: current main ~496ms, with #202 fast-path applied ~234ms, with +/// raw JSONB extraction bypass (proxy-emitted, no eql_v2 plpgsql in the +/// hot path) ~7ms. Threshold of 50ms is set near the bypass number — when +/// #204 inlines the extractors, the EQL form should converge on the same +/// ballpark. +#[sqlx::test(fixtures( + path = "../fixtures", + scripts("bench_data", "bench_setup", "bench_json_data") +))] +#[ignore = "#204: JSONB field extractors not yet inlined; remove ignore when #204 merges"] +async fn group_by_jsonb_field_under_threshold(pool: PgPool) -> Result<()> { + let sql = "SELECT count(*) FROM bench_json \ + GROUP BY eql_v2.jsonb_path_query_first(e, 'a7cea93975ed8c01f861ccb6bd082784')"; + let stats: ExplainStats = explain_analyze_avg(&pool, sql, 5).await?; + assert!( + stats.execution_time_ms < 50.0, + "GROUP BY field-level jsonb_path_query_first took {:.1}ms, threshold 50ms (~7ms achievable with full extractor inlining + #202, currently ~496ms on main, ~234ms with #202 fast-path only, node_type={})", + stats.execution_time_ms, stats.node_type + ); + Ok(()) +}