diff --git a/tests/sqlx/fixtures/bench_json_data.sql b/tests/sqlx/fixtures/bench_json_data.sql new file mode 100644 index 00000000..3002a8b1 --- /dev/null +++ b/tests/sqlx/fixtures/bench_json_data.sql @@ -0,0 +1,49 @@ +-- Fixture: bench_json_data.sql +-- +-- Builds the bench_json table by overlaying the existing `bench` rows +-- (loaded by the bench_data.sql fixture, which must run first) and adding +-- `hm` to the $.hello sv element of each row. This mirrors what +-- `@cipherstash/protect` would produce for a JSONB column where the +-- $.hello path is configured with a `unique` index — without that, the +-- field-level sv element carries only OPE terms (`ocv`) and field-level +-- GROUP BY / DISTINCT / hash joins on the extracted value raise: +-- +-- ERROR: Cannot hash eql_v2_encrypted value: no hmac_256 index term +-- found. Configure a `unique` index on the column for hash +-- operations (GROUP BY, DISTINCT, hash joins). +-- +-- The synthesised `hm` is the field's existing `ocv` hex string (already +-- deterministic over the plaintext at that selector) so it serves as a +-- valid equality token without us inventing a separate one. The shape +-- matches production: `c`, `s`, `ocv`, `hm` at the sv element level. +-- +-- Selector cheatsheet (matches Selectors:: in tests/sqlx/src/selectors.rs): +-- bca213de9ccce676fa849ff9c4807963 → $ (root, has b3 here today) +-- a7cea93975ed8c01f861ccb6bd082784 → $.hello (we add `hm` here) +-- 2517068c0d1f9d4d41d2c666211f785e → $.n (left alone) + +CREATE TABLE IF NOT EXISTS bench_json ( + id bigserial PRIMARY KEY, + e eql_v2_encrypted +); + +INSERT INTO bench_json (e) +SELECT (jsonb_build_object( + 'c', (encrypted_text).data ->> 'c', + 'i', (encrypted_text).data -> 'i', + 'v', 2, + 'hm', (encrypted_text).data ->> 'hm', + 'sv', ( + SELECT jsonb_agg( + CASE + WHEN elem ->> 's' = 'a7cea93975ed8c01f861ccb6bd082784' + THEN elem || jsonb_build_object('hm', elem ->> 'ocv') + ELSE elem + END + ) + FROM jsonb_array_elements((encrypted_text).data -> 'sv') elem + ) +)) :: eql_v2_encrypted +FROM bench; + +ANALYZE bench_json; diff --git a/tests/sqlx/tests/bench_plan_tests.rs b/tests/sqlx/tests/bench_plan_tests.rs index 2cb85ec1..0be9bab7 100644 --- a/tests/sqlx/tests/bench_plan_tests.rs +++ b/tests/sqlx/tests/bench_plan_tests.rs @@ -6,7 +6,9 @@ //! ANALYZE is run by the bench_setup fixture — planner statistics are populated at fixture load. use anyhow::Result; -use eql_tests::{assert_uses_index, get_bench_encrypted_int, get_bench_encrypted_text}; +use eql_tests::{ + assert_uses_index, explain_query, get_bench_encrypted_int, get_bench_encrypted_text, +}; use sqlx::PgPool; const BENCH_INT_ORE_IDX: &str = "bench_int_ore_idx"; @@ -157,3 +159,107 @@ async fn bare_ilike_uses_bloom_index(pool: PgPool) -> Result<()> { assert_uses_index(&pool, &sql, BENCH_TEXT_BLOOM_IDX).await?; Ok(()) } + +// ============================================================================ +// Hash-strategy plans: GROUP BY / JOIN / DISTINCT on encrypted columns engage +// the hash operator class (#196). The plan-shape assertions below cover the +// surface PR #196 enabled; the corresponding timing thresholds in +// bench_regression_tests.rs are #[ignore]'d pending the hash-chain inlining +// work tracked in #202. +// ============================================================================ + +/// `GROUP BY encrypted_col` engages HashAggregate via the hash operator class. +/// Without the hash op class registered in #196 this would fall back to +/// GroupAggregate-after-Sort or — worse — degenerate to a Nested-Loop self-comparison. +#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))] +#[cfg_attr( + not(feature = "bench"), + ignore = "perf-bench: gated, run via mise test:bench" +)] +async fn group_by_encrypted_uses_hash_aggregate(pool: PgPool) -> Result<()> { + let sql = "SELECT count(*) FROM bench GROUP BY encrypted_text"; + let plan = explain_query(&pool, sql).await?; + assert!( + plan.contains("HashAggregate"), + "Expected GROUP BY to use HashAggregate. EXPLAIN output:\n{}", + plan + ); + Ok(()) +} + +/// JOIN on `a.encrypted_col = b.encrypted_col` engages the hmac functional index. +/// Acceptable plan shapes: Hash Join (preferred), or Nested Loop + Memoize + +/// Index Scan via `bench_text_hmac_idx` (current planner choice — fine since +/// the index lookup remains the per-probe cost). +#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))] +#[cfg_attr( + not(feature = "bench"), + ignore = "perf-bench: gated, run via mise test:bench" +)] +async fn join_on_encrypted_uses_hmac_index(pool: PgPool) -> Result<()> { + let sql = "SELECT count(*) FROM bench a JOIN bench b \ + ON a.encrypted_text = b.encrypted_text"; + assert_uses_index(&pool, sql, BENCH_TEXT_HMAC_IDX).await?; + Ok(()) +} + +/// `SELECT DISTINCT encrypted_col FROM t` (unbounded) engages HashAggregate +/// via the hash operator class. The bounded variant (`... LIMIT N`) biases +/// the planner toward Index Only Scan over the ORE btree opclass — that's +/// fine on full installs but unavailable on Supabase, where this hash path +/// becomes the only viable one. +#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))] +#[cfg_attr( + not(feature = "bench"), + ignore = "perf-bench: gated, run via mise test:bench" +)] +async fn distinct_encrypted_uses_hash_aggregate(pool: PgPool) -> Result<()> { + let sql = "SELECT DISTINCT encrypted_text FROM bench"; + let plan = explain_query(&pool, sql).await?; + assert!( + plan.contains("HashAggregate"), + "Expected DISTINCT to use HashAggregate. EXPLAIN output:\n{}", + plan + ); + Ok(()) +} + +// ============================================================================ +// Field-level hash-strategy: GROUP BY on a JSON path extracted from an +// encrypted column. This is the "how many users per region" pattern against +// ste_vec encryption. +// +// Uses the bench_json fixture, which overlays `hm` onto the `$.hello` sv +// element of each bench row — simulating what `@cipherstash/protect` produces +// for a JSONB column where the `$.hello` path is configured with a `unique` +// index. Without that overlay, field-level GROUP BY raises today +// ("Cannot hash eql_v2_encrypted value: no hmac_256 index term found"). +// ============================================================================ + +/// Documented EQL form for field-level GROUP BY: +/// `GROUP BY eql_v2.jsonb_path_query_first(col, '')`. +/// The planner engages a parallel Partial HashAggregate + Sort + GroupAggregate +/// merge — i.e., `HashAggregate` appears in the plan even if it's not the top +/// node. The bare-`->` form (`col -> ''::text`) currently picks +/// Sort + GroupAggregate instead; once #204 inlines the extractors, the +/// planner has the option of flattening to a single HashAggregate. +#[sqlx::test(fixtures( + path = "../fixtures", + scripts("bench_data", "bench_setup", "bench_json_data") +))] +#[cfg_attr( + not(feature = "bench"), + ignore = "perf-bench: gated, run via mise test:bench" +)] +async fn group_by_jsonb_field_uses_hash_aggregate(pool: PgPool) -> Result<()> { + // Selectors::HELLO = $.hello — see tests/sqlx/src/selectors.rs. + let sql = "SELECT count(*) FROM bench_json \ + GROUP BY eql_v2.jsonb_path_query_first(e, 'a7cea93975ed8c01f861ccb6bd082784')"; + let plan = explain_query(&pool, sql).await?; + assert!( + plan.contains("HashAggregate"), + "Expected field-level GROUP BY plan to include a HashAggregate node. EXPLAIN output:\n{}", + plan + ); + Ok(()) +} diff --git a/tests/sqlx/tests/bench_regression_tests.rs b/tests/sqlx/tests/bench_regression_tests.rs index 2d49faa6..5335db95 100644 --- a/tests/sqlx/tests/bench_regression_tests.rs +++ b/tests/sqlx/tests/bench_regression_tests.rs @@ -111,3 +111,120 @@ async fn ore_order_by_under_threshold(pool: PgPool) -> Result<()> { ); Ok(()) } + +// ============================================================================ +// Hash-strategy timing regressions: GROUP BY / JOIN / DISTINCT on encrypted +// columns. The plan shapes already engage the hash operator class (#196), but +// per-row cost is dominated by plpgsql call overhead in the +// `hash_encrypted` → `to_ste_vec_value` → `hmac_256` chain. +// +// All three are #[ignore]'d pending the hash_encrypted fast-path tracked in +// #202. The dominant cost on these queries isn't plpgsql call overhead (a +// naive plpgsql → LANGUAGE sql conversion of the existing body leaves them +// effectively unchanged); it's `to_ste_vec_value`'s per-row JSONB inspection +// and reconstruction. The #202 fix short-circuits via root-level `hm` +// (`coalesce(val.data ->> 'hm', ...)`), falling through to `to_ste_vec_value` +// only for single-element ste_vec-wrapped payloads. Thresholds below reflect +// measured numbers with that fast-path applied in-place. Remove the +// `#[ignore]` markers when #202 merges and confirm green. +// ============================================================================ + +/// `GROUP BY encrypted_text` should be under 150ms at 10K rows. +/// Measured baseline today: ~309ms (HashAggregate + Seq Scan, dominated by +/// per-row `to_ste_vec_value` cost in the hash_encrypted chain). Measured +/// with the #202 fast-path applied: ~73ms. Threshold of 150ms is ~2x the +/// fast-path number to absorb CI variance. +#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))] +#[ignore = "#202: hash_encrypted chain not yet inlined; remove ignore when #202 merges"] +async fn group_by_encrypted_under_threshold(pool: PgPool) -> Result<()> { + let stats: ExplainStats = explain_analyze_avg( + &pool, + "SELECT count(*) FROM bench GROUP BY encrypted_text", + 5, + ) + .await?; + assert!( + stats.execution_time_ms < 150.0, + "GROUP BY encrypted_text took {:.1}ms, threshold 150ms (~73ms expected after #202 fast-path, currently ~309ms, node_type={})", + stats.execution_time_ms, stats.node_type + ); + Ok(()) +} + +/// Self-join on `a.encrypted_text = b.encrypted_text` should be under 350ms at +/// 10K rows (which produces ~1M result rows due to ~99 distinct values × ~100 +/// matches each — most of the time is intrinsic result cardinality, not the +/// per-probe cost). +/// Measured baseline today: ~308ms. Measured with the #202 fast-path applied: +/// ~185ms. Threshold of 350ms catches a regression to seq scan (>1s) without +/// flapping on cardinality variance. +#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))] +#[ignore = "#202: hash_encrypted chain not yet inlined; remove ignore when #202 merges"] +async fn self_join_encrypted_under_threshold(pool: PgPool) -> Result<()> { + let stats: ExplainStats = explain_analyze_avg( + &pool, + "SELECT count(*) FROM bench a JOIN bench b ON a.encrypted_text = b.encrypted_text", + 3, + ) + .await?; + assert!( + stats.execution_time_ms < 350.0, + "Self-join on encrypted_text took {:.1}ms, threshold 350ms (~185ms expected after #202 fast-path, currently ~308ms, node_type={})", + stats.execution_time_ms, stats.node_type + ); + Ok(()) +} + +/// Unbounded `SELECT DISTINCT encrypted_text` should be under 200ms at 10K rows. +/// Measured baseline today: ~515ms (HashAggregate over plpgsql hash_encrypted +/// chain). Measured with the #202 fast-path applied (`coalesce(val.data ->> +/// 'hm', ...)`): ~72ms. Threshold of 200ms is ~2.8x the fast-path number to +/// absorb CI variance. +/// +/// (The `... LIMIT N` variant biases the planner toward Index Only Scan over +/// the ORE btree opclass — fine on full installs but unavailable on Supabase. +/// This test exercises the unbounded path that engages HashAggregate.) +#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))] +#[ignore = "#202: hash_encrypted chain not yet inlined; remove ignore when #202 merges"] +async fn distinct_encrypted_under_threshold(pool: PgPool) -> Result<()> { + let stats: ExplainStats = + explain_analyze_avg(&pool, "SELECT DISTINCT encrypted_text FROM bench", 5).await?; + assert!( + stats.execution_time_ms < 200.0, + "DISTINCT encrypted_text took {:.1}ms, threshold 200ms (~72ms expected after #202 fast-path, currently ~515ms, node_type={})", + stats.execution_time_ms, stats.node_type + ); + Ok(()) +} + +/// Field-level `GROUP BY eql_v2.jsonb_path_query_first(col, '')` — +/// the canonical pattern for "count rows per JSON field value" against an +/// encrypted column where the field has a `unique` index configured. +/// +/// Closing #202 (hash_encrypted fast-path) helps but doesn't fully solve this: +/// the dominant cost shifts to `jsonb_path_query_first`'s per-row plpgsql +/// overhead. The threshold below reflects what becomes achievable once #204 +/// inlines the JSONB field extractors. +/// +/// Measured at 10K rows on a synthesized fixture with `hm` overlaid at +/// $.hello: current main ~496ms, with #202 fast-path applied ~234ms, with +/// raw JSONB extraction bypass (proxy-emitted, no eql_v2 plpgsql in the +/// hot path) ~7ms. Threshold of 50ms is set near the bypass number — when +/// #204 inlines the extractors, the EQL form should converge on the same +/// ballpark. +#[sqlx::test(fixtures( + path = "../fixtures", + scripts("bench_data", "bench_setup", "bench_json_data") +))] +#[ignore = "#204: JSONB field extractors not yet inlined; remove ignore when #204 merges"] +async fn group_by_jsonb_field_under_threshold(pool: PgPool) -> Result<()> { + let sql = "SELECT count(*) FROM bench_json \ + GROUP BY eql_v2.jsonb_path_query_first(e, 'a7cea93975ed8c01f861ccb6bd082784')"; + let stats: ExplainStats = explain_analyze_avg(&pool, sql, 5).await?; + assert!( + stats.execution_time_ms < 50.0, + "GROUP BY field-level jsonb_path_query_first took {:.1}ms, threshold 50ms (~7ms achievable with full extractor inlining + #202, currently ~496ms on main, ~234ms with #202 fast-path only, node_type={})", + stats.execution_time_ms, stats.node_type + ); + Ok(()) +}