Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions tests/sqlx/fixtures/bench_json_data.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
-- Fixture: bench_json_data.sql
--
-- Builds the bench_json table by overlaying the existing `bench` rows
-- (loaded by the bench_data.sql fixture, which must run first) and adding
-- `hm` to the $.hello sv element of each row. This mirrors what
-- `@cipherstash/protect` would produce for a JSONB column where the
-- $.hello path is configured with a `unique` index — without that, the
-- field-level sv element carries only OPE terms (`ocv`) and field-level
-- GROUP BY / DISTINCT / hash joins on the extracted value raise:
--
-- ERROR: Cannot hash eql_v2_encrypted value: no hmac_256 index term
-- found. Configure a `unique` index on the column for hash
-- operations (GROUP BY, DISTINCT, hash joins).
--
-- The synthesised `hm` is the field's existing `ocv` hex string (already
-- deterministic over the plaintext at that selector) so it serves as a
-- valid equality token without us inventing a separate one. The shape
-- matches production: `c`, `s`, `ocv`, `hm` at the sv element level.
--
-- Selector cheatsheet (matches Selectors:: in tests/sqlx/src/selectors.rs):
-- bca213de9ccce676fa849ff9c4807963 → $ (root, has b3 here today)
-- a7cea93975ed8c01f861ccb6bd082784 → $.hello (we add `hm` here)
-- 2517068c0d1f9d4d41d2c666211f785e → $.n (left alone)

CREATE TABLE IF NOT EXISTS bench_json (
id bigserial PRIMARY KEY,
e eql_v2_encrypted
);

INSERT INTO bench_json (e)
SELECT (jsonb_build_object(
'c', (encrypted_text).data ->> 'c',
'i', (encrypted_text).data -> 'i',
'v', 2,
'hm', (encrypted_text).data ->> 'hm',
'sv', (
SELECT jsonb_agg(
CASE
WHEN elem ->> 's' = 'a7cea93975ed8c01f861ccb6bd082784'
THEN elem || jsonb_build_object('hm', elem ->> 'ocv')
ELSE elem
END
)
FROM jsonb_array_elements((encrypted_text).data -> 'sv') elem
)
)) :: eql_v2_encrypted
FROM bench;

ANALYZE bench_json;
108 changes: 107 additions & 1 deletion tests/sqlx/tests/bench_plan_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
//! ANALYZE is run by the bench_setup fixture — planner statistics are populated at fixture load.

use anyhow::Result;
use eql_tests::{assert_uses_index, get_bench_encrypted_int, get_bench_encrypted_text};
use eql_tests::{
assert_uses_index, explain_query, get_bench_encrypted_int, get_bench_encrypted_text,
};
use sqlx::PgPool;

const BENCH_INT_ORE_IDX: &str = "bench_int_ore_idx";
Expand Down Expand Up @@ -157,3 +159,107 @@ async fn bare_ilike_uses_bloom_index(pool: PgPool) -> Result<()> {
assert_uses_index(&pool, &sql, BENCH_TEXT_BLOOM_IDX).await?;
Ok(())
}

// ============================================================================
// Hash-strategy plans: GROUP BY / JOIN / DISTINCT on encrypted columns engage
// the hash operator class (#196). The plan-shape assertions below cover the
// surface PR #196 enabled; the corresponding timing thresholds in
// bench_regression_tests.rs are #[ignore]'d pending the hash-chain inlining
// work tracked in #202.
// ============================================================================

/// `GROUP BY encrypted_col` engages HashAggregate via the hash operator class.
/// Without the hash op class registered in #196 this would fall back to
/// GroupAggregate-after-Sort or — worse — degenerate to a Nested-Loop self-comparison.
#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
#[cfg_attr(
not(feature = "bench"),
ignore = "perf-bench: gated, run via mise test:bench"
)]
async fn group_by_encrypted_uses_hash_aggregate(pool: PgPool) -> Result<()> {
let sql = "SELECT count(*) FROM bench GROUP BY encrypted_text";
let plan = explain_query(&pool, sql).await?;
assert!(
plan.contains("HashAggregate"),
"Expected GROUP BY to use HashAggregate. EXPLAIN output:\n{}",
plan
);
Ok(())
}

/// JOIN on `a.encrypted_col = b.encrypted_col` engages the hmac functional index.
/// Acceptable plan shapes: Hash Join (preferred), or Nested Loop + Memoize +
/// Index Scan via `bench_text_hmac_idx` (current planner choice — fine since
/// the index lookup remains the per-probe cost).
#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
#[cfg_attr(
not(feature = "bench"),
ignore = "perf-bench: gated, run via mise test:bench"
)]
async fn join_on_encrypted_uses_hmac_index(pool: PgPool) -> Result<()> {
let sql = "SELECT count(*) FROM bench a JOIN bench b \
ON a.encrypted_text = b.encrypted_text";
assert_uses_index(&pool, sql, BENCH_TEXT_HMAC_IDX).await?;
Ok(())
Comment on lines +190 to +203
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Join assertion currently rejects a documented acceptable Hash Join plan

The test description allows either Hash Join or index-driven nested-loop plans, but the assertion only accepts index usage. This can fail valid plans.

Suggested patch
 async fn join_on_encrypted_uses_hmac_index(pool: PgPool) -> Result<()> {
     let sql = "SELECT count(*) FROM bench a JOIN bench b \
                ON a.encrypted_text = b.encrypted_text";
-    assert_uses_index(&pool, sql, BENCH_TEXT_HMAC_IDX).await?;
+    let plan = explain_query(&pool, sql).await?;
+    assert!(
+        plan.contains("Hash Join") || plan.contains(BENCH_TEXT_HMAC_IDX),
+        "Expected JOIN to use Hash Join or {}. EXPLAIN output:\n{}",
+        BENCH_TEXT_HMAC_IDX,
+        plan
+    );
     Ok(())
 }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
/// JOIN on `a.encrypted_col = b.encrypted_col` engages the hmac functional index.
/// Acceptable plan shapes: Hash Join (preferred), or Nested Loop + Memoize +
/// Index Scan via `bench_text_hmac_idx` (current planner choice — fine since
/// the index lookup remains the per-probe cost).
#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
#[cfg_attr(
not(feature = "bench"),
ignore = "perf-bench: gated, run via mise test:bench"
)]
async fn join_on_encrypted_uses_hmac_index(pool: PgPool) -> Result<()> {
let sql = "SELECT count(*) FROM bench a JOIN bench b \
ON a.encrypted_text = b.encrypted_text";
assert_uses_index(&pool, sql, BENCH_TEXT_HMAC_IDX).await?;
Ok(())
/// JOIN on `a.encrypted_col = b.encrypted_col` engages the hmac functional index.
/// Acceptable plan shapes: Hash Join (preferred), or Nested Loop + Memoize +
/// Index Scan via `bench_text_hmac_idx` (current planner choice — fine since
/// the index lookup remains the per-probe cost).
#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
#[cfg_attr(
not(feature = "bench"),
ignore = "perf-bench: gated, run via mise test:bench"
)]
async fn join_on_encrypted_uses_hmac_index(pool: PgPool) -> Result<()> {
let sql = "SELECT count(*) FROM bench a JOIN bench b \
ON a.encrypted_text = b.encrypted_text";
let plan = explain_query(&pool, sql).await?;
assert!(
plan.contains("Hash Join") || plan.contains(BENCH_TEXT_HMAC_IDX),
"Expected JOIN to use Hash Join or {}. EXPLAIN output:\n{}",
BENCH_TEXT_HMAC_IDX,
plan
);
Ok(())
}
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@tests/sqlx/tests/bench_plan_tests.rs` around lines 153 - 166, The test
join_on_encrypted_uses_hmac_index currently calls assert_uses_index(&pool, sql,
BENCH_TEXT_HMAC_IDX) but the test comment allows either an index-driven
nested-loop plan or a Hash Join; update the assertion to accept both plan shapes
by either (A) changing the test to call a new helper (e.g.,
assert_uses_index_or_plan(&pool, sql, BENCH_TEXT_HMAC_IDX, "Hash Join")) that
succeeds if the plan contains BENCH_TEXT_HMAC_IDX OR the string "Hash Join", or
(B) extend assert_uses_index to accept an additional allowed_plan parameter and
treat the test as passing if the executed plan contains the index name OR the
allowed_plan ("Hash Join") — locate the test function
join_on_encrypted_uses_hmac_index and the helper assert_uses_index /
BENCH_TEXT_HMAC_IDX to implement this conditional check.

}

/// `SELECT DISTINCT encrypted_col FROM t` (unbounded) engages HashAggregate
/// via the hash operator class. The bounded variant (`... LIMIT N`) biases
/// the planner toward Index Only Scan over the ORE btree opclass — that's
/// fine on full installs but unavailable on Supabase, where this hash path
/// becomes the only viable one.
#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
#[cfg_attr(
not(feature = "bench"),
ignore = "perf-bench: gated, run via mise test:bench"
)]
async fn distinct_encrypted_uses_hash_aggregate(pool: PgPool) -> Result<()> {
let sql = "SELECT DISTINCT encrypted_text FROM bench";
let plan = explain_query(&pool, sql).await?;
assert!(
plan.contains("HashAggregate"),
"Expected DISTINCT to use HashAggregate. EXPLAIN output:\n{}",
plan
);
Ok(())
}

// ============================================================================
// Field-level hash-strategy: GROUP BY on a JSON path extracted from an
// encrypted column. This is the "how many users per region" pattern against
// ste_vec encryption.
//
// Uses the bench_json fixture, which overlays `hm` onto the `$.hello` sv
// element of each bench row — simulating what `@cipherstash/protect` produces
// for a JSONB column where the `$.hello` path is configured with a `unique`
// index. Without that overlay, field-level GROUP BY raises today
// ("Cannot hash eql_v2_encrypted value: no hmac_256 index term found").
// ============================================================================

/// Documented EQL form for field-level GROUP BY:
/// `GROUP BY eql_v2.jsonb_path_query_first(col, '<selector>')`.
/// The planner engages a parallel Partial HashAggregate + Sort + GroupAggregate
/// merge — i.e., `HashAggregate` appears in the plan even if it's not the top
/// node. The bare-`->` form (`col -> '<sel>'::text`) currently picks
/// Sort + GroupAggregate instead; once #204 inlines the extractors, the
/// planner has the option of flattening to a single HashAggregate.
#[sqlx::test(fixtures(
path = "../fixtures",
scripts("bench_data", "bench_setup", "bench_json_data")
))]
#[cfg_attr(
not(feature = "bench"),
ignore = "perf-bench: gated, run via mise test:bench"
)]
async fn group_by_jsonb_field_uses_hash_aggregate(pool: PgPool) -> Result<()> {
// Selectors::HELLO = $.hello — see tests/sqlx/src/selectors.rs.
let sql = "SELECT count(*) FROM bench_json \
GROUP BY eql_v2.jsonb_path_query_first(e, 'a7cea93975ed8c01f861ccb6bd082784')";
let plan = explain_query(&pool, sql).await?;
assert!(
plan.contains("HashAggregate"),
"Expected field-level GROUP BY plan to include a HashAggregate node. EXPLAIN output:\n{}",
plan
);
Ok(())
}
117 changes: 117 additions & 0 deletions tests/sqlx/tests/bench_regression_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,120 @@ async fn ore_order_by_under_threshold(pool: PgPool) -> Result<()> {
);
Ok(())
}

// ============================================================================
// Hash-strategy timing regressions: GROUP BY / JOIN / DISTINCT on encrypted
// columns. The plan shapes already engage the hash operator class (#196), but
// per-row cost is dominated by plpgsql call overhead in the
// `hash_encrypted` → `to_ste_vec_value` → `hmac_256` chain.
//
// All three are #[ignore]'d pending the hash_encrypted fast-path tracked in
// #202. The dominant cost on these queries isn't plpgsql call overhead (a
// naive plpgsql → LANGUAGE sql conversion of the existing body leaves them
// effectively unchanged); it's `to_ste_vec_value`'s per-row JSONB inspection
// and reconstruction. The #202 fix short-circuits via root-level `hm`
// (`coalesce(val.data ->> 'hm', ...)`), falling through to `to_ste_vec_value`
// only for single-element ste_vec-wrapped payloads. Thresholds below reflect
// measured numbers with that fast-path applied in-place. Remove the
// `#[ignore]` markers when #202 merges and confirm green.
// ============================================================================

/// `GROUP BY encrypted_text` should be under 150ms at 10K rows.
/// Measured baseline today: ~309ms (HashAggregate + Seq Scan, dominated by
/// per-row `to_ste_vec_value` cost in the hash_encrypted chain). Measured
/// with the #202 fast-path applied: ~73ms. Threshold of 150ms is ~2x the
/// fast-path number to absorb CI variance.
#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
#[ignore = "#202: hash_encrypted chain not yet inlined; remove ignore when #202 merges"]
async fn group_by_encrypted_under_threshold(pool: PgPool) -> Result<()> {
let stats: ExplainStats = explain_analyze_avg(
&pool,
"SELECT count(*) FROM bench GROUP BY encrypted_text",
5,
)
.await?;
assert!(
stats.execution_time_ms < 150.0,
"GROUP BY encrypted_text took {:.1}ms, threshold 150ms (~73ms expected after #202 fast-path, currently ~309ms, node_type={})",
stats.execution_time_ms, stats.node_type
);
Ok(())
}

/// Self-join on `a.encrypted_text = b.encrypted_text` should be under 350ms at
/// 10K rows (which produces ~1M result rows due to ~99 distinct values × ~100
/// matches each — most of the time is intrinsic result cardinality, not the
/// per-probe cost).
/// Measured baseline today: ~308ms. Measured with the #202 fast-path applied:
/// ~185ms. Threshold of 350ms catches a regression to seq scan (>1s) without
/// flapping on cardinality variance.
#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
#[ignore = "#202: hash_encrypted chain not yet inlined; remove ignore when #202 merges"]
async fn self_join_encrypted_under_threshold(pool: PgPool) -> Result<()> {
let stats: ExplainStats = explain_analyze_avg(
&pool,
"SELECT count(*) FROM bench a JOIN bench b ON a.encrypted_text = b.encrypted_text",
3,
)
.await?;
assert!(
stats.execution_time_ms < 350.0,
"Self-join on encrypted_text took {:.1}ms, threshold 350ms (~185ms expected after #202 fast-path, currently ~308ms, node_type={})",
stats.execution_time_ms, stats.node_type
);
Ok(())
}

/// Unbounded `SELECT DISTINCT encrypted_text` should be under 200ms at 10K rows.
/// Measured baseline today: ~515ms (HashAggregate over plpgsql hash_encrypted
/// chain). Measured with the #202 fast-path applied (`coalesce(val.data ->>
/// 'hm', ...)`): ~72ms. Threshold of 200ms is ~2.8x the fast-path number to
/// absorb CI variance.
///
/// (The `... LIMIT N` variant biases the planner toward Index Only Scan over
/// the ORE btree opclass — fine on full installs but unavailable on Supabase.
/// This test exercises the unbounded path that engages HashAggregate.)
#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
#[ignore = "#202: hash_encrypted chain not yet inlined; remove ignore when #202 merges"]
async fn distinct_encrypted_under_threshold(pool: PgPool) -> Result<()> {
let stats: ExplainStats =
explain_analyze_avg(&pool, "SELECT DISTINCT encrypted_text FROM bench", 5).await?;
assert!(
stats.execution_time_ms < 200.0,
"DISTINCT encrypted_text took {:.1}ms, threshold 200ms (~72ms expected after #202 fast-path, currently ~515ms, node_type={})",
stats.execution_time_ms, stats.node_type
);
Ok(())
}

/// Field-level `GROUP BY eql_v2.jsonb_path_query_first(col, '<selector>')` —
/// the canonical pattern for "count rows per JSON field value" against an
/// encrypted column where the field has a `unique` index configured.
///
/// Closing #202 (hash_encrypted fast-path) helps but doesn't fully solve this:
/// the dominant cost shifts to `jsonb_path_query_first`'s per-row plpgsql
/// overhead. The threshold below reflects what becomes achievable once #204
/// inlines the JSONB field extractors.
///
/// Measured at 10K rows on a synthesized fixture with `hm` overlaid at
/// $.hello: current main ~496ms, with #202 fast-path applied ~234ms, with
/// raw JSONB extraction bypass (proxy-emitted, no eql_v2 plpgsql in the
/// hot path) ~7ms. Threshold of 50ms is set near the bypass number — when
/// #204 inlines the extractors, the EQL form should converge on the same
/// ballpark.
#[sqlx::test(fixtures(
path = "../fixtures",
scripts("bench_data", "bench_setup", "bench_json_data")
))]
#[ignore = "#204: JSONB field extractors not yet inlined; remove ignore when #204 merges"]
async fn group_by_jsonb_field_under_threshold(pool: PgPool) -> Result<()> {
let sql = "SELECT count(*) FROM bench_json \
GROUP BY eql_v2.jsonb_path_query_first(e, 'a7cea93975ed8c01f861ccb6bd082784')";
let stats: ExplainStats = explain_analyze_avg(&pool, sql, 5).await?;
assert!(
stats.execution_time_ms < 50.0,
"GROUP BY field-level jsonb_path_query_first took {:.1}ms, threshold 50ms (~7ms achievable with full extractor inlining + #202, currently ~496ms on main, ~234ms with #202 fast-path only, node_type={})",
stats.execution_time_ms, stats.node_type
);
Ok(())
}