cipherstash · coderdan · May 11, 2026 · May 11, 2026 · May 11, 2026 · coderabbitai
diff --git a/tests/sqlx/fixtures/bench_json_data.sql b/tests/sqlx/fixtures/bench_json_data.sql
@@ -0,0 +1,49 @@
+-- Fixture: bench_json_data.sql
+--
+-- Builds the bench_json table by overlaying the existing `bench` rows
+-- (loaded by the bench_data.sql fixture, which must run first) and adding
+-- `hm` to the $.hello sv element of each row. This mirrors what
+-- `@cipherstash/protect` would produce for a JSONB column where the
+-- $.hello path is configured with a `unique` index — without that, the
+-- field-level sv element carries only OPE terms (`ocv`) and field-level
+-- GROUP BY / DISTINCT / hash joins on the extracted value raise:
+--
+--     ERROR:  Cannot hash eql_v2_encrypted value: no hmac_256 index term
+--             found. Configure a `unique` index on the column for hash
+--             operations (GROUP BY, DISTINCT, hash joins).
+--
+-- The synthesised `hm` is the field's existing `ocv` hex string (already
+-- deterministic over the plaintext at that selector) so it serves as a
+-- valid equality token without us inventing a separate one. The shape
+-- matches production: `c`, `s`, `ocv`, `hm` at the sv element level.
+--
+-- Selector cheatsheet (matches Selectors:: in tests/sqlx/src/selectors.rs):
+--   bca213de9ccce676fa849ff9c4807963 → $       (root, has b3 here today)
+--   a7cea93975ed8c01f861ccb6bd082784 → $.hello (we add `hm` here)
+--   2517068c0d1f9d4d41d2c666211f785e → $.n     (left alone)
+
+CREATE TABLE IF NOT EXISTS bench_json (
+    id bigserial PRIMARY KEY,
+    e eql_v2_encrypted
+);
+
+INSERT INTO bench_json (e)
+SELECT (jsonb_build_object(
+    'c',  (encrypted_text).data ->> 'c',
+    'i',  (encrypted_text).data -> 'i',
+    'v',  2,
+    'hm', (encrypted_text).data ->> 'hm',
+    'sv', (
+        SELECT jsonb_agg(
+            CASE
+                WHEN elem ->> 's' = 'a7cea93975ed8c01f861ccb6bd082784'
+                THEN elem || jsonb_build_object('hm', elem ->> 'ocv')
+                ELSE elem
+            END
+        )
+        FROM jsonb_array_elements((encrypted_text).data -> 'sv') elem
+    )
+)) :: eql_v2_encrypted
+FROM bench;
+
+ANALYZE bench_json;
diff --git a/tests/sqlx/tests/bench_plan_tests.rs b/tests/sqlx/tests/bench_plan_tests.rs
@@ -6,7 +6,9 @@
 //! ANALYZE is run by the bench_setup fixture — planner statistics are populated at fixture load.
 
 use anyhow::Result;
-use eql_tests::{assert_uses_index, get_bench_encrypted_int, get_bench_encrypted_text};
+use eql_tests::{
+    assert_uses_index, explain_query, get_bench_encrypted_int, get_bench_encrypted_text,
+};
 use sqlx::PgPool;
 
 const BENCH_INT_ORE_IDX: &str = "bench_int_ore_idx";
@@ -157,3 +159,107 @@ async fn bare_ilike_uses_bloom_index(pool: PgPool) -> Result<()> {
     assert_uses_index(&pool, &sql, BENCH_TEXT_BLOOM_IDX).await?;
     Ok(())
 }
+
+// ============================================================================
+// Hash-strategy plans: GROUP BY / JOIN / DISTINCT on encrypted columns engage
+// the hash operator class (#196). The plan-shape assertions below cover the
+// surface PR #196 enabled; the corresponding timing thresholds in
+// bench_regression_tests.rs are #[ignore]'d pending the hash-chain inlining
+// work tracked in #202.
+// ============================================================================
+
+/// `GROUP BY encrypted_col` engages HashAggregate via the hash operator class.
+/// Without the hash op class registered in #196 this would fall back to
+/// GroupAggregate-after-Sort or — worse — degenerate to a Nested-Loop self-comparison.
+#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
+#[cfg_attr(
+    not(feature = "bench"),
+    ignore = "perf-bench: gated, run via mise test:bench"
+)]
+async fn group_by_encrypted_uses_hash_aggregate(pool: PgPool) -> Result<()> {
+    let sql = "SELECT count(*) FROM bench GROUP BY encrypted_text";
+    let plan = explain_query(&pool, sql).await?;
+    assert!(
+        plan.contains("HashAggregate"),
+        "Expected GROUP BY to use HashAggregate. EXPLAIN output:\n{}",
+        plan
+    );
+    Ok(())
+}
+
+/// JOIN on `a.encrypted_col = b.encrypted_col` engages the hmac functional index.
+/// Acceptable plan shapes: Hash Join (preferred), or Nested Loop + Memoize +
+/// Index Scan via `bench_text_hmac_idx` (current planner choice — fine since
+/// the index lookup remains the per-probe cost).
+#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
+#[cfg_attr(
+    not(feature = "bench"),
+    ignore = "perf-bench: gated, run via mise test:bench"
+)]
+async fn join_on_encrypted_uses_hmac_index(pool: PgPool) -> Result<()> {
+    let sql = "SELECT count(*) FROM bench a JOIN bench b \
+               ON a.encrypted_text = b.encrypted_text";
+    assert_uses_index(&pool, sql, BENCH_TEXT_HMAC_IDX).await?;
+    Ok(())
-/// JOIN on `a.encrypted_col = b.encrypted_col` engages the hmac functional index.
-/// Acceptable plan shapes: Hash Join (preferred), or Nested Loop + Memoize +
-/// Index Scan via `bench_text_hmac_idx` (current planner choice — fine since
-/// the index lookup remains the per-probe cost).
-#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
-#[cfg_attr(
-    not(feature = "bench"),
-    ignore = "perf-bench: gated, run via mise test:bench"
-)]
-async fn join_on_encrypted_uses_hmac_index(pool: PgPool) -> Result<()> {
-    let sql = "SELECT count(*) FROM bench a JOIN bench b \
-               ON a.encrypted_text = b.encrypted_text";
-    assert_uses_index(&pool, sql, BENCH_TEXT_HMAC_IDX).await?;
-    Ok(())
+/// JOIN on `a.encrypted_col = b.encrypted_col` engages the hmac functional index.
+/// Acceptable plan shapes: Hash Join (preferred), or Nested Loop + Memoize +
+/// Index Scan via `bench_text_hmac_idx` (current planner choice — fine since
+/// the index lookup remains the per-probe cost).
+#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
+#[cfg_attr(
+    not(feature = "bench"),
+    ignore = "perf-bench: gated, run via mise test:bench"
+)]
+async fn join_on_encrypted_uses_hmac_index(pool: PgPool) -> Result<()> {
+    let sql = "SELECT count(*) FROM bench a JOIN bench b \
+               ON a.encrypted_text = b.encrypted_text";
+    let plan = explain_query(&pool, sql).await?;
+    assert!(
+        plan.contains("Hash Join") || plan.contains(BENCH_TEXT_HMAC_IDX),
+        "Expected JOIN to use Hash Join or {}. EXPLAIN output:\n{}",
+        BENCH_TEXT_HMAC_IDX,
+        plan
+    );
+    Ok(())
+}
-/// JOIN on `a.encrypted_col = b.encrypted_col` engages the hmac functional index.
-/// Acceptable plan shapes: Hash Join (preferred), or Nested Loop + Memoize +
-/// Index Scan via `bench_text_hmac_idx` (current planner choice — fine since
-/// the index lookup remains the per-probe cost).
-#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
-#[cfg_attr(
-    not(feature = "bench"),
-    ignore = "perf-bench: gated, run via mise test:bench"
-)]
-async fn join_on_encrypted_uses_hmac_index(pool: PgPool) -> Result<()> {
-    let sql = "SELECT count(*) FROM bench a JOIN bench b \
-               ON a.encrypted_text = b.encrypted_text";
-    assert_uses_index(&pool, sql, BENCH_TEXT_HMAC_IDX).await?;
-    Ok(())
+/// JOIN on `a.encrypted_col = b.encrypted_col` engages the hmac functional index.
+/// Acceptable plan shapes: Hash Join (preferred), or Nested Loop + Memoize +
+/// Index Scan via `bench_text_hmac_idx` (current planner choice — fine since
+/// the index lookup remains the per-probe cost).
+#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
+#[cfg_attr(
+    not(feature = "bench"),
+    ignore = "perf-bench: gated, run via mise test:bench"
+)]
+async fn join_on_encrypted_uses_hmac_index(pool: PgPool) -> Result<()> {
+    let sql = "SELECT count(*) FROM bench a JOIN bench b \
+               ON a.encrypted_text = b.encrypted_text";
+    let plan = explain_query(&pool, sql).await?;
+    assert!(
+        plan.contains("Hash Join") || plan.contains(BENCH_TEXT_HMAC_IDX),
+        "Expected JOIN to use Hash Join or {}. EXPLAIN output:\n{}",
+        BENCH_TEXT_HMAC_IDX,
+        plan
+    );
+    Ok(())
+}
+}
+
+/// `SELECT DISTINCT encrypted_col FROM t` (unbounded) engages HashAggregate
+/// via the hash operator class. The bounded variant (`... LIMIT N`) biases
+/// the planner toward Index Only Scan over the ORE btree opclass — that's
+/// fine on full installs but unavailable on Supabase, where this hash path
+/// becomes the only viable one.
+#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
+#[cfg_attr(
+    not(feature = "bench"),
+    ignore = "perf-bench: gated, run via mise test:bench"
+)]
+async fn distinct_encrypted_uses_hash_aggregate(pool: PgPool) -> Result<()> {
+    let sql = "SELECT DISTINCT encrypted_text FROM bench";
+    let plan = explain_query(&pool, sql).await?;
+    assert!(
+        plan.contains("HashAggregate"),
+        "Expected DISTINCT to use HashAggregate. EXPLAIN output:\n{}",
+        plan
+    );
+    Ok(())
+}
+
+// ============================================================================
+// Field-level hash-strategy: GROUP BY on a JSON path extracted from an
+// encrypted column. This is the "how many users per region" pattern against
+// ste_vec encryption.
+//
+// Uses the bench_json fixture, which overlays `hm` onto the `$.hello` sv
+// element of each bench row — simulating what `@cipherstash/protect` produces
+// for a JSONB column where the `$.hello` path is configured with a `unique`
+// index. Without that overlay, field-level GROUP BY raises today
+// ("Cannot hash eql_v2_encrypted value: no hmac_256 index term found").
+// ============================================================================
+
+/// Documented EQL form for field-level GROUP BY:
+/// `GROUP BY eql_v2.jsonb_path_query_first(col, '<selector>')`.
+/// The planner engages a parallel Partial HashAggregate + Sort + GroupAggregate
+/// merge — i.e., `HashAggregate` appears in the plan even if it's not the top
+/// node. The bare-`->` form (`col -> '<sel>'::text`) currently picks
+/// Sort + GroupAggregate instead; once #204 inlines the extractors, the
+/// planner has the option of flattening to a single HashAggregate.
+#[sqlx::test(fixtures(
+    path = "../fixtures",
+    scripts("bench_data", "bench_setup", "bench_json_data")
+))]
+#[cfg_attr(
+    not(feature = "bench"),
+    ignore = "perf-bench: gated, run via mise test:bench"
+)]
+async fn group_by_jsonb_field_uses_hash_aggregate(pool: PgPool) -> Result<()> {
+    // Selectors::HELLO = $.hello — see tests/sqlx/src/selectors.rs.
+    let sql = "SELECT count(*) FROM bench_json \
+               GROUP BY eql_v2.jsonb_path_query_first(e, 'a7cea93975ed8c01f861ccb6bd082784')";
+    let plan = explain_query(&pool, sql).await?;
+    assert!(
+        plan.contains("HashAggregate"),
+        "Expected field-level GROUP BY plan to include a HashAggregate node. EXPLAIN output:\n{}",
+        plan
+    );
+    Ok(())
+}
diff --git a/tests/sqlx/tests/bench_regression_tests.rs b/tests/sqlx/tests/bench_regression_tests.rs
@@ -111,3 +111,120 @@ async fn ore_order_by_under_threshold(pool: PgPool) -> Result<()> {
     );
     Ok(())
 }
+
+// ============================================================================
+// Hash-strategy timing regressions: GROUP BY / JOIN / DISTINCT on encrypted
+// columns. The plan shapes already engage the hash operator class (#196), but
+// per-row cost is dominated by plpgsql call overhead in the
+// `hash_encrypted` → `to_ste_vec_value` → `hmac_256` chain.
+//
+// All three are #[ignore]'d pending the hash_encrypted fast-path tracked in
+// #202. The dominant cost on these queries isn't plpgsql call overhead (a
+// naive plpgsql → LANGUAGE sql conversion of the existing body leaves them
+// effectively unchanged); it's `to_ste_vec_value`'s per-row JSONB inspection
+// and reconstruction. The #202 fix short-circuits via root-level `hm`
+// (`coalesce(val.data ->> 'hm', ...)`), falling through to `to_ste_vec_value`
+// only for single-element ste_vec-wrapped payloads. Thresholds below reflect
+// measured numbers with that fast-path applied in-place. Remove the
+// `#[ignore]` markers when #202 merges and confirm green.
+// ============================================================================
+
+/// `GROUP BY encrypted_text` should be under 150ms at 10K rows.
+/// Measured baseline today: ~309ms (HashAggregate + Seq Scan, dominated by
+/// per-row `to_ste_vec_value` cost in the hash_encrypted chain). Measured
+/// with the #202 fast-path applied: ~73ms. Threshold of 150ms is ~2x the
+/// fast-path number to absorb CI variance.
+#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
+#[ignore = "#202: hash_encrypted chain not yet inlined; remove ignore when #202 merges"]
+async fn group_by_encrypted_under_threshold(pool: PgPool) -> Result<()> {
+    let stats: ExplainStats = explain_analyze_avg(
+        &pool,
+        "SELECT count(*) FROM bench GROUP BY encrypted_text",
+        5,
+    )
+    .await?;
+    assert!(
+        stats.execution_time_ms < 150.0,
+        "GROUP BY encrypted_text took {:.1}ms, threshold 150ms (~73ms expected after #202 fast-path, currently ~309ms, node_type={})",
+        stats.execution_time_ms, stats.node_type
+    );
+    Ok(())
+}
+
+/// Self-join on `a.encrypted_text = b.encrypted_text` should be under 350ms at
+/// 10K rows (which produces ~1M result rows due to ~99 distinct values × ~100
+/// matches each — most of the time is intrinsic result cardinality, not the
+/// per-probe cost).
+/// Measured baseline today: ~308ms. Measured with the #202 fast-path applied:
+/// ~185ms. Threshold of 350ms catches a regression to seq scan (>1s) without
+/// flapping on cardinality variance.
+#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
+#[ignore = "#202: hash_encrypted chain not yet inlined; remove ignore when #202 merges"]
+async fn self_join_encrypted_under_threshold(pool: PgPool) -> Result<()> {
+    let stats: ExplainStats = explain_analyze_avg(
+        &pool,
+        "SELECT count(*) FROM bench a JOIN bench b ON a.encrypted_text = b.encrypted_text",
+        3,
+    )
+    .await?;
+    assert!(
+        stats.execution_time_ms < 350.0,
+        "Self-join on encrypted_text took {:.1}ms, threshold 350ms (~185ms expected after #202 fast-path, currently ~308ms, node_type={})",
+        stats.execution_time_ms, stats.node_type
+    );
+    Ok(())
+}
+
+/// Unbounded `SELECT DISTINCT encrypted_text` should be under 200ms at 10K rows.
+/// Measured baseline today: ~515ms (HashAggregate over plpgsql hash_encrypted
+/// chain). Measured with the #202 fast-path applied (`coalesce(val.data ->>
+/// 'hm', ...)`): ~72ms. Threshold of 200ms is ~2.8x the fast-path number to
+/// absorb CI variance.
+///
+/// (The `... LIMIT N` variant biases the planner toward Index Only Scan over
+/// the ORE btree opclass — fine on full installs but unavailable on Supabase.
+/// This test exercises the unbounded path that engages HashAggregate.)
+#[sqlx::test(fixtures(path = "../fixtures", scripts("bench_data", "bench_setup")))]
+#[ignore = "#202: hash_encrypted chain not yet inlined; remove ignore when #202 merges"]
+async fn distinct_encrypted_under_threshold(pool: PgPool) -> Result<()> {
+    let stats: ExplainStats =
+        explain_analyze_avg(&pool, "SELECT DISTINCT encrypted_text FROM bench", 5).await?;
+    assert!(
+        stats.execution_time_ms < 200.0,
+        "DISTINCT encrypted_text took {:.1}ms, threshold 200ms (~72ms expected after #202 fast-path, currently ~515ms, node_type={})",
+        stats.execution_time_ms, stats.node_type
+    );
+    Ok(())
+}
+
+/// Field-level `GROUP BY eql_v2.jsonb_path_query_first(col, '<selector>')` —
+/// the canonical pattern for "count rows per JSON field value" against an
+/// encrypted column where the field has a `unique` index configured.
+///
+/// Closing #202 (hash_encrypted fast-path) helps but doesn't fully solve this:
+/// the dominant cost shifts to `jsonb_path_query_first`'s per-row plpgsql
+/// overhead. The threshold below reflects what becomes achievable once #204
+/// inlines the JSONB field extractors.
+///
+/// Measured at 10K rows on a synthesized fixture with `hm` overlaid at
+/// $.hello: current main ~496ms, with #202 fast-path applied ~234ms, with
+/// raw JSONB extraction bypass (proxy-emitted, no eql_v2 plpgsql in the
+/// hot path) ~7ms. Threshold of 50ms is set near the bypass number — when
+/// #204 inlines the extractors, the EQL form should converge on the same
+/// ballpark.
+#[sqlx::test(fixtures(
+    path = "../fixtures",
+    scripts("bench_data", "bench_setup", "bench_json_data")
+))]
+#[ignore = "#204: JSONB field extractors not yet inlined; remove ignore when #204 merges"]
+async fn group_by_jsonb_field_under_threshold(pool: PgPool) -> Result<()> {
+    let sql = "SELECT count(*) FROM bench_json \
+               GROUP BY eql_v2.jsonb_path_query_first(e, 'a7cea93975ed8c01f861ccb6bd082784')";
+    let stats: ExplainStats = explain_analyze_avg(&pool, sql, 5).await?;
+    assert!(
+        stats.execution_time_ms < 50.0,
+        "GROUP BY field-level jsonb_path_query_first took {:.1}ms, threshold 50ms (~7ms achievable with full extractor inlining + #202, currently ~496ms on main, ~234ms with #202 fast-path only, node_type={})",
+        stats.execution_time_ms, stats.node_type
+    );
+    Ok(())
+}