From d7a473ea60ee09023cace3b4475263fc767d31b2 Mon Sep 17 00:00:00 2001 From: Dan Draper Date: Thu, 14 May 2026 18:14:47 +1000 Subject: [PATCH] perf(eql_v2): inline hash_encrypted to SQL with data-hash fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `eql_v2.hash_encrypted(eql_v2_encrypted)` (the hash operator class FUNCTION 1 called per row by HashAggregate / hash joins / DISTINCT) flips from plpgsql-with-RAISE to a single-statement SQL function: `coalesce(hashtext(hmac_256(val)::text), hashtext((val).data::text))`. Why. On the bench branch `GROUP BY value` at 100k rows took ~29s and at 1M rows projected to ~55 minutes and never finished. The cost was per-row plpgsql interpreter overhead compounded with HashAggregate work_mem spillage. Inlinable SQL removes the plpgsql cost; when the inner `eql_v2.hmac_256(val)` is also inlinable (#205), the planner can fold the extractor into the calling query. What changes for callers. Two consequences of the flip, both detailed in U-002: * `GROUP BY` / `DISTINCT` / hash joins on a column lacking `hm` no longer raises. The previous RAISE was the loud-misconfig signal; the cost was paying plpgsql per row on the happy path. The flip falls back to `hashtext((val).data::text)` so distinct ciphertexts still hash distinctly — necessary to keep HashAggregate from degrading to O(N^2) on a single NULL-hash bucket — but rows are no longer guaranteed to group by encrypted plaintext on a misconfigured column. Audit at config time (`eql_v2.has_hmac_256(value)`), not query time. * `GROUP BY value` engages HashAggregate cleanly. The plan choice didn't change, but per-row cost drops from plpgsql-call overhead (~290us at 100k rows) to inlined SQL-function overhead. Splinter / pin_search_path. `hash_encrypted` joins the inline_critical_oids allowlist alongside the Phase 1 operators and the ore_block inner comparators — pinning a SET search_path would re-add the per-row plpgsql-equivalent overhead and undo the flip. Tests. Two existing RAISE-asserting tests (`hash_function_errors_without_hash_index`, `multi_element_ste_vec_raises_error`) replaced with fallback-asserting counterparts. A new `hash_encrypted_is_inlinable` test reads `pg_proc.prolang` / `pg_proc.proconfig` directly to assert the inlinability prerequisites and catch regressions where someone inadvertently re-pins or re-plpgsqls the function. The fallback tests auto-skip when the inner hmac_256 is still plpgsql (pre-#205 branches) since the fallback path is then unreachable. Stacked on dan/inline-range-operators (#211). The full perf story needs #205 + #211 + this; rebasing this branch onto main after #211 lands will pick up the inlinable hmac_256 chain. --- CHANGELOG.md | 4 +- docs/upgrading/v2.3.md | 14 ++-- src/encrypted/hash.sql | 64 +++++++++------ tasks/pin_search_path.sql | 8 ++ tasks/test/splinter.sh | 1 + tests/sqlx/tests/hash_operator_tests.rs | 100 +++++++++++++++++------- 6 files changed, 129 insertions(+), 62 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e71ff967..9b150df6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,9 +33,9 @@ Targeting `2.3.0` as a breaking release. Customers re-encrypt their data as part - **`=`, `<>`, `~~` (`LIKE`), `~~*` (`ILIKE`) on `eql_v2_encrypted` are now inlinable SQL functions.** The planner can structurally match these operators against the documented functional indexes (`eql_v2.hmac_256(col)` for equality, `eql_v2.bloom_filter(col)` for `LIKE`/`ILIKE`), so bare-form queries (`WHERE col = $1`) engage the index without per-query rewriting. Previously these operators wrapped multi-branch PL/pgSQL bodies that the planner could not inline, forcing seq scans on Supabase / managed Postgres installations that lack operator-class indexes. ([#193](https://github.com/cipherstash/encrypt-query-language/pull/193), [#196](https://github.com/cipherstash/encrypt-query-language/pull/196)) - **`<`, `<=`, `>`, `>=` on `eql_v2_encrypted` are now inlinable SQL functions.** Same precedent as the `=` inlining above: the operator bodies reduce to `eql_v2.ore_block_u64_8_256(a) eql_v2.ore_block_u64_8_256(b)`, so bare-form range queries (`WHERE col < $1`, `WHERE col > $1`, …) structurally match a functional btree index on `eql_v2.ore_block_u64_8_256(col)` (using the existing `eql_v2.ore_block_u64_8_256_operator_class`). Top-N sorts under `ORDER BY col LIMIT n` still need a Sort node (the natural-form sort key doesn't syntactically match the index expression), but each comparison now uses the inlined ORE-term path rather than a plpgsql `eql_v2.compare()` dispatch. The inner `eql_v2.ore_block_u64_8_256_{eq,neq,lt,lte,gt,gte}` helpers backing the ORE-term type's own operators are now declared `IMMUTABLE STRICT PARALLEL SAFE` and allowlisted in the post-build search-path pin so that the chain inlines cleanly through to index matching. **Behaviour to be aware of:** range queries against columns that carry only `ore_cllw_u64_8` / `ore_cllw_var_8` (CLLW ORE) or OPE terms now raise from the `ore_block_u64_8_256` extractor instead of dispatching through the old `eql_v2.compare()` priority list. Callers in that situation must rewrite to the relevant extractor form (e.g. `WHERE eql_v2.ore_cllw_u64_8(col) < eql_v2.ore_cllw_u64_8($1::jsonb)`) — see [U-005](docs/upgrading/v2.3.md#u-005-range-operators-are-block-ore-only). -- **`eql_v2.hmac_256(val jsonb)` and `eql_v2.hmac_256(val eql_v2_encrypted)` are now inlinable SQL.** Both 1-arg overloads flipped from plpgsql-with-RAISE to single-statement SQL returning NULL when `hm` is absent. This restores per-row extractor inlining inside the `=` / `<>` operator bodies. **Behaviour to be aware of:** `WHERE col = $1` on a column lacking `hm` now silently returns zero rows where it previously raised — see the amended [U-002](docs/upgrading/v2.3.md#u-002-equality-and-hashing-require-hmac). The loud RAISE-on-missing-hm path is retained in `eql_v2.hash_encrypted`, so `GROUP BY` / `DISTINCT` / hash joins still surface misconfiguration. ([#205](https://github.com/cipherstash/encrypt-query-language/issues/205)) +- **`eql_v2.hmac_256(val jsonb)` and `eql_v2.hmac_256(val eql_v2_encrypted)` are now inlinable SQL.** Both 1-arg overloads flipped from plpgsql-with-RAISE to single-statement SQL returning NULL when `hm` is absent. This restores per-row extractor inlining inside the `=` / `<>` operator bodies. **Behaviour to be aware of:** `WHERE col = $1` on a column lacking `hm` now silently returns zero rows where it previously raised — see the amended [U-002](docs/upgrading/v2.3.md#u-002-equality-and-hashing-require-hmac). ([#205](https://github.com/cipherstash/encrypt-query-language/issues/205)) - **`eql_v2_encrypted = eql_v2_encrypted` is now strictly hmac-based at the root.** Equality requires both sides to carry `hm` (hmac); otherwise the operator returns NULL (and the query returns zero rows). Previously, equality could silently fall through to a `NULL` comparison or to Blake3 on synthetic fixtures. **Behaviour to be aware of:** see [U-002](docs/upgrading/v2.3.md#u-002-equality-and-hashing-require-hmac). ([#196](https://github.com/cipherstash/encrypt-query-language/pull/196), [#205](https://github.com/cipherstash/encrypt-query-language/issues/205)) -- **`eql_v2.hash_encrypted(eql_v2_encrypted)` is now hmac-only.** Hash operations (`GROUP BY`, `DISTINCT`, hash joins) require the column to carry an `hm` index term; the previous Blake3 fallback has been removed. The function raises a clear error directing the caller to configure a `unique` index. ([#196](https://github.com/cipherstash/encrypt-query-language/pull/196)) +- **`eql_v2.hash_encrypted(eql_v2_encrypted)` is now inlinable SQL.** The hash operator class FUNCTION 1 (called once per row by HashAggregate, hash joins, `DISTINCT`) flipped from plpgsql-with-RAISE to a single-statement SQL function: `coalesce(hashtext(hmac_256(val)::text), hashtext((val).data::text))`. Per-row plpgsql interpreter overhead disappears — at 100k rows, `GROUP BY value` drops from ~29s to tens of ms when `eql_v2.hmac_256(val)` is also inlinable; at 1M rows the prior plpgsql cost compounded with HashAggregate `work_mem` spillage and ran for hours, while the inlinable form scales linearly. The previous RAISE-on-missing-`hm` path is replaced with a fallback hash of the encrypted payload bytes, preserving row distinctness so misconfigured columns can't trigger pathological O(N²) bucket collisions. **Behaviour to be aware of:** `GROUP BY` / `DISTINCT` / hash joins on a column lacking `hm` no longer raises — see the amended [U-002](docs/upgrading/v2.3.md#u-002-equality-and-hashing-require-hmac). - **`ste_vec_contains` now requires `hm` on sv elements.** Element comparison uses `compare_hmac_256` when both sides carry `hm`, falling through to `eq`/`compare` for non-hash-indexed elements (e.g. future OPE-only shapes). The Blake3 path that previously lived inside `ste_vec_contains` is gone — every sv element now carries `hm` post-migration. See [U-004](docs/upgrading/v2.3.md#u-004-sv-element-equality-term-is-hm-not-b3). ([#205](https://github.com/cipherstash/encrypt-query-language/issues/205)) ### Removed diff --git a/docs/upgrading/v2.3.md b/docs/upgrading/v2.3.md index 5457985a..dbd4213b 100644 --- a/docs/upgrading/v2.3.md +++ b/docs/upgrading/v2.3.md @@ -74,18 +74,18 @@ DROP INDEX CONCURRENTLY users_email_idx; **What changed.** Two related operations are now strictly hmac-based at the root: - **`a = b` and `a <> b` on `eql_v2_encrypted`** require both sides to carry `hm` (hmac_256). Without it, the comparison returns NULL and the query returns zero rows. -- **`eql_v2.hash_encrypted(value)`** (used by `GROUP BY`, `DISTINCT`, hash joins) requires the value to carry `hm`. The previous Blake3 fallback has been removed; the function raises with a clear message when `hm` is missing. +- **`eql_v2.hash_encrypted(value)`** (used by `GROUP BY`, `DISTINCT`, hash joins) consults `hm` when present. The previous Blake3 fallback has been removed. **Why.** Pre-2.3, equality could route through hmac *or* Blake3 depending on what was present in the payload, which made plans non-deterministic and made it impossible to reason about which index would be engaged. The Blake3 fallback only activated on synthetic test fixtures — `@cipherstash/protect` never emitted root-level `b3` in production. Removing the fallback aligns the documented contract with what production data has always looked like. -**Action required.** Audit any column you query with `=`, `<>`, `GROUP BY`, `DISTINCT`, or hash joins. Each must carry `hm` in its payload. If you're using `@cipherstash/protect`, configure the column with the `unique` index type — that's what emits `hm`. If `unique` is already present, you're done. +**Action required.** Audit any column you query with `=`, `<>`, `GROUP BY`, `DISTINCT`, or hash joins. Each must carry `hm` in its payload for semantically correct results. If you're using `@cipherstash/protect`, configure the column with the `unique` index type — that's what emits `hm`. If `unique` is already present, you're done. -**Behavioural change to be aware of.** The error surface differs by operation: +**Behavioural change — both equality and hashing now fail open on missing `hm`.** Pre-2.3, equality could silently fall through to a NULL comparison on synthetic Blake3-only fixtures, and `hash_encrypted` raised loudly when `hm` was missing. The 2.3 cleanup makes both operations behave the same way on a column lacking `hm`: they return a value (NULL for equality, a fallback hash for `hash_encrypted`) instead of raising. The error surface that used to surface misconfiguration at query time is gone — audit happens at config time now. -- **`WHERE col = $1` / `WHERE col <> $1` on a column lacking `hm`**: silently returns zero rows. The `=` / `<>` operators inline to `hmac_256(a) = hmac_256(b)`; with `hm` missing, both extractors return NULL and the comparison evaluates to NULL — false in a WHERE context. Same shape as pre-2.3, but now deliberate and consistent rather than an accident of the dispatch chain. -- **`GROUP BY col` / `DISTINCT col` / hash joins**: raises loudly with a message naming the missing index term. The `hash_encrypted` function checks for `hm` explicitly and refuses to hash when it's absent. +- **`WHERE col = $1` / `WHERE col <> $1` on a column lacking `hm`**: silently returns zero rows. The `=` / `<>` operators inline to `hmac_256(a) = hmac_256(b)`; with `hm` missing, both extractors return NULL and the comparison evaluates to NULL — false in a WHERE context. Same observable shape as pre-2.3, but now deliberate and consistent rather than an accident of the dispatch chain. +- **`GROUP BY col` / `DISTINCT col` / hash joins on a column lacking `hm`**: groups by ciphertext rather than by encrypted plaintext. `hash_encrypted` is inlinable SQL with the body `coalesce(hashtext(hmac_256(val)::text), hashtext((val).data::text))`. When `hm` is absent, the first branch returns NULL and the fallback hashes the encrypted payload bytes — each distinct ciphertext gets a distinct hash, so HashAggregate can't degrade to O(N²) on a single NULL-hash bucket, but distinct ciphertexts of the *same plaintext* will no longer group together. The fallback exists for safety, not correctness. -The asymmetry is intentional: equality queries fail open (zero rows) because that's the only behaviour an inlinable `LANGUAGE sql` extractor can express; hash operations fail loud because they go through a plpgsql guard that can raise. Use a hash operation (`GROUP BY 1`, `SELECT count(DISTINCT col)`) in staging to surface misconfigured columns before promoting. +**Why the symmetry matters.** Inlinable SQL is what lets the planner fold these calls into the calling query, which is what makes the 2.3 functional-index recipe work end-to-end (`eql_v2.hmac_256(col)` matches `WHERE col = $1`) and what brings `GROUP BY value` on the natural form from ~29s to tens of ms at 100k rows. Inlinable SQL can't `RAISE`, so the trade-off is the loud query-time RAISE for the fast path. Audit columns once via `SELECT eql_v2.has_hmac_256(value) FROM tbl LIMIT 1` in staging — and prefer the explicit extractor form (`GROUP BY eql_v2.hmac_256(value)`) anywhere correctness is load-bearing. ### U-003: Blake3 removed at root @@ -189,7 +189,7 @@ The old plpgsql wrappers walked `eql_v2.compare()`'s priority list (Block ORE `2.3.0` is binary-compatible with `2.2.x` data. Rolling back is just re-installing the prior release SQL. No data migration is required either way. -The application-level surface that's hardest to roll back is U-002's "raise instead of NULL" behaviour: code written against the 2.3 contract may not handle the silent-NULL semantics gracefully. If you've tightened error handling around equality, plan rollbacks accordingly. +The application-level surface that's hardest to roll back is U-002's split behaviour: equality now *raises* where it used to return zero rows, while `hash_encrypted` now *returns a fallback* where it used to raise. Code written against the 2.3 contract may not handle the inverse semantics on the way back. If you've added error handling around equality or removed it around aggregates, plan rollbacks accordingly. ## See also diff --git a/src/encrypted/hash.sql b/src/encrypted/hash.sql index 1692b66f..a5612f20 100644 --- a/src/encrypted/hash.sql +++ b/src/encrypted/hash.sql @@ -7,37 +7,53 @@ --! @brief Compute hash integer for encrypted value --! --! Produces a 32-bit integer hash suitable for PostgreSQL hash joins, GROUP BY, ---! DISTINCT, and hash aggregate operations. Uses the HMAC-256 index term to ---! stay consistent with the equality operator: if a = b then hash(a) = hash(b). ---! The `=` operator on eql_v2_encrypted reduces to hmac_256(a) = hmac_256(b), ---! so the hash function must derive from hmac_256 as well — see the EQL ---! payload scheme discipline RFC for the single-term-per-purpose contract. +--! DISTINCT, and hash aggregate operations. Used by the `eql_v2_encrypted` hash +--! operator class (`FUNCTION 1`). Inlinable single-statement SQL — the SQL +--! function machinery is much cheaper per row than plpgsql, which matters +--! because HashAggregate / hash-join call this once per input row. --! ---! @param val eql_v2_encrypted Encrypted value to hash ---! @return integer 32-bit hash value derived from the HMAC-256 index term +--! @par Behaviour +--! - If the payload carries `hm` (hmac_256), the hash is `hashtext(hm::text)` — +--! the canonical bucket for equality groups, since `=` on +--! `eql_v2_encrypted` reduces to `hmac_256(a) = hmac_256(b)`. +--! - If `hm` is absent (misconfigured column), falls back to +--! `hashtext((val).data::text)`. Each row gets a distinct hash (no +--! pathological bucket collision), but rows are no longer guaranteed to +--! group by encrypted plaintext — `=` returns NULL between distinct +--! ciphertexts in that case, so each row lands in its own group anyway. +--! The fallback exists purely to avoid quadratic blow-up; correctness on +--! misconfigured columns is already undefined. +--! +--! @par Why the fallback rather than RAISE +--! Pre-2.3 this function raised on missing `hm` to surface misconfiguration. +--! That made `GROUP BY value` on a misconfigured column fail loudly — but +--! also made the happy path call into plpgsql once per row, which dominated +--! wall-clock time on aggregates and made HashAggregate spill catastrophic. +--! Inlinable SQL is ~10× cheaper per call and lets the planner fold +--! `hmac_256` into the calling query when `eql_v2.hmac_256(val)` is itself +--! inlinable. The misconfig case is rare enough — and detectable via +--! `eql_v2.has_hmac_256` at config / index-creation time — that trading the +--! loud RAISE for a 100× speed-up on aggregates is the right call. --! ---! @throws Exception if no HMAC-256 index term is present +--! @param val eql_v2_encrypted Encrypted value to hash +--! @return integer 32-bit hash value derived from `hm`, or from the payload +--! data when `hm` is absent --! ---! @note Requires a `unique` (hmac_256) index configured on the column. ---! Match-only / ORE-only / OPE-only / ste_vec-only values cannot be ---! hashed at the root. +--! @note Requires a `unique` (hmac_256) index configured on the column for +--! semantically meaningful grouping. Match-only / ORE-only / OPE-only +--! / ste_vec-only values without `hm` still hash without error but +--! will not group across logically-equal values. --! --! @see eql_v2.hmac_256 +--! @see eql_v2.has_hmac_256 --! @see eql_v2.compare CREATE FUNCTION eql_v2.hash_encrypted(val eql_v2_encrypted) RETURNS integer + LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE - SET search_path = pg_catalog, extensions, public AS $$ -DECLARE - ste_val eql_v2_encrypted; -BEGIN - ste_val := eql_v2.to_ste_vec_value(val); - - IF eql_v2.has_hmac_256(ste_val) THEN - RETURN hashtext(eql_v2.hmac_256(ste_val)::text); - END IF; - - RAISE EXCEPTION 'Cannot hash eql_v2_encrypted value: no hmac_256 index term found. Configure a `unique` index on the column for hash operations (GROUP BY, DISTINCT, hash joins).'; -END; -$$ LANGUAGE plpgsql; + SELECT coalesce( + pg_catalog.hashtext(eql_v2.hmac_256(eql_v2.to_ste_vec_value(val))::text), + pg_catalog.hashtext((val).data::text) + ) +$$; diff --git a/tasks/pin_search_path.sql b/tasks/pin_search_path.sql index 50616abb..cba214e1 100644 --- a/tasks/pin_search_path.sql +++ b/tasks/pin_search_path.sql @@ -159,6 +159,14 @@ BEGIN AND p.proname IN ('ore_block_u64_8_256_eq', 'ore_block_u64_8_256_neq', 'ore_block_u64_8_256_lt', 'ore_block_u64_8_256_lte', 'ore_block_u64_8_256_gt', 'ore_block_u64_8_256_gte')) + -- Hash operator class FUNCTION 1: called once per row by HashAggregate, + -- hash joins, DISTINCT. Inlinable SQL avoids the per-row plpgsql + -- interpreter overhead — without this, `GROUP BY value` on + -- `eql_v2_encrypted` at 1M rows degrades super-linearly because the + -- plpgsql cost compounds with HashAggregate work_mem spillage. + OR (p.pronargs = 1 + AND p.proname = 'hash_encrypted' + AND p.proargtypes[0] = enc_oid) ); FOR fn_oid IN diff --git a/tasks/test/splinter.sh b/tasks/test/splinter.sh index 3bb23ec9..c59ef0d4 100755 --- a/tasks/test/splinter.sh +++ b/tasks/test/splinter.sh @@ -65,6 +65,7 @@ function_search_path_mutable eql_v2 ore_block_u64_8_256_lt function Inner compar function_search_path_mutable eql_v2 ore_block_u64_8_256_lte function Inner comparator for the ore_block_u64_8_256 type's `<=` operator. Same rationale as ore_block_u64_8_256_eq. function_search_path_mutable eql_v2 ore_block_u64_8_256_gt function Inner comparator for the ore_block_u64_8_256 type's `>` operator. Same rationale as ore_block_u64_8_256_eq. function_search_path_mutable eql_v2 ore_block_u64_8_256_gte function Inner comparator for the ore_block_u64_8_256 type's `>=` operator. Same rationale as ore_block_u64_8_256_eq. +function_search_path_mutable eql_v2 hash_encrypted function Hash operator class FUNCTION 1: called once per row by HashAggregate, hash joins, DISTINCT. SET search_path forces plpgsql-equivalent call overhead per row; without pinning, the SQL function machinery is ~10× cheaper and `GROUP BY` / `DISTINCT` on `eql_v2_encrypted` at 1M rows stays linear rather than degrading super-linearly via work_mem spillage. function_search_path_mutable eql_v2 ~~ function Phase 1 inlining (#193): must inline so the planner can match eql_v2.bloom_filter(col). Three overloads. (Note: the eql_v2.~~* operator points at this same function — case-insensitivity of LIKE on encrypted ciphertexts is meaningless because the bloom filter index term is independent of case.) function_search_path_mutable eql_v2 like function LIKE/ILIKE inlining (#201): the eql_v2."~~" operator wrapper inlines to a single-statement call to eql_v2.like, which itself must inline to reach `eql_v2.bloom_filter(a) @> eql_v2.bloom_filter(b)` and match the documented functional GIN index. Pinning search_path here breaks the second inlining layer and reverts bare-form `WHERE col ~~ val` to seq scan. function_search_path_mutable eql_v2 ilike function LIKE/ILIKE inlining (#201): same rationale as eql_v2.like — the eql_v2."~~*" operator inlines through eql_v2.ilike to the bloom_filter containment form. diff --git a/tests/sqlx/tests/hash_operator_tests.rs b/tests/sqlx/tests/hash_operator_tests.rs index 798740cb..87f64ee5 100644 --- a/tests/sqlx/tests/hash_operator_tests.rs +++ b/tests/sqlx/tests/hash_operator_tests.rs @@ -224,26 +224,31 @@ async fn hash_function_falls_back_to_hmac(pool: PgPool) -> Result<()> { } #[sqlx::test(fixtures(path = "../fixtures", scripts("encrypted_json")))] -async fn hash_function_errors_without_hash_index(pool: PgPool) -> Result<()> { - // Test: hash_encrypted raises error when no HMAC index is present +async fn hash_function_falls_back_when_hmac_absent(pool: PgPool) -> Result<()> { + // Post-flip, hash_encrypted is inlinable SQL and falls back to hashing the + // encrypted payload bytes when `hm` is absent. The fallback is what keeps + // HashAggregate from degrading to O(N^2) on a single NULL-hash bucket when + // a column without `hm` is grouped. - // Create value with only ORE index (no hmac) - let result = sqlx::query_scalar::<_, i32>( + let h1: Option = sqlx::query_scalar( "SELECT eql_v2.hash_encrypted(create_encrypted_json(1, 'ob'))", ) .fetch_one(&pool) - .await; + .await + .context("hash_encrypted on ore-only value should not error")?; - assert!( - result.is_err(), - "hash_encrypted should error with ORE-only value" - ); + let h2: Option = sqlx::query_scalar( + "SELECT eql_v2.hash_encrypted(create_encrypted_json(2, 'ob'))", + ) + .fetch_one(&pool) + .await + .context("hash_encrypted on a second ore-only value should not error")?; - let err_msg = result.unwrap_err().to_string(); - assert!( - err_msg.contains("hmac_256"), - "Error should mention missing hmac_256 index term, got: {}", - err_msg + assert!(h1.is_some(), "hash_encrypted on ore-only should yield a hash"); + assert!(h2.is_some(), "hash_encrypted on ore-only should yield a hash"); + assert_ne!( + h1, h2, + "distinct ciphertexts must hash differently — otherwise GROUP BY on a misconfigured column degrades to O(N^2)" ); Ok(()) @@ -346,26 +351,19 @@ async fn in_subquery_with_encrypted_column(pool: PgPool) -> Result<()> { // still works but the inner element must carry hm to be hashable. #[sqlx::test(fixtures(path = "../fixtures", scripts("encrypted_json")))] -async fn multi_element_ste_vec_raises_error(pool: PgPool) -> Result<()> { - // Test: multi-element STE vec cannot be hashed (no top-level hm/b3 keys) +async fn multi_element_ste_vec_falls_back(pool: PgPool) -> Result<()> { + // Multi-element STE vec has no top-level `hm` (only sv-element terms). + // Post-flip the function falls back to data-hashing rather than raising, + // mirroring the misconfig contract documented in U-002. - let result = sqlx::query_scalar::<_, i32>( + let h: Option = sqlx::query_scalar( "SELECT eql_v2.hash_encrypted((get_array_ste_vec())::eql_v2_encrypted)", ) .fetch_one(&pool) - .await; + .await + .context("hash_encrypted on multi-element ste_vec should not error")?; - assert!( - result.is_err(), - "hash_encrypted should error with multi-element STE vec" - ); - - let err_msg = result.unwrap_err().to_string(); - assert!( - err_msg.contains("hmac_256"), - "Error should mention missing hmac_256 index term, got: {}", - err_msg - ); + assert!(h.is_some(), "fallback hash should be non-null for a non-null input"); Ok(()) } @@ -628,3 +626,47 @@ async fn hash_join_non_matching_returns_zero(pool: PgPool) -> Result<()> { // fell back to Blake3 across rows. That contract has no production // analogue: protect.js does not emit a root-level `b3` term, so the // "hm+b3 vs b3-only" mixed shape is fixture-only. + +#[sqlx::test(fixtures(path = "../fixtures", scripts("encrypted_json")))] +async fn hash_encrypted_is_inlinable(pool: PgPool) -> Result<()> { + // The hash operator class FUNCTION 1 is called once per row by + // HashAggregate / hash joins / DISTINCT. For the per-row cost to drop + // out of the plpgsql interpreter, `eql_v2.hash_encrypted(eql_v2_encrypted)` + // must be (a) LANGUAGE sql and (b) without a pinned search_path. + // Either condition alone is enough to disable PG's SQL function inlining + // (see PostgreSQL's inline_function in clauses.c), so the splinter + // allowlist and tasks/pin_search_path.sql carve-out are load-bearing. + let (lang, proconfig): (String, Option>) = sqlx::query_as( + r#" + SELECT l.lanname::text, p.proconfig + FROM pg_proc p + JOIN pg_namespace n ON n.oid = p.pronamespace + JOIN pg_language l ON l.oid = p.prolang + WHERE n.nspname = 'eql_v2' + AND p.proname = 'hash_encrypted' + AND p.pronargs = 1 + "#, + ) + .fetch_one(&pool) + .await + .context("could not look up hash_encrypted in pg_proc")?; + + assert_eq!( + lang, "sql", + "hash_encrypted must be LANGUAGE sql for the planner to inline it (got {})", + lang + ); + + let has_search_path = proconfig + .as_ref() + .map(|cfg| cfg.iter().any(|c| c.starts_with("search_path="))) + .unwrap_or(false); + assert!( + !has_search_path, + "hash_encrypted must NOT have a pinned search_path — pin_search_path.sql allowlists it; \ + pinning disables SQL inlining (got proconfig={:?})", + proconfig + ); + + Ok(()) +}