diff --git a/src/serving/masking.py b/src/serving/masking.py index 9258b94..58ff6ba 100644 --- a/src/serving/masking.py +++ b/src/serving/masking.py @@ -98,7 +98,11 @@ def mask_query_results( return [dict(row) for row in rows], False # Resolve projection lineage so a renamed/derived PII column is masked by # what it's *built from*, not by its output name. (audit_30_06_26.md D2) - source_columns = self._projection_source_columns(sql) + # The real result keys (projection order) let an *unaliased* expression be + # keyed by its true DuckDB output name. (audit_30 D2 follow-up: unaliased + # expression PII leak) + output_keys = list(rows[0].keys()) if rows else [] + source_columns = self._projection_source_columns(sql, output_keys) # Apply every matched entity's masking rules. A multi-entity JOIN must not # bypass masking — returning the rows unmasked leaked cleartext PII # (e.g. users_enriched JOIN orders_v2). Mask the union of all matched @@ -111,8 +115,10 @@ def mask_query_results( ] return masked_rows, masked_rows != rows - def _projection_source_columns(self, sql: str) -> dict[str, frozenset[str]] | None: - """Map each output column to the source column names that feed it. + def _projection_source_columns( + self, sql: str, output_keys: list[str] + ) -> dict[str, frozenset[str]] | None: + """Map each *result* column to the source column names that feed it. Resolves *true* projection lineage (through subqueries, CTEs and unions) so a PII column renamed at any nesting depth is masked by what it is @@ -121,9 +127,19 @@ def _projection_source_columns(self, sql: str) -> dict[str, frozenset[str]] | No ``SELECT contact FROM (SELECT email AS contact FROM users_enriched)`` — returned cleartext. (audit_30 D2 follow-up: subquery/CTE-alias bypass) + Projections are aligned *positionally* to the real DuckDB result keys + (``output_keys``; projection order == result-column order) so an + *unaliased* expression — ``SELECT upper(email) FROM users_enriched`` + (output column ``upper(email)``), ``SELECT email || '' …`` + (``(email || '')``) — is keyed by its true output name and masked by the + column it references. sqlglot's own rendering does not reproduce DuckDB's + column naming (case and parenthesisation differ), so the names must come + from the rows. (audit_30 D2 follow-up: unaliased-expression PII leak) + Returns ``None`` for a ``SELECT *`` (whose outputs are the source names - verbatim, so name-matching is correct) or unparseable SQL, so masking - falls back to matching rule fields against output names. + verbatim, so name-matching is correct), for unparseable SQL, or when the + projections can't be aligned 1:1 to the result keys, so masking falls + back to matching rule fields against output names. """ try: parsed = sqlglot.parse_one(sql, read="duckdb") @@ -132,29 +148,40 @@ def _projection_source_columns(self, sql: str) -> dict[str, frozenset[str]] | No select = parsed.find(exp.Select) if select is None: return None + projections = select.expressions if any( isinstance(projection, exp.Star) or projection.find(exp.Star) is not None - for projection in select.expressions + for projection in projections ): return None + # Align projections to the real result keys positionally. A count mismatch + # means an expansion we can't map 1:1 — fall back to name-matching rather + # than mis-key a column. + if not output_keys or len(projections) != len(output_keys): + return None mapping: dict[str, frozenset[str]] = {} - for projection in select.expressions: + for projection, output_key in zip(projections, output_keys, strict=False): output_name = projection.alias_or_name - if not output_name: - continue - # Union the *deep* lineage sources (which trace a renamed PII column - # through subqueries/CTEs) with the *shallow* columns named directly - # in this projection. Lineage is blind through an inner ``SELECT *`` - # (no schema to expand it), where the shallow scan still catches a - # direct ``email AS contact``; lineage catches the inner rename the - # shallow scan misses. Either alone leaks one shape — the union - # closes both. - deep = self._lineage_source_columns(output_name, sql) - if isinstance(deep, _UnresolvedSources): - mapping[output_name] = deep - continue - shallow = frozenset(col.name for col in projection.find_all(exp.Column) if col.name) - mapping[output_name] = deep | shallow + if output_name: + # Union the *deep* lineage sources (which trace a renamed PII + # column through subqueries/CTEs) with the *shallow* columns named + # directly in this projection. Lineage is blind through an inner + # ``SELECT *`` (handled fail-closed in _lineage_source_columns); + # the shallow scan still catches a direct ``email AS contact``. + # Either alone leaks one shape — the union closes both. + deep = self._lineage_source_columns(output_name, sql) + if isinstance(deep, _UnresolvedSources): + mapping[output_key] = deep + continue + shallow = frozenset(col.name for col in projection.find_all(exp.Column) if col.name) + mapping[output_key] = deep | shallow + else: + # An unaliased expression has no output name for lineage to key on, + # so mask by the columns it directly references (``upper(email)`` -> + # ``email``). A nested star was already excluded above. + mapping[output_key] = frozenset( + col.name for col in projection.find_all(exp.Column) if col.name + ) return mapping def _lineage_source_columns(self, output_name: str, sql: str) -> frozenset[str]: diff --git a/tests/unit/test_masking.py b/tests/unit/test_masking.py index 76148c6..3e71189 100644 --- a/tests/unit/test_masking.py +++ b/tests/unit/test_masking.py @@ -250,6 +250,46 @@ def test_mask_query_results_masks_pii_renamed_above_inner_select_star(tmp_path: assert rows == [{"c": "a***@example.com"}], sql +def test_mask_query_results_masks_unaliased_pii_expression(tmp_path: Path): + # An unaliased expression over PII has no `alias_or_name`, so the resolver + # skipped it and the output column kept DuckDB's rendered name (`upper(email)`, + # `(email || '')`) — which never matched a rule field, so cleartext leaked. + # Aligning projections positionally to the real result keys maps the column to + # the source it references and masks it. (audit_30 D2 follow-up: unaliased-expr) + masker = PiiMasker(_user_email_config(tmp_path)) + + for sql, out_col in ( + ("SELECT upper(email) FROM users_enriched", "upper(email)"), + ("SELECT email || '' FROM users_enriched", "(email || '')"), + ): + rows, masked = masker.mask_query_results( + sql, + [{out_col: "alice@example.com"}], + tenant="acme", + table_to_entity={"users_enriched": "user"}, + ) + + assert masked is True, sql + assert rows == [{out_col: "a***@example.com"}], sql + + +def test_mask_query_results_does_not_overmask_directly_named_nonpii(tmp_path: Path): + # Positional alignment must key each projection to its *own* result column: a + # directly-named non-PII column alongside a PII one stays untouched (no star, + # no derivation -> resolvable lineage, not the fail-closed sentinel). + masker = PiiMasker(_user_email_config(tmp_path)) + + rows, masked = masker.mask_query_results( + "SELECT email, user_id FROM users_enriched", + [{"email": "alice@example.com", "user_id": "U-1"}], + tenant="acme", + table_to_entity={"users_enriched": "user"}, + ) + + assert masked is True + assert rows == [{"email": "a***@example.com", "user_id": "U-1"}] + + def test_mask_query_results_masks_select_star_by_name(tmp_path: Path): # SELECT * has no resolvable projection lineage; masking falls back to # matching rule fields against the (canonical) output column names.