From c4bea1934e1bc61810632d29a398b96a070ad70f Mon Sep 17 00:00:00 2001 From: Pierre Massat Date: Mon, 15 Jun 2026 13:16:12 -0700 Subject: [PATCH 1/9] fix(eap): Keep existence if(...) out of IN/NOT_IN so the analyzer can name it The previous typed-NULL fix (#8032) made the isNull(...) occurrence of the existence if(mapContains, arrayElement, NULL) consistent, but EAP IN / NOT IN filters on optional attributes still died under the new ClickHouse analyzer: Code: 10. DB::Exception: Not found column sumOrNullIf(... in(if(..., arrayElement(...), _CAST(NULL_Nullable(String), 'Nullable(String)')), __set)) ... in block. There are only columns: ... in(if(..., arrayElement(...), NULL_Nullable(Nothing)), __set) ... The remaining divergence is the in(...) left operand itself: the analyzer keeps the explicit CAST when it names the projection column but folds it to a bare NULL_Nullable(Nothing) when it computes the block, so the aggregate column no longer matches its projection name. This happens regardless of whether the NULL is bare or typed, because it is the IN-operand coercion that diverges. Stop feeding the existence if(...) into in(...) entirely. For IN/NOT_IN on map-backed attributes, compare the raw arrayElement(...) value (which the existence pass leaves untouched, so no NULL constant survives inside in(...)) and guard it with an explicit mapContains: IN: and(mapContains, in(arrayElement, set)) NOT IN: not(and(mapContains, in(arrayElement, set))) mapContains is what distinguishes an absent key from a stored empty value ('' / 0, which arrayElement also returns for a missing key). This is equivalent to the old isNull(if(mapContains, arrayElement, NULL)) handling for every value list these filters accept: the typed array fields only carry scalars, so the set never contains NULL and the old has(set, NULL) branch was always constant false. The predicate now contains no NULL literal at all. Non-IN comparisons are unchanged, as are array/normalized/boolean keys (which carry no SubscriptableReference and so never produced the foldable if(...)). Verified semantically against a local ClickHouse via the existing OP_IN-in-conditional-aggregation e2e tests; the analyzer column-naming itself must still be replayed with enable_analyzer=1 before re-enabling the analyzer in production. Fixes SNUBA-B62 Fixes SNUBA-B6C Fixes SNUBA-A13 Co-Authored-By: Claude Opus 4.8 (1M context) Agent transcript: https://claudescope.sentry.dev/share/cWD_0FRYO1tqYclqE9vE0yfuaJS-O8RTXHO4feo9IzI --- snuba/web/rpc/common/common.py | 93 ++++++++++++++++++++++- tests/web/rpc/test_common.py | 133 ++++++++++++++++++++++++++++++++- 2 files changed, 223 insertions(+), 3 deletions(-) diff --git a/snuba/web/rpc/common/common.py b/snuba/web/rpc/common/common.py index ff0349ef27..62e50cf8fa 100644 --- a/snuba/web/rpc/common/common.py +++ b/snuba/web/rpc/common/common.py @@ -1,4 +1,5 @@ import json +from dataclasses import replace from datetime import datetime, timedelta, timezone from typing import Any, Callable, TypeVar, cast @@ -24,6 +25,7 @@ from snuba.query.dsl import Functions as f from snuba.query.dsl import ( and_cond, + arrayElement, column, in_cond, literal, @@ -289,6 +291,85 @@ def transform(exp: Expression) -> Expression: query.transform_expressions(transform) +def _contains_subscriptable_reference(exp: Expression) -> bool: + """True if ``exp`` has any ``SubscriptableReference`` in its subtree. + + Those are exactly the nodes ``add_existence_check_to_subscriptable_references`` + later wraps in ``if(mapContains(...), ..., NULL)``. When such a wrapped + reference ends up as the left operand of ``in(...)`` it triggers the new + analyzer column-name mismatch (see ``_analyzer_safe_in_expression``). + """ + found = False + + def visit(node: Expression) -> Expression: + nonlocal found + if isinstance(node, SubscriptableReference): + found = True + return node + + exp.transform(visit) + return found + + +def _subscriptable_references_to_array_element(exp: Expression) -> Expression: + """Lower every ``SubscriptableReference`` in ``exp`` to a bare ``arrayElement``. + + ``add_existence_check_to_subscriptable_references`` only rewrites + ``SubscriptableReference`` nodes, so pre-lowering them to ``arrayElement`` + keeps that pass from wrapping them in ``if(mapContains(...), ..., NULL)``. + The result carries no NULL constant, so it is safe as the left operand of + ``in(...)`` (see ``_analyzer_safe_in_expression``). ``arrayElement`` returns + the map's default ('' / 0) for a missing key rather than NULL — callers pair + this with an explicit ``mapContains`` guard to restore NULL semantics. + + All aliases are stripped: the lowered expression is structurally different + from the existence ``if(...)`` the SELECT clause may still carry for the same + attribute, so reusing the alias would collide ("different expression, same + alias"). Condition expressions don't need aliases anyway. + """ + + def transform(node: Expression) -> Expression: + if isinstance(node, SubscriptableReference): + return arrayElement(None, node.column, node.key) + if node.alias is not None: + return replace(node, alias=None) + return node + + return exp.transform(transform) + + +def _analyzer_safe_in_expression( + k_expression: Expression, v_expression: Expression, *, negated: bool +) -> Expression: + """Build an ``IN`` / ``NOT IN`` predicate the new ClickHouse analyzer can name + consistently. + + The straightforward form ``in(if(mapContains(...), arrayElement(...), NULL), set)`` + puts a NULL constant inside the ``in(...)`` left operand. The new analyzer + folds that NULL differently when it names the projection column versus when + it computes the block (it strips the redundant CAST in one place but not the + other), so the aggregate column no longer matches its projection name and the + query dies with ``Code: 10. DB::Exception: Not found column ... in block``. + + Instead of feeding the existence ``if(...)`` into ``in(...)`` we compare the + raw ``arrayElement(...)`` value and guard it with an explicit ``mapContains`` + so a missing key (which ``arrayElement`` reads as the column default '' / 0, + not NULL) can never match: + + * IN: key exists AND value in set + * NOT IN: NOT (key exists AND value in set) -> missing key is "not in" + + This is equivalent to the legacy ``isNull(if(mapContains, arrayElement, + NULL))`` handling for every value list these filters accept: the typed array + fields (``val_str_array`` etc.) only carry scalars, so the set never contains + NULL and the old ``has(set, NULL)`` branch was always a constant ``false``. + """ + exists = get_field_existence_expression(k_expression) + membership_value = _subscriptable_references_to_array_element(k_expression) + present = and_cond(exists, in_cond(membership_value, v_expression)) + return not_cond(present) if negated else present + + def _scalar_value(v: AttributeValue) -> bool | str | int | float | None: """Extract a Python scalar from an AttributeValue proto.""" match v.WhichOneof("value"): @@ -718,10 +799,14 @@ def trace_item_filters_to_expression( None, [literal(elem.val_str.lower()) for elem in v.val_array.values], ) - expr = in_cond(k_expression, v_expression) # note: v_expression must be an array # we redefine the way in works for nulls # now null in ['hi', null] is true + if _contains_subscriptable_reference(k_expression): + # Map-backed attribute: avoid feeding the existence if(...) into + # in(...), which the new analyzer canonicalizes inconsistently. + return _analyzer_safe_in_expression(k_expression, v_expression, negated=False) + expr = in_cond(k_expression, v_expression) expr_with_null = or_cond( expr, and_cond(f.isNull(k_expression), f.has(v_expression, literal(None))), @@ -741,10 +826,14 @@ def trace_item_filters_to_expression( None, [literal(elem.val_str.lower()) for elem in v.val_array.values], ) - expr = not_cond(in_cond(k_expression, v_expression)) # note: v_expression must be an array # we redefine the way not in works for nulls # now null not in ['hi'] is true + if _contains_subscriptable_reference(k_expression): + # Map-backed attribute: avoid feeding the existence if(...) into + # in(...), which the new analyzer canonicalizes inconsistently. + return _analyzer_safe_in_expression(k_expression, v_expression, negated=True) + expr = not_cond(in_cond(k_expression, v_expression)) expr_with_null = or_cond( expr, and_cond( diff --git a/tests/web/rpc/test_common.py b/tests/web/rpc/test_common.py index fe018e7756..8bd36205ff 100644 --- a/tests/web/rpc/test_common.py +++ b/tests/web/rpc/test_common.py @@ -31,9 +31,17 @@ from snuba.datasets.storages.factory import get_storage from snuba.datasets.storages.storage_key import StorageKey from snuba.protos.common import ATTRIBUTES_TO_COALESCE -from snuba.query.expressions import FunctionCall, Lambda, Literal +from snuba.query.expressions import ( + Expression, + FunctionCall, + Lambda, + Literal, + SubscriptableReference, +) +from snuba.query.logical import Query from snuba.web.rpc.common.common import ( _any_attribute_filter_to_expression, + add_existence_check_to_subscriptable_references, attribute_key_to_expression, next_monday, prev_monday, @@ -262,6 +270,129 @@ def test_exists_filter_on_non_coalesced_attribute_unchanged(self) -> None: assert expr.function_name == "mapContains" +class TestInNotInAnalyzerSafe: + """IN / NOT IN on a map-backed attribute must not feed the existence + ``if(mapContains(...), arrayElement(...), NULL)`` into ``in(...)``. + + That shape makes the new ClickHouse analyzer name the projection column and + the computed block inconsistently (it folds the NULL else-branch differently + in each), surfacing as ``Code: 10. DB::Exception: Not found column ... in + block`` (SNUBA-B62 / SNUBA-B6C / SNUBA-A13). We compare the raw + ``arrayElement`` value and guard NULL handling with an explicit + ``mapContains`` instead. + """ + + @staticmethod + def _walk(expr: Expression) -> list[Expression]: + nodes: list[Expression] = [] + + def visit(node: Expression) -> Expression: + nodes.append(node) + return node + + expr.transform(visit) + return nodes + + @staticmethod + def _make_in_filter( + name: str, + op: ComparisonFilter.Op.ValueType, + values: list[str], + ) -> TraceItemFilter: + return TraceItemFilter( + comparison_filter=ComparisonFilter( + key=AttributeKey(type=AttributeKey.Type.TYPE_STRING, name=name), + op=op, + value=AttributeValue(val_str_array=StrArray(values=values)), + ) + ) + + def _assert_in_operand_is_array_element(self, expr: Expression) -> None: + # The whole point: every in(...) left operand is a raw arrayElement, not + # an if(...) carrying a NULL constant. + in_calls = [ + n for n in self._walk(expr) if isinstance(n, FunctionCall) and n.function_name == "in" + ] + assert in_calls, "expected an in(...) call in the predicate" + for in_call in in_calls: + operand = in_call.parameters[0] + assert isinstance(operand, FunctionCall), operand + assert operand.function_name == "arrayElement", operand.function_name + + def _assert_no_subscriptable_references(self, expr: Expression) -> None: + # No SubscriptableReference means add_existence_check_to_subscriptable_references + # is a no-op here, so it can never reintroduce the if(...) inside in(...). + assert not any(isinstance(n, SubscriptableReference) for n in self._walk(expr)) + + def _assert_no_null_literals(self, expr: Expression) -> None: + # The predicate must contain no NULL literal at all -- not in in(...), + # and not in a has(set, NULL) term either. + assert not any(isinstance(n, Literal) and n.value is None for n in self._walk(expr)) + + def test_not_in_uses_array_element_and_map_contains(self) -> None: + item_filter = self._make_in_filter( + "some.custom.tag", ComparisonFilter.OP_NOT_IN, ["a", "b"] + ) + expr = trace_item_filters_to_expression(item_filter, attribute_key_to_expression) + self._assert_in_operand_is_array_element(expr) + self._assert_no_subscriptable_references(expr) + + # not( and(mapContains, in(arrayElement, set)) ) -- no NULL anywhere. + assert isinstance(expr, FunctionCall) and expr.function_name == "not" + names = {n.function_name for n in self._walk(expr) if isinstance(n, FunctionCall)} + assert {"not", "and", "in", "mapContains", "arrayElement"} <= names + assert "has" not in names + self._assert_no_null_literals(expr) + + def test_in_uses_array_element_and_map_contains(self) -> None: + item_filter = self._make_in_filter("some.custom.tag", ComparisonFilter.OP_IN, ["a", "b"]) + expr = trace_item_filters_to_expression(item_filter, attribute_key_to_expression) + self._assert_in_operand_is_array_element(expr) + self._assert_no_subscriptable_references(expr) + # and(mapContains, in(arrayElement, set)) -- no NULL anywhere. + assert isinstance(expr, FunctionCall) and expr.function_name == "and" + self._assert_no_null_literals(expr) + + def test_not_in_ignore_case_lowers_array_element(self) -> None: + item_filter = TraceItemFilter( + comparison_filter=ComparisonFilter( + key=AttributeKey(type=AttributeKey.Type.TYPE_STRING, name="some.custom.tag"), + op=ComparisonFilter.OP_NOT_IN, + value=AttributeValue(val_str_array=StrArray(values=["A", "B"])), + ignore_case=True, + ) + ) + expr = trace_item_filters_to_expression(item_filter, attribute_key_to_expression) + self._assert_no_subscriptable_references(expr) + # in(...) operand is lower(arrayElement(...)); set values are lowercased. + in_calls = [ + n for n in self._walk(expr) if isinstance(n, FunctionCall) and n.function_name == "in" + ] + assert len(in_calls) == 1 + operand = in_calls[0].parameters[0] + assert isinstance(operand, FunctionCall) and operand.function_name == "lower" + inner = operand.parameters[0] + assert isinstance(inner, FunctionCall) and inner.function_name == "arrayElement" + + def test_existence_pass_is_noop_on_in_predicate(self) -> None: + """add_existence_check_to_subscriptable_references must not reintroduce an + if(...) inside the in(...) — it has no SubscriptableReference to wrap.""" + item_filter = self._make_in_filter( + "some.custom.tag", ComparisonFilter.OP_NOT_IN, ["a", "b"] + ) + expr = trace_item_filters_to_expression(item_filter, attribute_key_to_expression) + + query = Query(from_clause=None, condition=expr) + add_existence_check_to_subscriptable_references(query) + transformed = query.get_condition() + assert transformed is not None + self._assert_in_operand_is_array_element(transformed) + # No if(...) wrapper anywhere — the analyzer-foldable shape is gone. + assert not any( + isinstance(n, FunctionCall) and n.function_name == "if" for n in self._walk(transformed) + ) + + @pytest.mark.redis_db @pytest.mark.clickhouse_db def test_convert_rpc_exception_to_proto_packs_details() -> None: From 57a905275ef44f46dce35bbd04198fe31a49fcf0 Mon Sep 17 00:00:00 2001 From: Pierre Massat Date: Mon, 15 Jun 2026 15:14:26 -0700 Subject: [PATCH 2/9] ref(eap): Build EQUALS/NOT_EQUALS/LIKE/NOT_LIKE from (mapContains, arrayElement) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generalize the IN/NOT_IN analyzer-safe rewrite to the other per-key comparison operators. The legacy idiom wrapped every map lookup in if(mapContains, arrayElement, NULL) and detected an absent key via isNull(...), which drags a foldable NULL constant into the WHERE / aggregate-condition predicate — the same shape that breaks the new ClickHouse analyzer for IN. isNull(if(mapContains, arrayElement, NULL)) is just not(mapContains), so each operator is rebuilt from a (value, exists) pair via the shared _map_backed_operands helper: EQUALS: and(exists, equals(value, v)); = null -> not(exists) NOT_EQUALS: not(and(exists, equals(value, v))); != null -> exists LIKE: and(exists, like(value, v)) NOT_LIKE: not(and(exists, like(value, v))) exists = mapContains is the only thing that separates an absent key from a stored empty value (arrayElement reads both as '' / 0), so the empty-vs-absent distinction is preserved. The predicates now carry no NULL literal. Coalesced attributes are excluded (_use_map_backed_operands): coalesce relies on each branch being NULL for a missing key to fall through to the next, which the if(..., NULL) wrapper provides but a bare arrayElement does not — they keep the legacy isNull-based handling. Normalized columns / booleans / arrays were already on the legacy path (no SubscriptableReference). Adds AST tests (TestComparisonAnalyzerSafe) asserting the new shapes carry no NULL literal / SubscriptableReference / foldable if(...), and end-to-end empty-vs-absent tests (TestEmptyVsAbsentComparison) pinning that each operator classifies a stored empty value and an absent key identically before and after. Co-Authored-By: Claude Opus 4.8 (1M context) --- snuba/web/rpc/common/common.py | 98 +++++++++++++- tests/web/rpc/test_common.py | 238 +++++++++++++++++++++++++++++++++ 2 files changed, 330 insertions(+), 6 deletions(-) diff --git a/snuba/web/rpc/common/common.py b/snuba/web/rpc/common/common.py index 62e50cf8fa..034d9a0a22 100644 --- a/snuba/web/rpc/common/common.py +++ b/snuba/web/rpc/common/common.py @@ -311,6 +311,34 @@ def visit(node: Expression) -> Expression: return found +def _contains_coalesce(exp: Expression) -> bool: + """True if ``exp`` has any ``coalesce(...)`` call in its subtree.""" + found = False + + def visit(node: Expression) -> Expression: + nonlocal found + if isinstance(node, FunctionCall) and node.function_name == "coalesce": + found = True + return node + + exp.transform(visit) + return found + + +def _use_map_backed_operands(k_expression: Expression) -> bool: + """Whether the analyzer-safe ``(mapContains, arrayElement)`` rewrite applies. + + Requires a map lookup (a ``SubscriptableReference``) and excludes *coalesced* + attributes. ``coalesce(if(mapContains(a), a[k], NULL), if(mapContains(b), ...))`` + relies on each branch being NULL for a missing key so it can fall through to + the next; a bare ``arrayElement`` returns the column default ('' / 0), never + NULL, so lowering it would pin the result to the first key. Coalesced keys + therefore keep the legacy ``isNull``-based handling (the ``if(..., NULL)`` + wrapper is load-bearing there). + """ + return _contains_subscriptable_reference(k_expression) and not _contains_coalesce(k_expression) + + def _subscriptable_references_to_array_element(exp: Expression) -> Expression: """Lower every ``SubscriptableReference`` in ``exp`` to a bare ``arrayElement``. @@ -364,12 +392,36 @@ def _analyzer_safe_in_expression( fields (``val_str_array`` etc.) only carry scalars, so the set never contains NULL and the old ``has(set, NULL)`` branch was always a constant ``false``. """ - exists = get_field_existence_expression(k_expression) - membership_value = _subscriptable_references_to_array_element(k_expression) - present = and_cond(exists, in_cond(membership_value, v_expression)) + value, exists = _map_backed_operands(k_expression) + present = and_cond(exists, in_cond(value, v_expression)) return not_cond(present) if negated else present +def _map_backed_operands(k_expression: Expression) -> tuple[Expression, Expression]: + """Return ``(value, exists)`` for a map-backed attribute key expression. + + ``value`` is the raw ``arrayElement(...)`` with every ``SubscriptableReference`` + lowered (so ``add_existence_check_to_subscriptable_references`` cannot wrap it + in ``if(..., NULL)``), and ``exists`` is the ``mapContains`` existence guard. + Per-key comparisons are then built as ``and(exists, cmp(value, v))`` (and + negated for ``!=`` / ``NOT LIKE`` / ``NOT IN``). This keeps every NULL constant + out of the WHERE / aggregate-condition predicate — see + ``_analyzer_safe_in_expression`` for why that matters to the new analyzer — + while preserving the absent-vs-empty distinction: a missing key has + ``exists = false``, whereas a stored empty value has ``exists = true`` and + ``value = ''`` (``arrayElement`` reads both as the column default, so the + ``mapContains`` guard is the only thing that tells them apart). + + Only valid when ``_contains_subscriptable_reference(k_expression)`` is true; + normalized columns / booleans / arrays keep their legacy ``isNull``-based + handling. + """ + return ( + _subscriptable_references_to_array_element(k_expression), + get_field_existence_expression(k_expression), + ) + + def _scalar_value(v: AttributeValue) -> bool | str | int | float | None: """Extract a Python scalar from an AttributeValue proto.""" match v.WhichOneof("value"): @@ -691,7 +743,8 @@ def trace_item_filters_to_expression( if value_type is None: raise BadSnubaRPCRequestException("comparison does not have a right hand side") - if v.is_null or value_type == "val_null": + v_is_null = v.is_null or value_type == "val_null" + if v_is_null: v_expression: Expression = literal(None) else: v_expression = _attribute_value_to_expression(v) @@ -703,6 +756,19 @@ def trace_item_filters_to_expression( return _type_array_includes_scalar_expression( k_expression, v, item_filter.comparison_filter.ignore_case ) + if _use_map_backed_operands(k_expression): + # Map-backed attribute: compare the raw arrayElement value guarded + # by mapContains, keeping NULL out of the predicate. + value, exists = _map_backed_operands(k_expression) + if v_is_null: + # `attr = null` means the key is absent. + return not_cond(exists) + lhs, rhs = ( + (f.lower(value), f.lower(v_expression)) + if item_filter.comparison_filter.ignore_case + else (value, v_expression) + ) + return and_cond(exists, f.equals(lhs, rhs)) else: expr = ( f.equals(f.lower(k_expression), f.lower(v_expression)) @@ -723,6 +789,18 @@ def trace_item_filters_to_expression( k_expression, v, item_filter.comparison_filter.ignore_case ) ) + if _use_map_backed_operands(k_expression): + # Negation of the OP_EQUALS form: an absent key is "not equal". + value, exists = _map_backed_operands(k_expression) + if v_is_null: + # `attr != null` means the key is present. + return exists + lhs, rhs = ( + (f.lower(value), f.lower(v_expression)) + if item_filter.comparison_filter.ignore_case + else (value, v_expression) + ) + return not_cond(and_cond(exists, f.equals(lhs, rhs))) else: expr = ( f.notEquals(f.lower(k_expression), f.lower(v_expression)) @@ -751,6 +829,9 @@ def trace_item_filters_to_expression( "the LIKE comparison is only supported on string and array keys" ) comparison_function = f.ilike if item_filter.comparison_filter.ignore_case else f.like + if _use_map_backed_operands(k_expression): + value, exists = _map_backed_operands(k_expression) + return and_cond(exists, comparison_function(value, v_expression)) return comparison_function(k_expression, v_expression) if op == ComparisonFilter.OP_NOT_LIKE: if k.type == AttributeKey.Type.TYPE_ARRAY: @@ -769,6 +850,11 @@ def trace_item_filters_to_expression( raise BadSnubaRPCRequestException( "the NOT LIKE comparison is only supported on string and array keys" ) + if _use_map_backed_operands(k_expression): + # Negation of the OP_LIKE form: an absent key is "not like". + like_fn = f.ilike if item_filter.comparison_filter.ignore_case else f.like + value, exists = _map_backed_operands(k_expression) + return not_cond(and_cond(exists, like_fn(value, v_expression))) comparison_function = ( f.notILike if item_filter.comparison_filter.ignore_case else f.notLike ) @@ -802,7 +888,7 @@ def trace_item_filters_to_expression( # note: v_expression must be an array # we redefine the way in works for nulls # now null in ['hi', null] is true - if _contains_subscriptable_reference(k_expression): + if _use_map_backed_operands(k_expression): # Map-backed attribute: avoid feeding the existence if(...) into # in(...), which the new analyzer canonicalizes inconsistently. return _analyzer_safe_in_expression(k_expression, v_expression, negated=False) @@ -829,7 +915,7 @@ def trace_item_filters_to_expression( # note: v_expression must be an array # we redefine the way not in works for nulls # now null not in ['hi'] is true - if _contains_subscriptable_reference(k_expression): + if _use_map_backed_operands(k_expression): # Map-backed attribute: avoid feeding the existence if(...) into # in(...), which the new analyzer canonicalizes inconsistently. return _analyzer_safe_in_expression(k_expression, v_expression, negated=True) diff --git a/tests/web/rpc/test_common.py b/tests/web/rpc/test_common.py index 8bd36205ff..e96c77221d 100644 --- a/tests/web/rpc/test_common.py +++ b/tests/web/rpc/test_common.py @@ -20,6 +20,7 @@ StrArray, ) from sentry_protos.snuba.v1.trace_item_filter_pb2 import ( + AndFilter, AnyAttributeFilter, ComparisonFilter, ExistsFilter, @@ -393,6 +394,102 @@ def test_existence_pass_is_noop_on_in_predicate(self) -> None: ) +class TestComparisonAnalyzerSafe: + """EQUALS / NOT_EQUALS / LIKE / NOT_LIKE on map-backed attributes are built + from a (mapContains, arrayElement) pair instead of the legacy + if(mapContains, arrayElement, NULL) + isNull(...) idiom, so no NULL constant + is fed into the WHERE / aggregate-condition predicate (same class of new + analyzer fragility as IN/NOT_IN), while the absent-vs-empty distinction is + preserved by the explicit mapContains guard. + """ + + @staticmethod + def _walk(expr: Expression) -> list[Expression]: + nodes: list[Expression] = [] + + def visit(node: Expression) -> Expression: + nodes.append(node) + return node + + expr.transform(visit) + return nodes + + def _cmp( + self, + op: ComparisonFilter.Op.ValueType, + *, + value: str | None = None, + is_null: bool = False, + ignore_case: bool = False, + ) -> Expression: + av = AttributeValue(val_null=True) if is_null else AttributeValue(val_str=value or "") + item_filter = TraceItemFilter( + comparison_filter=ComparisonFilter( + key=AttributeKey(type=AttributeKey.Type.TYPE_STRING, name="some.custom.tag"), + op=op, + value=av, + ignore_case=ignore_case, + ) + ) + return trace_item_filters_to_expression(item_filter, attribute_key_to_expression) + + def _fn_names(self, expr: Expression) -> set[str]: + return {n.function_name for n in self._walk(expr) if isinstance(n, FunctionCall)} + + def _assert_clean(self, expr: Expression) -> None: + # No NULL literal, no leftover SubscriptableReference, no foldable if(...). + for n in self._walk(expr): + assert not (isinstance(n, Literal) and n.value is None) + assert not isinstance(n, SubscriptableReference) + assert not (isinstance(n, FunctionCall) and n.function_name == "if") + + def test_equals_is_guarded_arrayelement(self) -> None: + expr = self._cmp(ComparisonFilter.OP_EQUALS, value="ok") + self._assert_clean(expr) + assert isinstance(expr, FunctionCall) and expr.function_name == "and" + assert {"and", "mapContains", "equals", "arrayElement"} <= self._fn_names(expr) + + def test_equals_null_is_not_map_contains(self) -> None: + # `attr = null` == key absent == not(mapContains(...)) + expr = self._cmp(ComparisonFilter.OP_EQUALS, is_null=True) + self._assert_clean(expr) + assert isinstance(expr, FunctionCall) and expr.function_name == "not" + assert self._fn_names(expr) == {"not", "mapContains"} + + def test_not_equals_is_negated_guarded_equals(self) -> None: + expr = self._cmp(ComparisonFilter.OP_NOT_EQUALS, value="ok") + self._assert_clean(expr) + assert isinstance(expr, FunctionCall) and expr.function_name == "not" + assert {"not", "and", "mapContains", "equals", "arrayElement"} <= self._fn_names(expr) + + def test_not_equals_null_is_map_contains(self) -> None: + # `attr != null` == key present == mapContains(...) + expr = self._cmp(ComparisonFilter.OP_NOT_EQUALS, is_null=True) + self._assert_clean(expr) + assert isinstance(expr, FunctionCall) and expr.function_name == "mapContains" + + def test_like_is_guarded_arrayelement(self) -> None: + expr = self._cmp(ComparisonFilter.OP_LIKE, value="%ok%") + self._assert_clean(expr) + assert isinstance(expr, FunctionCall) and expr.function_name == "and" + assert {"and", "mapContains", "like", "arrayElement"} <= self._fn_names(expr) + + def test_not_like_is_negated_guarded_like(self) -> None: + expr = self._cmp(ComparisonFilter.OP_NOT_LIKE, value="%ok%") + self._assert_clean(expr) + assert isinstance(expr, FunctionCall) and expr.function_name == "not" + names = self._fn_names(expr) + assert {"not", "and", "mapContains", "like", "arrayElement"} <= names + # positive like under the negation, not notLike + assert "notLike" not in names + + def test_not_like_ignore_case_uses_ilike(self) -> None: + expr = self._cmp(ComparisonFilter.OP_NOT_LIKE, value="%ok%", ignore_case=True) + self._assert_clean(expr) + names = self._fn_names(expr) + assert "ilike" in names and "notILike" not in names + + @pytest.mark.redis_db @pytest.mark.clickhouse_db def test_convert_rpc_exception_to_proto_packs_details() -> None: @@ -699,3 +796,144 @@ def test_no_match_returns_empty(self) -> None: ) ) assert colors == [] + + +@pytest.mark.eap +@pytest.mark.redis_db +class TestEmptyVsAbsentComparison: + """End-to-end: the migrated per-key comparisons (EQUALS / NOT_EQUALS / LIKE / + NOT_LIKE) must keep distinguishing a *stored empty value* from an *absent + key*, since arrayElement reads both as '' and only mapContains tells them + apart. Three spans, by the attribute under test: + - present, value "ok" (color "red") + - present, empty "" (color "blue") + - absent (color "green") + + A unique per-run batch tag (present on all three rows and AND-ed into every + query) isolates these rows, so the absent-key cases aren't polluted by other + rows in the shared table (which also lack the attribute under test). + """ + + # Custom attribute names not present in gen_item_message's default set + # (which injects e.g. status="ok"), so we fully control present/empty/absent. + ATTR = "test.empty_vs_absent.status" + BATCH_ATTR = "test.empty_vs_absent.batch" + + @pytest.fixture(autouse=True) + def setup(self, eap: None, redis_db: None) -> None: + self.batch = f"batch-{uuid.uuid4().hex}" + self.base_time = datetime.now(tz=timezone.utc).replace( + minute=0, second=0, microsecond=0 + ) - timedelta(hours=1) + self.start_ts = Timestamp(seconds=int((self.base_time - timedelta(hours=1)).timestamp())) + self.end_ts = Timestamp(seconds=int((self.base_time + timedelta(hours=2)).timestamp())) + batch = AnyValue(string_value=self.batch) + messages = [ + gen_item_message( + start_timestamp=self.base_time, + attributes={ + self.BATCH_ATTR: batch, + self.ATTR: AnyValue(string_value="ok"), + "color": AnyValue(string_value="red"), + }, + ), + gen_item_message( + start_timestamp=self.base_time + timedelta(minutes=1), + attributes={ + self.BATCH_ATTR: batch, + self.ATTR: AnyValue(string_value=""), + "color": AnyValue(string_value="blue"), + }, + ), + gen_item_message( + start_timestamp=self.base_time + timedelta(minutes=2), + attributes={ + self.BATCH_ATTR: batch, + "color": AnyValue(string_value="green"), + }, + ), + ] + storage = get_storage(StorageKey("eap_items")) + write_raw_unprocessed_events(storage, messages) # type: ignore + + def _execute( + self, + op: ComparisonFilter.Op.ValueType, + *, + value: str | None = None, + is_null: bool = False, + ) -> list[str]: + av = AttributeValue(val_null=True) if is_null else AttributeValue(val_str=value or "") + filt = TraceItemFilter( + and_filter=AndFilter( + filters=[ + TraceItemFilter( + comparison_filter=ComparisonFilter( + key=AttributeKey(type=AttributeKey.TYPE_STRING, name=self.BATCH_ATTR), + op=ComparisonFilter.OP_EQUALS, + value=AttributeValue(val_str=self.batch), + ) + ), + TraceItemFilter( + comparison_filter=ComparisonFilter( + key=AttributeKey(type=AttributeKey.TYPE_STRING, name=self.ATTR), + op=op, + value=av, + ) + ), + ] + ) + ) + message = TraceItemTableRequest( + meta=RequestMeta( + project_ids=[1], + organization_id=1, + cogs_category="test", + referrer="test", + start_timestamp=self.start_ts, + end_timestamp=self.end_ts, + request_id=uuid.uuid4().hex, + trace_item_type=TraceItemType.TRACE_ITEM_TYPE_SPAN, + ), + filter=filt, + columns=[Column(key=AttributeKey(type=AttributeKey.TYPE_STRING, name="color"))], + order_by=[ + TraceItemTableRequest.OrderBy( + column=Column(key=AttributeKey(type=AttributeKey.TYPE_STRING, name="color")) + ) + ], + limit=100, + ) + response = EndpointTraceItemTable().execute(message) + if not response.column_values: + return [] + return sorted(r.val_str for r in response.column_values[0].results) + + def test_equals_empty_matches_only_stored_empty(self) -> None: + # The critical case: `status = ''` must match the stored empty, NOT the + # absent key (arrayElement reads both as ''). + assert self._execute(ComparisonFilter.OP_EQUALS, value="") == ["blue"] + + def test_equals_value_matches_only_that_value(self) -> None: + assert self._execute(ComparisonFilter.OP_EQUALS, value="ok") == ["red"] + + def test_equals_null_matches_only_absent(self) -> None: + assert self._execute(ComparisonFilter.OP_EQUALS, is_null=True) == ["green"] + + def test_not_equals_value_includes_empty_and_absent(self) -> None: + assert self._execute(ComparisonFilter.OP_NOT_EQUALS, value="ok") == ["blue", "green"] + + def test_not_equals_empty_includes_value_and_absent(self) -> None: + assert self._execute(ComparisonFilter.OP_NOT_EQUALS, value="") == ["green", "red"] + + def test_not_equals_null_matches_only_present(self) -> None: + assert self._execute(ComparisonFilter.OP_NOT_EQUALS, is_null=True) == ["blue", "red"] + + def test_like_wildcard_matches_present_not_absent(self) -> None: + # '%' matches both present rows (incl. the stored empty) but never the + # absent key. + assert self._execute(ComparisonFilter.OP_LIKE, value="%") == ["blue", "red"] + + def test_not_like_wildcard_matches_only_absent(self) -> None: + # Present rows all `like '%'`, so only the absent key survives NOT LIKE. + assert self._execute(ComparisonFilter.OP_NOT_LIKE, value="%") == ["green"] From 5b70a74c82bd811d5b520a4fe152d4db75910789 Mon Sep 17 00:00:00 2001 From: Pierre Massat Date: Mon, 15 Jun 2026 15:40:58 -0700 Subject: [PATCH 3/9] ref(eap): Generate coalesced filter values as multiIf instead of coalesce MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit excluded coalesced attributes from the analyzer-safe rewrite, leaving common filter attributes (transaction, environment, http.method, release, user.email, ...) on the legacy coalesce(if(mapContains, ..., NULL), ...) path — still feeding a foldable NULL into the predicate. Instead of rewriting the generated coalesce, generate the NULL-free form directly from the AttributeKey for the filter path: a multiIf over the per-key mapContains, with the existence OR as the guard. value = multiIf(mapContains(k1), arrayElement(k1), ..., mapContains(k_{n-1}), arrayElement(k_{n-1}), arrayElement(kn)) exists = mapContains(k1) OR ... OR mapContains(kn) This preserves the first-present-key fallthrough (the multiIf else value is only reached when every key is absent, where exists is already false) and the empty-vs-absent distinction, with no NULL anywhere. _map_backed_operands now takes the AttributeKey and reuses _generate_subscriptable_reference per coalesced key, so it never builds a coalesce object. The legacy coalesce(if(..., NULL)) form remains only in SELECT (via attribute_key_to_expression), where NULL output is wanted. Coverage: TestComparisonAnalyzerSafe asserts the coalesced shape is a multiIf with no coalesce / NULL / SubscriptableReference / if(...). The existing test_coalesce_attributes (writes deprecated ai.model_id, filters on canonical gen_ai.response.model) exercises the fallthrough end-to-end through this path. Co-Authored-By: Claude Opus 4.8 (1M context) --- snuba/web/rpc/common/common.py | 161 ++++++++++++++++++--------------- tests/web/rpc/test_common.py | 21 +++++ 2 files changed, 108 insertions(+), 74 deletions(-) diff --git a/snuba/web/rpc/common/common.py b/snuba/web/rpc/common/common.py index 034d9a0a22..13a4e3a13a 100644 --- a/snuba/web/rpc/common/common.py +++ b/snuba/web/rpc/common/common.py @@ -14,7 +14,9 @@ from snuba import settings, state from snuba.protos.common import ( + ATTRIBUTES_TO_COALESCE, MalformedAttributeException, + _generate_subscriptable_reference, type_array_to_membership_array_expression, ) from snuba.protos.common import ( @@ -311,34 +313,6 @@ def visit(node: Expression) -> Expression: return found -def _contains_coalesce(exp: Expression) -> bool: - """True if ``exp`` has any ``coalesce(...)`` call in its subtree.""" - found = False - - def visit(node: Expression) -> Expression: - nonlocal found - if isinstance(node, FunctionCall) and node.function_name == "coalesce": - found = True - return node - - exp.transform(visit) - return found - - -def _use_map_backed_operands(k_expression: Expression) -> bool: - """Whether the analyzer-safe ``(mapContains, arrayElement)`` rewrite applies. - - Requires a map lookup (a ``SubscriptableReference``) and excludes *coalesced* - attributes. ``coalesce(if(mapContains(a), a[k], NULL), if(mapContains(b), ...))`` - relies on each branch being NULL for a missing key so it can fall through to - the next; a bare ``arrayElement`` returns the column default ('' / 0), never - NULL, so lowering it would pin the result to the first key. Coalesced keys - therefore keep the legacy ``isNull``-based handling (the ``if(..., NULL)`` - wrapper is load-bearing there). - """ - return _contains_subscriptable_reference(k_expression) and not _contains_coalesce(k_expression) - - def _subscriptable_references_to_array_element(exp: Expression) -> Expression: """Lower every ``SubscriptableReference`` in ``exp`` to a bare ``arrayElement``. @@ -366,8 +340,62 @@ def transform(node: Expression) -> Expression: return exp.transform(transform) +def _map_backed_operands(k: AttributeKey) -> tuple[Expression, Expression]: + """Build ``(value, exists)`` for a map-backed attribute key, directly and + without ever constructing a ``coalesce``. + + A plain attribute resolves to a single ``arrayElement``; a coalesced + attribute resolves to the value of the first of its (canonical + deprecated) + keys that is present. We express that with a NULL-free ``multiIf`` over the + per-key ``mapContains``: + + value = multiIf(mapContains(k1), arrayElement(k1), + ..., + mapContains(k_{n-1}), arrayElement(k_{n-1}), + arrayElement(kn)) # else: last key's raw value + exists = mapContains(k1) OR ... OR mapContains(kn) + + Per-key comparisons are then built as ``and(exists, cmp(value, v))`` (negated + for ``!=`` / ``NOT LIKE`` / ``NOT IN``). This keeps every NULL constant out of + the WHERE / aggregate-condition predicate — see ``_analyzer_safe_in_expression`` + for why that matters to the new analyzer — while preserving the + absent-vs-empty distinction: a missing key has ``exists = false``, whereas a + stored empty value has ``exists = true`` and ``value = ''`` (``arrayElement`` + reads both as the column default, so ``mapContains`` is the only thing that + tells them apart). The ``multiIf`` else value is only reached when every key + is absent, where ``exists`` is already false, so it never affects the result. + + The legacy ``coalesce(if(mapContains, ..., NULL), ...)`` form (built by + ``attribute_key_to_expression`` and used in SELECT) leans on each branch being + NULL to fall through; we avoid it entirely here. + + Only valid for map-backed keys (string/int/float) — i.e. exactly when + ``_contains_subscriptable_reference`` of the built key expression is true. + Normalized columns / booleans / arrays keep their legacy ``isNull`` handling. + """ + names = [k.name] + list(ATTRIBUTES_TO_COALESCE.get(k.name, ())) + branches = [_generate_subscriptable_reference(name, k.type) for name in names] + values = [_subscriptable_references_to_array_element(b) for b in branches] + existences = [get_field_existence_expression(b) for b in branches] + + exists = combine_or_conditions(existences) if len(existences) > 1 else existences[0] + if len(values) == 1: + value: Expression = values[0] + else: + args: list[Expression] = [] + for cond, val in zip(existences[:-1], values[:-1]): + args.extend((cond, val)) + args.append(values[-1]) + value = f.multiIf(*args) + return value, exists + + def _analyzer_safe_in_expression( - k_expression: Expression, v_expression: Expression, *, negated: bool + k: AttributeKey, + v_expression: Expression, + *, + negated: bool, + ignore_case: bool = False, ) -> Expression: """Build an ``IN`` / ``NOT IN`` predicate the new ClickHouse analyzer can name consistently. @@ -380,9 +408,9 @@ def _analyzer_safe_in_expression( query dies with ``Code: 10. DB::Exception: Not found column ... in block``. Instead of feeding the existence ``if(...)`` into ``in(...)`` we compare the - raw ``arrayElement(...)`` value and guard it with an explicit ``mapContains`` - so a missing key (which ``arrayElement`` reads as the column default '' / 0, - not NULL) can never match: + NULL-free value and guard it with an explicit ``mapContains`` so a missing key + (which ``arrayElement`` reads as the column default '' / 0, not NULL) can + never match: * IN: key exists AND value in set * NOT IN: NOT (key exists AND value in set) -> missing key is "not in" @@ -392,36 +420,13 @@ def _analyzer_safe_in_expression( fields (``val_str_array`` etc.) only carry scalars, so the set never contains NULL and the old ``has(set, NULL)`` branch was always a constant ``false``. """ - value, exists = _map_backed_operands(k_expression) + value, exists = _map_backed_operands(k) + if ignore_case: + value = f.lower(value) present = and_cond(exists, in_cond(value, v_expression)) return not_cond(present) if negated else present -def _map_backed_operands(k_expression: Expression) -> tuple[Expression, Expression]: - """Return ``(value, exists)`` for a map-backed attribute key expression. - - ``value`` is the raw ``arrayElement(...)`` with every ``SubscriptableReference`` - lowered (so ``add_existence_check_to_subscriptable_references`` cannot wrap it - in ``if(..., NULL)``), and ``exists`` is the ``mapContains`` existence guard. - Per-key comparisons are then built as ``and(exists, cmp(value, v))`` (and - negated for ``!=`` / ``NOT LIKE`` / ``NOT IN``). This keeps every NULL constant - out of the WHERE / aggregate-condition predicate — see - ``_analyzer_safe_in_expression`` for why that matters to the new analyzer — - while preserving the absent-vs-empty distinction: a missing key has - ``exists = false``, whereas a stored empty value has ``exists = true`` and - ``value = ''`` (``arrayElement`` reads both as the column default, so the - ``mapContains`` guard is the only thing that tells them apart). - - Only valid when ``_contains_subscriptable_reference(k_expression)`` is true; - normalized columns / booleans / arrays keep their legacy ``isNull``-based - handling. - """ - return ( - _subscriptable_references_to_array_element(k_expression), - get_field_existence_expression(k_expression), - ) - - def _scalar_value(v: AttributeValue) -> bool | str | int | float | None: """Extract a Python scalar from an AttributeValue proto.""" match v.WhichOneof("value"): @@ -756,10 +761,10 @@ def trace_item_filters_to_expression( return _type_array_includes_scalar_expression( k_expression, v, item_filter.comparison_filter.ignore_case ) - if _use_map_backed_operands(k_expression): + if _contains_subscriptable_reference(k_expression): # Map-backed attribute: compare the raw arrayElement value guarded # by mapContains, keeping NULL out of the predicate. - value, exists = _map_backed_operands(k_expression) + value, exists = _map_backed_operands(k) if v_is_null: # `attr = null` means the key is absent. return not_cond(exists) @@ -789,9 +794,9 @@ def trace_item_filters_to_expression( k_expression, v, item_filter.comparison_filter.ignore_case ) ) - if _use_map_backed_operands(k_expression): + if _contains_subscriptable_reference(k_expression): # Negation of the OP_EQUALS form: an absent key is "not equal". - value, exists = _map_backed_operands(k_expression) + value, exists = _map_backed_operands(k) if v_is_null: # `attr != null` means the key is present. return exists @@ -829,8 +834,8 @@ def trace_item_filters_to_expression( "the LIKE comparison is only supported on string and array keys" ) comparison_function = f.ilike if item_filter.comparison_filter.ignore_case else f.like - if _use_map_backed_operands(k_expression): - value, exists = _map_backed_operands(k_expression) + if _contains_subscriptable_reference(k_expression): + value, exists = _map_backed_operands(k) return and_cond(exists, comparison_function(value, v_expression)) return comparison_function(k_expression, v_expression) if op == ComparisonFilter.OP_NOT_LIKE: @@ -850,10 +855,10 @@ def trace_item_filters_to_expression( raise BadSnubaRPCRequestException( "the NOT LIKE comparison is only supported on string and array keys" ) - if _use_map_backed_operands(k_expression): + if _contains_subscriptable_reference(k_expression): # Negation of the OP_LIKE form: an absent key is "not like". like_fn = f.ilike if item_filter.comparison_filter.ignore_case else f.like - value, exists = _map_backed_operands(k_expression) + value, exists = _map_backed_operands(k) return not_cond(and_cond(exists, like_fn(value, v_expression))) comparison_function = ( f.notILike if item_filter.comparison_filter.ignore_case else f.notLike @@ -873,8 +878,8 @@ def trace_item_filters_to_expression( return f.greaterOrEquals(k_expression, v_expression) if op == ComparisonFilter.OP_IN: _check_non_string_values_cannot_ignore_case(item_filter.comparison_filter) - if item_filter.comparison_filter.ignore_case: - k_expression = f.lower(k_expression) + ignore_case = item_filter.comparison_filter.ignore_case + if ignore_case: if value_type == "val_str_array": v_expression = literals_array( None, @@ -888,10 +893,14 @@ def trace_item_filters_to_expression( # note: v_expression must be an array # we redefine the way in works for nulls # now null in ['hi', null] is true - if _use_map_backed_operands(k_expression): + if _contains_subscriptable_reference(k_expression): # Map-backed attribute: avoid feeding the existence if(...) into # in(...), which the new analyzer canonicalizes inconsistently. - return _analyzer_safe_in_expression(k_expression, v_expression, negated=False) + return _analyzer_safe_in_expression( + k, v_expression, negated=False, ignore_case=ignore_case + ) + if ignore_case: + k_expression = f.lower(k_expression) expr = in_cond(k_expression, v_expression) expr_with_null = or_cond( expr, @@ -900,8 +909,8 @@ def trace_item_filters_to_expression( return expr_with_null if op == ComparisonFilter.OP_NOT_IN: _check_non_string_values_cannot_ignore_case(item_filter.comparison_filter) - if item_filter.comparison_filter.ignore_case: - k_expression = f.lower(k_expression) + ignore_case = item_filter.comparison_filter.ignore_case + if ignore_case: if value_type == "val_str_array": v_expression = literals_array( None, @@ -915,10 +924,14 @@ def trace_item_filters_to_expression( # note: v_expression must be an array # we redefine the way not in works for nulls # now null not in ['hi'] is true - if _use_map_backed_operands(k_expression): + if _contains_subscriptable_reference(k_expression): # Map-backed attribute: avoid feeding the existence if(...) into # in(...), which the new analyzer canonicalizes inconsistently. - return _analyzer_safe_in_expression(k_expression, v_expression, negated=True) + return _analyzer_safe_in_expression( + k, v_expression, negated=True, ignore_case=ignore_case + ) + if ignore_case: + k_expression = f.lower(k_expression) expr = not_cond(in_cond(k_expression, v_expression)) expr_with_null = or_cond( expr, diff --git a/tests/web/rpc/test_common.py b/tests/web/rpc/test_common.py index e96c77221d..22b4be0f95 100644 --- a/tests/web/rpc/test_common.py +++ b/tests/web/rpc/test_common.py @@ -489,6 +489,27 @@ def test_not_like_ignore_case_uses_ilike(self) -> None: names = self._fn_names(expr) assert "ilike" in names and "notILike" not in names + def test_coalesced_attribute_uses_multi_if_not_coalesce(self) -> None: + # A coalesced attribute is generated as a NULL-free multiIf over the + # per-key mapContains (never a coalesce(if(..., NULL), ...)), so it is + # analyzer-safe too. + canonical = "transaction" + assert canonical in ATTRIBUTES_TO_COALESCE, "test precondition: key must be coalesced" + item_filter = TraceItemFilter( + comparison_filter=ComparisonFilter( + key=AttributeKey(type=AttributeKey.Type.TYPE_STRING, name=canonical), + op=ComparisonFilter.OP_EQUALS, + value=AttributeValue(val_str="t"), + ) + ) + expr = trace_item_filters_to_expression(item_filter, attribute_key_to_expression) + self._assert_clean(expr) + names = self._fn_names(expr) + assert "multiIf" in names + assert "coalesce" not in names + # existence is an OR of mapContains across canonical + deprecated keys + assert {"and", "equals", "mapContains", "or", "arrayElement"} <= names + @pytest.mark.redis_db @pytest.mark.clickhouse_db From 92d75f62aa1b7694dbf889bb7a26f23f9848e68e Mon Sep 17 00:00:00 2001 From: Pierre Massat Date: Mon, 15 Jun 2026 16:07:15 -0700 Subject: [PATCH 4/9] ref(eap): Condense analyzer-safe filter docstrings and tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No behavior change. Trim the repeated analyzer-bug explanation to one canonical description in _map_backed_operands (other helpers reference it), shorten inline comments, and merge the two near-duplicate AST test classes (TestInNotInAnalyzerSafe + TestComparisonAnalyzerSafe) into a single TestAnalyzerSafeFilters sharing one set of walk/assert helpers. Drops the redundant existence-pass no-op test (its guarantee — no SubscriptableReference / if(...) in the predicate — is already covered by _assert_clean). Co-Authored-By: Claude Opus 4.8 (1M context) --- snuba/web/rpc/common/common.py | 122 +++++------------ tests/web/rpc/test_common.py | 242 +++++++++------------------------ 2 files changed, 103 insertions(+), 261 deletions(-) diff --git a/snuba/web/rpc/common/common.py b/snuba/web/rpc/common/common.py index 13a4e3a13a..6f67edd0e7 100644 --- a/snuba/web/rpc/common/common.py +++ b/snuba/web/rpc/common/common.py @@ -294,13 +294,8 @@ def transform(exp: Expression) -> Expression: def _contains_subscriptable_reference(exp: Expression) -> bool: - """True if ``exp`` has any ``SubscriptableReference`` in its subtree. - - Those are exactly the nodes ``add_existence_check_to_subscriptable_references`` - later wraps in ``if(mapContains(...), ..., NULL)``. When such a wrapped - reference ends up as the left operand of ``in(...)`` it triggers the new - analyzer column-name mismatch (see ``_analyzer_safe_in_expression``). - """ + """True if ``exp`` contains a ``SubscriptableReference`` (a map lookup) — the + map-backed keys we route through ``_map_backed_operands``.""" found = False def visit(node: Expression) -> Expression: @@ -314,20 +309,11 @@ def visit(node: Expression) -> Expression: def _subscriptable_references_to_array_element(exp: Expression) -> Expression: - """Lower every ``SubscriptableReference`` in ``exp`` to a bare ``arrayElement``. - - ``add_existence_check_to_subscriptable_references`` only rewrites - ``SubscriptableReference`` nodes, so pre-lowering them to ``arrayElement`` - keeps that pass from wrapping them in ``if(mapContains(...), ..., NULL)``. - The result carries no NULL constant, so it is safe as the left operand of - ``in(...)`` (see ``_analyzer_safe_in_expression``). ``arrayElement`` returns - the map's default ('' / 0) for a missing key rather than NULL — callers pair - this with an explicit ``mapContains`` guard to restore NULL semantics. - - All aliases are stripped: the lowered expression is structurally different - from the existence ``if(...)`` the SELECT clause may still carry for the same - attribute, so reusing the alias would collide ("different expression, same - alias"). Condition expressions don't need aliases anyway. + """Lower ``SubscriptableReference``\\ s to bare ``arrayElement`` (no NULL). + + Aliases are stripped so the lowered form can't collide with the existence + ``if(...)`` the SELECT clause may carry for the same attribute (same alias, + different expression). """ def transform(node: Expression) -> Expression: @@ -341,37 +327,26 @@ def transform(node: Expression) -> Expression: def _map_backed_operands(k: AttributeKey) -> tuple[Expression, Expression]: - """Build ``(value, exists)`` for a map-backed attribute key, directly and - without ever constructing a ``coalesce``. - - A plain attribute resolves to a single ``arrayElement``; a coalesced - attribute resolves to the value of the first of its (canonical + deprecated) - keys that is present. We express that with a NULL-free ``multiIf`` over the - per-key ``mapContains``: - - value = multiIf(mapContains(k1), arrayElement(k1), - ..., - mapContains(k_{n-1}), arrayElement(k_{n-1}), - arrayElement(kn)) # else: last key's raw value + """Build ``(value, exists)`` for a map-backed key, NULL-free. + + The legacy ``if(mapContains, arrayElement, NULL)`` form puts a NULL constant + into the predicate; the new ClickHouse analyzer canonicalizes that NULL + inconsistently between an aggregate's projection name and its computed block + (notably inside ``in(...)``), so the column isn't found ("Code: 10 ... Not + found column ... in block"). Building from ``mapContains`` + ``arrayElement`` + avoids the NULL entirely: + + value = arrayElement(k) # single key + value = multiIf(mapContains(k1), arrayElement(k1), ..., # coalesced: + arrayElement(kn)) # first present exists = mapContains(k1) OR ... OR mapContains(kn) - Per-key comparisons are then built as ``and(exists, cmp(value, v))`` (negated - for ``!=`` / ``NOT LIKE`` / ``NOT IN``). This keeps every NULL constant out of - the WHERE / aggregate-condition predicate — see ``_analyzer_safe_in_expression`` - for why that matters to the new analyzer — while preserving the - absent-vs-empty distinction: a missing key has ``exists = false``, whereas a - stored empty value has ``exists = true`` and ``value = ''`` (``arrayElement`` - reads both as the column default, so ``mapContains`` is the only thing that - tells them apart). The ``multiIf`` else value is only reached when every key - is absent, where ``exists`` is already false, so it never affects the result. - - The legacy ``coalesce(if(mapContains, ..., NULL), ...)`` form (built by - ``attribute_key_to_expression`` and used in SELECT) leans on each branch being - NULL to fall through; we avoid it entirely here. - - Only valid for map-backed keys (string/int/float) — i.e. exactly when - ``_contains_subscriptable_reference`` of the built key expression is true. - Normalized columns / booleans / arrays keep their legacy ``isNull`` handling. + Callers build ``and(exists, cmp(value, v))`` (negated for !=/NOT LIKE/NOT IN). + ``exists`` — not the value — distinguishes a missing key from a stored empty + value, since ``arrayElement`` reads both as the '' / 0 default; the coalesced + ``multiIf`` else is only reached when all keys are absent (``exists`` false). + Only valid for map-backed keys (string/int/float); the legacy + ``coalesce(if(..., NULL))`` form remains in SELECT, where NULL output is wanted. """ names = [k.name] + list(ATTRIBUTES_TO_COALESCE.get(k.name, ())) branches = [_generate_subscriptable_reference(name, k.type) for name in names] @@ -397,29 +372,11 @@ def _analyzer_safe_in_expression( negated: bool, ignore_case: bool = False, ) -> Expression: - """Build an ``IN`` / ``NOT IN`` predicate the new ClickHouse analyzer can name - consistently. - - The straightforward form ``in(if(mapContains(...), arrayElement(...), NULL), set)`` - puts a NULL constant inside the ``in(...)`` left operand. The new analyzer - folds that NULL differently when it names the projection column versus when - it computes the block (it strips the redundant CAST in one place but not the - other), so the aggregate column no longer matches its projection name and the - query dies with ``Code: 10. DB::Exception: Not found column ... in block``. - - Instead of feeding the existence ``if(...)`` into ``in(...)`` we compare the - NULL-free value and guard it with an explicit ``mapContains`` so a missing key - (which ``arrayElement`` reads as the column default '' / 0, not NULL) can - never match: - - * IN: key exists AND value in set - * NOT IN: NOT (key exists AND value in set) -> missing key is "not in" - - This is equivalent to the legacy ``isNull(if(mapContains, arrayElement, - NULL))`` handling for every value list these filters accept: the typed array - fields (``val_str_array`` etc.) only carry scalars, so the set never contains - NULL and the old ``has(set, NULL)`` branch was always a constant ``false``. - """ + """``IN`` / ``NOT IN`` as ``[not] and(exists, in(value, set))`` via + ``_map_backed_operands`` — keeps the NULL-laden existence ``if(...)`` out of + ``in(...)`` (see that helper). Equivalent to the legacy null-handling: the + value lists only carry scalars, so the set never contains NULL (the old + ``has(set, NULL)`` branch was always constant ``false``).""" value, exists = _map_backed_operands(k) if ignore_case: value = f.lower(value) @@ -762,11 +719,9 @@ def trace_item_filters_to_expression( k_expression, v, item_filter.comparison_filter.ignore_case ) if _contains_subscriptable_reference(k_expression): - # Map-backed attribute: compare the raw arrayElement value guarded - # by mapContains, keeping NULL out of the predicate. + # Map-backed: NULL-free (exists, value) form (see _map_backed_operands). value, exists = _map_backed_operands(k) - if v_is_null: - # `attr = null` means the key is absent. + if v_is_null: # `attr = null` <=> key absent return not_cond(exists) lhs, rhs = ( (f.lower(value), f.lower(v_expression)) @@ -795,10 +750,9 @@ def trace_item_filters_to_expression( ) ) if _contains_subscriptable_reference(k_expression): - # Negation of the OP_EQUALS form: an absent key is "not equal". + # Negation of OP_EQUALS; an absent key is "not equal". value, exists = _map_backed_operands(k) - if v_is_null: - # `attr != null` means the key is present. + if v_is_null: # `attr != null` <=> key present return exists lhs, rhs = ( (f.lower(value), f.lower(v_expression)) @@ -856,7 +810,7 @@ def trace_item_filters_to_expression( "the NOT LIKE comparison is only supported on string and array keys" ) if _contains_subscriptable_reference(k_expression): - # Negation of the OP_LIKE form: an absent key is "not like". + # Negation of OP_LIKE; an absent key is "not like". like_fn = f.ilike if item_filter.comparison_filter.ignore_case else f.like value, exists = _map_backed_operands(k) return not_cond(and_cond(exists, like_fn(value, v_expression))) @@ -894,8 +848,7 @@ def trace_item_filters_to_expression( # we redefine the way in works for nulls # now null in ['hi', null] is true if _contains_subscriptable_reference(k_expression): - # Map-backed attribute: avoid feeding the existence if(...) into - # in(...), which the new analyzer canonicalizes inconsistently. + # Map-backed: keep the existence if(...) out of in() (see helper). return _analyzer_safe_in_expression( k, v_expression, negated=False, ignore_case=ignore_case ) @@ -925,8 +878,7 @@ def trace_item_filters_to_expression( # we redefine the way not in works for nulls # now null not in ['hi'] is true if _contains_subscriptable_reference(k_expression): - # Map-backed attribute: avoid feeding the existence if(...) into - # in(...), which the new analyzer canonicalizes inconsistently. + # Map-backed: keep the existence if(...) out of in() (see helper). return _analyzer_safe_in_expression( k, v_expression, negated=True, ignore_case=ignore_case ) diff --git a/tests/web/rpc/test_common.py b/tests/web/rpc/test_common.py index 22b4be0f95..ea73d307f8 100644 --- a/tests/web/rpc/test_common.py +++ b/tests/web/rpc/test_common.py @@ -39,10 +39,8 @@ Literal, SubscriptableReference, ) -from snuba.query.logical import Query from snuba.web.rpc.common.common import ( _any_attribute_filter_to_expression, - add_existence_check_to_subscriptable_references, attribute_key_to_expression, next_monday, prev_monday, @@ -271,16 +269,12 @@ def test_exists_filter_on_non_coalesced_attribute_unchanged(self) -> None: assert expr.function_name == "mapContains" -class TestInNotInAnalyzerSafe: - """IN / NOT IN on a map-backed attribute must not feed the existence - ``if(mapContains(...), arrayElement(...), NULL)`` into ``in(...)``. - - That shape makes the new ClickHouse analyzer name the projection column and - the computed block inconsistently (it folds the NULL else-branch differently - in each), surfacing as ``Code: 10. DB::Exception: Not found column ... in - block`` (SNUBA-B62 / SNUBA-B6C / SNUBA-A13). We compare the raw - ``arrayElement`` value and guard NULL handling with an explicit - ``mapContains`` instead. +class TestAnalyzerSafeFilters: + """Per-key filters on map-backed attributes are built from a NULL-free + (mapContains, arrayElement) pair instead of the legacy + if(mapContains, arrayElement, NULL) + isNull(...) idiom, so the new ClickHouse + analyzer names the aggregate column consistently (SNUBA-B62/B6C/A13). The + mapContains guard preserves the absent-vs-empty distinction. """ @staticmethod @@ -294,138 +288,35 @@ def visit(node: Expression) -> Expression: expr.transform(visit) return nodes - @staticmethod - def _make_in_filter( - name: str, - op: ComparisonFilter.Op.ValueType, - values: list[str], - ) -> TraceItemFilter: - return TraceItemFilter( - comparison_filter=ComparisonFilter( - key=AttributeKey(type=AttributeKey.Type.TYPE_STRING, name=name), - op=op, - value=AttributeValue(val_str_array=StrArray(values=values)), - ) - ) - - def _assert_in_operand_is_array_element(self, expr: Expression) -> None: - # The whole point: every in(...) left operand is a raw arrayElement, not - # an if(...) carrying a NULL constant. - in_calls = [ - n for n in self._walk(expr) if isinstance(n, FunctionCall) and n.function_name == "in" - ] - assert in_calls, "expected an in(...) call in the predicate" - for in_call in in_calls: - operand = in_call.parameters[0] - assert isinstance(operand, FunctionCall), operand - assert operand.function_name == "arrayElement", operand.function_name - - def _assert_no_subscriptable_references(self, expr: Expression) -> None: - # No SubscriptableReference means add_existence_check_to_subscriptable_references - # is a no-op here, so it can never reintroduce the if(...) inside in(...). - assert not any(isinstance(n, SubscriptableReference) for n in self._walk(expr)) - - def _assert_no_null_literals(self, expr: Expression) -> None: - # The predicate must contain no NULL literal at all -- not in in(...), - # and not in a has(set, NULL) term either. - assert not any(isinstance(n, Literal) and n.value is None for n in self._walk(expr)) - - def test_not_in_uses_array_element_and_map_contains(self) -> None: - item_filter = self._make_in_filter( - "some.custom.tag", ComparisonFilter.OP_NOT_IN, ["a", "b"] - ) - expr = trace_item_filters_to_expression(item_filter, attribute_key_to_expression) - self._assert_in_operand_is_array_element(expr) - self._assert_no_subscriptable_references(expr) - - # not( and(mapContains, in(arrayElement, set)) ) -- no NULL anywhere. - assert isinstance(expr, FunctionCall) and expr.function_name == "not" - names = {n.function_name for n in self._walk(expr) if isinstance(n, FunctionCall)} - assert {"not", "and", "in", "mapContains", "arrayElement"} <= names - assert "has" not in names - self._assert_no_null_literals(expr) - - def test_in_uses_array_element_and_map_contains(self) -> None: - item_filter = self._make_in_filter("some.custom.tag", ComparisonFilter.OP_IN, ["a", "b"]) - expr = trace_item_filters_to_expression(item_filter, attribute_key_to_expression) - self._assert_in_operand_is_array_element(expr) - self._assert_no_subscriptable_references(expr) - # and(mapContains, in(arrayElement, set)) -- no NULL anywhere. - assert isinstance(expr, FunctionCall) and expr.function_name == "and" - self._assert_no_null_literals(expr) - - def test_not_in_ignore_case_lowers_array_element(self) -> None: - item_filter = TraceItemFilter( - comparison_filter=ComparisonFilter( - key=AttributeKey(type=AttributeKey.Type.TYPE_STRING, name="some.custom.tag"), - op=ComparisonFilter.OP_NOT_IN, - value=AttributeValue(val_str_array=StrArray(values=["A", "B"])), - ignore_case=True, - ) - ) - expr = trace_item_filters_to_expression(item_filter, attribute_key_to_expression) - self._assert_no_subscriptable_references(expr) - # in(...) operand is lower(arrayElement(...)); set values are lowercased. - in_calls = [ - n for n in self._walk(expr) if isinstance(n, FunctionCall) and n.function_name == "in" - ] - assert len(in_calls) == 1 - operand = in_calls[0].parameters[0] - assert isinstance(operand, FunctionCall) and operand.function_name == "lower" - inner = operand.parameters[0] - assert isinstance(inner, FunctionCall) and inner.function_name == "arrayElement" - - def test_existence_pass_is_noop_on_in_predicate(self) -> None: - """add_existence_check_to_subscriptable_references must not reintroduce an - if(...) inside the in(...) — it has no SubscriptableReference to wrap.""" - item_filter = self._make_in_filter( - "some.custom.tag", ComparisonFilter.OP_NOT_IN, ["a", "b"] - ) - expr = trace_item_filters_to_expression(item_filter, attribute_key_to_expression) - - query = Query(from_clause=None, condition=expr) - add_existence_check_to_subscriptable_references(query) - transformed = query.get_condition() - assert transformed is not None - self._assert_in_operand_is_array_element(transformed) - # No if(...) wrapper anywhere — the analyzer-foldable shape is gone. - assert not any( - isinstance(n, FunctionCall) and n.function_name == "if" for n in self._walk(transformed) - ) - - -class TestComparisonAnalyzerSafe: - """EQUALS / NOT_EQUALS / LIKE / NOT_LIKE on map-backed attributes are built - from a (mapContains, arrayElement) pair instead of the legacy - if(mapContains, arrayElement, NULL) + isNull(...) idiom, so no NULL constant - is fed into the WHERE / aggregate-condition predicate (same class of new - analyzer fragility as IN/NOT_IN), while the absent-vs-empty distinction is - preserved by the explicit mapContains guard. - """ - - @staticmethod - def _walk(expr: Expression) -> list[Expression]: - nodes: list[Expression] = [] - - def visit(node: Expression) -> Expression: - nodes.append(node) - return node + def _fn_names(self, expr: Expression) -> set[str]: + return {n.function_name for n in self._walk(expr) if isinstance(n, FunctionCall)} - expr.transform(visit) - return nodes + def _assert_clean(self, expr: Expression) -> None: + # No NULL literal, no leftover SubscriptableReference, no foldable if(...). + for n in self._walk(expr): + assert not (isinstance(n, Literal) and n.value is None) + assert not isinstance(n, SubscriptableReference) + assert not (isinstance(n, FunctionCall) and n.function_name == "if") - def _cmp( + def _build( self, op: ComparisonFilter.Op.ValueType, *, + name: str = "some.custom.tag", value: str | None = None, + values: list[str] | None = None, is_null: bool = False, ignore_case: bool = False, ) -> Expression: - av = AttributeValue(val_null=True) if is_null else AttributeValue(val_str=value or "") + if values is not None: + av = AttributeValue(val_str_array=StrArray(values=values)) + elif is_null: + av = AttributeValue(val_null=True) + else: + av = AttributeValue(val_str=value or "") item_filter = TraceItemFilter( comparison_filter=ComparisonFilter( - key=AttributeKey(type=AttributeKey.Type.TYPE_STRING, name="some.custom.tag"), + key=AttributeKey(type=AttributeKey.Type.TYPE_STRING, name=name), op=op, value=av, ignore_case=ignore_case, @@ -433,81 +324,80 @@ def _cmp( ) return trace_item_filters_to_expression(item_filter, attribute_key_to_expression) - def _fn_names(self, expr: Expression) -> set[str]: - return {n.function_name for n in self._walk(expr) if isinstance(n, FunctionCall)} + def test_in(self) -> None: + expr = self._build(ComparisonFilter.OP_IN, values=["a", "b"]) + self._assert_clean(expr) + assert isinstance(expr, FunctionCall) and expr.function_name == "and" + assert {"and", "in", "mapContains", "arrayElement"} <= self._fn_names(expr) - def _assert_clean(self, expr: Expression) -> None: - # No NULL literal, no leftover SubscriptableReference, no foldable if(...). - for n in self._walk(expr): - assert not (isinstance(n, Literal) and n.value is None) - assert not isinstance(n, SubscriptableReference) - assert not (isinstance(n, FunctionCall) and n.function_name == "if") + def test_not_in(self) -> None: + expr = self._build(ComparisonFilter.OP_NOT_IN, values=["a", "b"]) + self._assert_clean(expr) + assert isinstance(expr, FunctionCall) and expr.function_name == "not" + names = self._fn_names(expr) + assert {"not", "and", "in", "mapContains", "arrayElement"} <= names + assert "has" not in names # no NULL-in-set handling needed + + def test_in_ignore_case_lowers_array_element(self) -> None: + expr = self._build(ComparisonFilter.OP_NOT_IN, values=["A", "B"], ignore_case=True) + self._assert_clean(expr) + (in_call,) = [ + n for n in self._walk(expr) if isinstance(n, FunctionCall) and n.function_name == "in" + ] + operand = in_call.parameters[0] + assert isinstance(operand, FunctionCall) and operand.function_name == "lower" + inner = operand.parameters[0] + assert isinstance(inner, FunctionCall) and inner.function_name == "arrayElement" - def test_equals_is_guarded_arrayelement(self) -> None: - expr = self._cmp(ComparisonFilter.OP_EQUALS, value="ok") + def test_equals(self) -> None: + expr = self._build(ComparisonFilter.OP_EQUALS, value="ok") self._assert_clean(expr) assert isinstance(expr, FunctionCall) and expr.function_name == "and" assert {"and", "mapContains", "equals", "arrayElement"} <= self._fn_names(expr) def test_equals_null_is_not_map_contains(self) -> None: - # `attr = null` == key absent == not(mapContains(...)) - expr = self._cmp(ComparisonFilter.OP_EQUALS, is_null=True) + expr = self._build(ComparisonFilter.OP_EQUALS, is_null=True) # attr = null <=> absent self._assert_clean(expr) assert isinstance(expr, FunctionCall) and expr.function_name == "not" assert self._fn_names(expr) == {"not", "mapContains"} def test_not_equals_is_negated_guarded_equals(self) -> None: - expr = self._cmp(ComparisonFilter.OP_NOT_EQUALS, value="ok") + expr = self._build(ComparisonFilter.OP_NOT_EQUALS, value="ok") self._assert_clean(expr) assert isinstance(expr, FunctionCall) and expr.function_name == "not" assert {"not", "and", "mapContains", "equals", "arrayElement"} <= self._fn_names(expr) def test_not_equals_null_is_map_contains(self) -> None: - # `attr != null` == key present == mapContains(...) - expr = self._cmp(ComparisonFilter.OP_NOT_EQUALS, is_null=True) + expr = self._build(ComparisonFilter.OP_NOT_EQUALS, is_null=True) # attr != null <=> present self._assert_clean(expr) assert isinstance(expr, FunctionCall) and expr.function_name == "mapContains" - def test_like_is_guarded_arrayelement(self) -> None: - expr = self._cmp(ComparisonFilter.OP_LIKE, value="%ok%") + def test_like(self) -> None: + expr = self._build(ComparisonFilter.OP_LIKE, value="%ok%") self._assert_clean(expr) assert isinstance(expr, FunctionCall) and expr.function_name == "and" assert {"and", "mapContains", "like", "arrayElement"} <= self._fn_names(expr) - def test_not_like_is_negated_guarded_like(self) -> None: - expr = self._cmp(ComparisonFilter.OP_NOT_LIKE, value="%ok%") + def test_not_like_negates_positive_like(self) -> None: + expr = self._build(ComparisonFilter.OP_NOT_LIKE, value="%ok%") self._assert_clean(expr) assert isinstance(expr, FunctionCall) and expr.function_name == "not" names = self._fn_names(expr) assert {"not", "and", "mapContains", "like", "arrayElement"} <= names - # positive like under the negation, not notLike - assert "notLike" not in names - - def test_not_like_ignore_case_uses_ilike(self) -> None: - expr = self._cmp(ComparisonFilter.OP_NOT_LIKE, value="%ok%", ignore_case=True) - self._assert_clean(expr) - names = self._fn_names(expr) - assert "ilike" in names and "notILike" not in names + assert "notLike" not in names # positive like under the negation + ilike_expr = self._build(ComparisonFilter.OP_NOT_LIKE, value="%ok%", ignore_case=True) + assert "ilike" in self._fn_names(ilike_expr) and "notILike" not in self._fn_names( + ilike_expr + ) def test_coalesced_attribute_uses_multi_if_not_coalesce(self) -> None: - # A coalesced attribute is generated as a NULL-free multiIf over the - # per-key mapContains (never a coalesce(if(..., NULL), ...)), so it is - # analyzer-safe too. - canonical = "transaction" - assert canonical in ATTRIBUTES_TO_COALESCE, "test precondition: key must be coalesced" - item_filter = TraceItemFilter( - comparison_filter=ComparisonFilter( - key=AttributeKey(type=AttributeKey.Type.TYPE_STRING, name=canonical), - op=ComparisonFilter.OP_EQUALS, - value=AttributeValue(val_str="t"), - ) - ) - expr = trace_item_filters_to_expression(item_filter, attribute_key_to_expression) + # Coalesced keys resolve via a NULL-free multiIf over per-key mapContains, + # never coalesce(if(..., NULL), ...), so they are analyzer-safe too. + assert "transaction" in ATTRIBUTES_TO_COALESCE + expr = self._build(ComparisonFilter.OP_EQUALS, name="transaction", value="t") self._assert_clean(expr) names = self._fn_names(expr) - assert "multiIf" in names - assert "coalesce" not in names - # existence is an OR of mapContains across canonical + deprecated keys + assert "multiIf" in names and "coalesce" not in names assert {"and", "equals", "mapContains", "or", "arrayElement"} <= names From feaddd936d29ce63daee5d66daedb1d3b9c3843a Mon Sep 17 00:00:00 2001 From: Pierre Massat Date: Mon, 15 Jun 2026 16:34:58 -0700 Subject: [PATCH 5/9] perf(eap): Drop the mapContains guard when the literal can't match the default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existence guard is only needed to stop an absent key — which arrayElement reads as the column default ('' / 0) — from matching. For EQUALS / NOT_EQUALS / IN / NOT_IN against a literal that isn't the default, that can't happen, so the guard is dead weight. Emit the bare comparison instead, giving the simplest analyzer-safe SQL: sentry.status NOT IN ['ok','cancelled','unknown'] -> not(in(arrayElement(attributes_string,'sentry.status'), ['ok','cancelled','unknown'])) The guard is kept when the literal IS the default (`= ''`, `= 0`, `IN ['', ...]`) so absent keys are still excluded, and always for LIKE/NOT_LIKE (a pattern may match the default) and the `= null` / `!= null` cases. The empty-vs-absent e2e tests still pass unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- snuba/web/rpc/common/common.py | 62 +++++++++++++++++++------ tests/web/rpc/test_common.py | 84 ++++++++++++++++++++++------------ 2 files changed, 102 insertions(+), 44 deletions(-) diff --git a/snuba/web/rpc/common/common.py b/snuba/web/rpc/common/common.py index 6f67edd0e7..8a57999cd6 100644 --- a/snuba/web/rpc/common/common.py +++ b/snuba/web/rpc/common/common.py @@ -341,10 +341,11 @@ def _map_backed_operands(k: AttributeKey) -> tuple[Expression, Expression]: arrayElement(kn)) # first present exists = mapContains(k1) OR ... OR mapContains(kn) - Callers build ``and(exists, cmp(value, v))`` (negated for !=/NOT LIKE/NOT IN). - ``exists`` — not the value — distinguishes a missing key from a stored empty - value, since ``arrayElement`` reads both as the '' / 0 default; the coalesced - ``multiIf`` else is only reached when all keys are absent (``exists`` false). + Callers build ``cmp(value, v)``, wrapped in ``and(exists, ...)`` only when the + literal could be the column default (see ``_comparison_can_match_column_default``) + — then ``exists``, not the value, distinguishes a missing key from a stored + empty value, since ``arrayElement`` reads both as the '' / 0 default. The + coalesced ``multiIf`` else is only reached when all keys are absent. Only valid for map-backed keys (string/int/float); the legacy ``coalesce(if(..., NULL))`` form remains in SELECT, where NULL output is wanted. """ @@ -371,16 +372,18 @@ def _analyzer_safe_in_expression( *, negated: bool, ignore_case: bool = False, + guard: bool = True, ) -> Expression: - """``IN`` / ``NOT IN`` as ``[not] and(exists, in(value, set))`` via - ``_map_backed_operands`` — keeps the NULL-laden existence ``if(...)`` out of - ``in(...)`` (see that helper). Equivalent to the legacy null-handling: the - value lists only carry scalars, so the set never contains NULL (the old - ``has(set, NULL)`` branch was always constant ``false``).""" + """``IN`` / ``NOT IN`` as ``[not] in(value, set)``, wrapped in + ``and(exists, ...)`` only when ``guard`` is set (see ``_map_backed_operands`` + and ``_comparison_can_match_column_default``). The value lists carry only + scalars, so the set never contains NULL — the legacy ``has(set, NULL)`` branch + was always ``false``.""" value, exists = _map_backed_operands(k) if ignore_case: value = f.lower(value) - present = and_cond(exists, in_cond(value, v_expression)) + membership = in_cond(value, v_expression) + present = and_cond(exists, membership) if guard else membership return not_cond(present) if negated else present @@ -403,6 +406,23 @@ def _scalar_value(v: AttributeValue) -> bool | str | int | float | None: raise NotImplementedError(f"not a scalar AttributeValue type: {other}") +def _comparison_can_match_column_default( + attr_type: AttributeKey.Type.ValueType, v: AttributeValue, value_type: str +) -> bool: + """True if any compared literal is the column default ('' / 0), which an + absent key also reads as — so the ``mapContains`` guard is needed to avoid + matching absent keys. When no literal is the default the guard is dropped (the + simplest form). LIKE/NOT_LIKE always guard; null comparisons are separate.""" + default: str | int = "" if attr_type == AttributeKey.Type.TYPE_STRING else 0 + if value_type == "val_array": + scalars: list[Any] = [_scalar_value(x) for x in v.val_array.values] + elif value_type in ("val_str_array", "val_int_array", "val_float_array", "val_double_array"): + scalars = list(getattr(v, value_type).values) + else: + scalars = [_scalar_value(v)] + return any(s == default for s in scalars) + + def _attribute_value_to_expression(v: AttributeValue) -> Expression: """Convert an AttributeValue proto to a Snuba Expression.""" value_type = v.WhichOneof("value") @@ -728,7 +748,11 @@ def trace_item_filters_to_expression( if item_filter.comparison_filter.ignore_case else (value, v_expression) ) - return and_cond(exists, f.equals(lhs, rhs)) + cmp = f.equals(lhs, rhs) + # mapContains guard only needed when '' / 0 could match an absent key. + if _comparison_can_match_column_default(k.type, v, value_type): + return and_cond(exists, cmp) + return cmp else: expr = ( f.equals(f.lower(k_expression), f.lower(v_expression)) @@ -759,7 +783,9 @@ def trace_item_filters_to_expression( if item_filter.comparison_filter.ignore_case else (value, v_expression) ) - return not_cond(and_cond(exists, f.equals(lhs, rhs))) + if _comparison_can_match_column_default(k.type, v, value_type): + return not_cond(and_cond(exists, f.equals(lhs, rhs))) + return f.notEquals(lhs, rhs) else: expr = ( f.notEquals(f.lower(k_expression), f.lower(v_expression)) @@ -850,7 +876,11 @@ def trace_item_filters_to_expression( if _contains_subscriptable_reference(k_expression): # Map-backed: keep the existence if(...) out of in() (see helper). return _analyzer_safe_in_expression( - k, v_expression, negated=False, ignore_case=ignore_case + k, + v_expression, + negated=False, + ignore_case=ignore_case, + guard=_comparison_can_match_column_default(k.type, v, value_type), ) if ignore_case: k_expression = f.lower(k_expression) @@ -880,7 +910,11 @@ def trace_item_filters_to_expression( if _contains_subscriptable_reference(k_expression): # Map-backed: keep the existence if(...) out of in() (see helper). return _analyzer_safe_in_expression( - k, v_expression, negated=True, ignore_case=ignore_case + k, + v_expression, + negated=True, + ignore_case=ignore_case, + guard=_comparison_can_match_column_default(k.type, v, value_type), ) if ignore_case: k_expression = f.lower(k_expression) diff --git a/tests/web/rpc/test_common.py b/tests/web/rpc/test_common.py index ea73d307f8..925b61d26b 100644 --- a/tests/web/rpc/test_common.py +++ b/tests/web/rpc/test_common.py @@ -324,19 +324,45 @@ def _build( ) return trace_item_filters_to_expression(item_filter, attribute_key_to_expression) - def test_in(self) -> None: + # --- pruned forms: literal != column default, so no mapContains guard --- + + def test_in_pruned(self) -> None: expr = self._build(ComparisonFilter.OP_IN, values=["a", "b"]) self._assert_clean(expr) - assert isinstance(expr, FunctionCall) and expr.function_name == "and" - assert {"and", "in", "mapContains", "arrayElement"} <= self._fn_names(expr) + assert isinstance(expr, FunctionCall) and expr.function_name == "in" + assert "mapContains" not in self._fn_names(expr) - def test_not_in(self) -> None: + def test_not_in_pruned(self) -> None: expr = self._build(ComparisonFilter.OP_NOT_IN, values=["a", "b"]) self._assert_clean(expr) assert isinstance(expr, FunctionCall) and expr.function_name == "not" names = self._fn_names(expr) - assert {"not", "and", "in", "mapContains", "arrayElement"} <= names - assert "has" not in names # no NULL-in-set handling needed + assert {"not", "in", "arrayElement"} <= names + assert "mapContains" not in names and "has" not in names + + def test_equals_pruned(self) -> None: + expr = self._build(ComparisonFilter.OP_EQUALS, value="ok") + self._assert_clean(expr) + assert isinstance(expr, FunctionCall) and expr.function_name == "equals" + assert "mapContains" not in self._fn_names(expr) + + def test_not_equals_pruned(self) -> None: + expr = self._build(ComparisonFilter.OP_NOT_EQUALS, value="ok") + self._assert_clean(expr) + assert isinstance(expr, FunctionCall) and expr.function_name == "notEquals" + assert "mapContains" not in self._fn_names(expr) + + def test_coalesced_pruned_uses_multi_if_no_outer_guard(self) -> None: + # Coalesced keys resolve via a NULL-free multiIf over per-key mapContains + # (never coalesce(if(..., NULL), ...)); with a non-default literal the + # outer existence OR-guard is dropped too. + assert "transaction" in ATTRIBUTES_TO_COALESCE + expr = self._build(ComparisonFilter.OP_EQUALS, name="transaction", value="t") + self._assert_clean(expr) + assert isinstance(expr, FunctionCall) and expr.function_name == "equals" + names = self._fn_names(expr) + assert "multiIf" in names and "coalesce" not in names + assert "or" not in names # no outer exists guard def test_in_ignore_case_lowers_array_element(self) -> None: expr = self._build(ComparisonFilter.OP_NOT_IN, values=["A", "B"], ignore_case=True) @@ -349,56 +375,54 @@ def test_in_ignore_case_lowers_array_element(self) -> None: inner = operand.parameters[0] assert isinstance(inner, FunctionCall) and inner.function_name == "arrayElement" - def test_equals(self) -> None: - expr = self._build(ComparisonFilter.OP_EQUALS, value="ok") + # --- guarded forms: literal IS the column default, so mapContains is kept --- + + def test_equals_default_value_keeps_guard(self) -> None: + expr = self._build(ComparisonFilter.OP_EQUALS, value="") # '' could match an absent key self._assert_clean(expr) assert isinstance(expr, FunctionCall) and expr.function_name == "and" assert {"and", "mapContains", "equals", "arrayElement"} <= self._fn_names(expr) + def test_in_with_default_value_keeps_guard(self) -> None: + expr = self._build(ComparisonFilter.OP_IN, values=["", "x"]) + self._assert_clean(expr) + assert isinstance(expr, FunctionCall) and expr.function_name == "and" + assert {"and", "in", "mapContains"} <= self._fn_names(expr) + + def test_coalesced_default_value_keeps_guard(self) -> None: + expr = self._build(ComparisonFilter.OP_EQUALS, name="transaction", value="") + self._assert_clean(expr) + assert isinstance(expr, FunctionCall) and expr.function_name == "and" + assert {"and", "or", "equals", "multiIf", "mapContains"} <= self._fn_names(expr) + + # --- always-guarded forms: null comparisons and LIKE/NOT_LIKE --- + def test_equals_null_is_not_map_contains(self) -> None: expr = self._build(ComparisonFilter.OP_EQUALS, is_null=True) # attr = null <=> absent self._assert_clean(expr) assert isinstance(expr, FunctionCall) and expr.function_name == "not" assert self._fn_names(expr) == {"not", "mapContains"} - def test_not_equals_is_negated_guarded_equals(self) -> None: - expr = self._build(ComparisonFilter.OP_NOT_EQUALS, value="ok") - self._assert_clean(expr) - assert isinstance(expr, FunctionCall) and expr.function_name == "not" - assert {"not", "and", "mapContains", "equals", "arrayElement"} <= self._fn_names(expr) - def test_not_equals_null_is_map_contains(self) -> None: expr = self._build(ComparisonFilter.OP_NOT_EQUALS, is_null=True) # attr != null <=> present self._assert_clean(expr) assert isinstance(expr, FunctionCall) and expr.function_name == "mapContains" - def test_like(self) -> None: + def test_like_keeps_guard(self) -> None: expr = self._build(ComparisonFilter.OP_LIKE, value="%ok%") self._assert_clean(expr) assert isinstance(expr, FunctionCall) and expr.function_name == "and" assert {"and", "mapContains", "like", "arrayElement"} <= self._fn_names(expr) - def test_not_like_negates_positive_like(self) -> None: + def test_not_like_keeps_guard_and_negates_positive_like(self) -> None: expr = self._build(ComparisonFilter.OP_NOT_LIKE, value="%ok%") self._assert_clean(expr) assert isinstance(expr, FunctionCall) and expr.function_name == "not" names = self._fn_names(expr) assert {"not", "and", "mapContains", "like", "arrayElement"} <= names assert "notLike" not in names # positive like under the negation - ilike_expr = self._build(ComparisonFilter.OP_NOT_LIKE, value="%ok%", ignore_case=True) - assert "ilike" in self._fn_names(ilike_expr) and "notILike" not in self._fn_names( - ilike_expr - ) - - def test_coalesced_attribute_uses_multi_if_not_coalesce(self) -> None: - # Coalesced keys resolve via a NULL-free multiIf over per-key mapContains, - # never coalesce(if(..., NULL), ...), so they are analyzer-safe too. - assert "transaction" in ATTRIBUTES_TO_COALESCE - expr = self._build(ComparisonFilter.OP_EQUALS, name="transaction", value="t") - self._assert_clean(expr) - names = self._fn_names(expr) - assert "multiIf" in names and "coalesce" not in names - assert {"and", "equals", "mapContains", "or", "arrayElement"} <= names + ilike = self._build(ComparisonFilter.OP_NOT_LIKE, value="%ok%", ignore_case=True) + assert "ilike" in self._fn_names(ilike) and "notILike" not in self._fn_names(ilike) @pytest.mark.redis_db From 2e369f7be7596bda56ddc60bea88f1e93fd35b90 Mon Sep 17 00:00:00 2001 From: Pierre Massat Date: Mon, 15 Jun 2026 16:45:52 -0700 Subject: [PATCH 6/9] ref(eap): Build map-backed filter operands directly instead of rewriting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _map_backed_operands now writes arrayElement / mapContains directly from the AttributeKey (using PROTO_TYPE_TO_ATTRIBUTE_COLUMN), instead of generating a SubscriptableReference via _generate_subscriptable_reference and then rewriting it to arrayElement. Drops the _subscriptable_references_to_array_element helper and the get_field_existence_expression round-trip for this path. Identical generated SQL (pure internal cleanup) — the renders and all unit/e2e tests are unchanged. As a side effect the operands are alias-free by construction (we never attach an alias), rather than building-with-alias then stripping; conditions don't need aliases and an alias here would collide with the SELECT clause's existence if(...) for the same attribute. Co-Authored-By: Claude Opus 4.8 (1M context) --- snuba/web/rpc/common/common.py | 52 +++++++++++++++------------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/snuba/web/rpc/common/common.py b/snuba/web/rpc/common/common.py index 8a57999cd6..15fcbb5e2b 100644 --- a/snuba/web/rpc/common/common.py +++ b/snuba/web/rpc/common/common.py @@ -1,5 +1,4 @@ import json -from dataclasses import replace from datetime import datetime, timedelta, timezone from typing import Any, Callable, TypeVar, cast @@ -15,8 +14,9 @@ from snuba import settings, state from snuba.protos.common import ( ATTRIBUTES_TO_COALESCE, + PROTO_TYPE_TO_ATTRIBUTE_COLUMN, + PROTO_TYPE_TO_CLICKHOUSE_TYPE, MalformedAttributeException, - _generate_subscriptable_reference, type_array_to_membership_array_expression, ) from snuba.protos.common import ( @@ -308,33 +308,15 @@ def visit(node: Expression) -> Expression: return found -def _subscriptable_references_to_array_element(exp: Expression) -> Expression: - """Lower ``SubscriptableReference``\\ s to bare ``arrayElement`` (no NULL). - - Aliases are stripped so the lowered form can't collide with the existence - ``if(...)`` the SELECT clause may carry for the same attribute (same alias, - different expression). - """ - - def transform(node: Expression) -> Expression: - if isinstance(node, SubscriptableReference): - return arrayElement(None, node.column, node.key) - if node.alias is not None: - return replace(node, alias=None) - return node - - return exp.transform(transform) - - def _map_backed_operands(k: AttributeKey) -> tuple[Expression, Expression]: - """Build ``(value, exists)`` for a map-backed key, NULL-free. + """Build ``(value, exists)`` for a map-backed key directly, NULL-free. The legacy ``if(mapContains, arrayElement, NULL)`` form puts a NULL constant into the predicate; the new ClickHouse analyzer canonicalizes that NULL inconsistently between an aggregate's projection name and its computed block (notably inside ``in(...)``), so the column isn't found ("Code: 10 ... Not - found column ... in block"). Building from ``mapContains`` + ``arrayElement`` - avoids the NULL entirely: + found column ... in block"). We write ``mapContains`` + ``arrayElement`` + directly, so no NULL appears: value = arrayElement(k) # single key value = multiIf(mapContains(k1), arrayElement(k1), ..., # coalesced: @@ -345,14 +327,26 @@ def _map_backed_operands(k: AttributeKey) -> tuple[Expression, Expression]: literal could be the column default (see ``_comparison_can_match_column_default``) — then ``exists``, not the value, distinguishes a missing key from a stored empty value, since ``arrayElement`` reads both as the '' / 0 default. The - coalesced ``multiIf`` else is only reached when all keys are absent. - Only valid for map-backed keys (string/int/float); the legacy - ``coalesce(if(..., NULL))`` form remains in SELECT, where NULL output is wanted. + ``multiIf`` else is only reached when all keys are absent. + + Built without aliases: conditions don't need them, and an alias here would + collide with the SELECT clause's existence ``if(...)`` for the same attribute + (same alias, different expression). Only valid for map-backed keys + (string/int/float); booleans / normalized columns / arrays take the legacy + path, and SELECT keeps its own ``coalesce(...)`` representation untouched. """ + col_name = PROTO_TYPE_TO_ATTRIBUTE_COLUMN[k.type] names = [k.name] + list(ATTRIBUTES_TO_COALESCE.get(k.name, ())) - branches = [_generate_subscriptable_reference(name, k.type) for name in names] - values = [_subscriptable_references_to_array_element(b) for b in branches] - existences = [get_field_existence_expression(b) for b in branches] + + def _value(name: str) -> Expression: + elem = arrayElement(None, column(col_name), literal(name)) + # ints live in the float map and surface as Int64, so they need a cast. + if k.type == AttributeKey.Type.TYPE_INT: + return f.cast(elem, f"Nullable({PROTO_TYPE_TO_CLICKHOUSE_TYPE[k.type]})") + return elem + + values = [_value(name) for name in names] + existences = [f.mapContains(column(col_name), literal(name)) for name in names] exists = combine_or_conditions(existences) if len(existences) > 1 else existences[0] if len(values) == 1: From 983d82c3a9e38852394903b056e50700a239d84d Mon Sep 17 00:00:00 2001 From: Pierre Massat Date: Mon, 15 Jun 2026 20:32:21 -0700 Subject: [PATCH 7/9] fix(eap): Rewrite map-backed filter operands for virtual columns Map-backed filters now build arrayElement(attributes_string, key) directly instead of a SubscriptableReference, but _apply_virtual_columns only rewrote SubscriptableReference reads into the value-mapping transform. A filter on a virtual column (e.g. device.class) was therefore left comparing the raw public alias key against the unmapped literal, matching nothing and returning zero rows. Recognize the arrayElement form too so virtual-column filters compare against the mapped value, as they did before the map-backed rewrite. Co-Authored-By: Claude Opus 4.8 Agent transcript: https://claudescope.sentry.dev/share/P8gdU4fKq0U5g2jvhHBcfZ4HxsStWD1OJ1lagpQ_ldo --- .../R_eap_items/resolver_trace_item_table.py | 28 +++++++++--- .../test_endpoint_trace_item_table.py | 44 +++++++++++++++++++ 2 files changed, 67 insertions(+), 5 deletions(-) diff --git a/snuba/web/rpc/v1/resolvers/R_eap_items/resolver_trace_item_table.py b/snuba/web/rpc/v1/resolvers/R_eap_items/resolver_trace_item_table.py index 8d7260fa57..7b2a184a0e 100644 --- a/snuba/web/rpc/v1/resolvers/R_eap_items/resolver_trace_item_table.py +++ b/snuba/web/rpc/v1/resolvers/R_eap_items/resolver_trace_item_table.py @@ -37,12 +37,18 @@ from snuba.query.dsl import Functions as f from snuba.query.dsl import and_cond, if_cond, in_cond, literal, literals_array, or_cond from snuba.query.dsl import column as snuba_column +from snuba.query.expressions import ( + Column as ColumnExpr, +) from snuba.query.expressions import ( DangerousRawSQL, Expression, FunctionCall, SubscriptableReference, ) +from snuba.query.expressions import ( + Literal as LiteralExpr, +) from snuba.query.logical import Query from snuba.query.query_settings import HTTPQuerySettings from snuba.request import Request as SnubaRequest @@ -137,13 +143,25 @@ def _apply_virtual_columns( mapped_column_to_context = {c.to_column_name: c for c in virtual_column_contexts} def transform_expressions(expression: Expression) -> Expression: - # virtual columns will show up as `attr_str[virtual_column_name]` or `attr_num[virtual_column_name]` - if not isinstance(expression, SubscriptableReference): + # The attribute reads either as attributes_string[key] (SELECT / group by) or + # as arrayElement(attributes_string, key) — the form map-backed filters build + # directly (see _map_backed_operands). Both must map to the virtual value. + if isinstance(expression, SubscriptableReference): + if expression.column.column_name != "attributes_string": + return expression + key = str(expression.key.value) + elif ( + isinstance(expression, FunctionCall) + and expression.function_name == "arrayElement" + and isinstance(expression.parameters[0], ColumnExpr) + and expression.parameters[0].column_name == "attributes_string" + and isinstance(expression.parameters[1], LiteralExpr) + ): + key = str(expression.parameters[1].value) + else: return expression - if expression.column.column_name != "attributes_string": - return expression - context = mapped_column_to_context.get(str(expression.key.value)) + context = mapped_column_to_context.get(key) if context: attribute_expression = attribute_key_to_expression( AttributeKey( diff --git a/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py b/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py index db37dda848..eb1d559d42 100644 --- a/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py +++ b/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py @@ -3268,6 +3268,50 @@ def test_virtual_column_with_missing_attribute(self) -> None: AttributeValue(val_str="a") for _ in range(10) ] + [AttributeValue(val_str="default") for _ in range(5)] + def test_virtual_column_filter_uses_value_map(self) -> None: + # Regression: a filter on a virtual column must compare against the mapped + # value. Map-backed filters build arrayElement(attributes_string, key) + # directly (see _map_backed_operands), so _apply_virtual_columns has to + # rewrite that form too, not only the SubscriptableReference used by + # SELECT / group by — otherwise the filter matches the raw key/value and + # returns nothing. + span_ts = BASE_TIME - timedelta(minutes=1) + write_eap_item(span_ts, {"device.class": "1"}, 3) + write_eap_item(span_ts, {"device.class": "2"}, 5) + message = TraceItemTableRequest( + meta=RequestMeta( + project_ids=[1], + organization_id=1, + cogs_category="something", + referrer="something", + start_timestamp=START_TIMESTAMP, + end_timestamp=END_TIMESTAMP, + trace_item_type=TraceItemType.TRACE_ITEM_TYPE_SPAN, + ), + columns=[ + Column(key=AttributeKey(type=AttributeKey.TYPE_STRING, name="device.class.label")), + ], + filter=TraceItemFilter( + comparison_filter=ComparisonFilter( + key=AttributeKey(type=AttributeKey.TYPE_STRING, name="device.class.label"), + op=ComparisonFilter.OP_EQUALS, + value=AttributeValue(val_str="low"), + ) + ), + virtual_column_contexts=[ + VirtualColumnContext( + from_column_name="device.class", + to_column_name="device.class.label", + value_map={"1": "low", "2": "medium"}, + default_value="Unknown", + ), + ], + ) + response = EndpointTraceItemTable().execute(message) + assert response.column_values[0].results == [ + AttributeValue(val_str="low") for _ in range(3) + ] + def test_normal_mode_end_to_end(self) -> None: items_storage = get_storage(StorageKey("eap_items")) msg_timestamp = BASE_TIME - timedelta(minutes=1) From 96eae6ba1eef84b5b943519ef437ee63b8403dc9 Mon Sep 17 00:00:00 2001 From: Pierre Massat Date: Mon, 15 Jun 2026 20:55:44 -0700 Subject: [PATCH 8/9] fix(eap): Handle null values in map-backed IN/NOT_IN guard _comparison_can_match_column_default is always called for map-backed OP_IN/OP_NOT_IN, and it routes through _scalar_value, which raised NotImplementedError on val_null (a null filter value or a null array element). Translation crashed where the non-map-backed path builds in(k, NULL) fine. Treat val_null as None in _scalar_value so the guard sees a non-default scalar and SQL is built consistently. Co-Authored-By: Claude Opus 4.8 --- snuba/web/rpc/common/common.py | 2 +- tests/web/rpc/test_common.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/snuba/web/rpc/common/common.py b/snuba/web/rpc/common/common.py index 15fcbb5e2b..2ad3044d62 100644 --- a/snuba/web/rpc/common/common.py +++ b/snuba/web/rpc/common/common.py @@ -394,7 +394,7 @@ def _scalar_value(v: AttributeValue) -> bool | str | int | float | None: return v.val_double case "val_int": return v.val_int - case None: + case "val_null" | None: return None case other: raise NotImplementedError(f"not a scalar AttributeValue type: {other}") diff --git a/tests/web/rpc/test_common.py b/tests/web/rpc/test_common.py index 925b61d26b..5f4e88e056 100644 --- a/tests/web/rpc/test_common.py +++ b/tests/web/rpc/test_common.py @@ -408,6 +408,16 @@ def test_not_equals_null_is_map_contains(self) -> None: self._assert_clean(expr) assert isinstance(expr, FunctionCall) and expr.function_name == "mapContains" + def test_in_null_value_builds_without_raising(self) -> None: + # null isn't the column default, so the guard helper must treat it as a + # non-default scalar rather than raising (see _scalar_value / val_null). + expr = self._build(ComparisonFilter.OP_IN, is_null=True) + assert "in" in self._fn_names(expr) + + def test_not_in_null_value_builds_without_raising(self) -> None: + expr = self._build(ComparisonFilter.OP_NOT_IN, is_null=True) + assert {"not", "in"} <= self._fn_names(expr) + def test_like_keeps_guard(self) -> None: expr = self._build(ComparisonFilter.OP_LIKE, value="%ok%") self._assert_clean(expr) From 8d9b451800c07db4afe5aa0e1193b899cee1e77d Mon Sep 17 00:00:00 2001 From: Pierre Massat Date: Mon, 15 Jun 2026 21:07:04 -0700 Subject: [PATCH 9/9] fix(eap): Rewrite mapContains existence guard for virtual columns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Map-backed filters build a mapContains(attributes_string, key) existence guard from the request attribute name. For a virtual column that key is the to_column_name, which is absent in storage, so LIKE / NOT_LIKE, null checks, and default-value guards kept the existence check on the virtual key and matched no (or every) row. _apply_virtual_columns now rewrites the mapContains form too, mapping the guard to the backing from_column_name's existence — alongside the arrayElement value rewrite. Co-Authored-By: Claude Opus 4.8 --- .../R_eap_items/resolver_trace_item_table.py | 49 +++++++++++-------- .../test_endpoint_trace_item_table.py | 42 ++++++++++++++++ 2 files changed, 70 insertions(+), 21 deletions(-) diff --git a/snuba/web/rpc/v1/resolvers/R_eap_items/resolver_trace_item_table.py b/snuba/web/rpc/v1/resolvers/R_eap_items/resolver_trace_item_table.py index 7b2a184a0e..7fdf7ff4a5 100644 --- a/snuba/web/rpc/v1/resolvers/R_eap_items/resolver_trace_item_table.py +++ b/snuba/web/rpc/v1/resolvers/R_eap_items/resolver_trace_item_table.py @@ -58,6 +58,7 @@ add_existence_check_to_subscriptable_references, attribute_key_to_expression, base_conditions_and, + get_field_existence_expression, timestamp_in_range_condition, trace_item_filters_to_expression, treeify_or_and_conditions, @@ -143,43 +144,49 @@ def _apply_virtual_columns( mapped_column_to_context = {c.to_column_name: c for c in virtual_column_contexts} def transform_expressions(expression: Expression) -> Expression: - # The attribute reads either as attributes_string[key] (SELECT / group by) or - # as arrayElement(attributes_string, key) — the form map-backed filters build - # directly (see _map_backed_operands). Both must map to the virtual value. + # An attribute read appears as attributes_string[key] (SELECT / group by) or + # as arrayElement / mapContains over attributes_string — the forms map-backed + # filters build directly (see _map_backed_operands). For a virtual column the + # value forms map to the value transform and the mapContains existence guard + # maps to the backing column, so filters resolve against from_column_name. + is_existence = False if isinstance(expression, SubscriptableReference): if expression.column.column_name != "attributes_string": return expression key = str(expression.key.value) elif ( isinstance(expression, FunctionCall) - and expression.function_name == "arrayElement" + and expression.function_name in ("arrayElement", "mapContains") and isinstance(expression.parameters[0], ColumnExpr) and expression.parameters[0].column_name == "attributes_string" and isinstance(expression.parameters[1], LiteralExpr) ): key = str(expression.parameters[1].value) + is_existence = expression.function_name == "mapContains" else: return expression context = mapped_column_to_context.get(key) - if context: - attribute_expression = attribute_key_to_expression( - AttributeKey( - name=context.from_column_name, - type=NORMALIZED_COLUMNS_EAP_ITEMS.get( - context.from_column_name, [AttributeKey.TYPE_STRING] - )[0], - ) - ) - return f.transform( - f.CAST(f.ifNull(attribute_expression, literal("")), "String"), - literals_array(None, [literal(k) for k in context.value_map.keys()]), - literals_array(None, [literal(v) for v in context.value_map.values()]), - literal(context.default_value if context.default_value != "" else "unknown"), - alias=context.to_column_name, - ) + if context is None: + return expression - return expression + from_column = attribute_key_to_expression( + AttributeKey( + name=context.from_column_name, + type=NORMALIZED_COLUMNS_EAP_ITEMS.get( + context.from_column_name, [AttributeKey.TYPE_STRING] + )[0], + ) + ) + if is_existence: + return get_field_existence_expression(from_column) + return f.transform( + f.CAST(f.ifNull(from_column, literal("")), "String"), + literals_array(None, [literal(k) for k in context.value_map.keys()]), + literals_array(None, [literal(v) for v in context.value_map.values()]), + literal(context.default_value if context.default_value != "" else "unknown"), + alias=context.to_column_name, + ) query.transform_expressions(transform_expressions) diff --git a/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py b/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py index eb1d559d42..d4630afc8d 100644 --- a/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py +++ b/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py @@ -3312,6 +3312,48 @@ def test_virtual_column_filter_uses_value_map(self) -> None: AttributeValue(val_str="low") for _ in range(3) ] + def test_virtual_column_like_filter_uses_backing_existence(self) -> None: + # LIKE (and null / default-value guards) build a mapContains existence + # check on the request key; for a virtual column that key is absent in + # storage, so _apply_virtual_columns must rewrite the existence guard to + # the backing column too — otherwise it matches nothing. + span_ts = BASE_TIME - timedelta(minutes=1) + write_eap_item(span_ts, {"device.class": "1"}, 3) + write_eap_item(span_ts, {"device.class": "2"}, 5) + message = TraceItemTableRequest( + meta=RequestMeta( + project_ids=[1], + organization_id=1, + cogs_category="something", + referrer="something", + start_timestamp=START_TIMESTAMP, + end_timestamp=END_TIMESTAMP, + trace_item_type=TraceItemType.TRACE_ITEM_TYPE_SPAN, + ), + columns=[ + Column(key=AttributeKey(type=AttributeKey.TYPE_STRING, name="device.class.label")), + ], + filter=TraceItemFilter( + comparison_filter=ComparisonFilter( + key=AttributeKey(type=AttributeKey.TYPE_STRING, name="device.class.label"), + op=ComparisonFilter.OP_LIKE, + value=AttributeValue(val_str="%low%"), + ) + ), + virtual_column_contexts=[ + VirtualColumnContext( + from_column_name="device.class", + to_column_name="device.class.label", + value_map={"1": "low", "2": "medium"}, + default_value="Unknown", + ), + ], + ) + response = EndpointTraceItemTable().execute(message) + assert response.column_values[0].results == [ + AttributeValue(val_str="low") for _ in range(3) + ] + def test_normal_mode_end_to_end(self) -> None: items_storage = get_storage(StorageKey("eap_items")) msg_timestamp = BASE_TIME - timedelta(minutes=1)