diff --git a/snuba/web/rpc/common/common.py b/snuba/web/rpc/common/common.py index 2ad3044d62..c5008e8881 100644 --- a/snuba/web/rpc/common/common.py +++ b/snuba/web/rpc/common/common.py @@ -1,4 +1,5 @@ import json +import math from datetime import datetime, timedelta, timezone from typing import Any, Callable, TypeVar, cast @@ -12,8 +13,10 @@ ) from snuba import settings, state +from snuba.clickhouse import DATETIME_FORMAT from snuba.protos.common import ( ATTRIBUTES_TO_COALESCE, + COLUMN_PREFIX, PROTO_TYPE_TO_ATTRIBUTE_COLUMN, PROTO_TYPE_TO_CLICKHOUSE_TYPE, MalformedAttributeException, @@ -237,7 +240,12 @@ def treeify_or_and_conditions(query: Query) -> None: Note: does not apply to the conditions of a from_clause subquery (the nested one) this is bc transform_expressions is not implemented for composite queries + + This also removes structurally-identical conjuncts from the top-level WHERE AND + (`A AND A` == `A`); see dedupe_and_conditions. Nothing downstream in the query + pipeline dedupes conditions, so without this the duplicate would reach ClickHouse. """ + dedupe_and_conditions(query) def transform(exp: Expression) -> Expression: if not isinstance(exp, FunctionCall): @@ -253,6 +261,42 @@ def transform(exp: Expression) -> Expression: query.transform_expressions(transform) +def dedupe_and_conditions(query: Query) -> None: + """Remove structurally-identical conjuncts from the query's top-level AND. + + ``A AND A`` is equivalent to ``A``, so dropping exact duplicates never changes + results. The motivating case: when a client sends a ``sentry.timestamp`` range + filter whose bounds equal the mandatory time-range condition, both now produce + byte-identical expressions (see timestamp_seconds_to_datetime_literal) and + collapse to a single condition. Distinct ranges are left untouched (the AND of + them naturally yields the tightest window). + + Conditions nested inside OR/NOT subtrees are treated as opaque leaves -- we only + flatten the top-level AND. The query is left unchanged unless a duplicate is + actually found, so existing queries keep their exact condition tree. + """ + condition = query.get_condition() + if condition is None: + return + + def flatten_and(exp: Expression) -> list[Expression]: + if isinstance(exp, FunctionCall) and exp.function_name == "and": + flattened: list[Expression] = [] + for param in exp.parameters: + flattened.extend(flatten_and(param)) + return flattened + return [exp] + + conjuncts = flatten_and(condition) + deduped: list[Expression] = [] + for conjunct in conjuncts: + if conjunct not in deduped: + deduped.append(conjunct) + + if len(deduped) != len(conjuncts): + query.set_ast_condition(combine_and_conditions(deduped)) + + # Map column -> the Nullable type its values resolve to, used to emit a typed # NULL as the `if(...)` else-branch below. A bare, untyped NULL default trips # the ClickHouse "new analyzer": it canonicalizes the very same `if(...)` @@ -725,6 +769,51 @@ def trace_item_filters_to_expression( else: v_expression = _attribute_value_to_expression(v) + # `sentry.timestamp` is a normalized column that `attribute_key_to_expression` + # maps to `CAST(timestamp, 'Float64')`. Wrapping the primary-key/partition + # column in a CAST prevents ClickHouse from using it for granule and partition + # pruning, so range filters on it scan far more data than necessary. It also + # duplicates the mandatory time-range condition (timestamp_in_range_condition) + # that is already applied on the raw column. For range comparisons, compare + # against the raw DateTime `timestamp` column instead so the condition is + # index- and partition-prunable. We reuse timestamp_seconds_to_datetime_literal + # so a bound equal to the mandatory range is byte-identical to it and gets + # collapsed by dedupe_timestamp_conditions. + if k.name == f"{COLUMN_PREFIX}timestamp" and value_type in ( + "val_int", + "val_float", + "val_double", + ): + scalar_value = _scalar_value(v) + assert isinstance(scalar_value, (int, float)) + raw_timestamp = column("timestamp") + # `timestamp` is a second-resolution DateTime, so a fractional bound must be + # rounded to the integer second that preserves the original + # `CAST(timestamp, 'Float64') OP value` result: `<`/`>=` round up (ceil) and + # `<=`/`>` round down (floor). Integer bounds are left unchanged, so the + # rewritten mandatory-range bounds stay byte-identical and + # dedupe_timestamp_conditions can still collapse them. + if op == ComparisonFilter.OP_LESS_THAN: + return f.less( + raw_timestamp, + timestamp_seconds_to_datetime_literal(math.ceil(scalar_value)), + ) + if op == ComparisonFilter.OP_LESS_THAN_OR_EQUALS: + return f.lessOrEquals( + raw_timestamp, + timestamp_seconds_to_datetime_literal(math.floor(scalar_value)), + ) + if op == ComparisonFilter.OP_GREATER_THAN: + return f.greater( + raw_timestamp, + timestamp_seconds_to_datetime_literal(math.floor(scalar_value)), + ) + if op == ComparisonFilter.OP_GREATER_THAN_OR_EQUALS: + return f.greaterOrEquals( + raw_timestamp, + timestamp_seconds_to_datetime_literal(math.ceil(scalar_value)), + ) + if op == ComparisonFilter.OP_EQUALS: _check_non_string_values_cannot_ignore_case(item_filter.comparison_filter) @@ -955,20 +1044,20 @@ def project_id_and_org_conditions(meta: RequestMeta) -> Expression: ) +def timestamp_seconds_to_datetime_literal(ts_seconds: int) -> Expression: + """Build the canonical ``toDateTime('YYYY-MM-DD HH:MM:SS')`` expression for a unix + timestamp. The mandatory time-range bounds and rewritten ``sentry.timestamp`` range + filters share this form so equal bounds produce structurally identical expressions + (which lets dedupe_timestamp_conditions collapse the duplicates).""" + return f.toDateTime( + datetime.fromtimestamp(ts_seconds, tz=timezone.utc).strftime(DATETIME_FORMAT) + ) + + def timestamp_in_range_condition(start_ts: int, end_ts: int) -> Expression: return and_cond( - f.less( - column("timestamp"), - f.toDateTime( - datetime.fromtimestamp(end_ts, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S") - ), - ), - f.greaterOrEquals( - column("timestamp"), - f.toDateTime( - datetime.fromtimestamp(start_ts, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S") - ), - ), + f.less(column("timestamp"), timestamp_seconds_to_datetime_literal(end_ts)), + f.greaterOrEquals(column("timestamp"), timestamp_seconds_to_datetime_literal(start_ts)), ) diff --git a/snuba/web/rpc/v1/resolvers/R_eap_items/resolver_trace_item_table.py b/snuba/web/rpc/v1/resolvers/R_eap_items/resolver_trace_item_table.py index 7fdf7ff4a5..a5408539fa 100644 --- a/snuba/web/rpc/v1/resolvers/R_eap_items/resolver_trace_item_table.py +++ b/snuba/web/rpc/v1/resolvers/R_eap_items/resolver_trace_item_table.py @@ -296,10 +296,20 @@ def _convert_order_by( ] ) elif x.column.HasField("key"): + # `sentry.timestamp` maps to `CAST(timestamp, 'Float64')`; the cast stops + # ClickHouse from sorting against the primary-key `timestamp` column. The + # cast is monotonic, so ordering by the raw DateTime column yields the same + # order while staying read-in-order friendly. (The i == 0 / single-project / + # no-groupby branch above already expands to the full table sort key; this + # covers `sentry.timestamp` ordering anywhere else.) + if x.column.key.name == "sentry.timestamp": + expression: Expression = snuba_column("timestamp") + else: + expression = attribute_key_to_expression(x.column.key) res.append( OrderBy( direction=direction, - expression=attribute_key_to_expression(x.column.key), + expression=expression, ) ) elif x.column.HasField("conditional_aggregation"): diff --git a/tests/web/rpc/test_common.py b/tests/web/rpc/test_common.py index 5f4e88e056..2cdc5aca90 100644 --- a/tests/web/rpc/test_common.py +++ b/tests/web/rpc/test_common.py @@ -32,6 +32,8 @@ from snuba.datasets.storages.factory import get_storage from snuba.datasets.storages.storage_key import StorageKey from snuba.protos.common import ATTRIBUTES_TO_COALESCE +from snuba.query.dsl import Functions as f +from snuba.query.dsl import and_cond, column from snuba.query.expressions import ( Expression, FunctionCall, @@ -39,12 +41,15 @@ Literal, SubscriptableReference, ) +from snuba.query.logical import Query from snuba.web.rpc.common.common import ( _any_attribute_filter_to_expression, attribute_key_to_expression, + dedupe_and_conditions, next_monday, prev_monday, trace_item_filters_to_expression, + treeify_or_and_conditions, use_sampling_factor, ) from snuba.web.rpc.common.exceptions import ( @@ -269,6 +274,146 @@ def test_exists_filter_on_non_coalesced_attribute_unchanged(self) -> None: assert expr.function_name == "mapContains" +class TestSentryTimestampFilter: + """Range filters on `sentry.timestamp` must target the raw DateTime `timestamp` + column (index- and partition-prunable) rather than CAST(timestamp, 'Float64').""" + + @staticmethod + def _range_filter(op: "ComparisonFilter.Op.ValueType") -> TraceItemFilter: + return TraceItemFilter( + comparison_filter=ComparisonFilter( + key=AttributeKey(type=AttributeKey.Type.TYPE_DOUBLE, name="sentry.timestamp"), + op=op, + value=AttributeValue(val_double=1781040732), + ) + ) + + @pytest.mark.parametrize( + "op,expected_function", + [ + (ComparisonFilter.OP_LESS_THAN, "less"), + (ComparisonFilter.OP_LESS_THAN_OR_EQUALS, "lessOrEquals"), + (ComparisonFilter.OP_GREATER_THAN, "greater"), + (ComparisonFilter.OP_GREATER_THAN_OR_EQUALS, "greaterOrEquals"), + ], + ) + def test_range_filter_uses_raw_timestamp_column( + self, op: "ComparisonFilter.Op.ValueType", expected_function: str + ) -> None: + expr = trace_item_filters_to_expression(self._range_filter(op), attribute_key_to_expression) + assert isinstance(expr, FunctionCall) + assert expr.function_name == expected_function + + # LHS is the bare `timestamp` column, not a CAST. + from snuba.query.expressions import Column as SnubaColumn + + lhs = expr.parameters[0] + assert isinstance(lhs, SnubaColumn) + assert lhs.column_name == "timestamp" + + # RHS is toDateTime(value) so it compares against a DateTime, enabling + # primary-key and partition (toMonday(timestamp)) pruning. + rhs = expr.parameters[1] + assert isinstance(rhs, FunctionCall) + assert rhs.function_name == "toDateTime" + + @pytest.mark.parametrize( + "op,expected_second", + [ + # `timestamp` is second-resolution, so fractional bounds round to the + # integer second that preserves `CAST(timestamp, 'Float64') OP value`: + # `<`/`>=` round up, `<=`/`>` round down. + (ComparisonFilter.OP_LESS_THAN, 1781040733), + (ComparisonFilter.OP_LESS_THAN_OR_EQUALS, 1781040732), + (ComparisonFilter.OP_GREATER_THAN, 1781040732), + (ComparisonFilter.OP_GREATER_THAN_OR_EQUALS, 1781040733), + ], + ) + def test_fractional_range_filter_rounds_to_preserve_semantics( + self, op: "ComparisonFilter.Op.ValueType", expected_second: int + ) -> None: + fractional = TraceItemFilter( + comparison_filter=ComparisonFilter( + key=AttributeKey(type=AttributeKey.Type.TYPE_DOUBLE, name="sentry.timestamp"), + op=op, + value=AttributeValue(val_double=1781040732.7), + ) + ) + expr = trace_item_filters_to_expression(fractional, attribute_key_to_expression) + assert isinstance(expr, FunctionCall) + rhs = expr.parameters[1] + assert isinstance(rhs, FunctionCall) + assert rhs.function_name == "toDateTime" + literal = rhs.parameters[0] + assert isinstance(literal, Literal) + expected = datetime.fromtimestamp(expected_second, tz=timezone.utc).strftime( + "%Y-%m-%d %H:%M:%S" + ) + assert literal.value == expected + + def test_equals_filter_unchanged(self) -> None: + """Non-range comparisons keep the existing CAST-based behavior.""" + expr = trace_item_filters_to_expression( + self._range_filter(ComparisonFilter.OP_EQUALS), attribute_key_to_expression + ) + # equals path wraps in the null-aware OR; the LHS of the equals is still the CAST. + assert isinstance(expr, FunctionCall) + assert expr.function_name == "or" + equals_expr = expr.parameters[0] + assert isinstance(equals_expr, FunctionCall) + assert equals_expr.function_name == "equals" + cast_expr = equals_expr.parameters[0] + assert isinstance(cast_expr, FunctionCall) + assert cast_expr.function_name == "cast" + + +class TestDedupeAndConditions: + """dedupe_and_conditions removes structurally-identical top-level AND conjuncts. + treeify_or_and_conditions runs it, so every endpoint that treeifies gets it.""" + + @staticmethod + def _conjuncts(condition: object) -> list[FunctionCall]: + out: list[FunctionCall] = [] + if isinstance(condition, FunctionCall) and condition.function_name == "and": + for param in condition.parameters: + out.extend(TestDedupeAndConditions._conjuncts(param)) + elif isinstance(condition, FunctionCall): + out.append(condition) + return out + + def test_duplicate_conjunct_removed(self) -> None: + a = f.greaterOrEquals(column("timestamp"), f.toDateTime("2026-06-09 21:30:00")) + b = f.equals(column("project_id"), 1) + query = Query(from_clause=None, condition=and_cond(a, b, a)) + + dedupe_and_conditions(query) + + conjuncts = self._conjuncts(query.get_condition()) + assert len(conjuncts) == 2 + assert a in conjuncts and b in conjuncts + + def test_no_duplicates_leaves_condition_untouched(self) -> None: + a = f.greaterOrEquals(column("timestamp"), f.toDateTime("2026-06-09 21:30:00")) + b = f.less(column("timestamp"), f.toDateTime("2026-06-10 21:40:00")) + original = and_cond(a, b) + query = Query(from_clause=None, condition=original) + + dedupe_and_conditions(query) + + # Unchanged object identity: we only rebuild when a duplicate is found. + assert query.get_condition() is original + + def test_treeify_also_dedupes(self) -> None: + a = f.equals(column("project_id"), 1) + b = f.equals(column("organization_id"), 2) + query = Query(from_clause=None, condition=and_cond(a, b, a)) + + treeify_or_and_conditions(query) + + conjuncts = self._conjuncts(query.get_condition()) + assert len(conjuncts) == 2 + + class TestAnalyzerSafeFilters: """Per-key filters on map-backed attributes are built from a NULL-free (mapContains, arrayElement) pair instead of the legacy diff --git a/tests/web/rpc/v1/test_endpoint_time_series/test_endpoint_time_series_timestamp_pruning.py b/tests/web/rpc/v1/test_endpoint_time_series/test_endpoint_time_series_timestamp_pruning.py new file mode 100644 index 0000000000..abdc21efd4 --- /dev/null +++ b/tests/web/rpc/v1/test_endpoint_time_series/test_endpoint_time_series_timestamp_pruning.py @@ -0,0 +1,171 @@ +"""Tests that `sentry.timestamp` range filters on the TimeSeries endpoint are +emitted against the raw `timestamp` DateTime column (never CAST to Float64) and +that conditions identical to the mandatory time-range bound are deduped. + +These build the query AST directly via `build_query` so we can assert on the +generated WHERE clause without needing a live ClickHouse. +""" + +from google.protobuf.timestamp_pb2 import Timestamp +from sentry_protos.snuba.v1.endpoint_time_series_pb2 import TimeSeriesRequest +from sentry_protos.snuba.v1.request_common_pb2 import RequestMeta, TraceItemType +from sentry_protos.snuba.v1.trace_item_attribute_pb2 import ( + AttributeAggregation, + AttributeKey, + AttributeValue, + Function, +) +from sentry_protos.snuba.v1.trace_item_filter_pb2 import ( + AndFilter, + ComparisonFilter, + TraceItemFilter, +) + +from snuba.query.expressions import Column, Expression, FunctionCall +from snuba.web.rpc.v1.resolvers.R_eap_items.resolver_time_series import build_query + +# Aligned to a 600s boundary so the meta window matches what the time-series +# bucketing produces; values are arbitrary fixed unix seconds. +START_TS = 1781040600 +END_TS = 1781127600 + +_RANGE_OPS = {"less", "lessOrEquals", "greater", "greaterOrEquals"} + + +def _flatten_and(exp: Expression | None) -> list[Expression]: + if isinstance(exp, FunctionCall) and exp.function_name == "and": + out: list[Expression] = [] + for param in exp.parameters: + out.extend(_flatten_and(param)) + return out + return [] if exp is None else [exp] + + +def _is_raw_timestamp_column(exp: Expression) -> bool: + return isinstance(exp, Column) and exp.column_name == "timestamp" + + +def _timestamp_range_conditions(condition: Expression | None) -> list[FunctionCall]: + """All top-level range comparisons whose left side is the raw `timestamp` column.""" + conditions = [] + for conjunct in _flatten_and(condition): + if ( + isinstance(conjunct, FunctionCall) + and conjunct.function_name in _RANGE_OPS + and conjunct.parameters + and _is_raw_timestamp_column(conjunct.parameters[0]) + ): + conditions.append(conjunct) + return conditions + + +def _contains_cast_of_timestamp(exp: Expression | None) -> bool: + """True if any `cast(timestamp, ...)` (the Float64 form) appears in the tree.""" + if isinstance(exp, FunctionCall): + if ( + exp.function_name == "cast" + and exp.parameters + and _is_raw_timestamp_column(exp.parameters[0]) + ): + return True + return any(_contains_cast_of_timestamp(p) for p in exp.parameters) + return False + + +def _make_request(*timestamp_filters: TraceItemFilter) -> TimeSeriesRequest: + request_filter = None + if len(timestamp_filters) == 1: + request_filter = timestamp_filters[0] + elif len(timestamp_filters) > 1: + request_filter = TraceItemFilter(and_filter=AndFilter(filters=list(timestamp_filters))) + + return TimeSeriesRequest( + meta=RequestMeta( + project_ids=[1], + organization_id=1, + cogs_category="something", + referrer="something", + start_timestamp=Timestamp(seconds=START_TS), + end_timestamp=Timestamp(seconds=END_TS), + trace_item_type=TraceItemType.TRACE_ITEM_TYPE_SPAN, + ), + aggregations=[ + AttributeAggregation( + aggregate=Function.FUNCTION_COUNT, + key=AttributeKey(type=AttributeKey.TYPE_FLOAT, name="sentry.duration"), + label="count", + ), + ], + granularity_secs=600, + filter=request_filter or TraceItemFilter(), + ) + + +def _ts_filter(op: "ComparisonFilter.Op.ValueType", seconds: int) -> TraceItemFilter: + return TraceItemFilter( + comparison_filter=ComparisonFilter( + key=AttributeKey(type=AttributeKey.TYPE_DOUBLE, name="sentry.timestamp"), + op=op, + value=AttributeValue(val_double=seconds), + ) + ) + + +def test_no_timestamp_filter_keeps_only_mandatory_bounds() -> None: + query = build_query(_make_request()) + conds = _timestamp_range_conditions(query.get_condition()) + # Just the mandatory lower + upper bound from the request time window. + assert len(conds) == 2 + assert not _contains_cast_of_timestamp(query.get_condition()) + + +def test_identical_range_is_deduped_to_one() -> None: + # A sentry.timestamp filter whose bounds exactly match the meta time window. + # Using `>=` for the lower bound and `<` for the upper bound mirrors the + # mandatory condition, so the two pairs are byte-identical and collapse. + query = build_query( + _make_request( + _ts_filter(ComparisonFilter.OP_GREATER_THAN_OR_EQUALS, START_TS), + _ts_filter(ComparisonFilter.OP_LESS_THAN, END_TS), + ) + ) + conds = _timestamp_range_conditions(query.get_condition()) + # The duplicate lower/upper bounds are pruned -> only one of each remains. + assert len(conds) == 2 + functions = sorted(c.function_name for c in conds) + assert functions == ["greaterOrEquals", "less"] + assert not _contains_cast_of_timestamp(query.get_condition()) + + +def test_different_range_keeps_both_bounds_as_intersection() -> None: + # A tighter window than the meta range: both the meta bounds and the filter + # bounds survive; ClickHouse's AND yields the smaller (intersected) window. + query = build_query( + _make_request( + _ts_filter(ComparisonFilter.OP_GREATER_THAN_OR_EQUALS, START_TS + 600), + _ts_filter(ComparisonFilter.OP_LESS_THAN, END_TS - 600), + ) + ) + conds = _timestamp_range_conditions(query.get_condition()) + # 2 mandatory + 2 distinct filter bounds, none collapsed. + assert len(conds) == 4 + assert not _contains_cast_of_timestamp(query.get_condition()) + + +def test_more_than_two_distinct_bounds_all_kept_and_not_float() -> None: + # Three distinct sentry.timestamp bounds (none equal to a meta bound) plus the + # two mandatory bounds: all are kept, and crucially none use CAST(.. Float64). + query = build_query( + _make_request( + _ts_filter(ComparisonFilter.OP_GREATER_THAN, START_TS + 100), + _ts_filter(ComparisonFilter.OP_GREATER_THAN_OR_EQUALS, START_TS + 600), + _ts_filter(ComparisonFilter.OP_LESS_THAN, END_TS - 600), + ) + ) + condition = query.get_condition() + conds = _timestamp_range_conditions(condition) + assert len(conds) == 5 + # Every timestamp comparison compares the raw DateTime column, not a Float cast. + assert not _contains_cast_of_timestamp(condition) + for cond in conds: + assert _is_raw_timestamp_column(cond.parameters[0]) diff --git a/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py b/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py index d4630afc8d..50f8a4e42b 100644 --- a/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py +++ b/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py @@ -4282,14 +4282,13 @@ def test_build_query_with_order_by_optimization_disabled_because_multiproject() request = _apply_labels_to_columns(request) query = build_query(request) + # The full sort-key expansion is disabled (multi-project), but ordering by + # sentry.timestamp still targets the raw `timestamp` column rather than a CAST so + # ClickHouse can read in primary-key order. assert query.get_orderby() == [ OrderBy( direction=OrderByDirection.DESC, - expression=f.cast( - snuba_column("timestamp"), - "String", - alias="sentry.timestamp_TYPE_STRING", - ), + expression=snuba_column("timestamp"), ), ] @@ -4325,14 +4324,13 @@ def test_build_query_with_order_by_optimization_disabled_because_groupby() -> No request = _apply_labels_to_columns(request) query = build_query(request) + # The full sort-key expansion is disabled (group by present), but ordering by + # sentry.timestamp still targets the raw `timestamp` column rather than a CAST so + # ClickHouse can read in primary-key order. assert query.get_orderby() == [ OrderBy( direction=OrderByDirection.DESC, - expression=f.cast( - snuba_column("timestamp"), - "String", - alias="sentry.timestamp_TYPE_STRING", - ), + expression=snuba_column("timestamp"), ), ]