Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 101 additions & 12 deletions snuba/web/rpc/common/common.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import math
from datetime import datetime, timedelta, timezone
from typing import Any, Callable, TypeVar, cast

Expand All @@ -12,8 +13,10 @@
)

from snuba import settings, state
from snuba.clickhouse import DATETIME_FORMAT
from snuba.protos.common import (
ATTRIBUTES_TO_COALESCE,
COLUMN_PREFIX,
PROTO_TYPE_TO_ATTRIBUTE_COLUMN,
PROTO_TYPE_TO_CLICKHOUSE_TYPE,
MalformedAttributeException,
Expand Down Expand Up @@ -237,7 +240,12 @@ def treeify_or_and_conditions(query: Query) -> None:

Note: does not apply to the conditions of a from_clause subquery (the nested one)
this is bc transform_expressions is not implemented for composite queries

This also removes structurally-identical conjuncts from the top-level WHERE AND
(`A AND A` == `A`); see dedupe_and_conditions. Nothing downstream in the query
pipeline dedupes conditions, so without this the duplicate would reach ClickHouse.
"""
dedupe_and_conditions(query)

def transform(exp: Expression) -> Expression:
if not isinstance(exp, FunctionCall):
Expand All @@ -253,6 +261,42 @@ def transform(exp: Expression) -> Expression:
query.transform_expressions(transform)


def dedupe_and_conditions(query: Query) -> None:
"""Remove structurally-identical conjuncts from the query's top-level AND.

``A AND A`` is equivalent to ``A``, so dropping exact duplicates never changes
results. The motivating case: when a client sends a ``sentry.timestamp`` range
filter whose bounds equal the mandatory time-range condition, both now produce
byte-identical expressions (see timestamp_seconds_to_datetime_literal) and
collapse to a single condition. Distinct ranges are left untouched (the AND of
them naturally yields the tightest window).

Conditions nested inside OR/NOT subtrees are treated as opaque leaves -- we only
flatten the top-level AND. The query is left unchanged unless a duplicate is
actually found, so existing queries keep their exact condition tree.
"""
condition = query.get_condition()
if condition is None:
return

def flatten_and(exp: Expression) -> list[Expression]:
if isinstance(exp, FunctionCall) and exp.function_name == "and":
flattened: list[Expression] = []
for param in exp.parameters:
flattened.extend(flatten_and(param))
return flattened
return [exp]

conjuncts = flatten_and(condition)
deduped: list[Expression] = []
for conjunct in conjuncts:
if conjunct not in deduped:
Comment thread
sentry[bot] marked this conversation as resolved.
deduped.append(conjunct)

if len(deduped) != len(conjuncts):
query.set_ast_condition(combine_and_conditions(deduped))


# Map column -> the Nullable type its values resolve to, used to emit a typed
# NULL as the `if(...)` else-branch below. A bare, untyped NULL default trips
# the ClickHouse "new analyzer": it canonicalizes the very same `if(...)`
Expand Down Expand Up @@ -725,6 +769,51 @@ def trace_item_filters_to_expression(
else:
v_expression = _attribute_value_to_expression(v)

# `sentry.timestamp` is a normalized column that `attribute_key_to_expression`
# maps to `CAST(timestamp, 'Float64')`. Wrapping the primary-key/partition
# column in a CAST prevents ClickHouse from using it for granule and partition
# pruning, so range filters on it scan far more data than necessary. It also
# duplicates the mandatory time-range condition (timestamp_in_range_condition)
# that is already applied on the raw column. For range comparisons, compare
# against the raw DateTime `timestamp` column instead so the condition is
# index- and partition-prunable. We reuse timestamp_seconds_to_datetime_literal
# so a bound equal to the mandatory range is byte-identical to it and gets
# collapsed by dedupe_timestamp_conditions.
if k.name == f"{COLUMN_PREFIX}timestamp" and value_type in (
"val_int",
"val_float",
"val_double",
):
scalar_value = _scalar_value(v)
assert isinstance(scalar_value, (int, float))
raw_timestamp = column("timestamp")
# `timestamp` is a second-resolution DateTime, so a fractional bound must be
# rounded to the integer second that preserves the original
# `CAST(timestamp, 'Float64') OP value` result: `<`/`>=` round up (ceil) and
# `<=`/`>` round down (floor). Integer bounds are left unchanged, so the
# rewritten mandatory-range bounds stay byte-identical and
# dedupe_timestamp_conditions can still collapse them.
if op == ComparisonFilter.OP_LESS_THAN:
return f.less(
raw_timestamp,
timestamp_seconds_to_datetime_literal(math.ceil(scalar_value)),
)
if op == ComparisonFilter.OP_LESS_THAN_OR_EQUALS:
return f.lessOrEquals(
raw_timestamp,
timestamp_seconds_to_datetime_literal(math.floor(scalar_value)),
)
if op == ComparisonFilter.OP_GREATER_THAN:
return f.greater(
raw_timestamp,
timestamp_seconds_to_datetime_literal(math.floor(scalar_value)),
)
if op == ComparisonFilter.OP_GREATER_THAN_OR_EQUALS:
return f.greaterOrEquals(
raw_timestamp,
timestamp_seconds_to_datetime_literal(math.ceil(scalar_value)),
)

if op == ComparisonFilter.OP_EQUALS:
_check_non_string_values_cannot_ignore_case(item_filter.comparison_filter)

Expand Down Expand Up @@ -955,20 +1044,20 @@ def project_id_and_org_conditions(meta: RequestMeta) -> Expression:
)


def timestamp_seconds_to_datetime_literal(ts_seconds: int) -> Expression:
"""Build the canonical ``toDateTime('YYYY-MM-DD HH:MM:SS')`` expression for a unix
timestamp. The mandatory time-range bounds and rewritten ``sentry.timestamp`` range
filters share this form so equal bounds produce structurally identical expressions
(which lets dedupe_timestamp_conditions collapse the duplicates)."""
return f.toDateTime(
datetime.fromtimestamp(ts_seconds, tz=timezone.utc).strftime(DATETIME_FORMAT)
)


def timestamp_in_range_condition(start_ts: int, end_ts: int) -> Expression:
return and_cond(
f.less(
column("timestamp"),
f.toDateTime(
datetime.fromtimestamp(end_ts, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
),
),
f.greaterOrEquals(
column("timestamp"),
f.toDateTime(
datetime.fromtimestamp(start_ts, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
),
),
f.less(column("timestamp"), timestamp_seconds_to_datetime_literal(end_ts)),
f.greaterOrEquals(column("timestamp"), timestamp_seconds_to_datetime_literal(start_ts)),
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,20 @@ def _convert_order_by(
]
)
elif x.column.HasField("key"):
# `sentry.timestamp` maps to `CAST(timestamp, 'Float64')`; the cast stops
# ClickHouse from sorting against the primary-key `timestamp` column. The
# cast is monotonic, so ordering by the raw DateTime column yields the same
# order while staying read-in-order friendly. (The i == 0 / single-project /
# no-groupby branch above already expands to the full table sort key; this
# covers `sentry.timestamp` ordering anywhere else.)
if x.column.key.name == "sentry.timestamp":
expression: Expression = snuba_column("timestamp")
else:
expression = attribute_key_to_expression(x.column.key)
res.append(
OrderBy(
direction=direction,
expression=attribute_key_to_expression(x.column.key),
expression=expression,
)
)
elif x.column.HasField("conditional_aggregation"):
Expand Down
145 changes: 145 additions & 0 deletions tests/web/rpc/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,24 @@
from snuba.datasets.storages.factory import get_storage
from snuba.datasets.storages.storage_key import StorageKey
from snuba.protos.common import ATTRIBUTES_TO_COALESCE
from snuba.query.dsl import Functions as f
from snuba.query.dsl import and_cond, column
from snuba.query.expressions import (
Expression,
FunctionCall,
Lambda,
Literal,
SubscriptableReference,
)
from snuba.query.logical import Query
from snuba.web.rpc.common.common import (
_any_attribute_filter_to_expression,
attribute_key_to_expression,
dedupe_and_conditions,
next_monday,
prev_monday,
trace_item_filters_to_expression,
treeify_or_and_conditions,
use_sampling_factor,
)
from snuba.web.rpc.common.exceptions import (
Expand Down Expand Up @@ -269,6 +274,146 @@ def test_exists_filter_on_non_coalesced_attribute_unchanged(self) -> None:
assert expr.function_name == "mapContains"


class TestSentryTimestampFilter:
"""Range filters on `sentry.timestamp` must target the raw DateTime `timestamp`
column (index- and partition-prunable) rather than CAST(timestamp, 'Float64')."""

@staticmethod
def _range_filter(op: "ComparisonFilter.Op.ValueType") -> TraceItemFilter:
return TraceItemFilter(
comparison_filter=ComparisonFilter(
key=AttributeKey(type=AttributeKey.Type.TYPE_DOUBLE, name="sentry.timestamp"),
op=op,
value=AttributeValue(val_double=1781040732),
)
)

@pytest.mark.parametrize(
"op,expected_function",
[
(ComparisonFilter.OP_LESS_THAN, "less"),
(ComparisonFilter.OP_LESS_THAN_OR_EQUALS, "lessOrEquals"),
(ComparisonFilter.OP_GREATER_THAN, "greater"),
(ComparisonFilter.OP_GREATER_THAN_OR_EQUALS, "greaterOrEquals"),
],
)
def test_range_filter_uses_raw_timestamp_column(
self, op: "ComparisonFilter.Op.ValueType", expected_function: str
) -> None:
expr = trace_item_filters_to_expression(self._range_filter(op), attribute_key_to_expression)
assert isinstance(expr, FunctionCall)
assert expr.function_name == expected_function

# LHS is the bare `timestamp` column, not a CAST.
from snuba.query.expressions import Column as SnubaColumn

lhs = expr.parameters[0]
assert isinstance(lhs, SnubaColumn)
assert lhs.column_name == "timestamp"

# RHS is toDateTime(value) so it compares against a DateTime, enabling
# primary-key and partition (toMonday(timestamp)) pruning.
rhs = expr.parameters[1]
assert isinstance(rhs, FunctionCall)
assert rhs.function_name == "toDateTime"

@pytest.mark.parametrize(
"op,expected_second",
[
# `timestamp` is second-resolution, so fractional bounds round to the
# integer second that preserves `CAST(timestamp, 'Float64') OP value`:
# `<`/`>=` round up, `<=`/`>` round down.
(ComparisonFilter.OP_LESS_THAN, 1781040733),
(ComparisonFilter.OP_LESS_THAN_OR_EQUALS, 1781040732),
(ComparisonFilter.OP_GREATER_THAN, 1781040732),
(ComparisonFilter.OP_GREATER_THAN_OR_EQUALS, 1781040733),
],
)
def test_fractional_range_filter_rounds_to_preserve_semantics(
self, op: "ComparisonFilter.Op.ValueType", expected_second: int
) -> None:
fractional = TraceItemFilter(
comparison_filter=ComparisonFilter(
key=AttributeKey(type=AttributeKey.Type.TYPE_DOUBLE, name="sentry.timestamp"),
op=op,
value=AttributeValue(val_double=1781040732.7),
)
)
expr = trace_item_filters_to_expression(fractional, attribute_key_to_expression)
assert isinstance(expr, FunctionCall)
rhs = expr.parameters[1]
assert isinstance(rhs, FunctionCall)
assert rhs.function_name == "toDateTime"
literal = rhs.parameters[0]
assert isinstance(literal, Literal)
expected = datetime.fromtimestamp(expected_second, tz=timezone.utc).strftime(
"%Y-%m-%d %H:%M:%S"
)
assert literal.value == expected

def test_equals_filter_unchanged(self) -> None:
"""Non-range comparisons keep the existing CAST-based behavior."""
expr = trace_item_filters_to_expression(
self._range_filter(ComparisonFilter.OP_EQUALS), attribute_key_to_expression
)
# equals path wraps in the null-aware OR; the LHS of the equals is still the CAST.
assert isinstance(expr, FunctionCall)
assert expr.function_name == "or"
equals_expr = expr.parameters[0]
assert isinstance(equals_expr, FunctionCall)
assert equals_expr.function_name == "equals"
cast_expr = equals_expr.parameters[0]
assert isinstance(cast_expr, FunctionCall)
assert cast_expr.function_name == "cast"


class TestDedupeAndConditions:
"""dedupe_and_conditions removes structurally-identical top-level AND conjuncts.
treeify_or_and_conditions runs it, so every endpoint that treeifies gets it."""

@staticmethod
def _conjuncts(condition: object) -> list[FunctionCall]:
out: list[FunctionCall] = []
if isinstance(condition, FunctionCall) and condition.function_name == "and":
for param in condition.parameters:
out.extend(TestDedupeAndConditions._conjuncts(param))
elif isinstance(condition, FunctionCall):
out.append(condition)
return out

def test_duplicate_conjunct_removed(self) -> None:
a = f.greaterOrEquals(column("timestamp"), f.toDateTime("2026-06-09 21:30:00"))
b = f.equals(column("project_id"), 1)
query = Query(from_clause=None, condition=and_cond(a, b, a))

dedupe_and_conditions(query)

conjuncts = self._conjuncts(query.get_condition())
assert len(conjuncts) == 2
assert a in conjuncts and b in conjuncts

def test_no_duplicates_leaves_condition_untouched(self) -> None:
a = f.greaterOrEquals(column("timestamp"), f.toDateTime("2026-06-09 21:30:00"))
b = f.less(column("timestamp"), f.toDateTime("2026-06-10 21:40:00"))
original = and_cond(a, b)
query = Query(from_clause=None, condition=original)

dedupe_and_conditions(query)

# Unchanged object identity: we only rebuild when a duplicate is found.
assert query.get_condition() is original

def test_treeify_also_dedupes(self) -> None:
a = f.equals(column("project_id"), 1)
b = f.equals(column("organization_id"), 2)
query = Query(from_clause=None, condition=and_cond(a, b, a))

treeify_or_and_conditions(query)

conjuncts = self._conjuncts(query.get_condition())
assert len(conjuncts) == 2


class TestAnalyzerSafeFilters:
"""Per-key filters on map-backed attributes are built from a NULL-free
(mapContains, arrayElement) pair instead of the legacy
Expand Down
Loading
Loading