Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions datajunction-server/datajunction_server/sql/decompose.py
Original file line number Diff line number Diff line change
Expand Up @@ -1156,12 +1156,13 @@ def _expand_template(self, template: str, args: list) -> str:
def _short_hash(self, expression: str, query_ast: ast.Query) -> str:
"""Generate a short hash for the given expression.

``str(query_ast.select.from_)`` is whitespace-normalized so cosmetic
changes to AST string rendering (pretty-printing) don't perturb hashes
and rename every derived measure. ``expression`` is passed through
unchanged to preserve hash compatibility for expressions that already
contain newlines (e.g. CASE, IF).
Both the expression and the FROM clause are whitespace-normalized so
cosmetic changes to AST string rendering (pretty-printing, indentation,
newlines around AND/OR conjuncts) don't perturb hashes and silently
rename every derived measure. Component names are durable identifiers
for materialized columns, so the hash must be invariant to formatting.
"""
expression_normalized = " ".join(expression.split())
from_str = " ".join(str(query_ast.select.from_).split())
signature = expression + from_str
signature = expression_normalized + from_str
return hashlib.md5(signature.encode("utf-8")).hexdigest()[:8]
27 changes: 27 additions & 0 deletions datajunction-server/tests/sql/decompose_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,33 @@ async def test_unsupported_aggregation_function(session: AsyncSession, create_me
)


@pytest.mark.asyncio
async def test_count_if_hash_invariant_to_formatting(
session: AsyncSession,
create_metric,
):
"""
Component hashes must be stable across cosmetic AST rendering changes.

A previous regression (pretty-printing PR) caused ``str(arg)`` of a
BinaryOp containing AND/OR to emit newlines between conjuncts, which
silently renamed every ``count_if`` measure with a multi-conjunct
condition. Component names are durable identifiers for materialized
columns, so the same metric query must always produce the same hash —
even if the same logical condition is parsed from differently-formatted
SQL text.
"""
single_line = await create_metric(
"SELECT COUNT_IF(a = 1 AND b = 2 AND c = 3) FROM parent_node",
)
multi_line = await create_metric(
"SELECT COUNT_IF(\n a = 1\n AND b = 2\n AND c = 3\n) FROM parent_node",
)
one, _ = await MetricComponentExtractor(single_line.id).extract(session)
two, _ = await MetricComponentExtractor(multi_line.id).extract(session)
assert [c.name for c in one] == [c.name for c in two]


@pytest.mark.asyncio
async def test_count_if(session: AsyncSession, create_metric):
"""
Expand Down
Loading