diff --git a/datajunction-server/datajunction_server/sql/decompose.py b/datajunction-server/datajunction_server/sql/decompose.py index fc4236e8d..b95672d3a 100644 --- a/datajunction-server/datajunction_server/sql/decompose.py +++ b/datajunction-server/datajunction_server/sql/decompose.py @@ -1156,12 +1156,13 @@ def _expand_template(self, template: str, args: list) -> str: def _short_hash(self, expression: str, query_ast: ast.Query) -> str: """Generate a short hash for the given expression. - ``str(query_ast.select.from_)`` is whitespace-normalized so cosmetic - changes to AST string rendering (pretty-printing) don't perturb hashes - and rename every derived measure. ``expression`` is passed through - unchanged to preserve hash compatibility for expressions that already - contain newlines (e.g. CASE, IF). + Both the expression and the FROM clause are whitespace-normalized so + cosmetic changes to AST string rendering (pretty-printing, indentation, + newlines around AND/OR conjuncts) don't perturb hashes and silently + rename every derived measure. Component names are durable identifiers + for materialized columns, so the hash must be invariant to formatting. """ + expression_normalized = " ".join(expression.split()) from_str = " ".join(str(query_ast.select.from_).split()) - signature = expression + from_str + signature = expression_normalized + from_str return hashlib.md5(signature.encode("utf-8")).hexdigest()[:8] diff --git a/datajunction-server/tests/sql/decompose_test.py b/datajunction-server/tests/sql/decompose_test.py index b4cf1ff4d..f8633316d 100644 --- a/datajunction-server/tests/sql/decompose_test.py +++ b/datajunction-server/tests/sql/decompose_test.py @@ -795,6 +795,33 @@ async def test_unsupported_aggregation_function(session: AsyncSession, create_me ) +@pytest.mark.asyncio +async def test_count_if_hash_invariant_to_formatting( + session: AsyncSession, + create_metric, +): + """ + Component hashes must be stable across cosmetic AST rendering changes. + + A previous regression (pretty-printing PR) caused ``str(arg)`` of a + BinaryOp containing AND/OR to emit newlines between conjuncts, which + silently renamed every ``count_if`` measure with a multi-conjunct + condition. Component names are durable identifiers for materialized + columns, so the same metric query must always produce the same hash — + even if the same logical condition is parsed from differently-formatted + SQL text. + """ + single_line = await create_metric( + "SELECT COUNT_IF(a = 1 AND b = 2 AND c = 3) FROM parent_node", + ) + multi_line = await create_metric( + "SELECT COUNT_IF(\n a = 1\n AND b = 2\n AND c = 3\n) FROM parent_node", + ) + one, _ = await MetricComponentExtractor(single_line.id).extract(session) + two, _ = await MetricComponentExtractor(multi_line.id).extract(session) + assert [c.name for c in one] == [c.name for c in two] + + @pytest.mark.asyncio async def test_count_if(session: AsyncSession, create_metric): """