diff --git a/datajunction-server/datajunction_server/construction/build_v3/combiners.py b/datajunction-server/datajunction_server/construction/build_v3/combiners.py index 8aa092ea9..033c92478 100644 --- a/datajunction-server/datajunction_server/construction/build_v3/combiners.py +++ b/datajunction-server/datajunction_server/construction/build_v3/combiners.py @@ -846,12 +846,17 @@ def _build_grain_group_from_preagg_table( if col.semantic_type in ("metric_component", "measure", "metric"): col_ref = ast.Column(name=ast.Name(col.name)) - # Find the component to get the merge function + # Find the component to get the merge function. The third + # clause matches legacy pre-agg / grain-group shapes where + # the LIMITED-DISTINCT column lived under the bare grain + # name (before the ` AS ` projection landed) — + # keeps re-aggregation correct against stale materializations. merge_func = None for comp in original_gg.components: if ( # pragma: no branch comp.name == col.name or original_gg.component_aliases.get(comp.name) == col.name + or comp.grain_alias == col.name ): merge_func = comp.merge break diff --git a/datajunction-server/datajunction_server/construction/build_v3/cube_matcher.py b/datajunction-server/datajunction_server/construction/build_v3/cube_matcher.py index 59c351fd4..880d64ffc 100644 --- a/datajunction-server/datajunction_server/construction/build_v3/cube_matcher.py +++ b/datajunction-server/datajunction_server/construction/build_v3/cube_matcher.py @@ -486,7 +486,19 @@ def build_synthetic_grain_group( for comp in decomposed.components: if comp.name not in component_aliases: # pragma: no branch - cube_col_name = mat_col_lookup.get(comp.name, comp.name) + # Legacy compat: cubes materialized before LIMITED-DISTINCT + # gained its ` AS ` projection had the column + # stored under the bare grain name (e.g. ``order_id``) + # rather than the hashed identity (``order_id_distinct_``). + # Try the hashed name first (current shape), then fall back + # to the grain_alias (legacy shape) before defaulting to + # ``comp.name``. Keeps stale cube configs queryable until + # they're re-materialized. + cube_col_name = mat_col_lookup.get(comp.name) + if cube_col_name is None and comp.grain_alias: + cube_col_name = mat_col_lookup.get(comp.grain_alias) + if cube_col_name is None: + cube_col_name = comp.name component_aliases[comp.name] = cube_col_name all_components.append(comp) diff --git a/datajunction-server/datajunction_server/construction/build_v3/measures.py b/datajunction-server/datajunction_server/construction/build_v3/measures.py index c2fd8df64..3108066c6 100644 --- a/datajunction-server/datajunction_server/construction/build_v3/measures.py +++ b/datajunction-server/datajunction_server/construction/build_v3/measures.py @@ -164,6 +164,43 @@ def _parse_type_string(type_str: str | None) -> ct.ColumnType | None: return _TYPE_STRING_MAP.get(normalized) +def register_limited_component( + component: MetricComponent, + metric_node: Node, + component_aliases: dict[str, str], + component_expressions: list[tuple[str, ast.Expression]], + component_metadata: list[tuple[str, MetricComponent, Node]], +) -> None: + """Register a LIMITED component (e.g. ``COUNT(DISTINCT)``). + + For plain-column DISTINCT, ``grain_alias`` is the bare column and + differs from ``component.name`` (the hashed identity). Emit an + extra `` AS `` projection so the hashed name + is addressable alongside the bare grain column, and route + ``component_aliases`` at the hashed name so combiner rewriters + stay consistent. Both call sites in ``build_grain_group_sql`` must + use this helper to keep the contract in one place. + """ + # The preagg-read path assumes LIMITED.merge is None to skip + # re-aggregation; a future LIMITED+merge variant would wrap the + # merge over the bare grain column and produce wrong SQL. + assert component.merge is None, ( + f"LIMITED component {component.name!r} has merge={component.merge!r}; " + f"update the preagg-read path in combiners.py before adding this." + ) + grain_alias = component.grain_alias or component.name + if grain_alias != component.name: + component_aliases[component.name] = component.name + component_expressions.append( + (component.name, make_column_ref(grain_alias)), + ) + component_metadata.append( + (component.name, component, metric_node), + ) + else: + component_aliases[component.name] = grain_alias + + def infer_component_type( component: MetricComponent, metric_type: str, @@ -1747,10 +1784,14 @@ def build_grain_group_from_preagg( f"Component {component.name} not found in pre-agg {preagg.id}", ) - # Always use the measure column name (component hash) as the output alias - # This ensures consistency with the non-preagg path - output_alias = measure_col - + # Always alias the output as ``component.name`` (the hashed + # identity) so the CTE's output column name matches the + # metric's persisted ``derived_expression`` regardless of how + # the pre-agg physically stored it. Legacy pre-aggs + # materialized before the LIMITED-DISTINCT change carry the + # column under the bare grain name; we read that bare column + # and alias it back to the hashed identity here. + output_alias = component.name component_aliases[component.name] = output_alias col_ref = ast.Column(name=ast.Name(measure_col)) @@ -1765,11 +1806,17 @@ def build_grain_group_from_preagg( aliased = ast.Alias(child=agg_expr, alias=ast.Name(output_alias)) select_items.append(aliased) else: - # No merge - output grain column directly, add to GROUP BY - select_items.append(col_ref) + # No merge - output grain column directly, add to GROUP BY. + # Alias to ``component.name`` so the CTE output name is + # the hashed identity regardless of the pre-agg's stored + # column name. + if measure_col != output_alias: + select_items.append( + ast.Alias(child=col_ref, alias=ast.Name(output_alias)), + ) + else: + select_items.append(col_ref) grain_col_names.append(measure_col) - output_alias = measure_col - component_aliases[component.name] = output_alias # Get type from pre-agg columns col_type = preagg.get_column_type(measure_col, default="double") @@ -1910,11 +1957,12 @@ def build_grain_group_sql( Aggregability.FULL, ) if orig_agg == Aggregability.LIMITED: - # LIMITED: grain column is already in GROUP BY, no output needed. - # grain_alias was set by _make_component: plain column → column name, - # complex expression → component.name. - component_aliases[component.name] = ( - component.grain_alias or component.name + register_limited_component( + component, + metric_node, + component_aliases, + component_expressions, + component_metadata, ) continue else: @@ -1929,12 +1977,14 @@ def build_grain_group_sql( component_aliases[component.name] = component_alias continue - # Skip LIMITED aggregability components with no aggregation - # These are represented by grain columns instead. - # grain_alias was set by _make_component: plain column → column name, - # complex expression → component.name. if component.rule.type == Aggregability.LIMITED and not component.aggregation: - component_aliases[component.name] = component.grain_alias or component.name + register_limited_component( + component, + metric_node, + component_aliases, + component_expressions, + component_metadata, + ) continue # Always use component.name for consistency - no special case for single-component diff --git a/datajunction-server/tests/api/cubes_test.py b/datajunction-server/tests/api/cubes_test.py index e5a0d0941..8c4c65f14 100644 --- a/datajunction-server/tests/api/cubes_test.py +++ b/datajunction-server/tests/api/cubes_test.py @@ -4964,6 +4964,7 @@ async def test_materialize_cube_returns_metric_combiners( order_id, SUM(line_total_sum_e1f61696) line_total_sum_e1f61696, hll_union_agg(customer_id_hll_23002251) customer_id_hll_23002251, + order_id_distinct_f93d50ab, week_order FROM analytics.preaggs.v3_revenue_orders_by_week GROUP BY category, order_id, week_order @@ -5003,6 +5004,14 @@ async def test_materialize_cube_returns_metric_combiners( "semantic_type": "metric_component", "type": "binary", }, + { + "column": None, + "name": "order_id_distinct_f93d50ab", + "node": None, + "semantic_entity": "v3.order_count:order_id_distinct_f93d50ab", + "semantic_type": "metric_component", + "type": "bigint", + }, { "column": None, "name": "week_order", diff --git a/datajunction-server/tests/construction/build_v3/djsql_test.py b/datajunction-server/tests/construction/build_v3/djsql_test.py index 5c1094afb..78a188ccb 100644 --- a/datajunction-server/tests/construction/build_v3/djsql_test.py +++ b/datajunction-server/tests/construction/build_v3/djsql_test.py @@ -144,12 +144,12 @@ async def test_derived_metric(self, client_with_build_v3): JOIN default.v3.order_items oi ON o.order_id = oi.order_id ), order_details_0 AS ( - SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 + SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id), 0) AS avg_order_value + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value FROM order_details_0 GROUP BY order_details_0.status """, diff --git a/datajunction-server/tests/construction/build_v3/having_clause_test.py b/datajunction-server/tests/construction/build_v3/having_clause_test.py index 8bde5601f..27647eaac 100644 --- a/datajunction-server/tests/construction/build_v3/having_clause_test.py +++ b/datajunction-server/tests/construction/build_v3/having_clause_test.py @@ -99,17 +99,18 @@ async def test_metric_filter_with_less_than( order_details_0 AS ( SELECT t2.category, - t1.order_id + t1.order_id, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id ) SELECT order_details_0.category AS category, - COUNT( DISTINCT order_details_0.order_id) AS order_count + COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count FROM order_details_0 GROUP BY order_details_0.category - HAVING COUNT( DISTINCT order_details_0.order_id) < 100 + HAVING COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) < 100 """, ) @@ -194,17 +195,18 @@ async def test_metric_filter_with_equality( order_details_0 AS ( SELECT t2.category, - t1.order_id + t1.order_id, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id ) SELECT order_details_0.category AS category, - COUNT( DISTINCT order_details_0.order_id) AS order_count + COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count FROM order_details_0 GROUP BY order_details_0.category - HAVING COUNT(DISTINCT order_details_0.order_id) = 50 + HAVING COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) = 50 """, ) @@ -257,7 +259,8 @@ async def test_multiple_metric_filters_and( SELECT t2.category, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696 + SUM(t1.line_total) line_total_sum_e1f61696, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id @@ -265,12 +268,12 @@ async def test_multiple_metric_filters_and( SELECT order_details_0.category AS category, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue, - COUNT( DISTINCT order_details_0.order_id) AS order_count + COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count FROM order_details_0 GROUP BY order_details_0.category HAVING SUM(order_details_0.line_total_sum_e1f61696) > 5000 - AND COUNT( DISTINCT order_details_0.order_id) > 20 + AND COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) > 20 """, ) @@ -436,7 +439,8 @@ async def test_complex_mixed_filters( t2.category, t1.status, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696 + SUM(t1.line_total) line_total_sum_e1f61696, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id WHERE t2.category IN ('Electronics', 'Clothing') @@ -446,11 +450,11 @@ async def test_complex_mixed_filters( order_details_0.category AS category, order_details_0.status AS status, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue, - COUNT(DISTINCT order_details_0.order_id) AS order_count + COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count FROM order_details_0 WHERE order_details_0.category IN ('Electronics', 'Clothing') AND order_details_0.status = 'completed' GROUP BY order_details_0.category, order_details_0.status - HAVING SUM(order_details_0.line_total_sum_e1f61696) > 10000 AND COUNT( DISTINCT order_details_0.order_id) >= 50 + HAVING SUM(order_details_0.line_total_sum_e1f61696) > 10000 AND COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) >= 50 """, ) @@ -501,16 +505,17 @@ async def test_derived_metric_filter( SELECT t2.category, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696 + SUM(t1.line_total) line_total_sum_e1f61696, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id ) SELECT order_details_0.category AS category, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id), 0) AS avg_order_value + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value FROM order_details_0 GROUP BY order_details_0.category - HAVING SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id), 0) > 100 + HAVING SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) > 100 """, ) diff --git a/datajunction-server/tests/construction/build_v3/measures_sql_test.py b/datajunction-server/tests/construction/build_v3/measures_sql_test.py index 8d267bdea..2a931a7e1 100644 --- a/datajunction-server/tests/construction/build_v3/measures_sql_test.py +++ b/datajunction-server/tests/construction/build_v3/measures_sql_test.py @@ -1249,7 +1249,7 @@ async def test_three_metrics_same_parent(self, client_with_build_v3): ] # Columns: dimension grain column 3 raw metric columns - assert len(gg["columns"]) == 4 + assert len(gg["columns"]) == 5 assert gg["columns"][0] == { "name": "status", "type": "string", @@ -1272,7 +1272,7 @@ async def test_three_metrics_same_parent(self, client_with_build_v3): FROM default.v3.orders o JOIN default.v3.order_items oi ON o.order_id = oi.order_id ) - SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, SUM(t1.quantity) quantity_sum_06b64d2e + SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, SUM(t1.quantity) quantity_sum_06b64d2e, t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 GROUP BY t1.status, t1.order_id """, @@ -1299,7 +1299,7 @@ async def test_three_metrics_same_parent(self, client_with_build_v3): # total_revenue component assert components[1] == { - "name": "order_id", + "name": "order_id_distinct_f93d50ab", "expression": "order_id", "aggregation": None, "merge": None, @@ -1647,6 +1647,12 @@ async def test_cross_fact_metrics_different_aggregabilities( "semantic_entity": "v3.page_views_enriched.session_id", "semantic_type": "dimension", }, + { + "name": "session_id_distinct_e7c26e39", + "type": "bigint", + "semantic_entity": "v3.session_count:session_id_distinct_e7c26e39", + "semantic_type": "metric_component", + }, ] assert_sql_equal( gg_sessions["sql"], @@ -1660,7 +1666,7 @@ async def test_cross_fact_metrics_different_aggregabilities( SELECT product_id, category FROM default.v3.products ) - SELECT t2.category, t1.session_id + SELECT t2.category, t1.session_id, t1.session_id session_id_distinct_e7c26e39 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.session_id @@ -1721,6 +1727,12 @@ async def test_cross_fact_derived_metric(self, client_with_build_v3): "semantic_entity": "v3.order_details.order_id", "semantic_type": "dimension", }, + { + "name": "order_id_distinct_f93d50ab", + "type": "bigint", + "semantic_entity": "v3.order_count:order_id_distinct_f93d50ab", + "semantic_type": "metric_component", + }, ] # Joins order_details -> product for category dimension assert_sql_equal( @@ -1736,7 +1748,7 @@ async def test_cross_fact_derived_metric(self, client_with_build_v3): SELECT product_id, category FROM default.v3.products ) - SELECT t2.category, t1.order_id + SELECT t2.category, t1.order_id, t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id @@ -1762,6 +1774,12 @@ async def test_cross_fact_derived_metric(self, client_with_build_v3): "semantic_entity": "v3.page_views_enriched.customer_id", "semantic_type": "dimension", }, + { + "name": "customer_id_distinct_dd4be7a5", + "type": "bigint", + "semantic_entity": "v3.visitor_count:customer_id_distinct_dd4be7a5", + "semantic_type": "metric_component", + }, ] # Joins page_views_enriched -> product for category dimension assert_sql_equal( @@ -1776,7 +1794,7 @@ async def test_cross_fact_derived_metric(self, client_with_build_v3): SELECT product_id, category FROM default.v3.products ) - SELECT t2.category, t1.customer_id + SELECT t2.category, t1.customer_id, t1.customer_id customer_id_distinct_dd4be7a5 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.customer_id @@ -1856,7 +1874,7 @@ async def test_cross_fact_multiple_derived_metrics(self, client_with_build_v3): SELECT product_id, category FROM default.v3.products ) - SELECT t2.category, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 + SELECT t2.category, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id @@ -1882,7 +1900,7 @@ async def test_cross_fact_multiple_derived_metrics(self, client_with_build_v3): SELECT product_id, category FROM default.v3.products ) - SELECT t2.category, t1.customer_id, COUNT(t1.view_id) view_id_count_f41e2db4 + SELECT t2.category, t1.customer_id, COUNT(t1.view_id) view_id_count_f41e2db4, t1.customer_id customer_id_distinct_dd4be7a5 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.customer_id @@ -2323,7 +2341,8 @@ async def test_count_distinct_grain_col_not_duplicated_when_also_a_dimension( gg = response.json()["grain_groups"][0] # order_id must appear exactly once in the SELECT (not duplicated as both - # dimension column and grain column) + # dimension column and grain column). The hashed identity projection + # for the LIMITED component is still emitted. assert_sql_equal( gg["sql"], """ @@ -2332,7 +2351,7 @@ async def test_count_distinct_grain_col_not_duplicated_when_also_a_dimension( FROM default.v3.orders o JOIN default.v3.order_items oi ON o.order_id = oi.order_id ) - SELECT t1.status, t1.order_id + SELECT t1.status, t1.order_id, t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 GROUP BY t1.status, t1.order_id """, @@ -2358,6 +2377,12 @@ async def test_count_distinct_grain_col_not_duplicated_when_also_a_dimension( "semantic_entity": "v3.order_details.order_id", "semantic_type": "dimension", }, + { + "name": "order_id_distinct_f93d50ab", + "type": "bigint", + "semantic_entity": "v3.order_count:order_id_distinct_f93d50ab", + "semantic_type": "metric_component", + }, ] @@ -3173,7 +3198,8 @@ async def test_temporal_filters_applied_when_cube_exists( t1.order_date date_id, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, - SUM(t1.quantity) quantity_sum_06b64d2e + SUM(t1.quantity) quantity_sum_06b64d2e, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 GROUP BY t1.order_date, t1.order_id """, @@ -4156,7 +4182,8 @@ async def test_nested_derived_metric_decomposes_to_base_components( SELECT t1.status, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696 + SUM(t1.line_total) line_total_sum_e1f61696, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 GROUP BY t1.status, t1.order_id """, @@ -4214,7 +4241,8 @@ async def test_nested_derived_window_metric_decomposes_to_base_components( t2.category, t3.week, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696 + SUM(t1.line_total) line_total_sum_e1f61696, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id @@ -4280,7 +4308,8 @@ async def test_nested_derived_cross_fact_decomposes_to_base_components( SELECT t2.category, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696 + SUM(t1.line_total) line_total_sum_e1f61696, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id @@ -4305,7 +4334,8 @@ async def test_nested_derived_cross_fact_decomposes_to_base_components( ) SELECT t2.category, t1.session_id, - COUNT(t1.view_id) view_id_count_f41e2db4 + COUNT(t1.view_id) view_id_count_f41e2db4, + t1.session_id session_id_distinct_e7c26e39 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.session_id """, @@ -6402,7 +6432,7 @@ async def test_nested_subquery_drops_fk_column(self, client_with_build_v3): ) AS wrap GROUP BY account_id, event_type ) - SELECT t1.event_type, t1.account_id + SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_ccca78c0 FROM v3_events_nested t1 GROUP BY t1.event_type, t1.account_id """, @@ -6461,7 +6491,7 @@ async def test_self_join_filters_both_aliases(self, client_with_build_v3): WHERE a1.audit_date >= 20260101 AND a2.audit_date >= 20260101 ) - SELECT t1.event_type, t1.account_id + SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_efcd5b4d FROM v3_events_self_join t1 GROUP BY t1.event_type, t1.account_id """, @@ -6528,7 +6558,7 @@ async def test_source_inside_inner_with_cte( SELECT account_id, event_type FROM v3_events_with_cte__inner_cte inner_cte ) - SELECT t1.event_type, t1.account_id + SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_4062d29e FROM v3_events_with_cte t1 GROUP BY t1.event_type, t1.account_id """, @@ -6615,7 +6645,7 @@ async def test_source_inside_where_exists_subquery( AND log.audit_date >= 20260101 ) ) - SELECT t1.event_type, t1.account_id + SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_7a788b39 FROM v3_events_exists t1 GROUP BY t1.event_type, t1.account_id """, @@ -6702,7 +6732,7 @@ async def test_set_op_with_nested_subquery_in_non_leading_arm( WHERE deep.audit_date >= 20260101 ) AS wrap ) - SELECT t1.event_type, t1.account_id + SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_49f58cc7 FROM v3_events_union_nested t1 GROUP BY t1.event_type, t1.account_id """, @@ -6770,7 +6800,7 @@ async def test_multiple_filters_same_source_different_scopes( AND inner_src.audit_date <= 20260131 ) AS wrap ) - SELECT t1.event_type, t1.account_id + SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_233abbf9 FROM v3_events_two_filters t1 GROUP BY t1.event_type, t1.account_id """, @@ -6835,7 +6865,7 @@ async def test_source_in_from_and_in_where_exists(self, client_with_build_v3): ) AND a.audit_date >= 20260101 ) - SELECT t1.event_type, t1.account_id + SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_3d483180 FROM v3_events_from_exists t1 GROUP BY t1.event_type, t1.account_id """, @@ -6887,7 +6917,7 @@ async def test_source_without_alias(self, client_with_build_v3): FROM default.v3.audit_log_noalias WHERE src_audit_log_noalias.audit_date >= 20260101 ) - SELECT t1.event_type, t1.account_id + SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_a0b1fd19 FROM v3_events_no_alias t1 GROUP BY t1.event_type, t1.account_id """, @@ -6965,7 +6995,7 @@ async def test_source_in_having_subquery(self, client_with_build_v3): WHERE log.audit_date >= 20260101 ) ) - SELECT t1.event_type, t1.account_id + SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_4e8747df FROM v3_events_having t1 GROUP BY t1.event_type, t1.account_id """, @@ -7040,7 +7070,7 @@ async def test_source_in_select_list_scalar_subquery( WHERE log.audit_date >= 20260101) AS max_id FROM default.v3.acct_scalar AS acc ) - SELECT t1.event_type, t1.account_id + SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_946a2255 FROM v3_events_scalar t1 GROUP BY t1.event_type, t1.account_id """, @@ -7181,7 +7211,7 @@ async def test_two_upstreams_linked_to_same_dim(self, client_with_build_v3): WHERE x.audit_date >= 20260101 AND y.audit_date >= 20260101 ) - SELECT t1.event_type, t1.account_id + SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_ec521d7c FROM v3_events_two_upstreams t1 GROUP BY t1.event_type, t1.account_id """, @@ -7286,7 +7316,7 @@ async def test_transform_link_target(self, client_with_build_v3): SELECT account_id, event_type FROM v3_events_with_date ) - SELECT t1.event_type, t1.account_id + SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_274a187b FROM v3_events_wrapping t1 GROUP BY t1.event_type, t1.account_id """, @@ -7385,7 +7415,7 @@ async def test_transform_link_plus_source_link_double_filter( v3_events_double_link_wrap AS ( SELECT account_id, event_type FROM v3_events_double_link ) - SELECT t1.event_type, t1.account_id + SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_b9c0262f FROM v3_events_double_link_wrap t1 GROUP BY t1.event_type, t1.account_id """, @@ -9806,3 +9836,357 @@ async def test_alias_substitution_skips_subquery_alias_collision( """, normalize_aliases=True, ) + + @pytest.mark.asyncio + async def test_count_distinct_plain_column_projects_hashed_alias( + self, + client_with_build_v3, + ): + """For ``COUNT(DISTINCT plain_col)``, the measures SQL must + project the column under BOTH the bare grain name and the + hashed identity, AND the combiner and persisted + ``derived_expression`` must reference the hashed identity — + otherwise consumers can't resolve the column. + """ + client = client_with_build_v3 + resp = await client.post( + "/nodes/source/", + json={ + "name": "v3.src_distinct_plain", + "columns": [ + {"name": "account_id", "type": "int"}, + {"name": "event_date", "type": "int"}, + ], + "mode": "published", + "catalog": "default", + "schema_": "v3", + "table": "distinct_plain_events", + }, + ) + assert resp.status_code in (200, 201), resp.json() + resp = await client.post( + "/nodes/transform/", + json={ + "name": "v3.distinct_plain_xform", + "query": ("SELECT account_id, event_date FROM v3.src_distinct_plain"), + "mode": "published", + "primary_key": ["account_id", "event_date"], + }, + ) + assert resp.status_code == 201, resp.json() + resp = await client.post( + "/nodes/metric/", + json={ + "name": "v3.distinct_plain_count", + "query": ( + "SELECT COUNT(DISTINCT account_id) FROM v3.distinct_plain_xform" + ), + "mode": "published", + }, + ) + assert resp.status_code == 201, resp.json() + + response = await client.get( + "/sql/measures/v3/", + params={ + "metrics": ["v3.distinct_plain_count"], + "dimensions": ["v3.distinct_plain_xform.event_date"], + }, + ) + assert response.status_code == 200, response.json() + body = response.json() + gg = body["grain_groups"][0] + + # Locate the hash-suffixed identity column emitted by the fix. + hashed_col = next( + ( + c["name"] + for c in gg["columns"] + if c["name"].startswith("account_id_distinct_") + ), + None, + ) + assert hashed_col is not None, ( + f"Expected an ``account_id_distinct_`` output column; " + f"got {[c['name'] for c in gg['columns']]}" + ) + + # SQL projects BOTH ``t1.account_id`` (grain/dim usage) AND + # ``t1.account_id AS account_id_distinct_`` (identity). + assert_sql_equal( + gg["sql"], + f""" + WITH v3_distinct_plain_xform AS ( + SELECT account_id, event_date + FROM default.v3.distinct_plain_events + ) + SELECT t1.event_date, + t1.account_id, + t1.account_id {hashed_col} + FROM v3_distinct_plain_xform t1 + GROUP BY t1.event_date, t1.account_id + """, + normalize_aliases=True, + ) + + # Combiner in the response and the metric node's persisted + # ``derived_expression`` both reference the hashed identity + # exactly. Both currently upper-case the function name; the + # contract is *equality between the two* — if either side + # drifts (different qualification, different case-folding, + # accidental rewrite), this test fails. + expected_combiner = f"COUNT( DISTINCT {hashed_col})" + assert body["metric_formulas"][0]["combiner"] == expected_combiner + metric_resp = await client.get("/metrics/v3.distinct_plain_count") + assert metric_resp.status_code == 200 + assert metric_resp.json()["derived_expression"] == expected_combiner + + @pytest.mark.asyncio + async def test_count_distinct_in_merged_grain_group_projects_hashed_alias( + self, + client_with_build_v3, + ): + """Same contract in the ``is_merged=True`` path (mixed- + aggregability metrics in one query). Pins that both LIMITED- + component branches in ``build_grain_group_sql`` route through + ``register_limited_component`` — the merged branch used to + skip the hashed projection independently. + """ + client = client_with_build_v3 + resp = await client.post( + "/nodes/source/", + json={ + "name": "v3.src_merged_distinct", + "columns": [ + {"name": "account_id", "type": "int"}, + {"name": "event_date", "type": "int"}, + {"name": "is_active", "type": "boolean"}, + {"name": "amount", "type": "double"}, + ], + "mode": "published", + "catalog": "default", + "schema_": "v3", + "table": "merged_distinct_events", + }, + ) + assert resp.status_code in (200, 201), resp.json() + resp = await client.post( + "/nodes/transform/", + json={ + "name": "v3.merged_distinct_xform", + "query": ( + "SELECT account_id, event_date, is_active, amount " + "FROM v3.src_merged_distinct" + ), + "mode": "published", + "primary_key": ["account_id", "event_date"], + }, + ) + assert resp.status_code == 201, resp.json() + # Plain-column DISTINCT (LIMITED, grain_alias=bare). + resp = await client.post( + "/nodes/metric/", + json={ + "name": "v3.merged_distinct_accounts", + "query": ( + "SELECT COUNT(DISTINCT account_id) FROM v3.merged_distinct_xform" + ), + "mode": "published", + }, + ) + assert resp.status_code == 201, resp.json() + # Complex DISTINCT (LIMITED, grain_alias==component.name). + resp = await client.post( + "/nodes/metric/", + json={ + "name": "v3.merged_distinct_active_accounts", + "query": ( + "SELECT COUNT(DISTINCT CASE WHEN is_active " + "THEN account_id ELSE NULL END) " + "FROM v3.merged_distinct_xform" + ), + "mode": "published", + }, + ) + assert resp.status_code == 201, resp.json() + # FULL aggregability metric — forces the grain group to merge + # across aggregability levels (drives is_merged=True). + resp = await client.post( + "/nodes/metric/", + json={ + "name": "v3.merged_distinct_total_amount", + "query": "SELECT SUM(amount) FROM v3.merged_distinct_xform", + "mode": "published", + }, + ) + assert resp.status_code == 201, resp.json() + + response = await client.get( + "/sql/measures/v3/", + params={ + "metrics": [ + "v3.merged_distinct_accounts", + "v3.merged_distinct_active_accounts", + "v3.merged_distinct_total_amount", + ], + "dimensions": ["v3.merged_distinct_xform.event_date"], + }, + ) + assert response.status_code == 200, response.json() + body = response.json() + + # The merged grain group is the LIMITED one (FULL gets merged + # in at the LIMITED level). Locate the hashed identity + # columns we'll substitute into the expected SQL & combiners. + gg = next( + g + for g in body["grain_groups"] + if "v3.merged_distinct_accounts" in g["metrics"] + ) + cols = [c["name"] for c in gg["columns"]] + # Pin uniqueness of each hashed identity so a future + # decompose-side naming change (e.g., a uniformity refactor + # that drops a column-name prefix) breaks the test rather + # than silently passing with both `next()` calls returning + # the same column. + plain_matches = [ + n + for n in cols + if n.startswith("account_id_distinct_") + and not n.startswith("is_active_account_id_distinct_") + ] + complex_matches = [ + n for n in cols if n.startswith("is_active_account_id_distinct_") + ] + amount_matches = [n for n in cols if n.startswith("amount_sum_")] + assert len(plain_matches) == 1, plain_matches + assert len(complex_matches) == 1, complex_matches + assert len(amount_matches) == 1, amount_matches + plain_hashed = plain_matches[0] + complex_hashed = complex_matches[0] + amount_sum_hashed = amount_matches[0] + assert plain_hashed != complex_hashed + + assert_sql_equal( + gg["sql"], + f""" + WITH v3_merged_distinct_xform AS ( + SELECT account_id, event_date, is_active, amount + FROM default.v3.merged_distinct_events + ) + SELECT t1.event_date, + CASE WHEN t1.is_active THEN t1.account_id ELSE NULL END {complex_hashed}, + t1.account_id, + SUM(t1.amount) {amount_sum_hashed}, + t1.account_id {plain_hashed} + FROM v3_merged_distinct_xform t1 + GROUP BY t1.event_date, {complex_hashed}, t1.account_id + """, + normalize_aliases=True, + ) + + combiners_by_metric = { + f["name"]: f["combiner"] for f in body["metric_formulas"] + } + assert combiners_by_metric == { + "v3.merged_distinct_accounts": f"COUNT( DISTINCT {plain_hashed})", + "v3.merged_distinct_active_accounts": f"COUNT( DISTINCT {complex_hashed})", + "v3.merged_distinct_total_amount": f"SUM({amount_sum_hashed})", + } + + @pytest.mark.asyncio + async def test_count_distinct_dedup_across_shared_components( + self, + client_with_build_v3, + ): + """Two metrics that decompose to the same LIMITED component + must produce a single `` AS `` projection, not + two — duplicate column names break SQL execution. Dedup is + held by ``seen_components`` in ``build_grain_group_sql``. + """ + client = client_with_build_v3 + resp = await client.post( + "/nodes/source/", + json={ + "name": "v3.src_dedup_distinct", + "columns": [ + {"name": "account_id", "type": "int"}, + {"name": "event_date", "type": "int"}, + ], + "mode": "published", + "catalog": "default", + "schema_": "v3", + "table": "dedup_distinct_events", + }, + ) + assert resp.status_code in (200, 201), resp.json() + resp = await client.post( + "/nodes/transform/", + json={ + "name": "v3.dedup_distinct_xform", + "query": "SELECT account_id, event_date FROM v3.src_dedup_distinct", + "mode": "published", + "primary_key": ["account_id", "event_date"], + }, + ) + assert resp.status_code == 201, resp.json() + # Two metrics with identical decomposition — both produce a + # COUNT(DISTINCT account_id) component with the same hash. + for name in ("v3.dedup_unique_accounts", "v3.dedup_distinct_accounts"): + resp = await client.post( + "/nodes/metric/", + json={ + "name": name, + "query": ( + "SELECT COUNT(DISTINCT account_id) FROM v3.dedup_distinct_xform" + ), + "mode": "published", + }, + ) + assert resp.status_code == 201, resp.json() + + response = await client.get( + "/sql/measures/v3/", + params={ + "metrics": [ + "v3.dedup_unique_accounts", + "v3.dedup_distinct_accounts", + ], + "dimensions": ["v3.dedup_distinct_xform.event_date"], + }, + ) + assert response.status_code == 200, response.json() + body = response.json() + gg = body["grain_groups"][0] + cols = [c["name"] for c in gg["columns"]] + + # The hashed identity column appears exactly once even though + # two metrics reference it. + hashed_matches = [n for n in cols if n.startswith("account_id_distinct_")] + assert len(hashed_matches) == 1, hashed_matches + hashed_col = hashed_matches[0] + + assert_sql_equal( + gg["sql"], + f""" + WITH v3_dedup_distinct_xform AS ( + SELECT account_id, event_date + FROM default.v3.dedup_distinct_events + ) + SELECT t1.event_date, + t1.account_id, + t1.account_id {hashed_col} + FROM v3_dedup_distinct_xform t1 + GROUP BY t1.event_date, t1.account_id + """, + normalize_aliases=True, + ) + + # Both metric formulas reference the same hashed column. + combiners_by_metric = { + f["name"]: f["combiner"] for f in body["metric_formulas"] + } + assert combiners_by_metric == { + "v3.dedup_unique_accounts": f"COUNT( DISTINCT {hashed_col})", + "v3.dedup_distinct_accounts": f"COUNT( DISTINCT {hashed_col})", + } diff --git a/datajunction-server/tests/construction/build_v3/metrics_sql_test.py b/datajunction-server/tests/construction/build_v3/metrics_sql_test.py index dfae474a6..b74d0b91a 100644 --- a/datajunction-server/tests/construction/build_v3/metrics_sql_test.py +++ b/datajunction-server/tests/construction/build_v3/metrics_sql_test.py @@ -242,7 +242,9 @@ async def test_derived_metric_ratio(self, client_with_build_v3): FROM default.v3.page_views ), order_details_0 AS ( - SELECT t2.category, t1.order_id + SELECT t2.category, + t1.order_id, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id @@ -255,7 +257,9 @@ async def test_derived_metric_ratio(self, client_with_build_v3): GROUP BY category ), page_views_enriched_0 AS ( - SELECT t2.category, t1.customer_id + SELECT t2.category, + t1.customer_id, + t1.customer_id customer_id_distinct_dd4be7a5 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.customer_id @@ -326,13 +330,13 @@ async def test_multiple_derived_metrics_same_fact(self, client_with_build_v3): JOIN default.v3.order_items oi ON o.order_id = oi.order_id ), order_details_0 AS ( - SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, SUM(t1.quantity) quantity_sum_06b64d2e + SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, SUM(t1.quantity) quantity_sum_06b64d2e, t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id), 0) AS avg_order_value, - SUM(order_details_0.quantity_sum_06b64d2e) / NULLIF(COUNT(DISTINCT order_details_0.order_id), 0) AS avg_items_per_order + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value, + SUM(order_details_0.quantity_sum_06b64d2e) / NULLIF(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_items_per_order FROM order_details_0 GROUP BY order_details_0.status """, @@ -412,7 +416,8 @@ async def test_derived_metrics_cross_fact(self, client_with_build_v3): SELECT t2.category, t3.name name_customer, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696 + SUM(t1.line_total) line_total_sum_e1f61696, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_customer t3 ON t1.customer_id = t3.customer_id GROUP BY t2.category, t3.name, t1.order_id @@ -421,7 +426,8 @@ async def test_derived_metrics_cross_fact(self, client_with_build_v3): SELECT t2.category, t3.name name_customer, t1.customer_id, - COUNT(t1.view_id) view_id_count_f41e2db4 + COUNT(t1.view_id) view_id_count_f41e2db4, + t1.customer_id customer_id_distinct_dd4be7a5 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_customer t3 ON t1.customer_id = t3.customer_id GROUP BY t2.category, t3.name, t1.customer_id @@ -429,8 +435,8 @@ async def test_derived_metrics_cross_fact(self, client_with_build_v3): SELECT COALESCE(order_details_0.category, page_views_enriched_0.category) AS category, COALESCE(order_details_0.name_customer, page_views_enriched_0.name_customer) AS name_customer, - CAST(COUNT( DISTINCT order_details_0.order_id) AS DOUBLE) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id), 0) AS conversion_rate, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id), 0) AS revenue_per_visitor, + CAST(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS DOUBLE) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5), 0) AS conversion_rate, + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5), 0) AS revenue_per_visitor, SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(SUM(page_views_enriched_0.view_id_count_f41e2db4), 0) AS revenue_per_page_view FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.category = page_views_enriched_0.category AND order_details_0.name_customer = page_views_enriched_0.name_customer GROUP BY 1, 2 @@ -511,13 +517,14 @@ async def test_pages_per_session_same_fact_derived(self, client_with_build_v3): page_views_enriched_0 AS ( SELECT t2.category, t1.session_id, - COUNT(t1.view_id) view_id_count_f41e2db4 + COUNT(t1.view_id) view_id_count_f41e2db4, + t1.session_id session_id_distinct_e7c26e39 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.session_id ) SELECT page_views_enriched_0.category AS category, - SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id), 0) AS pages_per_session + SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id_distinct_e7c26e39), 0) AS pages_per_session FROM page_views_enriched_0 GROUP BY page_views_enriched_0.category """, @@ -1419,7 +1426,8 @@ async def test_period_over_period_metrics(self, client_with_build_v3): t3.month, t3.week, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696 + SUM(t1.line_total) line_total_sum_e1f61696, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id @@ -1430,7 +1438,7 @@ async def test_period_over_period_metrics(self, client_with_build_v3): order_details_0.category AS category, order_details_0.month AS month, order_details_0.week AS week, - COUNT(DISTINCT order_details_0.order_id) AS order_count, + COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue FROM order_details_0 GROUP BY order_details_0.category, order_details_0.month, order_details_0.week @@ -1439,7 +1447,7 @@ async def test_period_over_period_metrics(self, client_with_build_v3): SELECT order_details_0.category AS category, order_details_0.week AS week, - COUNT(DISTINCT order_details_0.order_id) AS order_count, + COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue FROM order_details_0 GROUP BY order_details_0.category, order_details_0.week @@ -1619,7 +1627,7 @@ async def test_cross_fact_full_plus_limited_fan_out( GROUP BY t2.category ), page_views_enriched_0 AS ( - SELECT t2.category, t1.customer_id + SELECT t2.category, t1.customer_id, t1.customer_id customer_id_distinct_dd4be7a5 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.customer_id @@ -1683,13 +1691,14 @@ async def test_derived_metric_with_base_metric_in_same_query( SELECT t1.status, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696 + SUM(t1.line_total) line_total_sum_e1f61696, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id), 0) AS avg_order_value, + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue FROM order_details_0 GROUP BY order_details_0.status @@ -2143,12 +2152,12 @@ async def test_nested_derived_metric_simple(self, client_with_build_v3): JOIN default.v3.order_items oi ON o.order_id = oi.order_id ), order_details_0 AS ( - SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 + SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id), 0) / 50.0 * 100 AS aov_growth_index + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) / 50.0 * 100 AS aov_growth_index FROM order_details_0 GROUP BY order_details_0.status """, @@ -2208,7 +2217,7 @@ async def test_nested_derived_metric_with_window_function( FROM default.v3.products ), order_details_0 AS ( - SELECT t2.category, t3.week, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 + SELECT t2.category, t3.week, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id @@ -2218,9 +2227,9 @@ async def test_nested_derived_metric_with_window_function( SELECT order_details_0.category AS category, order_details_0.week AS week, - COUNT(DISTINCT order_details_0.order_id) AS order_count, + COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id), 0) AS avg_order_value + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value FROM order_details_0 GROUP BY order_details_0.category, order_details_0.week ) @@ -2300,20 +2309,20 @@ async def test_nested_derived_metric_cross_fact(self, client_with_build_v3): FROM default.v3.page_views ), order_details_0 AS ( - SELECT t2.category, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 + SELECT t2.category, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id ), page_views_enriched_0 AS ( - SELECT t2.category, t1.session_id, COUNT(t1.view_id) view_id_count_f41e2db4 + SELECT t2.category, t1.session_id, COUNT(t1.view_id) view_id_count_f41e2db4, t1.session_id session_id_distinct_e7c26e39 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.session_id ) SELECT COALESCE(order_details_0.category, page_views_enriched_0.category) AS category, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id), 0) - / NULLIF(SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT(DISTINCT page_views_enriched_0.session_id), 0), 0) AS efficiency_ratio + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) + / NULLIF(SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT(DISTINCT page_views_enriched_0.session_id_distinct_e7c26e39), 0), 0) AS efficiency_ratio FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.category = page_views_enriched_0.category GROUP BY 1 @@ -2683,7 +2692,8 @@ async def test_wow_with_count_distinct_at_daily_grain(self, client_with_build_v3 t2.category, t3.week, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696 + SUM(t1.line_total) line_total_sum_e1f61696, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id @@ -2694,7 +2704,7 @@ async def test_wow_with_count_distinct_at_daily_grain(self, client_with_build_v3 order_details_0.date_id_order AS date_id_order, order_details_0.category AS category, order_details_0.week AS week, - COUNT(DISTINCT order_details_0.order_id) AS order_count, + COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue FROM order_details_0 GROUP BY order_details_0.date_id_order, order_details_0.category, order_details_0.week @@ -2703,7 +2713,7 @@ async def test_wow_with_count_distinct_at_daily_grain(self, client_with_build_v3 SELECT order_details_0.category AS category, order_details_0.week AS week, - COUNT(DISTINCT order_details_0.order_id) AS order_count, + COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue FROM order_details_0 GROUP BY order_details_0.category, order_details_0.week @@ -2880,14 +2890,14 @@ async def test_cross_fact_wow_conversion_rate(self, client_with_build_v3): FROM default.v3.page_views ), order_details_0 AS ( - SELECT t2.category, t3.week, t1.order_id + SELECT t2.category, t3.week, t1.order_id, t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id GROUP BY t2.category, t3.week, t1.order_id ), page_views_enriched_0 AS ( - SELECT t2.category, t3.week, t1.customer_id + SELECT t2.category, t3.week, t1.customer_id, t1.customer_id customer_id_distinct_dd4be7a5 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.page_date = t3.date_id @@ -2897,9 +2907,9 @@ async def test_cross_fact_wow_conversion_rate(self, client_with_build_v3): SELECT COALESCE(order_details_0.category, page_views_enriched_0.category) AS category, COALESCE(order_details_0.week, page_views_enriched_0.week) AS week, - COUNT(DISTINCT order_details_0.order_id) AS order_count, - COUNT(DISTINCT page_views_enriched_0.customer_id) AS visitor_count, - CAST(COUNT(DISTINCT order_details_0.order_id) AS DOUBLE) / NULLIF(COUNT(DISTINCT page_views_enriched_0.customer_id), 0) AS conversion_rate + COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, + COUNT(DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5) AS visitor_count, + CAST(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS DOUBLE) / NULLIF(COUNT(DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5), 0) AS conversion_rate FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.category = page_views_enriched_0.category AND order_details_0.week = page_views_enriched_0.week GROUP BY 1, 2 @@ -3021,7 +3031,8 @@ async def test_cross_fact_window_metric_with_finer_grain( SELECT COALESCE(t1.order_date, t3.date_id) AS date_id, t2.category, t3.week, - t1.order_id + t1.order_id, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id GROUP BY COALESCE(t1.order_date, t3.date_id), t2.category, t3.week, t1.order_id @@ -3030,7 +3041,8 @@ async def test_cross_fact_window_metric_with_finer_grain( SELECT COALESCE(t1.page_date, t3.date_id) AS date_id, t2.category, t3.week, - t1.customer_id + t1.customer_id, + t1.customer_id customer_id_distinct_dd4be7a5 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.page_date = t3.date_id GROUP BY COALESCE(t1.page_date, t3.date_id), t2.category, t3.week, t1.customer_id @@ -3039,9 +3051,9 @@ async def test_cross_fact_window_metric_with_finer_grain( SELECT COALESCE(order_details_0.date_id, page_views_enriched_0.date_id) AS date_id, COALESCE(order_details_0.category, page_views_enriched_0.category) AS category, COALESCE(order_details_0.week, page_views_enriched_0.week) AS week, - COUNT( DISTINCT order_details_0.order_id) AS order_count, - COUNT( DISTINCT page_views_enriched_0.customer_id) AS visitor_count, - CAST(COUNT( DISTINCT order_details_0.order_id) AS DOUBLE) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id), 0) AS conversion_rate + COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, + COUNT( DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5) AS visitor_count, + CAST(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS DOUBLE) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5), 0) AS conversion_rate FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.date_id = page_views_enriched_0.date_id AND order_details_0.category = page_views_enriched_0.category AND order_details_0.week = page_views_enriched_0.week GROUP BY 1, 2, 3 ) @@ -3130,7 +3142,8 @@ async def test_cross_fact_window_on_derived_metric(self, client_with_build_v3): t2.category, t3.week, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696 + SUM(t1.line_total) line_total_sum_e1f61696, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id GROUP BY COALESCE(t1.order_date, t3.date_id), t2.category, t3.week, t1.order_id @@ -3140,7 +3153,8 @@ async def test_cross_fact_window_on_derived_metric(self, client_with_build_v3): t2.category, t3.week, t1.session_id, - COUNT(t1.view_id) view_id_count_f41e2db4 + COUNT(t1.view_id) view_id_count_f41e2db4, + t1.session_id session_id_distinct_e7c26e39 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.page_date = t3.date_id GROUP BY COALESCE(t1.page_date, t3.date_id), t2.category, t3.week, t1.session_id @@ -3149,13 +3163,13 @@ async def test_cross_fact_window_on_derived_metric(self, client_with_build_v3): SELECT COALESCE(order_details_0.date_id, page_views_enriched_0.date_id) AS date_id, COALESCE(order_details_0.category, page_views_enriched_0.category) AS category, COALESCE(order_details_0.week, page_views_enriched_0.week) AS week, - COUNT( DISTINCT order_details_0.order_id) AS order_count, + COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, SUM(page_views_enriched_0.view_id_count_f41e2db4) AS page_view_count, - COUNT( DISTINCT page_views_enriched_0.session_id) AS session_count, + COUNT( DISTINCT page_views_enriched_0.session_id_distinct_e7c26e39) AS session_count, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id), 0) AS avg_order_value, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id), 0) / NULLIF(SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id), 0), 0) AS efficiency_ratio, - SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id), 0) AS pages_per_session + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value, + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) / NULLIF(SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id_distinct_e7c26e39), 0), 0) AS efficiency_ratio, + SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id_distinct_e7c26e39), 0) AS pages_per_session FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.date_id = page_views_enriched_0.date_id AND order_details_0.category = page_views_enriched_0.category AND order_details_0.week = page_views_enriched_0.week GROUP BY 1, 2, 3 ) @@ -3244,7 +3258,8 @@ async def test_cross_fact_window_on_base_metrics(self, client_with_build_v3): SELECT COALESCE(t1.order_date, t3.date_id) AS date_id, t2.category, t3.week, - t1.order_id + t1.order_id, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id GROUP BY COALESCE(t1.order_date, t3.date_id), t2.category, t3.week, t1.order_id @@ -3253,7 +3268,8 @@ async def test_cross_fact_window_on_base_metrics(self, client_with_build_v3): SELECT COALESCE(t1.page_date, t3.date_id) AS date_id, t2.category, t3.week, - t1.customer_id + t1.customer_id, + t1.customer_id customer_id_distinct_dd4be7a5 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.page_date = t3.date_id GROUP BY COALESCE(t1.page_date, t3.date_id), t2.category, t3.week, t1.customer_id @@ -3262,8 +3278,8 @@ async def test_cross_fact_window_on_base_metrics(self, client_with_build_v3): SELECT COALESCE(order_details_0.date_id, page_views_enriched_0.date_id) AS date_id, COALESCE(order_details_0.category, page_views_enriched_0.category) AS category, COALESCE(order_details_0.week, page_views_enriched_0.week) AS week, - COUNT( DISTINCT order_details_0.order_id) AS order_count, - COUNT( DISTINCT page_views_enriched_0.customer_id) AS visitor_count + COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, + COUNT( DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5) AS visitor_count FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.date_id = page_views_enriched_0.date_id AND order_details_0.category = page_views_enriched_0.category AND order_details_0.week = page_views_enriched_0.week GROUP BY 1, 2, 3 ), @@ -3383,7 +3399,8 @@ async def test_multi_fact_window_metrics_same_grain(self, client_with_build_v3): SELECT t2.category, t3.week, t1.session_id, - COUNT(t1.view_id) view_id_count_f41e2db4 + COUNT(t1.view_id) view_id_count_f41e2db4, + t1.session_id session_id_distinct_e7c26e39 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.page_date = t3.date_id GROUP BY t2.category, t3.week, t1.session_id @@ -3392,9 +3409,9 @@ async def test_multi_fact_window_metrics_same_grain(self, client_with_build_v3): SELECT COALESCE(order_details_0.category, page_views_enriched_0.category) AS category, COALESCE(order_details_0.week, page_views_enriched_0.week) AS week, SUM(page_views_enriched_0.view_id_count_f41e2db4) AS page_view_count, - COUNT( DISTINCT page_views_enriched_0.session_id) AS session_count, + COUNT( DISTINCT page_views_enriched_0.session_id_distinct_e7c26e39) AS session_count, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue, - SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id), 0) AS pages_per_session + SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id_distinct_e7c26e39), 0) AS pages_per_session FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.category = page_views_enriched_0.category AND order_details_0.week = page_views_enriched_0.week GROUP BY 1, 2 ) @@ -3605,13 +3622,14 @@ async def test_order_by_multiple_columns(self, client_with_build_v3): order_details_0 AS ( SELECT t1.status, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696 + SUM(t1.line_total) line_total_sum_e1f61696, + t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue, - COUNT(DISTINCT order_details_0.order_id) AS order_count + COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count FROM order_details_0 GROUP BY order_details_0.status ORDER BY status ASC, total_revenue DESC @@ -5103,13 +5121,13 @@ async def test_full_and_limited_metrics_same_fact_computed_at_limited_grain( JOIN default.v3.order_items oi ON o.order_id = oi.order_id ), order_details_0 AS ( - SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 + SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue, - COUNT(DISTINCT order_details_0.order_id) AS order_count + COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count FROM order_details_0 GROUP BY order_details_0.status """, diff --git a/datajunction-server/tests/construction/build_v3/preagg_substitution_test.py b/datajunction-server/tests/construction/build_v3/preagg_substitution_test.py index 1849dcf8a..77e91dc25 100644 --- a/datajunction-server/tests/construction/build_v3/preagg_substitution_test.py +++ b/datajunction-server/tests/construction/build_v3/preagg_substitution_test.py @@ -145,7 +145,7 @@ async def test_limited_metric_no_preagg(self, client_with_build_v3): FROM default.v3.orders o JOIN default.v3.order_items oi ON o.order_id = oi.order_id ) - SELECT t1.status, t1.order_id + SELECT t1.status, t1.order_id, t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 GROUP BY t1.status, t1.order_id """, @@ -173,12 +173,12 @@ async def test_limited_metric_no_preagg(self, client_with_build_v3): JOIN default.v3.order_items oi ON o.order_id = oi.order_id ), order_details_0 AS ( - SELECT t1.status, t1.order_id + SELECT t1.status, t1.order_id, t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, - COUNT(DISTINCT order_details_0.order_id) AS order_count + COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count FROM order_details_0 GROUP BY order_details_0.status """, @@ -218,7 +218,7 @@ async def test_derived_metric_no_preagg(self, client_with_build_v3): FROM default.v3.orders o JOIN default.v3.order_items oi ON o.order_id = oi.order_id ) - SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 + SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 GROUP BY t1.status, t1.order_id """, @@ -246,12 +246,12 @@ async def test_derived_metric_no_preagg(self, client_with_build_v3): JOIN default.v3.order_items oi ON o.order_id = oi.order_id ), order_details_0 AS ( - SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 + SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id), 0) AS avg_order_value + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value FROM order_details_0 GROUP BY order_details_0.status """, @@ -386,12 +386,12 @@ async def test_derived_metric_with_preagg(self, client_with_build_v3): WITH order_details_0 AS ( SELECT status, SUM(line_total_sum_e1f61696) line_total_sum_e1f61696, - order_id + order_id_distinct_f93d50ab FROM warehouse.preaggs.v3_order_metrics - GROUP BY status, order_id + GROUP BY status, order_id_distinct_f93d50ab ) SELECT order_details_0.status AS status, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id), 0) AS avg_order_value + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value FROM order_details_0 GROUP BY order_details_0.status """, @@ -574,13 +574,13 @@ async def test_cross_fact_with_partial_preagg(self, client_with_build_v3): GROUP BY customer_id ), page_views_enriched_0 AS ( - SELECT t1.customer_id + SELECT t1.customer_id, t1.customer_id customer_id_distinct_dd4be7a5 FROM v3_page_views_enriched t1 GROUP BY t1.customer_id ) SELECT COALESCE(order_details_0.customer_id, page_views_enriched_0.customer_id) AS customer_id, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id), 0) AS revenue_per_visitor + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5), 0) AS revenue_per_visitor FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.customer_id = page_views_enriched_0.customer_id GROUP BY 1 """, @@ -655,12 +655,12 @@ async def test_cross_fact_with_full_preagg_coverage(self, client_with_build_v3): GROUP BY customer_id ), page_views_enriched_0 AS ( - SELECT t1.customer_id + SELECT t1.customer_id, t1.customer_id customer_id_distinct_dd4be7a5 FROM v3_page_views_enriched t1 GROUP BY t1.customer_id ) SELECT COALESCE(order_details_0.customer_id, page_views_enriched_0.customer_id) AS customer_id, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id), 0) AS revenue_per_visitor + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5), 0) AS revenue_per_visitor FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.customer_id = page_views_enriched_0.customer_id GROUP BY 1 """, @@ -913,3 +913,94 @@ def test_deduplicates_repeated_components(self): ) # Only one component should appear in output despite two in input assert len(result.components) == 1 + + def test_aliases_legacy_bare_measure_col_to_hashed_component_name(self): + """build_grain_group_from_preagg aliases the pre-agg's measure + column to ``component.name`` when they differ. + + Legacy v2 pre-aggs (materialized before plain-column + COUNT(DISTINCT) projected the hashed identity) carry the + column under the bare grain name (e.g. ``account_id``) while + the component identity is hashed (``account_id_distinct_``). + ``component_aliases`` and the CTE's output column must be the + hashed name so the persisted ``derived_expression`` resolves — + the pre-agg path emits `` AS ``. + """ + from datajunction_server.database.availabilitystate import AvailabilityState + + node = self._make_node() + # Plain-column COUNT(DISTINCT account_id) component: name is + # hashed identity, expression is the bare column, LIMITED with + # no aggregation, and merge is None. + component = MetricComponent( + name="account_id_distinct_abc123", + expression="account_id", + aggregation=None, + merge=None, + rule=AggregationRule(type=Aggregability.LIMITED), + grain_alias="account_id", + ) + grain_group = GrainGroup( + parent_node=node, + aggregability=Aggregability.LIMITED, + grain_columns=[], + components=[(node, component)], + ) + + # Legacy pre-agg stores the measure under the BARE grain name. + # expr_hash matches the component so the measure is found. + expr_hash = compute_expression_hash("account_id") + measure = PreAggMeasure( + name="account_id", # legacy bare-named column + expression="account_id", + aggregation=None, + merge=None, + rule=AggregationRule(type=Aggregability.LIMITED), + expr_hash=expr_hash, + ) + avail = AvailabilityState( + catalog="wh", + schema_="preaggs", + table="legacy_tbl", + valid_through_ts=99999, + ) + preagg = PreAggregation( + node_revision_id=1, + grain_columns=[], + measures=[measure], + sql="SELECT 1", + grain_group_hash="abc", + preagg_hash="def", + availability=avail, + ) + + result = build_grain_group_from_preagg( + self._make_ctx(), + grain_group, + preagg, + resolved_dimensions=[], + components_per_metric={}, + ) + # component_aliases routes the hashed identity at component.name + # (matching the live-build path) so downstream combiner + # rewriters reference the hashed column consistently. + assert result.component_aliases == { + "account_id_distinct_abc123": "account_id_distinct_abc123", + } + # The CTE output exposes ``account_id_distinct_abc123`` even + # though the pre-agg physically stores ``account_id``. + column_names = {c.name for c in result.columns} + assert "account_id_distinct_abc123" in column_names + assert "account_id" not in column_names + # The generated SQL selects the bare physical ``account_id`` + # column from the legacy pre-agg and aliases it to the hashed + # identity. ``assert_sql_equal`` ignores whitespace and the + # optional ``AS`` keyword in alias syntax. + assert_sql_equal( + str(result.query), + """ + SELECT account_id account_id_distinct_abc123 + FROM wh.preaggs.legacy_tbl + GROUP BY account_id + """, + ) diff --git a/datajunction-server/tests/construction/build_v3/set_operations_test.py b/datajunction-server/tests/construction/build_v3/set_operations_test.py index 5097e3480..66a601ed9 100644 --- a/datajunction-server/tests/construction/build_v3/set_operations_test.py +++ b/datajunction-server/tests/construction/build_v3/set_operations_test.py @@ -89,12 +89,12 @@ async def test_union_all_transform_metric_generates_sql( WHERE status = 'shipped' ), orders_unified_0 AS ( - SELECT t1.status, t1.order_id + SELECT t1.status, t1.order_id, t1.order_id order_id_distinct_a853fcda FROM v3_orders_unified t1 GROUP BY t1.status, t1.order_id ) SELECT orders_unified_0.status AS status, - COUNT(DISTINCT orders_unified_0.order_id) AS unified_order_count + COUNT(DISTINCT orders_unified_0.order_id_distinct_a853fcda) AS unified_order_count FROM orders_unified_0 GROUP BY orders_unified_0.status """, @@ -138,12 +138,12 @@ async def test_union_transform_filter_pushed_into_primary_arm_only( WHERE status = 'shipped' ), orders_unified_0 AS ( - SELECT t1.status, t1.order_id + SELECT t1.status, t1.order_id, t1.order_id order_id_distinct_a853fcda FROM v3_orders_unified t1 GROUP BY t1.status, t1.order_id ) SELECT orders_unified_0.status AS status, - COUNT(DISTINCT orders_unified_0.order_id) AS unified_order_count + COUNT(DISTINCT orders_unified_0.order_id_distinct_a853fcda) AS unified_order_count FROM orders_unified_0 WHERE orders_unified_0.status = 'completed' GROUP BY orders_unified_0.status @@ -276,7 +276,7 @@ async def test_filter_only_dim_link_on_one_arm_source_only( FROM default.v3.orders_us WHERE src_orders_us.order_date >= 20260101 ) - SELECT t1.status, t1.order_id + SELECT t1.status, t1.order_id, t1.order_id order_id_distinct_3d2cb4c8 FROM v3_orders_global t1 GROUP BY t1.status, t1.order_id """, diff --git a/datajunction-server/tests/construction/build_v3/test_self_join_sql_generation.py b/datajunction-server/tests/construction/build_v3/test_self_join_sql_generation.py index d86e1cb73..9ece9f096 100644 --- a/datajunction-server/tests/construction/build_v3/test_self_join_sql_generation.py +++ b/datajunction-server/tests/construction/build_v3/test_self_join_sql_generation.py @@ -92,7 +92,8 @@ async def test_direct_self_join_employee_manager( ) SELECT t2.employee_name employee_name_manager, - t1.employee_id + t1.employee_id, + t1.employee_id employee_id_distinct_d13d9843 FROM default_employee t1 LEFT OUTER JOIN default_employee t2 ON t1.manager_employee_id = t2.employee_id GROUP BY @@ -125,7 +126,8 @@ async def test_direct_self_join_employee_manager( employee_0 AS ( SELECT t2.employee_name employee_name_manager, - t1.employee_id + t1.employee_id, + t1.employee_id employee_id_distinct_d13d9843 FROM default_employee t1 LEFT OUTER JOIN default_employee t2 ON t1.manager_employee_id = t2.employee_id GROUP BY @@ -133,7 +135,7 @@ async def test_direct_self_join_employee_manager( ) SELECT employee_0.employee_name_manager AS employee_name_manager, - COUNT(DISTINCT employee_0.employee_id) AS employee_count + COUNT(DISTINCT employee_0.employee_id_distinct_d13d9843) AS employee_count FROM employee_0 GROUP BY employee_0.employee_name_manager