diff --git a/datajunction-server/datajunction_server/construction/build_v3/combiners.py b/datajunction-server/datajunction_server/construction/build_v3/combiners.py index 033c92478..8aa092ea9 100644 --- a/datajunction-server/datajunction_server/construction/build_v3/combiners.py +++ b/datajunction-server/datajunction_server/construction/build_v3/combiners.py @@ -846,17 +846,12 @@ def _build_grain_group_from_preagg_table( if col.semantic_type in ("metric_component", "measure", "metric"): col_ref = ast.Column(name=ast.Name(col.name)) - # Find the component to get the merge function. The third - # clause matches legacy pre-agg / grain-group shapes where - # the LIMITED-DISTINCT column lived under the bare grain - # name (before the ` AS ` projection landed) — - # keeps re-aggregation correct against stale materializations. + # Find the component to get the merge function merge_func = None for comp in original_gg.components: if ( # pragma: no branch comp.name == col.name or original_gg.component_aliases.get(comp.name) == col.name - or comp.grain_alias == col.name ): merge_func = comp.merge break diff --git a/datajunction-server/datajunction_server/construction/build_v3/cube_matcher.py b/datajunction-server/datajunction_server/construction/build_v3/cube_matcher.py index 880d64ffc..59c351fd4 100644 --- a/datajunction-server/datajunction_server/construction/build_v3/cube_matcher.py +++ b/datajunction-server/datajunction_server/construction/build_v3/cube_matcher.py @@ -486,19 +486,7 @@ def build_synthetic_grain_group( for comp in decomposed.components: if comp.name not in component_aliases: # pragma: no branch - # Legacy compat: cubes materialized before LIMITED-DISTINCT - # gained its ` AS ` projection had the column - # stored under the bare grain name (e.g. ``order_id``) - # rather than the hashed identity (``order_id_distinct_``). - # Try the hashed name first (current shape), then fall back - # to the grain_alias (legacy shape) before defaulting to - # ``comp.name``. Keeps stale cube configs queryable until - # they're re-materialized. - cube_col_name = mat_col_lookup.get(comp.name) - if cube_col_name is None and comp.grain_alias: - cube_col_name = mat_col_lookup.get(comp.grain_alias) - if cube_col_name is None: - cube_col_name = comp.name + cube_col_name = mat_col_lookup.get(comp.name, comp.name) component_aliases[comp.name] = cube_col_name all_components.append(comp) diff --git a/datajunction-server/datajunction_server/construction/build_v3/measures.py b/datajunction-server/datajunction_server/construction/build_v3/measures.py index 3108066c6..c2fd8df64 100644 --- a/datajunction-server/datajunction_server/construction/build_v3/measures.py +++ b/datajunction-server/datajunction_server/construction/build_v3/measures.py @@ -164,43 +164,6 @@ def _parse_type_string(type_str: str | None) -> ct.ColumnType | None: return _TYPE_STRING_MAP.get(normalized) -def register_limited_component( - component: MetricComponent, - metric_node: Node, - component_aliases: dict[str, str], - component_expressions: list[tuple[str, ast.Expression]], - component_metadata: list[tuple[str, MetricComponent, Node]], -) -> None: - """Register a LIMITED component (e.g. ``COUNT(DISTINCT)``). - - For plain-column DISTINCT, ``grain_alias`` is the bare column and - differs from ``component.name`` (the hashed identity). Emit an - extra `` AS `` projection so the hashed name - is addressable alongside the bare grain column, and route - ``component_aliases`` at the hashed name so combiner rewriters - stay consistent. Both call sites in ``build_grain_group_sql`` must - use this helper to keep the contract in one place. - """ - # The preagg-read path assumes LIMITED.merge is None to skip - # re-aggregation; a future LIMITED+merge variant would wrap the - # merge over the bare grain column and produce wrong SQL. - assert component.merge is None, ( - f"LIMITED component {component.name!r} has merge={component.merge!r}; " - f"update the preagg-read path in combiners.py before adding this." - ) - grain_alias = component.grain_alias or component.name - if grain_alias != component.name: - component_aliases[component.name] = component.name - component_expressions.append( - (component.name, make_column_ref(grain_alias)), - ) - component_metadata.append( - (component.name, component, metric_node), - ) - else: - component_aliases[component.name] = grain_alias - - def infer_component_type( component: MetricComponent, metric_type: str, @@ -1784,14 +1747,10 @@ def build_grain_group_from_preagg( f"Component {component.name} not found in pre-agg {preagg.id}", ) - # Always alias the output as ``component.name`` (the hashed - # identity) so the CTE's output column name matches the - # metric's persisted ``derived_expression`` regardless of how - # the pre-agg physically stored it. Legacy pre-aggs - # materialized before the LIMITED-DISTINCT change carry the - # column under the bare grain name; we read that bare column - # and alias it back to the hashed identity here. - output_alias = component.name + # Always use the measure column name (component hash) as the output alias + # This ensures consistency with the non-preagg path + output_alias = measure_col + component_aliases[component.name] = output_alias col_ref = ast.Column(name=ast.Name(measure_col)) @@ -1806,17 +1765,11 @@ def build_grain_group_from_preagg( aliased = ast.Alias(child=agg_expr, alias=ast.Name(output_alias)) select_items.append(aliased) else: - # No merge - output grain column directly, add to GROUP BY. - # Alias to ``component.name`` so the CTE output name is - # the hashed identity regardless of the pre-agg's stored - # column name. - if measure_col != output_alias: - select_items.append( - ast.Alias(child=col_ref, alias=ast.Name(output_alias)), - ) - else: - select_items.append(col_ref) + # No merge - output grain column directly, add to GROUP BY + select_items.append(col_ref) grain_col_names.append(measure_col) + output_alias = measure_col + component_aliases[component.name] = output_alias # Get type from pre-agg columns col_type = preagg.get_column_type(measure_col, default="double") @@ -1957,12 +1910,11 @@ def build_grain_group_sql( Aggregability.FULL, ) if orig_agg == Aggregability.LIMITED: - register_limited_component( - component, - metric_node, - component_aliases, - component_expressions, - component_metadata, + # LIMITED: grain column is already in GROUP BY, no output needed. + # grain_alias was set by _make_component: plain column → column name, + # complex expression → component.name. + component_aliases[component.name] = ( + component.grain_alias or component.name ) continue else: @@ -1977,14 +1929,12 @@ def build_grain_group_sql( component_aliases[component.name] = component_alias continue + # Skip LIMITED aggregability components with no aggregation + # These are represented by grain columns instead. + # grain_alias was set by _make_component: plain column → column name, + # complex expression → component.name. if component.rule.type == Aggregability.LIMITED and not component.aggregation: - register_limited_component( - component, - metric_node, - component_aliases, - component_expressions, - component_metadata, - ) + component_aliases[component.name] = component.grain_alias or component.name continue # Always use component.name for consistency - no special case for single-component diff --git a/datajunction-server/tests/api/cubes_test.py b/datajunction-server/tests/api/cubes_test.py index 8c4c65f14..e5a0d0941 100644 --- a/datajunction-server/tests/api/cubes_test.py +++ b/datajunction-server/tests/api/cubes_test.py @@ -4964,7 +4964,6 @@ async def test_materialize_cube_returns_metric_combiners( order_id, SUM(line_total_sum_e1f61696) line_total_sum_e1f61696, hll_union_agg(customer_id_hll_23002251) customer_id_hll_23002251, - order_id_distinct_f93d50ab, week_order FROM analytics.preaggs.v3_revenue_orders_by_week GROUP BY category, order_id, week_order @@ -5004,14 +5003,6 @@ async def test_materialize_cube_returns_metric_combiners( "semantic_type": "metric_component", "type": "binary", }, - { - "column": None, - "name": "order_id_distinct_f93d50ab", - "node": None, - "semantic_entity": "v3.order_count:order_id_distinct_f93d50ab", - "semantic_type": "metric_component", - "type": "bigint", - }, { "column": None, "name": "week_order", diff --git a/datajunction-server/tests/construction/build_v3/djsql_test.py b/datajunction-server/tests/construction/build_v3/djsql_test.py index 78a188ccb..5c1094afb 100644 --- a/datajunction-server/tests/construction/build_v3/djsql_test.py +++ b/datajunction-server/tests/construction/build_v3/djsql_test.py @@ -144,12 +144,12 @@ async def test_derived_metric(self, client_with_build_v3): JOIN default.v3.order_items oi ON o.order_id = oi.order_id ), order_details_0 AS ( - SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab + SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id), 0) AS avg_order_value FROM order_details_0 GROUP BY order_details_0.status """, diff --git a/datajunction-server/tests/construction/build_v3/having_clause_test.py b/datajunction-server/tests/construction/build_v3/having_clause_test.py index 27647eaac..8bde5601f 100644 --- a/datajunction-server/tests/construction/build_v3/having_clause_test.py +++ b/datajunction-server/tests/construction/build_v3/having_clause_test.py @@ -99,18 +99,17 @@ async def test_metric_filter_with_less_than( order_details_0 AS ( SELECT t2.category, - t1.order_id, - t1.order_id order_id_distinct_f93d50ab + t1.order_id FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id ) SELECT order_details_0.category AS category, - COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count + COUNT( DISTINCT order_details_0.order_id) AS order_count FROM order_details_0 GROUP BY order_details_0.category - HAVING COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) < 100 + HAVING COUNT( DISTINCT order_details_0.order_id) < 100 """, ) @@ -195,18 +194,17 @@ async def test_metric_filter_with_equality( order_details_0 AS ( SELECT t2.category, - t1.order_id, - t1.order_id order_id_distinct_f93d50ab + t1.order_id FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id ) SELECT order_details_0.category AS category, - COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count + COUNT( DISTINCT order_details_0.order_id) AS order_count FROM order_details_0 GROUP BY order_details_0.category - HAVING COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) = 50 + HAVING COUNT(DISTINCT order_details_0.order_id) = 50 """, ) @@ -259,8 +257,7 @@ async def test_multiple_metric_filters_and( SELECT t2.category, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696, - t1.order_id order_id_distinct_f93d50ab + SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id @@ -268,12 +265,12 @@ async def test_multiple_metric_filters_and( SELECT order_details_0.category AS category, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue, - COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count + COUNT( DISTINCT order_details_0.order_id) AS order_count FROM order_details_0 GROUP BY order_details_0.category HAVING SUM(order_details_0.line_total_sum_e1f61696) > 5000 - AND COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) > 20 + AND COUNT( DISTINCT order_details_0.order_id) > 20 """, ) @@ -439,8 +436,7 @@ async def test_complex_mixed_filters( t2.category, t1.status, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696, - t1.order_id order_id_distinct_f93d50ab + SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id WHERE t2.category IN ('Electronics', 'Clothing') @@ -450,11 +446,11 @@ async def test_complex_mixed_filters( order_details_0.category AS category, order_details_0.status AS status, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue, - COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count + COUNT(DISTINCT order_details_0.order_id) AS order_count FROM order_details_0 WHERE order_details_0.category IN ('Electronics', 'Clothing') AND order_details_0.status = 'completed' GROUP BY order_details_0.category, order_details_0.status - HAVING SUM(order_details_0.line_total_sum_e1f61696) > 10000 AND COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) >= 50 + HAVING SUM(order_details_0.line_total_sum_e1f61696) > 10000 AND COUNT( DISTINCT order_details_0.order_id) >= 50 """, ) @@ -505,17 +501,16 @@ async def test_derived_metric_filter( SELECT t2.category, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696, - t1.order_id order_id_distinct_f93d50ab + SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id ) SELECT order_details_0.category AS category, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id), 0) AS avg_order_value FROM order_details_0 GROUP BY order_details_0.category - HAVING SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) > 100 + HAVING SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id), 0) > 100 """, ) diff --git a/datajunction-server/tests/construction/build_v3/measures_sql_test.py b/datajunction-server/tests/construction/build_v3/measures_sql_test.py index 2a931a7e1..8d267bdea 100644 --- a/datajunction-server/tests/construction/build_v3/measures_sql_test.py +++ b/datajunction-server/tests/construction/build_v3/measures_sql_test.py @@ -1249,7 +1249,7 @@ async def test_three_metrics_same_parent(self, client_with_build_v3): ] # Columns: dimension grain column 3 raw metric columns - assert len(gg["columns"]) == 5 + assert len(gg["columns"]) == 4 assert gg["columns"][0] == { "name": "status", "type": "string", @@ -1272,7 +1272,7 @@ async def test_three_metrics_same_parent(self, client_with_build_v3): FROM default.v3.orders o JOIN default.v3.order_items oi ON o.order_id = oi.order_id ) - SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, SUM(t1.quantity) quantity_sum_06b64d2e, t1.order_id order_id_distinct_f93d50ab + SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, SUM(t1.quantity) quantity_sum_06b64d2e FROM v3_order_details t1 GROUP BY t1.status, t1.order_id """, @@ -1299,7 +1299,7 @@ async def test_three_metrics_same_parent(self, client_with_build_v3): # total_revenue component assert components[1] == { - "name": "order_id_distinct_f93d50ab", + "name": "order_id", "expression": "order_id", "aggregation": None, "merge": None, @@ -1647,12 +1647,6 @@ async def test_cross_fact_metrics_different_aggregabilities( "semantic_entity": "v3.page_views_enriched.session_id", "semantic_type": "dimension", }, - { - "name": "session_id_distinct_e7c26e39", - "type": "bigint", - "semantic_entity": "v3.session_count:session_id_distinct_e7c26e39", - "semantic_type": "metric_component", - }, ] assert_sql_equal( gg_sessions["sql"], @@ -1666,7 +1660,7 @@ async def test_cross_fact_metrics_different_aggregabilities( SELECT product_id, category FROM default.v3.products ) - SELECT t2.category, t1.session_id, t1.session_id session_id_distinct_e7c26e39 + SELECT t2.category, t1.session_id FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.session_id @@ -1727,12 +1721,6 @@ async def test_cross_fact_derived_metric(self, client_with_build_v3): "semantic_entity": "v3.order_details.order_id", "semantic_type": "dimension", }, - { - "name": "order_id_distinct_f93d50ab", - "type": "bigint", - "semantic_entity": "v3.order_count:order_id_distinct_f93d50ab", - "semantic_type": "metric_component", - }, ] # Joins order_details -> product for category dimension assert_sql_equal( @@ -1748,7 +1736,7 @@ async def test_cross_fact_derived_metric(self, client_with_build_v3): SELECT product_id, category FROM default.v3.products ) - SELECT t2.category, t1.order_id, t1.order_id order_id_distinct_f93d50ab + SELECT t2.category, t1.order_id FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id @@ -1774,12 +1762,6 @@ async def test_cross_fact_derived_metric(self, client_with_build_v3): "semantic_entity": "v3.page_views_enriched.customer_id", "semantic_type": "dimension", }, - { - "name": "customer_id_distinct_dd4be7a5", - "type": "bigint", - "semantic_entity": "v3.visitor_count:customer_id_distinct_dd4be7a5", - "semantic_type": "metric_component", - }, ] # Joins page_views_enriched -> product for category dimension assert_sql_equal( @@ -1794,7 +1776,7 @@ async def test_cross_fact_derived_metric(self, client_with_build_v3): SELECT product_id, category FROM default.v3.products ) - SELECT t2.category, t1.customer_id, t1.customer_id customer_id_distinct_dd4be7a5 + SELECT t2.category, t1.customer_id FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.customer_id @@ -1874,7 +1856,7 @@ async def test_cross_fact_multiple_derived_metrics(self, client_with_build_v3): SELECT product_id, category FROM default.v3.products ) - SELECT t2.category, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab + SELECT t2.category, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id @@ -1900,7 +1882,7 @@ async def test_cross_fact_multiple_derived_metrics(self, client_with_build_v3): SELECT product_id, category FROM default.v3.products ) - SELECT t2.category, t1.customer_id, COUNT(t1.view_id) view_id_count_f41e2db4, t1.customer_id customer_id_distinct_dd4be7a5 + SELECT t2.category, t1.customer_id, COUNT(t1.view_id) view_id_count_f41e2db4 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.customer_id @@ -2341,8 +2323,7 @@ async def test_count_distinct_grain_col_not_duplicated_when_also_a_dimension( gg = response.json()["grain_groups"][0] # order_id must appear exactly once in the SELECT (not duplicated as both - # dimension column and grain column). The hashed identity projection - # for the LIMITED component is still emitted. + # dimension column and grain column) assert_sql_equal( gg["sql"], """ @@ -2351,7 +2332,7 @@ async def test_count_distinct_grain_col_not_duplicated_when_also_a_dimension( FROM default.v3.orders o JOIN default.v3.order_items oi ON o.order_id = oi.order_id ) - SELECT t1.status, t1.order_id, t1.order_id order_id_distinct_f93d50ab + SELECT t1.status, t1.order_id FROM v3_order_details t1 GROUP BY t1.status, t1.order_id """, @@ -2377,12 +2358,6 @@ async def test_count_distinct_grain_col_not_duplicated_when_also_a_dimension( "semantic_entity": "v3.order_details.order_id", "semantic_type": "dimension", }, - { - "name": "order_id_distinct_f93d50ab", - "type": "bigint", - "semantic_entity": "v3.order_count:order_id_distinct_f93d50ab", - "semantic_type": "metric_component", - }, ] @@ -3198,8 +3173,7 @@ async def test_temporal_filters_applied_when_cube_exists( t1.order_date date_id, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, - SUM(t1.quantity) quantity_sum_06b64d2e, - t1.order_id order_id_distinct_f93d50ab + SUM(t1.quantity) quantity_sum_06b64d2e FROM v3_order_details t1 GROUP BY t1.order_date, t1.order_id """, @@ -4182,8 +4156,7 @@ async def test_nested_derived_metric_decomposes_to_base_components( SELECT t1.status, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696, - t1.order_id order_id_distinct_f93d50ab + SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 GROUP BY t1.status, t1.order_id """, @@ -4241,8 +4214,7 @@ async def test_nested_derived_window_metric_decomposes_to_base_components( t2.category, t3.week, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696, - t1.order_id order_id_distinct_f93d50ab + SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id @@ -4308,8 +4280,7 @@ async def test_nested_derived_cross_fact_decomposes_to_base_components( SELECT t2.category, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696, - t1.order_id order_id_distinct_f93d50ab + SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id @@ -4334,8 +4305,7 @@ async def test_nested_derived_cross_fact_decomposes_to_base_components( ) SELECT t2.category, t1.session_id, - COUNT(t1.view_id) view_id_count_f41e2db4, - t1.session_id session_id_distinct_e7c26e39 + COUNT(t1.view_id) view_id_count_f41e2db4 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.session_id """, @@ -6432,7 +6402,7 @@ async def test_nested_subquery_drops_fk_column(self, client_with_build_v3): ) AS wrap GROUP BY account_id, event_type ) - SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_ccca78c0 + SELECT t1.event_type, t1.account_id FROM v3_events_nested t1 GROUP BY t1.event_type, t1.account_id """, @@ -6491,7 +6461,7 @@ async def test_self_join_filters_both_aliases(self, client_with_build_v3): WHERE a1.audit_date >= 20260101 AND a2.audit_date >= 20260101 ) - SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_efcd5b4d + SELECT t1.event_type, t1.account_id FROM v3_events_self_join t1 GROUP BY t1.event_type, t1.account_id """, @@ -6558,7 +6528,7 @@ async def test_source_inside_inner_with_cte( SELECT account_id, event_type FROM v3_events_with_cte__inner_cte inner_cte ) - SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_4062d29e + SELECT t1.event_type, t1.account_id FROM v3_events_with_cte t1 GROUP BY t1.event_type, t1.account_id """, @@ -6645,7 +6615,7 @@ async def test_source_inside_where_exists_subquery( AND log.audit_date >= 20260101 ) ) - SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_7a788b39 + SELECT t1.event_type, t1.account_id FROM v3_events_exists t1 GROUP BY t1.event_type, t1.account_id """, @@ -6732,7 +6702,7 @@ async def test_set_op_with_nested_subquery_in_non_leading_arm( WHERE deep.audit_date >= 20260101 ) AS wrap ) - SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_49f58cc7 + SELECT t1.event_type, t1.account_id FROM v3_events_union_nested t1 GROUP BY t1.event_type, t1.account_id """, @@ -6800,7 +6770,7 @@ async def test_multiple_filters_same_source_different_scopes( AND inner_src.audit_date <= 20260131 ) AS wrap ) - SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_233abbf9 + SELECT t1.event_type, t1.account_id FROM v3_events_two_filters t1 GROUP BY t1.event_type, t1.account_id """, @@ -6865,7 +6835,7 @@ async def test_source_in_from_and_in_where_exists(self, client_with_build_v3): ) AND a.audit_date >= 20260101 ) - SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_3d483180 + SELECT t1.event_type, t1.account_id FROM v3_events_from_exists t1 GROUP BY t1.event_type, t1.account_id """, @@ -6917,7 +6887,7 @@ async def test_source_without_alias(self, client_with_build_v3): FROM default.v3.audit_log_noalias WHERE src_audit_log_noalias.audit_date >= 20260101 ) - SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_a0b1fd19 + SELECT t1.event_type, t1.account_id FROM v3_events_no_alias t1 GROUP BY t1.event_type, t1.account_id """, @@ -6995,7 +6965,7 @@ async def test_source_in_having_subquery(self, client_with_build_v3): WHERE log.audit_date >= 20260101 ) ) - SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_4e8747df + SELECT t1.event_type, t1.account_id FROM v3_events_having t1 GROUP BY t1.event_type, t1.account_id """, @@ -7070,7 +7040,7 @@ async def test_source_in_select_list_scalar_subquery( WHERE log.audit_date >= 20260101) AS max_id FROM default.v3.acct_scalar AS acc ) - SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_946a2255 + SELECT t1.event_type, t1.account_id FROM v3_events_scalar t1 GROUP BY t1.event_type, t1.account_id """, @@ -7211,7 +7181,7 @@ async def test_two_upstreams_linked_to_same_dim(self, client_with_build_v3): WHERE x.audit_date >= 20260101 AND y.audit_date >= 20260101 ) - SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_ec521d7c + SELECT t1.event_type, t1.account_id FROM v3_events_two_upstreams t1 GROUP BY t1.event_type, t1.account_id """, @@ -7316,7 +7286,7 @@ async def test_transform_link_target(self, client_with_build_v3): SELECT account_id, event_type FROM v3_events_with_date ) - SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_274a187b + SELECT t1.event_type, t1.account_id FROM v3_events_wrapping t1 GROUP BY t1.event_type, t1.account_id """, @@ -7415,7 +7385,7 @@ async def test_transform_link_plus_source_link_double_filter( v3_events_double_link_wrap AS ( SELECT account_id, event_type FROM v3_events_double_link ) - SELECT t1.event_type, t1.account_id, t1.account_id account_id_distinct_b9c0262f + SELECT t1.event_type, t1.account_id FROM v3_events_double_link_wrap t1 GROUP BY t1.event_type, t1.account_id """, @@ -9836,357 +9806,3 @@ async def test_alias_substitution_skips_subquery_alias_collision( """, normalize_aliases=True, ) - - @pytest.mark.asyncio - async def test_count_distinct_plain_column_projects_hashed_alias( - self, - client_with_build_v3, - ): - """For ``COUNT(DISTINCT plain_col)``, the measures SQL must - project the column under BOTH the bare grain name and the - hashed identity, AND the combiner and persisted - ``derived_expression`` must reference the hashed identity — - otherwise consumers can't resolve the column. - """ - client = client_with_build_v3 - resp = await client.post( - "/nodes/source/", - json={ - "name": "v3.src_distinct_plain", - "columns": [ - {"name": "account_id", "type": "int"}, - {"name": "event_date", "type": "int"}, - ], - "mode": "published", - "catalog": "default", - "schema_": "v3", - "table": "distinct_plain_events", - }, - ) - assert resp.status_code in (200, 201), resp.json() - resp = await client.post( - "/nodes/transform/", - json={ - "name": "v3.distinct_plain_xform", - "query": ("SELECT account_id, event_date FROM v3.src_distinct_plain"), - "mode": "published", - "primary_key": ["account_id", "event_date"], - }, - ) - assert resp.status_code == 201, resp.json() - resp = await client.post( - "/nodes/metric/", - json={ - "name": "v3.distinct_plain_count", - "query": ( - "SELECT COUNT(DISTINCT account_id) FROM v3.distinct_plain_xform" - ), - "mode": "published", - }, - ) - assert resp.status_code == 201, resp.json() - - response = await client.get( - "/sql/measures/v3/", - params={ - "metrics": ["v3.distinct_plain_count"], - "dimensions": ["v3.distinct_plain_xform.event_date"], - }, - ) - assert response.status_code == 200, response.json() - body = response.json() - gg = body["grain_groups"][0] - - # Locate the hash-suffixed identity column emitted by the fix. - hashed_col = next( - ( - c["name"] - for c in gg["columns"] - if c["name"].startswith("account_id_distinct_") - ), - None, - ) - assert hashed_col is not None, ( - f"Expected an ``account_id_distinct_`` output column; " - f"got {[c['name'] for c in gg['columns']]}" - ) - - # SQL projects BOTH ``t1.account_id`` (grain/dim usage) AND - # ``t1.account_id AS account_id_distinct_`` (identity). - assert_sql_equal( - gg["sql"], - f""" - WITH v3_distinct_plain_xform AS ( - SELECT account_id, event_date - FROM default.v3.distinct_plain_events - ) - SELECT t1.event_date, - t1.account_id, - t1.account_id {hashed_col} - FROM v3_distinct_plain_xform t1 - GROUP BY t1.event_date, t1.account_id - """, - normalize_aliases=True, - ) - - # Combiner in the response and the metric node's persisted - # ``derived_expression`` both reference the hashed identity - # exactly. Both currently upper-case the function name; the - # contract is *equality between the two* — if either side - # drifts (different qualification, different case-folding, - # accidental rewrite), this test fails. - expected_combiner = f"COUNT( DISTINCT {hashed_col})" - assert body["metric_formulas"][0]["combiner"] == expected_combiner - metric_resp = await client.get("/metrics/v3.distinct_plain_count") - assert metric_resp.status_code == 200 - assert metric_resp.json()["derived_expression"] == expected_combiner - - @pytest.mark.asyncio - async def test_count_distinct_in_merged_grain_group_projects_hashed_alias( - self, - client_with_build_v3, - ): - """Same contract in the ``is_merged=True`` path (mixed- - aggregability metrics in one query). Pins that both LIMITED- - component branches in ``build_grain_group_sql`` route through - ``register_limited_component`` — the merged branch used to - skip the hashed projection independently. - """ - client = client_with_build_v3 - resp = await client.post( - "/nodes/source/", - json={ - "name": "v3.src_merged_distinct", - "columns": [ - {"name": "account_id", "type": "int"}, - {"name": "event_date", "type": "int"}, - {"name": "is_active", "type": "boolean"}, - {"name": "amount", "type": "double"}, - ], - "mode": "published", - "catalog": "default", - "schema_": "v3", - "table": "merged_distinct_events", - }, - ) - assert resp.status_code in (200, 201), resp.json() - resp = await client.post( - "/nodes/transform/", - json={ - "name": "v3.merged_distinct_xform", - "query": ( - "SELECT account_id, event_date, is_active, amount " - "FROM v3.src_merged_distinct" - ), - "mode": "published", - "primary_key": ["account_id", "event_date"], - }, - ) - assert resp.status_code == 201, resp.json() - # Plain-column DISTINCT (LIMITED, grain_alias=bare). - resp = await client.post( - "/nodes/metric/", - json={ - "name": "v3.merged_distinct_accounts", - "query": ( - "SELECT COUNT(DISTINCT account_id) FROM v3.merged_distinct_xform" - ), - "mode": "published", - }, - ) - assert resp.status_code == 201, resp.json() - # Complex DISTINCT (LIMITED, grain_alias==component.name). - resp = await client.post( - "/nodes/metric/", - json={ - "name": "v3.merged_distinct_active_accounts", - "query": ( - "SELECT COUNT(DISTINCT CASE WHEN is_active " - "THEN account_id ELSE NULL END) " - "FROM v3.merged_distinct_xform" - ), - "mode": "published", - }, - ) - assert resp.status_code == 201, resp.json() - # FULL aggregability metric — forces the grain group to merge - # across aggregability levels (drives is_merged=True). - resp = await client.post( - "/nodes/metric/", - json={ - "name": "v3.merged_distinct_total_amount", - "query": "SELECT SUM(amount) FROM v3.merged_distinct_xform", - "mode": "published", - }, - ) - assert resp.status_code == 201, resp.json() - - response = await client.get( - "/sql/measures/v3/", - params={ - "metrics": [ - "v3.merged_distinct_accounts", - "v3.merged_distinct_active_accounts", - "v3.merged_distinct_total_amount", - ], - "dimensions": ["v3.merged_distinct_xform.event_date"], - }, - ) - assert response.status_code == 200, response.json() - body = response.json() - - # The merged grain group is the LIMITED one (FULL gets merged - # in at the LIMITED level). Locate the hashed identity - # columns we'll substitute into the expected SQL & combiners. - gg = next( - g - for g in body["grain_groups"] - if "v3.merged_distinct_accounts" in g["metrics"] - ) - cols = [c["name"] for c in gg["columns"]] - # Pin uniqueness of each hashed identity so a future - # decompose-side naming change (e.g., a uniformity refactor - # that drops a column-name prefix) breaks the test rather - # than silently passing with both `next()` calls returning - # the same column. - plain_matches = [ - n - for n in cols - if n.startswith("account_id_distinct_") - and not n.startswith("is_active_account_id_distinct_") - ] - complex_matches = [ - n for n in cols if n.startswith("is_active_account_id_distinct_") - ] - amount_matches = [n for n in cols if n.startswith("amount_sum_")] - assert len(plain_matches) == 1, plain_matches - assert len(complex_matches) == 1, complex_matches - assert len(amount_matches) == 1, amount_matches - plain_hashed = plain_matches[0] - complex_hashed = complex_matches[0] - amount_sum_hashed = amount_matches[0] - assert plain_hashed != complex_hashed - - assert_sql_equal( - gg["sql"], - f""" - WITH v3_merged_distinct_xform AS ( - SELECT account_id, event_date, is_active, amount - FROM default.v3.merged_distinct_events - ) - SELECT t1.event_date, - CASE WHEN t1.is_active THEN t1.account_id ELSE NULL END {complex_hashed}, - t1.account_id, - SUM(t1.amount) {amount_sum_hashed}, - t1.account_id {plain_hashed} - FROM v3_merged_distinct_xform t1 - GROUP BY t1.event_date, {complex_hashed}, t1.account_id - """, - normalize_aliases=True, - ) - - combiners_by_metric = { - f["name"]: f["combiner"] for f in body["metric_formulas"] - } - assert combiners_by_metric == { - "v3.merged_distinct_accounts": f"COUNT( DISTINCT {plain_hashed})", - "v3.merged_distinct_active_accounts": f"COUNT( DISTINCT {complex_hashed})", - "v3.merged_distinct_total_amount": f"SUM({amount_sum_hashed})", - } - - @pytest.mark.asyncio - async def test_count_distinct_dedup_across_shared_components( - self, - client_with_build_v3, - ): - """Two metrics that decompose to the same LIMITED component - must produce a single `` AS `` projection, not - two — duplicate column names break SQL execution. Dedup is - held by ``seen_components`` in ``build_grain_group_sql``. - """ - client = client_with_build_v3 - resp = await client.post( - "/nodes/source/", - json={ - "name": "v3.src_dedup_distinct", - "columns": [ - {"name": "account_id", "type": "int"}, - {"name": "event_date", "type": "int"}, - ], - "mode": "published", - "catalog": "default", - "schema_": "v3", - "table": "dedup_distinct_events", - }, - ) - assert resp.status_code in (200, 201), resp.json() - resp = await client.post( - "/nodes/transform/", - json={ - "name": "v3.dedup_distinct_xform", - "query": "SELECT account_id, event_date FROM v3.src_dedup_distinct", - "mode": "published", - "primary_key": ["account_id", "event_date"], - }, - ) - assert resp.status_code == 201, resp.json() - # Two metrics with identical decomposition — both produce a - # COUNT(DISTINCT account_id) component with the same hash. - for name in ("v3.dedup_unique_accounts", "v3.dedup_distinct_accounts"): - resp = await client.post( - "/nodes/metric/", - json={ - "name": name, - "query": ( - "SELECT COUNT(DISTINCT account_id) FROM v3.dedup_distinct_xform" - ), - "mode": "published", - }, - ) - assert resp.status_code == 201, resp.json() - - response = await client.get( - "/sql/measures/v3/", - params={ - "metrics": [ - "v3.dedup_unique_accounts", - "v3.dedup_distinct_accounts", - ], - "dimensions": ["v3.dedup_distinct_xform.event_date"], - }, - ) - assert response.status_code == 200, response.json() - body = response.json() - gg = body["grain_groups"][0] - cols = [c["name"] for c in gg["columns"]] - - # The hashed identity column appears exactly once even though - # two metrics reference it. - hashed_matches = [n for n in cols if n.startswith("account_id_distinct_")] - assert len(hashed_matches) == 1, hashed_matches - hashed_col = hashed_matches[0] - - assert_sql_equal( - gg["sql"], - f""" - WITH v3_dedup_distinct_xform AS ( - SELECT account_id, event_date - FROM default.v3.dedup_distinct_events - ) - SELECT t1.event_date, - t1.account_id, - t1.account_id {hashed_col} - FROM v3_dedup_distinct_xform t1 - GROUP BY t1.event_date, t1.account_id - """, - normalize_aliases=True, - ) - - # Both metric formulas reference the same hashed column. - combiners_by_metric = { - f["name"]: f["combiner"] for f in body["metric_formulas"] - } - assert combiners_by_metric == { - "v3.dedup_unique_accounts": f"COUNT( DISTINCT {hashed_col})", - "v3.dedup_distinct_accounts": f"COUNT( DISTINCT {hashed_col})", - } diff --git a/datajunction-server/tests/construction/build_v3/metrics_sql_test.py b/datajunction-server/tests/construction/build_v3/metrics_sql_test.py index b74d0b91a..dfae474a6 100644 --- a/datajunction-server/tests/construction/build_v3/metrics_sql_test.py +++ b/datajunction-server/tests/construction/build_v3/metrics_sql_test.py @@ -242,9 +242,7 @@ async def test_derived_metric_ratio(self, client_with_build_v3): FROM default.v3.page_views ), order_details_0 AS ( - SELECT t2.category, - t1.order_id, - t1.order_id order_id_distinct_f93d50ab + SELECT t2.category, t1.order_id FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id @@ -257,9 +255,7 @@ async def test_derived_metric_ratio(self, client_with_build_v3): GROUP BY category ), page_views_enriched_0 AS ( - SELECT t2.category, - t1.customer_id, - t1.customer_id customer_id_distinct_dd4be7a5 + SELECT t2.category, t1.customer_id FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.customer_id @@ -330,13 +326,13 @@ async def test_multiple_derived_metrics_same_fact(self, client_with_build_v3): JOIN default.v3.order_items oi ON o.order_id = oi.order_id ), order_details_0 AS ( - SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, SUM(t1.quantity) quantity_sum_06b64d2e, t1.order_id order_id_distinct_f93d50ab + SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, SUM(t1.quantity) quantity_sum_06b64d2e FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value, - SUM(order_details_0.quantity_sum_06b64d2e) / NULLIF(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_items_per_order + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id), 0) AS avg_order_value, + SUM(order_details_0.quantity_sum_06b64d2e) / NULLIF(COUNT(DISTINCT order_details_0.order_id), 0) AS avg_items_per_order FROM order_details_0 GROUP BY order_details_0.status """, @@ -416,8 +412,7 @@ async def test_derived_metrics_cross_fact(self, client_with_build_v3): SELECT t2.category, t3.name name_customer, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696, - t1.order_id order_id_distinct_f93d50ab + SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_customer t3 ON t1.customer_id = t3.customer_id GROUP BY t2.category, t3.name, t1.order_id @@ -426,8 +421,7 @@ async def test_derived_metrics_cross_fact(self, client_with_build_v3): SELECT t2.category, t3.name name_customer, t1.customer_id, - COUNT(t1.view_id) view_id_count_f41e2db4, - t1.customer_id customer_id_distinct_dd4be7a5 + COUNT(t1.view_id) view_id_count_f41e2db4 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_customer t3 ON t1.customer_id = t3.customer_id GROUP BY t2.category, t3.name, t1.customer_id @@ -435,8 +429,8 @@ async def test_derived_metrics_cross_fact(self, client_with_build_v3): SELECT COALESCE(order_details_0.category, page_views_enriched_0.category) AS category, COALESCE(order_details_0.name_customer, page_views_enriched_0.name_customer) AS name_customer, - CAST(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS DOUBLE) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5), 0) AS conversion_rate, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5), 0) AS revenue_per_visitor, + CAST(COUNT( DISTINCT order_details_0.order_id) AS DOUBLE) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id), 0) AS conversion_rate, + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id), 0) AS revenue_per_visitor, SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(SUM(page_views_enriched_0.view_id_count_f41e2db4), 0) AS revenue_per_page_view FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.category = page_views_enriched_0.category AND order_details_0.name_customer = page_views_enriched_0.name_customer GROUP BY 1, 2 @@ -517,14 +511,13 @@ async def test_pages_per_session_same_fact_derived(self, client_with_build_v3): page_views_enriched_0 AS ( SELECT t2.category, t1.session_id, - COUNT(t1.view_id) view_id_count_f41e2db4, - t1.session_id session_id_distinct_e7c26e39 + COUNT(t1.view_id) view_id_count_f41e2db4 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.session_id ) SELECT page_views_enriched_0.category AS category, - SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id_distinct_e7c26e39), 0) AS pages_per_session + SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id), 0) AS pages_per_session FROM page_views_enriched_0 GROUP BY page_views_enriched_0.category """, @@ -1426,8 +1419,7 @@ async def test_period_over_period_metrics(self, client_with_build_v3): t3.month, t3.week, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696, - t1.order_id order_id_distinct_f93d50ab + SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id @@ -1438,7 +1430,7 @@ async def test_period_over_period_metrics(self, client_with_build_v3): order_details_0.category AS category, order_details_0.month AS month, order_details_0.week AS week, - COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, + COUNT(DISTINCT order_details_0.order_id) AS order_count, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue FROM order_details_0 GROUP BY order_details_0.category, order_details_0.month, order_details_0.week @@ -1447,7 +1439,7 @@ async def test_period_over_period_metrics(self, client_with_build_v3): SELECT order_details_0.category AS category, order_details_0.week AS week, - COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, + COUNT(DISTINCT order_details_0.order_id) AS order_count, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue FROM order_details_0 GROUP BY order_details_0.category, order_details_0.week @@ -1627,7 +1619,7 @@ async def test_cross_fact_full_plus_limited_fan_out( GROUP BY t2.category ), page_views_enriched_0 AS ( - SELECT t2.category, t1.customer_id, t1.customer_id customer_id_distinct_dd4be7a5 + SELECT t2.category, t1.customer_id FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.customer_id @@ -1691,14 +1683,13 @@ async def test_derived_metric_with_base_metric_in_same_query( SELECT t1.status, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696, - t1.order_id order_id_distinct_f93d50ab + SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value, + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id), 0) AS avg_order_value, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue FROM order_details_0 GROUP BY order_details_0.status @@ -2152,12 +2143,12 @@ async def test_nested_derived_metric_simple(self, client_with_build_v3): JOIN default.v3.order_items oi ON o.order_id = oi.order_id ), order_details_0 AS ( - SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab + SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) / 50.0 * 100 AS aov_growth_index + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id), 0) / 50.0 * 100 AS aov_growth_index FROM order_details_0 GROUP BY order_details_0.status """, @@ -2217,7 +2208,7 @@ async def test_nested_derived_metric_with_window_function( FROM default.v3.products ), order_details_0 AS ( - SELECT t2.category, t3.week, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab + SELECT t2.category, t3.week, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id @@ -2227,9 +2218,9 @@ async def test_nested_derived_metric_with_window_function( SELECT order_details_0.category AS category, order_details_0.week AS week, - COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, + COUNT(DISTINCT order_details_0.order_id) AS order_count, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id), 0) AS avg_order_value FROM order_details_0 GROUP BY order_details_0.category, order_details_0.week ) @@ -2309,20 +2300,20 @@ async def test_nested_derived_metric_cross_fact(self, client_with_build_v3): FROM default.v3.page_views ), order_details_0 AS ( - SELECT t2.category, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab + SELECT t2.category, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.order_id ), page_views_enriched_0 AS ( - SELECT t2.category, t1.session_id, COUNT(t1.view_id) view_id_count_f41e2db4, t1.session_id session_id_distinct_e7c26e39 + SELECT t2.category, t1.session_id, COUNT(t1.view_id) view_id_count_f41e2db4 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id GROUP BY t2.category, t1.session_id ) SELECT COALESCE(order_details_0.category, page_views_enriched_0.category) AS category, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) - / NULLIF(SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT(DISTINCT page_views_enriched_0.session_id_distinct_e7c26e39), 0), 0) AS efficiency_ratio + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id), 0) + / NULLIF(SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT(DISTINCT page_views_enriched_0.session_id), 0), 0) AS efficiency_ratio FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.category = page_views_enriched_0.category GROUP BY 1 @@ -2692,8 +2683,7 @@ async def test_wow_with_count_distinct_at_daily_grain(self, client_with_build_v3 t2.category, t3.week, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696, - t1.order_id order_id_distinct_f93d50ab + SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id @@ -2704,7 +2694,7 @@ async def test_wow_with_count_distinct_at_daily_grain(self, client_with_build_v3 order_details_0.date_id_order AS date_id_order, order_details_0.category AS category, order_details_0.week AS week, - COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, + COUNT(DISTINCT order_details_0.order_id) AS order_count, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue FROM order_details_0 GROUP BY order_details_0.date_id_order, order_details_0.category, order_details_0.week @@ -2713,7 +2703,7 @@ async def test_wow_with_count_distinct_at_daily_grain(self, client_with_build_v3 SELECT order_details_0.category AS category, order_details_0.week AS week, - COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, + COUNT(DISTINCT order_details_0.order_id) AS order_count, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue FROM order_details_0 GROUP BY order_details_0.category, order_details_0.week @@ -2890,14 +2880,14 @@ async def test_cross_fact_wow_conversion_rate(self, client_with_build_v3): FROM default.v3.page_views ), order_details_0 AS ( - SELECT t2.category, t3.week, t1.order_id, t1.order_id order_id_distinct_f93d50ab + SELECT t2.category, t3.week, t1.order_id FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id GROUP BY t2.category, t3.week, t1.order_id ), page_views_enriched_0 AS ( - SELECT t2.category, t3.week, t1.customer_id, t1.customer_id customer_id_distinct_dd4be7a5 + SELECT t2.category, t3.week, t1.customer_id FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.page_date = t3.date_id @@ -2907,9 +2897,9 @@ async def test_cross_fact_wow_conversion_rate(self, client_with_build_v3): SELECT COALESCE(order_details_0.category, page_views_enriched_0.category) AS category, COALESCE(order_details_0.week, page_views_enriched_0.week) AS week, - COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, - COUNT(DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5) AS visitor_count, - CAST(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS DOUBLE) / NULLIF(COUNT(DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5), 0) AS conversion_rate + COUNT(DISTINCT order_details_0.order_id) AS order_count, + COUNT(DISTINCT page_views_enriched_0.customer_id) AS visitor_count, + CAST(COUNT(DISTINCT order_details_0.order_id) AS DOUBLE) / NULLIF(COUNT(DISTINCT page_views_enriched_0.customer_id), 0) AS conversion_rate FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.category = page_views_enriched_0.category AND order_details_0.week = page_views_enriched_0.week GROUP BY 1, 2 @@ -3031,8 +3021,7 @@ async def test_cross_fact_window_metric_with_finer_grain( SELECT COALESCE(t1.order_date, t3.date_id) AS date_id, t2.category, t3.week, - t1.order_id, - t1.order_id order_id_distinct_f93d50ab + t1.order_id FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id GROUP BY COALESCE(t1.order_date, t3.date_id), t2.category, t3.week, t1.order_id @@ -3041,8 +3030,7 @@ async def test_cross_fact_window_metric_with_finer_grain( SELECT COALESCE(t1.page_date, t3.date_id) AS date_id, t2.category, t3.week, - t1.customer_id, - t1.customer_id customer_id_distinct_dd4be7a5 + t1.customer_id FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.page_date = t3.date_id GROUP BY COALESCE(t1.page_date, t3.date_id), t2.category, t3.week, t1.customer_id @@ -3051,9 +3039,9 @@ async def test_cross_fact_window_metric_with_finer_grain( SELECT COALESCE(order_details_0.date_id, page_views_enriched_0.date_id) AS date_id, COALESCE(order_details_0.category, page_views_enriched_0.category) AS category, COALESCE(order_details_0.week, page_views_enriched_0.week) AS week, - COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, - COUNT( DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5) AS visitor_count, - CAST(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS DOUBLE) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5), 0) AS conversion_rate + COUNT( DISTINCT order_details_0.order_id) AS order_count, + COUNT( DISTINCT page_views_enriched_0.customer_id) AS visitor_count, + CAST(COUNT( DISTINCT order_details_0.order_id) AS DOUBLE) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id), 0) AS conversion_rate FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.date_id = page_views_enriched_0.date_id AND order_details_0.category = page_views_enriched_0.category AND order_details_0.week = page_views_enriched_0.week GROUP BY 1, 2, 3 ) @@ -3142,8 +3130,7 @@ async def test_cross_fact_window_on_derived_metric(self, client_with_build_v3): t2.category, t3.week, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696, - t1.order_id order_id_distinct_f93d50ab + SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id GROUP BY COALESCE(t1.order_date, t3.date_id), t2.category, t3.week, t1.order_id @@ -3153,8 +3140,7 @@ async def test_cross_fact_window_on_derived_metric(self, client_with_build_v3): t2.category, t3.week, t1.session_id, - COUNT(t1.view_id) view_id_count_f41e2db4, - t1.session_id session_id_distinct_e7c26e39 + COUNT(t1.view_id) view_id_count_f41e2db4 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.page_date = t3.date_id GROUP BY COALESCE(t1.page_date, t3.date_id), t2.category, t3.week, t1.session_id @@ -3163,13 +3149,13 @@ async def test_cross_fact_window_on_derived_metric(self, client_with_build_v3): SELECT COALESCE(order_details_0.date_id, page_views_enriched_0.date_id) AS date_id, COALESCE(order_details_0.category, page_views_enriched_0.category) AS category, COALESCE(order_details_0.week, page_views_enriched_0.week) AS week, - COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, + COUNT( DISTINCT order_details_0.order_id) AS order_count, SUM(page_views_enriched_0.view_id_count_f41e2db4) AS page_view_count, - COUNT( DISTINCT page_views_enriched_0.session_id_distinct_e7c26e39) AS session_count, + COUNT( DISTINCT page_views_enriched_0.session_id) AS session_count, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) / NULLIF(SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id_distinct_e7c26e39), 0), 0) AS efficiency_ratio, - SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id_distinct_e7c26e39), 0) AS pages_per_session + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id), 0) AS avg_order_value, + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT order_details_0.order_id), 0) / NULLIF(SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id), 0), 0) AS efficiency_ratio, + SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id), 0) AS pages_per_session FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.date_id = page_views_enriched_0.date_id AND order_details_0.category = page_views_enriched_0.category AND order_details_0.week = page_views_enriched_0.week GROUP BY 1, 2, 3 ) @@ -3258,8 +3244,7 @@ async def test_cross_fact_window_on_base_metrics(self, client_with_build_v3): SELECT COALESCE(t1.order_date, t3.date_id) AS date_id, t2.category, t3.week, - t1.order_id, - t1.order_id order_id_distinct_f93d50ab + t1.order_id FROM v3_order_details t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.order_date = t3.date_id GROUP BY COALESCE(t1.order_date, t3.date_id), t2.category, t3.week, t1.order_id @@ -3268,8 +3253,7 @@ async def test_cross_fact_window_on_base_metrics(self, client_with_build_v3): SELECT COALESCE(t1.page_date, t3.date_id) AS date_id, t2.category, t3.week, - t1.customer_id, - t1.customer_id customer_id_distinct_dd4be7a5 + t1.customer_id FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.page_date = t3.date_id GROUP BY COALESCE(t1.page_date, t3.date_id), t2.category, t3.week, t1.customer_id @@ -3278,8 +3262,8 @@ async def test_cross_fact_window_on_base_metrics(self, client_with_build_v3): SELECT COALESCE(order_details_0.date_id, page_views_enriched_0.date_id) AS date_id, COALESCE(order_details_0.category, page_views_enriched_0.category) AS category, COALESCE(order_details_0.week, page_views_enriched_0.week) AS week, - COUNT( DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count, - COUNT( DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5) AS visitor_count + COUNT( DISTINCT order_details_0.order_id) AS order_count, + COUNT( DISTINCT page_views_enriched_0.customer_id) AS visitor_count FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.date_id = page_views_enriched_0.date_id AND order_details_0.category = page_views_enriched_0.category AND order_details_0.week = page_views_enriched_0.week GROUP BY 1, 2, 3 ), @@ -3399,8 +3383,7 @@ async def test_multi_fact_window_metrics_same_grain(self, client_with_build_v3): SELECT t2.category, t3.week, t1.session_id, - COUNT(t1.view_id) view_id_count_f41e2db4, - t1.session_id session_id_distinct_e7c26e39 + COUNT(t1.view_id) view_id_count_f41e2db4 FROM v3_page_views_enriched t1 LEFT OUTER JOIN v3_product t2 ON t1.product_id = t2.product_id LEFT OUTER JOIN v3_date t3 ON t1.page_date = t3.date_id GROUP BY t2.category, t3.week, t1.session_id @@ -3409,9 +3392,9 @@ async def test_multi_fact_window_metrics_same_grain(self, client_with_build_v3): SELECT COALESCE(order_details_0.category, page_views_enriched_0.category) AS category, COALESCE(order_details_0.week, page_views_enriched_0.week) AS week, SUM(page_views_enriched_0.view_id_count_f41e2db4) AS page_view_count, - COUNT( DISTINCT page_views_enriched_0.session_id_distinct_e7c26e39) AS session_count, + COUNT( DISTINCT page_views_enriched_0.session_id) AS session_count, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue, - SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id_distinct_e7c26e39), 0) AS pages_per_session + SUM(page_views_enriched_0.view_id_count_f41e2db4) / NULLIF(COUNT( DISTINCT page_views_enriched_0.session_id), 0) AS pages_per_session FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.category = page_views_enriched_0.category AND order_details_0.week = page_views_enriched_0.week GROUP BY 1, 2 ) @@ -3622,14 +3605,13 @@ async def test_order_by_multiple_columns(self, client_with_build_v3): order_details_0 AS ( SELECT t1.status, t1.order_id, - SUM(t1.line_total) line_total_sum_e1f61696, - t1.order_id order_id_distinct_f93d50ab + SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue, - COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count + COUNT(DISTINCT order_details_0.order_id) AS order_count FROM order_details_0 GROUP BY order_details_0.status ORDER BY status ASC, total_revenue DESC @@ -5121,13 +5103,13 @@ async def test_full_and_limited_metrics_same_fact_computed_at_limited_grain( JOIN default.v3.order_items oi ON o.order_id = oi.order_id ), order_details_0 AS ( - SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab + SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, SUM(order_details_0.line_total_sum_e1f61696) AS total_revenue, - COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count + COUNT(DISTINCT order_details_0.order_id) AS order_count FROM order_details_0 GROUP BY order_details_0.status """, diff --git a/datajunction-server/tests/construction/build_v3/preagg_substitution_test.py b/datajunction-server/tests/construction/build_v3/preagg_substitution_test.py index 77e91dc25..1849dcf8a 100644 --- a/datajunction-server/tests/construction/build_v3/preagg_substitution_test.py +++ b/datajunction-server/tests/construction/build_v3/preagg_substitution_test.py @@ -145,7 +145,7 @@ async def test_limited_metric_no_preagg(self, client_with_build_v3): FROM default.v3.orders o JOIN default.v3.order_items oi ON o.order_id = oi.order_id ) - SELECT t1.status, t1.order_id, t1.order_id order_id_distinct_f93d50ab + SELECT t1.status, t1.order_id FROM v3_order_details t1 GROUP BY t1.status, t1.order_id """, @@ -173,12 +173,12 @@ async def test_limited_metric_no_preagg(self, client_with_build_v3): JOIN default.v3.order_items oi ON o.order_id = oi.order_id ), order_details_0 AS ( - SELECT t1.status, t1.order_id, t1.order_id order_id_distinct_f93d50ab + SELECT t1.status, t1.order_id FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, - COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab) AS order_count + COUNT(DISTINCT order_details_0.order_id) AS order_count FROM order_details_0 GROUP BY order_details_0.status """, @@ -218,7 +218,7 @@ async def test_derived_metric_no_preagg(self, client_with_build_v3): FROM default.v3.orders o JOIN default.v3.order_items oi ON o.order_id = oi.order_id ) - SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab + SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 GROUP BY t1.status, t1.order_id """, @@ -246,12 +246,12 @@ async def test_derived_metric_no_preagg(self, client_with_build_v3): JOIN default.v3.order_items oi ON o.order_id = oi.order_id ), order_details_0 AS ( - SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696, t1.order_id order_id_distinct_f93d50ab + SELECT t1.status, t1.order_id, SUM(t1.line_total) line_total_sum_e1f61696 FROM v3_order_details t1 GROUP BY t1.status, t1.order_id ) SELECT order_details_0.status AS status, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id), 0) AS avg_order_value FROM order_details_0 GROUP BY order_details_0.status """, @@ -386,12 +386,12 @@ async def test_derived_metric_with_preagg(self, client_with_build_v3): WITH order_details_0 AS ( SELECT status, SUM(line_total_sum_e1f61696) line_total_sum_e1f61696, - order_id_distinct_f93d50ab + order_id FROM warehouse.preaggs.v3_order_metrics - GROUP BY status, order_id_distinct_f93d50ab + GROUP BY status, order_id ) SELECT order_details_0.status AS status, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id_distinct_f93d50ab), 0) AS avg_order_value + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT(DISTINCT order_details_0.order_id), 0) AS avg_order_value FROM order_details_0 GROUP BY order_details_0.status """, @@ -574,13 +574,13 @@ async def test_cross_fact_with_partial_preagg(self, client_with_build_v3): GROUP BY customer_id ), page_views_enriched_0 AS ( - SELECT t1.customer_id, t1.customer_id customer_id_distinct_dd4be7a5 + SELECT t1.customer_id FROM v3_page_views_enriched t1 GROUP BY t1.customer_id ) SELECT COALESCE(order_details_0.customer_id, page_views_enriched_0.customer_id) AS customer_id, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5), 0) AS revenue_per_visitor + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id), 0) AS revenue_per_visitor FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.customer_id = page_views_enriched_0.customer_id GROUP BY 1 """, @@ -655,12 +655,12 @@ async def test_cross_fact_with_full_preagg_coverage(self, client_with_build_v3): GROUP BY customer_id ), page_views_enriched_0 AS ( - SELECT t1.customer_id, t1.customer_id customer_id_distinct_dd4be7a5 + SELECT t1.customer_id FROM v3_page_views_enriched t1 GROUP BY t1.customer_id ) SELECT COALESCE(order_details_0.customer_id, page_views_enriched_0.customer_id) AS customer_id, - SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id_distinct_dd4be7a5), 0) AS revenue_per_visitor + SUM(order_details_0.line_total_sum_e1f61696) / NULLIF(COUNT( DISTINCT page_views_enriched_0.customer_id), 0) AS revenue_per_visitor FROM order_details_0 FULL OUTER JOIN page_views_enriched_0 ON order_details_0.customer_id = page_views_enriched_0.customer_id GROUP BY 1 """, @@ -913,94 +913,3 @@ def test_deduplicates_repeated_components(self): ) # Only one component should appear in output despite two in input assert len(result.components) == 1 - - def test_aliases_legacy_bare_measure_col_to_hashed_component_name(self): - """build_grain_group_from_preagg aliases the pre-agg's measure - column to ``component.name`` when they differ. - - Legacy v2 pre-aggs (materialized before plain-column - COUNT(DISTINCT) projected the hashed identity) carry the - column under the bare grain name (e.g. ``account_id``) while - the component identity is hashed (``account_id_distinct_``). - ``component_aliases`` and the CTE's output column must be the - hashed name so the persisted ``derived_expression`` resolves — - the pre-agg path emits `` AS ``. - """ - from datajunction_server.database.availabilitystate import AvailabilityState - - node = self._make_node() - # Plain-column COUNT(DISTINCT account_id) component: name is - # hashed identity, expression is the bare column, LIMITED with - # no aggregation, and merge is None. - component = MetricComponent( - name="account_id_distinct_abc123", - expression="account_id", - aggregation=None, - merge=None, - rule=AggregationRule(type=Aggregability.LIMITED), - grain_alias="account_id", - ) - grain_group = GrainGroup( - parent_node=node, - aggregability=Aggregability.LIMITED, - grain_columns=[], - components=[(node, component)], - ) - - # Legacy pre-agg stores the measure under the BARE grain name. - # expr_hash matches the component so the measure is found. - expr_hash = compute_expression_hash("account_id") - measure = PreAggMeasure( - name="account_id", # legacy bare-named column - expression="account_id", - aggregation=None, - merge=None, - rule=AggregationRule(type=Aggregability.LIMITED), - expr_hash=expr_hash, - ) - avail = AvailabilityState( - catalog="wh", - schema_="preaggs", - table="legacy_tbl", - valid_through_ts=99999, - ) - preagg = PreAggregation( - node_revision_id=1, - grain_columns=[], - measures=[measure], - sql="SELECT 1", - grain_group_hash="abc", - preagg_hash="def", - availability=avail, - ) - - result = build_grain_group_from_preagg( - self._make_ctx(), - grain_group, - preagg, - resolved_dimensions=[], - components_per_metric={}, - ) - # component_aliases routes the hashed identity at component.name - # (matching the live-build path) so downstream combiner - # rewriters reference the hashed column consistently. - assert result.component_aliases == { - "account_id_distinct_abc123": "account_id_distinct_abc123", - } - # The CTE output exposes ``account_id_distinct_abc123`` even - # though the pre-agg physically stores ``account_id``. - column_names = {c.name for c in result.columns} - assert "account_id_distinct_abc123" in column_names - assert "account_id" not in column_names - # The generated SQL selects the bare physical ``account_id`` - # column from the legacy pre-agg and aliases it to the hashed - # identity. ``assert_sql_equal`` ignores whitespace and the - # optional ``AS`` keyword in alias syntax. - assert_sql_equal( - str(result.query), - """ - SELECT account_id account_id_distinct_abc123 - FROM wh.preaggs.legacy_tbl - GROUP BY account_id - """, - ) diff --git a/datajunction-server/tests/construction/build_v3/set_operations_test.py b/datajunction-server/tests/construction/build_v3/set_operations_test.py index 66a601ed9..5097e3480 100644 --- a/datajunction-server/tests/construction/build_v3/set_operations_test.py +++ b/datajunction-server/tests/construction/build_v3/set_operations_test.py @@ -89,12 +89,12 @@ async def test_union_all_transform_metric_generates_sql( WHERE status = 'shipped' ), orders_unified_0 AS ( - SELECT t1.status, t1.order_id, t1.order_id order_id_distinct_a853fcda + SELECT t1.status, t1.order_id FROM v3_orders_unified t1 GROUP BY t1.status, t1.order_id ) SELECT orders_unified_0.status AS status, - COUNT(DISTINCT orders_unified_0.order_id_distinct_a853fcda) AS unified_order_count + COUNT(DISTINCT orders_unified_0.order_id) AS unified_order_count FROM orders_unified_0 GROUP BY orders_unified_0.status """, @@ -138,12 +138,12 @@ async def test_union_transform_filter_pushed_into_primary_arm_only( WHERE status = 'shipped' ), orders_unified_0 AS ( - SELECT t1.status, t1.order_id, t1.order_id order_id_distinct_a853fcda + SELECT t1.status, t1.order_id FROM v3_orders_unified t1 GROUP BY t1.status, t1.order_id ) SELECT orders_unified_0.status AS status, - COUNT(DISTINCT orders_unified_0.order_id_distinct_a853fcda) AS unified_order_count + COUNT(DISTINCT orders_unified_0.order_id) AS unified_order_count FROM orders_unified_0 WHERE orders_unified_0.status = 'completed' GROUP BY orders_unified_0.status @@ -276,7 +276,7 @@ async def test_filter_only_dim_link_on_one_arm_source_only( FROM default.v3.orders_us WHERE src_orders_us.order_date >= 20260101 ) - SELECT t1.status, t1.order_id, t1.order_id order_id_distinct_3d2cb4c8 + SELECT t1.status, t1.order_id FROM v3_orders_global t1 GROUP BY t1.status, t1.order_id """, diff --git a/datajunction-server/tests/construction/build_v3/test_self_join_sql_generation.py b/datajunction-server/tests/construction/build_v3/test_self_join_sql_generation.py index 9ece9f096..d86e1cb73 100644 --- a/datajunction-server/tests/construction/build_v3/test_self_join_sql_generation.py +++ b/datajunction-server/tests/construction/build_v3/test_self_join_sql_generation.py @@ -92,8 +92,7 @@ async def test_direct_self_join_employee_manager( ) SELECT t2.employee_name employee_name_manager, - t1.employee_id, - t1.employee_id employee_id_distinct_d13d9843 + t1.employee_id FROM default_employee t1 LEFT OUTER JOIN default_employee t2 ON t1.manager_employee_id = t2.employee_id GROUP BY @@ -126,8 +125,7 @@ async def test_direct_self_join_employee_manager( employee_0 AS ( SELECT t2.employee_name employee_name_manager, - t1.employee_id, - t1.employee_id employee_id_distinct_d13d9843 + t1.employee_id FROM default_employee t1 LEFT OUTER JOIN default_employee t2 ON t1.manager_employee_id = t2.employee_id GROUP BY @@ -135,7 +133,7 @@ async def test_direct_self_join_employee_manager( ) SELECT employee_0.employee_name_manager AS employee_name_manager, - COUNT(DISTINCT employee_0.employee_id_distinct_d13d9843) AS employee_count + COUNT(DISTINCT employee_0.employee_id) AS employee_count FROM employee_0 GROUP BY employee_0.employee_name_manager