Feat: BigQuery - Handle forward_only changes to clustered_by (#3231)

erindru · web-flow · commit 9300f2724bb3 · 2024-10-10T10:26:10.000+13:00
diff --git a/sqlmesh/core/engine_adapter/bigquery.py b/sqlmesh/core/engine_adapter/bigquery.py
@@ -5,7 +5,7 @@
 from collections import defaultdict
 
 import pandas as pd
-from sqlglot import exp
+from sqlglot import exp, parse_one
 from sqlglot.transforms import remove_precision_parameterized_types
 
 from sqlmesh.core.dialect import to_schema
@@ -40,6 +40,9 @@
 NestedField = t.Tuple[str, str, t.List[str]]
 NestedFieldsDict = t.Dict[str, t.List[NestedField]]
 
+# used to tag AST nodes to be specially handled in alter_table()
+_CLUSTERING_META_KEY = "__sqlmesh_update_table_clustering"
+
 
 @set_catalog()
 class BigQueryEngineAdapter(InsertOverwriteWithMergeMixin, ClusteredByMixin):
@@ -243,6 +246,18 @@ def alter_table(
         if nested_fields:
             self._update_table_schema_nested_fields(nested_fields, alter_expressions[0].this)
 
+        # this is easier than trying to detect exp.Cluster nodes
+        # or exp.Command nodes that contain the string "DROP CLUSTERING KEY"
+        clustering_change_operations = [
+            e for e in non_nested_expressions if _CLUSTERING_META_KEY in e.meta
+        ]
+        for op in clustering_change_operations:
+            non_nested_expressions.remove(op)
+            table, cluster_by = op.meta[_CLUSTERING_META_KEY]
+            assert isinstance(table, str) or isinstance(table, exp.Table)
+
+            self._update_clustering_key(table, cluster_by)
+
         if non_nested_expressions:
             super().alter_table(non_nested_expressions)
 
@@ -847,25 +862,55 @@ def _get_data_objects(
         # resort to using SQL instead.
         schema = to_schema(schema_name)
         catalog = schema.catalog or self.default_catalog
-        query = exp.select(
-            exp.column("table_catalog").as_("catalog"),
-            exp.column("table_name").as_("name"),
-            exp.column("table_schema").as_("schema_name"),
-            exp.case()
-            .when(exp.column("table_type").eq("BASE TABLE"), exp.Literal.string("TABLE"))
-            .when(exp.column("table_type").eq("CLONE"), exp.Literal.string("TABLE"))
-            .when(exp.column("table_type").eq("EXTERNAL"), exp.Literal.string("TABLE"))
-            .when(exp.column("table_type").eq("SNAPSHOT"), exp.Literal.string("TABLE"))
-            .when(exp.column("table_type").eq("VIEW"), exp.Literal.string("VIEW"))
-            .when(
-                exp.column("table_type").eq("MATERIALIZED VIEW"),
-                exp.Literal.string("MATERIALIZED_VIEW"),
+        query = (
+            exp.select(
+                exp.column("table_catalog").as_("catalog"),
+                exp.column("table_name").as_("name"),
+                exp.column("table_schema").as_("schema_name"),
+                exp.case()
+                .when(exp.column("table_type").eq("BASE TABLE"), exp.Literal.string("TABLE"))
+                .when(exp.column("table_type").eq("CLONE"), exp.Literal.string("TABLE"))
+                .when(exp.column("table_type").eq("EXTERNAL"), exp.Literal.string("TABLE"))
+                .when(exp.column("table_type").eq("SNAPSHOT"), exp.Literal.string("TABLE"))
+                .when(exp.column("table_type").eq("VIEW"), exp.Literal.string("VIEW"))
+                .when(
+                    exp.column("table_type").eq("MATERIALIZED VIEW"),
+                    exp.Literal.string("MATERIALIZED_VIEW"),
+                )
+                .else_(exp.column("table_type"))
+                .as_("type"),
+                exp.column("clustering_key", "ci").as_("clustering_key"),
+            )
+            .with_(
+                "clustering_info",
+                as_=exp.select(
+                    exp.column("table_catalog"),
+                    exp.column("table_schema"),
+                    exp.column("table_name"),
+                    parse_one(
+                        "string_agg(column_name order by clustering_ordinal_position)",
+                        dialect=self.dialect,
+                    ).as_("clustering_key"),
+                )
+                .from_(
+                    exp.to_table(
+                        f"`{catalog}`.`{schema.db}`.INFORMATION_SCHEMA.COLUMNS",
+                        dialect=self.dialect,
+                    )
+                )
+                .where(exp.column("clustering_ordinal_position").is_(exp.not_(exp.null())))
+                .group_by("1", "2", "3"),
             )
-            .else_(exp.column("table_type"))
-            .as_("type"),
-        ).from_(
-            exp.to_table(
-                f"`{catalog}`.`{schema.db}`.INFORMATION_SCHEMA.TABLES", dialect=self.dialect
+            .from_(
+                exp.to_table(
+                    f"`{catalog}`.`{schema.db}`.INFORMATION_SCHEMA.TABLES", dialect=self.dialect
+                )
+            )
+            .join(
+                "clustering_info",
+                using=["table_catalog", "table_schema", "table_name"],
+                join_type="left",
+                join_alias="ci",
             )
         )
         if object_names:
@@ -886,10 +931,41 @@ def _get_data_objects(
                 schema=row.schema_name,  # type: ignore
                 name=row.name,  # type: ignore
                 type=DataObjectType.from_str(row.type),  # type: ignore
+                clustering_key=f"({row.clustering_key})" if row.clustering_key else None,  # type: ignore
             )
             for row in df.itertuples()
         ]
 
+    def _change_clustering_key_expr(
+        self, table: exp.Table, cluster_by: t.List[exp.Expression]
+    ) -> exp.Alter:
+        expr = super()._change_clustering_key_expr(table=table, cluster_by=cluster_by)
+        expr.meta[_CLUSTERING_META_KEY] = (table, cluster_by)
+        return expr
+
+    def _drop_clustering_key_expr(self, table: exp.Table) -> exp.Alter:
+        expr = super()._drop_clustering_key_expr(table=table)
+        expr.meta[_CLUSTERING_META_KEY] = (table, None)
+        return expr
+
+    def _update_clustering_key(
+        self, table_name: TableName, cluster_by: t.Optional[t.List[exp.Expression]]
+    ) -> None:
+        cluster_by = cluster_by or []
+        bq_table = self._get_table(table_name)
+
+        rendered_columns = [c.sql(dialect=self.dialect) for c in cluster_by]
+        bq_table.clustering_fields = (
+            rendered_columns or None
+        )  # causes a drop of the key if cluster_by is empty or None
+
+        self._db_call(self.client.update_table, table=bq_table, fields=["clustering_fields"])
+
+        if cluster_by:
+            # BigQuery only applies new clustering going forward, so this rewrites the columns to apply the new clustering to historical data
+            # ref: https://cloud.google.com/bigquery/docs/creating-clustered-tables#modifying-cluster-spec
+            self.execute(exp.update(table_name, {c: c for c in cluster_by}, where=exp.true()))
+
     @property
     def _query_data(self) -> t.Any:
         return self._connection_pool.get_attribute("query_data")
@@ -971,7 +1047,7 @@ def select_partitions_expr(
     """Generates a SQL expression that aggregates partition values for a table.
 
     Args:
-        schema: The schema (BigQueyr dataset) of the table.
+        schema: The schema (BigQuery dataset) of the table.
         table_name: The name of the table.
         data_type: The data type of the partition column.
         granularity: The granularity of the partition. Supported values are: 'day', 'month', 'year' and 'hour'.
diff --git a/sqlmesh/core/engine_adapter/mixins.py b/sqlmesh/core/engine_adapter/mixins.py
@@ -3,7 +3,8 @@
 import logging
 import typing as t
 
-from sqlglot import exp
+from sqlglot import exp, parse_one
+from sqlglot.helper import seq_get
 
 from sqlmesh.core.engine_adapter.base import EngineAdapter
 from sqlmesh.core.engine_adapter.shared import InsertOverwriteStrategy, SourceQuery
@@ -337,3 +338,61 @@ def _build_clustered_by_exp(
         **kwargs: t.Any,
     ) -> t.Optional[exp.Cluster]:
         return exp.Cluster(expressions=[exp.column(col) for col in clustered_by])
+
+    def _parse_clustering_key(self, clustering_key: t.Optional[str]) -> t.List[exp.Expression]:
+        if not clustering_key:
+            return []
+
+        # Note: Assumes `clustering_key` as a string like:
+        # - "(col_a)"
+        # - "(col_a, col_b)"
+        # - "func(col_a, transform(col_b))"
+        parsed_cluster_key = parse_one(clustering_key, dialect=self.dialect)
+
+        return parsed_cluster_key.expressions or [parsed_cluster_key.this]
+
+    def get_alter_expressions(
+        self, current_table_name: TableName, target_table_name: TableName
+    ) -> t.List[exp.Alter]:
+        expressions = super().get_alter_expressions(current_table_name, target_table_name)
+
+        # check for a change in clustering
+        current_table = exp.to_table(current_table_name)
+        target_table = exp.to_table(target_table_name)
+
+        current_table_info = seq_get(
+            self.get_data_objects(current_table.db, {current_table.name}), 0
+        )
+        target_table_info = seq_get(self.get_data_objects(target_table.db, {target_table.name}), 0)
+
+        if current_table_info and target_table_info:
+            if target_table_info.is_clustered:
+                if target_table_info.clustering_key and (
+                    current_table_info.clustering_key != target_table_info.clustering_key
+                ):
+                    expressions.append(
+                        self._change_clustering_key_expr(
+                            current_table,
+                            self._parse_clustering_key(target_table_info.clustering_key),
+                        )
+                    )
+            elif current_table_info.is_clustered:
+                expressions.append(self._drop_clustering_key_expr(current_table))
+
+        return expressions
+
+    def _change_clustering_key_expr(
+        self, table: exp.Table, cluster_by: t.List[exp.Expression]
+    ) -> exp.Alter:
+        return exp.Alter(
+            this=table,
+            kind="TABLE",
+            actions=[exp.Cluster(expressions=cluster_by)],
+        )
+
+    def _drop_clustering_key_expr(self, table: exp.Table) -> exp.Alter:
+        return exp.Alter(
+            this=table,
+            kind="TABLE",
+            actions=[exp.Command(this="DROP", expression="CLUSTERING KEY")],
+        )
diff --git a/sqlmesh/core/engine_adapter/shared.py b/sqlmesh/core/engine_adapter/shared.py
@@ -164,6 +164,13 @@ class DataObject(PydanticModel):
     name: str
     type: DataObjectType
 
+    # for type=DataObjectType.Table, only if the DB supports it
+    clustering_key: t.Optional[str] = None
+
+    @property
+    def is_clustered(self) -> bool:
+        return bool(self.clustering_key)
+
 
 class CatalogSupport(Enum):
     UNSUPPORTED = 1
diff --git a/sqlmesh/core/engine_adapter/snowflake.py b/sqlmesh/core/engine_adapter/snowflake.py
@@ -6,8 +6,7 @@
 
 import pandas as pd
 from pandas.api.types import is_datetime64_any_dtype  # type: ignore
-from sqlglot import exp, parse_one
-from sqlglot.helper import seq_get
+from sqlglot import exp
 from sqlglot.optimizer.normalize_identifiers import normalize_identifiers
 from sqlglot.optimizer.qualify_columns import quote_identifiers
 
@@ -33,14 +32,6 @@
     from sqlmesh.core.node import IntervalUnit
 
 
-class SnowflakeDataObject(DataObject):
-    clustering_key: t.Optional[str] = None
-
-    @property
-    def is_clustered(self) -> bool:
-        return bool(self.clustering_key)
-
-
 @set_catalog(
     override_mapping={
         "_get_data_objects": CatalogSupport.REQUIRES_SET_CATALOG,
@@ -348,7 +339,7 @@ def _get_data_objects(
         if df.empty:
             return []
         return [
-            SnowflakeDataObject(
+            DataObject(
                 catalog=row.catalog,  # type: ignore
                 schema=row.schema_name,  # type: ignore
                 name=row.name,  # type: ignore
@@ -433,50 +424,3 @@ def _create_column_comments(
                 f"Column comments for table '{table.alias_or_name}' not registered - this may be due to limited permissions.",
                 exc_info=True,
             )
-
-    def get_alter_expressions(
-        self, current_table_name: TableName, target_table_name: TableName
-    ) -> t.List[exp.Alter]:
-        schema_expressions = super().get_alter_expressions(current_table_name, target_table_name)
-        additional_expressions = []
-
-        # check for a change in clustering
-        current_table = exp.to_table(current_table_name)
-        target_table = exp.to_table(target_table_name)
-
-        current_table_info = t.cast(
-            SnowflakeDataObject,
-            seq_get(self.get_data_objects(current_table.db, {current_table.name}), 0),
-        )
-        target_table_info = t.cast(
-            SnowflakeDataObject,
-            seq_get(self.get_data_objects(target_table.db, {target_table.name}), 0),
-        )
-
-        if current_table_info and target_table_info:
-            if target_table_info.is_clustered:
-                if target_table_info.clustering_key and (
-                    current_table_info.clustering_key != target_table_info.clustering_key
-                ):
-                    # Note: If you create a table with eg `CLUSTER BY (c2, c1)` and read the info back from information_schema,
-                    # it gets returned as a string like "LINEAR(c2, c1)" which we need to parse back into a list of columns
-                    parsed_cluster_key = parse_one(
-                        target_table_info.clustering_key, dialect=self.dialect
-                    )
-                    additional_expressions.append(
-                        exp.Alter(
-                            this=current_table,
-                            kind="TABLE",
-                            actions=[exp.Cluster(expressions=parsed_cluster_key.expressions)],
-                        )
-                    )
-            elif current_table_info.is_clustered:
-                additional_expressions.append(
-                    exp.Alter(
-                        this=current_table,
-                        kind="TABLE",
-                        actions=[exp.Command(this="DROP", expression="CLUSTERING KEY")],
-                    )
-                )
-
-        return schema_expressions + additional_expressions
diff --git a/tests/core/engine_adapter/integration/test_integration_bigquery.py b/tests/core/engine_adapter/integration/test_integration_bigquery.py
diff --git a/tests/core/engine_adapter/integration/test_integration_snowflake.py b/tests/core/engine_adapter/integration/test_integration_snowflake.py