Fix!: Snowflake adapter (#2870)

erindru · web-flow · commit d2c4025d9e71 · 2024-07-16T17:20:43.000-07:00
diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py
@@ -2055,13 +2055,15 @@ def _get_data_objects(
         """
         raise NotImplementedError()
 
-    def _get_temp_table(self, table: TableName, table_only: bool = False) -> exp.Table:
+    def _get_temp_table(
+        self, table: TableName, table_only: bool = False, quoted: bool = True
+    ) -> exp.Table:
         """
         Returns the name of the temp table that should be used for the given table name.
         """
         table = t.cast(exp.Table, exp.to_table(table).copy())
         table.set(
-            "this", exp.to_identifier(f"__temp_{table.name}_{random_id(short=True)}", quoted=True)
+            "this", exp.to_identifier(f"__temp_{table.name}_{random_id(short=True)}", quoted=quoted)
         )
 
         if table_only:
diff --git a/sqlmesh/core/engine_adapter/snowflake.py b/sqlmesh/core/engine_adapter/snowflake.py
@@ -200,7 +200,9 @@ def _df_to_source_queries(
         batch_size: int,
         target_table: TableName,
     ) -> t.List[SourceQuery]:
-        temp_table = self._get_temp_table(target_table or "pandas")
+        temp_table = self._get_temp_table(
+            target_table or "pandas", quoted=False
+        )  # write_pandas() re-quotes everything without checking if its already quoted
 
         def query_factory() -> Query:
             if snowpark and isinstance(df, snowpark.dataframe.DataFrame):
@@ -211,10 +213,10 @@ def query_factory() -> Query:
                 # Workaround for https://github.com/snowflakedb/snowflake-connector-python/issues/1034
                 # The above issue has already been fixed upstream, but we keep the following
                 # line anyway in order to support a wider range of Snowflake versions.
-                schema = f'"{temp_table.db}"'
+                schema = temp_table.db
                 if temp_table.catalog:
-                    schema = f'"{temp_table.catalog}".{schema}'
-                self.cursor.execute(f"USE SCHEMA {schema}")
+                    schema = f"{temp_table.catalog}.{schema}"
+                self.set_current_schema(schema)
 
                 # See: https://stackoverflow.com/a/75627721
                 for column, kind in columns_to_types.items():
@@ -240,10 +242,14 @@ def query_factory() -> Query:
                     df,
                     temp_table.name,
                     schema=temp_table.db or None,
-                    database=temp_table.catalog or None,
+                    database=normalize_identifiers(temp_table.catalog, dialect=self.dialect).sql(
+                        dialect=self.dialect
+                    )
+                    if temp_table.catalog
+                    else None,
                     chunk_size=self.DEFAULT_BATCH_SIZE,
                     overwrite=True,
-                    table_type="temp",  # if you dont have this, it will convert the table we created above into a normal table and it wont get dropped when the session ends
+                    table_type="temp",
                 )
             else:
                 raise SQLMeshError(
@@ -252,7 +258,13 @@ def query_factory() -> Query:
 
             return exp.select(*self._casted_columns(columns_to_types)).from_(temp_table)
 
-        return [SourceQuery(query_factory=query_factory)]
+        # the cleanup_func technically isnt needed because the temp table gets dropped when the session ends
+        # but boy does it make our multi-adapter integration tests easier to write
+        return [
+            SourceQuery(
+                query_factory=query_factory, cleanup_func=lambda: self.drop_table(temp_table)
+            )
+        ]
 
     def _fetch_native_df(
         self, query: t.Union[exp.Expression, str], quote_identifiers: bool = False
@@ -280,6 +292,7 @@ def _get_data_objects(
 
         schema = to_schema(schema_name)
         catalog_name = schema.catalog or self.get_current_catalog()
+
         query = (
             exp.select(
                 exp.column("TABLE_CATALOG").as_("catalog"),
@@ -308,6 +321,8 @@ def _get_data_objects(
             )
             .from_(exp.table_("TABLES", db="INFORMATION_SCHEMA", catalog=catalog_name))
             .where(exp.column("TABLE_SCHEMA").eq(schema.db))
+            # Snowflake seems to have delayed internal metadata updates and will sometimes return duplicates
+            .distinct()
         )
         if object_names:
             query = query.where(exp.column("TABLE_NAME").isin(*object_names))
@@ -328,10 +343,49 @@ def _get_data_objects(
     def set_current_catalog(self, catalog: str) -> None:
         self.execute(exp.Use(this=exp.to_identifier(catalog)))
 
+    def set_current_schema(self, schema: str) -> None:
+        self.execute(exp.Use(kind="SCHEMA", this=to_schema(schema)))
+
+    def _to_sql(self, expression: exp.Expression, quote: bool = True, **kwargs: t.Any) -> str:
+        # note: important to use self._default_catalog instead of the self.default_catalog property
+        # otherwise we get RecursionError: maximum recursion depth exceeded
+        # because it calls get_current_catalog(), which executes a query, which needs the default catalog, which calls get_current_catalog()... etc
+        if self._default_catalog:
+            # the purpose of this function is to identify instances where the default catalog is being used
+            # (so that we can replace it with the actual catalog as specified in the gateway)
+            #
+            # we can't do a direct string comparison because the catalog value on the model
+            # gets changed when it's normalized as part of generating `model.fqn`
+            def unquote_and_lower(identifier: str) -> str:
+                return exp.parse_identifier(identifier).name.lower()
+
+            default_catalog_unquoted = unquote_and_lower(self._default_catalog)
+            default_catalog_normalized = normalize_identifiers(
+                self._default_catalog, dialect=self.dialect
+            )
+
+            def catalog_rewriter(node: exp.Expression) -> exp.Expression:
+                if isinstance(node, exp.Table):
+                    if node.catalog:
+                        # only replace the catalog on the model with the target catalog if the two are functionally equivalent
+                        if unquote_and_lower(node.catalog) == default_catalog_unquoted:
+                            node.set("catalog", default_catalog_normalized)
+                elif isinstance(node, exp.Use) and isinstance(node.this, exp.Identifier):
+                    if unquote_and_lower(node.this.output_name) == default_catalog_unquoted:
+                        node.set("this", default_catalog_normalized)
+                return node
+
+            # Rewrite whatever default catalog is present on the query to be compatible with what the user supplied in the
+            # Snowflake connection config. This is because the catalog present on the model gets normalized and quoted to match
+            # the source dialect, which isnt always compatible with Snowflake
+            expression = expression.transform(catalog_rewriter)
+
+        return super()._to_sql(expression=expression, quote=quote, **kwargs)
+
     def _build_create_comment_column_exp(
         self, table: exp.Table, column_name: str, column_comment: str, table_kind: str = "TABLE"
     ) -> exp.Comment | str:
-        table_sql = table.sql(dialect=self.dialect, identify=True)
+        table_sql = self._to_sql(table)  # so that catalog replacement happens
         column_sql = exp.column(column_name).sql(dialect=self.dialect, identify=True)
 
         truncated_comment = self._truncate_column_comment(column_comment)
diff --git a/sqlmesh/core/engine_adapter/spark.py b/sqlmesh/core/engine_adapter/spark.py
@@ -284,9 +284,7 @@ def _ensure_pyspark_df(
         return self.spark.createDataFrame(df, **kwargs)  # type: ignore
 
     def _get_temp_table(
-        self,
-        table: TableName,
-        table_only: bool = False,
+        self, table: TableName, table_only: bool = False, quoted: bool = True
     ) -> exp.Table:
         """
         Returns the name of the temp table that should be used for the given table name.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -17,11 +17,13 @@
 from sqlglot import exp, maybe_parse, parse_one
 from sqlglot.dialects.dialect import DialectType
 from sqlglot.helper import ensure_list
+from sqlglot.optimizer.normalize_identifiers import normalize_identifiers
 
 from sqlmesh.core.config import DuckDBConnectionConfig
 from sqlmesh.core.context import Context
 from sqlmesh.core.engine_adapter import SparkEngineAdapter
 from sqlmesh.core.engine_adapter.base import EngineAdapter
+from sqlmesh.core.environment import EnvironmentNamingInfo
 from sqlmesh.core.macros import macro
 from sqlmesh.core.model import IncrementalByTimeRangeKind, SqlModel, model
 from sqlmesh.core.model.kind import OnDestructiveChange
@@ -126,7 +128,28 @@ def validate(
         *,
         env_name: t.Optional[str] = None,
         dialect: t.Optional[str] = None,
+        environment_naming_info: t.Optional[EnvironmentNamingInfo] = None,
     ) -> t.Dict[t.Any, t.Any]:
+        if (
+            env_name
+            and dialect
+            and environment_naming_info
+            and environment_naming_info.normalize_name
+        ):
+            # if the environment_naming_info was configured to normalize names, then Snapshot.qualified_view_name.table_for_enviromnent()
+            # returns schemas that contain the environment_name normalised for that engine
+            #
+            # in practice, this means "test_prod" becomes "TEST_PROD" on some engines so the final views are named like:
+            #
+            # "sushi__TEST_PROD"."waiter_as_customer_by_day"
+            #
+            # instead of:
+            #
+            # "sushi__test_prod"."waiter_as_customer_by_day"
+            #
+            # this matters for the reading the data back below to validate it
+            env_name = normalize_identifiers(env_name, dialect=dialect).name
+
         """
         Both start and end are inclusive.
         """
diff --git a/tests/core/engine_adapter/test_integration.py b/tests/core/engine_adapter/test_integration.py
@@ -11,6 +11,7 @@
 import pandas as pd
 import pytest
 from sqlglot import exp, parse_one
+from sqlglot.optimizer.normalize_identifiers import normalize_identifiers
 
 from sqlmesh import Config, Context, EngineAdapter
 from sqlmesh.cli.example_project import init_example_project
@@ -51,6 +52,7 @@ def __init__(
         self.gateway = gateway
         self._columns_to_types = columns_to_types
         self.test_id = random_id(short=True)
+        self._context = None
 
     @property
     def columns_to_types(self):
@@ -411,11 +413,14 @@ def create_context(
         self._context = Context(paths=".", config=config, gateway=self.gateway)
         return self._context
 
-    def cleanup(self, ctx: Context):
-        schemas = []
-        for _, model in ctx.models.items():
-            schemas.append(model.schema_name)
-            schemas.append(model.physical_schema)
+    def cleanup(self, ctx: t.Optional[Context] = None):
+        schemas = [self.schema(TEST_SCHEMA)]
+
+        ctx = ctx or self._context
+        if ctx and ctx.models:
+            for _, model in ctx.models.items():
+                schemas.append(model.schema_name)
+                schemas.append(model.physical_schema)
 
         for schema_name in set(schemas):
             self.engine_adapter.drop_schema(
@@ -662,6 +667,14 @@ def ctx(engine_adapter, test_type, mark_gateway):
     return TestContext(test_type, engine_adapter, gateway)
 
 
+@pytest.fixture(autouse=True)
+def cleanup(ctx: TestContext):
+    yield  # run test
+
+    if ctx:
+        ctx.cleanup()
+
+
 def test_catalog_operations(ctx: TestContext):
     if (
         ctx.engine_adapter.CATALOG_SUPPORT.is_unsupported
@@ -691,11 +704,11 @@ def test_catalog_operations(ctx: TestContext):
             ctx.engine_adapter.execute(f'CREATE DATABASE IF NOT EXISTS "{catalog_name}"')
         except Exception:
             pass
-    current_catalog = ctx.engine_adapter.get_current_catalog()
+    current_catalog = ctx.engine_adapter.get_current_catalog().lower()
     ctx.engine_adapter.set_current_catalog(catalog_name)
-    assert ctx.engine_adapter.get_current_catalog() == catalog_name
+    assert ctx.engine_adapter.get_current_catalog().lower() == catalog_name
     ctx.engine_adapter.set_current_catalog(current_catalog)
-    assert ctx.engine_adapter.get_current_catalog() == current_catalog
+    assert ctx.engine_adapter.get_current_catalog().lower() == current_catalog
 
 
 def test_drop_schema_catalog(ctx: TestContext, caplog):
@@ -782,21 +795,14 @@ def test_temp_table(ctx: TestContext):
     )
     table = ctx.table("example")
 
-    # The snowflake adapter persists the DataFrame to an intermediate table because we use the `write_pandas()` function from the Snowflake python library
-    # Other adapters just use SQLGlot to convert the dataframe directly into a SELECT query
-    expected_tables = 2 if ctx.dialect == "snowflake" and ctx.test_type == "df" else 1
     with ctx.engine_adapter.temp_table(ctx.input_data(input_data), table.sql()) as table_name:
         results = ctx.get_metadata_results()
         assert len(results.views) == 0
-        assert len(results.tables) == expected_tables
+        assert len(results.tables) == 1
         assert len(results.non_temp_tables) == 0
         assert len(results.materialized_views) == 0
         ctx.compare_with_current(table_name, input_data)
 
-    if ctx.dialect == "snowflake":
-        # force the next query to create a new connection to prove temp tables have been dropped
-        ctx.engine_adapter._connection_pool.close()
-
     results = ctx.get_metadata_results()
     assert len(results.views) == len(results.tables) == len(results.non_temp_tables) == 0
 
@@ -1735,6 +1741,14 @@ def test_sushi(mark_gateway: t.Tuple[str, str], ctx: TestContext):
         personal_paths=[pathlib.Path("~/.sqlmesh/config.yaml").expanduser()],
     )
     _, gateway = mark_gateway
+
+    # clear cache from prior runs
+    cache_dir = pathlib.Path("./examples/sushi/.cache")
+    if cache_dir.exists():
+        import shutil
+
+        shutil.rmtree(cache_dir)
+
     context = Context(paths="./examples/sushi", config=config, gateway=gateway)
 
     # clean up any leftover schemas from previous runs (requires context)
@@ -1769,7 +1783,7 @@ def test_sushi(mark_gateway: t.Tuple[str, str], ctx: TestContext):
 
         context._models.update({cust_rev_by_day_key: cust_rev_by_day_model_tbl_props})
 
-    context.plan(
+    plan: Plan = context.plan(
         environment="test_prod",
         start=start,
         end=end,
@@ -1785,6 +1799,7 @@ def test_sushi(mark_gateway: t.Tuple[str, str], ctx: TestContext):
         yesterday(),
         env_name="test_prod",
         dialect=ctx.dialect,
+        environment_naming_info=plan.environment_naming_info,
     )
 
     # Ensure table and column comments were correctly registered with engine
@@ -1977,10 +1992,13 @@ def validate_no_comments(
         # confirm physical temp table comments are not registered
         validate_no_comments("sqlmesh__sushi", table_name_suffix="__temp", check_temp_tables=True)
         # confirm view layer comments are not registered in non-PROD environment
-        validate_no_comments("sushi__test_prod", is_physical_layer=False)
+        env_name = "test_prod"
+        if plan.environment_naming_info and plan.environment_naming_info.normalize_name:
+            env_name = normalize_identifiers(env_name, dialect=ctx.dialect).name
+        validate_no_comments(f"sushi__{env_name}", is_physical_layer=False)
 
     # Ensure that the plan has been applied successfully.
-    no_change_plan = context.plan(
+    no_change_plan: Plan = context.plan(
         environment="test_dev",
         start=start,
         end=end,
@@ -2000,6 +2018,7 @@ def validate_no_comments(
         yesterday(),
         env_name="test_dev",
         dialect=ctx.dialect,
+        environment_naming_info=no_change_plan.environment_naming_info,
     )
 
     # confirm view layer comments are registered in PROD
@@ -2051,7 +2070,7 @@ def test_init_project(ctx: TestContext, mark_gateway: t.Tuple[str, str], tmp_pat
     assert len(physical_layer_results.tables) == len(physical_layer_results.non_temp_tables) == 6
 
     # make and validate unmodified dev environment
-    no_change_plan = context.plan(
+    no_change_plan: Plan = context.plan(
         environment="test_dev",
         skip_tests=True,
         no_prompts=True,
@@ -2062,7 +2081,12 @@ def test_init_project(ctx: TestContext, mark_gateway: t.Tuple[str, str], tmp_pat
 
     context.apply(no_change_plan)
 
-    dev_schema_results = ctx.get_metadata_results("sqlmesh_example__test_dev")
+    environment = no_change_plan.environment
+    first_snapshot = no_change_plan.environment.snapshots[0]
+    schema_name = first_snapshot.qualified_view_name.schema_for_environment(
+        environment, dialect=ctx.dialect
+    )
+    dev_schema_results = ctx.get_metadata_results(schema_name)
     assert sorted(dev_schema_results.views) == [
         "full_model",
         "incremental_model",
@@ -2234,6 +2258,7 @@ def _mutate_config(current_gateway_name: str, config: Config):
         connection.concurrent_tasks = 1
 
     context = ctx.create_context(_mutate_config)
+    assert context.default_dialect == "duckdb"
 
     schema = ctx.schema(TEST_SCHEMA)
     seed_query = ctx.input_data(
@@ -2278,13 +2303,13 @@ def _mutate_config(current_gateway_name: str, config: Config):
     try:
         context.plan(auto_apply=True, no_prompts=True)
 
-        results = ctx.get_metadata_results(schema)
+        test_model = context.get_model(f"{schema}.test_model")
+        normalized_schema_name = test_model.fully_qualified_table.db
+        results = ctx.get_metadata_results(normalized_schema_name)
         assert "test_model" in results.views
 
         actual_df = (
-            ctx.get_current_data(f"{schema}.test_model")
-            .sort_values(by="event_date")
-            .reset_index(drop=True)
+            ctx.get_current_data(test_model.fqn).sort_values(by="event_date").reset_index(drop=True)
         )
         actual_df["event_date"] = actual_df["event_date"].astype(str)
         assert actual_df.count()[0] == 3
diff --git a/tests/core/engine_adapter/test_snowflake.py b/tests/core/engine_adapter/test_snowflake.py