feat!: support unicode characters

eakmanrq · eakmanrq · commit 34a58623fc62 · 2025-09-30T11:39:59.000-07:00
diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml
@@ -300,8 +300,8 @@ workflows:
           name: cloud_engine_<< matrix.engine >>
           context:
             - sqlmesh_cloud_database_integration
-          requires:
-            - engine_tests_docker
+#          requires:
+#            - engine_tests_docker
           matrix:
             parameters:
               engine:
@@ -313,10 +313,10 @@ workflows:
                 - athena
                 - fabric
                 - gcp-postgres
-          filters:
-            branches:
-              only:
-                - main
+#          filters:
+#            branches:
+#              only:
+#                - main
       - ui_style
       - ui_test
       - vscode_test
diff --git a/sqlmesh/core/engine_adapter/databricks.py b/sqlmesh/core/engine_adapter/databricks.py
@@ -34,6 +34,8 @@ class DatabricksEngineAdapter(SparkEngineAdapter):
     SUPPORTS_CLONING = True
     SUPPORTS_MATERIALIZED_VIEWS = True
     SUPPORTS_MATERIALIZED_VIEW_SCHEMA = True
+    # Spark has this set to false for compatibility when mixing with Trino but that isn't a concern with Databricks
+    QUOTE_IDENTIFIERS_IN_VIEWS = True
     SCHEMA_DIFFER_KWARGS = {
         "support_positional_add": True,
         "nested_support": NestedSupport.ALL,
diff --git a/sqlmesh/utils/__init__.py b/sqlmesh/utils/__init__.py
@@ -13,6 +13,7 @@
 import types
 import typing as t
 import uuid
+import unicodedata
 from dataclasses import dataclass
 from collections import defaultdict
 from contextlib import contextmanager
@@ -289,11 +290,13 @@ def sqlglot_dialects() -> str:
     return "'" + "', '".join(Dialects.__members__.values()) + "'"
 
 
-NON_ALNUM = re.compile(r"[^a-zA-Z0-9_]")
+NON_WORD = re.compile(r"\W", flags=re.UNICODE)
 
 
 def sanitize_name(name: str) -> str:
-    return NON_ALNUM.sub("_", name)
+    s = unicodedata.normalize("NFC", name)
+    s = NON_WORD.sub("_", s)
+    return s
 
 
 def groupby(
diff --git a/tests/core/engine_adapter/integration/test_integration.py b/tests/core/engine_adapter/integration/test_integration.py
@@ -3990,3 +3990,32 @@ def _set_config(gateway: str, config: Config) -> None:
         was_evaluated=True,
         day_delta=4,
     )
+
+
+def test_unicode_characters(ctx: TestContext, tmp_path: Path):
+    if ctx.dialect in ["spark", "trino"]:
+        # It is possible that Trino could support this if we changed `QUOTE_IDENTIFIERS_IN_VIEWS` but that would
+        # break the compatibility it has when be mixed with Spark for compute
+        pytest.skip("Skipping as these engines have issues with unicode characters in model names")
+
+    model_name = "客户数据"
+    table = ctx.table(model_name).sql(dialect=ctx.dialect)
+    (tmp_path / "models").mkdir(exist_ok=True)
+
+    model_def = f"""
+    MODEL (
+        name {table},
+        kind FULL,
+        dialect '{ctx.dialect}'
+    );
+    SELECT 1 as id
+    """
+
+    (tmp_path / "models" / "客户数据.sql").write_text(model_def)
+
+    context = ctx.create_context(path=tmp_path)
+    context.plan(auto_apply=True, no_prompts=True)
+
+    results = ctx.get_metadata_results()
+    assert len(results.views) == 1
+    assert results.views[0].lower() == model_name
diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py
@@ -0,0 +1,22 @@
+import pytest
+
+from sqlmesh.utils import sanitize_name
+
+
+@pytest.mark.parametrize(
+    "raw,expected",
+    [
+        ("simple", "simple"),
+        ("snake_case", "snake_case"),
+        ("客户数据", "客户数据"),  # pure Chinese kept
+        ("客户-数据 v2", "客户_数据_v2"),  # dash/space -> underscore
+        ("中文，逗号", "中文_逗号"),  # full-width comma -> underscore
+        ("a/b", "a_b"),  # slash -> underscore
+        ("spaces\tand\nnewlines", "spaces_and_newlines"),
+        ("data📦2025", "data_2025"),
+        ("MiXeD123_名字", "MiXeD123_名字"),
+        ("", ""),
+    ],
+)
+def test_sanitize_known_cases(raw, expected):
+    assert sanitize_name(raw) == expected
diff --git a/tests/utils/test_cache.py b/tests/utils/test_cache.py
@@ -39,6 +39,7 @@ def test_file_cache(tmp_path: Path, mocker: MockerFixture):
     loader.assert_called_once()
 
     assert "___test_model_" in cache._cache_entry_path('"test_model"').name
+    assert "客户数据" in cache._cache_entry_path("客户数据").name
 
 
 def test_optimized_query_cache(tmp_path: Path, mocker: MockerFixture):