Skip to content

Commit 34a5862

Browse files
committed
feat!: support unicode characters
1 parent a255e17 commit 34a5862

File tree

6 files changed

+65
-8
lines changed

6 files changed

+65
-8
lines changed

.circleci/continue_config.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,8 @@ workflows:
300300
name: cloud_engine_<< matrix.engine >>
301301
context:
302302
- sqlmesh_cloud_database_integration
303-
requires:
304-
- engine_tests_docker
303+
# requires:
304+
# - engine_tests_docker
305305
matrix:
306306
parameters:
307307
engine:
@@ -313,10 +313,10 @@ workflows:
313313
- athena
314314
- fabric
315315
- gcp-postgres
316-
filters:
317-
branches:
318-
only:
319-
- main
316+
# filters:
317+
# branches:
318+
# only:
319+
# - main
320320
- ui_style
321321
- ui_test
322322
- vscode_test

sqlmesh/core/engine_adapter/databricks.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ class DatabricksEngineAdapter(SparkEngineAdapter):
3434
SUPPORTS_CLONING = True
3535
SUPPORTS_MATERIALIZED_VIEWS = True
3636
SUPPORTS_MATERIALIZED_VIEW_SCHEMA = True
37+
# Spark has this set to false for compatibility when mixing with Trino but that isn't a concern with Databricks
38+
QUOTE_IDENTIFIERS_IN_VIEWS = True
3739
SCHEMA_DIFFER_KWARGS = {
3840
"support_positional_add": True,
3941
"nested_support": NestedSupport.ALL,

sqlmesh/utils/__init__.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import types
1414
import typing as t
1515
import uuid
16+
import unicodedata
1617
from dataclasses import dataclass
1718
from collections import defaultdict
1819
from contextlib import contextmanager
@@ -289,11 +290,13 @@ def sqlglot_dialects() -> str:
289290
return "'" + "', '".join(Dialects.__members__.values()) + "'"
290291

291292

292-
NON_ALNUM = re.compile(r"[^a-zA-Z0-9_]")
293+
NON_WORD = re.compile(r"\W", flags=re.UNICODE)
293294

294295

295296
def sanitize_name(name: str) -> str:
296-
return NON_ALNUM.sub("_", name)
297+
s = unicodedata.normalize("NFC", name)
298+
s = NON_WORD.sub("_", s)
299+
return s
297300

298301

299302
def groupby(

tests/core/engine_adapter/integration/test_integration.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3990,3 +3990,32 @@ def _set_config(gateway: str, config: Config) -> None:
39903990
was_evaluated=True,
39913991
day_delta=4,
39923992
)
3993+
3994+
3995+
def test_unicode_characters(ctx: TestContext, tmp_path: Path):
3996+
if ctx.dialect in ["spark", "trino"]:
3997+
# It is possible that Trino could support this if we changed `QUOTE_IDENTIFIERS_IN_VIEWS` but that would
3998+
# break the compatibility it has when be mixed with Spark for compute
3999+
pytest.skip("Skipping as these engines have issues with unicode characters in model names")
4000+
4001+
model_name = "客户数据"
4002+
table = ctx.table(model_name).sql(dialect=ctx.dialect)
4003+
(tmp_path / "models").mkdir(exist_ok=True)
4004+
4005+
model_def = f"""
4006+
MODEL (
4007+
name {table},
4008+
kind FULL,
4009+
dialect '{ctx.dialect}'
4010+
);
4011+
SELECT 1 as id
4012+
"""
4013+
4014+
(tmp_path / "models" / "客户数据.sql").write_text(model_def)
4015+
4016+
context = ctx.create_context(path=tmp_path)
4017+
context.plan(auto_apply=True, no_prompts=True)
4018+
4019+
results = ctx.get_metadata_results()
4020+
assert len(results.views) == 1
4021+
assert results.views[0].lower() == model_name

tests/utils/__init__.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import pytest
2+
3+
from sqlmesh.utils import sanitize_name
4+
5+
6+
@pytest.mark.parametrize(
7+
"raw,expected",
8+
[
9+
("simple", "simple"),
10+
("snake_case", "snake_case"),
11+
("客户数据", "客户数据"), # pure Chinese kept
12+
("客户-数据 v2", "客户_数据_v2"), # dash/space -> underscore
13+
("中文,逗号", "中文_逗号"), # full-width comma -> underscore
14+
("a/b", "a_b"), # slash -> underscore
15+
("spaces\tand\nnewlines", "spaces_and_newlines"),
16+
("data📦2025", "data_2025"),
17+
("MiXeD123_名字", "MiXeD123_名字"),
18+
("", ""),
19+
],
20+
)
21+
def test_sanitize_known_cases(raw, expected):
22+
assert sanitize_name(raw) == expected

tests/utils/test_cache.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def test_file_cache(tmp_path: Path, mocker: MockerFixture):
3939
loader.assert_called_once()
4040

4141
assert "___test_model_" in cache._cache_entry_path('"test_model"').name
42+
assert "客户数据" in cache._cache_entry_path("客户数据").name
4243

4344

4445
def test_optimized_query_cache(tmp_path: Path, mocker: MockerFixture):

0 commit comments

Comments
 (0)