Skip to content

Commit 0f6bcf0

Browse files
committed
fix: support unicode characters in file cache
1 parent a255e17 commit 0f6bcf0

File tree

4 files changed

+33
-2
lines changed

4 files changed

+33
-2
lines changed

sqlmesh/utils/__init__.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from functools import lru_cache, reduce, wraps
2222
from pathlib import Path
2323

24+
import unicodedata
2425
from sqlglot import exp
2526
from sqlglot.dialects.dialect import Dialects
2627

@@ -291,8 +292,14 @@ def sqlglot_dialects() -> str:
291292

292293
NON_ALNUM = re.compile(r"[^a-zA-Z0-9_]")
293294

295+
NON_ALUM_INCLUDE_UNICODE = re.compile(r"\W", flags=re.UNICODE)
294296

295-
def sanitize_name(name: str) -> str:
297+
298+
def sanitize_name(name: str, *, include_unicode: bool = False) -> str:
299+
if include_unicode:
300+
s = unicodedata.normalize("NFC", name)
301+
s = NON_ALUM_INCLUDE_UNICODE.sub("_", s)
302+
return s
296303
return NON_ALNUM.sub("_", name)
297304

298305

sqlmesh/utils/cache.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def clear(self) -> None:
133133

134134
def _cache_entry_path(self, name: str, entry_id: str = "") -> Path:
135135
entry_file_name = "__".join(p for p in (self._cache_version, name, entry_id) if p)
136-
full_path = self._path / sanitize_name(entry_file_name)
136+
full_path = self._path / sanitize_name(entry_file_name, include_unicode=True)
137137
if IS_WINDOWS:
138138
# handle paths longer than 260 chars
139139
full_path = fix_windows_path(full_path)

tests/utils/__init__.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import pytest
2+
3+
from sqlmesh.utils import sanitize_name
4+
5+
6+
@pytest.mark.parametrize(
7+
"raw,exclude_unicode,include_unicode",
8+
[
9+
("simple", "simple", "simple"),
10+
("snake_case", "snake_case", "snake_case"),
11+
("客户数据", "____", "客户数据"),
12+
("客户-数据 v2", "______v2", "客户_数据_v2"),
13+
("中文,逗号", "_____", "中文_逗号"),
14+
("a/b", "a_b", "a_b"),
15+
("spaces\tand\nnewlines", "spaces_and_newlines", "spaces_and_newlines"),
16+
("data📦2025", "data_2025", "data_2025"),
17+
("MiXeD123_名字", "MiXeD123___", "MiXeD123_名字"),
18+
("", "", ""),
19+
],
20+
)
21+
def test_sanitize_name_no_(raw, exclude_unicode, include_unicode):
22+
assert sanitize_name(raw) == exclude_unicode
23+
assert sanitize_name(raw, include_unicode=True) == include_unicode

tests/utils/test_cache.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def test_file_cache(tmp_path: Path, mocker: MockerFixture):
3939
loader.assert_called_once()
4040

4141
assert "___test_model_" in cache._cache_entry_path('"test_model"').name
42+
assert "客户数据" in cache._cache_entry_path("客户数据").name
4243

4344

4445
def test_optimized_query_cache(tmp_path: Path, mocker: MockerFixture):

0 commit comments

Comments
 (0)