fix: better downstream and identifier handling (#3757)

tobymao · izeigerman · commit 679c0fd5042a · 2025-01-30T20:00:43.000-08:00
diff --git a/sqlmesh/core/selector.py b/sqlmesh/core/selector.py
@@ -6,8 +6,8 @@
 
 from sqlglot import exp
 from sqlglot.errors import ParseError
-from sqlglot.tokens import Token, Tokenizer, TokenType
-from sqlglot.dialects.dialect import DialectType
+from sqlglot.tokens import Token, TokenType, Tokenizer as BaseTokenizer
+from sqlglot.dialects.dialect import Dialect, DialectType
 from sqlglot.helper import seq_get
 
 from sqlmesh.core.dialect import normalize_model_name
@@ -230,20 +230,25 @@ def evaluate(node: exp.Expression) -> t.Set[str]:
         return evaluate(node)
 
 
-class SelectorTokenizer(Tokenizer):
-    SINGLE_TOKENS = {
-        "(": TokenType.L_PAREN,
-        ")": TokenType.R_PAREN,
-        "&": TokenType.AMP,
-        "|": TokenType.PIPE,
-        "^": TokenType.CARET,
-        "+": TokenType.PLUS,
-        "*": TokenType.STAR,
-        ":": TokenType.COLON,
-    }
+class SelectorDialect(Dialect):
+    IDENTIFIERS_CAN_START_WITH_DIGIT = True
 
-    KEYWORDS = {}
-    IDENTIFIERS: t.List[str | t.Tuple[str, str]] = []
+    class Tokenizer(BaseTokenizer):
+        SINGLE_TOKENS = {
+            "(": TokenType.L_PAREN,
+            ")": TokenType.R_PAREN,
+            "&": TokenType.AMP,
+            "|": TokenType.PIPE,
+            "^": TokenType.CARET,
+            "+": TokenType.PLUS,
+            "*": TokenType.STAR,
+            ":": TokenType.COLON,
+        }
+
+        KEYWORDS = {}
+        IDENTIFIERS = ["\\"]  # there are no identifiers but need to put something here
+        IDENTIFIER_START = ""
+        IDENTIFIER_END = ""
 
 
 class Git(exp.Expression):
@@ -259,7 +264,7 @@ class Direction(exp.Expression):
 
 
 def parse(selector: str, dialect: DialectType = None) -> exp.Expression:
-    tokens = SelectorTokenizer().tokenize(selector)
+    tokens = SelectorDialect().tokenize(selector)
     i = 0
 
     def _curr() -> t.Optional[Token]:
@@ -304,29 +309,32 @@ def _parse_kind(kind: str) -> bool:
 
     def _parse_var() -> exp.Expression:
         upstream = _match(TokenType.PLUS)
+        downstream = None
         tag = _parse_kind("tag")
         git = False if tag else _parse_kind("git")
         lstar = "*" if _match(TokenType.STAR) else ""
         directions = {}
 
-        if _match(TokenType.VAR):
+        if _match(TokenType.VAR) or _match(TokenType.NUMBER):
             name = _prev().text
             rstar = "*" if _match(TokenType.STAR) else ""
             downstream = _match(TokenType.PLUS)
             this: exp.Expression = exp.Var(this=f"{lstar}{name}{rstar}")
 
-            if upstream:
-                directions["up"] = True
-            if downstream:
-                directions["down"] = True
         elif _match(TokenType.L_PAREN):
             this = exp.Paren(this=_parse_conjunction())
+            downstream = _match(TokenType.PLUS)
             _match(TokenType.R_PAREN, True)
         elif lstar:
             this = exp.var("*")
         else:
             raise ParseError(_error("Expected model name."))
 
+        if upstream:
+            directions["up"] = True
+        if downstream:
+            directions["down"] = True
+
         if tag:
             this = Tag(this=this)
         if git:
diff --git a/tests/core/test_selector.py b/tests/core/test_selector.py
@@ -512,6 +512,38 @@ def test_select_models_missing_env(mocker: MockerFixture, make_snapshot):
             ["model* & ^(tag:tag1 | tag:tag2)"],
             {'"model3"'},
         ),
+        (
+            [
+                ("model1", "tag1", None),
+                ("model2", "tag2", {"model1"}),
+                ("model3", "tag3", {"model1"}),
+            ],
+            ["(model1*)+"],
+            {'"model1"', '"model2"', '"model3"'},
+        ),
+        (
+            [
+                ("model1", "tag1", None),
+                ("model2", "tag2", {"model1"}),
+                ("model3", "tag3", {"model2"}),
+            ],
+            ["+(+model2*+)+"],
+            {'"model1"', '"model2"', '"model3"'},
+        ),
+        (
+            [
+                ("model1", "tag1", None),
+                ("model2", "tag2", {"model1"}),
+                ("model3", "tag3", {"model1"}),
+            ],
+            ["(model* & ^*1)+"],
+            {'"model2"', '"model3"'},
+        ),
+        (
+            [("model2", "tag1", None), ("model2_1", "tag2", None), ("model2_2", "tag3", None)],
+            ["*2_*"],
+            {'"model2_1"', '"model2_2"'},
+        ),
     ],
 )
 def test_expand_model_selections(