Feat: Support enabling audits in model defaults (#2947)

themisvaltinos · web-flow · commit 8e786fda59da · 2024-07-26T15:55:04.000+03:00
diff --git a/docs/concepts/audits.md b/docs/concepts/audits.md
@@ -88,6 +88,15 @@ SELECT * FROM @this_model
 WHERE @column >= @threshold;
 ```
 
+Alternatively, you can apply specific audits globally by including them in the model defaults configuration:
+
+```sql linenums="1"
+model_defaults:
+  audits: 
+    - assert_positive_order_ids
+    - does_not_exceed_threshold(column := id, threshold := 1000)
+```
+
 ### Naming
 We recommended avoiding SQL keywords when naming audit parameters. Quote any audit argument that is also a SQL keyword.
 
diff --git a/docs/concepts/overview.md b/docs/concepts/overview.md
@@ -61,7 +61,7 @@ You create audits by writing SQL queries that should return 0 rows. For example,
 
 Audits are flexible &mdash; they can be tied to a specific model's contents, or you can use [macros](./macros/overview.md) to create audits that are usable by multiple models. SQLMesh also includes pre-made audits for common use cases, such as detecting NULL or duplicated values.
 
-You specify which audits should run for a model by including them in the model's metadata properties.
+You specify which audits should run for a model by including them in the model's metadata properties. To apply them globally across your project, include them in the model defaults configuration.
 
 SQLMesh automatically runs audits when you apply a `plan` to an environment, or you can run them on demand with the [`audit` command](../reference/cli.md#audit).
 
diff --git a/docs/reference/model_configuration.md b/docs/reference/model_configuration.md
@@ -51,6 +51,7 @@ The SQLMesh project-level `model_defaults` key supports the following options, d
 - storage_format
 - session_properties (on per key basis)
 - on_destructive_change (described [below](#incremental-models))
+- audits (described [here](../concepts/audits.md#generic-audits))
 
 
 ### Model Naming
diff --git a/sqlmesh/core/config/model.py b/sqlmesh/core/config/model.py
@@ -2,6 +2,7 @@
 
 import typing as t
 
+from sqlmesh.core.dialect import parse_one, extract_audit
 from sqlmesh.core.config.base import BaseConfig
 from sqlmesh.core.model.kind import (
     ModelKind,
@@ -10,6 +11,8 @@
     on_destructive_change_validator,
 )
 from sqlmesh.utils.date import TimeLike
+from sqlmesh.core.model.meta import AuditReference
+from sqlmesh.utils.pydantic import field_validator
 
 
 class ModelDefaultsConfig(BaseConfig):
@@ -27,6 +30,7 @@ class ModelDefaultsConfig(BaseConfig):
         storage_format: The storage format used to store the physical table, only applicable in certain engines.
             (eg. 'parquet')
         on_destructive_change: What should happen when a forward-only model requires a destructive schema change.
+        audits: The audits to be applied globally to all models in the project.
     """
 
     kind: t.Optional[ModelKind] = None
@@ -37,6 +41,14 @@ class ModelDefaultsConfig(BaseConfig):
     storage_format: t.Optional[str] = None
     on_destructive_change: t.Optional[OnDestructiveChange] = None
     session_properties: t.Optional[t.Dict[str, t.Any]] = None
+    audits: t.Optional[t.List[AuditReference]] = None
 
     _model_kind_validator = model_kind_validator
     _on_destructive_change_validator = on_destructive_change_validator
+
+    @field_validator("audits", mode="before")
+    def _audits_validator(cls, v: t.Any) -> t.Any:
+        if isinstance(v, list):
+            return [extract_audit(parse_one(audit)) for audit in v]
+
+        return v
diff --git a/sqlmesh/core/context.py b/sqlmesh/core/context.py
@@ -1907,6 +1907,7 @@ def _nodes_to_snapshots(nodes: t.Dict[str, Node]) -> t.Dict[str, Snapshot]:
                     audits=audits,
                     cache=fingerprint_cache,
                     ttl=ttl,
+                    config=self.config_for_node(node),
                 )
                 snapshots[snapshot.name] = snapshot
             return snapshots
diff --git a/sqlmesh/core/dialect.py b/sqlmesh/core/dialect.py
@@ -21,7 +21,7 @@
 from sqlglot.tokens import Token
 
 from sqlmesh.core.constants import MAX_MODEL_DEFINITION_SIZE
-from sqlmesh.utils.errors import SQLMeshError
+from sqlmesh.utils.errors import SQLMeshError, ConfigError
 from sqlmesh.utils.pandas import columns_to_types_from_df
 
 if t.TYPE_CHECKING:
@@ -1099,3 +1099,24 @@ def interpret_key_value_pairs(
     e: exp.Tuple,
 ) -> t.Dict[str, exp.Expression | str | int | float | bool]:
     return {i.this.name: interpret_expression(i.expression) for i in e.expressions}
+
+
+def extract_audit(v: exp.Expression) -> t.Tuple[str, t.Dict[str, exp.Expression]]:
+    kwargs = {}
+
+    if isinstance(v, exp.Anonymous):
+        func = v.name
+        args = v.expressions
+    elif isinstance(v, exp.Func):
+        func = v.sql_name()
+        args = list(v.args.values())
+    else:
+        return v.name.lower(), {}
+
+    for arg in args:
+        if not isinstance(arg, (exp.PropertyEQ, exp.EQ)):
+            raise ConfigError(
+                f"Function '{func}' must be called with key-value arguments like {func}(arg := value)."
+            )
+        kwargs[arg.left.name.lower()] = arg.right
+    return func.lower(), kwargs
diff --git a/sqlmesh/core/model/definition.py b/sqlmesh/core/model/definition.py
@@ -25,7 +25,7 @@
 from sqlmesh.core.macros import MacroRegistry, MacroStrTemplate, macro
 from sqlmesh.core.model.common import expression_validator
 from sqlmesh.core.model.kind import ModelKindName, SeedKind, ModelKind, FullKind, create_model_kind
-from sqlmesh.core.model.meta import ModelMeta
+from sqlmesh.core.model.meta import ModelMeta, AuditReference
 from sqlmesh.core.model.seed import CsvSeedReader, Seed, create_seed
 from sqlmesh.core.renderer import ExpressionRenderer, QueryRenderer
 from sqlmesh.utils import columns_to_types_all_known, str_to_bool, UniqueKeyDict
@@ -461,7 +461,11 @@ def ctas_query(self, **render_kwarg: t.Any) -> exp.Query:
             )
         return query
 
-    def referenced_audits(self, audits: t.Dict[str, ModelAudit]) -> t.List[ModelAudit]:
+    def referenced_audits(
+        self,
+        audits: t.Dict[str, ModelAudit],
+        default_audits: t.List[AuditReference] = [],
+    ) -> t.List[ModelAudit]:
         """Returns audits referenced in this model.
 
         Args:
@@ -471,7 +475,7 @@ def referenced_audits(self, audits: t.Dict[str, ModelAudit]) -> t.List[ModelAudi
 
         referenced_audits = []
 
-        for audit_name, _ in self.audits:
+        for audit_name, _ in self.audits + default_audits:
             if audit_name in self.inline_audits:
                 referenced_audits.append(self.inline_audits[audit_name])
             elif audit_name in audits:
diff --git a/sqlmesh/core/model/meta.py b/sqlmesh/core/model/meta.py
@@ -10,7 +10,7 @@
 from sqlglot.optimizer.normalize_identifiers import normalize_identifiers
 
 from sqlmesh.core import dialect as d
-from sqlmesh.core.dialect import normalize_model_name
+from sqlmesh.core.dialect import normalize_model_name, extract_audit
 from sqlmesh.core.model.common import (
     bool_validator,
     default_catalog_validator,
@@ -85,32 +85,12 @@ class ModelMeta(_Node):
 
     @field_validator("audits", mode="before")
     def _audits_validator(cls, v: t.Any) -> t.Any:
-        def extract(v: exp.Expression) -> t.Tuple[str, t.Dict[str, exp.Expression]]:
-            kwargs = {}
-
-            if isinstance(v, exp.Anonymous):
-                func = v.name
-                args = v.expressions
-            elif isinstance(v, exp.Func):
-                func = v.sql_name()
-                args = list(v.args.values())
-            else:
-                return v.name.lower(), {}
-
-            for arg in args:
-                if not isinstance(arg, (exp.PropertyEQ, exp.EQ)):
-                    raise ConfigError(
-                        f"Function '{func}' must be called with key-value arguments like {func}(arg := value)."
-                    )
-                kwargs[arg.left.name.lower()] = arg.right
-            return func.lower(), kwargs
-
         if isinstance(v, (exp.Tuple, exp.Array)):
-            return [extract(i) for i in v.expressions]
+            return [extract_audit(i) for i in v.expressions]
         if isinstance(v, exp.Paren):
-            return [extract(v.this)]
+            return [extract_audit(v.this)]
         if isinstance(v, exp.Expression):
-            return [extract(v)]
+            return [extract_audit(v)]
         if isinstance(v, list):
             audits = []
 
diff --git a/sqlmesh/core/snapshot/definition.py b/sqlmesh/core/snapshot/definition.py
@@ -46,6 +46,7 @@
 if t.TYPE_CHECKING:
     from sqlglot.dialects.dialect import DialectType
     from sqlmesh.core.environment import EnvironmentNamingInfo
+    from sqlmesh.core.config import Config
 
 Interval = t.Tuple[int, int]
 Intervals = t.List[Interval]
@@ -617,6 +618,7 @@ def from_node(
         version: t.Optional[str] = None,
         audits: t.Optional[t.Dict[str, ModelAudit]] = None,
         cache: t.Optional[t.Dict[str, SnapshotFingerprint]] = None,
+        config: t.Optional[Config] = None,
     ) -> Snapshot:
         """Creates a new snapshot for a node.
 
@@ -634,8 +636,13 @@ def from_node(
         """
         created_ts = now_timestamp()
         kwargs = {}
+        default_audits = (
+            config.model_defaults.audits if (config and config.model_defaults.audits) else []
+        )
         if node.is_model:
-            kwargs["audits"] = tuple(t.cast(_Model, node).referenced_audits(audits or {}))
+            kwargs["audits"] = tuple(
+                t.cast(_Model, node).referenced_audits(audits or {}, default_audits)
+            )
 
         return cls(
             name=node.fqn,
diff --git a/tests/core/test_config.py b/tests/core/test_config.py
@@ -557,3 +557,30 @@ def test_load_duckdb_attach_config(tmp_path):
     assert attach_config_2.type == "postgres"
     assert attach_config_2.path == "dbname=postgres user=postgres host=127.0.0.1"
     assert attach_config_2.read_only is True
+
+
+def test_load_model_defaults_audits(tmp_path):
+    config_path = tmp_path / "config_model_defaults_audits.yaml"
+    with open(config_path, "w", encoding="utf-8") as fd:
+        fd.write(
+            """
+model_defaults:
+    dialect: ''
+    audits: 
+        - assert_positive_order_ids
+        - does_not_exceed_threshold(column := id, threshold := 1000)
+        """
+        )
+
+    config = load_config_from_paths(
+        Config,
+        project_paths=[config_path],
+    )
+
+    assert len(config.model_defaults.audits) == 2
+    assert config.model_defaults.audits[0] == ("assert_positive_order_ids", {})
+    assert config.model_defaults.audits[1][0] == "does_not_exceed_threshold"
+    assert type(config.model_defaults.audits[1][1]["column"]) == exp.Column
+    assert config.model_defaults.audits[1][1]["column"].this.this == "id"
+    assert type(config.model_defaults.audits[1][1]["threshold"]) == exp.Literal
+    assert config.model_defaults.audits[1][1]["threshold"].this == "1000"
diff --git a/tests/core/test_model.py b/tests/core/test_model.py
@@ -15,6 +15,7 @@
 
 from sqlmesh.core import constants as c
 from sqlmesh.core import dialect as d
+from sqlmesh.core.audit import ModelAudit
 from sqlmesh.core.config import (
     Config,
     NameInferenceConfig,
@@ -969,6 +970,39 @@ def test_audits():
     assert model.tags == ["foo"]
 
 
+def test_enable_audits_from_model_defaults():
+    expressions = d.parse(
+        """
+        MODEL (
+            name db.audit_model,
+        );
+        SELECT 1 as id;
+
+        AUDIT (
+    name assert_positive_order_ids,
+    );
+    SELECT *
+    FROM @this_model
+    WHERE
+    id < 0;
+    """
+    )
+
+    model = load_sql_based_model(expressions, path=Path("./examples/sushi/models/test_model.sql"))
+    assert len(model.audits) == 0
+    assert len(model.inline_audits) == 1
+
+    config = Config(
+        model_defaults=ModelDefaultsConfig(dialect="duckdb", audits=["assert_positive_order_ids"])
+    )
+    assert config.model_defaults.audits[0] == ("assert_positive_order_ids", {})
+
+    snapshot = Snapshot.from_node(model, nodes={}, config=config)
+    assert len(snapshot.audits) == 1
+    assert type(snapshot.audits[0]) == ModelAudit
+    assert snapshot.audits[0].query.sql() == "SELECT * FROM @this_model WHERE id < 0"
+
+
 def test_description(sushi_context):
     assert sushi_context.models['"memory"."sushi"."orders"'].description == "Table of sushi orders."
 

Original file line number	Diff line number	Diff line change
`@@ -1907,6 +1907,7 @@ def _nodes_to_snapshots(nodes: t.Dict[str, Node]) -> t.Dict[str, Snapshot]:`
`1907`	`1907`	`audits=audits,`
`1908`	`1908`	`cache=fingerprint_cache,`
`1909`	`1909`	`ttl=ttl,`
	`1910`	`+ config=self.config_for_node(node),`
`1910`	`1911`	`)`
`1911`	`1912`	`snapshots[snapshot.name] = snapshot`
`1912`	`1913`	`return snapshots`