feat: add the ability to have audits on external models (#2715)

tobymao · web-flow · commit 0d86edc8d596 · 2024-06-04T19:28:01.000-07:00
diff --git a/docs/concepts/models/external_models.md b/docs/concepts/models/external_models.md
@@ -91,4 +91,24 @@ Files in the `external_models` directory must be `.yaml` files that follow the s
 
 When SQLMesh loads the definitions, it will first load the models defined in `external_models.yaml` (or `schema.yaml`) and  any models found in `external_models/*.yaml`.
 
-Therefore, you can use `sqlmesh create_external_models` to manage the `external_models.yaml` file and then put any models that need to be defined manually inside the `external_models/` directory.
+Therefore, you can use `sqlmesh create_external_models` to manage the `external_models.yaml` file and then put any models that need to be defined manually inside the `external_models/` directory.
+
+### External Audits
+It is possible to define [audits](../audits.md) on external models. This can be useful to check the data quality of upstream dependencies before your internal models evaluate.
+
+This example shows an external model with two audits.
+
+```yaml
+- name: raw.demographics
+  description: Table containing demographics information
+  audits:
+    - name: not_null
+      columns: "[customer_id]"
+    - name: accepted_range
+      column: zip
+      min_v: "'00000'"
+      max_v: "'99999'"
+  columns:
+    customer_id: int
+    zip: text
+```
diff --git a/examples/sushi/config.py b/examples/sushi/config.py
@@ -24,10 +24,15 @@
 DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
 
 
+defaults = {"dialect": "duckdb"}
+model_defaults = ModelDefaultsConfig(**defaults)
+model_defaults_iceberg = ModelDefaultsConfig(**defaults, storage_format="iceberg")
+
+
 # An in memory DuckDB config.
 config = Config(
     default_connection=DuckDBConnectionConfig(),
-    model_defaults=ModelDefaultsConfig(dialect="duckdb"),
+    model_defaults=model_defaults,
 )
 
 bigquery_config = Config(
@@ -38,21 +43,21 @@
         )
     },
     default_gateway="bq",
-    model_defaults=ModelDefaultsConfig(dialect="duckdb"),
+    model_defaults=model_defaults,
 )
 
 # A configuration used for SQLMesh tests.
 test_config = Config(
     gateways={"in_memory": GatewayConfig(connection=DuckDBConnectionConfig())},
     default_gateway="in_memory",
     plan=PlanConfig(auto_categorize_changes=CategorizerConfig(sql=AutoCategorizationMode.SEMI)),
-    model_defaults=ModelDefaultsConfig(dialect="duckdb"),
+    model_defaults=model_defaults,
 )
 
 # A stateful DuckDB config.
 local_config = Config(
     default_connection=DuckDBConnectionConfig(database=f"{DATA_DIR}/local.duckdb"),
-    model_defaults=ModelDefaultsConfig(dialect="duckdb"),
+    model_defaults=model_defaults,
 )
 
 airflow_config = Config(
@@ -65,21 +70,21 @@
             },
         )
     ),
-    model_defaults=ModelDefaultsConfig(dialect="duckdb", storage_format="iceberg"),
+    model_defaults=model_defaults_iceberg,
 )
 
 
 airflow_config_docker = Config(
     default_scheduler=AirflowSchedulerConfig(airflow_url="http://airflow-webserver:8080/"),
     gateways=GatewayConfig(connection=SparkConnectionConfig()),
-    model_defaults=ModelDefaultsConfig(dialect="duckdb", storage_format="iceberg"),
+    model_defaults=model_defaults_iceberg,
 )
 
 # A DuckDB config with a physical schema map.
 map_config = Config(
     default_connection=DuckDBConnectionConfig(),
     physical_schema_override={"sushi": "company_internal"},
-    model_defaults=ModelDefaultsConfig(dialect="duckdb"),
+    model_defaults=model_defaults,
 )
 
 
@@ -114,13 +119,13 @@
             ],
         ),
     ],
-    model_defaults=ModelDefaultsConfig(dialect="duckdb"),
+    model_defaults=model_defaults,
 )
 
 
 environment_suffix_config = Config(
     default_connection=DuckDBConnectionConfig(),
-    model_defaults=ModelDefaultsConfig(dialect="duckdb"),
+    model_defaults=model_defaults,
     environment_suffix_target=EnvironmentSuffixTarget.TABLE,
 )
 
@@ -133,7 +138,7 @@
 local_catalogs = Config(
     default_connection=DuckDBConnectionConfig(catalogs=CATALOGS),
     default_test_connection=DuckDBConnectionConfig(catalogs=CATALOGS),
-    model_defaults=ModelDefaultsConfig(dialect="duckdb"),
+    model_defaults=model_defaults,
 )
 
 environment_catalog_mapping_config = Config(
@@ -144,7 +149,7 @@
             "dev_catalog": ":memory:",
         }
     ),
-    model_defaults=ModelDefaultsConfig(dialect="duckdb"),
+    model_defaults=model_defaults,
     environment_suffix_target=EnvironmentSuffixTarget.TABLE,
     environment_catalog_mapping={
         "^prod$": "prod_catalog",
diff --git a/examples/sushi/external_models.yaml b/examples/sushi/external_models.yaml
@@ -1,5 +1,13 @@
 - name: raw.demographics
   description: Table containing demographics information
+  start: 1 week ago
+  audits:
+    - name: not_null
+      columns: "[customer_id]"
+    - name: accepted_range
+      column: zip
+      min_v: "'00000'"
+      max_v: "'99999'"
   columns:
     customer_id: int
     zip: text
diff --git a/sqlmesh/core/loader.py b/sqlmesh/core/loader.py
@@ -203,6 +203,7 @@ def _load_external_models(self) -> UniqueKeyDict[str, Model]:
                         model = create_external_model(
                             **row,
                             dialect=config.model_defaults.dialect,
+                            defaults=config.model_defaults.dict(),
                             path=path,
                             project=config.project,
                             default_catalog=self._context.default_catalog,
diff --git a/sqlmesh/core/model/definition.py b/sqlmesh/core/model/definition.py
@@ -1791,6 +1791,7 @@ def create_external_model(
     *,
     dialect: t.Optional[str] = None,
     path: Path = Path(),
+    defaults: t.Optional[t.Dict[str, t.Any]] = None,
     **kwargs: t.Any,
 ) -> Model:
     """Creates an external model.
@@ -1804,6 +1805,7 @@ def create_external_model(
     return _create_model(
         ExternalModel,
         name,
+        defaults=defaults,
         dialect=dialect,
         path=path,
         kind=ModelKindName.EXTERNAL.value,
diff --git a/sqlmesh/core/model/meta.py b/sqlmesh/core/model/meta.py
@@ -110,16 +110,29 @@ def extract(v: exp.Expression) -> t.Tuple[str, t.Dict[str, exp.Expression]]:
         if isinstance(v, exp.Expression):
             return [extract(v)]
         if isinstance(v, list):
-            return [
-                (
-                    entry[0].lower(),
-                    {
-                        key: d.parse(value)[0] if isinstance(value, str) else value
-                        for key, value in entry[1].items()
-                    },
+            audits = []
+
+            for entry in v:
+                if isinstance(entry, dict):
+                    args = entry
+                    name = entry.pop("name")
+                elif isinstance(entry, (tuple, list)):
+                    name, args = entry
+                else:
+                    raise ConfigError(f"Audit must be a dictionary or named tuple. Got {entry}.")
+
+                audits.append(
+                    (
+                        name.lower(),
+                        {
+                            key: d.parse_one(value) if isinstance(value, str) else value
+                            for key, value in args.items()
+                        },
+                    )
                 )
-                for entry in v
-            ]
+
+            return audits
+
         return v
 
     @field_validator("tags", mode="before")
diff --git a/sqlmesh/core/snapshot/definition.py b/sqlmesh/core/snapshot/definition.py
@@ -319,6 +319,9 @@ def _table_name(self, version: str, is_deployable: bool) -> str:
             version: The snapshot version.
             is_deployable: Indicates whether to return the table name for deployment to production.
         """
+        if self.is_external:
+            return self.name
+
         is_dev_table = not is_deployable
         if is_dev_table:
             version = self.temp_version_get_or_generate()
@@ -742,6 +745,11 @@ def merge_intervals(self, other: t.Union[Snapshot, SnapshotIntervals]) -> None:
             for start, end in other.dev_intervals:
                 self.add_interval(start, end, is_dev=True)
 
+    @property
+    def evaluatable(self) -> bool:
+        """Whether or not a snapshot should be evaluated and have intervals."""
+        return bool(not self.is_symbolic or self.model.audits)
+
     def missing_intervals(
         self,
         start: TimeLike,
@@ -761,7 +769,6 @@ def missing_intervals(
             start: The start date/time of the interval (inclusive)
             end: The end date/time of the interval (inclusive if the type is date, exclusive otherwise)
             execution_time: The date/time time reference to use for execution time. Defaults to now.
-            restatements: A set of snapshot names being restated
             deployability_index: Determines snapshots that are deployable in the context of this evaluation.
             ignore_cron: Whether to ignore the node's cron schedule.
             end_bounded: If set to true, the returned intervals will be bounded by the target end date, disregarding lookback,
@@ -795,7 +802,7 @@ def missing_intervals(
             self.intervals if deployability_index.is_representative(self) else self.dev_intervals
         )
 
-        if self.is_symbolic or (self.is_seed and intervals):
+        if not self.evaluatable or (self.is_seed and intervals):
             return []
 
         allow_partials = not end_bounded and self.is_model and self.model.allow_partials
@@ -1502,7 +1509,7 @@ def missing_intervals(
     deployability_index = deployability_index or DeployabilityIndex.all_deployable()
 
     for snapshot in snapshots:
-        if snapshot.is_symbolic:
+        if not snapshot.evaluatable:
             continue
         interval = restatements.get(snapshot.snapshot_id)
         snapshot_start_date = start_dt
diff --git a/tests/core/test_integration.py b/tests/core/test_integration.py
@@ -2084,9 +2084,9 @@ def validate_state_sync_environment(
 def validate_tables(snapshots: t.Iterable[Snapshot], context: Context) -> None:
     adapter = context.engine_adapter
     for snapshot in snapshots:
-        if not snapshot.is_model:
+        if not snapshot.is_model or snapshot.is_external:
             continue
-        table_should_exist = not snapshot.is_symbolic
+        table_should_exist = not snapshot.is_embedded
         assert adapter.table_exists(snapshot.table_name()) == table_should_exist
         if table_should_exist:
             assert select_all(snapshot.table_name(), adapter)
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
@@ -2,10 +2,11 @@
 
 import pytest
 from pytest_mock.plugin import MockerFixture
-from sqlglot import parse_one
+from sqlglot import parse_one, parse
 
 from sqlmesh.core.context import Context
 from sqlmesh.core.environment import EnvironmentNamingInfo
+from sqlmesh.core.model import load_sql_based_model
 from sqlmesh.core.model.definition import SqlModel
 from sqlmesh.core.model.kind import (
     IncrementalByTimeRangeKind,
@@ -14,7 +15,7 @@
 )
 from sqlmesh.core.node import IntervalUnit
 from sqlmesh.core.scheduler import Scheduler, compute_interval_params
-from sqlmesh.core.snapshot import Snapshot, SnapshotEvaluator
+from sqlmesh.core.snapshot import Snapshot, SnapshotEvaluator, SnapshotChangeCategory
 from sqlmesh.utils.date import to_datetime
 from sqlmesh.utils.errors import CircuitBreakerError
 
@@ -417,3 +418,43 @@ def test_intervals_with_end_date_on_model(mocker: MockerFixture, make_snapshot):
     # generate for future days to ensure no future batches are loaded
     snapshot_to_batches = scheduler.batches(start="2023-02-01", end="2023-02-28")
     assert len(snapshot_to_batches) == 0
+
+
+def test_external_model_audit(mocker, make_snapshot):
+    model = load_sql_based_model(
+        parse(  # type: ignore
+            """
+            MODEL (
+                name test_schema.test_model,
+                kind EXTERNAL,
+                columns (id int),
+                audits not_null(columns := id)
+            );
+
+            SELECT 1;
+            """
+        ),
+    )
+
+    snapshot = make_snapshot(model)
+    snapshot.categorize_as(SnapshotChangeCategory.BREAKING)
+
+    evaluator = SnapshotEvaluator(adapter=mocker.MagicMock())
+    spy = mocker.spy(evaluator, "_audit")
+
+    scheduler = Scheduler(
+        snapshots=[snapshot],
+        snapshot_evaluator=evaluator,
+        state_sync=mocker.MagicMock(),
+        max_workers=2,
+        default_catalog=None,
+    )
+
+    scheduler.run(
+        EnvironmentNamingInfo(),
+        "2022-01-01",
+        "2022-01-01",
+        "2022-01-30",
+    )
+
+    spy.assert_called_once()
diff --git a/tests/core/test_snapshot.py b/tests/core/test_snapshot.py
@@ -1746,3 +1746,10 @@ def test_missing_intervals_node_start_end(make_snapshot):
     assert missing_intervals([snapshot], start="2024-03-01", end=to_datetime("2024-03-10")) == {}
     assert missing_intervals([snapshot], start="2024-03-13", end="2024-03-14") == {}
     assert missing_intervals([snapshot], start="2024-03-14", end="2024-03-30") == {}
+
+
+def test_external_model_audits(sushi_context):
+    snapshot = sushi_context.get_snapshot("raw.demographics")
+    assert snapshot.evaluatable
+    assert len(snapshot.model.audits) == 2
+    assert snapshot.intervals
diff --git a/tests/integrations/jupyter/test_magics.py b/tests/integrations/jupyter/test_magics.py
@@ -291,7 +291,7 @@ def test_plan(
 
     # TODO: Should this be going to stdout? This is printing the status updates for when each batch finishes for
     # the models and how long it took
-    assert len(output.stdout.strip().split("\n")) == 20
+    assert len(output.stdout.strip().split("\n")) == 21
     assert not output.stderr
     assert len(output.outputs) == 5
     text_output = convert_all_html_output_to_text(output)