feat: add batch_size support to scd type 2 kinds (#4220)

eakmanrq · web-flow · commit f96488ce5747 · 2025-04-22T14:29:06.000-07:00
diff --git a/docs/concepts/models/model_kinds.md b/docs/concepts/models/model_kinds.md
@@ -1241,12 +1241,13 @@ This is the most accurate representation of the menu based on the source data pr
 
 ### Shared Configuration Options
 
-| Name                    | Description                                                                                                     | Type                      |
-|-------------------------|-----------------------------------------------------------------------------------------------------------------|---------------------------|
-| unique_key              | Unique key used for identifying rows between source and target                                                  | List of strings or string |
-| valid_from_name         | The name of the `valid_from` column to create in the target table. Default: `valid_from`                        | string                    |
-| valid_to_name           | The name of the `valid_to` column to create in the target table. Default: `valid_to`                            | string                    |
-| invalidate_hard_deletes | If set to `true`, when a record is missing from the source table it will be marked as invalid. Default: `false` | bool                      |
+| Name                    | Description                                                                                                                                                                                                                                                                                                       | Type                      |
+|-------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|
+| unique_key              | Unique key used for identifying rows between source and target                                                                                                                                                                                                                                                    | List of strings or string |
+| valid_from_name         | The name of the `valid_from` column to create in the target table. Default: `valid_from`                                                                                                                                                                                                                          | string                    |
+| valid_to_name           | The name of the `valid_to` column to create in the target table. Default: `valid_to`                                                                                                                                                                                                                              | string                    |
+| invalidate_hard_deletes | If set to `true`, when a record is missing from the source table it will be marked as invalid. Default: `false`                                                                                                                                                                                                   | bool                      |
+| batch_size              | The maximum number of intervals that can be evaluated in a single backfill task. If this is `None`, all intervals will be processed as part of a single task. See [Processing Source Table with Historical Data](#processing-source-table-with-historical-data) for more info on this use case. (Default: `None`) | int                       |
 
 !!! tip "Important"
 
@@ -1273,10 +1274,66 @@ This is the most accurate representation of the menu based on the source data pr
 
 ### SCD Type 2 By Column Configuration Options
 
-| Name                         | Description                                                                                                                                                                                                                                 | Type                      |
-|------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|
-| columns                      | The name of the columns to check for changes. `*` to represent that all columns should be checked.                                                                                                                                          | List of strings or string |
-| execution_time_as_valid_from | By default, when the model is first loaded `valid_from` is set to `1970-01-01 00:00:00` and future new rows will have `execution_time` of when the pipeline ran. This changes the behavior to always use `execution_time`. Default: `false` | bool                      |
+| Name                         | Description                                                                                                                                                                                                                                                                                                                                  | Type                      |
+|------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|
+| columns                      | The name of the columns to check for changes. `*` to represent that all columns should be checked.                                                                                                                                                                                                                                           | List of strings or string |
+| execution_time_as_valid_from | By default, when the model is first loaded `valid_from` is set to `1970-01-01 00:00:00` and future new rows will have `execution_time` of when the pipeline ran. This changes the behavior to always use `execution_time`. Default: `false`                                                                                                  | bool                      |
+| updated_at_name              | If sourcing from a table that includes as timestamp to use as valid_from, set this property to that column. See [Processing Source Table with Historical Data](#processing-source-table-with-historical-data) for more info on this use case. (Default: `None`) | int                       |
+
+
+### Processing Source Table with Historical Data
+
+The most common case for SCD Type 2 is creating history for a table that it doesn't have it already. 
+In the example of the restaurant menu, the menu just tells you what is offered right now, but you want to know what was offered over time.
+In this case, the default setting of `None` for `batch_size` is the best option.
+
+Another use case though is processing a source table that already has history in it. 
+A common example of this is a "daily snapshot" table that is created by a source system that takes a snapshot of the data at the end of each day.
+If your source table has historical records, like a "daily snapshot" table, then set `batch_size` to `1` to process each interval (each day if a `@daily` cron) in sequential order.
+That way the historical records will be properly captured in the SCD Type 2 table.
+
+#### Example - Source from Daily Snapshot Table
+
+```sql linenums="1"
+MODEL (
+    name db.table,
+    kind SCD_TYPE_2_BY_COLUMN (
+        unique_key id,
+        columns [some_value],
+        updated_at_name ds,
+        batch_size 1
+    ),
+    start '2025-01-01',
+    cron '@daily'
+);
+SELECT
+    id,
+    some_value,
+    ds
+FROM
+    source_table
+WHERE
+    ds between @start_ds and @end_ds
+```
+
+This will process each day of the source table in sequential order (if more than one day to process), checking `some_value` column to see if it changed. If it did change, `valid_from` will be set to match the `ds` column (except for first value which would be `1970-01-01 00:00:00`).
+
+If the source data was the following:
+
+| id | some_value |     ds      |
+|----|------------|:-----------:|
+| 1  | 1          | 2025-01-01  |
+| 1  | 2          | 2025-01-02  |
+| 1  | 3          | 2025-01-03  |
+| 1  | 3          | 2025-01-04  |
+
+Then the resulting SCD Type 2 table would be:
+
+| id | some_value |     ds      |     valid_from      |      valid_to       |
+|----|------------|:-----------:|:-------------------:|:-------------------:|
+| 1  | 1          | 2025-01-01  | 1970-01-01 00:00:00 | 2025-01-02 00:00:00 |
+| 1  | 2          | 2025-01-02  | 2025-01-02 00:00:00 | 2025-01-03 00:00:00 |
+| 1  | 3          | 2025-01-03  | 2025-01-03 00:00:00 |        NULL         |
 
 ### Querying SCD Type 2 Models
 
diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py
@@ -1411,7 +1411,7 @@ def scd_type_2_by_time(
         unique_key: t.Sequence[exp.Expression],
         valid_from_col: exp.Column,
         valid_to_col: exp.Column,
-        execution_time: TimeLike,
+        execution_time: t.Union[TimeLike, exp.Column],
         updated_at_col: exp.Column,
         invalidate_hard_deletes: bool = True,
         updated_at_as_valid_from: bool = False,
@@ -1445,7 +1445,7 @@ def scd_type_2_by_column(
         unique_key: t.Sequence[exp.Expression],
         valid_from_col: exp.Column,
         valid_to_col: exp.Column,
-        execution_time: TimeLike,
+        execution_time: t.Union[TimeLike, exp.Column],
         check_columns: t.Union[exp.Star, t.Sequence[exp.Column]],
         invalidate_hard_deletes: bool = True,
         execution_time_as_valid_from: bool = False,
@@ -1479,7 +1479,7 @@ def _scd_type_2(
         unique_key: t.Sequence[exp.Expression],
         valid_from_col: exp.Column,
         valid_to_col: exp.Column,
-        execution_time: TimeLike,
+        execution_time: t.Union[TimeLike, exp.Column],
         invalidate_hard_deletes: bool = True,
         updated_at_col: t.Optional[exp.Column] = None,
         check_columns: t.Optional[t.Union[exp.Star, t.Sequence[exp.Column]]] = None,
@@ -1554,7 +1554,11 @@ def remove_managed_columns(
         # column names and then remove them from the unmanaged_columns
         if check_columns and check_columns == exp.Star():
             check_columns = [exp.column(col) for col in unmanaged_columns_to_types]
-        execution_ts = to_time_column(execution_time, time_data_type, self.dialect, nullable=True)
+        execution_ts = (
+            exp.cast(execution_time, time_data_type, dialect=self.dialect)
+            if isinstance(execution_time, exp.Column)
+            else to_time_column(execution_time, time_data_type, self.dialect, nullable=True)
+        )
         if updated_at_as_valid_from:
             if not updated_at_col:
                 raise SQLMeshError(
diff --git a/sqlmesh/core/engine_adapter/trino.py b/sqlmesh/core/engine_adapter/trino.py
@@ -228,7 +228,7 @@ def _scd_type_2(
         unique_key: t.Sequence[exp.Expression],
         valid_from_col: exp.Column,
         valid_to_col: exp.Column,
-        execution_time: TimeLike,
+        execution_time: t.Union[TimeLike, exp.Column],
         invalidate_hard_deletes: bool = True,
         updated_at_col: t.Optional[exp.Column] = None,
         check_columns: t.Optional[t.Union[exp.Star, t.Sequence[exp.Column]]] = None,
diff --git a/sqlmesh/core/model/kind.py b/sqlmesh/core/model/kind.py
@@ -672,6 +672,7 @@ class _SCDType2Kind(_Incremental):
     valid_to_name: SQLGlotColumn = Field(exp.column("valid_to"), validate_default=True)
     invalidate_hard_deletes: SQLGlotBool = False
     time_data_type: exp.DataType = Field(exp.DataType.build("TIMESTAMP"), validate_default=True)
+    batch_size: t.Optional[SQLGlotPositiveInt] = None
 
     forward_only: SQLGlotBool = True
     disable_restatement: SQLGlotBool = True
@@ -711,6 +712,7 @@ def data_hash_values(self) -> t.List[t.Optional[str]]:
             gen(self.valid_to_name),
             str(self.invalidate_hard_deletes),
             gen(self.time_data_type),
+            gen(self.batch_size) if self.batch_size is not None else None,
         ]
 
     @property
@@ -781,6 +783,7 @@ class SCDType2ByColumnKind(_SCDType2Kind):
     name: t.Literal[ModelKindName.SCD_TYPE_2_BY_COLUMN] = ModelKindName.SCD_TYPE_2_BY_COLUMN
     columns: SQLGlotListOfColumnsOrStar
     execution_time_as_valid_from: SQLGlotBool = False
+    updated_at_name: t.Optional[SQLGlotColumn] = None
 
     @property
     def data_hash_values(self) -> t.List[t.Optional[str]]:
@@ -789,7 +792,12 @@ def data_hash_values(self) -> t.List[t.Optional[str]]:
             if isinstance(self.columns, list)
             else [gen(self.columns)]
         )
-        return [*super().data_hash_values, *columns_sql, str(self.execution_time_as_valid_from)]
+        return [
+            *super().data_hash_values,
+            *columns_sql,
+            str(self.execution_time_as_valid_from),
+            gen(self.updated_at_name) if self.updated_at_name is not None else None,
+        ]
 
     def to_expression(
         self, expressions: t.Optional[t.List[exp.Expression]] = None, **kwargs: t.Any
diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py
@@ -1761,7 +1761,7 @@ def insert(
                 unique_key=model.unique_key,
                 valid_from_col=model.kind.valid_from_name,
                 valid_to_col=model.kind.valid_to_name,
-                execution_time=kwargs["execution_time"],
+                execution_time=model.kind.updated_at_name or kwargs["execution_time"],
                 check_columns=model.kind.columns,
                 invalidate_hard_deletes=model.kind.invalidate_hard_deletes,
                 execution_time_as_valid_from=model.kind.execution_time_as_valid_from,
diff --git a/sqlmesh/migrations/v0080_add_batch_size_to_scd_type_2_models.py b/sqlmesh/migrations/v0080_add_batch_size_to_scd_type_2_models.py
@@ -0,0 +1,5 @@
+"""Add batch_size to SCD Type 2 models and add updated_at_name to by time which changes their data hash."""
+
+
+def migrate(state_sync, **kwargs):  # type: ignore
+    pass
diff --git a/tests/core/test_model.py b/tests/core/test_model.py
@@ -4403,6 +4403,7 @@ def test_scd_type_2_by_column_overrides():
                 forward_only False,
                 disable_restatement False,
                 invalidate_hard_deletes False,
+                batch_size 1
             ),
         );
         SELECT
@@ -4428,6 +4429,7 @@ def test_scd_type_2_by_column_overrides():
     assert scd_type_2_model.kind.is_scd_type_2
     assert scd_type_2_model.kind.is_materialized
     assert scd_type_2_model.kind.time_data_type == exp.DataType.build("TIMESTAMPTZ")
+    assert scd_type_2_model.kind.batch_size == 1
     assert not scd_type_2_model.kind.invalidate_hard_deletes
     assert not scd_type_2_model.kind.forward_only
     assert not scd_type_2_model.kind.disable_restatement
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
@@ -13,6 +13,7 @@
     IncrementalByTimeRangeKind,
     IncrementalByUniqueKeyKind,
     TimeColumn,
+    SCDType2ByColumnKind,
 )
 from sqlmesh.core.node import IntervalUnit
 from sqlmesh.core.scheduler import (
@@ -810,3 +811,69 @@ def signal_base(batch: DatetimeRanges):
         snapshot_b: [(to_timestamp("2023-01-01"), to_timestamp("2023-01-04"))],
         snapshot_c: [(to_timestamp("2023-01-01"), to_timestamp("2023-01-02"))],
     }
+
+
+@pytest.mark.parametrize(
+    "batch_size, expected_batches",
+    [
+        (
+            1,
+            [
+                (to_timestamp("2023-01-01"), to_timestamp("2023-01-02")),
+                (to_timestamp("2023-01-02"), to_timestamp("2023-01-03")),
+                (to_timestamp("2023-01-03"), to_timestamp("2023-01-04")),
+            ],
+        ),
+        (
+            None,
+            [
+                (to_timestamp("2023-01-01"), to_timestamp("2023-01-04")),
+            ],
+        ),
+    ],
+)
+def test_scd_type_2_batch_size(
+    mocker: MockerFixture,
+    make_snapshot,
+    get_batched_missing_intervals,
+    batch_size: t.Optional[int],
+    expected_batches: t.List[t.Tuple[int, int]],
+):
+    """
+    Test that SCD_TYPE_2_BY_COLUMN models are batched correctly based on batch_size.
+    With batch_size=1, we expect 3 separate batches for 3 days.
+    Without a specified batch_size, we expect a single batch for the entire period.
+    """
+    start = to_datetime("2023-01-01")
+    end = to_datetime("2023-01-04")
+
+    # Configure kind params
+    kind_params = {}
+    if batch_size is not None:
+        kind_params["batch_size"] = batch_size
+
+    # Create the model and snapshot
+    model = SqlModel(
+        name="test_scd_model",
+        kind=SCDType2ByColumnKind(columns="valid_to", unique_key=["id"], **kind_params),
+        cron="@daily",
+        start=start,
+        query=parse_one("SELECT id, valid_from, valid_to FROM source"),
+    )
+    snapshot = make_snapshot(model)
+
+    # Setup scheduler
+    snapshot_evaluator = SnapshotEvaluator(adapters=mocker.MagicMock(), ddl_concurrent_tasks=1)
+    scheduler = Scheduler(
+        snapshots=[snapshot],
+        snapshot_evaluator=snapshot_evaluator,
+        state_sync=mocker.MagicMock(),
+        max_workers=2,
+        default_catalog=None,
+    )
+
+    # Get batches for the time period
+    batches = get_batched_missing_intervals(scheduler, start, end, end)[snapshot]
+
+    # Verify batches match expectations
+    assert batches == expected_batches

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +"""Add batch_size to SCD Type 2 models and add updated_at_name to by time which changes their data hash."""
++
++
 +def migrate(state_sync, **kwargs):  # type: ignore
 +    pass