Fix: Defer unpausing right to right before env finalization when ensuring finalized snapshots (#2147)

izeigerman · web-flow · commit e8c4c4de22f5 · 2024-02-20T09:11:15.000-08:00
diff --git a/sqlmesh/core/context.py b/sqlmesh/core/context.py
@@ -1046,6 +1046,7 @@ def plan_builder(
                 enable_preview if enable_preview is not None else self.config.plan.enable_preview
             ),
             end_bounded=not run,
+            ensure_finalized_snapshots=self.config.plan.use_finalized_state,
         )
 
     def apply(
diff --git a/sqlmesh/core/plan/builder.py b/sqlmesh/core/plan/builder.py
@@ -58,6 +58,9 @@ class PlanBuilder:
         enable_preview: Whether to enable preview for forward-only models in development environments.
         end_bounded: If set to true, the missing intervals will be bounded by the target end date, disregarding lookback,
             allow_partials, and other attributes that could cause the intervals to exceed the target end date.
+        ensure_finalized_snapshots: Whether to compare against snapshots from the latest finalized
+            environment state, or to use whatever snapshots are in the current environment state even if
+            the environment is not finalized.
     """
 
     def __init__(
@@ -84,6 +87,7 @@ def __init__(
         default_end: t.Optional[TimeLike] = None,
         enable_preview: bool = False,
         end_bounded: bool = False,
+        ensure_finalized_snapshots: bool = False,
     ):
         self._context_diff = context_diff
         self._no_gaps = no_gaps
@@ -92,6 +96,7 @@ def __init__(
         self._forward_only = forward_only
         self._enable_preview = enable_preview
         self._end_bounded = end_bounded
+        self._ensure_finalized_snapshots = ensure_finalized_snapshots
         self._environment_ttl = environment_ttl
         self._categorizer_config = categorizer_config or CategorizerConfig()
         self._auto_categorization_enabled = auto_categorization_enabled
@@ -226,6 +231,7 @@ def build(self) -> Plan:
             effective_from=self._effective_from,
             execution_time=self._execution_time,
             end_bounded=self._end_bounded,
+            ensure_finalized_snapshots=self._ensure_finalized_snapshots,
         )
         self._latest_plan = plan
         return plan
diff --git a/sqlmesh/core/plan/definition.py b/sqlmesh/core/plan/definition.py
@@ -36,6 +36,7 @@ class Plan(PydanticModel, frozen=True):
     forward_only: bool
     include_unmodified: bool
     end_bounded: bool
+    ensure_finalized_snapshots: bool
 
     environment_ttl: t.Optional[str] = None
     environment_naming_info: EnvironmentNamingInfo
diff --git a/sqlmesh/core/plan/evaluator.py b/sqlmesh/core/plan/evaluator.py
@@ -217,7 +217,11 @@ def _promote(
                 [s for s in plan.snapshots.values() if s.is_paused],
                 plan.snapshots,
             )
-            self.state_sync.unpause_snapshots(promotion_result.added, plan.end)
+            if not plan.ensure_finalized_snapshots:
+                # Only unpause at this point if we don't have to use the finalized snapshots
+                # for subsequent plan applications. Otherwise, unpause right before finalizing
+                # the environment.
+                self.state_sync.unpause_snapshots(promotion_result.added, plan.end)
 
         return promotion_result
 
@@ -234,6 +238,12 @@ def _update_views(
             promotion_result: The result of the promotion.
             deployability_index: Indicates which snapshots are deployable in the context of this promotion.
         """
+        if not plan.is_dev and plan.ensure_finalized_snapshots:
+            # Unpause right before finalizing the environment in case when
+            # we need to use the finalized snapshots for subsequent plan applications.
+            # Otherwise, unpause right after updatig the environment record.
+            self.state_sync.unpause_snapshots(promotion_result.added, plan.end)
+
         environment = plan.environment
 
         self.console.start_promotion_progress(
@@ -359,6 +369,7 @@ def _apply_plan(self, plan: Plan, plan_request_id: str) -> None:
             forward_only=plan.forward_only,
             models_to_backfill=plan.models_to_backfill,
             end_bounded=plan.end_bounded,
+            ensure_finalized_snapshots=plan.ensure_finalized_snapshots,
         )
         plan_dag_spec = create_plan_dag_spec(plan_application_request, self.state_sync)
         PlanDagState.from_state_sync(self.state_sync).add_dag_spec(plan_dag_spec)
@@ -428,6 +439,7 @@ def _apply_plan(self, plan: Plan, plan_request_id: str) -> None:
             forward_only=plan.forward_only,
             models_to_backfill=plan.models_to_backfill,
             end_bounded=plan.end_bounded,
+            ensure_finalized_snapshots=plan.ensure_finalized_snapshots,
         )
 
 
diff --git a/sqlmesh/schedulers/airflow/client.py b/sqlmesh/schedulers/airflow/client.py
@@ -197,6 +197,7 @@ def apply_plan(
         forward_only: bool = False,
         models_to_backfill: t.Optional[t.Set[str]] = None,
         end_bounded: bool = False,
+        ensure_finalized_snapshots: bool = False,
     ) -> None:
         request = common.PlanApplicationRequest(
             new_snapshots=list(new_snapshots),
@@ -213,6 +214,7 @@ def apply_plan(
             forward_only=forward_only,
             models_to_backfill=models_to_backfill,
             end_bounded=end_bounded,
+            ensure_finalized_snapshots=ensure_finalized_snapshots,
         )
 
         response = self._session.post(
diff --git a/sqlmesh/schedulers/airflow/common.py b/sqlmesh/schedulers/airflow/common.py
@@ -51,6 +51,7 @@ class PlanApplicationRequest(PydanticModel):
     forward_only: bool
     models_to_backfill: t.Optional[t.Set[str]]
     end_bounded: bool
+    ensure_finalized_snapshots: bool
 
     def is_selected_for_backfill(self, model_fqn: str) -> bool:
         return self.models_to_backfill is None or model_fqn in self.models_to_backfill
@@ -81,6 +82,7 @@ class PlanDagSpec(PydanticModel):
     deployability_index_for_creation: DeployabilityIndex = DeployabilityIndex.all_deployable()
     no_gaps_snapshot_names: t.Optional[t.Set[str]] = None
     models_to_backfill: t.Optional[t.Set[str]] = None
+    ensure_finalized_snapshots: bool = False
 
 
 class EnvironmentsResponse(PydanticModel):
diff --git a/sqlmesh/schedulers/airflow/dag_generator.py b/sqlmesh/schedulers/airflow/dag_generator.py
@@ -148,16 +148,14 @@ def _create_plan_application_dag(self, plan_dag_spec: common.PlanDagSpec) -> DAG
             plan_dag_spec.environment.name,
         )
 
-        environment = plan_dag_spec.environment
-
         all_snapshots = {
             **{s.snapshot_id: s for s in plan_dag_spec.new_snapshots},
-            **self._state_reader.get_snapshots(environment.snapshots),
+            **self._state_reader.get_snapshots(plan_dag_spec.environment.snapshots),
         }
 
         snapshots_to_create = [
             all_snapshots[snapshot.snapshot_id]
-            for snapshot in environment.snapshots
+            for snapshot in plan_dag_spec.environment.snapshots
             if snapshot.snapshot_id in all_snapshots
             and (
                 plan_dag_spec.models_to_backfill is None
@@ -216,22 +214,36 @@ def _create_plan_application_dag(self, plan_dag_spec: common.PlanDagSpec) -> DAG
             (
                 promote_start_task,
                 promote_end_task,
-            ) = self._create_promotion_demotion_tasks(plan_dag_spec, environment, all_snapshots)
-
-            update_views_task_pair = self._create_update_views_tasks(plan_dag_spec, all_snapshots)
-
-            finalize_task = self._create_finalize_task(environment)
+            ) = self._create_promotion_demotion_tasks(plan_dag_spec, all_snapshots)
 
             start_task >> create_start_task
             create_end_task >> backfill_before_promote_start_task
             backfill_before_promote_end_task >> promote_start_task
-            promote_end_task >> backfill_after_promote_start_task
 
+            update_views_task_pair = self._create_update_views_tasks(plan_dag_spec, all_snapshots)
             if update_views_task_pair:
                 backfill_after_promote_end_task >> update_views_task_pair[0]
-                update_views_task_pair[1] >> finalize_task
+                before_finalize_task = update_views_task_pair[1]
             else:
-                backfill_after_promote_end_task >> finalize_task
+                before_finalize_task = backfill_after_promote_end_task
+
+            unpause_snapshots_task = self._create_unpause_snapshots_task(plan_dag_spec)
+            if unpause_snapshots_task:
+                if not plan_dag_spec.ensure_finalized_snapshots:
+                    # Only unpause right after updatign the environment record if we don't
+                    # have to use the finalized snapshots for subsequent plan applications.
+                    promote_end_task >> unpause_snapshots_task
+                    unpause_snapshots_task >> backfill_after_promote_start_task
+                else:
+                    # Otherwise, unpause right before finalizing the environment.
+                    promote_end_task >> backfill_after_promote_start_task
+                    before_finalize_task >> unpause_snapshots_task
+                    before_finalize_task = unpause_snapshots_task
+            else:
+                promote_end_task >> backfill_after_promote_start_task
+
+            finalize_task = self._create_finalize_task(plan_dag_spec.environment)
+            before_finalize_task >> finalize_task
 
             self._add_notification_target_tasks(plan_dag_spec, start_task, end_task, finalize_task)
             return dag
@@ -310,51 +322,52 @@ def _create_creation_tasks(
     def _create_promotion_demotion_tasks(
         self,
         request: common.PlanDagSpec,
-        environment: Environment,
         snapshots: t.Dict[SnapshotId, Snapshot],
     ) -> t.Tuple[BaseOperator, BaseOperator]:
         update_state_task = PythonOperator(
             task_id="snapshot_promotion_update_state",
             python_callable=promotion_update_state_task,
             op_kwargs={
-                "environment": environment,
+                "environment": request.environment,
                 "no_gaps_snapshot_names": (
                     request.no_gaps_snapshot_names if request.no_gaps else set()
                 ),
             },
         )
 
         start_task = update_state_task
-        end_task = update_state_task
-
-        if request.environment.promoted_snapshots:
-            if not request.is_dev and request.unpaused_dt:
-                migrate_tables_task = self._create_snapshot_migrate_tables_operator(
-                    [
-                        snapshots[s.snapshot_id]
-                        for s in request.environment.promoted_snapshots
-                        if snapshots[s.snapshot_id].is_paused
-                    ],
-                    request.ddl_concurrent_tasks,
-                    "snapshot_promotion_migrate_tables",
-                )
-
-                unpause_snapshots_task = PythonOperator(
-                    task_id="snapshot_promotion_unpause_snapshots",
-                    python_callable=promotion_unpause_snapshots_task,
-                    op_kwargs={
-                        "environment": environment,
-                        "unpaused_dt": request.unpaused_dt,
-                    },
-                    trigger_rule="none_failed",
-                )
-
-                update_state_task >> migrate_tables_task
-                migrate_tables_task >> unpause_snapshots_task
-                end_task = unpause_snapshots_task
+        end_task: BaseOperator = update_state_task
+
+        if request.environment.promoted_snapshots and not request.is_dev and request.unpaused_dt:
+            migrate_tables_task = self._create_snapshot_migrate_tables_operator(
+                [
+                    snapshots[s.snapshot_id]
+                    for s in request.environment.promoted_snapshots
+                    if snapshots[s.snapshot_id].is_paused
+                ],
+                request.ddl_concurrent_tasks,
+                "snapshot_promotion_migrate_tables",
+            )
+            update_state_task >> migrate_tables_task
+            end_task = migrate_tables_task
 
         return (start_task, end_task)
 
+    def _create_unpause_snapshots_task(
+        self, request: common.PlanDagSpec
+    ) -> t.Optional[BaseOperator]:
+        if request.is_dev or not request.unpaused_dt:
+            return None
+        return PythonOperator(
+            task_id="snapshot_promotion_unpause_snapshots",
+            python_callable=promotion_unpause_snapshots_task,
+            op_kwargs={
+                "environment": request.environment,
+                "unpaused_dt": request.unpaused_dt,
+            },
+            trigger_rule="none_failed",
+        )
+
     def _create_update_views_tasks(
         self, request: common.PlanDagSpec, snapshots: t.Dict[SnapshotId, Snapshot]
     ) -> t.Optional[t.Tuple[BaseOperator, BaseOperator]]:
diff --git a/sqlmesh/schedulers/airflow/plan.py b/sqlmesh/schedulers/airflow/plan.py
@@ -170,6 +170,7 @@ def create_plan_dag_spec(
         deployability_index_for_creation=deployability_index_for_creation,
         no_gaps_snapshot_names=no_gaps_snapshot_names,
         models_to_backfill=request.models_to_backfill,
+        ensure_finalized_snapshots=request.ensure_finalized_snapshots,
     )
 
 
diff --git a/tests/core/test_plan.py b/tests/core/test_plan.py
@@ -243,6 +243,7 @@ def test_missing_intervals_lookback(make_snapshot, mocker: MockerFixture):
         deployability_index=DeployabilityIndex.all_deployable(),
         restatements={},
         end_bounded=False,
+        ensure_finalized_snapshots=False,
     )
 
     assert not plan.missing_intervals
diff --git a/tests/core/test_plan_evaluator.py b/tests/core/test_plan_evaluator.py
@@ -100,6 +100,7 @@ def test_airflow_evaluator(sushi_plan: Plan, mocker: MockerFixture):
         forward_only=False,
         models_to_backfill=None,
         end_bounded=False,
+        ensure_finalized_snapshots=False,
     )
 
     airflow_client_mock.wait_for_dag_run_completion.assert_called_once()
diff --git a/tests/schedulers/airflow/test_client.py b/tests/schedulers/airflow/test_client.py
@@ -158,6 +158,7 @@ def test_apply_plan(mocker: MockerFixture, snapshot: Snapshot):
         "forward_only": False,
         "models_to_backfill": ['"test_model"'],
         "end_bounded": False,
+        "ensure_finalized_snapshots": False,
     }
 
 
diff --git a/tests/schedulers/airflow/test_plan.py b/tests/schedulers/airflow/test_plan.py
@@ -117,6 +117,7 @@ def test_create_plan_dag_spec(
         forward_only=True,
         models_to_backfill=None,
         end_bounded=False,
+        ensure_finalized_snapshots=False,
     )
 
     deleted_snapshot = SnapshotTableInfo(
@@ -240,6 +241,7 @@ def test_restatement(
         forward_only=True,
         models_to_backfill=None,
         end_bounded=False,
+        ensure_finalized_snapshots=False,
     )
     old_environment = Environment(
         name=environment_name,
@@ -344,6 +346,7 @@ def test_select_models_for_backfill(mocker: MockerFixture, random_name, make_sna
         forward_only=True,
         models_to_backfill={snapshot_b.name},
         end_bounded=False,
+        ensure_finalized_snapshots=False,
     )
 
     state_sync_mock = mocker.Mock()
@@ -412,6 +415,7 @@ def test_create_plan_dag_spec_duplicated_snapshot(
         forward_only=False,
         models_to_backfill=None,
         end_bounded=False,
+        ensure_finalized_snapshots=False,
     )
 
     dag_run_mock = mocker.Mock()
@@ -461,6 +465,7 @@ def test_create_plan_dag_spec_unbounded_end(
         forward_only=False,
         models_to_backfill=None,
         end_bounded=False,
+        ensure_finalized_snapshots=False,
     )
 
     state_sync_mock = mocker.Mock()

Original file line number	Diff line number	Diff line change
`@@ -1046,6 +1046,7 @@ def plan_builder(`
`1046`	`1046`	`enable_preview if enable_preview is not None else self.config.plan.enable_preview`
`1047`	`1047`	`),`
`1048`	`1048`	`end_bounded=not run,`
	`1049`	`+ ensure_finalized_snapshots=self.config.plan.use_finalized_state,`
`1049`	`1050`	`)`
`1050`	`1051`
`1051`	`1052`	`def apply(`
Original file line number	Diff line number	Diff line change
`@@ -170,6 +170,7 @@ def create_plan_dag_spec(`
`170`	`170`	`deployability_index_for_creation=deployability_index_for_creation,`
`171`	`171`	`no_gaps_snapshot_names=no_gaps_snapshot_names,`
`172`	`172`	`models_to_backfill=request.models_to_backfill,`
	`173`	`+ ensure_finalized_snapshots=request.ensure_finalized_snapshots,`
`173`	`174`	`)`
`174`	`175`
`175`	`176`
Original file line number	Diff line number	Diff line change
`@@ -243,6 +243,7 @@ def test_missing_intervals_lookback(make_snapshot, mocker: MockerFixture):`
`243`	`243`	`deployability_index=DeployabilityIndex.all_deployable(),`
`244`	`244`	`restatements={},`
`245`	`245`	`end_bounded=False,`
	`246`	`+ ensure_finalized_snapshots=False,`
`246`	`247`	`)`
`247`	`248`
`248`	`249`	`assert not plan.missing_intervals`
Original file line number	Diff line number	Diff line change
`@@ -100,6 +100,7 @@ def test_airflow_evaluator(sushi_plan: Plan, mocker: MockerFixture):`
`100`	`100`	`forward_only=False,`
`101`	`101`	`models_to_backfill=None,`
`102`	`102`	`end_bounded=False,`
	`103`	`+ ensure_finalized_snapshots=False,`
`103`	`104`	`)`
`104`	`105`
`105`	`106`	`airflow_client_mock.wait_for_dag_run_completion.assert_called_once()`