From 168ec9a32a776af3d8215729298044f736f555b7 Mon Sep 17 00:00:00 2001 From: Erin Drummond Date: Thu, 18 Sep 2025 02:14:31 +0000 Subject: [PATCH 1/2] Fix: Dont backfill seeds if the seed data didnt actually change --- sqlmesh/core/plan/builder.py | 6 ++ sqlmesh/core/plan/common.py | 3 +- tests/core/test_integration.py | 122 +++++++++++++++++++++++++++++++++ tests/core/test_plan.py | 61 +++++++++++++++++ 4 files changed, 191 insertions(+), 1 deletion(-) diff --git a/sqlmesh/core/plan/builder.py b/sqlmesh/core/plan/builder.py index 2eb4c54aeb..7963af62ed 100644 --- a/sqlmesh/core/plan/builder.py +++ b/sqlmesh/core/plan/builder.py @@ -712,6 +712,12 @@ def _categorize_snapshot( snapshot.categorize_as(SnapshotChangeCategory.INDIRECT_NON_BREAKING, forward_only) else: # Metadata updated. + if not forward_only and snapshot.is_seed: + # seeds with metadata updates should be categorized as forward_only to prevent unnecessary backfill + # backfill should only happen if the actual seed data changes, in which case it will be classed as + # directly modified / breaking and not forward only as this code path will not be hit + forward_only = True + snapshot.categorize_as(SnapshotChangeCategory.METADATA, forward_only) def _get_orphaned_indirect_change_category( diff --git a/sqlmesh/core/plan/common.py b/sqlmesh/core/plan/common.py index 99601cb484..2ae34fbfba 100644 --- a/sqlmesh/core/plan/common.py +++ b/sqlmesh/core/plan/common.py @@ -16,8 +16,9 @@ def should_force_rebuild(old: Snapshot, new: Snapshot) -> bool: if new.is_view and new.is_indirect_non_breaking and not new.is_forward_only: # View models always need to be rebuilt to reflect updated upstream dependencies return True - if new.is_seed: + if new.is_seed and not new.is_metadata: # Seed models always need to be rebuilt to reflect changes in the seed file + # Unless only their metadata has been updated (eg description added) and the seed file has not been touched return True return is_breaking_kind_change(old, new) diff --git a/tests/core/test_integration.py b/tests/core/test_integration.py index a0e41f329f..b91c7a1039 100644 --- a/tests/core/test_integration.py +++ b/tests/core/test_integration.py @@ -10662,3 +10662,125 @@ def _run_restatement_plan(tmp_path: Path, config: Config, q: queue.Queue): assert len(model_a.intervals) set_console(orig_console) + + +def test_seed_model_metadata_update_sets_forward_only(tmp_path: Path): + """ + Scenario: + - Create a seed model; perform initial population + - Modify the model with a metadata-only change and trigger a plan + + Outcome: + - The seed model is modified (metadata-only) but this should NOT trigger backfill + - To prevent backfill, the categorizer needs to set forward_only on the seed + """ + + models_path = tmp_path / "models" + seeds_path = tmp_path / "seeds" + models_path.mkdir() + seeds_path.mkdir() + + seed_model_path = models_path / "seed.sql" + seed_path = seeds_path / "seed_data.csv" + + seed_path.write_text("\n".join(["id,name", "1,test"])) + + seed_model_path.write_text(""" + MODEL ( + name test.source_data, + kind SEED ( + path '../seeds/seed_data.csv' + ) + ); + """) + + config = Config( + gateways={"": GatewayConfig(connection=DuckDBConnectionConfig())}, + model_defaults=ModelDefaultsConfig(dialect="duckdb", start="2024-01-01"), + ) + ctx = Context(paths=tmp_path, config=config) + + plan = ctx.plan(auto_apply=True) + + original_seed_snapshot = ctx.snapshots['"memory"."test"."source_data"'] + assert not original_seed_snapshot.forward_only + assert plan.directly_modified == {original_seed_snapshot.snapshot_id} + assert plan.metadata_updated == set() + assert plan.missing_intervals + + # prove data loaded + assert ctx.engine_adapter.fetchall("select id, name from memory.test.source_data") == [ + (1, "test") + ] + + # prove no diff + ctx.load() + plan = ctx.plan(auto_apply=True) + assert not plan.has_changes + assert not plan.missing_intervals + + # make metadata-only change + seed_model_path.write_text(""" + MODEL ( + name test.source_data, + kind SEED ( + path '../seeds/seed_data.csv' + ), + description 'updated by test' + ); + """) + + ctx.load() + plan = ctx.plan(auto_apply=True) + assert plan.has_changes + + new_seed_snapshot = ctx.snapshots['"memory"."test"."source_data"'] + assert ( + new_seed_snapshot.forward_only + ) # change needs to be applied as forward-only to prevent backfill + assert ( + new_seed_snapshot.version == original_seed_snapshot.version + ) # should be using the same physical table + assert ( + new_seed_snapshot.snapshot_id != original_seed_snapshot.snapshot_id + ) # but still be different due to the metadata change + assert plan.directly_modified == set() + assert plan.metadata_updated == {new_seed_snapshot.snapshot_id} + + # there should be no missing intervals to backfill since all we did is update a description + assert not plan.missing_intervals + + # there should still be no diff or missing intervals in 3 days time + assert new_seed_snapshot.model.interval_unit.is_day + with time_machine.travel(timedelta(days=3)): + ctx.clear_caches() + ctx.load() + plan = ctx.plan(auto_apply=True) + assert not plan.has_changes + assert not plan.missing_intervals + + # change seed data + seed_path.write_text("\n".join(["id,name", "1,test", "2,updated"])) + + # new plan - NOW we should backfill because data changed + ctx.load() + plan = ctx.plan(auto_apply=True) + assert plan.has_changes + + updated_seed_snapshot = ctx.snapshots['"memory"."test"."source_data"'] + + assert ( + updated_seed_snapshot.snapshot_id + != new_seed_snapshot.snapshot_id + != original_seed_snapshot.snapshot_id + ) + assert not updated_seed_snapshot.forward_only + assert plan.directly_modified == {updated_seed_snapshot.snapshot_id} + assert plan.metadata_updated == set() + assert plan.missing_intervals + + # prove backfilled data loaded + assert ctx.engine_adapter.fetchall("select id, name from memory.test.source_data") == [ + (1, "test"), + (2, "updated"), + ] diff --git a/tests/core/test_plan.py b/tests/core/test_plan.py index 4f6b99a4ee..d61a245a50 100644 --- a/tests/core/test_plan.py +++ b/tests/core/test_plan.py @@ -1189,6 +1189,67 @@ def test_forward_only_plan_seed_models(make_snapshot, mocker: MockerFixture): assert not snapshot_a_updated.is_forward_only +def test_seed_model_metadata_change_categorized_forward_only( + make_snapshot: t.Callable[..., Snapshot], +): + snapshot_a = make_snapshot( + SeedModel( + name="a", + kind=SeedKind(path="./path/to/seed"), + seed=Seed(content="content"), + column_hashes={"col": "hash1"}, + depends_on=set(), + ) + ) + snapshot_a.categorize_as(SnapshotChangeCategory.BREAKING) + snapshot_a.add_interval("2022-01-01", now()) + + snapshot_a_metadata_updated = make_snapshot( + SeedModel( + name="a", + kind=SeedKind(path="./path/to/seed"), + seed=Seed(content="content"), + column_hashes={"col": "hash1"}, + depends_on=set(), + description="foo", + ) + ) + assert snapshot_a_metadata_updated.version is None + assert snapshot_a_metadata_updated.change_category is None + + context_diff = ContextDiff( + environment="prod", + is_new_environment=True, + is_unfinalized_environment=False, + normalize_environment_name=True, + create_from="prod", + create_from_env_exists=True, + added=set(), + removed_snapshots={}, + modified_snapshots={ + snapshot_a_metadata_updated.name: (snapshot_a_metadata_updated, snapshot_a) + }, + snapshots={snapshot_a_metadata_updated.snapshot_id: snapshot_a_metadata_updated}, + new_snapshots={snapshot_a_metadata_updated.snapshot_id: snapshot_a}, + previous_plan_id=None, + previously_promoted_snapshot_ids=set(), + previous_finalized_snapshots=None, + previous_gateway_managed_virtual_layer=False, + gateway_managed_virtual_layer=False, + environment_statements=[], + ) + + plan = PlanBuilder(context_diff).build() + assert snapshot_a_metadata_updated.change_category == SnapshotChangeCategory.METADATA + assert ( + snapshot_a_metadata_updated.is_forward_only + ) # needs to be forward_only to prevent backfill on a metadata change + assert ( + snapshot_a_metadata_updated.intervals == snapshot_a.intervals + ) # intervals should have been copied + assert not plan.missing_intervals # plan should have no missing intervals + + def test_start_inference(make_snapshot, mocker: MockerFixture): snapshot_a = make_snapshot( SqlModel(name="a", query=parse_one("select 1, ds"), start="2022-01-01") From 0d926ab35b2cb5e34aca9117009f8d007a2faa97 Mon Sep 17 00:00:00 2001 From: Erin Drummond Date: Thu, 18 Sep 2025 03:24:40 +0000 Subject: [PATCH 2/2] PR feedback --- sqlmesh/core/plan/builder.py | 6 ------ tests/core/test_integration.py | 8 ++------ tests/core/test_plan.py | 8 +++----- 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/sqlmesh/core/plan/builder.py b/sqlmesh/core/plan/builder.py index 7963af62ed..2eb4c54aeb 100644 --- a/sqlmesh/core/plan/builder.py +++ b/sqlmesh/core/plan/builder.py @@ -712,12 +712,6 @@ def _categorize_snapshot( snapshot.categorize_as(SnapshotChangeCategory.INDIRECT_NON_BREAKING, forward_only) else: # Metadata updated. - if not forward_only and snapshot.is_seed: - # seeds with metadata updates should be categorized as forward_only to prevent unnecessary backfill - # backfill should only happen if the actual seed data changes, in which case it will be classed as - # directly modified / breaking and not forward only as this code path will not be hit - forward_only = True - snapshot.categorize_as(SnapshotChangeCategory.METADATA, forward_only) def _get_orphaned_indirect_change_category( diff --git a/tests/core/test_integration.py b/tests/core/test_integration.py index b91c7a1039..5b44d80149 100644 --- a/tests/core/test_integration.py +++ b/tests/core/test_integration.py @@ -10664,7 +10664,7 @@ def _run_restatement_plan(tmp_path: Path, config: Config, q: queue.Queue): set_console(orig_console) -def test_seed_model_metadata_update_sets_forward_only(tmp_path: Path): +def test_seed_model_metadata_update_does_not_trigger_backfill(tmp_path: Path): """ Scenario: - Create a seed model; perform initial population @@ -10672,7 +10672,7 @@ def test_seed_model_metadata_update_sets_forward_only(tmp_path: Path): Outcome: - The seed model is modified (metadata-only) but this should NOT trigger backfill - - To prevent backfill, the categorizer needs to set forward_only on the seed + - There should be no missing_intervals on the plan to backfill """ models_path = tmp_path / "models" @@ -10703,7 +10703,6 @@ def test_seed_model_metadata_update_sets_forward_only(tmp_path: Path): plan = ctx.plan(auto_apply=True) original_seed_snapshot = ctx.snapshots['"memory"."test"."source_data"'] - assert not original_seed_snapshot.forward_only assert plan.directly_modified == {original_seed_snapshot.snapshot_id} assert plan.metadata_updated == set() assert plan.missing_intervals @@ -10735,9 +10734,6 @@ def test_seed_model_metadata_update_sets_forward_only(tmp_path: Path): assert plan.has_changes new_seed_snapshot = ctx.snapshots['"memory"."test"."source_data"'] - assert ( - new_seed_snapshot.forward_only - ) # change needs to be applied as forward-only to prevent backfill assert ( new_seed_snapshot.version == original_seed_snapshot.version ) # should be using the same physical table diff --git a/tests/core/test_plan.py b/tests/core/test_plan.py index d61a245a50..59bc91d1bf 100644 --- a/tests/core/test_plan.py +++ b/tests/core/test_plan.py @@ -1189,7 +1189,7 @@ def test_forward_only_plan_seed_models(make_snapshot, mocker: MockerFixture): assert not snapshot_a_updated.is_forward_only -def test_seed_model_metadata_change_categorized_forward_only( +def test_seed_model_metadata_change_no_missing_intervals( make_snapshot: t.Callable[..., Snapshot], ): snapshot_a = make_snapshot( @@ -1241,13 +1241,11 @@ def test_seed_model_metadata_change_categorized_forward_only( plan = PlanBuilder(context_diff).build() assert snapshot_a_metadata_updated.change_category == SnapshotChangeCategory.METADATA - assert ( - snapshot_a_metadata_updated.is_forward_only - ) # needs to be forward_only to prevent backfill on a metadata change + assert not snapshot_a_metadata_updated.is_forward_only + assert not plan.missing_intervals # plan should have no missing intervals assert ( snapshot_a_metadata_updated.intervals == snapshot_a.intervals ) # intervals should have been copied - assert not plan.missing_intervals # plan should have no missing intervals def test_start_inference(make_snapshot, mocker: MockerFixture):