Skip to content

Commit 168ec9a

Browse files
committed
Fix: Dont backfill seeds if the seed data didnt actually change
1 parent df7d61b commit 168ec9a

File tree

4 files changed

+191
-1
lines changed

4 files changed

+191
-1
lines changed

sqlmesh/core/plan/builder.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,12 @@ def _categorize_snapshot(
712712
snapshot.categorize_as(SnapshotChangeCategory.INDIRECT_NON_BREAKING, forward_only)
713713
else:
714714
# Metadata updated.
715+
if not forward_only and snapshot.is_seed:
716+
# seeds with metadata updates should be categorized as forward_only to prevent unnecessary backfill
717+
# backfill should only happen if the actual seed data changes, in which case it will be classed as
718+
# directly modified / breaking and not forward only as this code path will not be hit
719+
forward_only = True
720+
715721
snapshot.categorize_as(SnapshotChangeCategory.METADATA, forward_only)
716722

717723
def _get_orphaned_indirect_change_category(

sqlmesh/core/plan/common.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ def should_force_rebuild(old: Snapshot, new: Snapshot) -> bool:
1616
if new.is_view and new.is_indirect_non_breaking and not new.is_forward_only:
1717
# View models always need to be rebuilt to reflect updated upstream dependencies
1818
return True
19-
if new.is_seed:
19+
if new.is_seed and not new.is_metadata:
2020
# Seed models always need to be rebuilt to reflect changes in the seed file
21+
# Unless only their metadata has been updated (eg description added) and the seed file has not been touched
2122
return True
2223
return is_breaking_kind_change(old, new)
2324

tests/core/test_integration.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10662,3 +10662,125 @@ def _run_restatement_plan(tmp_path: Path, config: Config, q: queue.Queue):
1066210662
assert len(model_a.intervals)
1066310663

1066410664
set_console(orig_console)
10665+
10666+
10667+
def test_seed_model_metadata_update_sets_forward_only(tmp_path: Path):
10668+
"""
10669+
Scenario:
10670+
- Create a seed model; perform initial population
10671+
- Modify the model with a metadata-only change and trigger a plan
10672+
10673+
Outcome:
10674+
- The seed model is modified (metadata-only) but this should NOT trigger backfill
10675+
- To prevent backfill, the categorizer needs to set forward_only on the seed
10676+
"""
10677+
10678+
models_path = tmp_path / "models"
10679+
seeds_path = tmp_path / "seeds"
10680+
models_path.mkdir()
10681+
seeds_path.mkdir()
10682+
10683+
seed_model_path = models_path / "seed.sql"
10684+
seed_path = seeds_path / "seed_data.csv"
10685+
10686+
seed_path.write_text("\n".join(["id,name", "1,test"]))
10687+
10688+
seed_model_path.write_text("""
10689+
MODEL (
10690+
name test.source_data,
10691+
kind SEED (
10692+
path '../seeds/seed_data.csv'
10693+
)
10694+
);
10695+
""")
10696+
10697+
config = Config(
10698+
gateways={"": GatewayConfig(connection=DuckDBConnectionConfig())},
10699+
model_defaults=ModelDefaultsConfig(dialect="duckdb", start="2024-01-01"),
10700+
)
10701+
ctx = Context(paths=tmp_path, config=config)
10702+
10703+
plan = ctx.plan(auto_apply=True)
10704+
10705+
original_seed_snapshot = ctx.snapshots['"memory"."test"."source_data"']
10706+
assert not original_seed_snapshot.forward_only
10707+
assert plan.directly_modified == {original_seed_snapshot.snapshot_id}
10708+
assert plan.metadata_updated == set()
10709+
assert plan.missing_intervals
10710+
10711+
# prove data loaded
10712+
assert ctx.engine_adapter.fetchall("select id, name from memory.test.source_data") == [
10713+
(1, "test")
10714+
]
10715+
10716+
# prove no diff
10717+
ctx.load()
10718+
plan = ctx.plan(auto_apply=True)
10719+
assert not plan.has_changes
10720+
assert not plan.missing_intervals
10721+
10722+
# make metadata-only change
10723+
seed_model_path.write_text("""
10724+
MODEL (
10725+
name test.source_data,
10726+
kind SEED (
10727+
path '../seeds/seed_data.csv'
10728+
),
10729+
description 'updated by test'
10730+
);
10731+
""")
10732+
10733+
ctx.load()
10734+
plan = ctx.plan(auto_apply=True)
10735+
assert plan.has_changes
10736+
10737+
new_seed_snapshot = ctx.snapshots['"memory"."test"."source_data"']
10738+
assert (
10739+
new_seed_snapshot.forward_only
10740+
) # change needs to be applied as forward-only to prevent backfill
10741+
assert (
10742+
new_seed_snapshot.version == original_seed_snapshot.version
10743+
) # should be using the same physical table
10744+
assert (
10745+
new_seed_snapshot.snapshot_id != original_seed_snapshot.snapshot_id
10746+
) # but still be different due to the metadata change
10747+
assert plan.directly_modified == set()
10748+
assert plan.metadata_updated == {new_seed_snapshot.snapshot_id}
10749+
10750+
# there should be no missing intervals to backfill since all we did is update a description
10751+
assert not plan.missing_intervals
10752+
10753+
# there should still be no diff or missing intervals in 3 days time
10754+
assert new_seed_snapshot.model.interval_unit.is_day
10755+
with time_machine.travel(timedelta(days=3)):
10756+
ctx.clear_caches()
10757+
ctx.load()
10758+
plan = ctx.plan(auto_apply=True)
10759+
assert not plan.has_changes
10760+
assert not plan.missing_intervals
10761+
10762+
# change seed data
10763+
seed_path.write_text("\n".join(["id,name", "1,test", "2,updated"]))
10764+
10765+
# new plan - NOW we should backfill because data changed
10766+
ctx.load()
10767+
plan = ctx.plan(auto_apply=True)
10768+
assert plan.has_changes
10769+
10770+
updated_seed_snapshot = ctx.snapshots['"memory"."test"."source_data"']
10771+
10772+
assert (
10773+
updated_seed_snapshot.snapshot_id
10774+
!= new_seed_snapshot.snapshot_id
10775+
!= original_seed_snapshot.snapshot_id
10776+
)
10777+
assert not updated_seed_snapshot.forward_only
10778+
assert plan.directly_modified == {updated_seed_snapshot.snapshot_id}
10779+
assert plan.metadata_updated == set()
10780+
assert plan.missing_intervals
10781+
10782+
# prove backfilled data loaded
10783+
assert ctx.engine_adapter.fetchall("select id, name from memory.test.source_data") == [
10784+
(1, "test"),
10785+
(2, "updated"),
10786+
]

tests/core/test_plan.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,6 +1189,67 @@ def test_forward_only_plan_seed_models(make_snapshot, mocker: MockerFixture):
11891189
assert not snapshot_a_updated.is_forward_only
11901190

11911191

1192+
def test_seed_model_metadata_change_categorized_forward_only(
1193+
make_snapshot: t.Callable[..., Snapshot],
1194+
):
1195+
snapshot_a = make_snapshot(
1196+
SeedModel(
1197+
name="a",
1198+
kind=SeedKind(path="./path/to/seed"),
1199+
seed=Seed(content="content"),
1200+
column_hashes={"col": "hash1"},
1201+
depends_on=set(),
1202+
)
1203+
)
1204+
snapshot_a.categorize_as(SnapshotChangeCategory.BREAKING)
1205+
snapshot_a.add_interval("2022-01-01", now())
1206+
1207+
snapshot_a_metadata_updated = make_snapshot(
1208+
SeedModel(
1209+
name="a",
1210+
kind=SeedKind(path="./path/to/seed"),
1211+
seed=Seed(content="content"),
1212+
column_hashes={"col": "hash1"},
1213+
depends_on=set(),
1214+
description="foo",
1215+
)
1216+
)
1217+
assert snapshot_a_metadata_updated.version is None
1218+
assert snapshot_a_metadata_updated.change_category is None
1219+
1220+
context_diff = ContextDiff(
1221+
environment="prod",
1222+
is_new_environment=True,
1223+
is_unfinalized_environment=False,
1224+
normalize_environment_name=True,
1225+
create_from="prod",
1226+
create_from_env_exists=True,
1227+
added=set(),
1228+
removed_snapshots={},
1229+
modified_snapshots={
1230+
snapshot_a_metadata_updated.name: (snapshot_a_metadata_updated, snapshot_a)
1231+
},
1232+
snapshots={snapshot_a_metadata_updated.snapshot_id: snapshot_a_metadata_updated},
1233+
new_snapshots={snapshot_a_metadata_updated.snapshot_id: snapshot_a},
1234+
previous_plan_id=None,
1235+
previously_promoted_snapshot_ids=set(),
1236+
previous_finalized_snapshots=None,
1237+
previous_gateway_managed_virtual_layer=False,
1238+
gateway_managed_virtual_layer=False,
1239+
environment_statements=[],
1240+
)
1241+
1242+
plan = PlanBuilder(context_diff).build()
1243+
assert snapshot_a_metadata_updated.change_category == SnapshotChangeCategory.METADATA
1244+
assert (
1245+
snapshot_a_metadata_updated.is_forward_only
1246+
) # needs to be forward_only to prevent backfill on a metadata change
1247+
assert (
1248+
snapshot_a_metadata_updated.intervals == snapshot_a.intervals
1249+
) # intervals should have been copied
1250+
assert not plan.missing_intervals # plan should have no missing intervals
1251+
1252+
11921253
def test_start_inference(make_snapshot, mocker: MockerFixture):
11931254
snapshot_a = make_snapshot(
11941255
SqlModel(name="a", query=parse_one("select 1, ds"), start="2022-01-01")

0 commit comments

Comments
 (0)