From 5110d635635db561c06eaed899b41cd9c74206c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 13 Apr 2026 13:11:54 +0200 Subject: [PATCH 01/29] docs: add implementation plan for resume mechanism Fixes #525 --- plans/525/resume-interrupted-runs.md | 176 +++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 plans/525/resume-interrupted-runs.md diff --git a/plans/525/resume-interrupted-runs.md b/plans/525/resume-interrupted-runs.md new file mode 100644 index 000000000..61e193ff5 --- /dev/null +++ b/plans/525/resume-interrupted-runs.md @@ -0,0 +1,176 @@ +--- +date: 2026-04-13 +authors: + - pboruta +issue: https://github.com/NVIDIA-NeMo/DataDesigner/issues/525 +--- + +# Plan: Resume interrupted dataset generation runs + +## Problem + +When a long-running `DataDesigner.create()` call is interrupted (machine crash, OOM kill, preemption), the user has to restart generation from scratch — even though completed batches are already durably written to disk and `metadata.json` tracks exactly how many finished. + +The situation is made worse by an existing safeguard that fires at the wrong time: `ArtifactStorage.resolved_dataset_name` detects the existing folder on the next run and silently creates a new timestamped directory, orphaning the previous partial results instead of resuming from them. + +## Proposed Solution + +Add `resume: bool = False` to `DataDesigner.create()`. When `resume=True` the engine reads `metadata.json` from the existing dataset directory, validates that the run parameters are compatible, and starts the batch loop from the first incomplete batch rather than from zero. + +Expected usage: + +```python +dd = DataDesigner(...) +dd.add_column(...) + +# First run — interrupted at batch 7 of 20 +results = dd.create(config_builder, num_records=10_000) + +# After restart — picks up from batch 8 +results = dd.create(config_builder, num_records=10_000, resume=True) +``` + +## Design Decisions + +| Decision | Choice | Rationale | +|---|---|---| +| API surface | `resume: bool = False` on `DataDesigner.create()` | Opt-in flag keeps default behaviour unchanged. Users who want a clean re-run keep getting the timestamped-folder behaviour. | +| Resume state source | Read `metadata.json` written after each completed batch | Already contains `num_completed_batches`, `target_num_records`, `buffer_size`, `actual_num_records`. No new persistence needed. | +| Partial batch at crash time | Clear `tmp-partial-parquet-files/` at resume start | Simpler and safer than merging an incomplete parquet; losing one batch is acceptable since the user is already recovering from a crash. | +| Compatibility validation | Raise `DatasetGenerationError` if `num_records` or `buffer_size` changed | Different `num_records` changes which rows land in which batch file, breaking the numbering invariant. `buffer_size` changes the file-per-batch mapping. Both must match. | +| Async engine | Raise `DatasetGenerationError` if `DATA_DESIGNER_ASYNC_ENGINE=1` with `resume=True` | The async path uses a row-group scheduler rather than an indexed batch loop; resume would require a different strategy. Out of scope for v1. | +| Already-complete runs | Detect and warn, return existing path | If `num_completed_batches == total_num_batches` the dataset is already complete; the user may have re-run by mistake. | +| No metadata → error | Raise `DatasetGenerationError` | Resuming without a checkpoint is impossible; a clear error is better than silent fallback to a fresh run. | + +## Affected Files + +| File | Change | +|---|---| +| `packages/data-designer-engine/src/data_designer/engine/storage/artifact_storage.py` | Add `resume: bool = False` field; modify `resolved_dataset_name` to skip timestamping when `resume=True`; add `clear_partial_results()` helper | +| `packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py` | Add `start_batch: int = 0` and `initial_actual_num_records: int = 0` to `start()` | +| `packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py` | Add `resume: bool = False` to `build()`; add `_load_resume_state()` private method; implement validation and batch-skip logic | +| `packages/data-designer/src/data_designer/interface/data_designer.py` | Add `resume: bool = False` to `create()` and `_create_resource_provider()`; pass through to `ArtifactStorage` and `builder.build()` | +| `packages/data-designer-engine/tests/engine/storage/test_artifact_storage.py` | Tests for resume flag on `resolved_dataset_name` and `clear_partial_results()` | +| `packages/data-designer-engine/tests/engine/dataset_builders/utils/test_dataset_batch_manager.py` | Tests for `start_batch` and `initial_actual_num_records` parameters | +| `packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py` | Tests for resume validation, batch skipping, async engine error, already-complete detection | + +## Implementation Sketch + +### `ArtifactStorage` + +```python +class ArtifactStorage(BaseModel): + ... + resume: bool = False + + @cached_property + def resolved_dataset_name(self) -> str: + dataset_path = self.artifact_path / self.dataset_name + if dataset_path.exists() and len(list(dataset_path.iterdir())) > 0: + if self.resume: + return self.dataset_name # use existing folder as-is + # existing behaviour: create timestamped copy + new_dataset_name = f"{self.dataset_name}_{datetime.now().strftime(...)}" + ... + return new_dataset_name + if self.resume: + raise ArtifactStorageError( + f"Cannot resume: no existing dataset found at {dataset_path!r}." + ) + return self.dataset_name + + def clear_partial_results(self) -> None: + """Remove any in-flight partial results left over from an interrupted run.""" + if self.partial_results_path.exists(): + shutil.rmtree(self.partial_results_path) +``` + +### `DatasetBatchManager.start()` + +```python +def start( + self, + *, + num_records: int, + buffer_size: int, + start_batch: int = 0, + initial_actual_num_records: int = 0, +) -> None: + ... + self.reset() + self._current_batch_number = start_batch + self._actual_num_records = initial_actual_num_records +``` + +### `DatasetBuilder.build()` — resume path + +```python +@dataclass +class _ResumeState: + num_completed_batches: int + actual_num_records: int + buffer_size: int + +def _load_resume_state(self, num_records: int, buffer_size: int) -> _ResumeState: + try: + metadata = self.artifact_storage.read_metadata() + except FileNotFoundError: + raise DatasetGenerationError("Cannot resume: metadata.json not found. ...") + + target = metadata.get("target_num_records") + if target != num_records: + raise DatasetGenerationError( + f"Cannot resume: num_records={num_records} does not match " + f"the original run's target_num_records={target}. ..." + ) + + meta_buffer_size = metadata.get("buffer_size") + if meta_buffer_size != buffer_size: + raise DatasetGenerationError( + f"Cannot resume: buffer_size={buffer_size} does not match " + f"the original run's buffer_size={meta_buffer_size}. ..." + ) + + return _ResumeState( + num_completed_batches=metadata["num_completed_batches"], + actual_num_records=metadata["actual_num_records"], + buffer_size=buffer_size, + ) + +def build(self, *, num_records, on_batch_complete=None, save_multimedia_to_disk=True, resume=False): + ... + if resume and DATA_DESIGNER_ASYNC_ENGINE: + raise DatasetGenerationError("resume=True is not supported with DATA_DESIGNER_ASYNC_ENGINE.") + + buffer_size = self._resource_provider.run_config.buffer_size + + if resume: + state = self._load_resume_state(num_records, buffer_size) + if state.num_completed_batches * buffer_size >= num_records: + logger.warning("Dataset already complete — nothing to resume.") + return self.artifact_storage.final_dataset_path + self.artifact_storage.clear_partial_results() + self.batch_manager.start( + num_records=num_records, + buffer_size=buffer_size, + start_batch=state.num_completed_batches, + initial_actual_num_records=state.actual_num_records, + ) + for batch_idx in range(state.num_completed_batches, self.batch_manager.num_batches): + ... + else: + # existing path unchanged + self.batch_manager.start(num_records=num_records, buffer_size=buffer_size) + for batch_idx in range(self.batch_manager.num_batches): + ... +``` + +## Trade-offs Considered + +- **Automatic resume detection** (no flag, detect existing folder automatically): rejected — removes user intent. A user re-running a pipeline from scratch would be surprised by silent resumption. +- **Resume support for async engine**: deferred to a follow-up. The async scheduler's row-group model doesn't map 1:1 to batch indices; implementing it would require a separate mechanism. +- **Per-column resume** (resume from column N within an interrupted batch): out of scope. Requires per-column checkpointing and state reconstruction, significantly higher complexity. + +## Delivery + +Single PR implementing all changes listed in the affected-files table plus tests. No backwards-incompatible changes — `resume` defaults to `False` and all existing call sites are unaffected. From 6afb6383096bb061829399c0c18076cc44135c49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 13 Apr 2026 13:12:02 +0200 Subject: [PATCH 02/29] feat(storage): add resume flag and clear_partial_results() - ArtifactStorage gains a `resume: bool = False` field - resolved_dataset_name skips timestamp logic when resume=True, returning the existing dataset folder name as-is - Raises ArtifactStorageError on resume=True when the target folder is absent or empty (no data to resume from) - New clear_partial_results() removes in-flight partial results left over from an interrupted run Fixes #525 --- .../engine/storage/artifact_storage.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/packages/data-designer-engine/src/data_designer/engine/storage/artifact_storage.py b/packages/data-designer-engine/src/data_designer/engine/storage/artifact_storage.py index 458eed689..907422d0d 100644 --- a/packages/data-designer-engine/src/data_designer/engine/storage/artifact_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/storage/artifact_storage.py @@ -47,6 +47,7 @@ class ArtifactStorage(BaseModel): partial_results_folder_name: str = "tmp-partial-parquet-files" dropped_columns_folder_name: str = "dropped-columns-parquet-files" processors_outputs_folder_name: str = PROCESSORS_OUTPUTS_FOLDER_NAME + resume: bool = False _media_storage: MediaStorage = PrivateAttr(default=None) @property @@ -67,12 +68,19 @@ def artifact_path_exists(self) -> bool: def resolved_dataset_name(self) -> str: dataset_path = self.artifact_path / self.dataset_name if dataset_path.exists() and len(list(dataset_path.iterdir())) > 0: + if self.resume: + return self.dataset_name new_dataset_name = f"{self.dataset_name}_{datetime.now().strftime('%m-%d-%Y_%H%M%S')}" logger.info( f"📂 Dataset path {str(dataset_path)!r} already exists. Dataset from this session" f"\n\t\t will be saved to {str(self.artifact_path / new_dataset_name)!r} instead." ) return new_dataset_name + if self.resume: + raise ArtifactStorageError( + f"🛑 Cannot resume: no existing dataset found at {str(dataset_path)!r}. " + "Run without resume=True to start a new generation." + ) return self.dataset_name @property @@ -204,6 +212,11 @@ def load_dataset_with_dropped_columns(self) -> pd.DataFrame: df = lazy.pd.concat([df, df_dropped], axis=1) return df + def clear_partial_results(self) -> None: + """Remove any in-flight partial results left over from an interrupted run.""" + if self.partial_results_path.exists(): + shutil.rmtree(self.partial_results_path) + def move_partial_result_to_final_file_path(self, batch_number: int) -> Path: partial_result_path = self.create_batch_file_path(batch_number, batch_stage=BatchStage.PARTIAL_RESULT) if not partial_result_path.exists(): From 699f510f7b33c2391f1d16d87e950b4e78f9955f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 13 Apr 2026 13:12:35 +0200 Subject: [PATCH 03/29] feat(batch-manager): add start_batch param to start() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DatasetBatchManager.start() now accepts: - start_batch: int = 0 — first batch index to process - initial_actual_num_records: int = 0 — records already on disk Both default to 0 so all existing call sites are unaffected. Fixes #525 --- .../dataset_builders/utils/dataset_batch_manager.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py index 757853300..63907c916 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py @@ -158,7 +158,14 @@ def reset(self, delete_files: bool = False) -> None: except OSError as e: raise DatasetBatchManagementError(f"🛑 Failed to delete directory {dir_path}: {e}") - def start(self, *, num_records: int, buffer_size: int) -> None: + def start( + self, + *, + num_records: int, + buffer_size: int, + start_batch: int = 0, + initial_actual_num_records: int = 0, + ) -> None: if num_records <= 0: raise DatasetBatchManagementError("🛑 num_records must be positive.") if buffer_size <= 0: @@ -169,6 +176,8 @@ def start(self, *, num_records: int, buffer_size: int) -> None: if remaining_records := num_records % buffer_size: self._num_records_list.append(remaining_records) self.reset() + self._current_batch_number = start_batch + self._actual_num_records = initial_actual_num_records def write(self) -> Path | None: """Write the current batch to a parquet file. From 866df6b0407cfa883e48d2bb1ba3603a14d89e9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 13 Apr 2026 13:13:20 +0200 Subject: [PATCH 04/29] feat(builder): implement resume logic in DatasetBuilder - build() gains a resume: bool = False parameter - _load_resume_state() reads metadata.json and validates that num_records and buffer_size match the original run - _build_with_resume() skips completed batches, clears in-flight partial results, and continues from the first incomplete batch - Raises DatasetGenerationError with clear messages for: - missing metadata.json (interrupted before first batch completes) - num_records mismatch - buffer_size mismatch - DATA_DESIGNER_ASYNC_ENGINE=1 (not yet supported) - Logs a warning and returns early when dataset is already complete Fixes #525 --- .../dataset_builders/dataset_builder.py | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index 985ca2c09..39f1521e6 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -10,6 +10,7 @@ import time import uuid import warnings +from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Any, Callable @@ -94,6 +95,13 @@ def _is_async_trace_enabled(settings: RunConfig) -> bool: return settings.async_trace or os.environ.get("DATA_DESIGNER_ASYNC_TRACE", "0") == "1" +@dataclass +class _ResumeState: + num_completed_batches: int + actual_num_records: int + buffer_size: int + + class DatasetBuilder: def __init__( self, @@ -197,6 +205,7 @@ def build( num_records: int, on_batch_complete: Callable[[Path], None] | None = None, save_multimedia_to_disk: bool = True, + resume: bool = False, ) -> Path: """Build the dataset. @@ -206,11 +215,16 @@ def build( save_multimedia_to_disk: Whether to save generated multimedia (images, audio, video) to disk. If False, multimedia is stored directly in the DataFrame (e.g., images as base64). Default is True. + resume: If True, resume generation from the last completed batch found in the existing + artifact directory. The run parameters (num_records, buffer_size) must match those + of the original run. Any in-flight partial results from the interrupted run are + discarded. Not supported when DATA_DESIGNER_ASYNC_ENGINE is enabled. Returns: Path to the generated dataset directory. """ self._reset_run_state() + self._run_model_health_check_if_needed() self._run_mcp_tool_check_if_needed() self._write_builder_config() @@ -227,6 +241,8 @@ def build( self._use_async = DATA_DESIGNER_ASYNC_ENGINE and self._resolve_async_compatibility() if self._use_async: self._build_async(generators, num_records, buffer_size, on_batch_complete) + elif resume: + self._build_with_resume(generators, num_records, buffer_size, on_batch_complete) else: group_id = uuid.uuid4().hex self.batch_manager.start(num_records=num_records, buffer_size=buffer_size) @@ -246,6 +262,85 @@ def build( return self.artifact_storage.final_dataset_path + def _load_resume_state(self, num_records: int, buffer_size: int) -> _ResumeState: + """Read and validate resume state from an existing metadata.json. + + Raises: + DatasetGenerationError: If metadata is missing or incompatible with the current run parameters. + """ + try: + metadata = self.artifact_storage.read_metadata() + except FileNotFoundError: + raise DatasetGenerationError( + "🛑 Cannot resume: metadata.json not found in the existing dataset directory. " + "Run without resume=True to start a new generation." + ) + + target = metadata.get("target_num_records") + if target != num_records: + raise DatasetGenerationError( + f"🛑 Cannot resume: num_records={num_records} does not match the original run's " + f"target_num_records={target}. Use the same num_records as the interrupted run, " + "or start a new run without resume=True." + ) + + meta_buffer_size = metadata.get("buffer_size") + if meta_buffer_size != buffer_size: + raise DatasetGenerationError( + f"🛑 Cannot resume: buffer_size={buffer_size} does not match the original run's " + f"buffer_size={meta_buffer_size}. Use the same buffer_size as the interrupted run, " + "or start a new run without resume=True." + ) + + return _ResumeState( + num_completed_batches=metadata["num_completed_batches"], + actual_num_records=metadata["actual_num_records"], + buffer_size=buffer_size, + ) + + def _build_with_resume( + self, + generators: list[ColumnGenerator], + num_records: int, + buffer_size: int, + on_batch_complete: Callable[[Path], None] | None, + ) -> None: + """Resume generation from the last completed batch.""" + state = self._load_resume_state(num_records, buffer_size) + + self.batch_manager.start( + num_records=num_records, + buffer_size=buffer_size, + start_batch=state.num_completed_batches, + initial_actual_num_records=state.actual_num_records, + ) + + if state.num_completed_batches >= self.batch_manager.num_batches: + logger.warning( + "⚠️ Dataset is already complete — all batches were found in the existing artifact directory. " + "Nothing to resume. Remove resume=True if you want to generate a new dataset." + ) + return + + logger.info( + f"▶️ Resuming from batch {state.num_completed_batches + 1} of {self.batch_manager.num_batches} " + f"({state.actual_num_records} records already generated)." + ) + + self.artifact_storage.clear_partial_results() + + group_id = uuid.uuid4().hex + for batch_idx in range(state.num_completed_batches, self.batch_manager.num_batches): + logger.info(f"⏳ Processing batch {batch_idx + 1} of {self.batch_manager.num_batches}") + self._run_batch( + generators, + batch_mode="batch", + group_id=group_id, + current_batch_number=batch_idx, + on_batch_complete=on_batch_complete, + ) + self.batch_manager.finish() + def build_preview(self, *, num_records: int) -> pd.DataFrame: self._reset_run_state() self._run_model_health_check_if_needed() From e0b22d5029f6ad8dac85c4410025d38165265525 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 13 Apr 2026 13:13:23 +0200 Subject: [PATCH 05/29] feat(interface): expose resume on DataDesigner.create() - create() gains resume: bool = False - _create_resource_provider() passes resume to ArtifactStorage - builder.build() receives the resume flag Fixes #525 --- .../src/data_designer/interface/data_designer.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/packages/data-designer/src/data_designer/interface/data_designer.py b/packages/data-designer/src/data_designer/interface/data_designer.py index cd928f611..d165fcdd7 100644 --- a/packages/data-designer/src/data_designer/interface/data_designer.py +++ b/packages/data-designer/src/data_designer/interface/data_designer.py @@ -202,6 +202,7 @@ def create( *, num_records: int = DEFAULT_NUM_RECORDS, dataset_name: str = "dataset", + resume: bool = False, ) -> DatasetCreationResults: """Create dataset and save results to the local artifact storage. @@ -219,6 +220,11 @@ def create( a datetime stamp. For example, if the dataset name is "awesome_dataset" and a directory with the same name already exists, the dataset will be saved to a new directory with the name "awesome_dataset_2025-01-01_12-00-00". + resume: If True, resume generation from the last completed batch found in the + existing dataset directory. The run parameters (num_records, buffer_size) must + match those of the original run. Any in-flight partial results from the + interrupted run are discarded. Not supported when DATA_DESIGNER_ASYNC_ENGINE + is enabled. Returns: DatasetCreationResults object with methods for loading the generated dataset, @@ -231,11 +237,11 @@ def create( logger.info("🎨 Creating Data Designer dataset") self._log_jinja_rendering_engine_mode() - resource_provider = self._create_resource_provider(dataset_name, config_builder) + resource_provider = self._create_resource_provider(dataset_name, config_builder, resume=resume) try: builder = self._create_dataset_builder(config_builder.build(), resource_provider) - builder.build(num_records=num_records) + builder.build(num_records=num_records, resume=resume) except DeprecationWarning: raise except Exception as e: @@ -546,7 +552,7 @@ def _create_dataset_profiler( ) def _create_resource_provider( - self, dataset_name: str, config_builder: DataDesignerConfigBuilder + self, dataset_name: str, config_builder: DataDesignerConfigBuilder, *, resume: bool = False ) -> ResourceProvider: ArtifactStorage.mkdir_if_needed(self._artifact_path) @@ -555,7 +561,9 @@ def _create_resource_provider( seed_dataset_source = seed_config.source return create_resource_provider( - artifact_storage=ArtifactStorage(artifact_path=self._artifact_path, dataset_name=dataset_name), + artifact_storage=ArtifactStorage( + artifact_path=self._artifact_path, dataset_name=dataset_name, resume=resume + ), model_configs=config_builder.model_configs, secret_resolver=self._secret_resolver, model_provider_registry=self._model_provider_registry, From 4b514b2b140d615ba5f86816353fbfa007697610 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 13 Apr 2026 13:13:30 +0200 Subject: [PATCH 06/29] test: add tests for resume mechanism Covers: - ArtifactStorage.resolved_dataset_name with resume=True - ArtifactStorage.clear_partial_results() - DatasetBatchManager.start() with start_batch and initial_actual_num_records - DatasetBuilder.build(resume=True): missing metadata, num_records mismatch, buffer_size mismatch, already-complete detection Fixes #525 --- .../dataset_builders/test_dataset_builder.py | 96 +++++++++++++++++++ .../utils/test_dataset_batch_manager.py | 37 +++++++ .../engine/storage/test_artifact_storage.py | 58 +++++++++++ 3 files changed, 191 insertions(+) diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index 8c796374a..64aa893ab 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1433,3 +1433,99 @@ def test_skip_row_count_preserved_across_pipeline(stub_resource_provider, stub_m assert len(result) == 5, "Skip must not change the row count" assert result["seed_id"].tolist() == [1, 2, 3, 4, 5] +# --------------------------------------------------------------------------- +# Resume mechanism tests +# --------------------------------------------------------------------------- + + +import json as _json +from pathlib import Path as _Path + +from data_designer.engine.storage.artifact_storage import ArtifactStorage as _ArtifactStorage + + +def _write_metadata(dataset_dir: _Path, **fields) -> None: + """Write a metadata.json into an existing dataset folder.""" + dataset_dir.mkdir(parents=True, exist_ok=True) + (dataset_dir / "sentinel.txt").write_text("x") # make folder non-empty for resolved_dataset_name + (dataset_dir / "metadata.json").write_text(_json.dumps(fields)) + + +def _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, *, buffer_size: int = 2): + """Return a DatasetBuilder whose ArtifactStorage has resume=True.""" + storage = _ArtifactStorage(artifact_path=tmp_path, resume=True) + stub_resource_provider.artifact_storage = storage + stub_resource_provider.run_config = RunConfig(buffer_size=buffer_size) + return DatasetBuilder( + data_designer_config=stub_test_config_builder.build(), + resource_provider=stub_resource_provider, + ) + + +def test_build_resume_raises_without_metadata(stub_resource_provider, stub_test_config_builder, tmp_path): + """resume=True when only the folder exists (no metadata.json) raises DatasetGenerationError. + + This covers the case where a run was interrupted before any batch completed — the + folder was created by _write_builder_config but metadata.json was never written. + """ + # Pre-create the folder with content so resolved_dataset_name(resume=True) returns "dataset" + dataset_dir = tmp_path / "dataset" + dataset_dir.mkdir() + (dataset_dir / "builder_config.json").write_text("{}") # non-empty, no metadata + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path) + with pytest.raises(DatasetGenerationError, match="metadata.json not found"): + builder.build(num_records=4, resume=True) + + +def test_build_resume_raises_on_num_records_mismatch(stub_resource_provider, stub_test_config_builder, tmp_path): + """resume=True raises when num_records differs from the original run.""" + dataset_dir = tmp_path / "dataset" + _write_metadata( + dataset_dir, + target_num_records=10, + buffer_size=2, + num_completed_batches=2, + actual_num_records=4, + ) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + with pytest.raises(DatasetGenerationError, match="num_records=4 does not match"): + builder.build(num_records=4, resume=True) + + +def test_build_resume_raises_on_buffer_size_mismatch(stub_resource_provider, stub_test_config_builder, tmp_path): + """resume=True raises when buffer_size differs from the original run.""" + dataset_dir = tmp_path / "dataset" + _write_metadata( + dataset_dir, + target_num_records=4, + buffer_size=2, + num_completed_batches=1, + actual_num_records=2, + ) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=3) + with pytest.raises(DatasetGenerationError, match="buffer_size=3 does not match"): + builder.build(num_records=4, resume=True) + + +def test_build_resume_logs_warning_when_already_complete( + stub_resource_provider, stub_test_config_builder, tmp_path, caplog +): + """resume=True on a fully-complete dataset logs a warning and returns without generating.""" + dataset_dir = tmp_path / "dataset" + # 4 records, 2 per batch = 2 batches; num_completed_batches == 2 → already done + _write_metadata( + dataset_dir, + target_num_records=4, + buffer_size=2, + num_completed_batches=2, + actual_num_records=4, + ) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + with caplog.at_level(logging.WARNING): + builder.build(num_records=4, resume=True) + + assert any("already complete" in record.message for record in caplog.records) diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/utils/test_dataset_batch_manager.py b/packages/data-designer-engine/tests/engine/dataset_builders/utils/test_dataset_batch_manager.py index e2529e1fa..cf3a600bf 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/utils/test_dataset_batch_manager.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/utils/test_dataset_batch_manager.py @@ -451,3 +451,40 @@ def test_full_workflow(stub_batch_manager): # Verify all files exist assert stub_batch_manager.artifact_storage.metadata_file_path.exists() assert len(list(stub_batch_manager.artifact_storage.final_dataset_path.glob("*.parquet"))) == 3 + + +# --------------------------------------------------------------------------- +# start() with resume parameters +# --------------------------------------------------------------------------- + + +def test_start_with_start_batch(stub_batch_manager): + """start_batch shifts _current_batch_number so the loop skips already-done batches.""" + stub_batch_manager.start(num_records=10, buffer_size=3, start_batch=2) + + assert stub_batch_manager._current_batch_number == 2 + assert stub_batch_manager.num_batches == 4 + assert stub_batch_manager.buffer_is_empty is True + + +def test_start_with_initial_actual_num_records(stub_batch_manager): + """initial_actual_num_records pre-populates the running total for resumed runs.""" + stub_batch_manager.start(num_records=10, buffer_size=3, initial_actual_num_records=6) + + assert stub_batch_manager._actual_num_records == 6 + + +def test_start_with_start_batch_and_initial_actual_num_records(stub_batch_manager): + """Both resume params can be set together.""" + stub_batch_manager.start(num_records=10, buffer_size=3, start_batch=2, initial_actual_num_records=6) + + assert stub_batch_manager._current_batch_number == 2 + assert stub_batch_manager._actual_num_records == 6 + + +def test_start_default_values_unchanged(stub_batch_manager): + """Default call (no resume params) still starts at batch 0 with 0 actual records.""" + stub_batch_manager.start(num_records=10, buffer_size=3) + + assert stub_batch_manager._current_batch_number == 0 + assert stub_batch_manager._actual_num_records == 0 diff --git a/packages/data-designer-engine/tests/engine/storage/test_artifact_storage.py b/packages/data-designer-engine/tests/engine/storage/test_artifact_storage.py index 6206d5bbc..7dc1d8b1e 100644 --- a/packages/data-designer-engine/tests/engine/storage/test_artifact_storage.py +++ b/packages/data-designer-engine/tests/engine/storage/test_artifact_storage.py @@ -412,3 +412,61 @@ def test_standalone_load_processor_dataset_raises_file_not_found(tmp_path): """Standalone function raises FileNotFoundError (not ArtifactStorageError).""" with pytest.raises(FileNotFoundError, match="No artifacts found"): load_processor_dataset(tmp_path, "nonexistent") + + +# --------------------------------------------------------------------------- +# Resume flag tests +# --------------------------------------------------------------------------- + + +def test_resolved_dataset_name_creates_timestamped_copy_when_folder_exists(tmp_path): + """Default behaviour: existing non-empty folder gets a timestamped sibling.""" + existing = tmp_path / "dataset" + existing.mkdir() + (existing / "some_file.txt").write_text("x") + + storage = ArtifactStorage(artifact_path=tmp_path, dataset_name="dataset") + name = storage.resolved_dataset_name + assert name != "dataset" + assert name.startswith("dataset_") + + +def test_resolved_dataset_name_resume_uses_existing_folder(tmp_path): + """With resume=True, an existing non-empty folder is used as-is.""" + existing = tmp_path / "dataset" + existing.mkdir() + (existing / "some_file.txt").write_text("x") + + storage = ArtifactStorage(artifact_path=tmp_path, dataset_name="dataset", resume=True) + assert storage.resolved_dataset_name == "dataset" + + +def test_resolved_dataset_name_resume_raises_when_no_existing_folder(tmp_path): + """With resume=True, missing dataset folder raises ArtifactStorageError at init.""" + with pytest.raises(ArtifactStorageError, match="Cannot resume"): + ArtifactStorage(artifact_path=tmp_path, dataset_name="dataset", resume=True) + + +def test_resolved_dataset_name_resume_raises_when_folder_is_empty(tmp_path): + """With resume=True, an empty existing folder raises ArtifactStorageError at init.""" + (tmp_path / "dataset").mkdir() + + with pytest.raises(ArtifactStorageError, match="Cannot resume"): + ArtifactStorage(artifact_path=tmp_path, dataset_name="dataset", resume=True) + + +def test_clear_partial_results_removes_partial_folder(tmp_path, stub_sample_dataframe): + """clear_partial_results() deletes the partial results directory and its contents.""" + storage = ArtifactStorage(artifact_path=tmp_path) + storage.write_batch_to_parquet_file(0, stub_sample_dataframe, BatchStage.PARTIAL_RESULT) + assert storage.partial_results_path.exists() + + storage.clear_partial_results() + assert not storage.partial_results_path.exists() + + +def test_clear_partial_results_is_noop_when_no_partial_folder(tmp_path): + """clear_partial_results() does not raise when the partial results folder is absent.""" + storage = ArtifactStorage(artifact_path=tmp_path) + assert not storage.partial_results_path.exists() + storage.clear_partial_results() # must not raise From 1db497a9174568fbf3372bd09848f56e706b95e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 13 Apr 2026 13:39:50 +0200 Subject: [PATCH 07/29] feat(builder): extend resume to async engine (DATA_DESIGNER_ASYNC_ENGINE=1) - Add _find_completed_row_group_ids() to scan parquet-files/ for already-written row groups by parsing batch_*.parquet filenames - _build_async() now accepts resume=True: loads metadata, finds completed row groups, clears partial results, and logs progress; returns early if all row groups are done - _prepare_async_run() accepts skip_row_groups, initial_actual_num_records, and initial_total_num_batches so the scheduler only processes remaining row groups and RowGroupBufferManager starts from the correct counts - RowGroupBufferManager.__init__ gains initial_actual_num_records and initial_total_num_batches params to seed the counters on resume - finalize_row_group closure now writes incremental metadata after each checkpoint so any run (resume or not) can be resumed if interrupted mid-way - Remove the guard that rejected resume=True with DATA_DESIGNER_ASYNC_ENGINE=1 - Add tests for all new paths --- .../dataset_builders/dataset_builder.py | 75 ++++++++++++++-- .../utils/row_group_buffer.py | 11 ++- .../dataset_builders/test_dataset_builder.py | 89 +++++++++++++++++++ .../utils/test_row_group_buffer.py | 32 +++++++ 4 files changed, 195 insertions(+), 12 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index 39f1521e6..a20402fdf 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -215,10 +215,10 @@ def build( save_multimedia_to_disk: Whether to save generated multimedia (images, audio, video) to disk. If False, multimedia is stored directly in the DataFrame (e.g., images as base64). Default is True. - resume: If True, resume generation from the last completed batch found in the existing - artifact directory. The run parameters (num_records, buffer_size) must match those - of the original run. Any in-flight partial results from the interrupted run are - discarded. Not supported when DATA_DESIGNER_ASYNC_ENGINE is enabled. + resume: If True, resume generation from the last completed batch (sync engine) or + row group (async engine) found in the existing artifact directory. The run parameters + (num_records, buffer_size) must match those of the original run. Any in-flight + partial results from the interrupted run are discarded. Returns: Path to the generated dataset directory. @@ -240,7 +240,7 @@ def build( self._use_async = DATA_DESIGNER_ASYNC_ENGINE and self._resolve_async_compatibility() if self._use_async: - self._build_async(generators, num_records, buffer_size, on_batch_complete) + self._build_async(generators, num_records, buffer_size, on_batch_complete, resume=resume) elif resume: self._build_with_resume(generators, num_records, buffer_size, on_batch_complete) else: @@ -426,12 +426,31 @@ def _resolve_async_compatibility(self) -> bool: return False return True + def _find_completed_row_group_ids(self) -> set[int]: + """Scan the final dataset directory for already-written row group parquet files. + + Returns: + Set of row-group IDs (batch numbers) that have a parquet file in ``parquet-files/``. + """ + final_path = self.artifact_storage.final_dataset_path + if not final_path.exists(): + return set() + ids: set[int] = set() + for p in final_path.glob("batch_*.parquet"): + try: + ids.add(int(p.stem.split("_", 1)[1])) + except (ValueError, IndexError): + continue + return ids + def _build_async( self, generators: list[ColumnGenerator], num_records: int, buffer_size: int, on_batch_complete: Callable[[Path], None] | None = None, + *, + resume: bool = False, ) -> None: """Async task-queue builder path - dispatches tasks based on dependency readiness.""" logger.info("⚡ DATA_DESIGNER_ASYNC_ENGINE is enabled - using async task-queue builder") @@ -439,12 +458,39 @@ def _build_async( settings = self._resource_provider.run_config trace_enabled = _is_async_trace_enabled(settings) + skip_row_groups: frozenset[int] = frozenset() + initial_actual_num_records = 0 + initial_total_num_batches = 0 + + if resume: + state = self._load_resume_state(num_records, buffer_size) + completed_ids = self._find_completed_row_group_ids() + skip_row_groups = frozenset(completed_ids) + initial_actual_num_records = state.actual_num_records + initial_total_num_batches = state.num_completed_batches + self.artifact_storage.clear_partial_results() + + total_row_groups = -(-num_records // buffer_size) # ceiling division + if len(completed_ids) >= total_row_groups: + logger.warning( + "⚠️ Dataset is already complete — all row groups were found in the existing artifact " + "directory. Nothing to resume. Remove resume=True if you want to generate a new dataset." + ) + return + + logger.info( + f"▶️ Resuming async run: {len(completed_ids)} of {total_row_groups} row group(s) already " + f"complete ({state.actual_num_records} records), skipping them." + ) + def finalize_row_group(rg_id: int) -> None: def on_complete(final_path: Path | str | None) -> None: if final_path is not None and on_batch_complete: on_batch_complete(final_path) buffer_manager.checkpoint_row_group(rg_id, on_complete=on_complete) + # Write incremental metadata after each row group so interrupted runs can be resumed. + buffer_manager.write_metadata(target_num_records=num_records, buffer_size=buffer_size) scheduler, buffer_manager = self._prepare_async_run( generators, @@ -455,6 +501,9 @@ def on_complete(final_path: Path | str | None) -> None: shutdown_error_window=settings.shutdown_error_window, disable_early_shutdown=settings.disable_early_shutdown, trace=trace_enabled, + skip_row_groups=skip_row_groups, + initial_actual_num_records=initial_actual_num_records, + initial_total_num_batches=initial_total_num_batches, ) # Telemetry snapshot @@ -483,7 +532,7 @@ def on_complete(final_path: Path | str | None) -> None: except Exception: logger.debug("Failed to emit batch telemetry for async run", exc_info=True) - # Write metadata + # Write final metadata (overwrites the last incremental write with identical content). buffer_manager.write_metadata(target_num_records=num_records, buffer_size=buffer_size) # Surface partial completion @@ -515,6 +564,9 @@ def _prepare_async_run( shutdown_error_window: int = 10, disable_early_shutdown: bool = False, trace: bool = False, + skip_row_groups: frozenset[int] = frozenset(), + initial_actual_num_records: int = 0, + initial_total_num_batches: int = 0, ) -> tuple[AsyncTaskScheduler, RowGroupBufferManager]: """Build a fully-wired scheduler and buffer manager for async generation. @@ -537,18 +589,23 @@ def _prepare_async_run( for gen in generators: gen.log_pre_generation() - # Partition into row groups + # Partition into row groups, skipping any already completed on resume. row_groups: list[tuple[int, int]] = [] remaining = num_records rg_id = 0 while remaining > 0: size = min(buffer_size, remaining) - row_groups.append((rg_id, size)) + if rg_id not in skip_row_groups: + row_groups.append((rg_id, size)) remaining -= size rg_id += 1 tracker = CompletionTracker.with_graph(graph, row_groups) - buffer_manager = RowGroupBufferManager(self.artifact_storage) + buffer_manager = RowGroupBufferManager( + self.artifact_storage, + initial_actual_num_records=initial_actual_num_records, + initial_total_num_batches=initial_total_num_batches, + ) # Pre-batch processor callback: runs after seed tasks complete for a row group. # If it raises, the scheduler propagates the error as DatasetGenerationError (fail-fast). diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/row_group_buffer.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/row_group_buffer.py index 5ddbfec8c..18cac9d0c 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/row_group_buffer.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/row_group_buffer.py @@ -28,13 +28,18 @@ class RowGroupBufferManager: exclusively by the async scheduler. """ - def __init__(self, artifact_storage: ArtifactStorage) -> None: + def __init__( + self, + artifact_storage: ArtifactStorage, + initial_actual_num_records: int = 0, + initial_total_num_batches: int = 0, + ) -> None: self._buffers: dict[int, list[dict]] = {} self._row_group_sizes: dict[int, int] = {} self._dropped: dict[int, set[int]] = {} self._artifact_storage = artifact_storage - self._actual_num_records: int = 0 - self._total_num_batches: int = 0 + self._actual_num_records: int = initial_actual_num_records + self._total_num_batches: int = initial_total_num_batches def init_row_group(self, row_group: int, size: int) -> None: """Allocate a buffer for *row_group* with *size* empty rows.""" diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index 64aa893ab..3f00ef639 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1529,3 +1529,92 @@ def test_build_resume_logs_warning_when_already_complete( builder.build(num_records=4, resume=True) assert any("already complete" in record.message for record in caplog.records) + + +# --------------------------------------------------------------------------- +# _find_completed_row_group_ids tests +# --------------------------------------------------------------------------- + + +def test_find_completed_row_group_ids_empty_dir(stub_resource_provider, stub_test_config_builder, tmp_path): + """Returns empty set when final_dataset_path does not exist.""" + dataset_dir = tmp_path / "dataset" + _write_metadata(dataset_dir, target_num_records=4, buffer_size=2, num_completed_batches=0, actual_num_records=0) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path) + assert builder._find_completed_row_group_ids() == set() + + +def test_find_completed_row_group_ids_with_files(stub_resource_provider, stub_test_config_builder, tmp_path): + """Returns correct IDs from batch_*.parquet files in parquet-files/.""" + dataset_dir = tmp_path / "dataset" + _write_metadata(dataset_dir, target_num_records=6, buffer_size=2, num_completed_batches=2, actual_num_records=4) + + parquet_dir = dataset_dir / "parquet-files" + parquet_dir.mkdir(parents=True) + (parquet_dir / "batch_00000.parquet").write_text("") + (parquet_dir / "batch_00002.parquet").write_text("") + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + assert builder._find_completed_row_group_ids() == {0, 2} + + +def test_find_completed_row_group_ids_ignores_non_batch_files( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """Non-batch files in parquet-files/ are silently ignored.""" + dataset_dir = tmp_path / "dataset" + _write_metadata(dataset_dir, target_num_records=4, buffer_size=2, num_completed_batches=1, actual_num_records=2) + + parquet_dir = dataset_dir / "parquet-files" + parquet_dir.mkdir(parents=True) + (parquet_dir / "batch_00001.parquet").write_text("") + (parquet_dir / "unrelated.parquet").write_text("") + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + assert builder._find_completed_row_group_ids() == {1} + + +# --------------------------------------------------------------------------- +# Async resume via _build_async tests +# --------------------------------------------------------------------------- + + +def _write_parquet_files(parquet_dir: _Path, row_group_ids: list[int]) -> None: + """Create stub batch_*.parquet files for the given row group IDs.""" + parquet_dir.mkdir(parents=True, exist_ok=True) + for rg_id in row_group_ids: + (parquet_dir / f"batch_{rg_id:05d}.parquet").write_text("") + + +def test_build_async_resume_logs_warning_when_already_complete( + stub_resource_provider, stub_test_config_builder, tmp_path, caplog +): + """Async resume on a fully-complete dataset logs a warning and returns without running.""" + dataset_dir = tmp_path / "dataset" + # 4 records at buffer_size=2 → 2 row groups (IDs 0 and 1) + _write_metadata(dataset_dir, target_num_records=4, buffer_size=2, num_completed_batches=2, actual_num_records=4) + _write_parquet_files(dataset_dir / "parquet-files", [0, 1]) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + + with caplog.at_level(logging.WARNING): + with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): + with patch.object(builder, "_run_model_health_check_if_needed"): + builder.build(num_records=4, resume=True) + + assert any("already complete" in record.message for record in caplog.records) + + +def test_build_async_resume_raises_without_metadata(stub_resource_provider, stub_test_config_builder, tmp_path): + """Async resume raises DatasetGenerationError when metadata.json is missing.""" + dataset_dir = tmp_path / "dataset" + dataset_dir.mkdir() + (dataset_dir / "builder_config.json").write_text("{}") + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path) + + with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): + with patch.object(builder, "_run_model_health_check_if_needed"): + with pytest.raises(DatasetGenerationError, match="metadata.json not found"): + builder.build(num_records=4, resume=True) diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/utils/test_row_group_buffer.py b/packages/data-designer-engine/tests/engine/dataset_builders/utils/test_row_group_buffer.py index 37b6f71ac..08dad3730 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/utils/test_row_group_buffer.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/utils/test_row_group_buffer.py @@ -223,3 +223,35 @@ def test_checkpoint_calls_on_complete_when_all_rows_dropped() -> None: callback.assert_called_once_with(None) storage.write_batch_to_parquet_file.assert_not_called() + + +def test_initial_actual_num_records() -> None: + """initial_actual_num_records pre-seeds the actual_num_records counter.""" + storage = _mock_artifact_storage() + storage.write_batch_to_parquet_file.return_value = "/fake/path.parquet" + storage.move_partial_result_to_final_file_path.return_value = "/fake/final.parquet" + + mgr = RowGroupBufferManager(storage, initial_actual_num_records=10) + mgr.init_row_group(0, 3) + mgr.update_batch(0, "col", ["a", "b", "c"]) + mgr.checkpoint_row_group(0) + + assert mgr.actual_num_records == 13 + + +def test_initial_total_num_batches_reflected_in_metadata() -> None: + """initial_total_num_batches pre-seeds the batch counter used by write_metadata.""" + storage = _mock_artifact_storage() + storage.write_batch_to_parquet_file.return_value = "/fake/path.parquet" + storage.move_partial_result_to_final_file_path.return_value = "/fake/final.parquet" + + mgr = RowGroupBufferManager(storage, initial_actual_num_records=5, initial_total_num_batches=2) + mgr.init_row_group(2, 2) + mgr.update_batch(2, "col", ["x", "y"]) + mgr.checkpoint_row_group(2) + + mgr.write_metadata(target_num_records=9, buffer_size=3) + + written = storage.write_metadata.call_args[0][0] + assert written["num_completed_batches"] == 3 # 2 initial + 1 new + assert written["actual_num_records"] == 7 # 5 initial + 2 new From 812d7dfd202683e5fe7feafddccf50828f381582 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 13 Apr 2026 13:48:03 +0200 Subject: [PATCH 08/29] fix(builder): skip after-generation processors when resume finds dataset already complete _build_with_resume and _build_async now return False when the dataset is already complete (early-return path), True otherwise. build() skips _processor_runner.run_after_generation() on False, preventing processors from calling shutil.rmtree and rewriting an already-finalized dataset. Fixes the issue raised in review: greptile P1 comment on PR #526. --- .../dataset_builders/dataset_builder.py | 32 +++++++++++----- .../dataset_builders/test_dataset_builder.py | 38 +++++++++++++++++++ 2 files changed, 61 insertions(+), 9 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index a20402fdf..82ea1adac 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -238,11 +238,12 @@ def build( start_time = time.perf_counter() buffer_size = self._resource_provider.run_config.buffer_size + generated = True self._use_async = DATA_DESIGNER_ASYNC_ENGINE and self._resolve_async_compatibility() if self._use_async: - self._build_async(generators, num_records, buffer_size, on_batch_complete, resume=resume) + generated = self._build_async(generators, num_records, buffer_size, on_batch_complete, resume=resume) elif resume: - self._build_with_resume(generators, num_records, buffer_size, on_batch_complete) + generated = self._build_with_resume(generators, num_records, buffer_size, on_batch_complete) else: group_id = uuid.uuid4().hex self.batch_manager.start(num_records=num_records, buffer_size=buffer_size) @@ -257,7 +258,8 @@ def build( ) self.batch_manager.finish() - self._processor_runner.run_after_generation(buffer_size) + if generated: + self._processor_runner.run_after_generation(buffer_size) self._resource_provider.model_registry.log_model_usage(time.perf_counter() - start_time) return self.artifact_storage.final_dataset_path @@ -304,8 +306,13 @@ def _build_with_resume( num_records: int, buffer_size: int, on_batch_complete: Callable[[Path], None] | None, - ) -> None: - """Resume generation from the last completed batch.""" + ) -> bool: + """Resume generation from the last completed batch. + + Returns: + False if the dataset was already complete (no new records generated), + True after successfully generating the remaining batches. + """ state = self._load_resume_state(num_records, buffer_size) self.batch_manager.start( @@ -320,7 +327,7 @@ def _build_with_resume( "⚠️ Dataset is already complete — all batches were found in the existing artifact directory. " "Nothing to resume. Remove resume=True if you want to generate a new dataset." ) - return + return False logger.info( f"▶️ Resuming from batch {state.num_completed_batches + 1} of {self.batch_manager.num_batches} " @@ -340,6 +347,7 @@ def _build_with_resume( on_batch_complete=on_batch_complete, ) self.batch_manager.finish() + return True def build_preview(self, *, num_records: int) -> pd.DataFrame: self._reset_run_state() @@ -451,8 +459,13 @@ def _build_async( on_batch_complete: Callable[[Path], None] | None = None, *, resume: bool = False, - ) -> None: - """Async task-queue builder path - dispatches tasks based on dependency readiness.""" + ) -> bool: + """Async task-queue builder path - dispatches tasks based on dependency readiness. + + Returns: + False if the dataset was already complete (no new records generated), + True after successfully running the scheduler. + """ logger.info("⚡ DATA_DESIGNER_ASYNC_ENGINE is enabled - using async task-queue builder") settings = self._resource_provider.run_config @@ -476,7 +489,7 @@ def _build_async( "⚠️ Dataset is already complete — all row groups were found in the existing artifact " "directory. Nothing to resume. Remove resume=True if you want to generate a new dataset." ) - return + return False logger.info( f"▶️ Resuming async run: {len(completed_ids)} of {total_row_groups} row group(s) already " @@ -534,6 +547,7 @@ def on_complete(final_path: Path | str | None) -> None: # Write final metadata (overwrites the last incremental write with identical content). buffer_manager.write_metadata(target_num_records=num_records, buffer_size=buffer_size) + return True # Surface partial completion actual = self._actual_num_records diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index 3f00ef639..3d6283191 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1531,6 +1531,26 @@ def test_build_resume_logs_warning_when_already_complete( assert any("already complete" in record.message for record in caplog.records) +def test_build_resume_already_complete_does_not_run_after_generation_processors( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """When already complete, run_after_generation must NOT be called (would destroy the dataset).""" + dataset_dir = tmp_path / "dataset" + _write_metadata( + dataset_dir, + target_num_records=4, + buffer_size=2, + num_completed_batches=2, + actual_num_records=4, + ) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + with patch.object(builder._processor_runner, "run_after_generation") as mock_after: + builder.build(num_records=4, resume=True) + + mock_after.assert_not_called() + + # --------------------------------------------------------------------------- # _find_completed_row_group_ids tests # --------------------------------------------------------------------------- @@ -1618,3 +1638,21 @@ def test_build_async_resume_raises_without_metadata(stub_resource_provider, stub with patch.object(builder, "_run_model_health_check_if_needed"): with pytest.raises(DatasetGenerationError, match="metadata.json not found"): builder.build(num_records=4, resume=True) + + +def test_build_async_resume_already_complete_does_not_run_after_generation_processors( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """Async resume: when already complete, run_after_generation must NOT be called.""" + dataset_dir = tmp_path / "dataset" + _write_metadata(dataset_dir, target_num_records=4, buffer_size=2, num_completed_batches=2, actual_num_records=4) + _write_parquet_files(dataset_dir / "parquet-files", [0, 1]) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + + with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): + with patch.object(builder, "_run_model_health_check_if_needed"): + with patch.object(builder._processor_runner, "run_after_generation") as mock_after: + builder.build(num_records=4, resume=True) + + mock_after.assert_not_called() From 4054610147cd89e3d2558eef8fa89ee8c1488d5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 13 Apr 2026 14:26:45 +0200 Subject: [PATCH 09/29] fix(builder): use filesystem count for initial_total_num_batches on async resume Metadata can lag by one row group if a crash occurs between move_partial_result_to_final_file_path and write_metadata. Using len(completed_ids) from the filesystem scan instead of state.num_completed_batches ensures the final metadata reflects the actual number of parquet files present, not the potentially stale metadata count. --- .../dataset_builders/dataset_builder.py | 4 ++- .../dataset_builders/test_dataset_builder.py | 27 +++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index 82ea1adac..ce25a9cf8 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -480,7 +480,9 @@ def _build_async( completed_ids = self._find_completed_row_group_ids() skip_row_groups = frozenset(completed_ids) initial_actual_num_records = state.actual_num_records - initial_total_num_batches = state.num_completed_batches + # Use filesystem count as source of truth — metadata may lag by one row group + # if a crash occurred between move_partial_result_to_final_file_path and write_metadata. + initial_total_num_batches = len(completed_ids) self.artifact_storage.clear_partial_results() total_row_groups = -(-num_records // buffer_size) # ceiling division diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index 3d6283191..fe18553aa 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1656,3 +1656,30 @@ def test_build_async_resume_already_complete_does_not_run_after_generation_proce builder.build(num_records=4, resume=True) mock_after.assert_not_called() + + +def test_find_completed_row_group_ids_used_for_initial_total_batches( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """initial_total_num_batches uses filesystem count, not metadata count. + + Simulates the crash window: 2 parquet files exist on disk but metadata still + records num_completed_batches=1 (write_metadata crashed after the second + row group was moved to parquet-files/ but before metadata was updated). + Verifies that _find_completed_row_group_ids() (= 2) is used, not metadata (= 1). + """ + dataset_dir = tmp_path / "dataset" + # Metadata lags — says only 1 batch completed + _write_metadata(dataset_dir, target_num_records=4, buffer_size=2, num_completed_batches=1, actual_num_records=2) + # Filesystem truth — 2 row groups already written + _write_parquet_files(dataset_dir / "parquet-files", [0, 1]) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + # Both row groups are on disk → dataset is already complete → generated=False + with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): + with patch.object(builder, "_run_model_health_check_if_needed"): + with patch.object(builder._processor_runner, "run_after_generation") as mock_after: + builder.build(num_records=4, resume=True) + + # Already complete based on filesystem count (2 files ≥ 2 row groups) — no generation needed + mock_after.assert_not_called() From 0bdf24ab677499b932550847d7d7d1c2a4183395 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 13 Apr 2026 21:22:01 +0200 Subject: [PATCH 10/29] feat(results): add export() method and --output-format CLI flag Adds DatasetCreationResults.export(path, format=) supporting jsonl, csv, and parquet. The CLI create command gains --output-format / -f which writes dataset. alongside the parquet batch files. --- .../src/data_designer/cli/commands/create.py | 12 ++++ .../cli/controllers/generation_controller.py | 21 ++++++ .../src/data_designer/interface/results.py | 41 ++++++++++- .../tests/cli/commands/test_create_command.py | 31 +++++++- packages/data-designer/tests/cli/test_main.py | 1 + .../tests/interface/test_results.py | 71 +++++++++++++++++++ 6 files changed, 174 insertions(+), 3 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/commands/create.py b/packages/data-designer/src/data_designer/cli/commands/create.py index 3bf3265f0..1f9506dca 100644 --- a/packages/data-designer/src/data_designer/cli/commands/create.py +++ b/packages/data-designer/src/data_designer/cli/commands/create.py @@ -7,6 +7,7 @@ from data_designer.cli.controllers.generation_controller import GenerationController from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS +from data_designer.interface.results import SUPPORTED_EXPORT_FORMATS def create_command( @@ -35,6 +36,16 @@ def create_command( "-o", help="Path where generated artifacts will be stored. Defaults to ./artifacts.", ), + output_format: str | None = typer.Option( + None, + "--output-format", + "-f", + help=( + f"Export the dataset to a single file after generation. " + f"Supported formats: {', '.join(SUPPORTED_EXPORT_FORMATS)}. " + "The file is written to //dataset.." + ), + ), ) -> None: """Create a full dataset and save results to disk. @@ -60,4 +71,5 @@ def create_command( num_records=num_records, dataset_name=dataset_name, artifact_path=artifact_path, + output_format=output_format, ) diff --git a/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py b/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py index 74a44c3cd..bdac0a95f 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py @@ -116,6 +116,7 @@ def run_create( num_records: int, dataset_name: str, artifact_path: str | None, + output_format: str | None = None, ) -> None: """Load config, create a full dataset, and save results to disk. @@ -124,6 +125,8 @@ def run_create( num_records: Number of records to generate. dataset_name: Name for the generated dataset folder. artifact_path: Path where generated artifacts will be stored, or None for default. + output_format: If set, export the dataset to a single file in this format after + generation. One of 'jsonl', 'csv', 'parquet'. """ config_builder = self._load_config(config_source) @@ -157,6 +160,24 @@ def run_create( console.print() print_success(f"Dataset created — {len(dataset)} record(s) generated") console.print(f" Artifacts saved to: [bold]{results.artifact_storage.base_dataset_path}[/bold]") + + if output_format is not None: + from data_designer.interface.results import SUPPORTED_EXPORT_FORMATS + + if output_format not in SUPPORTED_EXPORT_FORMATS: + print_error( + f"Unsupported export format: {output_format!r}. " + f"Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}." + ) + raise typer.Exit(code=1) + export_path = results.artifact_storage.base_dataset_path / f"dataset.{output_format}" + try: + results.export(export_path, format=output_format) # type: ignore[arg-type] + except Exception as e: + print_error(f"Export failed: {e}") + raise typer.Exit(code=1) + console.print(f" Exported to: [bold]{export_path}[/bold]") + console.print() def _load_config(self, config_source: str) -> DataDesignerConfigBuilder: diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index 07692ff00..fce35f8d3 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -4,7 +4,7 @@ from __future__ import annotations from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults from data_designer.config.config_builder import DataDesignerConfigBuilder @@ -19,6 +19,9 @@ from data_designer.engine.dataset_builders.utils.task_model import TaskTrace +ExportFormat = Literal["jsonl", "csv", "parquet"] +SUPPORTED_EXPORT_FORMATS: tuple[str, ...] = ("jsonl", "csv", "parquet") + class DatasetCreationResults(WithRecordSamplerMixin): """Results container for a Data Designer dataset creation run. @@ -95,6 +98,42 @@ def get_path_to_processor_artifacts(self, processor_name: str) -> Path: raise ArtifactStorageError(f"Processor {processor_name} has no artifacts.") return self.artifact_storage.processors_outputs_path / processor_name + def export(self, path: Path | str, *, format: ExportFormat = "jsonl") -> Path: + """Export the generated dataset to a single file. + + Args: + path: Output file path. The extension is not inferred from *format* — + the exact path is used as-is. + format: Output format. One of ``'jsonl'``, ``'csv'``, or ``'parquet'``. + Defaults to ``'jsonl'``. + + Returns: + Path to the written file. + + Raises: + ValueError: If an unsupported format is requested. + + Example: + >>> results = data_designer.create(config, num_records=1000) + >>> results.export("output.jsonl") + PosixPath('output.jsonl') + >>> results.export("output.csv", format="csv") + PosixPath('output.csv') + """ + if format not in SUPPORTED_EXPORT_FORMATS: + raise ValueError( + f"Unsupported export format: {format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}." + ) + path = Path(path) + df = self.load_dataset() + if format == "jsonl": + df.to_json(path, orient="records", lines=True, force_ascii=False) + elif format == "csv": + df.to_csv(path, index=False) + elif format == "parquet": + df.to_parquet(path, index=False) + return path + def push_to_hub( self, repo_id: str, diff --git a/packages/data-designer/tests/cli/commands/test_create_command.py b/packages/data-designer/tests/cli/commands/test_create_command.py index 30cae9bc7..cde4fff06 100644 --- a/packages/data-designer/tests/cli/commands/test_create_command.py +++ b/packages/data-designer/tests/cli/commands/test_create_command.py @@ -18,7 +18,7 @@ def test_create_command_delegates_to_controller(mock_ctrl_cls: MagicMock) -> Non mock_ctrl = MagicMock() mock_ctrl_cls.return_value = mock_ctrl - create_command(config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None) + create_command(config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None, output_format=None) mock_ctrl_cls.assert_called_once() mock_ctrl.run_create.assert_called_once_with( @@ -26,6 +26,7 @@ def test_create_command_delegates_to_controller(mock_ctrl_cls: MagicMock) -> Non num_records=10, dataset_name="dataset", artifact_path=None, + output_format=None, ) @@ -40,6 +41,7 @@ def test_create_command_passes_custom_options(mock_ctrl_cls: MagicMock) -> None: num_records=100, dataset_name="my_data", artifact_path="/custom/output", + output_format=None, ) mock_ctrl.run_create.assert_called_once_with( @@ -47,6 +49,7 @@ def test_create_command_passes_custom_options(mock_ctrl_cls: MagicMock) -> None: num_records=100, dataset_name="my_data", artifact_path="/custom/output", + output_format=None, ) @@ -56,11 +59,35 @@ def test_create_command_default_artifact_path_is_none(mock_ctrl_cls: MagicMock) mock_ctrl = MagicMock() mock_ctrl_cls.return_value = mock_ctrl - create_command(config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None) + create_command(config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None, output_format=None) mock_ctrl.run_create.assert_called_once_with( config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None, + output_format=None, + ) + + +@patch("data_designer.cli.commands.create.GenerationController") +def test_create_command_passes_output_format(mock_ctrl_cls: MagicMock) -> None: + """Test create_command forwards --output-format to the controller.""" + mock_ctrl = MagicMock() + mock_ctrl_cls.return_value = mock_ctrl + + create_command( + config_source="config.yaml", + num_records=10, + dataset_name="dataset", + artifact_path=None, + output_format="jsonl", + ) + + mock_ctrl.run_create.assert_called_once_with( + config_source="config.yaml", + num_records=10, + dataset_name="dataset", + artifact_path=None, + output_format="jsonl", ) diff --git a/packages/data-designer/tests/cli/test_main.py b/packages/data-designer/tests/cli/test_main.py index 32d9cfc7d..15349d8e6 100644 --- a/packages/data-designer/tests/cli/test_main.py +++ b/packages/data-designer/tests/cli/test_main.py @@ -84,4 +84,5 @@ def test_app_dispatches_lazy_create_command(mock_controller_cls: Mock) -> None: num_records=DEFAULT_NUM_RECORDS, dataset_name="dataset", artifact_path=None, + output_format=None, ) diff --git a/packages/data-designer/tests/interface/test_results.py b/packages/data-designer/tests/interface/test_results.py index a28dd987e..802cfff6c 100644 --- a/packages/data-designer/tests/interface/test_results.py +++ b/packages/data-designer/tests/interface/test_results.py @@ -259,6 +259,77 @@ def test_load_dataset_independent_of_record_sampler_cache(stub_dataset_creation_ stub_artifact_storage.load_dataset.assert_called_once() +@pytest.mark.parametrize("fmt", ["jsonl", "csv", "parquet"]) +def test_export_writes_file(stub_dataset_creation_results, tmp_path, fmt): + """export() writes a file in the requested format.""" + out = tmp_path / f"out.{fmt}" + result = stub_dataset_creation_results.export(out, format=fmt) + assert result == out + assert out.exists() + assert out.stat().st_size > 0 + + +def test_export_jsonl_content(stub_dataset_creation_results, stub_dataframe, tmp_path): + """JSONL export writes one JSON object per line.""" + import json + + out = tmp_path / "out.jsonl" + stub_dataset_creation_results.export(out, format="jsonl") + lines = out.read_text(encoding="utf-8").splitlines() + assert len(lines) == len(stub_dataframe) + # Each line must be valid JSON + for line in lines: + json.loads(line) + + +def test_export_csv_content(stub_dataset_creation_results, stub_dataframe, tmp_path): + """CSV export has a header row and one data row per record.""" + import data_designer.lazy_heavy_imports as lazy + + out = tmp_path / "out.csv" + stub_dataset_creation_results.export(out, format="csv") + loaded = lazy.pd.read_csv(out) + assert list(loaded.columns) == list(stub_dataframe.columns) + assert len(loaded) == len(stub_dataframe) + + +def test_export_parquet_content(stub_dataset_creation_results, stub_dataframe, tmp_path): + """Parquet export round-trips to the original DataFrame.""" + import data_designer.lazy_heavy_imports as lazy + + out = tmp_path / "out.parquet" + stub_dataset_creation_results.export(out, format="parquet") + loaded = lazy.pd.read_parquet(out) + lazy.pd.testing.assert_frame_equal(loaded.reset_index(drop=True), stub_dataframe.reset_index(drop=True)) + + +def test_export_default_format_is_jsonl(stub_dataset_creation_results, tmp_path): + """export() defaults to JSONL when no format is given.""" + import json + + out = tmp_path / "out.jsonl" + stub_dataset_creation_results.export(out) + lines = out.read_text(encoding="utf-8").splitlines() + # All lines must be valid JSON + for line in lines: + json.loads(line) + + +def test_export_unsupported_format_raises(stub_dataset_creation_results, tmp_path): + """export() raises ValueError for unknown formats.""" + with pytest.raises(ValueError, match="Unsupported export format"): + stub_dataset_creation_results.export(tmp_path / "out.xyz", format="xlsx") # type: ignore[arg-type] + + +def test_export_returns_path_object(stub_dataset_creation_results, tmp_path): + """export() returns a Path regardless of whether str or Path was passed.""" + from pathlib import Path + + out = tmp_path / "out.jsonl" + result = stub_dataset_creation_results.export(str(out)) + assert isinstance(result, Path) + + def test_preview_results_dataset_metadata() -> None: """Test that PreviewResults uses DatasetMetadata in display_sample_record.""" config_builder = MagicMock(spec=DataDesignerConfigBuilder) From 4401e4b446fa2315c0c6c3fc3c1f3558b1895c14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Tue, 14 Apr 2026 22:57:32 +0200 Subject: [PATCH 11/29] fix(builder): handle resume when metadata.json missing (interrupted before first batch) When a run is interrupted before any row group or batch completes, metadata.json is never written. Previously resume=True would raise DatasetGenerationError in this case. Now build() detects the missing file, logs an info message, clears any leftover partial results and falls back to a clean fresh run. This is the common scenario for small datasets (fewer records than buffer_size) where all records fit in a single row group. --- .../dataset_builders/dataset_builder.py | 11 +++++ .../dataset_builders/test_dataset_builder.py | 40 ++++++++++++++----- .../tests/cli/commands/test_create_command.py | 8 +++- 3 files changed, 47 insertions(+), 12 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index ce25a9cf8..f77be87de 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -238,6 +238,17 @@ def build( start_time = time.perf_counter() buffer_size = self._resource_provider.run_config.buffer_size + if resume and not self.artifact_storage.metadata_file_path.exists(): + # No metadata.json means the previous run was interrupted before any batch (sync) or + # row group (async) completed. Nothing to resume — discard any leftover partial + # results and start fresh. + logger.info( + "▶️ No metadata.json found — the previous run was interrupted before any batch " + "completed. Starting generation from the beginning." + ) + self.artifact_storage.clear_partial_results() + resume = False + generated = True self._use_async = DATA_DESIGNER_ASYNC_ENGINE and self._resolve_async_compatibility() if self._use_async: diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index fe18553aa..5bc2969bb 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1462,11 +1462,12 @@ def _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_p ) -def test_build_resume_raises_without_metadata(stub_resource_provider, stub_test_config_builder, tmp_path): - """resume=True when only the folder exists (no metadata.json) raises DatasetGenerationError. +def test_build_resume_starts_fresh_without_metadata(stub_resource_provider, stub_test_config_builder, tmp_path, caplog): + """resume=True when only the folder exists (no metadata.json) logs an info message and starts fresh. This covers the case where a run was interrupted before any batch completed — the folder was created by _write_builder_config but metadata.json was never written. + Previously this raised DatasetGenerationError; now it silently restarts from batch 0. """ # Pre-create the folder with content so resolved_dataset_name(resume=True) returns "dataset" dataset_dir = tmp_path / "dataset" @@ -1474,8 +1475,14 @@ def test_build_resume_raises_without_metadata(stub_resource_provider, stub_test_ (dataset_dir / "builder_config.json").write_text("{}") # non-empty, no metadata builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path) - with pytest.raises(DatasetGenerationError, match="metadata.json not found"): - builder.build(num_records=4, resume=True) + with caplog.at_level(logging.INFO): + with patch.object(builder, "_run_model_health_check_if_needed"): + with patch.object(builder, "_run_batch"): + with patch.object(builder.batch_manager, "finish"): + # resume=False is set internally; build dispatches to the normal (non-resume) path + builder.build(num_records=4, resume=True) + + assert any("interrupted before any batch completed" in record.message for record in caplog.records) def test_build_resume_raises_on_num_records_mismatch(stub_resource_provider, stub_test_config_builder, tmp_path): @@ -1626,18 +1633,31 @@ def test_build_async_resume_logs_warning_when_already_complete( assert any("already complete" in record.message for record in caplog.records) -def test_build_async_resume_raises_without_metadata(stub_resource_provider, stub_test_config_builder, tmp_path): - """Async resume raises DatasetGenerationError when metadata.json is missing.""" +def test_build_async_resume_starts_fresh_without_metadata( + stub_resource_provider, stub_test_config_builder, tmp_path, caplog +): + """Async resume with no metadata.json logs an info message and starts fresh. + + Previously this raised DatasetGenerationError; now it silently restarts from row group 0. + The log is emitted in build() before dispatching to _build_async, so mocking _build_async + does not suppress the message. + """ dataset_dir = tmp_path / "dataset" dataset_dir.mkdir() (dataset_dir / "builder_config.json").write_text("{}") builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path) - with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): - with patch.object(builder, "_run_model_health_check_if_needed"): - with pytest.raises(DatasetGenerationError, match="metadata.json not found"): - builder.build(num_records=4, resume=True) + with caplog.at_level(logging.INFO): + with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): + with patch.object(builder, "_run_model_health_check_if_needed"): + with patch.object(builder, "_build_async", return_value=True) as mock_async: + builder.build(num_records=4, resume=True) + + # _build_async is called with resume=False because the no-metadata path resets the flag + _, kwargs = mock_async.call_args + assert kwargs.get("resume") is False + assert any("interrupted before any batch completed" in record.message for record in caplog.records) def test_build_async_resume_already_complete_does_not_run_after_generation_processors( diff --git a/packages/data-designer/tests/cli/commands/test_create_command.py b/packages/data-designer/tests/cli/commands/test_create_command.py index cde4fff06..fc779df7c 100644 --- a/packages/data-designer/tests/cli/commands/test_create_command.py +++ b/packages/data-designer/tests/cli/commands/test_create_command.py @@ -18,7 +18,9 @@ def test_create_command_delegates_to_controller(mock_ctrl_cls: MagicMock) -> Non mock_ctrl = MagicMock() mock_ctrl_cls.return_value = mock_ctrl - create_command(config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None, output_format=None) + create_command( + config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None, output_format=None + ) mock_ctrl_cls.assert_called_once() mock_ctrl.run_create.assert_called_once_with( @@ -59,7 +61,9 @@ def test_create_command_default_artifact_path_is_none(mock_ctrl_cls: MagicMock) mock_ctrl = MagicMock() mock_ctrl_cls.return_value = mock_ctrl - create_command(config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None, output_format=None) + create_command( + config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None, output_format=None + ) mock_ctrl.run_create.assert_called_once_with( config_source="config.yaml", From c2d0a77e6eb4d4547f04da909ab9c08289cd8dcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Tue, 14 Apr 2026 23:07:09 +0200 Subject: [PATCH 12/29] =?UTF-8?q?docs(interface):=20fix=20resume=20docstri?= =?UTF-8?q?ng=20=E2=80=94=20async=20engine=20is=20supported?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/data_designer/interface/data_designer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/packages/data-designer/src/data_designer/interface/data_designer.py b/packages/data-designer/src/data_designer/interface/data_designer.py index d165fcdd7..767b66187 100644 --- a/packages/data-designer/src/data_designer/interface/data_designer.py +++ b/packages/data-designer/src/data_designer/interface/data_designer.py @@ -220,11 +220,12 @@ def create( a datetime stamp. For example, if the dataset name is "awesome_dataset" and a directory with the same name already exists, the dataset will be saved to a new directory with the name "awesome_dataset_2025-01-01_12-00-00". - resume: If True, resume generation from the last completed batch found in the - existing dataset directory. The run parameters (num_records, buffer_size) must - match those of the original run. Any in-flight partial results from the - interrupted run are discarded. Not supported when DATA_DESIGNER_ASYNC_ENGINE - is enabled. + resume: If True, resume generation from the last completed batch (sync engine) + or row group (async engine) found in the existing dataset directory. If no + progress was checkpointed yet (i.e. the run was interrupted before the first + batch/row-group completed), generation restarts from the beginning. The run + parameters (num_records, buffer_size) must match those of the original run. + Any in-flight partial results from the interrupted run are discarded. Returns: DatasetCreationResults object with methods for loading the generated dataset, From 4ffd7f311861aea930965523a5914ee159026e92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Tue, 14 Apr 2026 23:22:31 +0200 Subject: [PATCH 13/29] fix(builder): derive initial_actual_num_records from filesystem in async resume In the crash window (row group written to disk but write_metadata crashed before updating the file), both initial_total_num_batches and initial_actual_num_records now use the filesystem-discovered completed_ids as source of truth. Previously initial_actual_num_records was read from potentially stale metadata, causing actual_num_records in the final metadata to be undercounted by one row group. Also adds a test covering the partial-resume crash-window scenario. --- .../dataset_builders/dataset_builder.py | 13 +++-- .../dataset_builders/test_dataset_builder.py | 51 +++++++++++++++++++ 2 files changed, 59 insertions(+), 5 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index f77be87de..29651b63c 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -487,13 +487,16 @@ def _build_async( initial_total_num_batches = 0 if resume: - state = self._load_resume_state(num_records, buffer_size) + self._load_resume_state(num_records, buffer_size) completed_ids = self._find_completed_row_group_ids() skip_row_groups = frozenset(completed_ids) - initial_actual_num_records = state.actual_num_records - # Use filesystem count as source of truth — metadata may lag by one row group - # if a crash occurred between move_partial_result_to_final_file_path and write_metadata. + # Use filesystem as source of truth for both counters — metadata may lag by one + # row group if a crash occurred between move_partial_result_to_final_file_path + # and write_metadata. initial_total_num_batches = len(completed_ids) + initial_actual_num_records = sum( + min(buffer_size, num_records - rg_id * buffer_size) for rg_id in completed_ids + ) self.artifact_storage.clear_partial_results() total_row_groups = -(-num_records // buffer_size) # ceiling division @@ -506,7 +509,7 @@ def _build_async( logger.info( f"▶️ Resuming async run: {len(completed_ids)} of {total_row_groups} row group(s) already " - f"complete ({state.actual_num_records} records), skipping them." + f"complete ({initial_actual_num_records} records), skipping them." ) def finalize_row_group(rg_id: int) -> None: diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index 5bc2969bb..ca36f2cf6 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1703,3 +1703,54 @@ def test_find_completed_row_group_ids_used_for_initial_total_batches( # Already complete based on filesystem count (2 files ≥ 2 row groups) — no generation needed mock_after.assert_not_called() + + +def test_initial_actual_num_records_from_filesystem_in_crash_window( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """initial_actual_num_records is derived from filesystem, not stale metadata. + + Crash window scenario: row groups 0 and 1 are on disk but metadata only records + num_completed_batches=1 / actual_num_records=2 (write_metadata crashed after + the second row group was written but before it updated the file). + + With 6 records and buffer_size=2 (3 row groups total), the correct + initial_actual_num_records is 4 (groups 0+1), not 2 (stale metadata value). + """ + import asyncio as stdlib_asyncio + + dataset_dir = tmp_path / "dataset" + # Metadata lags — says only 1 batch completed with 2 records + _write_metadata(dataset_dir, target_num_records=6, buffer_size=2, num_completed_batches=1, actual_num_records=2) + # Filesystem truth — 2 row groups already written (ids 0 and 1) + _write_parquet_files(dataset_dir / "parquet-files", [0, 1]) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + + captured: dict = {} + + def capturing_prepare(*args, **kwargs): + captured["initial_actual_num_records"] = kwargs.get("initial_actual_num_records", 0) + captured["initial_total_num_batches"] = kwargs.get("initial_total_num_batches", 0) + mock_scheduler = Mock() + mock_scheduler.traces = [] + mock_buffer_manager = Mock() + return mock_scheduler, mock_buffer_manager + + mock_future = Mock() + mock_future.result = Mock(return_value=None) + + # asyncio and ensure_async_engine_loop are lazy-imported in dataset_builder only when + # DATA_DESIGNER_ASYNC_ENGINE=True at module load time. Inject them for the duration + # of this test so _build_async can proceed past the early-return path. + with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): + with patch.object(builder_mod, "asyncio", stdlib_asyncio, create=True): + with patch.object(builder_mod, "ensure_async_engine_loop", Mock(return_value=Mock()), create=True): + with patch.object(stdlib_asyncio, "run_coroutine_threadsafe", return_value=mock_future): + with patch.object(builder, "_run_model_health_check_if_needed"): + with patch.object(builder, "_prepare_async_run", side_effect=capturing_prepare): + builder.build(num_records=6, resume=True) + + # Filesystem says 2 groups done (IDs 0+1) → 2+2 = 4 records, not stale metadata value 2 + assert captured["initial_actual_num_records"] == 4 + assert captured["initial_total_num_batches"] == 2 From b89f1a150e3212c96cae9aed2811b88ab00a920c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Fri, 1 May 2026 00:26:33 +0200 Subject: [PATCH 14/29] feat(resume): replace resume: bool with ResumeMode enum (NEVER/ALWAYS/IF_POSSIBLE) - Introduces ResumeMode(StrEnum) in artifact_storage.py for use across all layers - Replaces resume: bool with resume: ResumeMode in DatasetBuilder.build(), DataDesigner.create(), ArtifactStorage, and _build_async() - Adds _check_resume_config_compatibility() using config fingerprints to support IF_POSSIBLE: falls back to a fresh run when config has changed since last run - Relaxes num_records validation from strict equality to num_records >= actual_num_records, allowing dataset extension on resume; buffer_size must still match exactly - Preserves exception chain with 'from exc' on FileNotFoundError in _load_resume_state - Exports ResumeMode from data_designer.interface for users to import - Adds skip_row_groups assertion test and IF_POSSIBLE storage behavior tests --- .../dataset_builders/dataset_builder.py | 89 +++++++++++++---- .../engine/storage/artifact_storage.py | 14 ++- .../dataset_builders/test_dataset_builder.py | 99 +++++++++++++++---- .../engine/storage/test_artifact_storage.py | 30 ++++-- .../src/data_designer/interface/__init__.py | 2 + .../data_designer/interface/data_designer.py | 25 +++-- 6 files changed, 201 insertions(+), 58 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index 29651b63c..6ad00fd73 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -5,6 +5,7 @@ import contextlib import functools +import json import logging import os import time @@ -54,7 +55,7 @@ from data_designer.engine.processing.processors.drop_columns import DropColumnsProcessor from data_designer.engine.registry.data_designer_registry import DataDesignerRegistry from data_designer.engine.resources.resource_provider import ResourceProvider -from data_designer.engine.storage.artifact_storage import SDG_CONFIG_FILENAME, ArtifactStorage +from data_designer.engine.storage.artifact_storage import SDG_CONFIG_FILENAME, ArtifactStorage, ResumeMode from data_designer.engine.storage.media_storage import StorageMode if TYPE_CHECKING: @@ -205,7 +206,7 @@ def build( num_records: int, on_batch_complete: Callable[[Path], None] | None = None, save_multimedia_to_disk: bool = True, - resume: bool = False, + resume: ResumeMode = ResumeMode.NEVER, ) -> Path: """Build the dataset. @@ -215,10 +216,18 @@ def build( save_multimedia_to_disk: Whether to save generated multimedia (images, audio, video) to disk. If False, multimedia is stored directly in the DataFrame (e.g., images as base64). Default is True. - resume: If True, resume generation from the last completed batch (sync engine) or - row group (async engine) found in the existing artifact directory. The run parameters - (num_records, buffer_size) must match those of the original run. Any in-flight - partial results from the interrupted run are discarded. + resume: Controls how interrupted runs are handled. + + - ``ResumeMode.NEVER`` (default): always start a fresh generation run. + - ``ResumeMode.ALWAYS``: resume from the last completed batch (sync) or row group + (async) found in the existing artifact directory. Raises if no prior progress + exists or if the run parameters (buffer_size) are incompatible. ``num_records`` + may be equal to or greater than the number already generated. + - ``ResumeMode.IF_POSSIBLE``: like ``ALWAYS`` when the current config fingerprint + matches the stored config; otherwise starts a fresh run without raising an error. + + In all resume modes, in-flight partial results from the interrupted run are + discarded before generation continues. Returns: Path to the generated dataset directory. @@ -227,6 +236,17 @@ def build( self._run_model_health_check_if_needed() self._run_mcp_tool_check_if_needed() + + # For IF_POSSIBLE: resolve to ALWAYS or NEVER before touching the artifact directory. + if resume == ResumeMode.IF_POSSIBLE: + if not self._check_resume_config_compatibility(): + logger.info( + "▶️ Config has changed since the last run — starting a fresh generation (resume=IF_POSSIBLE)." + ) + resume = ResumeMode.NEVER + else: + resume = ResumeMode.ALWAYS + self._write_builder_config() # Set media storage mode based on parameters @@ -238,7 +258,7 @@ def build( start_time = time.perf_counter() buffer_size = self._resource_provider.run_config.buffer_size - if resume and not self.artifact_storage.metadata_file_path.exists(): + if resume == ResumeMode.ALWAYS and not self.artifact_storage.metadata_file_path.exists(): # No metadata.json means the previous run was interrupted before any batch (sync) or # row group (async) completed. Nothing to resume — discard any leftover partial # results and start fresh. @@ -247,13 +267,13 @@ def build( "completed. Starting generation from the beginning." ) self.artifact_storage.clear_partial_results() - resume = False + resume = ResumeMode.NEVER generated = True self._use_async = DATA_DESIGNER_ASYNC_ENGINE and self._resolve_async_compatibility() if self._use_async: generated = self._build_async(generators, num_records, buffer_size, on_batch_complete, resume=resume) - elif resume: + elif resume == ResumeMode.ALWAYS: generated = self._build_with_resume(generators, num_records, buffer_size, on_batch_complete) else: group_id = uuid.uuid4().hex @@ -278,23 +298,27 @@ def build( def _load_resume_state(self, num_records: int, buffer_size: int) -> _ResumeState: """Read and validate resume state from an existing metadata.json. + ``num_records`` must be >= the number of records already generated (you may extend + the dataset, but cannot shrink it below what has been written). ``buffer_size`` must + exactly match the original run because it determines row-group boundaries. + Raises: DatasetGenerationError: If metadata is missing or incompatible with the current run parameters. """ try: metadata = self.artifact_storage.read_metadata() - except FileNotFoundError: + except FileNotFoundError as exc: raise DatasetGenerationError( "🛑 Cannot resume: metadata.json not found in the existing dataset directory. " - "Run without resume=True to start a new generation." - ) + "Run without resume=ResumeMode.ALWAYS to start a new generation." + ) from exc - target = metadata.get("target_num_records") - if target != num_records: + actual_num_records = metadata.get("actual_num_records", 0) + if num_records < actual_num_records: raise DatasetGenerationError( - f"🛑 Cannot resume: num_records={num_records} does not match the original run's " - f"target_num_records={target}. Use the same num_records as the interrupted run, " - "or start a new run without resume=True." + f"🛑 Cannot resume: num_records={num_records} is less than the {actual_num_records} " + "records already generated. Use num_records >= actual_num_records, " + "or start a new run without resume=ResumeMode.ALWAYS." ) meta_buffer_size = metadata.get("buffer_size") @@ -302,12 +326,12 @@ def _load_resume_state(self, num_records: int, buffer_size: int) -> _ResumeState raise DatasetGenerationError( f"🛑 Cannot resume: buffer_size={buffer_size} does not match the original run's " f"buffer_size={meta_buffer_size}. Use the same buffer_size as the interrupted run, " - "or start a new run without resume=True." + "or start a new run without resume=ResumeMode.ALWAYS." ) return _ResumeState( num_completed_batches=metadata["num_completed_batches"], - actual_num_records=metadata["actual_num_records"], + actual_num_records=actual_num_records, buffer_size=buffer_size, ) @@ -462,6 +486,29 @@ def _find_completed_row_group_ids(self) -> set[int]: continue return ids + def _check_resume_config_compatibility(self) -> bool: + """Compare the current config fingerprint against the stored builder_config.json. + + Returns True when the configs are compatible (same data-relevant fingerprint) or when + the stored config cannot be read (a warning is logged in that case so the corruption + is visible). Returns False when the fingerprints differ. + """ + config_path = self.artifact_storage.base_dataset_path / SDG_CONFIG_FILENAME + if not config_path.exists(): + return True + try: + stored_data = json.loads(config_path.read_text()) + stored_config = BuilderConfig.model_validate(stored_data) + current_fp = self._data_designer_config.fingerprint()["config_hash"] + stored_fp = stored_config.data_designer.fingerprint()["config_hash"] + return current_fp == stored_fp + except Exception: + logger.warning( + "⚠️ Could not read stored config at %s for compatibility check — assuming compatible.", + config_path, + ) + return True + def _build_async( self, generators: list[ColumnGenerator], @@ -469,7 +516,7 @@ def _build_async( buffer_size: int, on_batch_complete: Callable[[Path], None] | None = None, *, - resume: bool = False, + resume: ResumeMode = ResumeMode.NEVER, ) -> bool: """Async task-queue builder path - dispatches tasks based on dependency readiness. @@ -486,7 +533,7 @@ def _build_async( initial_actual_num_records = 0 initial_total_num_batches = 0 - if resume: + if resume == ResumeMode.ALWAYS: self._load_resume_state(num_records, buffer_size) completed_ids = self._find_completed_row_group_ids() skip_row_groups = frozenset(completed_ids) diff --git a/packages/data-designer-engine/src/data_designer/engine/storage/artifact_storage.py b/packages/data-designer-engine/src/data_designer/engine/storage/artifact_storage.py index 907422d0d..dd6d2e98a 100644 --- a/packages/data-designer-engine/src/data_designer/engine/storage/artifact_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/storage/artifact_storage.py @@ -38,6 +38,12 @@ class BatchStage(StrEnum): PROCESSORS_OUTPUTS = "processors_outputs_path" +class ResumeMode(StrEnum): + NEVER = "never" + ALWAYS = "always" + IF_POSSIBLE = "if_possible" + + class ArtifactStorage(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) @@ -47,7 +53,7 @@ class ArtifactStorage(BaseModel): partial_results_folder_name: str = "tmp-partial-parquet-files" dropped_columns_folder_name: str = "dropped-columns-parquet-files" processors_outputs_folder_name: str = PROCESSORS_OUTPUTS_FOLDER_NAME - resume: bool = False + resume: ResumeMode = ResumeMode.NEVER _media_storage: MediaStorage = PrivateAttr(default=None) @property @@ -68,7 +74,7 @@ def artifact_path_exists(self) -> bool: def resolved_dataset_name(self) -> str: dataset_path = self.artifact_path / self.dataset_name if dataset_path.exists() and len(list(dataset_path.iterdir())) > 0: - if self.resume: + if self.resume in (ResumeMode.ALWAYS, ResumeMode.IF_POSSIBLE): return self.dataset_name new_dataset_name = f"{self.dataset_name}_{datetime.now().strftime('%m-%d-%Y_%H%M%S')}" logger.info( @@ -76,10 +82,10 @@ def resolved_dataset_name(self) -> str: f"\n\t\t will be saved to {str(self.artifact_path / new_dataset_name)!r} instead." ) return new_dataset_name - if self.resume: + if self.resume == ResumeMode.ALWAYS: raise ArtifactStorageError( f"🛑 Cannot resume: no existing dataset found at {str(dataset_path)!r}. " - "Run without resume=True to start a new generation." + "Run without resume=ResumeMode.ALWAYS to start a new generation." ) return self.dataset_name diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index ca36f2cf6..e0e175d18 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -33,6 +33,7 @@ from data_designer.engine.processing.processors.base import Processor from data_designer.engine.registry.data_designer_registry import DataDesignerRegistry from data_designer.engine.resources.seed_reader import DataFrameSeedReader +from data_designer.engine.storage.artifact_storage import ResumeMode if TYPE_CHECKING: import pandas as pd @@ -1452,8 +1453,8 @@ def _write_metadata(dataset_dir: _Path, **fields) -> None: def _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, *, buffer_size: int = 2): - """Return a DatasetBuilder whose ArtifactStorage has resume=True.""" - storage = _ArtifactStorage(artifact_path=tmp_path, resume=True) + """Return a DatasetBuilder whose ArtifactStorage has resume=ResumeMode.ALWAYS.""" + storage = _ArtifactStorage(artifact_path=tmp_path, resume=ResumeMode.ALWAYS) stub_resource_provider.artifact_storage = storage stub_resource_provider.run_config = RunConfig(buffer_size=buffer_size) return DatasetBuilder( @@ -1480,25 +1481,49 @@ def test_build_resume_starts_fresh_without_metadata(stub_resource_provider, stub with patch.object(builder, "_run_batch"): with patch.object(builder.batch_manager, "finish"): # resume=False is set internally; build dispatches to the normal (non-resume) path - builder.build(num_records=4, resume=True) + builder.build(num_records=4, resume=ResumeMode.ALWAYS) assert any("interrupted before any batch completed" in record.message for record in caplog.records) -def test_build_resume_raises_on_num_records_mismatch(stub_resource_provider, stub_test_config_builder, tmp_path): - """resume=True raises when num_records differs from the original run.""" +def test_build_resume_raises_when_num_records_below_actual( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """resume=ALWAYS raises when num_records is less than what has already been generated.""" dataset_dir = tmp_path / "dataset" _write_metadata( dataset_dir, target_num_records=10, buffer_size=2, + num_completed_batches=3, + actual_num_records=6, + ) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + with pytest.raises(DatasetGenerationError, match="num_records=4 is less than the 6 records already generated"): + builder.build(num_records=4, resume=ResumeMode.ALWAYS) + + +def test_build_resume_allows_larger_num_records( + stub_resource_provider, stub_test_config_builder, tmp_path, caplog +): + """resume=ALWAYS succeeds when num_records > original target (extending the dataset).""" + dataset_dir = tmp_path / "dataset" + _write_metadata( + dataset_dir, + target_num_records=4, + buffer_size=2, num_completed_batches=2, actual_num_records=4, ) builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) - with pytest.raises(DatasetGenerationError, match="num_records=4 does not match"): - builder.build(num_records=4, resume=True) + with caplog.at_level(logging.WARNING): + with patch.object(builder, "_run_model_health_check_if_needed"): + # 6 > 4 already generated → not already complete, should start generating + # Here we just verify it does NOT raise on the num_records check + with patch.object(builder, "_build_with_resume", return_value=True): + builder.build(num_records=6, resume=ResumeMode.ALWAYS) def test_build_resume_raises_on_buffer_size_mismatch(stub_resource_provider, stub_test_config_builder, tmp_path): @@ -1514,7 +1539,7 @@ def test_build_resume_raises_on_buffer_size_mismatch(stub_resource_provider, stu builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=3) with pytest.raises(DatasetGenerationError, match="buffer_size=3 does not match"): - builder.build(num_records=4, resume=True) + builder.build(num_records=4, resume=ResumeMode.ALWAYS) def test_build_resume_logs_warning_when_already_complete( @@ -1533,7 +1558,7 @@ def test_build_resume_logs_warning_when_already_complete( builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) with caplog.at_level(logging.WARNING): - builder.build(num_records=4, resume=True) + builder.build(num_records=4, resume=ResumeMode.ALWAYS) assert any("already complete" in record.message for record in caplog.records) @@ -1553,7 +1578,7 @@ def test_build_resume_already_complete_does_not_run_after_generation_processors( builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) with patch.object(builder._processor_runner, "run_after_generation") as mock_after: - builder.build(num_records=4, resume=True) + builder.build(num_records=4, resume=ResumeMode.ALWAYS) mock_after.assert_not_called() @@ -1628,7 +1653,7 @@ def test_build_async_resume_logs_warning_when_already_complete( with caplog.at_level(logging.WARNING): with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): with patch.object(builder, "_run_model_health_check_if_needed"): - builder.build(num_records=4, resume=True) + builder.build(num_records=4, resume=ResumeMode.ALWAYS) assert any("already complete" in record.message for record in caplog.records) @@ -1652,11 +1677,11 @@ def test_build_async_resume_starts_fresh_without_metadata( with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): with patch.object(builder, "_run_model_health_check_if_needed"): with patch.object(builder, "_build_async", return_value=True) as mock_async: - builder.build(num_records=4, resume=True) + builder.build(num_records=4, resume=ResumeMode.ALWAYS) - # _build_async is called with resume=False because the no-metadata path resets the flag + # _build_async is called with resume=NEVER because the no-metadata path resets the mode _, kwargs = mock_async.call_args - assert kwargs.get("resume") is False + assert kwargs.get("resume") == ResumeMode.NEVER assert any("interrupted before any batch completed" in record.message for record in caplog.records) @@ -1673,7 +1698,7 @@ def test_build_async_resume_already_complete_does_not_run_after_generation_proce with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): with patch.object(builder, "_run_model_health_check_if_needed"): with patch.object(builder._processor_runner, "run_after_generation") as mock_after: - builder.build(num_records=4, resume=True) + builder.build(num_records=4, resume=ResumeMode.ALWAYS) mock_after.assert_not_called() @@ -1699,7 +1724,7 @@ def test_find_completed_row_group_ids_used_for_initial_total_batches( with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): with patch.object(builder, "_run_model_health_check_if_needed"): with patch.object(builder._processor_runner, "run_after_generation") as mock_after: - builder.build(num_records=4, resume=True) + builder.build(num_records=4, resume=ResumeMode.ALWAYS) # Already complete based on filesystem count (2 files ≥ 2 row groups) — no generation needed mock_after.assert_not_called() @@ -1749,8 +1774,48 @@ def capturing_prepare(*args, **kwargs): with patch.object(stdlib_asyncio, "run_coroutine_threadsafe", return_value=mock_future): with patch.object(builder, "_run_model_health_check_if_needed"): with patch.object(builder, "_prepare_async_run", side_effect=capturing_prepare): - builder.build(num_records=6, resume=True) + builder.build(num_records=6, resume=ResumeMode.ALWAYS) # Filesystem says 2 groups done (IDs 0+1) → 2+2 = 4 records, not stale metadata value 2 assert captured["initial_actual_num_records"] == 4 assert captured["initial_total_num_batches"] == 2 + + +def test_build_async_resume_skip_row_groups_contains_completed_ids( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """skip_row_groups passed to _prepare_async_run contains exactly the completed row group IDs. + + Verifies the set-based skip mechanism so the scheduler never re-generates a row group + that already has a parquet file on disk. + """ + import asyncio as stdlib_asyncio + + dataset_dir = tmp_path / "dataset" + # 6 records, buffer_size=2 → 3 row groups total; row groups 0 and 2 already on disk + _write_metadata(dataset_dir, target_num_records=6, buffer_size=2, num_completed_batches=2, actual_num_records=4) + _write_parquet_files(dataset_dir / "parquet-files", [0, 2]) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + + captured: dict = {} + + def capturing_prepare(*args, **kwargs): + captured["skip_row_groups"] = kwargs.get("skip_row_groups", frozenset()) + mock_scheduler = Mock() + mock_scheduler.traces = [] + mock_buffer_manager = Mock() + return mock_scheduler, mock_buffer_manager + + mock_future = Mock() + mock_future.result = Mock(return_value=None) + + with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): + with patch.object(builder_mod, "asyncio", stdlib_asyncio, create=True): + with patch.object(builder_mod, "ensure_async_engine_loop", Mock(return_value=Mock()), create=True): + with patch.object(stdlib_asyncio, "run_coroutine_threadsafe", return_value=mock_future): + with patch.object(builder, "_run_model_health_check_if_needed"): + with patch.object(builder, "_prepare_async_run", side_effect=capturing_prepare): + builder.build(num_records=6, resume=ResumeMode.ALWAYS) + + assert captured["skip_row_groups"] == frozenset({0, 2}) diff --git a/packages/data-designer-engine/tests/engine/storage/test_artifact_storage.py b/packages/data-designer-engine/tests/engine/storage/test_artifact_storage.py index 7dc1d8b1e..17e30d860 100644 --- a/packages/data-designer-engine/tests/engine/storage/test_artifact_storage.py +++ b/packages/data-designer-engine/tests/engine/storage/test_artifact_storage.py @@ -13,7 +13,7 @@ import data_designer.lazy_heavy_imports as lazy from data_designer.config.utils.io_helpers import load_processor_dataset from data_designer.engine.dataset_builders.errors import ArtifactStorageError -from data_designer.engine.storage.artifact_storage import ArtifactStorage, BatchStage +from data_designer.engine.storage.artifact_storage import ArtifactStorage, BatchStage, ResumeMode @pytest.fixture @@ -432,27 +432,43 @@ def test_resolved_dataset_name_creates_timestamped_copy_when_folder_exists(tmp_p def test_resolved_dataset_name_resume_uses_existing_folder(tmp_path): - """With resume=True, an existing non-empty folder is used as-is.""" + """With resume=ALWAYS, an existing non-empty folder is used as-is.""" existing = tmp_path / "dataset" existing.mkdir() (existing / "some_file.txt").write_text("x") - storage = ArtifactStorage(artifact_path=tmp_path, dataset_name="dataset", resume=True) + storage = ArtifactStorage(artifact_path=tmp_path, dataset_name="dataset", resume=ResumeMode.ALWAYS) assert storage.resolved_dataset_name == "dataset" def test_resolved_dataset_name_resume_raises_when_no_existing_folder(tmp_path): - """With resume=True, missing dataset folder raises ArtifactStorageError at init.""" + """With resume=ALWAYS, missing dataset folder raises ArtifactStorageError.""" with pytest.raises(ArtifactStorageError, match="Cannot resume"): - ArtifactStorage(artifact_path=tmp_path, dataset_name="dataset", resume=True) + ArtifactStorage(artifact_path=tmp_path, dataset_name="dataset", resume=ResumeMode.ALWAYS) def test_resolved_dataset_name_resume_raises_when_folder_is_empty(tmp_path): - """With resume=True, an empty existing folder raises ArtifactStorageError at init.""" + """With resume=ALWAYS, an empty existing folder raises ArtifactStorageError.""" (tmp_path / "dataset").mkdir() with pytest.raises(ArtifactStorageError, match="Cannot resume"): - ArtifactStorage(artifact_path=tmp_path, dataset_name="dataset", resume=True) + ArtifactStorage(artifact_path=tmp_path, dataset_name="dataset", resume=ResumeMode.ALWAYS) + + +def test_resolved_dataset_name_if_possible_uses_existing_folder(tmp_path): + """With resume=IF_POSSIBLE, an existing non-empty folder is used as-is.""" + existing = tmp_path / "dataset" + existing.mkdir() + (existing / "some_file.txt").write_text("x") + + storage = ArtifactStorage(artifact_path=tmp_path, dataset_name="dataset", resume=ResumeMode.IF_POSSIBLE) + assert storage.resolved_dataset_name == "dataset" + + +def test_resolved_dataset_name_if_possible_uses_clean_name_when_no_existing_folder(tmp_path): + """With resume=IF_POSSIBLE, a missing dataset folder results in a fresh run (no error).""" + storage = ArtifactStorage(artifact_path=tmp_path, dataset_name="dataset", resume=ResumeMode.IF_POSSIBLE) + assert storage.resolved_dataset_name == "dataset" def test_clear_partial_results_removes_partial_folder(tmp_path, stub_sample_dataframe): diff --git a/packages/data-designer/src/data_designer/interface/__init__.py b/packages/data-designer/src/data_designer/interface/__init__.py index a8a1bd61b..c5e426e35 100644 --- a/packages/data-designer/src/data_designer/interface/__init__.py +++ b/packages/data-designer/src/data_designer/interface/__init__.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: + from data_designer.engine.storage.artifact_storage import ResumeMode # noqa: F401 from data_designer.interface.data_designer import DataDesigner # noqa: F401 from data_designer.interface.errors import ( # noqa: F401 DataDesignerEarlyShutdownError, @@ -21,6 +22,7 @@ "DataDesignerGenerationError": ("data_designer.interface.errors", "DataDesignerGenerationError"), "DataDesignerProfilingError": ("data_designer.interface.errors", "DataDesignerProfilingError"), "DatasetCreationResults": ("data_designer.interface.results", "DatasetCreationResults"), + "ResumeMode": ("data_designer.engine.storage.artifact_storage", "ResumeMode"), } __all__ = list(_LAZY_IMPORTS.keys()) diff --git a/packages/data-designer/src/data_designer/interface/data_designer.py b/packages/data-designer/src/data_designer/interface/data_designer.py index 767b66187..7223f5616 100644 --- a/packages/data-designer/src/data_designer/interface/data_designer.py +++ b/packages/data-designer/src/data_designer/interface/data_designer.py @@ -61,7 +61,7 @@ PlaintextResolver, SecretResolver, ) -from data_designer.engine.storage.artifact_storage import ArtifactStorage +from data_designer.engine.storage.artifact_storage import ArtifactStorage, ResumeMode from data_designer.interface.errors import ( DataDesignerEarlyShutdownError, DataDesignerGenerationError, @@ -202,7 +202,7 @@ def create( *, num_records: int = DEFAULT_NUM_RECORDS, dataset_name: str = "dataset", - resume: bool = False, + resume: ResumeMode = ResumeMode.NEVER, ) -> DatasetCreationResults: """Create dataset and save results to the local artifact storage. @@ -220,12 +220,19 @@ def create( a datetime stamp. For example, if the dataset name is "awesome_dataset" and a directory with the same name already exists, the dataset will be saved to a new directory with the name "awesome_dataset_2025-01-01_12-00-00". - resume: If True, resume generation from the last completed batch (sync engine) - or row group (async engine) found in the existing dataset directory. If no - progress was checkpointed yet (i.e. the run was interrupted before the first - batch/row-group completed), generation restarts from the beginning. The run - parameters (num_records, buffer_size) must match those of the original run. - Any in-flight partial results from the interrupted run are discarded. + resume: Controls how interrupted runs are handled. + + - ``ResumeMode.NEVER`` (default): always start a fresh generation run. + - ``ResumeMode.ALWAYS``: resume from the last completed batch (sync) or row group + (async) found in the existing dataset directory. ``num_records`` may be equal to + or greater than the number already generated (you can extend the dataset). The + ``buffer_size`` must match the original run. If no prior progress exists or the + run is incompatible, an error is raised. + - ``ResumeMode.IF_POSSIBLE``: like ``ALWAYS`` when the current config fingerprint + matches the stored config; otherwise starts a fresh run without raising an error. + + In all resume modes, in-flight partial results from the interrupted run are + discarded before generation continues. Returns: DatasetCreationResults object with methods for loading the generated dataset, @@ -553,7 +560,7 @@ def _create_dataset_profiler( ) def _create_resource_provider( - self, dataset_name: str, config_builder: DataDesignerConfigBuilder, *, resume: bool = False + self, dataset_name: str, config_builder: DataDesignerConfigBuilder, *, resume: ResumeMode = ResumeMode.NEVER ) -> ResourceProvider: ArtifactStorage.mkdir_if_needed(self._artifact_path) From 5a99f5926448f6f86931b7835b84ec1b59ca67fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Fri, 1 May 2026 23:15:13 +0200 Subject: [PATCH 15/29] fix(resume): invalidate resolved_dataset_name cache when IF_POSSIBLE downgrades to NEVER ArtifactStorage's Pydantic model validator accesses base_dataset_path at construction time, caching resolved_dataset_name under IF_POSSIBLE semantics before build() can set resume=NEVER. Pop the stale cache entry so the property re-resolves with the correct NEVER semantics (timestamped directory). Also fixes _check_resume_config_compatibility() to use artifact_path/dataset_name directly instead of base_dataset_path, and adds a regression test covering the cache-bypass scenario. --- .../dataset_builders/dataset_builder.py | 21 +++++++- .../dataset_builders/test_dataset_builder.py | 49 +++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index 6ad00fd73..36d42c6ed 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -238,14 +238,25 @@ def build( self._run_mcp_tool_check_if_needed() # For IF_POSSIBLE: resolve to ALWAYS or NEVER before touching the artifact directory. + # _check_resume_config_compatibility() must NOT access base_dataset_path (which would + # cache resolved_dataset_name prematurely). After the decision, sync artifact_storage.resume + # so that resolved_dataset_name picks up the right semantics on its first real access. + # + # Also invalidate any stale resolved_dataset_name cache: ArtifactStorage's Pydantic + # validator accesses base_dataset_path at construction time, which caches resolved_dataset_name + # under the original resume=IF_POSSIBLE semantics. Popping it forces a fresh resolution. if resume == ResumeMode.IF_POSSIBLE: if not self._check_resume_config_compatibility(): logger.info( "▶️ Config has changed since the last run — starting a fresh generation (resume=IF_POSSIBLE)." ) resume = ResumeMode.NEVER + self.artifact_storage.resume = ResumeMode.NEVER + self.artifact_storage.__dict__.pop("resolved_dataset_name", None) else: resume = ResumeMode.ALWAYS + self.artifact_storage.resume = ResumeMode.ALWAYS + self.artifact_storage.__dict__.pop("resolved_dataset_name", None) self._write_builder_config() @@ -492,8 +503,16 @@ def _check_resume_config_compatibility(self) -> bool: Returns True when the configs are compatible (same data-relevant fingerprint) or when the stored config cannot be read (a warning is logged in that case so the corruption is visible). Returns False when the fingerprints differ. + + Uses artifact_path / dataset_name directly — NOT base_dataset_path — to avoid + prematurely triggering the resolved_dataset_name cached_property before the + caller has had a chance to decide whether to resume or start fresh. """ - config_path = self.artifact_storage.base_dataset_path / SDG_CONFIG_FILENAME + config_path = ( + Path(self.artifact_storage.artifact_path) + / self.artifact_storage.dataset_name + / SDG_CONFIG_FILENAME + ) if not config_path.exists(): return True try: diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index e0e175d18..2bfcd859b 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1819,3 +1819,52 @@ def capturing_prepare(*args, **kwargs): builder.build(num_records=6, resume=ResumeMode.ALWAYS) assert captured["skip_row_groups"] == frozenset({0, 2}) + + +def test_if_possible_incompatible_config_does_not_overwrite_existing_dataset( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """IF_POSSIBLE + incompatible config must NOT resolve to the existing dataset directory. + + Bug: _check_resume_config_compatibility() used base_dataset_path, triggering the + resolved_dataset_name cached_property while artifact_storage.resume was still IF_POSSIBLE. + The property cached the existing directory name; after resume was reset to NEVER locally, + artifact_storage.resume was never updated, so _write_builder_config() still wrote into the + old directory. + + Fix: _check_resume_config_compatibility() uses artifact_path/dataset_name directly and + build() syncs artifact_storage.resume = NEVER before the first real access to base_dataset_path. + """ + dataset_dir = tmp_path / "dataset" + dataset_dir.mkdir() + sentinel = dataset_dir / "important_file.txt" + sentinel.write_text("precious data") + + storage = _ArtifactStorage(artifact_path=tmp_path, resume=ResumeMode.IF_POSSIBLE) + stub_resource_provider.artifact_storage = storage + + builder = DatasetBuilder( + data_designer_config=stub_test_config_builder.build(), + resource_provider=stub_resource_provider, + ) + + # Simulate incompatible config and mock out all I/O so build() does not actually generate data + with patch.object(builder, "_check_resume_config_compatibility", return_value=False): + with patch.object(builder, "_run_model_health_check_if_needed"): + with patch.object(builder, "_run_mcp_tool_check_if_needed"): + with patch.object(builder, "_write_builder_config"): + with patch.object(builder, "_initialize_generators_and_graph", return_value=([], None)): + with patch.object(builder.batch_manager, "start"): + with patch.object(builder.batch_manager, "finish"): + with patch.object(builder._processor_runner, "run_after_generation"): + builder.build(num_records=2, resume=ResumeMode.IF_POSSIBLE) + + # artifact_storage.resume must be downgraded to NEVER so resolved_dataset_name uses NEVER semantics + assert storage.resume == ResumeMode.NEVER + + # resolved_dataset_name has not been cached yet (compat check bypassed base_dataset_path, + # _write_builder_config was mocked). Accessing it now must give a timestamped name. + assert sentinel.exists(), "Existing dataset directory must not be touched" + assert storage.resolved_dataset_name != "dataset", ( + "resolved_dataset_name must be a new timestamped directory, not the existing one" + ) From f69b3e7a8f91f508a872ee3b21032a83f4f94346 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Fri, 1 May 2026 23:21:13 +0200 Subject: [PATCH 16/29] fix(builder): move partial-completion warning before return in _build_async --- .../engine/dataset_builders/dataset_builder.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index 36d42c6ed..636597b0d 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -509,9 +509,7 @@ def _check_resume_config_compatibility(self) -> bool: caller has had a chance to decide whether to resume or start fresh. """ config_path = ( - Path(self.artifact_storage.artifact_path) - / self.artifact_storage.dataset_name - / SDG_CONFIG_FILENAME + Path(self.artifact_storage.artifact_path) / self.artifact_storage.dataset_name / SDG_CONFIG_FILENAME ) if not config_path.exists(): return True @@ -629,7 +627,6 @@ def on_complete(final_path: Path | str | None) -> None: # Write final metadata (overwrites the last incremental write with identical content). buffer_manager.write_metadata(target_num_records=num_records, buffer_size=buffer_size) - return True # Surface partial completion actual = self._actual_num_records @@ -648,6 +645,8 @@ def on_complete(final_path: Path | str | None) -> None: else: logger.warning(base + "The dataset may be incomplete due to dropped rows.") + return True + def _prepare_async_run( self, generators: list[ColumnGenerator], From 4daf48b4b3290a48dcb420a597a3bbc0927b96fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 4 May 2026 21:07:46 +0200 Subject: [PATCH 17/29] fix(builder): IF_POSSIBLE now starts fresh when no dataset directory exists _check_resume_config_compatibility returned True when config_path was absent, even when the dataset directory itself didn't exist. This caused IF_POSSIBLE to upgrade to ALWAYS, which then raised ArtifactStorageError on the first-ever run because ALWAYS requires an existing directory. Fix: return False early when the dataset directory is absent. Also sets actual_num_records on mock buffer managers in two async resume tests that started failing after the partial-completion warning block was made reachable. --- .../dataset_builders/dataset_builder.py | 7 +-- .../dataset_builders/test_dataset_builder.py | 43 ++++++++++++++++--- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index 636597b0d..d42a4fe8c 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -508,9 +508,10 @@ def _check_resume_config_compatibility(self) -> bool: prematurely triggering the resolved_dataset_name cached_property before the caller has had a chance to decide whether to resume or start fresh. """ - config_path = ( - Path(self.artifact_storage.artifact_path) / self.artifact_storage.dataset_name / SDG_CONFIG_FILENAME - ) + dataset_dir = Path(self.artifact_storage.artifact_path) / self.artifact_storage.dataset_name + if not dataset_dir.exists(): + return False + config_path = dataset_dir / SDG_CONFIG_FILENAME if not config_path.exists(): return True try: diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index 2bfcd859b..9e6fe1daa 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1434,6 +1434,8 @@ def test_skip_row_count_preserved_across_pipeline(stub_resource_provider, stub_m assert len(result) == 5, "Skip must not change the row count" assert result["seed_id"].tolist() == [1, 2, 3, 4, 5] + + # --------------------------------------------------------------------------- # Resume mechanism tests # --------------------------------------------------------------------------- @@ -1486,9 +1488,7 @@ def test_build_resume_starts_fresh_without_metadata(stub_resource_provider, stub assert any("interrupted before any batch completed" in record.message for record in caplog.records) -def test_build_resume_raises_when_num_records_below_actual( - stub_resource_provider, stub_test_config_builder, tmp_path -): +def test_build_resume_raises_when_num_records_below_actual(stub_resource_provider, stub_test_config_builder, tmp_path): """resume=ALWAYS raises when num_records is less than what has already been generated.""" dataset_dir = tmp_path / "dataset" _write_metadata( @@ -1504,9 +1504,7 @@ def test_build_resume_raises_when_num_records_below_actual( builder.build(num_records=4, resume=ResumeMode.ALWAYS) -def test_build_resume_allows_larger_num_records( - stub_resource_provider, stub_test_config_builder, tmp_path, caplog -): +def test_build_resume_allows_larger_num_records(stub_resource_provider, stub_test_config_builder, tmp_path, caplog): """resume=ALWAYS succeeds when num_records > original target (extending the dataset).""" dataset_dir = tmp_path / "dataset" _write_metadata( @@ -1760,6 +1758,7 @@ def capturing_prepare(*args, **kwargs): mock_scheduler = Mock() mock_scheduler.traces = [] mock_buffer_manager = Mock() + mock_buffer_manager.actual_num_records = 6 return mock_scheduler, mock_buffer_manager mock_future = Mock() @@ -1805,6 +1804,7 @@ def capturing_prepare(*args, **kwargs): mock_scheduler = Mock() mock_scheduler.traces = [] mock_buffer_manager = Mock() + mock_buffer_manager.actual_num_records = 6 return mock_scheduler, mock_buffer_manager mock_future = Mock() @@ -1868,3 +1868,34 @@ def test_if_possible_incompatible_config_does_not_overwrite_existing_dataset( assert storage.resolved_dataset_name != "dataset", ( "resolved_dataset_name must be a new timestamped directory, not the existing one" ) + + +def test_if_possible_starts_fresh_when_no_existing_directory( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """IF_POSSIBLE on a first-ever run (no dataset directory) must start fresh, not raise. + + Bug: _check_resume_config_compatibility returned True when config_path did not exist, + which caused IF_POSSIBLE to upgrade to ALWAYS. resolved_dataset_name then raised + ArtifactStorageError because ALWAYS requires an existing directory. + + Fix: return False when the dataset directory itself is absent. + """ + storage = _ArtifactStorage(artifact_path=tmp_path, resume=ResumeMode.IF_POSSIBLE) + stub_resource_provider.artifact_storage = storage + + builder = DatasetBuilder( + data_designer_config=stub_test_config_builder.build(), + resource_provider=stub_resource_provider, + ) + + with patch.object(builder, "_run_model_health_check_if_needed"): + with patch.object(builder, "_run_mcp_tool_check_if_needed"): + with patch.object(builder, "_write_builder_config"): + with patch.object(builder, "_initialize_generators_and_graph", return_value=([], None)): + with patch.object(builder.batch_manager, "start"): + with patch.object(builder.batch_manager, "finish"): + with patch.object(builder._processor_runner, "run_after_generation"): + builder.build(num_records=2, resume=ResumeMode.IF_POSSIBLE) + + assert storage.resume == ResumeMode.NEVER From 69c3e5582e977515c1639f327997a22b86f4760c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 4 May 2026 21:31:58 +0200 Subject: [PATCH 18/29] fix(builder): use original target_num_records in async resume record count When extending a non-aligned run (e.g. original num_records=5, buffer_size=2), the last completed row group has 1 record, not buffer_size=2. Using new num_records in the formula would overcount: min(2, 7-2*2)=2 instead of min(2, 5-2*2)=1. Fix: capture state from _load_resume_state (previously discarded) and pass state.target_num_records into the sum formula. Added target_num_records field to _ResumeState, populated from metadata.json. Test: test_build_async_resume_initial_actual_num_records_uses_original_target --- .../dataset_builders/dataset_builder.py | 8 +++- .../dataset_builders/test_dataset_builder.py | 44 +++++++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index d42a4fe8c..cd5703d58 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -101,6 +101,7 @@ class _ResumeState: num_completed_batches: int actual_num_records: int buffer_size: int + target_num_records: int class DatasetBuilder: @@ -344,6 +345,7 @@ def _load_resume_state(self, num_records: int, buffer_size: int) -> _ResumeState num_completed_batches=metadata["num_completed_batches"], actual_num_records=actual_num_records, buffer_size=buffer_size, + target_num_records=metadata["target_num_records"], ) def _build_with_resume( @@ -552,15 +554,17 @@ def _build_async( initial_total_num_batches = 0 if resume == ResumeMode.ALWAYS: - self._load_resume_state(num_records, buffer_size) + state = self._load_resume_state(num_records, buffer_size) completed_ids = self._find_completed_row_group_ids() skip_row_groups = frozenset(completed_ids) # Use filesystem as source of truth for both counters — metadata may lag by one # row group if a crash occurred between move_partial_result_to_final_file_path # and write_metadata. + # Use the original target (not the new num_records) so the last row group of a + # non-aligned run gets its true size, not buffer_size. initial_total_num_batches = len(completed_ids) initial_actual_num_records = sum( - min(buffer_size, num_records - rg_id * buffer_size) for rg_id in completed_ids + min(buffer_size, state.target_num_records - rg_id * buffer_size) for rg_id in completed_ids ) self.artifact_storage.clear_partial_results() diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index 9e6fe1daa..628e33c53 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1780,6 +1780,50 @@ def capturing_prepare(*args, **kwargs): assert captured["initial_total_num_batches"] == 2 +def test_build_async_resume_initial_actual_num_records_uses_original_target( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """initial_actual_num_records uses the original target_num_records, not the new num_records. + + When extending a non-aligned run (original num_records=5, buffer_size=2 → row groups [2,2,1]), + all 3 row groups completed. Resuming with num_records=7 must not use the new target in the + formula: min(2, 7-2*2)=min(2,3)=2 would give 6, but the actual data is 5 records. + """ + import asyncio as stdlib_asyncio + + dataset_dir = tmp_path / "dataset" + # Original run: 5 records, buffer_size=2, all 3 row groups done + _write_metadata(dataset_dir, target_num_records=5, buffer_size=2, num_completed_batches=3, actual_num_records=5) + _write_parquet_files(dataset_dir / "parquet-files", [0, 1, 2]) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + + captured: dict = {} + + def capturing_prepare(*args, **kwargs): + captured["initial_actual_num_records"] = kwargs.get("initial_actual_num_records", 0) + mock_scheduler = Mock() + mock_scheduler.traces = [] + mock_buffer_manager = Mock() + mock_buffer_manager.actual_num_records = 7 + return mock_scheduler, mock_buffer_manager + + mock_future = Mock() + mock_future.result = Mock(return_value=None) + + with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): + with patch.object(builder_mod, "asyncio", stdlib_asyncio, create=True): + with patch.object(builder_mod, "ensure_async_engine_loop", Mock(return_value=Mock()), create=True): + with patch.object(stdlib_asyncio, "run_coroutine_threadsafe", return_value=mock_future): + with patch.object(builder, "_run_model_health_check_if_needed"): + with patch.object(builder, "_prepare_async_run", side_effect=capturing_prepare): + # Extend the dataset: new target is 7, original was 5 + builder.build(num_records=7, resume=ResumeMode.ALWAYS) + + # Row groups [2, 2, 1] from original 5-record run: 2+2+1=5, not 2+2+2=6 + assert captured["initial_actual_num_records"] == 5 + + def test_build_async_resume_skip_row_groups_contains_completed_ids( stub_resource_provider, stub_test_config_builder, tmp_path ): From 487def23e39e9541b7b6d135414839cb64cca207 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 4 May 2026 21:42:44 +0200 Subject: [PATCH 19/29] fix(builder): IF_POSSIBLE starts fresh on empty dataset directory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Empty directory (crash between mkdir and first file write) was treated as compatible — _check_resume_config_compatibility returned True, IF_POSSIBLE upgraded to ALWAYS, which then raised ArtifactStorageError. Fix: treat empty directory the same as missing — return False from _check_resume_config_compatibility when any(dir.iterdir()) is False. Test: test_if_possible_starts_fresh_when_directory_is_empty --- .../dataset_builders/dataset_builder.py | 2 +- .../dataset_builders/test_dataset_builder.py | 35 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index cd5703d58..92d2e36b5 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -511,7 +511,7 @@ def _check_resume_config_compatibility(self) -> bool: caller has had a chance to decide whether to resume or start fresh. """ dataset_dir = Path(self.artifact_storage.artifact_path) / self.artifact_storage.dataset_name - if not dataset_dir.exists(): + if not dataset_dir.exists() or not any(dataset_dir.iterdir()): return False config_path = dataset_dir / SDG_CONFIG_FILENAME if not config_path.exists(): diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index 628e33c53..0d62b907b 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1943,3 +1943,38 @@ def test_if_possible_starts_fresh_when_no_existing_directory( builder.build(num_records=2, resume=ResumeMode.IF_POSSIBLE) assert storage.resume == ResumeMode.NEVER + + +def test_if_possible_starts_fresh_when_directory_is_empty( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """IF_POSSIBLE on an empty dataset directory must start fresh, not raise. + + Edge case: a prior run crashed in the window between mkdir and the first file write + inside _write_builder_config, leaving an empty directory. _check_resume_config_compatibility + previously returned True (config file absent → assume compatible), causing IF_POSSIBLE to + upgrade to ALWAYS, which then raised ArtifactStorageError because the directory is empty. + + Fix: treat an empty directory the same as a missing one — return False. + """ + dataset_dir = tmp_path / "dataset" + dataset_dir.mkdir() # empty — no files written yet + + storage = _ArtifactStorage(artifact_path=tmp_path, resume=ResumeMode.IF_POSSIBLE) + stub_resource_provider.artifact_storage = storage + + builder = DatasetBuilder( + data_designer_config=stub_test_config_builder.build(), + resource_provider=stub_resource_provider, + ) + + with patch.object(builder, "_run_model_health_check_if_needed"): + with patch.object(builder, "_run_mcp_tool_check_if_needed"): + with patch.object(builder, "_write_builder_config"): + with patch.object(builder, "_initialize_generators_and_graph", return_value=([], None)): + with patch.object(builder.batch_manager, "start"): + with patch.object(builder.batch_manager, "finish"): + with patch.object(builder._processor_runner, "run_after_generation"): + builder.build(num_records=2, resume=ResumeMode.IF_POSSIBLE) + + assert storage.resume == ResumeMode.NEVER From a08d9cc8b454d54bcc030d3f76486844d1301532 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 4 May 2026 22:05:31 +0200 Subject: [PATCH 20/29] fix(builder): ALWAYS raises DatasetGenerationError on config fingerprint mismatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ResumeMode.ALWAYS was documented to raise when column/model config changed, but _check_resume_config_compatibility() was only called in the IF_POSSIBLE branch. A user resuming with ALWAYS after changing the config would silently mix records from two different configs. Fix: - Refactor _check_resume_config_compatibility() to return _ConfigCompatibility enum (COMPATIBLE / INCOMPATIBLE / NO_PRIOR_DATASET) instead of bool so callers can distinguish 'no prior run' from 'configs differ' - Call the check for both ALWAYS and IF_POSSIBLE before _write_builder_config() - ALWAYS + INCOMPATIBLE → DatasetGenerationError - IF_POSSIBLE + INCOMPATIBLE → silent fresh start (existing behaviour) - IF_POSSIBLE + NO_PRIOR_DATASET → silent fresh start (existing behaviour) Test: test_build_resume_always_raises_on_config_mismatch --- .../dataset_builders/dataset_builder.py | 66 ++++++++++++------- .../dataset_builders/test_dataset_builder.py | 13 +++- 2 files changed, 54 insertions(+), 25 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index 92d2e36b5..dd6724454 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -12,6 +12,7 @@ import uuid import warnings from dataclasses import dataclass +from enum import StrEnum from pathlib import Path from typing import TYPE_CHECKING, Any, Callable @@ -96,6 +97,12 @@ def _is_async_trace_enabled(settings: RunConfig) -> bool: return settings.async_trace or os.environ.get("DATA_DESIGNER_ASYNC_TRACE", "0") == "1" +class _ConfigCompatibility(StrEnum): + COMPATIBLE = "compatible" + INCOMPATIBLE = "incompatible" + NO_PRIOR_DATASET = "no_prior_dataset" + + @dataclass class _ResumeState: num_completed_batches: int @@ -238,26 +245,36 @@ def build( self._run_model_health_check_if_needed() self._run_mcp_tool_check_if_needed() - # For IF_POSSIBLE: resolve to ALWAYS or NEVER before touching the artifact directory. - # _check_resume_config_compatibility() must NOT access base_dataset_path (which would - # cache resolved_dataset_name prematurely). After the decision, sync artifact_storage.resume - # so that resolved_dataset_name picks up the right semantics on its first real access. + # For IF_POSSIBLE and ALWAYS: check config compatibility before touching the artifact + # directory. _check_resume_config_compatibility() must NOT access base_dataset_path + # (which would cache resolved_dataset_name prematurely). After the decision, sync + # artifact_storage.resume so that resolved_dataset_name picks up the right semantics + # on its first real access. # # Also invalidate any stale resolved_dataset_name cache: ArtifactStorage's Pydantic # validator accesses base_dataset_path at construction time, which caches resolved_dataset_name - # under the original resume=IF_POSSIBLE semantics. Popping it forces a fresh resolution. - if resume == ResumeMode.IF_POSSIBLE: - if not self._check_resume_config_compatibility(): - logger.info( - "▶️ Config has changed since the last run — starting a fresh generation (resume=IF_POSSIBLE)." + # under the original resume mode semantics. Popping it forces a fresh resolution. + if resume in (ResumeMode.IF_POSSIBLE, ResumeMode.ALWAYS): + compat = self._check_resume_config_compatibility() + if resume == ResumeMode.ALWAYS and compat == _ConfigCompatibility.INCOMPATIBLE: + raise DatasetGenerationError( + "🛑 Cannot resume: the current config does not match the config used in the interrupted run. " + "Use resume=ResumeMode.IF_POSSIBLE to start fresh automatically, or " + "resume=ResumeMode.NEVER to force a new run." ) - resume = ResumeMode.NEVER - self.artifact_storage.resume = ResumeMode.NEVER - self.artifact_storage.__dict__.pop("resolved_dataset_name", None) - else: - resume = ResumeMode.ALWAYS - self.artifact_storage.resume = ResumeMode.ALWAYS - self.artifact_storage.__dict__.pop("resolved_dataset_name", None) + if resume == ResumeMode.IF_POSSIBLE: + if compat != _ConfigCompatibility.COMPATIBLE: + if compat == _ConfigCompatibility.INCOMPATIBLE: + logger.info( + "▶️ Config has changed since the last run — starting a fresh generation (resume=IF_POSSIBLE)." + ) + resume = ResumeMode.NEVER + self.artifact_storage.resume = ResumeMode.NEVER + self.artifact_storage.__dict__.pop("resolved_dataset_name", None) + else: + resume = ResumeMode.ALWAYS + self.artifact_storage.resume = ResumeMode.ALWAYS + self.artifact_storage.__dict__.pop("resolved_dataset_name", None) self._write_builder_config() @@ -499,12 +516,13 @@ def _find_completed_row_group_ids(self) -> set[int]: continue return ids - def _check_resume_config_compatibility(self) -> bool: + def _check_resume_config_compatibility(self) -> _ConfigCompatibility: """Compare the current config fingerprint against the stored builder_config.json. - Returns True when the configs are compatible (same data-relevant fingerprint) or when - the stored config cannot be read (a warning is logged in that case so the corruption - is visible). Returns False when the fingerprints differ. + Returns: + NO_PRIOR_DATASET — directory absent or empty (no prior run to resume from). + COMPATIBLE — fingerprints match, or stored config is unreadable (warning logged). + INCOMPATIBLE — fingerprints differ; continuing would mix records from two configs. Uses artifact_path / dataset_name directly — NOT base_dataset_path — to avoid prematurely triggering the resolved_dataset_name cached_property before the @@ -512,22 +530,22 @@ def _check_resume_config_compatibility(self) -> bool: """ dataset_dir = Path(self.artifact_storage.artifact_path) / self.artifact_storage.dataset_name if not dataset_dir.exists() or not any(dataset_dir.iterdir()): - return False + return _ConfigCompatibility.NO_PRIOR_DATASET config_path = dataset_dir / SDG_CONFIG_FILENAME if not config_path.exists(): - return True + return _ConfigCompatibility.COMPATIBLE try: stored_data = json.loads(config_path.read_text()) stored_config = BuilderConfig.model_validate(stored_data) current_fp = self._data_designer_config.fingerprint()["config_hash"] stored_fp = stored_config.data_designer.fingerprint()["config_hash"] - return current_fp == stored_fp + return _ConfigCompatibility.COMPATIBLE if current_fp == stored_fp else _ConfigCompatibility.INCOMPATIBLE except Exception: logger.warning( "⚠️ Could not read stored config at %s for compatibility check — assuming compatible.", config_path, ) - return True + return _ConfigCompatibility.COMPATIBLE def _build_async( self, diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index 0d62b907b..13909caa4 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -21,7 +21,7 @@ from data_designer.config.seed_source import LocalFileSeedSource from data_designer.config.seed_source_dataframe import DataFrameSeedSource from data_designer.engine.column_generators.generators.base import GenerationStrategy -from data_designer.engine.dataset_builders.dataset_builder import DatasetBuilder +from data_designer.engine.dataset_builders.dataset_builder import DatasetBuilder, _ConfigCompatibility from data_designer.engine.dataset_builders.errors import DatasetGenerationError, DatasetProcessingError from data_designer.engine.models.errors import ( FormattedLLMErrorMessage, @@ -1540,6 +1540,17 @@ def test_build_resume_raises_on_buffer_size_mismatch(stub_resource_provider, stu builder.build(num_records=4, resume=ResumeMode.ALWAYS) +def test_build_resume_always_raises_on_config_mismatch(stub_resource_provider, stub_test_config_builder, tmp_path): + """resume=ALWAYS raises DatasetGenerationError when the stored config fingerprint differs.""" + dataset_dir = tmp_path / "dataset" + _write_metadata(dataset_dir, target_num_records=4, buffer_size=2, num_completed_batches=1, actual_num_records=2) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path) + with patch.object(builder, "_check_resume_config_compatibility", return_value=_ConfigCompatibility.INCOMPATIBLE): + with pytest.raises(DatasetGenerationError, match="does not match the config used"): + builder.build(num_records=4, resume=ResumeMode.ALWAYS) + + def test_build_resume_logs_warning_when_already_complete( stub_resource_provider, stub_test_config_builder, tmp_path, caplog ): From e7c0f95d1045466693f567d3005db0d7a4e13fc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Wed, 6 May 2026 19:29:36 +0200 Subject: [PATCH 21/29] =?UTF-8?q?fix(resume):=20address=20nabinchha=20revi?= =?UTF-8?q?ew=20=E2=80=94=20drop=20export=20collision,=20add=20CLI=20flag,?= =?UTF-8?q?=20fix=20edge=20cases?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit C1: drop commit 0bdf24ab — remove export() / --output-format from this PR; that feature belongs to #540 which has a superior streaming implementation C2: add --resume / -r flag to data-designer create CLI, thread ResumeMode through GenerationController.run_create() into DataDesigner.create() C3: fix already-complete warning text — replace stale "Remove resume=True" with "Use resume=ResumeMode.NEVER" in _build_with_resume and _build_async C4: fix docstrings — ALWAYS does NOT raise when no checkpoint exists (silently restarts from scratch); clarify num_records >= actual semantics C5: sync artifact_storage.resume = NEVER when no-metadata fallback fires so both state holders agree after the downgrade C6: fix return_value=False → _ConfigCompatibility.INCOMPATIBLE in IF_POSSIBLE test; drop 3 direct _find_completed_row_group_ids tests (private API, covered by build()) W1: add logger.warning when builder_config.json is absent (silent COMPATIBLE was footgun) W2: narrow except Exception → (OSError, json.JSONDecodeError, ValidationError) W3: run make check-all-fix — ruff reformatted test_if_possible_starts_fresh_when_directory_is_empty --- .../dataset_builders/dataset_builder.py | 21 ++++-- .../dataset_builders/test_dataset_builder.py | 50 +------------ .../src/data_designer/cli/commands/create.py | 27 ++++--- .../cli/controllers/generation_controller.py | 29 ++------ .../data_designer/interface/data_designer.py | 9 +-- .../src/data_designer/interface/results.py | 41 +---------- .../tests/cli/commands/test_create_command.py | 52 +++++++++++--- .../controllers/test_generation_controller.py | 70 +++++++++++++++--- packages/data-designer/tests/cli/test_main.py | 3 +- .../tests/interface/test_results.py | 71 ------------------- 10 files changed, 147 insertions(+), 226 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index dd6724454..ffe88374b 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -16,6 +16,8 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Callable +from pydantic import ValidationError + import data_designer.lazy_heavy_imports as lazy from data_designer.config.column_configs import CustomColumnConfig from data_designer.config.column_types import ColumnConfigT, DataDesignerColumnType @@ -228,9 +230,11 @@ def build( - ``ResumeMode.NEVER`` (default): always start a fresh generation run. - ``ResumeMode.ALWAYS``: resume from the last completed batch (sync) or row group - (async) found in the existing artifact directory. Raises if no prior progress - exists or if the run parameters (buffer_size) are incompatible. ``num_records`` - may be equal to or greater than the number already generated. + (async). ``buffer_size`` must match the original run. ``num_records`` may be + equal to or greater than what was already generated (you can extend the dataset); + ``num_records`` less than actual records so far raises ``DatasetGenerationError``. + If no checkpoint exists yet (interrupted before the first batch finished), silently + restarts from the beginning. Raises if the stored config is incompatible. - ``ResumeMode.IF_POSSIBLE``: like ``ALWAYS`` when the current config fingerprint matches the stored config; otherwise starts a fresh run without raising an error. @@ -297,6 +301,7 @@ def build( ) self.artifact_storage.clear_partial_results() resume = ResumeMode.NEVER + self.artifact_storage.resume = ResumeMode.NEVER generated = True self._use_async = DATA_DESIGNER_ASYNC_ENGINE and self._resolve_async_compatibility() @@ -390,7 +395,7 @@ def _build_with_resume( if state.num_completed_batches >= self.batch_manager.num_batches: logger.warning( "⚠️ Dataset is already complete — all batches were found in the existing artifact directory. " - "Nothing to resume. Remove resume=True if you want to generate a new dataset." + "Nothing to resume. Use resume=ResumeMode.NEVER if you want to generate a new dataset." ) return False @@ -533,6 +538,10 @@ def _check_resume_config_compatibility(self) -> _ConfigCompatibility: return _ConfigCompatibility.NO_PRIOR_DATASET config_path = dataset_dir / SDG_CONFIG_FILENAME if not config_path.exists(): + logger.warning( + "⚠️ No builder_config.json found in %s — skipping config compatibility check on resume.", + dataset_dir, + ) return _ConfigCompatibility.COMPATIBLE try: stored_data = json.loads(config_path.read_text()) @@ -540,7 +549,7 @@ def _check_resume_config_compatibility(self) -> _ConfigCompatibility: current_fp = self._data_designer_config.fingerprint()["config_hash"] stored_fp = stored_config.data_designer.fingerprint()["config_hash"] return _ConfigCompatibility.COMPATIBLE if current_fp == stored_fp else _ConfigCompatibility.INCOMPATIBLE - except Exception: + except (OSError, json.JSONDecodeError, ValidationError): logger.warning( "⚠️ Could not read stored config at %s for compatibility check — assuming compatible.", config_path, @@ -590,7 +599,7 @@ def _build_async( if len(completed_ids) >= total_row_groups: logger.warning( "⚠️ Dataset is already complete — all row groups were found in the existing artifact " - "directory. Nothing to resume. Remove resume=True if you want to generate a new dataset." + "directory. Nothing to resume. Use resume=ResumeMode.NEVER if you want to generate a new dataset." ) return False diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index 13909caa4..efdcd7d76 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1592,50 +1592,6 @@ def test_build_resume_already_complete_does_not_run_after_generation_processors( mock_after.assert_not_called() -# --------------------------------------------------------------------------- -# _find_completed_row_group_ids tests -# --------------------------------------------------------------------------- - - -def test_find_completed_row_group_ids_empty_dir(stub_resource_provider, stub_test_config_builder, tmp_path): - """Returns empty set when final_dataset_path does not exist.""" - dataset_dir = tmp_path / "dataset" - _write_metadata(dataset_dir, target_num_records=4, buffer_size=2, num_completed_batches=0, actual_num_records=0) - - builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path) - assert builder._find_completed_row_group_ids() == set() - - -def test_find_completed_row_group_ids_with_files(stub_resource_provider, stub_test_config_builder, tmp_path): - """Returns correct IDs from batch_*.parquet files in parquet-files/.""" - dataset_dir = tmp_path / "dataset" - _write_metadata(dataset_dir, target_num_records=6, buffer_size=2, num_completed_batches=2, actual_num_records=4) - - parquet_dir = dataset_dir / "parquet-files" - parquet_dir.mkdir(parents=True) - (parquet_dir / "batch_00000.parquet").write_text("") - (parquet_dir / "batch_00002.parquet").write_text("") - - builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) - assert builder._find_completed_row_group_ids() == {0, 2} - - -def test_find_completed_row_group_ids_ignores_non_batch_files( - stub_resource_provider, stub_test_config_builder, tmp_path -): - """Non-batch files in parquet-files/ are silently ignored.""" - dataset_dir = tmp_path / "dataset" - _write_metadata(dataset_dir, target_num_records=4, buffer_size=2, num_completed_batches=1, actual_num_records=2) - - parquet_dir = dataset_dir / "parquet-files" - parquet_dir.mkdir(parents=True) - (parquet_dir / "batch_00001.parquet").write_text("") - (parquet_dir / "unrelated.parquet").write_text("") - - builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) - assert builder._find_completed_row_group_ids() == {1} - - # --------------------------------------------------------------------------- # Async resume via _build_async tests # --------------------------------------------------------------------------- @@ -1904,7 +1860,7 @@ def test_if_possible_incompatible_config_does_not_overwrite_existing_dataset( ) # Simulate incompatible config and mock out all I/O so build() does not actually generate data - with patch.object(builder, "_check_resume_config_compatibility", return_value=False): + with patch.object(builder, "_check_resume_config_compatibility", return_value=_ConfigCompatibility.INCOMPATIBLE): with patch.object(builder, "_run_model_health_check_if_needed"): with patch.object(builder, "_run_mcp_tool_check_if_needed"): with patch.object(builder, "_write_builder_config"): @@ -1956,9 +1912,7 @@ def test_if_possible_starts_fresh_when_no_existing_directory( assert storage.resume == ResumeMode.NEVER -def test_if_possible_starts_fresh_when_directory_is_empty( - stub_resource_provider, stub_test_config_builder, tmp_path -): +def test_if_possible_starts_fresh_when_directory_is_empty(stub_resource_provider, stub_test_config_builder, tmp_path): """IF_POSSIBLE on an empty dataset directory must start fresh, not raise. Edge case: a prior run crashed in the window between mkdir and the first file write diff --git a/packages/data-designer/src/data_designer/cli/commands/create.py b/packages/data-designer/src/data_designer/cli/commands/create.py index 1f9506dca..51dda7db2 100644 --- a/packages/data-designer/src/data_designer/cli/commands/create.py +++ b/packages/data-designer/src/data_designer/cli/commands/create.py @@ -7,7 +7,7 @@ from data_designer.cli.controllers.generation_controller import GenerationController from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS -from data_designer.interface.results import SUPPORTED_EXPORT_FORMATS +from data_designer.engine.storage.artifact_storage import ResumeMode def create_command( @@ -36,15 +36,17 @@ def create_command( "-o", help="Path where generated artifacts will be stored. Defaults to ./artifacts.", ), - output_format: str | None = typer.Option( - None, - "--output-format", - "-f", + resume: ResumeMode = typer.Option( + ResumeMode.NEVER, + "--resume", + "-r", help=( - f"Export the dataset to a single file after generation. " - f"Supported formats: {', '.join(SUPPORTED_EXPORT_FORMATS)}. " - "The file is written to //dataset.." + "Resume an interrupted generation run. " + "'never' (default): always start fresh. " + "'always': resume from the last checkpoint; raise if config changed. " + "'if_possible': resume if config matches, otherwise start fresh silently." ), + case_sensitive=False, ), ) -> None: """Create a full dataset and save results to disk. @@ -59,8 +61,11 @@ def create_command( # Create with custom settings data-designer create my_config.yaml --num-records 1000 --dataset-name my_dataset - # Create from a remote config URL - data-designer create https://example.com/my_config.json --dataset-name my_dataset + # Resume an interrupted run + data-designer create my_config.yaml --resume always + + # Resume if config unchanged, otherwise start fresh + data-designer create my_config.yaml --resume if_possible # Create from a Python module with custom output path data-designer create my_config.py --artifact-path /path/to/output @@ -71,5 +76,5 @@ def create_command( num_records=num_records, dataset_name=dataset_name, artifact_path=artifact_path, - output_format=output_format, + resume=resume, ) diff --git a/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py b/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py index bdac0a95f..e8436d819 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py @@ -16,6 +16,7 @@ from data_designer.cli.utils.sample_records_pager import PAGER_FILENAME, create_sample_records_pager from data_designer.config.errors import InvalidConfigError from data_designer.config.utils.constants import DEFAULT_DISPLAY_WIDTH +from data_designer.engine.storage.artifact_storage import ResumeMode from data_designer.interface import DataDesigner from data_designer.logging import LOG_INDENT @@ -116,7 +117,7 @@ def run_create( num_records: int, dataset_name: str, artifact_path: str | None, - output_format: str | None = None, + resume: ResumeMode = ResumeMode.NEVER, ) -> None: """Load config, create a full dataset, and save results to disk. @@ -125,8 +126,7 @@ def run_create( num_records: Number of records to generate. dataset_name: Name for the generated dataset folder. artifact_path: Path where generated artifacts will be stored, or None for default. - output_format: If set, export the dataset to a single file in this format after - generation. One of 'jsonl', 'csv', 'parquet'. + resume: Controls how interrupted runs are handled. """ config_builder = self._load_config(config_source) @@ -145,39 +145,20 @@ def run_create( config_builder, num_records=num_records, dataset_name=dataset_name, + resume=resume, ) except Exception as e: print_error(f"Dataset creation failed: {e}") raise typer.Exit(code=1) - dataset = results.load_dataset() - analysis = results.load_analysis() if analysis is not None: console.print() analysis.to_report() console.print() - print_success(f"Dataset created — {len(dataset)} record(s) generated") + print_success("Dataset created successfully") console.print(f" Artifacts saved to: [bold]{results.artifact_storage.base_dataset_path}[/bold]") - - if output_format is not None: - from data_designer.interface.results import SUPPORTED_EXPORT_FORMATS - - if output_format not in SUPPORTED_EXPORT_FORMATS: - print_error( - f"Unsupported export format: {output_format!r}. " - f"Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}." - ) - raise typer.Exit(code=1) - export_path = results.artifact_storage.base_dataset_path / f"dataset.{output_format}" - try: - results.export(export_path, format=output_format) # type: ignore[arg-type] - except Exception as e: - print_error(f"Export failed: {e}") - raise typer.Exit(code=1) - console.print(f" Exported to: [bold]{export_path}[/bold]") - console.print() def _load_config(self, config_source: str) -> DataDesignerConfigBuilder: diff --git a/packages/data-designer/src/data_designer/interface/data_designer.py b/packages/data-designer/src/data_designer/interface/data_designer.py index 7223f5616..6d53689df 100644 --- a/packages/data-designer/src/data_designer/interface/data_designer.py +++ b/packages/data-designer/src/data_designer/interface/data_designer.py @@ -224,10 +224,11 @@ def create( - ``ResumeMode.NEVER`` (default): always start a fresh generation run. - ``ResumeMode.ALWAYS``: resume from the last completed batch (sync) or row group - (async) found in the existing dataset directory. ``num_records`` may be equal to - or greater than the number already generated (you can extend the dataset). The - ``buffer_size`` must match the original run. If no prior progress exists or the - run is incompatible, an error is raised. + (async). ``buffer_size`` must match the original run. ``num_records`` may be + equal to or greater than what was already generated (you can extend the dataset); + ``num_records`` less than actual records so far raises ``DatasetGenerationError``. + If no checkpoint exists yet (interrupted before the first batch finished), silently + restarts from the beginning. Raises if the stored config is incompatible. - ``ResumeMode.IF_POSSIBLE``: like ``ALWAYS`` when the current config fingerprint matches the stored config; otherwise starts a fresh run without raising an error. diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index fce35f8d3..07692ff00 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -4,7 +4,7 @@ from __future__ import annotations from pathlib import Path -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults from data_designer.config.config_builder import DataDesignerConfigBuilder @@ -19,9 +19,6 @@ from data_designer.engine.dataset_builders.utils.task_model import TaskTrace -ExportFormat = Literal["jsonl", "csv", "parquet"] -SUPPORTED_EXPORT_FORMATS: tuple[str, ...] = ("jsonl", "csv", "parquet") - class DatasetCreationResults(WithRecordSamplerMixin): """Results container for a Data Designer dataset creation run. @@ -98,42 +95,6 @@ def get_path_to_processor_artifacts(self, processor_name: str) -> Path: raise ArtifactStorageError(f"Processor {processor_name} has no artifacts.") return self.artifact_storage.processors_outputs_path / processor_name - def export(self, path: Path | str, *, format: ExportFormat = "jsonl") -> Path: - """Export the generated dataset to a single file. - - Args: - path: Output file path. The extension is not inferred from *format* — - the exact path is used as-is. - format: Output format. One of ``'jsonl'``, ``'csv'``, or ``'parquet'``. - Defaults to ``'jsonl'``. - - Returns: - Path to the written file. - - Raises: - ValueError: If an unsupported format is requested. - - Example: - >>> results = data_designer.create(config, num_records=1000) - >>> results.export("output.jsonl") - PosixPath('output.jsonl') - >>> results.export("output.csv", format="csv") - PosixPath('output.csv') - """ - if format not in SUPPORTED_EXPORT_FORMATS: - raise ValueError( - f"Unsupported export format: {format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}." - ) - path = Path(path) - df = self.load_dataset() - if format == "jsonl": - df.to_json(path, orient="records", lines=True, force_ascii=False) - elif format == "csv": - df.to_csv(path, index=False) - elif format == "parquet": - df.to_parquet(path, index=False) - return path - def push_to_hub( self, repo_id: str, diff --git a/packages/data-designer/tests/cli/commands/test_create_command.py b/packages/data-designer/tests/cli/commands/test_create_command.py index fc779df7c..c676e994f 100644 --- a/packages/data-designer/tests/cli/commands/test_create_command.py +++ b/packages/data-designer/tests/cli/commands/test_create_command.py @@ -6,6 +6,7 @@ from unittest.mock import MagicMock, patch from data_designer.cli.commands.create import create_command +from data_designer.engine.storage.artifact_storage import ResumeMode # --------------------------------------------------------------------------- # create_command delegation tests @@ -19,7 +20,11 @@ def test_create_command_delegates_to_controller(mock_ctrl_cls: MagicMock) -> Non mock_ctrl_cls.return_value = mock_ctrl create_command( - config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None, output_format=None + config_source="config.yaml", + num_records=10, + dataset_name="dataset", + artifact_path=None, + resume=ResumeMode.NEVER, ) mock_ctrl_cls.assert_called_once() @@ -28,7 +33,7 @@ def test_create_command_delegates_to_controller(mock_ctrl_cls: MagicMock) -> Non num_records=10, dataset_name="dataset", artifact_path=None, - output_format=None, + resume=ResumeMode.NEVER, ) @@ -43,7 +48,7 @@ def test_create_command_passes_custom_options(mock_ctrl_cls: MagicMock) -> None: num_records=100, dataset_name="my_data", artifact_path="/custom/output", - output_format=None, + resume=ResumeMode.NEVER, ) mock_ctrl.run_create.assert_called_once_with( @@ -51,7 +56,7 @@ def test_create_command_passes_custom_options(mock_ctrl_cls: MagicMock) -> None: num_records=100, dataset_name="my_data", artifact_path="/custom/output", - output_format=None, + resume=ResumeMode.NEVER, ) @@ -62,7 +67,11 @@ def test_create_command_default_artifact_path_is_none(mock_ctrl_cls: MagicMock) mock_ctrl_cls.return_value = mock_ctrl create_command( - config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None, output_format=None + config_source="config.yaml", + num_records=5, + dataset_name="ds", + artifact_path=None, + resume=ResumeMode.NEVER, ) mock_ctrl.run_create.assert_called_once_with( @@ -70,13 +79,36 @@ def test_create_command_default_artifact_path_is_none(mock_ctrl_cls: MagicMock) num_records=5, dataset_name="ds", artifact_path=None, - output_format=None, + resume=ResumeMode.NEVER, + ) + + +@patch("data_designer.cli.commands.create.GenerationController") +def test_create_command_passes_resume_always(mock_ctrl_cls: MagicMock) -> None: + """Test create_command forwards --resume always to the controller.""" + mock_ctrl = MagicMock() + mock_ctrl_cls.return_value = mock_ctrl + + create_command( + config_source="config.yaml", + num_records=10, + dataset_name="dataset", + artifact_path=None, + resume=ResumeMode.ALWAYS, + ) + + mock_ctrl.run_create.assert_called_once_with( + config_source="config.yaml", + num_records=10, + dataset_name="dataset", + artifact_path=None, + resume=ResumeMode.ALWAYS, ) @patch("data_designer.cli.commands.create.GenerationController") -def test_create_command_passes_output_format(mock_ctrl_cls: MagicMock) -> None: - """Test create_command forwards --output-format to the controller.""" +def test_create_command_passes_resume_if_possible(mock_ctrl_cls: MagicMock) -> None: + """Test create_command forwards --resume if_possible to the controller.""" mock_ctrl = MagicMock() mock_ctrl_cls.return_value = mock_ctrl @@ -85,7 +117,7 @@ def test_create_command_passes_output_format(mock_ctrl_cls: MagicMock) -> None: num_records=10, dataset_name="dataset", artifact_path=None, - output_format="jsonl", + resume=ResumeMode.IF_POSSIBLE, ) mock_ctrl.run_create.assert_called_once_with( @@ -93,5 +125,5 @@ def test_create_command_passes_output_format(mock_ctrl_cls: MagicMock) -> None: num_records=10, dataset_name="dataset", artifact_path=None, - output_format="jsonl", + resume=ResumeMode.IF_POSSIBLE, ) diff --git a/packages/data-designer/tests/cli/controllers/test_generation_controller.py b/packages/data-designer/tests/cli/controllers/test_generation_controller.py index de4918cff..16d15a849 100644 --- a/packages/data-designer/tests/cli/controllers/test_generation_controller.py +++ b/packages/data-designer/tests/cli/controllers/test_generation_controller.py @@ -14,6 +14,7 @@ from data_designer.config.config_builder import DataDesignerConfigBuilder from data_designer.config.errors import InvalidConfigError from data_designer.config.utils.constants import DEFAULT_DISPLAY_WIDTH +from data_designer.engine.storage.artifact_storage import ResumeMode _CTRL = "data_designer.cli.controllers.generation_controller" _DW = DEFAULT_DISPLAY_WIDTH @@ -27,12 +28,9 @@ def _make_mock_preview_results(num_records: int) -> MagicMock: return mock_results -def _make_mock_create_results(num_records: int, base_path: str = "/output/artifacts/dataset") -> MagicMock: - """Create a mock CreateResults with the given number of records.""" +def _make_mock_create_results(base_path: str = "/output/artifacts/dataset") -> MagicMock: + """Create a mock DatasetCreationResults.""" mock_results = MagicMock() - mock_dataset = MagicMock() - mock_dataset.__len__ = MagicMock(return_value=num_records) - mock_results.load_dataset.return_value = mock_dataset mock_results.artifact_storage.base_dataset_path = base_path return mock_results @@ -677,14 +675,16 @@ def test_run_create_success(mock_load_config: MagicMock, mock_dd_cls: MagicMock) mock_dd = MagicMock() mock_dd_cls.return_value = mock_dd - mock_dd.create.return_value = _make_mock_create_results(10) + mock_dd.create.return_value = _make_mock_create_results() controller = GenerationController() controller.run_create(config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None) mock_load_config.assert_called_once_with("config.yaml") mock_dd_cls.assert_called_once_with(artifact_path=Path.cwd() / "artifacts") - mock_dd.create.assert_called_once_with(mock_builder, num_records=10, dataset_name="dataset") + mock_dd.create.assert_called_once_with( + mock_builder, num_records=10, dataset_name="dataset", resume=ResumeMode.NEVER + ) @patch(f"{_CTRL}.DataDesigner") @@ -694,7 +694,7 @@ def test_run_create_custom_options(mock_load_config: MagicMock, mock_dd_cls: Mag mock_load_config.return_value = MagicMock(spec=DataDesignerConfigBuilder) mock_dd = MagicMock() mock_dd_cls.return_value = mock_dd - mock_dd.create.return_value = _make_mock_create_results(100, "/custom/output/my_data") + mock_dd.create.return_value = _make_mock_create_results("/custom/output/my_data") controller = GenerationController() controller.run_create( @@ -705,7 +705,9 @@ def test_run_create_custom_options(mock_load_config: MagicMock, mock_dd_cls: Mag ) mock_dd_cls.assert_called_once_with(artifact_path=Path("/custom/output")) - mock_dd.create.assert_called_once_with(mock_load_config.return_value, num_records=100, dataset_name="my_data") + mock_dd.create.assert_called_once_with( + mock_load_config.return_value, num_records=100, dataset_name="my_data", resume=ResumeMode.NEVER + ) @patch(f"{_CTRL}.load_config_builder") @@ -743,7 +745,7 @@ def test_run_create_calls_to_report_when_analysis_present(mock_load_config: Magi mock_load_config.return_value = MagicMock(spec=DataDesignerConfigBuilder) mock_dd = MagicMock() mock_dd_cls.return_value = mock_dd - mock_results = _make_mock_create_results(10) + mock_results = _make_mock_create_results() mock_analysis = MagicMock() mock_results.load_analysis.return_value = mock_analysis mock_dd.create.return_value = mock_results @@ -762,7 +764,7 @@ def test_run_create_skips_report_when_analysis_is_none(mock_load_config: MagicMo mock_load_config.return_value = MagicMock(spec=DataDesignerConfigBuilder) mock_dd = MagicMock() mock_dd_cls.return_value = mock_dd - mock_results = _make_mock_create_results(10) + mock_results = _make_mock_create_results() mock_results.load_analysis.return_value = None mock_dd.create.return_value = mock_results @@ -772,3 +774,49 @@ def test_run_create_skips_report_when_analysis_is_none(mock_load_config: MagicMo # load_analysis() returns None, so to_report() must not be called. # If the code ignores the None check, an AttributeError propagates and the test fails. mock_results.load_analysis.assert_called_once() + + +@patch(f"{_CTRL}.DataDesigner") +@patch(f"{_CTRL}.load_config_builder") +def test_run_create_passes_resume_always(mock_load_config: MagicMock, mock_dd_cls: MagicMock) -> None: + """run_create forwards resume=ALWAYS to DataDesigner.create().""" + mock_load_config.return_value = MagicMock(spec=DataDesignerConfigBuilder) + mock_dd = MagicMock() + mock_dd_cls.return_value = mock_dd + mock_dd.create.return_value = _make_mock_create_results() + + controller = GenerationController() + controller.run_create( + config_source="config.yaml", + num_records=10, + dataset_name="dataset", + artifact_path=None, + resume=ResumeMode.ALWAYS, + ) + + mock_dd.create.assert_called_once_with( + mock_load_config.return_value, num_records=10, dataset_name="dataset", resume=ResumeMode.ALWAYS + ) + + +@patch(f"{_CTRL}.DataDesigner") +@patch(f"{_CTRL}.load_config_builder") +def test_run_create_passes_resume_if_possible(mock_load_config: MagicMock, mock_dd_cls: MagicMock) -> None: + """run_create forwards resume=IF_POSSIBLE to DataDesigner.create().""" + mock_load_config.return_value = MagicMock(spec=DataDesignerConfigBuilder) + mock_dd = MagicMock() + mock_dd_cls.return_value = mock_dd + mock_dd.create.return_value = _make_mock_create_results() + + controller = GenerationController() + controller.run_create( + config_source="config.yaml", + num_records=10, + dataset_name="dataset", + artifact_path=None, + resume=ResumeMode.IF_POSSIBLE, + ) + + mock_dd.create.assert_called_once_with( + mock_load_config.return_value, num_records=10, dataset_name="dataset", resume=ResumeMode.IF_POSSIBLE + ) diff --git a/packages/data-designer/tests/cli/test_main.py b/packages/data-designer/tests/cli/test_main.py index 15349d8e6..4d897bebb 100644 --- a/packages/data-designer/tests/cli/test_main.py +++ b/packages/data-designer/tests/cli/test_main.py @@ -10,6 +10,7 @@ from data_designer.cli.main import app, main from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS +from data_designer.engine.storage.artifact_storage import ResumeMode runner = CliRunner() @@ -84,5 +85,5 @@ def test_app_dispatches_lazy_create_command(mock_controller_cls: Mock) -> None: num_records=DEFAULT_NUM_RECORDS, dataset_name="dataset", artifact_path=None, - output_format=None, + resume=ResumeMode.NEVER, ) diff --git a/packages/data-designer/tests/interface/test_results.py b/packages/data-designer/tests/interface/test_results.py index 802cfff6c..a28dd987e 100644 --- a/packages/data-designer/tests/interface/test_results.py +++ b/packages/data-designer/tests/interface/test_results.py @@ -259,77 +259,6 @@ def test_load_dataset_independent_of_record_sampler_cache(stub_dataset_creation_ stub_artifact_storage.load_dataset.assert_called_once() -@pytest.mark.parametrize("fmt", ["jsonl", "csv", "parquet"]) -def test_export_writes_file(stub_dataset_creation_results, tmp_path, fmt): - """export() writes a file in the requested format.""" - out = tmp_path / f"out.{fmt}" - result = stub_dataset_creation_results.export(out, format=fmt) - assert result == out - assert out.exists() - assert out.stat().st_size > 0 - - -def test_export_jsonl_content(stub_dataset_creation_results, stub_dataframe, tmp_path): - """JSONL export writes one JSON object per line.""" - import json - - out = tmp_path / "out.jsonl" - stub_dataset_creation_results.export(out, format="jsonl") - lines = out.read_text(encoding="utf-8").splitlines() - assert len(lines) == len(stub_dataframe) - # Each line must be valid JSON - for line in lines: - json.loads(line) - - -def test_export_csv_content(stub_dataset_creation_results, stub_dataframe, tmp_path): - """CSV export has a header row and one data row per record.""" - import data_designer.lazy_heavy_imports as lazy - - out = tmp_path / "out.csv" - stub_dataset_creation_results.export(out, format="csv") - loaded = lazy.pd.read_csv(out) - assert list(loaded.columns) == list(stub_dataframe.columns) - assert len(loaded) == len(stub_dataframe) - - -def test_export_parquet_content(stub_dataset_creation_results, stub_dataframe, tmp_path): - """Parquet export round-trips to the original DataFrame.""" - import data_designer.lazy_heavy_imports as lazy - - out = tmp_path / "out.parquet" - stub_dataset_creation_results.export(out, format="parquet") - loaded = lazy.pd.read_parquet(out) - lazy.pd.testing.assert_frame_equal(loaded.reset_index(drop=True), stub_dataframe.reset_index(drop=True)) - - -def test_export_default_format_is_jsonl(stub_dataset_creation_results, tmp_path): - """export() defaults to JSONL when no format is given.""" - import json - - out = tmp_path / "out.jsonl" - stub_dataset_creation_results.export(out) - lines = out.read_text(encoding="utf-8").splitlines() - # All lines must be valid JSON - for line in lines: - json.loads(line) - - -def test_export_unsupported_format_raises(stub_dataset_creation_results, tmp_path): - """export() raises ValueError for unknown formats.""" - with pytest.raises(ValueError, match="Unsupported export format"): - stub_dataset_creation_results.export(tmp_path / "out.xyz", format="xlsx") # type: ignore[arg-type] - - -def test_export_returns_path_object(stub_dataset_creation_results, tmp_path): - """export() returns a Path regardless of whether str or Path was passed.""" - from pathlib import Path - - out = tmp_path / "out.jsonl" - result = stub_dataset_creation_results.export(str(out)) - assert isinstance(result, Path) - - def test_preview_results_dataset_metadata() -> None: """Test that PreviewResults uses DatasetMetadata in display_sample_record.""" config_builder = MagicMock(spec=DataDesignerConfigBuilder) From 02821b847b5ea4faff45e88ebe7160d27251ad0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Wed, 6 May 2026 20:16:56 +0200 Subject: [PATCH 22/29] fix(builder): replace stdlib StrEnum with project compat shim for Python 3.10 --- .../data_designer/engine/dataset_builders/dataset_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index bc3a64a83..25a2df3c9 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -11,7 +11,6 @@ import time import uuid from dataclasses import dataclass -from enum import StrEnum from pathlib import Path from typing import TYPE_CHECKING, Any, Callable @@ -27,6 +26,7 @@ ProcessorConfig, ProcessorType, ) +from data_designer.config.utils.type_helpers import StrEnum from data_designer.config.utils.warning_helpers import warn_at_caller from data_designer.config.version import get_library_version from data_designer.engine.column_generators.generators.base import ( From b8c633c1978acf1229a647f44a2ba05f1dee375a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Wed, 6 May 2026 21:33:34 +0200 Subject: [PATCH 23/29] fix(builder): guard extension row groups in initial_actual_num_records formula on async resume When extending an async run (num_records > state.target_num_records) and a crash occurs after an extension row group is written to disk but before write_metadata, the formula `min(buffer_size, state.target_num_records - rg_id * buffer_size)` yields a negative value for any extension row group (rg_id * buffer_size >= target), making initial_actual_num_records silently undercount. The RowGroupBufferManager then starts at the wrong offset, and the final metadata reports an incorrect actual_num_records with a false partial-completion warning. Fix: use state.target_num_records for original row groups and num_records for extension row groups (guarded by rg_id * buffer_size < state.target_num_records). Covers the scenario with a new regression test. --- .../dataset_builders/dataset_builder.py | 8 +++- .../dataset_builders/test_dataset_builder.py | 47 +++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index 25a2df3c9..507bdd426 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -601,7 +601,13 @@ def _build_async( # non-aligned run gets its true size, not buffer_size. initial_total_num_batches = len(completed_ids) initial_actual_num_records = sum( - min(buffer_size, state.target_num_records - rg_id * buffer_size) for rg_id in completed_ids + min( + buffer_size, + state.target_num_records - rg_id * buffer_size + if rg_id * buffer_size < state.target_num_records + else num_records - rg_id * buffer_size, + ) + for rg_id in completed_ids ) self.artifact_storage.clear_partial_results() diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index efdcd7d76..84ade04c2 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1791,6 +1791,53 @@ def capturing_prepare(*args, **kwargs): assert captured["initial_actual_num_records"] == 5 +def test_build_async_resume_initial_actual_num_records_extension_crash_window( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """Extension row groups on disk use new num_records in the size formula, not original target. + + Crash window: original run had num_records=5, buffer_size=2 (row groups [2,2,1], all done). + Extension starts with num_records=9; row group 3 (2 records) is written to disk but + write_metadata crashes before updating the file. On resume, completed_ids={0,1,2,3} + while metadata still reports target_num_records=5. + + Correct count: groups 0,1 → 2+2; group 2 (last original, non-aligned) → 1; group 3 + (extension) → min(2, 9-6)=2. Total = 7, not 4 (which the unguarded formula gives, + since min(2, 5-6) = -1). + """ + import asyncio as stdlib_asyncio + + dataset_dir = tmp_path / "dataset" + _write_metadata(dataset_dir, target_num_records=5, buffer_size=2, num_completed_batches=3, actual_num_records=5) + _write_parquet_files(dataset_dir / "parquet-files", [0, 1, 2, 3]) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + + captured: dict = {} + + def capturing_prepare(*args, **kwargs): + captured["initial_actual_num_records"] = kwargs.get("initial_actual_num_records", 0) + mock_scheduler = Mock() + mock_scheduler.traces = [] + mock_buffer_manager = Mock() + mock_buffer_manager.actual_num_records = 9 + return mock_scheduler, mock_buffer_manager + + mock_future = Mock() + mock_future.result = Mock(return_value=None) + + with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): + with patch.object(builder_mod, "asyncio", stdlib_asyncio, create=True): + with patch.object(builder_mod, "ensure_async_engine_loop", Mock(return_value=Mock()), create=True): + with patch.object(stdlib_asyncio, "run_coroutine_threadsafe", return_value=mock_future): + with patch.object(builder, "_run_model_health_check_if_needed"): + with patch.object(builder, "_prepare_async_run", side_effect=capturing_prepare): + builder.build(num_records=9, resume=ResumeMode.ALWAYS) + + # 2+2+1 (original) + 2 (extension group 3) = 7, not 4 (which unguarded formula gives) + assert captured["initial_actual_num_records"] == 7 + + def test_build_async_resume_skip_row_groups_contains_completed_ids( stub_resource_provider, stub_test_config_builder, tmp_path ): From 0fef8d41b0f45a4d6f72deda802ed66b01ef7aa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Wed, 6 May 2026 21:51:28 +0200 Subject: [PATCH 24/29] fix(builder): pre-compute row-group list in _build_async to fix sizes on non-aligned extension resume The partitioning loop in _prepare_async_run decremented remaining by min(buffer_size, remaining) for every row group, including skipped ones. For a non-aligned original run (e.g. target=5, buffer_size=2, last group has 1 record), the loop deducted 2 for the skipped last group, leaving remaining one short. Extension row groups received smaller sizes than intended, so the generated dataset was silently short by the deficit and a false partial-completion warning fired. Fix: pre-compute the full row-group list with correct per-group sizes in _build_async where state.target_num_records is available, then pass it to _prepare_async_run as precomputed_row_groups (replacing the skip_row_groups param). Original groups use min(buffer_size, target - rg*bs); extension groups use min(buffer_size, extension_records - ext_idx*bs). Also updates the skip_row_groups test to assert on precomputed_row_groups and adds a regression test for the non-aligned extension case. --- .../dataset_builders/dataset_builder.py | 53 ++++++++++------- .../dataset_builders/test_dataset_builder.py | 57 +++++++++++++++++-- 2 files changed, 83 insertions(+), 27 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index 507bdd426..201eb17d1 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -586,29 +586,30 @@ def _build_async( settings = self._resource_provider.run_config trace_enabled = _is_async_trace_enabled(settings) - skip_row_groups: frozenset[int] = frozenset() + precomputed_row_groups: list[tuple[int, int]] | None = None initial_actual_num_records = 0 initial_total_num_batches = 0 if resume == ResumeMode.ALWAYS: state = self._load_resume_state(num_records, buffer_size) completed_ids = self._find_completed_row_group_ids() - skip_row_groups = frozenset(completed_ids) # Use filesystem as source of truth for both counters — metadata may lag by one # row group if a crash occurred between move_partial_result_to_final_file_path # and write_metadata. # Use the original target (not the new num_records) so the last row group of a # non-aligned run gets its true size, not buffer_size. initial_total_num_batches = len(completed_ids) - initial_actual_num_records = sum( - min( - buffer_size, - state.target_num_records - rg_id * buffer_size - if rg_id * buffer_size < state.target_num_records - else num_records - rg_id * buffer_size, - ) - for rg_id in completed_ids - ) + original_target = state.target_num_records + + num_original_groups = -(-original_target // buffer_size) # ceil(original_target/buffer_size) + + def _rg_size(rg_id: int) -> int: + if rg_id < num_original_groups: + return min(buffer_size, original_target - rg_id * buffer_size) + ext_group_idx = rg_id - num_original_groups + return min(buffer_size, (num_records - original_target) - ext_group_idx * buffer_size) + + initial_actual_num_records = sum(_rg_size(rg_id) for rg_id in completed_ids) self.artifact_storage.clear_partial_results() total_row_groups = -(-num_records // buffer_size) # ceiling division @@ -624,6 +625,13 @@ def _build_async( f"complete ({initial_actual_num_records} records), skipping them." ) + # Pre-compute the full row-group list with correct per-group sizes so that + # non-aligned skipped groups deduct their actual on-disk record count rather + # than buffer_size, keeping extension group sizes accurate. + precomputed_row_groups = [ + (rg_id, _rg_size(rg_id)) for rg_id in range(total_row_groups) if rg_id not in completed_ids + ] + def finalize_row_group(rg_id: int) -> None: def on_complete(final_path: Path | str | None) -> None: if final_path is not None and on_batch_complete: @@ -642,7 +650,7 @@ def on_complete(final_path: Path | str | None) -> None: shutdown_error_window=settings.shutdown_error_window, disable_early_shutdown=settings.disable_early_shutdown, trace=trace_enabled, - skip_row_groups=skip_row_groups, + precomputed_row_groups=precomputed_row_groups, initial_actual_num_records=initial_actual_num_records, initial_total_num_batches=initial_total_num_batches, ) @@ -707,7 +715,7 @@ def _prepare_async_run( shutdown_error_window: int = 10, disable_early_shutdown: bool = False, trace: bool = False, - skip_row_groups: frozenset[int] = frozenset(), + precomputed_row_groups: list[tuple[int, int]] | None = None, initial_actual_num_records: int = 0, initial_total_num_batches: int = 0, ) -> tuple[AsyncTaskScheduler, RowGroupBufferManager]: @@ -732,16 +740,17 @@ def _prepare_async_run( for gen in generators: gen.log_pre_generation() - # Partition into row groups, skipping any already completed on resume. - row_groups: list[tuple[int, int]] = [] - remaining = num_records - rg_id = 0 - while remaining > 0: - size = min(buffer_size, remaining) - if rg_id not in skip_row_groups: + if precomputed_row_groups is not None: + row_groups = precomputed_row_groups + else: + row_groups = [] + remaining = num_records + rg_id = 0 + while remaining > 0: + size = min(buffer_size, remaining) row_groups.append((rg_id, size)) - remaining -= size - rg_id += 1 + remaining -= size + rg_id += 1 tracker = CompletionTracker.with_graph(graph, row_groups) buffer_manager = RowGroupBufferManager( diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index 84ade04c2..c60586a9a 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1841,10 +1841,11 @@ def capturing_prepare(*args, **kwargs): def test_build_async_resume_skip_row_groups_contains_completed_ids( stub_resource_provider, stub_test_config_builder, tmp_path ): - """skip_row_groups passed to _prepare_async_run contains exactly the completed row group IDs. + """precomputed_row_groups passed to _prepare_async_run excludes already-completed row groups. - Verifies the set-based skip mechanism so the scheduler never re-generates a row group - that already has a parquet file on disk. + Verifies the skip mechanism so the scheduler never re-generates a row group that + already has a parquet file on disk. 6 records, buffer_size=2 → 3 row groups total; + row groups 0 and 2 already on disk → only row group 1 should be scheduled. """ import asyncio as stdlib_asyncio @@ -1858,7 +1859,7 @@ def test_build_async_resume_skip_row_groups_contains_completed_ids( captured: dict = {} def capturing_prepare(*args, **kwargs): - captured["skip_row_groups"] = kwargs.get("skip_row_groups", frozenset()) + captured["precomputed_row_groups"] = kwargs.get("precomputed_row_groups") mock_scheduler = Mock() mock_scheduler.traces = [] mock_buffer_manager = Mock() @@ -1876,7 +1877,53 @@ def capturing_prepare(*args, **kwargs): with patch.object(builder, "_prepare_async_run", side_effect=capturing_prepare): builder.build(num_records=6, resume=ResumeMode.ALWAYS) - assert captured["skip_row_groups"] == frozenset({0, 2}) + # Only rg_id=1 remains; rg_id=0 and rg_id=2 are already on disk + assert captured["precomputed_row_groups"] == [(1, 2)] + + +def test_build_async_resume_extension_non_aligned_row_group_sizes( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """Extension row groups get the correct size when the original run was non-aligned. + + Original run: num_records=5, buffer_size=2 → row groups [2, 2, 1], all completed. + Extending to num_records=7: the loop previously deducted 2 for rg_id=2 (instead of 1), + leaving remaining=1 so rg_id=3 received size 1 instead of 2. 7 records were never + generated; only 6 reached the dataset and a false partial-completion warning fired. + + After the fix, precomputed_row_groups must be [(3, 2)], not [(3, 1)]. + """ + import asyncio as stdlib_asyncio + + dataset_dir = tmp_path / "dataset" + _write_metadata(dataset_dir, target_num_records=5, buffer_size=2, num_completed_batches=3, actual_num_records=5) + _write_parquet_files(dataset_dir / "parquet-files", [0, 1, 2]) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + + captured: dict = {} + + def capturing_prepare(*args, **kwargs): + captured["precomputed_row_groups"] = kwargs.get("precomputed_row_groups") + mock_scheduler = Mock() + mock_scheduler.traces = [] + mock_buffer_manager = Mock() + mock_buffer_manager.actual_num_records = 7 + return mock_scheduler, mock_buffer_manager + + mock_future = Mock() + mock_future.result = Mock(return_value=None) + + with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): + with patch.object(builder_mod, "asyncio", stdlib_asyncio, create=True): + with patch.object(builder_mod, "ensure_async_engine_loop", Mock(return_value=Mock()), create=True): + with patch.object(stdlib_asyncio, "run_coroutine_threadsafe", return_value=mock_future): + with patch.object(builder, "_run_model_health_check_if_needed"): + with patch.object(builder, "_prepare_async_run", side_effect=capturing_prepare): + builder.build(num_records=7, resume=ResumeMode.ALWAYS) + + # rg_id=3 should have 2 records (7-5=2 extension records, buffer_size=2), not 1 + assert captured["precomputed_row_groups"] == [(3, 2)] def test_if_possible_incompatible_config_does_not_overwrite_existing_dataset( From dad57b889de6fdc4caa8fd02dc1bcd845e9276b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Thu, 7 May 2026 22:57:44 +0200 Subject: [PATCH 25/29] chore: remove stale implementation plan for #525 The plan described the initial resume: bool design which has since been replaced by the full ResumeMode enum (NEVER/ALWAYS/IF_POSSIBLE), async engine support, filesystem reconciliation, and config compatibility checks. The PR description is the authoritative record of what shipped. --- plans/525/resume-interrupted-runs.md | 176 --------------------------- 1 file changed, 176 deletions(-) delete mode 100644 plans/525/resume-interrupted-runs.md diff --git a/plans/525/resume-interrupted-runs.md b/plans/525/resume-interrupted-runs.md deleted file mode 100644 index 61e193ff5..000000000 --- a/plans/525/resume-interrupted-runs.md +++ /dev/null @@ -1,176 +0,0 @@ ---- -date: 2026-04-13 -authors: - - pboruta -issue: https://github.com/NVIDIA-NeMo/DataDesigner/issues/525 ---- - -# Plan: Resume interrupted dataset generation runs - -## Problem - -When a long-running `DataDesigner.create()` call is interrupted (machine crash, OOM kill, preemption), the user has to restart generation from scratch — even though completed batches are already durably written to disk and `metadata.json` tracks exactly how many finished. - -The situation is made worse by an existing safeguard that fires at the wrong time: `ArtifactStorage.resolved_dataset_name` detects the existing folder on the next run and silently creates a new timestamped directory, orphaning the previous partial results instead of resuming from them. - -## Proposed Solution - -Add `resume: bool = False` to `DataDesigner.create()`. When `resume=True` the engine reads `metadata.json` from the existing dataset directory, validates that the run parameters are compatible, and starts the batch loop from the first incomplete batch rather than from zero. - -Expected usage: - -```python -dd = DataDesigner(...) -dd.add_column(...) - -# First run — interrupted at batch 7 of 20 -results = dd.create(config_builder, num_records=10_000) - -# After restart — picks up from batch 8 -results = dd.create(config_builder, num_records=10_000, resume=True) -``` - -## Design Decisions - -| Decision | Choice | Rationale | -|---|---|---| -| API surface | `resume: bool = False` on `DataDesigner.create()` | Opt-in flag keeps default behaviour unchanged. Users who want a clean re-run keep getting the timestamped-folder behaviour. | -| Resume state source | Read `metadata.json` written after each completed batch | Already contains `num_completed_batches`, `target_num_records`, `buffer_size`, `actual_num_records`. No new persistence needed. | -| Partial batch at crash time | Clear `tmp-partial-parquet-files/` at resume start | Simpler and safer than merging an incomplete parquet; losing one batch is acceptable since the user is already recovering from a crash. | -| Compatibility validation | Raise `DatasetGenerationError` if `num_records` or `buffer_size` changed | Different `num_records` changes which rows land in which batch file, breaking the numbering invariant. `buffer_size` changes the file-per-batch mapping. Both must match. | -| Async engine | Raise `DatasetGenerationError` if `DATA_DESIGNER_ASYNC_ENGINE=1` with `resume=True` | The async path uses a row-group scheduler rather than an indexed batch loop; resume would require a different strategy. Out of scope for v1. | -| Already-complete runs | Detect and warn, return existing path | If `num_completed_batches == total_num_batches` the dataset is already complete; the user may have re-run by mistake. | -| No metadata → error | Raise `DatasetGenerationError` | Resuming without a checkpoint is impossible; a clear error is better than silent fallback to a fresh run. | - -## Affected Files - -| File | Change | -|---|---| -| `packages/data-designer-engine/src/data_designer/engine/storage/artifact_storage.py` | Add `resume: bool = False` field; modify `resolved_dataset_name` to skip timestamping when `resume=True`; add `clear_partial_results()` helper | -| `packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py` | Add `start_batch: int = 0` and `initial_actual_num_records: int = 0` to `start()` | -| `packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py` | Add `resume: bool = False` to `build()`; add `_load_resume_state()` private method; implement validation and batch-skip logic | -| `packages/data-designer/src/data_designer/interface/data_designer.py` | Add `resume: bool = False` to `create()` and `_create_resource_provider()`; pass through to `ArtifactStorage` and `builder.build()` | -| `packages/data-designer-engine/tests/engine/storage/test_artifact_storage.py` | Tests for resume flag on `resolved_dataset_name` and `clear_partial_results()` | -| `packages/data-designer-engine/tests/engine/dataset_builders/utils/test_dataset_batch_manager.py` | Tests for `start_batch` and `initial_actual_num_records` parameters | -| `packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py` | Tests for resume validation, batch skipping, async engine error, already-complete detection | - -## Implementation Sketch - -### `ArtifactStorage` - -```python -class ArtifactStorage(BaseModel): - ... - resume: bool = False - - @cached_property - def resolved_dataset_name(self) -> str: - dataset_path = self.artifact_path / self.dataset_name - if dataset_path.exists() and len(list(dataset_path.iterdir())) > 0: - if self.resume: - return self.dataset_name # use existing folder as-is - # existing behaviour: create timestamped copy - new_dataset_name = f"{self.dataset_name}_{datetime.now().strftime(...)}" - ... - return new_dataset_name - if self.resume: - raise ArtifactStorageError( - f"Cannot resume: no existing dataset found at {dataset_path!r}." - ) - return self.dataset_name - - def clear_partial_results(self) -> None: - """Remove any in-flight partial results left over from an interrupted run.""" - if self.partial_results_path.exists(): - shutil.rmtree(self.partial_results_path) -``` - -### `DatasetBatchManager.start()` - -```python -def start( - self, - *, - num_records: int, - buffer_size: int, - start_batch: int = 0, - initial_actual_num_records: int = 0, -) -> None: - ... - self.reset() - self._current_batch_number = start_batch - self._actual_num_records = initial_actual_num_records -``` - -### `DatasetBuilder.build()` — resume path - -```python -@dataclass -class _ResumeState: - num_completed_batches: int - actual_num_records: int - buffer_size: int - -def _load_resume_state(self, num_records: int, buffer_size: int) -> _ResumeState: - try: - metadata = self.artifact_storage.read_metadata() - except FileNotFoundError: - raise DatasetGenerationError("Cannot resume: metadata.json not found. ...") - - target = metadata.get("target_num_records") - if target != num_records: - raise DatasetGenerationError( - f"Cannot resume: num_records={num_records} does not match " - f"the original run's target_num_records={target}. ..." - ) - - meta_buffer_size = metadata.get("buffer_size") - if meta_buffer_size != buffer_size: - raise DatasetGenerationError( - f"Cannot resume: buffer_size={buffer_size} does not match " - f"the original run's buffer_size={meta_buffer_size}. ..." - ) - - return _ResumeState( - num_completed_batches=metadata["num_completed_batches"], - actual_num_records=metadata["actual_num_records"], - buffer_size=buffer_size, - ) - -def build(self, *, num_records, on_batch_complete=None, save_multimedia_to_disk=True, resume=False): - ... - if resume and DATA_DESIGNER_ASYNC_ENGINE: - raise DatasetGenerationError("resume=True is not supported with DATA_DESIGNER_ASYNC_ENGINE.") - - buffer_size = self._resource_provider.run_config.buffer_size - - if resume: - state = self._load_resume_state(num_records, buffer_size) - if state.num_completed_batches * buffer_size >= num_records: - logger.warning("Dataset already complete — nothing to resume.") - return self.artifact_storage.final_dataset_path - self.artifact_storage.clear_partial_results() - self.batch_manager.start( - num_records=num_records, - buffer_size=buffer_size, - start_batch=state.num_completed_batches, - initial_actual_num_records=state.actual_num_records, - ) - for batch_idx in range(state.num_completed_batches, self.batch_manager.num_batches): - ... - else: - # existing path unchanged - self.batch_manager.start(num_records=num_records, buffer_size=buffer_size) - for batch_idx in range(self.batch_manager.num_batches): - ... -``` - -## Trade-offs Considered - -- **Automatic resume detection** (no flag, detect existing folder automatically): rejected — removes user intent. A user re-running a pipeline from scratch would be surprised by silent resumption. -- **Resume support for async engine**: deferred to a follow-up. The async scheduler's row-group model doesn't map 1:1 to batch indices; implementing it would require a separate mechanism. -- **Per-column resume** (resume from column N within an interrupted batch): out of scope. Requires per-column checkpointing and state reconstruction, significantly higher complexity. - -## Delivery - -Single PR implementing all changes listed in the affected-files table plus tests. No backwards-incompatible changes — `resume` defaults to `False` and all existing call sites are unaffected. From 60b9d6bd93449ba3d7aa65876567341a64ee6023 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Thu, 7 May 2026 23:20:14 +0200 Subject: [PATCH 26/29] fix(engine): fix false 'already complete' when extension fits in last group's slack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit original_target=5, buffer_size=2 produces 3 groups [2,2,1]. Extending to num_records=6: ceil(6/2)=3 equalled len(completed_ids)=3, triggering the already-complete branch on both the async and sync paths — returning the 5-record dataset silently. Fix (async): replace ceil(num_records/bs) with num_original_groups + ceil(extension_records/bs) so any extension always adds new groups beyond num_original_groups. Fix (sync): add num_records_list param to DatasetBatchManager.start() and pass the correct per-batch sizes in _build_with_resume, giving the batch manager the right total batch count (4 instead of 3 in the example). --- .../dataset_builders/dataset_builder.py | 17 ++++- .../utils/dataset_batch_manager.py | 10 ++- .../dataset_builders/test_dataset_builder.py | 62 +++++++++++++++++++ 3 files changed, 85 insertions(+), 4 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index 201eb17d1..3769f8da2 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -385,11 +385,22 @@ def _build_with_resume( """ state = self._load_resume_state(num_records, buffer_size) + # Compute the correct per-batch sizes. ceil(num_records/bs) is wrong for a + # non-aligned extension: original groups are immutable, so any extension always + # adds new groups beyond num_original_batches. + original_target = state.target_num_records + num_original_batches = -(-original_target // buffer_size) + extension_records = num_records - original_target + num_extension_batches = -(-extension_records // buffer_size) + original_sizes = [min(buffer_size, original_target - i * buffer_size) for i in range(num_original_batches)] + extension_sizes = [min(buffer_size, extension_records - i * buffer_size) for i in range(num_extension_batches)] + self.batch_manager.start( num_records=num_records, buffer_size=buffer_size, start_batch=state.num_completed_batches, initial_actual_num_records=state.actual_num_records, + num_records_list=original_sizes + extension_sizes, ) if state.num_completed_batches >= self.batch_manager.num_batches: @@ -612,7 +623,11 @@ def _rg_size(rg_id: int) -> int: initial_actual_num_records = sum(_rg_size(rg_id) for rg_id in completed_ids) self.artifact_storage.clear_partial_results() - total_row_groups = -(-num_records // buffer_size) # ceiling division + # Original groups are immutable; any extension always needs new groups beyond + # num_original_groups — ceil(num_records/bs) gives the wrong count when the + # original run was non-aligned and the extension fits in the last group's slack. + extension_records = num_records - original_target + total_row_groups = num_original_groups + -(-extension_records // buffer_size) if len(completed_ids) >= total_row_groups: logger.warning( "⚠️ Dataset is already complete — all row groups were found in the existing artifact " diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py index 63907c916..1811cd781 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py @@ -165,6 +165,7 @@ def start( buffer_size: int, start_batch: int = 0, initial_actual_num_records: int = 0, + num_records_list: list[int] | None = None, ) -> None: if num_records <= 0: raise DatasetBatchManagementError("🛑 num_records must be positive.") @@ -172,9 +173,12 @@ def start( raise DatasetBatchManagementError("🛑 buffer_size must be positive.") self._buffer_size = buffer_size - self._num_records_list = [buffer_size] * (num_records // buffer_size) - if remaining_records := num_records % buffer_size: - self._num_records_list.append(remaining_records) + if num_records_list is not None: + self._num_records_list = list(num_records_list) + else: + self._num_records_list = [buffer_size] * (num_records // buffer_size) + if remaining_records := num_records % buffer_size: + self._num_records_list.append(remaining_records) self.reset() self._current_batch_number = start_batch self._actual_num_records = initial_actual_num_records diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index c60586a9a..cec5da727 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1592,6 +1592,29 @@ def test_build_resume_already_complete_does_not_run_after_generation_processors( mock_after.assert_not_called() +def test_build_resume_not_already_complete_when_extension_fits_in_slack( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """Non-aligned extension fitting in the last group's slack must not falsely trigger 'already complete'. + + original_target=5, buffer_size=2 → 3 batches [2,2,1]; extending to num_records=6: + ceil(6/2)=3 == num_completed_batches=3 used to trigger the false 'already complete' branch. + Correct total_batches = 3 + ceil(1/2) = 4, so batch 3 (1 record) must be scheduled. + """ + dataset_dir = tmp_path / "dataset" + _write_metadata(dataset_dir, target_num_records=5, buffer_size=2, num_completed_batches=3, actual_num_records=5) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + + with patch.object(builder, "_run_batch") as mock_run_batch: + with patch.object(builder.batch_manager, "finish"): + with patch.object(builder, "_run_model_health_check_if_needed"): + builder.build(num_records=6, resume=ResumeMode.ALWAYS) + + mock_run_batch.assert_called_once() + assert mock_run_batch.call_args.kwargs["current_batch_number"] == 3 + + # --------------------------------------------------------------------------- # Async resume via _build_async tests # --------------------------------------------------------------------------- @@ -1926,6 +1949,45 @@ def capturing_prepare(*args, **kwargs): assert captured["precomputed_row_groups"] == [(3, 2)] +def test_build_async_resume_not_already_complete_when_extension_fits_in_slack( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """Non-aligned extension fitting in the last group's slack must not falsely trigger 'already complete'. + + original_target=5, buffer_size=2 → 3 row groups; extending to num_records=6: + ceil(6/2)=3 == len(completed_ids)=3 used to trigger the false 'already complete' branch. + Correct total_row_groups = 3 + ceil(1/2) = 4, so _prepare_async_run must be called. + """ + import asyncio as stdlib_asyncio + + dataset_dir = tmp_path / "dataset" + _write_metadata(dataset_dir, target_num_records=5, buffer_size=2, num_completed_batches=3, actual_num_records=5) + _write_parquet_files(dataset_dir / "parquet-files", [0, 1, 2]) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + + def capturing_prepare(*args, **kwargs): + mock_scheduler = Mock() + mock_scheduler.traces = [] + mock_buffer_manager = Mock() + mock_buffer_manager.actual_num_records = 6 + return mock_scheduler, mock_buffer_manager + + mock_future = Mock() + mock_future.result = Mock(return_value=None) + + with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): + with patch.object(builder_mod, "asyncio", stdlib_asyncio, create=True): + with patch.object(builder_mod, "ensure_async_engine_loop", Mock(return_value=Mock()), create=True): + with patch.object(stdlib_asyncio, "run_coroutine_threadsafe", return_value=mock_future): + with patch.object(builder, "_run_model_health_check_if_needed"): + with patch.object(builder, "_prepare_async_run", side_effect=capturing_prepare) as mock_prepare: + builder.build(num_records=6, resume=ResumeMode.ALWAYS) + + # _prepare_async_run must be called — the dataset is NOT already complete + mock_prepare.assert_called_once() + + def test_if_possible_incompatible_config_does_not_overwrite_existing_dataset( stub_resource_provider, stub_test_config_builder, tmp_path ): From cd85e972cf48819046a07e52541d6c7f32e7a358 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Fri, 8 May 2026 11:58:16 +0200 Subject: [PATCH 27/29] fix(engine): raise error when num_records is below original target on resume Prevents negative extension_records in async path which silently truncated the dataset and corrupted metadata without triggering a partial-completion warning. --- .../engine/dataset_builders/dataset_builder.py | 9 +++++++++ .../dataset_builders/test_dataset_builder.py | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index 3769f8da2..1081d667c 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -355,6 +355,15 @@ def _load_resume_state(self, num_records: int, buffer_size: int) -> _ResumeState "or start a new run without resume=ResumeMode.ALWAYS." ) + target_num_records = metadata.get("target_num_records") + if target_num_records is not None and num_records < target_num_records: + raise DatasetGenerationError( + f"🛑 Cannot resume: num_records={num_records} is less than the original target " + f"({target_num_records}). To resume, use num_records >= {target_num_records} " + "(you may extend the dataset beyond the original target). " + "Use resume=ResumeMode.NEVER to start a new run." + ) + meta_buffer_size = metadata.get("buffer_size") if meta_buffer_size != buffer_size: raise DatasetGenerationError( diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index cec5da727..3debe5b0c 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1504,6 +1504,24 @@ def test_build_resume_raises_when_num_records_below_actual(stub_resource_provide builder.build(num_records=4, resume=ResumeMode.ALWAYS) +def test_build_resume_raises_when_num_records_below_original_target( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """resume=ALWAYS raises when num_records is between actual and original target (negative extension_records).""" + dataset_dir = tmp_path / "dataset" + _write_metadata( + dataset_dir, + target_num_records=10, + buffer_size=2, + num_completed_batches=2, + actual_num_records=4, + ) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + with pytest.raises(DatasetGenerationError, match="num_records=7 is less than the original target"): + builder.build(num_records=7, resume=ResumeMode.ALWAYS) + + def test_build_resume_allows_larger_num_records(stub_resource_provider, stub_test_config_builder, tmp_path, caplog): """resume=ALWAYS succeeds when num_records > original target (extending the dataset).""" dataset_dir = tmp_path / "dataset" From 729ecf24ee7f9501488b4778c2bd9141fe6d4972 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Fri, 8 May 2026 12:46:01 +0200 Subject: [PATCH 28/29] =?UTF-8?q?fix(storage):=20refresh=20MediaStorage=20?= =?UTF-8?q?path=20after=20IF=5FPOSSIBLE=20=E2=86=92=20NEVER=20downgrade?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When build() detected an incompatible config and downgraded resume from IF_POSSIBLE to NEVER, _media_storage.base_path remained bound to the original directory while all other path properties resolved to the new timestamped directory — causing broken image references in image-column runs. --- .../dataset_builders/dataset_builder.py | 1 + .../engine/storage/artifact_storage.py | 10 ++++ .../dataset_builders/test_dataset_builder.py | 48 +++++++++++++++++++ 3 files changed, 59 insertions(+) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index 1081d667c..06ecdf4ab 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -275,6 +275,7 @@ def build( resume = ResumeMode.NEVER self.artifact_storage.resume = ResumeMode.NEVER self.artifact_storage.__dict__.pop("resolved_dataset_name", None) + self.artifact_storage.refresh_media_storage_path() else: resume = ResumeMode.ALWAYS self.artifact_storage.resume = ResumeMode.ALWAYS diff --git a/packages/data-designer-engine/src/data_designer/engine/storage/artifact_storage.py b/packages/data-designer-engine/src/data_designer/engine/storage/artifact_storage.py index dd6d2e98a..cd106a991 100644 --- a/packages/data-designer-engine/src/data_designer/engine/storage/artifact_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/storage/artifact_storage.py @@ -158,6 +158,16 @@ def set_media_storage_mode(self, mode: StorageMode) -> None: """ self._media_storage.mode = mode + def refresh_media_storage_path(self) -> None: + """Re-point MediaStorage to the current base_dataset_path. + + Must be called after popping the resolved_dataset_name cache so that + _media_storage.base_path and .images_dir reflect the updated directory. + """ + images_subdir = self._media_storage.images_dir.name + self._media_storage.base_path = self.base_dataset_path + self._media_storage.images_dir = self.base_dataset_path / images_subdir + @staticmethod def mkdir_if_needed(path: Path | str) -> Path: """Create the directory if it does not exist.""" diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index 3debe5b0c..e69893cba 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -2055,6 +2055,54 @@ def test_if_possible_incompatible_config_does_not_overwrite_existing_dataset( ) +def test_if_possible_incompatible_config_refreshes_media_storage_path( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """After IF_POSSIBLE → NEVER downgrade, _media_storage must point to the new timestamped dir. + + Bug: validate_folder_names initialises MediaStorage with base_dataset_path at Pydantic + construction time (while resume=IF_POSSIBLE), caching the original directory name. + After the cache pop and resume=NEVER, base_dataset_path resolves to a new timestamped + directory, but _media_storage.base_path still holds the old path — producing broken + image references for image-column datasets. + + Fix: refresh_media_storage_path() is called after the cache pop. + """ + dataset_dir = tmp_path / "dataset" + dataset_dir.mkdir() + (dataset_dir / "existing_file.parquet").write_text("data") # non-empty dir triggers NEVER→timestamp + + storage = _ArtifactStorage(artifact_path=tmp_path, resume=ResumeMode.IF_POSSIBLE) + stub_resource_provider.artifact_storage = storage + + # Trigger validate_folder_names so _media_storage is initialised with IF_POSSIBLE semantics + # (non-empty dir + IF_POSSIBLE → resolved_dataset_name returns "dataset", not timestamped) + original_media_base = storage.media_storage.base_path + + builder = DatasetBuilder( + data_designer_config=stub_test_config_builder.build(), + resource_provider=stub_resource_provider, + ) + + with patch.object(builder, "_check_resume_config_compatibility", return_value=_ConfigCompatibility.INCOMPATIBLE): + with patch.object(builder, "_run_model_health_check_if_needed"): + with patch.object(builder, "_run_mcp_tool_check_if_needed"): + with patch.object(builder, "_write_builder_config"): + with patch.object(builder, "_initialize_generators_and_graph", return_value=([], None)): + with patch.object(builder.batch_manager, "start"): + with patch.object(builder.batch_manager, "finish"): + with patch.object(builder._processor_runner, "run_after_generation"): + builder.build(num_records=2, resume=ResumeMode.IF_POSSIBLE) + + new_media_base = storage.media_storage.base_path + assert new_media_base != original_media_base, ( + "media_storage.base_path must be updated to the new timestamped directory after IF_POSSIBLE → NEVER downgrade" + ) + assert new_media_base == storage.base_dataset_path, ( + "media_storage.base_path must match base_dataset_path after downgrade" + ) + + def test_if_possible_starts_fresh_when_no_existing_directory( stub_resource_provider, stub_test_config_builder, tmp_path ): From e64ec031efb78a63f6be2a9dd93397303f608b1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Fri, 8 May 2026 13:04:25 +0200 Subject: [PATCH 29/29] fix(engine): preserve original_target_num_records across extension resume writes After finalize_row_group successfully wrote incremental metadata during an extension run, target_num_records in metadata was updated to the extension target. A subsequent resume would read this as the original target, making _rg_size() incorrect for all row groups and silently corrupting actual_num_records. Stores original_target_num_records as an immutable field in metadata so the original group boundaries are always recoverable regardless of how many incremental writes have occurred. --- .../dataset_builders/dataset_builder.py | 20 +++++-- .../utils/dataset_batch_manager.py | 7 ++- .../utils/row_group_buffer.py | 5 +- .../dataset_builders/test_dataset_builder.py | 55 +++++++++++++++++++ 4 files changed, 81 insertions(+), 6 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index 06ecdf4ab..09954b9b6 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -111,6 +111,7 @@ class _ResumeState: actual_num_records: int buffer_size: int target_num_records: int + original_target_num_records: int class DatasetBuilder: @@ -378,6 +379,7 @@ def _load_resume_state(self, num_records: int, buffer_size: int) -> _ResumeState actual_num_records=actual_num_records, buffer_size=buffer_size, target_num_records=metadata["target_num_records"], + original_target_num_records=metadata.get("original_target_num_records", metadata["target_num_records"]), ) def _build_with_resume( @@ -398,7 +400,7 @@ def _build_with_resume( # Compute the correct per-batch sizes. ceil(num_records/bs) is wrong for a # non-aligned extension: original groups are immutable, so any extension always # adds new groups beyond num_original_batches. - original_target = state.target_num_records + original_target = state.original_target_num_records num_original_batches = -(-original_target // buffer_size) extension_records = num_records - original_target num_extension_batches = -(-extension_records // buffer_size) @@ -411,6 +413,7 @@ def _build_with_resume( start_batch=state.num_completed_batches, initial_actual_num_records=state.actual_num_records, num_records_list=original_sizes + extension_sizes, + original_target_num_records=original_target, ) if state.num_completed_batches >= self.batch_manager.num_batches: @@ -610,6 +613,7 @@ def _build_async( precomputed_row_groups: list[tuple[int, int]] | None = None initial_actual_num_records = 0 initial_total_num_batches = 0 + original_target = num_records # immutable original target; overridden on resume if resume == ResumeMode.ALWAYS: state = self._load_resume_state(num_records, buffer_size) @@ -620,7 +624,7 @@ def _build_async( # Use the original target (not the new num_records) so the last row group of a # non-aligned run gets its true size, not buffer_size. initial_total_num_batches = len(completed_ids) - original_target = state.target_num_records + original_target = state.original_target_num_records num_original_groups = -(-original_target // buffer_size) # ceil(original_target/buffer_size) @@ -664,7 +668,11 @@ def on_complete(final_path: Path | str | None) -> None: buffer_manager.checkpoint_row_group(rg_id, on_complete=on_complete) # Write incremental metadata after each row group so interrupted runs can be resumed. - buffer_manager.write_metadata(target_num_records=num_records, buffer_size=buffer_size) + buffer_manager.write_metadata( + target_num_records=num_records, + original_target_num_records=original_target, + buffer_size=buffer_size, + ) scheduler, buffer_manager = self._prepare_async_run( generators, @@ -707,7 +715,11 @@ def on_complete(final_path: Path | str | None) -> None: logger.debug("Failed to emit batch telemetry for async run", exc_info=True) # Write final metadata (overwrites the last incremental write with identical content). - buffer_manager.write_metadata(target_num_records=num_records, buffer_size=buffer_size) + buffer_manager.write_metadata( + target_num_records=num_records, + original_target_num_records=original_target, + buffer_size=buffer_size, + ) # Surface partial completion actual = self._actual_num_records diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py index 1811cd781..f2bc39cdd 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py @@ -26,6 +26,7 @@ def __init__(self, artifact_storage: ArtifactStorage): self._num_records_list: list[int] | None = None self._buffer_size: int | None = None self._actual_num_records: int = 0 + self._original_target_num_records: int | None = None self.artifact_storage = artifact_storage @property @@ -87,9 +88,11 @@ def finish_batch(self, on_complete: Callable[[Path], None] | None = None) -> Pat self._actual_num_records += len(self._buffer) final_file_path = self.artifact_storage.move_partial_result_to_final_file_path(self._current_batch_number) + target = sum(self.num_records_list) self.artifact_storage.write_metadata( { - "target_num_records": sum(self.num_records_list), + "target_num_records": target, + "original_target_num_records": self._original_target_num_records or target, "actual_num_records": self._actual_num_records, "total_num_batches": self.num_batches, "buffer_size": self._buffer_size, @@ -166,6 +169,7 @@ def start( start_batch: int = 0, initial_actual_num_records: int = 0, num_records_list: list[int] | None = None, + original_target_num_records: int | None = None, ) -> None: if num_records <= 0: raise DatasetBatchManagementError("🛑 num_records must be positive.") @@ -173,6 +177,7 @@ def start( raise DatasetBatchManagementError("🛑 buffer_size must be positive.") self._buffer_size = buffer_size + self._original_target_num_records = original_target_num_records if num_records_list is not None: self._num_records_list = list(num_records_list) else: diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/row_group_buffer.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/row_group_buffer.py index 18cac9d0c..b4b28e9aa 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/row_group_buffer.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/row_group_buffer.py @@ -134,11 +134,14 @@ def checkpoint_row_group( self.free_row_group(row_group) - def write_metadata(self, target_num_records: int, buffer_size: int) -> None: + def write_metadata( + self, target_num_records: int, buffer_size: int, original_target_num_records: int | None = None + ) -> None: """Write final metadata after all row groups are checkpointed.""" self._artifact_storage.write_metadata( { "target_num_records": target_num_records, + "original_target_num_records": original_target_num_records or target_num_records, "actual_num_records": self._actual_num_records, "total_num_batches": self._total_num_batches, "buffer_size": buffer_size, diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py index e69893cba..efb7d87f6 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py @@ -1879,6 +1879,61 @@ def capturing_prepare(*args, **kwargs): assert captured["initial_actual_num_records"] == 7 +def test_build_async_resume_stale_original_target_after_incremental_metadata_write( + stub_resource_provider, stub_test_config_builder, tmp_path +): + """original_target_num_records stays immutable even after an incremental metadata write. + + Scenario: original run had num_records=5, buffer_size=2 (row groups [2,2,1], all done). + Extension to num_records=9 starts; row group 3 (2 records) completes and finalize_row_group + writes metadata with target_num_records=9. Crash before row group 4. + + On second resume, metadata now shows target_num_records=9. Without the fix, original_target + would be read as 9, making num_original_groups=5 and producing wrong _rg_size values. + With the fix, original_target_num_records=5 is preserved in metadata, giving the correct + initial_actual_num_records=7 (2+2+1 original + 2 extension). + """ + import asyncio as stdlib_asyncio + + dataset_dir = tmp_path / "dataset" + # Metadata reflects a post-incremental-write state: target updated to 9, original still 5 + _write_metadata( + dataset_dir, + target_num_records=9, + original_target_num_records=5, + buffer_size=2, + num_completed_batches=4, + actual_num_records=7, + ) + _write_parquet_files(dataset_dir / "parquet-files", [0, 1, 2, 3]) + + builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, buffer_size=2) + + captured: dict = {} + + def capturing_prepare(*args, **kwargs): + captured["initial_actual_num_records"] = kwargs.get("initial_actual_num_records", 0) + mock_scheduler = Mock() + mock_scheduler.traces = [] + mock_buffer_manager = Mock() + mock_buffer_manager.actual_num_records = 9 + return mock_scheduler, mock_buffer_manager + + mock_future = Mock() + mock_future.result = Mock(return_value=None) + + with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True): + with patch.object(builder_mod, "asyncio", stdlib_asyncio, create=True): + with patch.object(builder_mod, "ensure_async_engine_loop", Mock(return_value=Mock()), create=True): + with patch.object(stdlib_asyncio, "run_coroutine_threadsafe", return_value=mock_future): + with patch.object(builder, "_run_model_health_check_if_needed"): + with patch.object(builder, "_prepare_async_run", side_effect=capturing_prepare): + builder.build(num_records=9, resume=ResumeMode.ALWAYS) + + # original_target=5 → groups 0,1 → 2+2; group 2 → 1; group 3 (ext) → min(2,9-6)=2. Total=7 + assert captured["initial_actual_num_records"] == 7 + + def test_build_async_resume_skip_row_groups_contains_completed_ids( stub_resource_provider, stub_test_config_builder, tmp_path ):