From f3eb730739c7d44478cd8e86c81b05a7181bdd1a Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 7 May 2026 15:26:44 +0100 Subject: [PATCH] feat(retrieval-sdg): expose recipe entry point Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- plugins/data-designer-retrieval-sdg/README.md | 18 ++ .../pyproject.toml | 3 + .../src/data_designer_retrieval_sdg/config.py | 3 +- .../src/data_designer_retrieval_sdg/recipe.py | 170 ++++++++++++++++++ .../tests/test_recipe.py | 61 +++++++ 5 files changed, 254 insertions(+), 1 deletion(-) create mode 100644 plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/recipe.py create mode 100644 plugins/data-designer-retrieval-sdg/tests/test_recipe.py diff --git a/plugins/data-designer-retrieval-sdg/README.md b/plugins/data-designer-retrieval-sdg/README.md index 426f39d..024f12c 100644 --- a/plugins/data-designer-retrieval-sdg/README.md +++ b/plugins/data-designer-retrieval-sdg/README.md @@ -64,6 +64,24 @@ data-designer-retrieval-sdg generate \ --num-pairs 7 ``` +### Preview through Data Designer recipes + +When installed with a Data Designer CLI that supports recipe entry +points, this package also registers `retrieval-sdg` under +`data_designer.recipes`: + +```bash +data-designer preview --recipe retrieval-sdg -- \ + --input-dir ./my_documents \ + --num-pairs 2 +``` + +Recipe-specific options can be inspected without running generation: + +```bash +data-designer recipes help retrieval-sdg +``` + ### Convert to training format ```bash diff --git a/plugins/data-designer-retrieval-sdg/pyproject.toml b/plugins/data-designer-retrieval-sdg/pyproject.toml index bc57a50..566595c 100644 --- a/plugins/data-designer-retrieval-sdg/pyproject.toml +++ b/plugins/data-designer-retrieval-sdg/pyproject.toml @@ -27,6 +27,9 @@ classifiers = [ embedding-dedup = "data_designer_retrieval_sdg.plugins:embedding_dedup_plugin" document-chunker = "data_designer_retrieval_sdg.plugins:document_chunker_plugin" +[project.entry-points."data_designer.recipes"] +retrieval-sdg = "data_designer_retrieval_sdg.recipe:load_config_builder" + [project.scripts] data-designer-retrieval-sdg = "data_designer_retrieval_sdg.cli:main" diff --git a/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/config.py b/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/config.py index 30ac3c3..593915f 100644 --- a/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/config.py +++ b/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/config.py @@ -54,6 +54,7 @@ def side_effect_columns(self) -> list[str]: """Additional columns produced as side effects.""" return [] - def get_column_emoji(self) -> str: + @staticmethod + def get_column_emoji() -> str: """Emoji displayed in logs for this column type.""" return "🔍" diff --git a/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/recipe.py b/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/recipe.py new file mode 100644 index 0000000..53ff86d --- /dev/null +++ b/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/recipe.py @@ -0,0 +1,170 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Typer-backed Data Designer recipe entry point for retrieval SDG.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Annotated + +import click +import data_designer.config as dd +import typer + +from data_designer_retrieval_sdg.pipeline import ( + DEFAULT_CHAT_MODEL, + DEFAULT_EMBED_MODEL, + DEFAULT_PROVIDER, + build_qa_generation_pipeline, +) +from data_designer_retrieval_sdg.seed_source import DocumentChunkerSeedSource + + +def load_config_builder(params: dd.DataDesignerScriptParams | None = None) -> dd.DataDesignerConfigBuilder: + """Build the retrieval SDG pipeline from forwarded Data Designer CLI args. + + Args: + params: Data Designer script parameters. ``params.argv`` contains the + arguments supplied after ``data-designer preview/create --recipe + retrieval-sdg --``. + + Returns: + A configured Data Designer config builder for retrieval SDG generation. + """ + argv = list(tuple(getattr(params, "argv", ()))) + command = typer.main.get_command(build_typer_app()) + config_builder = command.main( + args=argv, + prog_name="data-designer preview/create --recipe retrieval-sdg --", + standalone_mode=False, + ) + + if config_builder == 0 and any(arg in {"--help", "-h"} for arg in argv): + raise SystemExit(0) + if not isinstance(config_builder, dd.DataDesignerConfigBuilder): + raise TypeError(f"Recipe returned {type(config_builder).__name__}, expected DataDesignerConfigBuilder") + return config_builder + + +def build_typer_app() -> typer.Typer: + """Build the Typer app used for recipe inspection and execution. + + Returns: + Typer app describing the retrieval SDG recipe interface. + """ + app = typer.Typer(add_completion=False, help="Build the retrieval SDG Data Designer workflow.") + app.command(name=None, help="Build the retrieval SDG Data Designer workflow.")(recipe_command) + return app + + +def recipe_command( + input_dir: Annotated[Path, typer.Option("--input-dir", help="Directory containing text files")], + file_pattern: Annotated[str, typer.Option("--file-pattern", help="Filename glob (basenames only)")] = "*", + recursive: Annotated[ + bool, + typer.Option("--recursive/--no-recursive", help="Enable recursive search"), + ] = True, + file_extensions: Annotated[ + list[str] | None, + typer.Option( + "--file-extensions", + help="Allowed file extensions (use empty string '' to match files without extensions)", + ), + ] = None, + min_text_length: Annotated[int, typer.Option("--min-text-length", help="Minimum document text length")] = 50, + sentences_per_chunk: Annotated[int, typer.Option("--sentences-per-chunk", help="Sentences per chunk")] = 5, + num_sections: Annotated[int, typer.Option("--num-sections", help="Sections to divide chunks into")] = 1, + num_files: Annotated[int | None, typer.Option("--num-files", help="Max files to process")] = None, + multi_doc: Annotated[bool, typer.Option("--multi-doc", help="Enable multi-doc bundling")] = False, + bundle_size: Annotated[int, typer.Option("--bundle-size", help="Docs per bundle")] = 2, + bundle_strategy: Annotated[ + str, + typer.Option( + "--bundle-strategy", + help="Section splitting strategy", + click_type=click.Choice(["sequential", "doc_balanced", "interleaved"]), + ), + ] = "sequential", + max_docs_per_bundle: Annotated[int, typer.Option("--max-docs-per-bundle", help="Max docs per bundle")] = 3, + multi_doc_manifest: Annotated[ + Path | None, typer.Option("--multi-doc-manifest", help="Manifest for explicit bundles") + ] = None, + start_index: Annotated[int, typer.Option("--start-index", help="Start seed row index")] = 0, + end_index: Annotated[int, typer.Option("--end-index", help="End seed row index")] = 199, + max_artifacts_per_type: Annotated[int, typer.Option("--max-artifacts-per-type", help="Max artifacts per type")] = 2, + num_pairs: Annotated[int, typer.Option("--num-pairs", help="QA pairs per document")] = 7, + min_hops: Annotated[int, typer.Option("--min-hops", help="Min hops for multi-hop questions")] = 2, + max_hops: Annotated[int, typer.Option("--max-hops", help="Max hops for multi-hop questions")] = 4, + min_complexity: Annotated[int, typer.Option("--min-complexity", help="Min question complexity")] = 4, + similarity_threshold: Annotated[ + float, typer.Option("--similarity-threshold", help="Cosine threshold for QA-pair dedup") + ] = 0.9, + artifact_extraction_model: Annotated[ + str, typer.Option("--artifact-extraction-model", help="Artifact extraction model") + ] = DEFAULT_CHAT_MODEL, + artifact_extraction_provider: Annotated[ + str, typer.Option("--artifact-extraction-provider", help="Artifact extraction provider") + ] = DEFAULT_PROVIDER, + qa_generation_model: Annotated[str, typer.Option("--qa-generation-model", help="QA generation model")] = ( + DEFAULT_CHAT_MODEL + ), + qa_generation_provider: Annotated[str, typer.Option("--qa-generation-provider", help="QA generation provider")] = ( + DEFAULT_PROVIDER + ), + quality_judge_model: Annotated[str, typer.Option("--quality-judge-model", help="Quality judge model")] = ( + DEFAULT_CHAT_MODEL + ), + quality_judge_provider: Annotated[str, typer.Option("--quality-judge-provider", help="Quality judge provider")] = ( + DEFAULT_PROVIDER + ), + embed_model: Annotated[str, typer.Option("--embed-model", help="Embedding model")] = DEFAULT_EMBED_MODEL, + embed_provider: Annotated[str, typer.Option("--embed-provider", help="Embedding provider")] = DEFAULT_PROVIDER, + max_parallel_requests_for_gen: Annotated[ + int | None, typer.Option("--max-parallel-requests-for-gen", help="Max parallel generation requests") + ] = None, +) -> dd.DataDesignerConfigBuilder: + """Build the retrieval SDG Data Designer workflow. + + Returns: + A configured Data Designer config builder. + """ + if end_index < start_index: + raise click.BadParameter("--end-index must be greater than or equal to --start-index") + + seed_source = DocumentChunkerSeedSource( + path=str(input_dir), + file_pattern=file_pattern, + recursive=recursive, + file_extensions=file_extensions or [".txt", ".md", ".text"], + min_text_length=min_text_length, + sentences_per_chunk=sentences_per_chunk, + num_sections=num_sections, + num_files=num_files, + multi_doc=multi_doc, + bundle_size=bundle_size, + bundle_strategy=bundle_strategy, + max_docs_per_bundle=max_docs_per_bundle, + multi_doc_manifest=str(multi_doc_manifest) if multi_doc_manifest else None, + ) + + return build_qa_generation_pipeline( + seed_source=seed_source, + start_index=start_index, + end_index=end_index, + max_artifacts_per_type=max_artifacts_per_type, + num_pairs=num_pairs, + min_hops=min_hops, + max_hops=max_hops, + min_complexity=min_complexity, + similarity_threshold=similarity_threshold, + max_parallel_requests_for_gen=max_parallel_requests_for_gen, + artifact_extraction_model=artifact_extraction_model, + artifact_extraction_provider=artifact_extraction_provider, + qa_generation_model=qa_generation_model, + qa_generation_provider=qa_generation_provider, + quality_judge_model=quality_judge_model, + quality_judge_provider=quality_judge_provider, + embed_model=embed_model, + embed_provider=embed_provider, + ) diff --git a/plugins/data-designer-retrieval-sdg/tests/test_recipe.py b/plugins/data-designer-retrieval-sdg/tests/test_recipe.py new file mode 100644 index 0000000..743ec79 --- /dev/null +++ b/plugins/data-designer-retrieval-sdg/tests/test_recipe.py @@ -0,0 +1,61 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace + +import typer +from click.testing import CliRunner +from data_designer.config.config_builder import DataDesignerConfigBuilder + +from data_designer_retrieval_sdg.recipe import build_typer_app, load_config_builder +from data_designer_retrieval_sdg.seed_source import DocumentChunkerSeedSource + + +def test_load_config_builder_builds_retrieval_sdg_pipeline(tmp_path: Path) -> None: + """The recipe entry point builds the retrieval SDG workflow from argv.""" + params = SimpleNamespace( + argv=( + "--input-dir", + str(tmp_path), + "--num-pairs", + "2", + "--start-index", + "1", + "--end-index", + "4", + "--file-extensions", + ".txt", + ) + ) + + builder = load_config_builder(params) + + assert isinstance(builder, DataDesignerConfigBuilder) + seed_config = builder.get_seed_config() + assert seed_config is not None + assert isinstance(seed_config.source, DocumentChunkerSeedSource) + assert seed_config.source.path == str(tmp_path) + assert seed_config.source.file_extensions == [".txt"] + assert seed_config.selection_strategy is not None + assert seed_config.selection_strategy.start == 1 + assert seed_config.selection_strategy.end == 4 + assert [column.name for column in builder.get_column_configs()] == [ + "document_artifacts", + "qa_generation", + "deduplicated_qa_pairs", + "qa_evaluations", + ] + + +def test_build_typer_app_exposes_recipe_help() -> None: + """The recipe exposes Typer metadata for Data Designer inspection.""" + command = typer.main.get_command(build_typer_app()) + result = CliRunner().invoke(command, ["--help"]) + + assert result.exit_code == 0 + assert "Build the retrieval SDG Data Designer workflow." in result.output + assert "--input-dir" in result.output + assert "--num-pairs" in result.output