Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions plugins/data-designer-retrieval-sdg/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,24 @@ data-designer-retrieval-sdg generate \
--num-pairs 7
```

### Preview through Data Designer recipes

When installed with a Data Designer CLI that supports recipe entry
points, this package also registers `retrieval-sdg` under
`data_designer.recipes`:

```bash
data-designer preview --recipe retrieval-sdg -- \
--input-dir ./my_documents \
--num-pairs 2
```

Recipe-specific options can be inspected without running generation:

```bash
data-designer recipes help retrieval-sdg
```

### Convert to training format

```bash
Expand Down
3 changes: 3 additions & 0 deletions plugins/data-designer-retrieval-sdg/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ classifiers = [
embedding-dedup = "data_designer_retrieval_sdg.plugins:embedding_dedup_plugin"
document-chunker = "data_designer_retrieval_sdg.plugins:document_chunker_plugin"

[project.entry-points."data_designer.recipes"]
retrieval-sdg = "data_designer_retrieval_sdg.recipe:load_config_builder"

[project.scripts]
data-designer-retrieval-sdg = "data_designer_retrieval_sdg.cli:main"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def side_effect_columns(self) -> list[str]:
"""Additional columns produced as side effects."""
return []

def get_column_emoji(self) -> str:
@staticmethod
def get_column_emoji() -> str:
"""Emoji displayed in logs for this column type."""
return "🔍"
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Typer-backed Data Designer recipe entry point for retrieval SDG."""

from __future__ import annotations

from pathlib import Path
from typing import Annotated

import click
import data_designer.config as dd
import typer

from data_designer_retrieval_sdg.pipeline import (
DEFAULT_CHAT_MODEL,
DEFAULT_EMBED_MODEL,
DEFAULT_PROVIDER,
build_qa_generation_pipeline,
)
from data_designer_retrieval_sdg.seed_source import DocumentChunkerSeedSource


def load_config_builder(params: dd.DataDesignerScriptParams | None = None) -> dd.DataDesignerConfigBuilder:
"""Build the retrieval SDG pipeline from forwarded Data Designer CLI args.

Args:
params: Data Designer script parameters. ``params.argv`` contains the
arguments supplied after ``data-designer preview/create --recipe
retrieval-sdg --``.

Returns:
A configured Data Designer config builder for retrieval SDG generation.
"""
argv = list(tuple(getattr(params, "argv", ())))
command = typer.main.get_command(build_typer_app())
config_builder = command.main(
args=argv,
prog_name="data-designer preview/create --recipe retrieval-sdg --",
standalone_mode=False,
)

if config_builder == 0 and any(arg in {"--help", "-h"} for arg in argv):
raise SystemExit(0)
if not isinstance(config_builder, dd.DataDesignerConfigBuilder):
raise TypeError(f"Recipe returned {type(config_builder).__name__}, expected DataDesignerConfigBuilder")
return config_builder


def build_typer_app() -> typer.Typer:
"""Build the Typer app used for recipe inspection and execution.

Returns:
Typer app describing the retrieval SDG recipe interface.
"""
app = typer.Typer(add_completion=False, help="Build the retrieval SDG Data Designer workflow.")
app.command(name=None, help="Build the retrieval SDG Data Designer workflow.")(recipe_command)
return app


def recipe_command(
input_dir: Annotated[Path, typer.Option("--input-dir", help="Directory containing text files")],
file_pattern: Annotated[str, typer.Option("--file-pattern", help="Filename glob (basenames only)")] = "*",
recursive: Annotated[
bool,
typer.Option("--recursive/--no-recursive", help="Enable recursive search"),
] = True,
file_extensions: Annotated[
list[str] | None,
typer.Option(
"--file-extensions",
help="Allowed file extensions (use empty string '' to match files without extensions)",
),
] = None,
min_text_length: Annotated[int, typer.Option("--min-text-length", help="Minimum document text length")] = 50,
sentences_per_chunk: Annotated[int, typer.Option("--sentences-per-chunk", help="Sentences per chunk")] = 5,
num_sections: Annotated[int, typer.Option("--num-sections", help="Sections to divide chunks into")] = 1,
num_files: Annotated[int | None, typer.Option("--num-files", help="Max files to process")] = None,
multi_doc: Annotated[bool, typer.Option("--multi-doc", help="Enable multi-doc bundling")] = False,
bundle_size: Annotated[int, typer.Option("--bundle-size", help="Docs per bundle")] = 2,
bundle_strategy: Annotated[
str,
typer.Option(
"--bundle-strategy",
help="Section splitting strategy",
click_type=click.Choice(["sequential", "doc_balanced", "interleaved"]),
),
] = "sequential",
max_docs_per_bundle: Annotated[int, typer.Option("--max-docs-per-bundle", help="Max docs per bundle")] = 3,
multi_doc_manifest: Annotated[
Path | None, typer.Option("--multi-doc-manifest", help="Manifest for explicit bundles")
] = None,
start_index: Annotated[int, typer.Option("--start-index", help="Start seed row index")] = 0,
end_index: Annotated[int, typer.Option("--end-index", help="End seed row index")] = 199,
max_artifacts_per_type: Annotated[int, typer.Option("--max-artifacts-per-type", help="Max artifacts per type")] = 2,
num_pairs: Annotated[int, typer.Option("--num-pairs", help="QA pairs per document")] = 7,
min_hops: Annotated[int, typer.Option("--min-hops", help="Min hops for multi-hop questions")] = 2,
max_hops: Annotated[int, typer.Option("--max-hops", help="Max hops for multi-hop questions")] = 4,
min_complexity: Annotated[int, typer.Option("--min-complexity", help="Min question complexity")] = 4,
similarity_threshold: Annotated[
float, typer.Option("--similarity-threshold", help="Cosine threshold for QA-pair dedup")
] = 0.9,
artifact_extraction_model: Annotated[
str, typer.Option("--artifact-extraction-model", help="Artifact extraction model")
] = DEFAULT_CHAT_MODEL,
artifact_extraction_provider: Annotated[
str, typer.Option("--artifact-extraction-provider", help="Artifact extraction provider")
] = DEFAULT_PROVIDER,
qa_generation_model: Annotated[str, typer.Option("--qa-generation-model", help="QA generation model")] = (
DEFAULT_CHAT_MODEL
),
qa_generation_provider: Annotated[str, typer.Option("--qa-generation-provider", help="QA generation provider")] = (
DEFAULT_PROVIDER
),
quality_judge_model: Annotated[str, typer.Option("--quality-judge-model", help="Quality judge model")] = (
DEFAULT_CHAT_MODEL
),
quality_judge_provider: Annotated[str, typer.Option("--quality-judge-provider", help="Quality judge provider")] = (
DEFAULT_PROVIDER
),
embed_model: Annotated[str, typer.Option("--embed-model", help="Embedding model")] = DEFAULT_EMBED_MODEL,
embed_provider: Annotated[str, typer.Option("--embed-provider", help="Embedding provider")] = DEFAULT_PROVIDER,
max_parallel_requests_for_gen: Annotated[
int | None, typer.Option("--max-parallel-requests-for-gen", help="Max parallel generation requests")
] = None,
) -> dd.DataDesignerConfigBuilder:
"""Build the retrieval SDG Data Designer workflow.

Returns:
A configured Data Designer config builder.
"""
if end_index < start_index:
raise click.BadParameter("--end-index must be greater than or equal to --start-index")

seed_source = DocumentChunkerSeedSource(
path=str(input_dir),
file_pattern=file_pattern,
recursive=recursive,
file_extensions=file_extensions or [".txt", ".md", ".text"],
min_text_length=min_text_length,
sentences_per_chunk=sentences_per_chunk,
num_sections=num_sections,
num_files=num_files,
multi_doc=multi_doc,
bundle_size=bundle_size,
bundle_strategy=bundle_strategy,
max_docs_per_bundle=max_docs_per_bundle,
multi_doc_manifest=str(multi_doc_manifest) if multi_doc_manifest else None,
)

return build_qa_generation_pipeline(
seed_source=seed_source,
start_index=start_index,
end_index=end_index,
max_artifacts_per_type=max_artifacts_per_type,
num_pairs=num_pairs,
min_hops=min_hops,
max_hops=max_hops,
min_complexity=min_complexity,
similarity_threshold=similarity_threshold,
max_parallel_requests_for_gen=max_parallel_requests_for_gen,
artifact_extraction_model=artifact_extraction_model,
artifact_extraction_provider=artifact_extraction_provider,
qa_generation_model=qa_generation_model,
qa_generation_provider=qa_generation_provider,
quality_judge_model=quality_judge_model,
quality_judge_provider=quality_judge_provider,
embed_model=embed_model,
embed_provider=embed_provider,
)
61 changes: 61 additions & 0 deletions plugins/data-designer-retrieval-sdg/tests/test_recipe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

from pathlib import Path
from types import SimpleNamespace

import typer
from click.testing import CliRunner
from data_designer.config.config_builder import DataDesignerConfigBuilder

from data_designer_retrieval_sdg.recipe import build_typer_app, load_config_builder
from data_designer_retrieval_sdg.seed_source import DocumentChunkerSeedSource


def test_load_config_builder_builds_retrieval_sdg_pipeline(tmp_path: Path) -> None:
"""The recipe entry point builds the retrieval SDG workflow from argv."""
params = SimpleNamespace(
argv=(
"--input-dir",
str(tmp_path),
"--num-pairs",
"2",
"--start-index",
"1",
"--end-index",
"4",
"--file-extensions",
".txt",
)
)

builder = load_config_builder(params)

assert isinstance(builder, DataDesignerConfigBuilder)
seed_config = builder.get_seed_config()
assert seed_config is not None
assert isinstance(seed_config.source, DocumentChunkerSeedSource)
assert seed_config.source.path == str(tmp_path)
assert seed_config.source.file_extensions == [".txt"]
assert seed_config.selection_strategy is not None
assert seed_config.selection_strategy.start == 1
assert seed_config.selection_strategy.end == 4
assert [column.name for column in builder.get_column_configs()] == [
"document_artifacts",
"qa_generation",
"deduplicated_qa_pairs",
"qa_evaluations",
]


def test_build_typer_app_exposes_recipe_help() -> None:
"""The recipe exposes Typer metadata for Data Designer inspection."""
command = typer.main.get_command(build_typer_app())
result = CliRunner().invoke(command, ["--help"])

assert result.exit_code == 0
assert "Build the retrieval SDG Data Designer workflow." in result.output
assert "--input-dir" in result.output
assert "--num-pairs" in result.output
Loading