NVIDIA-NeMo · shan-nvidia · Apr 29, 2026 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
@@ -7,4 +7,5 @@
 /.github/ @NVIDIA-NeMo/data_designer_reviewers
 
 # Plugins
+/plugins/data-designer-retrieval-sdg/ @NVIDIA-NeMo/data_designer_reviewers @shan-nvidia @oliverholworthy
 /plugins/data-designer-template/ @NVIDIA-NeMo/data_designer_reviewers
@@ -35,3 +35,6 @@ htmlcov/
 
 # Distribution
 *.tar.gz
+
+# CI artifacts
+*artifacts/
@@ -4,4 +4,6 @@ Auto-generated from plugin metadata. Do not edit manually.
 
 | Plugin | Version | Column Type | Description |
 |--------|---------|-------------|-------------|
+| data-designer-retrieval-sdg | 0.1.0 | `document-chunker` | Retriever SDG toolkit: registers the embedding-dedup column generator and document-chunker seed reader, plus a multi-step QA generation pipeline, CLI, and Automodel-compatible data conversion |
+| data-designer-retrieval-sdg | 0.1.0 | `embedding-dedup` | Retriever SDG toolkit: registers the embedding-dedup column generator and document-chunker seed reader, plus a multi-step QA generation pipeline, CLI, and Automodel-compatible data conversion |
 | data-designer-template | 0.1.0 | `text-transform` | Template Data Designer plugin — text transform column generator |
@@ -0,0 +1,3 @@
+# Owner(s) of this plugin — used to generate the root CODEOWNERS file.
+# GitHub accepts @username, @org/team, or email format.
+* @NVIDIA-NeMo/data_designer_reviewers @shan-nvidia @oliverholworthy
@@ -0,0 +1,125 @@
+# data-designer-retrieval-sdg
+
+Data Designer toolkit for **retriever synthetic data generation**. The
+package registers two `data_designer.plugins` entry points, ships a
+ready-made multi-step QA generation pipeline, and exposes a CLI that
+generates QA pairs and converts them into training formats compatible
+with [Automodel](https://github.com/NVIDIA-NeMo/Automodel) retriever
+finetuning.
+
+## Plugins
+
+The single PyPI package contributes two plugins to DataDesigner's
+registries via `[project.entry-points."data_designer.plugins"]`:
+
+| Slug | Type | Purpose |
+|------|------|---------|
+| `embedding-dedup` | column generator | Generic cosine-similarity dedup of any list-valued column. Implements native `agenerate()` for the async engine. |
+| `document-chunker` | seed reader | Sentence-chunks a directory of text files and emits structured sections, with optional multi-document bundling. |
+
+Both ship with the same `pip install data-designer-retrieval-sdg` and
+become discoverable automatically through Python entry points.
+
+## Native async (`DATA_DESIGNER_ASYNC_ENGINE=1`)
+
+`embedding-dedup` implements `agenerate()` directly on top of
+`model.agenerate_text_embeddings`, so the column participates in
+DataDesigner's async cell-level scheduler whenever the env var is set:
+
+```bash
+export DATA_DESIGNER_ASYNC_ENGINE=1
+data-designer-retrieval-sdg generate ...
+```
+
+The async engine requires Python 3.11+; without the env var the package
+runs on Python 3.10+ via the framework's sync bridge.
+
+## Installation
+
+```bash
+pip install data-designer-retrieval-sdg
+```
+
+For development inside the monorepo:
+
+```bash
+make sync                     # install all packages into .venv
+source .venv/bin/activate     # activate the virtual environment
+```
+
+Or prefix any command with `uv run`:
+
+```bash
+uv run data-designer-retrieval-sdg generate --help
+```
+
+## Quick start
+
+### Generate QA pairs
+
+```bash
+data-designer-retrieval-sdg generate \
+    --input-dir ./my_documents \
+    --output-dir ./generated_output \
+    --num-pairs 7
+```
+
+### Convert to training format
+
+```bash
+data-designer-retrieval-sdg convert ./generated_output \
+    --corpus-id my_corpus
+```
+
+### Use as a library
+
+```python
+from data_designer_retrieval_sdg import (
+    DocumentChunkerSeedSource,
+    build_qa_generation_pipeline,
+)
+
+seed_source = DocumentChunkerSeedSource(
+    path="./docs",
+    file_extensions=[".txt", ".md"],
+)
+config_builder = build_qa_generation_pipeline(seed_source)
+```
+
+## Plugin configuration examples
+
+### `embedding-dedup` column
+
+```python
+from data_designer_retrieval_sdg.config import EmbeddingDedupColumnConfig
+
+config_builder.add_column(
+    EmbeddingDedupColumnConfig(
+        name="deduplicated_qa_pairs",
+        source_column="qa_generation",   # upstream column with the items
+        items_key="pairs",               # key under the source column ("None" if the column is already a list)
+        text_field="question",           # field on each item to embed
+        model_alias="embed",             # registered embedding model alias
+        similarity_threshold=0.9,
+    )
+)
+```
+
+### `document-chunker` seed reader
+
+```python
+from data_designer_retrieval_sdg.seed_source import DocumentChunkerSeedSource
+
+seed_source = DocumentChunkerSeedSource(
+    path="./docs",
+    file_pattern="*",
+    recursive=True,
+    file_extensions=[".txt", ".md"],
+    sentences_per_chunk=5,
+    num_sections=1,
+    multi_doc=False,                # set True for bundle-per-row mode
+)
+```
+
+Output schema (one record per row): `file_name`, `text`, `chunks`,
+`sections_structured`, `bundle_id`, `bundle_members`, `is_multi_doc`.
@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+[project]
+name = "data-designer-retrieval-sdg"
+version = "0.1.0"
+description = "Retriever SDG toolkit: registers the embedding-dedup column generator and document-chunker seed reader, plus a multi-step QA generation pipeline, CLI, and Automodel-compatible data conversion"
+requires-python = ">=3.10"
+dependencies = [
+    "data-designer>=0.5.7",
+    "nltk>=3.9.2",
+    "pyyaml>=6.0",
+    "pyarrow>=14.0",
+]
+license = "Apache-2.0"
+readme = "README.md"
+authors = [
+    {name = "NVIDIA Corporation"},
+]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Programming Language :: Python :: 3",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+
+[project.entry-points."data_designer.plugins"]
+embedding-dedup = "data_designer_retrieval_sdg.plugins:embedding_dedup_plugin"
+document-chunker = "data_designer_retrieval_sdg.plugins:document_chunker_plugin"
+
+[project.scripts]
+data-designer-retrieval-sdg = "data_designer_retrieval_sdg.cli:main"
+
+[project.urls]
+Repository = "https://github.com/NVIDIA-NeMo/DataDesignerPlugins"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/data_designer_retrieval_sdg"]
+
+[tool.ruff]
+extend = "../../pyproject.toml"
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Data Designer plugins and pipeline for retriever synthetic data generation.
+
+The package registers two ``data_designer.plugins`` entry points:
+
+- ``embedding-dedup``: generic embedding-cosine-similarity column generator.
+- ``document-chunker``: filesystem seed reader that loads text files,
+  sentence-chunks them, and emits structured sections.
+
+It also ships a ready-made four-column QA generation pipeline, a CLI for
+running the pipeline end-to-end (``generate``) and exporting to NeMo
+Retriever / BEIR formats (``convert``), and reusable post-processing
+helpers.
+"""
+
+from data_designer_retrieval_sdg.config import EmbeddingDedupColumnConfig
+from data_designer_retrieval_sdg.pipeline import build_qa_generation_pipeline
+from data_designer_retrieval_sdg.postprocess import (
+    filter_qa_pairs_by_quality,
+    load_positive_docs_with_modality,
+    postprocess_retriever_data,
+)
+from data_designer_retrieval_sdg.seed_source import DocumentChunkerSeedSource
+
+__all__ = [
+    "DocumentChunkerSeedSource",
+    "EmbeddingDedupColumnConfig",
+    "build_qa_generation_pipeline",
+    "filter_qa_pairs_by_quality",
+    "load_positive_docs_with_modality",
+    "postprocess_retriever_data",
+]