Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@
/.github/ @NVIDIA-NeMo/data_designer_reviewers

# Plugins
/plugins/data-designer-github/ @eric-tramel
/plugins/data-designer-template/ @NVIDIA-NeMo/data_designer_reviewers
1 change: 1 addition & 0 deletions docs/catalog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ Auto-generated from plugin metadata. Do not edit manually.

| Plugin | Version | Column Type | Description |
|--------|---------|-------------|-------------|
| data-designer-github | 0.1.0 | `github` | GitHub and local git repository seed reader for Data Designer |
| data-designer-template | 0.1.0 | `text-transform` | Template Data Designer plugin — text transform column generator |
3 changes: 3 additions & 0 deletions plugins/data-designer-github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Owner(s) of this plugin — used to generate the root CODEOWNERS file.
# GitHub accepts @username, @org/team, or email format.
* @eric-tramel
51 changes: 51 additions & 0 deletions plugins/data-designer-github/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# data-designer-github

GitHub and local git repository seed reader for
[NeMo Data Designer](https://github.com/NVIDIA-NeMo/DataDesigner).

## Installation

```bash
pip install data-designer-github
```

## Usage

This plugin provides a `github` seed source. Once installed, the seed reader is
automatically discovered by Data Designer.

```python
from data_designer.config.config_builder import DataDesignerConfigBuilder
from data_designer.interface.data_designer import DataDesigner
from data_designer_github.config import GitHubSeedSource

builder = DataDesignerConfigBuilder()
builder.with_seed_dataset(
GitHubSeedSource(
repositories=["NVIDIA-NeMo/DataDesigner"],
file_pattern="*.py",
recursive=True,
)
)

preview = DataDesigner().preview(builder, num_records=5)
print(preview.dataset[["repo_id", "relative_path", "code_lang", "content"]])
```

The reader can also scan local git repositories:

```python
builder.with_seed_dataset(
GitHubSeedSource(
path="/path/to/repos",
repository_paths=["/path/to/one/repo"],
file_pattern="*.py",
)
)
```

Seed columns include repository metadata, file paths, language hints, file
content, and content SHA-256 hashes.

For the full plugin authoring guide, see the
[main repository docs](https://github.com/NVIDIA-NeMo/DataDesignerPlugins/blob/main/docs/adding-a-plugin.md).
36 changes: 36 additions & 0 deletions plugins/data-designer-github/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

[project]
name = "data-designer-github"
version = "0.1.0"
description = "GitHub and local git repository seed reader for Data Designer"
requires-python = ">=3.10"
dependencies = [
"data-designer>=0.5.7",
]
license = "Apache-2.0"
readme = "README.md"
authors = [
{name = "NVIDIA Corporation"},
]
classifiers = [
"Development Status :: 3 - Alpha",
"Programming Language :: Python :: 3",
]

[project.entry-points."data_designer.plugins"]
github = "data_designer_github.plugin:plugin"

[project.urls]
Repository = "https://github.com/NVIDIA-NeMo/DataDesignerPlugins"

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["src/data_designer_github"]

[tool.ruff]
extend = "../../pyproject.toml"
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
217 changes: 217 additions & 0 deletions plugins/data-designer-github/src/data_designer_github/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

import codecs
from pathlib import Path
from typing import ClassVar, Literal

from data_designer.config.base import ConfigBase
from data_designer.config.seed_source import FileSystemSeedSource
from pydantic import Field, field_validator, model_validator
from typing_extensions import Self

DEFAULT_CODE_EXTENSIONS = [
".bash",
".c",
".cc",
".cfg",
".cpp",
".cs",
".css",
".go",
".h",
".hpp",
".html",
".java",
".js",
".json",
".jsx",
".kt",
".kts",
".lua",
".md",
".php",
".py",
".rb",
".rs",
".scala",
".sh",
".sql",
".swift",
".toml",
".ts",
".tsx",
".yaml",
".yml",
".zsh",
]

DEFAULT_CODE_FILENAMES = [
"Dockerfile",
"Makefile",
]

DEFAULT_EXCLUDE_PATTERNS = [
".git/*",
".git/**",
".mypy_cache/*",
".pytest_cache/*",
".ruff_cache/*",
".tox/*",
".venv/*",
"__pycache__/*",
"build/*",
"dist/*",
"node_modules/*",
"venv/*",
"**/.git/*",
"**/.git/**",
"**/.mypy_cache/*",
"**/.pytest_cache/*",
"**/.ruff_cache/*",
"**/.tox/*",
"**/.venv/*",
"**/__pycache__/*",
"**/build/*",
"**/dist/*",
"**/node_modules/*",
"**/venv/*",
]


class GitHubSeedSource(FileSystemSeedSource, ConfigBase):
"""Seed source for reading code files from GitHub and local git repositories."""

seed_type: Literal["github"] = "github"

path: str | None = Field(
None,
description=(
"Optional local git repository path, or a directory whose immediate children are git repositories. "
"Relative paths are resolved from the current working directory when the config is loaded."
),
)
repositories: list[str] = Field(
default_factory=list,
description=(
"GitHub repositories to clone before reading. Each entry may be 'owner/name', "
"'https://github.com/owner/name', or 'https://github.com/owner/name.git'."
),
)
repository_paths: list[str] = Field(
default_factory=list,
description="Additional local git repository paths to read.",
)
ref: str | None = Field(
None,
description="Optional branch, tag, or commit to check out after cloning GitHub repositories.",
)
clone_depth: int | None = Field(
1,
ge=1,
description="Depth for GitHub clones. Set to null for a full clone.",
)
clone_timeout_seconds: int = Field(
300,
ge=1,
description="Timeout for each git clone or checkout operation.",
)
include_extensions: list[str] | None = Field(
default_factory=lambda: list(DEFAULT_CODE_EXTENSIONS),
description=(
"Lowercase file extensions to include. Values may include or omit the leading dot. "
"Set to null to include every extension."
),
)
include_file_names: list[str] = Field(
default_factory=lambda: list(DEFAULT_CODE_FILENAMES),
description="Extensionless file names to include, such as Dockerfile or Makefile.",
)
exclude_patterns: list[str] = Field(
default_factory=lambda: list(DEFAULT_EXCLUDE_PATTERNS),
description="Relative path glob patterns to exclude from repository scans.",
)
max_file_size_bytes: int = Field(
1_000_000,
ge=1,
description="Maximum file size to hydrate into the content column.",
)
encoding: str = Field(
"utf-8",
description="Text encoding used when hydrating repository file contents.",
)

_source_fields: ClassVar[tuple[str, ...]] = ("path", "repositories", "repository_paths")

@model_validator(mode="after")
def validate_has_repository_source(self) -> Self:
"""Ensure the seed source has at least one repository source."""
if self.path is None and not self.repositories and not self.repository_paths:
fields = ", ".join(self._source_fields)
raise ValueError(f"At least one of {fields} must be provided.")
return self

@field_validator("encoding", mode="after")
@classmethod
def validate_encoding(cls, value: str) -> str:
"""Validate that the configured text encoding exists."""
try:
codecs.lookup(value)
except LookupError as error:
raise ValueError(f"Unknown encoding: {value!r}. Use a valid Python codec name.") from error
return value

@field_validator("include_extensions", mode="after")
@classmethod
def normalize_include_extensions(cls, value: list[str] | None) -> list[str] | None:
"""Normalize configured extensions to lowercase dotted values."""
if value is None:
return None

normalized: list[str] = []
for extension in value:
stripped = extension.strip().lower()
if not stripped:
raise ValueError("include_extensions cannot contain empty values.")
normalized.append(stripped if stripped.startswith(".") else f".{stripped}")
return sorted(set(normalized))

@field_validator("include_file_names", "exclude_patterns", mode="after")
@classmethod
def validate_non_empty_strings(cls, value: list[str]) -> list[str]:
"""Validate string list fields do not contain blank entries."""
for item in value:
if not item.strip():
raise ValueError("String lists cannot contain empty values.")
return value

@field_validator("repositories", mode="after")
@classmethod
def validate_repositories(cls, value: list[str]) -> list[str]:
"""Validate repository specs do not contain blank entries."""
for repository in value:
if not repository.strip():
raise ValueError("repositories cannot contain empty values.")
return value

@field_validator("repository_paths", mode="after")
@classmethod
def validate_repository_paths(cls, value: list[str]) -> list[str]:
"""Validate explicit local repository paths exist."""
for repository_path in value:
path = Path(repository_path).expanduser().resolve()
if not path.is_dir():
raise ValueError(f"Repository path {path} is not a directory.")
return value

@property
def runtime_path(self) -> str:
"""Return the resolved local scan root after a reader has prepared it."""
if self._runtime_path is not None:
return self._runtime_path
if self.path is None:
raise ValueError("GitHubSeedSource.runtime_path is available after the seed reader is attached.")
self._runtime_path = str(Path(self.path).expanduser().resolve())
return self._runtime_path
Loading
Loading