From 47cf14ab00ad797a4dad4956c69cbcf3d5107f21 Mon Sep 17 00:00:00 2001 From: "Eric W. Tramel" Date: Tue, 5 May 2026 11:43:32 -0400 Subject: [PATCH 1/3] feat: add github seed reader plugin --- .github/CODEOWNERS | 1 + docs/catalog.md | 1 + plugins/data-designer-github/CODEOWNERS | 3 + plugins/data-designer-github/README.md | 51 +++ plugins/data-designer-github/pyproject.toml | 36 ++ .../src/data_designer_github/__init__.py | 2 + .../src/data_designer_github/config.py | 217 ++++++++++ .../src/data_designer_github/impl.py | 409 ++++++++++++++++++ .../src/data_designer_github/plugin.py | 10 + .../data-designer-github/tests/test_plugin.py | 104 +++++ pyproject.toml | 2 +- uv.lock | 14 +- 12 files changed, 848 insertions(+), 2 deletions(-) create mode 100644 plugins/data-designer-github/CODEOWNERS create mode 100644 plugins/data-designer-github/README.md create mode 100644 plugins/data-designer-github/pyproject.toml create mode 100644 plugins/data-designer-github/src/data_designer_github/__init__.py create mode 100644 plugins/data-designer-github/src/data_designer_github/config.py create mode 100644 plugins/data-designer-github/src/data_designer_github/impl.py create mode 100644 plugins/data-designer-github/src/data_designer_github/plugin.py create mode 100644 plugins/data-designer-github/tests/test_plugin.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index eb06565..394d44b 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -7,4 +7,5 @@ /.github/ @NVIDIA-NeMo/data_designer_reviewers # Plugins +/plugins/data-designer-github/ @eric-tramel /plugins/data-designer-template/ @NVIDIA-NeMo/data_designer_reviewers diff --git a/docs/catalog.md b/docs/catalog.md index d3c1211..376d1bb 100644 --- a/docs/catalog.md +++ b/docs/catalog.md @@ -4,4 +4,5 @@ Auto-generated from plugin metadata. Do not edit manually. | Plugin | Version | Column Type | Description | |--------|---------|-------------|-------------| +| data-designer-github | 0.1.0 | `github` | GitHub and local git repository seed reader for Data Designer | | data-designer-template | 0.1.0 | `text-transform` | Template Data Designer plugin — text transform column generator | diff --git a/plugins/data-designer-github/CODEOWNERS b/plugins/data-designer-github/CODEOWNERS new file mode 100644 index 0000000..e0e141b --- /dev/null +++ b/plugins/data-designer-github/CODEOWNERS @@ -0,0 +1,3 @@ +# Owner(s) of this plugin — used to generate the root CODEOWNERS file. +# GitHub accepts @username, @org/team, or email format. +* @eric-tramel diff --git a/plugins/data-designer-github/README.md b/plugins/data-designer-github/README.md new file mode 100644 index 0000000..68671e3 --- /dev/null +++ b/plugins/data-designer-github/README.md @@ -0,0 +1,51 @@ +# data-designer-github + +GitHub and local git repository seed reader for +[NeMo Data Designer](https://github.com/NVIDIA-NeMo/DataDesigner). + +## Installation + +```bash +pip install data-designer-github +``` + +## Usage + +This plugin provides a `github` seed source. Once installed, the seed reader is +automatically discovered by Data Designer. + +```python +from data_designer.config.config_builder import DataDesignerConfigBuilder +from data_designer.interface.data_designer import DataDesigner +from data_designer_github.config import GitHubSeedSource + +builder = DataDesignerConfigBuilder() +builder.with_seed_dataset( + GitHubSeedSource( + repositories=["NVIDIA-NeMo/DataDesigner"], + file_pattern="*.py", + recursive=True, + ) +) + +preview = DataDesigner().preview(builder, num_records=5) +print(preview.dataset[["repo_id", "relative_path", "code_lang", "content"]]) +``` + +The reader can also scan local git repositories: + +```python +builder.with_seed_dataset( + GitHubSeedSource( + path="/path/to/repos", + repository_paths=["/path/to/one/repo"], + file_pattern="*.py", + ) +) +``` + +Seed columns include repository metadata, file paths, language hints, file +content, and content SHA-256 hashes. + +For the full plugin authoring guide, see the +[main repository docs](https://github.com/NVIDIA-NeMo/DataDesignerPlugins/blob/main/docs/adding-a-plugin.md). diff --git a/plugins/data-designer-github/pyproject.toml b/plugins/data-designer-github/pyproject.toml new file mode 100644 index 0000000..359c6f9 --- /dev/null +++ b/plugins/data-designer-github/pyproject.toml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[project] +name = "data-designer-github" +version = "0.1.0" +description = "GitHub and local git repository seed reader for Data Designer" +requires-python = ">=3.10" +dependencies = [ + "data-designer>=0.5.7", +] +license = "Apache-2.0" +readme = "README.md" +authors = [ + {name = "NVIDIA Corporation"}, +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Programming Language :: Python :: 3", +] + +[project.entry-points."data_designer.plugins"] +github = "data_designer_github.plugin:plugin" + +[project.urls] +Repository = "https://github.com/NVIDIA-NeMo/DataDesignerPlugins" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/data_designer_github"] + +[tool.ruff] +extend = "../../pyproject.toml" diff --git a/plugins/data-designer-github/src/data_designer_github/__init__.py b/plugins/data-designer-github/src/data_designer_github/__init__.py new file mode 100644 index 0000000..52a7a9d --- /dev/null +++ b/plugins/data-designer-github/src/data_designer_github/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/plugins/data-designer-github/src/data_designer_github/config.py b/plugins/data-designer-github/src/data_designer_github/config.py new file mode 100644 index 0000000..6285de5 --- /dev/null +++ b/plugins/data-designer-github/src/data_designer_github/config.py @@ -0,0 +1,217 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import codecs +from pathlib import Path +from typing import ClassVar, Literal + +from data_designer.config.base import ConfigBase +from data_designer.config.seed_source import FileSystemSeedSource +from pydantic import Field, field_validator, model_validator +from typing_extensions import Self + +DEFAULT_CODE_EXTENSIONS = [ + ".bash", + ".c", + ".cc", + ".cfg", + ".cpp", + ".cs", + ".css", + ".go", + ".h", + ".hpp", + ".html", + ".java", + ".js", + ".json", + ".jsx", + ".kt", + ".kts", + ".lua", + ".md", + ".php", + ".py", + ".rb", + ".rs", + ".scala", + ".sh", + ".sql", + ".swift", + ".toml", + ".ts", + ".tsx", + ".yaml", + ".yml", + ".zsh", +] + +DEFAULT_CODE_FILENAMES = [ + "Dockerfile", + "Makefile", +] + +DEFAULT_EXCLUDE_PATTERNS = [ + ".git/*", + ".git/**", + ".mypy_cache/*", + ".pytest_cache/*", + ".ruff_cache/*", + ".tox/*", + ".venv/*", + "__pycache__/*", + "build/*", + "dist/*", + "node_modules/*", + "venv/*", + "**/.git/*", + "**/.git/**", + "**/.mypy_cache/*", + "**/.pytest_cache/*", + "**/.ruff_cache/*", + "**/.tox/*", + "**/.venv/*", + "**/__pycache__/*", + "**/build/*", + "**/dist/*", + "**/node_modules/*", + "**/venv/*", +] + + +class GitHubSeedSource(FileSystemSeedSource, ConfigBase): + """Seed source for reading code files from GitHub and local git repositories.""" + + seed_type: Literal["github"] = "github" + + path: str | None = Field( + None, + description=( + "Optional local git repository path, or a directory whose immediate children are git repositories. " + "Relative paths are resolved from the current working directory when the config is loaded." + ), + ) + repositories: list[str] = Field( + default_factory=list, + description=( + "GitHub repositories to clone before reading. Each entry may be 'owner/name', " + "'https://github.com/owner/name', or 'https://github.com/owner/name.git'." + ), + ) + repository_paths: list[str] = Field( + default_factory=list, + description="Additional local git repository paths to read.", + ) + ref: str | None = Field( + None, + description="Optional branch, tag, or commit to check out after cloning GitHub repositories.", + ) + clone_depth: int | None = Field( + 1, + ge=1, + description="Depth for GitHub clones. Set to null for a full clone.", + ) + clone_timeout_seconds: int = Field( + 300, + ge=1, + description="Timeout for each git clone or checkout operation.", + ) + include_extensions: list[str] | None = Field( + default_factory=lambda: list(DEFAULT_CODE_EXTENSIONS), + description=( + "Lowercase file extensions to include. Values may include or omit the leading dot. " + "Set to null to include every extension." + ), + ) + include_file_names: list[str] = Field( + default_factory=lambda: list(DEFAULT_CODE_FILENAMES), + description="Extensionless file names to include, such as Dockerfile or Makefile.", + ) + exclude_patterns: list[str] = Field( + default_factory=lambda: list(DEFAULT_EXCLUDE_PATTERNS), + description="Relative path glob patterns to exclude from repository scans.", + ) + max_file_size_bytes: int = Field( + 1_000_000, + ge=1, + description="Maximum file size to hydrate into the content column.", + ) + encoding: str = Field( + "utf-8", + description="Text encoding used when hydrating repository file contents.", + ) + + _source_fields: ClassVar[tuple[str, ...]] = ("path", "repositories", "repository_paths") + + @model_validator(mode="after") + def validate_has_repository_source(self) -> Self: + """Ensure the seed source has at least one repository source.""" + if self.path is None and not self.repositories and not self.repository_paths: + fields = ", ".join(self._source_fields) + raise ValueError(f"At least one of {fields} must be provided.") + return self + + @field_validator("encoding", mode="after") + @classmethod + def validate_encoding(cls, value: str) -> str: + """Validate that the configured text encoding exists.""" + try: + codecs.lookup(value) + except LookupError as error: + raise ValueError(f"Unknown encoding: {value!r}. Use a valid Python codec name.") from error + return value + + @field_validator("include_extensions", mode="after") + @classmethod + def normalize_include_extensions(cls, value: list[str] | None) -> list[str] | None: + """Normalize configured extensions to lowercase dotted values.""" + if value is None: + return None + + normalized: list[str] = [] + for extension in value: + stripped = extension.strip().lower() + if not stripped: + raise ValueError("include_extensions cannot contain empty values.") + normalized.append(stripped if stripped.startswith(".") else f".{stripped}") + return sorted(set(normalized)) + + @field_validator("include_file_names", "exclude_patterns", mode="after") + @classmethod + def validate_non_empty_strings(cls, value: list[str]) -> list[str]: + """Validate string list fields do not contain blank entries.""" + for item in value: + if not item.strip(): + raise ValueError("String lists cannot contain empty values.") + return value + + @field_validator("repositories", mode="after") + @classmethod + def validate_repositories(cls, value: list[str]) -> list[str]: + """Validate repository specs do not contain blank entries.""" + for repository in value: + if not repository.strip(): + raise ValueError("repositories cannot contain empty values.") + return value + + @field_validator("repository_paths", mode="after") + @classmethod + def validate_repository_paths(cls, value: list[str]) -> list[str]: + """Validate explicit local repository paths exist.""" + for repository_path in value: + path = Path(repository_path).expanduser().resolve() + if not path.is_dir(): + raise ValueError(f"Repository path {path} is not a directory.") + return value + + @property + def runtime_path(self) -> str: + """Return the resolved local scan root after a reader has prepared it.""" + if self._runtime_path is not None: + return self._runtime_path + if self.path is None: + raise ValueError("GitHubSeedSource.runtime_path is available after the seed reader is attached.") + self._runtime_path = str(Path(self.path).expanduser().resolve()) + return self._runtime_path diff --git a/plugins/data-designer-github/src/data_designer_github/impl.py b/plugins/data-designer-github/src/data_designer_github/impl.py new file mode 100644 index 0000000..c228b79 --- /dev/null +++ b/plugins/data-designer-github/src/data_designer_github/impl.py @@ -0,0 +1,409 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import hashlib +import logging +import re +import shutil +import subprocess +import tempfile +from dataclasses import dataclass +from fnmatch import fnmatchcase +from pathlib import Path +from typing import Any, ClassVar +from urllib.parse import urlparse + +import pandas as pd +from data_designer.engine.resources.seed_reader import ( + FileSystemSeedReader, + SeedReaderError, + SeedReaderFileSystemContext, +) + +from data_designer_github.config import GitHubSeedSource + +logger = logging.getLogger(__name__) + + +LANGUAGE_BY_EXTENSION = { + ".bash": "bash", + ".c": "c", + ".cc": "cpp", + ".cfg": "config", + ".cpp": "cpp", + ".cs": "csharp", + ".css": "css", + ".go": "go", + ".h": "c", + ".hpp": "cpp", + ".html": "html", + ".java": "java", + ".js": "javascript", + ".json": "json", + ".jsx": "javascript", + ".kt": "kotlin", + ".kts": "kotlin", + ".lua": "lua", + ".md": "markdown", + ".php": "php", + ".py": "python", + ".rb": "ruby", + ".rs": "rust", + ".scala": "scala", + ".sh": "shell", + ".sql": "sql", + ".swift": "swift", + ".toml": "toml", + ".ts": "typescript", + ".tsx": "typescript", + ".yaml": "yaml", + ".yml": "yaml", + ".zsh": "zsh", +} + +LANGUAGE_BY_FILENAME = { + "Dockerfile": "dockerfile", + "Makefile": "makefile", +} + + +@dataclass(frozen=True) +class RepositoryRoot: + """Prepared repository root available for manifest building.""" + + repo_id: str + repo_url: str | None + root_path: Path + source_kind: str + commit_sha: str | None + + +class GitHubSeedReader(FileSystemSeedReader[GitHubSeedSource]): + """Read code files from GitHub clones and local git repositories.""" + + output_columns: ClassVar[list[str] | None] = [ + "repo_id", + "repo_url", + "commit_sha", + "source_kind", + "repository_path", + "source_path", + "relative_path", + "file_name", + "file_extension", + "code_lang", + "size_bytes", + "content_sha256", + "content", + ] + + def _reset_attachment_state(self) -> None: + super()._reset_attachment_state() + temp_dir = getattr(self, "_temp_dir", None) + if temp_dir is not None: + temp_dir.cleanup() + self._temp_dir: tempfile.TemporaryDirectory[str] | None = None + self._repository_roots: list[RepositoryRoot] | None = None + + def build_manifest(self, *, context: SeedReaderFileSystemContext) -> pd.DataFrame | list[dict[str, Any]]: + """Build a cheap file manifest across every configured repository.""" + records: list[dict[str, Any]] = [] + for repository in self._get_repository_roots(context): + records.extend(self._build_repository_manifest(repository)) + return records + + def hydrate_row( + self, + *, + manifest_row: dict[str, Any], + context: SeedReaderFileSystemContext, + ) -> dict[str, Any] | list[dict[str, Any]]: + """Read file content and add it to a manifest row.""" + del context + source_path = Path(str(manifest_row["source_path"])) + try: + content_bytes = source_path.read_bytes() + content = content_bytes.decode(self.source.encoding) + except UnicodeDecodeError as error: + logger.warning( + "Skipping file %s because it cannot be decoded as %s: %s", + source_path, + self.source.encoding, + error, + ) + return [] + except OSError as error: + raise SeedReaderError(f"Failed to read repository file {source_path}: {error}") from error + + record = dict(manifest_row) + record["content_sha256"] = hashlib.sha256(content_bytes).hexdigest() + record["content"] = content + return record + + def _get_filesystem_context(self) -> SeedReaderFileSystemContext: + self._ensure_attached() + context = getattr(self, "_filesystem_context", None) + if context is not None: + return context + + runtime_root = self._prepare_runtime_root() + context = self.create_filesystem_context(runtime_root) + self._filesystem_context = context + return context + + def _prepare_runtime_root(self) -> Path: + self._temp_dir = tempfile.TemporaryDirectory(prefix="data-designer-github-") + runtime_root = Path(self._temp_dir.name).resolve() + + repository_roots = self._prepare_local_repositories() + clone_root = runtime_root / "github" + clone_root.mkdir(parents=True, exist_ok=True) + repository_roots.extend(self._clone_github_repositories(clone_root)) + + if not repository_roots: + raise SeedReaderError("GitHub seed source did not resolve any repositories.") + + self.source._runtime_path = str(runtime_root) + self._repository_roots = repository_roots + return runtime_root + + def _get_repository_roots(self, context: SeedReaderFileSystemContext) -> list[RepositoryRoot]: + del context + repository_roots = getattr(self, "_repository_roots", None) + if repository_roots is None: + raise SeedReaderError("Repository roots are not prepared.") + return repository_roots + + def _prepare_local_repositories(self) -> list[RepositoryRoot]: + local_paths = _resolve_local_repository_paths( + parent_path=self.source.path, + repository_paths=self.source.repository_paths, + ) + return [self._build_local_repository_root(path) for path in local_paths] + + def _clone_github_repositories(self, clone_root: Path) -> list[RepositoryRoot]: + repository_roots: list[RepositoryRoot] = [] + for repository_spec in self.source.repositories: + repo_id, repo_url = normalize_github_repository(repository_spec) + destination = clone_root / _safe_repo_directory_name(repo_id) + self._clone_repository(repo_url=repo_url, destination=destination) + if self.source.ref is not None: + _run_git( + ["checkout", "--quiet", self.source.ref], + cwd=destination, + timeout=self.source.clone_timeout_seconds, + ) + repository_roots.append( + RepositoryRoot( + repo_id=repo_id, + repo_url=repo_url, + root_path=destination, + source_kind="github", + commit_sha=_get_commit_sha(destination), + ) + ) + return repository_roots + + def _clone_repository(self, *, repo_url: str, destination: Path) -> None: + command = ["clone", "--quiet"] + if self.source.ref is not None and not _looks_like_commit_sha(self.source.ref): + command.extend(["--branch", self.source.ref]) + if self.source.clone_depth is not None: + command.extend(["--depth", str(self.source.clone_depth)]) + command.extend([repo_url, str(destination)]) + _run_git(command, timeout=self.source.clone_timeout_seconds) + + def _build_local_repository_root(self, root_path: Path) -> RepositoryRoot: + remote_url = _get_remote_url(root_path) + return RepositoryRoot( + repo_id=_repo_id_from_local_path(root_path, remote_url), + repo_url=remote_url, + root_path=root_path, + source_kind="git_repository", + commit_sha=_get_commit_sha(root_path), + ) + + def _build_repository_manifest(self, repository: RepositoryRoot) -> list[dict[str, Any]]: + records: list[dict[str, Any]] = [] + for file_path in self._iter_matching_files(repository.root_path): + relative_path = file_path.relative_to(repository.root_path).as_posix() + stat = file_path.stat() + records.append( + { + "repo_id": repository.repo_id, + "repo_url": repository.repo_url, + "commit_sha": repository.commit_sha, + "source_kind": repository.source_kind, + "repository_path": str(repository.root_path), + "source_path": str(file_path), + "relative_path": relative_path, + "file_name": file_path.name, + "file_extension": file_path.suffix.lower(), + "code_lang": _detect_language(file_path), + "size_bytes": stat.st_size, + "content_sha256": "", + "content": "", + } + ) + return records + + def _iter_matching_files(self, root_path: Path) -> list[Path]: + paths = ( + root_path.rglob(self.source.file_pattern) + if self.source.recursive + else root_path.glob(self.source.file_pattern) + ) + files = [path for path in paths if self._should_include_file(root_path=root_path, file_path=path)] + files.sort(key=lambda path: path.relative_to(root_path).as_posix()) + return files + + def _should_include_file(self, *, root_path: Path, file_path: Path) -> bool: + if not file_path.is_file(): + return False + + relative_path = file_path.relative_to(root_path).as_posix() + if any(fnmatchcase(relative_path, pattern) for pattern in self.source.exclude_patterns): + return False + + try: + file_size = file_path.stat().st_size + except OSError as error: + logger.warning("Skipping file %s because it cannot be stat'ed: %s", file_path, error) + return False + + if file_size > self.source.max_file_size_bytes: + return False + + if file_path.name in self.source.include_file_names: + return True + + include_extensions = self.source.include_extensions + return include_extensions is None or file_path.suffix.lower() in include_extensions + + +def normalize_github_repository(repository: str) -> tuple[str, str]: + """Normalize a GitHub repository spec to ``(owner/name, clone_url)``.""" + stripped = repository.strip() + parsed = urlparse(stripped) + + if parsed.scheme in {"http", "https"}: + if parsed.netloc.lower() != "github.com": + raise SeedReaderError(f"Expected a github.com repository URL, got {repository!r}.") + repo_id = parsed.path.strip("/").removesuffix(".git") + elif stripped.startswith("git@github.com:"): + repo_id = stripped.removeprefix("git@github.com:").removesuffix(".git").strip("/") + else: + repo_id = stripped.removesuffix(".git").strip("/") + + if not re.fullmatch(r"[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+", repo_id): + raise SeedReaderError(f"GitHub repository {repository!r} must use 'owner/name' or a github.com repository URL.") + + return repo_id, f"https://github.com/{repo_id}.git" + + +def _resolve_local_repository_paths(*, parent_path: str | None, repository_paths: list[str]) -> list[Path]: + roots: dict[Path, None] = {} + if parent_path is not None: + parent = Path(parent_path).expanduser().resolve() + top_level = _get_git_toplevel(parent) + if top_level is not None: + roots[top_level] = None + else: + for child in sorted(parent.iterdir()): + if child.is_dir(): + child_top_level = _get_git_toplevel(child) + if child_top_level is not None: + roots[child_top_level] = None + + for repository_path in repository_paths: + path = Path(repository_path).expanduser().resolve() + top_level = _get_git_toplevel(path) + if top_level is None: + raise SeedReaderError(f"Repository path {path} is not a git repository.") + roots[top_level] = None + + return list(roots) + + +def _get_git_toplevel(path: Path) -> Path | None: + result = subprocess.run( + ["git", "-C", str(path), "rev-parse", "--show-toplevel"], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + return None + return Path(result.stdout.strip()).resolve() + + +def _get_commit_sha(root_path: Path) -> str | None: + result = subprocess.run( + ["git", "-C", str(root_path), "rev-parse", "HEAD"], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + return None + return result.stdout.strip() + + +def _get_remote_url(root_path: Path) -> str | None: + result = subprocess.run( + ["git", "-C", str(root_path), "config", "--get", "remote.origin.url"], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + return None + return result.stdout.strip() or None + + +def _run_git(command: list[str], *, cwd: Path | None = None, timeout: int) -> None: + git = shutil.which("git") + if git is None: + raise SeedReaderError("git is required to read GitHub repositories, but it was not found on PATH.") + + try: + result = subprocess.run( + [git, *command], + cwd=None if cwd is None else str(cwd), + capture_output=True, + text=True, + check=False, + timeout=timeout, + ) + except subprocess.TimeoutExpired as error: + raise SeedReaderError(f"git {' '.join(command)} timed out after {timeout} seconds") from error + + if result.returncode != 0: + detail = result.stderr.strip() or result.stdout.strip() + raise SeedReaderError(f"git {' '.join(command)} failed: {detail}") + + +def _repo_id_from_local_path(root_path: Path, remote_url: str | None) -> str: + if remote_url: + try: + repo_id, _ = normalize_github_repository(remote_url) + return repo_id + except SeedReaderError: + pass + return root_path.name + + +def _safe_repo_directory_name(repo_id: str) -> str: + return repo_id.replace("/", "__") + + +def _looks_like_commit_sha(ref: str) -> bool: + return re.fullmatch(r"[0-9a-fA-F]{7,40}", ref) is not None + + +def _detect_language(file_path: Path) -> str: + if file_path.name in LANGUAGE_BY_FILENAME: + return LANGUAGE_BY_FILENAME[file_path.name] + return LANGUAGE_BY_EXTENSION.get(file_path.suffix.lower(), file_path.suffix.lower().removeprefix(".")) diff --git a/plugins/data-designer-github/src/data_designer_github/plugin.py b/plugins/data-designer-github/src/data_designer_github/plugin.py new file mode 100644 index 0000000..8a81988 --- /dev/null +++ b/plugins/data-designer-github/src/data_designer_github/plugin.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from data_designer.plugins.plugin import Plugin, PluginType + +plugin = Plugin( + config_qualified_name="data_designer_github.config.GitHubSeedSource", + impl_qualified_name="data_designer_github.impl.GitHubSeedReader", + plugin_type=PluginType.SEED_READER, +) diff --git a/plugins/data-designer-github/tests/test_plugin.py b/plugins/data-designer-github/tests/test_plugin.py new file mode 100644 index 0000000..be82445 --- /dev/null +++ b/plugins/data-designer-github/tests/test_plugin.py @@ -0,0 +1,104 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import subprocess +from pathlib import Path + +import pytest +from data_designer.config.config_builder import DataDesignerConfigBuilder +from data_designer.engine.secret_resolver import PlaintextResolver +from data_designer.engine.testing.utils import assert_valid_plugin +from data_designer.interface.data_designer import DataDesigner + +from data_designer_github.config import GitHubSeedSource +from data_designer_github.impl import GitHubSeedReader, normalize_github_repository +from data_designer_github.plugin import plugin + + +def test_valid_plugin() -> None: + assert_valid_plugin(plugin) + + +def test_normalize_github_repository() -> None: + assert normalize_github_repository("NVIDIA-NeMo/DataDesigner")[0] == "NVIDIA-NeMo/DataDesigner" + assert normalize_github_repository("https://github.com/NVIDIA-NeMo/DataDesigner.git")[0] == ( + "NVIDIA-NeMo/DataDesigner" + ) + + +def test_source_requires_at_least_one_repository_source() -> None: + with pytest.raises(ValueError, match="At least one"): + GitHubSeedSource() + + +def test_reader_hydrates_local_repository_files(tmp_path: Path) -> None: + repo = _create_git_repo(tmp_path / "sample-repo") + source = GitHubSeedSource(repository_paths=[str(repo)], file_pattern="*.py") + reader = GitHubSeedReader() + reader.attach(source, PlaintextResolver()) + + assert reader.get_seed_dataset_size() == 1 + batch = reader.create_batch_reader(batch_size=10, index_range=None, shuffle=False).read_next_batch() + rows = batch.to_pandas().to_dict(orient="records") + + assert len(rows) == 1 + row = rows[0] + assert row["repo_id"] == "sample-repo" + assert row["source_kind"] == "git_repository" + assert row["relative_path"] == "src/example.py" + assert row["file_name"] == "example.py" + assert row["file_extension"] == ".py" + assert row["code_lang"] == "python" + assert row["size_bytes"] > 0 + assert len(row["commit_sha"]) == 40 + assert len(row["content_sha256"]) == 64 + assert "def greet" in row["content"] + + +def test_parent_path_discovers_child_git_repositories(tmp_path: Path) -> None: + repo = _create_git_repo(tmp_path / "repos" / "child-repo") + source = GitHubSeedSource(path=str(repo.parent), file_pattern="*.py") + reader = GitHubSeedReader() + reader.attach(source, PlaintextResolver()) + + batch = reader.create_batch_reader(batch_size=10, index_range=None, shuffle=False).read_next_batch() + rows = batch.to_pandas().to_dict(orient="records") + + assert [row["repo_id"] for row in rows] == ["child-repo"] + + +def test_preview_uses_github_seed_reader(tmp_path: Path) -> None: + repo = _create_git_repo(tmp_path / "preview-repo") + builder = DataDesignerConfigBuilder() + builder.with_seed_dataset(GitHubSeedSource(repository_paths=[str(repo)], file_pattern="*.py")) + builder.add_column(name="_row_id", column_type="sampler", sampler_type="uuid", params={}) + + result = DataDesigner(artifact_path=tmp_path / "artifacts").preview(builder, num_records=1) + + assert result.dataset is not None + assert list(result.dataset["repo_id"]) == ["preview-repo"] + assert list(result.dataset["relative_path"]) == ["src/example.py"] + assert "def greet" in result.dataset["content"].iloc[0] + + +def _create_git_repo(path: Path) -> Path: + path.mkdir(parents=True) + src = path / "src" + src.mkdir() + (src / "example.py").write_text( + "import os\n\n\ndef greet(name: str) -> str:\n return f'hello {name} from {os.getcwd()}'\n", + encoding="utf-8", + ) + (path / "README.md").write_text("# Sample\n", encoding="utf-8") + _git(path, "init", "--quiet") + _git(path, "config", "user.email", "test@example.com") + _git(path, "config", "user.name", "Test User") + _git(path, "add", ".") + _git(path, "commit", "--quiet", "-m", "initial") + return path + + +def _git(cwd: Path, *args: str) -> None: + subprocess.run(["git", *args], cwd=cwd, check=True) diff --git a/pyproject.toml b/pyproject.toml index 296d95b..ace3cc4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ ignore = [ ] [tool.ruff.lint.isort] -known-first-party = ["ddp", "data_designer_template"] +known-first-party = ["ddp", "data_designer_github", "data_designer_template"] [tool.ruff.lint.flake8-tidy-imports] ban-relative-imports = "all" diff --git a/uv.lock b/uv.lock index bf6a01c..6529ed9 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.12'", @@ -9,6 +9,7 @@ resolution-markers = [ [manifest] members = [ + "data-designer-github", "data-designer-plugins-workspace", "data-designer-template", "ddp", @@ -422,6 +423,17 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/35/d4/3844529ae989be9e63b0b8f47c28492793993427dc7d54d6d2a923ad2acc/data_designer_engine-0.5.7-py3-none-any.whl", hash = "sha256:75cd7d5ad0b230ddf75950ba7f97c9ad75c54887ad1247cdf623dc008e31a418", size = 631945, upload-time = "2026-04-17T22:03:08.584Z" }, ] +[[package]] +name = "data-designer-github" +version = "0.1.0" +source = { editable = "plugins/data-designer-github" } +dependencies = [ + { name = "data-designer" }, +] + +[package.metadata] +requires-dist = [{ name = "data-designer", specifier = ">=0.5.7" }] + [[package]] name = "data-designer-plugins-workspace" version = "0.0.0" From ebf47e1f7b357e14fd24f3c0130f944365bfcae6 Mon Sep 17 00:00:00 2001 From: "Eric W. Tramel" Date: Tue, 5 May 2026 12:10:52 -0400 Subject: [PATCH 2/3] docs: clarify pull request guidance --- AGENTS.md | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/AGENTS.md b/AGENTS.md index 435352f..22b9336 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -94,6 +94,48 @@ Pull request example: gh pr create ``` +# Pull Request Style + +Pull requests are read by humans and should make the intent of the work clear without requiring a reviewer to reverse-engineer the diff. + +Do not prefix PR titles with `[codex]`, `[agent]`, or similar automation labels unless the user explicitly asks for that. +Use a concise, human-readable title that describes the change itself, such as: + +```text +Add GitHub repository seed reader plugin +``` + +Avoid vague titles like `Update plugin`, `Fix things`, or `Add changes`. + +PR descriptions should be descriptive enough to explain the change, why it exists, how it works, and how someone uses it. +Use this structure by default: + +```markdown +## What + +Describe the user-facing or maintainer-facing change in concrete terms. + +## Why + +Explain the motivation, problem, or workflow this PR enables. + +## Usage + +Show a realistic usage example, command, configuration snippet, or before/after behavior. +For plugins, include a Data Designer config or Python snippet that demonstrates the new plugin. + +## How + +Summarize the implementation approach, important design choices, and any tradeoffs. + +## Validation + +List the exact checks run and any meaningful smoke-test output. +``` + +Usage is required when the PR adds or changes user-facing behavior. +If the PR is not user-facing, use the section to show the maintainer workflow or operational impact. + # Development Style Tests should be written around public interfaces. From b7e723762c1c34fb20f22fe8ad5b7c4a266354e7 Mon Sep 17 00:00:00 2001 From: "Eric W. Tramel" Date: Tue, 5 May 2026 12:12:51 -0400 Subject: [PATCH 3/3] Revert "docs: clarify pull request guidance" This reverts commit ebf47e1f7b357e14fd24f3c0130f944365bfcae6. --- AGENTS.md | 42 ------------------------------------------ 1 file changed, 42 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 22b9336..435352f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -94,48 +94,6 @@ Pull request example: gh pr create ``` -# Pull Request Style - -Pull requests are read by humans and should make the intent of the work clear without requiring a reviewer to reverse-engineer the diff. - -Do not prefix PR titles with `[codex]`, `[agent]`, or similar automation labels unless the user explicitly asks for that. -Use a concise, human-readable title that describes the change itself, such as: - -```text -Add GitHub repository seed reader plugin -``` - -Avoid vague titles like `Update plugin`, `Fix things`, or `Add changes`. - -PR descriptions should be descriptive enough to explain the change, why it exists, how it works, and how someone uses it. -Use this structure by default: - -```markdown -## What - -Describe the user-facing or maintainer-facing change in concrete terms. - -## Why - -Explain the motivation, problem, or workflow this PR enables. - -## Usage - -Show a realistic usage example, command, configuration snippet, or before/after behavior. -For plugins, include a Data Designer config or Python snippet that demonstrates the new plugin. - -## How - -Summarize the implementation approach, important design choices, and any tradeoffs. - -## Validation - -List the exact checks run and any meaningful smoke-test output. -``` - -Usage is required when the PR adds or changes user-facing behavior. -If the PR is not user-facing, use the section to show the maintainer workflow or operational impact. - # Development Style Tests should be written around public interfaces.