diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index eb06565..394d44b 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -7,4 +7,5 @@ /.github/ @NVIDIA-NeMo/data_designer_reviewers # Plugins +/plugins/data-designer-github/ @eric-tramel /plugins/data-designer-template/ @NVIDIA-NeMo/data_designer_reviewers diff --git a/docs/plugins/data-designer-github/index.md b/docs/plugins/data-designer-github/index.md new file mode 100644 index 0000000..1243d43 --- /dev/null +++ b/docs/plugins/data-designer-github/index.md @@ -0,0 +1,79 @@ +# data-designer-github + +`data-designer-github` is a Data Designer seed reader for repository files. It +turns GitHub repositories or local git repositories into seed rows that carry +file content, path metadata, repository provenance, and commit identifiers. + +Use it when a workflow needs code repository data as the starting point for +generation, review, transformation, or indexing tasks. The reader is intentionally +file-oriented: each matching text file becomes one seed row, and downstream Data +Designer columns decide how to summarize, critique, rewrite, label, or enrich +that row. + +## Installation + +```bash +uv add data-designer data-designer-github +``` + +The plugin is discovered through the `data_designer.plugins` entry point once it +is installed in the same environment as Data Designer. + +## Seed source + +Use the `github` seed source when the seed dataset should come from one or more +repositories. + +| Field | Required | Description | +| --- | --- | --- | +| `path` | No | A local git repository path, or a directory whose immediate children are git repositories. | +| `repositories` | No | GitHub repositories to clone. Entries may be `owner/name`, `https://github.com/owner/name`, or `https://github.com/owner/name.git`. | +| `repository_paths` | No | Additional explicit local git repository paths to read. | +| `ref` | No | Branch, tag, or commit to check out for cloned GitHub repositories. | +| `clone_depth` | No | Shallow clone depth for GitHub repositories. Defaults to `1`; set to `None` for a full clone. | +| `clone_timeout_seconds` | No | Timeout for each clone or checkout operation. Defaults to `300`. | +| `file_pattern` | No | Inherited file glob from Data Designer's filesystem seed source. For example, `*.py`. | +| `recursive` | No | Whether `file_pattern` is applied recursively. | +| `include_extensions` | No | File extensions to include after the glob match. Defaults to common code and documentation extensions. Set to `None` to allow every extension. | +| `include_file_names` | No | Extensionless file names to include, such as `Dockerfile` and `Makefile`. | +| `exclude_patterns` | No | Relative path glob patterns to skip, including `.git`, cache, build, virtualenv, and dependency directories by default. | +| `max_file_size_bytes` | No | Maximum file size to hydrate into `content`. Defaults to `1_000_000`. | +| `encoding` | No | Text encoding used when reading file contents. Defaults to `utf-8`. | + +At least one of `path`, `repositories`, or `repository_paths` is required. + +## Output columns + +| Column | Description | +| --- | --- | +| `repo_id` | Repository identifier. GitHub repositories use `owner/name`; local repositories use their GitHub remote when available, otherwise the directory name. | +| `repo_url` | Remote origin URL when available. | +| `commit_sha` | Checked-out commit SHA for the repository. | +| `source_kind` | `github` for cloned repositories, or `git_repository` for local repositories. | +| `repository_path` | Local path used by the reader. GitHub repositories are cloned into a temporary runtime directory. | +| `source_path` | Absolute path to the file that produced the seed row. | +| `relative_path` | File path relative to the repository root. | +| `file_name` | Basename of the file. | +| `file_extension` | Lowercase file extension. | +| `code_lang` | Language hint inferred from the file name or extension. | +| `size_bytes` | File size at manifest time. | +| `content_sha256` | SHA-256 hash of the hydrated file bytes. | +| `content` | Decoded text content. | + +## Behavior + +When the reader is attached, it resolves local repository roots, clones any +configured GitHub repositories, records the checked-out commit, and builds a +manifest of matching files. File content is read during row hydration, so Data +Designer can batch and sample repository content using the same seed reader +interfaces as other filesystem-backed datasets. + +The plugin reads repository files only. It does not parse code into functions, +classes, symbols, dependency graphs, or AST nodes. If a workflow needs those +structures, use this reader to collect stable file-level inputs and add +downstream columns that perform the language-specific analysis. + +The plugin shells out to `git` for repository operations and does not manage +GitHub API tokens. Public repositories work directly. Private repositories +require the execution environment's git credential configuration to already have +access. diff --git a/docs/plugins/data-designer-github/usage.md b/docs/plugins/data-designer-github/usage.md new file mode 100644 index 0000000..7f96e8b --- /dev/null +++ b/docs/plugins/data-designer-github/usage.md @@ -0,0 +1,165 @@ +# Usage + +This tutorial walks through the common patterns for turning repositories into +Data Designer seed rows. The examples use the Python builder API, but the same +configuration fields apply when a workflow is built from serialized config. + +## Read a GitHub repository + +Start with a small repository and a narrow file pattern. This keeps previews +fast and makes it clear which rows are entering the workflow. + +```python +from data_designer.config.config_builder import DataDesignerConfigBuilder +from data_designer.interface.data_designer import DataDesigner +from data_designer_github.config import GitHubSeedSource + +builder = DataDesignerConfigBuilder() +builder.with_seed_dataset( + GitHubSeedSource( + repositories=["pallets/markupsafe"], + file_pattern="*.py", + recursive=True, + ) +) + +builder.add_column( + name="_row_id", + column_type="sampler", + sampler_type="uuid", + params={}, +) + +preview = DataDesigner().preview(builder, num_records=5) +print(preview.dataset[["repo_id", "relative_path", "code_lang", "content"]]) +``` + +The seed rows contain repository provenance and file text. Downstream columns can +then ask questions such as "summarize this file", "identify risky APIs", "write +a short module description", or "extract candidate test scenarios" using the +`content`, `relative_path`, `code_lang`, and `commit_sha` columns. + +## Pin a branch, tag, or commit + +Use `ref` when the dataset must be reproducible against a specific branch, tag, +or commit. Branches and tags are passed to `git clone --branch`; commit SHAs are +checked out after cloning. + +```python +source = GitHubSeedSource( + repositories=["NVIDIA-NeMo/DataDesigner"], + ref="v0.5.7", + clone_depth=1, + file_pattern="*.py", + recursive=True, +) +``` + +For arbitrary commit SHAs, set `clone_depth=None` if the commit may not be +reachable from the shallow default clone. + +```python +source = GitHubSeedSource( + repositories=["NVIDIA-NeMo/DataDesigner"], + ref="0123456789abcdef0123456789abcdef01234567", + clone_depth=None, + file_pattern="*.py", + recursive=True, +) +``` + +## Read local repositories + +Local repositories are useful for private code, local experiments, or a checked +out monorepo that already exists on disk. + +```python +source = GitHubSeedSource( + repository_paths=[ + "/workspace/services/api", + "/workspace/libraries/shared", + ], + file_pattern="*.py", + recursive=True, +) +``` + +If `path` points at a git repository, that repository is read. If `path` points +at a directory whose immediate children are git repositories, each child +repository is discovered and read. + +```python +source = GitHubSeedSource( + path="/workspace/repos", + file_pattern="*.ts", + recursive=True, +) +``` + +## Control which files become rows + +The reader first applies `file_pattern` and `recursive`, then filters by +extension, file name, exclude pattern, and file size. + +```python +source = GitHubSeedSource( + repositories=["NVIDIA-NeMo/DataDesigner"], + file_pattern="*", + recursive=True, + include_extensions=["py", "toml", "md"], + include_file_names=["Dockerfile", "Makefile"], + exclude_patterns=[ + ".git/**", + "**/__pycache__/**", + "**/build/**", + "**/dist/**", + "docs/generated/**", + ], + max_file_size_bytes=250_000, +) +``` + +Use `include_extensions=None` for broad repository inventory tasks where the +glob and exclude patterns should decide the candidate set. + +```python +source = GitHubSeedSource( + repositories=["owner/repo"], + file_pattern="LICENSE*", + recursive=False, + include_extensions=None, +) +``` + +## Typical workflows + +`data-designer-github` works best as the seed layer for file-level code +workflows: + +- Repository QA: score files for risky dependencies, missing license headers, or + stale implementation notes. +- Documentation generation: turn source files into module summaries, migration + notes, or API reference drafts. +- Test ideation: derive test scenarios from implementation files and route them + to a code-generation column. +- Code search preparation: create embeddings or labels from stable file content + and repository metadata. +- Dataset construction: sample representative code files from several projects + while preserving `repo_id`, `relative_path`, and `commit_sha` provenance. + +Because the reader emits full file content, prompts should account for file +length and language. A common pattern is to filter or sample seed rows first, +then generate focused columns that reference only the metadata and content each +task needs. + +## Operational notes + +The plugin requires `git` on `PATH`. GitHub repositories are cloned into a +temporary runtime directory for the reader attachment and local repositories are +read in place. Files that exceed `max_file_size_bytes` are skipped before +hydration. Files that cannot be decoded with `encoding` are skipped with a +warning rather than producing partial text. + +The reader does not call the GitHub API, manage credentials, or expand GitHub +issues and pull requests. It is scoped to repository file content so workflows +can compose repository-aware seed data with the rest of Data Designer. diff --git a/docs/plugins/index.md b/docs/plugins/index.md index 4e54e2e..d488636 100644 --- a/docs/plugins/index.md +++ b/docs/plugins/index.md @@ -5,6 +5,17 @@ Browse available Data Designer plugins by what they add to your data generation workflow.
+ + + data-designer-github + v0.1.0 + + GitHub and local git repository seed reader for Data Designer + + Column types + github + + data-designer-template diff --git a/plugins/data-designer-github/CODEOWNERS b/plugins/data-designer-github/CODEOWNERS new file mode 100644 index 0000000..e0e141b --- /dev/null +++ b/plugins/data-designer-github/CODEOWNERS @@ -0,0 +1,3 @@ +# Owner(s) of this plugin — used to generate the root CODEOWNERS file. +# GitHub accepts @username, @org/team, or email format. +* @eric-tramel diff --git a/plugins/data-designer-github/README.md b/plugins/data-designer-github/README.md new file mode 100644 index 0000000..68671e3 --- /dev/null +++ b/plugins/data-designer-github/README.md @@ -0,0 +1,51 @@ +# data-designer-github + +GitHub and local git repository seed reader for +[NeMo Data Designer](https://github.com/NVIDIA-NeMo/DataDesigner). + +## Installation + +```bash +pip install data-designer-github +``` + +## Usage + +This plugin provides a `github` seed source. Once installed, the seed reader is +automatically discovered by Data Designer. + +```python +from data_designer.config.config_builder import DataDesignerConfigBuilder +from data_designer.interface.data_designer import DataDesigner +from data_designer_github.config import GitHubSeedSource + +builder = DataDesignerConfigBuilder() +builder.with_seed_dataset( + GitHubSeedSource( + repositories=["NVIDIA-NeMo/DataDesigner"], + file_pattern="*.py", + recursive=True, + ) +) + +preview = DataDesigner().preview(builder, num_records=5) +print(preview.dataset[["repo_id", "relative_path", "code_lang", "content"]]) +``` + +The reader can also scan local git repositories: + +```python +builder.with_seed_dataset( + GitHubSeedSource( + path="/path/to/repos", + repository_paths=["/path/to/one/repo"], + file_pattern="*.py", + ) +) +``` + +Seed columns include repository metadata, file paths, language hints, file +content, and content SHA-256 hashes. + +For the full plugin authoring guide, see the +[main repository docs](https://github.com/NVIDIA-NeMo/DataDesignerPlugins/blob/main/docs/adding-a-plugin.md). diff --git a/plugins/data-designer-github/docs/index.md b/plugins/data-designer-github/docs/index.md new file mode 100644 index 0000000..1243d43 --- /dev/null +++ b/plugins/data-designer-github/docs/index.md @@ -0,0 +1,79 @@ +# data-designer-github + +`data-designer-github` is a Data Designer seed reader for repository files. It +turns GitHub repositories or local git repositories into seed rows that carry +file content, path metadata, repository provenance, and commit identifiers. + +Use it when a workflow needs code repository data as the starting point for +generation, review, transformation, or indexing tasks. The reader is intentionally +file-oriented: each matching text file becomes one seed row, and downstream Data +Designer columns decide how to summarize, critique, rewrite, label, or enrich +that row. + +## Installation + +```bash +uv add data-designer data-designer-github +``` + +The plugin is discovered through the `data_designer.plugins` entry point once it +is installed in the same environment as Data Designer. + +## Seed source + +Use the `github` seed source when the seed dataset should come from one or more +repositories. + +| Field | Required | Description | +| --- | --- | --- | +| `path` | No | A local git repository path, or a directory whose immediate children are git repositories. | +| `repositories` | No | GitHub repositories to clone. Entries may be `owner/name`, `https://github.com/owner/name`, or `https://github.com/owner/name.git`. | +| `repository_paths` | No | Additional explicit local git repository paths to read. | +| `ref` | No | Branch, tag, or commit to check out for cloned GitHub repositories. | +| `clone_depth` | No | Shallow clone depth for GitHub repositories. Defaults to `1`; set to `None` for a full clone. | +| `clone_timeout_seconds` | No | Timeout for each clone or checkout operation. Defaults to `300`. | +| `file_pattern` | No | Inherited file glob from Data Designer's filesystem seed source. For example, `*.py`. | +| `recursive` | No | Whether `file_pattern` is applied recursively. | +| `include_extensions` | No | File extensions to include after the glob match. Defaults to common code and documentation extensions. Set to `None` to allow every extension. | +| `include_file_names` | No | Extensionless file names to include, such as `Dockerfile` and `Makefile`. | +| `exclude_patterns` | No | Relative path glob patterns to skip, including `.git`, cache, build, virtualenv, and dependency directories by default. | +| `max_file_size_bytes` | No | Maximum file size to hydrate into `content`. Defaults to `1_000_000`. | +| `encoding` | No | Text encoding used when reading file contents. Defaults to `utf-8`. | + +At least one of `path`, `repositories`, or `repository_paths` is required. + +## Output columns + +| Column | Description | +| --- | --- | +| `repo_id` | Repository identifier. GitHub repositories use `owner/name`; local repositories use their GitHub remote when available, otherwise the directory name. | +| `repo_url` | Remote origin URL when available. | +| `commit_sha` | Checked-out commit SHA for the repository. | +| `source_kind` | `github` for cloned repositories, or `git_repository` for local repositories. | +| `repository_path` | Local path used by the reader. GitHub repositories are cloned into a temporary runtime directory. | +| `source_path` | Absolute path to the file that produced the seed row. | +| `relative_path` | File path relative to the repository root. | +| `file_name` | Basename of the file. | +| `file_extension` | Lowercase file extension. | +| `code_lang` | Language hint inferred from the file name or extension. | +| `size_bytes` | File size at manifest time. | +| `content_sha256` | SHA-256 hash of the hydrated file bytes. | +| `content` | Decoded text content. | + +## Behavior + +When the reader is attached, it resolves local repository roots, clones any +configured GitHub repositories, records the checked-out commit, and builds a +manifest of matching files. File content is read during row hydration, so Data +Designer can batch and sample repository content using the same seed reader +interfaces as other filesystem-backed datasets. + +The plugin reads repository files only. It does not parse code into functions, +classes, symbols, dependency graphs, or AST nodes. If a workflow needs those +structures, use this reader to collect stable file-level inputs and add +downstream columns that perform the language-specific analysis. + +The plugin shells out to `git` for repository operations and does not manage +GitHub API tokens. Public repositories work directly. Private repositories +require the execution environment's git credential configuration to already have +access. diff --git a/plugins/data-designer-github/docs/usage.md b/plugins/data-designer-github/docs/usage.md new file mode 100644 index 0000000..7f96e8b --- /dev/null +++ b/plugins/data-designer-github/docs/usage.md @@ -0,0 +1,165 @@ +# Usage + +This tutorial walks through the common patterns for turning repositories into +Data Designer seed rows. The examples use the Python builder API, but the same +configuration fields apply when a workflow is built from serialized config. + +## Read a GitHub repository + +Start with a small repository and a narrow file pattern. This keeps previews +fast and makes it clear which rows are entering the workflow. + +```python +from data_designer.config.config_builder import DataDesignerConfigBuilder +from data_designer.interface.data_designer import DataDesigner +from data_designer_github.config import GitHubSeedSource + +builder = DataDesignerConfigBuilder() +builder.with_seed_dataset( + GitHubSeedSource( + repositories=["pallets/markupsafe"], + file_pattern="*.py", + recursive=True, + ) +) + +builder.add_column( + name="_row_id", + column_type="sampler", + sampler_type="uuid", + params={}, +) + +preview = DataDesigner().preview(builder, num_records=5) +print(preview.dataset[["repo_id", "relative_path", "code_lang", "content"]]) +``` + +The seed rows contain repository provenance and file text. Downstream columns can +then ask questions such as "summarize this file", "identify risky APIs", "write +a short module description", or "extract candidate test scenarios" using the +`content`, `relative_path`, `code_lang`, and `commit_sha` columns. + +## Pin a branch, tag, or commit + +Use `ref` when the dataset must be reproducible against a specific branch, tag, +or commit. Branches and tags are passed to `git clone --branch`; commit SHAs are +checked out after cloning. + +```python +source = GitHubSeedSource( + repositories=["NVIDIA-NeMo/DataDesigner"], + ref="v0.5.7", + clone_depth=1, + file_pattern="*.py", + recursive=True, +) +``` + +For arbitrary commit SHAs, set `clone_depth=None` if the commit may not be +reachable from the shallow default clone. + +```python +source = GitHubSeedSource( + repositories=["NVIDIA-NeMo/DataDesigner"], + ref="0123456789abcdef0123456789abcdef01234567", + clone_depth=None, + file_pattern="*.py", + recursive=True, +) +``` + +## Read local repositories + +Local repositories are useful for private code, local experiments, or a checked +out monorepo that already exists on disk. + +```python +source = GitHubSeedSource( + repository_paths=[ + "/workspace/services/api", + "/workspace/libraries/shared", + ], + file_pattern="*.py", + recursive=True, +) +``` + +If `path` points at a git repository, that repository is read. If `path` points +at a directory whose immediate children are git repositories, each child +repository is discovered and read. + +```python +source = GitHubSeedSource( + path="/workspace/repos", + file_pattern="*.ts", + recursive=True, +) +``` + +## Control which files become rows + +The reader first applies `file_pattern` and `recursive`, then filters by +extension, file name, exclude pattern, and file size. + +```python +source = GitHubSeedSource( + repositories=["NVIDIA-NeMo/DataDesigner"], + file_pattern="*", + recursive=True, + include_extensions=["py", "toml", "md"], + include_file_names=["Dockerfile", "Makefile"], + exclude_patterns=[ + ".git/**", + "**/__pycache__/**", + "**/build/**", + "**/dist/**", + "docs/generated/**", + ], + max_file_size_bytes=250_000, +) +``` + +Use `include_extensions=None` for broad repository inventory tasks where the +glob and exclude patterns should decide the candidate set. + +```python +source = GitHubSeedSource( + repositories=["owner/repo"], + file_pattern="LICENSE*", + recursive=False, + include_extensions=None, +) +``` + +## Typical workflows + +`data-designer-github` works best as the seed layer for file-level code +workflows: + +- Repository QA: score files for risky dependencies, missing license headers, or + stale implementation notes. +- Documentation generation: turn source files into module summaries, migration + notes, or API reference drafts. +- Test ideation: derive test scenarios from implementation files and route them + to a code-generation column. +- Code search preparation: create embeddings or labels from stable file content + and repository metadata. +- Dataset construction: sample representative code files from several projects + while preserving `repo_id`, `relative_path`, and `commit_sha` provenance. + +Because the reader emits full file content, prompts should account for file +length and language. A common pattern is to filter or sample seed rows first, +then generate focused columns that reference only the metadata and content each +task needs. + +## Operational notes + +The plugin requires `git` on `PATH`. GitHub repositories are cloned into a +temporary runtime directory for the reader attachment and local repositories are +read in place. Files that exceed `max_file_size_bytes` are skipped before +hydration. Files that cannot be decoded with `encoding` are skipped with a +warning rather than producing partial text. + +The reader does not call the GitHub API, manage credentials, or expand GitHub +issues and pull requests. It is scoped to repository file content so workflows +can compose repository-aware seed data with the rest of Data Designer. diff --git a/plugins/data-designer-github/pyproject.toml b/plugins/data-designer-github/pyproject.toml new file mode 100644 index 0000000..359c6f9 --- /dev/null +++ b/plugins/data-designer-github/pyproject.toml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[project] +name = "data-designer-github" +version = "0.1.0" +description = "GitHub and local git repository seed reader for Data Designer" +requires-python = ">=3.10" +dependencies = [ + "data-designer>=0.5.7", +] +license = "Apache-2.0" +readme = "README.md" +authors = [ + {name = "NVIDIA Corporation"}, +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Programming Language :: Python :: 3", +] + +[project.entry-points."data_designer.plugins"] +github = "data_designer_github.plugin:plugin" + +[project.urls] +Repository = "https://github.com/NVIDIA-NeMo/DataDesignerPlugins" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/data_designer_github"] + +[tool.ruff] +extend = "../../pyproject.toml" diff --git a/plugins/data-designer-github/src/data_designer_github/__init__.py b/plugins/data-designer-github/src/data_designer_github/__init__.py new file mode 100644 index 0000000..52a7a9d --- /dev/null +++ b/plugins/data-designer-github/src/data_designer_github/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/plugins/data-designer-github/src/data_designer_github/config.py b/plugins/data-designer-github/src/data_designer_github/config.py new file mode 100644 index 0000000..6285de5 --- /dev/null +++ b/plugins/data-designer-github/src/data_designer_github/config.py @@ -0,0 +1,217 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import codecs +from pathlib import Path +from typing import ClassVar, Literal + +from data_designer.config.base import ConfigBase +from data_designer.config.seed_source import FileSystemSeedSource +from pydantic import Field, field_validator, model_validator +from typing_extensions import Self + +DEFAULT_CODE_EXTENSIONS = [ + ".bash", + ".c", + ".cc", + ".cfg", + ".cpp", + ".cs", + ".css", + ".go", + ".h", + ".hpp", + ".html", + ".java", + ".js", + ".json", + ".jsx", + ".kt", + ".kts", + ".lua", + ".md", + ".php", + ".py", + ".rb", + ".rs", + ".scala", + ".sh", + ".sql", + ".swift", + ".toml", + ".ts", + ".tsx", + ".yaml", + ".yml", + ".zsh", +] + +DEFAULT_CODE_FILENAMES = [ + "Dockerfile", + "Makefile", +] + +DEFAULT_EXCLUDE_PATTERNS = [ + ".git/*", + ".git/**", + ".mypy_cache/*", + ".pytest_cache/*", + ".ruff_cache/*", + ".tox/*", + ".venv/*", + "__pycache__/*", + "build/*", + "dist/*", + "node_modules/*", + "venv/*", + "**/.git/*", + "**/.git/**", + "**/.mypy_cache/*", + "**/.pytest_cache/*", + "**/.ruff_cache/*", + "**/.tox/*", + "**/.venv/*", + "**/__pycache__/*", + "**/build/*", + "**/dist/*", + "**/node_modules/*", + "**/venv/*", +] + + +class GitHubSeedSource(FileSystemSeedSource, ConfigBase): + """Seed source for reading code files from GitHub and local git repositories.""" + + seed_type: Literal["github"] = "github" + + path: str | None = Field( + None, + description=( + "Optional local git repository path, or a directory whose immediate children are git repositories. " + "Relative paths are resolved from the current working directory when the config is loaded." + ), + ) + repositories: list[str] = Field( + default_factory=list, + description=( + "GitHub repositories to clone before reading. Each entry may be 'owner/name', " + "'https://github.com/owner/name', or 'https://github.com/owner/name.git'." + ), + ) + repository_paths: list[str] = Field( + default_factory=list, + description="Additional local git repository paths to read.", + ) + ref: str | None = Field( + None, + description="Optional branch, tag, or commit to check out after cloning GitHub repositories.", + ) + clone_depth: int | None = Field( + 1, + ge=1, + description="Depth for GitHub clones. Set to null for a full clone.", + ) + clone_timeout_seconds: int = Field( + 300, + ge=1, + description="Timeout for each git clone or checkout operation.", + ) + include_extensions: list[str] | None = Field( + default_factory=lambda: list(DEFAULT_CODE_EXTENSIONS), + description=( + "Lowercase file extensions to include. Values may include or omit the leading dot. " + "Set to null to include every extension." + ), + ) + include_file_names: list[str] = Field( + default_factory=lambda: list(DEFAULT_CODE_FILENAMES), + description="Extensionless file names to include, such as Dockerfile or Makefile.", + ) + exclude_patterns: list[str] = Field( + default_factory=lambda: list(DEFAULT_EXCLUDE_PATTERNS), + description="Relative path glob patterns to exclude from repository scans.", + ) + max_file_size_bytes: int = Field( + 1_000_000, + ge=1, + description="Maximum file size to hydrate into the content column.", + ) + encoding: str = Field( + "utf-8", + description="Text encoding used when hydrating repository file contents.", + ) + + _source_fields: ClassVar[tuple[str, ...]] = ("path", "repositories", "repository_paths") + + @model_validator(mode="after") + def validate_has_repository_source(self) -> Self: + """Ensure the seed source has at least one repository source.""" + if self.path is None and not self.repositories and not self.repository_paths: + fields = ", ".join(self._source_fields) + raise ValueError(f"At least one of {fields} must be provided.") + return self + + @field_validator("encoding", mode="after") + @classmethod + def validate_encoding(cls, value: str) -> str: + """Validate that the configured text encoding exists.""" + try: + codecs.lookup(value) + except LookupError as error: + raise ValueError(f"Unknown encoding: {value!r}. Use a valid Python codec name.") from error + return value + + @field_validator("include_extensions", mode="after") + @classmethod + def normalize_include_extensions(cls, value: list[str] | None) -> list[str] | None: + """Normalize configured extensions to lowercase dotted values.""" + if value is None: + return None + + normalized: list[str] = [] + for extension in value: + stripped = extension.strip().lower() + if not stripped: + raise ValueError("include_extensions cannot contain empty values.") + normalized.append(stripped if stripped.startswith(".") else f".{stripped}") + return sorted(set(normalized)) + + @field_validator("include_file_names", "exclude_patterns", mode="after") + @classmethod + def validate_non_empty_strings(cls, value: list[str]) -> list[str]: + """Validate string list fields do not contain blank entries.""" + for item in value: + if not item.strip(): + raise ValueError("String lists cannot contain empty values.") + return value + + @field_validator("repositories", mode="after") + @classmethod + def validate_repositories(cls, value: list[str]) -> list[str]: + """Validate repository specs do not contain blank entries.""" + for repository in value: + if not repository.strip(): + raise ValueError("repositories cannot contain empty values.") + return value + + @field_validator("repository_paths", mode="after") + @classmethod + def validate_repository_paths(cls, value: list[str]) -> list[str]: + """Validate explicit local repository paths exist.""" + for repository_path in value: + path = Path(repository_path).expanduser().resolve() + if not path.is_dir(): + raise ValueError(f"Repository path {path} is not a directory.") + return value + + @property + def runtime_path(self) -> str: + """Return the resolved local scan root after a reader has prepared it.""" + if self._runtime_path is not None: + return self._runtime_path + if self.path is None: + raise ValueError("GitHubSeedSource.runtime_path is available after the seed reader is attached.") + self._runtime_path = str(Path(self.path).expanduser().resolve()) + return self._runtime_path diff --git a/plugins/data-designer-github/src/data_designer_github/impl.py b/plugins/data-designer-github/src/data_designer_github/impl.py new file mode 100644 index 0000000..c228b79 --- /dev/null +++ b/plugins/data-designer-github/src/data_designer_github/impl.py @@ -0,0 +1,409 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import hashlib +import logging +import re +import shutil +import subprocess +import tempfile +from dataclasses import dataclass +from fnmatch import fnmatchcase +from pathlib import Path +from typing import Any, ClassVar +from urllib.parse import urlparse + +import pandas as pd +from data_designer.engine.resources.seed_reader import ( + FileSystemSeedReader, + SeedReaderError, + SeedReaderFileSystemContext, +) + +from data_designer_github.config import GitHubSeedSource + +logger = logging.getLogger(__name__) + + +LANGUAGE_BY_EXTENSION = { + ".bash": "bash", + ".c": "c", + ".cc": "cpp", + ".cfg": "config", + ".cpp": "cpp", + ".cs": "csharp", + ".css": "css", + ".go": "go", + ".h": "c", + ".hpp": "cpp", + ".html": "html", + ".java": "java", + ".js": "javascript", + ".json": "json", + ".jsx": "javascript", + ".kt": "kotlin", + ".kts": "kotlin", + ".lua": "lua", + ".md": "markdown", + ".php": "php", + ".py": "python", + ".rb": "ruby", + ".rs": "rust", + ".scala": "scala", + ".sh": "shell", + ".sql": "sql", + ".swift": "swift", + ".toml": "toml", + ".ts": "typescript", + ".tsx": "typescript", + ".yaml": "yaml", + ".yml": "yaml", + ".zsh": "zsh", +} + +LANGUAGE_BY_FILENAME = { + "Dockerfile": "dockerfile", + "Makefile": "makefile", +} + + +@dataclass(frozen=True) +class RepositoryRoot: + """Prepared repository root available for manifest building.""" + + repo_id: str + repo_url: str | None + root_path: Path + source_kind: str + commit_sha: str | None + + +class GitHubSeedReader(FileSystemSeedReader[GitHubSeedSource]): + """Read code files from GitHub clones and local git repositories.""" + + output_columns: ClassVar[list[str] | None] = [ + "repo_id", + "repo_url", + "commit_sha", + "source_kind", + "repository_path", + "source_path", + "relative_path", + "file_name", + "file_extension", + "code_lang", + "size_bytes", + "content_sha256", + "content", + ] + + def _reset_attachment_state(self) -> None: + super()._reset_attachment_state() + temp_dir = getattr(self, "_temp_dir", None) + if temp_dir is not None: + temp_dir.cleanup() + self._temp_dir: tempfile.TemporaryDirectory[str] | None = None + self._repository_roots: list[RepositoryRoot] | None = None + + def build_manifest(self, *, context: SeedReaderFileSystemContext) -> pd.DataFrame | list[dict[str, Any]]: + """Build a cheap file manifest across every configured repository.""" + records: list[dict[str, Any]] = [] + for repository in self._get_repository_roots(context): + records.extend(self._build_repository_manifest(repository)) + return records + + def hydrate_row( + self, + *, + manifest_row: dict[str, Any], + context: SeedReaderFileSystemContext, + ) -> dict[str, Any] | list[dict[str, Any]]: + """Read file content and add it to a manifest row.""" + del context + source_path = Path(str(manifest_row["source_path"])) + try: + content_bytes = source_path.read_bytes() + content = content_bytes.decode(self.source.encoding) + except UnicodeDecodeError as error: + logger.warning( + "Skipping file %s because it cannot be decoded as %s: %s", + source_path, + self.source.encoding, + error, + ) + return [] + except OSError as error: + raise SeedReaderError(f"Failed to read repository file {source_path}: {error}") from error + + record = dict(manifest_row) + record["content_sha256"] = hashlib.sha256(content_bytes).hexdigest() + record["content"] = content + return record + + def _get_filesystem_context(self) -> SeedReaderFileSystemContext: + self._ensure_attached() + context = getattr(self, "_filesystem_context", None) + if context is not None: + return context + + runtime_root = self._prepare_runtime_root() + context = self.create_filesystem_context(runtime_root) + self._filesystem_context = context + return context + + def _prepare_runtime_root(self) -> Path: + self._temp_dir = tempfile.TemporaryDirectory(prefix="data-designer-github-") + runtime_root = Path(self._temp_dir.name).resolve() + + repository_roots = self._prepare_local_repositories() + clone_root = runtime_root / "github" + clone_root.mkdir(parents=True, exist_ok=True) + repository_roots.extend(self._clone_github_repositories(clone_root)) + + if not repository_roots: + raise SeedReaderError("GitHub seed source did not resolve any repositories.") + + self.source._runtime_path = str(runtime_root) + self._repository_roots = repository_roots + return runtime_root + + def _get_repository_roots(self, context: SeedReaderFileSystemContext) -> list[RepositoryRoot]: + del context + repository_roots = getattr(self, "_repository_roots", None) + if repository_roots is None: + raise SeedReaderError("Repository roots are not prepared.") + return repository_roots + + def _prepare_local_repositories(self) -> list[RepositoryRoot]: + local_paths = _resolve_local_repository_paths( + parent_path=self.source.path, + repository_paths=self.source.repository_paths, + ) + return [self._build_local_repository_root(path) for path in local_paths] + + def _clone_github_repositories(self, clone_root: Path) -> list[RepositoryRoot]: + repository_roots: list[RepositoryRoot] = [] + for repository_spec in self.source.repositories: + repo_id, repo_url = normalize_github_repository(repository_spec) + destination = clone_root / _safe_repo_directory_name(repo_id) + self._clone_repository(repo_url=repo_url, destination=destination) + if self.source.ref is not None: + _run_git( + ["checkout", "--quiet", self.source.ref], + cwd=destination, + timeout=self.source.clone_timeout_seconds, + ) + repository_roots.append( + RepositoryRoot( + repo_id=repo_id, + repo_url=repo_url, + root_path=destination, + source_kind="github", + commit_sha=_get_commit_sha(destination), + ) + ) + return repository_roots + + def _clone_repository(self, *, repo_url: str, destination: Path) -> None: + command = ["clone", "--quiet"] + if self.source.ref is not None and not _looks_like_commit_sha(self.source.ref): + command.extend(["--branch", self.source.ref]) + if self.source.clone_depth is not None: + command.extend(["--depth", str(self.source.clone_depth)]) + command.extend([repo_url, str(destination)]) + _run_git(command, timeout=self.source.clone_timeout_seconds) + + def _build_local_repository_root(self, root_path: Path) -> RepositoryRoot: + remote_url = _get_remote_url(root_path) + return RepositoryRoot( + repo_id=_repo_id_from_local_path(root_path, remote_url), + repo_url=remote_url, + root_path=root_path, + source_kind="git_repository", + commit_sha=_get_commit_sha(root_path), + ) + + def _build_repository_manifest(self, repository: RepositoryRoot) -> list[dict[str, Any]]: + records: list[dict[str, Any]] = [] + for file_path in self._iter_matching_files(repository.root_path): + relative_path = file_path.relative_to(repository.root_path).as_posix() + stat = file_path.stat() + records.append( + { + "repo_id": repository.repo_id, + "repo_url": repository.repo_url, + "commit_sha": repository.commit_sha, + "source_kind": repository.source_kind, + "repository_path": str(repository.root_path), + "source_path": str(file_path), + "relative_path": relative_path, + "file_name": file_path.name, + "file_extension": file_path.suffix.lower(), + "code_lang": _detect_language(file_path), + "size_bytes": stat.st_size, + "content_sha256": "", + "content": "", + } + ) + return records + + def _iter_matching_files(self, root_path: Path) -> list[Path]: + paths = ( + root_path.rglob(self.source.file_pattern) + if self.source.recursive + else root_path.glob(self.source.file_pattern) + ) + files = [path for path in paths if self._should_include_file(root_path=root_path, file_path=path)] + files.sort(key=lambda path: path.relative_to(root_path).as_posix()) + return files + + def _should_include_file(self, *, root_path: Path, file_path: Path) -> bool: + if not file_path.is_file(): + return False + + relative_path = file_path.relative_to(root_path).as_posix() + if any(fnmatchcase(relative_path, pattern) for pattern in self.source.exclude_patterns): + return False + + try: + file_size = file_path.stat().st_size + except OSError as error: + logger.warning("Skipping file %s because it cannot be stat'ed: %s", file_path, error) + return False + + if file_size > self.source.max_file_size_bytes: + return False + + if file_path.name in self.source.include_file_names: + return True + + include_extensions = self.source.include_extensions + return include_extensions is None or file_path.suffix.lower() in include_extensions + + +def normalize_github_repository(repository: str) -> tuple[str, str]: + """Normalize a GitHub repository spec to ``(owner/name, clone_url)``.""" + stripped = repository.strip() + parsed = urlparse(stripped) + + if parsed.scheme in {"http", "https"}: + if parsed.netloc.lower() != "github.com": + raise SeedReaderError(f"Expected a github.com repository URL, got {repository!r}.") + repo_id = parsed.path.strip("/").removesuffix(".git") + elif stripped.startswith("git@github.com:"): + repo_id = stripped.removeprefix("git@github.com:").removesuffix(".git").strip("/") + else: + repo_id = stripped.removesuffix(".git").strip("/") + + if not re.fullmatch(r"[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+", repo_id): + raise SeedReaderError(f"GitHub repository {repository!r} must use 'owner/name' or a github.com repository URL.") + + return repo_id, f"https://github.com/{repo_id}.git" + + +def _resolve_local_repository_paths(*, parent_path: str | None, repository_paths: list[str]) -> list[Path]: + roots: dict[Path, None] = {} + if parent_path is not None: + parent = Path(parent_path).expanduser().resolve() + top_level = _get_git_toplevel(parent) + if top_level is not None: + roots[top_level] = None + else: + for child in sorted(parent.iterdir()): + if child.is_dir(): + child_top_level = _get_git_toplevel(child) + if child_top_level is not None: + roots[child_top_level] = None + + for repository_path in repository_paths: + path = Path(repository_path).expanduser().resolve() + top_level = _get_git_toplevel(path) + if top_level is None: + raise SeedReaderError(f"Repository path {path} is not a git repository.") + roots[top_level] = None + + return list(roots) + + +def _get_git_toplevel(path: Path) -> Path | None: + result = subprocess.run( + ["git", "-C", str(path), "rev-parse", "--show-toplevel"], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + return None + return Path(result.stdout.strip()).resolve() + + +def _get_commit_sha(root_path: Path) -> str | None: + result = subprocess.run( + ["git", "-C", str(root_path), "rev-parse", "HEAD"], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + return None + return result.stdout.strip() + + +def _get_remote_url(root_path: Path) -> str | None: + result = subprocess.run( + ["git", "-C", str(root_path), "config", "--get", "remote.origin.url"], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + return None + return result.stdout.strip() or None + + +def _run_git(command: list[str], *, cwd: Path | None = None, timeout: int) -> None: + git = shutil.which("git") + if git is None: + raise SeedReaderError("git is required to read GitHub repositories, but it was not found on PATH.") + + try: + result = subprocess.run( + [git, *command], + cwd=None if cwd is None else str(cwd), + capture_output=True, + text=True, + check=False, + timeout=timeout, + ) + except subprocess.TimeoutExpired as error: + raise SeedReaderError(f"git {' '.join(command)} timed out after {timeout} seconds") from error + + if result.returncode != 0: + detail = result.stderr.strip() or result.stdout.strip() + raise SeedReaderError(f"git {' '.join(command)} failed: {detail}") + + +def _repo_id_from_local_path(root_path: Path, remote_url: str | None) -> str: + if remote_url: + try: + repo_id, _ = normalize_github_repository(remote_url) + return repo_id + except SeedReaderError: + pass + return root_path.name + + +def _safe_repo_directory_name(repo_id: str) -> str: + return repo_id.replace("/", "__") + + +def _looks_like_commit_sha(ref: str) -> bool: + return re.fullmatch(r"[0-9a-fA-F]{7,40}", ref) is not None + + +def _detect_language(file_path: Path) -> str: + if file_path.name in LANGUAGE_BY_FILENAME: + return LANGUAGE_BY_FILENAME[file_path.name] + return LANGUAGE_BY_EXTENSION.get(file_path.suffix.lower(), file_path.suffix.lower().removeprefix(".")) diff --git a/plugins/data-designer-github/src/data_designer_github/plugin.py b/plugins/data-designer-github/src/data_designer_github/plugin.py new file mode 100644 index 0000000..8a81988 --- /dev/null +++ b/plugins/data-designer-github/src/data_designer_github/plugin.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from data_designer.plugins.plugin import Plugin, PluginType + +plugin = Plugin( + config_qualified_name="data_designer_github.config.GitHubSeedSource", + impl_qualified_name="data_designer_github.impl.GitHubSeedReader", + plugin_type=PluginType.SEED_READER, +) diff --git a/plugins/data-designer-github/tests/test_plugin.py b/plugins/data-designer-github/tests/test_plugin.py new file mode 100644 index 0000000..be82445 --- /dev/null +++ b/plugins/data-designer-github/tests/test_plugin.py @@ -0,0 +1,104 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import subprocess +from pathlib import Path + +import pytest +from data_designer.config.config_builder import DataDesignerConfigBuilder +from data_designer.engine.secret_resolver import PlaintextResolver +from data_designer.engine.testing.utils import assert_valid_plugin +from data_designer.interface.data_designer import DataDesigner + +from data_designer_github.config import GitHubSeedSource +from data_designer_github.impl import GitHubSeedReader, normalize_github_repository +from data_designer_github.plugin import plugin + + +def test_valid_plugin() -> None: + assert_valid_plugin(plugin) + + +def test_normalize_github_repository() -> None: + assert normalize_github_repository("NVIDIA-NeMo/DataDesigner")[0] == "NVIDIA-NeMo/DataDesigner" + assert normalize_github_repository("https://github.com/NVIDIA-NeMo/DataDesigner.git")[0] == ( + "NVIDIA-NeMo/DataDesigner" + ) + + +def test_source_requires_at_least_one_repository_source() -> None: + with pytest.raises(ValueError, match="At least one"): + GitHubSeedSource() + + +def test_reader_hydrates_local_repository_files(tmp_path: Path) -> None: + repo = _create_git_repo(tmp_path / "sample-repo") + source = GitHubSeedSource(repository_paths=[str(repo)], file_pattern="*.py") + reader = GitHubSeedReader() + reader.attach(source, PlaintextResolver()) + + assert reader.get_seed_dataset_size() == 1 + batch = reader.create_batch_reader(batch_size=10, index_range=None, shuffle=False).read_next_batch() + rows = batch.to_pandas().to_dict(orient="records") + + assert len(rows) == 1 + row = rows[0] + assert row["repo_id"] == "sample-repo" + assert row["source_kind"] == "git_repository" + assert row["relative_path"] == "src/example.py" + assert row["file_name"] == "example.py" + assert row["file_extension"] == ".py" + assert row["code_lang"] == "python" + assert row["size_bytes"] > 0 + assert len(row["commit_sha"]) == 40 + assert len(row["content_sha256"]) == 64 + assert "def greet" in row["content"] + + +def test_parent_path_discovers_child_git_repositories(tmp_path: Path) -> None: + repo = _create_git_repo(tmp_path / "repos" / "child-repo") + source = GitHubSeedSource(path=str(repo.parent), file_pattern="*.py") + reader = GitHubSeedReader() + reader.attach(source, PlaintextResolver()) + + batch = reader.create_batch_reader(batch_size=10, index_range=None, shuffle=False).read_next_batch() + rows = batch.to_pandas().to_dict(orient="records") + + assert [row["repo_id"] for row in rows] == ["child-repo"] + + +def test_preview_uses_github_seed_reader(tmp_path: Path) -> None: + repo = _create_git_repo(tmp_path / "preview-repo") + builder = DataDesignerConfigBuilder() + builder.with_seed_dataset(GitHubSeedSource(repository_paths=[str(repo)], file_pattern="*.py")) + builder.add_column(name="_row_id", column_type="sampler", sampler_type="uuid", params={}) + + result = DataDesigner(artifact_path=tmp_path / "artifacts").preview(builder, num_records=1) + + assert result.dataset is not None + assert list(result.dataset["repo_id"]) == ["preview-repo"] + assert list(result.dataset["relative_path"]) == ["src/example.py"] + assert "def greet" in result.dataset["content"].iloc[0] + + +def _create_git_repo(path: Path) -> Path: + path.mkdir(parents=True) + src = path / "src" + src.mkdir() + (src / "example.py").write_text( + "import os\n\n\ndef greet(name: str) -> str:\n return f'hello {name} from {os.getcwd()}'\n", + encoding="utf-8", + ) + (path / "README.md").write_text("# Sample\n", encoding="utf-8") + _git(path, "init", "--quiet") + _git(path, "config", "user.email", "test@example.com") + _git(path, "config", "user.name", "Test User") + _git(path, "add", ".") + _git(path, "commit", "--quiet", "-m", "initial") + return path + + +def _git(cwd: Path, *args: str) -> None: + subprocess.run(["git", *args], cwd=cwd, check=True) diff --git a/pyproject.toml b/pyproject.toml index cffc13f..838281c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ ignore = [ ] [tool.ruff.lint.isort] -known-first-party = ["ddp", "data_designer_template"] +known-first-party = ["ddp", "data_designer_github", "data_designer_template"] [tool.ruff.lint.flake8-tidy-imports] ban-relative-imports = "all" diff --git a/uv.lock b/uv.lock index 97c44fd..c8780c1 100644 --- a/uv.lock +++ b/uv.lock @@ -9,6 +9,7 @@ resolution-markers = [ [manifest] members = [ + "data-designer-github", "data-designer-plugins-workspace", "data-designer-template", "ddp", @@ -422,6 +423,17 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/35/d4/3844529ae989be9e63b0b8f47c28492793993427dc7d54d6d2a923ad2acc/data_designer_engine-0.5.7-py3-none-any.whl", hash = "sha256:75cd7d5ad0b230ddf75950ba7f97c9ad75c54887ad1247cdf623dc008e31a418", size = 631945, upload-time = "2026-04-17T22:03:08.584Z" }, ] +[[package]] +name = "data-designer-github" +version = "0.1.0" +source = { editable = "plugins/data-designer-github" } +dependencies = [ + { name = "data-designer" }, +] + +[package.metadata] +requires-dist = [{ name = "data-designer", specifier = ">=0.5.7" }] + [[package]] name = "data-designer-plugins-workspace" version = "0.0.0" diff --git a/zensical.toml b/zensical.toml index 3f1af80..8f3dec1 100644 --- a/zensical.toml +++ b/zensical.toml @@ -19,6 +19,10 @@ nav = [ {"Plugins" = [ {"Overview" = "plugins/index.md"}, # BEGIN GENERATED PLUGIN DOCS NAV + {"data-designer-github" = [ + {"Overview" = "plugins/data-designer-github/index.md"}, + {"Usage" = "plugins/data-designer-github/usage.md"}, + ]}, {"data-designer-template" = [ {"Overview" = "plugins/data-designer-template/index.md"}, {"Usage" = "plugins/data-designer-template/usage.md"},