From 817f31345fb345adad0188b98e5ea1332346267f Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 30 Mar 2026 17:13:53 +0530 Subject: [PATCH 01/23] add e2e tests for kernel builder cli. --- .github/workflows/test_e2e.yaml | 176 +++++++++++++ e2e-test-plan.md | 435 ++++++++++++++++++++++++++++++++ 2 files changed, 611 insertions(+) create mode 100644 .github/workflows/test_e2e.yaml create mode 100644 e2e-test-plan.md diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml new file mode 100644 index 00000000..21d1a108 --- /dev/null +++ b/.github/workflows/test_e2e.yaml @@ -0,0 +1,176 @@ +name: "E2E: kernel-builder init + build + upload + download" + +on: + push: + branches: [main] + pull_request: + branches: [main] + paths: + - "kernel-builder/**" + - "kernels/src/**" + - "nix-builder/**" + - "kernels-data/**" + - ".github/workflows/test_e2e.yaml" + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + init-and-build: + name: Init and build kernel + runs-on: + group: aws-highmemory-32-plus-nix + steps: + - uses: actions/checkout@v6 + - uses: DeterminateSystems/nix-installer-action@main + with: + extra-conf: | + max-jobs = 8 + cores = 12 + sandbox-fallback = false + - uses: cachix/cachix-action@v16 + with: + name: huggingface + authToken: "${{ secrets.CACHIX_AUTH_TOKEN }}" + env: + USER: runner + + - name: Build kernel-builder CLI + run: cargo build --release --manifest-path kernel-builder/Cargo.toml + + - name: Init kernel project + run: | + cd /tmp + $GITHUB_WORKSPACE/target/release/kernel-builder init \ + --name kernels-test/e2e-test-kernel \ + --backends cuda + + - name: Validate scaffold + run: | + cd /tmp/e2e-test-kernel + test -f build.toml + test -f flake.nix + test -f torch-ext/e2e_test_kernel/__init__.py + test -f torch-ext/torch_binding.cpp + test -f torch-ext/torch_binding.h + test -f e2e_test_kernel_cuda/e2e_test_kernel.cu + test -f tests/test_e2e_test_kernel.py + test -f example.py + grep -q 'name = "e2e-test-kernel"' build.toml + grep -q 'repo-id = "kernels-test/e2e-test-kernel"' build.toml + grep -q 'backend = "cuda"' build.toml + + - name: Patch flake.nix to use local nix-builder + run: | + cd /tmp/e2e-test-kernel + sed -i 's|github:huggingface/kernels|path:'"$GITHUB_WORKSPACE"'|' flake.nix + + - name: Build one CUDA variant + run: | + cd /tmp/e2e-test-kernel + nix build .#ci -L + mkdir -p build + cp -rL result/* build/ + + - name: Verify build artifacts + run: | + cd /tmp/e2e-test-kernel + VARIANT_DIR=$(ls -d build/torch* | head -1) + echo "Built variant: $VARIANT_DIR" + test -f "$VARIANT_DIR/__init__.py" + test -f "$VARIANT_DIR/metadata.json" + ls "$VARIANT_DIR"/*.so + + - name: Upload built kernel + uses: actions/upload-artifact@v6 + with: + name: e2e-built-kernel + path: /tmp/e2e-test-kernel/ + + - name: Upload kernel-builder binary + uses: actions/upload-artifact@v6 + with: + name: kernel-builder-bin + path: ${{ github.workspace }}/target/release/kernel-builder + + upload: + name: Upload kernel to Hub + needs: init-and-build + runs-on: + group: aws-g6-12xlarge-plus + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + steps: + - name: Download built kernel + uses: actions/download-artifact@v7 + with: + name: e2e-built-kernel + path: /tmp/e2e-test-kernel + + - name: Download kernel-builder binary + uses: actions/download-artifact@v7 + with: + name: kernel-builder-bin + path: /tmp/bin + + - name: Make binary executable + run: chmod +x /tmp/bin/kernel-builder + + - name: Upload kernel to Hub + run: | + /tmp/bin/kernel-builder upload /tmp/e2e-test-kernel \ + --repo-id kernels-test/kernels-upload-test + + download-and-test: + name: Download and test kernel via get_kernel + needs: upload + runs-on: + group: aws-g6-12xlarge-plus + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + UV_PYTHON_PREFERENCE: only-managed + steps: + - uses: actions/checkout@v6 + + - name: Install uv and set Python version + uses: astral-sh/setup-uv@v7 + with: + python-version: "3.12" + + - name: Install Python deps + working-directory: ./kernels + run: | + uv sync --all-extras --dev + uv pip install torch==2.10.0 + + - name: Test get_kernel download and usage + working-directory: ./kernels + run: | + uv run python -c " + import torch + from kernels import get_kernel + + kernel = get_kernel('kernels-test/kernels-upload-test', version=1) + + x = torch.randn(1024, 1024, dtype=torch.float32, device='cuda') + result = kernel.e2e_test_kernel(x) + expected = x + 1.0 + torch.testing.assert_close(result, expected) + print('E2E test passed: get_kernel + correctness check') + " + + - name: Cleanup Hub repo + if: always() + working-directory: ./kernels + run: | + uv run python -c " + from huggingface_hub import HfApi + api = HfApi() + try: + api.delete_repo('kernels-test/kernels-upload-test') + print('Cleaned up repo') + except Exception as e: + print(f'Cleanup warning: {e}') + " diff --git a/e2e-test-plan.md b/e2e-test-plan.md new file mode 100644 index 00000000..dfc83fda --- /dev/null +++ b/e2e-test-plan.md @@ -0,0 +1,435 @@ +# E2E Test Plan: kernel-builder init + build + upload + get_kernel + +## Goal + +Validate the full lifecycle: `kernel-builder init` creates a valid project, the template builds successfully, the built kernel can be uploaded to the Hub, and `get_kernel()` can download and use it. + +--- + +## Overview + +A single GitHub Actions workflow with three jobs for fault isolation: + +1. **init-and-build** (Nix runner) -- `kernel-builder init`, validate scaffold, `nix build` one CUDA variant. +2. **upload** (GPU runner) -- `kernel-builder upload` the built artifacts to Hub. +3. **download-and-test** (GPU runner) -- `get_kernel()` download + correctness check, cleanup. + +If init+build fails, we know the template or build infra is broken. If upload fails, the Rust upload logic is broken. If download+test fails, the Python `get_kernel()` path is broken. Each job's failure points to a specific component. + +--- + +## Speed considerations + +- **Build exactly one variant.** Use the `ci` Nix target which selects one variant per framework (one CUDA variant in our case). +- **No matrix.** Single Python version (3.12), single Torch version (latest: 2.10), single backend (CUDA). +- **Cachix.** Leverage the existing `huggingface` Cachix cache so Nix derivations are fetched, not rebuilt. +- **Compile `kernel-builder` with `--release` once** in the build job and pass the binary via artifact to the GPU jobs (avoids compiling Rust twice). + +--- + +## Trigger paths + +The workflow should run on changes to the core components that affect this lifecycle: + +```yaml +on: + push: + branches: [main] + pull_request: + branches: [main] + paths: + - "kernel-builder/**" # CLI: init, build, upload commands + - "kernels/src/**" # Python library: get_kernel, variants, install + - "nix-builder/**" # Nix build infrastructure + - "kernels-data/**" # Shared config/data structures + workflow_dispatch: +``` + +This keeps the e2e tests focused on changes that could actually break the init/build/upload/download cycle, avoiding unnecessary runs on docs-only or CI config changes. + +--- + +## Job 1: `init-and-build` + +### What it validates + +- `kernel-builder init` produces a complete, well-formed project. +- The generated `build.toml` is valid. +- The project builds via Nix without errors. +- The build produces a working kernel variant. + +### Implementation + +**Runner:** `aws-highmemory-32-plus-nix` (matches existing `build_kernel.yaml`) + +#### Step 1: Build `kernel-builder` CLI + +```bash +cargo build --release --manifest-path kernel-builder/Cargo.toml +``` + +#### Step 2: Init a fresh kernel project + +```bash +cd /tmp +kernel-builder init --name kernels-test/e2e-test-kernel --backends cuda +``` + +Creates `/tmp/e2e-test-kernel/` with the full scaffold. + +#### Step 3: Validate the scaffold + +Quick checks that key files exist: + +```bash +cd /tmp/e2e-test-kernel +test -f build.toml +test -f flake.nix +test -f torch-ext/e2e_test_kernel/__init__.py +test -f torch-ext/torch_binding.cpp +test -f torch-ext/torch_binding.h +test -f e2e_test_kernel_cuda/e2e_test_kernel.cu +test -f tests/test_e2e_test_kernel.py +test -f example.py +``` + +Verify key fields in `build.toml`: +- `name = "e2e-test-kernel"` +- `repo-id = "kernels-test/e2e-test-kernel"` +- `backend = "cuda"` appears in a kernel section + +#### Step 4: Patch flake.nix for local nix-builder + +The init template generates a `flake.nix` that references `github:huggingface/kernels` (the remote repo). For CI, we need it to use the **local checkout** so the test validates the current code: + +```bash +cd /tmp/e2e-test-kernel +sed -i 's|github:huggingface/kernels|path:'"$GITHUB_WORKSPACE"'|' flake.nix +``` + +This changes `kernel-builder.url = "github:huggingface/kernels"` to `kernel-builder.url = "path:/path/to/checkout"`, matching how the example kernels work (`path:../../..`). + +#### Step 5: Build one CUDA variant with Nix + +```bash +cd /tmp/e2e-test-kernel +nix build .#ci -L +cp -rL result/* build/ +``` + +The `ci` Nix target builds exactly one variant per framework. This is the fastest path -- it's the same target the existing `build_kernel.yaml` CI uses. + +#### Step 6: Verify build artifacts + +```bash +VARIANT_DIR=$(ls -d build/torch* | head -1) +test -f "$VARIANT_DIR/__init__.py" +test -f "$VARIANT_DIR/metadata.json" +ls "$VARIANT_DIR"/*.so # At least one shared object +``` + +#### Step 7: Upload artifacts + +Upload the built kernel directory and `kernel-builder` binary for downstream jobs. + +--- + +## Job 2: `upload` + +### What it validates + +- `kernel-builder upload` successfully pushes build artifacts to a Hub repo. + +### Implementation + +**Runner:** `aws-g6-12xlarge-plus` (GPU runner -- needed for Job 3 anyway, reuse the same runner group) +**Depends on:** `init-and-build` + +#### Step 1: Download artifacts from Job 1 + +Download the built kernel directory and `kernel-builder` binary. + +#### Step 2: Upload kernel to Hub + +```bash +kernel-builder upload /tmp/e2e-test-kernel \ + --repo-id kernels-test/kernels-upload-test \ + --private +``` + +Uses `HF_TOKEN` secret for authentication. + +--- + +## Job 3: `download-and-test` + +### What it validates + +- `get_kernel()` can download the uploaded kernel and resolve the correct variant. +- The imported module is callable and produces correct results. + +### Implementation + +**Runner:** `aws-g6-12xlarge-plus` (GPU runner) +**Depends on:** `upload` + +#### Step 1: Install Python deps + +```bash +cd kernels +uv sync --all-extras --dev +uv pip install torch==2.10.0 +``` + +#### Step 2: Download and test via `get_kernel()` + +```python +import torch +from kernels import get_kernel + +kernel = get_kernel("kernels-test/kernels-upload-test", version=1) + +x = torch.randn(1024, 1024, dtype=torch.float32, device="cuda") +result = kernel.e2e_test_kernel(x) +expected = x + 1.0 +torch.testing.assert_close(result, expected) +print("E2E test passed!") +``` + +#### Step 3: Cleanup (always runs) + +Delete the Hub repo regardless of test outcome: + +```python +from huggingface_hub import HfApi +api = HfApi() +api.delete_repo("kernels-test/kernels-upload-test") +``` + +--- + +## Key risks and mitigations + +| Risk | Mitigation | +|------|------------| +| Init template `flake.nix` points to remote nix-builder | `sed` patch to use `path:$GITHUB_WORKSPACE` | +| Leftover repos on failure | `if: always()` cleanup step in Job 3 | +| Nix build is slow | Use `ci` target (one variant) + Cachix caching | +| Rust recompilation on GPU runner | Pass pre-built binary via artifact | + +--- + +## Complete workflow file + +```yaml +name: "E2E: kernel-builder init + build + upload + download" + +on: + push: + branches: [main] + pull_request: + branches: [main] + paths: + - "kernel-builder/**" + - "kernels/src/**" + - "nix-builder/**" + - "kernels-data/**" + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + init-and-build: + name: Init and build kernel + runs-on: + group: aws-highmemory-32-plus-nix + steps: + - uses: actions/checkout@v6 + - uses: DeterminateSystems/nix-installer-action@main + with: + extra-conf: | + max-jobs = 8 + cores = 12 + sandbox-fallback = false + - uses: cachix/cachix-action@v16 + with: + name: huggingface + authToken: "${{ secrets.CACHIX_AUTH_TOKEN }}" + env: + USER: runner + + - name: Build kernel-builder CLI + run: cargo build --release --manifest-path kernel-builder/Cargo.toml + + - name: Init kernel project + run: | + cd /tmp + $GITHUB_WORKSPACE/target/release/kernel-builder init \ + --name kernels-test/e2e-test-kernel \ + --backends cuda + + - name: Validate scaffold + run: | + cd /tmp/e2e-test-kernel + test -f build.toml + test -f flake.nix + test -f torch-ext/e2e_test_kernel/__init__.py + test -f torch-ext/torch_binding.cpp + test -f torch-ext/torch_binding.h + test -f e2e_test_kernel_cuda/e2e_test_kernel.cu + test -f tests/test_e2e_test_kernel.py + test -f example.py + grep -q 'name = "e2e-test-kernel"' build.toml + grep -q 'repo-id = "kernels-test/e2e-test-kernel"' build.toml + grep -q 'backend = "cuda"' build.toml + + - name: Patch flake.nix to use local nix-builder + run: | + cd /tmp/e2e-test-kernel + sed -i 's|github:huggingface/kernels|path:'"$GITHUB_WORKSPACE"'|' flake.nix + + - name: Build one CUDA variant + run: | + cd /tmp/e2e-test-kernel + nix build .#ci -L + mkdir -p build + cp -rL result/* build/ + + - name: Verify build artifacts + run: | + cd /tmp/e2e-test-kernel + VARIANT_DIR=$(ls -d build/torch* | head -1) + echo "Built variant: $VARIANT_DIR" + test -f "$VARIANT_DIR/__init__.py" + test -f "$VARIANT_DIR/metadata.json" + ls "$VARIANT_DIR"/*.so + + - name: Upload built kernel + uses: actions/upload-artifact@v6 + with: + name: e2e-built-kernel + path: /tmp/e2e-test-kernel/ + + - name: Upload kernel-builder binary + uses: actions/upload-artifact@v6 + with: + name: kernel-builder-bin + path: ${{ github.workspace }}/target/release/kernel-builder + + upload: + name: Upload kernel to Hub + needs: init-and-build + runs-on: + group: aws-g6-12xlarge-plus + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + steps: + - name: Download built kernel + uses: actions/download-artifact@v7 + with: + name: e2e-built-kernel + path: /tmp/e2e-test-kernel + + - name: Download kernel-builder binary + uses: actions/download-artifact@v7 + with: + name: kernel-builder-bin + path: /tmp/bin + + - name: Make binary executable + run: chmod +x /tmp/bin/kernel-builder + + - name: Upload kernel to Hub + run: | + /tmp/bin/kernel-builder upload /tmp/e2e-test-kernel \ + --repo-id kernels-test/kernels-upload-test \ + + download-and-test: + name: Download and test kernel via get_kernel + needs: upload + runs-on: + group: aws-g6-12xlarge-plus + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + UV_PYTHON_PREFERENCE: only-managed + steps: + - uses: actions/checkout@v6 + + - name: Install uv and set Python version + uses: astral-sh/setup-uv@v7 + with: + python-version: "3.12" + + - name: Install Python deps + working-directory: ./kernels + run: | + uv sync --all-extras --dev + uv pip install torch==2.10.0 + + - name: Test get_kernel download and usage + working-directory: ./kernels + run: | + uv run python -c " + import torch + from kernels import get_kernel + + kernel = get_kernel('kernels-test/kernels-upload-test', version=1) + + x = torch.randn(1024, 1024, dtype=torch.float32, device='cuda') + result = kernel.e2e_test_kernel(x) + expected = x + 1.0 + torch.testing.assert_close(result, expected) + print('E2E test passed: get_kernel + correctness check') + " + + - name: Cleanup Hub repo + if: always() + working-directory: ./kernels + run: | + uv run python -c " + from huggingface_hub import HfApi + api = HfApi() + try: + api.delete_repo('kernels-test/kernels-upload-test') + print('Cleaned up repo') + except Exception as e: + print(f'Cleanup warning: {e}') + " +``` + +--- + +## Files to create + +| File | Purpose | +|------|---------| +| `.github/workflows/test_e2e.yaml` | The workflow above | + +No new Python test files needed -- the e2e test is self-contained in the workflow. + +--- + +## Execution flow + +``` +Job 1: init-and-build (Nix runner, no GPU) + 1. cargo build kernel-builder + 2. kernel-builder init --backends cuda + 3. validate scaffold files + build.toml + 4. patch flake.nix → local nix-builder + 5. nix build .#ci (one CUDA variant) + 6. verify artifacts (*.so, metadata.json) + → artifacts: built kernel dir + kernel-builder binary + +Job 2: upload (GPU runner) + 7. kernel-builder upload → Hub + ✗ Failure here = upload logic is broken + +Job 3: download-and-test (GPU runner) + 8. get_kernel() → download + import + 9. call kernel function + assert correctness + 10. delete Hub repo (always) + ✗ Failure here = Python get_kernel / variant resolution is broken +``` From 14a258abdccf911e624ed127ac7bc7cbca6b74ac Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 30 Mar 2026 17:31:58 +0530 Subject: [PATCH 02/23] fix more. --- .github/workflows/test_e2e.yaml | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index 21d1a108..7795289a 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -20,6 +20,8 @@ concurrency: jobs: init-and-build: name: Init and build kernel + outputs: + torch_version: ${{ steps.detect-variant.outputs.torch_version }} runs-on: group: aws-highmemory-32-plus-nix steps: @@ -74,7 +76,8 @@ jobs: mkdir -p build cp -rL result/* build/ - - name: Verify build artifacts + - name: Verify build artifacts and detect Torch version + id: detect-variant run: | cd /tmp/e2e-test-kernel VARIANT_DIR=$(ls -d build/torch* | head -1) @@ -83,6 +86,15 @@ jobs: test -f "$VARIANT_DIR/metadata.json" ls "$VARIANT_DIR"/*.so + # Extract Torch version from variant dir name (e.g. torch210 -> 2.10.0) + VARIANT_NAME=$(basename "$VARIANT_DIR") + TORCH_DIGITS=$(echo "$VARIANT_NAME" | grep -oP '(?<=torch)\d+') + TORCH_MAJOR=${TORCH_DIGITS:0:1} + TORCH_MINOR=${TORCH_DIGITS:1} + TORCH_VERSION="${TORCH_MAJOR}.${TORCH_MINOR}.0" + echo "Detected Torch version: $TORCH_VERSION" + echo "torch_version=$TORCH_VERSION" >> "$GITHUB_OUTPUT" + - name: Upload built kernel uses: actions/upload-artifact@v6 with: @@ -125,7 +137,7 @@ jobs: download-and-test: name: Download and test kernel via get_kernel - needs: upload + needs: [upload, init-and-build] runs-on: group: aws-g6-12xlarge-plus env: @@ -143,7 +155,7 @@ jobs: working-directory: ./kernels run: | uv sync --all-extras --dev - uv pip install torch==2.10.0 + uv pip install "torch==${{ needs.init-and-build.outputs.torch_version }}" - name: Test get_kernel download and usage working-directory: ./kernels From da37bc9f0d08366fbb4045b3ce8c736f7558afca Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 30 Mar 2026 17:42:44 +0530 Subject: [PATCH 03/23] fix cargo failure. --- .github/workflows/test_e2e.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index 7795289a..d7d633e5 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -40,12 +40,12 @@ jobs: USER: runner - name: Build kernel-builder CLI - run: cargo build --release --manifest-path kernel-builder/Cargo.toml + run: nix build .#kernel-builder -L - name: Init kernel project run: | cd /tmp - $GITHUB_WORKSPACE/target/release/kernel-builder init \ + $GITHUB_WORKSPACE/result/bin/kernel-builder init \ --name kernels-test/e2e-test-kernel \ --backends cuda @@ -105,7 +105,7 @@ jobs: uses: actions/upload-artifact@v6 with: name: kernel-builder-bin - path: ${{ github.workspace }}/target/release/kernel-builder + path: ${{ github.workspace }}/result/bin/kernel-builder upload: name: Upload kernel to Hub From fe582588b24377984671792c2067a9c575df251d Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 30 Mar 2026 18:44:57 +0530 Subject: [PATCH 04/23] empty From db6abc902a9bb25a3d7380f36e7a3657bb337073 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 30 Mar 2026 20:19:15 +0530 Subject: [PATCH 05/23] fix cachix. --- .github/workflows/test_e2e.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index d7d633e5..26c65071 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -111,10 +111,11 @@ jobs: name: Upload kernel to Hub needs: init-and-build runs-on: - group: aws-g6-12xlarge-plus + group: aws-highmemory-32-plus-nix env: HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: + - uses: DeterminateSystems/nix-installer-action@main - name: Download built kernel uses: actions/download-artifact@v7 with: From 7abe3d8ed54863b996cde4bc9a338fdff3a76ba4 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 30 Mar 2026 20:32:53 +0530 Subject: [PATCH 06/23] up --- .github/workflows/test_e2e.yaml | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index 26c65071..66683326 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -101,12 +101,6 @@ jobs: name: e2e-built-kernel path: /tmp/e2e-test-kernel/ - - name: Upload kernel-builder binary - uses: actions/upload-artifact@v6 - with: - name: kernel-builder-bin - path: ${{ github.workspace }}/result/bin/kernel-builder - upload: name: Upload kernel to Hub needs: init-and-build @@ -115,25 +109,27 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: + - uses: actions/checkout@v6 - uses: DeterminateSystems/nix-installer-action@main + - uses: cachix/cachix-action@v16 + with: + name: huggingface + authToken: "${{ secrets.CACHIX_AUTH_TOKEN }}" + env: + USER: runner + + - name: Build kernel-builder CLI + run: nix build .#kernel-builder -L + - name: Download built kernel uses: actions/download-artifact@v7 with: name: e2e-built-kernel path: /tmp/e2e-test-kernel - - name: Download kernel-builder binary - uses: actions/download-artifact@v7 - with: - name: kernel-builder-bin - path: /tmp/bin - - - name: Make binary executable - run: chmod +x /tmp/bin/kernel-builder - - name: Upload kernel to Hub run: | - /tmp/bin/kernel-builder upload /tmp/e2e-test-kernel \ + ./result/bin/kernel-builder upload /tmp/e2e-test-kernel \ --repo-id kernels-test/kernels-upload-test download-and-test: From 1031a965158d2159cfff4dcdcf8a3ed0c66d1222 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 30 Mar 2026 20:48:45 +0530 Subject: [PATCH 07/23] up --- .github/workflows/test_e2e.yaml | 44 ++++++--------------------------- 1 file changed, 7 insertions(+), 37 deletions(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index 66683326..aa3c9736 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -18,12 +18,14 @@ concurrency: cancel-in-progress: true jobs: - init-and-build: - name: Init and build kernel + init-build-upload: + name: Init, build, and upload kernel outputs: torch_version: ${{ steps.detect-variant.outputs.torch_version }} runs-on: group: aws-highmemory-32-plus-nix + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: - uses: actions/checkout@v6 - uses: DeterminateSystems/nix-installer-action@main @@ -95,46 +97,14 @@ jobs: echo "Detected Torch version: $TORCH_VERSION" echo "torch_version=$TORCH_VERSION" >> "$GITHUB_OUTPUT" - - name: Upload built kernel - uses: actions/upload-artifact@v6 - with: - name: e2e-built-kernel - path: /tmp/e2e-test-kernel/ - - upload: - name: Upload kernel to Hub - needs: init-and-build - runs-on: - group: aws-highmemory-32-plus-nix - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - steps: - - uses: actions/checkout@v6 - - uses: DeterminateSystems/nix-installer-action@main - - uses: cachix/cachix-action@v16 - with: - name: huggingface - authToken: "${{ secrets.CACHIX_AUTH_TOKEN }}" - env: - USER: runner - - - name: Build kernel-builder CLI - run: nix build .#kernel-builder -L - - - name: Download built kernel - uses: actions/download-artifact@v7 - with: - name: e2e-built-kernel - path: /tmp/e2e-test-kernel - - name: Upload kernel to Hub run: | - ./result/bin/kernel-builder upload /tmp/e2e-test-kernel \ + $GITHUB_WORKSPACE/result/bin/kernel-builder upload /tmp/e2e-test-kernel \ --repo-id kernels-test/kernels-upload-test download-and-test: name: Download and test kernel via get_kernel - needs: [upload, init-and-build] + needs: init-build-upload runs-on: group: aws-g6-12xlarge-plus env: @@ -152,7 +122,7 @@ jobs: working-directory: ./kernels run: | uv sync --all-extras --dev - uv pip install "torch==${{ needs.init-and-build.outputs.torch_version }}" + uv pip install "torch==${{ needs.init-build-upload.outputs.torch_version }}" - name: Test get_kernel download and usage working-directory: ./kernels From 4af65e016e3943fa5ee8a201821ce402021100ca Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 31 Mar 2026 09:16:47 +0530 Subject: [PATCH 08/23] remove plan --- e2e-test-plan.md | 435 ----------------------------------------------- 1 file changed, 435 deletions(-) delete mode 100644 e2e-test-plan.md diff --git a/e2e-test-plan.md b/e2e-test-plan.md deleted file mode 100644 index dfc83fda..00000000 --- a/e2e-test-plan.md +++ /dev/null @@ -1,435 +0,0 @@ -# E2E Test Plan: kernel-builder init + build + upload + get_kernel - -## Goal - -Validate the full lifecycle: `kernel-builder init` creates a valid project, the template builds successfully, the built kernel can be uploaded to the Hub, and `get_kernel()` can download and use it. - ---- - -## Overview - -A single GitHub Actions workflow with three jobs for fault isolation: - -1. **init-and-build** (Nix runner) -- `kernel-builder init`, validate scaffold, `nix build` one CUDA variant. -2. **upload** (GPU runner) -- `kernel-builder upload` the built artifacts to Hub. -3. **download-and-test** (GPU runner) -- `get_kernel()` download + correctness check, cleanup. - -If init+build fails, we know the template or build infra is broken. If upload fails, the Rust upload logic is broken. If download+test fails, the Python `get_kernel()` path is broken. Each job's failure points to a specific component. - ---- - -## Speed considerations - -- **Build exactly one variant.** Use the `ci` Nix target which selects one variant per framework (one CUDA variant in our case). -- **No matrix.** Single Python version (3.12), single Torch version (latest: 2.10), single backend (CUDA). -- **Cachix.** Leverage the existing `huggingface` Cachix cache so Nix derivations are fetched, not rebuilt. -- **Compile `kernel-builder` with `--release` once** in the build job and pass the binary via artifact to the GPU jobs (avoids compiling Rust twice). - ---- - -## Trigger paths - -The workflow should run on changes to the core components that affect this lifecycle: - -```yaml -on: - push: - branches: [main] - pull_request: - branches: [main] - paths: - - "kernel-builder/**" # CLI: init, build, upload commands - - "kernels/src/**" # Python library: get_kernel, variants, install - - "nix-builder/**" # Nix build infrastructure - - "kernels-data/**" # Shared config/data structures - workflow_dispatch: -``` - -This keeps the e2e tests focused on changes that could actually break the init/build/upload/download cycle, avoiding unnecessary runs on docs-only or CI config changes. - ---- - -## Job 1: `init-and-build` - -### What it validates - -- `kernel-builder init` produces a complete, well-formed project. -- The generated `build.toml` is valid. -- The project builds via Nix without errors. -- The build produces a working kernel variant. - -### Implementation - -**Runner:** `aws-highmemory-32-plus-nix` (matches existing `build_kernel.yaml`) - -#### Step 1: Build `kernel-builder` CLI - -```bash -cargo build --release --manifest-path kernel-builder/Cargo.toml -``` - -#### Step 2: Init a fresh kernel project - -```bash -cd /tmp -kernel-builder init --name kernels-test/e2e-test-kernel --backends cuda -``` - -Creates `/tmp/e2e-test-kernel/` with the full scaffold. - -#### Step 3: Validate the scaffold - -Quick checks that key files exist: - -```bash -cd /tmp/e2e-test-kernel -test -f build.toml -test -f flake.nix -test -f torch-ext/e2e_test_kernel/__init__.py -test -f torch-ext/torch_binding.cpp -test -f torch-ext/torch_binding.h -test -f e2e_test_kernel_cuda/e2e_test_kernel.cu -test -f tests/test_e2e_test_kernel.py -test -f example.py -``` - -Verify key fields in `build.toml`: -- `name = "e2e-test-kernel"` -- `repo-id = "kernels-test/e2e-test-kernel"` -- `backend = "cuda"` appears in a kernel section - -#### Step 4: Patch flake.nix for local nix-builder - -The init template generates a `flake.nix` that references `github:huggingface/kernels` (the remote repo). For CI, we need it to use the **local checkout** so the test validates the current code: - -```bash -cd /tmp/e2e-test-kernel -sed -i 's|github:huggingface/kernels|path:'"$GITHUB_WORKSPACE"'|' flake.nix -``` - -This changes `kernel-builder.url = "github:huggingface/kernels"` to `kernel-builder.url = "path:/path/to/checkout"`, matching how the example kernels work (`path:../../..`). - -#### Step 5: Build one CUDA variant with Nix - -```bash -cd /tmp/e2e-test-kernel -nix build .#ci -L -cp -rL result/* build/ -``` - -The `ci` Nix target builds exactly one variant per framework. This is the fastest path -- it's the same target the existing `build_kernel.yaml` CI uses. - -#### Step 6: Verify build artifacts - -```bash -VARIANT_DIR=$(ls -d build/torch* | head -1) -test -f "$VARIANT_DIR/__init__.py" -test -f "$VARIANT_DIR/metadata.json" -ls "$VARIANT_DIR"/*.so # At least one shared object -``` - -#### Step 7: Upload artifacts - -Upload the built kernel directory and `kernel-builder` binary for downstream jobs. - ---- - -## Job 2: `upload` - -### What it validates - -- `kernel-builder upload` successfully pushes build artifacts to a Hub repo. - -### Implementation - -**Runner:** `aws-g6-12xlarge-plus` (GPU runner -- needed for Job 3 anyway, reuse the same runner group) -**Depends on:** `init-and-build` - -#### Step 1: Download artifacts from Job 1 - -Download the built kernel directory and `kernel-builder` binary. - -#### Step 2: Upload kernel to Hub - -```bash -kernel-builder upload /tmp/e2e-test-kernel \ - --repo-id kernels-test/kernels-upload-test \ - --private -``` - -Uses `HF_TOKEN` secret for authentication. - ---- - -## Job 3: `download-and-test` - -### What it validates - -- `get_kernel()` can download the uploaded kernel and resolve the correct variant. -- The imported module is callable and produces correct results. - -### Implementation - -**Runner:** `aws-g6-12xlarge-plus` (GPU runner) -**Depends on:** `upload` - -#### Step 1: Install Python deps - -```bash -cd kernels -uv sync --all-extras --dev -uv pip install torch==2.10.0 -``` - -#### Step 2: Download and test via `get_kernel()` - -```python -import torch -from kernels import get_kernel - -kernel = get_kernel("kernels-test/kernels-upload-test", version=1) - -x = torch.randn(1024, 1024, dtype=torch.float32, device="cuda") -result = kernel.e2e_test_kernel(x) -expected = x + 1.0 -torch.testing.assert_close(result, expected) -print("E2E test passed!") -``` - -#### Step 3: Cleanup (always runs) - -Delete the Hub repo regardless of test outcome: - -```python -from huggingface_hub import HfApi -api = HfApi() -api.delete_repo("kernels-test/kernels-upload-test") -``` - ---- - -## Key risks and mitigations - -| Risk | Mitigation | -|------|------------| -| Init template `flake.nix` points to remote nix-builder | `sed` patch to use `path:$GITHUB_WORKSPACE` | -| Leftover repos on failure | `if: always()` cleanup step in Job 3 | -| Nix build is slow | Use `ci` target (one variant) + Cachix caching | -| Rust recompilation on GPU runner | Pass pre-built binary via artifact | - ---- - -## Complete workflow file - -```yaml -name: "E2E: kernel-builder init + build + upload + download" - -on: - push: - branches: [main] - pull_request: - branches: [main] - paths: - - "kernel-builder/**" - - "kernels/src/**" - - "nix-builder/**" - - "kernels-data/**" - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - init-and-build: - name: Init and build kernel - runs-on: - group: aws-highmemory-32-plus-nix - steps: - - uses: actions/checkout@v6 - - uses: DeterminateSystems/nix-installer-action@main - with: - extra-conf: | - max-jobs = 8 - cores = 12 - sandbox-fallback = false - - uses: cachix/cachix-action@v16 - with: - name: huggingface - authToken: "${{ secrets.CACHIX_AUTH_TOKEN }}" - env: - USER: runner - - - name: Build kernel-builder CLI - run: cargo build --release --manifest-path kernel-builder/Cargo.toml - - - name: Init kernel project - run: | - cd /tmp - $GITHUB_WORKSPACE/target/release/kernel-builder init \ - --name kernels-test/e2e-test-kernel \ - --backends cuda - - - name: Validate scaffold - run: | - cd /tmp/e2e-test-kernel - test -f build.toml - test -f flake.nix - test -f torch-ext/e2e_test_kernel/__init__.py - test -f torch-ext/torch_binding.cpp - test -f torch-ext/torch_binding.h - test -f e2e_test_kernel_cuda/e2e_test_kernel.cu - test -f tests/test_e2e_test_kernel.py - test -f example.py - grep -q 'name = "e2e-test-kernel"' build.toml - grep -q 'repo-id = "kernels-test/e2e-test-kernel"' build.toml - grep -q 'backend = "cuda"' build.toml - - - name: Patch flake.nix to use local nix-builder - run: | - cd /tmp/e2e-test-kernel - sed -i 's|github:huggingface/kernels|path:'"$GITHUB_WORKSPACE"'|' flake.nix - - - name: Build one CUDA variant - run: | - cd /tmp/e2e-test-kernel - nix build .#ci -L - mkdir -p build - cp -rL result/* build/ - - - name: Verify build artifacts - run: | - cd /tmp/e2e-test-kernel - VARIANT_DIR=$(ls -d build/torch* | head -1) - echo "Built variant: $VARIANT_DIR" - test -f "$VARIANT_DIR/__init__.py" - test -f "$VARIANT_DIR/metadata.json" - ls "$VARIANT_DIR"/*.so - - - name: Upload built kernel - uses: actions/upload-artifact@v6 - with: - name: e2e-built-kernel - path: /tmp/e2e-test-kernel/ - - - name: Upload kernel-builder binary - uses: actions/upload-artifact@v6 - with: - name: kernel-builder-bin - path: ${{ github.workspace }}/target/release/kernel-builder - - upload: - name: Upload kernel to Hub - needs: init-and-build - runs-on: - group: aws-g6-12xlarge-plus - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - steps: - - name: Download built kernel - uses: actions/download-artifact@v7 - with: - name: e2e-built-kernel - path: /tmp/e2e-test-kernel - - - name: Download kernel-builder binary - uses: actions/download-artifact@v7 - with: - name: kernel-builder-bin - path: /tmp/bin - - - name: Make binary executable - run: chmod +x /tmp/bin/kernel-builder - - - name: Upload kernel to Hub - run: | - /tmp/bin/kernel-builder upload /tmp/e2e-test-kernel \ - --repo-id kernels-test/kernels-upload-test \ - - download-and-test: - name: Download and test kernel via get_kernel - needs: upload - runs-on: - group: aws-g6-12xlarge-plus - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - UV_PYTHON_PREFERENCE: only-managed - steps: - - uses: actions/checkout@v6 - - - name: Install uv and set Python version - uses: astral-sh/setup-uv@v7 - with: - python-version: "3.12" - - - name: Install Python deps - working-directory: ./kernels - run: | - uv sync --all-extras --dev - uv pip install torch==2.10.0 - - - name: Test get_kernel download and usage - working-directory: ./kernels - run: | - uv run python -c " - import torch - from kernels import get_kernel - - kernel = get_kernel('kernels-test/kernels-upload-test', version=1) - - x = torch.randn(1024, 1024, dtype=torch.float32, device='cuda') - result = kernel.e2e_test_kernel(x) - expected = x + 1.0 - torch.testing.assert_close(result, expected) - print('E2E test passed: get_kernel + correctness check') - " - - - name: Cleanup Hub repo - if: always() - working-directory: ./kernels - run: | - uv run python -c " - from huggingface_hub import HfApi - api = HfApi() - try: - api.delete_repo('kernels-test/kernels-upload-test') - print('Cleaned up repo') - except Exception as e: - print(f'Cleanup warning: {e}') - " -``` - ---- - -## Files to create - -| File | Purpose | -|------|---------| -| `.github/workflows/test_e2e.yaml` | The workflow above | - -No new Python test files needed -- the e2e test is self-contained in the workflow. - ---- - -## Execution flow - -``` -Job 1: init-and-build (Nix runner, no GPU) - 1. cargo build kernel-builder - 2. kernel-builder init --backends cuda - 3. validate scaffold files + build.toml - 4. patch flake.nix → local nix-builder - 5. nix build .#ci (one CUDA variant) - 6. verify artifacts (*.so, metadata.json) - → artifacts: built kernel dir + kernel-builder binary - -Job 2: upload (GPU runner) - 7. kernel-builder upload → Hub - ✗ Failure here = upload logic is broken - -Job 3: download-and-test (GPU runner) - 8. get_kernel() → download + import - 9. call kernel function + assert correctness - 10. delete Hub repo (always) - ✗ Failure here = Python get_kernel / variant resolution is broken -``` From d1152ffecd6b092a010159a9cc51c855f5dbe05a Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 31 Mar 2026 09:24:13 +0530 Subject: [PATCH 09/23] up --- .github/workflows/test_e2e.yaml | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index aa3c9736..d4f2073e 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -13,6 +13,9 @@ on: - ".github/workflows/test_e2e.yaml" workflow_dispatch: +env: + E2E_REPO_ID: kernels-test/kernels-upload-test + concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true @@ -100,7 +103,7 @@ jobs: - name: Upload kernel to Hub run: | $GITHUB_WORKSPACE/result/bin/kernel-builder upload /tmp/e2e-test-kernel \ - --repo-id kernels-test/kernels-upload-test + --repo-id ${{ env.E2E_REPO_ID }} download-and-test: name: Download and test kernel via get_kernel @@ -131,7 +134,7 @@ jobs: import torch from kernels import get_kernel - kernel = get_kernel('kernels-test/kernels-upload-test', version=1) + kernel = get_kernel('${{ env.E2E_REPO_ID }}', version=1) x = torch.randn(1024, 1024, dtype=torch.float32, device='cuda') result = kernel.e2e_test_kernel(x) @@ -143,13 +146,4 @@ jobs: - name: Cleanup Hub repo if: always() working-directory: ./kernels - run: | - uv run python -c " - from huggingface_hub import HfApi - api = HfApi() - try: - api.delete_repo('kernels-test/kernels-upload-test') - print('Cleaned up repo') - except Exception as e: - print(f'Cleanup warning: {e}') - " + run: uv run huggingface-cli delete-repo ${{ env.E2E_REPO_ID }} --yes || true From 035577a9440c5431cc956264ae08070ba0b19ec8 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 31 Mar 2026 14:27:50 +0530 Subject: [PATCH 10/23] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Daniël de Kok --- .github/workflows/test_e2e.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index d4f2073e..5a7f882a 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -50,7 +50,7 @@ jobs: - name: Init kernel project run: | cd /tmp - $GITHUB_WORKSPACE/result/bin/kernel-builder init \ + nix run $GITHUB_WORKSPACE#kernel-builder -- init \ --name kernels-test/e2e-test-kernel \ --backends cuda From 380bf24eac03281f50ba73394a10e904fcc1fed3 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 31 Mar 2026 14:42:31 +0530 Subject: [PATCH 11/23] use kernel-builder buil --- .github/workflows/test_e2e.yaml | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index 5a7f882a..6dd84963 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -23,8 +23,6 @@ concurrency: jobs: init-build-upload: name: Init, build, and upload kernel - outputs: - torch_version: ${{ steps.detect-variant.outputs.torch_version }} runs-on: group: aws-highmemory-32-plus-nix env: @@ -44,9 +42,6 @@ jobs: env: USER: runner - - name: Build kernel-builder CLI - run: nix build .#kernel-builder -L - - name: Init kernel project run: | cd /tmp @@ -74,15 +69,12 @@ jobs: cd /tmp/e2e-test-kernel sed -i 's|github:huggingface/kernels|path:'"$GITHUB_WORKSPACE"'|' flake.nix - - name: Build one CUDA variant + - name: Build kernel run: | cd /tmp/e2e-test-kernel - nix build .#ci -L - mkdir -p build - cp -rL result/* build/ + nix run $GITHUB_WORKSPACE#kernel-builder -- build . -L - - name: Verify build artifacts and detect Torch version - id: detect-variant + - name: Verify build artifacts run: | cd /tmp/e2e-test-kernel VARIANT_DIR=$(ls -d build/torch* | head -1) @@ -91,18 +83,9 @@ jobs: test -f "$VARIANT_DIR/metadata.json" ls "$VARIANT_DIR"/*.so - # Extract Torch version from variant dir name (e.g. torch210 -> 2.10.0) - VARIANT_NAME=$(basename "$VARIANT_DIR") - TORCH_DIGITS=$(echo "$VARIANT_NAME" | grep -oP '(?<=torch)\d+') - TORCH_MAJOR=${TORCH_DIGITS:0:1} - TORCH_MINOR=${TORCH_DIGITS:1} - TORCH_VERSION="${TORCH_MAJOR}.${TORCH_MINOR}.0" - echo "Detected Torch version: $TORCH_VERSION" - echo "torch_version=$TORCH_VERSION" >> "$GITHUB_OUTPUT" - - name: Upload kernel to Hub run: | - $GITHUB_WORKSPACE/result/bin/kernel-builder upload /tmp/e2e-test-kernel \ + nix run $GITHUB_WORKSPACE#kernel-builder -- upload /tmp/e2e-test-kernel \ --repo-id ${{ env.E2E_REPO_ID }} download-and-test: @@ -125,7 +108,7 @@ jobs: working-directory: ./kernels run: | uv sync --all-extras --dev - uv pip install "torch==${{ needs.init-build-upload.outputs.torch_version }}" + uv pip install torch - name: Test get_kernel download and usage working-directory: ./kernels From 8a721f8a0da3645b18e3a9d4595a1c4643d41e93 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 31 Mar 2026 14:59:20 +0530 Subject: [PATCH 12/23] add --- .github/workflows/test_e2e.yaml | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index 6dd84963..113a5e40 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -15,6 +15,7 @@ on: env: E2E_REPO_ID: kernels-test/kernels-upload-test + E2E_BRANCH: e2e-${{ github.event.pull_request.number || github.run_id }}-${{ github.run_attempt }} concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -86,7 +87,8 @@ jobs: - name: Upload kernel to Hub run: | nix run $GITHUB_WORKSPACE#kernel-builder -- upload /tmp/e2e-test-kernel \ - --repo-id ${{ env.E2E_REPO_ID }} + --repo-id ${{ env.E2E_REPO_ID }} \ + --branch ${{ env.E2E_BRANCH }} download-and-test: name: Download and test kernel via get_kernel @@ -117,7 +119,7 @@ jobs: import torch from kernels import get_kernel - kernel = get_kernel('${{ env.E2E_REPO_ID }}', version=1) + kernel = get_kernel('${{ env.E2E_REPO_ID }}', revision='${{ env.E2E_BRANCH }}') x = torch.randn(1024, 1024, dtype=torch.float32, device='cuda') result = kernel.e2e_test_kernel(x) @@ -125,8 +127,3 @@ jobs: torch.testing.assert_close(result, expected) print('E2E test passed: get_kernel + correctness check') " - - - name: Cleanup Hub repo - if: always() - working-directory: ./kernels - run: uv run huggingface-cli delete-repo ${{ env.E2E_REPO_ID }} --yes || true From 87f57fcb11be82ccd79d365d6e9842904c192dfa Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 31 Mar 2026 15:04:42 +0530 Subject: [PATCH 13/23] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Daniël de Kok --- .github/workflows/test_e2e.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index 113a5e40..75f54e2d 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -47,7 +47,7 @@ jobs: run: | cd /tmp nix run $GITHUB_WORKSPACE#kernel-builder -- init \ - --name kernels-test/e2e-test-kernel \ + --name ${{ env.E2E_REPO_ID }} \ --backends cuda - name: Validate scaffold From 13d526830689d6cc9e69305ee0a863609de8639c Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 31 Mar 2026 15:17:05 +0530 Subject: [PATCH 14/23] fix more --- .github/workflows/test_e2e.yaml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index 75f54e2d..ca63cb9c 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -52,32 +52,32 @@ jobs: - name: Validate scaffold run: | - cd /tmp/e2e-test-kernel + cd /tmp/kernels-upload-test test -f build.toml test -f flake.nix - test -f torch-ext/e2e_test_kernel/__init__.py + test -f torch-ext/kernels_upload_test/__init__.py test -f torch-ext/torch_binding.cpp test -f torch-ext/torch_binding.h - test -f e2e_test_kernel_cuda/e2e_test_kernel.cu - test -f tests/test_e2e_test_kernel.py + test -f kernels_upload_test_cuda/kernels_upload_test.cu + test -f tests/test_kernels_upload_test.py test -f example.py - grep -q 'name = "e2e-test-kernel"' build.toml - grep -q 'repo-id = "kernels-test/e2e-test-kernel"' build.toml + grep -q 'name = "kernels-upload-test"' build.toml + grep -q 'repo-id = "kernels-test/kernels-upload-test"' build.toml grep -q 'backend = "cuda"' build.toml - name: Patch flake.nix to use local nix-builder run: | - cd /tmp/e2e-test-kernel + cd /tmp/kernels-upload-test sed -i 's|github:huggingface/kernels|path:'"$GITHUB_WORKSPACE"'|' flake.nix - name: Build kernel run: | - cd /tmp/e2e-test-kernel + cd /tmp/kernels-upload-test nix run $GITHUB_WORKSPACE#kernel-builder -- build . -L - name: Verify build artifacts run: | - cd /tmp/e2e-test-kernel + cd /tmp/kernels-upload-test VARIANT_DIR=$(ls -d build/torch* | head -1) echo "Built variant: $VARIANT_DIR" test -f "$VARIANT_DIR/__init__.py" @@ -86,7 +86,7 @@ jobs: - name: Upload kernel to Hub run: | - nix run $GITHUB_WORKSPACE#kernel-builder -- upload /tmp/e2e-test-kernel \ + nix run $GITHUB_WORKSPACE#kernel-builder -- upload /tmp/kernels-upload-test \ --repo-id ${{ env.E2E_REPO_ID }} \ --branch ${{ env.E2E_BRANCH }} @@ -122,7 +122,7 @@ jobs: kernel = get_kernel('${{ env.E2E_REPO_ID }}', revision='${{ env.E2E_BRANCH }}') x = torch.randn(1024, 1024, dtype=torch.float32, device='cuda') - result = kernel.e2e_test_kernel(x) + result = kernel.kernels_upload_test(x) expected = x + 1.0 torch.testing.assert_close(result, expected) print('E2E test passed: get_kernel + correctness check') From 0234a8ce940dec9bc6ba2fa8e1ebc7ba08daaffa Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 31 Mar 2026 15:25:50 +0530 Subject: [PATCH 15/23] more --- .github/workflows/test_e2e.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index ca63cb9c..75279926 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -73,7 +73,7 @@ jobs: - name: Build kernel run: | cd /tmp/kernels-upload-test - nix run $GITHUB_WORKSPACE#kernel-builder -- build . -L + nix run $GITHUB_WORKSPACE#kernel-builder -- build-and-copy . -L - name: Verify build artifacts run: | From 7b5eb82a170dc501a9810e18761ce30fb69065d3 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 31 Mar 2026 15:39:13 +0530 Subject: [PATCH 16/23] remove repo-id from the upload command. --- .github/workflows/test_e2e.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index 75279926..ae3a30d6 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -86,9 +86,7 @@ jobs: - name: Upload kernel to Hub run: | - nix run $GITHUB_WORKSPACE#kernel-builder -- upload /tmp/kernels-upload-test \ - --repo-id ${{ env.E2E_REPO_ID }} \ - --branch ${{ env.E2E_BRANCH }} + nix run $GITHUB_WORKSPACE#kernel-builder -- upload /tmp/kernels-upload-test --branch ${{ env.E2E_BRANCH }} download-and-test: name: Download and test kernel via get_kernel From 82e9067da68f54c4d256b399545ff13c6d3a557f Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 31 Mar 2026 19:00:45 +0530 Subject: [PATCH 17/23] scope the token. --- .github/workflows/test_e2e.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index ae3a30d6..6557e6f8 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -26,8 +26,6 @@ jobs: name: Init, build, and upload kernel runs-on: group: aws-highmemory-32-plus-nix - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: - uses: actions/checkout@v6 - uses: DeterminateSystems/nix-installer-action@main @@ -85,6 +83,8 @@ jobs: ls "$VARIANT_DIR"/*.so - name: Upload kernel to Hub + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | nix run $GITHUB_WORKSPACE#kernel-builder -- upload /tmp/kernels-upload-test --branch ${{ env.E2E_BRANCH }} @@ -94,7 +94,6 @@ jobs: runs-on: group: aws-g6-12xlarge-plus env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} UV_PYTHON_PREFERENCE: only-managed steps: - uses: actions/checkout@v6 @@ -112,6 +111,8 @@ jobs: - name: Test get_kernel download and usage working-directory: ./kernels + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | uv run python -c " import torch From 56beb52353d49a24cb1e82cbd9d95f21d5c5a8f7 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 31 Mar 2026 19:34:33 +0530 Subject: [PATCH 18/23] upgrade forcefully. --- .github/workflows/test_e2e.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index 6557e6f8..6d8c1df2 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -107,7 +107,7 @@ jobs: working-directory: ./kernels run: | uv sync --all-extras --dev - uv pip install torch + uv pip install --upgrade torch - name: Test get_kernel download and usage working-directory: ./kernels From a0f806ec965b03eee5cb68ffc3764aaa590e7d88 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 31 Mar 2026 20:53:04 +0530 Subject: [PATCH 19/23] check --- .github/workflows/test_e2e.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index 6d8c1df2..8b1c2e99 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -103,11 +103,21 @@ jobs: with: python-version: "3.12" + - name: Detect CUDA version + id: cuda + run: | + nvidia-smi + CUDA_RT=$(nvidia-smi | grep -oP 'CUDA Version: \K[\d.]+') + CUDA_TAG="cu$(echo $CUDA_RT | tr -d '.')" + echo "Detected CUDA: $CUDA_RT -> $CUDA_TAG" + echo "cuda_tag=$CUDA_TAG" >> "$GITHUB_OUTPUT" + - name: Install Python deps working-directory: ./kernels run: | uv sync --all-extras --dev - uv pip install --upgrade torch + uv pip install --upgrade torch --index-url https://download.pytorch.org/whl/${{ steps.cuda.outputs.cuda_tag }} + pip show torch - name: Test get_kernel download and usage working-directory: ./kernels From 66795d1c8c635389ceffe24bdbca03939bf2eeb3 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 31 Mar 2026 21:03:42 +0530 Subject: [PATCH 20/23] switch to test_kernels runner --- .github/workflows/test_e2e.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index 8b1c2e99..20d11c14 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -92,7 +92,7 @@ jobs: name: Download and test kernel via get_kernel needs: init-build-upload runs-on: - group: aws-g6-12xlarge-plus + group: aws-g6-24xlarge env: UV_PYTHON_PREFERENCE: only-managed steps: From 46436ae70382276e91b30247163686ad3b9c0941 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 31 Mar 2026 21:17:06 +0530 Subject: [PATCH 21/23] up --- .github/workflows/test_e2e.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index 20d11c14..19ab1262 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -117,7 +117,7 @@ jobs: run: | uv sync --all-extras --dev uv pip install --upgrade torch --index-url https://download.pytorch.org/whl/${{ steps.cuda.outputs.cuda_tag }} - pip show torch + uv pip show torch - name: Test get_kernel download and usage working-directory: ./kernels From a82db5b5353367a60f20480027923dfe9bd222be Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 31 Mar 2026 21:36:48 +0530 Subject: [PATCH 22/23] abi compilation issues. --- .github/workflows/test_e2e.yaml | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index 19ab1262..10fc445f 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -103,21 +103,12 @@ jobs: with: python-version: "3.12" - - name: Detect CUDA version - id: cuda - run: | - nvidia-smi - CUDA_RT=$(nvidia-smi | grep -oP 'CUDA Version: \K[\d.]+') - CUDA_TAG="cu$(echo $CUDA_RT | tr -d '.')" - echo "Detected CUDA: $CUDA_RT -> $CUDA_TAG" - echo "cuda_tag=$CUDA_TAG" >> "$GITHUB_OUTPUT" - - name: Install Python deps working-directory: ./kernels run: | uv sync --all-extras --dev - uv pip install --upgrade torch --index-url https://download.pytorch.org/whl/${{ steps.cuda.outputs.cuda_tag }} - uv pip show torch + uv pip install --upgrade torch + uv run python -c "import torch; print(f'torch={torch.__version__}, cuda={torch.version.cuda}, cxx11_abi={torch.compiled_with_cxx11_abi()}')" - name: Test get_kernel download and usage working-directory: ./kernels From b5eab6440bf88afb63400886141456ed5f64d0be Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 31 Mar 2026 22:40:00 +0530 Subject: [PATCH 23/23] up --- .github/workflows/test_e2e.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_e2e.yaml b/.github/workflows/test_e2e.yaml index 10fc445f..44a2166e 100644 --- a/.github/workflows/test_e2e.yaml +++ b/.github/workflows/test_e2e.yaml @@ -108,14 +108,14 @@ jobs: run: | uv sync --all-extras --dev uv pip install --upgrade torch - uv run python -c "import torch; print(f'torch={torch.__version__}, cuda={torch.version.cuda}, cxx11_abi={torch.compiled_with_cxx11_abi()}')" + uv run --no-sync python -c "import torch; print(f'torch={torch.__version__}, cuda={torch.version.cuda}, cxx11_abi={torch.compiled_with_cxx11_abi()}')" - name: Test get_kernel download and usage working-directory: ./kernels env: HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | - uv run python -c " + uv run --no-sync python -c " import torch from kernels import get_kernel