From acfc41f6f6d8ab6cc9492f76a7648b40e7d37e58 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Mon, 2 Mar 2026 09:51:55 -0800
Subject: [PATCH 1/3] Add GPU CI tests via Modal L4

Builds pygpubench from source (CUDA 13.0, sm_80/90) and runs
test/grayscale.py on a Modal L4 GPU. Triggered on push/PR to master.
---
 .github/workflows/gpu-test.yml | 26 +++++++++++++++
 ci/modal_gpu_test.py           | 61 ++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 .github/workflows/gpu-test.yml
 create mode 100644 ci/modal_gpu_test.py

diff --git a/.github/workflows/gpu-test.yml b/.github/workflows/gpu-test.yml
new file mode 100644
index 0000000..503b695
--- /dev/null
+++ b/.github/workflows/gpu-test.yml
@@ -0,0 +1,26 @@
+name: GPU Tests
+on:
+  push:
+    branches: [master]
+  pull_request:
+    branches: [master]
+
+jobs:
+  gpu-test:
+    name: GPU tests (Modal L4)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install Modal
+        run: pip install modal
+
+      - name: Run GPU tests
+        env:
+          MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+          MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+        run: modal run ci/modal_gpu_test.py
diff --git a/ci/modal_gpu_test.py b/ci/modal_gpu_test.py
new file mode 100644
index 0000000..3c40537
--- /dev/null
+++ b/ci/modal_gpu_test.py
@@ -0,0 +1,61 @@
+"""Run pygpubench GPU tests on a Modal L4 GPU.
+
+Usage: modal run ci/modal_gpu_test.py
+"""
+
+import modal
+from pathlib import Path
+
+_repo = Path(__file__).resolve().parent.parent
+
+image = (
+    modal.Image.from_registry(
+        "nvidia/cuda:13.0.1-cudnn-devel-ubuntu24.04", add_python="3.12"
+    )
+    .entrypoint([])
+    .apt_install("git", "g++-13", "cmake", "ninja-build")
+    .uv_pip_install("torch", index_url="https://download.pytorch.org/whl/cu130")
+    .env({
+        "CUDAARCHS": "80;90",
+        "CC": "gcc-13",
+        "CXX": "g++-13",
+        "CMAKE_GENERATOR": "Ninja",
+    })
+    .add_local_dir(str(_repo / "csrc"), remote_path="/root/pygpubench/csrc")
+    .add_local_dir(str(_repo / "python"), remote_path="/root/pygpubench/python")
+    .add_local_dir(str(_repo / "test"), remote_path="/root/pygpubench/test")
+    .add_local_file(str(_repo / "pyproject.toml"), remote_path="/root/pygpubench/pyproject.toml")
+    .add_local_file(str(_repo / "CMakeLists.txt"), remote_path="/root/pygpubench/CMakeLists.txt")
+    .add_local_file(str(_repo / "README.md"), remote_path="/root/pygpubench/README.md")
+)
+
+app = modal.App("pygpubench-ci", image=image)
+
+
+@app.function(gpu="L4", timeout=600)
+def run_tests():
+    import subprocess
+    import shutil
+    import glob
+    import sys
+    import os
+
+    # Mounts are read-only; copy to a writable location for the build
+    shutil.copytree("/root/pygpubench", "/tmp/pygpubench")
+    os.chdir("/tmp/pygpubench")
+
+    subprocess.run(["uv", "build", "--wheel"], check=True)
+
+    whl = glob.glob("dist/*.whl")[0]
+    subprocess.run([sys.executable, "-m", "pip", "install", whl], check=True)
+
+    # Run tests
+    os.chdir("/tmp/pygpubench/test")
+    result = subprocess.run([sys.executable, "grayscale.py"], text=True)
+    if result.returncode != 0:
+        raise SystemExit(1)
+
+
+@app.local_entrypoint()
+def main():
+    run_tests.remote()

From c0ac923f1dafadadecf44549eabe80e361a29bd9 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Mon, 2 Mar 2026 10:00:00 -0800
Subject: [PATCH 2/3] Run all test scripts in test/ directory automatically

---
 ci/modal_gpu_test.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/ci/modal_gpu_test.py b/ci/modal_gpu_test.py
index 3c40537..68c9d66 100644
--- a/ci/modal_gpu_test.py
+++ b/ci/modal_gpu_test.py
@@ -49,10 +49,19 @@ def run_tests():
     whl = glob.glob("dist/*.whl")[0]
     subprocess.run([sys.executable, "-m", "pip", "install", whl], check=True)
 
-    # Run tests
+    # Run all test scripts in test/
     os.chdir("/tmp/pygpubench/test")
-    result = subprocess.run([sys.executable, "grayscale.py"], text=True)
-    if result.returncode != 0:
+    failed = []
+    for test_file in sorted(glob.glob("*.py")):
+        if test_file == "submission.py":
+            continue
+        print(f"\n=== {test_file} ===")
+        result = subprocess.run([sys.executable, test_file], text=True)
+        if result.returncode != 0:
+            failed.append(test_file)
+
+    if failed:
+        print(f"\nFailed: {', '.join(failed)}")
         raise SystemExit(1)
 
 

From a22d170bfb99a0435b1cc42e9f26eef6576b0c2b Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Mon, 2 Mar 2026 10:33:43 -0800
Subject: [PATCH 3/3] Use pre-built wheel from CI instead of building from
 source on Modal

Simplifies the Modal script to accept a wheel + test files as args,
and moves the GPU test job into wheel.yml so it reuses the wheel artifact.
---
 .github/workflows/gpu-test.yml | 26 ---------------
 .github/workflows/wheel.yml    | 27 +++++++++++++++
 ci/modal_gpu_test.py           | 61 +++++++++++++++++-----------------
 3 files changed, 57 insertions(+), 57 deletions(-)
 delete mode 100644 .github/workflows/gpu-test.yml

diff --git a/.github/workflows/gpu-test.yml b/.github/workflows/gpu-test.yml
deleted file mode 100644
index 503b695..0000000
--- a/.github/workflows/gpu-test.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-name: GPU Tests
-on:
-  push:
-    branches: [master]
-  pull_request:
-    branches: [master]
-
-jobs:
-  gpu-test:
-    name: GPU tests (Modal L4)
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.12"
-
-      - name: Install Modal
-        run: pip install modal
-
-      - name: Run GPU tests
-        env:
-          MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
-          MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
-        run: modal run ci/modal_gpu_test.py
diff --git a/.github/workflows/wheel.yml b/.github/workflows/wheel.yml
index a6c7f2f..0fc0f4d 100644
--- a/.github/workflows/wheel.yml
+++ b/.github/workflows/wheel.yml
@@ -58,6 +58,33 @@ jobs:
           name: wheel-${{ matrix.wheel_tag }}
           path: wheelhouse/pygpubench*.whl
 
+  gpu-test:
+    name: GPU tests (Modal L4)
+    needs: wheel
+    runs-on: ubuntu-24.04
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Download wheel
+        uses: actions/download-artifact@v4
+        with:
+          pattern: wheel-*
+          path: dist/
+          merge-multiple: true
+
+      - name: Install Modal
+        run: pip install modal
+
+      - name: Run GPU tests
+        env:
+          MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+          MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+        run: modal run ci/modal_gpu_test.py --wheel dist/pygpubench*.whl --test-dir test
+
   release:
     name: Publish to GitHub Releases
     needs: wheel
diff --git a/ci/modal_gpu_test.py b/ci/modal_gpu_test.py
index 68c9d66..0e30399 100644
--- a/ci/modal_gpu_test.py
+++ b/ci/modal_gpu_test.py
@@ -1,58 +1,45 @@
 """Run pygpubench GPU tests on a Modal L4 GPU.
 
-Usage: modal run ci/modal_gpu_test.py
+Usage: modal run ci/modal_gpu_test.py <wheel_path> <test_dir>
 """
 
 import modal
 from pathlib import Path
 
-_repo = Path(__file__).resolve().parent.parent
-
 image = (
     modal.Image.from_registry(
         "nvidia/cuda:13.0.1-cudnn-devel-ubuntu24.04", add_python="3.12"
     )
     .entrypoint([])
-    .apt_install("git", "g++-13", "cmake", "ninja-build")
     .uv_pip_install("torch", index_url="https://download.pytorch.org/whl/cu130")
-    .env({
-        "CUDAARCHS": "80;90",
-        "CC": "gcc-13",
-        "CXX": "g++-13",
-        "CMAKE_GENERATOR": "Ninja",
-    })
-    .add_local_dir(str(_repo / "csrc"), remote_path="/root/pygpubench/csrc")
-    .add_local_dir(str(_repo / "python"), remote_path="/root/pygpubench/python")
-    .add_local_dir(str(_repo / "test"), remote_path="/root/pygpubench/test")
-    .add_local_file(str(_repo / "pyproject.toml"), remote_path="/root/pygpubench/pyproject.toml")
-    .add_local_file(str(_repo / "CMakeLists.txt"), remote_path="/root/pygpubench/CMakeLists.txt")
-    .add_local_file(str(_repo / "README.md"), remote_path="/root/pygpubench/README.md")
 )
 
 app = modal.App("pygpubench-ci", image=image)
 
 
 @app.function(gpu="L4", timeout=600)
-def run_tests():
+def run_tests(whl_bytes: bytes, whl_name: str, test_files: dict[str, bytes]):
     import subprocess
-    import shutil
-    import glob
     import sys
     import os
 
-    # Mounts are read-only; copy to a writable location for the build
-    shutil.copytree("/root/pygpubench", "/tmp/pygpubench")
-    os.chdir("/tmp/pygpubench")
+    # Write wheel and install it
+    whl_path = f"/tmp/{whl_name}"
+    with open(whl_path, "wb") as f:
+        f.write(whl_bytes)
+    subprocess.run([sys.executable, "-m", "pip", "install", whl_path], check=True)
 
-    subprocess.run(["uv", "build", "--wheel"], check=True)
+    # Write test files
+    test_dir = "/tmp/tests"
+    os.makedirs(test_dir, exist_ok=True)
+    for name, content in test_files.items():
+        with open(os.path.join(test_dir, name), "wb") as f:
+            f.write(content)
 
-    whl = glob.glob("dist/*.whl")[0]
-    subprocess.run([sys.executable, "-m", "pip", "install", whl], check=True)
-
-    # Run all test scripts in test/
-    os.chdir("/tmp/pygpubench/test")
+    # Run all test scripts
+    os.chdir(test_dir)
     failed = []
-    for test_file in sorted(glob.glob("*.py")):
+    for test_file in sorted(test_files):
         if test_file == "submission.py":
             continue
         print(f"\n=== {test_file} ===")
@@ -66,5 +53,17 @@ def run_tests():
 
 
 @app.local_entrypoint()
-def main():
-    run_tests.remote()
+def main(wheel: str, test_dir: str = "test"):
+    import glob
+
+    # Read the wheel
+    whl_path = Path(wheel)
+    whl_bytes = whl_path.read_bytes()
+
+    # Read all test files
+    test_path = Path(test_dir)
+    test_files = {}
+    for f in test_path.glob("*.py"):
+        test_files[f.name] = f.read_bytes()
+
+    run_tests.remote(whl_bytes, whl_path.name, test_files)