diff --git a/.github/workflows/build_kernel.yaml b/.github/workflows/build_kernel.yaml index fe76e261..f2e92d81 100644 --- a/.github/workflows/build_kernel.yaml +++ b/.github/workflows/build_kernel.yaml @@ -24,22 +24,22 @@ jobs: env: USER: github_runner - name: Build activation kernel - run: ( cd examples/activation && nix build .\#redistributable.torch27-cxx11-cu126-x86_64-linux ) + run: ( cd examples/activation && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux ) - name: Copy activation kernel run: cp -rL examples/activation/result activation-kernel - name: Build cutlass GEMM kernel - run: ( cd examples/cutlass-gemm && nix build .\#redistributable.torch27-cxx11-cu126-x86_64-linux ) + run: ( cd examples/cutlass-gemm && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux ) - name: Copy cutlass GEMM kernel run: cp -rL examples/cutlass-gemm/result cutlass-gemm-kernel - name: Build relu kernel - run: ( cd examples/relu && nix build .\#redistributable.torch27-cxx11-cu126-x86_64-linux ) + run: ( cd examples/relu && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux ) - name: Copy relu kernel run: cp -rL examples/relu/result relu-kernel - name: Build relu-backprop-compile kernel - run: ( cd examples/relu-backprop-compile && nix build .\#redistributable.torch27-cxx11-cu126-x86_64-linux ) + run: ( cd examples/relu-backprop-compile && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux ) - name: Copy relu-backprop-compile kernel run: cp -rL examples/relu-backprop-compile/result relu-backprop-compile-kernel @@ -51,7 +51,7 @@ jobs: run: ( cd examples/relu && nix build .#devShells.x86_64-linux.test ) - name: Build silu-and-mul-universal kernel - run: ( cd examples/silu-and-mul-universal && nix build .\#redistributable.torch27-cxx11-cu126-x86_64-linux ) + run: ( cd examples/silu-and-mul-universal && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux ) - name: Copy silu-and-mul-universal kernel run: cp -rL examples/silu-and-mul-universal/result silu-and-mul-universal-kernel diff --git a/.github/workflows/build_kernel_macos.yaml b/.github/workflows/build_kernel_macos.yaml index 7b6d9f79..9a8a995c 100644 --- a/.github/workflows/build_kernel_macos.yaml +++ b/.github/workflows/build_kernel_macos.yaml @@ -21,4 +21,4 @@ jobs: # For now we only test that there are no regressions in building macOS # kernels. Also run tests once we have a macOS runner. - name: Build relu kernel - run: ( cd examples/relu && nix build .\#redistributable.torch27-metal-aarch64-darwin -L ) + run: ( cd examples/relu && nix build .\#redistributable.torch29-metal-aarch64-darwin -L ) diff --git a/.github/workflows/build_kernel_rocm.yaml b/.github/workflows/build_kernel_rocm.yaml index e1733706..9e46b7ec 100644 --- a/.github/workflows/build_kernel_rocm.yaml +++ b/.github/workflows/build_kernel_rocm.yaml @@ -26,4 +26,4 @@ jobs: # For now we only test that there are no regressions in building ROCm # kernels. Also run tests once we have a ROCm runner. - name: Build relu kernel - run: ( cd examples/relu && nix build .\#redistributable.torch27-cxx11-rocm63-x86_64-linux -L ) + run: ( cd examples/relu && nix build .\#redistributable.torch29-cxx11-rocm63-x86_64-linux -L ) diff --git a/.github/workflows/build_kernel_xpu.yaml b/.github/workflows/build_kernel_xpu.yaml index e1ebc40e..49f7d773 100644 --- a/.github/workflows/build_kernel_xpu.yaml +++ b/.github/workflows/build_kernel_xpu.yaml @@ -26,4 +26,4 @@ jobs: # For now we only test that there are no regressions in building XPU # kernels. Also run tests once we have a XPU runner. - name: Build relu kernel - run: ( cd examples/relu && nix build .\#redistributable.torch28-cxx11-xpu20251-x86_64-linux -L ) + run: ( cd examples/relu && nix build .\#redistributable.torch29-cxx11-xpu20252-x86_64-linux -L ) diff --git a/build-variants.json b/build-variants.json index eb4e85d6..86e49cdc 100644 --- a/build-variants.json +++ b/build-variants.json @@ -1,14 +1,12 @@ { "aarch64-darwin": { "metal": [ - "torch27-metal-aarch64-darwin", "torch28-metal-aarch64-darwin", "torch29-metal-aarch64-darwin" ] }, "aarch64-linux": { "cuda": [ - "torch27-cxx11-cu128-aarch64-linux", "torch28-cxx11-cu129-aarch64-linux", "torch29-cxx11-cu126-aarch64-linux", "torch29-cxx11-cu128-aarch64-linux", @@ -17,9 +15,6 @@ }, "x86_64-linux": { "cuda": [ - "torch27-cxx11-cu118-x86_64-linux", - "torch27-cxx11-cu126-x86_64-linux", - "torch27-cxx11-cu128-x86_64-linux", "torch28-cxx11-cu126-x86_64-linux", "torch28-cxx11-cu128-x86_64-linux", "torch28-cxx11-cu129-x86_64-linux", @@ -28,14 +23,12 @@ "torch29-cxx11-cu130-x86_64-linux" ], "rocm": [ - "torch27-cxx11-rocm63-x86_64-linux", "torch28-cxx11-rocm63-x86_64-linux", "torch28-cxx11-rocm64-x86_64-linux", "torch29-cxx11-rocm63-x86_64-linux", "torch29-cxx11-rocm64-x86_64-linux" ], "xpu": [ - "torch27-cxx11-xpu20250-x86_64-linux", "torch28-cxx11-xpu20251-x86_64-linux", "torch29-cxx11-xpu20252-x86_64-linux" ] diff --git a/docs/build-variants.md b/docs/build-variants.md index ed7e3f2b..39b5c975 100644 --- a/docs/build-variants.md +++ b/docs/build-variants.md @@ -7,13 +7,11 @@ available. This list will be updated as new PyTorch versions are released. ## Metal aarch64-darwin -- `torch27-metal-aarch64-darwin` - `torch28-metal-aarch64-darwin` - `torch29-metal-aarch64-darwin` ## CUDA aarch64-linux -- `torch27-cxx11-cu128-aarch64-linux` - `torch28-cxx11-cu129-aarch64-linux` - `torch29-cxx11-cu126-aarch64-linux` - `torch29-cxx11-cu128-aarch64-linux` @@ -21,9 +19,6 @@ available. This list will be updated as new PyTorch versions are released. ## CUDA x86_64-linux -- `torch27-cxx11-cu118-x86_64-linux` -- `torch27-cxx11-cu126-x86_64-linux` -- `torch27-cxx11-cu128-x86_64-linux` - `torch28-cxx11-cu126-x86_64-linux` - `torch28-cxx11-cu128-x86_64-linux` - `torch28-cxx11-cu129-x86_64-linux` @@ -33,7 +28,6 @@ available. This list will be updated as new PyTorch versions are released. ## ROCm x86_64-linux -- `torch27-cxx11-rocm63-x86_64-linux` - `torch28-cxx11-rocm63-x86_64-linux` - `torch28-cxx11-rocm64-x86_64-linux` - `torch29-cxx11-rocm63-x86_64-linux` @@ -41,7 +35,6 @@ available. This list will be updated as new PyTorch versions are released. ## XPU x86_64-linux -- `torch27-cxx11-xpu20250-x86_64-linux` - `torch28-cxx11-xpu20251-x86_64-linux` - `torch29-cxx11-xpu20252-x86_64-linux` diff --git a/docs/docker.md b/docs/docker.md index 74bfe568..20148954 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -186,7 +186,7 @@ To load a kernel locally, you should add the kernel build that is compatible wit ```bash # PyTorch 2.6 and CUDA 12.6 -export PYTHONPATH="result/torch26-cxx11-cu126-x86_64-linux" +export PYTHONPATH="result/torch29-cxx11-cu126-x86_64-linux" ``` The kernel can then be imported as a Python module: diff --git a/docs/nix.md b/docs/nix.md index 2e2bc47b..b5fa1c93 100644 --- a/docs/nix.md +++ b/docs/nix.md @@ -84,7 +84,7 @@ using: ```bash $ rm -rf .venv # Remove existing venv if any. -$ nix develop .#devShells.torch27-cxx11-rocm63-x86_64-linux +$ nix develop .#devShells.torch29-cxx11-rocm64-x86_64-linux ``` ## Shell for testing a kernel diff --git a/examples/relu-specific-torch/flake.nix b/examples/relu-specific-torch/flake.nix index 0add5d7c..5b01e602 100644 --- a/examples/relu-specific-torch/flake.nix +++ b/examples/relu-specific-torch/flake.nix @@ -15,7 +15,7 @@ path = ./.; torchVersions = defaultVersions: [ { - torchVersion = "2.7"; + torchVersion = "2.9"; cudaVersion = "12.8"; cxx11Abi = true; systems = [ diff --git a/flake.lock b/flake.lock index a42e6625..5e6b0af8 100644 --- a/flake.lock +++ b/flake.lock @@ -73,11 +73,11 @@ "nixpkgs": "nixpkgs" }, "locked": { - "lastModified": 1760519460, - "narHash": "sha256-AL6vLcmL4nShgJ/Rqr7Rml1QMs/MuyCdfEqoGNHN8Jk=", + "lastModified": 1760620968, + "narHash": "sha256-8xJEJxO2MKwlDFVueQv6dM/iAwPVCKrYskie+j2vR60=", "owner": "huggingface", "repo": "hf-nix", - "rev": "6ca864b261ec3e9228d91f794ca0f10cd1766e9f", + "rev": "c7a79829e226b2275a404ad6b86915fb35f036e1", "type": "github" }, "original": { diff --git a/lib/build-sets.nix b/lib/build-sets.nix index 88c46041..8d407471 100644 --- a/lib/build-sets.nix +++ b/lib/build-sets.nix @@ -71,6 +71,7 @@ let cxx11Abi, system, bundleBuild ? false, + sourceBuild ? false, }: let pkgs = @@ -84,9 +85,15 @@ let pkgsByXpuVer.${xpuVersion} else throw "No compute framework set in Torch version"; - torch = pkgs.python3.pkgs."torch_${flattenVersion torchVersion}".override { - inherit cxx11Abi; - }; + torch = + if sourceBuild then + pkgs.python3.pkgs."torch_${flattenVersion torchVersion}".override { + inherit cxx11Abi; + } + else + pkgs.python3.pkgs."torch-bin_${flattenVersion torchVersion}".override { + inherit cxx11Abi; + }; extension = pkgs.callPackage ./torch-extension { inherit torch; }; in { diff --git a/lib/build.nix b/lib/build.nix index a8c24ae1..3f50f4ca 100644 --- a/lib/build.nix +++ b/lib/build.nix @@ -22,6 +22,7 @@ let isRocm isXpu ; + inherit (import ./build-variants.nix { inherit lib; }) computeFramework; in rec { resolveDeps = import ./deps.nix { inherit lib; }; @@ -29,22 +30,22 @@ rec { readToml = path: builtins.fromTOML (builtins.readFile path); validateBuildConfig = - buildConfig: + buildToml: let - kernels = lib.attrValues (buildConfig.kernel or { }); - hasOldUniversal = builtins.hasAttr "universal" (buildConfig.torch or { }); + kernels = lib.attrValues (buildToml.kernel or { }); + hasOldUniversal = builtins.hasAttr "universal" (buildToml.torch or { }); hasLanguage = lib.any (kernel: kernel ? language) kernels; in assert lib.assertMsg (!hasOldUniversal && !hasLanguage) '' build.toml seems to be of an older version, update it with: build2cmake update-build build.toml''; - buildConfig; + buildToml; backends = - buildConfig: + buildToml: let - kernels = lib.attrValues (buildConfig.kernel or { }); + kernels = lib.attrValues (buildToml.kernel or { }); kernelBackend = kernel: kernel.backend; init = { cuda = false; @@ -66,11 +67,11 @@ rec { # Filter buildsets that are applicable to a given kernel build config. filterApplicableBuildSets = - buildConfig: buildSets: + buildToml: buildSets: let - backends' = backends buildConfig; - minCuda = buildConfig.general.cuda-minver or "11.8"; - maxCuda = buildConfig.general.cuda-maxver or "99.9"; + backends' = backends buildToml; + minCuda = buildToml.general.cuda-minver or "11.8"; + maxCuda = buildToml.general.cuda-maxver or "99.9"; versionBetween = minver: maxver: ver: builtins.compareVersions ver minver >= 0 && builtins.compareVersions ver maxver <= 0; @@ -82,7 +83,7 @@ rec { || (isRocm buildSet.buildConfig && backends'.rocm) || (isMetal buildSet.buildConfig && backends'.metal) || (isXpu buildSet.buildConfig && backends'.xpu) - || (buildConfig.general.universal or false); + || (buildToml.general.universal or false); cudaVersionSupported = !(isCuda buildSet.buildConfig) || versionBetween minCuda maxCuda buildSet.pkgs.cudaPackages.cudaMajorMinorVersion; @@ -111,11 +112,13 @@ rec { }: let inherit (lib) fileset; - buildConfig = readBuildConfig path; - kernels = buildConfig.kernel or { }; + buildToml = readBuildConfig path; + kernels = lib.filterAttrs (_: kernel: computeFramework buildConfig == kernel.backend) ( + buildToml.kernel or { } + ); extraDeps = resolveDeps { inherit pkgs torch; - deps = lib.unique (lib.flatten (lib.mapAttrsToList (_: buildConfig: buildConfig.depends) kernels)); + deps = lib.unique (lib.flatten (lib.mapAttrsToList (_: kernel: kernel.depends) kernels)); }; # Use the mkSourceSet function to get the source @@ -125,11 +128,11 @@ rec { listMax = lib.foldl' lib.max 1; nvccThreads = listMax ( lib.mapAttrsToList ( - _: buildConfig: builtins.length (buildConfig.cuda-capabilities or supportedCudaCapabilities) - ) buildConfig.kernel + _: kernel: builtins.length (kernel.cuda-capabilities or supportedCudaCapabilities) + ) buildToml.kernel ); in - if buildConfig.general.universal then + if buildToml.general.universal then # No torch extension sources? Treat it as a noarch package. extension.mkNoArchExtension { @@ -138,7 +141,7 @@ rec { rev doGetKernelCheck ; - extensionName = buildConfig.general.name; + extensionName = buildToml.general.name; } else extension.mkExtension { @@ -151,7 +154,7 @@ rec { rev ; - extensionName = buildConfig.general.name; + extensionName = buildToml.general.name; doAbiCheck = true; }; @@ -198,9 +201,9 @@ rec { ; bundleOnly = true; }; - buildConfig = readBuildConfig path; + buildToml = readBuildConfig path; namePaths = - if buildConfig.general.universal then + if buildToml.general.universal then # Noarch, just get the first extension. { "torch-universal" = builtins.head (builtins.attrValues extensions); } else diff --git a/lib/deps.nix b/lib/deps.nix index bffecf01..9e8c6c81 100644 --- a/lib/deps.nix +++ b/lib/deps.nix @@ -30,7 +30,7 @@ let ]; "torch" = [ torch - torch.cxxdev + #torch.cxxdev ]; "cutlass_sycl" = [ torch.xpuPackages.cutlass-sycl ]; }; diff --git a/lib/torch-extension/arch.nix b/lib/torch-extension/arch.nix index 0bee5663..f5baaccf 100644 --- a/lib/torch-extension/arch.nix +++ b/lib/torch-extension/arch.nix @@ -129,7 +129,10 @@ stdenv.mkDerivation (prevAttrs: { ++ lib.optionals rocmSupport ( with rocmPackages; [ + hipcub-devel hipsparselt + rocprim-devel + rocthrust-devel rocwmma-devel ] ) @@ -145,14 +148,7 @@ stdenv.mkDerivation (prevAttrs: { env = lib.optionalAttrs cudaSupport { CUDAToolkit_ROOT = "${lib.getDev cudaPackages.cuda_nvcc}"; - TORCH_CUDA_ARCH_LIST = - if cudaPackages.cudaOlder "12.8" then - "7.0;7.5;8.0;8.6;8.9;9.0" - else if cudaPackages.cudaOlder "13.0" then - "7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0" - else - # sm_101 has been renamed to sm_110 in CUDA 13. - "7.5;8.0;8.6;8.9;9.0;10.0;11.0;12.0"; + TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities; } // lib.optionalAttrs rocmSupport { PYTORCH_ROCM_ARCH = lib.concatStringsSep ";" torch.rocmArchs; @@ -167,6 +163,9 @@ stdenv.mkDerivation (prevAttrs: { cmakeFlags = [ (lib.cmakeFeature "Python_EXECUTABLE" "${python3.withPackages (ps: [ torch ])}/bin/python") + # Fix: file RPATH_CHANGE could not write new RPATH, we are rewriting + # rpaths anyway. + (lib.cmakeBool "CMAKE_SKIP_RPATH" true) ] ++ lib.optionals cudaSupport [ (lib.cmakeFeature "CMAKE_CUDA_HOST_COMPILER" "${stdenv.cc}/bin/g++") diff --git a/tests/Dockerfile.test-kernel b/tests/Dockerfile.test-kernel index c6270c5f..3ec8c7ad 100644 --- a/tests/Dockerfile.test-kernel +++ b/tests/Dockerfile.test-kernel @@ -1,11 +1,11 @@ # syntax=docker/dockerfile:1.4 ARG PYTHON_VERSION=3.10 # Ideally we'd test with 11.8, but the GELU kernel is subtly off. -ARG CUDA_VERSION=12.1.0 -ARG UBUNTU_VERSION=18.04 -ARG TORCH_VERSION=2.5.0 +ARG CUDA_VERSION=12.6.0 +ARG UBI_VERSION=8 +ARG TORCH_VERSION=2.9.0 -FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} as base +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubi${UBI_VERSION} as base # Set environment variables ENV DEBIAN_FRONTEND=noninteractive \ @@ -15,11 +15,8 @@ ENV DEBIAN_FRONTEND=noninteractive \ NVIDIA_DRIVER_CAPABILITIES=compute,utility # Install system dependencies -RUN apt-get update && apt-get install -y --no-install-recommends \ - curl \ - python3 \ - python3-pip \ - && rm -rf /var/lib/apt/lists/* +RUN dnf install -y \ + curl # Install uv package manager RUN curl -LsSf https://astral.sh/uv/install.sh | sh @@ -47,10 +44,10 @@ WORKDIR /app/kernel-test RUN CUDA_MAJOR_MINOR=$(echo ${CUDA_VERSION} | cut -d'.' -f1,2) && \ case ${CUDA_MAJOR_MINOR} in \ - "11.8") CUDA_TAG="cu118" ;; \ - "12.1") CUDA_TAG="cu121" ;; \ - "12.2") CUDA_TAG="cu122" ;; \ "12.4") CUDA_TAG="cu124" ;; \ + "12.6") CUDA_TAG="cu126" ;; \ + "12.8") CUDA_TAG="cu128" ;; \ + "13.0") CUDA_TAG="cu130" ;; \ *) CUDA_TAG="" ;; \ esac && \ if [ -n "${CUDA_TAG}" ]; then \ diff --git a/versions.nix b/versions.nix index 5b43fea1..f6ee6fb5 100644 --- a/versions.nix +++ b/versions.nix @@ -1,59 +1,4 @@ [ - { - torchVersion = "2.7"; - cudaVersion = "11.8"; - cxx11Abi = true; - systems = [ "x86_64-linux" ]; - bundleBuild = true; - } - { - torchVersion = "2.7"; - cudaVersion = "12.6"; - cxx11Abi = true; - systems = [ - "x86_64-linux" - ]; - bundleBuild = true; - } - { - torchVersion = "2.7"; - cudaVersion = "12.8"; - cxx11Abi = true; - systems = [ - "x86_64-linux" - "aarch64-linux" - ]; - bundleBuild = true; - } - { - torchVersion = "2.7"; - rocmVersion = "6.3.4"; - cxx11Abi = true; - systems = [ "x86_64-linux" ]; - bundleBuild = true; - } - { - torchVersion = "2.7"; - rocmVersion = "6.4.2"; - cxx11Abi = true; - systems = [ "x86_64-linux" ]; - bundleBuild = false; - } - { - torchVersion = "2.7"; - xpuVersion = "2025.0.2"; - cxx11Abi = true; - systems = [ "x86_64-linux" ]; - bundleBuild = true; - } - { - torchVersion = "2.7"; - cxx11Abi = true; - metal = true; - systems = [ "aarch64-darwin" ]; - bundleBuild = true; - } - { torchVersion = "2.8"; xpuVersion = "2025.1.3"; @@ -178,5 +123,6 @@ "x86_64-linux" "aarch64-linux" ]; + sourceBuild = true; } ]