huggingface · danieldk · Oct 16, 2025 · Oct 9, 2025 · Oct 15, 2025 · Oct 16, 2025
diff --git a/.github/workflows/build_kernel.yaml b/.github/workflows/build_kernel.yaml
@@ -24,22 +24,22 @@ jobs:
         env:
           USER: github_runner
       - name: Build activation kernel
-        run: ( cd examples/activation && nix build .\#redistributable.torch27-cxx11-cu126-x86_64-linux )
+        run: ( cd examples/activation && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux )
       - name: Copy activation kernel
         run: cp -rL examples/activation/result activation-kernel
 
       - name: Build cutlass GEMM kernel
-        run: ( cd examples/cutlass-gemm && nix build .\#redistributable.torch27-cxx11-cu126-x86_64-linux )
+        run: ( cd examples/cutlass-gemm && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux )
       - name: Copy cutlass GEMM kernel
         run: cp -rL examples/cutlass-gemm/result cutlass-gemm-kernel
 
       - name: Build relu kernel
-        run: ( cd examples/relu && nix build .\#redistributable.torch27-cxx11-cu126-x86_64-linux )
+        run: ( cd examples/relu && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux )
       - name: Copy relu kernel
         run: cp -rL examples/relu/result relu-kernel
 
       - name: Build relu-backprop-compile kernel
-        run: ( cd examples/relu-backprop-compile && nix build .\#redistributable.torch27-cxx11-cu126-x86_64-linux )
+        run: ( cd examples/relu-backprop-compile && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux )
       - name: Copy relu-backprop-compile kernel
         run: cp -rL examples/relu-backprop-compile/result relu-backprop-compile-kernel
 
@@ -51,7 +51,7 @@ jobs:
         run: ( cd examples/relu && nix build .#devShells.x86_64-linux.test )
 
       - name: Build silu-and-mul-universal kernel
-        run: ( cd examples/silu-and-mul-universal && nix build .\#redistributable.torch27-cxx11-cu126-x86_64-linux )
+        run: ( cd examples/silu-and-mul-universal && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux )
       - name: Copy silu-and-mul-universal kernel
         run: cp -rL examples/silu-and-mul-universal/result silu-and-mul-universal-kernel
 

diff --git a/.github/workflows/build_kernel_macos.yaml b/.github/workflows/build_kernel_macos.yaml
@@ -21,4 +21,4 @@ jobs:
       # For now we only test that there are no regressions in building macOS
       # kernels. Also run tests once we have a macOS runner.
       - name: Build relu kernel
-        run: ( cd examples/relu && nix build .\#redistributable.torch27-metal-aarch64-darwin -L )
+        run: ( cd examples/relu && nix build .\#redistributable.torch29-metal-aarch64-darwin -L )
diff --git a/.github/workflows/build_kernel_rocm.yaml b/.github/workflows/build_kernel_rocm.yaml
@@ -26,4 +26,4 @@ jobs:
       # For now we only test that there are no regressions in building ROCm
       # kernels. Also run tests once we have a ROCm runner.
       - name: Build relu kernel
-        run: ( cd examples/relu && nix build .\#redistributable.torch27-cxx11-rocm63-x86_64-linux -L )
+        run: ( cd examples/relu && nix build .\#redistributable.torch29-cxx11-rocm63-x86_64-linux -L )
diff --git a/.github/workflows/build_kernel_xpu.yaml b/.github/workflows/build_kernel_xpu.yaml
@@ -26,4 +26,4 @@ jobs:
       # For now we only test that there are no regressions in building XPU
       # kernels. Also run tests once we have a XPU runner.
       - name: Build relu kernel
-        run: ( cd examples/relu && nix build .\#redistributable.torch28-cxx11-xpu20251-x86_64-linux -L )
+        run: ( cd examples/relu && nix build .\#redistributable.torch29-cxx11-xpu20252-x86_64-linux -L )
diff --git a/build-variants.json b/build-variants.json
@@ -1,14 +1,12 @@
 {
   "aarch64-darwin": {
     "metal": [
-      "torch27-metal-aarch64-darwin",
       "torch28-metal-aarch64-darwin",
       "torch29-metal-aarch64-darwin"
     ]
   },
   "aarch64-linux": {
     "cuda": [
-      "torch27-cxx11-cu128-aarch64-linux",
       "torch28-cxx11-cu129-aarch64-linux",
       "torch29-cxx11-cu126-aarch64-linux",
       "torch29-cxx11-cu128-aarch64-linux",
@@ -17,9 +15,6 @@
   },
   "x86_64-linux": {
     "cuda": [
-      "torch27-cxx11-cu118-x86_64-linux",
-      "torch27-cxx11-cu126-x86_64-linux",
-      "torch27-cxx11-cu128-x86_64-linux",
       "torch28-cxx11-cu126-x86_64-linux",
       "torch28-cxx11-cu128-x86_64-linux",
       "torch28-cxx11-cu129-x86_64-linux",
@@ -28,14 +23,12 @@
       "torch29-cxx11-cu130-x86_64-linux"
     ],
     "rocm": [
-      "torch27-cxx11-rocm63-x86_64-linux",
       "torch28-cxx11-rocm63-x86_64-linux",
       "torch28-cxx11-rocm64-x86_64-linux",
       "torch29-cxx11-rocm63-x86_64-linux",
       "torch29-cxx11-rocm64-x86_64-linux"
     ],
     "xpu": [
-      "torch27-cxx11-xpu20250-x86_64-linux",
       "torch28-cxx11-xpu20251-x86_64-linux",
       "torch29-cxx11-xpu20252-x86_64-linux"
     ]

diff --git a/docs/build-variants.md b/docs/build-variants.md
@@ -7,23 +7,18 @@ available. This list will be updated as new PyTorch versions are released.
 
 ## Metal aarch64-darwin
 
-- `torch27-metal-aarch64-darwin`
 - `torch28-metal-aarch64-darwin`
 - `torch29-metal-aarch64-darwin`
 
 ## CUDA aarch64-linux
 
-- `torch27-cxx11-cu128-aarch64-linux`
 - `torch28-cxx11-cu129-aarch64-linux`
 - `torch29-cxx11-cu126-aarch64-linux`
 - `torch29-cxx11-cu128-aarch64-linux`
 - `torch29-cxx11-cu130-aarch64-linux`
 
 ## CUDA x86_64-linux
 
-- `torch27-cxx11-cu118-x86_64-linux`
-- `torch27-cxx11-cu126-x86_64-linux`
-- `torch27-cxx11-cu128-x86_64-linux`
 - `torch28-cxx11-cu126-x86_64-linux`
 - `torch28-cxx11-cu128-x86_64-linux`
 - `torch28-cxx11-cu129-x86_64-linux`
@@ -33,15 +28,13 @@ available. This list will be updated as new PyTorch versions are released.
 
 ## ROCm x86_64-linux
 
-- `torch27-cxx11-rocm63-x86_64-linux`
 - `torch28-cxx11-rocm63-x86_64-linux`
 - `torch28-cxx11-rocm64-x86_64-linux`
 - `torch29-cxx11-rocm63-x86_64-linux`
 - `torch29-cxx11-rocm64-x86_64-linux`
 
 ## XPU x86_64-linux
 
-- `torch27-cxx11-xpu20250-x86_64-linux`
 - `torch28-cxx11-xpu20251-x86_64-linux`
 - `torch29-cxx11-xpu20252-x86_64-linux`
 

diff --git a/docs/docker.md b/docs/docker.md
@@ -186,7 +186,7 @@ To load a kernel locally, you should add the kernel build that is compatible wit
 
 ```bash
 # PyTorch 2.6 and CUDA 12.6
-export PYTHONPATH="result/torch26-cxx11-cu126-x86_64-linux"
+export PYTHONPATH="result/torch29-cxx11-cu126-x86_64-linux"
 ```
 
 The kernel can then be imported as a Python module:

diff --git a/docs/nix.md b/docs/nix.md
@@ -84,7 +84,7 @@ using:
 
 ```bash
 $ rm -rf .venv  # Remove existing venv if any.
-$ nix develop .#devShells.torch27-cxx11-rocm63-x86_64-linux
+$ nix develop .#devShells.torch29-cxx11-rocm64-x86_64-linux
 ```
 
 ## Shell for testing a kernel

diff --git a/examples/relu-specific-torch/flake.nix b/examples/relu-specific-torch/flake.nix
@@ -15,7 +15,7 @@
       path = ./.;
       torchVersions = defaultVersions: [
         {
-          torchVersion = "2.7";
+          torchVersion = "2.9";
           cudaVersion = "12.8";
           cxx11Abi = true;
           systems = [

diff --git a/flake.lock b/flake.lock
diff --git a/lib/build-sets.nix b/lib/build-sets.nix
@@ -71,6 +71,7 @@ let
       cxx11Abi,
       system,
       bundleBuild ? false,
+      sourceBuild ? false,
     }:
     let
       pkgs =
@@ -84,9 +85,15 @@ let
           pkgsByXpuVer.${xpuVersion}
         else
           throw "No compute framework set in Torch version";
-      torch = pkgs.python3.pkgs."torch_${flattenVersion torchVersion}".override {
-        inherit cxx11Abi;
-      };
+      torch =
+        if sourceBuild then
+          pkgs.python3.pkgs."torch_${flattenVersion torchVersion}".override {
+            inherit cxx11Abi;
+          }
+        else
+          pkgs.python3.pkgs."torch-bin_${flattenVersion torchVersion}".override {
+            inherit cxx11Abi;
+          };
       extension = pkgs.callPackage ./torch-extension { inherit torch; };
     in
     {

diff --git a/lib/build.nix b/lib/build.nix
@@ -22,29 +22,30 @@ let
     isRocm
     isXpu
     ;
+  inherit (import ./build-variants.nix { inherit lib; }) computeFramework;
 in
 rec {
   resolveDeps = import ./deps.nix { inherit lib; };
 
   readToml = path: builtins.fromTOML (builtins.readFile path);
 
   validateBuildConfig =
-    buildConfig:
+    buildToml:
     let
-      kernels = lib.attrValues (buildConfig.kernel or { });
-      hasOldUniversal = builtins.hasAttr "universal" (buildConfig.torch or { });
+      kernels = lib.attrValues (buildToml.kernel or { });
+      hasOldUniversal = builtins.hasAttr "universal" (buildToml.torch or { });
       hasLanguage = lib.any (kernel: kernel ? language) kernels;
 
     in
     assert lib.assertMsg (!hasOldUniversal && !hasLanguage) ''
       build.toml seems to be of an older version, update it with:
             build2cmake update-build build.toml'';
-    buildConfig;
+    buildToml;
 
   backends =
-    buildConfig:
+    buildToml:
     let
-      kernels = lib.attrValues (buildConfig.kernel or { });
+      kernels = lib.attrValues (buildToml.kernel or { });
       kernelBackend = kernel: kernel.backend;
       init = {
         cuda = false;
@@ -66,11 +67,11 @@ rec {
 
   # Filter buildsets that are applicable to a given kernel build config.
   filterApplicableBuildSets =
-    buildConfig: buildSets:
+    buildToml: buildSets:
     let
-      backends' = backends buildConfig;
-      minCuda = buildConfig.general.cuda-minver or "11.8";
-      maxCuda = buildConfig.general.cuda-maxver or "99.9";
+      backends' = backends buildToml;
+      minCuda = buildToml.general.cuda-minver or "11.8";
+      maxCuda = buildToml.general.cuda-maxver or "99.9";
       versionBetween =
         minver: maxver: ver:
         builtins.compareVersions ver minver >= 0 && builtins.compareVersions ver maxver <= 0;
@@ -82,7 +83,7 @@ rec {
             || (isRocm buildSet.buildConfig && backends'.rocm)
             || (isMetal buildSet.buildConfig && backends'.metal)
             || (isXpu buildSet.buildConfig && backends'.xpu)
-            || (buildConfig.general.universal or false);
+            || (buildToml.general.universal or false);
           cudaVersionSupported =
             !(isCuda buildSet.buildConfig)
             || versionBetween minCuda maxCuda buildSet.pkgs.cudaPackages.cudaMajorMinorVersion;
@@ -111,11 +112,13 @@ rec {
     }:
     let
       inherit (lib) fileset;
-      buildConfig = readBuildConfig path;
-      kernels = buildConfig.kernel or { };
+      buildToml = readBuildConfig path;
+      kernels = lib.filterAttrs (_: kernel: computeFramework buildConfig == kernel.backend) (
+        buildToml.kernel or { }
+      );
       extraDeps = resolveDeps {
         inherit pkgs torch;
-        deps = lib.unique (lib.flatten (lib.mapAttrsToList (_: buildConfig: buildConfig.depends) kernels));
+        deps = lib.unique (lib.flatten (lib.mapAttrsToList (_: kernel: kernel.depends) kernels));
       };
 
       # Use the mkSourceSet function to get the source
@@ -125,11 +128,11 @@ rec {
       listMax = lib.foldl' lib.max 1;
       nvccThreads = listMax (
         lib.mapAttrsToList (
-          _: buildConfig: builtins.length (buildConfig.cuda-capabilities or supportedCudaCapabilities)
-        ) buildConfig.kernel
+          _: kernel: builtins.length (kernel.cuda-capabilities or supportedCudaCapabilities)
+        ) buildToml.kernel
       );
     in
-    if buildConfig.general.universal then
+    if buildToml.general.universal then
       # No torch extension sources? Treat it as a noarch package.
 
       extension.mkNoArchExtension {
@@ -138,7 +141,7 @@ rec {
           rev
           doGetKernelCheck
           ;
-        extensionName = buildConfig.general.name;
+        extensionName = buildToml.general.name;
       }
     else
       extension.mkExtension {
@@ -151,7 +154,7 @@ rec {
           rev
           ;
 
-        extensionName = buildConfig.general.name;
+        extensionName = buildToml.general.name;
         doAbiCheck = true;
       };
 
@@ -198,9 +201,9 @@ rec {
           ;
         bundleOnly = true;
       };
-      buildConfig = readBuildConfig path;
+      buildToml = readBuildConfig path;
       namePaths =
-        if buildConfig.general.universal then
+        if buildToml.general.universal then
           # Noarch, just get the first extension.
           { "torch-universal" = builtins.head (builtins.attrValues extensions); }
         else

diff --git a/lib/deps.nix b/lib/deps.nix
@@ -30,7 +30,7 @@ let
     ];
     "torch" = [
       torch
-      torch.cxxdev
+      #torch.cxxdev
     ];
     "cutlass_sycl" = [ torch.xpuPackages.cutlass-sycl ];
   };

diff --git a/lib/torch-extension/arch.nix b/lib/torch-extension/arch.nix
@@ -129,7 +129,10 @@ stdenv.mkDerivation (prevAttrs: {
   ++ lib.optionals rocmSupport (
     with rocmPackages;
     [
+      hipcub-devel
       hipsparselt
+      rocprim-devel
+      rocthrust-devel
       rocwmma-devel
     ]
   )
@@ -145,14 +148,7 @@ stdenv.mkDerivation (prevAttrs: {
   env =
     lib.optionalAttrs cudaSupport {
       CUDAToolkit_ROOT = "${lib.getDev cudaPackages.cuda_nvcc}";
-      TORCH_CUDA_ARCH_LIST =
-        if cudaPackages.cudaOlder "12.8" then
-          "7.0;7.5;8.0;8.6;8.9;9.0"
-        else if cudaPackages.cudaOlder "13.0" then
-          "7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0"
-        else
-          # sm_101 has been renamed to sm_110 in CUDA 13.
-          "7.5;8.0;8.6;8.9;9.0;10.0;11.0;12.0";
+      TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities;
     }
     // lib.optionalAttrs rocmSupport {
       PYTORCH_ROCM_ARCH = lib.concatStringsSep ";" torch.rocmArchs;
@@ -167,6 +163,9 @@ stdenv.mkDerivation (prevAttrs: {
 
   cmakeFlags = [
     (lib.cmakeFeature "Python_EXECUTABLE" "${python3.withPackages (ps: [ torch ])}/bin/python")
+    # Fix: file RPATH_CHANGE could not write new RPATH, we are rewriting
+    # rpaths anyway.
+    (lib.cmakeBool "CMAKE_SKIP_RPATH" true)
   ]
   ++ lib.optionals cudaSupport [
     (lib.cmakeFeature "CMAKE_CUDA_HOST_COMPILER" "${stdenv.cc}/bin/g++")