Skip to content
This repository was archived by the owner on Apr 6, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/build_kernel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ jobs:
- name: Copy relu kernel
run: cp -rL examples/relu/result relu-kernel

- name: Build relu kernel (CPU)
run: ( cd examples/relu && nix build .\#redistributable.torch29-cxx11-cpu-x86_64-linux )
- name: Copy relu kernel (CPU)
run: cp -rL examples/relu/result relu-kernel-cpu

- name: Build cutlass GEMM kernel
run: ( cd examples/cutlass-gemm && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux )
- name: Copy cutlass GEMM kernel
Expand Down Expand Up @@ -66,6 +71,7 @@ jobs:
activation-kernel
cutlass-gemm-kernel
relu-kernel
relu-kernel-cpu
relu-backprop-compile-kernel
silu-and-mul-universal-kernel

Expand Down
12 changes: 12 additions & 0 deletions build-variants.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
{
"aarch64-darwin": {
"cpu": [
"torch28-cpu-aarch64-darwin",
"torch29-cpu-aarch64-darwin"
],
"metal": [
"torch28-metal-aarch64-darwin",
"torch29-metal-aarch64-darwin"
]
},
"aarch64-linux": {
"cpu": [
"torch28-cxx11-cpu-aarch64-linux",
"torch29-cxx11-cpu-aarch64-linux"
],
"cuda": [
"torch28-cxx11-cu129-aarch64-linux",
"torch29-cxx11-cu126-aarch64-linux",
Expand All @@ -14,6 +22,10 @@
]
},
"x86_64-linux": {
"cpu": [
"torch28-cxx11-cpu-x86_64-linux",
"torch29-cxx11-cpu-x86_64-linux"
],
"cuda": [
"torch28-cxx11-cu126-x86_64-linux",
"torch28-cxx11-cu128-x86_64-linux",
Expand Down
24 changes: 20 additions & 4 deletions build2cmake/src/config/v2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ impl Build {
self.kernels
.values()
.map(|kernel| match kernel {
Kernel::Cpu { .. } => Backend::Cpu,
Kernel::Cuda { .. } => Backend::Cuda,
Kernel::Metal { .. } => Backend::Metal,
Kernel::Rocm { .. } => Backend::Rocm,
Expand Down Expand Up @@ -96,6 +97,13 @@ impl Torch {
#[derive(Debug, Deserialize, Serialize)]
#[serde(deny_unknown_fields, rename_all = "kebab-case", tag = "backend")]
pub enum Kernel {
#[serde(rename_all = "kebab-case")]
Cpu {
cxx_flags: Option<Vec<String>>,
depends: Vec<Dependencies>,
include: Option<Vec<String>>,
src: Vec<String>,
},
#[serde(rename_all = "kebab-case")]
Cuda {
cuda_capabilities: Option<Vec<String>>,
Expand Down Expand Up @@ -135,7 +143,8 @@ pub enum Kernel {
impl Kernel {
pub fn cxx_flags(&self) -> Option<&[String]> {
match self {
Kernel::Cuda { cxx_flags, .. }
Kernel::Cpu { cxx_flags, .. }
| Kernel::Cuda { cxx_flags, .. }
| Kernel::Metal { cxx_flags, .. }
| Kernel::Rocm { cxx_flags, .. }
| Kernel::Xpu { cxx_flags, .. } => cxx_flags.as_deref(),
Expand All @@ -144,7 +153,8 @@ impl Kernel {

pub fn include(&self) -> Option<&[String]> {
match self {
Kernel::Cuda { include, .. }
Kernel::Cpu { include, .. }
| Kernel::Cuda { include, .. }
| Kernel::Metal { include, .. }
| Kernel::Rocm { include, .. }
| Kernel::Xpu { include, .. } => include.as_deref(),
Expand All @@ -153,6 +163,7 @@ impl Kernel {

pub fn backend(&self) -> Backend {
match self {
Kernel::Cpu { .. } => Backend::Cpu,
Kernel::Cuda { .. } => Backend::Cuda,
Kernel::Metal { .. } => Backend::Metal,
Kernel::Rocm { .. } => Backend::Rocm,
Expand All @@ -162,7 +173,8 @@ impl Kernel {

pub fn depends(&self) -> &[Dependencies] {
match self {
Kernel::Cuda { depends, .. }
Kernel::Cpu { depends, .. }
| Kernel::Cuda { depends, .. }
| Kernel::Metal { depends, .. }
| Kernel::Rocm { depends, .. }
| Kernel::Xpu { depends, .. } => depends,
Expand All @@ -171,7 +183,8 @@ impl Kernel {

pub fn src(&self) -> &[String] {
match self {
Kernel::Cuda { src, .. }
Kernel::Cpu { src, .. }
| Kernel::Cuda { src, .. }
| Kernel::Metal { src, .. }
| Kernel::Rocm { src, .. }
| Kernel::Xpu { src, .. } => src,
Expand All @@ -182,6 +195,7 @@ impl Kernel {
#[derive(Clone, Copy, Debug, Deserialize, Eq, Ord, PartialEq, PartialOrd, Serialize)]
#[serde(deny_unknown_fields, rename_all = "kebab-case")]
pub enum Backend {
Cpu,
Cuda,
Metal,
Rocm,
Expand All @@ -191,6 +205,7 @@ pub enum Backend {
impl Display for Backend {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Backend::Cpu => write!(f, "cpu"),
Backend::Cuda => write!(f, "cuda"),
Backend::Metal => write!(f, "metal"),
Backend::Rocm => write!(f, "rocm"),
Expand All @@ -204,6 +219,7 @@ impl FromStr for Backend {

fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"cpu" => Ok(Backend::Cpu),
"cuda" => Ok(Backend::Cuda),
"metal" => Ok(Backend::Metal),
"rocm" => Ok(Backend::Rocm),
Expand Down
5 changes: 4 additions & 1 deletion build2cmake/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ use minijinja::Environment;

mod torch;
use torch::{
write_torch_ext_cuda, write_torch_ext_metal, write_torch_ext_universal, write_torch_ext_xpu,
write_torch_ext_cpu, write_torch_ext_cuda, write_torch_ext_metal, write_torch_ext_universal,
write_torch_ext_xpu,
};

mod config;
Expand Down Expand Up @@ -178,6 +179,7 @@ fn generate_torch(
};

let file_set = match backend {
Backend::Cpu => write_torch_ext_cpu(&env, &build, target_dir.clone(), ops_id)?,
Backend::Cuda | Backend::Rocm => {
write_torch_ext_cuda(&env, backend, &build, target_dir.clone(), ops_id)?
}
Expand Down Expand Up @@ -376,6 +378,7 @@ fn get_generated_files(

for backend in build.backends() {
let set = match backend {
Backend::Cpu => write_torch_ext_cpu(env, build, target_dir.clone(), ops_id.clone())?,
Backend::Cuda | Backend::Rocm => {
write_torch_ext_cuda(env, backend, build, target_dir.clone(), ops_id.clone())?
}
Expand Down
24 changes: 24 additions & 0 deletions build2cmake/src/templates/cpu/kernel.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
set({{kernel_name}}_SRC
{{ sources }}
)

{% if includes %}
# TODO: check if CLion support this:
# https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories
set_source_files_properties(
{{'${' + kernel_name + '_SRC}'}}
PROPERTIES INCLUDE_DIRECTORIES "{{ includes }}")
{% endif %}

{% if cxx_flags %}
foreach(_KERNEL_SRC {{'${' + kernel_name + '_SRC}'}})
set_property(
SOURCE ${_KERNEL_SRC}
APPEND PROPERTY
COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:{{ cxx_flags }}>"
)
endforeach()
Comment on lines +14 to +20
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we set the flags for each source file separately ?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC we have to set it on the whole target otherwise, which would result in applying the flags for other kernels as well.

{% endif %}

# Add C++ sources to main source list
list(APPEND SRC {{'"${' + kernel_name + '_SRC}"'}})
28 changes: 28 additions & 0 deletions build2cmake/src/templates/cpu/preamble.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
cmake_minimum_required(VERSION 3.26)
project({{name}} LANGUAGES CXX)

set(CMAKE_OSX_DEPLOYMENT_TARGET "15.0" CACHE STRING "Minimum macOS deployment version")

install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)

include(FetchContent)
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")

include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)

if(DEFINED Python3_EXECUTABLE)
# Allow passing through the interpreter (e.g. from setup.py).
find_package(Python3 COMPONENTS Development Development.SABIModule Interpreter)
if (NOT Python3_FOUND)
message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
endif()
else()
find_package(Python3 REQUIRED COMPONENTS Development Development.SABIModule Interpreter)
endif()

append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")

find_package(Torch REQUIRED)

add_compile_definitions(CPU_KERNEL)
121 changes: 121 additions & 0 deletions build2cmake/src/templates/cpu/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import logging
import os
from shutil import which, move
import subprocess
import sys
from pathlib import Path

from setuptools import Extension, find_packages, setup
from setuptools.command.build_ext import build_ext

logger = logging.getLogger(__name__)


def is_sccache_available() -> bool:
return which("sccache") is not None


def is_ccache_available() -> bool:
return which("ccache") is not None


def is_ninja_available() -> bool:
return which("ninja") is not None


class CMakeExtension(Extension):
def __init__(self, name: str, sourcedir: str = "") -> None:
super().__init__(name, sources=[], py_limited_api=True)
self.sourcedir = os.fspath(Path(sourcedir).resolve())


class CMakeBuild(build_ext):
def build_extension(self, ext: CMakeExtension) -> None:
ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)
extdir = ext_fullpath.parent.resolve()

debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
cfg = "Debug" if debug else "Release"

cmake_generator = os.environ.get("CMAKE_GENERATOR", "")

# Set Python3_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
# EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
# from Python.
cmake_args = [
f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}",
f"-DPython3_EXECUTABLE={sys.executable}",
f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm
]
build_args = []
if "CMAKE_ARGS" in os.environ:
cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]

if not cmake_generator or cmake_generator == "Ninja":
try:
import ninja

ninja_executable_path = Path(ninja.BIN_DIR) / "ninja"
cmake_args += [
"-GNinja",
f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
]
except ImportError:
pass

if is_sccache_available():
cmake_args += [
"-DCMAKE_C_COMPILER_LAUNCHER=sccache",
"-DCMAKE_CXX_COMPILER_LAUNCHER=sccache",
"-DCMAKE_HIP_COMPILER_LAUNCHER=sccache",
"-DCMAKE_OBJC_COMPILER_LAUNCHER=sccache",
"-DCMAKE_OBJCXX_COMPILER_LAUNCHER=sccache",
]
elif is_ccache_available():
cmake_args += [
"-DCMAKE_C_COMPILER_LAUNCHER=ccache",
"-DCMAKE_CXX_COMPILER_LAUNCHER=ccache",
"-DCMAKE_HIP_COMPILER_LAUNCHER=ccache",
"-DCMAKE_OBJC_COMPILER_LAUNCHER=ccache",
"-DCMAKE_OBJCXX_COMPILER_LAUNCHER=ccache",
]

num_jobs = os.getenv("MAX_JOBS", None)
if num_jobs is not None:
num_jobs = int(num_jobs)
logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
else:
try:
# os.sched_getaffinity() isn't universally available, so fall
# back to os.cpu_count() if we get an error here.
num_jobs = len(os.sched_getaffinity(0))
except AttributeError:
num_jobs = os.cpu_count()

build_temp = Path(self.build_temp) / ext.name
if not build_temp.exists():
build_temp.mkdir(parents=True)

subprocess.run(
["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True
)
subprocess.run(
["cmake", "--build", ".", *build_args], cwd=build_temp, check=True
)


setup(
name="{{ name }}",
# The version is just a stub, it's not used by the final build artefact.
version="0.1.0",
ext_modules=[CMakeExtension("{{ name }}.{{ ops_name }}")],
cmdclass={"build_ext": CMakeBuild},
packages=find_packages(where="torch-ext", include=["{{ name }}*"]),
package_dir={"": "torch-ext"},
{% if data_globs %}
package_data={"{{ name }}": [ {{ data_globs }} ]},
{% endif %}
zip_safe=False,
install_requires=["torch"],
python_requires=">=3.9",
)
13 changes: 13 additions & 0 deletions build2cmake/src/templates/cpu/torch-binding.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
set(TORCH_{{name}}_SRC
{{ src|join(' ') }}
)

{% if includes %}
# TODO: check if CLion support this:
# https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories
set_source_files_properties(
{{'${TORCH_' + name + '_SRC}'}}
PROPERTIES INCLUDE_DIRECTORIES "{{ includes }}")
{% endif %}

list(APPEND SRC {{'"${TORCH_' + name + '_SRC}"'}})
9 changes: 9 additions & 0 deletions build2cmake/src/templates/cpu/torch-extension.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
define_gpu_extension_target(
{{ ops_name }}
DESTINATION {{ ops_name }}
LANGUAGE ${GPU_LANG}
SOURCES ${SRC}
COMPILE_FLAGS ${GPU_FLAGS}
ARCHITECTURES ${GPU_ARCHES}
USE_SABI 3
WITH_SOABI)
Loading
Loading