From c04e0e621032cd48c494a0ccdbdb3108d7f0f42d Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Sat, 6 Dec 2025 12:05:20 -0600 Subject: [PATCH 1/4] docs: add tutorial on raising loops [skip ci] --- docs/Project.toml | 1 + docs/src/.vitepress/config.mts | 2 + docs/src/tutorials/raising.md | 104 +++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+) create mode 100644 docs/src/tutorials/raising.md diff --git a/docs/Project.toml b/docs/Project.toml index c0ccd2bbcb..550ce7eb64 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -3,6 +3,7 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" DocumenterVitepress = "4710194d-e776-4893-9690-8d956a29c365" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" +PrettyChairmarks = "aafa11c5-44f9-44a1-b829-427e6ce1ffc2" Reactant = "3c362404-f566-11ee-1572-e11a4b42c853" ReactantCore = "a3311ec8-5e00-46d5-b541-4f83e724a433" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/docs/src/.vitepress/config.mts b/docs/src/.vitepress/config.mts index 2853e7abd5..792461abb5 100644 --- a/docs/src/.vitepress/config.mts +++ b/docs/src/.vitepress/config.mts @@ -101,6 +101,7 @@ export default defineConfig({ text: "Persistent Compilation Cache", link: "/tutorials/persistent_compile_cache", }, + { text: "Raising", link: "/tutorials/raising" } ], }, { @@ -185,6 +186,7 @@ export default defineConfig({ text: "Persistent Compilation Cache", link: "/tutorials/persistent_compile_cache", }, + { text: "Raising", link: "/tutorials/raising" } ], } ], diff --git a/docs/src/tutorials/raising.md b/docs/src/tutorials/raising.md new file mode 100644 index 0000000000..a807259b17 --- /dev/null +++ b/docs/src/tutorials/raising.md @@ -0,0 +1,104 @@ +# Raising + +## Raising GPU Kernels + + + +## Raising Scalar Loops to Tensor IR + +We will implement a simple N body simulation code in Reactant. Instead of using +broadcasting or high-level abstractions, we will use loops and scalar operations +to implement this. + +```@example raising_stablehlo +using Reactant, PrettyChairmarks + +Reactant.allowscalar(true) # generally not recommended to turn on globally +``` + +We will implement a naive function to compute the attractive force between each +pair of particles in a system. + +```@example raising_stablehlo +function compute_attractive_force( + positions::AbstractMatrix, masses::AbstractVector, G::Number +) + N = size(positions, 2) + F = similar(positions, N, N) + + @trace for i in 1:N + @trace for j in 1:N + dx = positions[1, i] - positions[1, j] + dy = positions[2, i] - positions[2, j] + dz = positions[3, i] - positions[3, j] + + invr² = ifelse(i == j, dx, inv(dx^2 + dy^2 + dz^2)) + + Fx = G * masses[i] * masses[j] * invr² * dx + Fy = G * masses[i] * masses[j] * invr² * dy + Fz = G * masses[i] * masses[j] * invr² * dz + F[i, j] = Fx + Fy + Fz + end + end + + return F +end +``` + +```@example raising_stablehlo +positions = randn(Float32, 3, 1024) +masses = rand(Float32, 1024) .* 10 + +positions_ra = Reactant.to_rarray(positions) +masses_ra = Reactant.to_rarray(masses) +nothing # hide +``` + +Let's see what the HLO IR looks like for this function (without enabling the loop +raising). + +```@example raising_stablehlo +@code_hlo compile_options = CompileOptions(; + disable_auto_batching_passes=true +) compute_attractive_force(positions_ra, masses_ra, 2.0f0) +``` + +This IR has a nested loop, but that won't work nicely for GPUs/TPUs. Even for CPUs, XLA +often doens't do a great job with loops. By default, we will attempt to raise loops to a +tensor IR. + +```@example raising_stablehlo +hlo = @code_hlo compute_attractive_force(positions_ra, masses_ra, 2.0f0) +@assert !contains(repr(hlo), "stablehlo.while") #hide +hlo +``` + +This IR won't have any loops, instead it will be written in a tensor IR! Let ensure that +the values are identical. + +```@example raising_stablehlo +y_jl = compute_attractive_force(positions, masses, 2.0f0) +y_ra = @jit compute_attractive_force(positions_ra, masses_ra, 2.0f0) +maximum(abs, Array(y_ra) .- y_jl) +``` + +Let's time the execution of the two versions. + +```@example raising_stablehlo +fn1 = @compile sync=true compile_options=CompileOptions(; + disable_auto_batching_passes=true +) compute_attractive_force(positions_ra, masses_ra, 2.0f0) +fn2 = @compile sync=true compute_attractive_force(positions_ra, masses_ra, 2.0f0) +``` + +Runtime for non-raised function: + +```@example raising_stablehlo +@bs fn1(positions_ra, masses_ra, 2.0f0) +``` + +Runtime for raised function: + +```@example raising_stablehlo +@bs fn2(positions_ra, masses_ra, 2.0f0) +``` From 5a47dd628c0ebc7bc4c77e561b49dab1036bfa6f Mon Sep 17 00:00:00 2001 From: Paul Berg Date: Tue, 9 Dec 2025 10:52:48 +0100 Subject: [PATCH 2/4] Update raising.md --- docs/src/tutorials/raising.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/src/tutorials/raising.md b/docs/src/tutorials/raising.md index a807259b17..aea394e687 100644 --- a/docs/src/tutorials/raising.md +++ b/docs/src/tutorials/raising.md @@ -2,7 +2,16 @@ ## Raising GPU Kernels - +Kernel raising refer to Reactant's ability to transform a program written in a GPU kernel style. That is, kernel functions which are evaluated in a grid of blocks and threads where operations are done at the scalar level. The transformation raises the program to a tensor style function (in the StableHLO dialect) where operations are broadcasted. + +This transformation enables several features: + + - Running the raised compute kernel on hardware where the original kernel was not designed to run on (_i.e._ running a CUDA kernel on a TPU). + - Enabling further optimizations, since the raised kernel is now indiscernible from the rest of the program, it can be optimized with it. For example, two sequential kernel launches operating on the result of each others can be fused if they are both raised. Resulting in a single kernel launch, in the final optimized StableHLO program. + - Lastly, automatic-differentiation in Reactant is currently not supported for GPU kernels. Raising kernels enables Enzyme to differentiate the raised kernel. For this to function, one must use the `raise_first` compilation option to make sure the kernel are raised before Enzyme performs automatic-differentiation on the program. + +!!! note + Not all classes of kernels are currently raisable to StableHLO. If your kernel encounters an error while being raised, please open an issue on [the Reactant.jl repository](https://github.com/EnzymeAD/Reactant.jl/issues/new?labels=raising). ## Raising Scalar Loops to Tensor IR From db94b19605fb1ad6dd1f2a94638cd7fdeaa4601f Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Thu, 11 Dec 2025 23:13:11 -0500 Subject: [PATCH 3/4] docs: warmup --- docs/src/tutorials/raising.md | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/docs/src/tutorials/raising.md b/docs/src/tutorials/raising.md index aea394e687..8ce2f7082e 100644 --- a/docs/src/tutorials/raising.md +++ b/docs/src/tutorials/raising.md @@ -2,16 +2,29 @@ ## Raising GPU Kernels -Kernel raising refer to Reactant's ability to transform a program written in a GPU kernel style. That is, kernel functions which are evaluated in a grid of blocks and threads where operations are done at the scalar level. The transformation raises the program to a tensor style function (in the StableHLO dialect) where operations are broadcasted. +Kernel raising refer to Reactant's ability to transform a program written in a GPU kernel +style. That is, kernel functions which are evaluated in a grid of blocks and threads where +operations are done at the scalar level. The transformation raises the program to a tensor +style function (in the StableHLO dialect) where operations are broadcasted. This transformation enables several features: - - Running the raised compute kernel on hardware where the original kernel was not designed to run on (_i.e._ running a CUDA kernel on a TPU). - - Enabling further optimizations, since the raised kernel is now indiscernible from the rest of the program, it can be optimized with it. For example, two sequential kernel launches operating on the result of each others can be fused if they are both raised. Resulting in a single kernel launch, in the final optimized StableHLO program. - - Lastly, automatic-differentiation in Reactant is currently not supported for GPU kernels. Raising kernels enables Enzyme to differentiate the raised kernel. For this to function, one must use the `raise_first` compilation option to make sure the kernel are raised before Enzyme performs automatic-differentiation on the program. +- Running the raised compute kernel on hardware where the original kernel was not designed + to run on (_i.e._ running a CUDA kernel on a TPU). +- Enabling further optimizations, since the raised kernel is now indiscernible from the rest + of the program, it can be optimized with it. For example, two sequential kernel launches + operating on the result of each others can be fused if they are both raised. Resulting in + a single kernel launch, in the final optimized StableHLO program. +- Lastly, automatic-differentiation in Reactant is currently not supported for GPU kernels. + Raising kernels enables Enzyme to differentiate the raised kernel. For this to function, + one must use the `raise_first` compilation option to make sure the kernel are raised + before Enzyme performs automatic-differentiation on the program. !!! note - Not all classes of kernels are currently raisable to StableHLO. If your kernel encounters an error while being raised, please open an issue on [the Reactant.jl repository](https://github.com/EnzymeAD/Reactant.jl/issues/new?labels=raising). + + Not all classes of kernels are currently raisable to StableHLO. If your kernel + encounters an error while being raised, please open an issue on + [the Reactant.jl repository](https://github.com/EnzymeAD/Reactant.jl/issues/new?labels=raising). ## Raising Scalar Loops to Tensor IR @@ -103,11 +116,13 @@ fn2 = @compile sync=true compute_attractive_force(positions_ra, masses_ra, 2.0f0 Runtime for non-raised function: ```@example raising_stablehlo +fn1(positions_ra, masses_ra, 2.0f0) #hide @bs fn1(positions_ra, masses_ra, 2.0f0) ``` Runtime for raised function: ```@example raising_stablehlo +fn2(positions_ra, masses_ra, 2.0f0) #hide @bs fn2(positions_ra, masses_ra, 2.0f0) ``` From 263e2e2f868ea306816c3b59100090a40dc90720 Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Fri, 12 Dec 2025 09:38:10 -0500 Subject: [PATCH 4/4] docs: add a kernel raising example --- docs/Project.toml | 2 ++ docs/src/tutorials/raising.md | 42 +++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/docs/Project.toml b/docs/Project.toml index 550ce7eb64..3de01d2f95 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,8 +1,10 @@ [deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" DocumenterVitepress = "4710194d-e776-4893-9690-8d956a29c365" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" PrettyChairmarks = "aafa11c5-44f9-44a1-b829-427e6ce1ffc2" Reactant = "3c362404-f566-11ee-1572-e11a4b42c853" ReactantCore = "a3311ec8-5e00-46d5-b541-4f83e724a433" diff --git a/docs/src/tutorials/raising.md b/docs/src/tutorials/raising.md index 8ce2f7082e..ab7ab48486 100644 --- a/docs/src/tutorials/raising.md +++ b/docs/src/tutorials/raising.md @@ -26,6 +26,48 @@ This transformation enables several features: encounters an error while being raised, please open an issue on [the Reactant.jl repository](https://github.com/EnzymeAD/Reactant.jl/issues/new?labels=raising). +### Example + +```@example raising_kernelabstractions_to_stablehlo +using Reactant +using KernelAbstractions +using CUDA # needs to be loaded for raising even if CUDA is not functional on your system +``` + +!!! tip + + We could have also directly implemented the kernel using CUDA.jl instead of KernelAbstractions.jl. + +We will implement a simple kernel to compute the square of a vector. + +```@example raising_kernelabstractions_to_stablehlo +@kernel function square_kernel!(y, @Const(x)) + i = @index(Global) + @inbounds y[i] = x[i] * x[i] +end + +function square(x) + y = similar(x) + backend = KernelAbstractions.get_backend(x) + kernel! = square_kernel!(backend) + kernel!(y, x; ndrange=length(x)) + return y +end +``` + +```@example raising_kernelabstractions_to_stablehlo +x = Reactant.to_rarray(collect(1:1:64) ./ 64) +nothing # hide +``` + +Let's see what the HLO IR looks like for this function. Note that raising is automatically +enabled for backends like TPU, where the original kernel was not designed to run on. To +enable raising on other backends, pass the `raise=true` option. + +```@example raising_kernelabstractions_to_stablehlo +@code_hlo raise=true square(x) +``` + ## Raising Scalar Loops to Tensor IR We will implement a simple N body simulation code in Reactant. Instead of using