diff --git a/.gitmodules b/.gitmodules
index 59eacdf..468fa55 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,3 @@
-[submodule "third_party/local/WebGPU-distribution"]
- path = third_party/local/WebGPU-distribution
- url = https://github.com/eliemichel/WebGPU-distribution.git
- branch = dawn
[submodule "third_party/llm.c"]
path = third_party/llm.c
url = https://github.com/karpathy/llm.c
diff --git a/Makefile b/Makefile
index 3f69c6e..8e5d67b 100644
--- a/Makefile
+++ b/Makefile
@@ -19,13 +19,42 @@ pch:
mkdir -p build && $(CXX) -std=c++17 $(INCLUDES) -x c++-header gpu.hpp -o build/gpu.hpp.pch
# TODO(avh): change extension based on platform
-lib:
- mkdir -p build && $(CXX) -std=c++17 $(INCLUDES) -L$(LIBDIR) -ldawn -ldl -shared -fPIC gpu.cpp -o build/libgpucpp.dylib
+# Get the current OS name
+OS = $(shell uname | tr -d '\n')
+# Set the specific variables for each platform
+LIB_PATH ?= /usr/lib
+HEADER_PATH ?= /usr/include
+ifeq ($(OS), Linux)
+OS_TYPE ?= Linux
+GPU_CPP_LIB_NAME ?= libgpucpp.so
+DAWN_LIB_NAME ?= libwebgpu_dawn.so
+else ifeq ($(OS), Darwin)
+OS_TYPE ?= macOS
+GPU_CPP_LIB_NAME ?= libgpucpp.dylib
+DAWN_LIB_NAME ?= libwebgpu_dawn.dylib
+else
+OS_TYPE ?= unknown
+endif
+
+lib: check-clang dawnlib
+ mkdir -p build && $(CXX) -std=c++17 $(INCLUDES) -L$(LIBDIR) -lwebgpu_dawn -ldl -shared -fPIC gpu.cpp -o build/$(GPU_CPP_LIB_NAME)
+ python3 build.py
+ cp third_party/lib/$(DAWN_LIB_NAME) build/
+
+install:
+ cp build/$(GPU_CPP_LIB_NAME) $(LIB_PATH)
+ cp build/$(DAWN_LIB_NAME) $(LIB_PATH)
+ cp build/gpu.hpp $(HEADER_PATH)
+
+uninstall:
+ rm $(LIB_PATH)/$(GPU_CPP_LIB_NAME)
+ rm $(LIB_PATH)/$(DAWN_LIB_NAME)
+ rm $(HEADER_PATH)/gpu.hpp
examples/hello_world/build/hello_world: check-clang dawnlib examples/hello_world/run.cpp check-linux-vulkan
$(LIBSPEC) && cd examples/hello_world && make build/hello_world && ./build/hello_world
-dawnlib: $(if $(wildcard third_party/lib/libdawn.so third_party/lib/libdawn.dylib),,run_setup)
+dawnlib: $(if $(wildcard third_party/lib/libwebgpu_dawn.so third_party/lib/libwebgpu_dawn.dylib),,run_setup)
run_setup: check-python
python3 setup.py
@@ -42,7 +71,7 @@ all: dawnlib check-clang check-linux-vulkan lib pch
# Test 16-bit floating point type
test-half: dawnlib check-clang
- $(LIBSPEC) && clang++ -std=c++17 $(INCLUDES) numeric_types/half.cpp -L$(LIBDIR) -ldawn -ldl -o build/half && ./build/half
+ $(LIBSPEC) && clang++ -std=c++17 $(INCLUDES) numeric_types/half.cpp -L$(LIBDIR) -lwebgpu_dawn -ldl -o build/half && ./build/half
docs: Doxyfile
doxygen Doxyfile
@@ -73,7 +102,7 @@ all-cmake: check-clang check-cmake
################################################################################
clean-dawnlib:
- rm -f third_party/lib/libdawn.so third_party/lib/libdawn.dylib
+ rm -f third_party/lib/libwebgpu_dawn.so third_party/lib/libwebgpu_dawn.dylib
clean:
read -r -p "This will delete the contents of build/*. Are you sure? [CTRL-C to abort] " response && rm -rf build/*
@@ -90,21 +119,30 @@ clean:
rm -f build/half
clean-all:
- read -r -p "This will delete the contents of build/* and third_party/*. Are you sure? [CTRL-C to abort] " response && rm -rf build/* third_party/fetchcontent/* third_party/gpu-build third_party/gpu-subbuild third_party/gpu-src third_party/lib/libdawn.so third_party/lib/libdawn.dylib
+ read -r -p "This will delete the contents of build/* and third_party/*. Are you sure? [CTRL-C to abort] " response && rm -rf build/* third_party/fetchcontent/* third_party/gpu-build third_party/gpu-subbuild third_party/gpu-src third_party/lib/libwebgpu_dawn.so third_party/lib/libwebgpu_dawn.dylib
################################################################################
# Checks
################################################################################
+# Check all
+check-all: check-os check-clang check-cmake check-python
+
+# check the os
+check-os:
+ifeq ($(OS_TYPE), unknown)
+$(error Unsupported operating system)
+endif
+
# check for the existence of clang++ and cmake
check-clang:
- @command -v clang++ >/dev/null 2>&1 || { echo >&2 "Please install clang++ with 'sudo apt-get install clang' or 'brew install llvm'"; exit 1; }
+ @command -v clang++ >/dev/null 2>&1 || { echo -e >&2 "Clang++ is not installed. Please install clang++ to continue.\nOn Debian / Ubuntu: 'sudo apt-get install clang' or 'brew install llvm'\nOn Centos: 'sudo yum install clang'"; exit 1; }
check-cmake:
- @command -v cmake >/dev/null 2>&1 || { echo >&2 "Please install cmake with 'sudo apt-get install cmake' or 'brew install cmake'"; exit 1; }
+ @command -v cmake >/dev/null 2>&1 || { echo -e >&2 "Cmake is not installed. Please install cmake to continue.\nOn Debian / Ubuntu: 'sudo apt-get install cmake' or 'brew install cmake'\nOn Centos: 'sudo yum install cmake'"; exit 1; }
check-python:
- @command -v python3 >/dev/null 2>&1 || { echo >&2 "Python needs to be installed and in your path."; exit 1; }
+ @command -v python3 >/dev/null 2>&1 || { echo -e >&2 "Python is not installed. Please install python to continue.\nOn Debian / Ubuntu: 'sudo apt-get install python'\nOn Centos: 'sudo yum install python'"; exit 1; }
check-linux-vulkan:
@echo "Checking system type and Vulkan availability..."
@@ -113,7 +151,7 @@ check-linux-vulkan:
echo "Vulkan is installed."; \
vulkaninfo; \
else \
- echo "Vulkan is not installed. Please install Vulkan drivers to continue. On Debian / Ubuntu: sudo apt install libvulkan1 mesa-vulkan-drivers vulkan-tools"; \
+ echo -e "Vulkan is not installed. Please install Vulkan drivers to continue.\nOn Debian / Ubuntu: 'sudo apt install libvulkan1 mesa-vulkan-drivers vulkan-tools'.\nOn Centos: 'sudo yum install vulkan vulkan-tools.'"; \
exit 1; \
fi \
else \
diff --git a/README.md b/README.md
index 4b69bef..46340b7 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ GPU code in C++ projects and have it run on Nvidia, Intel, AMD, and other GPUs.
The same C++ code can work on a wide variety of laptops, workstations, mobile
devices or virtually any hardware with Vulkan, Metal, or DirectX support.
-## Technical Objectives: Lightweight, Fast Iteration, and Low Boilerplate
+## Objectives: Lightweight, Fast Iteration, and Low Boilerplate
With gpu.cpp we want to enable a high-leverage library for individual developers and researchers to incorporate GPU computation into programs relying on nothing more than a standard C++ compiler as tooling. Our goals are:
@@ -189,7 +189,7 @@ illustrate how to use gpu.cpp as a library.
After you have run `make` in the top-level directory which retrieves the prebuilt Dawn shared library, you can run each example by navigating to its directory and running `make` from the example's directory.
-An example of tiled matrix multiplication is in [examples/matmul](https://github.com/AnswerDotAI/gpu.cpp/blob/main/examples/matmul/). This implements a WebGPU version of the first few kernels of Simon Boehm's [How to Optimize a CUDA Matmul Kernel for cuBLAS-like Performance: a Worklog](https://siboehm.com/articles/22/CUDA-MMM) post. It currently runs at ~ 2.5+ TFLOPs on a Macbook Pro M1 Max laptop, which has a theoretical peak of 10.4 TFLOPs. Contributions to optimize this further are welcome.
+An example of tiled matrix multiplication is in [examples/matmul](https://github.com/AnswerDotAI/gpu.cpp/blob/main/examples/matmul/). This implements a WebGPU version of the first few kernels of Simon Boehm's [How to Optimize a CUDA Matmul Kernel for cuBLAS-like Performance: a Worklog](https://siboehm.com/articles/22/CUDA-MMM) post. It currently runs at ~ 3.5+ TFLOPs on a Macbook Pro M1 Max laptop. Contributions to optimize this further are welcome.
A parallel physics simulation of an ensemble of double pendulums simulated in parallel with different initial conditions on the GPU is shown in [examples/physics](https://github.com/AnswerDotAI/gpu.cpp/tree/main/examples/physics).
@@ -198,9 +198,7 @@ A parallel physics simulation of an ensemble of double pendulums simulated in pa
-We also show some examples of signed distance function computations, rendered in the terminal as ascii. A 3D SDF of spheres is shown in [examples/render](https://github.com/AnswerDotAI/gpu.cpp/tree/main/examples/render]) and a shadertoy-like live-reloading example is in [examples/shadertui](https://github.com/AnswerDotAI/gpu.cpp/tree/main/examples/shadertui).
-
-Interestingly, given a starting example, LLMs such as Claude 3.5 Sonnet can be quite capable at writing low-level WGSL code for you - the other shaders in the shadertui example are written by the LLM.
+We also show some examples of signed distance function computations, rendered in the terminal as ascii. A 3D SDF of spheres is shown in [examples/render](https://github.com/AnswerDotAI/gpu.cpp/tree/main/examples/render) and a shadertoy-like live-reloading example is in [examples/shadertui](https://github.com/AnswerDotAI/gpu.cpp/tree/main/examples/shadertui).

@@ -232,22 +230,16 @@ gpu.cpp lets us implement and drop-in any algorithm with fine-grained control of
gpu.cpp is meant for developers with some familiarity with C++ and GPU programming. It is not a high-level numerical computing or machine learning framework or inference engine, though it can be used in support of such implementations.
-Second, in spite of the name, WebGPU has native implementations decoupled from the web and the browser. gpu.cpp leverages WebGPU as a portable _native_ GPU API first and foremost, with the possibility of running in the browser being a convenient additional benefit in the future.
-
-If you find it counterintuitive, as many do, that WebGPU is a native technology and not just for the web, watch Elie Michel's excellent talk ["WebGPU is Not Just About the Web"](https://www.youtube.com/watch?v=qHrx41aOTUQ).
+Second, in spite of the name, WebGPU has native implementations decoupled from the web and the browser. If you find it counterintuitive, watch Elie Michel's excellent talk ["WebGPU is Not Just About the Web"](https://www.youtube.com/watch?v=qHrx41aOTUQ).
Finally, the focus of gpu.cpp is general-purpose GPU computation rather than rendering/graphics on the GPU, although it can be useful for offline rendering or video processing use cases. We may explore directions with graphics in the future, but for now our focus is GPU compute.
## Limitations and Upcoming Features
-_API Improvements_ - gpu.cpp is a work-in-progress and there are many features and improvements to come. At this early stage, we expect the API design to evolve as we identify improvements / needs from use cases. In particular, the handling of structured parameters and asynchronous dispatch will undergo refinement and maturation in the short-term.
-
_Browser Targets_ - In spite of using WebGPU we haven't tested builds targeting the browser yet though this is a short-term priority.
_Reusable Kernel Library_ - Currently the core library is strictly the operations and types for interfacing with the WebGPU API, with some specific use case example WGSL implementations in `examples/`. Over time, as kernel implementations mature we may migrate some of the reusable operations from specific examples into a small reusable kernel library.
-_More Use Case Examples and Tests_ - Expect an iteration loop of use cases to design tweaks and improvements, which in turn make the use cases cleaner and easier to write. One short term use cases to flesh out the kernels from [llm.c](https://github.com/karpathy/llm.c) in WebGPU form. As these mature into a reusable kernel library, we hope to help realize the potential for WebGPU compute in AI.
-
## Troubleshooting
If you run into issues building the project, please open an issue.
diff --git a/bindings/haskell/CHANGELOG.md b/bindings/haskell/CHANGELOG.md
new file mode 100644
index 0000000..d20679e
--- /dev/null
+++ b/bindings/haskell/CHANGELOG.md
@@ -0,0 +1,5 @@
+# Revision history for gpu-cpp
+
+## 0.1.0.0 -- 2024-12-28
+
+* First version.
diff --git a/bindings/haskell/Makefile b/bindings/haskell/Makefile
new file mode 100644
index 0000000..7ca37a0
--- /dev/null
+++ b/bindings/haskell/Makefile
@@ -0,0 +1,3 @@
+all:
+ cabal configure --extra-include-dirs=$(PWD)/../.. --extra-include-dirs=$(PWD)/../../third_party/headers --extra-lib-dirs=$(PWD)/../../third_party/lib
+ cabal build .
diff --git a/bindings/haskell/app/Main.hs b/bindings/haskell/app/Main.hs
new file mode 100644
index 0000000..ba1ae6d
--- /dev/null
+++ b/bindings/haskell/app/Main.hs
@@ -0,0 +1,37 @@
+module Main where
+
+import GpuCpp.Types
+import GpuCpp
+import qualified Data.Vector.Storable as V
+import Foreign.C.Types
+
+main :: IO ()
+main = do
+ context <- createContext
+ input <- createTensor context [12] kf32
+ output <- createTensor context [12] kf32
+ kernelCode <- createKernelCode
+ (
+ "const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)\n" <>
+ "@group(0) @binding(0) var
inp: array<{{precision}}>;\n" <>
+ "@group(0) @binding(1) var out: array<{{precision}}>;\n" <>
+ "@group(0) @binding(1) var dummy: array<{{precision}}>;\n" <>
+ "@compute @workgroup_size({{workgroupSize}})\n" <>
+ "fn main(\n" <>
+ " @builtin(global_invocation_id) GlobalInvocationID: vec3) {\n" <>
+ " let i: u32 = GlobalInvocationID.x;\n" <>
+ " if (i < arrayLength(&inp)) {\n" <>
+ " let x: f32 = inp[i];\n" <>
+ " out[i] = select(0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR \n" <>
+ " * (x + .044715 * x * x * x))), x, x > 10.0);\n" <>
+ " }\n" <>
+ "}\n"
+ )
+ 256
+ kf32
+ kernel <- createKernel context kernelCode [input, output] [0,0] [12,1,1]
+ toGpu context (V.fromList [1 :: CFloat,2,3,4,1,2,3,4,1,2,3,4]) input
+ async <- dispatchKernel context kernel
+ wait context async
+ vec <- toCpu context output :: IO (V.Vector CFloat)
+ print vec
diff --git a/bindings/haskell/gpu-cpp.cabal b/bindings/haskell/gpu-cpp.cabal
new file mode 100644
index 0000000..90cb4fa
--- /dev/null
+++ b/bindings/haskell/gpu-cpp.cabal
@@ -0,0 +1,49 @@
+cabal-version: 3.0
+name: gpu-cpp
+version: 0.1.0.0
+license: BSD-3-Clause
+author: Junji Hashimoto
+maintainer: junji.hashimoto@gmail.com
+category: Math
+build-type: Simple
+
+extra-doc-files: CHANGELOG.md
+
+common warnings
+ ghc-options: -Wall
+
+library
+ import: warnings
+ exposed-modules: GpuCpp
+ , GpuCpp.Types
+ build-depends: base ^>=4.18.1.0
+ , inline-c
+ , inline-c-cpp
+ , containers
+ , template-haskell
+ , safe-exceptions
+ , vector
+ hs-source-dirs: src
+ default-language: Haskell2010
+ ghc-options: -optcxx-std=c++17
+ extra-libraries: webgpu_dawn
+
+executable gpu-cpp
+ import: warnings
+ main-is: Main.hs
+ build-depends: base ^>=4.18.1.0
+ , gpu-cpp
+ , vector
+ hs-source-dirs: app
+ default-language: Haskell2010
+
+test-suite gpu-cpp-test
+ import: warnings
+ default-language: Haskell2010
+ type: exitcode-stdio-1.0
+ hs-source-dirs: test
+ main-is: Main.hs
+ build-depends: base ^>=4.18.1.0
+ , gpu-cpp
+ , vector
+ , hspec
diff --git a/bindings/haskell/src/GpuCpp.hs b/bindings/haskell/src/GpuCpp.hs
new file mode 100644
index 0000000..2177ecf
--- /dev/null
+++ b/bindings/haskell/src/GpuCpp.hs
@@ -0,0 +1,207 @@
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE PolyKinds #-}
+{-# LANGUAGE TemplateHaskell #-}
+{-# LANGUAGE QuasiQuotes #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TypeApplications #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE FlexibleInstances #-}
+
+module GpuCpp where
+
+import qualified Language.C.Inline.Cpp as C
+import qualified Language.C.Inline.Cpp.Unsafe as C
+import qualified Language.C.Inline.Context as C
+import Foreign.C.String
+import Foreign.C.Types
+import GHC.Int
+import GHC.ForeignPtr(mallocPlainForeignPtrBytes)
+import Foreign
+import Control.Monad (forM_)
+import GpuCpp.Types
+import Control.Exception.Safe (bracket)
+import qualified Data.Vector.Storable as V
+
+C.context $ C.cppCtx <> mempty { C.ctxTypesTable = typeTable }
+
+C.include ""
+C.include ""
+C.include ""
+
+[C.emitBlock|
+struct GpuAsync {
+ std::promise promise;
+ std::future future;
+ GpuAsync(): future(promise.get_future()){
+ }
+};
+
+gpu::Shape vector_to_shape(const std::vector &dims) {
+ switch(dims.size()){
+ case 1:
+ return gpu::Shape{(unsigned long)dims[0]};
+ break;
+ case 2:
+ return gpu::Shape{(unsigned long)dims[0],(unsigned long)dims[1]};
+ break;
+ case 3:
+ return gpu::Shape{(unsigned long)dims[0],(unsigned long)dims[1],(unsigned long)dims[2]};
+ break;
+ case 4:
+ return gpu::Shape{(unsigned long)dims[0],(unsigned long)dims[1],(unsigned long)dims[2],(unsigned long)dims[3]};
+ break;
+ case 5:
+ return gpu::Shape{(unsigned long)dims[0],(unsigned long)dims[1],(unsigned long)dims[2],(unsigned long)dims[3],(unsigned long)dims[4]};
+ break;
+ }
+ return gpu::Shape{0};
+}
+|]
+
+kf32 :: CInt
+kf32 = [C.pure| int { (int)gpu::kf32 } |]
+
+createContext :: IO (ForeignPtr Context)
+createContext =
+ [C.throwBlock| gpu::Context* { return new gpu::Context(gpu::createContext()); }|] >>=
+ newForeignPtr
+ [C.funPtr| void deleteContext(gpu::Context* ptr) { delete ptr; }|]
+
+
+createKernelCode :: String -> CInt -> CInt -> IO (ForeignPtr KernelCode)
+createKernelCode kernelString workgroupSize precision =
+ withCString kernelString $ \pData ->
+ [C.throwBlock| gpu::KernelCode* { return new gpu::KernelCode($(char* pData), $(int workgroupSize), (gpu::NumType)$(int precision)); }|] >>=
+ newForeignPtr
+ [C.funPtr| void deleteKernelCode(gpu::KernelCode* ptr) { delete ptr; }|]
+
+
+dispatchKernel :: ForeignPtr Context -> ForeignPtr Kernel -> IO (ForeignPtr GpuAsync)
+dispatchKernel context kernel =
+ withForeignPtr context $ \c ->
+ withForeignPtr kernel $ \k ->
+ [C.throwBlock| GpuAsync* {
+ auto async = new GpuAsync();
+ gpu::dispatchKernel(*$(gpu::Context* c), *$(gpu::Kernel* k), async->promise);
+ return async; }|] >>=
+ newForeignPtr
+ [C.funPtr| void deleteGpuAsync(GpuAsync* ptr) { delete ptr; }|]
+
+wait :: ForeignPtr Context -> ForeignPtr GpuAsync -> IO ()
+wait context async =
+ withForeignPtr context $ \c ->
+ withForeignPtr async $ \a ->
+ [C.throwBlock| void {
+ gpu::wait(*$(gpu::Context* c), $(GpuAsync* a)->future);
+ }|]
+
+instance WithVector CInt Int64 where
+ withVector shape func =
+ bracket
+ (do
+ let len = fromIntegral $ length shape
+ vec <- [C.throwBlock| std::vector* {
+ return new std::vector($(int len));
+ }|]
+ ptr <- [C.throwBlock| int64_t* {
+ return $(std::vector* vec)->data();
+ }|]
+ pokeArray ptr (map fromIntegral shape)
+ return vec
+ )
+ (\vec -> [C.block| void { delete $(std::vector* vec); }|])
+ (\vec -> func vec)
+
+instance WithVector CInt CSize where
+ withVector shape func =
+ bracket
+ (do
+ let len = fromIntegral $ length shape
+ vec <- [C.throwBlock| std::vector* {
+ return new std::vector($(int len));
+ }|]
+ ptr <- [C.throwBlock| size_t* {
+ return $(std::vector* vec)->data();
+ }|]
+ pokeArray ptr (map fromIntegral shape)
+ return vec
+ )
+ (\vec -> [C.block| void { delete $(std::vector* vec); }|])
+ (\vec -> func vec)
+
+instance WithVector (Ptr Tensor) Tensor where
+ withVector ptrs func =
+ bracket (do
+ vec <- [C.throwBlock| std::vector* { return new std::vector(); }|]
+ forM_ ptrs $ do
+ \ptr -> [C.throwBlock| void { $(std::vector* vec)->push_back(*$(gpu::Tensor* ptr)); }|]
+ return vec
+ )
+ (\vec -> [C.block| void { delete $(std::vector* vec); }|])
+ (\vec -> func vec)
+
+withForeignPtrs :: [ForeignPtr a] -> ([Ptr a] -> IO b) -> IO b
+withForeignPtrs [] func = func []
+withForeignPtrs (x:xs) func =
+ withForeignPtr x $ \x' ->
+ withForeignPtrs xs $ \xs' ->
+ func (x':xs')
+
+createKernel :: ForeignPtr Context -> ForeignPtr KernelCode -> [ForeignPtr Tensor] -> [Int] -> [Int] -> IO (ForeignPtr Kernel)
+createKernel context kernelCode dataBindings viewOffsets totalWorkgroups =
+ withForeignPtr context $ \c ->
+ withForeignPtr kernelCode $ \k ->
+ withForeignPtrs dataBindings $ \b ->
+ withVector b $ \b' ->
+ withVector @CInt (map fromIntegral viewOffsets) $ \v ->
+ withVector @CInt (map fromIntegral totalWorkgroups) $ \w ->
+ [C.throwBlock| gpu::Kernel* {
+ return new gpu::Kernel(gpu::createKernel(
+ *$(gpu::Context* c),
+ *$(gpu::KernelCode* k),
+ $(std::vector* b')->data(),
+ $(std::vector* b')->size(),
+ $(std::vector* v)->data(),
+ vector_to_shape(*$(std::vector* w))));
+ }|] >>=
+ newForeignPtr
+ [C.funPtr| void deleteKernel(gpu::Kernel* ptr) { delete ptr; }|]
+
+createTensor :: ForeignPtr Context -> [CInt] -> CInt -> IO (ForeignPtr Tensor)
+createTensor context shape dtype =
+ withVector shape $ \s ->
+ withForeignPtr context $ \c ->
+ [C.throwBlock| gpu::Tensor* {
+ return new gpu::Tensor(gpu::createTensor(*$(gpu::Context* c), vector_to_shape(*$(std::vector* s)), (gpu::NumType)$(int dtype)));
+ }|] >>=
+ newForeignPtr
+ [C.funPtr| void deleteTensor(gpu::Tensor* ptr) { delete ptr; }|]
+
+createVector :: forall a. Storable a => Int -> IO (V.Vector a)
+createVector n = do
+ ptr <- mallocPlainForeignPtrBytes (n * sizeOf (undefined :: a))
+ return $ V.unsafeFromForeignPtr ptr 0 n
+
+instance GpuStorable CFloat where
+ toGpu context array tensor =
+ withForeignPtr context $ \c ->
+ withForeignPtr tensor $ \t ->
+ V.unsafeWith array $ \ptr ->
+ [C.throwBlock| void {
+ gpu::toGPU(*$(gpu::Context* c), $(float* ptr), *$(gpu::Tensor* t));
+ }|]
+ toCpu context tensor =
+ withForeignPtr context $ \c ->
+ withForeignPtr tensor $ \t -> do
+ (size :: CInt) <- [C.block| int {
+ size_t u = sizeof(float);
+ size_t len = $(gpu::Tensor* t)->data.size;
+ return len/u;
+ }|]
+ array <- createVector (fromIntegral size)
+ V.unsafeWith array $ \ptr ->
+ [C.throwBlock| void {
+ gpu::toCPU(*$(gpu::Context* c), *$(gpu::Tensor* t), $(float* ptr), $(int size) * sizeof(float));
+ }|]
+ return array
diff --git a/bindings/haskell/src/GpuCpp/Types.hs b/bindings/haskell/src/GpuCpp/Types.hs
new file mode 100644
index 0000000..3905aa7
--- /dev/null
+++ b/bindings/haskell/src/GpuCpp/Types.hs
@@ -0,0 +1,40 @@
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE PolyKinds #-}
+{-# LANGUAGE TemplateHaskell #-}
+{-# LANGUAGE QuasiQuotes #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+
+module GpuCpp.Types where
+
+import qualified Language.C.Types as C
+import qualified Language.Haskell.TH.Lib as TH
+import qualified Data.Map as Map
+import Foreign
+import qualified Data.Vector.Storable as V
+
+data Context
+data Tensor
+data Kernel
+data KernelCode
+data GpuAsync
+data StdVector a
+
+typeTable :: Map.Map C.TypeSpecifier TH.TypeQ
+typeTable = Map.fromList [
+ (C.TypeName "gpu::Context", [t|Context|])
+ , (C.TypeName "gpu::Tensor", [t|Tensor|])
+ , (C.TypeName "gpu::Kernel", [t|Kernel|])
+ , (C.TypeName "gpu::KernelCode", [t|KernelCode|])
+ , (C.TypeName "GpuAsync", [t|GpuAsync|])
+ , (C.TypeName "std::vector", [t|StdVector|])
+ ]
+
+
+class WithVector a b where
+ withVector :: [a] -> (Ptr (StdVector b) -> IO c) -> IO c
+
+class GpuStorable a where
+ toGpu :: ForeignPtr Context -> V.Vector a -> ForeignPtr Tensor -> IO ()
+ toCpu :: ForeignPtr Context -> ForeignPtr Tensor -> IO (V.Vector a)
+
diff --git a/bindings/haskell/test/Main.hs b/bindings/haskell/test/Main.hs
new file mode 100644
index 0000000..d66e5c1
--- /dev/null
+++ b/bindings/haskell/test/Main.hs
@@ -0,0 +1,49 @@
+module Main (main) where
+
+import Test.Hspec
+import GpuCpp.Types
+import GpuCpp
+import qualified Data.Vector.Storable as V
+import Foreign.C.Types
+
+gelu :: String
+gelu= "const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)\n" <>
+ "@group(0) @binding(0) var inp: array<{{precision}}>;\n" <>
+ "@group(0) @binding(1) var out: array<{{precision}}>;\n" <>
+ "@group(0) @binding(1) var dummy: array<{{precision}}>;\n" <>
+ "@compute @workgroup_size({{workgroupSize}})\n" <>
+ "fn main(\n" <>
+ " @builtin(global_invocation_id) GlobalInvocationID: vec3) {\n" <>
+ " let i: u32 = GlobalInvocationID.x;\n" <>
+ " if (i < arrayLength(&inp)) {\n" <>
+ " let x: f32 = inp[i];\n" <>
+ " out[i] = select(0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR \n" <>
+ " * (x + .044715 * x * x * x))), x, x > 10.0);\n" <>
+ " }\n" <>
+ "}\n"
+
+main :: IO ()
+main = do
+ hspec $ do
+ describe "toCPU and toGPU" $ do
+ it "writes and reads back" $ do
+ context <- createContext
+ input <- createTensor context [12] kf32
+ toGpu context (V.fromList [1 :: CFloat,2,3,4,1,2,3,4,1,2,3,4]) input
+ output <- toCpu context input :: IO (V.Vector CFloat)
+ V.toList output `shouldBe` [1,2,3,4,1,2,3,4,1,2,3,4]
+ describe "call kernel" $ do
+ it "gelu" $ do
+ context <- createContext
+ input <- createTensor context [12] kf32
+ output <- createTensor context [12] kf32
+ kernelCode <- createKernelCode gelu 256 kf32
+ kernel <- createKernel context kernelCode [input, output] [0,0] [12,1,1]
+ toGpu context (V.fromList [1 :: CFloat,2,3,4,1,2,3,4,1,2,3,4]) input
+ async <- dispatchKernel context kernel
+ wait context async
+ vec <- toCpu context output :: IO (V.Vector CFloat)
+ V.toList (V.zipWith (\a b -> abs (a - b))
+ vec
+ (V.fromList [0.841192,1.9545977,2.9963627,3.9999297,0.841192,1.9545977,2.9963627,3.9999297,0.841192,1.9545977,2.9963627,3.9999297]))
+ `shouldSatisfy` all (< 0.001)
diff --git a/bindings/python/Makefile b/bindings/python/Makefile
new file mode 100644
index 0000000..78e0b58
--- /dev/null
+++ b/bindings/python/Makefile
@@ -0,0 +1,25 @@
+CXX=clang++
+PYTHON=python3
+GPUCPP ?= $(PWD)/../..
+LIBDIR ?= $(GPUCPP)/third_party/lib
+LIBSPEC ?= . $(GPUCPP)/source
+
+ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/null 2>&1 ; echo $$?),0)
+ STDLIB :=
+else
+ STDLIB := -stdlib=libc++
+endif
+
+FLAGS=-shared -fPIC -std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib -lwebgpu_dawn \
+ `python3 -m pybind11 --includes` \
+ `python3-config --includes --ldflags`
+
+SUFFIX=$(shell $(PYTHON)-config --extension-suffix)
+
+gpu_cpp$(SUFFIX): gpu_cpp.cpp
+ $(CXX) $(FLAGS) -o $@ $<
+
+test: test_gpu_cpp.py gpu_cpp$(SUFFIX)
+ $(PYTHON) test_gpu_cpp.py
+
+.PHONY: test
diff --git a/bindings/python/gpu_cpp.cpp b/bindings/python/gpu_cpp.cpp
new file mode 100644
index 0000000..8bd762d
--- /dev/null
+++ b/bindings/python/gpu_cpp.cpp
@@ -0,0 +1,111 @@
+#include "gpu.hpp"
+#include
+#include
+#include
+
+using namespace gpu;
+
+#include
+#include
+#include
+
+namespace py = pybind11;
+
+Shape vector_to_shape(const std::vector &dims) {
+ switch(dims.size()){
+ case 1:
+ return Shape{(unsigned long)dims[0]};
+ break;
+ case 2:
+ return Shape{(unsigned long)dims[0],(unsigned long)dims[1]};
+ break;
+ case 3:
+ return Shape{(unsigned long)dims[0],(unsigned long)dims[1],(unsigned long)dims[2]};
+ break;
+ case 4:
+ return Shape{(unsigned long)dims[0],(unsigned long)dims[1],(unsigned long)dims[2],(unsigned long)dims[3]};
+ break;
+ case 5:
+ return Shape{(unsigned long)dims[0],(unsigned long)dims[1],(unsigned long)dims[2],(unsigned long)dims[3],(unsigned long)dims[4]};
+ break;
+ }
+ return Shape{0};
+}
+
+Context* py_createContext() {
+ return new Context(createContext());
+}
+
+KernelCode* py_createKernelCode(const std::string &pData, size_t workgroupSize, int precision) {
+ return new KernelCode(pData, workgroupSize, (NumType)precision);
+}
+
+Kernel py_createKernel(Context *ctx, const KernelCode *code,
+ // const Tensor *dataBindings, size_t numTensors,
+ const py::list& dataBindings_py,
+ // const size_t *viewOffsets,
+ const py::list& viewOffsets_py,
+ const std::vector &totalWorkgroups){
+ std::vector bindings;
+ for (auto item : dataBindings_py) {
+ bindings.push_back(item.cast());
+ }
+ std::vector viewOffsets;
+ for (auto item : viewOffsets_py) {
+ viewOffsets.push_back(item.cast());
+ }
+ return createKernel(*ctx, *code, bindings.data(), bindings.size(), viewOffsets.data(), vector_to_shape(totalWorkgroups));
+}
+
+Tensor* py_createTensor(Context *ctx, const std::vector &dims, int dtype) {
+ return new Tensor(createTensor(*ctx, vector_to_shape(dims), (NumType)dtype));
+}
+
+py::array_t py_toCPU_float(Context *ctx, Tensor* tensor) {
+ auto result = py::array_t(tensor->data.size/sizeof(float));
+ py::buffer_info buf = result.request();
+ toCPU(*ctx, *tensor, static_cast(buf.ptr), tensor->data.size);
+ return result;
+}
+
+
+void py_toGPU_float(Context *ctx, py::array_t array, Tensor *tensor) {
+ py::buffer_info buf = array.request();
+ float *ptr = static_cast(buf.ptr);
+ toGPU(*ctx, ptr, *tensor);
+}
+
+struct GpuAsync {
+ std::promise promise;
+ std::future future ;
+ GpuAsync(): future(promise.get_future()){
+ }
+};
+
+GpuAsync* py_dispatchKernel(Context *ctx, Kernel kernel) {
+ auto async = new GpuAsync();
+ dispatchKernel(*ctx, kernel, async->promise);
+ return async;
+}
+
+void py_wait(Context *ctx, GpuAsync* async) {
+ wait(*ctx, async->future);
+}
+
+PYBIND11_MODULE(gpu_cpp, m) {
+ m.doc() = "gpu.cpp plugin";
+ py::class_(m, "Context");
+ py::class_(m, "Tensor");
+ py::class_>(m, "Kernel");
+ py::class_(m, "KernelCode");
+ py::class_(m, "GpuAsync");
+ m.def("create_context", &py_createContext, py::return_value_policy::take_ownership);
+ m.def("create_tensor", &py_createTensor, py::return_value_policy::take_ownership);
+ m.def("create_kernel", &py_createKernel);
+ m.def("create_kernel_code", &py_createKernelCode, py::return_value_policy::take_ownership);
+ m.def("dispatch_kernel", &py_dispatchKernel, py::return_value_policy::take_ownership);
+ m.def("wait", &py_wait, "Wait for GPU");
+ m.def("to_cpu_float", &py_toCPU_float);
+ m.def("to_gpu_float", &py_toGPU_float);
+ m.attr("kf32") = (int)kf32;
+}
diff --git a/bindings/python/test_gpu_cpp.py b/bindings/python/test_gpu_cpp.py
new file mode 100644
index 0000000..ad50c6a
--- /dev/null
+++ b/bindings/python/test_gpu_cpp.py
@@ -0,0 +1,39 @@
+import gpu_cpp as gpu
+import numpy as np
+
+ctx = gpu.create_context()
+
+N = 12
+
+input = gpu.create_tensor(ctx, [N], gpu.kf32)
+output = gpu.create_tensor(ctx, [N], gpu.kf32)
+kernel_code = gpu.create_kernel_code(
+ """
+ const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
+ @group(0) @binding(0) var inp: array<{{precision}}>;
+ @group(0) @binding(1) var out: array<{{precision}}>;
+ @group(0) @binding(1) var dummy: array<{{precision}}>;
+ @compute @workgroup_size({{workgroupSize}})
+ fn main(
+ @builtin(global_invocation_id) GlobalInvocationID: vec3) {
+ let i: u32 = GlobalInvocationID.x;
+ if (i < arrayLength(&inp)) {
+ let x: f32 = inp[i];
+ out[i] = select(0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR
+ * (x + .044715 * x * x * x))), x, x > 10.0);
+ }
+ }
+ """,
+ 256,
+ gpu.kf32
+ )
+
+kernel = gpu.create_kernel(ctx, kernel_code, [input, output], [0,0], [12,1,1])
+
+gpu.to_gpu_float(ctx, np.array([1,2,3,4,1,2,3,4,1,2,3,4],np.float32), input)
+
+gpu_async = gpu.dispatch_kernel(ctx, kernel);
+
+gpu.wait(ctx, gpu_async);
+
+print(gpu.to_cpu_float(ctx, output))
diff --git a/build.py b/build.py
new file mode 100644
index 0000000..ffb5e0d
--- /dev/null
+++ b/build.py
@@ -0,0 +1,32 @@
+# Dictionary of header files and their relative paths
+header_files = {
+ "#include \"webgpu/webgpu.h\"": "third_party/headers/webgpu/webgpu.h",
+ "#include \"numeric_types/half.hpp\"": "numeric_types/half.hpp",
+ "#include \"utils/logging.hpp\"": "utils/logging.hpp"
+}
+
+def main():
+ # File paths
+ source_file_path = "gpu.hpp"
+ output_file_path = "build/gpu.hpp"
+
+ # Open source file and read contents
+ with open(source_file_path, "r") as source:
+ file_contents = source.read()
+
+ # Ergodic over header files
+ for key, value in header_files.items():
+
+ # Replace header files
+ with open(value, "r") as header_file:
+ header_file_contents = header_file.read()
+ file_contents = file_contents.replace(key, header_file_contents)
+
+
+ # Open output file
+ with open(output_file_path, "w") as output:
+ # Write contents to output file
+ output.write(file_contents)
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/examples/Makefile b/examples/Makefile
index 6420619..3036e22 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -14,7 +14,7 @@ else
endif
FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib
-LFLAGS=-ldl -ldawn
+LFLAGS=-ldl -lwebgpu_dawn
.PHONY: default all_release all_debug dawnlib run_setup check-python
.PHONY: $(addsuffix _release, $(TARGETS))
diff --git a/examples/README.md b/examples/README.md
index b73de3b..bfd513e 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -18,7 +18,6 @@ directory of the repository.
| [shadertui](shadertui) | An example of runtime live reloading of WGSL - demonstrated using a terminal shadertoy-like scii rendering. |
| [render](render) | GPU ascii rendering of a signed distance function for two rotating 3D spheres. |
| [physics](physics) | Parallel physics simulation of a double pendulum with each thread starting at a different initial condition. |
-| [web](web) | A minimal example of how to use gpu.cpp to build a WebAssembly module that runs in the browser. Before building this example, make sure you've installed the emscripten sdk by following the [instructions here](https://emscripten.org/docs/getting_started/downloads.html) and run `source emsdk_env.sh` from the `emsdk/` directory that was created when you cloned the emscripten repository. |
## Advanced Examples
@@ -27,4 +26,3 @@ directory of the repository.
| [float16](float16) | Hello World example using the float16 WebGPU extension, instead of the default float32. |
| [matmul](matmul) | Tiled matrix multiplication. |
| [transpose](transpose) | Tiled matrix transpose. |
-| [webgpu_from_scratch](webgpu_from_scratch) | A minimal from-scratch example of how to use WebGPU directly without this library. This is useful to understand the code internals of gpu.cpp. Note this takes a while to build as it compiles the WebGPU C API implementation. |
diff --git a/examples/float16/Makefile b/examples/float16/Makefile
index 54835d9..51e895a 100644
--- a/examples/float16/Makefile
+++ b/examples/float16/Makefile
@@ -9,12 +9,12 @@ ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/nu
else
STDLIB := -stdlib=libc++
endif
-FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -ldawn
+FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -lwebgpu_dawn
run: ./build/$(TARGET) dawnlib
$(LIBSPEC) && ./build/$(TARGET)
-dawnlib: $(if $(wildcard $(GPUCPP)/third_party/lib/libdawn.so $(GPUCPP)/third_party/lib/libdawn.dylib),,run_setup)
+dawnlib: $(if $(wildcard $(GPUCPP)/third_party/lib/libwebgpu_dawn.so $(GPUCPP)/third_party/lib/libwebgpu_dawn.dylib),,run_setup)
run_setup: check-python
cd $(GPUCPP) && python3 setup.py
diff --git a/examples/gpu_puzzles/Makefile b/examples/gpu_puzzles/Makefile
index 849240c..90dfc2d 100644
--- a/examples/gpu_puzzles/Makefile
+++ b/examples/gpu_puzzles/Makefile
@@ -9,8 +9,8 @@ ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/nu
else
STDLIB := -stdlib=libc++
endif
-FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -ldawn
-FLAGS_KEY=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib key.cpp -ldl -ldawn
+FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -lwebgpu_dawn
+FLAGS_KEY=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib key.cpp -ldl -lwebgpu_dawn
run: ./build/$(TARGET)
$(LIBSPEC) && ./build/$(TARGET)
diff --git a/examples/hello_world/Makefile b/examples/hello_world/Makefile
index 085c7ea..7e64553 100644
--- a/examples/hello_world/Makefile
+++ b/examples/hello_world/Makefile
@@ -9,12 +9,12 @@ ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/nu
else
STDLIB := -stdlib=libc++
endif
-FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -ldawn
+FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -lwebgpu_dawn
run: ./build/$(TARGET) dawnlib
$(LIBSPEC) && ./build/$(TARGET)
-dawnlib: $(if $(wildcard $(GPUCPP)/third_party/lib/libdawn.so $(GPUCPP)/third_party/lib/libdawn.dylib),,run_setup)
+dawnlib: $(if $(wildcard $(GPUCPP)/third_party/lib/libwebgpu_dawn.so $(GPUCPP)/third_party/lib/libwebgpu_dawn.dylib),,run_setup)
run_setup: check-python
cd $(GPUCPP) && python3 setup.py
diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp
index 3fbafc4..7453869 100644
--- a/examples/hello_world/run.cpp
+++ b/examples/hello_world/run.cpp
@@ -3,9 +3,7 @@
#include
#include
-using namespace gpu; // createContext, createTensor, createKernel,
- // createShader, dispatchKernel, wait, toCPU
- // Tensor, Kernel, Context, Shape, kf32
+using namespace gpu;
static const char *kGelu = R"(
const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
@@ -29,6 +27,7 @@ int main(int argc, char **argv) {
printf("\nHello gpu.cpp!\n");
printf("--------------\n\n");
+ // std::unique_ptr ctx = createContext();
Context ctx = createContext();
static constexpr size_t N = 10000;
std::array inputArr, outputArr;
@@ -41,7 +40,7 @@ int main(int argc, char **argv) {
std::future future = promise.get_future();
Kernel op = createKernel(ctx, {kGelu, 256, kf32},
Bindings{input, output},
- /* nWorkgroups */ {cdiv(N, 256), 1, 1});
+ {cdiv(N, 256), 1, 1});
dispatchKernel(ctx, op, promise);
wait(ctx, future);
toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
@@ -50,5 +49,4 @@ int main(int argc, char **argv) {
}
printf(" ...\n\n");
printf("Computed %zu values of GELU(x)\n\n", N);
- return 0;
}
diff --git a/examples/matmul/Makefile b/examples/matmul/Makefile
index 78d3c0e..03cd20e 100644
--- a/examples/matmul/Makefile
+++ b/examples/matmul/Makefile
@@ -10,7 +10,7 @@ ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/nu
else
STDLIB := -stdlib=libc++
endif
-FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -ldawn
+FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -lwebgpu_dawn
run: ./build/$(TARGET)
$(LIBSPEC) && ./build/$(TARGET)
@@ -28,7 +28,7 @@ build/$(TARGET): run.cpp
build/$(TARGET)_with_metal_profiler: run.cpp
mkdir -p build && $(CXX) $(FLAGS) -o ./build/$(TARGET)_with_metal_profiler $(GPUCPP)/experimental/profiler/metal.mm -framework metal -framework Foundation -DMETAL_PROFILER -g
- install_name_tool -change @rpath/libdawn.dylib $(GPUCPP)/third_party/lib/libdawn.dylib ./build/$(TARGET)_with_metal_profiler
+ install_name_tool -change @rpath/libwebgpu_dawn.dylib $(GPUCPP)/third_party/lib/libwebgpu_dawn.dylib ./build/$(TARGET)_with_metal_profiler
watch:
@command -v entr >/dev/null 2>&1 || { echo >&2 "Please install entr with 'brew install entr' or 'sudo apt-get install entr'"; exit 1; }
diff --git a/examples/matmul/run.cpp b/examples/matmul/run.cpp
index 3db4f78..42d7009 100644
--- a/examples/matmul/run.cpp
+++ b/examples/matmul/run.cpp
@@ -792,13 +792,40 @@ void runTest(int version, size_t M, size_t K, size_t N,
}
// Allocate GPU buffers and copy data
- Context ctx = createContext(
- {}, {},
- /*device descriptor, enabling f16 in WGSL*/
- {
+ WGPUDeviceDescriptor devDescriptor = {};
+ devDescriptor.requiredFeatureCount = 1;
+ devDescriptor.requiredFeatures = std::array{WGPUFeatureName_ShaderF16}.data();
+
+ Context ctx;
+ if (numtype == kf16) {
+ ctx = createContext(
+ {}, {},
+ /*device descriptor, enabling f16 in WGSL*/
+ {
.requiredFeatureCount = 1,
- .requiredFeatures = std::array{WGPUFeatureName_ShaderF16}.data(),
- });
+ .requiredFeatures = std::array{WGPUFeatureName_ShaderF16}.data()
+ });
+ if (ctx.adapterStatus != WGPURequestAdapterStatus_Success) {
+ LOG(kDefLog, kError, "Failed to create adapter with f16 support, try running an f32 test instead (`export MATMUL_VERSION=9).");
+ exit(1);
+ }
+ if (ctx.deviceStatus != WGPURequestDeviceStatus_Success) {
+ LOG(kDefLog, kError, "Failed to create device with f16 support, try running an f32 test instead. (`export MATMUL_VERSION=9)");
+ exit(1);
+ }
+ }
+
+ if (numtype == kf32) {
+ ctx = createContext({}, {}, {});
+ if (ctx.adapterStatus != WGPURequestAdapterStatus_Success ||
+ ctx.deviceStatus != WGPURequestDeviceStatus_Success) {
+ LOG(kDefLog, kError, "Failed to create adapter or device");
+ // stop execution
+ exit(1);
+ } else {
+ LOG(kDefLog, kInfo, "Successfully created adapter and device");
+ }
+ }
Tensor input = createTensor(ctx, Shape{M, K}, numtype, inputPtr.get());
Tensor weights = createTensor(ctx, Shape{N, K}, numtype, weightsPtr.get()); // column-major
@@ -810,8 +837,6 @@ void runTest(int version, size_t M, size_t K, size_t N,
#endif
// Initialize Kernel and bind GPU buffers
-
-
// pre-allocate for async dispatch
std::array, nIter> promises;
std::array, nIter> futures;
@@ -823,10 +848,6 @@ void runTest(int version, size_t M, size_t K, size_t N,
kernels[i] = selectMatmul(ctx, version, {input, weights, outputs[i]}, M, K, N, numtype);
}
-#ifndef METAL_PROFILER
- printf("[ Press enter to start tests ... ]\n");
- getchar();
-#endif
LOG(kDefLog, kInfo, "Dispatching Kernel version %d: %s, %d iterations ...",
version, versionToStr(version).c_str(), nIter);
diff --git a/examples/physics/Makefile b/examples/physics/Makefile
index 7cdd3f5..10cfb13 100644
--- a/examples/physics/Makefile
+++ b/examples/physics/Makefile
@@ -9,7 +9,7 @@ ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/nu
else
STDLIB := -stdlib=libc++
endif
-FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -ldawn
+FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -lwebgpu_dawn
run: ./build/$(TARGET)
$(LIBSPEC) && ./build/$(TARGET)
diff --git a/examples/render/Makefile b/examples/render/Makefile
index 552bbf0..d07048c 100644
--- a/examples/render/Makefile
+++ b/examples/render/Makefile
@@ -9,7 +9,7 @@ ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/nu
else
STDLIB := -stdlib=libc++
endif
-FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -ldawn
+FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -lwebgpu_dawn
run: ./build/$(TARGET)
$(LIBSPEC) && ./build/$(TARGET)
diff --git a/examples/shadertui/Makefile b/examples/shadertui/Makefile
index 82daef1..81c740b 100644
--- a/examples/shadertui/Makefile
+++ b/examples/shadertui/Makefile
@@ -10,7 +10,7 @@ ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/nu
else
STDLIB := -stdlib=libc++
endif
-FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -ldawn
+FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -lwebgpu_dawn
run: ./build/$(TARGET)
diff --git a/examples/shadertui/shader.wgsl b/examples/shadertui/shader.wgsl
index 84d3206..7d95150 100644
--- a/examples/shadertui/shader.wgsl
+++ b/examples/shadertui/shader.wgsl
@@ -2,60 +2,39 @@
@group(0) @binding(1) var params: Params;
struct Params {
- time: f32,
- screenwidth: u32,
- screenheight: u32,
+ time: f32,
+ screenwidth: u32,
+ screenheight: u32,
};
-struct Particle {
- position: vec2,
- velocity: vec2,
- life: f32,
-}
-
-const NUM_PARTICLES: u32 = 1000u;
-const PARTICLE_LIFE: f32 = 9.0;
-const EMISSION_RATE: f32 = 300.0;
+const MAX_ITERATIONS: u32 = 100;
-fn rand(n: f32) -> f32 {
- return fract(sin(n) * 43758.5453123);
-}
-
-fn initialize_particle(id: f32, time: f32) -> Particle {
- let random1 = rand(id * 0.01 + time * 0.1);
- let random2 = rand(id * 0.02 + time * 0.1);
- let angle = random1 * 2.0 * 3.14159;
- let speed = 0.05 + random2 * 0.05;
-
- return Particle(
- vec2(0.5, 0.5),
- vec2(cos(angle) * speed, sin(angle) * speed),
- PARTICLE_LIFE
- );
+fn complex_mul(a: vec2, b: vec2) -> vec2 {
+ return vec2(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
}
@compute @workgroup_size(16, 16, 1)
fn main(@builtin(global_invocation_id) globalID : vec3) {
let resolution = vec2(f32(params.screenwidth), f32(params.screenheight));
- let uv = vec2(f32(globalID.x) / resolution.x, f32(globalID.y) / resolution.y);
- let idx = globalID.y * params.screenwidth + globalID.x;
+ let uv = (vec2(globalID.xy) - 0.5 * resolution) / min(resolution.x, resolution.y);
+
+ // Animate the Julia set parameters
+ let t = params.time * 0.3;
+ let c = 0.7885 * vec2(cos(t), sin(t));
- var color: f32 = 0.0;
+ var z = uv * 3.0;
+ var i: u32 = 0u;
- for (var i: f32 = 0.0; i < f32(NUM_PARTICLES); i += 1.0) {
- let spawn_time = i / EMISSION_RATE;
- let particle_age = fract((params.time - spawn_time) / PARTICLE_LIFE) * PARTICLE_LIFE;
-
- if (particle_age < PARTICLE_LIFE) {
- var particle = initialize_particle(i, spawn_time);
- particle.position += particle.velocity * particle_age;
-
- let distance = length(uv - particle.position);
- if (distance < 0.005) {
- color += 0.5 * (1.0 - particle_age / PARTICLE_LIFE);
- }
+ for (; i < MAX_ITERATIONS; i = i + 1u) {
+ z = complex_mul(z, z) + c;
+ if (dot(z, z) > 4.0) {
+ break;
}
}
- out[idx] = min(color, 1.0);
+ let smooth_i = f32(i) + 1.0 - log2(log2(dot(z, z)));
+ let color = 0.5 + 0.5 * cos(3.0 + smooth_i * 0.15 + vec3(0.0, 0.6, 1.0));
+
+ let idx = globalID.y * params.screenwidth + globalID.x;
+ out[idx] = (color.r + color.g + color.b) / 3.0;
}
diff --git a/examples/transpose/Makefile b/examples/transpose/Makefile
index dca2fb6..1495c96 100644
--- a/examples/transpose/Makefile
+++ b/examples/transpose/Makefile
@@ -10,7 +10,7 @@ ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/nu
else
STDLIB := -stdlib=libc++
endif
-FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -ldawn
+FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -lwebgpu_dawn
run: ./build/$(TARGET)
$(LIBSPEC) && ./build/$(TARGET)
diff --git a/examples/webgpu_from_scratch/CMakeLists.txt b/examples/webgpu_from_scratch/CMakeLists.txt
deleted file mode 100644
index 8804628..0000000
--- a/examples/webgpu_from_scratch/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-cmake_minimum_required(VERSION 3.11)
-project(wgpu_tutorial)
-
-include(FetchContent)
-
-FetchContent_Declare(
- webgpu-backend-dawn
- GIT_REPOSITORY https://github.com/eliemichel/WebGPU-distribution
- GIT_TAG dawn-6376
- GIT_SHALLOW TRUE
-)
-FetchContent_MakeAvailable(webgpu-backend-dawn)
-
-FetchContent_Declare(spdlog
- GIT_REPOSITORY https://github.com/gabime/spdlog.git
- GIT_TAG 27cb4c76708608465c413f6d0e6b8d99a4d84302
-)
-FetchContent_MakeAvailable(spdlog)
-
-add_executable(wgpu_tutorial run.cpp)
-target_link_libraries(wgpu_tutorial webgpu spdlog)
diff --git a/examples/webgpu_from_scratch/Makefile b/examples/webgpu_from_scratch/Makefile
deleted file mode 100644
index 6e0878f..0000000
--- a/examples/webgpu_from_scratch/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-run:
- mkdir -p build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Debug -DWEBGPU_BACKEND=DAWN -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON && make wgpu_tutorial && ./wgpu_tutorial
-
-watch:
- mkdir -p build && cd build && ls ../* | entr -s "cmake .. -DCMAKE_BUILD_TYPE=Debug -DWEBGPU_BACKEND=DAWN -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON && make wgpu_tutorial && ./wgpu_tutorial"
-
-clean:
- read -r -p "Are you sure? [CTRL-C to abort] " response && rm -rf build/*
diff --git a/examples/webgpu_from_scratch/run.cpp b/examples/webgpu_from_scratch/run.cpp
deleted file mode 100644
index 38f9b98..0000000
--- a/examples/webgpu_from_scratch/run.cpp
+++ /dev/null
@@ -1,446 +0,0 @@
-#include
-#include
-
-#include "webgpu/webgpu.h"
-#include "spdlog/spdlog.h"
-/*
- * Approximate GELU kernel definition, implemented as a WGSL.
- * In general GPU device code for WEBGPU is written in the WGSL domain specific
- * language.
- *
- * Here inp and out correspond to bindings 0 and 1 respectively. In the main
- * code, we create buffers for these bindings and populate them with data.
- *
- */
-const char *kShaderGELU = R"(
-const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
-@group(0) @binding(0) var inp: array;
-@group(0) @binding(1) var out: array;
-@compute @workgroup_size(256)
-fn main(
- @builtin(global_invocation_id) GlobalInvocationID: vec3) {
- let i: u32 = GlobalInvocationID.x;
- // Ensure we do not access out of bounds
- if (i < 3072) {
- let x: f32 = inp[i];
- let cube: f32 = 0.044715 * x * x * x;
- out[i] = 0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR * (x + cube)));
- }
-}
-)";
-
-/*
- * Convenience function to check if a condition is true, if not log an error
- * message and exit.
- *
- * @param condition: The condition to check.
- * @param message: The error message to log if the condition is false.
- * @param file: The file where the error occurred.
- * @param line: The line where the error occurred.
- */
-inline void check(bool condition, const char *message,
- const char *file = "unkown", int line = -1) {
- if (!condition) {
- spdlog::error("Error in file {} line {}:\n{}", file, line, message);
- exit(1);
- } else {
- spdlog::trace("Success in file {} line {}:\n{}", file, line, message);
- }
-}
-
-/*
- * Convenience function to display the first few elements of an array. A more
- * robust/extensive version of this is in array_utils.hpp this is minimal to keep
- * this example self-contained.
- *
- * @param a: The array to show.
- * @param name: The name of the array.
- * @return: A string representation of the array.
- */
-template
-std::string show(std::array a, std::string name) {
- std::string output = "\n\n";
- if (name != "") {
- output += name + " (" + std::to_string(N) + ") : \n";
- }
- for (size_t i = 0; i < N; i++) {
- output += std::to_string(a[i]) + "\n";
- if (i > 10) {
- output += "...\n";
- break;
- }
- }
- return output;
-}
-
-int main() {
-
- static constexpr size_t N = 3072;
-
- // Host data - input and output arrays on the CPU
- std::array inputArr;
- std::array outputArr;
- for (size_t i = 0; i < N; i++) {
- // Populate input array with a range of dummy values
- inputArr[i] = static_cast(i);
- }
-
- // API representations for interfacing with the GPU
- WGPUInstance instance; // The instance is the top-level context object for
- // WebGPU. It is used to create adapters.
- WGPUAdapter adapter; // The adapter is the physical device that WebGPU uses
- // to interface with the GPU.
- WGPUDevice device; // The device is the logical device that WebGPU uses to
- // interface with the adapter.
- WGPUQueue queue; // The queue is used to submit work to the GPU.
-
- // Buffers - buffers are used to store data on the GPU.
- WGPUBuffer inputBuffer; // The input buffer is used to store the input data.
- WGPUBuffer outputBuffer; // The output buffer is used to store the output data.
- WGPUBuffer readbackBuffer; // The readback buffer is used to copy the output
- // data from the GPU back to the CPU.
- WGPUCommandBuffer commandBuffer; // The command buffer is used to store the
- // sequence of operations to be executed on
- // the GPU.
-
- // Async management - polling the GPU is asynchronous, so we need to manage
- // the async work.
- std::promise promise; // used to signal when the work is done.
- std::future future; // used to wait for the work to be done.
-
- // Here we initialize the instance, adapter, device, and queue.
- spdlog::info("Setting up GPU Context");
- {
- const WGPUInstanceDescriptor desc = {};
- WGPURequestAdapterOptions adapterOpts = {};
- WGPUDeviceDescriptor devDescriptor = {};
- spdlog::info("Creating instance");
- {
- instance = wgpuCreateInstance(&desc);
- check(instance, "Initialize WebGPU", __FILE__, __LINE__);
- }
- spdlog::info("Requesting adapter");
- {
- struct AdapterData {
- WGPUAdapter adapter = nullptr;
- bool requestEnded = false;
- };
- AdapterData adapterData;
- auto onAdapterRequestEnded = [](WGPURequestAdapterStatus status,
- WGPUAdapter adapter, char const *message,
- void *pUserData) {
- AdapterData &adapterData = *reinterpret_cast(pUserData);
- check(status == WGPURequestAdapterStatus_Success,
- "Request WebGPU adapter", __FILE__, __LINE__);
- adapterData.adapter = adapter;
- adapterData.requestEnded = true;
- };
- wgpuInstanceRequestAdapter(instance, &adapterOpts, onAdapterRequestEnded,
- (void *)&adapterData);
- assert(adapterData.requestEnded);
- adapter = adapterData.adapter;
- check(adapter, "Get WebGPU adapter", __FILE__, __LINE__);
- }
- spdlog::info("Requesting device");
- {
- struct DeviceData {
- WGPUDevice device = nullptr;
- bool requestEnded = false;
- };
- DeviceData devData;
- auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status,
- WGPUDevice device, char const *message,
- void *pUserData) {
- DeviceData &devData = *reinterpret_cast(pUserData);
- check(status == WGPURequestDeviceStatus_Success,
- "Could not get WebGPU device.", __FILE__, __LINE__);
- spdlog::info("Device Request succeeded {}",
- static_cast(device));
- devData.device = device;
- devData.requestEnded = true;
- };
- devDescriptor.deviceLostCallback =
- [](WGPUDeviceLostReason reason, char const *message, void *userdata) {
- spdlog::error("Device lost:\n{}", message);
- };
- wgpuAdapterRequestDevice(adapter, &devDescriptor, onDeviceRequestEnded,
- (void *)&devData);
- assert(devData.requestEnded);
- device = devData.device;
- spdlog::info("Setting error callback");
- wgpuDeviceSetUncapturedErrorCallback(
- device,
- [](WGPUErrorType type, char const *message, void *devData) {
- spdlog::error("Device uncaptured error: {}", message);
- },
- nullptr);
- wgpuDeviceSetLoggingCallback(
- device,
- [](WGPULoggingType level, const char *message, void *userdata) {
- spdlog::info("WebGPU Validation: {}", message);
- },
- NULL);
- }
- // Queue
- spdlog::info("Instantiating device queue");
- queue = wgpuDeviceGetQueue(device);
- }
-
- // Here we setup the binding group layout. The binding group layout is used to
- // define the layout of the bind group - e.g. how many buffers are going to be
- // used and what their sizes are.
- //
- // The general pattern of using the WebGPU API is to populate a configuration
- // using a descriptor type (*Descriptor), and then pass the descriptor to a
- // factory function (*Create*) operation which returns a handle to the
- // object. Sometimes the descriptors can be hierarchical and nested, but
- // ultimately they are still just an elaborate set of configuration
- // parameters.
- //
- // For example, here we populate a WGPUBindGroupLayoutDescriptor and then
- // pass that to the wgpuDeviceCreateBindGroupLayout() function to get back a
- // WGPUBindGroupLayout.
- spdlog::info("Setting up binding group layout");
- WGPUBindGroupLayout bgLayout;
- static constexpr uint32_t bufferSize =
- static_cast(sizeof(float) * N);
- spdlog::info("Buffer size: {}, number of elements {}", bufferSize, N);
- {
- WGPUBindGroupLayoutEntry bgLayoutEntries[2];
- bgLayoutEntries[0] = (WGPUBindGroupLayoutEntry){
- .binding = 0,
- .visibility = WGPUShaderStage_Compute,
- .buffer =
- (WGPUBufferBindingLayout){
- .type = WGPUBufferBindingType_Storage,
- .minBindingSize = bufferSize,
- },
- };
- bgLayoutEntries[1] = (WGPUBindGroupLayoutEntry){
- .binding = 1,
- .visibility = WGPUShaderStage_Compute,
- .buffer =
- (WGPUBufferBindingLayout){
- .type = WGPUBufferBindingType_Storage,
- .minBindingSize = bufferSize,
- },
- };
- spdlog::info("Creating Binding Group Layout Description");
- WGPUBindGroupLayoutDescriptor bgLayoutDesc = {
- .entryCount = std::size(bgLayoutEntries),
- .entries = bgLayoutEntries,
- };
- bgLayout = wgpuDeviceCreateBindGroupLayout(device, &bgLayoutDesc);
- }
-
- // After setting up the binding group layout we initialize the buffers by
- // interacting with the device.
- spdlog::info("Create buffers: input, output, and readback");
- {
- WGPUBufferDescriptor inputBufferDesc = {
- .usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst,
- .size = bufferSize,
- };
- inputBuffer = wgpuDeviceCreateBuffer(device, &inputBufferDesc);
- WGPUBufferDescriptor outputBufferDesc = {
- .usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
- WGPUBufferUsage_CopySrc,
- .size = bufferSize,
- };
- outputBuffer = wgpuDeviceCreateBuffer(device, &outputBufferDesc);
- WGPUBufferDescriptor readbackBufferDescriptor = {
- .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
- .size = bufferSize,
- };
- readbackBuffer = wgpuDeviceCreateBuffer(device, &readbackBufferDescriptor);
- check(inputBuffer, "Create input buffer", __FILE__, __LINE__);
- check(outputBuffer, "Create output buffer", __FILE__, __LINE__);
- check(readbackBuffer, "Create readback buffer", __FILE__, __LINE__);
- }
-
- // We create the bind group with references to the buffers and initialize the
- // binding group. Does this seem redundant with the binding group layout?
- // Probably.
- // The bind group is used to bind the buffers to the compute pipeline.
- // The bind group layout is used to define the layout of the bind group.
- spdlog::info("Create the bind group");
- WGPUBindGroup bindGroup;
- {
- WGPUBindGroupEntry bindGroupEntries[2];
- bindGroupEntries[0] = (WGPUBindGroupEntry){
- .binding = 0,
- .buffer = inputBuffer,
- .offset = 0,
- .size = bufferSize,
- };
- bindGroupEntries[1] = (WGPUBindGroupEntry){
- .binding = 1,
- .buffer = outputBuffer,
- .offset = 0,
- .size = bufferSize,
- };
- WGPUBindGroupDescriptor bindGroupDesc = {
- .layout = bgLayout,
- .entryCount = std::size(bindGroupEntries),
- .entries = bindGroupEntries,
- };
- bindGroup = wgpuDeviceCreateBindGroup(device, &bindGroupDesc);
- }
-
- // We create the compute pipeline with the shader module and pipeline layout.
- // The compute pipeline is used to run the compute shader.
- spdlog::info("Creating the compute pipeline");
- WGPUComputePipeline computePipeline;
- {
- WGPUPipelineLayout pipelineLayout;
- WGPUPipelineLayoutDescriptor pipelineLayoutDesc = {
- .bindGroupLayoutCount = 1,
- .bindGroupLayouts = &bgLayout,
- };
- pipelineLayout =
- wgpuDeviceCreatePipelineLayout(device, &pipelineLayoutDesc);
- WGPUShaderModuleWGSLDescriptor wgslDesc = {
- .code = kShaderGELU,
- };
- wgslDesc.chain.sType = WGPUSType_ShaderModuleWGSLDescriptor;
- WGPUShaderModuleDescriptor shaderModuleDesc = {};
- shaderModuleDesc.nextInChain = &wgslDesc.chain;
- shaderModuleDesc.label = "shader";
- WGPUComputePipelineDescriptor computePipelineDesc = {};
- computePipelineDesc.layout = pipelineLayout;
- computePipelineDesc.compute.module =
- wgpuDeviceCreateShaderModule(device, &shaderModuleDesc);
- computePipelineDesc.compute.entryPoint = "main";
- computePipeline =
- wgpuDeviceCreateComputePipeline(device, &computePipelineDesc);
- check(computePipeline, "Create compute pipeline", __FILE__, __LINE__);
- }
-
- // We create the command encoder and the compute pass encoder. The command
- // encoder is used to encode commands for the GPU. The compute pass encoder is
- // used to encode commands for the compute pipeline.
- spdlog::info("Create the command encoder");
- {
- static constexpr uint32_t kWorkgroupSize = 256; // This needs to match the
- // workgroup size in the
- // shader.
- WGPUCommandEncoder commandEncoder;
- WGPUComputePassEncoder computePassEncoder;
- commandEncoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
- computePassEncoder =
- wgpuCommandEncoderBeginComputePass(commandEncoder, nullptr);
- wgpuComputePassEncoderSetPipeline(computePassEncoder, computePipeline);
- wgpuComputePassEncoderSetBindGroup(computePassEncoder, 0, bindGroup, 0,
- nullptr);
- wgpuComputePassEncoderDispatchWorkgroups(
- computePassEncoder, (N + (kWorkgroupSize - 1)) / kWorkgroupSize, 1, 1);
- wgpuComputePassEncoderEnd(computePassEncoder);
- wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, outputBuffer, 0,
- readbackBuffer, 0, bufferSize);
- commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
- check(commandBuffer, "Create command buffer", __FILE__, __LINE__);
- }
- spdlog::info("Initializing promise and future");
- promise = std::promise();
- future = promise.get_future();
-
- spdlog::info("Copying input data to GPU");
- wgpuQueueWriteBuffer(queue, inputBuffer, 0, inputArr.data(), bufferSize);
-
- // Submit the command buffer and launch the kernel. The command buffer is
- // submitted to the queue and a callback is set up to handle the completion of
- // the job which updates the promise. A while loop is used to wait for the
- // promise to be set.
- spdlog::info("Submit the command buffer and launching the kernel");
- struct CallbackData {
- WGPUBuffer buffer;
- size_t bufferSize;
- float *output;
- std::promise *promise;
- };
- {
-
- // Submit the command buffer
- wgpuQueueSubmit(queue, 1, &commandBuffer);
- CallbackData callbackData =
- CallbackData{readbackBuffer, sizeof(outputArr), nullptr, &promise};
- // Set up the callback for when the work is done
- wgpuQueueOnSubmittedWorkDone(
- queue,
- [](WGPUQueueWorkDoneStatus status, void *callbackData) {
- spdlog::info("QueueOnSubmittedWorkDone status: {}",
- WGPUQueueWorkDoneStatus_Success == status);
- check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",
- __FILE__, __LINE__);
- const auto *data = static_cast(callbackData);
- data->promise->set_value();
- },
- &callbackData);
- // Wait for the promise to be set
- while (future.wait_for(std::chrono::seconds(0)) !=
- std::future_status::ready) {
- wgpuInstanceProcessEvents(instance);
- }
- }
-
- // Copy the output data back to the CPU. This requires its own command encoder
- // and command buffer. As with the computation a job is asynchronously
- // submitted to the queue and a callback is set up to handle the completion
- // of the job which updates the promise.
- //
- // The execution blocks on the future until the promise is set, after which
- // the result of the computation is copied to the outputArr array and is
- // printed.
- spdlog::info("Copying output to the CPU");
- {
- // reset the promise and future
- promise = std::promise();
- future = promise.get_future();
- spdlog::info("Setting up command encoder and command buffer for copying "
- "output to the CPU");
- {
- WGPUCommandEncoder commandEncoder;
- WGPUComputePassEncoder computePassEncoder;
- commandEncoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
- wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, outputBuffer, 0,
- readbackBuffer, 0, bufferSize);
- commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
- check(commandBuffer, "Create command buffer", __FILE__, __LINE__);
- }
- wgpuQueueSubmit(queue, 1, &commandBuffer);
- CallbackData callbackData = {readbackBuffer, bufferSize, outputArr.data(),
- &promise};
- wgpuQueueOnSubmittedWorkDone(
- queue,
- [](WGPUQueueWorkDoneStatus status, void *callbackData) {
- spdlog::info("QueueOnSubmittedWorkDone status: {}",
- WGPUQueueWorkDoneStatus_Success == status);
- check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",
- __FILE__, __LINE__);
- const auto *data = static_cast(callbackData);
- wgpuBufferMapAsync(
- data->buffer, WGPUMapMode_Read, 0, bufferSize,
- [](WGPUBufferMapAsyncStatus status, void *captureData) {
- const auto *data = static_cast(captureData);
- check(status == WGPUBufferMapAsyncStatus_Success,
- "Map readbackBuffer", __FILE__, __LINE__);
- const void *mappedData = wgpuBufferGetConstMappedRange(
- data->buffer, /*offset=*/0, data->bufferSize);
- check(mappedData, "Get mapped range", __FILE__, __LINE__);
- memcpy(data->output, mappedData, data->bufferSize);
- wgpuBufferUnmap(data->buffer);
- data->promise->set_value();
- },
- callbackData);
- },
- &callbackData);
- while (future.wait_for(std::chrono::seconds(0)) !=
- std::future_status::ready) {
- wgpuInstanceProcessEvents(instance);
- }
- }
-
- spdlog::info("{}", show(inputArr, "GELU Input"));
- spdlog::info("{}", show(outputArr, "GELU Output"));
- spdlog::info("Done with GELU kernel");
-}
diff --git a/experimental/kernels/Makefile b/experimental/kernels/Makefile
index c233ef5..e2d89b1 100644
--- a/experimental/kernels/Makefile
+++ b/experimental/kernels/Makefile
@@ -16,7 +16,7 @@ CXXFLAGS=-std=c++17 -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -I. -Iunittest_l
CFLAGS=-Ofast -march=native -I. -Iunittest_llmc
# CFLAGS=-O2 -march=native -I. -Iunittest_llmc
-LDFLAGS=$(STDLIB) -L$(GPUCPP)/third_party/lib -ldl -ldawn
+LDFLAGS=$(STDLIB) -L$(GPUCPP)/third_party/lib -ldl -lwebgpu_dawn -fsanitize=address
FLAGS=$(CXXFLAGS) $(LDFLAGS)
ifeq ($(shell [ -d /opt/homebrew/opt/libomp/lib ] && echo "exists"), exists)
@@ -29,6 +29,10 @@ endif
default: run-native
+build/reduce: reduce.cpp kernels.h
+ $(CC) $(CFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $<
+ $(LIBSPEC) && build/reduce
+
run_llm.c: ./build/test_gpt2 dawnlib
$(LIBSPEC) && $<
@@ -85,7 +89,7 @@ build/test_gpt2_with_metal_profiler: llm.c build/unittest_kernels.o gpt2_124M.bi
mkdir -p build
$(call preprocess_file)
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ llm.c/test_gpt2.c build/unittest_kernels.o -I$(GPUCPP) $(GPUCPP)/experimental/profiler/metal.mm -framework metal -framework Foundation -DMETAL_PROFILER -g
- install_name_tool -change @rpath/libdawn.dylib $(GPUCPP)/third_party/lib/libdawn.dylib $@
+ install_name_tool -change @rpath/libwebgpu_dawn.dylib $(GPUCPP)/third_party/lib/libwebgpu_dawn.dylib $@
build/train_gpt2: llm.c build/unittest_kernels.o gpt2_124M.bin
mkdir -p build
@@ -95,10 +99,14 @@ build/train_gpt2: llm.c build/unittest_kernels.o gpt2_124M.bin
build/ops.o: ops.cpp ops.hpp kernels.h llm.c
mkdir -p build && $(CXX) $(CXXFLAGS) -c -o $@ $<
-build/gpt2_webgpu: llm.c gpt2_124M.bin llm.c
+build/gpt2_webgpu: llm.c gpt2_124M.bin llm.c gpt2_webgpu.cpp ops.cpp
mkdir -p build
$(CC) $(CXXFLAGS) -Illm.c $(LDFLAGS) -o $@ gpt2_webgpu.cpp ops.cpp
+build/gpt2_webgpu_aot: llm.c gpt2_124M.bin llm.c gpt2_webgpu_aot.cpp ops_aot.cpp
+ mkdir -p build
+ $(CC) $(CXXFLAGS) -Illm.c $(LDFLAGS) -o $@ gpt2_webgpu_aot.cpp ops_aot.cpp
+
build/gpt2_webgpu.html: check-emsdk gpt2_webgpu.cpp term.html llm.c
em++ gpt2_webgpu.cpp ops.cpp \
--preload-file gpt2_tokenizer.bin@/gpt2_tokenizer.bin \
@@ -116,8 +124,8 @@ watch-web:
watch-native:
ls *.cpp *.c *.hpp *.h | entr -s "rm -f build/gpt2_webgpu && rm -f build/ops.o && make build/gpt2_webgpu"
-run-native: build/gpt2_webgpu
- . $(GPUCPP)/source && ./build/gpt2_webgpu
+run-native: build/gpt2_webgpu_aot
+ . $(GPUCPP)/source && ./build/gpt2_webgpu_aot
# server: build/train_gpt2.html build/test_gpt2.html build/gpt2_gpucpp.html
server: build/gpt2_webgpu.html
@@ -131,7 +139,7 @@ server: build/gpt2_webgpu.html
build/unittest_kernels.o: unittest_llmc/unittest_kernels.cpp unittest_llmc/unittest_kernels.h kernels.h
mkdir -p build && $(CXX) $(CXXFLAGS) -DNDEBUG -c -o $@ $<
-dawnlib: $(if $(wildcard $(GPUCPP)/third_party/lib/libdawn.so $(GPUCPP)/third_party/lib/libdawn.dylib),,run_setup)
+dawnlib: $(if $(wildcard $(GPUCPP)/third_party/lib/libwebgpu_dawn.so $(GPUCPP)/third_party/lib/libwebgpu_dawn.dylib),,run_setup)
run_setup: check-python
cd $(GPUCPP) && python3 setup.py
diff --git a/experimental/kernels/gpt2_webgpu_aot.cpp b/experimental/kernels/gpt2_webgpu_aot.cpp
new file mode 100644
index 0000000..966fb7a
--- /dev/null
+++ b/experimental/kernels/gpt2_webgpu_aot.cpp
@@ -0,0 +1,1109 @@
+#include "gpu.hpp"
+#include "ops_aot.hpp"
+/*
+This file trains the GPT-2 model.
+This version is the clean, minimal, reference. As such:
+- it runs on CPU.
+- it does not make the code too complex; it is readable.
+- it does not use any processor-specific instructions, intrinsics and such.
+- it _does_ use a few OpenMP pragmas because this is a large speedup at very low cost
+There will be other versions of this code that specialize it and make it fast.
+*/
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#ifdef OMP
+#include
+#endif
+// our own utilities
+// defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck
+#include "llmc/utils.h"
+// defines: tokenizer_init, tokenizer_decode, tokenizer_free
+#include "llmc/tokenizer.h"
+// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
+#include "llmc/dataloader.h"
+
+using namespace gpu;
+
+// ----------------------------------------------------------------------------
+// GPT-2 model definition
+
+typedef struct {
+ int max_seq_len; // max sequence length, e.g. 1024
+ int vocab_size; // vocab size, e.g. 50257
+ int padded_vocab_size; // padded to e.g. %128==0, 50304
+ int num_layers; // number of layers, e.g. 12
+ int num_heads; // number of heads in attention, e.g. 12
+ int channels; // number of channels, e.g. 768
+} GPT2Config;
+
+// the parameters of the model
+#define NUM_PARAMETER_TENSORS 16
+typedef struct {
+ Tensor wte; // (V, C)
+ Tensor wpe; // (maxT, C)
+ std::vector ln1w; // (L, C)
+ std::vector ln1b; // (L, C)
+ std::vector qkvw; // (L, 3*C, C)
+ std::vector qkvb; // (L, 3*C)
+ std::vector attprojw; // (L, C, C)
+ std::vector attprojb; // (L, C)
+ std::vector ln2w; // (L, C)
+ std::vector ln2b; // (L, C)
+ std::vector fcw; // (L, 4*C, C)
+ std::vector fcb; // (L, 4*C)
+ std::vector fcprojw; // (L, C, 4*C)
+ std::vector fcprojb; // (L, C)
+ Tensor lnfw; // (C)
+ Tensor lnfb; // (C)
+} ParameterTensors;
+
+void fill_in_parameter_sizes(size_t* param_sizes, GPT2Config config) {
+ size_t Vp = config.padded_vocab_size;
+ size_t C = config.channels;
+ size_t maxT = config.max_seq_len;
+ size_t L = config.num_layers;
+ param_sizes[0] = Vp * C; // wte
+ param_sizes[1] = maxT * C; // wpe
+ param_sizes[2] = L * C; // ln1w
+ param_sizes[3] = L * C; // ln1b
+ param_sizes[4] = L * (3 * C) * C; // qkvw
+ param_sizes[5] = L * (3 * C); // qkvb
+ param_sizes[6] = L * C * C; // attprojw
+ param_sizes[7] = L * C; // attprojb
+ param_sizes[8] = L * C; // ln2w
+ param_sizes[9] = L * C; // ln2b
+ param_sizes[10] = L * (4 * C) * C; // fcw
+ param_sizes[11] = L * (4 * C); // fcb
+ param_sizes[12] = L * C * (4 * C); // fcprojw
+ param_sizes[13] = L * C; // fcprojb
+ param_sizes[14] = C; // lnfw
+ param_sizes[15] = C; // lnfb
+}
+
+// allocate memory for the parameters and point the individual tensors to the right places
+void malloc_and_point_parameters(Context& ctx, GPT2Config config, ParameterTensors* params, size_t* param_sizes) {
+ size_t L = config.num_layers;
+ params->wte = createTensor(ctx, Shape{param_sizes[0]}, kf32);
+ params->wpe = createTensor(ctx, Shape{param_sizes[1]}, kf32);
+
+ params->ln1w.resize(L);
+ params->ln1b.resize(L);
+ params->qkvw.resize(L);
+ params->qkvb.resize(L);
+ params->attprojw.resize(L);
+ params->attprojb.resize(L);
+ params->ln2w.resize(L);
+ params->ln2b.resize(L);
+ params->fcw.resize(L);
+ params->fcb.resize(L);
+ params->fcprojw.resize(L);
+ params->fcprojb.resize(L);
+ for(int l = 0; l < L ; l++) {
+ params->ln1w[l] = createTensor(ctx, Shape{param_sizes[2]/config.num_layers}, kf32);
+ params->ln1b[l] = createTensor(ctx, Shape{param_sizes[3]/config.num_layers}, kf32);
+ params->qkvw[l] = createTensor(ctx, Shape{param_sizes[4]/config.num_layers}, kf32);
+ params->qkvb[l] = createTensor(ctx, Shape{param_sizes[5]/config.num_layers}, kf32);
+ params->attprojw[l] = createTensor(ctx, Shape{param_sizes[6]/config.num_layers}, kf32);
+ params->attprojb[l] = createTensor(ctx, Shape{param_sizes[7]/config.num_layers}, kf32);
+ params->ln2w[l] = createTensor(ctx, Shape{param_sizes[8]/config.num_layers}, kf32);
+ params->ln2b[l] = createTensor(ctx, Shape{param_sizes[9]/config.num_layers}, kf32);
+ params->fcw[l] = createTensor(ctx, Shape{param_sizes[10]/config.num_layers}, kf32);
+ params->fcb[l] = createTensor(ctx, Shape{param_sizes[11]/config.num_layers}, kf32);
+ params->fcprojw[l] = createTensor(ctx, Shape{param_sizes[12]/config.num_layers}, kf32);
+ params->fcprojb[l] = createTensor(ctx, Shape{param_sizes[13]/config.num_layers}, kf32);
+ }
+ params->lnfw = createTensor(ctx, Shape{param_sizes[14]}, kf32);
+ params->lnfb = createTensor(ctx, Shape{param_sizes[15]}, kf32);
+}
+
+
+#define NUM_ACTIVATION_TENSORS 23
+typedef struct {
+ Tensor encoded; // (B, T, C)
+ std::vector ln1; // (L, B, T, C)
+ std::vector ln1_mean; // (L, B, T)
+ std::vector ln1_rstd; // (L, B, T)
+ std::vector qkv; // (L, B, T, 3*C)
+ std::vector atty; // (L, B, T, C)
+ std::vector preatt; // (L, B, NH, T, T)
+ std::vector att; // (L, B, NH, T, T)
+ std::vector attproj; // (L, B, T, C)
+ std::vector residual2; // (L, B, T, C)
+ std::vector ln2; // (L, B, T, C)
+ std::vector ln2_mean; // (L, B, T)
+ std::vector ln2_rstd; // (L, B, T)
+ std::vector fch; // (L, B, T, 4*C)
+ std::vector fch_gelu; // (L, B, T, 4*C)
+ std::vector fcproj; // (L, B, T, C)
+ std::vector residual3; // (L, B, T, C)
+ Tensor lnf; // (B, T, C)
+ Tensor lnf_mean; // (B, T)
+ Tensor lnf_rstd; // (B, T)
+ Tensor logits; // (B, T, V)
+ Tensor probs; // (B, T, V)
+ Tensor losses; // (B, T)
+} ActivationTensors;
+
+typedef struct {
+ Kernel encoder_forward;
+ std::vector layernorm_forward;
+ std::vector qkv_projection_forward;
+ std::vector attention_forward;
+ std::vector attention_projection_forward;
+ std::vector residual_forward;
+ std::vector ff_up_forward;
+ std::vector gelu_forward;
+ std::vector ff_down_forward;
+ std::vector residual2_forward;
+ Kernel layernorm_final_forward;
+ Kernel matmul_final_forward;
+ Kernel softmax_final_forward;
+ Kernel crossentropy_forward;
+
+ Kernel crossentropy_softmax_backward;
+ Kernel matmul_final_backward;
+ Kernel layernorm_final_backward;
+ std::vector residual2_backward;
+ std::vector ff_down_backward;
+ std::vector gelu_backward;
+ std::vector ff_up_backward;
+ std::vector layernorm2_backward;
+ std::vector attention_projection_backward;
+ std::vector attention_backward;
+ std::vector qkv_projection_backward;
+ std::vector layernorm1_backward;
+ Kernel encoder_backward;
+} Kernels;
+
+void fill_in_activation_sizes(size_t* act_sizes, GPT2Config config, int B, int T) {
+ size_t C = config.channels;
+ size_t NH = config.num_heads;
+ size_t L = config.num_layers;
+ size_t Vp = config.padded_vocab_size;
+ act_sizes[0] = B * T * C; // encoded
+ act_sizes[1] = L * B * T * C; // ln1
+ act_sizes[2] = L * B * T; // ln1_mean
+ act_sizes[3] = L * B * T; // ln1_rstd
+ act_sizes[4] = L * B * T * 3 * C; // qkv
+ act_sizes[5] = L * B * T * C; // atty
+ act_sizes[6] = L * B * NH * T * T; // preatt
+ act_sizes[7] = L * B * NH * T * T; // att
+ act_sizes[8] = L * B * T * C; // attproj
+ act_sizes[9] = L * B * T * C; // residual2
+ act_sizes[10] = L * B * T * C; // ln2
+ act_sizes[11] = L * B * T; // ln2_mean
+ act_sizes[12] = L * B * T; // ln2_rstd
+ act_sizes[13] = L * B * T * 4 * C; // fch
+ act_sizes[14] = L * B * T * 4 * C; // fch_gelu
+ act_sizes[15] = L * B * T * C; // fcproj
+ act_sizes[16] = L * B * T * C; // residual3
+ act_sizes[17] = B * T * C; // lnf
+ act_sizes[18] = B * T; // lnf_mean
+ act_sizes[19] = B * T; // lnf_rstd
+ act_sizes[20] = B * T * Vp; // logits
+ act_sizes[21] = B * T * Vp; // probs
+ act_sizes[22] = B * T; // losses
+}
+
+void malloc_and_point_activations(Context& ctx, GPT2Config config, ActivationTensors* acts, size_t* act_sizes) {
+ size_t L = config.num_layers;
+ acts->encoded = createTensor(ctx, Shape{act_sizes[0]}, kf32);
+ acts->ln1.resize(L);
+ acts->ln1_mean.resize(L);
+ acts->ln1_rstd.resize(L);
+ acts->qkv.resize(L);
+ acts->atty.resize(L);
+ acts->preatt.resize(L);
+ acts->att.resize(L);
+ acts->attproj.resize(L);
+ acts->residual2.resize(L);
+ acts->ln2.resize(L);
+ acts->ln2_mean.resize(L);
+ acts->ln2_rstd.resize(L);
+ acts->fch.resize(L);
+ acts->fch_gelu.resize(L);
+ acts->fcproj.resize(L);
+ acts->residual3.resize(L);
+ for (int l = 0; l < L; l++) {
+ acts->ln1[l] = createTensor(ctx, Shape{act_sizes[1]/config.num_layers}, kf32);
+ acts->ln1_mean[l] = createTensor(ctx, Shape{act_sizes[2]/config.num_layers}, kf32);
+ acts->ln1_rstd[l] = createTensor(ctx, Shape{act_sizes[3]/config.num_layers}, kf32);
+ acts->qkv[l] = createTensor(ctx, Shape{act_sizes[4]/config.num_layers}, kf32);
+ acts->atty[l] = createTensor(ctx, Shape{act_sizes[5]/config.num_layers}, kf32);
+ acts->preatt[l] = createTensor(ctx, Shape{act_sizes[6]/config.num_layers}, kf32);
+ acts->att[l] = createTensor(ctx, Shape{act_sizes[7]/config.num_layers}, kf32);
+ acts->attproj[l] = createTensor(ctx, Shape{act_sizes[8]/config.num_layers}, kf32);
+ acts->residual2[l] = createTensor(ctx, Shape{act_sizes[9]/config.num_layers}, kf32);
+ acts->ln2[l] = createTensor(ctx, Shape{act_sizes[10]/config.num_layers}, kf32);
+ acts->ln2_mean[l] = createTensor(ctx, Shape{act_sizes[11]/config.num_layers}, kf32);
+ acts->ln2_rstd[l] = createTensor(ctx, Shape{act_sizes[12]/config.num_layers}, kf32);
+ acts->fch[l] = createTensor(ctx, Shape{act_sizes[13]/config.num_layers}, kf32);
+ acts->fch_gelu[l] = createTensor(ctx, Shape{act_sizes[14]/config.num_layers}, kf32);
+ acts->fcproj[l] = createTensor(ctx, Shape{act_sizes[15]/config.num_layers}, kf32);
+ acts->residual3[l] = createTensor(ctx, Shape{act_sizes[16]/config.num_layers}, kf32);
+ }
+ acts->lnf = createTensor(ctx, Shape{act_sizes[17]}, kf32);
+ acts->lnf_mean = createTensor(ctx, Shape{act_sizes[18]}, kf32);
+ acts->lnf_rstd = createTensor(ctx, Shape{act_sizes[19]}, kf32);
+ acts->logits = createTensor(ctx, Shape{act_sizes[20]}, kf32);
+ acts->probs = createTensor(ctx, Shape{act_sizes[21]}, kf32);
+ acts->losses = createTensor(ctx, Shape{act_sizes[22]}, kf32);
+}
+
+void gpu_alloc(Context& ctx, Tensor* tensors, size_t* sizes, size_t n) {
+ for (size_t i = 0; i < n; i++) {
+ tensors[i] = createTensor(ctx, Shape{sizes[i]}, kf32);
+ }
+}
+
+typedef struct {
+ GPT2Config config;
+ // the weights (parameters) of the model, and their sizes
+ ParameterTensors params;
+ size_t param_sizes[NUM_PARAMETER_TENSORS];
+ float* params_memory;
+ size_t num_parameters;
+ // gradients of the weights
+ ParameterTensors grads;
+ float* grads_memory;
+ // buffers for the AdamW optimizer
+ float* m_memory;
+ float* v_memory;
+ // the activations of the model, and their sizes
+ ActivationTensors acts;
+ size_t act_sizes[NUM_ACTIVATION_TENSORS];
+ float* acts_memory;
+ size_t num_activations;
+ // gradients of the activations
+ ActivationTensors grads_acts;
+ float* grads_acts_memory;
+ // other run state configuration
+ int batch_size; // the batch size (B) of current forward pass
+ int seq_len; // the sequence length (T) of current forward pass
+ Tensor inputs; // the input tokens for the current forward pass
+ Tensor targets; // the target tokens for the current forward pass
+ float mean_loss; // after a forward pass with targets, will be populated with the mean loss
+ float* mean_loss_buffer;
+ float* probs_buffer;
+
+ Tensor nullTensor;
+
+ // kernels
+ Kernels kernels;
+ bool backward_enabled;
+} GPT2;
+
+void gpt2_build_from_checkpoint(Context& ctx, GPT2 *model, const char* checkpoint_path) {
+ printf("Building GPT-2 model from checkpoint '%s'\n", checkpoint_path);
+ // read in model from a checkpoint file
+ FILE *model_file = fopenCheck(checkpoint_path, "rb");
+ int model_header[256];
+ freadCheck(model_header, sizeof(int), 256, model_file);
+ if (model_header[0] != 20240326) { printf("Bad magic model file\n"); exit(1); }
+ if (model_header[1] != 3) {
+ printf("Bad version in model file\n");
+ printf("---> HINT: try to re-run `python train_gpt2.py`\n");
+ exit(1);
+ }
+
+ // read in hyperparameters
+ size_t maxT, V, Vp, L, NH, C; // size_t to prevent int overflow
+ model->config.max_seq_len = maxT = model_header[2];
+ model->config.vocab_size = V = model_header[3];
+#ifdef __EMSCRIPTEN__
+ model->config.num_layers = L = 12; // TODO(avh): Debugging only hack - revert this
+#else
+ model->config.num_layers = L = model_header[4];
+#endif
+ model->config.num_heads = NH = model_header[5];
+ model->config.channels = C = model_header[6];
+ model->config.padded_vocab_size = Vp = model_header[7];
+ printf("[GPT-2]\n");
+ printf("max_seq_len: %zu\n", maxT);
+ printf("vocab_size: %zu\n", V);
+ printf("padded_vocab_size: %zu\n", Vp);
+ printf("num_layers: %zu\n", L);
+ printf("num_heads: %zu\n", NH);
+ printf("channels: %zu\n", C);
+
+ // allocate space for all the parameters and read them in
+ fill_in_parameter_sizes(model->param_sizes, model->config);
+ // count the number of parameters
+ size_t num_parameters = 0;
+ for (size_t i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+ num_parameters += model->param_sizes[i];
+ }
+ printf("num_parameters: %zu\n", num_parameters);
+ model->num_parameters = num_parameters;
+
+ // read in all the parameters from file
+ malloc_and_point_parameters(ctx, model->config, &model->params, model->param_sizes);
+ model->params_memory = (float*)mallocCheck(num_parameters * sizeof(float));
+ freadCheck(model->params_memory, sizeof(float), num_parameters, model_file);
+ fcloseCheck(model_file);
+
+ // transfer to GPU memory
+ float* iter = model->params_memory;
+ toGPU(ctx, iter, model->params.wte);
+ iter += model->param_sizes[0];
+ toGPU(ctx, iter, model->params.wpe);
+ iter += model->param_sizes[1];
+ for (int l = 0; l < L; l++) {
+ toGPU(ctx, iter, model->params.ln1w[l]);
+ iter += model->param_sizes[2]/L;
+ toGPU(ctx, iter, model->params.ln1b[l]);
+ iter += model->param_sizes[3]/L;
+ toGPU(ctx, iter, model->params.qkvw[l]);
+ iter += model->param_sizes[4]/L;
+ toGPU(ctx, iter, model->params.qkvb[l]);
+ iter += model->param_sizes[5]/L;
+ toGPU(ctx, iter, model->params.attprojw[l]);
+ iter += model->param_sizes[6]/L;
+ toGPU(ctx, iter, model->params.attprojb[l]);
+ iter += model->param_sizes[7]/L;
+ toGPU(ctx, iter, model->params.ln2w[l]);
+ iter += model->param_sizes[8]/L;
+ toGPU(ctx, iter, model->params.ln2b[l]);
+ iter += model->param_sizes[9]/L;
+ toGPU(ctx, iter, model->params.fcw[l]);
+ iter += model->param_sizes[10]/L;
+ toGPU(ctx, iter, model->params.fcb[l]);
+ iter += model->param_sizes[11]/L;
+ toGPU(ctx, iter, model->params.fcprojw[l]);
+ iter += model->param_sizes[12]/L;
+ toGPU(ctx, iter, model->params.fcprojb[l]);
+ iter += model->param_sizes[13]/L;
+ }
+ toGPU(ctx, iter, model->params.lnfw);
+ iter += model->param_sizes[14];
+ toGPU(ctx, iter, model->params.lnfb);
+ iter += model->param_sizes[15];
+
+
+ // other inits
+ model->acts_memory = NULL;
+ model->grads_memory = NULL;
+ model->m_memory = NULL;
+ model->v_memory = NULL;
+ model->grads_acts_memory = NULL;
+ model->batch_size = 0;
+ model->seq_len = 0;
+ model->mean_loss = -1.0f; // -1.0f will designate no loss
+ model->mean_loss_buffer = NULL;
+ model->probs_buffer = NULL;
+ model->backward_enabled = false;
+
+ printf("Model build complete\n");
+
+}
+
+
+void gpt2_forward(Context& ctx, GPT2 *model, Tensor& inputs, Tensor& targets, size_t B, size_t T) {
+ // targets are optional and could be NULL
+
+ // ensure the model was initialized or error out
+ if (model->params_memory == NULL) {
+ printf("Error: model was not initialized properly.\n");
+ exit(1);
+ }
+
+ // convenience parameters (size_t to help prevent int overflow)
+ size_t V = model->config.vocab_size;
+ size_t Vp = model->config.padded_vocab_size;
+ size_t L = model->config.num_layers;
+ size_t NH = model->config.num_heads;
+ size_t C = model->config.channels;
+
+ // // validate inputs, all indices must be in the range [0, V)
+ // for(int i = 0; i < B * T; i++) {
+ // assert(0 <= inputs[i] && inputs[i] < V);
+ // if (targets != NULL) {
+ // assert(0 <= targets[i] && targets[i] < V);
+ // }
+ // }
+
+ // allocate space for all the activations if needed (done here, lazily)
+ if(model->acts_memory == NULL) {
+ // record the current B,T as well
+ model->batch_size = B;
+ model->seq_len = T;
+ // and now allocate the space
+ fill_in_activation_sizes(model->act_sizes, model->config, B, T);
+ model->mean_loss_buffer = (float*)mallocCheck(sizeof(float) * model->batch_size * model->seq_len);
+ model->probs_buffer = (float*)mallocCheck(sizeof(float) * model->batch_size * model->seq_len * Vp);
+
+ // TODO(avh): this is just a resource test for now, eventually deprecate CPU allocations
+ size_t num_activations = 0;
+ for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
+ num_activations += model->act_sizes[i];
+ }
+ printf("num_activations: %zu\n", num_activations);
+ model->num_activations = num_activations;
+ printf("Allocating %.2f MB for activations\n", num_activations * sizeof(float) / (1024.0f * 1024.0f));
+ malloc_and_point_activations(ctx, model->config, &model->acts, model->act_sizes);
+ // also create memory for caching inputs and targets
+ //model->inputs = (int*)mallocCheck(B * T * sizeof(int));
+ //model->targets = (int*)mallocCheck(B * T * sizeof(int)); // might be unused if we never have targets but it's small
+ model->inputs = createTensor(ctx, Shape{B * T}, ki32);
+ model->targets = createTensor(ctx, Shape{B * T}, ki32);
+ } else {
+ // validate B,T is consistent with how we've allocated the memory before
+ // in principle we could get more clever here in the future, for now this is safest
+ if (B != model->batch_size || T != model->seq_len) {
+ printf("Model: B=%d T=%d, Desired: B=%d T=%d\n", model->batch_size, model->seq_len, (int)B, (int)T);
+ exit(EXIT_FAILURE);
+ }
+ }
+ // create all kernels ahead of time
+ if (model->kernels.encoder_forward == nullptr) {
+ printf("Creating Kernels\n");
+ Kernels& kernels = model->kernels;
+ kernels.layernorm_forward.resize(L);
+ kernels.layernorm1_backward.resize(L);
+ kernels.qkv_projection_forward.resize(L);
+ kernels.qkv_projection_backward.resize(L);
+ kernels.attention_forward.resize(L);
+ kernels.attention_backward.resize(L);
+ kernels.attention_projection_forward.resize(L);
+ kernels.attention_projection_backward.resize(L);
+ kernels.residual_forward.resize(L);
+ kernels.residual2_forward.resize(L);
+ kernels.residual2_backward.resize(L);
+ kernels.ff_up_forward.resize(L);
+ kernels.ff_up_backward.resize(L);
+ kernels.gelu_forward.resize(L);
+ kernels.gelu_backward.resize(L);
+ kernels.ff_down_forward.resize(L);
+ kernels.ff_down_backward.resize(L);
+ for (int l = 0; l < L; ++l) {
+ kernels.layernorm_forward[l] = layernorm_forward(ctx, model->acts.ln1[l], model->acts.ln1_mean[l], model->acts.ln1_rstd[l],
+ /*input=*/ model->acts.residual3[l], /*weight=*/ model->params.ln1w[l], /*bias=*/ model->params.ln1b[l],
+ B, T, C);
+ kernels.qkv_projection_forward[l] = matmul_forward(ctx, model->acts.qkv[l], model->acts.ln1[l], model->params.qkvw[l], model->params.qkvb[l], B, T, C, 3*C);
+ kernels.attention_forward[l] = attention_forward(ctx, model->acts.atty[l], model->acts.preatt[l], model->acts.att[l], model->acts.qkv[l], B, T, C, NH);
+ kernels.attention_projection_forward[l] = matmul_forward(ctx, model->acts.attproj[l], model->acts.atty[l], model->params.attprojw[l], model->params.attprojb[l], B, T, C, C);
+ kernels.residual_forward[l] = residual_forward(ctx, model->acts.residual2[l], model->acts.residual3[l], model->acts.attproj[l], B*T*C);
+ kernels.ff_up_forward[l] = matmul_forward(ctx, model->acts.fch[l], model->acts.ln2[l], model->params.fcw[l], model->params.fcb[l], B, T, C, 4*C);
+ kernels.gelu_forward[l] = gelu_forward(ctx, model->acts.fch_gelu[l], model->acts.fch[l], B*T*4*C);
+ kernels.ff_down_forward[l] = matmul_forward(ctx, model->acts.fcproj[l], model->acts.fch_gelu[l], model->params.fcw[l], model->params.fcb[l], B, T, 4*C, C);
+ kernels.residual2_forward[l] = residual_forward(ctx, model->acts.residual3[l], model->acts.residual2[l], model->acts.fcproj[l], B*T*C);
+ }
+ kernels.crossentropy_forward = crossentropy_forward(ctx, model->acts.losses, model->acts.probs, targets, B, T, Vp);
+
+ kernels.encoder_forward = encoder_forward(ctx, model->acts.encoded, inputs, model->params.wte, model->params.wpe, B, T, C); // encoding goes into residual[0]
+ if(model->backward_enabled)
+ kernels.encoder_backward = encoder_backward(ctx, model->params.wte, model->params.wpe, model->acts.encoded, inputs, B, T, C);
+ kernels.layernorm_final_forward = layernorm_forward(ctx, model->acts.lnf, model->acts.lnf_mean, model->acts.lnf_rstd,
+ /*input=*/ model->acts.residual3[L-1], /*weight=*/ model->params.lnfw, /*bias=*/ model->params.lnfb,
+ B, T, C);
+ Tensor nullTensor = createTensor(ctx, Shape{1}, kf32);
+ model->nullTensor = nullTensor;
+ kernels.matmul_final_forward = matmul_forward(ctx, model->acts.logits, model->acts.lnf, model->params.wte, nullTensor, B, T, C, Vp);
+ kernels.softmax_final_forward = softmax_forward(ctx, model->acts.probs, model->acts.logits, B, T, V, Vp);
+ if(model->backward_enabled)
+ kernels.crossentropy_softmax_backward = crossentropy_softmax_backward(ctx, model->acts.logits, model->acts.losses, model->acts.probs, targets, B, T, V, Vp);
+ if(model->backward_enabled)
+ kernels.matmul_final_backward = matmul_backward(ctx, model->acts.lnf, model->params.wte, nullTensor, model->acts.logits,
+ model->acts.lnf, model->params.wte, B, T, C, Vp);
+ if(model->backward_enabled)
+ kernels.layernorm_final_backward = layernorm_backward(ctx, model->acts.residual3[L-1], model->params.lnfw, model->params.lnfb,
+ model->acts.lnf, model->acts.residual3[L-1], model->params.lnfw,
+ model->acts.lnf_mean, model->acts.lnf_rstd, B, T, C);
+ printf("Created Kernels\n");
+ }
+
+ printf("Cache inputs/targets\n");
+ printf("Forward pass\n");
+ // forward pass
+ ParameterTensors params = model->params; // for brevity
+ ActivationTensors acts = model->acts;
+ float* residual;
+ printf("Encoding\n");
+ //printf("inputs[0] = %d\n", inputs[0]);
+ // encoder_forward(ctx, acts.encoded, inputs, params.wte, params.wpe, B, T, C); // encoding goes into residual[0]
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.encoder_forward, promise);
+ wait(ctx, future);
+ }
+ for (int l = 0; l < L; l++) {
+ printf("Forward Pass Layer %d\n", l);
+
+ // now do the forward pass
+ printf(" [Forward] : LayerNorm1\n");
+ // layernorm_forward(ctx, l_ln1, l_ln1_mean, l_ln1_rstd, residual, l_ln1w, l_ln1b, B, T, C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.layernorm_forward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Forward] : QKV Projection\n");
+ // matmul_forward(ctx, l_qkv, l_ln1, l_qkvw, l_qkvb, B, T, C, 3*C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.qkv_projection_forward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Forward] : Attention\n");
+ // attention_forward(ctx, l_atty, l_preatt, l_att, l_qkv, B, T, C, NH);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.attention_forward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Forward] : Attention Projection\n");
+ // matmul_forward(ctx, l_attproj, l_atty, l_attprojw, l_attprojb, B, T, C, C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.attention_projection_forward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Forward] : Residual1\n");
+ // residual_forward(ctx, l_residual2, residual, l_attproj, B*T*C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.residual_forward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Forward] : LayerNorm2\n");
+ // layernorm_forward(ctx, l_ln2, l_ln2_mean, l_ln2_rstd, l_residual2, l_ln2w, l_ln2b, B, T, C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.layernorm_forward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Forward] : FF Up\n");
+ // matmul_forward(ctx, l_fch, l_ln2, l_fcw, l_fcb, B, T, C, 4*C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.ff_up_forward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Forward] : GELU\n");
+ // gelu_forward(ctx, l_fch_gelu, l_fch, B*T*4*C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.gelu_forward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Forward] : FF Down\n");
+ // matmul_forward(ctx, l_fcproj, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, 4*C, C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.ff_down_forward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Forward] : Residual2\n");
+ // residual_forward(ctx, l_residual3, l_residual2, l_fcproj, B*T*C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.residual2_forward[l], promise);
+ wait(ctx, future);
+ }
+ }
+ // residual = acts.residual3.data() + (L-1) * B * T * C; // last residual is in residual3
+ // layernorm_forward(ctx, acts.lnf, acts.lnf_mean, acts.lnf_rstd, residual, params.lnfw, params.lnfb, B, T, C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.layernorm_final_forward, promise);
+ wait(ctx, future);
+ }
+ // matmul_forward(ctx, acts.logits, acts.lnf, params.wte, NULL, B, T, C, Vp);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.matmul_final_forward, promise);
+ wait(ctx, future);
+ }
+ // softmax_forward(ctx, acts.probs, acts.logits, B, T, V, Vp);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.softmax_final_forward, promise);
+ wait(ctx, future);
+ }
+
+ printf("Crossentropy\n");
+ // also forward the cross-entropy loss function if we have the targets
+ // When targets's shape is (1), it means we don't have targets
+ if (targets.shape[0] != 1) {
+ // crossentropy_forward(ctx, model->acts.losses, model->acts.probs, targets, B, T, Vp);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.crossentropy_forward, promise);
+ wait(ctx, future);
+ }
+ // for convenience also evaluate the mean loss
+ float mean_loss = 0.0f;
+ toCPU(ctx, model->acts.losses, model->mean_loss_buffer, B*T * sizeof(float));
+ for (int i=0; imean_loss_buffer[i]; }
+ mean_loss /= B*T;
+ model->mean_loss = mean_loss;
+ } else {
+ // if we don't have targets, we don't have a loss
+ model->mean_loss = -1.0f;
+ }
+ printf("Forward pass done\n");
+}
+
+void gpt2_zero_grad(GPT2 *model) {
+ if(model->grads_memory != NULL) { memset(model->grads_memory, 0, model->num_parameters * sizeof(float)); }
+ if(model->grads_acts_memory != NULL) { memset(model->grads_acts_memory, 0, model->num_activations * sizeof(float)); }
+}
+
+void gpt2_backward(Context& ctx, GPT2 *model) {
+ printf("Backward pass\n");
+
+ // double check we forwarded previously, with targets
+ if (model->mean_loss == -1.0f) {
+ printf("Error: must forward with targets before backward\n");
+ exit(1);
+ }
+
+ // lazily allocate the memory for gradients of the weights and activations, if needed
+ if (model->grads_memory == NULL) {
+ printf("Allocating %.2f MB for gradients\n", model->num_parameters * sizeof(float) / (1024.0f * 1024.0f));
+ malloc_and_point_parameters(ctx, model->config, &model->grads, model->param_sizes);
+ malloc_and_point_activations(ctx, model->config, &model->grads_acts, model->act_sizes);
+ gpt2_zero_grad(model);
+ }
+
+ // convenience shortcuts (and size_t to help prevent int overflow)
+ size_t B = model->batch_size;
+ size_t T = model->seq_len;
+ size_t V = model->config.vocab_size;
+ size_t Vp = model->config.padded_vocab_size;
+ size_t L = model->config.num_layers;
+ size_t NH = model->config.num_heads;
+ size_t C = model->config.channels;
+
+ // backward pass: go in the reverse order of the forward pass, and call backward() functions
+ ParameterTensors params = model->params; // for brevity
+ ParameterTensors grads = model->grads;
+ ActivationTensors acts = model->acts;
+ ActivationTensors grads_acts = model->grads_acts;
+
+ // we kick off the chain rule by filling in dlosses with 1.0f/(B*T)
+ // technically this is a small, inline backward() pass of calculating
+ // total, final loss as the mean over all losses over all (B,T) positions in the batch
+ float dloss_mean = 1.0f / (B*T);
+ for (int i = 0; i < B*T; i++) { model->mean_loss_buffer[i] = dloss_mean; }
+ toGPU(ctx, model->mean_loss_buffer, model->acts.losses);
+ //toGPU(ctx, grads_acts.losses.data, model->acts_.data[22]);
+
+ // crossentropy_softmax_backward(ctx, grads_acts.logits, grads_acts.losses, acts.probs, model->targets, B, T, V, Vp);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.crossentropy_softmax_backward, promise);
+ wait(ctx, future);
+ }
+ // matmul_backward(ctx, grads_acts.lnf, grads.wte, NULL, grads_acts.logits, acts.lnf, params.wte, B, T, C, Vp);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.matmul_final_backward, promise);
+ wait(ctx, future);
+ }
+ // layernorm_backward(ctx, dresidual, grads.lnfw, grads.lnfb, grads_acts.lnf, residual, params.lnfw, acts.lnf_mean, acts.lnf_rstd, B, T, C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.layernorm_final_backward, promise);
+ wait(ctx, future);
+ }
+
+ for (int l = L-1; l >= 0; l--) {
+ printf("Backward Pass Layer %d\n", l);
+ // backprop this layer
+ printf(" [Backward] : Residual2\n");
+ // residual_backward(ctx, dl_residual2, dl_fcproj, dl_residual3, B*T*C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.residual2_backward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Backward] : FF Down \n");
+ // matmul_backward(ctx, dl_fch_gelu, dl_fcprojw, dl_fcprojb, dl_fcproj, l_fch_gelu, l_fcprojw, B, T, 4*C, C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.ff_down_backward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Backward] : GELU\n");
+ // gelu_backward(ctx, dl_fch, l_fch, dl_fch_gelu, B*T*4*C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.gelu_backward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Backward] : FF Up\n");
+ // matmul_backward(ctx, dl_ln2, dl_fcw, dl_fcb, dl_fch, l_ln2, l_fcw, B, T, C, 4*C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.ff_up_backward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Backward] : LayerNorm2\n");
+ // layernorm_backward(ctx, dl_residual2, dl_ln2w, dl_ln2b, dl_ln2, l_residual2, l_ln2w, l_ln2_mean, l_ln2_rstd, B, T, C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.layernorm2_backward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Backward] : Residual1\n");
+ // residual_backward(ctx, dresidual, dl_attproj, dl_residual2, B*T*C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.residual_forward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Backward] : Attention Projection\n");
+ // matmul_backward(ctx, dl_atty, dl_attprojw, dl_attprojb, dl_attproj, l_atty, l_attprojw, B, T, C, C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.attention_projection_backward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Backward] : Attention\n");
+ // attention_backward(ctx, dl_qkv, dl_preatt, dl_att, dl_atty, l_qkv, l_att, B, T, C, NH);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.attention_backward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Backward] : QKV Projection\n");
+ // matmul_backward(ctx, dl_ln1, dl_qkvw, dl_qkvb, dl_qkv, l_ln1, l_qkvw, B, T, C, 3*C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.qkv_projection_backward[l], promise);
+ wait(ctx, future);
+ }
+ printf(" [Backward] : LayerNorm1\n");
+ // layernorm_backward(ctx, dresidual, dl_ln1w, dl_ln1b, dl_ln1, residual, l_ln1w, l_ln1_mean, l_ln1_rstd, B, T, C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.layernorm1_backward[l], promise);
+ wait(ctx, future);
+ }
+ }
+ // encoder_backward(ctx, grads.wte, grads.wpe, grads_acts.encoded, model->inputs, B, T, C);
+ {
+ std::promise promise;
+ std::future future = promise.get_future();
+ dispatchKernel(ctx, model->kernels.encoder_backward, promise);
+ wait(ctx, future);
+ }
+ // toCPU(ctx, model->params_.data[0], model->grads.wte.data, model->param_sizes[0] * sizeof(float));
+ // toCPU(ctx, model->params_.data[1], model->grads.wpe.data, model->param_sizes[1] * sizeof(float));
+}
+
+void gpt2_update(Context& ctx, GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t) {
+ // reference: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
+
+ // lazily allocate the memory for m_memory and v_memory
+ if (model->m_memory == NULL) {
+ model->m_memory = (float*)calloc(model->num_parameters, sizeof(float));
+ model->v_memory = (float*)calloc(model->num_parameters, sizeof(float));
+ }
+
+ // Copy the parameters to the CPU
+ float* iter = model->params_memory;
+ toCPU(ctx, model->params.wte, iter, model->param_sizes[0] * sizeof(float));
+ iter += model->param_sizes[0];
+ toCPU(ctx, model->params.wpe, iter, model->param_sizes[1] * sizeof(float));
+ iter += model->param_sizes[1];
+ size_t L = model->config.num_layers;
+ for (int l = 0; l < L; l++) {
+ toCPU(ctx, model->params.ln1w[l], iter, model->param_sizes[2]/L * sizeof(float));
+ iter += model->param_sizes[2]/L;
+ toCPU(ctx, model->params.ln1b[l], iter, model->param_sizes[3]/L * sizeof(float));
+ iter += model->param_sizes[3]/L;
+ toCPU(ctx, model->params.qkvw[l], iter, model->param_sizes[4]/L * sizeof(float));
+ iter += model->param_sizes[4]/L;
+ toCPU(ctx, model->params.qkvb[l], iter, model->param_sizes[5]/L * sizeof(float));
+ iter += model->param_sizes[5]/L;
+ toCPU(ctx, model->params.attprojw[l], iter, model->param_sizes[6]/L * sizeof(float));
+ iter += model->param_sizes[6]/L;
+ toCPU(ctx, model->params.attprojb[l], iter, model->param_sizes[7]/L * sizeof(float));
+ iter += model->param_sizes[7]/L;
+ toCPU(ctx, model->params.ln2w[l], iter, model->param_sizes[8]/L * sizeof(float));
+ iter += model->param_sizes[8]/L;
+ toCPU(ctx, model->params.ln2b[l], iter, model->param_sizes[9]/L * sizeof(float));
+ iter += model->param_sizes[9]/L;
+ toCPU(ctx, model->params.fcw[l], iter, model->param_sizes[10]/L * sizeof(float));
+ iter += model->param_sizes[10]/L;
+ toCPU(ctx, model->params.fcb[l], iter, model->param_sizes[11]/L * sizeof(float));
+ iter += model->param_sizes[11]/L;
+ toCPU(ctx, model->params.fcprojw[l], iter, model->param_sizes[12]/L * sizeof(float));
+ iter += model->param_sizes[12]/L;
+ toCPU(ctx, model->params.fcprojb[l], iter, model->param_sizes[13]/L * sizeof(float));
+ iter += model->param_sizes[13]/L;
+ }
+ toCPU(ctx, model->params.lnfw, iter, model->param_sizes[14] * sizeof(float));
+ iter += model->param_sizes[14];
+ toCPU(ctx, model->params.lnfb, iter, model->param_sizes[15] * sizeof(float));
+ iter += model->param_sizes[15];
+
+
+ for (size_t i = 0; i < model->num_parameters; i++) {
+ float param = model->params_memory[i];
+ float grad = model->grads_memory[i];
+
+ // update the first moment (momentum)
+ float m = beta1 * model->m_memory[i] + (1.0f - beta1) * grad;
+ // update the second moment (RMSprop)
+ float v = beta2 * model->v_memory[i] + (1.0f - beta2) * grad * grad;
+ // bias-correct both moments
+ float m_hat = m / (1.0f - powf(beta1, t));
+ float v_hat = v / (1.0f - powf(beta2, t));
+
+ // update
+ model->m_memory[i] = m;
+ model->v_memory[i] = v;
+ model->params_memory[i] -= learning_rate * (m_hat / (sqrtf(v_hat) + eps) + weight_decay * param);
+ }
+ // toGPU(ctx, model->params_memory, model->params_.data[0]);
+ // toGPU(ctx, model->params_memory + model->param_sizes[0], model->params_.data[1]);
+ iter = model->params_memory;
+ toGPU(ctx, iter, model->params.wte);
+ iter += model->param_sizes[0];
+ toGPU(ctx, iter, model->params.wpe);
+ iter += model->param_sizes[1];
+ for (int l = 0; l < L; l++) {
+ toGPU(ctx, iter, model->params.ln1w[l]);
+ iter += model->param_sizes[2]/L;
+ toGPU(ctx, iter, model->params.ln1b[l]);
+ iter += model->param_sizes[3]/L;
+ toGPU(ctx, iter, model->params.qkvw[l]);
+ iter += model->param_sizes[4]/L;
+ toGPU(ctx, iter, model->params.qkvb[l]);
+ iter += model->param_sizes[5]/L;
+ toGPU(ctx, iter, model->params.attprojw[l]);
+ iter += model->param_sizes[6]/L;
+ toGPU(ctx, iter, model->params.attprojb[l]);
+ iter += model->param_sizes[7]/L;
+ toGPU(ctx, iter, model->params.ln2w[l]);
+ iter += model->param_sizes[8]/L;
+ toGPU(ctx, iter, model->params.ln2b[l]);
+ iter += model->param_sizes[9]/L;
+ toGPU(ctx, iter, model->params.fcw[l]);
+ iter += model->param_sizes[10]/L;
+ toGPU(ctx, iter, model->params.fcb[l]);
+ iter += model->param_sizes[11]/L;
+ toGPU(ctx, iter, model->params.fcprojw[l]);
+ iter += model->param_sizes[12]/L;
+ toGPU(ctx, iter, model->params.fcprojb[l]);
+ iter += model->param_sizes[13]/L;
+ }
+ toGPU(ctx, iter, model->params.lnfw);
+ iter += model->param_sizes[14];
+ toGPU(ctx, iter, model->params.lnfb);
+ iter += model->param_sizes[15];
+}
+
+void gpt2_free(GPT2 *model) {
+ free(model->params_memory);
+ free(model->grads_memory);
+ free(model->m_memory);
+ free(model->v_memory);
+ free(model->acts_memory);
+ free(model->grads_acts_memory);
+ // free(model->inputs);
+ // free(model->targets);
+ free(model->mean_loss_buffer);
+}
+
+#ifndef TESTING
+// if we are TESTING (see test_gpt2.c), we'll skip the int main below
+// ----------------------------------------------------------------------------
+// sampler
+
+unsigned int random_u32(uint64_t *state) {
+ // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
+ *state ^= *state >> 12;
+ *state ^= *state << 25;
+ *state ^= *state >> 27;
+ return (*state * 0x2545F4914F6CDD1Dull) >> 32;
+}
+float random_f32(uint64_t *state) { // random float32 in [0,1)
+ return (random_u32(state) >> 8) / 16777216.0f;
+}
+
+int sample_mult(float* probabilities, int n, float coin) {
+ // sample index from probabilities (they must sum to 1!)
+ // coin is a random number in [0, 1), usually from random_f32()
+ float cdf = 0.0f;
+ for (int i = 0; i < n; i++) {
+ cdf += probabilities[i];
+ if (coin < cdf) {
+ return i;
+ }
+ }
+ return n - 1; // in case of rounding errors
+}
+
+// ----------------------------------------------------------------------------
+// main training loop
+int main() {
+
+ setLogLevel(kWarn);
+
+ printf("Creating GPU context\n");
+ WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
+ gpu::Context ctx = gpu::createContext({}, {}, {
+ .requiredLimits = &requiredLimits
+ });
+
+ // build the GPT-2 model from a checkpoint
+ GPT2 model;
+ gpt2_build_from_checkpoint(ctx, &model, "gpt2_124M.bin");
+
+ // build the DataLoaders from tokens files. for now use tiny_shakespeare if available, else tiny_stories
+ const char* tiny_stories_train = "dev/data/tinystories/TinyStories_train.bin";
+ const char* tiny_stories_val = "dev/data/tinystories/TinyStories_val.bin";
+ const char* tiny_shakespeare_train = "dev/data/tinyshakespeare/tiny_shakespeare_train.bin";
+ const char* tiny_shakespeare_val = "dev/data/tinyshakespeare/tiny_shakespeare_val.bin";
+ const char* train_tokens = access(tiny_shakespeare_train, F_OK) != -1 ? tiny_shakespeare_train : tiny_stories_train;
+ const char* val_tokens = access(tiny_shakespeare_val, F_OK) != -1 ? tiny_shakespeare_val : tiny_stories_val;
+ constexpr int B = 4; // batch size 4 (i.e. 4 independent token sequences will be trained on)
+ constexpr int T = 64; // sequence length 64 (i.e. each sequence is 64 tokens long). must be <= maxT, which is 1024 for GPT-2
+ DataLoader train_loader, val_loader;
+ dataloader_init(&train_loader, train_tokens, B, T, 0, 1, 1);
+ dataloader_init(&val_loader, val_tokens, B, T, 0, 1, 0);
+ printf("train dataset num_batches: %zu\n", train_loader.num_tokens / (B*T));
+ printf("val dataset num_batches: %zu\n", val_loader.num_tokens / (B*T));
+ int val_num_batches = 5;
+
+ // build the Tokenizer
+ Tokenizer tokenizer;
+ tokenizer_init(&tokenizer, "gpt2_tokenizer.bin");
+
+ // some memory for generating samples from the model
+ uint64_t rng_state = 1337;
+ // int* gen_tokens = (int*)mallocCheck(B * T * sizeof(int));
+ const int genT = 64; // number of steps of inference we will do
+
+ // train
+ struct timespec start, end;
+ Tensor inputs = createTensor(ctx, Shape{B, T}, ki32);
+ Tensor targets = createTensor(ctx, Shape{B, T}, ki32);
+ Tensor gen_tokens = createTensor(ctx, Shape{B, T}, ki32);
+ int* gen_tokens_cpu = (int*)mallocCheck(B * T * sizeof(int));
+ printf("Starting training\n");
+ for (int step = 0; step <= 40; step++) {
+ printf("Step %d\n", step);
+
+ // once in a while estimate the validation loss
+ if (step % 10 == 0) {
+ float val_loss = 0.0f;
+ dataloader_reset(&val_loader);
+ for (int i = 0; i < val_num_batches; i++) {
+ dataloader_next_batch(&val_loader);
+ toGPU(ctx, val_loader.inputs, inputs);
+ toGPU(ctx, val_loader.targets, targets);
+ gpt2_forward(ctx, &model, inputs, targets, B, T);
+ val_loss += model.mean_loss;
+ }
+ val_loss /= val_num_batches;
+ printf("val loss %f\n", val_loss);
+ }
+
+ // once in a while do model inference to print generated text
+ if (step > 0 && step % 20 == 0) {
+ // fill up gen_tokens with the GPT2_EOT, which kicks off the generation
+ for(int i = 0; i < B * T; ++i) {
+ gen_tokens_cpu[i] = tokenizer.eot_token;
+ }
+ toGPU(ctx, gen_tokens_cpu, gen_tokens);
+ // now sample from the model autoregressively
+ printf("generating:\n---\n");
+ for (int t = 1; t < genT; t++) {
+ // note that inference is very wasteful here because for each token
+ // we re-calculate the forward pass for all of (B,T) positions from scratch
+ // but the inference here is just for sanity checking anyway
+ // and we can maybe optimize a bit more later, with careful tests
+ gpt2_forward(ctx, &model, gen_tokens, model.nullTensor, B, T);
+ // furthermore, below we're only using b=0 (i.e. the first row) of all B rows
+ // we're in principle running B "inference streams" in parallel here
+ // but only using position 0
+ // get the Vp-dimensional vector probs[0, t-1, :]
+ toCPU(ctx, model.acts.probs, model.probs_buffer, B * T * model.config.padded_vocab_size * sizeof(float));
+ float* probs = model.probs_buffer + (t-1) * model.config.padded_vocab_size;
+
+ float coin = random_f32(&rng_state);
+ // note we're only sampling from the first V elements, ignoring padding
+ // (the probabilities in the padded region should be zero anyway)
+ int next_token = sample_mult(probs, model.config.vocab_size, coin);
+ gen_tokens_cpu[t] = next_token;
+ toGPU(ctx, gen_tokens_cpu, gen_tokens);
+ // print the generated token, either using the Tokenizer or a fallback
+ if (tokenizer.init_ok) {
+ const char* token_str = tokenizer_decode(&tokenizer, next_token);
+ safe_printf(token_str);
+ } else {
+ // fall back to printing the token id
+ printf("%d ", next_token);
+ }
+ fflush(stdout);
+ }
+ printf("\n---\n");
+ }
+
+ // do a training step
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ dataloader_next_batch(&train_loader);
+ toGPU(ctx, train_loader.inputs, inputs);
+ toGPU(ctx, train_loader.targets, targets);
+ gpt2_forward(ctx, &model, inputs, targets, B, T);
+ if (model.backward_enabled) {
+ gpt2_zero_grad(&model);
+ gpt2_backward(ctx, &model);
+ gpt2_update(ctx, &model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, step+1);
+ }
+ clock_gettime(CLOCK_MONOTONIC, &end);
+ double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
+ printf("step %d: train loss %f (took %f ms)\n", step, model.mean_loss, time_elapsed_s * 1000);
+ }
+
+ // free
+ dataloader_free(&train_loader);
+ dataloader_free(&val_loader);
+ tokenizer_free(&tokenizer);
+ gpt2_free(&model);
+ // free(gen_tokens);
+ return 0;
+}
+#endif
diff --git a/experimental/kernels/kernels.h b/experimental/kernels/kernels.h
index 212c075..62a461e 100644
--- a/experimental/kernels/kernels.h
+++ b/experimental/kernels/kernels.h
@@ -309,6 +309,104 @@ fn main(@builtin(global_invocation_id) global_id : vec3) {
}
}
}
+
+)";
+
+
+static const char *kShaderMatmul2DTiling = R"(
+@group(0) @binding(0) var inp : array<{{precision}}>;
+@group(0) @binding(1) var weight : array<{{precision}}>;
+@group(0) @binding(2) var bias : array<{{precision}}>;
+@group(0) @binding(3) var out : array<{{precision}}>;
+@group(0) @binding(4) var params : Params;
+struct Params {
+ B: u32,
+ T: u32,
+ C: u32,
+ OC: u32,
+};
+var tileInp: array<{{precision}}, {{BT}} * {{BC}}>;
+var tileWeight: array<{{precision}}, {{BOC}} * {{BC}}>;
+
+@compute @workgroup_size({{workgroupSize}})
+fn main(
+ @builtin(local_invocation_id) localID : vec3,
+ @builtin(workgroup_id) groupid : vec3) {
+ let B : u32 = params.B;
+ let T : u32 = params.T;
+ let C : u32 = params.C;
+ let OC : u32 = params.OC;
+
+ var localT: array<{{precision}}, {{TT}}>;
+ var localOC: array<{{precision}}, {{TOC}}>;
+
+ let outB: u32 = groupid.x;
+ let outT: u32 = groupid.y;
+ let outOC: u32 = groupid.z;
+ let numThread: u32 = ({{BT}} * {{BOC}}) / ({{TT}} * {{TOC}});
+
+ // position of the first c element computed by the thread
+ let threadRow: u32 = (localID.x / ({{BOC}} / {{TOC}})) * {{TT}};
+ let threadCol: u32 = (localID.x % ({{BOC}} / {{TOC}})) * {{TOC}};
+
+ // inpPtr and weightPtr are the starting positions of the tiles in a and b,
+ // incremented in the bkidx loop.
+ // outPtr is the starting position of the tile in c which is fixed.
+
+ var inpPtr = (outB * T + outT * {{BT}}) * C; // BTC
+ var weightPtr = outOC * {{BOC}} * C; //OCC
+ var threadResults: array<{{precision}}, {{TT}} * {{TOC}}>;
+ let outPtr = (outB * T + outT * {{BT}}) * OC + outOC * {{BOC}}; //BTOC
+ let biasPtr = outOC * {{BOC}};
+
+ for (var bkidx: u32 = 0; bkidx < C; bkidx += {{BC}}) {
+ // Load BC x BOC by numThread(BT * BOC / (TT * TOC))
+ // The number of iteration == BC * BOC / (BT * BOC / (TT * TOC))
+ for (var idx: u32 = 0; idx < {{NUM_TILEW}}; idx++) {
+ tileWeight[localID.x + idx * numThread] = weight[weightPtr + ((localID.x + idx * numThread) / {{BC}}) * C + ((localID.x + idx * numThread) % {{BC}})];
+ }
+ weightPtr += {{BC}};
+
+ // Load tile
+ // Load BT x BC by numThread(BT * BOC / (TT * TOC))
+ // The number of iteration == BT * BC / (BT * BOC / (TT * TOC))
+ for (var idx: u32 = 0; idx < {{NUM_TILEI}}; idx++) {
+ tileInp[localID.x + idx * numThread] = inp[inpPtr + ((localID.x + idx * numThread) / {{BC}}) * C + (localID.x + idx * numThread) % {{BC}}];
+ }
+ inpPtr += {{BC}};
+
+ workgroupBarrier();
+ // Compute tile
+ for (var dotIdx: u32 = 0; dotIdx < {{BC}}; dotIdx = dotIdx + 1) {
+ for (var idx: u32 = 0; idx < {{TT}}; idx++) {
+ localT[idx] = tileInp[(threadRow + idx) * {{BC}} + dotIdx];
+ }
+ for (var idx: u32 = 0; idx < {{TOC}}; idx++) {
+ localOC[idx] = tileWeight[(threadCol + idx) * {{BC}} + dotIdx];
+ }
+ for (var resIdxT: u32 = 0; resIdxT < {{TT}}; resIdxT++) {
+ for (var resIdxOC: u32 = 0; resIdxOC < {{TOC}}; resIdxOC++) {
+ threadResults[resIdxT * {{TOC}} + resIdxOC] += localT[resIdxT] * localOC[resIdxOC];
+ }
+ }
+ }
+ workgroupBarrier();
+ }
+
+ if (arrayLength(&bias) == 1) {
+ for (var resIdxT: u32 = 0; resIdxT < {{TT}}; resIdxT++) {
+ for (var resIdxOC: u32 = 0; resIdxOC < {{TOC}}; resIdxOC++) {
+ out[outPtr + (threadRow + resIdxT) * OC + threadCol + resIdxOC] = threadResults[resIdxT * {{TOC}} + resIdxOC];
+ }
+ }
+ } else {
+ for (var resIdxT: u32 = 0; resIdxT < {{TT}}; resIdxT++) {
+ for (var resIdxOC: u32 = 0; resIdxOC < {{TOC}}; resIdxOC++) {
+ out[outPtr + (threadRow + resIdxT) * OC + threadCol + resIdxOC] = threadResults[resIdxT * {{TOC}} + resIdxOC] + bias[biasPtr + threadCol + resIdxOC];
+ }
+ }
+ }
+}
)";
static const char *kShaderMatmulBackward = R"(
@@ -683,6 +781,78 @@ fn main(@builtin(global_invocation_id) global_id : vec3) {
}
)";
+static const char *kSum = R"(
+@group(0) @binding(0) var inp: array<{{precision}}>;
+@group(0) @binding(1) var out: array<{{precision}}>;
+var buffer: array<{{precision}}, 1024>;
+@compute @workgroup_size({{workgroupSize}})
+fn main(
+ @builtin(global_invocation_id) globalID : vec3,
+ @builtin(local_invocation_id) localID : vec3,
+ @builtin(workgroup_id) groupid : vec3,
+ @builtin(num_workgroups) numGroups : vec3) {
+ let blockSize3d: vec3 = vec3({{workgroupSize}});
+ let blockSize: u32 = blockSize3d.x;
+ let threadId: u32 = localID.x;
+ let blockId: u32 = groupid.x + groupid.y * numGroups.x;
+ let blockStart = blockId * blockSize * 2 + threadId;
+
+ buffer[threadId] = inp[blockStart] + inp[blockStart + blockSize];
+ workgroupBarrier();
+ var stride: u32 = blockSize / 2;
+
+ if (blockSize >= 1024 && threadId < 512) {
+ buffer[threadId] += buffer[threadId + 512];
+ }
+ workgroupBarrier();
+
+ if (blockSize >= 512 && threadId < 256) {
+ buffer[threadId] += buffer[threadId + 256];
+ }
+ workgroupBarrier();
+
+ if (blockSize >= 256 && threadId < 128) {
+ buffer[threadId] += buffer[threadId + 128];
+ }
+ workgroupBarrier();
+
+ if (threadId < 64) {
+ buffer[threadId] += buffer[threadId + 64];
+ }
+ workgroupBarrier();
+
+ if (threadId < 32) {
+ buffer[threadId] += buffer[threadId + 32];
+ }
+ workgroupBarrier();
+
+ if (threadId < 16) {
+ buffer[threadId] += buffer[threadId + 16];
+ }
+ workgroupBarrier();
+
+ if (threadId < 8) {
+ buffer[threadId] += buffer[threadId + 8];
+ }
+ workgroupBarrier();
+
+ if (threadId < 4) {
+ buffer[threadId] += buffer[threadId + 4];
+ }
+ workgroupBarrier();
+
+ if (threadId < 2) {
+ buffer[threadId] += buffer[threadId + 2];
+ }
+ workgroupBarrier();
+
+ if (threadId == 0) {
+ buffer[0] += buffer[1];
+ out[blockId] = buffer[0];
+ }
+}
+)";
+
} // namespace gpu
#endif // KERNELS_H
diff --git a/experimental/kernels/ops.cpp b/experimental/kernels/ops.cpp
index 67fc679..0e9c076 100644
--- a/experimental/kernels/ops.cpp
+++ b/experimental/kernels/ops.cpp
@@ -6,6 +6,7 @@
#include "kernels.h"
#include "ops.hpp"
+#include "experimental/wgsl.h" // loopUnrolling
using namespace gpu;
@@ -22,27 +23,39 @@ void encoder_forward(Context& ctx, float* out,
uint32_t C;
};
setLogLevel(kError);
- printf("Creating tensors\n");
- printf("Creating input tensor\%pn", inp);
- Tensor input = createTensor(ctx, Shape{b * t}, ki32, inp);
- printf("Created input tensor\n");
- Tensor wte_t = createTensor(ctx, Shape{v, c}, kf32, wte);
- printf("Created wte tensor\n");
- Tensor wpe_t = createTensor(ctx, Shape{t, c}, kf32, wpe);
- printf("Created wpe tensor\n");
- Tensor output = createTensor(ctx, Shape{b * t * c}, kf32);
- printf("Created tensors\n");
+ // Generate the key of the cache by arguments.
+ std::string key = "encoder_forward_" + std::to_string(B) + "_" + std::to_string(T) + "_" + std::to_string(C);
+ Kernel op;
+ if (ctx.kernelPool.data.find(key) == ctx.kernelPool.data.end()) {
+ Tensor input = createTensor(ctx, Shape{b * t}, ki32);
+ Tensor wte_t = createTensor(ctx, Shape{v, c}, kf32);
+ Tensor wpe_t = createTensor(ctx, Shape{t, c}, kf32);
+ Tensor output = createTensor(ctx, Shape{b * t * c}, kf32);
+ op = createKernel(ctx, {kShaderEncoder, 256, kf32},
+ Bindings{input, wte_t, wpe_t, output},
+ /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
+ /* params */
+ EncoderParams{
+ static_cast(b),
+ static_cast(t),
+ static_cast(c)
+ },
+ nullptr,
+ key.c_str());
+ } else {
+ op = ctx.kernelPool.data[key];
+ }
+ Tensor& input = ctx.pool.data[op->buffers[0]];
+ Tensor& wte_t = ctx.pool.data[op->buffers[1]];
+ Tensor& wpe_t = ctx.pool.data[op->buffers[2]];
+ Tensor& output = ctx.pool.data[op->buffers[3]];
+
+ toGPU(ctx, inp, input);
+ toGPU(ctx, wte, wte_t);
+ toGPU(ctx, wpe, wpe_t);
+
std::promise promise;
std::future future = promise.get_future();
- Kernel op = createKernel(ctx, {kShaderEncoder, 256, kf32},
- Bindings{input, wte_t, wpe_t, output},
- /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
- /* params */
- EncoderParams{
- static_cast(b),
- static_cast(t),
- static_cast(c)
- });
dispatchKernel(ctx, op, promise);
wait(ctx, future);
toCPU(ctx, output, out, b * t * c * sizeof(float));
@@ -61,21 +74,40 @@ void encoder_backward(Context& ctx, float* dwte, float* dwpe,
uint32_t C;
};
setLogLevel(kError);
- Tensor dwte_t = createTensor(ctx, Shape{v, c}, kf32, dwte);
- Tensor dwpe_t = createTensor(ctx, Shape{t, c}, kf32, dwpe);
- Tensor dout_t = createTensor(ctx, Shape{b * t * c}, kf32, dout);
- Tensor input = createTensor(ctx, Shape{b * t}, ki32, inp);
+ // Generate the key of the cache by arguments.
+ std::string key = "encoder_backward_" + std::to_string(B) + "_" + std::to_string(T) + "_" + std::to_string(C);
+ Kernel op;
+ if (ctx.kernelPool.data.find(key) == ctx.kernelPool.data.end()) {
+ Tensor dwte_t = createTensor(ctx, Shape{v, c}, kf32);
+ Tensor dwpe_t = createTensor(ctx, Shape{t, c}, kf32);
+ Tensor dout_t = createTensor(ctx, Shape{b * t * c}, kf32);
+ Tensor input = createTensor(ctx, Shape{b * t}, ki32);
+ op = createKernel(ctx, {kShaderEncoderBackward, 256, kf32},
+ Bindings{dwte_t, dwpe_t, dout_t, input},
+ /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
+ /* params */
+ EncoderParams{
+ static_cast(b),
+ static_cast