From 408c96bf289706ab5467d2f08d96eeb584604c8c Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 17 Dec 2025 03:32:26 -0800
Subject: [PATCH 01/14] got kernel and so build working on WSL2 for legacy
 optix7

---
 CMakeLists.txt             |  49 +++++++---
 crtx/compileOptiX.sh       |  79 ++++++++-------
 crtx/cuew/cuew.c           |   2 +-
 crtx/dllmain.cpp           | 193 +++++++++++++++++++++++--------------
 crtx/internal.h            | 144 +++++++++++----------------
 crtx/kernel.cu             |  10 ++
 rtxpy/rtx.py               |   1 +
 rtxpy/tests/test_simple.py |   1 +
 8 files changed, 273 insertions(+), 206 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4703283..448db92 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,4 @@
 cmake_minimum_required(VERSION 3.10)
-
 project(rtxpy)
 
 set(CMAKE_CXX_STANDARD 11)
@@ -8,25 +7,53 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 add_definitions(-DRTX_EXPORTS)
 
 if (WIN32)
-	add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
 endif()
 
-SET(SOURCE_DIR "crtx")
+set(SOURCE_DIR "crtx")
 
 set(HEADERS
-	${SOURCE_DIR}/common.h
-	${SOURCE_DIR}/internal.h
-	${SOURCE_DIR}/rtx.h
+  ${SOURCE_DIR}/common.h
+  ${SOURCE_DIR}/internal.h
+  ${SOURCE_DIR}/rtx.h
 )
 
 set(SOURCES
-	${SOURCE_DIR}/dllmain.cpp
-	${SOURCE_DIR}/cuew/cuew.c
+  ${SOURCE_DIR}/dllmain.cpp
+  ${SOURCE_DIR}/cuew/cuew.c
 )
 
 add_library(${PROJECT_NAME} SHARED ${HEADERS} ${SOURCES})
+target_compile_definitions(${PROJECT_NAME} PRIVATE CUDA_NO_PROTOTYPES OPTIX_DONT_INCLUDE_CUDA)
+
+# ---- CUDA toolkit path (adjust if yours differs) ----
+set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
+set(CUDA_INCLUDE_DIR "${CUDA_TOOLKIT_ROOT_DIR}/include")
+set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+
+target_include_directories(${PROJECT_NAME} PRIVATE
+  ${SOURCE_DIR}
+  ${SOURCE_DIR}/optix_7.1/include
+  ${SOURCE_DIR}/optix_7.1
+  ${SOURCE_DIR}/cuew
+  ${CUDA_INCLUDE_DIR}
+)
+
+# Link search paths:
+# - CUDA toolkit libs (cudart, etc.) live here
+# - WSL provides the NVIDIA driver libcuda.so here
+target_link_directories(${PROJECT_NAME} PRIVATE
+  ${CUDA_LIB_DIR}
+  /usr/lib/wsl/lib
+)
+
+target_link_libraries(${PROJECT_NAME} PRIVATE
+  cuda      # libcuda.so (driver API)
+  dl
+  pthread
+)
 
-target_include_directories(${PROJECT_NAME} PUBLIC
-	${SOURCE_DIR}/optix_7.1
-	${SOURCE_DIR}/cuew
+# Ensure runtime can find libcuda.so on WSL
+target_link_options(${PROJECT_NAME} PRIVATE
+  "-Wl,-rpath,/usr/lib/wsl/lib"
 )
diff --git a/crtx/compileOptiX.sh b/crtx/compileOptiX.sh
index f8e004a..f452986 100644
--- a/crtx/compileOptiX.sh
+++ b/crtx/compileOptiX.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -euo pipefail
 
 unameOut="$(uname -s)"
 case "${unameOut}" in
@@ -6,48 +7,58 @@ case "${unameOut}" in
     Darwin*)    machine=Mac;;
     CYGWIN*)    machine=Cygwin;;
     MINGW*)     machine=MinGw;;
-    *)          machine="UNKNOWN:${unameOut}"
+    *)          machine="UNKNOWN:${unameOut}";;
 esac
 
-if [ ! -d "external/shaders" ]
-then
-	mkdir external/shaders
-fi
+mkdir -p external/shaders
+
+OPTIX_VERSION=7.1.0
 
 if [ "${machine}" == "Linux" ]
 then
-	echo "Setting up variables for Linux"
-	export OPTIX_VERSION=7.1.0
-	export INCLUDES="-I'/<PATH_TO>/NVIDIA-OptiX-SDK-${OPTIX_VERSION}-linux64-x86_64/include'"
-	export INCLUDES="$INCLUDES -I'../include'"
-	export INCLUDES="$INCLUDES -I'/usr/local/cuda/samples/common/inc'" #For math_helper.h
-	export NVCC="/usr/local/cuda/bin/nvcc"
-	export COMPILER="g++"
-else 
-	if [ "${machine}" == "MinGw" ]
-	then
-		echo "Setting up variables for Windows (Git Bash)"
-
-		export OPTIX_VERSION=7.1.0
-		export CUDA_VERSION=11.4
-		export INCLUDES=(-I"/c/ProgramData/NVIDIA Corporation/OptiX SDK $OPTIX_VERSION/include" -I"../include" -I"/c/ProgramData/NVIDIA Corporation/CUDA Samples/v${CUDA_VERSION}/common/inc")
-		export NVCC="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${CUDA_VERSION}/bin/nvcc"
-		# You may need to update the path to a valid compiler. This points to MSVS 2019 compiler
-		export COMPILER="/c/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/VC/Tools/MSVC/14.29.30037/bin/Hostx64/x64"
-	else
-		echo "Unsupported OS : ${machine}"
-	fi
-fi
+    echo "Setting up variables for Linux"
 
-echo "Compiling for OptiX $OPTIX_VERSION"
-echo "NVCC compiler currently set: $NVCC"
-echo "C++ compiler currently set: $COMPILER"
+    NVCC="/usr/local/cuda/bin/nvcc"
+    COMPILER="g++"
 
-export NVCC_FLAGS="-m64 --std c++11 --use_fast_math -cudart static -arch sm_50 -Xptxas -v"
+    INCLUDES=(
+        -I"./optix_7.1"                         # <-- OptiX 7.1 headers vendored in this repo
+        -I"../include"
+        -I"/usr/local/cuda/samples/common/inc"  # For helper_math.h / math_helper.h (CUDA samples)
+    )
 
-if [ -f "kernel.ptx" ]
+elif [ "${machine}" == "MinGw" ]
 then
-	rm kernel.ptx
+    echo "Setting up variables for Windows (Git Bash)"
+
+    CUDA_VERSION=11.4
+    INCLUDES=(
+        -I"./optix_7.1"  # <-- also use vendored headers on Windows
+        -I"../include"
+        -I"/c/ProgramData/NVIDIA Corporation/CUDA Samples/v${CUDA_VERSION}/common/inc"
+    )
+
+    NVCC="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${CUDA_VERSION}/bin/nvcc"
+    COMPILER="/c/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/VC/Tools/MSVC/14.29.30037/bin/Hostx64/x64"
+else
+    echo "Unsupported OS : ${machine}"
+    exit 1
 fi
 
-exec "$NVCC" $NVCC_FLAGS -ccbin "$COMPILER" "${INCLUDES[@]}" -ptx -o kernel.ptx  kernel.cu >> cudaoutput.txt | tee
+echo "Compiling for OptiX ${OPTIX_VERSION}"
+echo "NVCC compiler currently set: ${NVCC}"
+echo "C++ compiler currently set: ${COMPILER}"
+
+NVCC_FLAGS=(
+    -m64
+    --std=c++11
+    --use_fast_math
+    -cudart=static
+    -arch=sm_86
+    -Xptxas -v
+)
+
+rm -f kernel.ptx
+
+exec "${NVCC}" "${NVCC_FLAGS[@]}" -ccbin "${COMPILER}" "${INCLUDES[@]}" -ptx -o kernel.ptx kernel.cu \
+    >> cudaoutput.txt | tee
diff --git a/crtx/cuew/cuew.c b/crtx/cuew/cuew.c
index b7a1aa2..e34f191 100644
--- a/crtx/cuew/cuew.c
+++ b/crtx/cuew/cuew.c
@@ -25,7 +25,7 @@
 #  endif
 #endif
 
-#include <cuew.h>
+#include <cuew/cuew.h>
 #include <assert.h>
 #include <stdio.h>
 #include <string.h>
diff --git a/crtx/dllmain.cpp b/crtx/dllmain.cpp
index e0bbe37..6daad23 100644
--- a/crtx/dllmain.cpp
+++ b/crtx/dllmain.cpp
@@ -1,5 +1,5 @@
-#include <assert.h>
-#include "cuew.h"
+#define CUDA_NO_PROTOTYPES
+#include "cuew/cuew.h"          // this pulls in cuda.h safely (no prototypes)
 
 #define OPTIX_DONT_INCLUDE_CUDA
 #include <optix.h>
@@ -7,15 +7,17 @@
 #include <optix_stack_size.h>
 #include <optix_stubs.h>
 
+// now your other includes:
+#include <assert.h>
+#include <climits>
 #include <sstream>
-
 #include <vector>
 #include <fstream>
 
 struct float3 { float x,y,z; };
-struct int3   {   int i[3]; };
-#include "common.h"
+struct int3   { int i[3]; };
 
+#include "common.h"
 #include "rtx.h"
 #include "internal.h"
 
@@ -43,56 +45,64 @@ std::string getTextFileContents(const char* fileName) {
     return res;
 }
 
-
 int createModule(State& state)
 {
     char   log[2048];  // For error reporting from OptiX creation functions
-    size_t logSize = sizeof( log );
+    size_t logSize = sizeof(log);
 
     OptixModuleCompileOptions module_compile_options = {};
-    module_compile_options.maxRegisterCount          = OPTIX_COMPILE_DEFAULT_MAX_REGISTER_COUNT;
-
-    module_compile_options.optLevel   = OPTIX_COMPILE_OPTIMIZATION_DEFAULT;
-    module_compile_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+    module_compile_options.maxRegisterCount = OPTIX_COMPILE_DEFAULT_MAX_REGISTER_COUNT;
+    module_compile_options.optLevel         = OPTIX_COMPILE_OPTIMIZATION_DEFAULT;
+    module_compile_options.debugLevel       = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
 
     state.pipeline_compile_options.usesMotionBlur        = false;
     state.pipeline_compile_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_GAS;
     state.pipeline_compile_options.numPayloadValues      = 4;
     state.pipeline_compile_options.numAttributeValues    = 2;
-    state.pipeline_compile_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;  // should be OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW;
+    state.pipeline_compile_options.exceptionFlags        = OPTIX_EXCEPTION_FLAG_NONE;
     state.pipeline_compile_options.pipelineLaunchParamsVariableName = "params";
 
-#if 0
-    std::string input2 = getTextFileContents("kernel.ptx");
-    size_t      inputSize2 = input2.length();
-    input2 += "\0\0\0";
-    unsigned* asd = reinterpret_cast<unsigned*>(&input2[0]);
-    for (int i = 0; i < inputSize2/4; i++) {
-        fprintf(stderr, "0x%x, ", asd[i]);
-        if (i % 20 == 0 && i > 0) {
-            fprintf(stderr, "\n");
+    std::string ptx;
+    try {
+        // Path is relative to the *current working directory* when your .so is loaded.
+        // If you run from repo root, this is likely "crtx/kernel.ptx".
+        // If you run from crtx/, this is "kernel.ptx".
+        //
+        // Start simple:
+        //   bash compileOptiX.sh   (from crtx)
+        //   pytest ...             (from repo root)
+        //
+        // So try "crtx/kernel.ptx" first, then fallback to "kernel.ptx".
+        try {
+            ptx = load_ptx_file("crtx/kernel.ptx");
+        } catch (...) {
+            ptx = load_ptx_file("kernel.ptx");
         }
+    } catch (const std::exception& e) {
+        fprintf(stderr, "[RTX] Failed to load PTX: %s\n", e.what());
+        return -1;
     }
-#endif
 
-    std::string input(reinterpret_cast<const char*>(buff));
-    size_t      inputSize = input.length();
+    const char* input     = ptx.c_str();
+    const size_t inputSize = ptx.size();
 
     OPTIX_CHECK_LOG(
         optixModuleCreateFromPTX(
-            state.context, 
-            &module_compile_options, 
+            state.context,
+            &module_compile_options,
             &state.pipeline_compile_options,
-            &input[0], 
-            inputSize, 
-            log, 
+            input,
+            inputSize,
+            log,
             &logSize,
             &state.ptx_module
         )
     );
+
     return 0;
 }
 
+
 int createProgramGroups(State& state)
 {
     char   log[2048];
@@ -376,61 +386,83 @@ int buildRTX_internal(State& state, uint64_t hash, float3* verts, int64_t numVer
 
 int cleanup_internal(State& state)
 {
-    //In case cleanup has already been called, don't call it again
+    // If CUDA context was never created, nothing to do.
     if (state.cuda.context == 0) {
+        state.valid = false;
         return 0;
     }
+
     state.valid = false;
+
+    // Free scene memory safely
     state.scene.freeMem();
-    
-    OPTIX_CHECK(optixPipelineDestroy(state.pipeline));
-    OPTIX_CHECK(optixProgramGroupDestroy(state.raygen));
-    OPTIX_CHECK(optixProgramGroupDestroy(state.miss));
-    OPTIX_CHECK(optixProgramGroupDestroy(state.hit));
-    state.pipeline = 0;
-    state.raygen = 0;
-    state.miss = 0;
-    state.hit = 0;
-    
-    OPTIX_CHECK(optixModuleDestroy(state.ptx_module));
-    OPTIX_CHECK(optixDeviceContextDestroy(state.context));
-    state.ptx_module = 0;
-    state.context = 0;
+    state.scene.hash = uint64_t(-1);
 
-    CUresult err = CUDA_SUCCESS;
-    err = cuMemFree(state.sbt.raygenRecord);
-    assert(err == CUDA_SUCCESS);
-    err = cuMemFree(state.sbt.missRecordBase);
-    assert(err == CUDA_SUCCESS);
-    err = cuMemFree(state.sbt.hitgroupRecordBase);
-    assert(err == CUDA_SUCCESS);
+    // OptiX objects (ONLY destroy if non-zero)
+    if (state.pipeline) {
+        optixPipelineDestroy(state.pipeline);
+        state.pipeline = 0;
+    }
+    if (state.raygen) {
+        optixProgramGroupDestroy(state.raygen);
+        state.raygen = 0;
+    }
+    if (state.miss) {
+        optixProgramGroupDestroy(state.miss);
+        state.miss = 0;
+    }
+    if (state.hit) {
+        optixProgramGroupDestroy(state.hit);
+        state.hit = 0;
+    }
+    if (state.ptx_module) {
+        optixModuleDestroy(state.ptx_module);
+        state.ptx_module = 0;
+    }
+    if (state.context) {
+        optixDeviceContextDestroy(state.context);
+        state.context = 0;
+    }
+
+    // Free SBT buffers (ONLY if non-zero)
+    if (state.sbt.raygenRecord) {
+        cuMemFree(state.sbt.raygenRecord);
+    }
+    if (state.sbt.missRecordBase) {
+        cuMemFree(state.sbt.missRecordBase);
+    }
+    if (state.sbt.hitgroupRecordBase) {
+        cuMemFree(state.sbt.hitgroupRecordBase);
+    }
     memset(&state.sbt, 0, sizeof(state.sbt));
 
-    err = cuMemFree(state.d_params);
-    assert(err == CUDA_SUCCESS);
-    err = cuMemFree(state.d_rays);
-    assert(err == CUDA_SUCCESS);
-    err = cuMemFree(state.d_hits);
-    assert(err == CUDA_SUCCESS);
+    // Free params/rays/hits (ONLY if non-zero)
+    if (state.d_params) cuMemFree(state.d_params);
+    if (state.d_rays)   cuMemFree(state.d_rays);
+    if (state.d_hits)   cuMemFree(state.d_hits);
+
     state.d_params = 0;
-    state.d_hits = 0;
-    state.d_rays = 0;
-    state.d_hits_size = 0;
+    state.d_rays   = 0;
+    state.d_hits   = 0;
     state.d_rays_size = 0;
+    state.d_hits_size = 0;
 
-    err = cuStreamDestroy(state.cuda.stream);
-    assert(err == CUDA_SUCCESS);
+    // Stream/context teardown
+    if (state.cuda.stream) {
+        cuStreamDestroy(state.cuda.stream);
+        state.cuda.stream = 0;
+    }
 
-    err = cuDevicePrimaryCtxRelease(state.cuda.device);
-    assert(err == CUDA_SUCCESS);
+    if (state.cuda.device) {
+        cuDevicePrimaryCtxRelease(state.cuda.device);
+        state.cuda.device = 0;
+    }
 
-    state.cuda.stream = 0;
-    state.cuda.device = 0;
     state.cuda.context = 0;
-
-    return err;
+    return 0;
 }
 
+
 int traceRTX_internal(State& state, Ray* rays, Hit* hits, int size) {
     if (!state.valid) {
         fprintf(stderr, "State is invalid!");
@@ -555,22 +587,35 @@ int initBuffers_internal(State& state, int numRays) {
         fprintf(stderr, "State is invalid!");
         return -2;
     }
+
     CUresult err = CUDA_SUCCESS;
-    size_t newRaysSize = sizeof(Ray) * numRays;
-    size_t newHitsSize = sizeof(Hit) * numRays;
+    size_t newRaysSize = sizeof(Ray) * (size_t)numRays;
+    size_t newHitsSize = sizeof(Hit) * (size_t)numRays;
+
     if (newRaysSize != state.d_rays_size || newHitsSize != state.d_hits_size) {
-        err = cuMemFree(state.d_rays);
-        CHECK_CUDA_LOG(err, "Failed to deallocate old input data buffer");
-        err = cuMemFree(state.d_hits);
-        CHECK_CUDA_LOG(err, "Failed to deallocate old output data buffer");
 
+        // Only free if we actually have something allocated
+        if (state.d_rays) {
+            err = cuMemFree(state.d_rays);
+            CHECK_CUDA_LOG(err, "Failed to deallocate old input data buffer");
+            state.d_rays = 0;
+        }
+        if (state.d_hits) {
+            err = cuMemFree(state.d_hits);
+            CHECK_CUDA_LOG(err, "Failed to deallocate old output data buffer");
+            state.d_hits = 0;
+        }
+
+        // Allocate new buffers
         state.d_rays_size = newRaysSize;
         err = cuMemAlloc(&state.d_rays, state.d_rays_size);
         CHECK_CUDA_LOG(err, "Failed to allocate input data buffer");
+
         state.d_hits_size = newHitsSize;
         err = cuMemAlloc(&state.d_hits, state.d_hits_size);
         CHECK_CUDA_LOG(err, "Failed to allocate output data buffer");
     }
+
     return err;
 }
 
diff --git a/crtx/internal.h b/crtx/internal.h
index 5f69a1b..7ebfe30 100644
--- a/crtx/internal.h
+++ b/crtx/internal.h
@@ -1,5 +1,13 @@
 #pragma once
 
+#include <stdint.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string>
+#include <fstream>
+#include <sstream>
+
+// NOTE: dllmain.cpp must include <cuda.h> and <optix.h> BEFORE including this file.
 #ifdef WIN32
 #define __align__(X) __declspec(align(X))
 #else
@@ -12,12 +20,10 @@ struct Record {
     T data;
 };
 
-struct Empty {
-};
-
+struct Empty {};
 typedef Record<Empty>   RayGenSbtRecord;
-typedef Record<Empty>     MissSbtRecord;
-typedef Record<Empty> HitGroupSbtRecord;
+typedef Record<Empty>   MissSbtRecord;
+typedef Record<Empty>   HitGroupSbtRecord;
 
 #define OPTIX_CHECK( call )                                                    \
     do                                                                         \
@@ -39,12 +45,12 @@ typedef Record<Empty> HitGroupSbtRecord;
     {                                                                          \
         OptixResult res = call;                                                \
         const size_t sizeof_log_returned = logSize;                            \
-        logSize = sizeof( log ); /* reset sizeof_log for future calls */       \
+        logSize = sizeof( log );                                               \
         if( res != OPTIX_SUCCESS )                                             \
         {                                                                      \
             std::stringstream ss;                                              \
             ss << "Optix call '" << #call << "' failed with code("             \
-            << res << "): " __FILE__ ":"                                       \
+               << res << "): " __FILE__ ":"                                    \
                << __LINE__ << ")\nLog:\n" << log                               \
                << ( sizeof_log_returned > sizeof( log ) ? "<TRUNCATED>" : "" ) \
                << "\n";                                                        \
@@ -53,6 +59,23 @@ typedef Record<Empty> HitGroupSbtRecord;
         }                                                                      \
     } while( 0 )
 
+inline void checkCuda(int err, const char* msg, const char* fn, int line) {
+    if (!msg || msg[0] == '\0') {
+        fprintf(stderr, "CUDA Error[%d] at %s[%d]\n", err, fn, line);
+    } else {
+        fprintf(stderr, "CUDA Error[%d] at %s[%d] : %s\n", err, fn, line, msg);
+    }
+}
+
+#define CHECK_CUDA_LOG(err, msg) \
+    do { \
+        if ((err) != CUDA_SUCCESS) { \
+            checkCuda(int(err), msg, __FUNCTION__, __LINE__); \
+            return err; \
+        } \
+    } while (false)
+
+#define CHECK_CUDA(err) CHECK_CUDA_LOG(err, "")
 
 struct Scene {
     Scene() {
@@ -60,9 +83,8 @@ struct Scene {
         memory = 0;
         hash = uint64_t(-1);
     }
-    ~Scene() {
-        freeMem();
-    }
+    ~Scene() { freeMem(); }
+
     void freeMem() {
         if (memory) {
             cuMemFree(memory);
@@ -84,98 +106,48 @@ struct State {
     OptixPipeline               pipeline = 0;
 
     OptixProgramGroup raygen = 0;
-    OptixProgramGroup miss = 0;
-    OptixProgramGroup hit = 0;
+    OptixProgramGroup miss   = 0;
+    OptixProgramGroup hit    = 0;
 
     OptixShaderBindingTable sbt = {};
 
     CUdeviceptr d_params = 0;
 
     CUdeviceptr d_rays = 0;
-    size_t d_rays_size = 0;
+    size_t      d_rays_size = 0;
     CUdeviceptr d_hits = 0;
-    size_t d_hits_size = 0;
+    size_t      d_hits_size = 0;
 
     struct {
-        CUdevice device = 0;
-        CUstream stream = 0;
+        CUdevice  device  = 0;
+        CUstream  stream  = 0;
         CUcontext context = 0;
     } cuda;
 
     bool valid = false;
 };
 
-inline void checkCuda(int err, const char* msg, const char* fn, int line) {
-    if (!msg) {
-        fprintf(stderr, "CUDA Error[%d] at %s[%d]\n", err, fn, line);
+// Read a PTX file into a std::string (throws on failure)
+inline std::string load_ptx_file(const char* path)
+{
+    std::ifstream f(path, std::ios::in | std::ios::binary);
+    if (!f) {
+        throw std::runtime_error(std::string("Could not open PTX file: ") + path);
     }
-    else {
-        fprintf(stderr, "CUDA Error[%d] at %s[%d] : %s\n", err, fn, line, msg);
-    }
-}
 
-#define CHECK_CUDA_LOG(err, msg) \
-    do { \
-        if (err != CUDA_SUCCESS) { \
-            checkCuda(int(err), msg, __FUNCTION__, __LINE__); \
-            return err; \
-        }\
-    } while (false)
+    f.seekg(0, std::ios::end);
+    std::streamoff size = f.tellg();
+    f.seekg(0, std::ios::beg);
 
-#define CHECK_CUDA(err) CHECK_CUDA_LOG(err, "")
+    if (size <= 0) {
+        throw std::runtime_error(std::string("PTX file is empty: ") + path);
+    }
 
-// Our GPU source code
-static const uint32_t buff[] = {
-	0x2f0a2f2f, 0x6547202f, 0x6172656e, 0x20646574, 0x4e207962, 0x49444956, 0x564e2041, 0x43204d56, 0x69706d6f, 0xa72656c, 0x2f0a2f2f, 0x6f43202f, 0x6c69706d, 0x42207265, 0x646c6975, 0x3a444920, 0x2d4c4320, 0x33303033, 0x31313433, 0x202f2f0a, 0x61647543,
-	0x6d6f6320, 0x616c6970, 0x6e6f6974, 0x6f6f7420, 0x202c736c, 0x656c6572, 0x20657361, 0x342e3131, 0x3156202c, 0x2e342e31, 0x2f0a3834, 0x6142202f, 0x20646573, 0x4e206e6f, 0x204d5656, 0x2e302e37, 0x2f2f0a31, 0x762e0a0a, 0x69737265, 0x37206e6f,
-	0x2e0a342e, 0x67726174, 0x73207465, 0x30355f6d, 0x64612e0a, 0x73657264, 0x69735f73, 0x3620657a, 0x90a0a34, 0x2e202f2f, 0x626f6c67, 0x5f5f096c, 0x67796172, 0x5f5f6e65, 0x6e69616d, 0x6f632e0a, 0x2074736e, 0x696c612e, 0x38206e67, 0x38622e20,
-	0x72617020, 0x5b736d61, 0x3b5d3432, 0x762e0a0a, 0x62697369, 0x2e20656c, 0x72746e65, 0x5f5f2079, 0x67796172, 0x5f5f6e65, 0x6e69616d, 0x7b0a2928, 0x722e090a, 0x2e206765, 0x20323366, 0x3c662509, 0x3b3e3031, 0x722e090a, 0x2e206765, 0x20323362,
-	0x3c722509, 0x3b3e3632, 0x722e090a, 0x2e206765, 0x20343662, 0x64722509, 0x3e30313c, 0xa0a0a3b, 0x202f2f09, 0x69676562, 0x6e69206e, 0x656e696c, 0x6d736120, 0x6163090a, 0x28206c6c, 0x29317225, 0x6f5f202c, 0x78697470, 0x7465675f, 0x75616c5f,
-	0x5f68636e, 0x65646e69, 0x2c785f78, 0x3b292820, 0x2f2f090a, 0x646e6520, 0x6c6e6920, 0x20656e69, 0xa6d7361, 0x202f2f09, 0x69676562, 0x6e69206e, 0x656e696c, 0x6d736120, 0x6163090a, 0x28206c6c, 0x29327225, 0x6f5f202c, 0x78697470, 0x7465675f,
-	0x75616c5f, 0x5f68636e, 0x65646e69, 0x2c795f78, 0x3b292820, 0x2f2f090a, 0x646e6520, 0x6c6e6920, 0x20656e69, 0xa6d7361, 0x202f2f09, 0x69676562, 0x6e69206e, 0x656e696c, 0x6d736120, 0x6163090a, 0x28206c6c, 0x29337225, 0x6f5f202c, 0x78697470,
-	0x7465675f, 0x75616c5f, 0x5f68636e, 0x65646e69, 0x2c7a5f78, 0x3b292820, 0x2f2f090a, 0x646e6520, 0x6c6e6920, 0x20656e69, 0xa6d7361, 0x202f2f09, 0x69676562, 0x6e69206e, 0x656e696c, 0x6d736120, 0x6163090a, 0x28206c6c, 0x29347225, 0x6f5f202c,
-	0x78697470, 0x7465675f, 0x75616c5f, 0x5f68636e, 0x656d6964, 0x6f69736e, 0x2c785f6e, 0x3b292820, 0x2f2f090a, 0x646e6520, 0x6c6e6920, 0x20656e69, 0xa6d7361, 0x202f2f09, 0x69676562, 0x6e69206e, 0x656e696c, 0x6d736120, 0x6163090a, 0x28206c6c,
-	0x29357225, 0x6f5f202c, 0x78697470, 0x7465675f, 0x75616c5f, 0x5f68636e, 0x656d6964, 0x6f69736e, 0x2c795f6e, 0x3b292820, 0x2f2f090a, 0x646e6520, 0x6c6e6920, 0x20656e69, 0xa6d7361, 0x64616d09, 0x2e6f6c2e, 0x20323373, 0x32722509, 0x25202c30,
-	0x202c3572, 0x2c337225, 0x32722520, 0x6d090a3b, 0x6c2e6461, 0x33732e6f, 0x25092032, 0x2c313272, 0x32722520, 0x25202c30, 0x202c3472, 0x3b317225, 0x646c090a, 0x6e6f632e, 0x752e7473, 0x9203436, 0x32647225, 0x705b202c, 0x6d617261, 0x5d382b73,
-	0x63090a3b, 0x2e617476, 0x672e6f74, 0x61626f6c, 0x36752e6c, 0x25092034, 0x2c336472, 0x64722520, 0x90a3b32, 0x2e6c756d, 0x65646977, 0x3233752e, 0x72250920, 0x202c3464, 0x31327225, 0x3233202c, 0x61090a3b, 0x732e6464, 0x9203436, 0x35647225,
-	0x7225202c, 0x202c3364, 0x34647225, 0x6c090a3b, 0x6c672e64, 0x6c61626f, 0x3233662e, 0x66250920, 0x5b202c31, 0x35647225, 0x90a3b5d, 0x672e646c, 0x61626f6c, 0x33662e6c, 0x25092032, 0x202c3266, 0x6472255b, 0x5d342b35, 0x6c090a3b, 0x6c672e64,
-	0x6c61626f, 0x3233662e, 0x66250920, 0x5b202c33, 0x35647225, 0x3b5d382b, 0x646c090a, 0x6f6c672e, 0x2e6c6162, 0x20323366, 0x37662509, 0x255b202c, 0x2b356472, 0x3b5d3231, 0x646c090a, 0x6f6c672e, 0x2e6c6162, 0x20323366, 0x34662509, 0x255b202c,
-	0x2b356472, 0x3b5d3631, 0x646c090a, 0x6f6c672e, 0x2e6c6162, 0x20323366, 0x35662509, 0x255b202c, 0x2b356472, 0x3b5d3032, 0x646c090a, 0x6f6c672e, 0x2e6c6162, 0x20323366, 0x36662509, 0x255b202c, 0x2b356472, 0x3b5d3432, 0x646c090a, 0x6f6c672e,
-	0x2e6c6162, 0x20323366, 0x38662509, 0x255b202c, 0x2b356472, 0x3b5d3832, 0x646c090a, 0x6e6f632e, 0x752e7473, 0x9203436, 0x31647225, 0x705b202c, 0x6d617261, 0xa3b5d73, 0x766f6d09, 0x3233662e, 0x66250920, 0x30202c39, 0x30303066, 0x30303030,
-	0x90a3b30, 0x2e766f6d, 0x20323375, 0x31722509, 0x31202c34, 0x6d090a3b, 0x752e766f, 0x9203233, 0x35317225, 0x3b30202c, 0x2f2f090a, 0x67656220, 0x69206e69, 0x6e696c6e, 0x73612065, 0x63090a6d, 0x206c6c61, 0x37722528, 0x7225202c, 0x25202c38,
-	0x202c3972, 0x30317225, 0x5f202c29, 0x6974706f, 0x72745f78, 0x5f656361, 0x28202c34, 0x31647225, 0x6625202c, 0x25202c31, 0x202c3266, 0x2c336625, 0x34662520, 0x6625202c, 0x25202c35, 0x202c3666, 0x2c376625, 0x38662520, 0x6625202c, 0x25202c39,
-	0x2c343172, 0x31722520, 0x25202c35, 0x2c353172, 0x31722520, 0x25202c34, 0x2c353172, 0x32722520, 0x25202c32, 0x2c333272, 0x32722520, 0x25202c34, 0x29353272, 0x2f090a3b, 0x6e65202f, 0x6e692064, 0x656e696c, 0x6d736120, 0x646c090a, 0x6e6f632e,
-	0x752e7473, 0x9203436, 0x36647225, 0x705b202c, 0x6d617261, 0x36312b73, 0x90a3b5d, 0x61747663, 0x2e6f742e, 0x626f6c67, 0x752e6c61, 0x9203436, 0x37647225, 0x7225202c, 0xa3b3664, 0x6c756d09, 0x6469772e, 0x33752e65, 0x25092032, 0x2c386472,
-	0x32722520, 0x31202c31, 0x90a3b36, 0x2e646461, 0x20343673, 0x64722509, 0x25202c39, 0x2c376472, 0x64722520, 0x90a3b38, 0x672e7473, 0x61626f6c, 0x33752e6c, 0x5b092032, 0x39647225, 0x25202c5d, 0xa3b3772, 0x2e747309, 0x626f6c67, 0x752e6c61,
-	0x9203233, 0x6472255b, 0x5d342b39, 0x7225202c, 0x90a3b38, 0x672e7473, 0x61626f6c, 0x33752e6c, 0x5b092032, 0x39647225, 0x2c5d382b, 0x39722520, 0x73090a3b, 0x6c672e74, 0x6c61626f, 0x3233752e, 0x255b0920, 0x2b396472, 0x2c5d3231, 0x31722520,
-	0x90a3b30, 0x3b746572, 0xa7d0a0a, 0x202f2f09, 0x6f6c672e, 0x5f096c62, 0x73696d5f, 0x6d5f5f73, 0xa737369, 0x7369762e, 0x656c6269, 0x6e652e20, 0x20797274, 0x696d5f5f, 0x5f5f7373, 0x7373696d, 0x7b0a2928, 0x722e090a, 0x2e206765, 0x20323362,
-	0x3c722509, 0xa3b3e35, 0x6d090a0a, 0x752e766f, 0x9203233, 0x2c317225, 0x30312d20, 0x33313238, 0x32333430, 0x2f090a3b, 0x6562202f, 0x206e6967, 0x696c6e69, 0x6120656e, 0x90a6d73, 0x6c6c6163, 0x706f5f20, 0x5f786974, 0x5f746573, 0x6c796170,
-	0x5f64616f, 0x28202c30, 0x29317225, 0x2f090a3b, 0x6e65202f, 0x6e692064, 0x656e696c, 0x6d736120, 0x6f6d090a, 0x33752e76, 0x25092032, 0x202c3272, 0x35363031, 0x32333533, 0xa3b3631, 0x202f2f09, 0x69676562, 0x6e69206e, 0x656e696c, 0x6d736120,
-	0x6163090a, 0x5f206c6c, 0x6974706f, 0x65735f78, 0x61705f74, 0x616f6c79, 0x2c315f64, 0x72252820, 0xa3b2932, 0x202f2f09, 0x20646e65, 0x696c6e69, 0x6120656e, 0x90a6d73, 0x2e766f6d, 0x20323375, 0x34722509, 0x3b30202c, 0x2f2f090a, 0x67656220,
-	0x69206e69, 0x6e696c6e, 0x73612065, 0x63090a6d, 0x206c6c61, 0x74706f5f, 0x735f7869, 0x705f7465, 0x6f6c7961, 0x325f6461, 0x2528202c, 0x3b293472, 0x2f2f090a, 0x646e6520, 0x6c6e6920, 0x20656e69, 0xa6d7361, 0x202f2f09, 0x69676562, 0x6e69206e,
-	0x656e696c, 0x6d736120, 0x6163090a, 0x5f206c6c, 0x6974706f, 0x65735f78, 0x61705f74, 0x616f6c79, 0x2c335f64, 0x72252820, 0xa3b2934, 0x202f2f09, 0x20646e65, 0x696c6e69, 0x6120656e, 0x90a6d73, 0x3b746572, 0xa7d0a0a, 0x202f2f09, 0x6f6c672e,
-	0x5f096c62, 0x6f6c635f, 0x74736573, 0x5f746968, 0x6968635f, 0x762e0a74, 0x62697369, 0x2e20656c, 0x72746e65, 0x5f5f2079, 0x736f6c63, 0x68747365, 0x5f5f7469, 0x74696863, 0x7b0a2928, 0x722e090a, 0x2e206765, 0x20323366, 0x3c662509, 0x3b3e3733,
-	0x722e090a, 0x2e206765, 0x20323362, 0x3c722509, 0x3b3e3031, 0x722e090a, 0x2e206765, 0x20343662, 0x64722509, 0x3b3e333c, 0x90a0a0a, 0x62202f2f, 0x6e696765, 0x6c6e6920, 0x20656e69, 0xa6d7361, 0x6c616309, 0x2528206c, 0x2c293166, 0x706f5f20,
-	0x5f786974, 0x5f746567, 0x5f796172, 0x78616d74, 0x2928202c, 0x2f090a3b, 0x6e65202f, 0x6e692064, 0x656e696c, 0x6d736120, 0x7663090a, 0x7a722e74, 0x74662e69, 0x33752e7a, 0x33662e32, 0x25092032, 0x202c3972, 0x3b316625, 0x2f2f090a, 0x67656220,
-	0x69206e69, 0x6e696c6e, 0x73612065, 0x63090a6d, 0x206c6c61, 0x64722528, 0x202c2931, 0x74706f5f, 0x675f7869, 0x675f7465, 0x745f7361, 0x65766172, 0x62617372, 0x685f656c, 0x6c646e61, 0x28202c65, 0x90a3b29, 0x65202f2f, 0x6920646e, 0x6e696c6e,
-	0x73612065, 0x2f090a6d, 0x6562202f, 0x206e6967, 0x696c6e69, 0x6120656e, 0x90a6d73, 0x6c6c6163, 0x72252820, 0x202c2931, 0x74706f5f, 0x725f7869, 0x5f646165, 0x6d697270, 0x76697469, 0x64695f65, 0x28202c78, 0x90a3b29, 0x65202f2f, 0x6920646e,
-	0x6e696c6e, 0x73612065, 0x2f090a6d, 0x6562202f, 0x206e6967, 0x696c6e69, 0x6120656e, 0x90a6d73, 0x6c6c6163, 0x72252820, 0x202c2932, 0x74706f5f, 0x725f7869, 0x5f646165, 0x5f746273, 0x5f736167, 0x2c786469, 0x3b292820, 0x2f2f090a, 0x646e6520,
-	0x6c6e6920, 0x20656e69, 0xa6d7361, 0x202f2f09, 0x69676562, 0x6e69206e, 0x656e696c, 0x6d736120, 0x6163090a, 0x28206c6c, 0x29326625, 0x6f5f202c, 0x78697470, 0x7465675f, 0x7961725f, 0x6d69745f, 0x28202c65, 0x90a3b29, 0x65202f2f, 0x6920646e,
-	0x6e696c6e, 0x73612065, 0x2f090a6d, 0x6562202f, 0x206e6967, 0x696c6e69, 0x6120656e, 0x90a6d73, 0x6c6c6163, 0x66252820, 0x25202c33, 0x202c3466, 0x2c356625, 0x36662520, 0x6625202c, 0x25202c37, 0x202c3866, 0x2c396625, 0x31662520, 0x25202c30,
-	0x29313166, 0x6f5f202c, 0x78697470, 0x7465675f, 0x6972745f, 0x6c676e61, 0x65765f65, 0x78657472, 0x7461645f, 0x28202c61, 0x31647225, 0x7225202c, 0x25202c31, 0x202c3272, 0x29326625, 0x2f090a3b, 0x6e65202f, 0x6e692064, 0x656e696c, 0x6d736120,
-	0x7573090a, 0x74662e62, 0x33662e7a, 0x25092032, 0x2c333166, 0x36662520, 0x6625202c, 0x90a3b33, 0x2e627573, 0x2e7a7466, 0x20323366, 0x31662509, 0x25202c34, 0x202c3766, 0x3b346625, 0x7573090a, 0x74662e62, 0x33662e7a, 0x25092032, 0x2c353166,
-	0x38662520, 0x6625202c, 0x90a3b35, 0x2e627573, 0x2e7a7466, 0x20323366, 0x31662509, 0x25202c36, 0x202c3966, 0x3b336625, 0x7573090a, 0x74662e62, 0x33662e7a, 0x25092032, 0x2c373166, 0x31662520, 0x25202c30, 0xa3b3466, 0x62757309, 0x7a74662e,
-	0x3233662e, 0x66250920, 0x202c3831, 0x31316625, 0x6625202c, 0x90a3b35, 0x2e6c756d, 0x2e7a7466, 0x20323366, 0x31662509, 0x25202c39, 0x2c343166, 0x31662520, 0x90a3b38, 0x2e6c756d, 0x2e7a7466, 0x20323366, 0x32662509, 0x25202c30, 0x2c353166,
-	0x31662520, 0x90a3b37, 0x2e627573, 0x2e7a7466, 0x20323366, 0x32662509, 0x25202c31, 0x2c393166, 0x32662520, 0x90a3b30, 0x2e6c756d, 0x2e7a7466, 0x20323366, 0x32662509, 0x25202c32, 0x2c333166, 0x31662520, 0x90a3b38, 0x2e6c756d, 0x2e7a7466,
-	0x20323366, 0x32662509, 0x25202c33, 0x2c353166, 0x31662520, 0x90a3b36, 0x2e627573, 0x2e7a7466, 0x20323366, 0x32662509, 0x25202c34, 0x2c323266, 0x32662520, 0x90a3b33, 0x2e6c756d, 0x2e7a7466, 0x20323366, 0x32662509, 0x25202c35, 0x2c333166,
-	0x31662520, 0x90a3b37, 0x2e6c756d, 0x2e7a7466, 0x20323366, 0x32662509, 0x25202c36, 0x2c343166, 0x31662520, 0x90a3b36, 0x2e627573, 0x2e7a7466, 0x20323366, 0x32662509, 0x25202c37, 0x2c353266, 0x32662520, 0x90a3b36, 0x2e6c756d, 0x2e7a7466,
-	0x20323366, 0x32662509, 0x25202c38, 0x2c343266, 0x32662520, 0x90a3b34, 0x2e616d66, 0x662e6e72, 0x662e7a74, 0x9203233, 0x39326625, 0x6625202c, 0x202c3132, 0x31326625, 0x6625202c, 0xa3b3832, 0x616d6609, 0x2e6e722e, 0x2e7a7466, 0x20323366,
-	0x33662509, 0x25202c30, 0x2c373266, 0x32662520, 0x25202c37, 0x3b393266, 0x7372090a, 0x2e747271, 0x72707061, 0x662e786f, 0x662e7a74, 0x9203233, 0x31336625, 0x6625202c, 0xa3b3033, 0x6c756d09, 0x7a74662e, 0x3233662e, 0x66250920, 0x202c3233,
-	0x31336625, 0x6625202c, 0xa3b3132, 0x6c756d09, 0x7a74662e, 0x3233662e, 0x66250920, 0x202c3333, 0x34326625, 0x6625202c, 0xa3b3133, 0x67656e09, 0x7a74662e, 0x3233662e, 0x66250920, 0x202c3433, 0x33336625, 0x6d090a3b, 0x662e6c75, 0x662e7a74,
-	0x9203233, 0x35336625, 0x6625202c, 0x202c3133, 0x37326625, 0x63090a3b, 0x722e7476, 0x33662e6e, 0x33752e32, 0x25092032, 0x2c363366, 0x39722520, 0x6d090a3b, 0x622e766f, 0x9203233, 0x2c357225, 0x33662520, 0x90a3b36, 0x62202f2f, 0x6e696765,
-	0x6c6e6920, 0x20656e69, 0xa6d7361, 0x6c616309, 0x6f5f206c, 0x78697470, 0x7465735f, 0x7961705f, 0x64616f6c, 0x202c305f, 0x35722528, 0x90a3b29, 0x65202f2f, 0x6920646e, 0x6e696c6e, 0x73612065, 0x6d090a6d, 0x622e766f, 0x9203233, 0x2c367225,
-	0x33662520, 0x90a3b32, 0x62202f2f, 0x6e696765, 0x6c6e6920, 0x20656e69, 0xa6d7361, 0x6c616309, 0x6f5f206c, 0x78697470, 0x7465735f, 0x7961705f, 0x64616f6c, 0x202c315f, 0x36722528, 0x90a3b29, 0x65202f2f, 0x6920646e, 0x6e696c6e, 0x73612065,
-	0x6d090a6d, 0x622e766f, 0x9203233, 0x2c377225, 0x33662520, 0x90a3b34, 0x62202f2f, 0x6e696765, 0x6c6e6920, 0x20656e69, 0xa6d7361, 0x6c616309, 0x6f5f206c, 0x78697470, 0x7465735f, 0x7961705f, 0x64616f6c, 0x202c325f, 0x37722528, 0x90a3b29,
-	0x65202f2f, 0x6920646e, 0x6e696c6e, 0x73612065, 0x6d090a6d, 0x622e766f, 0x9203233, 0x2c387225, 0x33662520, 0x90a3b35, 0x62202f2f, 0x6e696765, 0x6c6e6920, 0x20656e69, 0xa6d7361, 0x6c616309, 0x6f5f206c, 0x78697470, 0x7465735f, 0x7961705f,
-	0x64616f6c, 0x202c335f, 0x38722528, 0x90a3b29, 0x65202f2f, 0x6920646e, 0x6e696c6e, 0x73612065, 0x72090a6d, 0xa3b7465, 0xa0a7d0a, 0x0
-};
\ No newline at end of file
+    std::string ptx;
+    ptx.resize(static_cast<size_t>(size));
+    f.read(&ptx[0], size);
+    if (!f) {
+        throw std::runtime_error(std::string("Failed to read PTX file: ") + path);
+    }
+    return ptx;
+}
diff --git a/crtx/kernel.cu b/crtx/kernel.cu
index b9cccfe..35d982a 100644
--- a/crtx/kernel.cu
+++ b/crtx/kernel.cu
@@ -1,6 +1,16 @@
 #include <optix.h>
 #include "common.h"
 
+// ---- Compatibility helpers (older sample code used these names) ----
+#ifndef int_as_float
+#define int_as_float __int_as_float
+#endif
+
+#ifndef float_as_int
+#define float_as_int __float_as_int
+#endif
+// -------------------------------------------------------------------
+
 typedef unsigned long long uint64_t;
 
 extern "C" {
diff --git a/rtxpy/rtx.py b/rtxpy/rtx.py
index 54f8932..1828977 100644
--- a/rtxpy/rtx.py
+++ b/rtxpy/rtx.py
@@ -18,6 +18,7 @@ def free_optix_resources():
     c_lib = None
 
 class RTX():
+
     def __init__(self):
         global c_lib
         if c_lib != None:
diff --git a/rtxpy/tests/test_simple.py b/rtxpy/tests/test_simple.py
index fa016f5..f3021ba 100644
--- a/rtxpy/tests/test_simple.py
+++ b/rtxpy/tests/test_simple.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pytest
+
 from rtxpy import RTX, has_cupy
 
 

From ed8b52e279136e1684f4d4d4cd168724ba52c528 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 17 Dec 2025 03:34:31 -0800
Subject: [PATCH 02/14] helper file while testing

---
 clean_build.sh | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 clean_build.sh

diff --git a/clean_build.sh b/clean_build.sh
new file mode 100644
index 0000000..5138a27
--- /dev/null
+++ b/clean_build.sh
@@ -0,0 +1,5 @@
+rm -rf build
+mkdir build
+cd build
+cmake ..
+cmake --build . -j

From e66918d2dbac7b597283273c8459b9ee4fcde114 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 17 Dec 2025 03:35:09 -0800
Subject: [PATCH 03/14] added optix_9.1 headers

---
 crtx/optix_9.1/internal/optix_device_impl.h   | 2785 +++++++++++++++++
 .../internal/optix_device_impl_coop_vec.h     |  951 ++++++
 .../optix_device_impl_transformations.h       |  422 +++
 crtx/optix_9.1/internal/optix_micromap_impl.h |  185 ++
 crtx/optix_9.1/optix.h                        |   38 +
 crtx/optix_9.1/optix_denoiser_tiling.h        |  363 +++
 crtx/optix_9.1/optix_device.h                 | 2440 +++++++++++++++
 crtx/optix_9.1/optix_function_table.h         |  444 +++
 .../optix_function_table_definition.h         |   59 +
 crtx/optix_9.1/optix_host.h                   | 1225 ++++++++
 crtx/optix_9.1/optix_micromap.h               |   76 +
 crtx/optix_9.1/optix_stack_size.h             |  345 ++
 crtx/optix_9.1/optix_stubs.h                  |  828 +++++
 crtx/optix_9.1/optix_types.h                  | 2747 ++++++++++++++++
 14 files changed, 12908 insertions(+)
 create mode 100644 crtx/optix_9.1/internal/optix_device_impl.h
 create mode 100644 crtx/optix_9.1/internal/optix_device_impl_coop_vec.h
 create mode 100644 crtx/optix_9.1/internal/optix_device_impl_transformations.h
 create mode 100644 crtx/optix_9.1/internal/optix_micromap_impl.h
 create mode 100644 crtx/optix_9.1/optix.h
 create mode 100644 crtx/optix_9.1/optix_denoiser_tiling.h
 create mode 100644 crtx/optix_9.1/optix_device.h
 create mode 100644 crtx/optix_9.1/optix_function_table.h
 create mode 100644 crtx/optix_9.1/optix_function_table_definition.h
 create mode 100644 crtx/optix_9.1/optix_host.h
 create mode 100644 crtx/optix_9.1/optix_micromap.h
 create mode 100644 crtx/optix_9.1/optix_stack_size.h
 create mode 100644 crtx/optix_9.1/optix_stubs.h
 create mode 100644 crtx/optix_9.1/optix_types.h

diff --git a/crtx/optix_9.1/internal/optix_device_impl.h b/crtx/optix_9.1/internal/optix_device_impl.h
new file mode 100644
index 0000000..556fd85
--- /dev/null
+++ b/crtx/optix_9.1/internal/optix_device_impl.h
@@ -0,0 +1,2785 @@
+/*
+* SPDX-FileCopyrightText: Copyright (c) 2019 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+* SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+*
+* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+* property and proprietary rights in and to this material, related
+* documentation and any modifications thereto. Any use, reproduction,
+* disclosure or distribution of this material and related documentation
+* without an express license agreement from NVIDIA CORPORATION or
+* its affiliates is strictly prohibited.
+*/
+/**
+* @file   optix_device_impl.h
+* @author NVIDIA Corporation
+* @brief  OptiX public API
+*
+* OptiX public API Reference - Device side implementation
+*/
+
+#if !defined( __OPTIX_INCLUDE_INTERNAL_HEADERS__ )
+#error("optix_device_impl.h is an internal header file and must not be used directly.  Please use optix_device.h or optix.h instead.")
+#endif
+
+#ifndef OPTIX_OPTIX_DEVICE_IMPL_H
+#define OPTIX_OPTIX_DEVICE_IMPL_H
+
+#include "internal/optix_device_impl_transformations.h"
+
+#ifndef __CUDACC_RTC__
+#include <initializer_list>
+#include <type_traits>
+#endif
+
+namespace optix_internal {
+template <typename...>
+struct TypePack{};
+}  // namespace optix_internal
+
+template <typename... Payload>
+static __forceinline__ __device__ void optixTrace( OptixTraversableHandle handle,
+                                                   float3                 rayOrigin,
+                                                   float3                 rayDirection,
+                                                   float                  tmin,
+                                                   float                  tmax,
+                                                   float                  rayTime,
+                                                   OptixVisibilityMask    visibilityMask,
+                                                   unsigned int           rayFlags,
+                                                   unsigned int           SBToffset,
+                                                   unsigned int           SBTstride,
+                                                   unsigned int           missSBTIndex,
+                                                   Payload&...            payload )
+{
+    static_assert( sizeof...( Payload ) <= 32, "Only up to 32 payload values are allowed." );
+    // std::is_same compares each type in the two TypePacks to make sure that all types are unsigned int.
+    // TypePack 1    unsigned int    T0      T1      T2   ...   Tn-1        Tn
+    // TypePack 2      T0            T1      T2      T3   ...   Tn        unsigned int
+#ifndef __CUDACC_RTC__
+    static_assert( std::is_same<optix_internal::TypePack<unsigned int, Payload...>, optix_internal::TypePack<Payload..., unsigned int>>::value,
+                   "All payload parameters need to be unsigned int." );
+#endif
+
+    OptixPayloadTypeID type = OPTIX_PAYLOAD_TYPE_DEFAULT;
+    float              ox = rayOrigin.x, oy = rayOrigin.y, oz = rayOrigin.z;
+    float              dx = rayDirection.x, dy = rayDirection.y, dz = rayDirection.z;
+    unsigned int p[33]       = { 0, payload... };
+    int          payloadSize = (int)sizeof...( Payload );
+    asm volatile(
+        "call"
+        "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%26,%27,%28,%"
+        "29,%30,%31),"
+        "_optix_trace_typed_32,"
+        "(%32,%33,%34,%35,%36,%37,%38,%39,%40,%41,%42,%43,%44,%45,%46,%47,%48,%49,%50,%51,%52,%53,%54,%55,%56,%57,%58,%"
+        "59,%60,%61,%62,%63,%64,%65,%66,%67,%68,%69,%70,%71,%72,%73,%74,%75,%76,%77,%78,%79,%80);"
+        : "=r"( p[1] ), "=r"( p[2] ), "=r"( p[3] ), "=r"( p[4] ), "=r"( p[5] ), "=r"( p[6] ), "=r"( p[7] ),
+          "=r"( p[8] ), "=r"( p[9] ), "=r"( p[10] ), "=r"( p[11] ), "=r"( p[12] ), "=r"( p[13] ), "=r"( p[14] ),
+          "=r"( p[15] ), "=r"( p[16] ), "=r"( p[17] ), "=r"( p[18] ), "=r"( p[19] ), "=r"( p[20] ), "=r"( p[21] ),
+          "=r"( p[22] ), "=r"( p[23] ), "=r"( p[24] ), "=r"( p[25] ), "=r"( p[26] ), "=r"( p[27] ), "=r"( p[28] ),
+          "=r"( p[29] ), "=r"( p[30] ), "=r"( p[31] ), "=r"( p[32] )
+        : "r"( type ), "l"( handle ), "f"( ox ), "f"( oy ), "f"( oz ), "f"( dx ), "f"( dy ), "f"( dz ), "f"( tmin ),
+          "f"( tmax ), "f"( rayTime ), "r"( visibilityMask ), "r"( rayFlags ), "r"( SBToffset ), "r"( SBTstride ),
+          "r"( missSBTIndex ), "r"( payloadSize ), "r"( p[1] ), "r"( p[2] ), "r"( p[3] ), "r"( p[4] ), "r"( p[5] ),
+          "r"( p[6] ), "r"( p[7] ), "r"( p[8] ), "r"( p[9] ), "r"( p[10] ), "r"( p[11] ), "r"( p[12] ), "r"( p[13] ),
+          "r"( p[14] ), "r"( p[15] ), "r"( p[16] ), "r"( p[17] ), "r"( p[18] ), "r"( p[19] ), "r"( p[20] ),
+          "r"( p[21] ), "r"( p[22] ), "r"( p[23] ), "r"( p[24] ), "r"( p[25] ), "r"( p[26] ), "r"( p[27] ),
+          "r"( p[28] ), "r"( p[29] ), "r"( p[30] ), "r"( p[31] ), "r"( p[32] )
+        : );
+    unsigned int index = 1;
+    (void)std::initializer_list<unsigned int>{index, ( payload = p[index++] )...};
+}
+
+template <typename... Payload>
+static __forceinline__ __device__ void optixTraverse( OptixTraversableHandle handle,
+                                                      float3                 rayOrigin,
+                                                      float3                 rayDirection,
+                                                      float                  tmin,
+                                                      float                  tmax,
+                                                      float                  rayTime,
+                                                      OptixVisibilityMask    visibilityMask,
+                                                      unsigned int           rayFlags,
+                                                      unsigned int           SBToffset,
+                                                      unsigned int           SBTstride,
+                                                      unsigned int           missSBTIndex,
+                                                      Payload&... payload )
+{
+    static_assert( sizeof...( Payload ) <= 32, "Only up to 32 payload values are allowed." );
+    // std::is_same compares each type in the two TypePacks to make sure that all types are unsigned int.
+    // TypePack 1    unsigned int    T0      T1      T2   ...   Tn-1        Tn
+    // TypePack 2      T0            T1      T2      T3   ...   Tn        unsigned int
+#ifndef __CUDACC_RTC__
+    static_assert( std::is_same<optix_internal::TypePack<unsigned int, Payload...>, optix_internal::TypePack<Payload..., unsigned int>>::value,
+                   "All payload parameters need to be unsigned int." );
+#endif
+
+    OptixPayloadTypeID type = OPTIX_PAYLOAD_TYPE_DEFAULT;
+    float              ox = rayOrigin.x, oy = rayOrigin.y, oz = rayOrigin.z;
+    float              dx = rayDirection.x, dy = rayDirection.y, dz = rayDirection.z;
+    unsigned int p[33]       = {0, payload...};
+    int          payloadSize = (int)sizeof...( Payload );
+    asm volatile(
+        "call"
+        "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%26,%27,%28,%"
+        "29,%30,%31),"
+        "_optix_hitobject_traverse,"
+        "(%32,%33,%34,%35,%36,%37,%38,%39,%40,%41,%42,%43,%44,%45,%46,%47,%48,%49,%50,%51,%52,%53,%54,%55,%56,%57,%58,%"
+        "59,%60,%61,%62,%63,%64,%65,%66,%67,%68,%69,%70,%71,%72,%73,%74,%75,%76,%77,%78,%79,%80);"
+        : "=r"( p[1] ), "=r"( p[2] ), "=r"( p[3] ), "=r"( p[4] ), "=r"( p[5] ), "=r"( p[6] ), "=r"( p[7] ),
+          "=r"( p[8] ), "=r"( p[9] ), "=r"( p[10] ), "=r"( p[11] ), "=r"( p[12] ), "=r"( p[13] ), "=r"( p[14] ),
+          "=r"( p[15] ), "=r"( p[16] ), "=r"( p[17] ), "=r"( p[18] ), "=r"( p[19] ), "=r"( p[20] ), "=r"( p[21] ),
+          "=r"( p[22] ), "=r"( p[23] ), "=r"( p[24] ), "=r"( p[25] ), "=r"( p[26] ), "=r"( p[27] ), "=r"( p[28] ),
+          "=r"( p[29] ), "=r"( p[30] ), "=r"( p[31] ), "=r"( p[32] )
+        : "r"( type ), "l"( handle ), "f"( ox ), "f"( oy ), "f"( oz ), "f"( dx ), "f"( dy ), "f"( dz ), "f"( tmin ),
+          "f"( tmax ), "f"( rayTime ), "r"( visibilityMask ), "r"( rayFlags ), "r"( SBToffset ), "r"( SBTstride ),
+          "r"( missSBTIndex ), "r"( payloadSize ), "r"( p[1] ), "r"( p[2] ), "r"( p[3] ), "r"( p[4] ), "r"( p[5] ),
+          "r"( p[6] ), "r"( p[7] ), "r"( p[8] ), "r"( p[9] ), "r"( p[10] ), "r"( p[11] ), "r"( p[12] ), "r"( p[13] ),
+          "r"( p[14] ), "r"( p[15] ), "r"( p[16] ), "r"( p[17] ), "r"( p[18] ), "r"( p[19] ), "r"( p[20] ),
+          "r"( p[21] ), "r"( p[22] ), "r"( p[23] ), "r"( p[24] ), "r"( p[25] ), "r"( p[26] ), "r"( p[27] ),
+          "r"( p[28] ), "r"( p[29] ), "r"( p[30] ), "r"( p[31] ), "r"( p[32] )
+        : );
+    unsigned int index = 1;
+    (void)std::initializer_list<unsigned int>{index, ( payload = p[index++] )...};
+}
+
+template <typename... Payload>
+static __forceinline__ __device__ void optixTrace( OptixPayloadTypeID     type,
+                                                   OptixTraversableHandle handle,
+                                                   float3                 rayOrigin,
+                                                   float3                 rayDirection,
+                                                   float                  tmin,
+                                                   float                  tmax,
+                                                   float                  rayTime,
+                                                   OptixVisibilityMask    visibilityMask,
+                                                   unsigned int           rayFlags,
+                                                   unsigned int           SBToffset,
+                                                   unsigned int           SBTstride,
+                                                   unsigned int           missSBTIndex,
+                                                   Payload&...            payload )
+{
+    // std::is_same compares each type in the two TypePacks to make sure that all types are unsigned int.
+    // TypePack 1    unsigned int    T0      T1      T2   ...   Tn-1        Tn
+    // TypePack 2      T0            T1      T2      T3   ...   Tn        unsigned int
+    static_assert( sizeof...( Payload ) <= 32, "Only up to 32 payload values are allowed." );
+#ifndef __CUDACC_RTC__
+    static_assert( std::is_same<optix_internal::TypePack<unsigned int, Payload...>, optix_internal::TypePack<Payload..., unsigned int>>::value,
+                   "All payload parameters need to be unsigned int." );
+#endif
+
+    float        ox = rayOrigin.x, oy = rayOrigin.y, oz = rayOrigin.z;
+    float        dx = rayDirection.x, dy = rayDirection.y, dz = rayDirection.z;
+    unsigned int p[33]       = {0, payload...};
+    int          payloadSize = (int)sizeof...( Payload );
+
+    asm volatile(
+        "call"
+        "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%26,%27,%28,%"
+        "29,%30,%31),"
+        "_optix_trace_typed_32,"
+        "(%32,%33,%34,%35,%36,%37,%38,%39,%40,%41,%42,%43,%44,%45,%46,%47,%48,%49,%50,%51,%52,%53,%54,%55,%56,%57,%58,%"
+        "59,%60,%61,%62,%63,%64,%65,%66,%67,%68,%69,%70,%71,%72,%73,%74,%75,%76,%77,%78,%79,%80);"
+        : "=r"( p[1] ), "=r"( p[2] ), "=r"( p[3] ), "=r"( p[4] ), "=r"( p[5] ), "=r"( p[6] ), "=r"( p[7] ),
+          "=r"( p[8] ), "=r"( p[9] ), "=r"( p[10] ), "=r"( p[11] ), "=r"( p[12] ), "=r"( p[13] ), "=r"( p[14] ),
+          "=r"( p[15] ), "=r"( p[16] ), "=r"( p[17] ), "=r"( p[18] ), "=r"( p[19] ), "=r"( p[20] ), "=r"( p[21] ),
+          "=r"( p[22] ), "=r"( p[23] ), "=r"( p[24] ), "=r"( p[25] ), "=r"( p[26] ), "=r"( p[27] ), "=r"( p[28] ),
+          "=r"( p[29] ), "=r"( p[30] ), "=r"( p[31] ), "=r"( p[32] )
+        : "r"( type ), "l"( handle ), "f"( ox ), "f"( oy ), "f"( oz ), "f"( dx ), "f"( dy ), "f"( dz ), "f"( tmin ),
+          "f"( tmax ), "f"( rayTime ), "r"( visibilityMask ), "r"( rayFlags ), "r"( SBToffset ), "r"( SBTstride ),
+          "r"( missSBTIndex ), "r"( payloadSize ), "r"( p[1] ), "r"( p[2] ), "r"( p[3] ), "r"( p[4] ), "r"( p[5] ),
+          "r"( p[6] ), "r"( p[7] ), "r"( p[8] ), "r"( p[9] ), "r"( p[10] ), "r"( p[11] ), "r"( p[12] ), "r"( p[13] ),
+          "r"( p[14] ), "r"( p[15] ), "r"( p[16] ), "r"( p[17] ), "r"( p[18] ), "r"( p[19] ), "r"( p[20] ),
+          "r"( p[21] ), "r"( p[22] ), "r"( p[23] ), "r"( p[24] ), "r"( p[25] ), "r"( p[26] ), "r"( p[27] ),
+          "r"( p[28] ), "r"( p[29] ), "r"( p[30] ), "r"( p[31] ), "r"( p[32] )
+        : );
+    unsigned int index = 1;
+    (void)std::initializer_list<unsigned int>{index, ( payload = p[index++] )...};
+}
+
+template <typename... Payload>
+static __forceinline__ __device__ void optixTraverse( OptixPayloadTypeID     type,
+                                                      OptixTraversableHandle handle,
+                                                      float3                 rayOrigin,
+                                                      float3                 rayDirection,
+                                                      float                  tmin,
+                                                      float                  tmax,
+                                                      float                  rayTime,
+                                                      OptixVisibilityMask    visibilityMask,
+                                                      unsigned int           rayFlags,
+                                                      unsigned int           SBToffset,
+                                                      unsigned int           SBTstride,
+                                                      unsigned int           missSBTIndex,
+                                                      Payload&... payload )
+{
+    // std::is_same compares each type in the two TypePacks to make sure that all types are unsigned int.
+    // TypePack 1    unsigned int    T0      T1      T2   ...   Tn-1        Tn
+    // TypePack 2      T0            T1      T2      T3   ...   Tn        unsigned int
+    static_assert( sizeof...( Payload ) <= 32, "Only up to 32 payload values are allowed." );
+#ifndef __CUDACC_RTC__
+    static_assert( std::is_same<optix_internal::TypePack<unsigned int, Payload...>, optix_internal::TypePack<Payload..., unsigned int>>::value,
+                   "All payload parameters need to be unsigned int." );
+#endif
+
+    float        ox = rayOrigin.x, oy = rayOrigin.y, oz = rayOrigin.z;
+    float        dx = rayDirection.x, dy = rayDirection.y, dz = rayDirection.z;
+    unsigned int p[33]       = {0, payload...};
+    int          payloadSize = (int)sizeof...( Payload );
+    asm volatile(
+        "call"
+        "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%26,%27,%28,%"
+        "29,%30,%31),"
+        "_optix_hitobject_traverse,"
+        "(%32,%33,%34,%35,%36,%37,%38,%39,%40,%41,%42,%43,%44,%45,%46,%47,%48,%49,%50,%51,%52,%53,%54,%55,%56,%57,%58,%"
+        "59,%60,%61,%62,%63,%64,%65,%66,%67,%68,%69,%70,%71,%72,%73,%74,%75,%76,%77,%78,%79,%80);"
+        : "=r"( p[1] ), "=r"( p[2] ), "=r"( p[3] ), "=r"( p[4] ), "=r"( p[5] ), "=r"( p[6] ), "=r"( p[7] ),
+          "=r"( p[8] ), "=r"( p[9] ), "=r"( p[10] ), "=r"( p[11] ), "=r"( p[12] ), "=r"( p[13] ), "=r"( p[14] ),
+          "=r"( p[15] ), "=r"( p[16] ), "=r"( p[17] ), "=r"( p[18] ), "=r"( p[19] ), "=r"( p[20] ), "=r"( p[21] ),
+          "=r"( p[22] ), "=r"( p[23] ), "=r"( p[24] ), "=r"( p[25] ), "=r"( p[26] ), "=r"( p[27] ), "=r"( p[28] ),
+          "=r"( p[29] ), "=r"( p[30] ), "=r"( p[31] ), "=r"( p[32] )
+        : "r"( type ), "l"( handle ), "f"( ox ), "f"( oy ), "f"( oz ), "f"( dx ), "f"( dy ), "f"( dz ), "f"( tmin ),
+          "f"( tmax ), "f"( rayTime ), "r"( visibilityMask ), "r"( rayFlags ), "r"( SBToffset ), "r"( SBTstride ),
+          "r"( missSBTIndex ), "r"( payloadSize ), "r"( p[1] ), "r"( p[2] ), "r"( p[3] ), "r"( p[4] ), "r"( p[5] ),
+          "r"( p[6] ), "r"( p[7] ), "r"( p[8] ), "r"( p[9] ), "r"( p[10] ), "r"( p[11] ), "r"( p[12] ), "r"( p[13] ),
+          "r"( p[14] ), "r"( p[15] ), "r"( p[16] ), "r"( p[17] ), "r"( p[18] ), "r"( p[19] ), "r"( p[20] ),
+          "r"( p[21] ), "r"( p[22] ), "r"( p[23] ), "r"( p[24] ), "r"( p[25] ), "r"( p[26] ), "r"( p[27] ),
+          "r"( p[28] ), "r"( p[29] ), "r"( p[30] ), "r"( p[31] ), "r"( p[32] )
+        : );
+    unsigned int index = 1;
+    (void)std::initializer_list<unsigned int>{index, ( payload = p[index++] )...};
+}
+
+static __forceinline__ __device__ void optixReorder( unsigned int coherenceHint, unsigned int numCoherenceHintBits )
+{
+    asm volatile(
+         "call"
+         "(),"
+         "_optix_hitobject_reorder,"
+         "(%0,%1);"
+         :
+         : "r"( coherenceHint ), "r"( numCoherenceHintBits )
+         : );
+}
+
+static __forceinline__ __device__ void optixReorder()
+{
+    unsigned int coherenceHint        = 0;
+    unsigned int numCoherenceHintBits = 0;
+    asm volatile(
+         "call"
+         "(),"
+         "_optix_hitobject_reorder,"
+         "(%0,%1);"
+         :
+         : "r"( coherenceHint ), "r"( numCoherenceHintBits )
+         : );
+}
+
+template <typename... Payload>
+static __forceinline__ __device__ void optixInvoke( OptixPayloadTypeID type, Payload&... payload )
+{
+    // std::is_same compares each type in the two TypePacks to make sure that all types are unsigned int.
+    // TypePack 1    unsigned int    T0      T1      T2   ...   Tn-1        Tn
+    // TypePack 2      T0            T1      T2      T3   ...   Tn        unsigned int
+    static_assert( sizeof...( Payload ) <= 32, "Only up to 32 payload values are allowed." );
+#ifndef __CUDACC_RTC__
+    static_assert( std::is_same<optix_internal::TypePack<unsigned int, Payload...>, optix_internal::TypePack<Payload..., unsigned int>>::value,
+                   "All payload parameters need to be unsigned int." );
+#endif
+
+    unsigned int p[33]       = {0, payload...};
+    int          payloadSize = (int)sizeof...( Payload );
+
+    asm volatile(
+        "call"
+        "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%26,%27,%28,%"
+        "29,%30,%31),"
+        "_optix_hitobject_invoke,"
+        "(%32,%33,%34,%35,%36,%37,%38,%39,%40,%41,%42,%43,%44,%45,%46,%47,%48,%49,%50,%51,%52,%53,%54,%55,%56,%57,%58,%"
+        "59,%60,%61,%62,%63,%64,%65);"
+        : "=r"( p[1] ), "=r"( p[2] ), "=r"( p[3] ), "=r"( p[4] ), "=r"( p[5] ), "=r"( p[6] ), "=r"( p[7] ),
+          "=r"( p[8] ), "=r"( p[9] ), "=r"( p[10] ), "=r"( p[11] ), "=r"( p[12] ), "=r"( p[13] ), "=r"( p[14] ),
+          "=r"( p[15] ), "=r"( p[16] ), "=r"( p[17] ), "=r"( p[18] ), "=r"( p[19] ), "=r"( p[20] ), "=r"( p[21] ),
+          "=r"( p[22] ), "=r"( p[23] ), "=r"( p[24] ), "=r"( p[25] ), "=r"( p[26] ), "=r"( p[27] ), "=r"( p[28] ),
+          "=r"( p[29] ), "=r"( p[30] ), "=r"( p[31] ), "=r"( p[32] )
+        : "r"( type ), "r"( payloadSize ), "r"( p[1] ), "r"( p[2] ),
+          "r"( p[3] ), "r"( p[4] ), "r"( p[5] ), "r"( p[6] ), "r"( p[7] ), "r"( p[8] ), "r"( p[9] ), "r"( p[10] ),
+          "r"( p[11] ), "r"( p[12] ), "r"( p[13] ), "r"( p[14] ), "r"( p[15] ), "r"( p[16] ), "r"( p[17] ),
+          "r"( p[18] ), "r"( p[19] ), "r"( p[20] ), "r"( p[21] ), "r"( p[22] ), "r"( p[23] ), "r"( p[24] ),
+          "r"( p[25] ), "r"( p[26] ), "r"( p[27] ), "r"( p[28] ), "r"( p[29] ), "r"( p[30] ), "r"( p[31] ), "r"( p[32] )
+        : );
+
+    unsigned int index = 1;
+    (void)std::initializer_list<unsigned int>{index, ( payload = p[index++] )...};
+}
+
+template <typename... Payload>
+static __forceinline__ __device__ void optixInvoke( Payload&... payload )
+{
+    // std::is_same compares each type in the two TypePacks to make sure that all types are unsigned int.
+    // TypePack 1    unsigned int    T0      T1      T2   ...   Tn-1        Tn
+    // TypePack 2      T0            T1      T2      T3   ...   Tn        unsigned int
+    static_assert( sizeof...( Payload ) <= 32, "Only up to 32 payload values are allowed." );
+#ifndef __CUDACC_RTC__
+    static_assert( std::is_same<optix_internal::TypePack<unsigned int, Payload...>, optix_internal::TypePack<Payload..., unsigned int>>::value,
+                   "All payload parameters need to be unsigned int." );
+#endif
+
+    OptixPayloadTypeID type        = OPTIX_PAYLOAD_TYPE_DEFAULT;
+    unsigned int       p[33]       = {0, payload...};
+    int                payloadSize = (int)sizeof...( Payload );
+
+    asm volatile(
+        "call"
+        "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%26,%27,%28,%"
+        "29,%30,%31),"
+        "_optix_hitobject_invoke,"
+        "(%32,%33,%34,%35,%36,%37,%38,%39,%40,%41,%42,%43,%44,%45,%46,%47,%48,%49,%50,%51,%52,%53,%54,%55,%56,%57,%58,%"
+        "59,%60,%61,%62,%63,%64,%65);"
+        : "=r"( p[1] ), "=r"( p[2] ), "=r"( p[3] ), "=r"( p[4] ), "=r"( p[5] ), "=r"( p[6] ), "=r"( p[7] ),
+          "=r"( p[8] ), "=r"( p[9] ), "=r"( p[10] ), "=r"( p[11] ), "=r"( p[12] ), "=r"( p[13] ), "=r"( p[14] ),
+          "=r"( p[15] ), "=r"( p[16] ), "=r"( p[17] ), "=r"( p[18] ), "=r"( p[19] ), "=r"( p[20] ), "=r"( p[21] ),
+          "=r"( p[22] ), "=r"( p[23] ), "=r"( p[24] ), "=r"( p[25] ), "=r"( p[26] ), "=r"( p[27] ), "=r"( p[28] ),
+          "=r"( p[29] ), "=r"( p[30] ), "=r"( p[31] ), "=r"( p[32] )
+        : "r"( type ), "r"( payloadSize ), "r"( p[1] ), "r"( p[2] ),
+          "r"( p[3] ), "r"( p[4] ), "r"( p[5] ), "r"( p[6] ), "r"( p[7] ), "r"( p[8] ), "r"( p[9] ), "r"( p[10] ),
+          "r"( p[11] ), "r"( p[12] ), "r"( p[13] ), "r"( p[14] ), "r"( p[15] ), "r"( p[16] ), "r"( p[17] ),
+          "r"( p[18] ), "r"( p[19] ), "r"( p[20] ), "r"( p[21] ), "r"( p[22] ), "r"( p[23] ), "r"( p[24] ),
+          "r"( p[25] ), "r"( p[26] ), "r"( p[27] ), "r"( p[28] ), "r"( p[29] ), "r"( p[30] ), "r"( p[31] ), "r"( p[32] )
+        : );
+
+    unsigned int index = 1;
+    (void)std::initializer_list<unsigned int>{index, ( payload = p[index++] )...};
+}
+
+static __forceinline__ __device__ void optixMakeHitObject( OptixTraversableHandle        handle,
+                                                           float3                        rayOrigin,
+                                                           float3                        rayDirection,
+                                                           float                         tmin,
+                                                           float                         rayTime,
+                                                           unsigned int                  rayFlags,
+                                                           OptixTraverseData             traverseData,
+                                                           const OptixTraversableHandle* transforms,
+                                                           unsigned int                  numTransforms )
+{
+    float ox = rayOrigin.x, oy = rayOrigin.y, oz = rayOrigin.z;
+    float dx = rayDirection.x, dy = rayDirection.y, dz = rayDirection.z;
+
+    asm volatile(
+        "call"
+        "(),"
+        "_optix_hitobject_make_with_traverse_data_v2,"
+        "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%26,%27,%28,%29,%30,%31);"
+        :
+        : "l"( handle ), "f"( ox ), "f"( oy ), "f"( oz ), "f"( dx ), "f"( dy ), "f"( dz ), "f"( tmin ), "f"( rayTime ), "r"( rayFlags ),
+          "r"( traverseData.data[0] ), "r"( traverseData.data[1] ), "r"( traverseData.data[2] ),
+          "r"( traverseData.data[3] ), "r"( traverseData.data[4] ), "r"( traverseData.data[5] ),
+          "r"( traverseData.data[6] ), "r"( traverseData.data[7] ), "r"( traverseData.data[8] ),
+          "r"( traverseData.data[9] ), "r"( traverseData.data[10] ), "r"( traverseData.data[11] ),
+          "r"( traverseData.data[12] ), "r"( traverseData.data[13] ), "r"( traverseData.data[14] ),
+          "r"( traverseData.data[15] ), "r"( traverseData.data[16] ), "r"( traverseData.data[17] ),
+          "r"( traverseData.data[18] ), "r"( traverseData.data[19] ), "l"( transforms ), "r"( numTransforms )
+        : );
+}
+
+ static __forceinline__ __device__ void optixMakeMissHitObject( unsigned int missSBTIndex,
+                                                                float3       rayOrigin,
+                                                                float3       rayDirection,
+                                                                float        tmin,
+                                                                float        tmax,
+                                                                float        rayTime,
+                                                                unsigned int rayFlags )
+{
+    float ox = rayOrigin.x, oy = rayOrigin.y, oz = rayOrigin.z;
+    float dx = rayDirection.x, dy = rayDirection.y, dz = rayDirection.z;
+
+    asm volatile(
+         "call"
+         "(),"
+         "_optix_hitobject_make_miss_v2,"
+         "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10);"
+         :
+         : "r"( missSBTIndex ), "f"( ox ), "f"( oy ), "f"( oz ), "f"( dx ), "f"( dy ), "f"( dz ), "f"( tmin ),
+           "f"( tmax ), "f"( rayTime ), "r"( rayFlags )
+         : );
+}
+
+static __forceinline__ __device__ void optixMakeNopHitObject()
+{
+    asm volatile(
+         "call"
+         "(),"
+         "_optix_hitobject_make_nop,"
+         "();"
+         :
+         :
+         : );
+}
+
+static __forceinline__ __device__ void optixHitObjectGetTraverseData( OptixTraverseData* data )
+{
+    asm volatile(
+         "call"
+         "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19),"
+         "_optix_hitobject_get_traverse_data,"
+         "();"
+         : "=r"( data->data[0] ), "=r"( data->data[1] ), "=r"( data->data[2] ), "=r"( data->data[3] ), "=r"( data->data[4] ),
+           "=r"( data->data[5] ), "=r"( data->data[6] ), "=r"( data->data[7] ), "=r"( data->data[8] ), "=r"( data->data[9] ),
+           "=r"( data->data[10] ), "=r"( data->data[11] ), "=r"( data->data[12] ), "=r"( data->data[13] ), "=r"( data->data[14] ),
+           "=r"( data->data[15] ), "=r"( data->data[16] ), "=r"( data->data[17] ), "=r"( data->data[18] ), "=r"( data->data[19] )
+         :
+         : );
+}
+
+static __forceinline__ __device__ bool optixHitObjectIsHit()
+{
+    unsigned int result;
+    asm volatile(
+         "call (%0), _optix_hitobject_is_hit,"
+         "();"
+         : "=r"( result )
+         :
+         : );
+    return result;
+}
+
+static __forceinline__ __device__ bool optixHitObjectIsMiss()
+{
+    unsigned int result;
+    asm volatile(
+         "call (%0), _optix_hitobject_is_miss,"
+         "();"
+         : "=r"( result )
+         :
+         : );
+    return result;
+}
+
+static __forceinline__ __device__ bool optixHitObjectIsNop()
+{
+    unsigned int result;
+    asm volatile(
+         "call (%0), _optix_hitobject_is_nop,"
+         "();"
+         : "=r"( result )
+         :
+         : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetInstanceId()
+{
+    unsigned int result;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_instance_id,"
+         "();"
+         : "=r"( result )
+         :
+         : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetInstanceIndex()
+{
+    unsigned int result;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_instance_idx,"
+         "();"
+         : "=r"( result )
+         :
+         : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetPrimitiveIndex()
+{
+    unsigned int result;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_primitive_idx,"
+         "();"
+         : "=r"( result )
+         :
+         : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetTransformListSize()
+{
+    unsigned int result;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_transform_list_size,"
+         "();"
+         : "=r"( result )
+         :
+         : );
+    return result;
+}
+
+static __forceinline__ __device__ OptixTraversableHandle optixHitObjectGetTransformListHandle( unsigned int index )
+{
+    unsigned long long result;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_transform_list_handle,"
+         "(%1);"
+         : "=l"( result )
+         : "r"( index )
+         : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetSbtGASIndex()
+{
+    unsigned int result;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_sbt_gas_idx,"
+         "();"
+         : "=r"( result )
+         :
+         : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetHitKind()
+{
+    unsigned int result;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_hitkind,"
+         "();"
+         : "=r"( result )
+         :
+         : );
+    return result;
+}
+
+static __forceinline__ __device__ float3 optixHitObjectGetWorldRayOrigin()
+{
+    float x, y, z;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_world_ray_origin_x,"
+         "();"
+         : "=f"( x )
+         :
+         : );
+    asm volatile(
+         "call (%0), _optix_hitobject_get_world_ray_origin_y,"
+         "();"
+         : "=f"( y )
+         :
+         : );
+    asm volatile(
+         "call (%0), _optix_hitobject_get_world_ray_origin_z,"
+         "();"
+         : "=f"( z )
+         :
+         : );
+    return make_float3( x, y, z );
+}
+
+static __forceinline__ __device__ float3 optixHitObjectGetWorldRayDirection()
+{
+    float x, y, z;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_world_ray_direction_x,"
+         "();"
+         : "=f"( x )
+         :
+         : );
+    asm volatile(
+         "call (%0), _optix_hitobject_get_world_ray_direction_y,"
+         "();"
+         : "=f"( y )
+         :
+         : );
+    asm volatile(
+         "call (%0), _optix_hitobject_get_world_ray_direction_z,"
+         "();"
+         : "=f"( z )
+         :
+         : );
+    return make_float3( x, y, z );
+}
+
+static __forceinline__ __device__ float optixHitObjectGetRayTmin()
+{
+    float result;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_ray_tmin,"
+         "();"
+         : "=f"( result )
+         :
+         : );
+    return result;
+}
+
+static __forceinline__ __device__ float optixHitObjectGetRayTmax()
+{
+    float result;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_ray_tmax,"
+         "();"
+         : "=f"( result )
+         :
+         : );
+    return result;
+}
+
+static __forceinline__ __device__ float optixHitObjectGetRayTime()
+{
+    float result;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_ray_time,"
+         "();"
+         : "=f"( result )
+         :
+         : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_0()
+{
+    unsigned int ret;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_attribute,"
+         "(%1);"
+         : "=r"( ret )
+         : "r"( 0 )
+         : );
+    return ret;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_1()
+{
+    unsigned int ret;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_attribute,"
+         "(%1);"
+         : "=r"( ret )
+         : "r"( 1 )
+         : );
+    return ret;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_2()
+{
+    unsigned int ret;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_attribute,"
+         "(%1);"
+         : "=r"( ret )
+         : "r"( 2 )
+         : );
+    return ret;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_3()
+{
+    unsigned int ret;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_attribute,"
+         "(%1);"
+         : "=r"( ret )
+         : "r"( 3 )
+         : );
+    return ret;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_4()
+{
+    unsigned int ret;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_attribute,"
+         "(%1);"
+         : "=r"( ret )
+         : "r"( 4 )
+         : );
+    return ret;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_5()
+{
+    unsigned int ret;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_attribute,"
+         "(%1);"
+         : "=r"( ret )
+         : "r"( 5 )
+         : );
+    return ret;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_6()
+{
+    unsigned int ret;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_attribute,"
+         "(%1);"
+         : "=r"( ret )
+         : "r"( 6 )
+         : );
+    return ret;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_7()
+{
+    unsigned int ret;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_attribute,"
+         "(%1);"
+         : "=r"( ret )
+         : "r"( 7 )
+         : );
+    return ret;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetSbtRecordIndex()
+{
+    unsigned int result;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_sbt_record_index,"
+         "();"
+         : "=r"( result )
+         :
+         : );
+    return result;
+}
+
+static __forceinline__ __device__ void optixHitObjectSetSbtRecordIndex( unsigned int sbtRecordIndex )
+{
+    asm volatile(
+        "call (), _optix_hitobject_set_sbt_record_index,"
+        "(%0);"
+        :
+        : "r"(sbtRecordIndex)
+        : );
+}
+
+static __forceinline__ __device__ CUdeviceptr optixHitObjectGetSbtDataPointer()
+{
+    unsigned long long ptr;
+    asm volatile(
+         "call (%0), _optix_hitobject_get_sbt_data_pointer,"
+         "();"
+         : "=l"( ptr )
+         :
+         : );
+    return ptr;
+}
+
+
+static __forceinline__ __device__ OptixTraversableHandle optixHitObjectGetGASTraversableHandle()
+{
+    unsigned long long handle;
+    asm( "call (%0), _optix_hitobject_get_gas_traversable_handle, ();" : "=l"( handle ) : );
+    return (OptixTraversableHandle)handle;
+}
+
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetRayFlags()
+{
+    unsigned int u0;
+    asm( "call (%0), _optix_hitobject_get_ray_flags, ();" : "=r"( u0 ) : );
+    return u0;
+}
+
+
+static __forceinline__ __device__ void optixSetPayload_0( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 0 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_1( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 1 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_2( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 2 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_3( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 3 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_4( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 4 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_5( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 5 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_6( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 6 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_7( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 7 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_8( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 8 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_9( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 9 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_10( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 10 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_11( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 11 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_12( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 12 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_13( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 13 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_14( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 14 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_15( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 15 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_16( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 16 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_17( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 17 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_18( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 18 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_19( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 19 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_20( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 20 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_21( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 21 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_22( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 22 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_23( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 23 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_24( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 24 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_25( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 25 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_26( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 26 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_27( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 27 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_28( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 28 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_29( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 29 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_30( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 30 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ void optixSetPayload_31( unsigned int p )
+{
+    asm volatile( "call _optix_set_payload, (%0, %1);" : : "r"( 31 ), "r"( p ) : );
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_0()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 0 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_1()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 1 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_2()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 2 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_3()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 3 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_4()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 4 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_5()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 5 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_6()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 6 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_7()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 7 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_8()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 8 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_9()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 9 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_10()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 10 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_11()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 11 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_12()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 12 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_13()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 13 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_14()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 14 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_15()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 15 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_16()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 16 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_17()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 17 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_18()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 18 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_19()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 19 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_20()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 20 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_21()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 21 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_22()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 22 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_23()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 23 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_24()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 24 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_25()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 25 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_26()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 26 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_27()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 27 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_28()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 28 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_29()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 29 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_30()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 30 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ unsigned int optixGetPayload_31()
+{
+    unsigned int result;
+    asm volatile( "call (%0), _optix_get_payload, (%1);" : "=r"( result ) : "r"( 31 ) : );
+    return result;
+}
+
+static __forceinline__ __device__ void optixSetPayloadTypes( unsigned int types )
+{
+    asm volatile( "call _optix_set_payload_types, (%0);" : : "r"( types ) : );
+}
+
+static __forceinline__ __device__ unsigned int optixUndefinedValue()
+{
+    unsigned int u0;
+    asm( "call (%0), _optix_undef_value, ();" : "=r"( u0 ) : );
+    return u0;
+}
+
+__device__ __forceinline__ unsigned int optixGetRemainingTraceDepth()
+{
+    unsigned int result;
+    asm (
+         "call (%0), _optix_get_remaining_trace_depth,"
+         "();"
+         : "=r"( result )
+         :
+         : );
+    return result;
+}
+
+static __forceinline__ __device__ float3 optixGetWorldRayOrigin()
+{
+    float f0, f1, f2;
+    asm( "call (%0), _optix_get_world_ray_origin_x, ();" : "=f"( f0 ) : );
+    asm( "call (%0), _optix_get_world_ray_origin_y, ();" : "=f"( f1 ) : );
+    asm( "call (%0), _optix_get_world_ray_origin_z, ();" : "=f"( f2 ) : );
+    return make_float3( f0, f1, f2 );
+}
+
+static __forceinline__ __device__ float3 optixGetWorldRayDirection()
+{
+    float f0, f1, f2;
+    asm( "call (%0), _optix_get_world_ray_direction_x, ();" : "=f"( f0 ) : );
+    asm( "call (%0), _optix_get_world_ray_direction_y, ();" : "=f"( f1 ) : );
+    asm( "call (%0), _optix_get_world_ray_direction_z, ();" : "=f"( f2 ) : );
+    return make_float3( f0, f1, f2 );
+}
+
+static __forceinline__ __device__ float3 optixGetObjectRayOrigin()
+{
+    float f0, f1, f2;
+    asm( "call (%0), _optix_get_object_ray_origin_x, ();" : "=f"( f0 ) : );
+    asm( "call (%0), _optix_get_object_ray_origin_y, ();" : "=f"( f1 ) : );
+    asm( "call (%0), _optix_get_object_ray_origin_z, ();" : "=f"( f2 ) : );
+    return make_float3( f0, f1, f2 );
+}
+
+static __forceinline__ __device__ float3 optixGetObjectRayDirection()
+{
+    float f0, f1, f2;
+    asm( "call (%0), _optix_get_object_ray_direction_x, ();" : "=f"( f0 ) : );
+    asm( "call (%0), _optix_get_object_ray_direction_y, ();" : "=f"( f1 ) : );
+    asm( "call (%0), _optix_get_object_ray_direction_z, ();" : "=f"( f2 ) : );
+    return make_float3( f0, f1, f2 );
+}
+
+static __forceinline__ __device__ float optixGetRayTmin()
+{
+    float f0;
+    asm( "call (%0), _optix_get_ray_tmin, ();" : "=f"( f0 ) : );
+    return f0;
+}
+
+static __forceinline__ __device__ float optixGetRayTmax()
+{
+    float f0;
+    asm( "call (%0), _optix_get_ray_tmax, ();" : "=f"( f0 ) : );
+    return f0;
+}
+
+static __forceinline__ __device__ float optixGetRayTime()
+{
+    float f0;
+    asm( "call (%0), _optix_get_ray_time, ();" : "=f"( f0 ) : );
+    return f0;
+}
+
+static __forceinline__ __device__ unsigned int optixGetRayFlags()
+{
+    unsigned int u0;
+    asm( "call (%0), _optix_get_ray_flags, ();" : "=r"( u0 ) : );
+    return u0;
+}
+
+static __forceinline__ __device__ unsigned int optixGetRayVisibilityMask()
+{
+    unsigned int u0;
+    asm( "call (%0), _optix_get_ray_visibility_mask, ();" : "=r"( u0 ) : );
+    return u0;
+}
+
+static __forceinline__ __device__ OptixTraversableHandle optixGetInstanceTraversableFromIAS( OptixTraversableHandle ias,
+                                                                                             unsigned int           instIdx )
+{
+    unsigned long long handle;
+    asm( "call (%0), _optix_get_instance_traversable_from_ias, (%1, %2);"
+         : "=l"( handle ) : "l"( ias ), "r"( instIdx ) );
+    return (OptixTraversableHandle)handle;
+}
+
+
+static __forceinline__ __device__ void optixGetTriangleVertexData( OptixTraversableHandle gas,
+                                                                   unsigned int           primIdx,
+                                                                   unsigned int           sbtGASIndex,
+                                                                   float                  time,
+                                                                   float3                 data[3] )
+{
+    asm( "call (%0, %1, %2, %3, %4, %5, %6, %7, %8), _optix_get_triangle_vertex_data, "
+         "(%9, %10, %11, %12);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[1].x ), "=f"( data[1].y ),
+           "=f"( data[1].z ), "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+
+static __forceinline__ __device__ void optixGetTriangleVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                             unsigned int           primIdx,
+                                                                             unsigned int           sbtGASIndex,
+                                                                             float                  time,
+                                                                             float3                 data[3] )
+{
+    asm( "call (%0, %1, %2, %3, %4, %5, %6, %7, %8), _optix_get_triangle_vertex_data_from_handle, "
+         "(%9, %10, %11, %12);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[1].x ), "=f"( data[1].y ),
+           "=f"( data[1].z ), "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetTriangleVertexData( float3 data[3] )
+{
+    asm( "call (%0, %1, %2, %3, %4, %5, %6, %7, %8), _optix_get_triangle_vertex_data_current_hit, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[1].x ), "=f"( data[1].y ),
+           "=f"( data[1].z ), "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z )
+         : );
+}
+
+static __forceinline__ __device__ void optixHitObjectGetTriangleVertexData( float3 data[3] )
+{
+    asm( "call (%0, %1, %2, %3, %4, %5, %6, %7, %8), _optix_hitobject_get_triangle_vertex_data, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[1].x ), "=f"( data[1].y ),
+           "=f"( data[1].z ), "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z )
+         : );
+}
+
+
+static __forceinline__ __device__ void optixGetLinearCurveVertexData( OptixTraversableHandle gas,
+                                                                      unsigned int           primIdx,
+                                                                      unsigned int           sbtGASIndex,
+                                                                      float                  time,
+                                                                      float4                 data[2] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7), _optix_get_linear_curve_vertex_data, "
+         "(%8, %9, %10, %11);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ),
+           "=f"( data[1].x ), "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetLinearCurveVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                                unsigned int           primIdx,
+                                                                                unsigned int           sbtGASIndex,
+                                                                                float                  time,
+                                                                                float4                 data[2] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7), _optix_get_linear_curve_vertex_data_from_handle, "
+         "(%8, %9, %10, %11);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ),
+           "=f"( data[1].x ), "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetLinearCurveVertexData( float4 data[2] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7), _optix_get_linear_curve_vertex_data_current_hit, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ),
+           "=f"( data[1].x ), "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixHitObjectGetLinearCurveVertexData( float4 data[2] )
+
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7), _optix_hitobject_get_linear_curve_vertex_data, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ),
+           "=f"( data[1].x ), "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetQuadraticBSplineVertexData( OptixTraversableHandle gas,
+                                                                           unsigned int         primIdx,
+                                                                           unsigned int         sbtGASIndex,
+                                                                           float                time,
+                                                                           float4               data[3] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11), _optix_get_quadratic_bspline_vertex_data, "
+         "(%12, %13, %14, %15);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ),
+           "=f"( data[1].x ), "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ),
+           "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z ), "=f"( data[2].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetQuadraticBSplineVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                                 unsigned int           primIdx,
+                                                                                 unsigned int           sbtGASIndex,
+                                                                                 float                  time,
+                                                                                 float4                 data[3] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11), _optix_get_quadratic_bspline_vertex_data_from_handle, "
+         "(%12, %13, %14, %15);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ),
+           "=f"( data[1].x ), "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ),
+           "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z ), "=f"( data[2].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetQuadraticBSplineVertexData( float4 data[3] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11), _optix_get_quadratic_bspline_vertex_data_current_hit, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ),
+           "=f"( data[1].x ), "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ),
+           "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z ), "=f"( data[2].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixHitObjectGetQuadraticBSplineVertexData( float4 data[3] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11), _optix_hitobject_get_quadratic_bspline_vertex_data, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ),
+           "=f"( data[1].x ), "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ),
+           "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z ), "=f"( data[2].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetQuadraticBSplineRocapsVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                                           unsigned int primIdx,
+                                                                                           unsigned int sbtGASIndex,
+                                                                                           float        time,
+                                                                                           float4       data[3] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11), _optix_get_quadratic_bspline_rocaps_vertex_data_from_handle, "
+         "(%12, %13, %14, %15);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ), "=f"( data[1].y ),
+           "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z ), "=f"( data[2].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetQuadraticBSplineRocapsVertexData( float4 data[3] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11), _optix_get_quadratic_bspline_rocaps_vertex_data_current_hit, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ), "=f"( data[1].y ),
+           "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z ), "=f"( data[2].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixHitObjectGetQuadraticBSplineRocapsVertexData( float4 data[3] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11), _optix_hitobject_get_quadratic_bspline_rocaps_vertex_data, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ), "=f"( data[1].y ),
+           "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z ), "=f"( data[2].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetCubicBSplineVertexData( OptixTraversableHandle gas,
+                                                                       unsigned int         primIdx,
+                                                                       unsigned int         sbtGASIndex,
+                                                                       float                time,
+                                                                       float4               data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_get_cubic_bspline_vertex_data, "
+         "(%16, %17, %18, %19);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ),
+           "=f"( data[1].x ), "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ),
+           "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z ), "=f"( data[2].w ),
+           "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetCubicBSplineVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                             unsigned int           primIdx,
+                                                                             unsigned int           sbtGASIndex,
+                                                                             float                  time,
+                                                                             float4                 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_get_cubic_bspline_vertex_data_from_handle, "
+         "(%16, %17, %18, %19);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ),
+           "=f"( data[1].x ), "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ),
+           "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z ), "=f"( data[2].w ),
+           "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetCubicBSplineVertexData( float4 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_get_cubic_bspline_vertex_data_current_hit, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ),
+           "=f"( data[1].x ), "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ),
+           "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z ), "=f"( data[2].w ),
+           "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixHitObjectGetCubicBSplineVertexData( float4 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_hitobject_get_cubic_bspline_vertex_data, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ),
+           "=f"( data[1].x ), "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ),
+           "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z ), "=f"( data[2].w ),
+           "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetCubicBSplineRocapsVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                                       unsigned int           primIdx,
+                                                                                       unsigned int sbtGASIndex,
+                                                                                       float        time,
+                                                                                       float4       data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_get_cubic_bspline_rocaps_vertex_data_from_handle, "
+         "(%16, %17, %18, %19);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetCubicBSplineRocapsVertexData( float4 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_get_cubic_bspline_rocaps_vertex_data_current_hit, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixHitObjectGetCubicBSplineRocapsVertexData( float4 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_hitobject_get_cubic_bspline_rocaps_vertex_data, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetCatmullRomVertexData( OptixTraversableHandle gas,
+                                                                     unsigned int           primIdx,
+                                                                     unsigned int           sbtGASIndex,
+                                                                     float                  time,
+                                                                     float4                 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_get_catmullrom_vertex_data, "
+         "(%16, %17, %18, %19);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetCatmullRomVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                           unsigned int           primIdx,
+                                                                           unsigned int           sbtGASIndex,
+                                                                           float                  time,
+                                                                           float4                 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_get_catmullrom_vertex_data_from_handle, "
+         "(%16, %17, %18, %19);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetCatmullRomVertexData( float4 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_get_catmullrom_vertex_data_current_hit, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixHitObjectGetCatmullRomVertexData( float4 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_hitobject_get_catmullrom_vertex_data, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetCatmullRomRocapsVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                                     unsigned int           primIdx,
+                                                                                     unsigned int           sbtGASIndex,
+                                                                                     float                  time,
+                                                                                     float4                 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_get_catmullrom_rocaps_vertex_data_from_handle, "
+         "(%16, %17, %18, %19);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetCatmullRomRocapsVertexData( float4 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_get_catmullrom_rocaps_vertex_data_current_hit, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixHitObjectGetCatmullRomRocapsVertexData( float4 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_hitobject_get_catmullrom_rocaps_vertex_data, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetCubicBezierVertexData( OptixTraversableHandle gas,
+                                                                      unsigned int           primIdx,
+                                                                      unsigned int           sbtGASIndex,
+                                                                      float                  time,
+                                                                      float4                 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_get_cubic_bezier_vertex_data, "
+         "(%16, %17, %18, %19);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetCubicBezierVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                            unsigned int           primIdx,
+                                                                            unsigned int           sbtGASIndex,
+                                                                            float                  time,
+                                                                            float4                 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_get_cubic_bezier_vertex_data_from_handle, "
+         "(%16, %17, %18, %19);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetCubicBezierVertexData( float4 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_get_cubic_bezier_vertex_data_current_hit, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixHitObjectGetCubicBezierVertexData( float4 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_hitobject_get_cubic_bezier_vertex_data, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetCubicBezierRocapsVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                                      unsigned int           primIdx,
+                                                                                      unsigned int sbtGASIndex,
+                                                                                      float        time,
+                                                                                      float4       data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_get_cubic_bezier_rocaps_vertex_data_from_handle, "
+         "(%16, %17, %18, %19);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetCubicBezierRocapsVertexData( float4 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_get_cubic_bezier_rocaps_vertex_data_current_hit, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixHitObjectGetCubicBezierRocapsVertexData( float4 data[4] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11,  %12, %13, %14, %15), "
+         "_optix_hitobject_get_cubic_bezier_rocaps_vertex_data, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ),
+           "=f"( data[1].y ), "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ),
+           "=f"( data[2].z ), "=f"( data[2].w ), "=f"( data[3].x ), "=f"( data[3].y ), "=f"( data[3].z ), "=f"( data[3].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetRibbonVertexData( OptixTraversableHandle gas,
+                                                                 unsigned int           primIdx,
+                                                                 unsigned int           sbtGASIndex,
+                                                                 float                  time,
+                                                                 float4                 data[3] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11), _optix_get_ribbon_vertex_data, "
+         "(%12, %13, %14, %15);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ), "=f"( data[1].y ),
+           "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z ), "=f"( data[2].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetRibbonVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                           unsigned int           primIdx,
+                                                                           unsigned int           sbtGASIndex,
+                                                                           float                  time,
+                                                                           float4                 data[3] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11), _optix_get_ribbon_vertex_data_from_handle, "
+         "(%12, %13, %14, %15);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ), "=f"( data[1].y ),
+           "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z ), "=f"( data[2].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetRibbonVertexData( float4 data[3] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11), _optix_get_ribbon_vertex_data_current_hit, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ), "=f"( data[1].y ),
+           "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z ), "=f"( data[2].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixHitObjectGetRibbonVertexData( float4 data[3] )
+{
+    asm( "call (%0, %1, %2, %3,  %4, %5, %6, %7,  %8, %9, %10, %11), _optix_hitobject_get_ribbon_vertex_data, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w ), "=f"( data[1].x ), "=f"( data[1].y ),
+           "=f"( data[1].z ), "=f"( data[1].w ), "=f"( data[2].x ), "=f"( data[2].y ), "=f"( data[2].z ), "=f"( data[2].w )
+         : );
+}
+
+static __forceinline__ __device__ float3 optixGetRibbonNormal( OptixTraversableHandle gas,
+                                                               unsigned int           primIdx,
+                                                               unsigned int           sbtGASIndex,
+                                                               float                  time,
+                                                               float2                 ribbonParameters )
+{
+    float3 normal;
+    asm( "call (%0, %1, %2), _optix_get_ribbon_normal, "
+         "(%3, %4, %5, %6, %7, %8);"
+         : "=f"( normal.x ), "=f"( normal.y ), "=f"( normal.z )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time ),
+           "f"( ribbonParameters.x ), "f"( ribbonParameters.y )
+         : );
+    return normal;
+}
+
+static __forceinline__ __device__ float3 optixGetRibbonNormalFromHandle( OptixTraversableHandle gas,
+                                                                         unsigned int           primIdx,
+                                                                         unsigned int           sbtGASIndex,
+                                                                         float                  time,
+                                                                         float2                 ribbonParameters )
+{
+    float3 normal;
+    asm( "call (%0, %1, %2), _optix_get_ribbon_normal_from_handle, "
+         "(%3, %4, %5, %6, %7, %8);"
+         : "=f"( normal.x ), "=f"( normal.y ), "=f"( normal.z )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time ),
+           "f"( ribbonParameters.x ), "f"( ribbonParameters.y )
+         : );
+    return normal;
+}
+
+static __forceinline__ __device__ float3 optixGetRibbonNormal( float2 ribbonParameters )
+{
+    float3 normal;
+    asm( "call (%0, %1, %2), _optix_get_ribbon_normal_current_hit, "
+         "(%3, %4);"
+         : "=f"( normal.x ), "=f"( normal.y ), "=f"( normal.z )
+         : "f"( ribbonParameters.x ), "f"( ribbonParameters.y )
+         : );
+    return normal;
+}
+
+static __forceinline__ __device__ float3 optixHitObjectGetRibbonNormal( float2 ribbonParameters )
+{
+    float3 normal;
+    asm( "call (%0, %1, %2), _optix_hitobject_get_ribbon_normal, "
+         "(%3, %4);"
+         : "=f"( normal.x ), "=f"( normal.y ), "=f"( normal.z )
+         : "f"( ribbonParameters.x ), "f"( ribbonParameters.y )
+         : );
+    return normal;
+}
+
+static __forceinline__ __device__ void optixGetSphereData( OptixTraversableHandle gas,
+                                                           unsigned int           primIdx,
+                                                           unsigned int           sbtGASIndex,
+                                                           float                  time,
+                                                           float4                 data[1] )
+{
+    asm( "call (%0, %1, %2, %3), "
+         "_optix_get_sphere_data, "
+         "(%4, %5, %6, %7);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetSphereDataFromHandle( OptixTraversableHandle gas,
+                                                                     unsigned int           primIdx,
+                                                                     unsigned int           sbtGASIndex,
+                                                                     float                  time,
+                                                                     float4                 data[1] )
+{
+    asm( "call (%0, %1, %2, %3), "
+         "_optix_get_sphere_data_from_handle, "
+         "(%4, %5, %6, %7);"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w )
+         : "l"( gas ), "r"( primIdx ), "r"( sbtGASIndex ), "f"( time )
+         : );
+}
+
+static __forceinline__ __device__ void optixGetSphereData( float4 data[1] )
+{
+    asm( "call (%0, %1, %2, %3), "
+         "_optix_get_sphere_data_current_hit, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w )
+         : );
+}
+
+static __forceinline__ __device__ void optixHitObjectGetSphereData( float4 data[1] )
+{
+    asm( "call (%0, %1, %2, %3), "
+         "_optix_hitobject_get_sphere_data, "
+         "();"
+         : "=f"( data[0].x ), "=f"( data[0].y ), "=f"( data[0].z ), "=f"( data[0].w )
+         : );
+}
+
+static __forceinline__ __device__ OptixTraversableHandle optixGetGASTraversableHandle()
+{
+    unsigned long long handle;
+    asm( "call (%0), _optix_get_gas_traversable_handle, ();" : "=l"( handle ) : );
+    return (OptixTraversableHandle)handle;
+}
+
+static __forceinline__ __device__ float optixGetGASMotionTimeBegin( OptixTraversableHandle handle )
+{
+    float f0;
+    asm( "call (%0), _optix_get_gas_motion_time_begin, (%1);" : "=f"( f0 ) : "l"( handle ) : );
+    return f0;
+}
+
+static __forceinline__ __device__ float optixGetGASMotionTimeEnd( OptixTraversableHandle handle )
+{
+    float f0;
+    asm( "call (%0), _optix_get_gas_motion_time_end, (%1);" : "=f"( f0 ) : "l"( handle ) : );
+    return f0;
+}
+
+static __forceinline__ __device__ unsigned int optixGetGASMotionStepCount( OptixTraversableHandle handle )
+{
+    unsigned int u0;
+    asm( "call (%0), _optix_get_gas_motion_step_count, (%1);" : "=r"( u0 ) : "l"( handle ) : );
+    return u0;
+}
+
+template<typename HitState>
+static __forceinline__ __device__ void optixGetWorldToObjectTransformMatrix( const HitState& hs, float m[12] )
+{
+    if( hs.getTransformListSize() == 0 )
+    {
+        m[0]  = 1.0f;
+        m[1]  = 0.0f;
+        m[2]  = 0.0f;
+        m[3]  = 0.0f;
+        m[4]  = 0.0f;
+        m[5]  = 1.0f;
+        m[6]  = 0.0f;
+        m[7]  = 0.0f;
+        m[8]  = 0.0f;
+        m[9]  = 0.0f;
+        m[10] = 1.0f;
+        m[11] = 0.0f;
+        return;
+    }
+
+    float4 m0, m1, m2;
+    optix_impl::optixGetWorldToObjectTransformMatrix( hs, m0, m1, m2 );
+    m[0]  = m0.x;
+    m[1]  = m0.y;
+    m[2]  = m0.z;
+    m[3]  = m0.w;
+    m[4]  = m1.x;
+    m[5]  = m1.y;
+    m[6]  = m1.z;
+    m[7]  = m1.w;
+    m[8]  = m2.x;
+    m[9]  = m2.y;
+    m[10] = m2.z;
+    m[11] = m2.w;
+}
+
+static __forceinline__ __device__ void optixGetWorldToObjectTransformMatrix( float m[12] )
+{
+    optixGetWorldToObjectTransformMatrix( OptixIncomingHitObject{}, m );
+}
+
+static __forceinline__ __device__ void optixHitObjectGetWorldToObjectTransformMatrix( float m[12] )
+{
+    optixGetWorldToObjectTransformMatrix( OptixOutgoingHitObject{}, m );
+}
+
+template<typename HitState>
+static __forceinline__ __device__ void optixGetObjectToWorldTransformMatrix( const HitState& hs, float m[12] )
+{
+    if( hs.getTransformListSize() == 0 )
+    {
+        m[0]  = 1.0f;
+        m[1]  = 0.0f;
+        m[2]  = 0.0f;
+        m[3]  = 0.0f;
+        m[4]  = 0.0f;
+        m[5]  = 1.0f;
+        m[6]  = 0.0f;
+        m[7]  = 0.0f;
+        m[8]  = 0.0f;
+        m[9]  = 0.0f;
+        m[10] = 1.0f;
+        m[11] = 0.0f;
+        return;
+    }
+
+    float4 m0, m1, m2;
+    optix_impl::optixGetObjectToWorldTransformMatrix( hs, m0, m1, m2 );
+    m[0]  = m0.x;
+    m[1]  = m0.y;
+    m[2]  = m0.z;
+    m[3]  = m0.w;
+    m[4]  = m1.x;
+    m[5]  = m1.y;
+    m[6]  = m1.z;
+    m[7]  = m1.w;
+    m[8]  = m2.x;
+    m[9]  = m2.y;
+    m[10] = m2.z;
+    m[11] = m2.w;
+}
+
+static __forceinline__ __device__ void optixGetObjectToWorldTransformMatrix( float m[12] )
+{
+    optixGetObjectToWorldTransformMatrix( OptixIncomingHitObject{}, m );
+}
+
+static __forceinline__ __device__ void optixHitObjectGetObjectToWorldTransformMatrix( float m[12] )
+{
+    optixGetObjectToWorldTransformMatrix( OptixOutgoingHitObject{}, m );
+}
+
+template<typename HitState>
+static __forceinline__ __device__ float3 optixTransformPointFromWorldToObjectSpace( const HitState& hs, float3 point )
+{
+    if( hs.getTransformListSize() == 0 )
+        return point;
+
+    float4 m0, m1, m2;
+    optix_impl::optixGetWorldToObjectTransformMatrix( hs, m0, m1, m2 );
+    return optix_impl::optixTransformPoint( m0, m1, m2, point );
+}
+
+static __forceinline__ __device__ float3 optixTransformPointFromWorldToObjectSpace( float3 point )
+{
+    return optixTransformPointFromWorldToObjectSpace( OptixIncomingHitObject{}, point );
+}
+
+static __forceinline__ __device__ float3 optixHitObjectTransformPointFromWorldToObjectSpace( float3 point )
+{
+    return optixTransformPointFromWorldToObjectSpace( OptixOutgoingHitObject{}, point );
+}
+
+template<typename HitState>
+static __forceinline__ __device__ float3 optixTransformVectorFromWorldToObjectSpace( const HitState& hs, float3 vec )
+{
+    if( hs.getTransformListSize() == 0 )
+        return vec;
+
+    float4 m0, m1, m2;
+    optix_impl::optixGetWorldToObjectTransformMatrix( hs, m0, m1, m2 );
+    return optix_impl::optixTransformVector( m0, m1, m2, vec );
+}
+
+static __forceinline__ __device__ float3 optixTransformVectorFromWorldToObjectSpace( float3 vec )
+{
+    return optixTransformVectorFromWorldToObjectSpace( OptixIncomingHitObject{}, vec );
+}
+
+static __forceinline__ __device__ float3 optixHitObjectTransformVectorFromWorldToObjectSpace( float3 vec )
+{
+    return optixTransformVectorFromWorldToObjectSpace( OptixOutgoingHitObject{}, vec );
+}
+
+template<typename HitState>
+static __forceinline__ __device__ float3 optixTransformNormalFromWorldToObjectSpace( const HitState& hs, float3 normal )
+{
+    if( hs.getTransformListSize() == 0 )
+        return normal;
+
+    float4 m0, m1, m2;
+    optix_impl::optixGetObjectToWorldTransformMatrix( hs, m0, m1, m2 );  // inverse of optixGetWorldToObjectTransformMatrix()
+    return optix_impl::optixTransformNormal( m0, m1, m2, normal );
+}
+
+static __forceinline__ __device__ float3 optixTransformNormalFromWorldToObjectSpace( float3 normal )
+{
+    return optixTransformNormalFromWorldToObjectSpace( OptixIncomingHitObject{}, normal );
+}
+
+static __forceinline__ __device__ float3 optixHitObjectTransformNormalFromWorldToObjectSpace( float3 normal )
+{
+    return optixTransformNormalFromWorldToObjectSpace( OptixOutgoingHitObject{}, normal );
+}
+
+template<typename HitState>
+static __forceinline__ __device__ float3 optixTransformPointFromObjectToWorldSpace( const HitState& hs, float3 point )
+{
+    if( hs.getTransformListSize() == 0 )
+        return point;
+
+    float4 m0, m1, m2;
+    optix_impl::optixGetObjectToWorldTransformMatrix( hs, m0, m1, m2 );
+    return optix_impl::optixTransformPoint( m0, m1, m2, point );
+}
+
+static __forceinline__ __device__ float3 optixTransformPointFromObjectToWorldSpace( float3 point )
+{
+    return optixTransformPointFromObjectToWorldSpace( OptixIncomingHitObject{}, point );
+}
+
+static __forceinline__ __device__ float3 optixHitObjectTransformPointFromObjectToWorldSpace( float3 point )
+{
+    return optixTransformPointFromObjectToWorldSpace( OptixOutgoingHitObject{}, point );
+}
+
+template<typename HitState>
+static __forceinline__ __device__ float3 optixTransformVectorFromObjectToWorldSpace( const HitState& hs, float3 vec )
+{
+    if( hs.getTransformListSize() == 0 )
+        return vec;
+
+    float4 m0, m1, m2;
+    optix_impl::optixGetObjectToWorldTransformMatrix( hs, m0, m1, m2 );
+    return optix_impl::optixTransformVector( m0, m1, m2, vec );
+}
+
+static __forceinline__ __device__ float3 optixTransformVectorFromObjectToWorldSpace( float3 vec )
+{
+    return optixTransformVectorFromObjectToWorldSpace( OptixIncomingHitObject{}, vec );
+}
+
+static __forceinline__ __device__ float3 optixHitObjectTransformVectorFromObjectToWorldSpace( float3 vec )
+{
+    return optixTransformVectorFromObjectToWorldSpace( OptixOutgoingHitObject{}, vec );
+}
+
+template<typename HitState>
+static __forceinline__ __device__ float3 optixTransformNormalFromObjectToWorldSpace( const HitState& hs, float3 normal )
+{
+    if( hs.getTransformListSize() == 0 )
+        return normal;
+
+    float4 m0, m1, m2;
+    optix_impl::optixGetWorldToObjectTransformMatrix( hs, m0, m1, m2 );  // inverse of optixGetObjectToWorldTransformMatrix()
+    return optix_impl::optixTransformNormal( m0, m1, m2, normal );
+}
+
+static __forceinline__ __device__ float3 optixTransformNormalFromObjectToWorldSpace( float3 normal )
+{
+    return optixTransformNormalFromObjectToWorldSpace( OptixIncomingHitObject{}, normal );
+}
+
+static __forceinline__ __device__ float3 optixHitObjectTransformNormalFromObjectToWorldSpace( float3 normal )
+{
+    return optixTransformNormalFromObjectToWorldSpace( OptixOutgoingHitObject{}, normal );
+}
+
+static __forceinline__ __device__ unsigned int optixGetTransformListSize()
+{
+    unsigned int u0;
+    asm( "call (%0), _optix_get_transform_list_size, ();" : "=r"( u0 ) : );
+    return u0;
+}
+
+static __forceinline__ __device__ OptixTraversableHandle optixGetTransformListHandle( unsigned int index )
+{
+    unsigned long long u0;
+    asm( "call (%0), _optix_get_transform_list_handle, (%1);" : "=l"( u0 ) : "r"( index ) : );
+    return u0;
+}
+
+static __forceinline__ __device__ OptixTransformType optixGetTransformTypeFromHandle( OptixTraversableHandle handle )
+{
+    int i0;
+    asm( "call (%0), _optix_get_transform_type_from_handle, (%1);" : "=r"( i0 ) : "l"( handle ) : );
+    return (OptixTransformType)i0;
+}
+
+static __forceinline__ __device__ const OptixStaticTransform* optixGetStaticTransformFromHandle( OptixTraversableHandle handle )
+{
+    unsigned long long ptr;
+    asm( "call (%0), _optix_get_static_transform_from_handle, (%1);" : "=l"( ptr ) : "l"( handle ) : );
+    return (const OptixStaticTransform*)ptr;
+}
+
+static __forceinline__ __device__ const OptixSRTMotionTransform* optixGetSRTMotionTransformFromHandle( OptixTraversableHandle handle )
+{
+    unsigned long long ptr;
+    asm( "call (%0), _optix_get_srt_motion_transform_from_handle, (%1);" : "=l"( ptr ) : "l"( handle ) : );
+    return (const OptixSRTMotionTransform*)ptr;
+}
+
+static __forceinline__ __device__ const OptixMatrixMotionTransform* optixGetMatrixMotionTransformFromHandle( OptixTraversableHandle handle )
+{
+    unsigned long long ptr;
+    asm( "call (%0), _optix_get_matrix_motion_transform_from_handle, (%1);" : "=l"( ptr ) : "l"( handle ) : );
+    return (const OptixMatrixMotionTransform*)ptr;
+}
+
+static __forceinline__ __device__ unsigned int optixGetInstanceIdFromHandle( OptixTraversableHandle handle )
+{
+    int i0;
+    asm( "call (%0), _optix_get_instance_id_from_handle, (%1);" : "=r"( i0 ) : "l"( handle ) : );
+    return i0;
+}
+
+static __forceinline__ __device__ OptixTraversableHandle optixGetInstanceChildFromHandle( OptixTraversableHandle handle )
+{
+    unsigned long long i0;
+    asm( "call (%0), _optix_get_instance_child_from_handle, (%1);" : "=l"( i0 ) : "l"( handle ) : );
+    return (OptixTraversableHandle)i0;
+}
+
+static __forceinline__ __device__ const float4* optixGetInstanceTransformFromHandle( OptixTraversableHandle handle )
+{
+    unsigned long long ptr;
+    asm( "call (%0), _optix_get_instance_transform_from_handle, (%1);" : "=l"( ptr ) : "l"( handle ) : );
+    return (const float4*)ptr;
+}
+
+static __forceinline__ __device__ const float4* optixGetInstanceInverseTransformFromHandle( OptixTraversableHandle handle )
+{
+    unsigned long long ptr;
+    asm( "call (%0), _optix_get_instance_inverse_transform_from_handle, (%1);" : "=l"( ptr ) : "l"( handle ) : );
+    return (const float4*)ptr;
+}
+
+static __device__ __forceinline__ CUdeviceptr optixGetGASPointerFromHandle( OptixTraversableHandle handle )
+{
+    unsigned long long ptr;
+    asm( "call (%0), _optix_get_gas_ptr_from_handle, (%1);" : "=l"( ptr ) : "l"( handle ) : );
+    return (CUdeviceptr)ptr;
+}
+static __forceinline__ __device__ bool optixReportIntersection( float hitT, unsigned int hitKind )
+{
+    int ret;
+    asm volatile(
+        "call (%0), _optix_report_intersection_0"
+        ", (%1, %2);"
+        : "=r"( ret )
+        : "f"( hitT ), "r"( hitKind )
+        : );
+    return ret;
+}
+
+static __forceinline__ __device__ bool optixReportIntersection( float hitT, unsigned int hitKind, unsigned int a0 )
+{
+    int ret;
+    asm volatile(
+        "call (%0), _optix_report_intersection_1"
+        ", (%1, %2, %3);"
+        : "=r"( ret )
+        : "f"( hitT ), "r"( hitKind ), "r"( a0 )
+        : );
+    return ret;
+}
+
+static __forceinline__ __device__ bool optixReportIntersection( float hitT, unsigned int hitKind, unsigned int a0, unsigned int a1 )
+{
+    int ret;
+    asm volatile(
+        "call (%0), _optix_report_intersection_2"
+        ", (%1, %2, %3, %4);"
+        : "=r"( ret )
+        : "f"( hitT ), "r"( hitKind ), "r"( a0 ), "r"( a1 )
+        : );
+    return ret;
+}
+
+static __forceinline__ __device__ bool optixReportIntersection( float hitT, unsigned int hitKind, unsigned int a0, unsigned int a1, unsigned int a2 )
+{
+    int ret;
+    asm volatile(
+        "call (%0), _optix_report_intersection_3"
+        ", (%1, %2, %3, %4, %5);"
+        : "=r"( ret )
+        : "f"( hitT ), "r"( hitKind ), "r"( a0 ), "r"( a1 ), "r"( a2 )
+        : );
+    return ret;
+}
+
+static __forceinline__ __device__ bool optixReportIntersection( float        hitT,
+                                                                unsigned int hitKind,
+                                                                unsigned int a0,
+                                                                unsigned int a1,
+                                                                unsigned int a2,
+                                                                unsigned int a3 )
+{
+    int ret;
+    asm volatile(
+        "call (%0), _optix_report_intersection_4"
+        ", (%1, %2, %3, %4, %5, %6);"
+        : "=r"( ret )
+        : "f"( hitT ), "r"( hitKind ), "r"( a0 ), "r"( a1 ), "r"( a2 ), "r"( a3 )
+        : );
+    return ret;
+}
+
+static __forceinline__ __device__ bool optixReportIntersection( float        hitT,
+                                                                unsigned int hitKind,
+                                                                unsigned int a0,
+                                                                unsigned int a1,
+                                                                unsigned int a2,
+                                                                unsigned int a3,
+                                                                unsigned int a4 )
+{
+    int ret;
+    asm volatile(
+        "call (%0), _optix_report_intersection_5"
+        ", (%1, %2, %3, %4, %5, %6, %7);"
+        : "=r"( ret )
+        : "f"( hitT ), "r"( hitKind ), "r"( a0 ), "r"( a1 ), "r"( a2 ), "r"( a3 ), "r"( a4 )
+        : );
+    return ret;
+}
+
+static __forceinline__ __device__ bool optixReportIntersection( float        hitT,
+                                                                unsigned int hitKind,
+                                                                unsigned int a0,
+                                                                unsigned int a1,
+                                                                unsigned int a2,
+                                                                unsigned int a3,
+                                                                unsigned int a4,
+                                                                unsigned int a5 )
+{
+    int ret;
+    asm volatile(
+        "call (%0), _optix_report_intersection_6"
+        ", (%1, %2, %3, %4, %5, %6, %7, %8);"
+        : "=r"( ret )
+        : "f"( hitT ), "r"( hitKind ), "r"( a0 ), "r"( a1 ), "r"( a2 ), "r"( a3 ), "r"( a4 ), "r"( a5 )
+        : );
+    return ret;
+}
+
+static __forceinline__ __device__ bool optixReportIntersection( float        hitT,
+                                                                unsigned int hitKind,
+                                                                unsigned int a0,
+                                                                unsigned int a1,
+                                                                unsigned int a2,
+                                                                unsigned int a3,
+                                                                unsigned int a4,
+                                                                unsigned int a5,
+                                                                unsigned int a6 )
+{
+    int ret;
+    asm volatile(
+        "call (%0), _optix_report_intersection_7"
+        ", (%1, %2, %3, %4, %5, %6, %7, %8, %9);"
+        : "=r"( ret )
+        : "f"( hitT ), "r"( hitKind ), "r"( a0 ), "r"( a1 ), "r"( a2 ), "r"( a3 ), "r"( a4 ), "r"( a5 ), "r"( a6 )
+        : );
+    return ret;
+}
+
+static __forceinline__ __device__ bool optixReportIntersection( float        hitT,
+                                                                unsigned int hitKind,
+                                                                unsigned int a0,
+                                                                unsigned int a1,
+                                                                unsigned int a2,
+                                                                unsigned int a3,
+                                                                unsigned int a4,
+                                                                unsigned int a5,
+                                                                unsigned int a6,
+                                                                unsigned int a7 )
+{
+    int ret;
+    asm volatile(
+        "call (%0), _optix_report_intersection_8"
+        ", (%1, %2, %3, %4, %5, %6, %7, %8, %9, %10);"
+        : "=r"( ret )
+        : "f"( hitT ), "r"( hitKind ), "r"( a0 ), "r"( a1 ), "r"( a2 ), "r"( a3 ), "r"( a4 ), "r"( a5 ), "r"( a6 ), "r"( a7 )
+        : );
+    return ret;
+}
+
+#define OPTIX_DEFINE_optixGetAttribute_BODY( which )                                                                   \
+    unsigned int ret;                                                                                                  \
+    asm( "call (%0), _optix_get_attribute_" #which ", ();" : "=r"( ret ) : );                                          \
+    return ret;
+
+static __forceinline__ __device__ unsigned int optixGetAttribute_0()
+{
+    OPTIX_DEFINE_optixGetAttribute_BODY( 0 );
+}
+
+static __forceinline__ __device__ unsigned int optixGetAttribute_1()
+{
+    OPTIX_DEFINE_optixGetAttribute_BODY( 1 );
+}
+
+static __forceinline__ __device__ unsigned int optixGetAttribute_2()
+{
+    OPTIX_DEFINE_optixGetAttribute_BODY( 2 );
+}
+
+static __forceinline__ __device__ unsigned int optixGetAttribute_3()
+{
+    OPTIX_DEFINE_optixGetAttribute_BODY( 3 );
+}
+
+static __forceinline__ __device__ unsigned int optixGetAttribute_4()
+{
+    OPTIX_DEFINE_optixGetAttribute_BODY( 4 );
+}
+
+static __forceinline__ __device__ unsigned int optixGetAttribute_5()
+{
+    OPTIX_DEFINE_optixGetAttribute_BODY( 5 );
+}
+
+static __forceinline__ __device__ unsigned int optixGetAttribute_6()
+{
+    OPTIX_DEFINE_optixGetAttribute_BODY( 6 );
+}
+
+static __forceinline__ __device__ unsigned int optixGetAttribute_7()
+{
+    OPTIX_DEFINE_optixGetAttribute_BODY( 7 );
+}
+
+#undef OPTIX_DEFINE_optixGetAttribute_BODY
+
+static __forceinline__ __device__ void optixTerminateRay()
+{
+    asm volatile( "call _optix_terminate_ray, ();" );
+}
+
+static __forceinline__ __device__ void optixIgnoreIntersection()
+{
+    asm volatile( "call _optix_ignore_intersection, ();" );
+}
+
+static __forceinline__ __device__ unsigned int optixGetPrimitiveIndex()
+{
+    unsigned int u0;
+    asm( "call (%0), _optix_read_primitive_idx, ();" : "=r"( u0 ) : );
+    return u0;
+}
+
+static __forceinline__ __device__ unsigned int optixGetClusterId()
+{
+    unsigned int u0;
+    asm( "call (%0), _optix_get_cluster_id, ();" : "=r"( u0 ) : );
+    return u0;
+}
+
+static __forceinline__ __device__ unsigned int optixHitObjectGetClusterId()
+{
+    unsigned int u0;
+    asm( "call (%0), _optix_hitobject_get_cluster_id, ();" : "=r"( u0 ) : );
+    return u0;
+}
+
+static __forceinline__ __device__ unsigned int optixGetSbtGASIndex()
+{
+    unsigned int u0;
+    asm( "call (%0), _optix_read_sbt_gas_idx, ();" : "=r"( u0 ) : );
+    return u0;
+}
+
+static __forceinline__ __device__ unsigned int optixGetInstanceId()
+{
+    unsigned int u0;
+    asm( "call (%0), _optix_read_instance_id, ();" : "=r"( u0 ) : );
+    return u0;
+}
+
+static __forceinline__ __device__ unsigned int optixGetInstanceIndex()
+{
+    unsigned int u0;
+    asm( "call (%0), _optix_read_instance_idx, ();" : "=r"( u0 ) : );
+    return u0;
+}
+
+static __forceinline__ __device__ unsigned int optixGetHitKind()
+{
+    unsigned int u0;
+    asm( "call (%0), _optix_get_hit_kind, ();" : "=r"( u0 ) : );
+    return u0;
+}
+
+static __forceinline__ __device__ OptixPrimitiveType optixGetPrimitiveType(unsigned int hitKind)
+{
+    unsigned int u0;
+    asm( "call (%0), _optix_get_primitive_type_from_hit_kind, (%1);" : "=r"( u0 ) : "r"( hitKind ) );
+    return (OptixPrimitiveType)u0;
+}
+
+static __forceinline__ __device__ bool optixIsBackFaceHit( unsigned int hitKind )
+{
+    unsigned int u0;
+    asm( "call (%0), _optix_get_backface_from_hit_kind, (%1);" : "=r"( u0 ) : "r"( hitKind ) );
+    return (u0 == 0x1);
+}
+
+static __forceinline__ __device__ bool optixIsFrontFaceHit( unsigned int hitKind )
+{
+    return !optixIsBackFaceHit( hitKind );
+}
+
+
+static __forceinline__ __device__ OptixPrimitiveType optixGetPrimitiveType()
+{
+    return optixGetPrimitiveType( optixGetHitKind() );
+}
+
+static __forceinline__ __device__ bool optixIsBackFaceHit()
+{
+    return optixIsBackFaceHit( optixGetHitKind() );
+}
+
+static __forceinline__ __device__ bool optixIsFrontFaceHit()
+{
+    return optixIsFrontFaceHit( optixGetHitKind() );
+}
+
+static __forceinline__ __device__ bool optixIsTriangleHit()
+{
+    return optixIsTriangleFrontFaceHit() || optixIsTriangleBackFaceHit();
+}
+
+static __forceinline__ __device__ bool optixIsTriangleFrontFaceHit()
+{
+    return optixGetHitKind() == OPTIX_HIT_KIND_TRIANGLE_FRONT_FACE;
+}
+
+static __forceinline__ __device__ bool optixIsTriangleBackFaceHit()
+{
+    return optixGetHitKind() == OPTIX_HIT_KIND_TRIANGLE_BACK_FACE;
+}
+
+
+static __forceinline__ __device__ float optixGetCurveParameter()
+{
+    float f0;
+    asm( "call (%0), _optix_get_curve_parameter, ();" : "=f"( f0 ) : );
+    return f0;
+}
+
+static __forceinline__ __device__ float optixHitObjectGetCurveParameter()
+{
+    float f0;
+    asm( "call (%0), _optix_hitobject_get_curve_parameter, ();" : "=f"( f0 ) : );
+    return f0;
+}
+
+static __forceinline__ __device__ float2 optixGetRibbonParameters()
+{
+    float f0, f1;
+    asm( "call (%0, %1), _optix_get_ribbon_parameters, ();" : "=f"( f0 ), "=f"( f1 ) : );
+    return make_float2( f0, f1 );
+}
+
+static __forceinline__ __device__ float2 optixHitObjectGetRibbonParameters()
+{
+    float f0, f1;
+    asm( "call (%0, %1), _optix_hitobject_get_ribbon_parameters, ();" : "=f"( f0 ), "=f"( f1 ) : );
+    return make_float2( f0, f1 );
+}
+
+static __forceinline__ __device__ float2 optixGetTriangleBarycentrics()
+{
+    float f0, f1;
+    asm( "call (%0, %1), _optix_get_triangle_barycentrics, ();" : "=f"( f0 ), "=f"( f1 ) : );
+    return make_float2( f0, f1 );
+}
+
+static __forceinline__ __device__ float2 optixHitObjectGetTriangleBarycentrics()
+{
+    float f0, f1;
+    asm( "call (%0, %1), _optix_hitobject_get_triangle_barycentrics, ();" : "=f"( f0 ), "=f"( f1 ) : );
+    return make_float2( f0, f1 );
+}
+
+static __forceinline__ __device__ uint3 optixGetLaunchIndex()
+{
+    unsigned int u0, u1, u2;
+    asm( "call (%0), _optix_get_launch_index_x, ();" : "=r"( u0 ) : );
+    asm( "call (%0), _optix_get_launch_index_y, ();" : "=r"( u1 ) : );
+    asm( "call (%0), _optix_get_launch_index_z, ();" : "=r"( u2 ) : );
+    return make_uint3( u0, u1, u2 );
+}
+
+static __forceinline__ __device__ uint3 optixGetLaunchDimensions()
+{
+    unsigned int u0, u1, u2;
+    asm( "call (%0), _optix_get_launch_dimension_x, ();" : "=r"( u0 ) : );
+    asm( "call (%0), _optix_get_launch_dimension_y, ();" : "=r"( u1 ) : );
+    asm( "call (%0), _optix_get_launch_dimension_z, ();" : "=r"( u2 ) : );
+    return make_uint3( u0, u1, u2 );
+}
+
+static __forceinline__ __device__ CUdeviceptr optixGetSbtDataPointer()
+{
+    unsigned long long ptr;
+    asm( "call (%0), _optix_get_sbt_data_ptr_64, ();" : "=l"( ptr ) : );
+    return (CUdeviceptr)ptr;
+}
+
+static __forceinline__ __device__ void optixThrowException( int exceptionCode )
+{
+    asm volatile(
+        "call _optix_throw_exception_0, (%0);"
+        : /* no return value */
+        : "r"( exceptionCode )
+        : );
+}
+
+static __forceinline__ __device__ void optixThrowException( int exceptionCode, unsigned int exceptionDetail0 )
+{
+    asm volatile(
+        "call _optix_throw_exception_1, (%0, %1);"
+        : /* no return value */
+        : "r"( exceptionCode ), "r"( exceptionDetail0 )
+        : );
+}
+
+static __forceinline__ __device__ void optixThrowException( int exceptionCode, unsigned int exceptionDetail0, unsigned int exceptionDetail1 )
+{
+    asm volatile(
+        "call _optix_throw_exception_2, (%0, %1, %2);"
+        : /* no return value */
+        : "r"( exceptionCode ), "r"( exceptionDetail0 ), "r"( exceptionDetail1 )
+        : );
+}
+
+static __forceinline__ __device__ void optixThrowException( int exceptionCode, unsigned int exceptionDetail0, unsigned int exceptionDetail1, unsigned int exceptionDetail2 )
+{
+    asm volatile(
+        "call _optix_throw_exception_3, (%0, %1, %2, %3);"
+        : /* no return value */
+        : "r"( exceptionCode ), "r"( exceptionDetail0 ), "r"( exceptionDetail1 ), "r"( exceptionDetail2 )
+        : );
+}
+
+static __forceinline__ __device__ void optixThrowException( int exceptionCode, unsigned int exceptionDetail0, unsigned int exceptionDetail1, unsigned int exceptionDetail2, unsigned int exceptionDetail3 )
+{
+    asm volatile(
+        "call _optix_throw_exception_4, (%0, %1, %2, %3, %4);"
+        : /* no return value */
+        : "r"( exceptionCode ), "r"( exceptionDetail0 ), "r"( exceptionDetail1 ), "r"( exceptionDetail2 ), "r"( exceptionDetail3 )
+        : );
+}
+
+static __forceinline__ __device__ void optixThrowException( int exceptionCode, unsigned int exceptionDetail0, unsigned int exceptionDetail1, unsigned int exceptionDetail2, unsigned int exceptionDetail3, unsigned int exceptionDetail4 )
+{
+    asm volatile(
+        "call _optix_throw_exception_5, (%0, %1, %2, %3, %4, %5);"
+        : /* no return value */
+        : "r"( exceptionCode ), "r"( exceptionDetail0 ), "r"( exceptionDetail1 ), "r"( exceptionDetail2 ), "r"( exceptionDetail3 ), "r"( exceptionDetail4 )
+        : );
+}
+
+static __forceinline__ __device__ void optixThrowException( int exceptionCode, unsigned int exceptionDetail0, unsigned int exceptionDetail1, unsigned int exceptionDetail2, unsigned int exceptionDetail3, unsigned int exceptionDetail4, unsigned int exceptionDetail5 )
+{
+    asm volatile(
+        "call _optix_throw_exception_6, (%0, %1, %2, %3, %4, %5, %6);"
+        : /* no return value */
+        : "r"( exceptionCode ), "r"( exceptionDetail0 ), "r"( exceptionDetail1 ), "r"( exceptionDetail2 ), "r"( exceptionDetail3 ), "r"( exceptionDetail4 ), "r"( exceptionDetail5 )
+        : );
+}
+
+static __forceinline__ __device__ void optixThrowException( int exceptionCode, unsigned int exceptionDetail0, unsigned int exceptionDetail1, unsigned int exceptionDetail2, unsigned int exceptionDetail3, unsigned int exceptionDetail4, unsigned int exceptionDetail5, unsigned int exceptionDetail6 )
+{
+    asm volatile(
+        "call _optix_throw_exception_7, (%0, %1, %2, %3, %4, %5, %6, %7);"
+        : /* no return value */
+        : "r"( exceptionCode ), "r"( exceptionDetail0 ), "r"( exceptionDetail1 ), "r"( exceptionDetail2 ), "r"( exceptionDetail3 ), "r"( exceptionDetail4 ), "r"( exceptionDetail5 ), "r"( exceptionDetail6 )
+        : );
+}
+
+static __forceinline__ __device__ void optixThrowException( int exceptionCode, unsigned int exceptionDetail0, unsigned int exceptionDetail1, unsigned int exceptionDetail2, unsigned int exceptionDetail3, unsigned int exceptionDetail4, unsigned int exceptionDetail5, unsigned int exceptionDetail6, unsigned int exceptionDetail7 )
+{
+    asm volatile(
+        "call _optix_throw_exception_8, (%0, %1, %2, %3, %4, %5, %6, %7, %8);"
+        : /* no return value */
+        : "r"( exceptionCode ), "r"( exceptionDetail0 ), "r"( exceptionDetail1 ), "r"( exceptionDetail2 ), "r"( exceptionDetail3 ), "r"( exceptionDetail4 ), "r"( exceptionDetail5 ), "r"( exceptionDetail6 ), "r"( exceptionDetail7 )
+        : );
+}
+
+static __forceinline__ __device__ int optixGetExceptionCode()
+{
+    int s0;
+    asm( "call (%0), _optix_get_exception_code, ();" : "=r"( s0 ) : );
+    return s0;
+}
+
+#define OPTIX_DEFINE_optixGetExceptionDetail_BODY( which )                                                             \
+    unsigned int ret;                                                                                                  \
+    asm( "call (%0), _optix_get_exception_detail_" #which ", ();" : "=r"( ret ) : );                                   \
+    return ret;
+
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_0()
+{
+    OPTIX_DEFINE_optixGetExceptionDetail_BODY( 0 );
+}
+
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_1()
+{
+    OPTIX_DEFINE_optixGetExceptionDetail_BODY( 1 );
+}
+
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_2()
+{
+    OPTIX_DEFINE_optixGetExceptionDetail_BODY( 2 );
+}
+
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_3()
+{
+    OPTIX_DEFINE_optixGetExceptionDetail_BODY( 3 );
+}
+
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_4()
+{
+    OPTIX_DEFINE_optixGetExceptionDetail_BODY( 4 );
+}
+
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_5()
+{
+    OPTIX_DEFINE_optixGetExceptionDetail_BODY( 5 );
+}
+
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_6()
+{
+    OPTIX_DEFINE_optixGetExceptionDetail_BODY( 6 );
+}
+
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_7()
+{
+    OPTIX_DEFINE_optixGetExceptionDetail_BODY( 7 );
+}
+
+#undef OPTIX_DEFINE_optixGetExceptionDetail_BODY
+
+
+static __forceinline__ __device__ char* optixGetExceptionLineInfo()
+{
+    unsigned long long ptr;
+    asm( "call (%0), _optix_get_exception_line_info, ();" : "=l"(ptr) : );
+    return (char*)ptr;
+}
+
+template <typename ReturnT, typename... ArgTypes>
+static __forceinline__ __device__ ReturnT optixDirectCall( unsigned int sbtIndex, ArgTypes... args )
+{
+    unsigned long long func;
+    asm( "call (%0), _optix_call_direct_callable,(%1);" : "=l"( func ) : "r"( sbtIndex ) : );
+    using funcT = ReturnT ( * )( ArgTypes... );
+    funcT call  = ( funcT )( func );
+    return call( args... );
+}
+
+template <typename ReturnT, typename... ArgTypes>
+static __forceinline__ __device__ ReturnT optixContinuationCall( unsigned int sbtIndex, ArgTypes... args )
+{
+    unsigned long long func;
+    asm( "call (%0), _optix_call_continuation_callable,(%1);" : "=l"( func ) : "r"( sbtIndex ) : );
+    using funcT = ReturnT ( * )( ArgTypes... );
+    funcT call  = ( funcT )( func );
+    return call( args... );
+}
+
+static __forceinline__ __device__ uint4 optixTexFootprint2D( unsigned long long tex, unsigned int texInfo, float x, float y, unsigned int* singleMipLevel )
+{
+    uint4              result;
+    unsigned long long resultPtr         = reinterpret_cast<unsigned long long>( &result );
+    unsigned long long singleMipLevelPtr = reinterpret_cast<unsigned long long>( singleMipLevel );
+    // Cast float args to integers, because the intrinics take .b32 arguments when compiled to PTX.
+    asm volatile(
+        "call _optix_tex_footprint_2d_v2"
+        ", (%0, %1, %2, %3, %4, %5);"
+        :
+        : "l"( tex ), "r"( texInfo ), "r"( __float_as_uint( x ) ), "r"( __float_as_uint( y ) ),
+          "l"( singleMipLevelPtr ), "l"( resultPtr )
+        : );
+    return result;
+}
+
+static __forceinline__ __device__ uint4 optixTexFootprint2DGrad( unsigned long long tex,
+                                                                 unsigned int       texInfo,
+                                                                 float              x,
+                                                                 float              y,
+                                                                 float              dPdx_x,
+                                                                 float              dPdx_y,
+                                                                 float              dPdy_x,
+                                                                 float              dPdy_y,
+                                                                 bool               coarse,
+                                                                 unsigned int*      singleMipLevel )
+{
+    uint4              result;
+    unsigned long long resultPtr         = reinterpret_cast<unsigned long long>( &result );
+    unsigned long long singleMipLevelPtr = reinterpret_cast<unsigned long long>( singleMipLevel );
+    // Cast float args to integers, because the intrinics take .b32 arguments when compiled to PTX.
+    asm volatile(
+        "call _optix_tex_footprint_2d_grad_v2"
+        ", (%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10);"
+        :
+        : "l"( tex ), "r"( texInfo ), "r"( __float_as_uint( x ) ), "r"( __float_as_uint( y ) ),
+          "r"( __float_as_uint( dPdx_x ) ), "r"( __float_as_uint( dPdx_y ) ), "r"( __float_as_uint( dPdy_x ) ),
+          "r"( __float_as_uint( dPdy_y ) ), "r"( static_cast<unsigned int>( coarse ) ), "l"( singleMipLevelPtr ), "l"( resultPtr )
+        : );
+
+    return result;
+}
+
+static __forceinline__ __device__ uint4
+optixTexFootprint2DLod( unsigned long long tex, unsigned int texInfo, float x, float y, float level, bool coarse, unsigned int* singleMipLevel )
+{
+    uint4              result;
+    unsigned long long resultPtr         = reinterpret_cast<unsigned long long>( &result );
+    unsigned long long singleMipLevelPtr = reinterpret_cast<unsigned long long>( singleMipLevel );
+    // Cast float args to integers, because the intrinics take .b32 arguments when compiled to PTX.
+    asm volatile(
+        "call _optix_tex_footprint_2d_lod_v2"
+        ", (%0, %1, %2, %3, %4, %5, %6, %7);"
+        :
+        : "l"( tex ), "r"( texInfo ), "r"( __float_as_uint( x ) ), "r"( __float_as_uint( y ) ),
+          "r"( __float_as_uint( level ) ), "r"( static_cast<unsigned int>( coarse ) ), "l"( singleMipLevelPtr ), "l"( resultPtr )
+        : );
+    return result;
+}
+
+#endif // OPTIX_OPTIX_DEVICE_IMPL_H
diff --git a/crtx/optix_9.1/internal/optix_device_impl_coop_vec.h b/crtx/optix_9.1/internal/optix_device_impl_coop_vec.h
new file mode 100644
index 0000000..7e936db
--- /dev/null
+++ b/crtx/optix_9.1/internal/optix_device_impl_coop_vec.h
@@ -0,0 +1,951 @@
+/*
+* SPDX-FileCopyrightText: Copyright (c) 2019 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+* SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+*
+* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+* property and proprietary rights in and to this material, related
+* documentation and any modifications thereto. Any use, reproduction,
+* disclosure or distribution of this material and related documentation
+* without an express license agreement from NVIDIA CORPORATION or
+* its affiliates is strictly prohibited.
+*/
+/// @file optix_device_impl_coopvec.h
+/// @author NVIDIA Corporation
+/// @brief  OptiX public API header
+///
+
+#ifndef OPTIX_OPTIX_DEVICE_IMPL_COOP_VEC_H
+#define OPTIX_OPTIX_DEVICE_IMPL_COOP_VEC_H
+
+#if !defined( __OPTIX_INCLUDE_INTERNAL_HEADERS__ )
+#error("optix_device_impl.h is an internal header file and must not be used directly.  Please use optix_device.h or optix.h instead.")
+#endif
+
+namespace optix_internal {
+
+typedef enum OptixCoopVecOp
+{
+    OPTIX_COOP_VEC_OP_UNKNOWN = 0x2A20,
+    OPTIX_COOP_VEC_OP_EXP2    = 0x2A21,
+    OPTIX_COOP_VEC_OP_LOG2    = 0x2A22,
+    OPTIX_COOP_VEC_OP_TANH    = 0x2A23,
+    OPTIX_COOP_VEC_OP_MAX     = 0x2A24,
+    OPTIX_COOP_VEC_OP_MIN     = 0x2A25,
+    OPTIX_COOP_VEC_OP_FFMA    = 0x2A26,
+    OPTIX_COOP_VEC_OP_MUL     = 0x2A27,
+    OPTIX_COOP_VEC_OP_ADD     = 0x2A28,
+    OPTIX_COOP_VEC_OP_SUB     = 0x2A29,
+    OPTIX_COOP_VEC_OP_CVT     = 0x2A2A,
+    OPTIX_COOP_VEC_OP_STEP    = 0x2A2B,
+} OptixCoopVecOp;
+}  // end namespace optix_internal
+
+#if !defined( OPTIX_DONT_INCLUDE_CUDA )
+// If OPTIX_DONT_INCLUDE_CUDA is defined, cuda driver types must be defined through other
+// means before including optix headers.
+#include <cuda_fp16.h>
+#endif
+
+
+namespace optix_internal {
+namespace coop_vec_type_traits {
+// clang-format off
+
+// We need to implement code that is available in <type_traits> since nvrtc does not support the <type_traits> header.
+// Custom is_float implementation - specialized only for half and float
+template <typename T> struct is_float { static const bool value = false; };
+template <> struct is_float<float> { static const bool value = true; };
+template <> struct is_float<half> { static const bool value = true; };
+
+template <typename T> struct is_integral { static const bool value = !is_float<T>::value; };
+
+template <typename T> struct is_signed_impl { static const bool value = static_cast<T>(-1) < static_cast<T>(0); };
+
+// If it's a float type, it's signed. Otherwise use the generic test.
+template <typename T>
+struct is_signed { static const bool value = is_float<T>::value ? true : is_signed_impl<T>::value; };
+// NVRTC is stricter about template instantiation requirements and requires both branches of a ternary operator
+// to be syntactically valid during compilation, so we need to explicitly specialize half to bypass the generic
+// is_signed_impl template that uses static_cast, avoiding the ambiguous conversion issue entirely.
+template <> struct is_signed<half> { static const bool value = true; };
+
+template <bool is_integral, bool is_signed, size_t byte_size> struct TT;
+template <> struct TT<true,  true,  1> { static const OptixCoopVecElemType value = OPTIX_COOP_VEC_ELEM_TYPE_INT8; };
+template <> struct TT<true,  false, 1> { static const OptixCoopVecElemType value = OPTIX_COOP_VEC_ELEM_TYPE_UINT8; };
+template <> struct TT<true,  true,  4> { static const OptixCoopVecElemType value = OPTIX_COOP_VEC_ELEM_TYPE_INT32; };
+template <> struct TT<true,  false, 4> { static const OptixCoopVecElemType value = OPTIX_COOP_VEC_ELEM_TYPE_UINT32; };
+template <> struct TT<false, true,  4> { static const OptixCoopVecElemType value = OPTIX_COOP_VEC_ELEM_TYPE_FLOAT32; };
+template <> struct TT<false, true,  2> { static const OptixCoopVecElemType value = OPTIX_COOP_VEC_ELEM_TYPE_FLOAT16; };
+
+template< size_t byte_size > struct TB;
+template<> struct TB<1> { using bitType = unsigned char; };
+template<> struct TB<2> { using bitType = unsigned short; };
+template<> struct TB<4> { using bitType = unsigned int; };
+// clang-format on
+
+// The non-specialized template can take advantage of all the built-in types, while for
+// other special types like half, will be handled by specialization.
+template <typename T>
+struct OptixCoopVecElemTypeTrait
+{
+    static const OptixCoopVecElemType elementType =
+        TT<coop_vec_type_traits::is_integral<T>::value, coop_vec_type_traits::is_signed<T>::value, sizeof( T )>::value;
+    using bitType = typename TB<sizeof( T )>::bitType;
+};
+}  // end namespace coop_vec_type_traits
+}  // end namespace optix_internal
+
+namespace optix_internal {
+
+template <typename VecTOut>
+struct OptixCoopVecLoadASMGenerator
+{
+    static const OptixCoopVecElemType outputElementType =
+        optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTOut::value_type>::elementType;
+    using outputBitType =
+        typename optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTOut::value_type>::bitType;
+
+    __forceinline__ __device__ static VecTOut generateASMPtr( CUdeviceptr ptr )
+    {
+        VecTOut result;
+        asm( "call"
+             "(),"
+             "_optix_vector_load_ptr,"
+             "(%0,%1,%2,%3);"
+             :
+             : "r"( outputElementType ), "r"( VecTOut::size ), "l"( ptr ), "l"( result.data() ) );
+        return result;
+    }
+    __forceinline__ __device__ static VecTOut generateASM( CUdeviceptr ptr )
+    {
+        if( VecTOut::size > 64 || sizeof( typename VecTOut::value_type ) > sizeof( unsigned int ) )
+            return generateASMPtr( ptr );
+        else
+        {
+            // This code needs to live in an else, block otherwise the compiler will
+            // complain about the loop being unreachable.
+            unsigned int O[64];
+            if( VecTOut::size <= 16 )
+                asm( "call"
+                     "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15),"
+                     "_optix_vector_load_16xi32,"
+                     "(%16,%17,%18);"
+                     : "=r"( O[0] ), "=r"( O[1] ), "=r"( O[2] ), "=r"( O[3] ), "=r"( O[4] ), "=r"( O[5] ), "=r"( O[6] ),
+                       "=r"( O[7] ), "=r"( O[8] ), "=r"( O[9] ), "=r"( O[10] ), "=r"( O[11] ), "=r"( O[12] ),
+                       "=r"( O[13] ), "=r"( O[14] ), "=r"( O[15] )
+                     : "r"( outputElementType ), "r"( VecTOut::size ), "l"( ptr ) );
+            else
+                asm( "call"
+                     "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%"
+                     "26,%27,%28,%29,%30,%31,%32,%33,%34,%35,%36,%37,%38,%39,%40,%41,%42,%43,%44,%45,%46,%47,%48,%49,%"
+                     "50,%51,%52,%53,%54,%55,%56,%57,%58,%59,%60,%61,%62,%63),"
+                     "_optix_vector_load_64xi32,"
+                     "(%64,%65,%66);"
+                     : "=r"( O[0] ), "=r"( O[1] ), "=r"( O[2] ), "=r"( O[3] ), "=r"( O[4] ), "=r"( O[5] ), "=r"( O[6] ),
+                       "=r"( O[7] ), "=r"( O[8] ), "=r"( O[9] ), "=r"( O[10] ), "=r"( O[11] ), "=r"( O[12] ),
+                       "=r"( O[13] ), "=r"( O[14] ), "=r"( O[15] ), "=r"( O[16] ), "=r"( O[17] ), "=r"( O[18] ),
+                       "=r"( O[19] ), "=r"( O[20] ), "=r"( O[21] ), "=r"( O[22] ), "=r"( O[23] ), "=r"( O[24] ),
+                       "=r"( O[25] ), "=r"( O[26] ), "=r"( O[27] ), "=r"( O[28] ), "=r"( O[29] ), "=r"( O[30] ),
+                       "=r"( O[31] ), "=r"( O[32] ), "=r"( O[33] ), "=r"( O[34] ), "=r"( O[35] ), "=r"( O[36] ),
+                       "=r"( O[37] ), "=r"( O[38] ), "=r"( O[39] ), "=r"( O[40] ), "=r"( O[41] ), "=r"( O[42] ),
+                       "=r"( O[43] ), "=r"( O[44] ), "=r"( O[45] ), "=r"( O[46] ), "=r"( O[47] ), "=r"( O[48] ),
+                       "=r"( O[49] ), "=r"( O[50] ), "=r"( O[51] ), "=r"( O[52] ), "=r"( O[53] ), "=r"( O[54] ),
+                       "=r"( O[55] ), "=r"( O[56] ), "=r"( O[57] ), "=r"( O[58] ), "=r"( O[59] ), "=r"( O[60] ),
+                       "=r"( O[61] ), "=r"( O[62] ), "=r"( O[63] )
+                     : "r"( outputElementType ), "r"( VecTOut::size ), "l"( ptr ) );
+
+            VecTOut result;
+            for( unsigned int i = 0; i < VecTOut::size; ++i )
+            {
+                outputBitType o = O[i];
+                result[i]       = *( reinterpret_cast<typename VecTOut::value_type*>( &( o ) ) );
+            }
+            return result;
+        }
+    }
+};
+
+
+template <OptixCoopVecOp VectorOp, typename VecTOut, typename VecTIn>
+struct OptixCoopVecASMGenerator
+{
+    static const OptixCoopVecElemType outputElementType =
+        optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTOut::value_type>::elementType;
+    using outputBitType =
+        typename optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTOut::value_type>::bitType;
+    static const OptixCoopVecElemType inputElementType =
+        optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTIn::value_type>::elementType;
+    using inputBitType =
+        typename optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTIn::value_type>::bitType;
+
+    __forceinline__ __device__ static VecTOut generateASMPtr( const VecTIn& vecA )
+    {
+        VecTOut result;
+        asm( "call"
+             "(),"
+             "_optix_vector_op1_ptr,"
+             "(%0,%1,%2,%3,%4,%5,%6);"
+             :
+             : "r"( VectorOp ), "r"( outputElementType ), "r"( VecTOut::size ), "r"( inputElementType ),
+               "r"( VecTIn::size ), "l"( vecA.data() ), "l"( result.data() ) );
+        return result;
+    }
+
+    __forceinline__ __device__ static VecTOut generateASMPtr( const VecTIn& vecA, const VecTIn& vecB )
+    {
+        VecTOut result;
+        asm( "call"
+             "(),"
+             "_optix_vector_op2_ptr,"
+             "(%0,%1,%2,%3,%4,%5,%6,%7);"
+             :
+             : "r"( VectorOp ), "r"( outputElementType ), "r"( VecTOut::size ), "r"( inputElementType ),
+               "r"( VecTIn::size ), "l"( vecA.data() ), "l"( vecB.data() ), "l"( result.data() ) );
+        return result;
+    }
+
+    __forceinline__ __device__ static VecTOut generateASMPtr( const VecTIn& vecA, const VecTIn& vecB, const VecTIn& vecC )
+    {
+        VecTOut result;
+        asm( "call"
+             "(),"
+             "_optix_vector_op3_ptr,"
+             "(%0,%1,%2,%3,%4,%5,%6,%7,%8);"
+             :
+             : "r"( VectorOp ), "r"( outputElementType ), "r"( VecTOut::size ), "r"( inputElementType ),
+               "r"( VecTIn::size ), "l"( vecA.data() ), "l"( vecB.data() ), "l"( vecC.data() ), "l"( result.data() ) );
+        return result;
+    }
+
+    __forceinline__ __device__ static VecTOut generateASM( const VecTIn& vecA )
+    {
+        if( VecTIn::size > 64 || VecTOut::size > 64 || sizeof( typename VecTIn::value_type ) > sizeof( unsigned int )
+            || sizeof( typename VecTOut::value_type ) > sizeof( unsigned int ) )
+            return generateASMPtr( vecA );
+        else
+        {
+            // This code needs to live in an else, block otherwise the compiler will
+            // complain about the loop being unreachable.
+            unsigned int IA[64];
+            unsigned int O[64];
+            for( unsigned int i = 0; i < VecTIn::size; ++i )
+            {
+                IA[i] = *( reinterpret_cast<const inputBitType*>( &( vecA[i] ) ) );
+            }
+            if( VecTOut::size <= 16 && VecTIn::size <= 16 )
+                asm( "call"
+                     "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15),"
+                     "_optix_vector_op1_16xi32,"
+                     "(%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%26,%27,%28,%29,%30,%31,%32,%33,%34,%35,%36);"
+                     : "=r"( O[0] ), "=r"( O[1] ), "=r"( O[2] ), "=r"( O[3] ), "=r"( O[4] ), "=r"( O[5] ), "=r"( O[6] ),
+                       "=r"( O[7] ), "=r"( O[8] ), "=r"( O[9] ), "=r"( O[10] ), "=r"( O[11] ), "=r"( O[12] ),
+                       "=r"( O[13] ), "=r"( O[14] ), "=r"( O[15] )
+                     : "r"( VectorOp ), "r"( outputElementType ), "r"( VecTOut::size ), "r"( inputElementType ),
+                       "r"( VecTIn::size ), "r"( IA[0] ), "r"( IA[1] ), "r"( IA[2] ), "r"( IA[3] ), "r"( IA[4] ),
+                       "r"( IA[5] ), "r"( IA[6] ), "r"( IA[7] ), "r"( IA[8] ), "r"( IA[9] ), "r"( IA[10] ),
+                       "r"( IA[11] ), "r"( IA[12] ), "r"( IA[13] ), "r"( IA[14] ), "r"( IA[15] ) );
+            else
+                asm( "call"
+                     "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%"
+                     "26,%27,%28,%29,%30,%31,%32,%33,%34,%35,%36,%37,%38,%39,%40,%41,%42,%43,%44,%45,%46,%47,%48,%49,%"
+                     "50,%51,%52,%53,%54,%55,%56,%57,%58,%59,%60,%61,%62,%63),"
+                     "_optix_vector_op1_64xi32,"
+                     "(%64,%65,%66,%67,%68,%69,%70,%71,%72,%73,%74,%75,%76,%77,%78,%79,%80,%81,%82,%83,%84,%85,%86,%87,"
+                     "%88,%89,%90,%91,%92,%93,%94,%95,%96,%97,%98,%99,%100,%101,%102,%103,%104,%105,%106,%107,%108,%"
+                     "109,%110,%111,%112,%113,%114,%115,%116,%117,%118,%119,%120,%121,%122,%123,%124,%125,%126,%127,%"
+                     "128,%129,%130,%131,%132);"
+                     : "=r"( O[0] ), "=r"( O[1] ), "=r"( O[2] ), "=r"( O[3] ), "=r"( O[4] ), "=r"( O[5] ), "=r"( O[6] ),
+                       "=r"( O[7] ), "=r"( O[8] ), "=r"( O[9] ), "=r"( O[10] ), "=r"( O[11] ), "=r"( O[12] ),
+                       "=r"( O[13] ), "=r"( O[14] ), "=r"( O[15] ), "=r"( O[16] ), "=r"( O[17] ), "=r"( O[18] ),
+                       "=r"( O[19] ), "=r"( O[20] ), "=r"( O[21] ), "=r"( O[22] ), "=r"( O[23] ), "=r"( O[24] ),
+                       "=r"( O[25] ), "=r"( O[26] ), "=r"( O[27] ), "=r"( O[28] ), "=r"( O[29] ), "=r"( O[30] ),
+                       "=r"( O[31] ), "=r"( O[32] ), "=r"( O[33] ), "=r"( O[34] ), "=r"( O[35] ), "=r"( O[36] ),
+                       "=r"( O[37] ), "=r"( O[38] ), "=r"( O[39] ), "=r"( O[40] ), "=r"( O[41] ), "=r"( O[42] ),
+                       "=r"( O[43] ), "=r"( O[44] ), "=r"( O[45] ), "=r"( O[46] ), "=r"( O[47] ), "=r"( O[48] ),
+                       "=r"( O[49] ), "=r"( O[50] ), "=r"( O[51] ), "=r"( O[52] ), "=r"( O[53] ), "=r"( O[54] ),
+                       "=r"( O[55] ), "=r"( O[56] ), "=r"( O[57] ), "=r"( O[58] ), "=r"( O[59] ), "=r"( O[60] ),
+                       "=r"( O[61] ), "=r"( O[62] ), "=r"( O[63] )
+                     : "r"( VectorOp ), "r"( outputElementType ), "r"( VecTOut::size ), "r"( inputElementType ),
+                       "r"( VecTIn::size ), "r"( IA[0] ), "r"( IA[1] ), "r"( IA[2] ), "r"( IA[3] ), "r"( IA[4] ),
+                       "r"( IA[5] ), "r"( IA[6] ), "r"( IA[7] ), "r"( IA[8] ), "r"( IA[9] ), "r"( IA[10] ),
+                       "r"( IA[11] ), "r"( IA[12] ), "r"( IA[13] ), "r"( IA[14] ), "r"( IA[15] ), "r"( IA[16] ),
+                       "r"( IA[17] ), "r"( IA[18] ), "r"( IA[19] ), "r"( IA[20] ), "r"( IA[21] ), "r"( IA[22] ),
+                       "r"( IA[23] ), "r"( IA[24] ), "r"( IA[25] ), "r"( IA[26] ), "r"( IA[27] ), "r"( IA[28] ),
+                       "r"( IA[29] ), "r"( IA[30] ), "r"( IA[31] ), "r"( IA[32] ), "r"( IA[33] ), "r"( IA[34] ),
+                       "r"( IA[35] ), "r"( IA[36] ), "r"( IA[37] ), "r"( IA[38] ), "r"( IA[39] ), "r"( IA[40] ),
+                       "r"( IA[41] ), "r"( IA[42] ), "r"( IA[43] ), "r"( IA[44] ), "r"( IA[45] ), "r"( IA[46] ),
+                       "r"( IA[47] ), "r"( IA[48] ), "r"( IA[49] ), "r"( IA[50] ), "r"( IA[51] ), "r"( IA[52] ),
+                       "r"( IA[53] ), "r"( IA[54] ), "r"( IA[55] ), "r"( IA[56] ), "r"( IA[57] ), "r"( IA[58] ),
+                       "r"( IA[59] ), "r"( IA[60] ), "r"( IA[61] ), "r"( IA[62] ), "r"( IA[63] ) );
+
+            VecTOut result;
+            for( unsigned int i = 0; i < VecTOut::size; ++i )
+            {
+                outputBitType o = O[i];
+                result[i]       = *( reinterpret_cast<typename VecTOut::value_type*>( &( o ) ) );
+            }
+            return result;
+        }
+    }
+
+    __forceinline__ __device__ static VecTOut generateASM( const VecTIn& vecA, const VecTIn& vecB )
+    {
+        if( VecTIn::size > 64 || VecTOut::size > 64 || sizeof( typename VecTIn::value_type ) > sizeof( unsigned int )
+            || sizeof( typename VecTOut::value_type ) > sizeof( unsigned int ) )
+            return generateASMPtr( vecA, vecB );
+        else
+        {
+            // This code needs to live in an else, block otherwise the compiler will
+            // complain about the loop being unreachable.
+            unsigned int IA[64];
+            unsigned int IB[64];
+            unsigned int O[64];
+            for( unsigned int i = 0; i < VecTIn::size; ++i )
+            {
+                IA[i] = *( reinterpret_cast<const inputBitType*>( &( vecA[i] ) ) );
+                IB[i] = *( reinterpret_cast<const inputBitType*>( &( vecB[i] ) ) );
+            }
+            if( VecTOut::size <= 16 && VecTIn::size <= 16 )
+                asm( "call"
+                     "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15),"
+                     "_optix_vector_op2_16xi32,"
+                     "(%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%26,%27,%28,%29,%30,%31,%32,%33,%34,%35,%36,%37,%38,%39,"
+                     "%40,%41,%42,%43,%44,%45,%46,%47,%48,%49,%50,%51,%52);"
+                     : "=r"( O[0] ), "=r"( O[1] ), "=r"( O[2] ), "=r"( O[3] ), "=r"( O[4] ), "=r"( O[5] ), "=r"( O[6] ),
+                       "=r"( O[7] ), "=r"( O[8] ), "=r"( O[9] ), "=r"( O[10] ), "=r"( O[11] ), "=r"( O[12] ),
+                       "=r"( O[13] ), "=r"( O[14] ), "=r"( O[15] )
+                     : "r"( VectorOp ), "r"( outputElementType ), "r"( VecTOut::size ), "r"( inputElementType ),
+                       "r"( VecTIn::size ), "r"( IA[0] ), "r"( IA[1] ), "r"( IA[2] ), "r"( IA[3] ), "r"( IA[4] ),
+                       "r"( IA[5] ), "r"( IA[6] ), "r"( IA[7] ), "r"( IA[8] ), "r"( IA[9] ), "r"( IA[10] ), "r"( IA[11] ),
+                       "r"( IA[12] ), "r"( IA[13] ), "r"( IA[14] ), "r"( IA[15] ), "r"( IB[0] ), "r"( IB[1] ), "r"( IB[2] ),
+                       "r"( IB[3] ), "r"( IB[4] ), "r"( IB[5] ), "r"( IB[6] ), "r"( IB[7] ), "r"( IB[8] ), "r"( IB[9] ),
+                       "r"( IB[10] ), "r"( IB[11] ), "r"( IB[12] ), "r"( IB[13] ), "r"( IB[14] ), "r"( IB[15] ) );
+            else
+                asm( "call"
+                     "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%"
+                     "26,%27,%28,%29,%30,%31,%32,%33,%34,%35,%36,%37,%38,%39,%40,%41,%42,%43,%44,%45,%46,%47,%48,%49,%"
+                     "50,%51,%52,%53,%54,%55,%56,%57,%58,%59,%60,%61,%62,%63),"
+                     "_optix_vector_op2_64xi32,"
+                     "(%64,%65,%66,%67,%68,%69,%70,%71,%72,%73,%74,%75,%76,%77,%78,%79,%80,%81,%82,%83,%84,%85,%86,%87,"
+                     "%88,%89,%90,%91,%92,%93,%94,%95,%96,%97,%98,%99,%100,%101,%102,%103,%104,%105,%106,%107,%108,%"
+                     "109,%110,%111,%112,%113,%114,%115,%116,%117,%118,%119,%120,%121,%122,%123,%124,%125,%126,%127,%"
+                     "128,%129,%130,%131,%132,%133,%134,%135,%136,%137,%138,%139,%140,%141,%142,%143,%144,%145,%146,%"
+                     "147,%148,%149,%150,%151,%152,%153,%154,%155,%156,%157,%158,%159,%160,%161,%162,%163,%164,%165,%"
+                     "166,%167,%168,%169,%170,%171,%172,%173,%174,%175,%176,%177,%178,%179,%180,%181,%182,%183,%184,%"
+                     "185,%186,%187,%188,%189,%190,%191,%192,%193,%194,%195,%196);"
+                     : "=r"( O[0] ), "=r"( O[1] ), "=r"( O[2] ), "=r"( O[3] ), "=r"( O[4] ), "=r"( O[5] ), "=r"( O[6] ),
+                       "=r"( O[7] ), "=r"( O[8] ), "=r"( O[9] ), "=r"( O[10] ), "=r"( O[11] ), "=r"( O[12] ),
+                       "=r"( O[13] ), "=r"( O[14] ), "=r"( O[15] ), "=r"( O[16] ), "=r"( O[17] ), "=r"( O[18] ),
+                       "=r"( O[19] ), "=r"( O[20] ), "=r"( O[21] ), "=r"( O[22] ), "=r"( O[23] ), "=r"( O[24] ),
+                       "=r"( O[25] ), "=r"( O[26] ), "=r"( O[27] ), "=r"( O[28] ), "=r"( O[29] ), "=r"( O[30] ),
+                       "=r"( O[31] ), "=r"( O[32] ), "=r"( O[33] ), "=r"( O[34] ), "=r"( O[35] ), "=r"( O[36] ),
+                       "=r"( O[37] ), "=r"( O[38] ), "=r"( O[39] ), "=r"( O[40] ), "=r"( O[41] ), "=r"( O[42] ),
+                       "=r"( O[43] ), "=r"( O[44] ), "=r"( O[45] ), "=r"( O[46] ), "=r"( O[47] ), "=r"( O[48] ),
+                       "=r"( O[49] ), "=r"( O[50] ), "=r"( O[51] ), "=r"( O[52] ), "=r"( O[53] ), "=r"( O[54] ),
+                       "=r"( O[55] ), "=r"( O[56] ), "=r"( O[57] ), "=r"( O[58] ), "=r"( O[59] ), "=r"( O[60] ),
+                       "=r"( O[61] ), "=r"( O[62] ), "=r"( O[63] )
+                     : "r"( VectorOp ), "r"( outputElementType ), "r"( VecTOut::size ), "r"( inputElementType ),
+                       "r"( VecTIn::size ), "r"( IA[0] ), "r"( IA[1] ), "r"( IA[2] ), "r"( IA[3] ), "r"( IA[4] ),
+                       "r"( IA[5] ), "r"( IA[6] ), "r"( IA[7] ), "r"( IA[8] ), "r"( IA[9] ), "r"( IA[10] ), "r"( IA[11] ),
+                       "r"( IA[12] ), "r"( IA[13] ), "r"( IA[14] ), "r"( IA[15] ), "r"( IA[16] ), "r"( IA[17] ),
+                       "r"( IA[18] ), "r"( IA[19] ), "r"( IA[20] ), "r"( IA[21] ), "r"( IA[22] ), "r"( IA[23] ),
+                       "r"( IA[24] ), "r"( IA[25] ), "r"( IA[26] ), "r"( IA[27] ), "r"( IA[28] ), "r"( IA[29] ),
+                       "r"( IA[30] ), "r"( IA[31] ), "r"( IA[32] ), "r"( IA[33] ), "r"( IA[34] ), "r"( IA[35] ),
+                       "r"( IA[36] ), "r"( IA[37] ), "r"( IA[38] ), "r"( IA[39] ), "r"( IA[40] ), "r"( IA[41] ),
+                       "r"( IA[42] ), "r"( IA[43] ), "r"( IA[44] ), "r"( IA[45] ), "r"( IA[46] ), "r"( IA[47] ),
+                       "r"( IA[48] ), "r"( IA[49] ), "r"( IA[50] ), "r"( IA[51] ), "r"( IA[52] ), "r"( IA[53] ),
+                       "r"( IA[54] ), "r"( IA[55] ), "r"( IA[56] ), "r"( IA[57] ), "r"( IA[58] ), "r"( IA[59] ),
+                       "r"( IA[60] ), "r"( IA[61] ), "r"( IA[62] ), "r"( IA[63] ), "r"( IB[0] ), "r"( IB[1] ), "r"( IB[2] ),
+                       "r"( IB[3] ), "r"( IB[4] ), "r"( IB[5] ), "r"( IB[6] ), "r"( IB[7] ), "r"( IB[8] ), "r"( IB[9] ),
+                       "r"( IB[10] ), "r"( IB[11] ), "r"( IB[12] ), "r"( IB[13] ), "r"( IB[14] ), "r"( IB[15] ),
+                       "r"( IB[16] ), "r"( IB[17] ), "r"( IB[18] ), "r"( IB[19] ), "r"( IB[20] ), "r"( IB[21] ),
+                       "r"( IB[22] ), "r"( IB[23] ), "r"( IB[24] ), "r"( IB[25] ), "r"( IB[26] ), "r"( IB[27] ),
+                       "r"( IB[28] ), "r"( IB[29] ), "r"( IB[30] ), "r"( IB[31] ), "r"( IB[32] ), "r"( IB[33] ),
+                       "r"( IB[34] ), "r"( IB[35] ), "r"( IB[36] ), "r"( IB[37] ), "r"( IB[38] ), "r"( IB[39] ),
+                       "r"( IB[40] ), "r"( IB[41] ), "r"( IB[42] ), "r"( IB[43] ), "r"( IB[44] ), "r"( IB[45] ),
+                       "r"( IB[46] ), "r"( IB[47] ), "r"( IB[48] ), "r"( IB[49] ), "r"( IB[50] ), "r"( IB[51] ),
+                       "r"( IB[52] ), "r"( IB[53] ), "r"( IB[54] ), "r"( IB[55] ), "r"( IB[56] ), "r"( IB[57] ),
+                       "r"( IB[58] ), "r"( IB[59] ), "r"( IB[60] ), "r"( IB[61] ), "r"( IB[62] ), "r"( IB[63] ) );
+
+            VecTOut result;
+            for( unsigned int i = 0; i < VecTOut::size; ++i )
+            {
+                outputBitType o = O[i];
+                result[i]       = *( reinterpret_cast<typename VecTOut::value_type*>( &( o ) ) );
+            }
+            return result;
+        }
+    }
+
+    __forceinline__ __device__ static VecTOut generateASM( const VecTIn& vecA, const VecTIn& vecB, const VecTIn& vecC )
+    {
+        if( VecTIn::size > 64 || VecTOut::size > 64 || sizeof( typename VecTIn::value_type ) > sizeof( unsigned int )
+            || sizeof( typename VecTOut::value_type ) > sizeof( unsigned int ) )
+            return generateASMPtr( vecA, vecB, vecC );
+        else
+        {
+            // This code needs to live in an else, block otherwise the compiler will
+            // complain about the loop being unreachable.
+            unsigned int IA[64];
+            unsigned int IB[64];
+            unsigned int IC[64];
+            unsigned int O[64];
+            for( unsigned int i = 0; i < VecTIn::size; ++i )
+            {
+                IA[i] = *( reinterpret_cast<const inputBitType*>( &( vecA[i] ) ) );
+                IB[i] = *( reinterpret_cast<const inputBitType*>( &( vecB[i] ) ) );
+                IC[i] = *( reinterpret_cast<const inputBitType*>( &( vecC[i] ) ) );
+            }
+            if( VecTOut::size <= 16 && VecTIn::size <= 16 )
+                asm( "call"
+                     "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15),"
+                     "_optix_vector_op3_16xi32,"
+                     "(%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%26,%27,%28,%29,%30,%31,%32,%33,%34,%35,%36,%37,%38,%39,"
+                     "%40,%41,%42,%43,%44,%45,%46,%47,%48,%49,%50,%51,%52,%53,%54,%55,%56,%57,%58,%59,%60,%61,%62,%63,%"
+                     "64,%65,%66,%67,%68);"
+                     : "=r"( O[0] ), "=r"( O[1] ), "=r"( O[2] ), "=r"( O[3] ), "=r"( O[4] ), "=r"( O[5] ), "=r"( O[6] ),
+                       "=r"( O[7] ), "=r"( O[8] ), "=r"( O[9] ), "=r"( O[10] ), "=r"( O[11] ), "=r"( O[12] ),
+                       "=r"( O[13] ), "=r"( O[14] ), "=r"( O[15] )
+                     : "r"( VectorOp ), "r"( outputElementType ), "r"( VecTOut::size ), "r"( inputElementType ),
+                       "r"( VecTIn::size ), "r"( IA[0] ), "r"( IA[1] ), "r"( IA[2] ), "r"( IA[3] ), "r"( IA[4] ),
+                       "r"( IA[5] ), "r"( IA[6] ), "r"( IA[7] ), "r"( IA[8] ), "r"( IA[9] ), "r"( IA[10] ),
+                       "r"( IA[11] ), "r"( IA[12] ), "r"( IA[13] ), "r"( IA[14] ), "r"( IA[15] ), "r"( IB[0] ),
+                       "r"( IB[1] ), "r"( IB[2] ), "r"( IB[3] ), "r"( IB[4] ), "r"( IB[5] ), "r"( IB[6] ), "r"( IB[7] ),
+                       "r"( IB[8] ), "r"( IB[9] ), "r"( IB[10] ), "r"( IB[11] ), "r"( IB[12] ), "r"( IB[13] ),
+                       "r"( IB[14] ), "r"( IB[15] ), "r"( IC[0] ), "r"( IC[1] ), "r"( IC[2] ), "r"( IC[3] ),
+                       "r"( IC[4] ), "r"( IC[5] ), "r"( IC[6] ), "r"( IC[7] ), "r"( IC[8] ), "r"( IC[9] ),
+                       "r"( IC[10] ), "r"( IC[11] ), "r"( IC[12] ), "r"( IC[13] ), "r"( IC[14] ), "r"( IC[15] ) );
+            else
+                asm( "call"
+                     "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%"
+                     "26,%27,%28,%29,%30,%31,%32,%33,%34,%35,%36,%37,%38,%39,%40,%41,%42,%43,%44,%45,%46,%47,%48,%49,%"
+                     "50,%51,%52,%53,%54,%55,%56,%57,%58,%59,%60,%61,%62,%63),"
+                     "_optix_vector_op3_64xi32,"
+                     "(%64,%65,%66,%67,%68,%69,%70,%71,%72,%73,%74,%75,%76,%77,%78,%79,%80,%81,%82,%83,%84,%85,%86,%87,"
+                     "%88,%89,%90,%91,%92,%93,%94,%95,%96,%97,%98,%99,%100,%101,%102,%103,%104,%105,%106,%107,%108,%"
+                     "109,%110,%111,%112,%113,%114,%115,%116,%117,%118,%119,%120,%121,%122,%123,%124,%125,%126,%127,%"
+                     "128,%129,%130,%131,%132,%133,%134,%135,%136,%137,%138,%139,%140,%141,%142,%143,%144,%145,%146,%"
+                     "147,%148,%149,%150,%151,%152,%153,%154,%155,%156,%157,%158,%159,%160,%161,%162,%163,%164,%165,%"
+                     "166,%167,%168,%169,%170,%171,%172,%173,%174,%175,%176,%177,%178,%179,%180,%181,%182,%183,%184,%"
+                     "185,%186,%187,%188,%189,%190,%191,%192,%193,%194,%195,%196,%197,%198,%199,%200,%201,%202,%203,%"
+                     "204,%205,%206,%207,%208,%209,%210,%211,%212,%213,%214,%215,%216,%217,%218,%219,%220,%221,%222,%"
+                     "223,%224,%225,%226,%227,%228,%229,%230,%231,%232,%233,%234,%235,%236,%237,%238,%239,%240,%241,%"
+                     "242,%243,%244,%245,%246,%247,%248,%249,%250,%251,%252,%253,%254,%255,%256,%257,%258,%259,%260);"
+                     : "=r"( O[0] ), "=r"( O[1] ), "=r"( O[2] ), "=r"( O[3] ), "=r"( O[4] ), "=r"( O[5] ), "=r"( O[6] ),
+                       "=r"( O[7] ), "=r"( O[8] ), "=r"( O[9] ), "=r"( O[10] ), "=r"( O[11] ), "=r"( O[12] ),
+                       "=r"( O[13] ), "=r"( O[14] ), "=r"( O[15] ), "=r"( O[16] ), "=r"( O[17] ), "=r"( O[18] ),
+                       "=r"( O[19] ), "=r"( O[20] ), "=r"( O[21] ), "=r"( O[22] ), "=r"( O[23] ), "=r"( O[24] ),
+                       "=r"( O[25] ), "=r"( O[26] ), "=r"( O[27] ), "=r"( O[28] ), "=r"( O[29] ), "=r"( O[30] ),
+                       "=r"( O[31] ), "=r"( O[32] ), "=r"( O[33] ), "=r"( O[34] ), "=r"( O[35] ), "=r"( O[36] ),
+                       "=r"( O[37] ), "=r"( O[38] ), "=r"( O[39] ), "=r"( O[40] ), "=r"( O[41] ), "=r"( O[42] ),
+                       "=r"( O[43] ), "=r"( O[44] ), "=r"( O[45] ), "=r"( O[46] ), "=r"( O[47] ), "=r"( O[48] ),
+                       "=r"( O[49] ), "=r"( O[50] ), "=r"( O[51] ), "=r"( O[52] ), "=r"( O[53] ), "=r"( O[54] ),
+                       "=r"( O[55] ), "=r"( O[56] ), "=r"( O[57] ), "=r"( O[58] ), "=r"( O[59] ), "=r"( O[60] ),
+                       "=r"( O[61] ), "=r"( O[62] ), "=r"( O[63] )
+                     : "r"( VectorOp ), "r"( outputElementType ), "r"( VecTOut::size ), "r"( inputElementType ),
+                       "r"( VecTIn::size ), "r"( IA[0] ), "r"( IA[1] ), "r"( IA[2] ), "r"( IA[3] ), "r"( IA[4] ),
+                       "r"( IA[5] ), "r"( IA[6] ), "r"( IA[7] ), "r"( IA[8] ), "r"( IA[9] ), "r"( IA[10] ),
+                       "r"( IA[11] ), "r"( IA[12] ), "r"( IA[13] ), "r"( IA[14] ), "r"( IA[15] ), "r"( IA[16] ),
+                       "r"( IA[17] ), "r"( IA[18] ), "r"( IA[19] ), "r"( IA[20] ), "r"( IA[21] ), "r"( IA[22] ),
+                       "r"( IA[23] ), "r"( IA[24] ), "r"( IA[25] ), "r"( IA[26] ), "r"( IA[27] ), "r"( IA[28] ),
+                       "r"( IA[29] ), "r"( IA[30] ), "r"( IA[31] ), "r"( IA[32] ), "r"( IA[33] ), "r"( IA[34] ),
+                       "r"( IA[35] ), "r"( IA[36] ), "r"( IA[37] ), "r"( IA[38] ), "r"( IA[39] ), "r"( IA[40] ),
+                       "r"( IA[41] ), "r"( IA[42] ), "r"( IA[43] ), "r"( IA[44] ), "r"( IA[45] ), "r"( IA[46] ),
+                       "r"( IA[47] ), "r"( IA[48] ), "r"( IA[49] ), "r"( IA[50] ), "r"( IA[51] ), "r"( IA[52] ),
+                       "r"( IA[53] ), "r"( IA[54] ), "r"( IA[55] ), "r"( IA[56] ), "r"( IA[57] ), "r"( IA[58] ),
+                       "r"( IA[59] ), "r"( IA[60] ), "r"( IA[61] ), "r"( IA[62] ), "r"( IA[63] ), "r"( IB[0] ),
+                       "r"( IB[1] ), "r"( IB[2] ), "r"( IB[3] ), "r"( IB[4] ), "r"( IB[5] ), "r"( IB[6] ), "r"( IB[7] ),
+                       "r"( IB[8] ), "r"( IB[9] ), "r"( IB[10] ), "r"( IB[11] ), "r"( IB[12] ), "r"( IB[13] ),
+                       "r"( IB[14] ), "r"( IB[15] ), "r"( IB[16] ), "r"( IB[17] ), "r"( IB[18] ), "r"( IB[19] ),
+                       "r"( IB[20] ), "r"( IB[21] ), "r"( IB[22] ), "r"( IB[23] ), "r"( IB[24] ), "r"( IB[25] ),
+                       "r"( IB[26] ), "r"( IB[27] ), "r"( IB[28] ), "r"( IB[29] ), "r"( IB[30] ), "r"( IB[31] ),
+                       "r"( IB[32] ), "r"( IB[33] ), "r"( IB[34] ), "r"( IB[35] ), "r"( IB[36] ), "r"( IB[37] ),
+                       "r"( IB[38] ), "r"( IB[39] ), "r"( IB[40] ), "r"( IB[41] ), "r"( IB[42] ), "r"( IB[43] ),
+                       "r"( IB[44] ), "r"( IB[45] ), "r"( IB[46] ), "r"( IB[47] ), "r"( IB[48] ), "r"( IB[49] ),
+                       "r"( IB[50] ), "r"( IB[51] ), "r"( IB[52] ), "r"( IB[53] ), "r"( IB[54] ), "r"( IB[55] ),
+                       "r"( IB[56] ), "r"( IB[57] ), "r"( IB[58] ), "r"( IB[59] ), "r"( IB[60] ), "r"( IB[61] ),
+                       "r"( IB[62] ), "r"( IB[63] ), "r"( IC[0] ), "r"( IC[1] ), "r"( IC[2] ), "r"( IC[3] ),
+                       "r"( IC[4] ), "r"( IC[5] ), "r"( IC[6] ), "r"( IC[7] ), "r"( IC[8] ), "r"( IC[9] ),
+                       "r"( IC[10] ), "r"( IC[11] ), "r"( IC[12] ), "r"( IC[13] ), "r"( IC[14] ), "r"( IC[15] ),
+                       "r"( IC[16] ), "r"( IC[17] ), "r"( IC[18] ), "r"( IC[19] ), "r"( IC[20] ), "r"( IC[21] ),
+                       "r"( IC[22] ), "r"( IC[23] ), "r"( IC[24] ), "r"( IC[25] ), "r"( IC[26] ), "r"( IC[27] ),
+                       "r"( IC[28] ), "r"( IC[29] ), "r"( IC[30] ), "r"( IC[31] ), "r"( IC[32] ), "r"( IC[33] ),
+                       "r"( IC[34] ), "r"( IC[35] ), "r"( IC[36] ), "r"( IC[37] ), "r"( IC[38] ), "r"( IC[39] ),
+                       "r"( IC[40] ), "r"( IC[41] ), "r"( IC[42] ), "r"( IC[43] ), "r"( IC[44] ), "r"( IC[45] ),
+                       "r"( IC[46] ), "r"( IC[47] ), "r"( IC[48] ), "r"( IC[49] ), "r"( IC[50] ), "r"( IC[51] ),
+                       "r"( IC[52] ), "r"( IC[53] ), "r"( IC[54] ), "r"( IC[55] ), "r"( IC[56] ), "r"( IC[57] ),
+                       "r"( IC[58] ), "r"( IC[59] ), "r"( IC[60] ), "r"( IC[61] ), "r"( IC[62] ), "r"( IC[63] ) );
+            VecTOut result;
+            for( unsigned int i = 0; i < VecTOut::size; ++i )
+            {
+                outputBitType o = O[i];
+                result[i]       = *( reinterpret_cast<typename VecTOut::value_type*>( &o ) );
+            }
+            return result;
+        }
+    }
+};
+
+}  // end namespace optix_internal
+
+template <typename VecTOut>
+static __forceinline__ __device__ VecTOut optixCoopVecLoad( CUdeviceptr ptr )
+{
+    return optix_internal::OptixCoopVecLoadASMGenerator<VecTOut>::generateASM( ptr );
+}
+
+template <typename VecTOut, typename T>
+static __forceinline__ __device__ VecTOut optixCoopVecLoad( T* ptr )
+{
+    return optixCoopVecLoad<VecTOut>( reinterpret_cast<CUdeviceptr>( ptr ) );
+}
+
+
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecExp2( const VecT& vec )
+{
+    return optix_internal::OptixCoopVecASMGenerator<optix_internal::OPTIX_COOP_VEC_OP_EXP2, VecT, VecT>::generateASM( vec );
+}
+
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecLog2( const VecT& vec )
+{
+    return optix_internal::OptixCoopVecASMGenerator<optix_internal::OPTIX_COOP_VEC_OP_LOG2, VecT, VecT>::generateASM( vec );
+}
+
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecTanh( const VecT& vec )
+{
+    return optix_internal::OptixCoopVecASMGenerator<optix_internal::OPTIX_COOP_VEC_OP_TANH, VecT, VecT>::generateASM( vec );
+}
+
+template <typename VecTOut, typename VecTIn>
+static __forceinline__ __device__ VecTOut optixCoopVecCvt( const VecTIn& vec )
+{
+    return optix_internal::OptixCoopVecASMGenerator<optix_internal::OPTIX_COOP_VEC_OP_CVT, VecTOut, VecTIn>::generateASM( vec );
+}
+
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecMin( const VecT& vecA, const VecT& vecB )
+{
+    return optix_internal::OptixCoopVecASMGenerator<optix_internal::OPTIX_COOP_VEC_OP_MIN, VecT, VecT>::generateASM( vecA, vecB );
+}
+
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecMin( const VecT& vecA, typename VecT::value_type B )
+{
+    VecT vecB( B );
+    return optixCoopVecMin( vecA, vecB );
+}
+
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecMax( const VecT& vecA, const VecT& vecB )
+{
+    return optix_internal::OptixCoopVecASMGenerator<optix_internal::OPTIX_COOP_VEC_OP_MAX, VecT, VecT>::generateASM( vecA, vecB );
+}
+
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecMax( const VecT& vecA, typename VecT::value_type B )
+{
+    VecT vecB( B );
+    return optixCoopVecMax( vecA, vecB );
+}
+
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecMul( const VecT& vecA, const VecT& vecB )
+{
+    return optix_internal::OptixCoopVecASMGenerator<optix_internal::OPTIX_COOP_VEC_OP_MUL, VecT, VecT>::generateASM( vecA, vecB );
+}
+
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecAdd( const VecT& vecA, const VecT& vecB )
+{
+    return optix_internal::OptixCoopVecASMGenerator<optix_internal::OPTIX_COOP_VEC_OP_ADD, VecT, VecT>::generateASM( vecA, vecB );
+}
+
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecSub( const VecT& vecA, const VecT& vecB )
+{
+    return optix_internal::OptixCoopVecASMGenerator<optix_internal::OPTIX_COOP_VEC_OP_SUB, VecT, VecT>::generateASM( vecA, vecB );
+}
+
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecStep( const VecT& vecA, const VecT& vecB )
+{
+    return optix_internal::OptixCoopVecASMGenerator<optix_internal::OPTIX_COOP_VEC_OP_STEP, VecT, VecT>::generateASM( vecA, vecB );
+}
+
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecFFMA( const VecT& vecA, const VecT& vecB, const VecT& vecC )
+{
+    return optix_internal::OptixCoopVecASMGenerator<optix_internal::OPTIX_COOP_VEC_OP_FFMA, VecT, VecT>::generateASM( vecA, vecB, vecC );
+}
+
+
+namespace optix_internal {
+template <typename VecTOut, typename VecTIn, OptixCoopVecElemType inputInterpretation, OptixCoopVecMatrixLayout matrixLayout, bool transpose, unsigned int N, unsigned int K, OptixCoopVecElemType matrixElementType, OptixCoopVecElemType biasElementType>
+struct OptixCoopVecMatMulASMGenerator
+{
+    static const OptixCoopVecElemType outputElementType =
+        optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTOut::value_type>::elementType;
+    using outputBitType =
+        typename optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTOut::value_type>::bitType;
+    static const OptixCoopVecElemType inputElementType =
+        optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTIn::value_type>::elementType;
+    using inputBitType =
+        typename optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTIn::value_type>::bitType;
+
+    __forceinline__ __device__ static VecTOut generateASMPtr( const VecTIn& inputVector,
+                                                              CUdeviceptr   matrix,
+                                                              unsigned      matrixOffsetInBytes,
+                                                              unsigned      rowColumnStrideInBytes,
+                                                              CUdeviceptr   bias,
+                                                              unsigned      biasOffsetInBytes )
+    {
+        VecTOut result;
+        // clang-format off
+        asm( "call"
+             "(),"
+             "_optix_matvecmul_ptr,"
+             "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17);"
+             :
+             : "r"( outputElementType ), "r"( VecTOut::size ),
+               "r"( inputElementType), "r"( VecTIn::size ), "r"( inputInterpretation ),
+               "r"( N ), "r"( K ),
+               "l"( matrix ), "r"( matrixOffsetInBytes ), "r"( rowColumnStrideInBytes ),
+               "r"( matrixLayout ), "r"( (unsigned)transpose ), "r"( matrixElementType ),
+               "l"( bias ), "r"( biasOffsetInBytes ), "r"( biasElementType ),
+               "l"( inputVector.data() ), "l"( result.data() )
+          );
+        // clang-format on
+        return result;
+    }
+
+    __forceinline__ __device__ static VecTOut generateASM( const VecTIn& inputVector,
+                                                           CUdeviceptr   matrix,
+                                                           unsigned      matrixOffsetInBytes,
+                                                           unsigned      rowColumnStrideInBytes,
+                                                           CUdeviceptr   bias,
+                                                           unsigned      biasOffsetInBytes )
+    {
+        // If too many elements or elements too large, fall back to the pointer passing method
+        if( VecTIn::size > 64 || VecTOut::size > 64 || sizeof( typename VecTIn::value_type ) > sizeof( unsigned int )
+            || sizeof( typename VecTOut::value_type ) > sizeof( unsigned int ) )
+            return generateASMPtr( inputVector, matrix, matrixOffsetInBytes, rowColumnStrideInBytes, bias, biasOffsetInBytes );
+        else
+        {
+            // This code needs to live in an else, block otherwise the compiler will
+            // complain about the loop being unreachable.
+            unsigned int I[64];
+            unsigned int O[64];
+            for( unsigned int i = 0; i < VecTIn::size; ++i )
+            {
+                I[i] = *( reinterpret_cast<const inputBitType*>( &( inputVector[i] ) ) );
+            }
+            if( VecTOut::size <= 16 && VecTIn::size <= 16 )
+                asm( "call"
+                     "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15),"
+                     "_optix_matvecmul_16xi32,"
+                     "(%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%26,%27,%28,%29,%30,%31,%32,%33,%34,%35,%36,%37,%38,%39,"
+                     "%40,%41,%42,%43,%44,%45,%46,%47);"
+                     : "=r"( O[0] ), "=r"( O[1] ), "=r"( O[2] ), "=r"( O[3] ), "=r"( O[4] ), "=r"( O[5] ), "=r"( O[6] ),
+                       "=r"( O[7] ), "=r"( O[8] ), "=r"( O[9] ), "=r"( O[10] ), "=r"( O[11] ), "=r"( O[12] ),
+                       "=r"( O[13] ), "=r"( O[14] ), "=r"( O[15] )
+                     : "r"( outputElementType ), "r"( VecTOut::size ), "r"( inputElementType ), "r"( VecTIn::size ),
+                       "r"( inputInterpretation ), "r"( N ), "r"( K ), "l"( matrix ), "r"( matrixOffsetInBytes ),
+                       "r"( rowColumnStrideInBytes ), "r"( matrixLayout ), "r"( (unsigned)transpose ), "r"( matrixElementType ),
+                       "l"( bias ), "r"( biasOffsetInBytes ), "r"( biasElementType ), "r"( I[0] ), "r"( I[1] ),
+                       "r"( I[2] ), "r"( I[3] ), "r"( I[4] ), "r"( I[5] ), "r"( I[6] ), "r"( I[7] ), "r"( I[8] ),
+                       "r"( I[9] ), "r"( I[10] ), "r"( I[11] ), "r"( I[12] ), "r"( I[13] ), "r"( I[14] ), "r"( I[15] ) );
+            else
+                asm( "call"
+                     "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%"
+                     "26,%27,%28,%29,%30,%31,%32,%33,%34,%35,%36,%37,%38,%39,%40,%41,%42,%43,%44,%45,%46,%47,%48,%49,%"
+                     "50,%51,%52,%53,%54,%55,%56,%57,%58,%59,%60,%61,%62,%63),"
+                     "_optix_matvecmul_64xi32,"
+                     "(%64,%65,%66,%67,%68,%69,%70,%71,%72,%73,%74,%75,%76,%77,%78,%79,%80,%81,%82,%83,%84,%85,%86,%87,"
+                     "%88,%89,%90,%91,%92,%93,%94,%95,%96,%97,%98,%99,%100,%101,%102,%103,%104,%105,%106,%107,%108,%"
+                     "109,%110,%111,%112,%113,%114,%115,%116,%117,%118,%119,%120,%121,%122,%123,%124,%125,%126,%127,%"
+                     "128,%129,%130,%131,%132,%133,%134,%135,%136,%137,%138,%139,%140,%141,%142,%143);"
+                     : "=r"( O[0] ), "=r"( O[1] ), "=r"( O[2] ), "=r"( O[3] ), "=r"( O[4] ), "=r"( O[5] ), "=r"( O[6] ),
+                       "=r"( O[7] ), "=r"( O[8] ), "=r"( O[9] ), "=r"( O[10] ), "=r"( O[11] ), "=r"( O[12] ),
+                       "=r"( O[13] ), "=r"( O[14] ), "=r"( O[15] ), "=r"( O[16] ), "=r"( O[17] ), "=r"( O[18] ),
+                       "=r"( O[19] ), "=r"( O[20] ), "=r"( O[21] ), "=r"( O[22] ), "=r"( O[23] ), "=r"( O[24] ),
+                       "=r"( O[25] ), "=r"( O[26] ), "=r"( O[27] ), "=r"( O[28] ), "=r"( O[29] ), "=r"( O[30] ),
+                       "=r"( O[31] ), "=r"( O[32] ), "=r"( O[33] ), "=r"( O[34] ), "=r"( O[35] ), "=r"( O[36] ),
+                       "=r"( O[37] ), "=r"( O[38] ), "=r"( O[39] ), "=r"( O[40] ), "=r"( O[41] ), "=r"( O[42] ),
+                       "=r"( O[43] ), "=r"( O[44] ), "=r"( O[45] ), "=r"( O[46] ), "=r"( O[47] ), "=r"( O[48] ),
+                       "=r"( O[49] ), "=r"( O[50] ), "=r"( O[51] ), "=r"( O[52] ), "=r"( O[53] ), "=r"( O[54] ),
+                       "=r"( O[55] ), "=r"( O[56] ), "=r"( O[57] ), "=r"( O[58] ), "=r"( O[59] ), "=r"( O[60] ),
+                       "=r"( O[61] ), "=r"( O[62] ), "=r"( O[63] )
+                     : "r"( outputElementType ), "r"( VecTOut::size ), "r"( inputElementType ), "r"( VecTIn::size ),
+                       "r"( inputInterpretation ), "r"( N ), "r"( K ), "l"( matrix ), "r"( matrixOffsetInBytes ),
+                       "r"( rowColumnStrideInBytes ), "r"( matrixLayout ), "r"( (unsigned)transpose ),
+                       "r"( matrixElementType ), "l"( bias ), "r"( biasOffsetInBytes ), "r"( biasElementType ), "r"( I[0] ),
+                       "r"( I[1] ), "r"( I[2] ), "r"( I[3] ), "r"( I[4] ), "r"( I[5] ), "r"( I[6] ), "r"( I[7] ),
+                       "r"( I[8] ), "r"( I[9] ), "r"( I[10] ), "r"( I[11] ), "r"( I[12] ), "r"( I[13] ), "r"( I[14] ),
+                       "r"( I[15] ), "r"( I[16] ), "r"( I[17] ), "r"( I[18] ), "r"( I[19] ), "r"( I[20] ), "r"( I[21] ),
+                       "r"( I[22] ), "r"( I[23] ), "r"( I[24] ), "r"( I[25] ), "r"( I[26] ), "r"( I[27] ), "r"( I[28] ),
+                       "r"( I[29] ), "r"( I[30] ), "r"( I[31] ), "r"( I[32] ), "r"( I[33] ), "r"( I[34] ), "r"( I[35] ),
+                       "r"( I[36] ), "r"( I[37] ), "r"( I[38] ), "r"( I[39] ), "r"( I[40] ), "r"( I[41] ), "r"( I[42] ),
+                       "r"( I[43] ), "r"( I[44] ), "r"( I[45] ), "r"( I[46] ), "r"( I[47] ), "r"( I[48] ), "r"( I[49] ),
+                       "r"( I[50] ), "r"( I[51] ), "r"( I[52] ), "r"( I[53] ), "r"( I[54] ), "r"( I[55] ), "r"( I[56] ),
+                       "r"( I[57] ), "r"( I[58] ), "r"( I[59] ), "r"( I[60] ), "r"( I[61] ), "r"( I[62] ), "r"( I[63] ) );
+            VecTOut result;
+            for( unsigned int i = 0; i < VecTOut::size; ++i )
+            {
+                outputBitType o = O[i];
+                result[i]       = *( reinterpret_cast<typename VecTOut::value_type*>( &( o ) ) );
+            }
+            return result;
+        }
+    }
+};
+
+template <typename VecTIn>
+struct OptixCoopVecReduceSumAccumulateASMGenerator
+{
+    static const OptixCoopVecElemType inputElementType =
+        optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTIn::value_type>::elementType;
+    using inputBitType =
+        typename optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTIn::value_type>::bitType;
+
+    __forceinline__ __device__ static void generateASMPtr( const VecTIn& vecA, CUdeviceptr outputVector, unsigned offsetInBytes )
+    {
+        asm volatile(
+            "call"
+            "(),"
+            "_optix_reduce_sum_accumulate_ptr,"
+            "(%0,%1,%2,%3,%4);"
+            :
+            : "r"( inputElementType ), "r"( VecTIn::size ), "l"( outputVector ), "r"( offsetInBytes ), "l"( vecA.data() ) );
+    }
+
+    __forceinline__ __device__ static void generateASM( const VecTIn& vecA, CUdeviceptr outputVector, unsigned offsetInBytes )
+    {
+        if( VecTIn::size > 64 || sizeof( typename VecTIn::value_type ) > sizeof( unsigned int ) )
+            return generateASMPtr( vecA, outputVector, offsetInBytes );
+        else
+        {
+            // This code needs to live in an else, block otherwise the compiler will
+            // complain about the loop being unreachable.
+            unsigned int IA[64];
+            for( unsigned int i = 0; i < VecTIn::size; ++i )
+            {
+                IA[i] = *( reinterpret_cast<const inputBitType*>( &( vecA[i] ) ) );
+            }
+            if( VecTIn::size <= 16 )
+                asm volatile(
+                    "call"
+                    "(),"
+                    "_optix_reduce_sum_accumulate_16xi32,"
+                    "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19);"
+                    :
+                    : "r"( inputElementType ), "r"( VecTIn::size ), "l"( outputVector ), "r"( offsetInBytes ),
+                      "r"( IA[0] ), "r"( IA[1] ), "r"( IA[2] ), "r"( IA[3] ), "r"( IA[4] ), "r"( IA[5] ), "r"( IA[6] ),
+                      "r"( IA[7] ), "r"( IA[8] ), "r"( IA[9] ), "r"( IA[10] ), "r"( IA[11] ), "r"( IA[12] ),
+                      "r"( IA[13] ), "r"( IA[14] ), "r"( IA[15] ) );
+            else
+                asm volatile(
+                    "call"
+                    "(),"
+                    "_optix_reduce_sum_accumulate_64xi32,"
+                    "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%"
+                    "26,%27,%28,%29,%30,%31,%32,%33,%34,%35,%36,%37,%38,%39,%40,%41,%42,%43,%44,%45,%46,%47,%48,%49,%"
+                    "50,%51,%52,%53,%54,%55,%56,%57,%58,%59,%60,%61,%62,%63,"
+                    "%64,%65,%66,%67);"
+                    :
+                    : "r"( inputElementType ), "r"( VecTIn::size ), "l"( outputVector ), "r"( offsetInBytes ), "r"( IA[0] ),
+                      "r"( IA[1] ), "r"( IA[2] ), "r"( IA[3] ), "r"( IA[4] ), "r"( IA[5] ), "r"( IA[6] ), "r"( IA[7] ),
+                      "r"( IA[8] ), "r"( IA[9] ), "r"( IA[10] ), "r"( IA[11] ), "r"( IA[12] ), "r"( IA[13] ), "r"( IA[14] ),
+                      "r"( IA[15] ), "r"( IA[16] ), "r"( IA[17] ), "r"( IA[18] ), "r"( IA[19] ), "r"( IA[20] ),
+                      "r"( IA[21] ), "r"( IA[22] ), "r"( IA[23] ), "r"( IA[24] ), "r"( IA[25] ), "r"( IA[26] ),
+                      "r"( IA[27] ), "r"( IA[28] ), "r"( IA[29] ), "r"( IA[30] ), "r"( IA[31] ), "r"( IA[32] ),
+                      "r"( IA[33] ), "r"( IA[34] ), "r"( IA[35] ), "r"( IA[36] ), "r"( IA[37] ), "r"( IA[38] ),
+                      "r"( IA[39] ), "r"( IA[40] ), "r"( IA[41] ), "r"( IA[42] ), "r"( IA[43] ), "r"( IA[44] ),
+                      "r"( IA[45] ), "r"( IA[46] ), "r"( IA[47] ), "r"( IA[48] ), "r"( IA[49] ), "r"( IA[50] ),
+                      "r"( IA[51] ), "r"( IA[52] ), "r"( IA[53] ), "r"( IA[54] ), "r"( IA[55] ), "r"( IA[56] ), "r"( IA[57] ),
+                      "r"( IA[58] ), "r"( IA[59] ), "r"( IA[60] ), "r"( IA[61] ), "r"( IA[62] ), "r"( IA[63] ) );
+        }
+    }
+};
+
+template <typename VecTA, typename VecTB, OptixCoopVecMatrixLayout matrixLayout>
+struct OptixCoopVecOuterProductAccumulateASMGenerator
+{
+    static const OptixCoopVecElemType vecAElementType =
+        optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTA::value_type>::elementType;
+    using vecABitType =
+        typename optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTA::value_type>::bitType;
+    static const OptixCoopVecElemType vecBElementType =
+        optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTB::value_type>::elementType;
+    using vecBBitType =
+        typename optix_internal::coop_vec_type_traits::OptixCoopVecElemTypeTrait<typename VecTB::value_type>::bitType;
+
+    __forceinline__ __device__ static void generateASMPtr( const VecTA& vecA,
+                                                           const VecTB& vecB,
+                                                           CUdeviceptr  outputMatrix,
+                                                           unsigned     offsetInBytes,
+                                                           unsigned     rowColumnStrideInBytes )
+    {
+        asm volatile(
+            "call"
+            "(),"
+            "_optix_outer_product_accumulate_ptr,"
+            "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9);"
+            :
+            : "r"( vecAElementType ), "r"( VecTA::size ), "r"( vecBElementType ), "r"( VecTB::size ), "l"( outputMatrix ),
+              "r"( offsetInBytes ), "r"( matrixLayout ), "r"( rowColumnStrideInBytes ), "l"( vecA.data() ), "l"( vecB.data() ) );
+    }
+
+    __forceinline__ __device__ static void generateASM( const VecTA& vecA,
+                                                        const VecTB& vecB,
+                                                        CUdeviceptr  outputMatrix,
+                                                        unsigned     offsetInBytes,
+                                                        unsigned     rowColumnStrideInBytes )
+    {
+        if( VecTA::size > 64 || VecTB::size > 64 || sizeof( typename VecTA::value_type ) > sizeof( unsigned int )
+            || sizeof( typename VecTB::value_type ) > sizeof( unsigned int ) )
+            return generateASMPtr( vecA, vecB, outputMatrix, offsetInBytes, rowColumnStrideInBytes );
+        else
+        {
+            // This code needs to live in an else, block otherwise the compiler will
+            // complain about the loop being unreachable.
+            unsigned int IA[64];
+            unsigned int IB[64];
+            for( unsigned int i = 0; i < VecTA::size; ++i )
+            {
+                IA[i] = *( reinterpret_cast<const vecABitType*>( &( vecA[i] ) ) );
+            }
+            for( unsigned int i = 0; i < VecTB::size; ++i )
+            {
+                IB[i] = *( reinterpret_cast<const vecBBitType*>( &( vecB[i] ) ) );
+            }
+            if( VecTB::size <= 16 && VecTA::size <= 16 )
+                asm volatile(
+                    "call"
+                    "(),"
+                    "_optix_outer_product_accumulate_16xi32,"
+                    "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%"
+                    "26,%27,%28,%29,%30,%31,%32,%33,%34,%35,%36,%37,%38,%39);"
+                    :
+                    : "r"( vecAElementType ), "r"( VecTA::size ), "r"( vecBElementType ), "r"( VecTB::size ),
+                      "l"( outputMatrix ), "r"( offsetInBytes ), "r"( matrixLayout ), "r"( rowColumnStrideInBytes ),
+                      "r"( IA[0] ), "r"( IA[1] ), "r"( IA[2] ), "r"( IA[3] ), "r"( IA[4] ), "r"( IA[5] ), "r"( IA[6] ),
+                      "r"( IA[7] ), "r"( IA[8] ), "r"( IA[9] ), "r"( IA[10] ), "r"( IA[11] ), "r"( IA[12] ),
+                      "r"( IA[13] ), "r"( IA[14] ), "r"( IA[15] ), "r"( IB[0] ), "r"( IB[1] ), "r"( IB[2] ),
+                      "r"( IB[3] ), "r"( IB[4] ), "r"( IB[5] ), "r"( IB[6] ), "r"( IB[7] ), "r"( IB[8] ), "r"( IB[9] ),
+                      "r"( IB[10] ), "r"( IB[11] ), "r"( IB[12] ), "r"( IB[13] ), "r"( IB[14] ), "r"( IB[15] ) );
+            else
+                asm volatile(
+                    "call"
+                    "(),"
+                    "_optix_outer_product_accumulate_64xi32,"
+                    "(%0,%1,%2,%3,%4,%5,%6,%7,%8,%9,%10,%11,%12,%13,%14,%15,%16,%17,%18,%19,%20,%21,%22,%23,%24,%25,%"
+                    "26,%27,%28,%29,%30,%31,%32,%33,%34,%35,%36,%37,%38,%39,%40,%41,%42,%43,%44,%45,%46,%47,%48,%49,%"
+                    "50,%51,%52,%53,%54,%55,%56,%57,%58,%59,%60,%61,%62,%63,%64,%65,%66,%67,%68,%69,%70,%71,%72,%73,%"
+                    "74,%75,%76,%77,%78,%79,%80,%81,%82,%83,%84,%85,%86,%87,%88,%89,%90,%91,%92,%93,%94,%95,%96,%97,%"
+                    "98,%99,%100,%101,%102,%103,%104,%105,%106,%107,%108,%109,%110,%111,%112,%113,%114,%115,%116,%117,"
+                    "%118,%119,%120,%121,%122,%123,%124,%125,%126,%127,%128,%129,%130,%131,%132,%133,%134,%135);"
+                    :
+                    : "r"( vecAElementType ), "r"( VecTA::size ), "r"( vecBElementType ), "r"( VecTB::size ),
+                      "l"( outputMatrix ), "r"( offsetInBytes ), "r"( matrixLayout ), "r"( rowColumnStrideInBytes ),
+                      "r"( IA[0] ), "r"( IA[1] ), "r"( IA[2] ), "r"( IA[3] ), "r"( IA[4] ), "r"( IA[5] ), "r"( IA[6] ),
+                      "r"( IA[7] ), "r"( IA[8] ), "r"( IA[9] ), "r"( IA[10] ), "r"( IA[11] ), "r"( IA[12] ),
+                      "r"( IA[13] ), "r"( IA[14] ), "r"( IA[15] ), "r"( IA[16] ), "r"( IA[17] ), "r"( IA[18] ),
+                      "r"( IA[19] ), "r"( IA[20] ), "r"( IA[21] ), "r"( IA[22] ), "r"( IA[23] ), "r"( IA[24] ),
+                      "r"( IA[25] ), "r"( IA[26] ), "r"( IA[27] ), "r"( IA[28] ), "r"( IA[29] ), "r"( IA[30] ),
+                      "r"( IA[31] ), "r"( IA[32] ), "r"( IA[33] ), "r"( IA[34] ), "r"( IA[35] ), "r"( IA[36] ),
+                      "r"( IA[37] ), "r"( IA[38] ), "r"( IA[39] ), "r"( IA[40] ), "r"( IA[41] ), "r"( IA[42] ),
+                      "r"( IA[43] ), "r"( IA[44] ), "r"( IA[45] ), "r"( IA[46] ), "r"( IA[47] ), "r"( IA[48] ),
+                      "r"( IA[49] ), "r"( IA[50] ), "r"( IA[51] ), "r"( IA[52] ), "r"( IA[53] ), "r"( IA[54] ),
+                      "r"( IA[55] ), "r"( IA[56] ), "r"( IA[57] ), "r"( IA[58] ), "r"( IA[59] ), "r"( IA[60] ),
+                      "r"( IA[61] ), "r"( IA[62] ), "r"( IA[63] ), "r"( IB[0] ), "r"( IB[1] ), "r"( IB[2] ),
+                      "r"( IB[3] ), "r"( IB[4] ), "r"( IB[5] ), "r"( IB[6] ), "r"( IB[7] ), "r"( IB[8] ), "r"( IB[9] ),
+                      "r"( IB[10] ), "r"( IB[11] ), "r"( IB[12] ), "r"( IB[13] ), "r"( IB[14] ), "r"( IB[15] ),
+                      "r"( IB[16] ), "r"( IB[17] ), "r"( IB[18] ), "r"( IB[19] ), "r"( IB[20] ), "r"( IB[21] ),
+                      "r"( IB[22] ), "r"( IB[23] ), "r"( IB[24] ), "r"( IB[25] ), "r"( IB[26] ), "r"( IB[27] ),
+                      "r"( IB[28] ), "r"( IB[29] ), "r"( IB[30] ), "r"( IB[31] ), "r"( IB[32] ), "r"( IB[33] ),
+                      "r"( IB[34] ), "r"( IB[35] ), "r"( IB[36] ), "r"( IB[37] ), "r"( IB[38] ), "r"( IB[39] ),
+                      "r"( IB[40] ), "r"( IB[41] ), "r"( IB[42] ), "r"( IB[43] ), "r"( IB[44] ), "r"( IB[45] ),
+                      "r"( IB[46] ), "r"( IB[47] ), "r"( IB[48] ), "r"( IB[49] ), "r"( IB[50] ), "r"( IB[51] ),
+                      "r"( IB[52] ), "r"( IB[53] ), "r"( IB[54] ), "r"( IB[55] ), "r"( IB[56] ), "r"( IB[57] ),
+                      "r"( IB[58] ), "r"( IB[59] ), "r"( IB[60] ), "r"( IB[61] ), "r"( IB[62] ), "r"( IB[63] ) );
+        }
+    }
+};
+}  // end namespace optix_internal
+
+
+template <typename VecTOut,  //
+          typename VecTIn,
+          OptixCoopVecElemType     inputInterpretation,
+          OptixCoopVecMatrixLayout matrixLayout,
+          bool                     transpose,
+          unsigned int             N,
+          unsigned int             K,
+          OptixCoopVecElemType     matrixElementType,
+          OptixCoopVecElemType     biasElementType>
+static __forceinline__ __device__ VecTOut optixCoopVecMatMul( const VecTIn& inputVector,
+                                                              CUdeviceptr matrix,  // 64 byte aligned, Array of KxN elements
+                                                              unsigned    matrixOffsetInBytes,  // 64 byte aligned
+                                                              CUdeviceptr bias,  // 16 byte aligned, Array of N elements
+                                                              unsigned    biasOffsetInBytes,  // 16 byte aligned
+                                                              unsigned    rowColumnStrideInBytes )
+{
+    return optix_internal::OptixCoopVecMatMulASMGenerator<VecTOut, VecTIn, inputInterpretation, matrixLayout, transpose, N, K, matrixElementType, biasElementType>::generateASM(
+        inputVector, matrix, matrixOffsetInBytes, rowColumnStrideInBytes, bias, biasOffsetInBytes );
+}
+
+template <typename VecTOut,  //
+          typename VecTIn,
+          OptixCoopVecElemType     inputInterpretation,
+          OptixCoopVecMatrixLayout matrixLayout,
+          bool                     transpose,
+          unsigned int             N,
+          unsigned int             K,
+          OptixCoopVecElemType     matrixElementType>
+static __forceinline__ __device__ VecTOut optixCoopVecMatMul( const VecTIn& inputVector,
+                                                              CUdeviceptr matrix,  // 64 byte aligned, Array of KxN elements
+                                                              unsigned matrixOffsetInBytes,  // 64 byte aligned
+                                                              unsigned rowColumnStrideInBytes )
+{
+    return optix_internal::OptixCoopVecMatMulASMGenerator<VecTOut, VecTIn, inputInterpretation, matrixLayout, transpose, N, K, matrixElementType,
+                                                          OPTIX_COOP_VEC_ELEM_TYPE_UNKNOWN>::generateASM( inputVector, matrix,
+                                                                                                          matrixOffsetInBytes,
+                                                                                                          rowColumnStrideInBytes,
+                                                                                                          0, 0 );
+}
+
+template <typename VecTIn>
+static __forceinline__ __device__ void optixCoopVecReduceSumAccumulate( const VecTIn& inputVector, CUdeviceptr outputVector, unsigned offsetInBytes )
+{
+    optix_internal::OptixCoopVecReduceSumAccumulateASMGenerator<VecTIn>::generateASM( inputVector, outputVector, offsetInBytes );
+}
+
+template <typename VecTA, typename VecTB, OptixCoopVecMatrixLayout matrixLayout>
+static __forceinline__ __device__ void optixCoopVecOuterProductAccumulate( const VecTA& vecA,
+                                                                           const VecTB& vecB,
+                                                                           CUdeviceptr  outputMatrix,
+                                                                           unsigned     offsetInBytes,
+                                                                           unsigned     rowColumnStrideInBytes )
+{
+    optix_internal::OptixCoopVecOuterProductAccumulateASMGenerator<VecTA, VecTB, matrixLayout>::generateASM(
+        vecA, vecB, outputMatrix, offsetInBytes, rowColumnStrideInBytes );
+}
+
+
+template <unsigned int N, unsigned int K, OptixCoopVecElemType elementType, OptixCoopVecMatrixLayout layout, unsigned int rowColumnStrideInBytes>
+static __forceinline__ __device__ unsigned int optixCoopVecGetMatrixSize()
+{
+    unsigned int size;
+    asm( "call"
+         "(%0),"
+         "_optix_coop_vec_get_matrix_size,"
+         "(%1,%2,%3,%4,%5);"
+         : "=r"( size )
+         : "r"( N ), "r"( K ), "r"( elementType ), "r"( layout ), "r"( rowColumnStrideInBytes ) );
+    return size;
+}
+
+#endif  // #ifndef OPTIX_OPTIX_DEVICE_IMPL_COOP_VEC_H
diff --git a/crtx/optix_9.1/internal/optix_device_impl_transformations.h b/crtx/optix_9.1/internal/optix_device_impl_transformations.h
new file mode 100644
index 0000000..dd5b958
--- /dev/null
+++ b/crtx/optix_9.1/internal/optix_device_impl_transformations.h
@@ -0,0 +1,422 @@
+/* 
+* SPDX-FileCopyrightText: Copyright (c) 2019 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 
+* SPDX-License-Identifier: LicenseRef-NvidiaProprietary 
+* 
+* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 
+* property and proprietary rights in and to this material, related 
+* documentation and any modifications thereto. Any use, reproduction, 
+* disclosure or distribution of this material and related documentation 
+* without an express license agreement from NVIDIA CORPORATION or 
+* its affiliates is strictly prohibited. 
+*/
+/**
+* @file   optix_device_impl_transformations.h
+* @author NVIDIA Corporation
+* @brief  OptiX public API
+*
+* OptiX public API Reference - Device side implementation for transformation helper functions.
+*/
+
+#if !defined( __OPTIX_INCLUDE_INTERNAL_HEADERS__ )
+#error("optix_device_impl_transformations.h is an internal header file and must not be used directly.  Please use optix_device.h or optix.h instead.")
+#endif
+
+#ifndef OPTIX_OPTIX_DEVICE_IMPL_TRANSFORMATIONS_H
+#define OPTIX_OPTIX_DEVICE_IMPL_TRANSFORMATIONS_H
+
+namespace optix_impl {
+
+static __forceinline__ __device__ float4 optixAddFloat4( const float4& a, const float4& b )
+{
+    return make_float4( a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w );
+}
+
+static __forceinline__ __device__ float4 optixMulFloat4( const float4& a, float b )
+{
+    return make_float4( a.x * b, a.y * b, a.z * b, a.w * b );
+}
+
+static __forceinline__ __device__ uint4 optixLdg( unsigned long long addr )
+{
+    const uint4* ptr;
+    asm volatile( "cvta.to.global.u64 %0, %1;" : "=l"( ptr ) : "l"( addr ) );
+    uint4 ret;
+    asm volatile( "ld.global.v4.u32 {%0,%1,%2,%3}, [%4];"
+                  : "=r"( ret.x ), "=r"( ret.y ), "=r"( ret.z ), "=r"( ret.w )
+                  : "l"( ptr ) );
+    return ret;
+}
+
+template <class T>
+static __forceinline__ __device__ T optixLoadReadOnlyAlign16( const T* ptr )
+{
+    // Debug mode may keep this temporary variable
+    // If T does not enforce 16B alignment, v may not be 16B aligned and storing the loaded data from ptr fails
+    __align__(16) T v;
+    for( unsigned int ofs            = 0; ofs < (unsigned int)sizeof( T ); ofs += 16 )
+        *(uint4*)( (char*)&v + ofs ) = optixLdg( (unsigned long long)( (char*)ptr + ofs ) );
+    return v;
+}
+
+// Multiplies the row vector vec with the 3x4 matrix with rows m0, m1, and m2
+static __forceinline__ __device__ float4 optixMultiplyRowMatrix( const float4 vec, const float4 m0, const float4 m1, const float4 m2 )
+{
+    float4 result;
+
+    result.x = vec.x * m0.x + vec.y * m1.x + vec.z * m2.x;
+    result.y = vec.x * m0.y + vec.y * m1.y + vec.z * m2.y;
+    result.z = vec.x * m0.z + vec.y * m1.z + vec.z * m2.z;
+    result.w = vec.x * m0.w + vec.y * m1.w + vec.z * m2.w + vec.w;
+
+    return result;
+}
+
+// Converts the SRT transformation srt into a 3x4 matrix with rows m0, m1, and m2
+static __forceinline__ __device__ void optixGetMatrixFromSrt( float4& m0, float4& m1, float4& m2, const OptixSRTData& srt )
+{
+    // assumed to be normalized
+    const float4 q = {srt.qx, srt.qy, srt.qz, srt.qw};
+
+    const float sqw = q.w * q.w;
+    const float sqx = q.x * q.x;
+    const float sqy = q.y * q.y;
+    const float sqz = q.z * q.z;
+
+    const float xy = q.x * q.y;
+    const float zw = q.z * q.w;
+    const float xz = q.x * q.z;
+    const float yw = q.y * q.w;
+    const float yz = q.y * q.z;
+    const float xw = q.x * q.w;
+
+    m0.x = ( sqx - sqy - sqz + sqw );
+    m0.y = 2.0f * ( xy - zw );
+    m0.z = 2.0f * ( xz + yw );
+
+    m1.x = 2.0f * ( xy + zw );
+    m1.y = ( -sqx + sqy - sqz + sqw );
+    m1.z = 2.0f * ( yz - xw );
+
+    m2.x = 2.0f * ( xz - yw );
+    m2.y = 2.0f * ( yz + xw );
+    m2.z = ( -sqx - sqy + sqz + sqw );
+
+    m0.w = m0.x * srt.pvx + m0.y * srt.pvy + m0.z * srt.pvz + srt.tx;
+    m1.w = m1.x * srt.pvx + m1.y * srt.pvy + m1.z * srt.pvz + srt.ty;
+    m2.w = m2.x * srt.pvx + m2.y * srt.pvy + m2.z * srt.pvz + srt.tz;
+
+    m0.z = m0.x * srt.b + m0.y * srt.c + m0.z * srt.sz;
+    m1.z = m1.x * srt.b + m1.y * srt.c + m1.z * srt.sz;
+    m2.z = m2.x * srt.b + m2.y * srt.c + m2.z * srt.sz;
+
+    m0.y = m0.x * srt.a + m0.y * srt.sy;
+    m1.y = m1.x * srt.a + m1.y * srt.sy;
+    m2.y = m2.x * srt.a + m2.y * srt.sy;
+
+    m0.x = m0.x * srt.sx;
+    m1.x = m1.x * srt.sx;
+    m2.x = m2.x * srt.sx;
+}
+
+// Inverts a 3x4 matrix in place
+static __forceinline__ __device__ void optixInvertMatrix( float4& m0, float4& m1, float4& m2 )
+{
+    const float det3 =
+        m0.x * ( m1.y * m2.z - m1.z * m2.y ) - m0.y * ( m1.x * m2.z - m1.z * m2.x ) + m0.z * ( m1.x * m2.y - m1.y * m2.x );
+
+    const float inv_det3 = 1.0f / det3;
+
+    float inv3[3][3];
+    inv3[0][0] = inv_det3 * ( m1.y * m2.z - m2.y * m1.z );
+    inv3[0][1] = inv_det3 * ( m0.z * m2.y - m2.z * m0.y );
+    inv3[0][2] = inv_det3 * ( m0.y * m1.z - m1.y * m0.z );
+
+    inv3[1][0] = inv_det3 * ( m1.z * m2.x - m2.z * m1.x );
+    inv3[1][1] = inv_det3 * ( m0.x * m2.z - m2.x * m0.z );
+    inv3[1][2] = inv_det3 * ( m0.z * m1.x - m1.z * m0.x );
+
+    inv3[2][0] = inv_det3 * ( m1.x * m2.y - m2.x * m1.y );
+    inv3[2][1] = inv_det3 * ( m0.y * m2.x - m2.y * m0.x );
+    inv3[2][2] = inv_det3 * ( m0.x * m1.y - m1.x * m0.y );
+
+    const float b[3] = {m0.w, m1.w, m2.w};
+
+    m0.x = inv3[0][0];
+    m0.y = inv3[0][1];
+    m0.z = inv3[0][2];
+    m0.w = -inv3[0][0] * b[0] - inv3[0][1] * b[1] - inv3[0][2] * b[2];
+
+    m1.x = inv3[1][0];
+    m1.y = inv3[1][1];
+    m1.z = inv3[1][2];
+    m1.w = -inv3[1][0] * b[0] - inv3[1][1] * b[1] - inv3[1][2] * b[2];
+
+    m2.x = inv3[2][0];
+    m2.y = inv3[2][1];
+    m2.z = inv3[2][2];
+    m2.w = -inv3[2][0] * b[0] - inv3[2][1] * b[1] - inv3[2][2] * b[2];
+}
+
+static __forceinline__ __device__ void optixLoadInterpolatedMatrixKey( float4& m0, float4& m1, float4& m2, const float4* matrix, const float t1 )
+{
+    m0 = optixLoadReadOnlyAlign16( &matrix[0] );
+    m1 = optixLoadReadOnlyAlign16( &matrix[1] );
+    m2 = optixLoadReadOnlyAlign16( &matrix[2] );
+
+    // The conditional prevents concurrent loads leading to spills
+    if( t1 > 0.0f )
+    {
+        const float t0 = 1.0f - t1;
+        m0 = optixAddFloat4( optixMulFloat4( m0, t0 ), optixMulFloat4( optixLoadReadOnlyAlign16( &matrix[3] ), t1 ) );
+        m1 = optixAddFloat4( optixMulFloat4( m1, t0 ), optixMulFloat4( optixLoadReadOnlyAlign16( &matrix[4] ), t1 ) );
+        m2 = optixAddFloat4( optixMulFloat4( m2, t0 ), optixMulFloat4( optixLoadReadOnlyAlign16( &matrix[5] ), t1 ) );
+    }
+}
+
+static __forceinline__ __device__ void optixLoadInterpolatedSrtKey( float4&       srt0,
+                                                                    float4&       srt1,
+                                                                    float4&       srt2,
+                                                                    float4&       srt3,
+                                                                    const float4* srt,
+                                                                    const float   t1 )
+{
+    srt0 = optixLoadReadOnlyAlign16( &srt[0] );
+    srt1 = optixLoadReadOnlyAlign16( &srt[1] );
+    srt2 = optixLoadReadOnlyAlign16( &srt[2] );
+    srt3 = optixLoadReadOnlyAlign16( &srt[3] );
+
+    // The conditional prevents concurrent loads leading to spills
+    if( t1 > 0.0f )
+    {
+        const float t0 = 1.0f - t1;
+        srt0 = optixAddFloat4( optixMulFloat4( srt0, t0 ), optixMulFloat4( optixLoadReadOnlyAlign16( &srt[4] ), t1 ) );
+        srt1 = optixAddFloat4( optixMulFloat4( srt1, t0 ), optixMulFloat4( optixLoadReadOnlyAlign16( &srt[5] ), t1 ) );
+        srt2 = optixAddFloat4( optixMulFloat4( srt2, t0 ), optixMulFloat4( optixLoadReadOnlyAlign16( &srt[6] ), t1 ) );
+        srt3 = optixAddFloat4( optixMulFloat4( srt3, t0 ), optixMulFloat4( optixLoadReadOnlyAlign16( &srt[7] ), t1 ) );
+
+        float inv_length = 1.f / sqrt( srt2.y * srt2.y + srt2.z * srt2.z + srt2.w * srt2.w + srt3.x * srt3.x );
+        srt2.y *= inv_length;
+        srt2.z *= inv_length;
+        srt2.w *= inv_length;
+        srt3.x *= inv_length;
+    }
+}
+
+static __forceinline__ __device__ void optixResolveMotionKey( float& localt, int& key, const OptixMotionOptions& options, const float globalt )
+{
+    const float timeBegin    = options.timeBegin;
+    const float timeEnd      = options.timeEnd;
+    const float numIntervals = (float)( options.numKeys - 1 );
+
+    // No need to check the motion flags. If data originates from a valid transform list handle, then globalt is in
+    // range, or vanish flags are not set.
+
+    // should be NaN or in [0,numIntervals]
+    float time = max( 0.f, min( numIntervals, numIntervals * __fdividef( globalt - timeBegin, timeEnd - timeBegin ) ) );
+
+    // catch NaN (for example when timeBegin=timeEnd)
+    if( time != time )
+        time = 0.f;
+
+    const float fltKey = fminf( floorf(time), numIntervals - 1 );
+
+    localt = time - fltKey;
+    key    = (int)fltKey;
+}
+
+// Returns the interpolated transformation matrix for a particular matrix motion transformation and point in time.
+static __forceinline__ __device__ void optixGetInterpolatedTransformation( float4&                           trf0,
+                                                                           float4&                           trf1,
+                                                                           float4&                           trf2,
+                                                                           const OptixMatrixMotionTransform* transformData,
+                                                                           const float                       time )
+{
+    // Compute key and intra key time
+    float keyTime;
+    int   key;
+    optixResolveMotionKey( keyTime, key, optixLoadReadOnlyAlign16( transformData ).motionOptions, time );
+
+    // Get pointer to left key
+    const float4* transform = (const float4*)( &transformData->transform[key][0] );
+
+    // Load and interpolate matrix keys
+    optixLoadInterpolatedMatrixKey( trf0, trf1, trf2, transform, keyTime );
+}
+
+// Returns the interpolated transformation matrix for a particular SRT motion transformation and point in time.
+static __forceinline__ __device__ void optixGetInterpolatedTransformation( float4&                        trf0,
+                                                                           float4&                        trf1,
+                                                                           float4&                        trf2,
+                                                                           const OptixSRTMotionTransform* transformData,
+                                                                           const float                    time )
+{
+    // Compute key and intra key time
+    float keyTime;
+    int   key;
+    optixResolveMotionKey( keyTime, key, optixLoadReadOnlyAlign16( transformData ).motionOptions, time );
+
+    // Get pointer to left key
+    const float4* dataPtr = reinterpret_cast<const float4*>( &transformData->srtData[key] );
+
+    // Load and interpolated SRT keys
+    float4 data[4];
+    optixLoadInterpolatedSrtKey( data[0], data[1], data[2], data[3], dataPtr, keyTime );
+
+    OptixSRTData srt = {data[0].x, data[0].y, data[0].z, data[0].w, data[1].x, data[1].y, data[1].z, data[1].w,
+                        data[2].x, data[2].y, data[2].z, data[2].w, data[3].x, data[3].y, data[3].z, data[3].w};
+
+    // Convert SRT into a matrix
+    optixGetMatrixFromSrt( trf0, trf1, trf2, srt );
+}
+
+// Returns the interpolated transformation matrix for a particular traversable handle and point in time.
+static __forceinline__ __device__ void optixGetInterpolatedTransformationFromHandle( float4&                      trf0,
+                                                                                     float4&                      trf1,
+                                                                                     float4&                      trf2,
+                                                                                     const OptixTraversableHandle handle,
+                                                                                     const float                  time,
+                                                                                     const bool objectToWorld )
+{
+    const OptixTransformType type = optixGetTransformTypeFromHandle( handle );
+
+    if( type == OPTIX_TRANSFORM_TYPE_MATRIX_MOTION_TRANSFORM || type == OPTIX_TRANSFORM_TYPE_SRT_MOTION_TRANSFORM )
+    {
+        if( type == OPTIX_TRANSFORM_TYPE_MATRIX_MOTION_TRANSFORM )
+        {
+            const OptixMatrixMotionTransform* transformData = optixGetMatrixMotionTransformFromHandle( handle );
+            optixGetInterpolatedTransformation( trf0, trf1, trf2, transformData, time );
+        }
+        else
+        {
+            const OptixSRTMotionTransform* transformData = optixGetSRTMotionTransformFromHandle( handle );
+            optixGetInterpolatedTransformation( trf0, trf1, trf2, transformData, time );
+        }
+
+        if( !objectToWorld )
+            optixInvertMatrix( trf0, trf1, trf2 );
+    }
+    else if( type == OPTIX_TRANSFORM_TYPE_INSTANCE || type == OPTIX_TRANSFORM_TYPE_STATIC_TRANSFORM )
+    {
+        const float4* transform;
+
+        if( type == OPTIX_TRANSFORM_TYPE_INSTANCE )
+        {
+            transform = ( objectToWorld ) ? optixGetInstanceTransformFromHandle( handle ) :
+                                            optixGetInstanceInverseTransformFromHandle( handle );
+        }
+        else
+        {
+            const OptixStaticTransform* traversable = optixGetStaticTransformFromHandle( handle );
+            transform = (const float4*)( ( objectToWorld ) ? traversable->transform : traversable->invTransform );
+        }
+
+        trf0 = optixLoadReadOnlyAlign16( &transform[0] );
+        trf1 = optixLoadReadOnlyAlign16( &transform[1] );
+        trf2 = optixLoadReadOnlyAlign16( &transform[2] );
+    }
+    else
+    {
+        trf0 = {1.0f, 0.0f, 0.0f, 0.0f};
+        trf1 = {0.0f, 1.0f, 0.0f, 0.0f};
+        trf2 = {0.0f, 0.0f, 1.0f, 0.0f};
+    }
+}
+
+// Returns the world-to-object transformation matrix resulting from the transform stack and ray time of the given hit object.
+template<typename HitState>
+static __forceinline__ __device__ void optixGetWorldToObjectTransformMatrix( const HitState& hs, float4& m0, float4& m1, float4& m2 )
+{
+    const unsigned int size = hs.getTransformListSize();
+    const float        time = hs.getRayTime();
+
+#pragma unroll 1
+    for( unsigned int i = 0; i < size; ++i )
+    {
+        OptixTraversableHandle handle = hs.getTransformListHandle( i );
+
+        float4 trf0, trf1, trf2;
+        optixGetInterpolatedTransformationFromHandle( trf0, trf1, trf2, handle, time, /*objectToWorld*/ false );
+
+        if( i == 0 )
+        {
+            m0 = trf0;
+            m1 = trf1;
+            m2 = trf2;
+        }
+        else
+        {
+            // m := trf * m
+            float4 tmp0 = m0, tmp1 = m1, tmp2 = m2;
+            m0 = optixMultiplyRowMatrix( trf0, tmp0, tmp1, tmp2 );
+            m1 = optixMultiplyRowMatrix( trf1, tmp0, tmp1, tmp2 );
+            m2 = optixMultiplyRowMatrix( trf2, tmp0, tmp1, tmp2 );
+        }
+    }
+}
+
+// Returns the object-to-world transformation matrix resulting from the transform stack and ray time of the given hit object.
+template<typename HitState>
+static __forceinline__ __device__ void optixGetObjectToWorldTransformMatrix( const HitState& hs, float4& m0, float4& m1, float4& m2 )
+{
+    const int   size = hs.getTransformListSize();
+    const float time = hs.getRayTime();
+
+#pragma unroll 1
+    for( int i = size - 1; i >= 0; --i )
+    {
+        OptixTraversableHandle handle = hs.getTransformListHandle( i );
+
+        float4 trf0, trf1, trf2;
+        optixGetInterpolatedTransformationFromHandle( trf0, trf1, trf2, handle, time, /*objectToWorld*/ true );
+
+        if( i == size - 1 )
+        {
+            m0 = trf0;
+            m1 = trf1;
+            m2 = trf2;
+        }
+        else
+        {
+            // m := trf * m
+            float4 tmp0 = m0, tmp1 = m1, tmp2 = m2;
+            m0 = optixMultiplyRowMatrix( trf0, tmp0, tmp1, tmp2 );
+            m1 = optixMultiplyRowMatrix( trf1, tmp0, tmp1, tmp2 );
+            m2 = optixMultiplyRowMatrix( trf2, tmp0, tmp1, tmp2 );
+        }
+    }
+}
+
+// Multiplies the 3x4 matrix with rows m0, m1, m2 with the point p.
+static __forceinline__ __device__ float3 optixTransformPoint( const float4& m0, const float4& m1, const float4& m2, const float3& p )
+{
+    float3 result;
+    result.x = m0.x * p.x + m0.y * p.y + m0.z * p.z + m0.w;
+    result.y = m1.x * p.x + m1.y * p.y + m1.z * p.z + m1.w;
+    result.z = m2.x * p.x + m2.y * p.y + m2.z * p.z + m2.w;
+    return result;
+}
+
+// Multiplies the 3x3 linear submatrix of the 3x4 matrix with rows m0, m1, m2 with the vector v.
+static __forceinline__ __device__ float3 optixTransformVector( const float4& m0, const float4& m1, const float4& m2, const float3& v )
+{
+    float3 result;
+    result.x = m0.x * v.x + m0.y * v.y + m0.z * v.z;
+    result.y = m1.x * v.x + m1.y * v.y + m1.z * v.z;
+    result.z = m2.x * v.x + m2.y * v.y + m2.z * v.z;
+    return result;
+}
+
+// Multiplies the transpose of the 3x3 linear submatrix of the 3x4 matrix with rows m0, m1, m2 with the normal n.
+// Note that the given matrix is supposed to be the inverse of the actual transformation matrix.
+static __forceinline__ __device__ float3 optixTransformNormal( const float4& m0, const float4& m1, const float4& m2, const float3& n )
+{
+    float3 result;
+    result.x = m0.x * n.x + m1.x * n.y + m2.x * n.z;
+    result.y = m0.y * n.x + m1.y * n.y + m2.y * n.z;
+    result.z = m0.z * n.x + m1.z * n.y + m2.z * n.z;
+    return result;
+}
+
+}  // namespace optix_impl
+
+#endif // OPTIX_OPTIX_DEVICE_IMPL_TRANSFORMATIONS_H
diff --git a/crtx/optix_9.1/internal/optix_micromap_impl.h b/crtx/optix_9.1/internal/optix_micromap_impl.h
new file mode 100644
index 0000000..a98ef28
--- /dev/null
+++ b/crtx/optix_9.1/internal/optix_micromap_impl.h
@@ -0,0 +1,185 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+
+/**
+* @file   optix_micromap_impl.h
+* @author NVIDIA Corporation
+* @brief  OptiX micromap helper functions
+*/
+
+#ifndef OPTIX_OPTIX_MICROMAP_IMPL_H
+#define OPTIX_OPTIX_MICROMAP_IMPL_H
+
+#ifndef OPTIX_MICROMAP_FUNC
+#ifdef __CUDACC__
+#define OPTIX_MICROMAP_FUNC __device__
+#else
+#define OPTIX_MICROMAP_FUNC
+#endif
+#endif
+
+namespace optix_impl {
+
+/** \addtogroup optix_utilities
+@{
+*/
+
+#define OPTIX_MICROMAP_INLINE_FUNC OPTIX_MICROMAP_FUNC inline
+
+#ifdef __CUDACC__
+// the device implementation of __uint_as_float is declared in cuda_runtime.h
+#else
+// the host implementation of __uint_as_float
+OPTIX_MICROMAP_INLINE_FUNC float __uint_as_float( unsigned int x )
+{
+    union { float f; unsigned int i; } var;
+    var.i = x;
+    return var.f;
+}
+#endif
+
+// Extract even bits
+OPTIX_MICROMAP_INLINE_FUNC unsigned int extractEvenBits( unsigned int x )
+{
+    x &= 0x55555555;
+    x = ( x | ( x >> 1 ) ) & 0x33333333;
+    x = ( x | ( x >> 2 ) ) & 0x0f0f0f0f;
+    x = ( x | ( x >> 4 ) ) & 0x00ff00ff;
+    x = ( x | ( x >> 8 ) ) & 0x0000ffff;
+    return x;
+}
+
+
+// Calculate exclusive prefix or (log(n) XOR's and SHF's)
+OPTIX_MICROMAP_INLINE_FUNC unsigned int prefixEor( unsigned int x )
+{
+    x ^= x >> 1;
+    x ^= x >> 2;
+    x ^= x >> 4;
+    x ^= x >> 8;
+    return x;
+}
+
+// Convert distance along the curve to discrete barycentrics
+OPTIX_MICROMAP_INLINE_FUNC void index2dbary( unsigned int index, unsigned int& u, unsigned int& v, unsigned int& w )
+{
+    unsigned int b0 = extractEvenBits( index );
+    unsigned int b1 = extractEvenBits( index >> 1 );
+
+    unsigned int fx = prefixEor( b0 );
+    unsigned int fy = prefixEor( b0 & ~b1 );
+
+    unsigned int t = fy ^ b1;
+
+    u = ( fx & ~t ) | ( b0 & ~t ) | ( ~b0 & ~fx & t );
+    v = fy ^ b0;
+    w = ( ~fx & ~t ) | ( b0 & ~t ) | ( ~b0 & fx & t );
+}
+
+// Compute barycentrics of a sub or micro triangle wrt a base triangle.  The order of the returned
+// bary0, bary1, bary2 matters and allows for using this function for sub triangles and the
+// conversion from sub triangle to base triangle barycentric space
+OPTIX_MICROMAP_INLINE_FUNC void micro2bary( unsigned int index, unsigned int subdivisionLevel, float2& bary0, float2& bary1, float2& bary2 )
+{
+    if( subdivisionLevel == 0 )
+    {
+        bary0 = { 0, 0 };
+        bary1 = { 1, 0 };
+        bary2 = { 0, 1 };
+        return;
+    }
+
+    unsigned int iu, iv, iw;
+    index2dbary( index, iu, iv, iw );
+
+    // we need to only look at "level" bits
+    iu = iu & ( ( 1 << subdivisionLevel ) - 1 );
+    iv = iv & ( ( 1 << subdivisionLevel ) - 1 );
+    iw = iw & ( ( 1 << subdivisionLevel ) - 1 );
+
+    int yFlipped = ( iu & 1 ) ^ ( iv & 1 ) ^ ( iw & 1 ) ^ 1;
+
+    int xFlipped = ( ( 0x8888888888888888ull ^ 0xf000f000f000f000ull ^ 0xffff000000000000ull ) >> index ) & 1;
+    xFlipped    ^= ( ( 0x8888888888888888ull ^ 0xf000f000f000f000ull ^ 0xffff000000000000ull ) >> ( index >> 6 ) ) & 1;
+
+    const float levelScale = __uint_as_float( ( 127u - subdivisionLevel ) << 23 );
+
+    // scale the barycentic coordinate to the global space/scale
+    float du = 1.f * levelScale;
+    float dv = 1.f * levelScale;
+
+    // scale the barycentic coordinate to the global space/scale
+    float u = (float)iu * levelScale;
+    float v = (float)iv * levelScale;
+
+    //     c        d
+    //      x-----x
+    //     / \   /
+    //    /   \ /
+    //   x-----x
+    //  a        b
+    //
+    // !xFlipped && !yFlipped: abc
+    // !xFlipped &&  yFlipped: cdb
+    //  xFlipped && !yFlipped: bac
+    //  xFlipped &&  yFlipped: dcb
+
+    bary0 = { u + xFlipped * du    , v + yFlipped * dv };
+    bary1 = { u + (1-xFlipped) * du, v + yFlipped * dv };
+    bary2 = { u + yFlipped * du    , v + (1-yFlipped) * dv };
+}
+
+// avoid any conflicts due to multiple definitions
+#define OPTIX_MICROMAP_FLOAT2_SUB(a,b) { a.x - b.x, a.y - b.y }
+
+// Compute barycentrics for micro triangle from base barycentrics
+OPTIX_MICROMAP_INLINE_FUNC float2 base2micro( const float2& baseBarycentrics, const float2 microVertexBaseBarycentrics[3] )
+{
+    float2 baryV0P  = OPTIX_MICROMAP_FLOAT2_SUB( baseBarycentrics, microVertexBaseBarycentrics[0] );
+    float2 baryV0V1 = OPTIX_MICROMAP_FLOAT2_SUB( microVertexBaseBarycentrics[1], microVertexBaseBarycentrics[0] );
+    float2 baryV0V2 = OPTIX_MICROMAP_FLOAT2_SUB( microVertexBaseBarycentrics[2], microVertexBaseBarycentrics[0] );
+
+    float  rdetA = 1.f / ( baryV0V1.x * baryV0V2.y - baryV0V1.y * baryV0V2.x );
+    float4 A     = { baryV0V2.y, -baryV0V2.x, -baryV0V1.y, baryV0V1.x };
+
+    float2 localUV;
+    localUV.x = rdetA * ( baryV0P.x * A.x + baryV0P.y * A.y );
+    localUV.y = rdetA * ( baryV0P.x * A.z + baryV0P.y * A.w );
+
+    return localUV;
+}
+#undef OPTIX_MICROMAP_FLOAT2_SUB
+
+/*@}*/  // end group optix_utilities
+
+}  // namespace optix_impl
+
+#endif  // OPTIX_OPTIX_MICROMAP_IMPL_H
diff --git a/crtx/optix_9.1/optix.h b/crtx/optix_9.1/optix.h
new file mode 100644
index 0000000..080f473
--- /dev/null
+++ b/crtx/optix_9.1/optix.h
@@ -0,0 +1,38 @@
+
+/*
+* SPDX-FileCopyrightText: Copyright (c) 2009 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+* SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+*
+* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+* property and proprietary rights in and to this material, related
+* documentation and any modifications thereto. Any use, reproduction,
+* disclosure or distribution of this material and related documentation
+* without an express license agreement from NVIDIA CORPORATION or
+* its affiliates is strictly prohibited.
+*/
+/// @file
+/// @author NVIDIA Corporation
+/// @brief  OptiX public API header
+///
+/// Includes the host api if compiling host code, includes the cuda api if compiling device code.
+/// For the math library routines include optix_math.h
+
+#ifndef OPTIX_OPTIX_H
+#define OPTIX_OPTIX_H
+
+/// The OptiX version.
+///
+/// - major =  OPTIX_VERSION/10000
+/// - minor = (OPTIX_VERSION%10000)/100
+/// - micro =  OPTIX_VERSION%100
+#define OPTIX_VERSION 90100
+
+
+#ifdef __CUDACC__
+#include "optix_device.h"
+#else
+#include "optix_host.h"
+#endif
+
+
+#endif  // OPTIX_OPTIX_H
diff --git a/crtx/optix_9.1/optix_denoiser_tiling.h b/crtx/optix_9.1/optix_denoiser_tiling.h
new file mode 100644
index 0000000..4dbd131
--- /dev/null
+++ b/crtx/optix_9.1/optix_denoiser_tiling.h
@@ -0,0 +1,363 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2019 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/// @file
+/// @author NVIDIA Corporation
+/// @brief  OptiX public API header
+
+#ifndef OPTIX_DENOISER_TILING_H
+#define OPTIX_DENOISER_TILING_H
+
+#include <optix.h>
+
+#include <algorithm>
+#include <vector>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \addtogroup optix_utilities
+@{
+*/
+
+/// Tile definition
+///
+/// see #optixUtilDenoiserSplitImage
+///
+struct OptixUtilDenoiserImageTile
+{
+    // input tile image
+    OptixImage2D input;
+
+    // output tile image
+    OptixImage2D output;
+
+    // overlap offsets, parameters for #optixUtilDenoiserInvoke
+    unsigned int inputOffsetX;
+    unsigned int inputOffsetY;
+};
+
+/// Return pixel stride in bytes for the given pixel format
+/// if the pixelStrideInBytes member of the image is zero.
+/// Otherwise return pixelStrideInBytes from the image.
+///
+/// \param[in] image              Image containing the pixel stride
+/// \param[in] pixelStrideInBytes Pixel stride in bytes
+///
+inline OptixResult optixUtilGetPixelStride( const OptixImage2D& image, unsigned int& pixelStrideInBytes )
+{
+    pixelStrideInBytes = image.pixelStrideInBytes;
+    if( pixelStrideInBytes == 0 )
+    {
+        switch( image.format )
+        {
+            case OPTIX_PIXEL_FORMAT_HALF1:
+                pixelStrideInBytes = 1 * sizeof( short );
+                break;
+            case OPTIX_PIXEL_FORMAT_HALF2:
+                pixelStrideInBytes = 2 * sizeof( short );
+                break;
+            case OPTIX_PIXEL_FORMAT_HALF3:
+                pixelStrideInBytes = 3 * sizeof( short );
+                break;
+            case OPTIX_PIXEL_FORMAT_HALF4:
+                pixelStrideInBytes = 4 * sizeof( short );
+                break;
+            case OPTIX_PIXEL_FORMAT_FLOAT1:
+                pixelStrideInBytes = 1 * sizeof( float );
+                break;
+            case OPTIX_PIXEL_FORMAT_FLOAT2:
+                pixelStrideInBytes = 2 * sizeof( float );
+                break;
+            case OPTIX_PIXEL_FORMAT_FLOAT3:
+                pixelStrideInBytes = 3 * sizeof( float );
+                break;
+            case OPTIX_PIXEL_FORMAT_FLOAT4:
+                pixelStrideInBytes = 4 * sizeof( float );
+                break;
+            case OPTIX_PIXEL_FORMAT_UCHAR3:
+                pixelStrideInBytes = 3 * sizeof( char );
+                break;
+            case OPTIX_PIXEL_FORMAT_UCHAR4:
+                pixelStrideInBytes = 4 * sizeof( char );
+                break;
+            case OPTIX_PIXEL_FORMAT_INTERNAL_GUIDE_LAYER:
+                return OPTIX_ERROR_INVALID_VALUE;
+                break;
+        }
+    }
+    return OPTIX_SUCCESS;
+}
+
+/// Split image into 2D tiles given horizontal and vertical tile size
+///
+/// \param[in]  input            full resolution input image to be split
+/// \param[in]  output           full resolution output image
+/// \param[in]  overlapWindowSizeInPixels    see #OptixDenoiserSizes, #optixDenoiserComputeMemoryResources
+/// \param[in]  tileWidth        maximum width of tiles
+/// \param[in]  tileHeight       maximum height of tiles
+/// \param[out] tiles            list of tiles covering the input image
+///
+inline OptixResult optixUtilDenoiserSplitImage(
+                                               const OptixImage2D&                     input,
+                                               const OptixImage2D&                     output,
+                                               unsigned int                            overlapWindowSizeInPixels,
+                                               unsigned int                            tileWidth,
+                                               unsigned int                            tileHeight,
+                                               std::vector<OptixUtilDenoiserImageTile>&    tiles )
+{
+    if( tileWidth == 0 || tileHeight == 0 )
+        return OPTIX_ERROR_INVALID_VALUE;
+
+    unsigned int inPixelStride, outPixelStride;
+    if( const OptixResult res = optixUtilGetPixelStride( input, inPixelStride ) )
+        return res;
+    if( const OptixResult res = optixUtilGetPixelStride( output, outPixelStride ) )
+        return res;
+
+    int inp_w = std::min( tileWidth + 2 * overlapWindowSizeInPixels, input.width );
+    int inp_h = std::min( tileHeight + 2 * overlapWindowSizeInPixels, input.height );
+    int inp_y = 0, copied_y = 0;
+
+    int upscaleX = output.width / input.width;
+    int upscaleY = output.height / input.height;
+
+    do
+    {
+        int inputOffsetY = inp_y == 0 ? 0 : std::max( (int)overlapWindowSizeInPixels, inp_h - ( (int)input.height - inp_y ) );
+        int copy_y       = inp_y == 0 ? std::min( input.height, tileHeight + overlapWindowSizeInPixels ) :
+                                  std::min( tileHeight, input.height - copied_y );
+
+        int inp_x = 0, copied_x = 0;
+        do
+        {
+            int inputOffsetX = inp_x == 0 ? 0 : std::max( (int)overlapWindowSizeInPixels, inp_w - ( (int)input.width - inp_x ) );
+            int copy_x = inp_x == 0 ? std::min( input.width, tileWidth + overlapWindowSizeInPixels ) :
+                                      std::min( tileWidth, input.width - copied_x );
+
+            OptixUtilDenoiserImageTile tile;
+            tile.input.data               = input.data + (size_t)( inp_y - inputOffsetY ) * input.rowStrideInBytes
+                                            + (size_t)( inp_x - inputOffsetX ) * inPixelStride;
+            tile.input.width              = inp_w;
+            tile.input.height             = inp_h;
+            tile.input.rowStrideInBytes   = input.rowStrideInBytes;
+            tile.input.pixelStrideInBytes = input.pixelStrideInBytes;
+            tile.input.format             = input.format;
+
+            tile.output.data               = output.data + (size_t)( upscaleY * inp_y ) * output.rowStrideInBytes
+                                             + (size_t)( upscaleX * inp_x ) * outPixelStride;
+            tile.output.width              = upscaleX * copy_x;
+            tile.output.height             = upscaleY * copy_y;
+            tile.output.rowStrideInBytes   = output.rowStrideInBytes;
+            tile.output.pixelStrideInBytes = output.pixelStrideInBytes;
+            tile.output.format             = output.format;
+
+            tile.inputOffsetX = inputOffsetX;
+            tile.inputOffsetY = inputOffsetY;
+
+            tiles.push_back( tile );
+
+            inp_x += inp_x == 0 ? tileWidth + overlapWindowSizeInPixels : tileWidth;
+            copied_x += copy_x;
+        } while( inp_x < static_cast<int>( input.width ) );
+
+        inp_y += inp_y == 0 ? tileHeight + overlapWindowSizeInPixels : tileHeight;
+        copied_y += copy_y;
+    } while( inp_y < static_cast<int>( input.height ) );
+
+    return OPTIX_SUCCESS;
+}
+
+/// Run denoiser on input layers
+/// see #optixDenoiserInvoke
+/// additional parameters:
+
+/// Runs the denoiser on the input layers on a single GPU and stream using #optixDenoiserInvoke.
+/// If the input layers' dimensions are larger than the specified tile size, the image is divided into
+/// tiles using #optixUtilDenoiserSplitImage, and multiple back-to-back invocations are performed in
+/// order to reuse the scratch space.  Multiple tiles can be invoked concurrently if
+/// #optixUtilDenoiserSplitImage is used directly and multiple scratch allocations for each concurrent
+/// invocation are used.
+
+/// The input parameters are the same as #optixDenoiserInvoke except for the addition of the maximum tile size.
+///
+/// \param[in] denoiser
+/// \param[in] stream
+/// \param[in] params
+/// \param[in] denoiserState
+/// \param[in] denoiserStateSizeInBytes
+/// \param[in] guideLayer
+/// \param[in] layers
+/// \param[in] numLayers
+/// \param[in] scratch
+/// \param[in] scratchSizeInBytes
+/// \param[in] overlapWindowSizeInPixels
+/// \param[in] tileWidth
+/// \param[in] tileHeight
+inline OptixResult optixUtilDenoiserInvokeTiled(
+                                                OptixDenoiser                   denoiser,
+                                                CUstream                        stream,
+                                                const OptixDenoiserParams*      params,
+                                                CUdeviceptr                     denoiserState,
+                                                size_t                          denoiserStateSizeInBytes,
+                                                const OptixDenoiserGuideLayer*  guideLayer,
+                                                const OptixDenoiserLayer*       layers,
+                                                unsigned int                    numLayers,
+                                                CUdeviceptr                     scratch,
+                                                size_t                          scratchSizeInBytes,
+                                                unsigned int                    overlapWindowSizeInPixels,
+                                                unsigned int                    tileWidth,
+                                                unsigned int                    tileHeight )
+{
+    if( !guideLayer || !layers )
+        return OPTIX_ERROR_INVALID_VALUE;
+
+    const unsigned int upscale = numLayers > 0 && layers[0].previousOutput.width == 2 * layers[0].input.width ? 2 : 1;
+
+    std::vector<std::vector<OptixUtilDenoiserImageTile>> tiles( numLayers );
+    std::vector<std::vector<OptixUtilDenoiserImageTile>> prevTiles( numLayers );
+    for( unsigned int l = 0; l < numLayers; l++ )
+    {
+        if( const OptixResult res = optixUtilDenoiserSplitImage( layers[l].input, layers[l].output,
+                                                                 overlapWindowSizeInPixels,
+                                                                 tileWidth, tileHeight, tiles[l] ) )
+            return res;
+
+        if( layers[l].previousOutput.data )
+        {
+            OptixImage2D dummyOutput = layers[l].previousOutput;
+            if( const OptixResult res = optixUtilDenoiserSplitImage( layers[l].previousOutput, dummyOutput,
+                                                                 upscale * overlapWindowSizeInPixels,
+                                                                 upscale * tileWidth, upscale * tileHeight, prevTiles[l] ) )
+                return res;
+        }
+    }
+
+    std::vector<OptixUtilDenoiserImageTile> albedoTiles;
+    if( guideLayer->albedo.data )
+    {
+        OptixImage2D dummyOutput = guideLayer->albedo;
+        if( const OptixResult res = optixUtilDenoiserSplitImage( guideLayer->albedo, dummyOutput,
+                                                                 overlapWindowSizeInPixels,
+                                                                 tileWidth, tileHeight, albedoTiles ) )
+            return res;
+    }
+
+    std::vector<OptixUtilDenoiserImageTile> normalTiles;
+    if( guideLayer->normal.data )
+    {
+        OptixImage2D dummyOutput = guideLayer->normal;
+        if( const OptixResult res = optixUtilDenoiserSplitImage( guideLayer->normal, dummyOutput,
+                                                                 overlapWindowSizeInPixels,
+                                                                 tileWidth, tileHeight, normalTiles ) )
+            return res;
+    }
+
+    std::vector<OptixUtilDenoiserImageTile> flowTiles;
+    if( guideLayer->flow.data )
+    {
+        OptixImage2D dummyOutput = guideLayer->flow;
+        if( const OptixResult res = optixUtilDenoiserSplitImage( guideLayer->flow, dummyOutput,
+                                                                 overlapWindowSizeInPixels,
+                                                                 tileWidth, tileHeight, flowTiles ) )
+            return res;
+    }
+
+    std::vector<OptixUtilDenoiserImageTile> flowTrustTiles;
+    if( guideLayer->flowTrustworthiness.data )
+    {
+        OptixImage2D dummyOutput = guideLayer->flowTrustworthiness;
+        if( const OptixResult res = optixUtilDenoiserSplitImage( guideLayer->flowTrustworthiness, dummyOutput,
+                                                                 overlapWindowSizeInPixels,
+                                                                 tileWidth, tileHeight, flowTrustTiles ) )
+            return res;
+    }
+
+    std::vector<OptixUtilDenoiserImageTile> internalGuideLayerTiles;
+    if( guideLayer->previousOutputInternalGuideLayer.data && guideLayer->outputInternalGuideLayer.data )
+    {
+        if( const OptixResult res = optixUtilDenoiserSplitImage( guideLayer->previousOutputInternalGuideLayer,
+                                                                 guideLayer->outputInternalGuideLayer,
+                                                                 upscale * overlapWindowSizeInPixels,
+                                                                 upscale * tileWidth, upscale * tileHeight, internalGuideLayerTiles ) )
+            return res;
+    }
+
+    for( size_t t = 0; t < tiles[0].size(); t++ )
+    {
+        std::vector<OptixDenoiserLayer> tlayers;
+        for( unsigned int l = 0; l < numLayers; l++ )
+        {
+            OptixDenoiserLayer layer = {};
+            layer.input  = ( tiles[l] )[t].input;
+            layer.output = ( tiles[l] )[t].output;
+            if( layers[l].previousOutput.data )
+                layer.previousOutput = ( prevTiles[l] )[t].input;
+            layer.type = layers[l].type;
+            tlayers.push_back( layer );
+        }
+
+        OptixDenoiserGuideLayer gl = {};
+        if( guideLayer->albedo.data )
+            gl.albedo = albedoTiles[t].input;
+
+        if( guideLayer->normal.data )
+            gl.normal = normalTiles[t].input;
+
+        if( guideLayer->flow.data )
+            gl.flow = flowTiles[t].input;
+
+        if( guideLayer->flowTrustworthiness.data )
+            gl.flowTrustworthiness = flowTrustTiles[t].input;
+
+        if( guideLayer->previousOutputInternalGuideLayer.data )
+            gl.previousOutputInternalGuideLayer = internalGuideLayerTiles[t].input;
+
+        if( guideLayer->outputInternalGuideLayer.data )
+            gl.outputInternalGuideLayer = internalGuideLayerTiles[t].output;
+
+        if( const OptixResult res =
+                optixDenoiserInvoke( denoiser, stream, params, denoiserState, denoiserStateSizeInBytes,
+                                     &gl, &tlayers[0], numLayers,
+                                     ( tiles[0] )[t].inputOffsetX, ( tiles[0] )[t].inputOffsetY,
+                                     scratch, scratchSizeInBytes ) )
+            return res;
+    }
+    return OPTIX_SUCCESS;
+}
+
+/**@}*/  // end group optix_utilities
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // OPTIX_DENOISER_TILING_H 
diff --git a/crtx/optix_9.1/optix_device.h b/crtx/optix_9.1/optix_device.h
new file mode 100644
index 0000000..c718305
--- /dev/null
+++ b/crtx/optix_9.1/optix_device.h
@@ -0,0 +1,2440 @@
+/*
+* SPDX-FileCopyrightText: Copyright (c) 2010 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+* SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+*
+* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+* property and proprietary rights in and to this material, related
+* documentation and any modifications thereto. Any use, reproduction,
+* disclosure or distribution of this material and related documentation
+* without an express license agreement from NVIDIA CORPORATION or
+* its affiliates is strictly prohibited.
+*/
+/// @file
+/// @author NVIDIA Corporation
+/// @brief  OptiX public API header
+///
+/// OptiX public API Reference - Device API declarations
+
+#ifndef OPTIX_OPTIX_DEVICE_H
+#define OPTIX_OPTIX_DEVICE_H
+
+#if defined( __cplusplus ) && ( __cplusplus < 201103L ) && !defined( _WIN32 )
+#error Device code for OptiX requires at least C++11. Consider adding "--std c++11" to the nvcc command-line.
+#endif
+
+#include "optix_types.h"
+
+/// \defgroup optix_device_api Device API
+/// \brief OptiX Device API
+
+/** \addtogroup optix_device_api
+@{
+*/
+
+
+/// Initiates a ray tracing query starting with the given traversable.
+///
+/// \param[in] handle
+/// \param[in] rayOrigin
+/// \param[in] rayDirection
+/// \param[in] tmin
+/// \param[in] tmax
+/// \param[in] rayTime
+/// \param[in] visibilityMask really only 8 bits
+/// \param[in] rayFlags       really only 16 bits, combination of OptixRayFlags
+/// \param[in] SBToffset      really only 4 bits
+/// \param[in] SBTstride      really only 4 bits
+/// \param[in] missSBTIndex   specifies the miss program invoked on a miss
+/// \param[in,out] payload    up to 32 unsigned int values that hold the payload
+///
+/// Available in RG, CH, MS, CC
+template <typename... Payload>
+static __forceinline__ __device__ void optixTrace( OptixTraversableHandle handle,
+                                                   float3                 rayOrigin,
+                                                   float3                 rayDirection,
+                                                   float                  tmin,
+                                                   float                  tmax,
+                                                   float                  rayTime,
+                                                   OptixVisibilityMask    visibilityMask,
+                                                   unsigned int           rayFlags,
+                                                   unsigned int           SBToffset,
+                                                   unsigned int           SBTstride,
+                                                   unsigned int           missSBTIndex,
+                                                   Payload&...            payload );
+
+/// Similar to optixTrace, but does not invoke closesthit or miss. Instead, it overwrites the
+/// current outgoing hit object with the results of traversing the ray. The outgoing hit object may
+/// be invoked at some later point with optixInvoke. The outgoing hit object can also be queried
+/// through various functions such as optixHitObjectIsHit or optixHitObjectGetAttribute_0.
+///
+/// \param[in] handle
+/// \param[in] rayOrigin
+/// \param[in] rayDirection
+/// \param[in] tmin
+/// \param[in] tmax
+/// \param[in] rayTime
+/// \param[in] visibilityMask really only 8 bits
+/// \param[in] rayFlags       really only 16 bits, combination of OptixRayFlags
+/// \param[in] SBToffset      really only 4 bits
+/// \param[in] SBTstride      really only 4 bits
+/// \param[in] missSBTIndex   specifies the miss program invoked on a miss
+/// \param[in,out] payload    up to 32 unsigned int values that hold the payload
+///
+/// Available in RG, CH, MS, CC, DC
+template <typename... Payload>
+static __forceinline__ __device__ void optixTraverse( OptixTraversableHandle handle,
+                                                      float3                 rayOrigin,
+                                                      float3                 rayDirection,
+                                                      float                  tmin,
+                                                      float                  tmax,
+                                                      float                  rayTime,
+                                                      OptixVisibilityMask    visibilityMask,
+                                                      unsigned int           rayFlags,
+                                                      unsigned int           SBToffset,
+                                                      unsigned int           SBTstride,
+                                                      unsigned int           missSBTIndex,
+                                                      Payload&... payload );
+
+/// Initiates a ray tracing query starting with the given traversable.
+///
+/// \param[in] type
+/// \param[in] handle
+/// \param[in] rayOrigin
+/// \param[in] rayDirection
+/// \param[in] tmin
+/// \param[in] tmax
+/// \param[in] rayTime
+/// \param[in] visibilityMask really only 8 bits
+/// \param[in] rayFlags       really only 16 bits, combination of OptixRayFlags
+/// \param[in] SBToffset      really only 4 bits
+/// \param[in] SBTstride      really only 4 bits
+/// \param[in] missSBTIndex   specifies the miss program invoked on a miss
+/// \param[in,out] payload    up to 32 unsigned int values that hold the payload
+///
+/// Available in RG, CH, MS, CC
+template <typename... Payload>
+static __forceinline__ __device__ void optixTrace( OptixPayloadTypeID     type,
+                                                   OptixTraversableHandle handle,
+                                                   float3                 rayOrigin,
+                                                   float3                 rayDirection,
+                                                   float                  tmin,
+                                                   float                  tmax,
+                                                   float                  rayTime,
+                                                   OptixVisibilityMask    visibilityMask,
+                                                   unsigned int           rayFlags,
+                                                   unsigned int           SBToffset,
+                                                   unsigned int           SBTstride,
+                                                   unsigned int           missSBTIndex,
+                                                   Payload&...            payload );
+
+/// Similar to optixTrace, but does not invoke closesthit or miss. Instead, it overwrites the
+/// current outgoing hit object with the results of traversing the ray. The outgoing hit object may
+/// be invoked at some later point with optixInvoke. The outgoing hit object can also be queried
+/// through various functions such as optixHitObjectIsHit or optixHitObjectGetAttribute_0.
+///
+/// \param[in] type
+/// \param[in] handle
+/// \param[in] rayOrigin
+/// \param[in] rayDirection
+/// \param[in] tmin
+/// \param[in] tmax
+/// \param[in] rayTime
+/// \param[in] visibilityMask really only 8 bits
+/// \param[in] rayFlags       really only 16 bits, combination of OptixRayFlags
+/// \param[in] SBToffset      really only 4 bits
+/// \param[in] SBTstride      really only 4 bits
+/// \param[in] missSBTIndex   specifies the miss program invoked on a miss
+/// \param[in,out] payload    up to 32 unsigned int values that hold the payload
+///
+/// Available in RG, CH, MS, CC, DC
+template <typename... Payload>
+static __forceinline__ __device__ void optixTraverse( OptixPayloadTypeID     type,
+                                                      OptixTraversableHandle handle,
+                                                      float3                 rayOrigin,
+                                                      float3                 rayDirection,
+                                                      float                  tmin,
+                                                      float                  tmax,
+                                                      float                  rayTime,
+                                                      OptixVisibilityMask    visibilityMask,
+                                                      unsigned int           rayFlags,
+                                                      unsigned int           SBToffset,
+                                                      unsigned int           SBTstride,
+                                                      unsigned int           missSBTIndex,
+                                                      Payload&... payload );
+
+/// Reorder the current thread using the current outgoing hit object and the coherence hint bits
+/// provided.  Note that the coherence hint will take away some of the bits used in the hit object
+/// for sorting, so care should be made to reduce the number of hint bits as much as possible. Nop
+/// hit objects can use more coherence hint bits. Bits are taken from the lowest significant bit
+/// range. The maximum value of numCoherenceHintBitsFromLSB is implementation defined and can vary.
+///
+/// \param[in] coherenceHint
+/// \param[in] numCoherenceHintBitsFromLSB
+///
+/// Available in RG
+static __forceinline__ __device__ void optixReorder( unsigned int coherenceHint, unsigned int numCoherenceHintBitsFromLSB );
+
+/// Reorder the current thread using the hit object only, ie without further coherence hints.
+///
+/// Available in RG
+static __forceinline__ __device__ void optixReorder();
+
+/// Invokes closesthit, miss or nop based on the current outgoing hit object. After execution the
+/// current outgoing hit object will be set to nop. An implied nop hit object is always assumed to
+/// exist even if there are no calls to optixTraverse, optixMakeMissHitObject, optixMakeHitObject
+/// or optixMakeNopHitObject.
+///
+/// \param[in,out] payload       up to 32 unsigned int values that hold the payload
+///
+/// Available in RG, CH, MS, CC
+template <typename... Payload>
+static __forceinline__ __device__ void optixInvoke( Payload&... payload );
+
+/// Invokes closesthit, miss or nop based on the current outgoing hit object. After execution the
+/// current outgoing hit object will be set to nop. An implied nop hit object is always assumed to
+/// exist even if there are no calls to optixTraverse, optixMakeMissHitObject, optixMakeHitObject
+/// or optixMakeNopHitObject.
+///
+/// \param[in] type
+/// \param[in,out] payload       up to 32 unsigned int values that hold the payload
+///
+/// Available in RG, CH, MS, CC
+template <typename... Payload>
+static __forceinline__ __device__ void optixInvoke( OptixPayloadTypeID type, Payload&... payload );
+
+/// Constructs an outgoing hit object from the hit object data provided. The traverseData needs to be collected from a previous hit
+/// object using #optixHitObjectGetTraverseData.
+/// This hit object will now become the current outgoing hit object and will overwrite the current outgoing hit object.
+///
+/// \param[in] handle
+/// \param[in] rayOrigin
+/// \param[in] rayDirection
+/// \param[in] tmin
+/// \param[in] rayTime
+/// \param[in] rayFlags       really only 16 bits, combination of OptixRayFlags
+/// \param[in] traverseData
+/// \param[in] transforms
+/// \param[in] numTransforms
+///
+/// Available in RG, CH, MS, CC
+static __forceinline__ __device__ void optixMakeHitObject( OptixTraversableHandle        handle,
+                                                           float3                        rayOrigin,
+                                                           float3                        rayDirection,
+                                                           float                         tmin,
+                                                           float                         rayTime,
+                                                           unsigned int                  rayFlags,
+                                                           OptixTraverseData             traverseData,
+                                                           const OptixTraversableHandle* transforms,
+                                                           unsigned int                  numTransforms );
+
+/// Constructs an outgoing hit object from the miss information provided. The SBT record index is
+/// explicitly specified as an argument. This hit object will now become the current outgoing hit
+/// object and will overwrite the current outgoing hit object.
+///
+/// \param[in] missSBTIndex   specifies the miss program invoked on a miss
+/// \param[in] rayOrigin
+/// \param[in] rayDirection
+/// \param[in] tmin
+/// \param[in] tmax
+/// \param[in] rayTime
+/// \param[in] rayFlags       really only 16 bits, combination of OptixRayFlags
+///
+/// Available in RG, CH, MS, CC
+static __forceinline__ __device__ void optixMakeMissHitObject( unsigned int missSBTIndex,
+                                                               float3       rayOrigin,
+                                                               float3       rayDirection,
+                                                               float        tmin,
+                                                               float        tmax,
+                                                               float        rayTime,
+                                                               unsigned int rayFlags );
+
+/// Constructs an outgoing hit object that when invoked does nothing (neither the miss nor the
+/// closest hit shader will be invoked). This hit object will now become the current outgoing hit
+/// object and will overwrite the current outgoing hit object. Accessors such as
+/// #optixHitObjectGetInstanceId will return 0 or 0 filled structs. Only #optixHitObjectIsNop
+/// will return a non-zero result.
+///
+/// Available in RG, CH, MS, CC
+static __forceinline__ __device__ void optixMakeNopHitObject();
+
+/// Serializes the current outgoing hit object which allows to recreate it at a later
+/// point using #optixMakeHitObject.
+///
+/// \param[out] data
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ void optixHitObjectGetTraverseData( OptixTraverseData* data );
+
+/// Returns true if the current outgoing hit object contains a hit.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ bool optixHitObjectIsHit();
+
+/// Returns true if the current outgoing hit object contains a miss.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ bool optixHitObjectIsMiss();
+
+/// Returns true if the current outgoing hit object contains neither a hit nor miss. If executed
+/// with optixInvoke, no operation will result. An implied nop hit object is always assumed to exist
+/// even if there are no calls such as optixTraverse to explicitly create one.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ bool optixHitObjectIsNop();
+
+/// Returns the SBT record index associated with the hit or miss program for the current outgoing
+/// hit object.
+///
+/// Returns 0 for nop hit objects.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ unsigned int optixHitObjectGetSbtRecordIndex();
+
+/// Sets the SBT record index in the current outgoing hit object.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ void optixHitObjectSetSbtRecordIndex( unsigned int sbtRecordIndex );
+
+/// Returns the traversable handle for the Geometry Acceleration Structure (GAS) associated
+/// with the current outgoing hit object.
+/// Returns 0 if the hit object is not a hit.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ OptixTraversableHandle optixHitObjectGetGASTraversableHandle();
+
+/// Writes the 32-bit payload at the given slot index. There are up to 32 attributes available. The
+/// number of attributes is configured with OptixPipelineCompileOptions::numPayloadValues or with
+/// OptixPayloadType parameters set in OptixModuleCompileOptions.
+///
+/// Available in IS, AH, CH, MS
+static __forceinline__ __device__ void optixSetPayload_0( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_1( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_2( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_3( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_4( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_5( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_6( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_7( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_8( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_9( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_10( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_11( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_12( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_13( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_14( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_15( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_16( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_17( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_18( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_19( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_20( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_21( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_22( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_23( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_24( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_25( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_26( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_27( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_28( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_29( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_30( unsigned int p );
+static __forceinline__ __device__ void optixSetPayload_31( unsigned int p );
+
+/// Returns the 32-bit payload at the given slot index. There are up to 32 attributes available. The
+/// number of attributes is configured with OptixPipelineCompileOptions::numPayloadValues or with
+/// OptixPayloadType parameters set in OptixModuleCompileOptions.
+///
+/// Available in IS, AH, CH, MS
+static __forceinline__ __device__ unsigned int optixGetPayload_0();
+static __forceinline__ __device__ unsigned int optixGetPayload_1();
+static __forceinline__ __device__ unsigned int optixGetPayload_2();
+static __forceinline__ __device__ unsigned int optixGetPayload_3();
+static __forceinline__ __device__ unsigned int optixGetPayload_4();
+static __forceinline__ __device__ unsigned int optixGetPayload_5();
+static __forceinline__ __device__ unsigned int optixGetPayload_6();
+static __forceinline__ __device__ unsigned int optixGetPayload_7();
+static __forceinline__ __device__ unsigned int optixGetPayload_8();
+static __forceinline__ __device__ unsigned int optixGetPayload_9();
+static __forceinline__ __device__ unsigned int optixGetPayload_10();
+static __forceinline__ __device__ unsigned int optixGetPayload_11();
+static __forceinline__ __device__ unsigned int optixGetPayload_12();
+static __forceinline__ __device__ unsigned int optixGetPayload_13();
+static __forceinline__ __device__ unsigned int optixGetPayload_14();
+static __forceinline__ __device__ unsigned int optixGetPayload_15();
+static __forceinline__ __device__ unsigned int optixGetPayload_16();
+static __forceinline__ __device__ unsigned int optixGetPayload_17();
+static __forceinline__ __device__ unsigned int optixGetPayload_18();
+static __forceinline__ __device__ unsigned int optixGetPayload_19();
+static __forceinline__ __device__ unsigned int optixGetPayload_20();
+static __forceinline__ __device__ unsigned int optixGetPayload_21();
+static __forceinline__ __device__ unsigned int optixGetPayload_22();
+static __forceinline__ __device__ unsigned int optixGetPayload_23();
+static __forceinline__ __device__ unsigned int optixGetPayload_24();
+static __forceinline__ __device__ unsigned int optixGetPayload_25();
+static __forceinline__ __device__ unsigned int optixGetPayload_26();
+static __forceinline__ __device__ unsigned int optixGetPayload_27();
+static __forceinline__ __device__ unsigned int optixGetPayload_28();
+static __forceinline__ __device__ unsigned int optixGetPayload_29();
+static __forceinline__ __device__ unsigned int optixGetPayload_30();
+static __forceinline__ __device__ unsigned int optixGetPayload_31();
+
+/// Specify the supported payload types for a program.
+///
+/// The supported types are specified as a bitwise combination of payload types. (See
+/// OptixPayloadTypeID) May only be called once per program.
+///
+/// Must be called at the top of the program.
+///
+/// Available in IS, AH, CH, MS
+static __forceinline__ __device__ void optixSetPayloadTypes( unsigned int typeMask );
+
+/// Returns an undefined value.
+///
+/// Available anywhere
+static __forceinline__ __device__ unsigned int optixUndefinedValue();
+
+/// If non-zero it is legal to call optixTrace or optixTraverse without triggering an
+/// OPTIX_EXCEPTION_CODE_TRACE_DEPTH_EXCEEDED exception. In the case of optixTrace it
+/// represents the number of recursive calls that are remaining and counts down.
+///
+/// Value is in the range of [0..OptixPipelineLinkOptions::maxTraceDepth], and
+/// maxTraceDepth has a maximum value of 31.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ unsigned int optixGetRemainingTraceDepth();
+
+/// Returns the rayOrigin passed into optixTrace.
+///
+/// May be more expensive to call in IS and AH than their object space counterparts, so effort
+/// should be made to use the object space ray in those programs.
+///
+/// Available in IS, AH, CH, MS
+static __forceinline__ __device__ float3 optixGetWorldRayOrigin();
+
+/// Returns the rayOrigin passed into optixTraverse, optixMakeHitObject or optixMakeMissHitObject.
+///
+/// Returns [0, 0, 0] for nop hit objects.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ float3 optixHitObjectGetWorldRayOrigin();
+
+/// Returns the rayDirection passed into optixTrace.
+///
+/// May be more expensive to call in IS and AH than their object space counterparts, so effort
+/// should be made to use the object space ray in those programs.
+///
+/// Available in IS, AH, CH, MS
+static __forceinline__ __device__ float3 optixGetWorldRayDirection();
+
+/// Returns the rayDirection passed into optixTraverse, optixMakeHitObject or optixMakeMissHitObject.
+///
+/// Returns [0, 0, 0] for nop hit objects.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ float3 optixHitObjectGetWorldRayDirection();
+
+/// Returns the current object space ray origin based on the current transform stack.
+///
+/// Available in IS and AH
+static __forceinline__ __device__ float3 optixGetObjectRayOrigin();
+
+/// Returns the current object space ray direction based on the current transform stack.
+///
+/// Available in IS and AH
+static __forceinline__ __device__ float3 optixGetObjectRayDirection();
+
+/// Returns the tmin passed into optixTrace.
+///
+/// Available in IS, AH, CH, MS
+static __forceinline__ __device__ float optixGetRayTmin();
+
+/// Returns the tmin passed into optixTraverse, optixMakeHitObject or optixMakeMissHitObject.
+///
+/// Returns 0.0f for nop hit objects.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ float optixHitObjectGetRayTmin();
+
+/// In IS and CH returns the current smallest reported hitT or the tmax passed into optixTrace if no
+/// hit has been reported
+///
+/// In AH returns the hitT value as passed in to optixReportIntersection
+///
+/// In MS returns the tmax passed into optixTrace
+///
+/// Available in IS, AH, CH, MS
+static __forceinline__ __device__ float optixGetRayTmax();
+
+/// If the hit object is a hit, returns the smallest reported hitT
+///
+/// If the hit object is a miss, returns the tmax passed into optixTraverse, optixMakeHitObject or
+/// optixMakeMissHitObject.
+///
+/// Returns 0 for nop hit objects.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ float optixHitObjectGetRayTmax();
+
+/// Returns the rayTime passed into optixTrace.
+///
+/// Returns 0 if motion is disabled.
+///
+/// Available in IS, AH, CH, MS
+static __forceinline__ __device__ float optixGetRayTime();
+
+/// Returns the rayTime passed into optixTraverse, optixMakeHitObject or optixMakeMissHitObject.
+///
+/// Returns 0 for nop hit objects or when motion is disabled.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ float optixHitObjectGetRayTime();
+
+/// Returns the rayFlags passed into optixTrace
+///
+/// Available in IS, AH, CH, MS
+static __forceinline__ __device__ unsigned int optixGetRayFlags();
+
+/// Returns the rayFlags passed into optixTrace associated with the current outgoing hit object.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ unsigned int optixHitObjectGetRayFlags();
+
+/// Returns the visibilityMask passed into optixTrace
+///
+/// Available in IS, AH, CH, MS
+static __forceinline__ __device__ unsigned int optixGetRayVisibilityMask();
+
+/// Return the traversable handle of a given instance in an Instance Acceleration Structure (IAS)
+///
+/// To obtain instance traversables by index, the IAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_INSTANCE_ACCESS.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ OptixTraversableHandle optixGetInstanceTraversableFromIAS( OptixTraversableHandle ias, unsigned int instIdx );
+
+/// [DEPRECATED] Returns the object space triangle vertex positions of a given triangle in a Geometry Acceleration
+/// Structure (GAS) at a given motion time.
+/// This function is deprecated, use optixGetTriangleVertexDataFromHandle for random access triangle vertex data fetch or
+/// the overload optixGetTriangleVertexData( float3 data[3] ) for a current triangle hit vertex data fetch.
+///
+/// To access vertex data, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetTriangleVertexData( OptixTraversableHandle gas,
+                                                                   unsigned int           primIdx,
+                                                                   unsigned int           sbtGASIndex,
+                                                                   float                  time,
+                                                                   float3                 data[3] );
+
+/// Performs a random access data fetch object space vertex position of a given triangle in a Geometry Acceleration
+/// Structure (GAS) at a given motion time.
+///
+/// To access vertex data of any triangle, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+/// If only the vertex data of a currently intersected triangle is required, it is recommended to
+/// use function optixGetTriangleVertexData. A data fetch of the currently hit primitive does NOT
+/// require building the corresponding GAS with flag OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetTriangleVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                             unsigned int           primIdx,
+                                                                             unsigned int           sbtGASIndex,
+                                                                             float                  time,
+                                                                             float3                 data[3] );
+
+/// Returns the object space triangle vertex positions of the currently intersected triangle at the current ray time.
+///
+/// Similar to the random access variant optixGetTriangleVertexDataFromHandle, but does not require setting flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS when building the corresponding GAS.
+///
+/// It is only valid to call this function if the return value of optixGetPrimitiveType( optixGetHitKind() ) equals OPTIX_PRIMITIVE_TYPE_TRIANGLE.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ void optixGetTriangleVertexData( float3 data[3] );
+
+/// Returns the object space triangle vertex positions of the intersected triangle for a valid outgoing hit object.
+/// It is the hit object's pendant of optixGetTriangleVertexData( float3 data[3] ).
+///
+/// It is only valid to call this function if the return value of optixGetPrimitiveType( optixHitObjectGetHitKind() ) equals OPTIX_PRIMITIVE_TYPE_TRIANGLE.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ void optixHitObjectGetTriangleVertexData( float3 data[3] );
+
+
+/// Deprecated. Call either optixGetLinearCurveVertexData( float4 data[2] ) for a current-hit data fetch,
+///  or optixGetLinearCurveVertexDataFromHandle( ... ) for a random-access data fetch.
+///
+/// Returns the object space curve control vertex data of a linear curve in a Geometry Acceleration
+/// Structure (GAS) at a given motion time.
+///
+/// To access vertex data, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetLinearCurveVertexData( OptixTraversableHandle gas,
+                                                                      unsigned int           primIdx,
+                                                                      unsigned int           sbtGASIndex,
+                                                                      float                  time,
+                                                                      float4                 data[2] );
+
+/// Performs a random access fetch of the object space curve control vertex data of a linear curve in a Geometry Acceleration
+/// Structure (GAS) at a given motion time.
+///
+/// To access vertex data of any curve, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+/// If only the vertex data of a currently intersected linear curve is required, it is recommended to
+/// use function optixGetLinearCurveVertexData. A data fetch of the currently hit primitive does NOT
+/// require building the corresponding GAS with flag OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetLinearCurveVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                                unsigned int           primIdx,
+                                                                                unsigned int           sbtGASIndex,
+                                                                                float                  time,
+                                                                                float4                 data[2] );
+
+/// Returns the object space control vertex data of the currently intersected linear curve at the current ray time.
+///
+/// Similar to the random access variant optixGetLinearCurveVertexDataFromHandle, but does not require setting flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS when building the corresponding GAS.
+///
+/// It is only valid to call this function if the return value of optixGetPrimitiveType( optixGetHitKind() ) equals OPTIX_PRIMITIVE_TYPE_ROUND_LINEAR.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ void optixGetLinearCurveVertexData( float4 data[2] );
+
+/// Returns the object space control vertex data of the currently intersected linear curve for a valid outgoing hit object.
+/// It is the hit object's pendant of optixGetLinearCurveVertexData( float4 data[2] ).
+///
+/// It is only valid to call this function if the return value of optixGetPrimitiveType( optixHitObjectGetHitKind() ) equals OPTIX_PRIMITIVE_TYPE_ROUND_LINEAR.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ void optixHitObjectGetLinearCurveVertexData( float4 data[2] );
+
+/// Returns the object space curve control vertex data of a quadratic BSpline curve in a Geometry
+/// Acceleration Structure (GAS) at a given motion time.
+///
+/// To access vertex data, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetQuadraticBSplineVertexData( OptixTraversableHandle gas,
+                                                                           unsigned int           primIdx,
+                                                                           unsigned int           sbtGASIndex,
+                                                                           float                  time,
+                                                                           float4                 data[3] );
+
+/// Returns the object space curve control vertex data of a quadratic BSpline curve in a Geometry
+/// Acceleration Structure (GAS) at a given motion time.
+///
+/// To access vertex data, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetQuadraticBSplineVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                                     unsigned int           primIdx,
+                                                                                     unsigned int           sbtGASIndex,
+                                                                                     float                  time,
+                                                                                     float4                 data[3] );
+static __forceinline__ __device__ void optixGetQuadraticBSplineRocapsVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                                           unsigned int primIdx,
+                                                                                           unsigned int sbtGASIndex,
+                                                                                           float        time,
+                                                                                           float4       data[3] );
+
+/// Returns the object space curve control vertex data of a quadratic BSpline curve in a Geometry
+/// Acceleration Structure (GAS) at a given motion time.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ void optixGetQuadraticBSplineVertexData( float4 data[3] );
+static __forceinline__ __device__ void optixGetQuadraticBSplineRocapsVertexData( float4 data[3] );
+
+/// Returns the object space curve control vertex data of a quadratic BSpline curve for a valid outgoing hit object.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// It is only valid to call this function if the return value of optixGetPrimitiveType( optixHitObjectGetHitKind() )
+/// equals OPTIX_PRIMITIVE_TYPE_FLAT_QUADRATIC_BSPLINE.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ void optixHitObjectGetQuadraticBSplineVertexData( float4 data[3] );
+static __forceinline__ __device__ void optixHitObjectGetQuadraticBSplineRocapsVertexData( float4 data[3] );
+
+/// Deprecated. Call either optixGetCubicBSplineVertexData( float4 data[4] ) for current hit
+/// sphere data, or optixGetCubicBSplineVertexDataFromHandle() for random access sphere data.
+///
+/// Return the object space curve control vertex data of a cubic BSpline curve in a Geometry
+/// Acceleration Structure (GAS) at a given motion time.
+///
+/// To access vertex data, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetCubicBSplineVertexData( OptixTraversableHandle gas,
+                                                                       unsigned int           primIdx,
+                                                                       unsigned int           sbtGASIndex,
+                                                                       float                  time,
+                                                                       float4                 data[4] );
+
+/// Returns the object space curve control vertex data of a cubic BSpline curve in a Geometry
+/// Acceleration Structure (GAS) at a given motion time.
+///
+/// To access vertex data, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetCubicBSplineVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                                 unsigned int           primIdx,
+                                                                                 unsigned int           sbtGASIndex,
+                                                                                 float                  time,
+                                                                                 float4                 data[4] );
+static __forceinline__ __device__ void optixGetCubicBSplineRocapsVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                                       unsigned int           primIdx,
+                                                                                       unsigned int sbtGASIndex,
+                                                                                       float        time,
+                                                                                       float4       data[4] );
+
+/// Returns the object space curve control vertex data of a cubic BSpline curve in a Geometry
+/// Acceleration Structure (GAS) at a given motion time.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ void optixGetCubicBSplineVertexData( float4 data[4] );
+static __forceinline__ __device__ void optixGetCubicBSplineRocapsVertexData( float4 data[4] );
+
+/// Returns the object space curve control vertex data of a cubic BSpline curve for a valid
+/// outgoing hit object.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// It is only valid to call this function if the return value of optixGetPrimitiveType( optixHitObjectGetHitKind() )
+/// equals OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ void optixHitObjectGetCubicBSplineVertexData( float4 data[4] );
+/// See #optixHitObjectGetCubicBSplineVertexData for further documentation
+///
+/// It is only valid to call this function if the return value of optixGetPrimitiveType( optixHitObjectGetHitKind() )
+/// equals OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE_ROCAPS.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ void optixHitObjectGetCubicBSplineRocapsVertexData( float4 data[4] );
+
+/// Deprecated. Call either optixGetCatmullRomVertexData( float4 data[4] ) for current hit
+/// data, or optixGetCatmullRomVertexDataFromHandle() for random access sphere data.
+///
+/// Returns the object space curve control vertex data of a CatmullRom spline curve in a Geometry
+/// Acceleration Structure (GAS) at a given motion time.
+///
+/// To access vertex data, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetCatmullRomVertexData( OptixTraversableHandle gas,
+                                                                     unsigned int           primIdx,
+                                                                     unsigned int           sbtGASIndex,
+                                                                     float                  time,
+                                                                     float4                 data[4] );
+
+/// Returns the object space curve control vertex data of a CatmullRom spline curve in a Geometry
+/// Acceleration Structure (GAS) at a given motion time.
+///
+/// To access vertex data, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetCatmullRomVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                               unsigned int           primIdx,
+                                                                               unsigned int           sbtGASIndex,
+                                                                               float                  time,
+                                                                               float4                 data[4] );
+static __forceinline__ __device__ void optixGetCatmullRomRocapsVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                                     unsigned int           primIdx,
+                                                                                     unsigned int           sbtGASIndex,
+                                                                                     float                  time,
+                                                                                     float4                 data[4] );
+
+/// Returns the object space curve control vertex data of a CatmullRom spline curve in a Geometry
+/// Acceleration Structure (GAS) at a given motion time.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ void optixGetCatmullRomVertexData( float4 data[4] );
+static __forceinline__ __device__ void optixGetCatmullRomRocapsVertexData( float4 data[4] );
+
+/// Returns the object space curve control vertex data of a CatmullRom spline curve for a valid
+/// outgoing hit object.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// It is only valid to call this function if the return value of optixGetPrimitiveType( optixHitObjectGetHitKind() )
+/// equals OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ void optixHitObjectGetCatmullRomVertexData( float4 data[4] );
+static __forceinline__ __device__ void optixHitObjectGetCatmullRomRocapsVertexData( float4 data[4] );
+
+/// Deprecated. Call either optixGetCubicBezierVertexData( float4 data[4] ) for current hit
+/// data, or optixGetCubicBezierVertexDataFromHandle() for random access sphere data.
+///
+/// Returns the object space curve control vertex data of a cubic Bezier curve in a Geometry
+/// Acceleration Structure (GAS) at a given motion time.
+///
+/// To access vertex data, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetCubicBezierVertexData( OptixTraversableHandle gas,
+                                                                      unsigned int           primIdx,
+                                                                      unsigned int           sbtGASIndex,
+                                                                      float                  time,
+                                                                      float4                 data[4] );
+
+/// Returns the object space curve control vertex data of a cubic Bezier curve in a Geometry
+/// Acceleration Structure (GAS) at a given motion time.
+///
+/// To access vertex data, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetCubicBezierVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                                unsigned int           primIdx,
+                                                                                unsigned int           sbtGASIndex,
+                                                                                float                  time,
+                                                                                float4                 data[4] );
+static __forceinline__ __device__ void optixGetCubicBezierRocapsVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                                      unsigned int           primIdx,
+                                                                                      unsigned int sbtGASIndex,
+                                                                                      float        time,
+                                                                                      float4       data[4] );
+
+/// Returns the object space curve control vertex data of a cubic Bezier curve in a Geometry
+/// Acceleration Structure (GAS) at a given motion time.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ void optixGetCubicBezierVertexData( float4 data[4] );
+static __forceinline__ __device__ void optixGetCubicBezierRocapsVertexData( float4 data[4] );
+
+/// Returns the object space curve control vertex data of a cubic Bezier curve for a valid
+/// outgoing hit object.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// It is only valid to call this function if the return value of optixGetPrimitiveType( optixHitObjectGetHitKind() )
+/// equals OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BEZIER.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ void optixHitObjectGetCubicBezierVertexData( float4 data[4] );
+static __forceinline__ __device__ void optixHitObjectGetCubicBezierRocapsVertexData( float4 data[4] );
+
+/// Deprecated. Call either optixGetRibbonVertexData( float4 data[3] ) for current hit
+/// data, or optixGetRibbonVertexDataFromHandle() for random access.
+///
+/// Returns the object space curve control vertex data of a ribbon (flat quadratic BSpline) in a
+/// Geometry Acceleration Structure (GAS) at a given motion time.
+///
+/// To access vertex data, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetRibbonVertexData( OptixTraversableHandle gas,
+                                                                 unsigned int           primIdx,
+                                                                 unsigned int           sbtGASIndex,
+                                                                 float                  time,
+                                                                 float4                 data[3] );
+
+/// Returns the object space curve control vertex data of a ribbon (flat quadratic BSpline) in a
+/// Geometry Acceleration Structure (GAS) at a given motion time.
+///
+/// To access vertex data, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetRibbonVertexDataFromHandle( OptixTraversableHandle gas,
+                                                                           unsigned int           primIdx,
+                                                                           unsigned int           sbtGASIndex,
+                                                                           float                  time,
+                                                                           float4                 data[3] );
+
+/// Returns the object space curve control vertex data of a ribbon (flat quadratic BSpline) in a
+/// Geometry Acceleration Structure (GAS) at a given motion time.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ void optixGetRibbonVertexData( float4 data[3] );
+
+/// Returns the object space curve control vertex data of a ribbon (flat quadratic BSpline) for a valid
+/// outgoing hit object.
+///
+/// data[i] = {x,y,z,w} with {x,y,z} the position and w the radius of control vertex i.
+///
+/// It is only valid to call this function if the return value of optixGetPrimitiveType( optixHitObjectGetHitKind() )
+/// equals OPTIX_PRIMITIVE_TYPE_FLAT_QUADRATIC_BSPLINE.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ void optixHitObjectGetRibbonVertexData( float4 data[3] );
+
+/// Deprecated. Call either optixGetRibbonNormal( float2 ribbonParameters ) for current hit
+/// data, or optixGetRibbonNormalFromHandle() for random access.
+///
+/// Returns ribbon normal at intersection reported by optixReportIntersection.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ float3 optixGetRibbonNormal( OptixTraversableHandle gas,
+                                                               unsigned int           primIdx,
+                                                               unsigned int           sbtGASIndex,
+                                                               float                  time,
+                                                               float2                 ribbonParameters );
+
+/// Returns ribbon normal at intersection reported by optixReportIntersection.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ float3 optixGetRibbonNormalFromHandle( OptixTraversableHandle gas,
+                                                                         unsigned int           primIdx,
+                                                                         unsigned int           sbtGASIndex,
+                                                                         float                  time,
+                                                                         float2                 ribbonParameters );
+
+/// Return ribbon normal at intersection reported by optixReportIntersection.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ float3 optixGetRibbonNormal( float2 ribbonParameters );
+
+/// Return ribbon normal at intersection reported by optixReportIntersection.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ float3 optixHitObjectGetRibbonNormal( float2 ribbonParameters );
+
+/// Deprecated. Call either optixGetSphereData( float4 data[1] ) for current hit
+/// sphere data, or optixGetSphereDataFromHandle() for random access sphere data.
+///
+/// Returns the object space sphere data, center point and radius, in a Geometry Acceleration
+/// Structure (GAS) at a given motion time.
+///
+/// To access sphere data, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// data[0] = {x,y,z,w} with {x,y,z} the position of the sphere center and w the radius.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetSphereData( OptixTraversableHandle gas,
+                                                           unsigned int           primIdx,
+                                                           unsigned int           sbtGASIndex,
+                                                           float                  time,
+                                                           float4                 data[1] );
+
+/// Performs a random access fetch of the object space sphere data, center point and radius, in a Geometry Acceleration
+/// Structure (GAS) at a given motion time.
+///
+/// To access vertex data of any curve, the GAS must be built using the flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+/// If only the vertex data of a currently intersected sphere is required, it is recommended to
+/// use function optixGetSphereData. A data fetch of the currently hit primitive does NOT
+/// require building the corresponding GAS with flag OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS.
+///
+/// data[0] = {x,y,z,w} with {x,y,z} the position of the sphere center and w the radius.
+///
+/// If motion is disabled via OptixPipelineCompileOptions::usesMotionBlur, or the GAS does not
+/// contain motion, the time parameter is ignored.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ void optixGetSphereDataFromHandle( OptixTraversableHandle gas,
+                                                                     unsigned int           primIdx,
+                                                                     unsigned int           sbtGASIndex,
+                                                                     float                  time,
+                                                                     float4                 data[1] );
+
+/// Returns the object space sphere data of the currently intersected sphere at the current ray time.
+///
+/// Similar to the random access variant optixGetSphereDataFromHandle, but does not require setting flag
+/// OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS when building the corresponding GAS.
+///
+/// It is only valid to call this function if the return value of optixGetPrimitiveType( optixGetHitKind() ) equals OPTIX_PRIMITIVE_TYPE_SPHERE.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ void optixGetSphereData( float4 data[1] );
+
+/// Returns the object space sphere data of the currently intersected sphere for a valid outgoing hit object.
+/// It is the hit object's pendant of optixGetSphereData( float4 data[1] ).
+///
+/// It is only valid to call this function if the return value of optixGetPrimitiveType( optixHitObjectGetHitKind() ) equals OPTIX_PRIMITIVE_TYPE_SPHERE.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ void optixHitObjectGetSphereData( float4 data[1] );
+
+/// Returns the traversable handle for the Geometry Acceleration Structure (GAS) containing the
+/// current hit.
+///
+/// Available in IS, AH, CH
+static __forceinline__ __device__ OptixTraversableHandle optixGetGASTraversableHandle();
+
+/// Returns the motion begin time of a GAS (see OptixMotionOptions)
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ float optixGetGASMotionTimeBegin( OptixTraversableHandle gas );
+
+/// Returns the motion end time of a GAS (see OptixMotionOptions)
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ float optixGetGASMotionTimeEnd( OptixTraversableHandle gas );
+
+/// Returns the number of motion steps of a GAS (see OptixMotionOptions)
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ unsigned int optixGetGASMotionStepCount( OptixTraversableHandle gas );
+
+/// Returns the world-to-object transformation matrix resulting from the current active
+/// transformation list.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+static __forceinline__ __device__ void optixGetWorldToObjectTransformMatrix( float m[12] );
+
+/// Returns the object-to-world transformation matrix resulting from the current active
+/// transformation list.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+static __forceinline__ __device__ void optixGetObjectToWorldTransformMatrix( float m[12] );
+
+/// Transforms the point using world-to-object transformation matrix resulting from the current
+/// active transformation list.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+static __forceinline__ __device__ float3 optixTransformPointFromWorldToObjectSpace( float3 point );
+
+/// Transforms the vector using world-to-object transformation matrix resulting from the current
+/// active transformation list.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+static __forceinline__ __device__ float3 optixTransformVectorFromWorldToObjectSpace( float3 vec );
+
+/// Transforms the normal using world-to-object transformation matrix resulting from the current
+/// active transformation list.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+static __forceinline__ __device__ float3 optixTransformNormalFromWorldToObjectSpace( float3 normal );
+
+/// Transforms the point using object-to-world transformation matrix resulting from the current
+/// active transformation list.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+static __forceinline__ __device__ float3 optixTransformPointFromObjectToWorldSpace( float3 point );
+
+/// Transforms the vector using object-to-world transformation matrix resulting from the current
+/// active transformation list.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+static __forceinline__ __device__ float3 optixTransformVectorFromObjectToWorldSpace( float3 vec );
+
+/// Transforms the normal using object-to-world transformation matrix resulting from the current
+/// active transformation list.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+static __forceinline__ __device__ float3 optixTransformNormalFromObjectToWorldSpace( float3 normal );
+
+/// Returns the world-to-object transformation matrix resulting from the
+/// transformation list of the current outgoing hit object.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ void optixHitObjectGetWorldToObjectTransformMatrix( float m[12] );
+
+/// Returns the object-to-world transformation matrix resulting from the
+/// transformation list of the current outgoing hit object.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ void optixHitObjectGetObjectToWorldTransformMatrix( float m[12] );
+
+/// Transforms the point using world-to-object transformation matrix resulting from the
+/// transformation list of the current outgoing hit object.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ float3 optixHitObjectTransformPointFromWorldToObjectSpace( float3 point );
+
+/// Transforms the vector using world-to-object transformation matrix resulting from the
+/// transformation list of the current outgoing hit object.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ float3 optixHitObjectTransformVectorFromWorldToObjectSpace( float3 vec );
+
+/// Transforms the normal using world-to-object transformation matrix resulting from the
+/// transformation list of the current outgoing hit object.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ float3 optixHitObjectTransformNormalFromWorldToObjectSpace( float3 normal );
+
+/// Transforms the point using object-to-world transformation matrix resulting from the
+/// transformation list of the current outgoing hit object.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ float3 optixHitObjectTransformPointFromObjectToWorldSpace( float3 point );
+
+/// Transforms the vector using object-to-world transformation matrix resulting from the
+/// transformation list of the current outgoing hit object.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ float3 optixHitObjectTransformVectorFromObjectToWorldSpace( float3 vec );
+
+/// Transforms the normal using object-to-world transformation matrix resulting from the
+/// transformation list of the current outgoing hit object.
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ float3 optixHitObjectTransformNormalFromObjectToWorldSpace( float3 normal );
+
+/// Returns the world-to-object transformation matrix resulting from the transformation list of the
+/// templated hit object. Users may implement getRayTime, getTransformListSize, and getTransformListHandle
+/// in their own structs, or inherit them from Optix[Incoming|Outgoing]HitObject. Here is an example:
+///
+/// struct FixedTimeHitState : OptixIncomingHitObject {
+///   float time;
+///   __forceinline__ __device__ float getRayTime() { return time; }
+/// };
+/// ...
+/// optixGetWorldToObjectTransformMatrix( FixedTimeHitState{ 0.4f }, m );
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+template <typename HitState>
+static __forceinline__ __device__ void optixGetWorldToObjectTransformMatrix( const HitState& hs, float m[12] );
+
+/// Returns the object-to-world transformation matrix resulting from the transformation list
+/// of the templated hit object (see optixGetWorldToObjectTransformMatrix for example usage).
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+template <typename HitState>
+static __forceinline__ __device__ void optixGetObjectToWorldTransformMatrix( const HitState& hs, float m[12] );
+
+/// Transforms the point using world-to-object transformation matrix resulting from the transformation
+/// list of the templated hit object (see optixGetWorldToObjectTransformMatrix for example usage).
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+template <typename HitState>
+static __forceinline__ __device__ float3 optixTransformPointFromWorldToObjectSpace( const HitState& hs, float3 point );
+
+/// Transforms the vector using world-to-object transformation matrix resulting from the transformation
+/// list of the templated hit object (see optixGetWorldToObjectTransformMatrix for example usage).
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+template <typename HitState>
+static __forceinline__ __device__ float3 optixTransformVectorFromWorldToObjectSpace( const HitState& hs, float3 vec );
+
+/// Transforms the normal using world-to-object transformation matrix resulting from the transformation
+/// list of the templated hit object (see optixGetWorldToObjectTransformMatrix for example usage).
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+template <typename HitState>
+static __forceinline__ __device__ float3 optixTransformNormalFromWorldToObjectSpace( const HitState& hs, float3 normal );
+
+/// Transforms the point using object-to-world transformation matrix resulting from the transformation
+/// list of the templated hit object (see optixGetWorldToObjectTransformMatrix for example usage).
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+template <typename HitState>
+static __forceinline__ __device__ float3 optixTransformPointFromObjectToWorldSpace( const HitState& hs, float3 point );
+
+/// Transforms the vector using object-to-world transformation matrix resulting from the transformation
+/// list of the templated hit object (see optixGetWorldToObjectTransformMatrix for example usage).
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+template <typename HitState>
+static __forceinline__ __device__ float3 optixTransformVectorFromObjectToWorldSpace( const HitState& hs, float3 vec );
+
+/// Transforms the normal using object-to-world transformation matrix resulting from the transformation
+/// list of the templated hit object (see optixGetWorldToObjectTransformMatrix for example usage).
+///
+/// The cost of this function may be proportional to the size of the transformation list.
+///
+/// Available in IS, AH, CH
+template <typename HitState>
+static __forceinline__ __device__ float3 optixTransformNormalFromObjectToWorldSpace( const HitState& hs, float3 normal );
+
+/// Returns the number of transforms on the current transform list.
+///
+/// Available in IS, AH, CH
+static __forceinline__ __device__ unsigned int optixGetTransformListSize();
+
+/// Returns the number of transforms associated with the current outgoing hit object's transform
+/// list.
+///
+/// Returns zero when there is no hit (miss and nop).
+///
+/// See #optixGetTransformListSize()
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ unsigned int optixHitObjectGetTransformListSize();
+
+/// Returns the traversable handle for a transform in the current transform list.
+///
+/// Available in IS, AH, CH
+static __forceinline__ __device__ OptixTraversableHandle optixGetTransformListHandle( unsigned int index );
+
+/// Returns the traversable handle for a transform in the current transform list associated with the
+/// outgoing hit object.
+///
+/// Results are undefined if the hit object is a miss.
+///
+/// See #optixGetTransformListHandle()
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ OptixTraversableHandle optixHitObjectGetTransformListHandle( unsigned int index );
+
+struct OptixIncomingHitObject
+{
+    __forceinline__ __device__ float        getRayTime() const { return optixGetRayTime(); }
+    __forceinline__ __device__ unsigned int getTransformListSize() const { return optixGetTransformListSize(); }
+    __forceinline__ __device__ OptixTraversableHandle getTransformListHandle( unsigned int index ) const
+    {
+        return optixGetTransformListHandle( index );
+    }
+};
+
+struct OptixOutgoingHitObject
+{
+    __forceinline__ __device__ float        getRayTime() const { return optixHitObjectGetRayTime(); }
+    __forceinline__ __device__ unsigned int getTransformListSize() const
+    {
+        return optixHitObjectGetTransformListSize();
+    }
+    __forceinline__ __device__ OptixTraversableHandle getTransformListHandle( unsigned int index ) const
+    {
+        return optixHitObjectGetTransformListHandle( index );
+    }
+};
+
+/// Returns the transform type of a traversable handle from a transform list.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ OptixTransformType optixGetTransformTypeFromHandle( OptixTraversableHandle handle );
+
+/// Returns a pointer to a OptixStaticTransform from its traversable handle.
+///
+/// Returns 0 if the traversable is not of type OPTIX_TRANSFORM_TYPE_STATIC_TRANSFORM.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ const OptixStaticTransform* optixGetStaticTransformFromHandle( OptixTraversableHandle handle );
+
+/// Returns a pointer to a OptixSRTMotionTransform from its traversable handle.
+///
+/// Returns 0 if the traversable is not of type OPTIX_TRANSFORM_TYPE_SRT_MOTION_TRANSFORM.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ const OptixSRTMotionTransform* optixGetSRTMotionTransformFromHandle( OptixTraversableHandle handle );
+
+/// Returns a pointer to a OptixMatrixMotionTransform from its traversable handle.
+///
+/// Returns 0 if the traversable is not of type OPTIX_TRANSFORM_TYPE_MATRIX_MOTION_TRANSFORM.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ const OptixMatrixMotionTransform* optixGetMatrixMotionTransformFromHandle( OptixTraversableHandle handle );
+
+/// Returns instanceId from an OptixInstance traversable.
+///
+/// Returns 0 if the traversable handle does not reference an OptixInstance.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ unsigned int optixGetInstanceIdFromHandle( OptixTraversableHandle handle );
+
+/// Returns child traversable handle from an OptixInstance traversable.
+///
+/// Returns 0 if the traversable handle does not reference an OptixInstance.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ OptixTraversableHandle optixGetInstanceChildFromHandle( OptixTraversableHandle handle );
+
+/// Returns object-to-world transform from an OptixInstance traversable.
+///
+/// Returns 0 if the traversable handle does not reference an OptixInstance.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ const float4* optixGetInstanceTransformFromHandle( OptixTraversableHandle handle );
+
+/// Returns world-to-object transform from an OptixInstance traversable.
+///
+/// Returns 0 if the traversable handle does not reference an OptixInstance.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ const float4* optixGetInstanceInverseTransformFromHandle( OptixTraversableHandle handle );
+
+/// Returns a pointer to the geometry acceleration structure from its traversable handle.
+///
+/// Returns 0 if the traversable is not a geometry acceleration structure.
+///
+/// Available in all OptiX program types
+static __device__ __forceinline__ CUdeviceptr optixGetGASPointerFromHandle( OptixTraversableHandle handle );
+/// Reports an intersections (overload without attributes).
+///
+/// If optixGetRayTmin() <= hitT <= optixGetRayTmax(), the any hit program associated with this
+/// intersection program (via the SBT entry) is called.
+///
+/// The AH program can do one of three things:
+/// 1. call optixIgnoreIntersection - no hit is recorded, optixReportIntersection returns false
+/// 2. call optixTerminateRay       -    hit is recorded, optixReportIntersection does not return, no further traversal occurs,
+///                                                       and the associated closest hit program is called
+/// 3. neither                      -    hit is recorded, optixReportIntersection returns true
+///
+/// hitKind - Only the 7 least significant bits should be written [0..127].  Any values above 127
+/// are reserved for built in intersection.  The value can be queried with optixGetHitKind() in AH
+/// and CH.
+///
+/// The attributes specified with a0..a7 are available in the AH and CH programs.  Note that the
+/// attributes available in the CH program correspond to the closest recorded intersection.  The
+/// number of attributes in registers and memory can be configured in the pipeline.
+///
+/// \param[in] hitT
+/// \param[in] hitKind
+///
+/// Available in IS
+static __forceinline__ __device__ bool optixReportIntersection( float hitT, unsigned int hitKind );
+
+/// Reports an intersection (overload with 1 attribute register).
+///
+/// \see #optixReportIntersection(float,unsigned int)
+///
+/// Available in IS
+static __forceinline__ __device__ bool optixReportIntersection( float hitT, unsigned int hitKind, unsigned int a0 );
+
+/// Reports an intersection (overload with 2 attribute registers).
+///
+/// \see #optixReportIntersection(float,unsigned int)
+///
+/// Available in IS
+static __forceinline__ __device__ bool optixReportIntersection( float hitT, unsigned int hitKind, unsigned int a0, unsigned int a1 );
+
+/// Reports an intersection (overload with 3 attribute registers).
+///
+/// \see #optixReportIntersection(float,unsigned int)
+///
+/// Available in IS
+static __forceinline__ __device__ bool optixReportIntersection( float hitT, unsigned int hitKind, unsigned int a0, unsigned int a1, unsigned int a2 );
+
+/// Reports an intersection (overload with 4 attribute registers).
+///
+/// \see #optixReportIntersection(float,unsigned int)
+///
+/// Available in IS
+static __forceinline__ __device__ bool optixReportIntersection( float        hitT,
+                                                                unsigned int hitKind,
+                                                                unsigned int a0,
+                                                                unsigned int a1,
+                                                                unsigned int a2,
+                                                                unsigned int a3 );
+
+/// Reports an intersection (overload with 5 attribute registers).
+///
+/// \see #optixReportIntersection(float,unsigned int)
+///
+/// Available in IS
+static __forceinline__ __device__ bool optixReportIntersection( float        hitT,
+                                                                unsigned int hitKind,
+                                                                unsigned int a0,
+                                                                unsigned int a1,
+                                                                unsigned int a2,
+                                                                unsigned int a3,
+                                                                unsigned int a4 );
+
+/// Reports an intersection (overload with 6 attribute registers).
+///
+/// \see #optixReportIntersection(float,unsigned int)
+///
+/// Available in IS
+static __forceinline__ __device__ bool optixReportIntersection( float        hitT,
+                                                                unsigned int hitKind,
+                                                                unsigned int a0,
+                                                                unsigned int a1,
+                                                                unsigned int a2,
+                                                                unsigned int a3,
+                                                                unsigned int a4,
+                                                                unsigned int a5 );
+
+/// Reports an intersection (overload with 7 attribute registers).
+///
+/// \see #optixReportIntersection(float,unsigned int)
+///
+/// Available in IS
+static __forceinline__ __device__ bool optixReportIntersection( float        hitT,
+                                                                unsigned int hitKind,
+                                                                unsigned int a0,
+                                                                unsigned int a1,
+                                                                unsigned int a2,
+                                                                unsigned int a3,
+                                                                unsigned int a4,
+                                                                unsigned int a5,
+                                                                unsigned int a6 );
+
+/// Reports an intersection (overload with 8 attribute registers).
+///
+/// \see #optixReportIntersection(float,unsigned int)
+///
+/// Available in IS
+static __forceinline__ __device__ bool optixReportIntersection( float        hitT,
+                                                                unsigned int hitKind,
+                                                                unsigned int a0,
+                                                                unsigned int a1,
+                                                                unsigned int a2,
+                                                                unsigned int a3,
+                                                                unsigned int a4,
+                                                                unsigned int a5,
+                                                                unsigned int a6,
+                                                                unsigned int a7 );
+
+/// Returns the attribute at the given slot index. There are up to 8 attributes available. The
+/// number of attributes is configured with OptixPipelineCompileOptions::numAttributeValues.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ unsigned int optixGetAttribute_0();
+static __forceinline__ __device__ unsigned int optixGetAttribute_1();
+static __forceinline__ __device__ unsigned int optixGetAttribute_2();
+static __forceinline__ __device__ unsigned int optixGetAttribute_3();
+static __forceinline__ __device__ unsigned int optixGetAttribute_4();
+static __forceinline__ __device__ unsigned int optixGetAttribute_5();
+static __forceinline__ __device__ unsigned int optixGetAttribute_6();
+static __forceinline__ __device__ unsigned int optixGetAttribute_7();
+
+
+/// Return the attribute at the given slot index for the current outgoing hit object. There are up
+/// to 8 attributes available. The number of attributes is configured with
+/// OptixPipelineCompileOptions::numAttributeValues.
+///
+/// Results are undefined if the hit object is a miss.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_0();
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_1();
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_2();
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_3();
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_4();
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_5();
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_6();
+static __forceinline__ __device__ unsigned int optixHitObjectGetAttribute_7();
+
+/// Record the hit, stops traversal, and proceeds to CH.
+///
+/// Available in AH
+static __forceinline__ __device__ void optixTerminateRay();
+
+/// Discards the hit, and returns control to the calling optixReportIntersection or built-in
+/// intersection routine.
+///
+/// Available in AH
+static __forceinline__ __device__ void optixIgnoreIntersection();
+
+
+/// For a given OptixBuildInputTriangleArray the number of primitives is defined as
+///
+/// "(OptixBuildInputTriangleArray::indexBuffer == 0) ? OptixBuildInputTriangleArray::numVertices/3 :
+///                                                     OptixBuildInputTriangleArray::numIndexTriplets;".
+///
+/// For a given OptixBuildInputCustomPrimitiveArray the number of primitives is defined as numAabbs.
+///
+/// The primitive index returns the index into the array of primitives plus the
+/// primitiveIndexOffset.
+///
+/// In IS and AH this corresponds to the currently intersected primitive.
+///
+/// In CH this corresponds to the primitive index of the closest intersected primitive.
+///
+/// Available in IS, AH, CH
+static __forceinline__ __device__ unsigned int optixGetPrimitiveIndex();
+
+
+/// Returns the user-provided cluster ID of the intersected CLAS of a hit.
+///
+/// Returns OPTIX_CLUSTER_ID_INVALID if the closest (or current) intersection 
+/// is not a cluster.
+///
+/// see also OptixPipelineCompileOptions::allowClusteredGeometry
+///
+/// Available in AH, CH
+static __forceinline__ __device__ unsigned int optixGetClusterId();
+
+/// Returns the user-provided cluster ID associated with the current outgoing hit object.
+///
+/// Returns OPTIX_CLUSTER_ID_INVALID if the closest intersection is not a cluster,
+/// or if the hit object is a miss.
+///
+/// see also OptixPipelineCompileOptions::allowClusteredGeometry
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ unsigned int optixHitObjectGetClusterId();
+
+
+/// Return the primitive index associated with the current outgoing hit object.
+///
+/// Results are undefined if the hit object is a miss.
+///
+/// See #optixGetPrimitiveIndex() for more details.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ unsigned int optixHitObjectGetPrimitiveIndex();
+
+/// Returns the Sbt GAS index of the primitive associated with the current intersection.
+///
+/// In IS and AH this corresponds to the currently intersected primitive.
+///
+/// In CH this corresponds to the SBT GAS index of the closest intersected primitive.
+///
+/// Available in IS, AH, CH
+static __forceinline__ __device__ unsigned int optixGetSbtGASIndex();
+
+/// Return the SBT GAS index of the closest intersected primitive associated with the current
+/// outgoing hit object.
+///
+/// Results are undefined if the hit object is a miss.
+///
+/// See #optixGetSbtGASIndex() for details on the version for the incoming hit object.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ unsigned int optixHitObjectGetSbtGASIndex();
+
+
+/// Returns the OptixInstance::instanceId of the instance within the top level acceleration
+/// structure associated with the current intersection.
+///
+/// When building an acceleration structure using OptixBuildInputInstanceArray each OptixInstance
+/// has a user supplied instanceId.  OptixInstance objects reference another acceleration structure.
+/// During traversal the acceleration structures are visited top down.  In the IS and AH programs
+/// the OptixInstance::instanceId corresponding to the most recently visited OptixInstance is
+/// returned when calling optixGetInstanceId().  In CH optixGetInstanceId() returns the
+/// OptixInstance::instanceId when the hit was recorded with optixReportIntersection.  In the case
+/// where there is no OptixInstance visited, optixGetInstanceId returns 0
+///
+/// Available in IS, AH, CH
+static __forceinline__ __device__ unsigned int optixGetInstanceId();
+
+/// Returns the OptixInstance::instanceId of the instance within the top level acceleration
+/// structure associated with the outgoing hit object.
+///
+/// Results are undefined if the hit object is a miss.
+///
+/// See #optixGetInstanceId().
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ unsigned int optixHitObjectGetInstanceId();
+
+/// Returns the zero-based index of the instance within its instance acceleration structure
+/// associated with the current intersection.
+///
+/// In the IS and AH programs the index corresponding to the most recently visited OptixInstance is
+/// returned when calling optixGetInstanceIndex().  In CH optixGetInstanceIndex() returns the index
+/// when the hit was recorded with optixReportIntersection.  In the case where there is no
+/// OptixInstance visited, optixGetInstanceIndex returns 0
+///
+/// Available in IS, AH, CH
+static __forceinline__ __device__ unsigned int optixGetInstanceIndex();
+
+/// Returns the zero-based index of the instance within its instance acceleration structure
+/// associated with the outgoing hit object.
+///
+/// Results are undefined if the hit object is a miss.
+///
+/// See #optixGetInstanceIndex().
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ unsigned int optixHitObjectGetInstanceIndex();
+
+/// Returns the 8 bit hit kind associated with the current hit.
+///
+/// Use optixGetPrimitiveType() to interpret the hit kind.  For custom intersections (primitive type
+/// OPTIX_PRIMITIVE_TYPE_CUSTOM), this is the 7-bit hitKind passed to optixReportIntersection().
+/// Hit kinds greater than 127 are reserved for built-in primitives.
+///
+/// Available in AH and CH
+static __forceinline__ __device__ unsigned int optixGetHitKind();
+
+/// Returns the 8 bit hit kind associated with the current outgoing hit object.
+///
+/// Results are undefined if the hit object is a miss.
+///
+/// See #optixGetHitKind().
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ unsigned int optixHitObjectGetHitKind();
+
+/// Function interpreting the result of #optixGetHitKind().
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ OptixPrimitiveType optixGetPrimitiveType( unsigned int hitKind );
+
+/// Function interpreting the result of #optixGetHitKind().
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ bool optixIsFrontFaceHit( unsigned int hitKind );
+
+/// Function interpreting the result of #optixGetHitKind().
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ bool optixIsBackFaceHit( unsigned int hitKind );
+
+/// Function interpreting the hit kind associated with the current optixReportIntersection.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ OptixPrimitiveType optixGetPrimitiveType();
+
+/// Function interpreting the hit kind associated with the current optixReportIntersection.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ bool optixIsFrontFaceHit();
+
+/// Function interpreting the hit kind associated with the current optixReportIntersection.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ bool optixIsBackFaceHit();
+
+/// Convenience function interpreting the result of #optixGetHitKind().
+///
+/// Available in AH, CH
+static __forceinline__ __device__ bool optixIsTriangleHit();
+
+/// Convenience function interpreting the result of #optixGetHitKind().
+///
+/// Available in AH, CH
+static __forceinline__ __device__ bool optixIsTriangleFrontFaceHit();
+
+/// Convenience function interpreting the result of #optixGetHitKind().
+///
+/// Available in AH, CH
+static __forceinline__ __device__ bool optixIsTriangleBackFaceHit();
+
+
+/// Convenience function that returns the first two attributes as floats.
+///
+/// When using OptixBuildInputTriangleArray objects, during intersection with a triangle, the barycentric coordinates of the hit
+/// are stored into the first two attribute registers.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ float2 optixGetTriangleBarycentrics();
+
+/// Returns the barycentric coordinates of the hit point on an intersected triangle.
+///
+/// This function is the hit object's equivalent to optixGetTriangleBarycentrics().
+/// It is only valid to call this function if the return value of
+/// optixGetPrimitiveType( optixHitObjectGetHitKind() ) equals OPTIX_PRIMITIVE_TYPE_TRIANGLE.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ float2 optixHitObjectGetTriangleBarycentrics();
+
+/// Returns the curve parameter associated with the current intersection when using
+/// OptixBuildInputCurveArray objects.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ float optixGetCurveParameter();
+
+/// Returns the curve parameter associated with the intersection of a curve.
+///
+/// This function is the hit object's equivalent to optixGetCurveParameter().
+/// It is only valid to call this function if the return value of
+/// optixGetPrimitiveType( optixHitObjectGetHitKind() ) equals a primitive type that can
+/// be used to build an AS with OptixBuildInputCurveArray objects.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ float optixHitObjectGetCurveParameter();
+
+/// Returns the ribbon parameters along directrix (length) and generator (width) of the current
+/// intersection when using OptixBuildInputCurveArray objects with curveType
+/// OPTIX_PRIMITIVE_TYPE_FLAT_QUADRATIC_BSPLINE.
+///
+/// Available in AH, CH
+static __forceinline__ __device__ float2 optixGetRibbonParameters();
+
+/// Returns the ribbon parameters along directrix (length) and generator (width) of the current
+/// curve intersection with primitive type OPTIX_PRIMITIVE_TYPE_FLAT_QUADRATIC_BSPLINE.
+///
+/// This function is the hit object's equivalent to optixGetRibbonParameters().
+/// It is only valid to call this function if the return value of
+/// optixGetPrimitiveType( optixHitObjectGetHitKind() ) equals OPTIX_PRIMITIVE_TYPE_FLAT_QUADRATIC_BSPLINE.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ float2 optixHitObjectGetRibbonParameters();
+
+/// Available in any program, it returns the current launch index within the launch dimensions
+/// specified by optixLaunch on the host.
+///
+/// The raygen program is typically only launched once per launch index.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ uint3 optixGetLaunchIndex();
+
+/// Available in any program, it returns the dimensions of the current launch specified by
+/// optixLaunch on the host.
+///
+/// Available in all OptiX program types
+static __forceinline__ __device__ uint3 optixGetLaunchDimensions();
+
+/// Returns the generic memory space pointer to the data region (past the header) of the
+/// currently active SBT record corresponding to the current program.
+///
+/// Note that optixGetSbtDataPointer is not available in OptiX-enabled functions, because
+/// there is no SBT entry associated with the function.
+///
+/// Available in RG, IS, AH, CH, MS, EX, DC, CC
+static __forceinline__ __device__ CUdeviceptr optixGetSbtDataPointer();
+
+/// Device pointer address for the SBT associated with the hit or miss program for the current
+/// outgoing hit object.
+///
+/// Returns 0 for nop hit objects.
+///
+/// Available in RG, CH, MS, CC, DC
+static __forceinline__ __device__ CUdeviceptr optixHitObjectGetSbtDataPointer();
+
+/// Throws a user exception with the given exception code (overload without exception details).
+///
+/// The exception code must be in the range from 0 to 2^30 - 1. Up to 8 optional exception details
+/// can be passed. They can be queried in the EX program using optixGetExceptionDetail_0() to
+/// ..._8().
+///
+/// The exception details must not be used to encode pointers to the stack since the current stack
+/// is not preserved in the EX program.
+///
+/// Not available in EX
+///
+/// \param[in] exceptionCode The exception code to be thrown.
+///
+/// Available in RG, IS, AH, CH, MS, DC, CC
+static __forceinline__ __device__ void optixThrowException( int exceptionCode );
+
+/// Throws a user exception with the given exception code (overload with 1 exception detail).
+///
+/// \see #optixThrowException(int)
+///
+/// Available in RG, IS, AH, CH, MS, DC, CC
+static __forceinline__ __device__ void optixThrowException( int exceptionCode, unsigned int exceptionDetail0 );
+
+/// Throws a user exception with the given exception code (overload with 2 exception details).
+///
+/// \see #optixThrowException(int)
+///
+/// Available in RG, IS, AH, CH, MS, DC, CC
+static __forceinline__ __device__ void optixThrowException( int exceptionCode,
+                                                            unsigned int exceptionDetail0,
+                                                            unsigned int exceptionDetail1 );
+
+/// Throws a user exception with the given exception code (overload with 3 exception details).
+///
+/// \see #optixThrowException(int)
+///
+/// Available in RG, IS, AH, CH, MS, DC, CC
+static __forceinline__ __device__ void optixThrowException( int exceptionCode,
+                                                            unsigned int exceptionDetail0,
+                                                            unsigned int exceptionDetail1,
+                                                            unsigned int exceptionDetail2 );
+
+/// Throws a user exception with the given exception code (overload with 4 exception details).
+///
+/// \see #optixThrowException(int)
+///
+/// Available in RG, IS, AH, CH, MS, DC, CC
+static __forceinline__ __device__ void optixThrowException( int exceptionCode,
+                                                            unsigned int exceptionDetail0,
+                                                            unsigned int exceptionDetail1,
+                                                            unsigned int exceptionDetail2,
+                                                            unsigned int exceptionDetail3 );
+
+/// Throws a user exception with the given exception code (overload with 5 exception details).
+///
+/// \see #optixThrowException(int)
+///
+/// Available in RG, IS, AH, CH, MS, DC, CC
+static __forceinline__ __device__ void optixThrowException( int exceptionCode,
+                                                            unsigned int exceptionDetail0,
+                                                            unsigned int exceptionDetail1,
+                                                            unsigned int exceptionDetail2,
+                                                            unsigned int exceptionDetail3,
+                                                            unsigned int exceptionDetail4 );
+
+/// Throws a user exception with the given exception code (overload with 6 exception details).
+///
+/// \see #optixThrowException(int)
+///
+/// Available in RG, IS, AH, CH, MS, DC, CC
+static __forceinline__ __device__ void optixThrowException( int exceptionCode,
+                                                            unsigned int exceptionDetail0,
+                                                            unsigned int exceptionDetail1,
+                                                            unsigned int exceptionDetail2,
+                                                            unsigned int exceptionDetail3,
+                                                            unsigned int exceptionDetail4,
+                                                            unsigned int exceptionDetail5 );
+
+/// Throws a user exception with the given exception code (overload with 7 exception
+/// details).
+///
+/// \see #optixThrowException(int)
+///
+/// Available in RG, IS, AH, CH, MS, DC, CC
+static __forceinline__ __device__ void optixThrowException( int exceptionCode,
+                                                            unsigned int exceptionDetail0,
+                                                            unsigned int exceptionDetail1,
+                                                            unsigned int exceptionDetail2,
+                                                            unsigned int exceptionDetail3,
+                                                            unsigned int exceptionDetail4,
+                                                            unsigned int exceptionDetail5,
+                                                            unsigned int exceptionDetail6 );
+
+/// Throws a user exception with the given exception code (overload with 8 exception details).
+///
+/// \see #optixThrowException(int)
+///
+/// Available in RG, IS, AH, CH, MS, DC, CC
+static __forceinline__ __device__ void optixThrowException( int exceptionCode,
+                                                            unsigned int exceptionDetail0,
+                                                            unsigned int exceptionDetail1,
+                                                            unsigned int exceptionDetail2,
+                                                            unsigned int exceptionDetail3,
+                                                            unsigned int exceptionDetail4,
+                                                            unsigned int exceptionDetail5,
+                                                            unsigned int exceptionDetail6,
+                                                            unsigned int exceptionDetail7 );
+
+/// Returns the exception code.
+///
+/// Available in EX
+static __forceinline__ __device__ int optixGetExceptionCode();
+
+/// Returns the 32-bit exception detail at slot 0.
+///
+/// The behavior is undefined if the exception is not a user exception, or the used overload
+/// #optixThrowException() did not provide the queried exception detail.
+///
+/// Available in EX
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_0();
+
+/// Returns the 32-bit exception detail at slot 1.
+///
+/// \see #optixGetExceptionDetail_0()
+///
+/// Available in EX
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_1();
+
+/// Returns the 32-bit exception detail at slot 2.
+///
+/// \see #optixGetExceptionDetail_0()
+///
+/// Available in EX
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_2();
+
+/// Returns the 32-bit exception detail at slot 3.
+///
+/// \see #optixGetExceptionDetail_0()
+///
+/// Available in EX
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_3();
+
+/// Returns the 32-bit exception detail at slot 4.
+///
+/// \see #optixGetExceptionDetail_0()
+///
+/// Available in EX
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_4();
+
+/// Returns the 32-bit exception detail at slot 5.
+///
+/// \see #optixGetExceptionDetail_0()
+///
+/// Available in EX
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_5();
+
+/// Returns the 32-bit exception detail at slot 6.
+///
+/// \see #optixGetExceptionDetail_0()
+///
+/// Available in EX
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_6();
+
+/// Returns the 32-bit exception detail at slot 7.
+///
+/// \see #optixGetExceptionDetail_0()
+///
+/// Available in EX
+static __forceinline__ __device__ unsigned int optixGetExceptionDetail_7();
+
+
+/// Returns a string that includes information about the source location that caused the current
+/// exception.
+///
+/// The source location is only available for user exceptions.
+/// Line information needs to be present in the input PTX and
+/// OptixModuleCompileOptions::debugLevel may not be set to OPTIX_COMPILE_DEBUG_LEVEL_NONE.
+///
+/// Returns a NULL pointer if no line information is available.
+///
+/// Available in EX
+static __forceinline__ __device__ char* optixGetExceptionLineInfo();
+
+/// Creates a call to the direct callable program at the specified SBT entry.
+///
+/// This will call the program that was specified in the
+/// OptixProgramGroupCallables::entryFunctionNameDC in the module specified by
+/// OptixProgramGroupCallables::moduleDC.
+///
+/// The address of the SBT entry is calculated by:
+///  OptixShaderBindingTable::callablesRecordBase + ( OptixShaderBindingTable::callablesRecordStrideInBytes * sbtIndex ).
+///
+/// Direct callable programs are allowed to call optixTrace, but any secondary trace calls invoked
+/// from subsequently called CH, MS and callable programs will result an an error.
+///
+/// Behavior is undefined if there is no direct callable program at the specified SBT entry.
+///
+/// Behavior is undefined if the number of arguments that are being passed in does not match the
+/// number of parameters expected by the program that is called. In validation mode an exception
+/// will be generated.
+///
+/// \param[in] sbtIndex The offset of the SBT entry of the direct callable program to call relative
+/// to OptixShaderBindingTable::callablesRecordBase.  \param[in] args The arguments to pass to the
+/// direct callable program.
+///
+/// Available in RG, IS, AH, CH, MS, DC, CC
+template <typename ReturnT, typename... ArgTypes>
+static __forceinline__ __device__ ReturnT optixDirectCall( unsigned int sbtIndex, ArgTypes... args );
+
+
+/// Creates a call to the continuation callable program at the specified SBT entry.
+///
+/// This will call the program that was specified in the
+/// OptixProgramGroupCallables::entryFunctionNameCC in the module specified by
+/// OptixProgramGroupCallables::moduleCC.
+///
+/// The address of the SBT entry is calculated by:
+///  OptixShaderBindingTable::callablesRecordBase + ( OptixShaderBindingTable::callablesRecordStrideInBytes * sbtIndex ).
+///
+/// As opposed to direct callable programs, continuation callable programs are allowed to make
+/// secondary optixTrace calls.
+///
+/// Behavior is undefined if there is no continuation callable program at the specified SBT entry.
+///
+/// Behavior is undefined if the number of arguments that are being passed in does not match the
+/// number of parameters expected by the program that is called. In validation mode an exception
+/// will be generated.
+///
+/// \param[in] sbtIndex The offset of the SBT entry of the continuation callable program to call relative to OptixShaderBindingTable::callablesRecordBase.
+/// \param[in] args The arguments to pass to the continuation callable program.
+///
+/// Available in RG, CH, MS, CC
+template <typename ReturnT, typename... ArgTypes>
+static __forceinline__ __device__ ReturnT optixContinuationCall( unsigned int sbtIndex, ArgTypes... args );
+
+
+/// optixTexFootprint2D calculates the footprint of a corresponding 2D texture fetch (non-mipmapped).
+///
+/// On Turing and subsequent architectures, a texture footprint instruction allows user programs to
+/// determine the set of texels that would be accessed by an equivalent filtered texture lookup.
+///
+/// \param[in] tex      CUDA texture object (cast to 64-bit integer)
+/// \param[in] texInfo  Texture info packed into 32-bit integer, described below.
+/// \param[in] x        Texture coordinate
+/// \param[in] y        Texture coordinate
+/// \param[out] singleMipLevel  Result indicating whether the footprint spans only a single miplevel.
+///
+/// The texture info argument is a packed 32-bit integer with the following layout:
+///
+///   texInfo[31:29] = reserved (3 bits)
+///   texInfo[28:24] = miplevel count (5 bits)
+///   texInfo[23:20] = log2 of tile width (4 bits)
+///   texInfo[19:16] = log2 of tile height (4 bits)
+///   texInfo[15:10] = reserved (6 bits)
+///   texInfo[9:8]   = horizontal wrap mode (2 bits) (CUaddress_mode)
+///   texInfo[7:6]   = vertical wrap mode (2 bits) (CUaddress_mode)
+///   texInfo[5]     = mipmap filter mode (1 bit) (CUfilter_mode)
+///   texInfo[4:0]   = maximum anisotropy (5 bits)
+///
+/// Returns a 16-byte structure (as a uint4) that stores the footprint of a texture request at a
+/// particular "granularity", which has the following layout:
+///
+///    struct Texture2DFootprint
+///    {
+///        unsigned long long mask;
+///        unsigned int tileY : 12;
+///        unsigned int reserved1 : 4;
+///        unsigned int dx : 3;
+///        unsigned int dy : 3;
+///        unsigned int reserved2 : 2;
+///        unsigned int granularity : 4;
+///        unsigned int reserved3 : 4;
+///        unsigned int tileX : 12;
+///        unsigned int level : 4;
+///        unsigned int reserved4 : 16;
+///    };
+///
+/// The granularity indicates the size of texel groups that are represented by an 8x8 bitmask. For
+/// example, a granularity of 12 indicates texel groups that are 128x64 texels in size. In a
+/// footprint call, The returned granularity will either be the actual granularity of the result, or
+/// 0 if the footprint call was able to honor the requested granularity (the usual case).
+///
+/// level is the mip level of the returned footprint. Two footprint calls are needed to get the
+/// complete footprint when a texture call spans multiple mip levels.
+///
+/// mask is an 8x8 bitmask of texel groups that are covered, or partially covered, by the footprint.
+/// tileX and tileY give the starting position of the mask in 8x8 texel-group blocks.  For example,
+/// suppose a granularity of 12 (128x64 texels), and tileX=3 and tileY=4. In this case, bit 0 of the
+/// mask (the low order bit) corresponds to texel group coordinates (3*8, 4*8), and texel
+/// coordinates (3*8*128, 4*8*64), within the specified mip level.
+///
+/// If nonzero, dx and dy specify a "toroidal rotation" of the bitmask.  Toroidal rotation of a
+/// coordinate in the mask simply means that its value is reduced by 8.  Continuing the example from
+/// above, if dx=0 and dy=0 the mask covers texel groups (3*8, 4*8) to (3*8+7, 4*8+7) inclusive.
+/// If, on the other hand, dx=2, the rightmost 2 columns in the mask have their x coordinates
+/// reduced by 8, and similarly for dy.
+///
+/// See the OptiX SDK for sample code that illustrates how to unpack the result.
+///
+/// Available anywhere
+static __forceinline__ __device__ uint4 optixTexFootprint2D( unsigned long long tex, unsigned int texInfo, float x, float y, unsigned int* singleMipLevel );
+
+/// optixTexFootprint2DLod calculates the footprint of a corresponding 2D texture fetch (tex2DLod)
+/// \param[in] tex      CUDA texture object (cast to 64-bit integer)
+/// \param[in] texInfo  Texture info packed into 32-bit integer, described below.
+/// \param[in] x        Texture coordinate
+/// \param[in] y        Texture coordinate
+/// \param[in] level    Level of detail (lod)
+/// \param[in] coarse   Requests footprint from coarse miplevel, when the footprint spans two levels.
+/// \param[out] singleMipLevel  Result indicating whether the footprint spans only a single miplevel.
+/// \see #optixTexFootprint2D(unsigned long long,unsigned int,float,float,unsigned int*)
+///
+/// Available anywhere
+static __forceinline__ __device__ uint4
+optixTexFootprint2DLod( unsigned long long tex, unsigned int texInfo, float x, float y, float level, bool coarse, unsigned int* singleMipLevel );
+
+/// optixTexFootprint2DGrad calculates the footprint of a corresponding 2D texture fetch (tex2DGrad)
+/// \param[in] tex      CUDA texture object (cast to 64-bit integer)
+/// \param[in] texInfo  Texture info packed into 32-bit integer, described below.
+/// \param[in] x        Texture coordinate
+/// \param[in] y        Texture coordinate
+/// \param[in] dPdx_x   Derivative of x coordinte, which determines level of detail.
+/// \param[in] dPdx_y   Derivative of x coordinte, which determines level of detail.
+/// \param[in] dPdy_x   Derivative of y coordinte, which determines level of detail.
+/// \param[in] dPdy_y   Derivative of y coordinte, which determines level of detail.
+/// \param[in] coarse   Requests footprint from coarse miplevel, when the footprint spans two levels.
+/// \param[out] singleMipLevel  Result indicating whether the footprint spans only a single miplevel.
+/// \see #optixTexFootprint2D(unsigned long long,unsigned int,float,float,unsigned int*)
+///
+/// Available anywhere
+static __forceinline__ __device__ uint4 optixTexFootprint2DGrad( unsigned long long tex,
+                                                                 unsigned int       texInfo,
+                                                                 float              x,
+                                                                 float              y,
+                                                                 float              dPdx_x,
+                                                                 float              dPdx_y,
+                                                                 float              dPdy_x,
+                                                                 float              dPdy_y,
+                                                                 bool               coarse,
+                                                                 unsigned int*      singleMipLevel );
+
+/**@}*/  // end group optix_device_api
+
+#define __OPTIX_INCLUDE_INTERNAL_HEADERS__
+
+#include "internal/optix_device_impl.h"
+
+
+// If you manually define OPTIX_INCLUDE_COOPERATIVE_VECTOR to override the default behavior, you must
+// set it to 0 or 1 and not simply define it with no value (which will default it have a value of 0).
+#ifndef OPTIX_INCLUDE_COOPERATIVE_VECTOR
+#  define OPTIX_INCLUDE_COOPERATIVE_VECTOR_UNSET
+#  define OPTIX_INCLUDE_COOPERATIVE_VECTOR 1
+#endif
+
+#if OPTIX_INCLUDE_COOPERATIVE_VECTOR
+/// \addtogroup optix_device_api
+/// \defgroup optix_device_api_coop_vec Cooperative Vector
+/// \ingroup optix_device_api
+///@{
+///
+
+/// Load the vector from global memory. The memory address must be 16 byte aligned
+/// regardless of the type and number of elements in the vector.
+///
+/// Available anywhere
+template <typename VecTOut>
+static __forceinline__ __device__ VecTOut optixCoopVecLoad( CUdeviceptr ptr );
+/// Load the vector from global memory. The memory address must be 16 byte aligned
+/// regardless of the type and number of elements in the vector.
+///
+/// Available anywhere
+template <typename VecTOut, typename T>
+static __forceinline__ __device__ VecTOut optixCoopVecLoad( T* ptr );
+
+
+/// Following functions are designed to facilitate activation function evaluation between
+/// calls to optixCoopVecMatMul. Utilizing only these functions on the activation vectors
+/// will typically improve performance.
+///
+/// Available anywhere
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecExp2( const VecT& vec );
+///
+/// Available anywhere
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecLog2( const VecT& vec );
+///
+/// Available anywhere
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecTanh( const VecT& vec );
+/// Convert from VecTIn to VecTOut. Not all conversions are supported, only integral to 16
+/// or 32-bit floating point.
+///
+/// Available anywhere
+template <typename VecTOut, typename VecTIn>
+static __forceinline__ __device__ VecTOut optixCoopVecCvt( const VecTIn& vec );
+///
+/// Available anywhere
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecMin( const VecT& vecA, const VecT& vecB );
+///
+/// Available anywhere
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecMin( const VecT& vecA, typename VecT::value_type B );
+///
+/// Available anywhere
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecMax( const VecT& vecA, const VecT& vecB );
+///
+/// Available anywhere
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecMax( const VecT& vecA, typename VecT::value_type B );
+///
+/// Available anywhere
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecMul( const VecT& vecA, const VecT& vecB );
+///
+/// Available anywhere
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecAdd( const VecT& vecA, const VecT& vecB );
+///
+/// Available anywhere
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecSub( const VecT& vecA, const VecT& vecB );
+/// Returns result[i] = ( vecA[i] < vecB[i] ) ? 0 : 1;
+///
+/// Available anywhere
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecStep( const VecT& vecA, const VecT& vecB );
+///
+/// Available anywhere
+template <typename VecT>
+static __forceinline__ __device__ VecT optixCoopVecFFMA( const VecT& vecA, const VecT& vecB, const VecT& vecC );
+
+/// Computes a vector matrix multiplication with an addition of a bias.
+///
+/// \code
+///           A * B           + C     = D
+/// Does matrix * inputVector + bias  = output
+///       [NxK]   [Kx1]         [Nx1] = [Nx1]
+/// \endcode
+///
+/// Not all combinations of inputType and matrixElementType are supported. See the
+/// following table for supported configurations.
+///
+/// inputType  | inputInterpretation | matrixElementType | biasElementType | outputType
+/// -----------|---------------------|-------------------|-----------------|-----------
+/// FLOAT16    | FLOAT16             | FLOAT16           | FLOAT16         | FLOAT16
+/// FLOAT16    | FLOAT8_E4M3         | FLOAT8_E4M3       | FLOAT16         | FLOAT16
+/// FLOAT16    | FLOAT8_E5M4         | FLOAT8_E5M4       | FLOAT16         | FLOAT16
+/// FLOAT16    | UINT8/INT8          | UINT8/INT8        | UINT32/INT32    | UINT32/INT32
+/// FLOAT32    | UINT8/INT8          | UINT8/INT8        | UINT32/INT32    | UINT32/INT32
+/// UINT8/INT8 | UINT8/INT8          | UINT8/INT8        | UINT32/INT32    | UINT32/INT32
+///
+/// If either the input or matrix is signed, then the bias and output must also be signed.
+///
+/// When matrixElementType is OPTIX_COOP_VEC_ELEM_TYPE_FLOAT8_E4M3 or
+/// OPTIX_COOP_VEC_ELEM_TYPE_FLOAT8_E5M2 the matrixLayout must be either
+/// OPTIX_COOP_VEC_MATRIX_LAYOUT_INFERENCING_OPTIMAL or
+/// OPTIX_COOP_VEC_MATRIX_LAYOUT_TRAINING_OPTIMAL.
+///
+/// When the inputVector's element type does not match the inputInterpretation
+/// arithmetically casting is performed on the input values to match the
+/// inputInterpretation.
+///
+/// If transpose is true, the matrix is treated as being stored transposed in memory
+/// (stored as KxN instead of NxK). Set other parameters as if the matrix was not
+/// transposed in memory. Not all matrix element types or matrix layouts support
+/// transpose. Only OPTIX_COOP_VEC_ELEM_TYPE_FLOAT16 is supported. Only
+/// OPTIX_COOP_VEC_MATRIX_LAYOUT_INFERENCING_OPTIMAL and
+/// OPTIX_COOP_VEC_MATRIX_LAYOUT_TRAINING_OPTIMAL are supported.
+///
+/// The bias pointer is assumed to not be null and may be dereferenced. If you wish to do
+/// the matrix multiply without a bias then use the overloaded version of this function
+/// that does not take the bias.
+///
+/// For row and column ordered matrix layouts, the stride will assume tight packing when
+/// rowColumnStrideInBytes is a constant immediate 0 (computed values or loaded from
+/// memory will not work). Ignored for other matrix layouts. Value must be 16 byte
+/// aligned.
+///
+/// \tparam VecTOut             Type must match biasElementType and size must match N
+/// \tparam VecTIn              Type must be i32, f16 or f32 type and size must match K
+/// \tparam inputInterpretation Must match matrixLayout
+/// \tparam matrixLayout        The layout of the matrix in memory
+/// \tparam transpose           Whether the data in memory for matrix is transposed from the specified layout
+/// \tparam N                   Must match VecTOut::size
+/// \tparam K                   Must match VecTIn::size
+/// \tparam matrixElementType   Type of elements stored in memory
+/// \tparam biasElementType     Type of elements stored in memory, must also match VecTOut::elementType
+///
+/// \param[in] inputVector
+/// \param[in] matrix                 pointer to global memory. Array of NxK elements. 64 byte aligned. Must not be modified during use.
+/// \param[in] matrixOffsetInBytes    offset to start of matrix data. Using the same value for matrix with different offsets for all layers yields more effecient execution. 64 byte aligned.
+/// \param[in] bias                   pointer to global memory. Array of N elements. 16 byte aligned. Must not be modified during use.
+/// \param[in] biasOffsetInBytes      offset to start of bias data. Using the same value for bias with different offsets for all layers yields more effecient execution. 16 byte aligned.
+/// \param[in] rowColumnStrideInBytes for row or column major matrix layouts, this identifies the stride between columns or rows.
+///
+/// Available in all OptiX program types
+template <
+    typename VecTOut,
+    typename VecTIn,
+    OptixCoopVecElemType inputInterpretation,
+    OptixCoopVecMatrixLayout matrixLayout,
+    bool transpose,
+    unsigned int N,
+    unsigned int K,
+    OptixCoopVecElemType matrixElementType,
+    OptixCoopVecElemType biasElementType>
+static __forceinline__ __device__ VecTOut optixCoopVecMatMul( const VecTIn& inputVector,
+                                                              CUdeviceptr matrix,  // 64 byte aligned, Array of KxN elements
+                                                              unsigned    matrixOffsetInBytes,  // 64 byte aligned
+                                                              CUdeviceptr bias,  // 16 byte aligned, Array of N elements
+                                                              unsigned    biasOffsetInBytes,  // 16 byte aligned
+                                                              unsigned    rowColumnStrideInBytes = 0 );
+
+/// Same as #optixCoopVecMatMul, but without the bias parameters.
+template <typename VecTOut, typename VecTIn, OptixCoopVecElemType inputInterpretation, OptixCoopVecMatrixLayout matrixLayout, bool transpose, unsigned int N, unsigned int K, OptixCoopVecElemType matrixElementType>
+static __forceinline__ __device__ VecTOut optixCoopVecMatMul( const VecTIn& inputVector,
+                                                              CUdeviceptr matrix,  // 64 byte aligned, Array of KxN elements
+                                                              unsigned matrixOffsetInBytes,  // 64 byte aligned
+                                                              unsigned rowColumnStrideInBytes = 0 );
+
+/// Performs a component-wise atomic add reduction of the vector into global memory
+/// starting at \a offsetInBytes bytes after \a outputVector.
+///
+/// VecTIn::elementType must be of type OPTIX_COOP_VEC_ELEM_TYPE_FLOAT16 or
+/// OPTIX_COOP_VEC_ELEM_TYPE_FLOAT32 The memory backed by \a outputVector + \a offsetInBytes
+/// must be large enough to accomodate VecTIn::size elements.  The type of data in
+/// \a outputVector must match VecTIn::elementType. No type conversion is performed.
+/// \a outputVector + \a offsetInBytes must be 4 byte aligned.
+///
+/// \tparam VecTIn Type of inputVector
+///
+/// \param[in] inputVector
+/// \param[in] outputVector  pointer to global memory on the device, sum with \a offsetInBytes must be a multiple of 4
+/// \param[in] offsetInBytes offset in bytes from \a outputVector, sum with \a outputVector must be a multiple of 4
+///
+/// Available in all OptiX program types
+template <typename VecTIn>
+static __forceinline__ __device__ void optixCoopVecReduceSumAccumulate( const VecTIn& inputVector,
+                                                                        CUdeviceptr   outputVector,
+                                                                        unsigned      offsetInBytes );
+
+/// Produces a matrix outer product of the input vecA and vecB ( vecA * transpose(vecB) )
+/// and does a component-wise atomic add reduction of the result into global memory
+/// starting \a offsetInBytes bytes after \a outputMatrix. The dimentions of the matrix are
+/// [VecTA::size, VecTB::size]. VecTA::elementType, VecTB::elementType and the element
+/// type of the matrix must be the same, no type conversion is performed. The element type
+/// must be OPTIX_COOP_VEC_ELEM_TYPE_FLOAT16.
+///
+/// outputMatrix + offsetInBytes must be 4B aligned, but performance may be better with
+/// 128 byte alignments.
+///
+/// The output matrix will be in matrixLayout layout, though currently only
+/// OPTIX_COOP_VEC_MATRIX_LAYOUT_TRAINING_OPTIMAL layout is supported.
+///
+/// \tparam VecTA        Type of vecA
+/// \tparam VecTB        Type of vecB
+/// \tparam matrixLayout Layout of matrix stored in outputMatrix
+///
+/// \param [in] vecA
+/// \param [in] vecB
+/// \param [in] outputMatrix           pointer to global memory on the device, sum with \a offsetInBytes must be a multiple of 4
+/// \param [in] offsetInBytes          offset in bytes from \a outputMatrix, sum with \a outputMatrix must be a multiple of 4
+/// \param [in] rowColumnStrideInBytes stride between rows or columns, zero takes natural stride, ignored for optimal layouts
+///
+/// Available in all OptiX program types
+template <typename VecTA, typename VecTB, OptixCoopVecMatrixLayout matrixLayout = OPTIX_COOP_VEC_MATRIX_LAYOUT_TRAINING_OPTIMAL>
+static __forceinline__ __device__ void optixCoopVecOuterProductAccumulate( const VecTA& vecA,
+                                                                           const VecTB& vecB,
+                                                                           CUdeviceptr  outputMatrix,
+                                                                           unsigned     offsetInBytes,
+                                                                           unsigned     rowColumnStrideInBytes = 0 );
+
+/// This function is intended strictly for matrix layouts that must be computed through
+/// the host API, #optixCoopVecMatrixComputeSize, but is needed on the device. For optimal
+/// performance the offsets to each layer in a network should be constant, so this
+/// function can be used to facilitate calculating the offset for subsequent layers in
+/// shader code. It can also be used for calculating the size of row and column major
+/// matrices, but the rowColumnStrideInBytes template parameter must be specified, so that
+/// it can be calculated during compilation.
+///
+/// For row and column ordered matrix layouts, when rowColumnStrideInBytes is 0, the
+/// stride will assume tight packing.
+///
+/// Results will be rounded to the next multiple of 64 to make it easy to pack the
+/// matrices in memory and have the correct alignment.
+///
+/// Results are in number of bytes, and should match the output of the host function
+/// #optixCoopVecMatrixComputeSize.
+///
+/// \tparam N, K        dimensions of the matrix
+/// \tparam elementType Type of the matrix elements
+/// \tparam layout      Layout of the matrix
+///
+/// Available anywhere
+template <unsigned int N, unsigned int K, OptixCoopVecElemType elementType, OptixCoopVecMatrixLayout layout = OPTIX_COOP_VEC_MATRIX_LAYOUT_INFERENCING_OPTIMAL, unsigned int rowColumnStrideInBytes = 0>
+static __forceinline__ __device__ unsigned int optixCoopVecGetMatrixSize();
+
+/// The API does not require the use of this class specifically, but it must define a
+/// certain interface as spelled out by the public members of the class. Note that not all
+/// types of T are supported. Only 8 and 32 bit signed and unsigned integral types along
+/// with 16 and 32 bit floating point values.
+template <typename T, unsigned int N>
+class OptixCoopVec
+{
+  public:
+    static const unsigned int size = N;
+    using value_type               = T;
+
+    __forceinline__ __device__ OptixCoopVec() {}
+    __forceinline__ __device__ OptixCoopVec( const value_type& val )
+    {
+        for( unsigned int i = 0; i < size; ++i )
+            m_data[i]       = val;
+    }
+    __forceinline__ __device__ const value_type& operator[]( unsigned int index ) const { return m_data[index]; }
+    __forceinline__ __device__ value_type& operator[]( unsigned int index ) { return m_data[index]; }
+
+    __forceinline__ __device__ const value_type* data() const { return m_data; }
+    __forceinline__ __device__ value_type* data() { return m_data; }
+
+  protected:
+    value_type m_data[size];
+};
+
+/**@}*/  // end group optix_device_api
+
+#include "internal/optix_device_impl_coop_vec.h"
+
+#endif //  OPTIX_INCLUDE_COOPERATIVE_VECTOR
+
+#ifdef OPTIX_INCLUDE_COOPERATIVE_VECTOR_UNSET
+#  undef OPTIX_INCLUDE_COOPERATIVE_VECTOR
+#  undef OPTIX_INCLUDE_COOPERATIVE_VECTOR_UNSET
+#endif
+
+
+#endif  // OPTIX_OPTIX_DEVICE_H
diff --git a/crtx/optix_9.1/optix_function_table.h b/crtx/optix_9.1/optix_function_table.h
new file mode 100644
index 0000000..04ae85c
--- /dev/null
+++ b/crtx/optix_9.1/optix_function_table.h
@@ -0,0 +1,444 @@
+/*
+* SPDX-FileCopyrightText: Copyright (c) 2019 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+* SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+*
+* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+* property and proprietary rights in and to this material, related
+* documentation and any modifications thereto. Any use, reproduction,
+* disclosure or distribution of this material and related documentation
+* without an express license agreement from NVIDIA CORPORATION or
+* its affiliates is strictly prohibited.
+*/
+/// @file
+/// @author NVIDIA Corporation
+/// @brief  OptiX public API header
+
+#ifndef OPTIX_OPTIX_FUNCTION_TABLE_H
+#define OPTIX_OPTIX_FUNCTION_TABLE_H
+
+/// The OptiX ABI version.
+#define OPTIX_ABI_VERSION 118
+
+#ifndef OPTIX_DEFINE_ABI_VERSION_ONLY
+
+#include "optix_types.h"
+
+#if !defined( OPTIX_DONT_INCLUDE_CUDA )
+// If OPTIX_DONT_INCLUDE_CUDA is defined, cuda driver types must be defined through other
+// means before including optix headers.
+#include <cuda.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// \defgroup optix_function_table Function Table
+/// \brief OptiX Function Table
+
+/** \addtogroup optix_function_table
+@{
+*/
+
+/// The function table containing all API functions.
+///
+/// See #optixInit() and #optixInitWithHandle().
+typedef struct OptixFunctionTable
+{
+    /// \name Error handling
+    //@ {
+
+    /// See ::optixGetErrorName().
+    const char* ( *optixGetErrorName )( OptixResult result );
+
+    /// See ::optixGetErrorString().
+    const char* ( *optixGetErrorString )( OptixResult result );
+
+    //@ }
+    /// \name Device context
+    //@ {
+
+    /// See ::optixDeviceContextCreate().
+    OptixResult ( *optixDeviceContextCreate )( CUcontext fromContext, const OptixDeviceContextOptions* options, OptixDeviceContext* context );
+
+    /// See ::optixDeviceContextDestroy().
+    OptixResult ( *optixDeviceContextDestroy )( OptixDeviceContext context );
+
+    /// See ::optixDeviceContextGetProperty().
+    OptixResult ( *optixDeviceContextGetProperty )( OptixDeviceContext context, OptixDeviceProperty property, void* value, size_t sizeInBytes );
+
+    /// See ::optixDeviceContextSetLogCallback().
+    OptixResult ( *optixDeviceContextSetLogCallback )( OptixDeviceContext context,
+                                                       OptixLogCallback   callbackFunction,
+                                                       void*              callbackData,
+                                                       unsigned int       callbackLevel );
+
+    /// See ::optixDeviceContextSetCacheEnabled().
+    OptixResult ( *optixDeviceContextSetCacheEnabled )( OptixDeviceContext context, int enabled );
+
+    /// See ::optixDeviceContextSetCacheLocation().
+    OptixResult ( *optixDeviceContextSetCacheLocation )( OptixDeviceContext context, const char* location );
+
+    /// See ::optixDeviceContextSetCacheDatabaseSizes().
+    OptixResult ( *optixDeviceContextSetCacheDatabaseSizes )( OptixDeviceContext context, size_t lowWaterMark, size_t highWaterMark );
+
+    /// See ::optixDeviceContextGetCacheEnabled().
+    OptixResult ( *optixDeviceContextGetCacheEnabled )( OptixDeviceContext context, int* enabled );
+
+    /// See ::optixDeviceContextGetCacheLocation().
+    OptixResult ( *optixDeviceContextGetCacheLocation )( OptixDeviceContext context, char* location, size_t locationSize );
+
+    /// See ::optixDeviceContextGetCacheDatabaseSizes().
+    OptixResult ( *optixDeviceContextGetCacheDatabaseSizes )( OptixDeviceContext context, size_t* lowWaterMark, size_t* highWaterMark );
+
+    //@ }
+    /// \name Modules
+    //@ {
+
+    /// See ::optixModuleCreate().
+    OptixResult ( *optixModuleCreate )( OptixDeviceContext                 context,
+                                        const OptixModuleCompileOptions*   moduleCompileOptions,
+                                        const OptixPipelineCompileOptions* pipelineCompileOptions,
+                                        const char*                        input,
+                                        size_t                             inputSize,
+                                        char*                              logString,
+                                        size_t*                            logStringSize,
+                                        OptixModule*                       module );
+
+    /// See ::optixModuleCreateWithTasks().
+    OptixResult ( *optixModuleCreateWithTasks )( OptixDeviceContext                 context,
+                                                 const OptixModuleCompileOptions*   moduleCompileOptions,
+                                                 const OptixPipelineCompileOptions* pipelineCompileOptions,
+                                                 const char*                        input,
+                                                 size_t                             inputSize,
+                                                 char*                              logString,
+                                                 size_t*                            logStringSize,
+                                                 OptixModule*                       module,
+                                                 OptixTask*                         firstTask );
+
+    /// See ::optixModuleGetCompilationState().
+    OptixResult ( *optixModuleGetCompilationState )( OptixModule module, OptixModuleCompileState* state );
+
+    /// See ::optixModuleCancelCreation().
+    OptixResult ( *optixModuleCancelCreation )( OptixModule module, OptixCreationFlags flags );
+
+	  OptixResult ( *optixStub )( void );
+
+    /// See ::optixDeviceContextCancelCreations().
+    OptixResult ( *optixDeviceContextCancelCreations )( OptixDeviceContext context, OptixCreationFlags flags );
+
+    /// See ::optixModuleDestroy().
+    OptixResult ( *optixModuleDestroy )( OptixModule module );
+
+    /// See ::optixBuiltinISModuleGet().
+    OptixResult( *optixBuiltinISModuleGet )( OptixDeviceContext                 context,
+                                             const OptixModuleCompileOptions*   moduleCompileOptions,
+                                             const OptixPipelineCompileOptions* pipelineCompileOptions,
+                                             const OptixBuiltinISOptions*       builtinISOptions,
+                                             OptixModule*                       builtinModule);
+
+    //@ }
+    /// \name Tasks
+    //@ {
+
+    /// See ::optixTaskExecute().
+    OptixResult ( *optixTaskExecute )( OptixTask     task,
+                                       OptixTask*    additionalTasks,
+                                       unsigned int  maxNumAdditionalTasks,
+                                       unsigned int* numAdditionalTasksCreated );
+
+    /// See ::optixTaskGetSerializationKey().
+    OptixResult ( *optixTaskGetSerializationKey )( OptixTask task, void* key, size_t* size );
+
+    /// See ::optixTaskSerializeOutput().
+    OptixResult ( *optixTaskSerializeOutput )( OptixTask task, void* data, size_t* size );
+
+    /// See ::optixTaskDeserializeOutput().
+    OptixResult ( *optixTaskDeserializeOutput )( OptixTask     task,
+                                                 const void*   data,
+                                                 size_t        size,
+                                                 OptixTask*    additionalTasks,
+                                                 unsigned int  maxNumAdditionalTasks,
+                                                 unsigned int* numAdditionalTasksCreated );
+
+    //@ }
+    /// \name Program groups
+    //@ {
+
+    /// See ::optixProgramGroupCreate().
+    OptixResult ( *optixProgramGroupCreate )( OptixDeviceContext              context,
+                                              const OptixProgramGroupDesc*    programDescriptions,
+                                              unsigned int                    numProgramGroups,
+                                              const OptixProgramGroupOptions* options,
+                                              char*                           logString,
+                                              size_t*                         logStringSize,
+                                              OptixProgramGroup*              programGroups );
+
+    /// See ::optixProgramGroupDestroy().
+    OptixResult ( *optixProgramGroupDestroy )( OptixProgramGroup programGroup );
+
+    /// See ::optixProgramGroupGetStackSize().
+    OptixResult ( *optixProgramGroupGetStackSize )( OptixProgramGroup programGroup, OptixStackSizes* stackSizes, OptixPipeline pipeline );
+
+    //@ }
+    /// \name Pipeline
+    //@ {
+
+    /// See ::optixPipelineCreate().
+    OptixResult ( *optixPipelineCreate )( OptixDeviceContext                 context,
+                                          const OptixPipelineCompileOptions* pipelineCompileOptions,
+                                          const OptixPipelineLinkOptions*    pipelineLinkOptions,
+                                          const OptixProgramGroup*           programGroups,
+                                          unsigned int                       numProgramGroups,
+                                          char*                              logString,
+                                          size_t*                            logStringSize,
+                                          OptixPipeline*                     pipeline );
+
+    /// See ::optixPipelineDestroy().
+    OptixResult ( *optixPipelineDestroy )( OptixPipeline pipeline );
+
+    /// See ::optixPipelineSetStackSizeFromCallDepths().
+    OptixResult ( *optixPipelineSetStackSizeFromCallDepths )( OptixPipeline pipeline,
+                                                              unsigned int  maxTraceDepth,
+                                                              unsigned int  maxContinuationCallableDepth,
+                                                              unsigned int  maxDirectCallableDepthFromState,
+                                                              unsigned int  maxDirectCallableDepthFromTraversal,
+                                                              unsigned int  maxTraversableGraphDepth);
+
+    /// See ::optixPipelineSetStackSize().
+    OptixResult ( *optixPipelineSetStackSize )( OptixPipeline pipeline,
+                                                unsigned int  directCallableStackSizeFromTraversal,
+                                                unsigned int  directCallableStackSizeFromState,
+                                                unsigned int  continuationStackSize,
+                                                unsigned int  maxTraversableGraphDepth );
+
+    OptixResult ( *optixPipelineSymbolMemcpyAsync )( OptixPipeline                 pipeline,
+                                                     const char*                   name,
+                                                     void*                         mem,
+                                                     size_t                        sizeInBytes,
+                                                     size_t                        offsetInBytes,
+                                                     OptixPipelineSymbolMemcpyKind kind,
+                                                     CUstream                      stream );
+
+    //@ }
+    /// \name Acceleration structures
+    //@ {
+
+    /// See ::optixAccelComputeMemoryUsage().
+    OptixResult ( *optixAccelComputeMemoryUsage )( OptixDeviceContext            context,
+                                                   const OptixAccelBuildOptions* accelOptions,
+                                                   const OptixBuildInput*        buildInputs,
+                                                   unsigned int                  numBuildInputs,
+                                                   OptixAccelBufferSizes*        bufferSizes );
+
+    /// See ::optixAccelBuild().
+    OptixResult ( *optixAccelBuild )( OptixDeviceContext            context,
+                                      CUstream                      stream,
+                                      const OptixAccelBuildOptions* accelOptions,
+                                      const OptixBuildInput*        buildInputs,
+                                      unsigned int                  numBuildInputs,
+                                      CUdeviceptr                   tempBuffer,
+                                      size_t                        tempBufferSizeInBytes,
+                                      CUdeviceptr                   outputBuffer,
+                                      size_t                        outputBufferSizeInBytes,
+                                      OptixTraversableHandle*       outputHandle,
+                                      const OptixAccelEmitDesc*     emittedProperties,
+                                      unsigned int                  numEmittedProperties );
+
+    /// See ::optixAccelGetRelocationInfo().
+    OptixResult ( *optixAccelGetRelocationInfo )( OptixDeviceContext context, OptixTraversableHandle handle, OptixRelocationInfo* info );
+
+
+    /// See ::optixCheckRelocationCompatibility().
+    OptixResult ( *optixCheckRelocationCompatibility )( OptixDeviceContext         context,
+                                                        const OptixRelocationInfo* info,
+                                                        int*                       compatible );
+
+    /// See ::optixAccelRelocate().
+    OptixResult ( *optixAccelRelocate )( OptixDeviceContext         context,
+                                         CUstream                   stream,
+                                         const OptixRelocationInfo* info,
+                                         const OptixRelocateInput*  relocateInputs,
+                                         size_t                     numRelocateInputs,
+                                         CUdeviceptr                targetAccel,
+                                         size_t                     targetAccelSizeInBytes,
+                                         OptixTraversableHandle*    targetHandle );
+
+
+    /// See ::optixAccelCompact().
+    OptixResult ( *optixAccelCompact )( OptixDeviceContext      context,
+                                        CUstream                stream,
+                                        OptixTraversableHandle  inputHandle,
+                                        CUdeviceptr             outputBuffer,
+                                        size_t                  outputBufferSizeInBytes,
+                                        OptixTraversableHandle* outputHandle );
+
+    OptixResult ( *optixAccelEmitProperty )( OptixDeviceContext        context,
+                                             CUstream                  stream,
+                                             OptixTraversableHandle    handle,
+                                             const OptixAccelEmitDesc* emittedProperty );
+
+    /// See ::optixConvertPointerToTraversableHandle().
+    OptixResult ( *optixConvertPointerToTraversableHandle )( OptixDeviceContext      onDevice,
+                                                             CUdeviceptr             pointer,
+                                                             OptixTraversableType    traversableType,
+                                                             OptixTraversableHandle* traversableHandle );
+
+    /// See ::optixOpacityMicromapArrayComputeMemoryUsage().
+    OptixResult ( *optixOpacityMicromapArrayComputeMemoryUsage )( OptixDeviceContext                         context,
+                                                                  const OptixOpacityMicromapArrayBuildInput* buildInput,
+                                                                  OptixMicromapBufferSizes*                 bufferSizes );
+
+    /// See ::optixOpacityMicromapArrayBuild().
+    OptixResult ( *optixOpacityMicromapArrayBuild )( OptixDeviceContext                         context,
+                                                     CUstream                                   stream,
+                                                     const OptixOpacityMicromapArrayBuildInput* buildInput,
+                                                     const OptixMicromapBuffers*               buffers );
+
+    /// See ::optixOpacityMicromapArrayGetRelocationInfo().
+    OptixResult ( *optixOpacityMicromapArrayGetRelocationInfo )( OptixDeviceContext   context,
+                                                                 CUdeviceptr          opacityMicromapArray,
+                                                                 OptixRelocationInfo* info );
+
+    /// See ::optixOpacityMicromapArrayRelocate().
+    OptixResult ( *optixOpacityMicromapArrayRelocate )( OptixDeviceContext         context,
+                                                        CUstream                   stream,
+                                                        const OptixRelocationInfo* info,
+                                                        CUdeviceptr                targetOpacityMicromapArray,
+                                                        size_t                     targetOpacityMicromapArraySizeInBytes );
+
+    OptixResult ( *stub1 )( void );
+    OptixResult ( *stub2 )( void );
+
+    /// See ::optixClusterAccelComputeMemoryUsage().
+    OptixResult ( *optixClusterAccelComputeMemoryUsage )( OptixDeviceContext                 context,
+                                                          OptixClusterAccelBuildMode         buildMode,
+                                                          const OptixClusterAccelBuildInput* buildInput,
+                                                          OptixAccelBufferSizes*             bufferSizes );
+
+    /// See ::optixClusterAccelBuild().
+    OptixResult ( *optixClusterAccelBuild )( OptixDeviceContext                    context,
+                                             CUstream                              stream,
+                                             const OptixClusterAccelBuildModeDesc* buildModeDesc,
+                                             const OptixClusterAccelBuildInput*    buildInput,
+                                             CUdeviceptr                           argsArray,
+                                             CUdeviceptr                           argsCount,
+                                             unsigned int                          argsStrideInBytes );
+
+    //@ }
+    /// \name Launch
+    //@ {
+
+    /// See ::optixConvertPointerToTraversableHandle().
+    OptixResult ( *optixSbtRecordPackHeader )( OptixProgramGroup programGroup, void* sbtRecordHeaderHostPointer );
+
+    /// See ::optixConvertPointerToTraversableHandle().
+    OptixResult ( *optixLaunch )( OptixPipeline                  pipeline,
+                                  CUstream                       stream,
+                                  CUdeviceptr                    pipelineParams,
+                                  size_t                         pipelineParamsSize,
+                                  const OptixShaderBindingTable* sbt,
+                                  unsigned int                   width,
+                                  unsigned int                   height,
+                                  unsigned int                   depth );
+
+    //@ }
+    /// \name Cooperative Vector
+    //@ {
+
+    /// See ::optixCoopVecMatrixConvert().
+    OptixResult ( *optixCoopVecMatrixConvert )( OptixDeviceContext             context,
+                                                CUstream                       stream,
+                                                unsigned int                   numNetworks,
+                                                const OptixNetworkDescription* inputNetworkDescription,
+                                                CUdeviceptr                    inputNetworks,
+                                                size_t                         inputNetworkStrideInBytes,
+                                                const OptixNetworkDescription* outputNetworkDescription,
+                                                CUdeviceptr                    outputNetworks,
+                                                size_t                         outputNetworkStrideInBytes );
+
+    /// See ::optixCoopVecMatrixComputeSize().
+    OptixResult ( *optixCoopVecMatrixComputeSize )( OptixDeviceContext       context,
+                                                    unsigned int             N,
+                                                    unsigned int             K,
+                                                    OptixCoopVecElemType     elementType,
+                                                    OptixCoopVecMatrixLayout layout,
+                                                    size_t                   rowColumnStrideInBytes,
+                                                    size_t*                  sizeInBytes );
+
+    //@ }
+    /// \name Denoiser
+    //@ {
+
+    /// See ::optixDenoiserCreate().
+    OptixResult ( *optixDenoiserCreate )( OptixDeviceContext context, OptixDenoiserModelKind modelKind, const OptixDenoiserOptions* options, OptixDenoiser* returnHandle );
+
+    /// See ::optixDenoiserDestroy().
+    OptixResult ( *optixDenoiserDestroy )( OptixDenoiser handle );
+
+    /// See ::optixDenoiserComputeMemoryResources().
+    OptixResult ( *optixDenoiserComputeMemoryResources )( const OptixDenoiser handle,
+                                                          unsigned int        maximumInputWidth,
+                                                          unsigned int        maximumInputHeight,
+                                                          OptixDenoiserSizes* returnSizes );
+
+    /// See ::optixDenoiserSetup().
+    OptixResult ( *optixDenoiserSetup )( OptixDenoiser denoiser,
+                                         CUstream      stream,
+                                         unsigned int  inputWidth,
+                                         unsigned int  inputHeight,
+                                         CUdeviceptr   state,
+                                         size_t        stateSizeInBytes,
+                                         CUdeviceptr   scratch,
+                                         size_t        scratchSizeInBytes );
+
+    /// See ::optixDenoiserInvoke().
+    OptixResult ( *optixDenoiserInvoke )( OptixDenoiser                   denoiser,
+                                          CUstream                        stream,
+                                          const OptixDenoiserParams*      params,
+                                          CUdeviceptr                     denoiserState,
+                                          size_t                          denoiserStateSizeInBytes,
+                                          const OptixDenoiserGuideLayer * guideLayer,
+                                          const OptixDenoiserLayer *      layers,
+                                          unsigned int                    numLayers,
+                                          unsigned int                    inputOffsetX,
+                                          unsigned int                    inputOffsetY,
+                                          CUdeviceptr                     scratch,
+                                          size_t                          scratchSizeInBytes );
+
+    /// See ::optixDenoiserComputeIntensity().
+    OptixResult ( *optixDenoiserComputeIntensity )( OptixDenoiser       handle,
+                                                    CUstream            stream,
+                                                    const OptixImage2D* inputImage,
+                                                    CUdeviceptr         outputIntensity,
+                                                    CUdeviceptr         scratch,
+                                                    size_t              scratchSizeInBytes );
+
+    /// See ::optixDenoiserComputeAverageColor().
+    OptixResult ( *optixDenoiserComputeAverageColor )( OptixDenoiser       handle,
+                                                       CUstream            stream,
+                                                       const OptixImage2D* inputImage,
+                                                       CUdeviceptr         outputAverageColor,
+                                                       CUdeviceptr         scratch,
+                                                       size_t              scratchSizeInBytes );
+
+    /// See ::optixDenoiserCreateWithUserModel().
+    OptixResult ( *optixDenoiserCreateWithUserModel )( OptixDeviceContext context, const void * data, size_t dataSizeInBytes, OptixDenoiser* returnHandle );
+    //@ }
+
+} OptixFunctionTable;
+
+// define global function table variable with ABI specific name.
+#define OPTIX_CONCATENATE_ABI_VERSION(prefix, macro) OPTIX_CONCATENATE_ABI_VERSION_IMPL(prefix, macro)
+#define OPTIX_CONCATENATE_ABI_VERSION_IMPL(prefix, macro) prefix ## _ ## macro
+#define OPTIX_FUNCTION_TABLE_SYMBOL OPTIX_CONCATENATE_ABI_VERSION(g_optixFunctionTable, OPTIX_ABI_VERSION)
+
+/**@}*/  // end group optix_function_table
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* OPTIX_DEFINE_ABI_VERSION_ONLY */
+
+#endif /* OPTIX_OPTIX_FUNCTION_TABLE_H */
diff --git a/crtx/optix_9.1/optix_function_table_definition.h b/crtx/optix_9.1/optix_function_table_definition.h
new file mode 100644
index 0000000..5432fd7
--- /dev/null
+++ b/crtx/optix_9.1/optix_function_table_definition.h
@@ -0,0 +1,59 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2019 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/// @file
+/// @author NVIDIA Corporation
+/// @brief  OptiX public API header
+
+#ifndef OPTIX_OPTIX_FUNCTION_TABLE_DEFINITION_H
+#define OPTIX_OPTIX_FUNCTION_TABLE_DEFINITION_H
+
+#include "optix_function_table.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \addtogroup optix_function_table
+@{
+*/
+
+/// If the stubs in optix_stubs.h are used, then the function table needs to be defined in exactly
+/// one translation unit. This can be achieved by including this header file in that translation
+/// unit.
+OptixFunctionTable OPTIX_FUNCTION_TABLE_SYMBOL;
+
+/**@}*/  // end group optix_function_table
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // OPTIX_OPTIX_FUNCTION_TABLE_DEFINITION_H
diff --git a/crtx/optix_9.1/optix_host.h b/crtx/optix_9.1/optix_host.h
new file mode 100644
index 0000000..9a8b543
--- /dev/null
+++ b/crtx/optix_9.1/optix_host.h
@@ -0,0 +1,1225 @@
+/*
+* SPDX-FileCopyrightText: Copyright (c) 2010 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+* SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+*
+* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+* property and proprietary rights in and to this material, related
+* documentation and any modifications thereto. Any use, reproduction,
+* disclosure or distribution of this material and related documentation
+* without an express license agreement from NVIDIA CORPORATION or
+* its affiliates is strictly prohibited.
+*/
+/// @file
+/// @author NVIDIA Corporation
+/// @brief  OptiX public API header
+///
+/// OptiX host include file -- includes the host api if compiling host code.
+/// For the math library routines include optix_math.h
+
+#ifndef OPTIX_OPTIX_HOST_H
+#define OPTIX_OPTIX_HOST_H
+
+/// Mixing multiple SDKs in a single application will result in symbol collisions.
+/// To enable different compilation units to use different SDKs, use OPTIX_ENABLE_SDK_MIXING.
+#ifndef OPTIXAPI
+# ifdef OPTIX_ENABLE_SDK_MIXING
+#   define OPTIXAPI static
+# else  // OPTIX_ENABLE_SDK_MIXING
+#   ifdef __cplusplus
+#     define OPTIXAPI extern "C"
+#   else  // __cplusplus
+#     define OPTIXAPI
+#   endif  // __cplusplus
+# endif  // OPTIX_ENABLE_SDK_MIXING
+#endif  // OPTIXAPI
+
+#include "optix_types.h"
+#if !defined( OPTIX_DONT_INCLUDE_CUDA )
+// If OPTIX_DONT_INCLUDE_CUDA is defined, cuda driver types must be defined through other
+// means before including optix headers.
+#include <cuda.h>
+#endif
+
+
+
+/// \defgroup optix_host_api Host API
+/// \brief OptiX Host API
+
+/// \defgroup optix_host_api_error_handling Error handling
+/// \ingroup optix_host_api
+///@{
+
+/// Returns a string containing the name of an error code in the enum.
+///
+/// Output is a string representation of the enum.  For example "OPTIX_SUCCESS" for
+/// OPTIX_SUCCESS and "OPTIX_ERROR_INVALID_VALUE" for OPTIX_ERROR_INVALID_VALUE.
+///
+/// If the error code is not recognized, "Unrecognized OptixResult code" is returned.
+///
+/// \param[in] result  OptixResult enum to generate string name for
+///
+/// \see #optixGetErrorString
+OPTIXAPI const char* optixGetErrorName( OptixResult result );
+
+/// Returns the description string for an error code.
+///
+/// Output is a string description of the enum.  For example "Success" for
+/// OPTIX_SUCCESS and "Invalid value" for OPTIX_ERROR_INVALID_VALUE.
+///
+/// If the error code is not recognized, "Unrecognized OptixResult code" is returned.
+///
+/// \param[in] result  OptixResult enum to generate string description for
+///
+/// \see #optixGetErrorName
+OPTIXAPI const char* optixGetErrorString( OptixResult result );
+
+///@}
+/// \defgroup optix_host_api_device_context Device context
+/// \ingroup optix_host_api
+///@{
+
+/// Create a device context associated with the CUDA context specified with 'fromContext'.
+///
+/// If zero is specified for 'fromContext', OptiX will use the current CUDA context. The
+/// CUDA context should be initialized before calling optixDeviceContextCreate.
+///
+/// \param[in] fromContext
+/// \param[in] options
+/// \param[out] context
+/// \return
+/// - OPTIX_ERROR_CUDA_NOT_INITIALIZED
+///   If using zero for 'fromContext' and CUDA has not been initialized yet on the calling
+///   thread.
+/// - OPTIX_ERROR_CUDA_ERROR
+///   CUDA operation failed.
+/// - OPTIX_ERROR_HOST_OUT_OF_MEMORY
+///   Heap allocation failed.
+/// - OPTIX_ERROR_INTERNAL_ERROR
+///   Internal error
+OPTIXAPI OptixResult optixDeviceContextCreate( CUcontext fromContext, const OptixDeviceContextOptions* options, OptixDeviceContext* context );
+
+/// Destroys all CPU and GPU state associated with the device.
+///
+/// It will attempt to block on CUDA streams that have launch work outstanding.
+///
+/// Any API objects, such as OptixModule and OptixPipeline, not already destroyed will be
+/// destroyed.
+///
+/// Thread safety: A device context must not be destroyed while it is still in use by concurrent API calls in other threads.
+OPTIXAPI OptixResult optixDeviceContextDestroy( OptixDeviceContext context );
+
+/// Query properties of a device context.
+///
+/// \param[in] context     the device context to query the property for
+/// \param[in] property    the property to query
+/// \param[out] value      pointer to the returned
+/// \param[in] sizeInBytes size of output
+OPTIXAPI OptixResult optixDeviceContextGetProperty( OptixDeviceContext context, OptixDeviceProperty property, void* value, size_t sizeInBytes );
+
+/// Sets the current log callback method.
+///
+/// See #OptixLogCallback for more details.
+///
+/// Thread safety: It is guaranteed that the callback itself (callbackFunction and callbackData) are updated atomically.
+/// It is not guaranteed that the callback itself (callbackFunction and callbackData) and the callbackLevel are updated
+/// atomically. It is unspecified when concurrent API calls using the same context start to make use of the new
+/// callback method.
+///
+/// \param[in] context          the device context
+/// \param[in] callbackFunction the callback function to call
+/// \param[in] callbackData     pointer to data passed to callback function while invoking it
+/// \param[in] callbackLevel    callback level
+OPTIXAPI OptixResult optixDeviceContextSetLogCallback( OptixDeviceContext context,
+                                                       OptixLogCallback   callbackFunction,
+                                                       void*              callbackData,
+                                                       unsigned int       callbackLevel );
+
+/// Enables or disables the disk cache.
+///
+/// If caching was previously disabled, enabling it will attempt to initialize
+/// the disk cache database using the currently configured cache location. An
+/// error will be returned if initialization fails.
+///
+/// Note that no in-memory cache is used, so no caching behavior will be observed if the disk cache
+/// is disabled.
+///
+/// The cache can be disabled by setting the environment variable OPTIX_CACHE_MAXSIZE=0.
+/// The environment variable takes precedence over this setting.
+/// See #optixDeviceContextSetCacheDatabaseSizes for additional information.
+///
+/// Note that the disk cache can be disabled by the environment variable, but it cannot be enabled
+/// via the environment if it is disabled via the API.
+///
+/// \param[in] context the device context
+/// \param[in] enabled 1 to enabled, 0 to disable
+OPTIXAPI OptixResult optixDeviceContextSetCacheEnabled( OptixDeviceContext context, int enabled );
+
+/// Sets the location of the disk cache.
+///
+/// The location is specified by a directory. This directory should not be used for other purposes
+/// and will be created if it does not exist. An error will be returned if is not possible to
+/// create the disk cache at the specified location for any reason (e.g., the path is invalid or
+/// the directory is not writable). Caching will be disabled if the disk cache cannot be
+/// initialized in the new location. If caching is disabled, no error will be returned until caching
+/// is enabled. If the disk cache is located on a network file share, behavior is undefined.
+///
+/// The location of the disk cache can be overridden with the environment variable OPTIX_CACHE_PATH.
+/// The environment variable takes precedence over this setting.
+///
+/// The default location depends on the operating system:
+/// - Windows: %LOCALAPPDATA%\\NVIDIA\\OptixCache
+/// - Linux:   /var/tmp/OptixCache_\<username\> (or /tmp/OptixCache_\<username\> if the first choice is not usable),
+///            the underscore and username suffix are omitted if the username cannot be obtained
+/// - MacOS X: /Library/Application Support/NVIDIA/OptixCache
+///
+/// \param[in] context  the device context
+/// \param[in] location directory of disk cache
+OPTIXAPI OptixResult optixDeviceContextSetCacheLocation( OptixDeviceContext context, const char* location );
+
+/// Sets the low and high water marks for disk cache garbage collection.
+///
+/// Garbage collection is triggered when a new entry is written to the cache and
+/// the current cache data size plus the size of the cache entry that is about
+/// to be inserted exceeds the high water mark. Garbage collection proceeds until
+/// the size reaches the low water mark. Garbage collection will always free enough
+/// space to insert the new entry without exceeding the low water mark. Setting
+/// either limit to zero will disable garbage collection. An error will be returned
+/// if both limits are non-zero and the high water mark is smaller than the low water mark.
+///
+/// Note that garbage collection is performed only on writes to the disk cache. No garbage
+/// collection is triggered on disk cache initialization or immediately when calling this function,
+/// but on subsequent inserting of data into the database.
+///
+/// If the size of a compiled module exceeds the value configured for the high water
+/// mark and garbage collection is enabled, the module will not be added to the cache
+/// and a warning will be added to the log.
+///
+/// The high water mark can be overridden with the environment variable OPTIX_CACHE_MAXSIZE.
+/// The environment variable takes precedence over the function parameters.  The low water mark
+/// will be set to half the value of OPTIX_CACHE_MAXSIZE.  Setting OPTIX_CACHE_MAXSIZE to 0 will
+/// disable the disk cache, but will not alter the contents of the cache.  Negative and non-integer
+/// values will be ignored.
+///
+/// \param[in] context       the device context
+/// \param[in] lowWaterMark  the low water mark
+/// \param[in] highWaterMark the high water mark
+OPTIXAPI OptixResult optixDeviceContextSetCacheDatabaseSizes( OptixDeviceContext context, size_t lowWaterMark, size_t highWaterMark );
+
+/// Indicates whether the disk cache is enabled or disabled.
+///
+/// \param[in] context   the device context
+/// \param[out] enabled  1 if enabled, 0 if disabled
+OPTIXAPI OptixResult optixDeviceContextGetCacheEnabled( OptixDeviceContext context, int* enabled );
+/// Returns the location of the disk cache.  If the cache has been disabled by setting the environment
+/// variable OPTIX_CACHE_MAXSIZE=0, this function will return an empy string.
+///
+/// \param[in] context      the device context
+/// \param[out] location    directory of disk cache, null terminated if locationSize > 0
+/// \param[in] locationSize locationSize
+OPTIXAPI OptixResult optixDeviceContextGetCacheLocation( OptixDeviceContext context, char* location, size_t locationSize );
+
+/// Returns the low and high water marks for disk cache garbage collection.  If the cache has been disabled by
+/// setting the environment variable OPTIX_CACHE_MAXSIZE=0, this function will return 0 for the low and high
+/// water marks.
+///
+/// \param[in] context        the device context
+/// \param[out] lowWaterMark  the low water mark
+/// \param[out] highWaterMark the high water mark
+OPTIXAPI OptixResult optixDeviceContextGetCacheDatabaseSizes( OptixDeviceContext context, size_t* lowWaterMark, size_t* highWaterMark );
+
+///@}
+/// \defgroup optix_host_api_pipelines Pipelines
+/// \ingroup optix_host_api
+///@{
+
+/// logString is an optional buffer that contains compiler feedback and errors.  This
+/// information is also passed to the context logger (if enabled), however it may be
+/// difficult to correlate output to the logger to specific API invocations when using
+/// multiple threads.  The output to logString will only contain feedback for this specific
+/// invocation of this API call.
+///
+/// logStringSize as input should be a pointer to the number of bytes backing logString.
+/// Upon return it contains the length of the log message (including the null terminator)
+/// which may be greater than the input value.  In this case, the log message will be
+/// truncated to fit into logString.
+///
+/// If logString or logStringSize are NULL, no output is written to logString.  If
+/// logStringSize points to a value that is zero, no output is written.  This does not
+/// affect output to the context logger if enabled.
+///
+/// \param[in] context
+/// \param[in] pipelineCompileOptions
+/// \param[in] pipelineLinkOptions
+/// \param[in] programGroups          array of ProgramGroup objects
+/// \param[in] numProgramGroups       number of ProgramGroup objects
+/// \param[out] logString             Information will be written to this string. If logStringSize > 0 logString will be null terminated.
+/// \param[in,out] logStringSize
+/// \param[out] pipeline
+OPTIXAPI OptixResult optixPipelineCreate( OptixDeviceContext                 context,
+                                          const OptixPipelineCompileOptions* pipelineCompileOptions,
+                                          const OptixPipelineLinkOptions*    pipelineLinkOptions,
+                                          const OptixProgramGroup*           programGroups,
+                                          unsigned int                       numProgramGroups,
+                                          char*                              logString,
+                                          size_t*                            logStringSize,
+                                          OptixPipeline*                     pipeline );
+
+/// Thread safety: A pipeline must not be destroyed while it is still in use by concurrent API calls in other threads.
+OPTIXAPI OptixResult optixPipelineDestroy( OptixPipeline pipeline );
+
+/// Sets the stack size for a pipeline based on the given depth parameters.
+///
+/// When the pipeline is created the stack sizes for a pipeline are configured based on the depth values
+/// that were given in the OptixPipelineLinkOptions. This method allows to reconfigure the pipeline
+/// to new values for the maximum trace depth and the maximum callable depths which includes a recalculation
+/// of the stack sizes.
+///
+/// \param[in]  pipeline                             The pipeline to set the stack size for.
+/// \param[in]  maxTraceDepth                        The maximum trace recursion depth. See #OptixPipelineLinkOptions::maxTraceDepth.
+/// \param[in]  maxContinuationCallableDepth         The maximum depth of continuation callable call graphs. See #OptixPipelineLinkOptions::maxContinuationCallableDepth.
+/// \param[in]  maxDirectCallableDepthFromState      The maximum depth of direct callable call graphs called from RG, CH, MS or CC. See #OptixPipelineLinkOptions::maxDirectCallableDepthFromState.
+/// \param[in]  maxDirectCallableDepthFromTraversal  The maximum depth of direct callable call graphs called from IS or AH. See #OptixPipelineLinkOptions::maxDirectCallableDepthFromTraversal.
+/// \param[in]  maxTraversableGraphDepth             The maximum depth of a traversable graph passed to trace.
+OPTIXAPI OptixResult optixPipelineSetStackSizeFromCallDepths( OptixPipeline pipeline,
+                                                              unsigned int  maxTraceDepth,
+                                                              unsigned int  maxContinuationCallableDepth,
+                                                              unsigned int  maxDirectCallableDepthFromState,
+                                                              unsigned int  maxDirectCallableDepthFromTraversal,
+                                                              unsigned int  maxTraversableGraphDepth );
+
+/// Sets the stack sizes for a pipeline.
+///
+/// Users are encouraged to see the programming guide and the implementations of the helper functions
+/// to understand how to construct the stack sizes based on their particular needs.
+///
+/// If this method is not used, an internal default implementation is used. The default implementation is correct (but
+/// not necessarily optimal) as long as the maximum depth of call trees of CC programs is at most 2, and no DC programs or motion transforms are used.
+///
+/// The maxTraversableGraphDepth responds to the maximal number of traversables visited when calling trace.
+/// Every acceleration structure and motion transform count as one level of traversal.
+/// E.g., for a simple IAS (instance acceleration structure) -> GAS (geometry acceleration structure)
+/// traversal graph, the maxTraversableGraphDepth is two.
+/// For IAS -> MT (motion transform) -> GAS, the maxTraversableGraphDepth is three.
+/// Note that it does not matter whether a IAS or GAS has motion or not, it always counts as one.
+/// Launching optix with exceptions turned on (see #OPTIX_EXCEPTION_FLAG_TRACE_DEPTH) will throw an exception
+/// if the specified maxTraversableGraphDepth is too small.
+///
+/// \param[in] pipeline                             The pipeline to configure the stack size for.
+/// \param[in] directCallableStackSizeFromTraversal The direct stack size requirement for direct callables invoked from IS or AH.
+/// \param[in] directCallableStackSizeFromState     The direct stack size requirement for direct callables invoked from RG, MS, or CH.
+/// \param[in] continuationStackSize                The continuation stack requirement.
+/// \param[in] maxTraversableGraphDepth             The maximum depth of a traversable graph passed to trace.
+OPTIXAPI OptixResult optixPipelineSetStackSize( OptixPipeline pipeline,
+                                                unsigned int  directCallableStackSizeFromTraversal,
+                                                unsigned int  directCallableStackSizeFromState,
+                                                unsigned int  continuationStackSize,
+                                                unsigned int  maxTraversableGraphDepth );
+
+/// Copies data from or to a global symbol in the pipeline.
+/// Depending on the given kind of the copy operation, the mem parameter acts as the source or the
+/// target of the operation.
+/// The sizeInBytes parameter determines how many bytes are copied.
+/// The offsetInBytes parameter determines the offset in bytes in the target memory.
+/// 
+/// \param[in]  pipeline                       The pipeline to get the symbol address from/to.
+/// \param[in]  name                           The name of the symbol to copy data from/to.
+/// \param[in]  mem                            The memory where to copy data from/to. Depending on the kind of the copy operation this is either a host or a device pointer.
+/// \param[in]  sizeInBytes                    The amount of bytes to copy.
+/// \param[in]  offsetInBytes                  The offset in the symbol's memory to copy the data from/to.
+/// \param[in]  kind                           A flag that determines the direction of the copy operation.
+/// \param[in]  stream                         The CUstream to execute the asynchronous operation in.
+OPTIXAPI OptixResult optixPipelineSymbolMemcpyAsync( OptixPipeline                 pipeline,
+                                                     const char*                   name,
+                                                     void*                         mem,
+                                                     size_t                        sizeInBytes,
+                                                     size_t                        offsetInBytes,
+                                                     OptixPipelineSymbolMemcpyKind kind,
+                                                     CUstream                      stream );
+
+///@}
+/// \defgroup optix_host_api_modules Modules
+/// \ingroup optix_host_api
+///@{
+
+/// Compiling programs into a module. These programs can be passed in as either PTX or OptiX-IR.
+///
+/// See the Programming Guide for details, as well as how to generate these encodings from CUDA sources.
+///
+/// logString is an optional buffer that contains compiler feedback and errors.  This
+/// information is also passed to the context logger (if enabled), however it may be
+/// difficult to correlate output to the logger to specific API invocations when using
+/// multiple threads.  The output to logString will only contain feedback for this specific
+/// invocation of this API call.
+///
+/// logStringSize as input should be a pointer to the number of bytes backing logString.
+/// Upon return it contains the length of the log message (including the null terminator)
+/// which may be greater than the input value.  In this case, the log message will be
+/// truncated to fit into logString.
+///
+/// If logString or logStringSize are NULL, no output is written to logString.  If
+/// logStringSize points to a value that is zero, no output is written.  This does not
+/// affect output to the context logger if enabled.
+///
+/// \param[in] context
+/// \param[in] moduleCompileOptions
+/// \param[in] pipelineCompileOptions All modules in a pipeline need to use the same values for the pipeline compile options.
+/// \param[in] input                  Pointer to the input code.
+/// \param[in] inputSize              Parsing proceeds up to inputSize characters. Or, when reading PTX input, the first NUL byte, whichever occurs first.
+/// \param[out] logString             Information will be written to this string. If logStringSize > 0 logString will be null terminated.
+/// \param[in,out] logStringSize
+/// \param[out] module
+///
+/// \return OPTIX_ERROR_INVALID_VALUE - context is 0, moduleCompileOptions is 0, pipelineCompileOptions is 0, input is 0, module is 0.
+OPTIXAPI OptixResult optixModuleCreate( OptixDeviceContext                 context,
+                                        const OptixModuleCompileOptions*   moduleCompileOptions,
+                                        const OptixPipelineCompileOptions* pipelineCompileOptions,
+                                        const char*                        input,
+                                        size_t                             inputSize,
+                                        char*                              logString,
+                                        size_t*                            logStringSize,
+                                        OptixModule*                       module );
+
+/// This function is designed to do just enough work to create the OptixTask return
+/// parameter and is expected to be fast enough run without needing parallel execution. A
+/// single thread could generate all the OptixTask objects for further processing in a
+/// work pool.
+///
+/// Options are similar to #optixModuleCreate(), aside from the return parameter,
+/// firstTask.
+///
+/// The memory used to hold the input should be live until all tasks are finished.
+///
+/// It is illegal to call #optixModuleDestroy() if any OptixTask objects are currently
+/// being executed. In that case OPTIX_ERROR_ILLEGAL_DURING_TASK_EXECUTE will be returned.
+///
+/// If an invocation of optixTaskExecute fails, the OptixModule will be marked as
+/// OPTIX_MODULE_COMPILE_STATE_IMPENDING_FAILURE if there are outstanding tasks or
+/// OPTIX_MODULE_COMPILE_STATE_FAILURE if there are no outstanding tasks. Subsequent calls
+/// to #optixTaskExecute() may execute additional work to collect compilation errors
+/// generated from the input. Currently executing tasks will not necessarily be terminated
+/// immediately but at the next opportunity.
+///
+/// Logging will continue to be directed to the logger installed with the
+/// OptixDeviceContext. If logString is provided to #optixModuleCreateWithTasks(),
+/// it will contain all the compiler feedback from all executed tasks. The lifetime of the
+/// memory pointed to by logString should extend from calling
+/// #optixModuleCreateWithTasks() to when the compilation state is either
+/// OPTIX_MODULE_COMPILE_STATE_FAILURE or OPTIX_MODULE_COMPILE_STATE_COMPLETED. OptiX will
+/// not write to the logString outside of execution of
+/// #optixModuleCreateWithTasks() or #optixTaskExecute(). If the compilation state
+/// is OPTIX_MODULE_COMPILE_STATE_IMPENDING_FAILURE and no further execution of
+/// #optixTaskExecute() is performed the logString may be reclaimed by the application
+/// before calling #optixModuleDestroy(). The contents of logString will contain output
+/// from currently completed tasks.
+///
+/// All OptixTask objects associated with a given OptixModule will be cleaned up when
+/// #optixModuleDestroy() is called regardless of whether the compilation was successful
+/// or not. If the compilation state is OPTIX_MODULE_COMPILE_STATE_IMPENDING_FAILURE, any
+/// unstarted OptixTask objects do not need to be executed though there is no harm doing
+/// so.
+///
+/// \see #optixModuleCreate
+OPTIXAPI OptixResult optixModuleCreateWithTasks( OptixDeviceContext                 context,
+                                                 const OptixModuleCompileOptions*   moduleCompileOptions,
+                                                 const OptixPipelineCompileOptions* pipelineCompileOptions,
+                                                 const char*                        input,
+                                                 size_t                             inputSize,
+                                                 char*                              logString,
+                                                 size_t*                            logStringSize,
+                                                 OptixModule*                       module,
+                                                 OptixTask*                         firstTask );
+
+/// When creating a module with tasks, the current state of the module can be queried
+/// using this function.
+///
+/// Thread safety: Safe to call from any thread until optixModuleDestroy is called.
+///
+/// \see #optixModuleCreateWithTasks
+OPTIXAPI OptixResult optixModuleGetCompilationState( OptixModule module, OptixModuleCompileState* state );
+
+/// Used to cancel task-based module creation. A canceled module will transition to
+/// OPTIX_MODULE_COMPILE_STATE_IMPENDING_FAILURE if there are unfinished tasks that
+/// have been returned to the user, or OPTIX_MODULE_COMPILE_STATE_FAILED if all
+/// returned tasks have finished executing, at which point it should be treated as
+/// any other module that has failed compilation. The user may continue executing
+/// tasks of a canceled module, they will simply return OPTIX_ERROR_CREATION_CANCELED
+/// without performing any compilation and without creating new tasks.
+/// 
+/// Conditionally blocks (see #OptixCreationFlags)
+///
+/// Thread safety: Safe to call from any thread
+OPTIXAPI OptixResult optixModuleCancelCreation( OptixModule module, OptixCreationFlags flags );
+
+
+/// Used to cancel creation of all modules asssociated with an OptixDeviceContext.
+/// Conditionally blocks (see #OptixCreationFlags)
+///
+/// Thread safety: Safe to call from any thread
+OPTIXAPI OptixResult optixDeviceContextCancelCreations( OptixDeviceContext context, OptixCreationFlags flags );
+
+/// Call for OptixModule objects created with optixModuleCreate and optixModuleDeserialize.
+///
+/// Modules must not be destroyed while they are still used by any program group.
+///
+/// Thread safety: A module must not be destroyed while it is still in use by concurrent API calls in other threads.
+OPTIXAPI OptixResult optixModuleDestroy( OptixModule module );
+
+/// Returns a module containing the intersection program for the built-in primitive type specified
+/// by the builtinISOptions.  This module must be used as the moduleIS for the OptixProgramGroupHitgroup
+/// in any SBT record for that primitive type.  (The entryFunctionNameIS should be null.)
+OPTIXAPI OptixResult optixBuiltinISModuleGet( OptixDeviceContext                 context,
+                                              const OptixModuleCompileOptions*   moduleCompileOptions,
+                                              const OptixPipelineCompileOptions* pipelineCompileOptions,
+                                              const OptixBuiltinISOptions*       builtinISOptions,
+                                              OptixModule*                       builtinModule );
+
+///@}
+/// \defgroup optix_host_api_tasks Tasks
+/// \ingroup optix_host_api
+///@{
+
+/// Each OptixTask should be executed with #optixTaskExecute(). If additional parallel
+/// work is found, new OptixTask objects will be returned in additionalTasks along with
+/// the number of additional tasks in numAdditionalTasksCreated. The parameter
+/// additionalTasks should point to a user allocated array of minimum size
+/// maxNumAdditionalTasks. OptiX can generate upto maxNumAdditionalTasks additional tasks.
+///
+/// Each task can be executed in parallel and in any order.
+///
+/// Thread safety: Safe to call from any thread until #optixModuleDestroy() is called for
+/// any associated task.
+///
+/// \see #optixModuleCreateWithTasks
+///
+/// \param[in] task the OptixTask to execute
+/// \param[in] additionalTasks pointer to array of OptixTask objects to be filled in
+/// \param[in] maxNumAdditionalTasks maximum number of additional OptixTask objects
+/// \param[out] numAdditionalTasksCreated number of OptixTask objects created by OptiX and written into additionalTasks
+OPTIXAPI OptixResult optixTaskExecute( OptixTask     task,
+                                       OptixTask*    additionalTasks,
+                                       unsigned int  maxNumAdditionalTasks,
+                                       unsigned int* numAdditionalTasksCreated );
+
+/// Retrieve the task's serialization key and its size.
+/// It is expected to call this function twice. Once to get the size and once to retrieve the key after space for it has been allocated.
+/// If the size of the key will be zero, the task will not be serializable and the task should be executed through #optixTaskExecute().
+///
+/// \param[in] task the OptixTask which key to retrieve
+/// \param[out] key characters representing the key without string-terminating '\0'. If nullptr, no output will be written
+/// \param[out] size size of the key. Will be 0 for non-serializable tasks.
+/// \return success
+OPTIXAPI OptixResult optixTaskGetSerializationKey( OptixTask task, void* key, size_t* size );
+
+/// Retrieve the serialized data of the task's output.
+/// It is expected to call this function twice. Once to get the size and once to retrieve the data after space for it has been allocated.
+/// Calling #optixTaskSerializeOutput() before calling #optixTaskExecute() will return an error. Calling #optixTaskSerializeOutput()
+/// after calling #optixTaskDeserializeOutput() will return an error.
+///
+/// \param[in] task the OptixTask which output data to retrieve
+/// \param[out] data allocated space big enough to hold the output. If nullptr, no output will be written
+/// \param[out] size size of the data. Will be 0 for non-serializable tasks.
+OPTIXAPI OptixResult optixTaskSerializeOutput( OptixTask task, void* data, size_t* size );
+
+/// Given the serialized task output, deserialize it and return potential new dependent tasks similar to #optixTaskExecute().
+/// Calling #optixTaskDeserializeOutput() on a completed (either executed or deserialized) task will return an error.
+///
+/// \param[in] task the OptixTask which to be deserialized
+/// \param[in] data the deserialized task's output
+/// \param[in] size the size of the deserialized task's output
+/// \param[in] additionalTasks pointer to array of OptixTask objects to be filled in
+/// \param[in] maxNumAdditionalTasks maximum number of additional OptixTask objects
+/// \param[out] numAdditionalTasksCreated number of OptixTask objects created by OptiX and written into additionalTasks
+OPTIXAPI OptixResult optixTaskDeserializeOutput( OptixTask     task,
+                                                 const void*   data,
+                                                 size_t        size,
+                                                 OptixTask*    additionalTasks,
+                                                 unsigned int  maxNumAdditionalTasks,
+                                                 unsigned int* numAdditionalTasksCreated );
+
+///@}
+/// \defgroup optix_host_api_program_groups Program groups
+/// \ingroup optix_host_api
+///@{
+
+/// Returns the stack sizes for the given program group. When programs in this \p programGroup are relying on external functions,
+/// the corresponding stack sizes can only be correctly retrieved when all functions are known after linking, i.e. when a pipeline
+/// has been created. When \p pipeline is set to NULL, the stack size will be calculated excluding external functions. In this case
+/// a warning will be issued if external functions are referenced by the OptixModule.
+///
+/// \param[in] programGroup the program group
+/// \param[out] stackSizes  the corresponding stack sizes
+/// \param[in] pipeline     considering the program group within the given pipeline, can be NULL
+OPTIXAPI OptixResult optixProgramGroupGetStackSize( OptixProgramGroup programGroup, OptixStackSizes* stackSizes, OptixPipeline pipeline );
+
+/// logString is an optional buffer that contains compiler feedback and errors.  This
+/// information is also passed to the context logger (if enabled), however it may be
+/// difficult to correlate output to the logger to specific API invocations when using
+/// multiple threads.  The output to logString will only contain feedback for this specific
+/// invocation of this API call.
+///
+/// logStringSize as input should be a pointer to the number of bytes backing logString.
+/// Upon return it contains the length of the log message (including the null terminator)
+/// which may be greater than the input value.  In this case, the log message will be
+/// truncated to fit into logString.
+///
+/// If logString or logStringSize are NULL, no output is written to logString.  If
+/// logStringSize points to a value that is zero, no output is written.  This does not
+/// affect output to the context logger if enabled.
+///
+/// Creates numProgramGroups OptiXProgramGroup objects from the specified
+/// OptixProgramGroupDesc array.  The size of the arrays must match.
+///
+/// \param[in] context
+/// \param[in] programDescriptions    N * OptixProgramGroupDesc
+/// \param[in] numProgramGroups       N
+/// \param[in] options
+/// \param[out] logString             Information will be written to this string. If logStringSize > 0 logString will be null terminated.
+/// \param[in,out] logStringSize
+/// \param[out] programGroups
+OPTIXAPI OptixResult optixProgramGroupCreate( OptixDeviceContext              context,
+                                              const OptixProgramGroupDesc*    programDescriptions,
+                                              unsigned int                    numProgramGroups,
+                                              const OptixProgramGroupOptions* options,
+                                              char*                           logString,
+                                              size_t*                         logStringSize,
+                                              OptixProgramGroup*              programGroups );
+
+/// Thread safety: A program group must not be destroyed while it is still in use by concurrent API calls in other threads.
+OPTIXAPI OptixResult optixProgramGroupDestroy( OptixProgramGroup programGroup );
+
+/// \param[in]  programGroup               the program group containing the program(s)
+/// \param[out] sbtRecordHeaderHostPointer  the result sbt record header
+OPTIXAPI OptixResult optixSbtRecordPackHeader( OptixProgramGroup programGroup, void* sbtRecordHeaderHostPointer );
+
+///@}
+/// \defgroup optix_host_api_launches Launches
+/// \ingroup optix_host_api
+///@{
+
+/// Where the magic happens.
+///
+/// The stream and pipeline must belong to the same device context.  Multiple launches
+/// may be issues in parallel from multiple threads to different streams.
+///
+/// pipelineParamsSize number of bytes are copied from the device memory pointed to by
+/// pipelineParams before launch.  It is an error if pipelineParamsSize is greater than the
+/// size of the variable declared in modules and identified by
+/// OptixPipelineCompileOptions::pipelineLaunchParamsVariableName. If the launch params
+/// variable was optimized out or not found in the modules linked to the pipeline then
+/// the pipelineParams and pipelineParamsSize parameters are ignored.
+///
+/// sbt points to the shader binding table, which defines shader
+/// groupings and their resources. See the SBT spec.
+///
+/// \param[in] pipeline
+/// \param[in] stream
+/// \param[in] pipelineParams
+/// \param[in] pipelineParamsSize
+/// \param[in] sbt
+/// \param[in] width              number of elements to compute
+/// \param[in] height             number of elements to compute
+/// \param[in] depth              number of elements to compute
+///
+/// Thread safety: In the current implementation concurrent launches to the same pipeline are not
+/// supported.  Concurrent launches require separate OptixPipeline objects.
+OPTIXAPI OptixResult optixLaunch( OptixPipeline                  pipeline,
+                                  CUstream                       stream,
+                                  CUdeviceptr                    pipelineParams,
+                                  size_t                         pipelineParamsSize,
+                                  const OptixShaderBindingTable* sbt,
+                                  unsigned int                   width,
+                                  unsigned int                   height,
+                                  unsigned int                   depth );
+
+///@}
+/// \defgroup optix_host_api_acceleration_structures Acceleration structures
+/// \ingroup optix_host_api
+///@{
+
+/// \param[in] context
+/// \param[in] accelOptions   options for the accel build
+/// \param[in] buildInputs    an array of OptixBuildInput objects
+/// \param[in] numBuildInputs number of elements in buildInputs (must be at least 1)
+/// \param[out] bufferSizes   fills in buffer sizes
+OPTIXAPI OptixResult optixAccelComputeMemoryUsage( OptixDeviceContext            context,
+                                                   const OptixAccelBuildOptions* accelOptions,
+                                                   const OptixBuildInput*        buildInputs,
+                                                   unsigned int                  numBuildInputs,
+                                                   OptixAccelBufferSizes*        bufferSizes );
+
+/// \param[in] context
+/// \param[in] stream
+/// \param[in] accelOptions             accel options
+/// \param[in] buildInputs              an array of OptixBuildInput objects
+/// \param[in] numBuildInputs           must be >= 1 for GAS, and == 1 for IAS
+/// \param[in] tempBuffer               must be a multiple of OPTIX_ACCEL_BUFFER_BYTE_ALIGNMENT
+/// \param[in] tempBufferSizeInBytes
+/// \param[in] outputBuffer             must be a multiple of OPTIX_ACCEL_BUFFER_BYTE_ALIGNMENT
+/// \param[in] outputBufferSizeInBytes
+/// \param[out] outputHandle
+/// \param[in] emittedProperties         types of requested properties and output buffers
+/// \param[in] numEmittedProperties      number of post-build properties to populate (may be zero)
+OPTIXAPI OptixResult optixAccelBuild( OptixDeviceContext            context,
+                                      CUstream                      stream,
+                                      const OptixAccelBuildOptions* accelOptions,
+                                      const OptixBuildInput*        buildInputs,
+                                      unsigned int                  numBuildInputs,
+                                      CUdeviceptr                   tempBuffer,
+                                      size_t                        tempBufferSizeInBytes,
+                                      CUdeviceptr                   outputBuffer,
+                                      size_t                        outputBufferSizeInBytes,
+                                      OptixTraversableHandle*       outputHandle,
+                                      const OptixAccelEmitDesc*     emittedProperties,
+                                      unsigned int                  numEmittedProperties );
+
+/// Obtain relocation information, stored in OptixRelocationInfo, for a given context
+/// and acceleration structure's traversable handle.
+///
+/// The relocation information can be passed to optixCheckRelocationCompatibility to
+/// determine if an acceleration structure, referenced by 'handle', can be relocated to a
+/// different device's memory space (see #optixCheckRelocationCompatibility).
+///
+/// When used with optixAccelRelocate, it provides data necessary for doing the relocation.
+///
+/// If the acceleration structure data associated with 'handle' is copied multiple times,
+/// the same OptixRelocationInfo can also be used on all copies.
+///
+/// \param[in] context
+/// \param[in] handle
+/// \param[out] info
+/// \return OPTIX_ERROR_INVALID_VALUE will be returned for traversable handles that are not from
+/// acceleration structure builds.
+OPTIXAPI OptixResult optixAccelGetRelocationInfo( OptixDeviceContext context, OptixTraversableHandle handle, OptixRelocationInfo* info );
+
+/// Checks if an optix data structure built using another OptixDeviceContext (that was
+/// used to fill in 'info') is compatible with the OptixDeviceContext specified in the
+/// 'context' parameter.
+///
+/// Any device is always compatible with itself.
+///
+/// \param[in] context
+/// \param[in] info
+/// \param[out] compatible If OPTIX_SUCCESS is returned 'compatible' will have the value of either:
+/// - 0: This context is not compatible with the optix data structure associated with 'info'.
+/// - 1: This context is compatible.
+OPTIXAPI OptixResult optixCheckRelocationCompatibility( OptixDeviceContext context, const OptixRelocationInfo* info, int* compatible );
+
+/// optixAccelRelocate is called to update the acceleration structure after it has been
+/// relocated.  Relocation is necessary when the acceleration structure's location in device
+/// memory has changed.  optixAccelRelocate does not copy the memory.  This function only
+/// operates on the relocated memory whose new location is specified by 'targetAccel'.
+/// optixAccelRelocate also returns the new OptixTraversableHandle associated with
+/// 'targetAccel'.  The original memory (source) is not required to be valid, only the
+/// OptixRelocationInfo.
+///
+/// Before calling optixAccelRelocate, optixCheckRelocationCompatibility should be
+/// called to ensure the copy will be compatible with the destination device context.
+///
+/// The memory pointed to by 'targetAccel' should be allocated with the same size as the
+/// source acceleration.  Similar to the 'outputBuffer' used in optixAccelBuild, this
+/// pointer must be a multiple of OPTIX_ACCEL_BUFFER_BYTE_ALIGNMENT.
+///
+/// The memory in 'targetAccel' must be allocated as long as the accel is in use.
+///
+/// The instance traversables referenced by an IAS and the
+/// micromaps referenced by a triangle GAS may themselves require relocation.
+/// 'relocateInputs' and 'numRelocateInputs' should be used to specify the relocated
+/// traversables and micromaps. After relocation, the relocated accel will reference
+/// these relocated traversables and micromaps instead of their sources.
+/// The number of relocate inputs 'numRelocateInputs' must match the number of build
+/// inputs 'numBuildInputs' used to build the source accel. Relocation inputs
+/// correspond with build inputs used to build the source accel and should appear in
+/// the same order (see #optixAccelBuild).
+/// 'relocateInputs' and 'numRelocateInputs' may be zero, preserving any references
+/// to traversables and micromaps from the source accel.
+///
+/// \param[in] context
+/// \param[in] stream
+/// \param[in] info
+/// \param[in] relocateInputs
+/// \param[in] numRelocateInputs
+/// \param[in] targetAccel
+/// \param[in] targetAccelSizeInBytes
+/// \param[out] targetHandle
+OPTIXAPI OptixResult optixAccelRelocate( OptixDeviceContext         context,
+                                         CUstream                   stream,
+                                         const OptixRelocationInfo* info,
+                                         const OptixRelocateInput*  relocateInputs,
+                                         size_t                     numRelocateInputs,
+                                         CUdeviceptr                targetAccel,
+                                         size_t                     targetAccelSizeInBytes,
+                                         OptixTraversableHandle*    targetHandle );
+
+/// After building an acceleration structure, it can be copied in a compacted form to reduce
+/// memory.  In order to be compacted, OPTIX_BUILD_FLAG_ALLOW_COMPACTION must be supplied in
+/// OptixAccelBuildOptions::buildFlags passed to optixAccelBuild.
+///
+/// 'outputBuffer' is the pointer to where the compacted acceleration structure will be
+/// written.  This pointer must be a multiple of OPTIX_ACCEL_BUFFER_BYTE_ALIGNMENT.
+///
+/// The size of the memory specified in 'outputBufferSizeInBytes' should be at least the
+/// value computed using the OPTIX_PROPERTY_TYPE_COMPACTED_SIZE that was reported during
+/// optixAccelBuild.
+///
+/// \param[in] context
+/// \param[in] stream
+/// \param[in] inputHandle
+/// \param[in] outputBuffer
+/// \param[in] outputBufferSizeInBytes
+/// \param[out] outputHandle
+OPTIXAPI OptixResult optixAccelCompact( OptixDeviceContext      context,
+                                        CUstream                stream,
+                                        OptixTraversableHandle  inputHandle,
+                                        CUdeviceptr             outputBuffer,
+                                        size_t                  outputBufferSizeInBytes,
+                                        OptixTraversableHandle* outputHandle );
+
+/// Emit a single property after an acceleration structure was built.
+/// The result buffer of the ' emittedProperty' needs to be large enough to hold the
+/// requested property (\see #OptixAccelPropertyType).
+///
+/// \param[in] context
+/// \param[in] stream
+/// \param[in] handle
+/// \param[in] emittedProperty    type of requested property and output buffer
+OPTIXAPI OptixResult optixAccelEmitProperty( OptixDeviceContext        context,
+                                             CUstream                  stream,
+                                             OptixTraversableHandle    handle,
+                                             const OptixAccelEmitDesc* emittedProperty );
+
+/// \param[in] onDevice
+/// \param[in] pointer            pointer to traversable allocated in OptixDeviceContext. This pointer must be a multiple of OPTIX_TRANSFORM_BYTE_ALIGNMENT
+/// \param[in] traversableType    Type of OptixTraversableHandle to create
+/// \param[out] traversableHandle traversable handle. traversableHandle must be in host memory
+OPTIXAPI OptixResult optixConvertPointerToTraversableHandle( OptixDeviceContext      onDevice,
+                                                             CUdeviceptr             pointer,
+                                                             OptixTraversableType    traversableType,
+                                                             OptixTraversableHandle* traversableHandle );
+
+
+/// Determine the amount of memory necessary for a Opacity Micromap Array build.
+///
+/// \param[in] context
+/// \param[in] buildInput
+/// \param[out] bufferSizes
+OPTIXAPI OptixResult optixOpacityMicromapArrayComputeMemoryUsage( OptixDeviceContext                         context,
+                                                                  const OptixOpacityMicromapArrayBuildInput* buildInput,
+                                                                  OptixMicromapBufferSizes* bufferSizes );
+
+/// Construct an array of Opacity Micromaps.
+///
+/// Each triangle within an instance/GAS may reference one opacity micromap to give finer
+/// control over alpha behavior. A opacity micromap consists of a set of 4^N micro-triangles
+/// in a triangular uniform barycentric grid. Multiple opacity micromaps are collected (built)
+/// into a opacity micromap array with this function. Each geometry in a GAS may bind a
+/// single opacity micromap array and can use opacity micromaps from that array only.
+///
+/// Each micro-triangle within a opacity micromap can be in one of four states: Transparent,
+/// Opaque, Unknown-Transparent or Unknown-Opaque. During traversal, if a triangle with a
+/// opacity micromap attached is intersected, the opacity micromap is queried to categorize
+/// the hit as either opaque, unknown (alpha) or a miss. Geometry, ray or instance flags that
+/// modify the alpha/opaque behavior are applied _after_ this opacity micromap query.
+///
+/// The opacity micromap query may operate in 2-state mode (alpha testing) or 4-state mode (AHS culling),
+/// depending on the opacity micromap type and ray/instance flags. When operating in 2-state
+/// mode, alpha hits will not be reported, and transparent and opaque hits must be accurate.
+///
+/// \param[in] context
+/// \param[in] stream
+/// \param[in] buildInput             a single build input object referencing many opacity micromaps
+/// \param[in] buffers                the buffers used for build
+OPTIXAPI OptixResult optixOpacityMicromapArrayBuild( OptixDeviceContext                         context,
+                                                     CUstream                                   stream,
+                                                     const OptixOpacityMicromapArrayBuildInput* buildInput,
+                                                     const OptixMicromapBuffers*                buffers );
+
+/// Obtain relocation information, stored in OptixRelocationInfo, for a given context
+/// and opacity micromap array.
+///
+/// The relocation information can be passed to optixCheckRelocationCompatibility to
+/// determine if a opacity micromap array, referenced by buffers, can be relocated to a
+/// different device's memory space (see #optixCheckRelocationCompatibility).
+///
+/// When used with optixOpacityMicromapArrayRelocate, it provides data necessary for doing the relocation.
+///
+/// If the opacity micromap array data associated with 'opacityMicromapArray' is copied multiple times,
+/// the same OptixRelocationInfo can also be used on all copies.
+///
+/// \param[in]  context
+/// \param[in]  opacityMicromapArray
+/// \param[out] info
+OPTIXAPI OptixResult optixOpacityMicromapArrayGetRelocationInfo( OptixDeviceContext   context,
+                                                                 CUdeviceptr          opacityMicromapArray,
+                                                                 OptixRelocationInfo* info );
+
+/// optixOpacityMicromapArrayRelocate is called to update the opacity micromap array after it has been
+/// relocated.  Relocation is necessary when the opacity micromap array's location in device
+/// memory has changed.  optixOpacityMicromapArrayRelocate does not copy the memory.  This function only
+/// operates on the relocated memory whose new location is specified by 'targetOpacityMicromapArray'.
+/// The original memory (source) is not required to be valid, only the
+/// OptixRelocationInfo.
+///
+/// Before calling optixOpacityMicromapArrayRelocate, optixCheckRelocationCompatibility should be called
+/// to ensure the copy will be compatible with the destination device context.
+///
+/// The memory pointed to by 'targetOpacityMicromapArray' should be allocated with the same size as the
+/// source opacity micromap array.  Similar to the 'OptixMicromapBuffers::output' used in optixOpacityMicromapArrayBuild,
+/// this pointer must be a multiple of OPTIX_OPACITY_MICROMAP_ARRAY_BUFFER_BYTE_ALIGNMENT.
+///
+/// The memory in 'targetOpacityMicromapArray' must be allocated as long as the opacity micromap array is in use.
+///
+/// Note that any Acceleration Structures build using the original memory (source) as input will
+/// still be associated with this original memory. To associate an existing (possibly relocated)
+/// Acceleration Structures with the relocated opacity micromap array, use optixAccelBuild
+/// to update the existing Acceleration Structures (See OPTIX_BUILD_OPERATION_UPDATE)
+///
+/// \param[in] context
+/// \param[in] stream
+/// \param[in] info
+/// \param[in] targetOpacityMicromapArray
+/// \param[in] targetOpacityMicromapArraySizeInBytes
+OPTIXAPI OptixResult optixOpacityMicromapArrayRelocate( OptixDeviceContext         context,
+                                                        CUstream                   stream,
+                                                        const OptixRelocationInfo* info,
+                                                        CUdeviceptr                targetOpacityMicromapArray,
+                                                        size_t                     targetOpacityMicromapArraySizeInBytes );
+
+
+/// Host side conservative memory computation for a subsequent optixClusterAccelBuild call with the same build mode and input.
+/// For implicit builds, the output buffer size contains the required size for holding all build outputs as specified in buildInput->maxArgsCount.
+/// For explicit builds, the output buffer size contains the required size for holding a single build output.
+/// The temp buffer of any optixClusterAccelBuild call must be at least as big as reported by optixClusterAccelComputeMemoryUsage.
+/// optixClusterAccelComputeMemoryUsage always returns 0 for OptixAccelBufferSizes::tempUpdateSizeInBytes.
+///
+/// \param[in] context
+/// \param[in] buildMode          Select the kind of output target (implicit: single buffer, explicit: per-build buffers, getSize: compact size computation for future explicit builds)
+/// \param[in] buildInput         A single input, describes the type of object to build and limits over all objects' arguments
+/// \param[out] bufferSizes
+OPTIXAPI OptixResult optixClusterAccelComputeMemoryUsage( OptixDeviceContext                 context,
+                                                          OptixClusterAccelBuildMode         buildMode,
+                                                          const OptixClusterAccelBuildInput* buildInput,
+                                                          OptixAccelBufferSizes*             bufferSizes );
+
+/// Entry point to building one type of cluster objects: a CLAS, a Cluster template, or a GAS-over-CLAS.
+/// This is an indirect build function: all build arguments are read from device memory, with only the output location, type of build and limits passed on the host.
+/// This is a multi build function: more than one object can be built at once, but only of one type. The supplied limits must bound the inputs (Args) of all builds.
+/// Output buffer size constraints for implicit and explicit builds:
+///   implicit: The output and temp buffer must be at least as big as reported by a corresponding optixClusterAccelComputeMemoryUsage call.
+///   explicit: The output buffers must be at least as big as reported by a corresponding optixClusterAccelBuild call with the getSize mode and all device data supplied.
+///             The temp buffer must be at least as big as reported by a corresponding optixClusterAccelComputeMemoryUsage call.
+///    getSize: No output buffer is used. The temp buffer must be at least as big as reported by a corresponding optixClusterAccelComputeMemoryUsage call.
+/// Consequently, calling optixClusterAccelBuild with the getSize mode and subsequently building with the explicit mode is more memory efficient, but slower compared to
+/// building with the implicit mode.
+///
+/// \param[in] context
+/// \param[in] stream
+/// \param[in] buildModeDesc      A single input, describes where to write data for the selected build mode
+/// \param[in] buildInput         A single input, describes the type of object to build and limits over all objects' arguments
+/// \param[in] argsArray          Pointer to arguments array in device memory, describes each object to build:
+///  OptixClusterAccelBuildInputTrianglesArgs when using OPTIX_CLUSTER_ACCEL_BUILD_TYPE_CLUSTERS_FROM_TRIANGLES
+///  OptixClusterAccelBuildInputTrianglesArgs when using OPTIX_CLUSTER_ACCEL_BUILD_TYPE_TEMPLATES_FROM_TRIANGLES
+///  OptixClusterAccelBuildInputGridsArgs     when using OPTIX_CLUSTER_ACCEL_BUILD_TYPE_TEMPLATES_FROM_GRIDS
+///  OptixClusterAccelBuildInputTemplatesArgs when using OPTIX_CLUSTER_ACCEL_BUILD_TYPE_CLUSTERS_FROM_TEMPLATES
+///  OptixClusterAccelBuildInputClustersArgs  when using OPTIX_CLUSTER_ACCEL_BUILD_TYPE_GASES_FROM_CLUSTERS
+/// \param[in] argsCount          Optional pointer to device memory, storing the number of objects to build, if null is provided, uses maxArgCount from buildInput
+/// \param[in] argsStrideInBytes  Optional stride of args objects, if null is provided, uses natural stride of Args type
+OPTIXAPI OptixResult optixClusterAccelBuild( OptixDeviceContext                    context,
+                                             CUstream                              stream,
+                                             const OptixClusterAccelBuildModeDesc* buildModeDesc,
+                                             const OptixClusterAccelBuildInput*    buildInput,
+                                             CUdeviceptr                           argsArray,
+                                             CUdeviceptr                           argsCount,
+                                             unsigned int                          argsStrideInBytes );
+
+
+///@}
+/// \defgroup optix_host_api_coop_vec Cooperative Vector
+/// \ingroup optix_host_api
+///@{
+
+/// Convert matrices from one layout and or element type to another.
+///
+/// One use case is to convert a matrix in OPTIX_COOP_VEC_MATRIX_LAYOUT_ROW_MAJOR or
+/// OPTIX_COOP_VEC_MATRIX_LAYOUT_COLUMN_MAJOR into OPTIX_COOP_VEC_MATRIX_LAYOUT_INFERENCING_OPTIMAL.
+///
+/// The alignment base address + offset of each matrix needs to be a minimum of 64
+/// bytes. This is similar to the requirements of #optixCoopVecMatMul.
+///
+/// Type conversion is possible, but is limited. If the input elementType and output
+/// elementType are not equal, then one must be OPTIX_COOP_VEC_ELEM_TYPE_FLOAT32 or
+/// OPTIX_COOP_VEC_ELEM_TYPE_FLOAT16 and the other must be a lower-precision
+/// floating-point type. If the output elementType is OPTIX_COOP_VEC_ELEM_TYPE_FLOAT8_E4M3
+/// or OPTIX_COOP_VEC_ELEM_TYPE_FLOAT8_E5M2, then the output layout must be
+/// OPTIX_COOP_VEC_MATRIX_LAYOUT_INFERENCING_OPTIMAL or
+/// OPTIX_COOP_VEC_MATRIX_LAYOUT_TRAINING_OPTIMAL.
+///
+
+
+
+/// \param[in] context
+/// \param[in] stream
+/// \param[in] numNetworks                number of networks to convert
+/// \param[in] inputNetworkDescription    description of the input network matrix topology (one per invocation)
+/// \param[in] inputNetworks              base pointer to array of matrices that match the input topology specified in network
+/// \param[in] inputNetworkStrideInBytes  number of bytes between input networks, ignored if numNetworks is one
+/// \param[in] outputNetworkDescription   description of the output network matrix topology (one per invocation)
+/// \param[in] outputNetworks             base pointer to array of matrices that match the output topology specified in network
+/// \param[in] outputNetworkStrideInBytes number of bytes between output networks, ignored if numNetworks is one
+OPTIXAPI OptixResult optixCoopVecMatrixConvert( OptixDeviceContext             context,
+                                                CUstream                       stream,
+                                                unsigned int                   numNetworks,
+                                                const OptixNetworkDescription* inputNetworkDescription,
+                                                CUdeviceptr                    inputNetworks,
+                                                size_t                         inputNetworkStrideInBytes,
+                                                const OptixNetworkDescription* outputNetworkDescription,
+                                                CUdeviceptr                    outputNetworks,
+                                                size_t                         outputNetworkStrideInBytes );
+
+
+
+/// For row and column ordered matrix layouts, when \a rowColumnStrideInBytes is 0, the
+/// stride will assume tight packing.
+///
+/// Results will be rounded to the next multiple of 64 to make it easy to pack the
+/// matrices in memory and have the correct alignment.
+///
+/// \param[in]  context
+/// \param[in]  elementType
+/// \param[in]  N
+/// \param[in]  K
+/// \param[in]  layout
+/// \param[in]  rowColumnStrideInBytes Ignored for optimal layouts
+/// \param[out] sizeInBytes            Output size of the matrix in bytes
+OPTIXAPI OptixResult optixCoopVecMatrixComputeSize( OptixDeviceContext       context,
+                                                    unsigned int             N,
+                                                    unsigned int             K,
+                                                    OptixCoopVecElemType     elementType,
+                                                    OptixCoopVecMatrixLayout layout,
+                                                    size_t                   rowColumnStrideInBytes,
+                                                    size_t*                  sizeInBytes );
+
+
+///@}
+/// \defgroup optix_host_api_denoiser Denoiser
+/// \ingroup optix_host_api
+///@{
+
+/// Creates a denoiser object with the given options, using built-in inference models
+///
+/// 'modelKind' selects the model used for inference.
+/// Inference for the built-in models can be guided (giving hints to improve image quality) with
+/// albedo and normal vector images in the guide layer (see 'optixDenoiserInvoke').
+/// Use of these images must be enabled in 'OptixDenoiserOptions'.
+///
+/// \param[in] context
+/// \param[in] modelKind
+/// \param[in] options
+/// \param[out] denoiser
+OPTIXAPI OptixResult optixDenoiserCreate( OptixDeviceContext          context,
+                                          OptixDenoiserModelKind      modelKind,
+                                          const OptixDenoiserOptions* options,
+                                          OptixDenoiser*              denoiser );
+
+/// Creates a denoiser object with the given options, using a provided inference model
+///
+/// 'userData' and 'userDataSizeInBytes' provide a user model for inference.
+/// The memory passed in userData will be accessed only during the invocation of this function and
+/// can be freed after it returns.
+/// The user model must export only one weight set which determines both the model kind and the
+/// required set of guide images.
+///
+/// \param[in] context
+/// \param[in] userData
+/// \param[in] userDataSizeInBytes
+/// \param[out] denoiser
+OPTIXAPI OptixResult optixDenoiserCreateWithUserModel( OptixDeviceContext context,
+                                                       const void*        userData,
+                                                       size_t             userDataSizeInBytes,
+                                                       OptixDenoiser*     denoiser );
+
+/// Destroys the denoiser object and any associated host resources.
+OPTIXAPI OptixResult optixDenoiserDestroy( OptixDenoiser denoiser );
+
+/// Computes the GPU memory resources required to execute the denoiser.
+///
+/// Memory for state and scratch buffers must be allocated with the sizes in 'returnSizes' and scratch memory
+/// passed to optixDenoiserSetup, optixDenoiserInvoke,
+/// optixDenoiserComputeIntensity and optixDenoiserComputeAverageColor.
+/// For tiled denoising an overlap area ('overlapWindowSizeInPixels') must be added to each tile on all sides
+/// which increases the amount of
+/// memory needed to denoise a tile. In case of tiling use withOverlapScratchSizeInBytes for scratch memory size.
+/// If only full resolution images are denoised, withoutOverlapScratchSizeInBytes can be used which is always
+/// smaller than withOverlapScratchSizeInBytes.
+///
+/// 'outputWidth' and 'outputHeight' is the dimension of the image to be denoised (without overlap in case tiling
+/// is being used).
+/// 'outputWidth' and 'outputHeight' must be greater than or equal to the dimensions passed to optixDenoiserSetup.
+///
+/// \param[in] denoiser
+/// \param[in] outputWidth
+/// \param[in] outputHeight
+/// \param[out] returnSizes
+OPTIXAPI OptixResult optixDenoiserComputeMemoryResources( const OptixDenoiser denoiser,
+                                                          unsigned int        outputWidth,
+                                                          unsigned int        outputHeight,
+                                                          OptixDenoiserSizes* returnSizes );
+
+/// Initializes the state required by the denoiser.
+///
+/// 'inputWidth' and 'inputHeight' must include overlap on both sides of the image if tiling is being used. The overlap is
+/// returned by #optixDenoiserComputeMemoryResources.
+/// For subsequent calls to #optixDenoiserInvoke 'inputWidth' and 'inputHeight' are the maximum dimensions
+/// of the input layers. Dimensions of the input layers passed to #optixDenoiserInvoke may be different in each
+/// invocation however they always must be smaller than 'inputWidth' and 'inputHeight' passed to #optixDenoiserSetup.
+///
+/// \param[in] denoiser
+/// \param[in] stream
+/// \param[in] inputWidth
+/// \param[in] inputHeight
+/// \param[in] denoiserState
+/// \param[in] denoiserStateSizeInBytes
+/// \param[in] scratch
+/// \param[in] scratchSizeInBytes
+OPTIXAPI OptixResult optixDenoiserSetup( OptixDenoiser denoiser,
+                                         CUstream      stream,
+                                         unsigned int  inputWidth,
+                                         unsigned int  inputHeight,
+                                         CUdeviceptr   denoiserState,
+                                         size_t        denoiserStateSizeInBytes,
+                                         CUdeviceptr   scratch,
+                                         size_t        scratchSizeInBytes );
+
+/// Invokes denoiser on a set of input data and produces at least one output image.
+/// State memory must be available during the execution of the
+/// denoiser (or until optixDenoiserSetup is called with a new state memory pointer).
+/// Scratch memory passed is used only for the duration of this function.
+/// Scratch and state memory sizes must have a size greater than or equal to the sizes as returned by
+/// optixDenoiserComputeMemoryResources.
+///
+/// 'inputOffsetX' and 'inputOffsetY' are pixel offsets in the 'inputLayers' image
+/// specifying the beginning of the image without overlap. When denoising an entire image without tiling
+/// there is no overlap and 'inputOffsetX' and 'inputOffsetY' must be zero. When denoising a tile which is
+/// adjacent to one of the four sides of the entire image the corresponding offsets must also be zero since
+/// there is no overlap at the side adjacent to the image border.
+///
+/// 'guideLayer' provides additional information to the denoiser. When providing albedo and normal vector
+/// guide images, the corresponding fields in the 'OptixDenoiserOptions' must be
+/// enabled, see #optixDenoiserCreate.
+/// 'guideLayer' must not be null. If a guide image in 'OptixDenoiserOptions' is not enabled, the
+/// corresponding image in 'OptixDenoiserGuideLayer' is ignored.
+///
+/// If OPTIX_DENOISER_MODEL_KIND_TEMPORAL or OPTIX_DENOISER_MODEL_KIND_TEMPORAL_AOV  is selected, a 2d flow
+/// image must be given in 'OptixDenoiserGuideLayer'.
+/// It describes for each pixel the flow from the previous to the current frame (a 2d vector in pixel space).
+/// The denoised beauty/AOV of the previous frame must be given in 'previousOutput'.
+/// If this image is not available in the first frame of a sequence, the noisy beauty/AOV from the first frame
+/// and zero flow vectors could be given as a substitute.
+/// For non-temporal model kinds the flow image in 'OptixDenoiserGuideLayer' is ignored.
+/// 'previousOutput' and
+/// 'output' may refer to the same buffer if tiling is not used, i.e. 'previousOutput' is first read by this function and later
+/// overwritten with the denoised result. 'output' can be passed as 'previousOutput' to the next frame.
+/// In other model kinds (not temporal) 'previousOutput' is ignored.
+///
+/// The beauty layer must be given as the first entry in 'layers'.
+/// In AOV type model kinds (OPTIX_DENOISER_MODEL_KIND_AOV or in user defined models implementing
+/// kernel-prediction) additional layers for the AOV images can be given.
+/// In each layer the noisy input image is given in 'input', the denoised output is written into the
+/// 'output' image. input and output images may refer to the same buffer, with the restriction that
+/// the pixel formats must be identical for input and output when the blend mode is selected (see
+/// #OptixDenoiserParams).
+///
+/// If OPTIX_DENOISER_MODEL_KIND_TEMPORAL or OPTIX_DENOISER_MODEL_KIND_TEMPORAL_AOV  is selected, the denoised
+/// image from the previous frame must be given in 'previousOutput' in the layer. 'previousOutput' and
+/// 'output' may refer to the same buffer if tiling is not used, i.e. 'previousOutput' is first read by this function and later
+/// overwritten with the denoised result. 'output' can be passed as 'previousOutput' to the next frame.
+/// In addition, 'previousOutputInternalGuideLayer' and 'outputInternalGuideLayer' must both be allocated regardless
+/// of tiling mode. The pixel format must be OPTIX_PIXEL_FORMAT_INTERNAL_GUIDE_LAYER and the dimension must be identical to
+/// to the other input layers. In the first frame memory in 'previousOutputInternalGuideLayer' must either contain valid
+/// data from previous denoiser runs or set to zero.
+/// In other model kinds (not temporal) 'previousOutput' and the internal guide layers are ignored.
+///
+/// If OPTIX_DENOISER_MODEL_KIND_TEMPORAL or OPTIX_DENOISER_MODEL_KIND_TEMPORAL_AOV is selected, the
+/// normal vector guide image must be given as 3d vectors in camera space. In the other models only
+/// the x and y channels are used and other channels are ignored.
+///
+/// \param[in] denoiser
+/// \param[in] stream
+/// \param[in] params
+/// \param[in] denoiserState
+/// \param[in] denoiserStateSizeInBytes
+/// \param[in] guideLayer
+/// \param[in] layers
+/// \param[in] numLayers
+/// \param[in] inputOffsetX
+/// \param[in] inputOffsetY
+/// \param[in] scratch
+/// \param[in] scratchSizeInBytes
+OPTIXAPI OptixResult optixDenoiserInvoke( OptixDenoiser                  denoiser,
+                                          CUstream                       stream,
+                                          const OptixDenoiserParams*     params,
+                                          CUdeviceptr                    denoiserState,
+                                          size_t                         denoiserStateSizeInBytes,
+                                          const OptixDenoiserGuideLayer* guideLayer,
+                                          const OptixDenoiserLayer*      layers,
+                                          unsigned int                   numLayers,
+                                          unsigned int                   inputOffsetX,
+                                          unsigned int                   inputOffsetY,
+                                          CUdeviceptr                    scratch,
+                                          size_t                         scratchSizeInBytes );
+
+/// Computes the logarithmic average intensity of the given image. The returned value 'outputIntensity'
+/// is multiplied with the RGB values of the input image/tile in optixDenoiserInvoke if given in the parameter
+/// OptixDenoiserParams::hdrIntensity (otherwise 'hdrIntensity' must be a null pointer). This is useful for
+/// denoising HDR images which are very dark or bright.
+/// When denoising tiles the intensity of the entire image should be computed, i.e. not per tile to get
+/// consistent results.
+///
+/// For each RGB pixel in the inputImage the intensity is calculated and summed if it is greater than 1e-8f:
+/// intensity = log(r * 0.212586f + g * 0.715170f + b * 0.072200f).
+/// The function returns 0.18 / exp(sum of intensities / number of summed pixels).
+/// More details could be found in the Reinhard tonemapping paper:
+/// http://www.cmap.polytechnique.fr/~peyre/cours/x2005signal/hdr_photographic.pdf
+///
+/// The size of scratch memory required can be queried with #optixDenoiserComputeMemoryResources.
+///
+/// data type unsigned char is not supported for 'inputImage', it must be 3 or 4 component half/float.
+///
+/// \param[in] denoiser
+/// \param[in] stream
+/// \param[in] inputImage
+/// \param[out] outputIntensity    single float
+/// \param[in] scratch
+/// \param[in] scratchSizeInBytes
+OPTIXAPI OptixResult optixDenoiserComputeIntensity( OptixDenoiser       denoiser,
+                                                    CUstream            stream,
+                                                    const OptixImage2D* inputImage,
+                                                    CUdeviceptr         outputIntensity,
+                                                    CUdeviceptr         scratch,
+                                                    size_t              scratchSizeInBytes );
+
+/// Compute average logarithmic for each of the first three channels for the given image.
+/// When denoising tiles the intensity of the entire image should be computed, i.e. not per tile to get
+/// consistent results.
+///
+/// The size of scratch memory required can be queried with #optixDenoiserComputeMemoryResources.
+///
+/// data type unsigned char is not supported for 'inputImage', it must be 3 or 4 component half/float.
+///
+/// \param[in] denoiser
+/// \param[in] stream
+/// \param[in] inputImage
+/// \param[out] outputAverageColor three floats
+/// \param[in] scratch
+/// \param[in] scratchSizeInBytes
+OPTIXAPI OptixResult optixDenoiserComputeAverageColor( OptixDenoiser       denoiser,
+                                                       CUstream            stream,
+                                                       const OptixImage2D* inputImage,
+                                                       CUdeviceptr         outputAverageColor,
+                                                       CUdeviceptr         scratch,
+                                                       size_t              scratchSizeInBytes );
+
+///@}
+
+#include "optix_function_table.h"
+
+#endif  // OPTIX_OPTIX_HOST_H
diff --git a/crtx/optix_9.1/optix_micromap.h b/crtx/optix_9.1/optix_micromap.h
new file mode 100644
index 0000000..144c35e
--- /dev/null
+++ b/crtx/optix_9.1/optix_micromap.h
@@ -0,0 +1,76 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+* @file   optix_micromap.h
+* @author NVIDIA Corporation
+* @brief  OptiX micromap helper functions
+*
+* OptiX micromap helper functions. Useable on either host or device.
+*/
+
+#ifndef OPTIX_OPTIX_MICROMAP_H
+#define OPTIX_OPTIX_MICROMAP_H
+
+#if !defined( OPTIX_DONT_INCLUDE_CUDA )
+// If OPTIX_DONT_INCLUDE_CUDA is defined, cuda driver type float2 must be defined through other
+// means before including optix headers.
+#include <vector_types.h>
+#endif
+#include "internal/optix_micromap_impl.h"
+
+/// Converts a micromap triangle index to the three base-triangle barycentric coordinates of the micro-triangle vertices in the base triangle.
+/// The base triangle is the triangle that the micromap is applied to.
+/// Note that for displaced micro-meshes this function can be used to compute a UV mapping from sub triangle to base triangle.
+///
+/// \param[in]  micromapTriangleIndex  Index of a micro- or sub triangle within a micromap.
+/// \param[in]  subdivisionLevel       Number of subdivision levels of the micromap or number of subdivision levels being considered (for sub triangles).
+/// \param[out] baseBarycentrics0      Barycentric coordinates in the space of the base triangle of vertex 0 of the micromap triangle.
+/// \param[out] baseBarycentrics1      Barycentric coordinates in the space of the base triangle of vertex 1 of the micromap triangle.
+/// \param[out] baseBarycentrics2      Barycentric coordinates in the space of the base triangle of vertex 2 of the micromap triangle.
+OPTIX_MICROMAP_INLINE_FUNC void optixMicromapIndexToBaseBarycentrics( unsigned int micromapTriangleIndex,
+                                                                      unsigned int subdivisionLevel,
+                                                                      float2&      baseBarycentrics0,
+                                                                      float2&      baseBarycentrics1,
+                                                                      float2&      baseBarycentrics2 )
+{
+    optix_impl::micro2bary( micromapTriangleIndex, subdivisionLevel, baseBarycentrics0, baseBarycentrics1, baseBarycentrics2 );
+}
+
+/// Maps barycentrics in the space of the base triangle to barycentrics of a micro triangle.
+/// The vertices of the micro triangle are defined by its barycentrics in the space of the base triangle.
+/// These can be queried for a DMM hit by using optixGetMicroTriangleBarycentricsData().
+OPTIX_MICROMAP_INLINE_FUNC float2 optixBaseBarycentricsToMicroBarycentrics( float2 baseBarycentrics,
+                                                                            float2 microVertexBaseBarycentrics[3] )
+{
+    return optix_impl::base2micro( baseBarycentrics, microVertexBaseBarycentrics );
+}
+
+#endif  // OPTIX_OPTIX_MICROMAP_H
diff --git a/crtx/optix_9.1/optix_stack_size.h b/crtx/optix_9.1/optix_stack_size.h
new file mode 100644
index 0000000..3b88b30
--- /dev/null
+++ b/crtx/optix_9.1/optix_stack_size.h
@@ -0,0 +1,345 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2019 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/// @file
+/// @author NVIDIA Corporation
+/// @brief  OptiX public API header
+
+#ifndef OPTIX_OPTIX_STACK_SIZE_H
+#define OPTIX_OPTIX_STACK_SIZE_H
+
+#include "optix.h"
+
+#include <algorithm>
+#include <cstring>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \addtogroup optix_utilities
+@{
+*/
+
+/// Retrieves direct and continuation stack sizes for each program in the program group and accumulates the upper bounds
+/// in the correponding output variables based on the semantic type of the program. Before the first invocation of this
+/// function with a given instance of #OptixStackSizes, the members of that instance should be set to 0.
+/// If the programs rely on external functions, passing the current pipeline will consider these as well. Otherwise, a null pointer
+/// can be passed instead. When external functions are present, a warning will be issued for these cases.
+inline OptixResult optixUtilAccumulateStackSizes( OptixProgramGroup programGroup, OptixStackSizes* stackSizes, OptixPipeline pipeline )
+{
+    if( !stackSizes )
+        return OPTIX_ERROR_INVALID_VALUE;
+
+    OptixStackSizes localStackSizes;
+    OptixResult     result = optixProgramGroupGetStackSize( programGroup, &localStackSizes, pipeline );
+    if( result != OPTIX_SUCCESS )
+        return result;
+
+    stackSizes->cssRG = std::max( stackSizes->cssRG, localStackSizes.cssRG );
+    stackSizes->cssMS = std::max( stackSizes->cssMS, localStackSizes.cssMS );
+    stackSizes->cssCH = std::max( stackSizes->cssCH, localStackSizes.cssCH );
+    stackSizes->cssAH = std::max( stackSizes->cssAH, localStackSizes.cssAH );
+    stackSizes->cssIS = std::max( stackSizes->cssIS, localStackSizes.cssIS );
+    stackSizes->cssCC = std::max( stackSizes->cssCC, localStackSizes.cssCC );
+    stackSizes->dssDC = std::max( stackSizes->dssDC, localStackSizes.dssDC );
+
+    return OPTIX_SUCCESS;
+}
+
+/// Computes the stack size values needed to configure a pipeline.
+///
+/// See the programming guide for an explanation of the formula.
+///
+/// \param[in] stackSizes                              Accumulated stack sizes of all programs in the call graph.
+/// \param[in] maxTraceDepth                           Maximum depth of #optixTrace() calls.
+/// \param[in] maxCCDepth                              Maximum depth of calls trees of continuation callables.
+/// \param[in] maxDCDepth                              Maximum depth of calls trees of direct callables.
+/// \param[out] directCallableStackSizeFromTraversal   Direct stack size requirement for direct callables invoked from
+///                                                    IS or AH.
+/// \param[out] directCallableStackSizeFromState       Direct stack size requirement for direct callables invoked from
+///                                                    RG, MS, or CH.
+/// \param[out] continuationStackSize                  Continuation stack requirement.
+inline OptixResult optixUtilComputeStackSizes( const OptixStackSizes* stackSizes,
+                                               unsigned int           maxTraceDepth,
+                                               unsigned int           maxCCDepth,
+                                               unsigned int           maxDCDepth,
+                                               unsigned int*          directCallableStackSizeFromTraversal,
+                                               unsigned int*          directCallableStackSizeFromState,
+                                               unsigned int*          continuationStackSize )
+{
+    if( !stackSizes )
+        return OPTIX_ERROR_INVALID_VALUE;
+
+    const unsigned int cssRG = stackSizes->cssRG;
+    const unsigned int cssMS = stackSizes->cssMS;
+    const unsigned int cssCH = stackSizes->cssCH;
+    const unsigned int cssAH = stackSizes->cssAH;
+    const unsigned int cssIS = stackSizes->cssIS;
+    const unsigned int cssCC = stackSizes->cssCC;
+    const unsigned int dssDC = stackSizes->dssDC;
+
+    if( directCallableStackSizeFromTraversal )
+        *directCallableStackSizeFromTraversal = maxDCDepth * dssDC;
+    if( directCallableStackSizeFromState )
+        *directCallableStackSizeFromState = maxDCDepth * dssDC;
+
+    // upper bound on continuation stack used by call trees of continuation callables
+    unsigned int cssCCTree = maxCCDepth * cssCC;
+
+    // upper bound on continuation stack used by CH or MS programs including the call tree of
+    // continuation callables
+    unsigned int cssCHOrMSPlusCCTree = std::max( cssCH, cssMS ) + cssCCTree;
+
+    // clang-format off
+    if( continuationStackSize )
+        *continuationStackSize
+            = cssRG + cssCCTree
+            + ( std::max( maxTraceDepth, 1u ) - 1 ) * cssCHOrMSPlusCCTree
+            + std::min( maxTraceDepth, 1u ) * std::max( cssCHOrMSPlusCCTree, cssIS + cssAH );
+    // clang-format on
+
+    return OPTIX_SUCCESS;
+}
+
+/// Computes the stack size values needed to configure a pipeline.
+///
+/// This variant is similar to #optixUtilComputeStackSizes(), except that it expects the values dssDC and
+/// maxDCDepth split by call site semantic.
+///
+/// See programming guide for an explanation of the formula.
+///
+/// \param[in] stackSizes                              Accumulated stack sizes of all programs in the call graph.
+/// \param[in] dssDCFromTraversal                      Accumulated direct stack size of all DC programs invoked from IS
+///                                                    or AH.
+/// \param[in] dssDCFromState                          Accumulated direct stack size of all DC programs invoked from RG,
+///                                                    MS, or CH.
+/// \param[in] maxTraceDepth                           Maximum depth of #optixTrace() calls.
+/// \param[in] maxCCDepth                              Maximum depth of calls trees of continuation callables.
+/// \param[in] maxDCDepthFromTraversal                 Maximum depth of calls trees of direct callables invoked from IS
+///                                                    or AH.
+/// \param[in] maxDCDepthFromState                     Maximum depth of calls trees of direct callables invoked from RG,
+///                                                    MS, or CH.
+/// \param[out] directCallableStackSizeFromTraversal   Direct stack size requirement for direct callables invoked from
+///                                                    IS or AH.
+/// \param[out] directCallableStackSizeFromState       Direct stack size requirement for direct callables invoked from
+///                                                    RG, MS, or CH.
+/// \param[out] continuationStackSize                  Continuation stack requirement.
+inline OptixResult optixUtilComputeStackSizesDCSplit( const OptixStackSizes* stackSizes,
+                                                      unsigned int           dssDCFromTraversal,
+                                                      unsigned int           dssDCFromState,
+                                                      unsigned int           maxTraceDepth,
+                                                      unsigned int           maxCCDepth,
+                                                      unsigned int           maxDCDepthFromTraversal,
+                                                      unsigned int           maxDCDepthFromState,
+                                                      unsigned int*          directCallableStackSizeFromTraversal,
+                                                      unsigned int*          directCallableStackSizeFromState,
+                                                      unsigned int*          continuationStackSize )
+{
+    if( !stackSizes )
+        return OPTIX_ERROR_INVALID_VALUE;
+
+    const unsigned int cssRG = stackSizes->cssRG;
+    const unsigned int cssMS = stackSizes->cssMS;
+    const unsigned int cssCH = stackSizes->cssCH;
+    const unsigned int cssAH = stackSizes->cssAH;
+    const unsigned int cssIS = stackSizes->cssIS;
+    const unsigned int cssCC = stackSizes->cssCC;
+    // use dssDCFromTraversal and dssDCFromState instead of stackSizes->dssDC
+
+    if( directCallableStackSizeFromTraversal )
+        *directCallableStackSizeFromTraversal = maxDCDepthFromTraversal * dssDCFromTraversal;
+    if( directCallableStackSizeFromState )
+        *directCallableStackSizeFromState = maxDCDepthFromState * dssDCFromState;
+
+    // upper bound on continuation stack used by call trees of continuation callables
+    unsigned int cssCCTree = maxCCDepth * cssCC;
+
+    // upper bound on continuation stack used by CH or MS programs including the call tree of
+    // continuation callables
+    unsigned int cssCHOrMSPlusCCTree = std::max( cssCH, cssMS ) + cssCCTree;
+
+    // clang-format off
+    if( continuationStackSize )
+        *continuationStackSize
+            = cssRG + cssCCTree
+            + ( std::max( maxTraceDepth, 1u ) - 1 ) * cssCHOrMSPlusCCTree
+            + std::min( maxTraceDepth, 1u ) * std::max( cssCHOrMSPlusCCTree, cssIS + cssAH );
+    // clang-format on
+
+    return OPTIX_SUCCESS;
+}
+
+/// Computes the stack size values needed to configure a pipeline.
+///
+/// This variant is similar to #optixUtilComputeStackSizes(), except that it expects the value cssCCTree
+/// instead of cssCC and maxCCDepth.
+///
+/// See programming guide for an explanation of the formula.
+///
+/// \param[in] stackSizes                              Accumulated stack sizes of all programs in the call graph.
+/// \param[in] cssCCTree                               Maximum stack size used by calls trees of continuation callables.
+/// \param[in] maxTraceDepth                           Maximum depth of #optixTrace() calls.
+/// \param[in] maxDCDepth                              Maximum depth of calls trees of direct callables.
+/// \param[out] directCallableStackSizeFromTraversal   Direct stack size requirement for direct callables invoked from
+///                                                    IS or AH.
+/// \param[out] directCallableStackSizeFromState       Direct stack size requirement for direct callables invoked from
+///                                                    RG, MS, or CH.
+/// \param[out] continuationStackSize                  Continuation stack requirement.
+inline OptixResult optixUtilComputeStackSizesCssCCTree( const OptixStackSizes* stackSizes,
+                                                        unsigned int           cssCCTree,
+                                                        unsigned int           maxTraceDepth,
+                                                        unsigned int           maxDCDepth,
+                                                        unsigned int*          directCallableStackSizeFromTraversal,
+                                                        unsigned int*          directCallableStackSizeFromState,
+                                                        unsigned int*          continuationStackSize )
+{
+    if( !stackSizes )
+        return OPTIX_ERROR_INVALID_VALUE;
+
+    const unsigned int cssRG = stackSizes->cssRG;
+    const unsigned int cssMS = stackSizes->cssMS;
+    const unsigned int cssCH = stackSizes->cssCH;
+    const unsigned int cssAH = stackSizes->cssAH;
+    const unsigned int cssIS = stackSizes->cssIS;
+    // use cssCCTree instead of stackSizes->cssCC and maxCCDepth
+    const unsigned int dssDC = stackSizes->dssDC;
+
+    if( directCallableStackSizeFromTraversal )
+        *directCallableStackSizeFromTraversal = maxDCDepth * dssDC;
+    if( directCallableStackSizeFromState )
+        *directCallableStackSizeFromState = maxDCDepth * dssDC;
+
+    // upper bound on continuation stack used by CH or MS programs including the call tree of
+    // continuation callables
+    unsigned int cssCHOrMSPlusCCTree = std::max( cssCH, cssMS ) + cssCCTree;
+
+    // clang-format off
+    if( continuationStackSize )
+        *continuationStackSize
+            = cssRG + cssCCTree
+            + ( std::max( maxTraceDepth, 1u ) - 1 ) * cssCHOrMSPlusCCTree
+            + std::min( maxTraceDepth, 1u ) * std::max( cssCHOrMSPlusCCTree, cssIS + cssAH );
+    // clang-format on
+
+    return OPTIX_SUCCESS;
+}
+
+/// Computes the stack size values needed to configure a pipeline.
+///
+/// This variant is a specialization of #optixUtilComputeStackSizes() for a simple path tracer with the following
+/// assumptions: There are only two ray types, camera rays and shadow rays. There are only RG, MS, and CH programs, and
+/// no AH, IS, CC, or DC programs. The camera rays invoke only the miss and closest hit programs MS1 and CH1,
+/// respectively. The CH1 program might trace shadow rays, which invoke only the miss and closest hit programs MS2 and
+/// CH2, respectively.
+///
+/// For flexibility, we allow for each of CH1 and CH2 not just one single program group, but an array of programs
+/// groups, and compute the maximas of the stack size requirements per array.
+///
+/// See programming guide for an explanation of the formula.
+///
+/// If the programs rely on external functions, passing the current pipeline will consider these as well. Otherwise, a null pointer
+/// can be passed instead. When external functions are present, a warning will be issued for these cases.
+inline OptixResult optixUtilComputeStackSizesSimplePathTracer( OptixProgramGroup        programGroupRG,
+                                                               OptixProgramGroup        programGroupMS1,
+                                                               const OptixProgramGroup* programGroupCH1,
+                                                               unsigned int             programGroupCH1Count,
+                                                               OptixProgramGroup        programGroupMS2,
+                                                               const OptixProgramGroup* programGroupCH2,
+                                                               unsigned int             programGroupCH2Count,
+                                                               unsigned int* directCallableStackSizeFromTraversal,
+                                                               unsigned int* directCallableStackSizeFromState,
+                                                               unsigned int* continuationStackSize,
+                                                               OptixPipeline pipeline )
+{
+    if( !programGroupCH1 && ( programGroupCH1Count > 0 ) )
+        return OPTIX_ERROR_INVALID_VALUE;
+    if( !programGroupCH2 && ( programGroupCH2Count > 0 ) )
+        return OPTIX_ERROR_INVALID_VALUE;
+
+    OptixResult result;
+
+    OptixStackSizes stackSizesRG = {};
+    result                       = optixProgramGroupGetStackSize( programGroupRG, &stackSizesRG, pipeline );
+    if( result != OPTIX_SUCCESS )
+        return result;
+
+    OptixStackSizes stackSizesMS1 = {};
+    result                        = optixProgramGroupGetStackSize( programGroupMS1, &stackSizesMS1, pipeline );
+    if( result != OPTIX_SUCCESS )
+        return result;
+
+    OptixStackSizes stackSizesCH1 = {};
+    for( unsigned int i = 0; i < programGroupCH1Count; ++i )
+    {
+        result = optixUtilAccumulateStackSizes( programGroupCH1[i], &stackSizesCH1, pipeline );
+        if( result != OPTIX_SUCCESS )
+            return result;
+    }
+
+    OptixStackSizes stackSizesMS2 = {};
+    result                        = optixProgramGroupGetStackSize( programGroupMS2, &stackSizesMS2, pipeline );
+    if( result != OPTIX_SUCCESS )
+        return result;
+
+    OptixStackSizes stackSizesCH2 = {};
+    memset( &stackSizesCH2, 0, sizeof( OptixStackSizes ) );
+    for( unsigned int i = 0; i < programGroupCH2Count; ++i )
+    {
+        result = optixUtilAccumulateStackSizes( programGroupCH2[i], &stackSizesCH2, pipeline );
+        if( result != OPTIX_SUCCESS )
+            return result;
+    }
+
+    const unsigned int cssRG  = stackSizesRG.cssRG;
+    const unsigned int cssMS1 = stackSizesMS1.cssMS;
+    const unsigned int cssCH1 = stackSizesCH1.cssCH;
+    const unsigned int cssMS2 = stackSizesMS2.cssMS;
+    const unsigned int cssCH2 = stackSizesCH2.cssCH;
+    // no AH, IS, CC, or DC programs
+
+    if( directCallableStackSizeFromTraversal )
+        *directCallableStackSizeFromTraversal = 0;
+    if( directCallableStackSizeFromState )
+        *directCallableStackSizeFromState = 0;
+
+    if( continuationStackSize )
+        *continuationStackSize = cssRG + std::max( cssMS1, cssCH1 + std::max( cssMS2, cssCH2 ) );
+
+    return OPTIX_SUCCESS;
+}
+
+/**@}*/  // end group optix_utilities
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // OPTIX_OPTIX_STACK_SIZE_H
diff --git a/crtx/optix_9.1/optix_stubs.h b/crtx/optix_9.1/optix_stubs.h
new file mode 100644
index 0000000..17e90ea
--- /dev/null
+++ b/crtx/optix_9.1/optix_stubs.h
@@ -0,0 +1,828 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2019 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/// @file
+/// @author NVIDIA Corporation
+/// @brief  OptiX public API header
+
+#ifndef OPTIX_OPTIX_STUBS_H
+#define OPTIX_OPTIX_STUBS_H
+
+#include "optix_function_table.h"
+
+#ifdef _WIN32
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN 1
+#endif
+#include <windows.h>
+// The cfgmgr32 header is necessary for interrogating driver information in the registry.
+// For convenience the library is also linked in automatically using the #pragma command.
+#include <cfgmgr32.h>
+#pragma comment( lib, "Cfgmgr32.lib" )
+#include <string.h>
+#else
+#include <dlfcn.h>
+#endif
+
+/// Mixing multiple SDKs in a single application will result in symbol collisions.
+/// To enable different compilation units to use different SDKs, use OPTIX_ENABLE_SDK_MIXING.
+#ifndef OPTIXAPI
+# ifdef OPTIX_ENABLE_SDK_MIXING
+#   define OPTIXAPI static
+# else  // OPTIX_ENABLE_SDK_MIXING
+#   ifdef __cplusplus
+#     define OPTIXAPI extern "C"
+#   else  // __cplusplus
+#     define OPTIXAPI
+#   endif  // __cplusplus
+# endif  // OPTIX_ENABLE_SDK_MIXING
+#endif  // OPTIXAPI
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The function table needs to be defined in exactly one translation unit. This can be
+// achieved by including optix_function_table_definition.h in that translation unit.
+extern OptixFunctionTable OPTIX_FUNCTION_TABLE_SYMBOL;
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef _WIN32
+#if defined( _MSC_VER )
+// Visual Studio produces warnings suggesting strcpy and friends being replaced with _s
+// variants. All the string lengths and allocation sizes have been calculated and should
+// be safe, so we are disabling this warning to increase compatibility.
+#pragma warning( push )
+#pragma warning( disable : 4996 )
+#endif
+static void* optixLoadWindowsDllFromName( const char* optixDllName )
+{
+    void* handle = NULL;
+
+
+    // Get the size of the path first, then allocate
+    unsigned int size = GetSystemDirectoryA( NULL, 0 );
+    if( size == 0 )
+    {
+        // Couldn't get the system path size, so bail
+        return NULL;
+    }
+    size_t pathSize   = size + 1 + strlen( optixDllName );
+    char*  systemPath = (char*)malloc( pathSize );
+    if( systemPath == NULL )
+        return NULL;
+    if( GetSystemDirectoryA( systemPath, size ) != size - 1 )
+    {
+        // Something went wrong
+        free( systemPath );
+        return NULL;
+    }
+    strcat( systemPath, "\\" );
+    strcat( systemPath, optixDllName );
+    handle = LoadLibraryA( systemPath );
+    free( systemPath );
+    if( handle )
+        return handle;
+
+    // If we didn't find it, go looking in the register store.  Since nvoptix.dll doesn't
+    // have its own registry entry, we are going to look for the opengl driver which lives
+    // next to nvoptix.dll.  0 (null) will be returned if any errors occured.
+
+    static const char* deviceInstanceIdentifiersGUID = "{4d36e968-e325-11ce-bfc1-08002be10318}";
+    const ULONG        flags                         = CM_GETIDLIST_FILTER_CLASS | CM_GETIDLIST_FILTER_PRESENT;
+    ULONG              deviceListSize                = 0;
+    if( CM_Get_Device_ID_List_SizeA( &deviceListSize, deviceInstanceIdentifiersGUID, flags ) != CR_SUCCESS )
+    {
+        return NULL;
+    }
+    char* deviceNames = (char*)malloc( deviceListSize );
+    if( deviceNames == NULL )
+        return NULL;
+    if( CM_Get_Device_ID_ListA( deviceInstanceIdentifiersGUID, deviceNames, deviceListSize, flags ) )
+    {
+        free( deviceNames );
+        return NULL;
+    }
+    DEVINST devID   = 0;
+    char*   dllPath = NULL;
+
+    // Continue to the next device if errors are encountered.
+    for( char* deviceName = deviceNames; *deviceName; deviceName += strlen( deviceName ) + 1 )
+    {
+        if( CM_Locate_DevNodeA( &devID, deviceName, CM_LOCATE_DEVNODE_NORMAL ) != CR_SUCCESS )
+        {
+            continue;
+        }
+        HKEY regKey = 0;
+        if( CM_Open_DevNode_Key( devID, KEY_QUERY_VALUE, 0, RegDisposition_OpenExisting, &regKey, CM_REGISTRY_SOFTWARE ) != CR_SUCCESS )
+        {
+            continue;
+        }
+        const char* valueName = "OpenGLDriverName";
+        DWORD       valueSize = 0;
+        LSTATUS     ret       = RegQueryValueExA( regKey, valueName, NULL, NULL, NULL, &valueSize );
+        if( ret != ERROR_SUCCESS )
+        {
+            RegCloseKey( regKey );
+            continue;
+        }
+        char* regValue = (char*)malloc( valueSize );
+        if( regValue == NULL )
+        {
+            RegCloseKey( regKey );
+            continue;
+        }
+        ret = RegQueryValueExA( regKey, valueName, NULL, NULL, (LPBYTE)regValue, &valueSize );
+        if( ret != ERROR_SUCCESS )
+        {
+            free( regValue );
+            RegCloseKey( regKey );
+            continue;
+        }
+        // Strip the opengl driver dll name from the string then create a new string with
+        // the path and the nvoptix.dll name
+        for( int i = (int)valueSize - 1; i >= 0 && regValue[i] != '\\'; --i )
+            regValue[i] = '\0';
+        size_t newPathSize = strlen( regValue ) + strlen( optixDllName ) + 1;
+        dllPath            = (char*)malloc( newPathSize );
+        if( dllPath == NULL )
+        {
+            free( regValue );
+            RegCloseKey( regKey );
+            continue;
+        }
+        strcpy( dllPath, regValue );
+        strcat( dllPath, optixDllName );
+        free( regValue );
+        RegCloseKey( regKey );
+        handle = LoadLibraryA( (LPCSTR)dllPath );
+        free( dllPath );
+        if( handle )
+            break;
+    }
+    free( deviceNames );
+    return handle;
+}
+#if defined( _MSC_VER )
+#pragma warning( pop )
+#endif
+
+static void* optixLoadWindowsDll()
+{
+    return optixLoadWindowsDllFromName( "nvoptix.dll" );
+}
+#endif
+
+/// \defgroup optix_utilities Utilities
+/// \brief OptiX Utilities
+
+/** \addtogroup optix_utilities
+@{
+*/
+
+/// Loads the OptiX library and initializes the function table used by the stubs below.
+///
+/// If handlePtr is not nullptr, an OS-specific handle to the library will be returned in *handlePtr.
+///
+/// \see #optixUninitWithHandle
+OPTIXAPI inline OptixResult optixInitWithHandle( void** handlePtr )
+{
+    // Make sure these functions get initialized to zero in case the DLL and function
+    // table can't be loaded
+    OPTIX_FUNCTION_TABLE_SYMBOL.optixGetErrorName   = 0;
+    OPTIX_FUNCTION_TABLE_SYMBOL.optixGetErrorString = 0;
+
+    if( !handlePtr )
+        return OPTIX_ERROR_INVALID_VALUE;
+
+#ifdef _WIN32
+    *handlePtr = optixLoadWindowsDll();
+    if( !*handlePtr )
+        return OPTIX_ERROR_LIBRARY_NOT_FOUND;
+
+    void* symbol = (void*)GetProcAddress( (HMODULE)*handlePtr, "optixQueryFunctionTable" );
+    if( !symbol )
+        return OPTIX_ERROR_ENTRY_SYMBOL_NOT_FOUND;
+#else
+    *handlePtr = dlopen( "libnvoptix.so.1", RTLD_NOW );
+    if( !*handlePtr )
+        return OPTIX_ERROR_LIBRARY_NOT_FOUND;
+
+    void* symbol = dlsym( *handlePtr, "optixQueryFunctionTable" );
+    if( !symbol )
+        return OPTIX_ERROR_ENTRY_SYMBOL_NOT_FOUND;
+#endif
+
+    OptixQueryFunctionTable_t* optixQueryFunctionTable = (OptixQueryFunctionTable_t*)symbol;
+
+    return optixQueryFunctionTable( OPTIX_ABI_VERSION, 0, 0, 0, &OPTIX_FUNCTION_TABLE_SYMBOL, sizeof( OPTIX_FUNCTION_TABLE_SYMBOL ) );
+}
+
+/// Loads the OptiX library and initializes the function table used by the stubs below.
+///
+/// A variant of #optixInitWithHandle() that does not make the handle to the loaded library available.
+OPTIXAPI inline OptixResult optixInit( void )
+{
+    void* handle;
+    return optixInitWithHandle( &handle );
+}
+
+/// Unloads the OptiX library and zeros the function table used by the stubs below.  Takes the
+/// handle returned by optixInitWithHandle.  All OptixDeviceContext objects must be destroyed
+/// before calling this function, or the behavior is undefined.
+///
+/// \see #optixInitWithHandle
+OPTIXAPI inline OptixResult optixUninitWithHandle( void* handle )
+{
+    if( !handle )
+        return OPTIX_ERROR_INVALID_VALUE;
+#ifdef _WIN32
+    if( !FreeLibrary( (HMODULE)handle ) )
+        return OPTIX_ERROR_LIBRARY_UNLOAD_FAILURE;
+#else
+    if( dlclose( handle ) )
+        return OPTIX_ERROR_LIBRARY_UNLOAD_FAILURE;
+#endif
+    OptixFunctionTable empty
+#ifdef __cplusplus
+      {}
+#else
+        = { 0 }
+#endif
+        ;
+    OPTIX_FUNCTION_TABLE_SYMBOL = empty;
+    return OPTIX_SUCCESS;
+}
+
+
+/**@}*/  // end group optix_utilities
+
+#ifndef OPTIX_DOXYGEN_SHOULD_SKIP_THIS
+
+// Stub functions that forward calls to the corresponding function pointer in the function table.
+
+OPTIXAPI inline const char* optixGetErrorName( OptixResult result )
+{
+    if( OPTIX_FUNCTION_TABLE_SYMBOL.optixGetErrorName )
+        return OPTIX_FUNCTION_TABLE_SYMBOL.optixGetErrorName( result );
+
+    // If the DLL and symbol table couldn't be loaded, provide a set of error strings
+    // suitable for processing errors related to the DLL loading.
+    switch( result )
+    {
+        case OPTIX_SUCCESS:
+            return "OPTIX_SUCCESS";
+        case OPTIX_ERROR_INVALID_VALUE:
+            return "OPTIX_ERROR_INVALID_VALUE";
+        case OPTIX_ERROR_UNSUPPORTED_ABI_VERSION:
+            return "OPTIX_ERROR_UNSUPPORTED_ABI_VERSION";
+        case OPTIX_ERROR_FUNCTION_TABLE_SIZE_MISMATCH:
+            return "OPTIX_ERROR_FUNCTION_TABLE_SIZE_MISMATCH";
+        case OPTIX_ERROR_INVALID_ENTRY_FUNCTION_OPTIONS:
+            return "OPTIX_ERROR_INVALID_ENTRY_FUNCTION_OPTIONS";
+        case OPTIX_ERROR_LIBRARY_NOT_FOUND:
+            return "OPTIX_ERROR_LIBRARY_NOT_FOUND";
+        case OPTIX_ERROR_ENTRY_SYMBOL_NOT_FOUND:
+            return "OPTIX_ERROR_ENTRY_SYMBOL_NOT_FOUND";
+        case OPTIX_ERROR_LIBRARY_UNLOAD_FAILURE:
+            return "OPTIX_ERROR_LIBRARY_UNLOAD_FAILURE";
+        default:
+            return "Unknown OptixResult code";
+    }
+}
+
+OPTIXAPI inline const char* optixGetErrorString( OptixResult result )
+{
+    if( OPTIX_FUNCTION_TABLE_SYMBOL.optixGetErrorString )
+        return OPTIX_FUNCTION_TABLE_SYMBOL.optixGetErrorString( result );
+
+    // If the DLL and symbol table couldn't be loaded, provide a set of error strings
+    // suitable for processing errors related to the DLL loading.
+    switch( result )
+    {
+        case OPTIX_SUCCESS:
+            return "Success";
+        case OPTIX_ERROR_INVALID_VALUE:
+            return "Invalid value";
+        case OPTIX_ERROR_UNSUPPORTED_ABI_VERSION:
+            return "Unsupported ABI version";
+        case OPTIX_ERROR_FUNCTION_TABLE_SIZE_MISMATCH:
+            return "Function table size mismatch";
+        case OPTIX_ERROR_INVALID_ENTRY_FUNCTION_OPTIONS:
+            return "Invalid options to entry function";
+        case OPTIX_ERROR_LIBRARY_NOT_FOUND:
+            return "Library not found";
+        case OPTIX_ERROR_ENTRY_SYMBOL_NOT_FOUND:
+            return "Entry symbol not found";
+        case OPTIX_ERROR_LIBRARY_UNLOAD_FAILURE:
+            return "Library could not be unloaded";
+        default:
+            return "Unknown OptixResult code";
+    }
+}
+
+OPTIXAPI inline OptixResult optixDeviceContextCreate( CUcontext fromContext, const OptixDeviceContextOptions* options, OptixDeviceContext* context )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDeviceContextCreate( fromContext, options, context );
+}
+
+OPTIXAPI inline OptixResult optixDeviceContextDestroy( OptixDeviceContext context )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDeviceContextDestroy( context );
+}
+
+OPTIXAPI inline OptixResult optixDeviceContextGetProperty( OptixDeviceContext context, OptixDeviceProperty property, void* value, size_t sizeInBytes )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDeviceContextGetProperty( context, property, value, sizeInBytes );
+}
+
+OPTIXAPI inline OptixResult optixDeviceContextSetLogCallback( OptixDeviceContext context,
+                                                              OptixLogCallback   callbackFunction,
+                                                              void*              callbackData,
+                                                              unsigned int       callbackLevel )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDeviceContextSetLogCallback( context, callbackFunction, callbackData, callbackLevel );
+}
+
+OPTIXAPI inline OptixResult optixDeviceContextSetCacheEnabled( OptixDeviceContext context, int enabled )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDeviceContextSetCacheEnabled( context, enabled );
+}
+
+OPTIXAPI inline OptixResult optixDeviceContextSetCacheLocation( OptixDeviceContext context, const char* location )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDeviceContextSetCacheLocation( context, location );
+}
+
+OPTIXAPI inline OptixResult optixDeviceContextSetCacheDatabaseSizes( OptixDeviceContext context, size_t lowWaterMark, size_t highWaterMark )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDeviceContextSetCacheDatabaseSizes( context, lowWaterMark, highWaterMark );
+}
+
+OPTIXAPI inline OptixResult optixDeviceContextGetCacheEnabled( OptixDeviceContext context, int* enabled )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDeviceContextGetCacheEnabled( context, enabled );
+}
+
+OPTIXAPI inline OptixResult optixDeviceContextGetCacheLocation( OptixDeviceContext context, char* location, size_t locationSize )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDeviceContextGetCacheLocation( context, location, locationSize );
+}
+
+OPTIXAPI inline OptixResult optixDeviceContextGetCacheDatabaseSizes( OptixDeviceContext context, size_t* lowWaterMark, size_t* highWaterMark )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDeviceContextGetCacheDatabaseSizes( context, lowWaterMark, highWaterMark );
+}
+
+OPTIXAPI inline OptixResult optixModuleCreate( OptixDeviceContext                 context,
+                                               const OptixModuleCompileOptions*   moduleCompileOptions,
+                                               const OptixPipelineCompileOptions* pipelineCompileOptions,
+                                               const char*                        input,
+                                               size_t                             inputSize,
+                                               char*                              logString,
+                                               size_t*                            logStringSize,
+                                               OptixModule*                       module )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixModuleCreate( context, moduleCompileOptions, pipelineCompileOptions, input,
+                                                          inputSize, logString, logStringSize, module );
+}
+
+OPTIXAPI inline OptixResult optixModuleCreateWithTasks( OptixDeviceContext                 context,
+                                                        const OptixModuleCompileOptions*   moduleCompileOptions,
+                                                        const OptixPipelineCompileOptions* pipelineCompileOptions,
+                                                        const char*                        input,
+                                                        size_t                             inputSize,
+                                                        char*                              logString,
+                                                        size_t*                            logStringSize,
+                                                        OptixModule*                       module,
+                                                        OptixTask*                         firstTask )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixModuleCreateWithTasks( context, moduleCompileOptions, pipelineCompileOptions, input,
+                                                                   inputSize, logString, logStringSize, module, firstTask );
+}
+
+OPTIXAPI inline OptixResult optixModuleGetCompilationState( OptixModule module, OptixModuleCompileState* state )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixModuleGetCompilationState( module, state );
+}
+
+OPTIXAPI inline OptixResult optixModuleCancelCreation( OptixModule module, OptixCreationFlags flags )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixModuleCancelCreation( module, flags );
+}
+
+
+OPTIXAPI inline OptixResult optixDeviceContextCancelCreations( OptixDeviceContext context, OptixCreationFlags flags )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDeviceContextCancelCreations( context, flags );
+}
+
+OPTIXAPI inline OptixResult optixModuleDestroy( OptixModule module )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixModuleDestroy( module );
+}
+
+OPTIXAPI inline OptixResult optixBuiltinISModuleGet( OptixDeviceContext                 context,
+                                                     const OptixModuleCompileOptions*   moduleCompileOptions,
+                                                     const OptixPipelineCompileOptions* pipelineCompileOptions,
+                                                     const OptixBuiltinISOptions*       builtinISOptions,
+                                                     OptixModule*                       builtinModule )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixBuiltinISModuleGet( context, moduleCompileOptions, pipelineCompileOptions,
+                                                                builtinISOptions, builtinModule );
+}
+
+OPTIXAPI inline OptixResult optixTaskExecute( OptixTask     task,
+                                              OptixTask*    additionalTasks,
+                                              unsigned int  maxNumAdditionalTasks,
+                                              unsigned int* numAdditionalTasksCreated )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixTaskExecute( task, additionalTasks, maxNumAdditionalTasks, numAdditionalTasksCreated );
+}
+
+OPTIXAPI inline OptixResult optixTaskGetSerializationKey( OptixTask task, void* key, size_t* size )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixTaskGetSerializationKey( task, key, size );
+}
+
+OPTIXAPI inline OptixResult optixTaskSerializeOutput( OptixTask task, void* data, size_t* size )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixTaskSerializeOutput( task, data, size );
+}
+
+OPTIXAPI inline OptixResult optixTaskDeserializeOutput( OptixTask     task,
+                                                        const void*   data,
+                                                        size_t        size,
+                                                        OptixTask*    additionalTasks,
+                                                        unsigned int  maxNumAdditionalTasks,
+                                                        unsigned int* numAdditionalTasksCreated )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixTaskDeserializeOutput( task, data, size, additionalTasks,
+                                                                   maxNumAdditionalTasks, numAdditionalTasksCreated );
+}
+
+OPTIXAPI inline OptixResult optixProgramGroupCreate( OptixDeviceContext              context,
+                                                     const OptixProgramGroupDesc*    programDescriptions,
+                                                     unsigned int                    numProgramGroups,
+                                                     const OptixProgramGroupOptions* options,
+                                                     char*                           logString,
+                                                     size_t*                         logStringSize,
+                                                     OptixProgramGroup*              programGroups )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixProgramGroupCreate( context, programDescriptions, numProgramGroups, options,
+                                                                logString, logStringSize, programGroups );
+}
+
+OPTIXAPI inline OptixResult optixProgramGroupDestroy( OptixProgramGroup programGroup )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixProgramGroupDestroy( programGroup );
+}
+
+OPTIXAPI inline OptixResult optixProgramGroupGetStackSize( OptixProgramGroup programGroup, OptixStackSizes* stackSizes, OptixPipeline pipeline )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixProgramGroupGetStackSize( programGroup, stackSizes, pipeline );
+}
+
+OPTIXAPI inline OptixResult optixPipelineCreate( OptixDeviceContext                 context,
+                                                 const OptixPipelineCompileOptions* pipelineCompileOptions,
+                                                 const OptixPipelineLinkOptions*    pipelineLinkOptions,
+                                                 const OptixProgramGroup*           programGroups,
+                                                 unsigned int                       numProgramGroups,
+                                                 char*                              logString,
+                                                 size_t*                            logStringSize,
+                                                 OptixPipeline*                     pipeline )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixPipelineCreate( context, pipelineCompileOptions, pipelineLinkOptions, programGroups,
+                                                            numProgramGroups, logString, logStringSize, pipeline );
+}
+
+OPTIXAPI inline OptixResult optixPipelineDestroy( OptixPipeline pipeline )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixPipelineDestroy( pipeline );
+}
+
+OPTIXAPI inline OptixResult optixPipelineSetStackSizeFromCallDepths( OptixPipeline pipeline,
+                                                                     unsigned int  maxTraceDepth,
+                                                                     unsigned int  maxContinuationCallableDepth,
+                                                                     unsigned int  maxDirectCallableDepthFromState,
+                                                                     unsigned int  maxDirectCallableDepthFromTraversal,
+                                                                     unsigned int  maxTraversableGraphDepth )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixPipelineSetStackSizeFromCallDepths( pipeline, maxTraceDepth, maxContinuationCallableDepth,
+                                                                                maxDirectCallableDepthFromState,
+                                                                                maxDirectCallableDepthFromTraversal, maxTraversableGraphDepth );
+}
+
+OPTIXAPI inline OptixResult optixPipelineSetStackSize( OptixPipeline pipeline,
+                                                       unsigned int  directCallableStackSizeFromTraversal,
+                                                       unsigned int  directCallableStackSizeFromState,
+                                                       unsigned int  continuationStackSize,
+                                                       unsigned int  maxTraversableGraphDepth )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixPipelineSetStackSize( pipeline, directCallableStackSizeFromTraversal,
+                                                                  directCallableStackSizeFromState,
+                                                                  continuationStackSize, maxTraversableGraphDepth );
+}
+
+OPTIXAPI inline OptixResult optixPipelineSymbolMemcpyAsync( OptixPipeline                 pipeline,
+                                                            const char*                   name,
+                                                            void*                         mem,
+                                                            size_t                        sizeInBytes,
+                                                            size_t                        offsetInBytes,
+                                                            OptixPipelineSymbolMemcpyKind kind,
+                                                            CUstream                      stream )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixPipelineSymbolMemcpyAsync( pipeline, name, mem, sizeInBytes, offsetInBytes, kind, stream );
+}
+
+OPTIXAPI inline OptixResult optixAccelComputeMemoryUsage( OptixDeviceContext            context,
+                                                          const OptixAccelBuildOptions* accelOptions,
+                                                          const OptixBuildInput*        buildInputs,
+                                                          unsigned int                  numBuildInputs,
+                                                          OptixAccelBufferSizes*        bufferSizes )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixAccelComputeMemoryUsage( context, accelOptions, buildInputs, numBuildInputs, bufferSizes );
+}
+
+OPTIXAPI inline OptixResult optixAccelBuild( OptixDeviceContext            context,
+                                             CUstream                      stream,
+                                             const OptixAccelBuildOptions* accelOptions,
+                                             const OptixBuildInput*        buildInputs,
+                                             unsigned int                  numBuildInputs,
+                                             CUdeviceptr                   tempBuffer,
+                                             size_t                        tempBufferSizeInBytes,
+                                             CUdeviceptr                   outputBuffer,
+                                             size_t                        outputBufferSizeInBytes,
+                                             OptixTraversableHandle*       outputHandle,
+                                             const OptixAccelEmitDesc*     emittedProperties,
+                                             unsigned int                  numEmittedProperties )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixAccelBuild( context, stream, accelOptions, buildInputs, numBuildInputs, tempBuffer,
+                                                        tempBufferSizeInBytes, outputBuffer, outputBufferSizeInBytes,
+                                                        outputHandle, emittedProperties, numEmittedProperties );
+}
+
+
+OPTIXAPI inline OptixResult optixAccelGetRelocationInfo( OptixDeviceContext context, OptixTraversableHandle handle, OptixRelocationInfo* info )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixAccelGetRelocationInfo( context, handle, info );
+}
+
+
+OPTIXAPI inline OptixResult optixCheckRelocationCompatibility( OptixDeviceContext context, const OptixRelocationInfo* info, int* compatible )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixCheckRelocationCompatibility( context, info, compatible );
+}
+
+OPTIXAPI inline OptixResult optixAccelRelocate( OptixDeviceContext         context,
+                                                CUstream                   stream,
+                                                const OptixRelocationInfo* info,
+                                                const OptixRelocateInput*  relocateInputs,
+                                                size_t                     numRelocateInputs,
+                                                CUdeviceptr                targetAccel,
+                                                size_t                     targetAccelSizeInBytes,
+                                                OptixTraversableHandle*    targetHandle )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixAccelRelocate( context, stream, info, relocateInputs, numRelocateInputs,
+                                                           targetAccel, targetAccelSizeInBytes, targetHandle );
+}
+
+OPTIXAPI inline OptixResult optixAccelCompact( OptixDeviceContext      context,
+                                               CUstream                stream,
+                                               OptixTraversableHandle  inputHandle,
+                                               CUdeviceptr             outputBuffer,
+                                               size_t                  outputBufferSizeInBytes,
+                                               OptixTraversableHandle* outputHandle )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixAccelCompact( context, stream, inputHandle, outputBuffer,
+                                                          outputBufferSizeInBytes, outputHandle );
+}
+
+OPTIXAPI inline OptixResult optixAccelEmitProperty( OptixDeviceContext        context,
+                                                    CUstream                  stream,
+                                                    OptixTraversableHandle    handle,
+                                                    const OptixAccelEmitDesc* emittedProperty )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixAccelEmitProperty( context, stream, handle, emittedProperty );
+}
+
+OPTIXAPI inline OptixResult optixConvertPointerToTraversableHandle( OptixDeviceContext      onDevice,
+                                                                    CUdeviceptr             pointer,
+                                                                    OptixTraversableType    traversableType,
+                                                                    OptixTraversableHandle* traversableHandle )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixConvertPointerToTraversableHandle( onDevice, pointer, traversableType, traversableHandle );
+}
+
+OPTIXAPI inline OptixResult optixOpacityMicromapArrayComputeMemoryUsage( OptixDeviceContext context,
+                                                                         const OptixOpacityMicromapArrayBuildInput* buildInput,
+                                                                         OptixMicromapBufferSizes* bufferSizes )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixOpacityMicromapArrayComputeMemoryUsage( context, buildInput, bufferSizes );
+}
+
+OPTIXAPI inline OptixResult optixOpacityMicromapArrayBuild( OptixDeviceContext                         context,
+                                                            CUstream                                   stream,
+                                                            const OptixOpacityMicromapArrayBuildInput* buildInput,
+                                                            const OptixMicromapBuffers*                buffers )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixOpacityMicromapArrayBuild( context, stream, buildInput, buffers );
+}
+
+OPTIXAPI inline OptixResult optixOpacityMicromapArrayGetRelocationInfo( OptixDeviceContext   context,
+                                                                        CUdeviceptr          opacityMicromapArray,
+                                                                        OptixRelocationInfo* info )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixOpacityMicromapArrayGetRelocationInfo( context, opacityMicromapArray, info );
+}
+
+OPTIXAPI inline OptixResult optixOpacityMicromapArrayRelocate( OptixDeviceContext         context,
+                                                               CUstream                   stream,
+                                                               const OptixRelocationInfo* info,
+                                                               CUdeviceptr                targetOpacityMicromapArray,
+                                                               size_t targetOpacityMicromapArraySizeInBytes )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixOpacityMicromapArrayRelocate( context, stream, info, targetOpacityMicromapArray,
+                                                                          targetOpacityMicromapArraySizeInBytes );
+}
+
+
+OPTIXAPI inline OptixResult optixClusterAccelComputeMemoryUsage( OptixDeviceContext                 context,
+                                                                 OptixClusterAccelBuildMode         buildMode,
+                                                                 const OptixClusterAccelBuildInput* buildInput,
+                                                                 OptixAccelBufferSizes*             bufferSizes )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixClusterAccelComputeMemoryUsage( context, buildMode, buildInput, bufferSizes );
+}
+
+OPTIXAPI inline OptixResult optixClusterAccelBuild( OptixDeviceContext                    context,
+                                                    CUstream                              stream,
+                                                    const OptixClusterAccelBuildModeDesc* buildModeDesc,
+                                                    const OptixClusterAccelBuildInput*    buildInput,
+                                                    CUdeviceptr                           argsArray,
+                                                    CUdeviceptr                           argsCount,
+                                                    unsigned int                          argsStrideInBytes )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixClusterAccelBuild( context, stream, buildModeDesc, buildInput, argsArray,
+                                                               argsCount, argsStrideInBytes );
+}
+
+OPTIXAPI inline OptixResult optixSbtRecordPackHeader( OptixProgramGroup programGroup, void* sbtRecordHeaderHostPointer )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixSbtRecordPackHeader( programGroup, sbtRecordHeaderHostPointer );
+}
+
+OPTIXAPI inline OptixResult optixLaunch( OptixPipeline                  pipeline,
+                                         CUstream                       stream,
+                                         CUdeviceptr                    pipelineParams,
+                                         size_t                         pipelineParamsSize,
+                                         const OptixShaderBindingTable* sbt,
+                                         unsigned int                   width,
+                                         unsigned int                   height,
+                                         unsigned int                   depth )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixLaunch( pipeline, stream, pipelineParams, pipelineParamsSize, sbt, width, height, depth );
+}
+
+OPTIXAPI inline OptixResult optixCoopVecMatrixConvert( OptixDeviceContext             context,
+                                                       CUstream                       stream,
+                                                       unsigned int                   numNetworks,
+                                                       const OptixNetworkDescription* inputNetworkDescription,
+                                                       CUdeviceptr                    inputNetworks,
+                                                       size_t                         inputNetworkStrideInBytes,
+                                                       const OptixNetworkDescription* outputNetworkDescription,
+                                                       CUdeviceptr                    outputNetworks,
+                                                       size_t                         outputNetworkStrideInBytes )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixCoopVecMatrixConvert( context, stream, numNetworks, inputNetworkDescription,
+                                                                  inputNetworks, inputNetworkStrideInBytes, outputNetworkDescription,
+                                                                  outputNetworks, outputNetworkStrideInBytes );
+}
+
+OPTIXAPI inline OptixResult optixCoopVecMatrixComputeSize( OptixDeviceContext       context,
+                                                           unsigned int             N,
+                                                           unsigned int             K,
+                                                           OptixCoopVecElemType     elementType,
+                                                           OptixCoopVecMatrixLayout layout,
+                                                           size_t                   rowColumnStrideInBytes,
+                                                           size_t*                  sizeInBytes )
+{
+
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixCoopVecMatrixComputeSize( context, N, K, elementType, layout,
+                                                                      rowColumnStrideInBytes, sizeInBytes );
+}
+OPTIXAPI inline OptixResult optixDenoiserCreate( OptixDeviceContext          context,
+                                                 OptixDenoiserModelKind      modelKind,
+                                                 const OptixDenoiserOptions* options,
+                                                 OptixDenoiser*              returnHandle )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDenoiserCreate( context, modelKind, options, returnHandle );
+}
+
+OPTIXAPI inline OptixResult optixDenoiserCreateWithUserModel( OptixDeviceContext context,
+                                                              const void*        data,
+                                                              size_t             dataSizeInBytes,
+                                                              OptixDenoiser*     returnHandle )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDenoiserCreateWithUserModel( context, data, dataSizeInBytes, returnHandle );
+}
+
+OPTIXAPI inline OptixResult optixDenoiserDestroy( OptixDenoiser handle )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDenoiserDestroy( handle );
+}
+
+OPTIXAPI inline OptixResult optixDenoiserComputeMemoryResources( const OptixDenoiser handle,
+                                                                 unsigned int        maximumInputWidth,
+                                                                 unsigned int        maximumInputHeight,
+                                                                 OptixDenoiserSizes* returnSizes )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDenoiserComputeMemoryResources( handle, maximumInputWidth, maximumInputHeight, returnSizes );
+}
+
+OPTIXAPI inline OptixResult optixDenoiserSetup( OptixDenoiser denoiser,
+                                                CUstream      stream,
+                                                unsigned int  inputWidth,
+                                                unsigned int  inputHeight,
+                                                CUdeviceptr   denoiserState,
+                                                size_t        denoiserStateSizeInBytes,
+                                                CUdeviceptr   scratch,
+                                                size_t        scratchSizeInBytes )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDenoiserSetup( denoiser, stream, inputWidth, inputHeight, denoiserState,
+                                                           denoiserStateSizeInBytes, scratch, scratchSizeInBytes );
+}
+
+OPTIXAPI inline OptixResult optixDenoiserInvoke( OptixDenoiser                  handle,
+                                                 CUstream                       stream,
+                                                 const OptixDenoiserParams*     params,
+                                                 CUdeviceptr                    denoiserData,
+                                                 size_t                         denoiserDataSize,
+                                                 const OptixDenoiserGuideLayer* guideLayer,
+                                                 const OptixDenoiserLayer*      layers,
+                                                 unsigned int                   numLayers,
+                                                 unsigned int                   inputOffsetX,
+                                                 unsigned int                   inputOffsetY,
+                                                 CUdeviceptr                    scratch,
+                                                 size_t                         scratchSizeInBytes )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDenoiserInvoke( handle, stream, params, denoiserData, denoiserDataSize,
+                                                            guideLayer, layers, numLayers, inputOffsetX, inputOffsetY,
+                                                            scratch, scratchSizeInBytes );
+}
+
+OPTIXAPI inline OptixResult optixDenoiserComputeIntensity( OptixDenoiser       handle,
+                                                           CUstream            stream,
+                                                           const OptixImage2D* inputImage,
+                                                           CUdeviceptr         outputIntensity,
+                                                           CUdeviceptr         scratch,
+                                                           size_t              scratchSizeInBytes )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDenoiserComputeIntensity( handle, stream, inputImage, outputIntensity,
+                                                                      scratch, scratchSizeInBytes );
+}
+
+OPTIXAPI inline OptixResult optixDenoiserComputeAverageColor( OptixDenoiser       handle,
+                                                              CUstream            stream,
+                                                              const OptixImage2D* inputImage,
+                                                              CUdeviceptr         outputAverageColor,
+                                                              CUdeviceptr         scratch,
+                                                              size_t              scratchSizeInBytes )
+{
+    return OPTIX_FUNCTION_TABLE_SYMBOL.optixDenoiserComputeAverageColor( handle, stream, inputImage, outputAverageColor,
+                                                                         scratch, scratchSizeInBytes );
+}
+
+#endif  // OPTIX_DOXYGEN_SHOULD_SKIP_THIS
+
+#endif  // OPTIX_OPTIX_STUBS_H
diff --git a/crtx/optix_9.1/optix_types.h b/crtx/optix_9.1/optix_types.h
new file mode 100644
index 0000000..09d178f
--- /dev/null
+++ b/crtx/optix_9.1/optix_types.h
@@ -0,0 +1,2747 @@
+
+/*
+* SPDX-FileCopyrightText: Copyright (c) 2019 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+* SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+*
+* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+* property and proprietary rights in and to this material, related
+* documentation and any modifications thereto. Any use, reproduction,
+* disclosure or distribution of this material and related documentation
+* without an express license agreement from NVIDIA CORPORATION or
+* its affiliates is strictly prohibited.
+*/
+/// @file
+/// @author NVIDIA Corporation
+/// @brief  OptiX public API header
+///
+/// OptiX types include file -- defines types and enums used by the API.
+///
+
+#ifndef OPTIX_OPTIX_TYPES_H
+#define OPTIX_OPTIX_TYPES_H
+
+#if !defined(__CUDACC_RTC__)
+#include <stddef.h> /* for size_t */
+#endif
+
+
+
+/// \defgroup optix_types Types
+/// \brief OptiX Types
+
+/** \addtogroup optix_types
+@{
+*/
+
+// This typedef should match the one in cuda.h in order to avoid compilation errors.
+#if defined(_WIN64) || defined(__LP64__)
+/// CUDA device pointer
+typedef unsigned long long CUdeviceptr;
+#else
+/// CUDA device pointer
+typedef unsigned int CUdeviceptr;
+#endif
+
+/// Opaque type representing a device context
+typedef struct OptixDeviceContext_t* OptixDeviceContext;
+
+/// Opaque type representing a module
+typedef struct OptixModule_t* OptixModule;
+
+/// Opaque type representing a program group
+typedef struct OptixProgramGroup_t* OptixProgramGroup;
+
+/// Opaque type representing a pipeline
+typedef struct OptixPipeline_t* OptixPipeline;
+
+/// Opaque type representing a denoiser instance
+typedef struct OptixDenoiser_t* OptixDenoiser;
+
+/// Opaque type representing a work task
+typedef struct OptixTask_t* OptixTask;
+
+/// Traversable handle
+typedef unsigned long long OptixTraversableHandle;
+
+/// Visibility mask
+typedef unsigned int OptixVisibilityMask;
+
+/// Size of the SBT record headers.
+#define OPTIX_SBT_RECORD_HEADER_SIZE ( (size_t)32 )
+
+/// Alignment requirement for device pointers in OptixShaderBindingTable.
+#define OPTIX_SBT_RECORD_ALIGNMENT 16ull
+
+/// Alignment requirement for output and temporay buffers for acceleration structures.
+#define OPTIX_ACCEL_BUFFER_BYTE_ALIGNMENT 128ull
+
+/// Alignment requirement for OptixBuildInputInstanceArray::instances.
+#define OPTIX_INSTANCE_BYTE_ALIGNMENT 16ull
+
+/// Alignment requirement for OptixBuildInputCustomPrimitiveArray::aabbBuffers
+#define OPTIX_AABB_BUFFER_BYTE_ALIGNMENT 8ull
+
+/// Alignment requirement for OptixBuildInputTriangleArray::preTransform
+#define OPTIX_GEOMETRY_TRANSFORM_BYTE_ALIGNMENT 16ull
+
+/// Alignment requirement for OptixStaticTransform, OptixMatrixMotionTransform, OptixSRTMotionTransform.
+#define OPTIX_TRANSFORM_BYTE_ALIGNMENT 64ull
+
+/// Alignment requirement for OptixOpacityMicromapArrayBuildInput::perMicromapDescBuffer.
+#define OPTIX_OPACITY_MICROMAP_DESC_BUFFER_BYTE_ALIGNMENT 8ull
+
+/// Maximum number of registers allowed. Defaults to no explicit limit.
+#define OPTIX_COMPILE_DEFAULT_MAX_REGISTER_COUNT 0
+
+/// Maximum number of payload types allowed.
+#define OPTIX_COMPILE_DEFAULT_MAX_PAYLOAD_TYPE_COUNT 8
+
+/// Maximum number of payload values allowed.
+#define OPTIX_COMPILE_DEFAULT_MAX_PAYLOAD_VALUE_COUNT 32
+
+/// Opacity micromaps encode the states of microtriangles in either 1 bit (2-state) or 2 bits (4-state) using
+/// the following values.
+#define OPTIX_OPACITY_MICROMAP_STATE_TRANSPARENT          ( 0 )
+#define OPTIX_OPACITY_MICROMAP_STATE_OPAQUE               ( 1 )
+#define OPTIX_OPACITY_MICROMAP_STATE_UNKNOWN_TRANSPARENT  ( 2 )
+#define OPTIX_OPACITY_MICROMAP_STATE_UNKNOWN_OPAQUE       ( 3 )
+
+/// Predefined index to indicate that a triangle in the BVH build doesn't have an associated opacity micromap,
+/// and that it should revert to one of the four possible states for the full triangle.
+#define OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_TRANSPARENT             ( -1 )
+#define OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_OPAQUE                  ( -2 )
+#define OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_UNKNOWN_TRANSPARENT     ( -3 )
+#define OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_UNKNOWN_OPAQUE          ( -4 )
+/// Predefined index to indicate that no opacity micromap applies for a triangle. The opaque/non-opaque state is determined by the geometry flags,
+/// similar as for triangles in instances with the OPTIX_INSTANCE_FLAG_DISABLE_OPACITY_MICROMAPS flag set.
+/// This special index is only available for the opacity micromap index array supplied to OptixClusterAccelBuildInputTrianglesArgs.
+/// This special index does NOT require the cluster to be built with OPTIX_CLUSTER_ACCEL_CLUSTER_FLAG_ALLOW_DISABLE_OPACITY_MICROMAPS.
+#define OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_CLUSTER_SKIP_OPACITY_MICROMAP ( -5 )
+
+/// Alignment requirement for opacity micromap array buffers
+#define OPTIX_OPACITY_MICROMAP_ARRAY_BUFFER_BYTE_ALIGNMENT 128ull
+
+/// Maximum subdivision level for opacity micromaps
+#define OPTIX_OPACITY_MICROMAP_MAX_SUBDIVISION_LEVEL 12
+
+/// Result codes returned from API functions
+///
+/// All host side API functions return OptixResult with the exception of optixGetErrorName
+/// and optixGetErrorString.  When successful OPTIX_SUCCESS is returned.  All return codes
+/// except for OPTIX_SUCCESS should be assumed to be errors as opposed to a warning.
+///
+/// \see #optixGetErrorName(), #optixGetErrorString()
+typedef enum OptixResult
+{
+    OPTIX_SUCCESS                               = 0,
+    OPTIX_ERROR_INVALID_VALUE                   = 7001,
+    OPTIX_ERROR_HOST_OUT_OF_MEMORY              = 7002,
+    OPTIX_ERROR_INVALID_OPERATION               = 7003,
+    OPTIX_ERROR_FILE_IO_ERROR                   = 7004,
+    OPTIX_ERROR_INVALID_FILE_FORMAT             = 7005,
+    OPTIX_ERROR_DISK_CACHE_INVALID_PATH         = 7010,
+    OPTIX_ERROR_DISK_CACHE_PERMISSION_ERROR     = 7011,
+    OPTIX_ERROR_DISK_CACHE_DATABASE_ERROR       = 7012,
+    OPTIX_ERROR_DISK_CACHE_INVALID_DATA         = 7013,
+    OPTIX_ERROR_LAUNCH_FAILURE                  = 7050,
+    OPTIX_ERROR_INVALID_DEVICE_CONTEXT          = 7051,
+    OPTIX_ERROR_CUDA_NOT_INITIALIZED            = 7052,
+    OPTIX_ERROR_VALIDATION_FAILURE              = 7053,
+    OPTIX_ERROR_INVALID_INPUT                   = 7200,
+    OPTIX_ERROR_INVALID_LAUNCH_PARAMETER        = 7201,
+    OPTIX_ERROR_INVALID_PAYLOAD_ACCESS          = 7202,
+    OPTIX_ERROR_INVALID_ATTRIBUTE_ACCESS        = 7203,
+    OPTIX_ERROR_INVALID_FUNCTION_USE            = 7204,
+    OPTIX_ERROR_INVALID_FUNCTION_ARGUMENTS      = 7205,
+    OPTIX_ERROR_PIPELINE_OUT_OF_CONSTANT_MEMORY = 7250,
+    OPTIX_ERROR_PIPELINE_LINK_ERROR             = 7251,
+    OPTIX_ERROR_ILLEGAL_DURING_TASK_EXECUTE     = 7270,
+    OPTIX_ERROR_CREATION_CANCELED               = 7290,
+    OPTIX_ERROR_INTERNAL_COMPILER_ERROR         = 7299,
+    OPTIX_ERROR_DENOISER_MODEL_NOT_SET          = 7300,
+    OPTIX_ERROR_DENOISER_NOT_INITIALIZED        = 7301,
+    OPTIX_ERROR_NOT_COMPATIBLE                  = 7400,
+    OPTIX_ERROR_PAYLOAD_TYPE_MISMATCH           = 7500,
+    OPTIX_ERROR_PAYLOAD_TYPE_RESOLUTION_FAILED  = 7501,
+    OPTIX_ERROR_PAYLOAD_TYPE_ID_INVALID         = 7502,
+    OPTIX_ERROR_NOT_SUPPORTED                   = 7800,
+    OPTIX_ERROR_UNSUPPORTED_ABI_VERSION         = 7801,
+    OPTIX_ERROR_FUNCTION_TABLE_SIZE_MISMATCH    = 7802,
+    OPTIX_ERROR_INVALID_ENTRY_FUNCTION_OPTIONS  = 7803,
+    OPTIX_ERROR_LIBRARY_NOT_FOUND               = 7804,
+    OPTIX_ERROR_ENTRY_SYMBOL_NOT_FOUND          = 7805,
+    OPTIX_ERROR_LIBRARY_UNLOAD_FAILURE          = 7806,
+    OPTIX_ERROR_DEVICE_OUT_OF_MEMORY            = 7807,
+    OPTIX_ERROR_INVALID_POINTER                 = 7808,
+    OPTIX_ERROR_SYMBOL_NOT_FOUND                = 7809,
+    OPTIX_ERROR_CUDA_ERROR                      = 7900,
+    OPTIX_ERROR_INTERNAL_ERROR                  = 7990,
+    OPTIX_ERROR_UNKNOWN                         = 7999,
+} OptixResult;
+
+/// Parameters used for #optixDeviceContextGetProperty()
+///
+/// \see #optixDeviceContextGetProperty()
+typedef enum OptixDeviceProperty
+{
+    /// Maximum value for OptixPipelineLinkOptions::maxTraceDepth. sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_TRACE_DEPTH = 0x2001,
+
+    /// Maximum value to pass into optixPipelineSetStackSize for parameter
+    /// maxTraversableGraphDepth. sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_TRAVERSABLE_GRAPH_DEPTH = 0x2002,
+
+    /// The maximum number of primitives (over all build inputs) as input to a single
+    /// Geometry Acceleration Structure (GAS). sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_PRIMITIVES_PER_GAS = 0x2003,
+
+    /// The maximum number of instances (over all build inputs) as input to a single
+    /// Instance Acceleration Structure (IAS). sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCES_PER_IAS = 0x2004,
+
+    /// The RT core version supported by the device (0 for no support, 10 for version
+    /// 1.0). sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_RTCORE_VERSION = 0x2005,
+
+    /// The maximum value for #OptixInstance::instanceId. sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID = 0x2006,
+
+    /// The number of bits available for the #OptixInstance::visibilityMask.
+    /// Higher bits must be set to zero. sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_LIMIT_NUM_BITS_INSTANCE_VISIBILITY_MASK = 0x2007,
+
+    /// The maximum number of instances that can be added to a single Instance
+    /// Acceleration Structure (IAS). sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_SBT_RECORDS_PER_GAS = 0x2008,
+
+    /// The maximum summed value of #OptixInstance::sbtOffset.
+    /// Also the maximum summed value of sbt offsets of all ancestor
+    /// instances of a GAS in a traversable graph. sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_SBT_OFFSET = 0x2009,
+
+    /// Returns a flag specifying capabilities of the optixReorder() device function.  See
+    /// OptixDevicePropertyShaderExecutionReorderingFlags for documentation on the values
+    /// that can be returned. sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_SHADER_EXECUTION_REORDERING = 0x200A,
+
+    /// Returns a flag specifying whether cooperative vector support is enabled for this
+    /// device. See OptixDevicePropertyCoopVecFlags for documentation on the values that
+    /// can be returned. sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_COOP_VEC = 0x200B,
+
+    /// Returns a flag specifying support for cluster acceleration structure builds.  See
+    /// OptixDevicePropertyClusterAccelFlags for documentation on the values
+    /// that can be returned. sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_CLUSTER_ACCEL = 0x2020,
+
+    /// Returns a maximum unique vertices per cluster in a cluster acceleration structure (CLAS) build.
+    /// sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_CLUSTER_VERTICES = 0x2021,
+
+    /// Returns a maximum triangles per cluster in a cluster acceleration structure (CLAS) build.
+    /// sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_CLUSTER_TRIANGLES = 0x2022,
+
+    /// Returns a maximum resolution per cluster in a structured cluster
+    /// acceleration (CLAS) structure build. sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_STRUCTURED_GRID_RESOLUTION = 0x2023,
+
+    /// Returns a maximum sbt index allowed in a cluster acceleration structure (CLAS) build.
+    /// sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_CLUSTER_SBT_INDEX = 0x2024,
+
+    /// Returns the maximum number of clusters (CLAS) as input to a single
+    /// Geometry Acceleration Structure (GAS). sizeof( unsigned int )
+    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_CLUSTERS_PER_GAS = 0x2025,
+} OptixDeviceProperty;
+
+/// Type of the callback function used for log messages.
+///
+/// \param[in] level      The log level indicates the severity of the message. See below for
+///                       possible values.
+/// \param[in] tag        A terse message category description (e.g., 'SCENE STAT').
+/// \param[in] message    Null terminated log message (without newline at the end).
+/// \param[in] cbdata     Callback data that was provided with the callback pointer.
+///
+/// It is the users responsibility to ensure thread safety within this function.
+///
+/// The following log levels are defined.
+///
+///   0   disable   Setting the callback level will disable all messages.  The callback
+///                 function will not be called in this case.
+///   1   fatal     A non-recoverable error. The context and/or OptiX itself might no longer
+///                 be in a usable state.
+///   2   error     A recoverable error, e.g., when passing invalid call parameters.
+///   3   warning   Hints that OptiX might not behave exactly as requested by the user or
+///                 may perform slower than expected.
+///   4   print     Status or progress messages.
+///
+/// Higher levels might occur.
+///
+/// \see #optixDeviceContextSetLogCallback(), #OptixDeviceContextOptions
+typedef void ( *OptixLogCallback )( unsigned int level, const char* tag, const char* message, void* cbdata );
+
+/// Validation mode settings.
+///
+/// When enabled, certain device code utilities will be enabled to provide as good debug and
+/// error checking facilities as possible.
+///
+///
+/// \see #optixDeviceContextCreate()
+typedef enum OptixDeviceContextValidationMode
+{
+    OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_OFF = 0,
+    OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL = 0xFFFFFFFF
+} OptixDeviceContextValidationMode;
+
+/// Parameters used for #optixDeviceContextCreate()
+///
+/// \see #optixDeviceContextCreate()
+typedef struct OptixDeviceContextOptions
+{
+    /// Function pointer used when OptiX wishes to generate messages
+    OptixLogCallback logCallbackFunction;
+    /// Pointer stored and passed to logCallbackFunction when a message is generated
+    void* logCallbackData;
+    /// Maximum callback level to generate message for (see #OptixLogCallback)
+    int logCallbackLevel;
+    /// Validation mode of context.
+    OptixDeviceContextValidationMode validationMode;
+} OptixDeviceContextOptions;
+
+/// Flags used to interpret the source and target of memory copies when using
+/// #optixPipelineSymbolMemcpyAsync()
+///
+/// \see #optixPipelineSymbolMemcpyAsync()
+typedef enum OptixPipelineSymbolMemcpyKind
+{
+    OPTIX_PIPELINE_SYMBOL_MEMCPY_KIND_FROM_DEVICE = 0x21A0,
+    OPTIX_PIPELINE_SYMBOL_MEMCPY_KIND_FROM_HOST   = 0x21A1,
+    OPTIX_PIPELINE_SYMBOL_MEMCPY_KIND_TO_DEVICE   = 0x21A2,
+    OPTIX_PIPELINE_SYMBOL_MEMCPY_KIND_TO_HOST     = 0x21A3,
+} OptixPipelineSymbolMemcpyKind;
+
+/// Flags used to interpret the result of #optixDeviceContextGetProperty() and
+/// OPTIX_DEVICE_PROPERTY_SHADER_EXECUTION_REORDERING
+///
+/// \see #optixDeviceContextGetProperty()
+typedef enum OptixDevicePropertyShaderExecutionReorderingFlags
+{
+    /// optixReorder() acts as a no-op, and no thread reordering is performed. Note that
+    /// it is still legal to call this device function; no errors will be generated.
+    OPTIX_DEVICE_PROPERTY_SHADER_EXECUTION_REORDERING_FLAG_NONE     = 0,
+
+    // Standard thread reordering is supported
+    OPTIX_DEVICE_PROPERTY_SHADER_EXECUTION_REORDERING_FLAG_STANDARD = 1 << 0,
+} OptixDevicePropertyShaderExecutionReorderingFlags;
+
+/// Flags used to interpret the result of #optixDeviceContextGetProperty() and
+/// OPTIX_DEVICE_PROPERTY_CLUSTER_ACCEL
+///
+/// \see #optixDeviceContextGetProperty()
+typedef enum OptixDevicePropertyClusterAccelFlags
+{
+    /// Cluster acceleration structure builds are not supported.
+    OPTIX_DEVICE_PROPERTY_CLUSTER_ACCEL_FLAG_NONE     = 0,
+
+    // Cluster acceleration structure builds are supported.
+    OPTIX_DEVICE_PROPERTY_CLUSTER_ACCEL_FLAG_STANDARD = 1 << 0,
+} OptixDevicePropertyClusterAccelFlags;
+
+/// Flags used by #OptixBuildInputTriangleArray::flags,
+/// #OptixBuildInputSphereArray::flags
+/// and #OptixBuildInputCustomPrimitiveArray::flags
+typedef enum OptixGeometryFlags
+{
+    /// No flags set
+    OPTIX_GEOMETRY_FLAG_NONE = 0,
+
+    /// Disables the invocation of the anyhit program.
+    /// Can be overridden by OPTIX_INSTANCE_FLAG_ENFORCE_ANYHIT and OPTIX_RAY_FLAG_ENFORCE_ANYHIT.
+    OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT = 1u << 0,
+
+    /// If set, an intersection with the primitive will trigger one and only one
+    /// invocation of the anyhit program.  Otherwise, the anyhit program may be invoked
+    /// more than once.
+    OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL = 1u << 1,
+
+    /// Prevent triangles from getting culled due to their orientation.
+    /// Effectively ignores ray flags
+    /// OPTIX_RAY_FLAG_CULL_BACK_FACING_TRIANGLES and OPTIX_RAY_FLAG_CULL_FRONT_FACING_TRIANGLES.
+    OPTIX_GEOMETRY_FLAG_DISABLE_TRIANGLE_FACE_CULLING = 1u << 2,
+} OptixGeometryFlags;
+
+/// Legacy type: A subset of the hit kinds for built-in primitive intersections.
+/// It is preferred to use optixGetPrimitiveType(), together with
+/// optixIsFrontFaceHit() or optixIsBackFaceHit().
+///
+/// \see #optixGetHitKind()
+typedef enum OptixHitKind
+{
+    /// Ray hit the triangle on the front face
+    OPTIX_HIT_KIND_TRIANGLE_FRONT_FACE = 0xFE,
+    /// Ray hit the triangle on the back face
+    OPTIX_HIT_KIND_TRIANGLE_BACK_FACE = 0xFF
+} OptixHitKind;
+
+/// Format of indices used int #OptixBuildInputTriangleArray::indexFormat.
+typedef enum OptixIndicesFormat
+{
+    /// No indices, this format must only be used in combination with triangle soups, i.e., numIndexTriplets must be zero
+    OPTIX_INDICES_FORMAT_NONE = 0,
+    /// Three bytes
+    OPTIX_INDICES_FORMAT_UNSIGNED_BYTE3 = 0x2101,
+    /// Three shorts
+    OPTIX_INDICES_FORMAT_UNSIGNED_SHORT3 = 0x2102,
+    /// Three ints
+    OPTIX_INDICES_FORMAT_UNSIGNED_INT3 = 0x2103
+} OptixIndicesFormat;
+
+/// Format of vertices used in #OptixBuildInputTriangleArray::vertexFormat.
+typedef enum OptixVertexFormat
+{
+    OPTIX_VERTEX_FORMAT_NONE      = 0,       ///< No vertices
+    OPTIX_VERTEX_FORMAT_FLOAT3    = 0x2121,  ///< Vertices are represented by three floats
+    OPTIX_VERTEX_FORMAT_FLOAT2    = 0x2122,  ///< Vertices are represented by two floats
+    OPTIX_VERTEX_FORMAT_HALF3     = 0x2123,  ///< Vertices are represented by three halfs
+    OPTIX_VERTEX_FORMAT_HALF2     = 0x2124,  ///< Vertices are represented by two halfs
+    OPTIX_VERTEX_FORMAT_SNORM16_3 = 0x2125,
+    OPTIX_VERTEX_FORMAT_SNORM16_2 = 0x2126
+} OptixVertexFormat;
+
+/// Format of transform used in #OptixBuildInputTriangleArray::transformFormat.
+typedef enum OptixTransformFormat
+{
+    OPTIX_TRANSFORM_FORMAT_NONE           = 0,       ///< no transform, default for zero initialization
+    OPTIX_TRANSFORM_FORMAT_MATRIX_FLOAT12 = 0x21E1,  ///< 3x4 row major affine matrix
+} OptixTransformFormat;
+
+/// Specifies whether to use a 2- or 4-state opacity micromap format.
+typedef enum OptixOpacityMicromapFormat
+{
+    /// invalid format
+    OPTIX_OPACITY_MICROMAP_FORMAT_NONE = 0,
+    /// 0: Transparent, 1: Opaque
+    OPTIX_OPACITY_MICROMAP_FORMAT_2_STATE = 1,
+    /// 0: Transparent, 1: Opaque, 2: Unknown-Transparent, 3: Unknown-Opaque
+    OPTIX_OPACITY_MICROMAP_FORMAT_4_STATE = 2,
+} OptixOpacityMicromapFormat;
+
+/// indexing mode of triangles to opacity micromaps in an array, used in #OptixBuildInputOpacityMicromap.
+typedef enum OptixOpacityMicromapArrayIndexingMode
+{
+    /// No opacity micromap is used
+    OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_NONE = 0,
+    /// An implicit linear mapping of triangles to opacity micromaps in the
+    /// opacity micromap array is used. triangle[i] will use opacityMicromapArray[i].
+    OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_LINEAR = 1,
+    /// OptixBuildInputOpacityMicromap::indexBuffer provides a per triangle array of predefined indices
+    /// and/or indices into OptixBuildInputOpacityMicromap::opacityMicromapArray.
+    /// See OptixBuildInputOpacityMicromap::indexBuffer for more details.
+    OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_INDEXED = 2,
+} OptixOpacityMicromapArrayIndexingMode;
+
+/// Opacity micromap usage count for acceleration structure builds.
+/// Specifies how many opacity micromaps of a specific type are referenced by triangles when building the AS.
+/// Note that while this is similar to OptixOpacityMicromapHistogramEntry, the usage count specifies how many opacity micromaps
+/// of a specific type are referenced by triangles in the AS.
+typedef struct OptixOpacityMicromapUsageCount
+{
+    /// Number of opacity micromaps with this format and subdivision level referenced by triangles in the corresponding
+    /// triangle build input at AS build time.
+    unsigned int count;
+    /// Number of micro-triangles is 4^level. Valid levels are [0, 12]
+    unsigned int subdivisionLevel;
+    /// opacity micromap format.
+    OptixOpacityMicromapFormat format;
+} OptixOpacityMicromapUsageCount;
+
+typedef struct OptixBuildInputOpacityMicromap
+{
+    /// Indexing mode of triangle to opacity micromap array mapping.
+    OptixOpacityMicromapArrayIndexingMode indexingMode;
+
+    /// Device pointer to a opacity micromap array used by this build input array.
+    /// This buffer is required when #OptixBuildInputOpacityMicromap::indexingMode is
+    /// OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_LINEAR or OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_INDEXED.
+    /// Must be zero if #OptixBuildInputOpacityMicromap::indexingMode is OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_NONE.
+    CUdeviceptr  opacityMicromapArray;
+
+    /// int16 or int32 buffer specifying which opacity micromap index to use for each triangle.
+    /// Instead of an actual index, one of the predefined indices
+    /// OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_(FULLY_TRANSPARENT | FULLY_OPAQUE | FULLY_UNKNOWN_TRANSPARENT | FULLY_UNKNOWN_OPAQUE)
+    /// can be used to indicate that there is no opacity micromap for this particular triangle
+    /// but the triangle is in a uniform state and the selected behavior is applied
+    /// to the entire triangle.
+    /// This buffer is required when #OptixBuildInputOpacityMicromap::indexingMode is OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_INDEXED.
+    /// Must be zero if #OptixBuildInputOpacityMicromap::indexingMode is
+    /// OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_LINEAR or OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_NONE.
+    CUdeviceptr  indexBuffer;
+
+    /// 0, 2 or 4 (unused, 16 or 32 bit)
+    /// Must be non-zero when #OptixBuildInputOpacityMicromap::indexingMode is OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_INDEXED.
+    unsigned int indexSizeInBytes;
+
+    /// Opacity micromap index buffer stride. If set to zero, indices are assumed to be tightly
+    /// packed and stride is inferred from #OptixBuildInputOpacityMicromap::indexSizeInBytes.
+    unsigned int indexStrideInBytes;
+
+    /// Constant offset to non-negative opacity micromap indices
+    unsigned int indexOffset;
+
+    /// Number of OptixOpacityMicromapUsageCount.
+    unsigned int numMicromapUsageCounts;
+    /// List of number of usages of opacity micromaps of format and subdivision combinations.
+    /// Counts with equal format and subdivision combination (duplicates) are added together.
+    const OptixOpacityMicromapUsageCount* micromapUsageCounts;
+} OptixBuildInputOpacityMicromap;
+
+typedef struct OptixRelocateInputOpacityMicromap
+{
+    /// Device pointer to a relocated opacity micromap array used by the source build input array.
+    /// May be zero when no micromaps where used in the source accel, or the referenced opacity
+    /// micromaps don't require relocation (for example relocation of a GAS on the source device).
+    CUdeviceptr  opacityMicromapArray;
+} OptixRelocateInputOpacityMicromap;
+
+
+
+/// Triangle inputs
+///
+/// \see #OptixBuildInput::triangleArray
+typedef struct OptixBuildInputTriangleArray
+{
+    /// Points to host array of device pointers, one per motion step. Host array size must match the number of
+    /// motion keys as set in #OptixMotionOptions (or an array of size 1 if OptixMotionOptions::numKeys is set
+    /// to 0 or 1). Each per motion key device pointer must point to an array of vertices of the
+    /// triangles in the format as described by vertexFormat. The minimum alignment must match the natural
+    /// alignment of the type as specified in the vertexFormat, i.e., for OPTIX_VERTEX_FORMAT_FLOATX 4-byte,
+    /// for all others a 2-byte alignment. However, an 16-byte stride (and buffer alignment) is recommended for
+    /// vertices of format OPTIX_VERTEX_FORMAT_FLOAT3 for GAS build performance.
+    const CUdeviceptr* vertexBuffers;
+
+    /// Number of vertices in each of buffer in OptixBuildInputTriangleArray::vertexBuffers.
+    unsigned int numVertices;
+
+    /// \see #OptixVertexFormat
+    OptixVertexFormat vertexFormat;
+
+    /// Stride between vertices. If set to zero, vertices are assumed to be tightly
+    /// packed and stride is inferred from vertexFormat.
+    unsigned int vertexStrideInBytes;
+
+    /// Optional pointer to array of 16 or 32-bit int triplets, one triplet per triangle.
+    /// The minimum alignment must match the natural alignment of the type as specified in the indexFormat, i.e.,
+    /// for OPTIX_INDICES_FORMAT_UNSIGNED_INT3 4-byte and for OPTIX_INDICES_FORMAT_UNSIGNED_SHORT3 a 2-byte alignment.
+    CUdeviceptr indexBuffer;
+
+    /// Size of array in OptixBuildInputTriangleArray::indexBuffer. For build, needs to be zero if indexBuffer is \c nullptr.
+    unsigned int numIndexTriplets;
+
+    /// \see #OptixIndicesFormat
+    OptixIndicesFormat indexFormat;
+
+    /// Stride between triplets of indices. If set to zero, indices are assumed to be tightly
+    /// packed and stride is inferred from indexFormat.
+    unsigned int indexStrideInBytes;
+
+    /// Optional pointer to array of floats
+    /// representing a 3x4 row major affine
+    /// transformation matrix. This pointer must be a multiple of OPTIX_GEOMETRY_TRANSFORM_BYTE_ALIGNMENT
+    CUdeviceptr preTransform;
+
+    /// Array of flags, to specify flags per sbt record,
+    /// combinations of OptixGeometryFlags describing the
+    /// primitive behavior, size must match numSbtRecords
+    const unsigned int* flags;
+
+    /// Number of sbt records available to the sbt index offset override.
+    unsigned int numSbtRecords;
+
+    /// Device pointer to per-primitive local sbt index offset buffer. May be NULL.
+    /// Every entry must be in range [0,numSbtRecords-1].
+    /// Size needs to be the number of primitives.
+    CUdeviceptr sbtIndexOffsetBuffer;
+
+    /// Size of type of the sbt index offset. Needs to be 0, 1, 2 or 4 (8, 16 or 32 bit).
+    unsigned int sbtIndexOffsetSizeInBytes;
+
+    /// Stride between the index offsets. If set to zero, the offsets are assumed to be tightly
+    /// packed and the stride matches the size of the type (sbtIndexOffsetSizeInBytes).
+    unsigned int sbtIndexOffsetStrideInBytes;
+
+    /// Primitive index bias, applied in optixGetPrimitiveIndex().
+    /// Sum of primitiveIndexOffset and number of triangles must not overflow 32bits.
+    unsigned int primitiveIndexOffset;
+
+    /// \see #OptixTransformFormat
+    OptixTransformFormat transformFormat;
+
+    /// Optional opacity micromap inputs.
+    OptixBuildInputOpacityMicromap opacityMicromap;
+
+} OptixBuildInputTriangleArray;
+
+/// Triangle inputs
+///
+/// \see #OptixRelocateInput::triangleArray
+typedef struct OptixRelocateInputTriangleArray
+{
+    /// Number of sbt records available to the sbt index offset override.
+    /// Must match #OptixBuildInputTriangleArray::numSbtRecords of the source build input.
+    unsigned int numSbtRecords;
+
+    /// Opacity micromap inputs.
+    OptixRelocateInputOpacityMicromap opacityMicromap;
+} OptixRelocateInputTriangleArray;
+
+/// Builtin primitive types
+///
+typedef enum OptixPrimitiveType
+{
+    /// Custom primitive.
+    OPTIX_PRIMITIVE_TYPE_CUSTOM                        = 0x2500,
+    /// B-spline curve of degree 2 with circular cross-section.
+    OPTIX_PRIMITIVE_TYPE_ROUND_QUADRATIC_BSPLINE       = 0x2501,
+    /// B-spline curve of degree 3 with circular cross-section.
+    OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE           = 0x2502,
+    /// Piecewise linear curve with circular cross-section.
+    OPTIX_PRIMITIVE_TYPE_ROUND_LINEAR                  = 0x2503,
+    /// CatmullRom curve with circular cross-section.
+    OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM              = 0x2504,
+    /// B-spline curve of degree 2 with oriented, flat cross-section.
+    OPTIX_PRIMITIVE_TYPE_FLAT_QUADRATIC_BSPLINE        = 0x2505,
+    /// Sphere.
+    OPTIX_PRIMITIVE_TYPE_SPHERE                        = 0x2506,
+    /// Bezier curve of degree 3 with circular cross-section.
+    OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BEZIER            = 0x2507,
+    /// B-spline curve of degree 2 with circular cross-section, using rocaps intersection.
+    OPTIX_PRIMITIVE_TYPE_ROUND_QUADRATIC_BSPLINE_ROCAPS = 0x2508,
+    /// B-spline curve of degree 3 with circular cross-section, using rocaps intersection.
+    OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE_ROCAPS    = 0x2509,
+    /// CatmullRom curve with circular cross-section, using rocaps intersection.
+    OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM_ROCAPS       = 0x250A,
+    /// Bezier curve of degree 3 with circular cross-section, using rocaps intersection.
+    OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BEZIER_ROCAPS     = 0x250B,
+    /// Triangle.
+    OPTIX_PRIMITIVE_TYPE_TRIANGLE                      = 0x2531,
+} OptixPrimitiveType;
+
+/// Builtin flags may be bitwise combined.
+///
+/// \see #OptixPipelineCompileOptions::usesPrimitiveTypeFlags
+typedef enum OptixPrimitiveTypeFlags
+{
+    /// Custom primitive.
+    OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM                       = 1 << 0,
+    /// B-spline curve of degree 2 with circular cross-section.
+    OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_QUADRATIC_BSPLINE      = 1 << 1,
+    /// B-spline curve of degree 3 with circular cross-section.
+    OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE          = 1 << 2,
+    /// Piecewise linear curve with circular cross-section.
+    OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_LINEAR                 = 1 << 3,
+    /// CatmullRom curve with circular cross-section.
+    OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CATMULLROM             = 1 << 4,
+    /// B-spline curve of degree 2 with oriented, flat cross-section.
+    OPTIX_PRIMITIVE_TYPE_FLAGS_FLAT_QUADRATIC_BSPLINE       = 1 << 5,
+    /// Sphere.
+    OPTIX_PRIMITIVE_TYPE_FLAGS_SPHERE                       = 1 << 6,
+    /// Bezier curve of degree 3 with circular cross-section.
+    OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BEZIER           = 1 << 7,
+    /// B-spline curve of degree 2 with circular cross-section, using rocaps intersection.
+    OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_QUADRATIC_BSPLINE_ROCAPS = 1 << 8,
+    /// B-spline curve of degree 3 with circular cross-section, using rocaps intersection.
+    OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE_ROCAPS   = 1 << 9,
+    /// CatmullRom curve with circular cross-section, using rocaps intersection.
+    OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CATMULLROM_ROCAPS      = 1 << 10,
+    /// Bezier curve of degree 3 with circular cross-section, using rocaps intersection.
+    OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BEZIER_ROCAPS    = 1 << 11,
+    /// Triangle.
+    OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE                     = 1 << 31,
+} OptixPrimitiveTypeFlags;
+
+/// Curve end cap types, for non-linear curves
+///
+typedef enum OptixCurveEndcapFlags
+{
+    /// Default end caps. Round end caps for linear, no end caps for quadratic/cubic.
+    OPTIX_CURVE_ENDCAP_DEFAULT                        = 0,
+    /// Flat end caps at both ends of quadratic/cubic curve segments. Not valid for linear.
+    OPTIX_CURVE_ENDCAP_ON                             = 1 << 0,
+} OptixCurveEndcapFlags;
+
+/// Curve inputs
+///
+/// A curve is a swept surface defined by a 3D spline curve and a varying width (radius). A curve (or "strand") of
+/// degree d (3=cubic, 2=quadratic, 1=linear) is represented by N > d vertices and N width values, and comprises N - d segments.
+/// Each segment is defined by d+1 consecutive vertices. Each curve may have a different number of vertices.
+///
+/// OptiX describes the curve array as a list of curve segments. The primitive id is the segment number.
+/// It is the user's responsibility to maintain a mapping between curves and curve segments.
+/// Each index buffer entry i = indexBuffer[primid] specifies the start of a curve segment,
+/// represented by d+1 consecutive vertices in the vertex buffer,
+/// and d+1 consecutive widths in the width buffer. Width is interpolated the same
+/// way vertices are interpolated, that is, using the curve basis.
+///
+/// Each curves build input has only one SBT record.
+/// To create curves with different materials in the same BVH, use multiple build inputs.
+///
+/// \see #OptixBuildInput::curveArray
+typedef struct OptixBuildInputCurveArray
+{
+    /// Curve degree and basis
+    /// \see #OptixPrimitiveType
+    OptixPrimitiveType curveType;
+    /// Number of primitives. Each primitive is a polynomial curve segment.
+    unsigned int numPrimitives;
+
+    /// Pointer to host array of device pointers, one per motion step. Host array size must match number of
+    /// motion keys as set in #OptixMotionOptions (or an array of size 1 if OptixMotionOptions::numKeys is set
+    /// to 1). Each per-motion-key device pointer must point to an array of floats (the vertices of the
+    /// curves).
+    const CUdeviceptr* vertexBuffers;
+    /// Number of vertices in each buffer in vertexBuffers.
+    unsigned int numVertices;
+    /// Stride between vertices. If set to zero, vertices are assumed to be tightly
+    /// packed and stride is sizeof( float3 ).
+    unsigned int vertexStrideInBytes;
+
+    /// Parallel to vertexBuffers: a device pointer per motion step, each with numVertices float values,
+    /// specifying the curve width (radius) corresponding to each vertex.
+    const CUdeviceptr* widthBuffers;
+    /// Stride between widths. If set to zero, widths are assumed to be tightly
+    /// packed and stride is sizeof( float ).
+    unsigned int widthStrideInBytes;
+
+    /// Reserved for future use.
+    const CUdeviceptr* normalBuffers;
+    /// Reserved for future use.
+    unsigned int normalStrideInBytes;
+
+    /// Device pointer to array of unsigned ints, one per curve segment.
+    /// This buffer is required (unlike for OptixBuildInputTriangleArray).
+    /// Each index is the start of degree+1 consecutive vertices in vertexBuffers,
+    /// and corresponding widths in widthBuffers and normals in normalBuffers.
+    /// These define a single segment. Size of array is numPrimitives.
+    CUdeviceptr indexBuffer;
+    /// Stride between indices. If set to zero, indices are assumed to be tightly
+    /// packed and stride is sizeof( unsigned int ).
+    unsigned int indexStrideInBytes;
+
+    /// Combination of OptixGeometryFlags describing the
+    /// primitive behavior.
+    unsigned int flag;
+
+    /// Primitive index bias, applied in optixGetPrimitiveIndex().
+    /// Sum of primitiveIndexOffset and number of primitives must not overflow 32bits.
+    unsigned int primitiveIndexOffset;
+
+    /// End cap flags, see OptixCurveEndcapFlags
+    unsigned int endcapFlags;
+} OptixBuildInputCurveArray;
+
+/// Sphere inputs
+///
+/// A sphere is defined by a center point and a radius.
+/// Each center point is represented by a vertex in the vertex buffer.
+/// There is either a single radius for all spheres, or the radii are represented by entries in the radius buffer.
+///
+/// The vertex buffers and radius buffers point to a host array of device pointers, one per motion step.
+/// Host array size must match the number of motion keys as set in #OptixMotionOptions (or an array of size 1 if OptixMotionOptions::numKeys is set
+/// to 0 or 1). Each per motion key device pointer must point to an array of vertices corresponding to the center points of the spheres, or
+/// an array of 1 or N radii. Format OPTIX_VERTEX_FORMAT_FLOAT3 is used for vertices, OPTIX_VERTEX_FORMAT_FLOAT for radii.
+///
+/// \see #OptixBuildInput::sphereArray
+typedef struct OptixBuildInputSphereArray
+{
+  /// Pointer to host array of device pointers, one per motion step. Host array size must match number of
+  /// motion keys as set in #OptixMotionOptions (or an array of size 1 if OptixMotionOptions::numKeys is set
+  /// to 1). Each per-motion-key device pointer must point to an array of floats (the center points of
+  /// the spheres).
+  const CUdeviceptr* vertexBuffers;
+
+  /// Stride between vertices. If set to zero, vertices are assumed to be tightly
+  /// packed and stride is sizeof( float3 ).
+  unsigned int vertexStrideInBytes;
+  /// Number of vertices in each buffer in vertexBuffers.
+  unsigned int numVertices;
+
+  /// Parallel to vertexBuffers: a device pointer per motion step, each with numRadii float values,
+  /// specifying the sphere radius corresponding to each vertex.
+  const CUdeviceptr* radiusBuffers;
+  /// Stride between radii. If set to zero, widths are assumed to be tightly
+  /// packed and stride is sizeof( float ).
+  unsigned int radiusStrideInBytes;
+  /// Boolean value indicating whether a single radius per radius buffer is used,
+  /// or the number of radii in radiusBuffers equals numVertices.
+  int singleRadius;
+
+  /// Array of flags, to specify flags per sbt record,
+  /// combinations of OptixGeometryFlags describing the
+  /// primitive behavior, size must match numSbtRecords
+  const unsigned int* flags;
+
+  /// Number of sbt records available to the sbt index offset override.
+  unsigned int numSbtRecords;
+  /// Device pointer to per-primitive local sbt index offset buffer. May be NULL.
+  /// Every entry must be in range [0,numSbtRecords-1].
+  /// Size needs to be the number of primitives.
+  CUdeviceptr sbtIndexOffsetBuffer;
+  /// Size of type of the sbt index offset. Needs to be 0, 1, 2 or 4 (8, 16 or 32 bit).
+  unsigned int sbtIndexOffsetSizeInBytes;
+  /// Stride between the sbt index offsets. If set to zero, the offsets are assumed to be tightly
+  /// packed and the stride matches the size of the type (sbtIndexOffsetSizeInBytes).
+  unsigned int sbtIndexOffsetStrideInBytes;
+
+  /// Primitive index bias, applied in optixGetPrimitiveIndex().
+  /// Sum of primitiveIndexOffset and number of primitives must not overflow 32bits.
+  unsigned int primitiveIndexOffset;
+} OptixBuildInputSphereArray;
+
+/// AABB inputs
+typedef struct OptixAabb
+{
+    float minX;  ///< Lower extent in X direction.
+    float minY;  ///< Lower extent in Y direction.
+    float minZ;  ///< Lower extent in Z direction.
+    float maxX;  ///< Upper extent in X direction.
+    float maxY;  ///< Upper extent in Y direction.
+    float maxZ;  ///< Upper extent in Z direction.
+} OptixAabb;
+
+/// Custom primitive inputs
+///
+/// \see #OptixBuildInput::customPrimitiveArray
+typedef struct OptixBuildInputCustomPrimitiveArray
+{
+    /// Points to host array of device pointers to AABBs (type OptixAabb), one per motion step.
+    /// Host array size must match number of motion keys as set in OptixMotionOptions (or an array of size 1
+    /// if OptixMotionOptions::numKeys is set to 1).
+    /// Each device pointer must be a multiple of OPTIX_AABB_BUFFER_BYTE_ALIGNMENT.
+    const CUdeviceptr* aabbBuffers;
+
+    /// Number of primitives in each buffer (i.e., per motion step) in
+    /// #OptixBuildInputCustomPrimitiveArray::aabbBuffers.
+    unsigned int numPrimitives;
+
+    /// Stride between AABBs (per motion key). If set to zero, the aabbs are assumed to be tightly
+    /// packed and the stride is assumed to be sizeof( OptixAabb ).
+    /// If non-zero, the value must be a multiple of OPTIX_AABB_BUFFER_BYTE_ALIGNMENT.
+    unsigned int strideInBytes;
+
+    /// Array of flags, to specify flags per sbt record,
+    /// combinations of OptixGeometryFlags describing the
+    /// primitive behavior, size must match numSbtRecords
+    const unsigned int* flags;
+
+    /// Number of sbt records available to the sbt index offset override.
+    unsigned int numSbtRecords;
+
+    /// Device pointer to per-primitive local sbt index offset buffer. May be NULL.
+    /// Every entry must be in range [0,numSbtRecords-1].
+    /// Size needs to be the number of primitives.
+    CUdeviceptr sbtIndexOffsetBuffer;
+
+    /// Size of type of the sbt index offset. Needs to be 0, 1, 2 or 4 (8, 16 or 32 bit).
+    unsigned int sbtIndexOffsetSizeInBytes;
+
+    /// Stride between the index offsets. If set to zero, the offsets are assumed to be tightly
+    /// packed and the stride matches the size of the type (sbtIndexOffsetSizeInBytes).
+    unsigned int sbtIndexOffsetStrideInBytes;
+
+    /// Primitive index bias, applied in optixGetPrimitiveIndex().
+    /// Sum of primitiveIndexOffset and number of primitive must not overflow 32bits.
+    unsigned int primitiveIndexOffset;
+} OptixBuildInputCustomPrimitiveArray;
+
+/// Instance and instance pointer inputs
+///
+/// \see #OptixBuildInput::instanceArray
+typedef struct OptixBuildInputInstanceArray
+{
+    /// If OptixBuildInput::type is OPTIX_BUILD_INPUT_TYPE_INSTANCE_POINTERS instances and
+    /// aabbs should be interpreted as arrays of pointers instead of arrays of structs.
+    ///
+    /// This pointer must be a multiple of OPTIX_INSTANCE_BYTE_ALIGNMENT if
+    /// OptixBuildInput::type is OPTIX_BUILD_INPUT_TYPE_INSTANCES. The array elements must
+    /// be a multiple of OPTIX_INSTANCE_BYTE_ALIGNMENT if OptixBuildInput::type is
+    /// OPTIX_BUILD_INPUT_TYPE_INSTANCE_POINTERS.
+    CUdeviceptr instances;
+
+    /// Number of elements in #OptixBuildInputInstanceArray::instances.
+    unsigned int numInstances;
+
+    /// Only valid for OPTIX_BUILD_INPUT_TYPE_INSTANCE
+    /// Defines the stride between instances. A stride of 0 indicates a tight packing, i.e.,
+    /// stride = sizeof( OptixInstance )
+    unsigned int instanceStride;
+} OptixBuildInputInstanceArray;
+
+/// Instance and instance pointer inputs
+///
+/// \see #OptixRelocateInput::instanceArray
+typedef struct OptixRelocateInputInstanceArray
+{
+    /// Number of elements in #OptixRelocateInputInstanceArray::traversableHandles.
+    /// Must match #OptixBuildInputInstanceArray::numInstances of the source build input.
+    unsigned int numInstances;
+
+    /// These are the traversable handles of the instances (See OptixInstance::traversableHandle)
+    /// These can be used when also relocating the instances.  No updates to
+    /// the bounds are performed.  Use optixAccelBuild to update the bounds.
+    /// 'traversableHandles' may be zero when the traversables are not relocated
+    /// (i.e. relocation of an IAS on the source device).
+    CUdeviceptr traversableHandles;
+
+} OptixRelocateInputInstanceArray;
+
+/// Enum to distinguish the different build input types.
+///
+/// \see #OptixBuildInput::type
+typedef enum OptixBuildInputType
+{
+    /// Triangle inputs. \see #OptixBuildInputTriangleArray
+    OPTIX_BUILD_INPUT_TYPE_TRIANGLES = 0x2141,
+    /// Custom primitive inputs. \see #OptixBuildInputCustomPrimitiveArray
+    OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES = 0x2142,
+    /// Instance inputs. \see #OptixBuildInputInstanceArray
+    OPTIX_BUILD_INPUT_TYPE_INSTANCES = 0x2143,
+    /// Instance pointer inputs. \see #OptixBuildInputInstanceArray
+    OPTIX_BUILD_INPUT_TYPE_INSTANCE_POINTERS = 0x2144,
+    /// Curve inputs. \see #OptixBuildInputCurveArray
+    OPTIX_BUILD_INPUT_TYPE_CURVES = 0x2145,
+    /// Sphere inputs. \see #OptixBuildInputSphereArray
+    OPTIX_BUILD_INPUT_TYPE_SPHERES = 0x2146
+} OptixBuildInputType;
+
+/// Build inputs.
+///
+/// All of them support motion and the size of the data arrays needs to match the number of motion steps
+///
+/// \see #optixAccelComputeMemoryUsage(), #optixAccelBuild()
+typedef struct OptixBuildInput
+{
+    /// The type of the build input.
+    OptixBuildInputType type;
+
+    union
+    {
+        char pad[1024];
+        /// Triangle inputs.
+        OptixBuildInputTriangleArray triangleArray;
+        /// Curve inputs.
+        OptixBuildInputCurveArray curveArray;
+        /// Sphere inputs.
+        OptixBuildInputSphereArray sphereArray;
+        /// Custom primitive inputs.
+        OptixBuildInputCustomPrimitiveArray customPrimitiveArray;
+        /// Instance and instance pointer inputs.
+        OptixBuildInputInstanceArray instanceArray;
+    };
+} OptixBuildInput;
+
+/// Relocation inputs.
+///
+/// \see #optixAccelRelocate()
+typedef struct OptixRelocateInput
+{
+    /// The type of the build input to relocate.
+    OptixBuildInputType type;
+
+    union
+    {
+        /// Instance and instance pointer inputs.
+        OptixRelocateInputInstanceArray instanceArray;
+
+        /// Triangle inputs.
+        OptixRelocateInputTriangleArray triangleArray;
+
+        /// Inputs of any of the other types don't require any relocation data.
+    };
+} OptixRelocateInput;
+
+/// Flags set on the #OptixInstance::flags.
+///
+/// These can be or'ed together to combine multiple flags.
+typedef enum OptixInstanceFlags
+{
+    /// No special flag set
+    OPTIX_INSTANCE_FLAG_NONE = 0,
+
+    /// Prevent triangles from getting culled due to their orientation.
+    /// Effectively ignores ray flags
+    /// OPTIX_RAY_FLAG_CULL_BACK_FACING_TRIANGLES and OPTIX_RAY_FLAG_CULL_FRONT_FACING_TRIANGLES.
+    OPTIX_INSTANCE_FLAG_DISABLE_TRIANGLE_FACE_CULLING = 1u << 0,
+
+    /// Flip triangle orientation.
+    /// This affects front/backface culling as well as the reported face in case of a hit.
+    OPTIX_INSTANCE_FLAG_FLIP_TRIANGLE_FACING = 1u << 1,
+
+    /// Disable anyhit programs for all geometries of the instance.
+    /// Can be overridden by OPTIX_RAY_FLAG_ENFORCE_ANYHIT.
+    /// This flag is mutually exclusive with OPTIX_INSTANCE_FLAG_ENFORCE_ANYHIT.
+    OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT = 1u << 2,
+
+    /// Enables anyhit programs for all geometries of the instance.
+    /// Overrides OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT
+    /// Can be overridden by OPTIX_RAY_FLAG_DISABLE_ANYHIT.
+    /// This flag is mutually exclusive with OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT.
+    OPTIX_INSTANCE_FLAG_ENFORCE_ANYHIT = 1u << 3,
+
+
+    /// Force 4-state opacity micromaps to behave as 2-state opacity micromaps during traversal.
+    OPTIX_INSTANCE_FLAG_FORCE_OPACITY_MICROMAP_2_STATE = 1u << 4,
+    /// Don't perform opacity micromap query for this instance. Triangle GAS must be built with ALLOW_DISABLE_OPACITY_MICROMAPS for this to be valid.
+    /// Clusters in a GAS must be build with OPTIX_CLUSTER_ACCEL_CLUSTER_FLAG_ALLOW_DISABLE_OPACITY_MICROMAPS for this to be valid.
+    /// This flag overrides FORCE_OPACTIY_MIXROMAP_2_STATE instance and ray flags.
+    OPTIX_INSTANCE_FLAG_DISABLE_OPACITY_MICROMAPS = 1u << 5,
+
+} OptixInstanceFlags;
+
+/// Instances
+///
+/// \see #OptixBuildInputInstanceArray::instances
+typedef struct OptixInstance
+{
+    /// affine object-to-world transformation as 3x4 matrix in row-major layout
+    float transform[12];
+
+    /// Application supplied ID. The maximal ID can be queried using OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID.
+    unsigned int instanceId;
+
+    /// SBT record offset.
+    /// In a traversable graph with multiple levels of instance acceleration structure (IAS) objects, offsets are summed together.
+    /// The maximal SBT offset can be queried using OPTIX_DEVICE_PROPERTY_LIMIT_MAX_SBT_OFFSET.
+    unsigned int sbtOffset;
+
+    /// Visibility mask. If rayMask & instanceMask == 0 the instance is culled. The number of available bits can be
+    /// queried using OPTIX_DEVICE_PROPERTY_LIMIT_NUM_BITS_INSTANCE_VISIBILITY_MASK.
+    unsigned int visibilityMask;
+
+    /// Any combination of OptixInstanceFlags is allowed.
+    unsigned int flags;
+
+    /// Set with an OptixTraversableHandle.
+    OptixTraversableHandle traversableHandle;
+
+    /// round up to 80-byte, to ensure 16-byte alignment
+    unsigned int pad[2];
+} OptixInstance;
+
+/// Builder Options
+///
+/// Used for #OptixAccelBuildOptions::buildFlags. Can be or'ed together.
+typedef enum OptixBuildFlags
+{
+    /// No special flags set.
+    OPTIX_BUILD_FLAG_NONE = 0,
+
+    /// Allow updating the build with new vertex positions with subsequent calls to
+    /// optixAccelBuild.
+    OPTIX_BUILD_FLAG_ALLOW_UPDATE = 1u << 0,
+
+    OPTIX_BUILD_FLAG_ALLOW_COMPACTION = 1u << 1,
+
+    /// This flag is mutually exclusive with OPTIX_BUILD_FLAG_PREFER_FAST_BUILD.
+    OPTIX_BUILD_FLAG_PREFER_FAST_TRACE = 1u << 2,
+
+    /// This flag is mutually exclusive with OPTIX_BUILD_FLAG_PREFER_FAST_TRACE.
+    OPTIX_BUILD_FLAG_PREFER_FAST_BUILD = 1u << 3,
+
+    /// Allow random access to build input vertices
+    /// See optixGetTriangleVertexDataFromHandle
+    ///     optixGetLinearCurveVertexDataFromHandle
+    ///     optixGetQuadraticBSplineVertexDataFromHandle
+    ///     optixGetCubicBSplineVertexDataFromHandle
+    ///     optixGetCatmullRomVertexDataFromHandle
+    ///     optixGetCubicBezierVertexDataFromHandle
+    ///     optixGetQuadraticBSplineRocapsVertexDataFromHandle
+    ///     optixGetCubicBSplineRocapsVertexDataFromHandle
+    ///     optixGetCatmullRomRocapsVertexDataFromHandle
+    ///     optixGetCubicBezierRocapsVertexDataFromHandle
+    ///     optixGetRibbonVertexDataFromHandle
+    ///     optixGetRibbonNormalFromHandle
+    ///     optixGetSphereDataFromHandle
+    OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS = 1u << 4,
+
+    /// Allow random access to instances
+    /// See optixGetInstanceTraversableFromIAS
+    OPTIX_BUILD_FLAG_ALLOW_RANDOM_INSTANCE_ACCESS = 1u << 5,
+
+    /// Support updating the opacity micromap array and opacity micromap indices on refits.
+    /// May increase AS size and may have a small negative impact on traversal performance.
+    /// If this flag is absent, all opacity micromap inputs must remain unchanged between the initial AS builds and their subsequent refits.
+    OPTIX_BUILD_FLAG_ALLOW_OPACITY_MICROMAP_UPDATE = 1u << 6,
+
+    /// If enabled, any instances referencing this GAS are allowed to disable the opacity micromap test through the DISABLE_OPACITY_MICROMAPS flag instance flag.
+    /// Note that the GAS will not be optimized for the attached opacity micromap Arrays if this flag is set,
+    /// which may result in reduced traversal performance.
+    OPTIX_BUILD_FLAG_ALLOW_DISABLE_OPACITY_MICROMAPS = 1u << 7,
+} OptixBuildFlags;
+
+
+/// Flags defining behavior of opacity micromaps in a opacity micromap array.
+typedef enum OptixOpacityMicromapFlags
+{
+    OPTIX_OPACITY_MICROMAP_FLAG_NONE = 0,
+
+    /// This flag is mutually exclusive with OPTIX_OPACITY_MICROMAP_FLAG_PREFER_FAST_BUILD.
+    OPTIX_OPACITY_MICROMAP_FLAG_PREFER_FAST_TRACE = 1 << 0,
+
+    /// This flag is mutually exclusive with OPTIX_OPACITY_MICROMAP_FLAG_PREFER_FAST_TRACE.
+    OPTIX_OPACITY_MICROMAP_FLAG_PREFER_FAST_BUILD = 1 << 1,
+} OptixOpacityMicromapFlags;
+
+/// Opacity micromap descriptor.
+typedef struct OptixOpacityMicromapDesc
+{
+    /// Byte offset to opacity micromap in data input buffer of opacity micromap array build
+    unsigned int  byteOffset;
+    /// Number of micro-triangles is 4^level. Valid levels are [0, 12]
+    unsigned short subdivisionLevel;
+    /// OptixOpacityMicromapFormat
+    unsigned short format;
+} OptixOpacityMicromapDesc;
+
+/// Opacity micromap histogram entry.
+/// Specifies how many opacity micromaps of a specific type are input to the opacity micromap array build.
+/// Note that while this is similar to OptixOpacityMicromapUsageCount, the histogram entry specifies how many opacity micromaps
+/// of a specific type are combined into a opacity micromap array.
+typedef struct OptixOpacityMicromapHistogramEntry
+{
+    /// Number of opacity micromaps with the format and subdivision level that are input to the opacity micromap array build.
+    unsigned int               count;
+    /// Number of micro-triangles is 4^level. Valid levels are [0, 12].
+    unsigned int               subdivisionLevel;
+    /// Opacity micromap format.
+    OptixOpacityMicromapFormat format;
+} OptixOpacityMicromapHistogramEntry;
+
+/// Inputs to opacity micromap array construction.
+typedef struct OptixOpacityMicromapArrayBuildInput
+{
+    /// Applies to all opacity micromaps in array.
+    unsigned int flags;
+
+    /// 128B aligned base pointer for raw opacity micromap input data.
+    CUdeviceptr inputBuffer;
+
+    /// One OptixOpacityMicromapDesc entry per opacity micromap.
+    /// This device pointer must be a multiple of OPTIX_OPACITY_MICROMAP_DESC_BYTE_ALIGNMENT.
+    CUdeviceptr perMicromapDescBuffer;
+
+    /// Stride between OptixOpacityMicromapDescs in perOmDescBuffer.
+    /// If set to zero, the opacity micromap descriptors are assumed to be tightly packed and the stride is assumed to be sizeof( OptixOpacityMicromapDesc ).
+    /// This stride must be a multiple of OPTIX_OPACITY_MICROMAP_DESC_BYTE_ALIGNMENT.
+    unsigned int perMicromapDescStrideInBytes;
+
+    /// Number of OptixOpacityMicromapHistogramEntry.
+    unsigned int numMicromapHistogramEntries;
+    /// Histogram over opacity micromaps of input format and subdivision combinations.
+    /// Counts of entries with equal format and subdivision combination (duplicates) are added together.
+    const OptixOpacityMicromapHistogramEntry* micromapHistogramEntries;
+} OptixOpacityMicromapArrayBuildInput;
+
+/// Conservative memory requirements for building a opacity micromap array
+typedef struct OptixMicromapBufferSizes
+{
+    size_t outputSizeInBytes;
+    size_t tempSizeInBytes;
+} OptixMicromapBufferSizes;
+
+/// Buffer inputs for opacity micromap array builds.
+typedef struct OptixMicromapBuffers
+{
+    /// Output buffer
+    CUdeviceptr output;
+    /// Output buffer size
+    size_t outputSizeInBytes;
+    /// Temp buffer
+    CUdeviceptr temp;
+    /// Temp buffer size
+    size_t tempSizeInBytes;
+} OptixMicromapBuffers;
+
+
+/// Enum to specify the acceleration build operation.
+///
+/// Used in OptixAccelBuildOptions, which is then passed to optixAccelBuild and
+/// optixAccelComputeMemoryUsage, this enum indicates whether to do a build or an update
+/// of the acceleration structure.
+///
+/// Acceleration structure updates utilize the same acceleration structure, but with
+/// updated bounds.  Updates are typically much faster than builds, however, large
+/// perturbations can degrade the quality of the acceleration structure.
+///
+/// \see #optixAccelComputeMemoryUsage(), #optixAccelBuild(), #OptixAccelBuildOptions
+typedef enum OptixBuildOperation
+{
+    /// Perform a full build operation
+    OPTIX_BUILD_OPERATION_BUILD = 0x2161,
+    /// Perform an update using new bounds
+    OPTIX_BUILD_OPERATION_UPDATE = 0x2162,
+} OptixBuildOperation;
+
+/// Enum to specify motion flags.
+///
+/// \see #OptixMotionOptions::flags.
+typedef enum OptixMotionFlags
+{
+    OPTIX_MOTION_FLAG_NONE         = 0,
+    OPTIX_MOTION_FLAG_START_VANISH = 1u << 0,
+    OPTIX_MOTION_FLAG_END_VANISH   = 1u << 1
+} OptixMotionFlags;
+
+/// Motion options
+///
+/// \see #OptixAccelBuildOptions::motionOptions, #OptixMatrixMotionTransform::motionOptions,
+///      #OptixSRTMotionTransform::motionOptions
+typedef struct OptixMotionOptions
+{
+    /// If numKeys > 1, motion is enabled. timeBegin,
+    /// timeEnd and flags are all ignored when motion is disabled.
+    unsigned short numKeys;
+
+    /// Combinations of #OptixMotionFlags
+    unsigned short flags;
+
+    /// Point in time where motion starts. Must be lesser than timeEnd.
+    float timeBegin;
+
+    /// Point in time where motion ends. Must be greater than timeBegin.
+    float timeEnd;
+} OptixMotionOptions;
+
+/// Build options for acceleration structures.
+///
+/// \see #optixAccelComputeMemoryUsage(), #optixAccelBuild()
+typedef struct OptixAccelBuildOptions
+{
+    /// Combinations of OptixBuildFlags
+    unsigned int buildFlags;
+
+    /// If OPTIX_BUILD_OPERATION_UPDATE the output buffer is assumed to contain the result
+    /// of a full build with OPTIX_BUILD_FLAG_ALLOW_UPDATE set and using the same number of
+    /// primitives.  It is updated incrementally to reflect the current position of the
+    /// primitives.
+    /// If a BLAS has been built with OPTIX_BUILD_FLAG_ALLOW_OPACITY_MICROMAP_UPDATE, new opacity micromap arrays
+    /// and opacity micromap indices may be provided to the refit.
+    OptixBuildOperation operation;
+
+    /// Options for motion.
+    OptixMotionOptions motionOptions;
+} OptixAccelBuildOptions;
+
+/// Struct for querying builder allocation requirements.
+///
+/// Once queried the sizes should be used to allocate device memory of at least these sizes.
+///
+/// \see #optixAccelComputeMemoryUsage()
+typedef struct OptixAccelBufferSizes
+{
+    /// The size in bytes required for the outputBuffer parameter to optixAccelBuild when
+    /// doing a build (OPTIX_BUILD_OPERATION_BUILD).
+    size_t outputSizeInBytes;
+
+    /// The size in bytes required for the tempBuffer paramter to optixAccelBuild when
+    /// doing a build (OPTIX_BUILD_OPERATION_BUILD).
+    size_t tempSizeInBytes;
+
+    /// The size in bytes required for the tempBuffer parameter to optixAccelBuild
+    /// when doing an update (OPTIX_BUILD_OPERATION_UPDATE).  This value can be different
+    /// than tempSizeInBytes used for a full build.  Only non-zero if
+    /// OPTIX_BUILD_FLAG_ALLOW_UPDATE flag is set in OptixAccelBuildOptions.
+    size_t tempUpdateSizeInBytes;
+} OptixAccelBufferSizes;
+
+/// Properties which can be emitted during acceleration structure build.
+///
+/// \see #OptixAccelEmitDesc::type.
+typedef enum OptixAccelPropertyType
+{
+    /// Size of a compacted acceleration structure. The device pointer points to a uint64.
+    OPTIX_PROPERTY_TYPE_COMPACTED_SIZE = 0x2181,
+
+    /// OptixAabb * numMotionSteps
+    OPTIX_PROPERTY_TYPE_AABBS = 0x2182,
+} OptixAccelPropertyType;
+
+/// Specifies a type and output destination for emitted post-build properties.
+///
+/// \see #optixAccelBuild()
+typedef struct OptixAccelEmitDesc
+{
+    /// Output buffer for the properties
+    CUdeviceptr result;
+
+    /// Requested property
+    OptixAccelPropertyType type;
+} OptixAccelEmitDesc;
+
+/// Used to store information related to relocation of optix data structures.
+///
+/// \see #optixOpacityMicromapArrayGetRelocationInfo(), #optixOpacityMicromapArrayRelocate(),
+/// #optixAccelGetRelocationInfo(), #optixAccelRelocate(), #optixCheckRelocationCompatibility()
+typedef struct OptixRelocationInfo
+{
+    /// Opaque data, used internally, should not be modified
+    unsigned long long info[4];
+} OptixRelocationInfo;
+
+/// Static transform
+///
+/// The device address of instances of this type must be a multiple of OPTIX_TRANSFORM_BYTE_ALIGNMENT.
+///
+/// \see #optixConvertPointerToTraversableHandle()
+typedef struct OptixStaticTransform
+{
+    /// The traversable transformed by this transformation
+    OptixTraversableHandle child;
+
+    /// Padding to make the transformations 16 byte aligned
+    unsigned int pad[2];
+
+    /// Affine object-to-world transformation as 3x4 matrix in row-major layout
+    float transform[12];
+
+    /// Affine world-to-object transformation as 3x4 matrix in row-major layout
+    /// Must be the inverse of the transform matrix
+    float invTransform[12];
+} OptixStaticTransform;
+
+/// Represents a matrix motion transformation.
+///
+/// The device address of instances of this type must be a multiple of OPTIX_TRANSFORM_BYTE_ALIGNMENT.
+///
+/// This struct, as defined here, handles only N=2 motion keys due to the fixed array length of its transform member.
+/// The following example shows how to create instances for an arbitrary number N of motion keys:
+///
+/// \code
+/// float matrixData[N][12];
+/// ... // setup matrixData
+///
+/// size_t transformSizeInBytes = sizeof( OptixMatrixMotionTransform ) + ( N-2 ) * 12 * sizeof( float );
+/// OptixMatrixMotionTransform* matrixMoptionTransform = (OptixMatrixMotionTransform*) malloc( transformSizeInBytes );
+/// memset( matrixMoptionTransform, 0, transformSizeInBytes );
+///
+/// ... // setup other members of matrixMoptionTransform
+/// matrixMoptionTransform->motionOptions.numKeys/// = N;
+/// memcpy( matrixMoptionTransform->transform, matrixData, N * 12 * sizeof( float ) );
+///
+/// ... // copy matrixMoptionTransform to device memory
+/// free( matrixMoptionTransform )
+/// \endcode
+///
+/// \see #optixConvertPointerToTraversableHandle()
+typedef struct OptixMatrixMotionTransform
+{
+    /// The traversable that is transformed by this transformation
+    OptixTraversableHandle child;
+
+    /// The motion options for this transformation.
+    /// Must have at least two motion keys.
+    OptixMotionOptions motionOptions;
+
+    /// Padding to make the transformation 16 byte aligned
+    unsigned int pad[3];
+
+    /// Affine object-to-world transformation as 3x4 matrix in row-major layout
+    float transform[2][12];
+} OptixMatrixMotionTransform;
+
+/// Represents an SRT transformation.
+///
+/// An SRT transformation can represent a smooth rotation with fewer motion keys than a matrix transformation. Each
+/// motion key is constructed from elements taken from a matrix S, a quaternion R, and a translation T.
+///
+/// The scaling matrix
+/// \f$S = \begin{bmatrix} sx & a & b & pvx \\ 0 & sy & c & pvy \\ 0 & 0  & sz & pvz \end{bmatrix}\f$
+//      [ sx   a   b  pvx ]
+//  S = [  0  sy   c  pvy ]
+//      [  0   0  sz  pvz ]
+/// defines an affine transformation that can include scale, shear, and a translation.
+/// The translation allows to define the pivot point for the subsequent rotation.
+///
+/// The quaternion R = [ qx, qy, qz, qw ] describes a rotation  with angular component qw = cos(theta/2) and other
+/// components [ qx, qy, qz ] = sin(theta/2) * [ ax, ay, az ] where the axis [ ax, ay, az ] is normalized.
+///
+/// The translation matrix
+/// \f$T = \begin{bmatrix} 1 & 0 & 0 & tx \\ 0 & 1 & 0 & ty \\ 0 & 0 & 1 & tz \end{bmatrix}\f$
+//      [  1  0  0 tx ]
+//  T = [  0  1  0 ty ]
+//      [  0  0  1 tz ]
+/// defines another translation that is applied after the rotation. Typically, this translation includes
+/// the inverse translation from the matrix S to reverse the translation for the pivot point for R.
+///
+/// To obtain the effective transformation at time t, the elements of the components of S, R, and T will be interpolated
+/// linearly. The components are then multiplied to obtain the combined transformation C = T * R * S. The transformation
+/// C is the effective object-to-world transformations at time t, and C^(-1) is the effective world-to-object
+/// transformation at time t.
+///
+/// \see #OptixSRTMotionTransform::srtData, #optixConvertPointerToTraversableHandle()
+typedef struct OptixSRTData
+{
+    /// \name Parameters describing the SRT transformation
+    /// @{
+    float sx, a, b, pvx, sy, c, pvy, sz, pvz, qx, qy, qz, qw, tx, ty, tz;
+    /// @}
+} OptixSRTData;
+
+
+/// Represents an SRT motion transformation.
+///
+/// The device address of instances of this type must be a multiple of OPTIX_TRANSFORM_BYTE_ALIGNMENT.
+///
+/// This struct, as defined here, handles only N=2 motion keys due to the fixed array length of its srtData member.
+/// The following example shows how to create instances for an arbitrary number N of motion keys:
+///
+/// \code
+/// OptixSRTData srtData[N];
+/// ... // setup srtData
+///
+/// size_t transformSizeInBytes = sizeof( OptixSRTMotionTransform ) + ( N-2 ) * sizeof( OptixSRTData );
+/// OptixSRTMotionTransform* srtMotionTransform = (OptixSRTMotionTransform*) malloc( transformSizeInBytes );
+/// memset( srtMotionTransform, 0, transformSizeInBytes );
+///
+/// ... // setup other members of srtMotionTransform
+/// srtMotionTransform->motionOptions.numKeys   = N;
+/// memcpy( srtMotionTransform->srtData, srtData, N * sizeof( OptixSRTData ) );
+///
+/// ... // copy srtMotionTransform to device memory
+/// free( srtMotionTransform )
+/// \endcode
+///
+/// \see #optixConvertPointerToTraversableHandle()
+typedef struct OptixSRTMotionTransform
+{
+    /// The traversable transformed by this transformation
+    OptixTraversableHandle child;
+
+    /// The motion options for this transformation
+    /// Must have at least two motion keys.
+    OptixMotionOptions motionOptions;
+
+    /// Padding to make the SRT data 16 byte aligned
+    unsigned int pad[3];
+
+    /// The actual SRT data describing the transformation
+    OptixSRTData srtData[2];
+} OptixSRTMotionTransform;
+
+/// Traversable Handles
+///
+/// \see #optixConvertPointerToTraversableHandle()
+typedef enum OptixTraversableType
+{
+    /// Static transforms. \see #OptixStaticTransform
+    OPTIX_TRAVERSABLE_TYPE_STATIC_TRANSFORM = 0x21C1,
+    /// Matrix motion transform. \see #OptixMatrixMotionTransform
+    OPTIX_TRAVERSABLE_TYPE_MATRIX_MOTION_TRANSFORM = 0x21C2,
+    /// SRT motion transform. \see #OptixSRTMotionTransform
+    OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM = 0x21C3,
+} OptixTraversableType;
+
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+///// Cluster AS build
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+
+/// Host-side flags for all types of cluster builds
+typedef enum OptixClusterAccelBuildFlags
+{
+    OPTIX_CLUSTER_ACCEL_BUILD_FLAG_NONE                    = 0,
+    OPTIX_CLUSTER_ACCEL_BUILD_FLAG_PREFER_FAST_TRACE       = 1 << 0,
+    OPTIX_CLUSTER_ACCEL_BUILD_FLAG_PREFER_FAST_BUILD       = 1 << 1,
+    OPTIX_CLUSTER_ACCEL_BUILD_FLAG_ALLOW_OPACITY_MICROMAPS = 1 << 2
+} OptixClusterAccelBuildFlags;
+
+/// Device-side flags for clusters builds
+typedef enum OptixClusterAccelClusterFlags
+{
+    OPTIX_CLUSTER_ACCEL_CLUSTER_FLAG_NONE                            = 0,
+    /// Similar to the 'ALLOW_DISABLE_OPACITY_MICROMAPS' build flag of regular triangle GAS builds.
+    /// This flag is required if the CLAS is in an instance with the OPTIX_INSTANCE_FLAG_DISABLE_OPACITY_MICROMAPS flag set.
+    OPTIX_CLUSTER_ACCEL_CLUSTER_FLAG_ALLOW_DISABLE_OPACITY_MICROMAPS = 1 << 0,
+} OptixClusterAccelClusterFlags;
+
+/// Device-side flags that specify per-primitive specific behavior
+/// Note the packing within the 32b struct OptixClusterAccelPrimitiveInfo
+typedef enum OptixClusterAccelPrimitiveFlags
+{
+    OPTIX_CLUSTER_ACCEL_PRIMITIVE_FLAG_NONE                          = 0,
+    OPTIX_CLUSTER_ACCEL_PRIMITIVE_FLAG_DISABLE_TRIANGLE_FACE_CULLING = 1 << 0,
+    OPTIX_CLUSTER_ACCEL_PRIMITIVE_FLAG_REQUIRE_SINGLE_ANYHIT_CALL    = 1 << 1,
+    OPTIX_CLUSTER_ACCEL_PRIMITIVE_FLAG_DISABLE_ANYHIT                = 1 << 2,
+} OptixClusterAccelPrimitiveFlags;
+
+/// Build type for cluster builds - specifying the type of data input and output
+typedef enum OptixClusterAccelBuildType
+{
+    OPTIX_CLUSTER_ACCEL_BUILD_TYPE_GASES_FROM_CLUSTERS      = 0x2545,
+    OPTIX_CLUSTER_ACCEL_BUILD_TYPE_CLUSTERS_FROM_TRIANGLES  = 0x2546,
+    OPTIX_CLUSTER_ACCEL_BUILD_TYPE_TEMPLATES_FROM_TRIANGLES = 0x2547,
+    OPTIX_CLUSTER_ACCEL_BUILD_TYPE_CLUSTERS_FROM_TEMPLATES  = 0x2548,
+    OPTIX_CLUSTER_ACCEL_BUILD_TYPE_TEMPLATES_FROM_GRIDS     = 0x2549
+} OptixClusterAccelBuildType;
+
+/// Build mode for cluster builds
+typedef enum OptixClusterAccelBuildMode
+{
+    /// Fastest build, single output buffer, build outputs may have padding wrt each other
+    OPTIX_CLUSTER_ACCEL_BUILD_MODE_IMPLICIT_DESTINATIONS = 0,
+    /// Compact build, application specifies output destination per Arg; requires Get Sizes build run beforehand
+    OPTIX_CLUSTER_ACCEL_BUILD_MODE_EXPLICIT_DESTINATIONS = 1,
+    /// Size computation for future explicit build; computes output sizes for all Args
+    OPTIX_CLUSTER_ACCEL_BUILD_MODE_GET_SIZES             = 2
+} OptixClusterAccelBuildMode;
+
+/// Helper enum where values match the byte count of the corresponding index format, allowing usage of enum value when specifying byte count
+typedef enum OptixClusterAccelIndicesFormat
+{
+    OPTIX_CLUSTER_ACCEL_INDICES_FORMAT_8BIT  = 1,
+    OPTIX_CLUSTER_ACCEL_INDICES_FORMAT_16BIT = 2,
+    OPTIX_CLUSTER_ACCEL_INDICES_FORMAT_32BIT = 4,
+} OptixClusterAccelIndicesFormat;
+
+typedef struct OptixClusterAccelBuildModeDescImplicitDest
+{
+    /// alignment of outputBuffer must match result type.
+    ///    Clusters: 128 bytes
+    ///   Templates: 32 bytes
+    ///       GASes: 128 bytes, see OPTIX_ACCEL_BUFFER_BYTE_ALIGNMENT
+    CUdeviceptr  outputBuffer;
+    /// size of outputHandlesBuffer is outputHandlesStrideInBytes * number of inputs specified with either argCount or maxArgCount
+    size_t       outputBufferSizeInBytes;
+    /// 128-byte aligned, see OPTIX_ACCEL_BUFFER_BYTE_ALIGNMENT
+    CUdeviceptr  tempBuffer;
+    size_t       tempBufferSizeInBytes;
+
+    /// TraversableHandle for GAS, pointer for cluster and template outputs
+    CUdeviceptr  outputHandlesBuffer;
+    /// Minimum 8, Stride of 0 implies natural stride of 8B
+    unsigned int outputHandlesStrideInBytes;
+    /// Optional, uint32 array (4 byte aligned)
+    CUdeviceptr  outputSizesBuffer;
+    /// Minimum 4, Stride of 0 implies natural stride of 4B
+    unsigned int outputSizesStrideInBytes;
+} OptixClusterAccelBuildModeDescImplicitDest;
+
+typedef struct OptixClusterAccelBuildModeDescExplicitDest
+{
+    /// 128-byte aligned, see OPTIX_ACCEL_BUFFER_BYTE_ALIGNMENT
+    CUdeviceptr  tempBuffer;
+    size_t       tempBufferSizeInBytes;
+    /// Entries must be aligned according to the output type
+    CUdeviceptr  destAddressesBuffer;
+    /// Minimum 8, Stride of 0 implies natural stride of 8B
+    unsigned int destAddressesStrideInBytes;
+
+    /// TraversableHandle for GAS, pointer for cluster and template outputs, can be the same as destAddresses in which case they will overwrite the input
+    CUdeviceptr  outputHandlesBuffer;
+    /// Minimum 8, Stride of 0 implies natural stride of 8B
+    unsigned int outputHandlesStrideInBytes;
+    /// Optional, uint32 array (4 byte aligned)
+    CUdeviceptr  outputSizesBuffer;
+    /// Minimum 4, Stride of 0 implies natural stride of 4B
+    unsigned int outputSizesStrideInBytes;
+} OptixClusterAccelBuildModeDescExplicitDest;
+
+typedef struct OptixClusterAccelBuildModeDescGetSize
+{
+    /// Mandatory, uint32 array (4 byte aligned)
+    CUdeviceptr  outputSizesBuffer;
+    /// Minimum 4, Stride of 0 implies natural stride of 4B
+    unsigned int outputSizesStrideInBytes;
+    /// 128-byte aligned, see OPTIX_ACCEL_BUFFER_BYTE_ALIGNMENT
+    CUdeviceptr  tempBuffer;
+    size_t       tempBufferSizeInBytes;
+} OptixClusterAccelBuildModeDescGetSize;
+
+typedef struct OptixClusterAccelBuildInputTriangles
+{
+    OptixClusterAccelBuildFlags flags;
+
+    /// Max number of OptixClusterAccelBuildInputTrianglesArgs provided at build time for OPTIX_CLUSTER_ACCEL_BUILD_TYPE_CLUSTERS_FROM_TRIANGLES
+    /// and OPTIX_CLUSTER_ACCEL_BUILD_TYPE_TEMPLATES_FROM_TRIANGLES.
+    /// Max number of OptixClusterAccelBuildInputTemplatesArgs provided at build time for OPTIX_CLUSTER_ACCEL_BUILD_TYPE_CLUSTERS_FROM_TEMPLATES.
+    unsigned int maxArgCount;
+    /// OptixVertexFormat (see documentation for supported formats)
+    OptixVertexFormat vertexFormat;
+    /// The maximum used sbt index over all clusters;
+    /// This must include the base sbt offset (::basePrimitiveInfo), any potential per primitive offset (::primitiveInfoBuffer),
+    /// as well as a potential offset at template instantiation (OptixClusterAccelBuildInputTemplatesArgs::sbtIndexOffset)
+    unsigned int maxSbtIndexValue;
+    /// Number of unique SBT indices per cluster. If the cluster has the same SBT index for all its triangles, this value is 1.
+    unsigned int maxUniqueSbtIndexCountPerArg;
+
+    /// Upper bound on the number of triangles per Arg
+    unsigned int maxTriangleCountPerArg;
+    /// Upper bound on the number of vertices per Arg
+    unsigned int maxVertexCountPerArg;
+    /// Optional, upper bound on the number of triangles over all Args, maxTriangleCountPerArg * maxArgCount otherwise
+    unsigned int maxTotalTriangleCount;
+    /// Optional, upper bound on the number of vertices over all Args, maxVertexCountPerArg * maxArgCount otherwise
+    unsigned int maxTotalVertexCount;
+    /// Lower bound on the number of bits being truncated of the vertex positions.
+    unsigned int minPositionTruncateBitCount;
+} OptixClusterAccelBuildInputTriangles;
+
+typedef struct OptixClusterAccelBuildInputGrids
+{
+    OptixClusterAccelBuildFlags flags;
+    // Max number of OptixClusterAccelBuildInputGridsArgs provided at build time for OPTIX_CLUSTER_ACCEL_BUILD_TYPE_TEMPLATES_FROM_GRIDS
+    unsigned int                maxArgCount;
+
+    /// OptixVertexFormat (see documentation for supported formats)
+    OptixVertexFormat vertexFormat;
+    /// The maximum used SBT index over all clusters.
+    /// This must include the base SBT offset (::basePrimitiveInfo), any potential per primitive offset (::primitiveInfoBuffer),
+    /// as well as a potential offset at template instantiation (OptixClusterAccelBuildInputTemplatesArgs::sbtIndexOffset)
+    unsigned int      maxSbtIndexValue;
+
+
+    /// The maximum number of edge segments along the width of any grid
+    unsigned int maxWidth;
+    /// The maximum number of edge segments along the height of any grid
+    unsigned int maxHeight;
+} OptixClusterAccelBuildInputGrids;
+
+typedef struct OptixClusterAccelBuildInputClusters
+{
+    OptixClusterAccelBuildFlags flags;
+    /// Max number of OptixClusterAccelBuildInputClustersArgs provided at build time for OPTIX_CLUSTER_ACCEL_BUILD_TYPE_GASES_FROM_CLUSTERS
+    unsigned int                maxArgCount;
+
+    unsigned int                maxTotalClusterCount;
+    unsigned int                maxClusterCountPerArg;
+} OptixClusterAccelBuildInputClusters;
+
+typedef struct OptixClusterAccelPrimitiveInfo
+{
+    unsigned int sbtIndex       : 24;
+    unsigned int reserved       :  5;
+    /// Combination of OptixClusterAccelPrimitiveFlags
+    unsigned int primitiveFlags :  3;
+} OptixClusterAccelPrimitiveInfo;
+
+/// Reserved value for cluster IDs in Args
+typedef enum OptixClusterIDValues {
+    OPTIX_CLUSTER_ID_INVALID = 0xFFFFFFFFu,
+} OptixClusterIDValues;
+
+/// Device data, args provided for OPTIX_CLUSTER_ACCEL_BUILD_TYPE_CLUSTERS_FROM_TRIANGLES builds and OPTIX_CLUSTER_ACCEL_BUILD_TYPE_TEMPLATES_FROM_TRIANGLES builds
+typedef struct OptixClusterAccelBuildInputTrianglesArgs
+{
+    /// 32-bit user-defined ID, for template creation acts as the baseClusterId and can be offset at template instantiation
+    /// (see OptixClusterAccelBuildInputTemplatesArgs::clusterIdOffset)
+    unsigned int clusterId;
+    /// Combination of OptixClusterAccelClusterFlags
+    unsigned int clusterFlags;
+
+    // Packing the following values into a single 32b value
+    /// Number of triangles for cluster / cluster template, max value can be queried
+    unsigned int triangleCount              : 9;
+    /// Number of vertices shared by triangles for cluster / cluster template, max value can be queried
+    unsigned int vertexCount                : 9;
+    /// Number of LSB in mantissa that are dropped (0 means don't drop any) for float32 positions. Other formats are first converted to float32 before dropping bits.
+    /// Builder will drop bits when building CLAS / instantiating cluster templates (no need to truncate the input before build).
+    unsigned int positionTruncateBitCount   : 6;
+    /// Can use OptixClusterAccelIndicesFormat as helper to set value: 1, 2, or 4 bytes-wide indices
+    unsigned int indexFormat                : 4;
+    /// Can use OptixClusterAccelIndicesFormat as helper to set value: 1, 2, or 4 bytes-wide indices
+    unsigned int opacityMicromapIndexFormat : 4;
+
+    /// Applied to all triangles in cluster. Additional per triangle flags can be specified in PrimitiveInfoBuffer.
+    OptixClusterAccelPrimitiveInfo basePrimitiveInfo;
+
+    /// Stride between elements in index buffer. Stride 0 -> natural stride
+    unsigned short indexBufferStrideInBytes;
+    /// Stride between elements in vertex buffer. Stride 0 -> natural stride
+    unsigned short vertexBufferStrideInBytes;
+    /// Stride between elements in primitive info buffer. Stride 0 -> natural stride
+    unsigned short primitiveInfoBufferStrideInBytes;
+    /// Stride between elements in omm index buffer. Stride 0 -> natural stride
+    unsigned short opacityMicromapIndexBufferStrideInBytes;
+
+    /// Triplets of vertex indices into vertexBuffer per triangle. Must contain 3 * triangleCount indices.
+    CUdeviceptr indexBuffer;
+    /// vertexBuffer is mandatory when using OPTIX_CLUSTER_ACCEL_BUILD_TYPE_CLUSTERS_FROM_TRIANGLES.
+    /// Optional with OPTIX_CLUSTER_ACCEL_BUILD_TYPE_TEMPLATES_FROM_TRIANGLES
+    /// and when specified provide example "hint" vertices for templates; actual vertices are specified at template instantiation.
+    /// It is typically useful to provide vertices for template creation in scenarios such as animation, where the relative locality
+    /// of vertices is expected to be similar between the template creation and instantiation.
+    CUdeviceptr vertexBuffer;
+    /// Optional, per primitive array of OptixClusterAccelPrimitiveInfo
+    CUdeviceptr primitiveInfoBuffer;
+    /// Optional, needs to be set if OMMs are used
+    CUdeviceptr opacityMicromapArray;
+    /// Optional, needs to be set if OMMs are used
+    CUdeviceptr opacityMicromapIndexBuffer;
+
+    /// Optional with OPTIX_CLUSTER_ACCEL_BUILD_TYPE_TEMPLATES_FROM_TRIANGLES, 32-byte-aligned pointer to OptixAabb, one per cluster,
+    /// limiting the extent of each cluster. Vertices provided for template instantiation must not be outside the bounding box.
+    /// Providing a bounding box may improve compression (reduced CLAS size) as well as trace performance.
+    /// Ignored for OPTIX_CLUSTER_ACCEL_BUILD_TYPE_CLUSTERS_FROM_TRIANGLES
+    CUdeviceptr instantiationBoundingBoxLimit;
+} OptixClusterAccelBuildInputTrianglesArgs;
+
+/// Device data, args provided for OPTIX_CLUSTER_ACCEL_BUILD_TYPE_TEMPLATES_FROM_GRIDS builds
+typedef struct OptixClusterAccelBuildInputGridsArgs
+{
+    /// 32-bit user-defined ID, serves as a base value for the template and can be offset at template instantiation
+    /// (see OptixClusterAccelBuildInputTemplatesArgs::clusterIdOffset)
+    unsigned int baseClusterId;
+    /// Combination of OptixClusterAccelClusterFlags
+    unsigned int clusterFlags;
+
+    /// Applied to all triangles in cluster
+    OptixClusterAccelPrimitiveInfo basePrimitiveInfo;
+
+    // Packing the following values into a single 32b value
+    /// See OptixClusterAccelBuildInputTrianglesArgs::positionTruncateBitCount
+    unsigned int positionTruncateBitCount :  6;
+    unsigned int reserved                 : 26;
+
+    // Packing the following values into a single 32b value
+    /// Resolution of the 2D grid, max value per dimension can be queried
+    unsigned char  dimensions[2];
+    unsigned short reserved2;
+} OptixClusterAccelBuildInputGridsArgs;
+
+/// Device data, args provided for OPTIX_CLUSTER_ACCEL_BUILD_TYPE_CLUSTERS_FROM_TEMPLATES builds
+typedef struct OptixClusterAccelBuildInputTemplatesArgs
+{
+    /// Offset applied to template baseClusterId, effective clusterId = clusterTemplate.baseClusterId + clusterIdOffset. Either may be 0.
+    unsigned int clusterIdOffset;
+
+    /// Offset to base sbtIndex from template creation (which may define a constant or per-triangle base sbtIndex),
+    /// final sbt index is also limited to fit into 24b
+    unsigned int sbtIndexOffset;
+
+    /// Opaque pointer to the template
+    CUdeviceptr  clusterTemplate;
+    /// The vertex data to use to instantiate the template; vertex order must match that of template creation.
+    /// For templates created from grids, see documentation.
+    CUdeviceptr  vertexBuffer;
+    /// Stride between elements in vertex buffer. Stride 0 -> natural stride
+    unsigned int vertexStrideInBytes;
+    unsigned int reserved;
+} OptixClusterAccelBuildInputTemplatesArgs;
+
+/// Device data, args provided for OPTIX_CLUSTER_ACCEL_BUILD_TYPE_GASES_FROM_CLUSTERS builds
+typedef struct OptixClusterAccelBuildInputClustersArgs
+{
+    /// Number of CLAS input to the BLAS build (size of the clusterHandles buffer)
+    unsigned int clusterHandlesCount;
+    unsigned int clusterHandlesBufferStrideInBytes;
+    /// The clusterHandlesBuffer can come directly from CLAS builds output via
+    ///  OptixClusterAccelBuildModeDescImplicitDest::outputHandlesBuffer or
+    ///  OptixClusterAccelBuildModeDescExplicitDest::outputHandlesBuffer
+    CUdeviceptr  clusterHandlesBuffer;
+} OptixClusterAccelBuildInputClustersArgs;
+
+typedef struct OptixClusterAccelBuildInput
+{
+    OptixClusterAccelBuildType type;
+
+    union
+    {
+        /// Used for OPTIX_CLUSTER_ACCEL_BUILD_TYPE_CLUSTERS_FROM_TRIANGLES, OPTIX_CLUSTER_ACCEL_BUILD_TYPE_TEMPLATES_FROM_TRIANGLES,
+        /// OPTIX_CLUSTER_ACCEL_BUILD_TYPE_CLUSTERS_FROM_TEMPLATES type builds
+        OptixClusterAccelBuildInputTriangles triangles;
+        /// Used for OPTIX_CLUSTER_ACCEL_BUILD_TYPE_GASES_FROM_CLUSTERS type builds
+        OptixClusterAccelBuildInputClusters  clusters;
+        /// Used for OPTIX_CLUSTER_ACCEL_BUILD_TYPE_TEMPLATES_FROM_GRIDS type builds
+        OptixClusterAccelBuildInputGrids     grids;
+    };
+} OptixClusterAccelBuildInput;
+
+typedef struct OptixClusterAccelBuildModeDesc
+{
+    OptixClusterAccelBuildMode mode;
+    union
+    {
+        OptixClusterAccelBuildModeDescImplicitDest implicitDest;
+        OptixClusterAccelBuildModeDescExplicitDest explicitDest;
+        OptixClusterAccelBuildModeDescGetSize      getSize;
+    };
+} OptixClusterAccelBuildModeDesc;
+
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+///// Denoiser
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+
+
+/// Pixel formats used by the denoiser.
+///
+/// \see #OptixImage2D::format
+typedef enum OptixPixelFormat
+{
+    OPTIX_PIXEL_FORMAT_HALF1  = 0x220a,               ///< one half
+    OPTIX_PIXEL_FORMAT_HALF2  = 0x2207,               ///< two halfs, XY
+    OPTIX_PIXEL_FORMAT_HALF3  = 0x2201,               ///< three halfs, RGB
+    OPTIX_PIXEL_FORMAT_HALF4  = 0x2202,               ///< four halfs, RGBA
+    OPTIX_PIXEL_FORMAT_FLOAT1 = 0x220b,               ///< one float
+    OPTIX_PIXEL_FORMAT_FLOAT2 = 0x2208,               ///< two floats, XY
+    OPTIX_PIXEL_FORMAT_FLOAT3 = 0x2203,               ///< three floats, RGB
+    OPTIX_PIXEL_FORMAT_FLOAT4 = 0x2204,               ///< four floats, RGBA
+    OPTIX_PIXEL_FORMAT_UCHAR3 = 0x2205,               ///< three unsigned chars, RGB
+    OPTIX_PIXEL_FORMAT_UCHAR4 = 0x2206,               ///< four unsigned chars, RGBA
+    OPTIX_PIXEL_FORMAT_INTERNAL_GUIDE_LAYER = 0x2209  ///< internal format
+} OptixPixelFormat;
+
+/// Image descriptor used by the denoiser.
+///
+/// \see #optixDenoiserInvoke(), #optixDenoiserComputeIntensity()
+typedef struct OptixImage2D
+{
+    /// Pointer to the actual pixel data.
+    CUdeviceptr data;
+    /// Width of the image (in pixels)
+    unsigned int width;
+    /// Height of the image (in pixels)
+    unsigned int height;
+    /// Stride between subsequent rows of the image (in bytes).
+    unsigned int rowStrideInBytes;
+    /// Stride between subsequent pixels of the image (in bytes).
+    /// If set to 0, dense packing (no gaps) is assumed.
+    /// For pixel format OPTIX_PIXEL_FORMAT_INTERNAL_GUIDE_LAYER it must be set to
+    /// OptixDenoiserSizes::internalGuideLayerPixelSizeInBytes.
+    unsigned int pixelStrideInBytes;
+    /// Pixel format.
+    OptixPixelFormat format;
+} OptixImage2D;
+
+/// Model kind used by the denoiser.
+///
+/// \see #optixDenoiserCreate
+typedef enum OptixDenoiserModelKind
+{
+    /// Built-in model for denoising single image.
+    OPTIX_DENOISER_MODEL_KIND_AOV = 0x2324,
+
+    /// Built-in model for denoising image sequence, temporally stable.
+    OPTIX_DENOISER_MODEL_KIND_TEMPORAL_AOV = 0x2326,
+
+    /// Built-in model for denoising single image upscaling (supports AOVs).
+    OPTIX_DENOISER_MODEL_KIND_UPSCALE2X = 0x2327,
+
+    /// Built-in model for denoising image sequence upscaling, temporally stable (supports AOVs).
+    OPTIX_DENOISER_MODEL_KIND_TEMPORAL_UPSCALE2X = 0x2328,
+
+    /// Deprecated. Use OPTIX_DENOISER_MODEL_KIND_AOV.
+    /// When used, internally mapped to OPTIX_DENOISER_MODEL_KIND_AOV.
+    OPTIX_DENOISER_MODEL_KIND_LDR = 0x2322,
+    OPTIX_DENOISER_MODEL_KIND_HDR = 0x2323,
+
+    /// Deprecated. Use OPTIX_DENOISER_MODEL_KIND_TEMPORAL_AOV.
+    OPTIX_DENOISER_MODEL_KIND_TEMPORAL = 0x2325
+
+} OptixDenoiserModelKind;
+
+/// Alpha denoising mode
+///
+/// \see #optixDenoiserCreate()
+typedef enum OptixDenoiserAlphaMode
+{
+    /// Copy alpha (if present) from input layer, no denoising.
+    OPTIX_DENOISER_ALPHA_MODE_COPY = 0,
+
+    /// Denoise alpha.
+    OPTIX_DENOISER_ALPHA_MODE_DENOISE = 1
+} OptixDenoiserAlphaMode;
+
+/// Options used by the denoiser
+///
+/// \see #optixDenoiserCreate()
+typedef struct OptixDenoiserOptions
+{
+    // if nonzero, albedo image must be given in OptixDenoiserGuideLayer
+    unsigned int guideAlbedo;
+
+    // if nonzero, normal image must be given in OptixDenoiserGuideLayer
+    unsigned int guideNormal;
+
+    /// alpha denoise mode
+    OptixDenoiserAlphaMode denoiseAlpha;
+} OptixDenoiserOptions;
+
+/// Guide layer for the denoiser
+///
+/// \see #optixDenoiserInvoke()
+typedef struct OptixDenoiserGuideLayer
+{
+    // image with three components: R, G, B.
+    OptixImage2D  albedo;
+
+    // image with two or three components: X, Y, Z.
+    // (X, Y) camera space for OPTIX_DENOISER_MODEL_KIND_LDR, OPTIX_DENOISER_MODEL_KIND_HDR models.
+    // (X, Y, Z) world space, all other models.
+    OptixImage2D  normal;
+
+    // image with two components: X, Y.
+    // pixel movement from previous to current frame for each pixel in screen space.
+    OptixImage2D  flow;
+
+    // Internal images used in temporal AOV denoising modes,
+    // pixel format OPTIX_PIXEL_FORMAT_INTERNAL_GUIDE_LAYER.
+    OptixImage2D  previousOutputInternalGuideLayer;
+    OptixImage2D  outputInternalGuideLayer;
+
+    // image with a single component value that specifies how trustworthy the flow vector at x,y position in
+    // OptixDenoiserGuideLayer::flow is. Range 0..1 (low->high trustworthiness).
+    // Ignored if data pointer in the image is zero.
+    OptixImage2D flowTrustworthiness;
+
+} OptixDenoiserGuideLayer;
+
+/// AOV type used by the denoiser
+///
+typedef enum OptixDenoiserAOVType
+{
+    /// Unspecified AOV type
+    OPTIX_DENOISER_AOV_TYPE_NONE       = 0,
+
+    OPTIX_DENOISER_AOV_TYPE_BEAUTY     = 0x7000,
+    OPTIX_DENOISER_AOV_TYPE_SPECULAR   = 0x7001,
+    OPTIX_DENOISER_AOV_TYPE_REFLECTION = 0x7002,
+    OPTIX_DENOISER_AOV_TYPE_REFRACTION = 0x7003,
+    OPTIX_DENOISER_AOV_TYPE_DIFFUSE    = 0x7004
+
+} OptixDenoiserAOVType;
+
+/// Input/Output layers for the denoiser
+///
+/// \see #optixDenoiserInvoke()
+typedef struct OptixDenoiserLayer
+{
+    // input image (beauty or AOV)
+    OptixImage2D  input;
+
+    // denoised output image from previous frame if temporal model kind selected
+    OptixImage2D  previousOutput;
+
+    // denoised output for given input
+    OptixImage2D  output;
+
+    // Type of AOV, used in temporal AOV modes as a hint to improve image quality.
+    OptixDenoiserAOVType type;
+} OptixDenoiserLayer;
+
+/// Various parameters used by the denoiser
+///
+/// \see #optixDenoiserInvoke()
+/// \see #optixDenoiserComputeIntensity()
+/// \see #optixDenoiserComputeAverageColor()
+
+typedef struct OptixDenoiserParams
+{
+    /// average log intensity of input image (default null pointer). points to a single float.
+    /// if set to null, autoexposure will be calculated automatically for the input image.
+    /// Should be set to average log intensity of the entire image at least if tiling is used to
+    /// get consistent autoexposure for all tiles.
+    CUdeviceptr  hdrIntensity;
+
+    /// blend factor.
+    /// If set to 0 the output is 100% of the denoised input. If set to 1, the output is 100% of
+    /// the unmodified input. Values between 0 and 1 will linearly interpolate between the denoised
+    /// and unmodified input.
+    float        blendFactor;
+
+    /// this parameter is used when the OPTIX_DENOISER_MODEL_KIND_AOV model kind is set.
+    /// average log color of input image, separate for RGB channels (default null pointer).
+    /// points to three floats.
+    /// if set to null, average log color will be calculated automatically. See hdrIntensity for tiling,
+    /// this also applies here.
+    CUdeviceptr  hdrAverageColor;
+
+    /// In temporal modes this parameter must be set to 1 if previous layers (e.g.
+    /// previousOutputInternalGuideLayer) contain valid data. This is the case in the
+    /// second and subsequent frames of a sequence (for example after a change of camera
+    /// angle). In the first frame of such a sequence this parameter must be set to 0.
+    unsigned int temporalModeUsePreviousLayers;
+
+    /// Multiplication factors for motion vectors (flow guide layer).
+    /// When set to zero, motion vectors are not scaled.
+    float        flowMulX;
+    float        flowMulY;
+} OptixDenoiserParams;
+
+/// Various sizes related to the denoiser.
+///
+/// \see #optixDenoiserComputeMemoryResources()
+typedef struct OptixDenoiserSizes
+{
+    /// Size of state memory passed to #optixDenoiserSetup, #optixDenoiserInvoke.
+    size_t stateSizeInBytes;
+
+    /// Size of scratch memory passed to #optixDenoiserSetup, #optixDenoiserInvoke.
+    /// Overlap added to dimensions passed to #optixDenoiserComputeMemoryResources.
+    size_t withOverlapScratchSizeInBytes;
+
+    /// Size of scratch memory passed to #optixDenoiserSetup, #optixDenoiserInvoke.
+    /// No overlap added.
+    size_t withoutOverlapScratchSizeInBytes;
+
+    /// Overlap on all four tile sides.
+    unsigned int overlapWindowSizeInPixels;
+
+    /// Size of scratch memory passed to #optixDenoiserComputeAverageColor.
+    /// The size is independent of the tile/image resolution.
+    size_t computeAverageColorSizeInBytes;
+
+    /// Size of scratch memory passed to #optixDenoiserComputeIntensity.
+    /// The size is independent of the tile/image resolution.
+    size_t computeIntensitySizeInBytes;
+
+    /// Number of bytes for each pixel in internal guide layers.
+    size_t internalGuideLayerPixelSizeInBytes;
+} OptixDenoiserSizes;
+
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+///// Traversal and Module/Pipeline/SBT
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+
+
+/// Ray flags passed to the device function #optixTrace().  These affect the behavior of
+/// traversal per invocation.
+///
+/// \see #optixTrace()
+typedef enum OptixRayFlags
+{
+    /// No change from the behavior configured for the individual AS.
+    OPTIX_RAY_FLAG_NONE = 0u,
+
+    /// Disables anyhit programs for the ray.
+    /// Overrides OPTIX_INSTANCE_FLAG_ENFORCE_ANYHIT.
+    /// This flag is mutually exclusive with OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
+    /// OPTIX_RAY_FLAG_CULL_DISABLED_ANYHIT, OPTIX_RAY_FLAG_CULL_ENFORCED_ANYHIT.
+    OPTIX_RAY_FLAG_DISABLE_ANYHIT = 1u << 0,
+
+    /// Forces anyhit program execution for the ray.
+    /// Overrides OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT as well as OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT.
+    /// This flag is mutually exclusive with OPTIX_RAY_FLAG_DISABLE_ANYHIT,
+    /// OPTIX_RAY_FLAG_CULL_DISABLED_ANYHIT, OPTIX_RAY_FLAG_CULL_ENFORCED_ANYHIT.
+    OPTIX_RAY_FLAG_ENFORCE_ANYHIT = 1u << 1,
+
+    /// Terminates the ray after the first hit and executes
+    /// the closesthit program of that hit.
+    OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT = 1u << 2,
+
+    /// Disables closesthit programs for the ray, but still executes miss program in case of a miss.
+    OPTIX_RAY_FLAG_DISABLE_CLOSESTHIT = 1u << 3,
+
+    /// Do not intersect triangle back faces
+    /// (respects a possible face change due to instance flag
+    /// OPTIX_INSTANCE_FLAG_FLIP_TRIANGLE_FACING).
+    /// This flag is mutually exclusive with OPTIX_RAY_FLAG_CULL_FRONT_FACING_TRIANGLES.
+    OPTIX_RAY_FLAG_CULL_BACK_FACING_TRIANGLES = 1u << 4,
+
+    /// Do not intersect triangle front faces
+    /// (respects a possible face change due to instance flag
+    /// OPTIX_INSTANCE_FLAG_FLIP_TRIANGLE_FACING).
+    /// This flag is mutually exclusive with OPTIX_RAY_FLAG_CULL_BACK_FACING_TRIANGLES.
+    OPTIX_RAY_FLAG_CULL_FRONT_FACING_TRIANGLES = 1u << 5,
+
+    /// Do not intersect geometry which disables anyhit programs
+    /// (due to setting geometry flag OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT or
+    /// instance flag OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT).
+    /// This flag is mutually exclusive with OPTIX_RAY_FLAG_CULL_ENFORCED_ANYHIT,
+    /// OPTIX_RAY_FLAG_ENFORCE_ANYHIT, OPTIX_RAY_FLAG_DISABLE_ANYHIT.
+    OPTIX_RAY_FLAG_CULL_DISABLED_ANYHIT = 1u << 6,
+
+    /// Do not intersect geometry which have an enabled anyhit program
+    /// (due to not setting geometry flag OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT or
+    /// setting instance flag OPTIX_INSTANCE_FLAG_ENFORCE_ANYHIT).
+    /// This flag is mutually exclusive with OPTIX_RAY_FLAG_CULL_DISABLED_ANYHIT,
+    /// OPTIX_RAY_FLAG_ENFORCE_ANYHIT, OPTIX_RAY_FLAG_DISABLE_ANYHIT.
+    OPTIX_RAY_FLAG_CULL_ENFORCED_ANYHIT = 1u << 7,
+
+    /// Force 4-state opacity micromaps to behave as 2-state opacity micromaps during traversal.
+    OPTIX_RAY_FLAG_FORCE_OPACITY_MICROMAP_2_STATE = 1u << 10,
+} OptixRayFlags;
+
+/// Transform
+///
+/// OptixTransformType is used by the device function #optixGetTransformTypeFromHandle() to
+/// determine the type of the OptixTraversableHandle returned from
+/// optixGetTransformListHandle().
+typedef enum OptixTransformType
+{
+    OPTIX_TRANSFORM_TYPE_NONE                    = 0, ///< Not a transformation
+    OPTIX_TRANSFORM_TYPE_STATIC_TRANSFORM        = 1, ///< \see #OptixStaticTransform
+    OPTIX_TRANSFORM_TYPE_MATRIX_MOTION_TRANSFORM = 2, ///< \see #OptixMatrixMotionTransform
+    OPTIX_TRANSFORM_TYPE_SRT_MOTION_TRANSFORM    = 3, ///< \see #OptixSRTMotionTransform
+    OPTIX_TRANSFORM_TYPE_INSTANCE                = 4, ///< \see #OptixInstance
+} OptixTransformType;
+
+/// Hit Object
+/// Struct to store the data collected in a hit object during traversal in an internal format
+/// using \c optixHitObjectGetTraverseData().
+/// The hit object can be reconstructed using that data at a later point with
+/// \c optixMakeHitObjectWithTraverseData().
+typedef struct OptixTraverseData
+{
+    unsigned int data[20];
+} OptixTraverseData;
+
+/// Specifies the set of valid traversable graphs that may be
+/// passed to invocation of #optixTrace(). Flags may be bitwise combined.
+typedef enum OptixTraversableGraphFlags
+{
+    ///  Used to signal that any traversable graphs is valid.
+    ///  This flag is mutually exclusive with all other flags.
+    OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY = 0,
+
+    ///  Used to signal that a traversable graph of a single Geometry Acceleration
+    ///  Structure (GAS) without any transforms is valid. This flag may be combined with
+    ///  other flags except for OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY.
+    OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_GAS = 1u << 0,
+
+    ///  Used to signal that a traversable graph of a single Instance Acceleration
+    ///  Structure (IAS) directly connected to Geometry Acceleration Structure (GAS)
+    ///  traversables without transform traversables in between is valid.  This flag may
+    ///  be combined with other flags except for OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY.
+    OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING = 1u << 1,
+} OptixTraversableGraphFlags;
+
+/// Optimization levels
+///
+/// \see #OptixModuleCompileOptions::optLevel
+typedef enum OptixCompileOptimizationLevel
+{
+    /// Default is to run all optimizations
+    OPTIX_COMPILE_OPTIMIZATION_DEFAULT    = 0,
+    /// No optimizations
+    OPTIX_COMPILE_OPTIMIZATION_LEVEL_0    = 0x2340,
+    /// Some optimizations
+    OPTIX_COMPILE_OPTIMIZATION_LEVEL_1    = 0x2341,
+    /// Most optimizations
+    OPTIX_COMPILE_OPTIMIZATION_LEVEL_2    = 0x2342,
+    /// All optimizations
+    OPTIX_COMPILE_OPTIMIZATION_LEVEL_3    = 0x2343,
+} OptixCompileOptimizationLevel;
+
+/// Debug levels
+///
+/// \see #OptixModuleCompileOptions::debugLevel
+typedef enum OptixCompileDebugLevel
+{
+    /// Default currently is minimal
+    OPTIX_COMPILE_DEBUG_LEVEL_DEFAULT  = 0,
+    /// No debug information
+    OPTIX_COMPILE_DEBUG_LEVEL_NONE     = 0x2350,
+    /// Generate information that does not impact performance.
+    /// Note this replaces OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO.
+    OPTIX_COMPILE_DEBUG_LEVEL_MINIMAL  = 0x2351,
+    /// Generate some debug information with slight performance cost
+    OPTIX_COMPILE_DEBUG_LEVEL_MODERATE = 0x2353,
+    /// Generate full debug information
+    OPTIX_COMPILE_DEBUG_LEVEL_FULL     = 0x2352,
+} OptixCompileDebugLevel;
+
+/// Module compilation state.
+///
+/// \see #optixModuleGetCompilationState(), #optixModuleCreateWithTasks()
+typedef enum OptixModuleCompileState
+{
+    /// No OptixTask objects have started
+    OPTIX_MODULE_COMPILE_STATE_NOT_STARTED       = 0x2360,
+
+    /// Started, but not all OptixTask objects have completed. No detected failures.
+    OPTIX_MODULE_COMPILE_STATE_STARTED           = 0x2361,
+
+    /// Not all OptixTask objects have completed, but at least one has failed.
+    OPTIX_MODULE_COMPILE_STATE_IMPENDING_FAILURE = 0x2362,
+
+    /// All OptixTask objects have completed, and at least one has failed
+    OPTIX_MODULE_COMPILE_STATE_FAILED            = 0x2363,
+
+    /// All OptixTask objects have completed. The OptixModule is ready to be used.
+    OPTIX_MODULE_COMPILE_STATE_COMPLETED         = 0x2364,
+} OptixModuleCompileState;
+
+/// Flags for canceling the creation of an OptiX object.
+///
+/// If OPTIX_CREATION_FLAG_BLOCK_UNTIL_EFFECTIVE is set, the calling thread will block until one of these conditions is met:
+/// 1. All executing object creation threads have processed the new state
+/// 2. The creation of the object has finished, in which case the new state will be ignored
+///
+/// If OPTIX_CREATION_FLAG_BLOCK_UNTIL_EFFECTIVE is not set, any *CancelCreation* call will return without blocking.
+/// Note that the cancel request may still be ignored if all creation threads finish their tasks before they can process the new state.
+///
+/// \see #optixModuleCancelCreation(), #optixPipelineCancelCreations(), #optixDeviceContextCancelCreations()
+typedef enum OptixCreationFlags
+{
+    OPTIX_CREATION_FLAG_NONE                  = 0,
+    OPTIX_CREATION_FLAG_BLOCK_UNTIL_EFFECTIVE = 1 << 0,
+} OptixCreationFlags;
+
+
+/// Struct for specifying specializations for pipelineParams as specified in
+/// OptixPipelineCompileOptions::pipelineLaunchParamsVariableName.
+///
+/// The bound values are supposed to represent a constant value in the
+/// pipelineParams. OptiX will attempt to locate all loads from the pipelineParams and
+/// correlate them to the appropriate bound value, but there are cases where OptiX cannot
+/// safely or reliably do this. For example if the pointer to the pipelineParams is passed
+/// as an argument to a non-inline function or the offset of the load to the
+/// pipelineParams cannot be statically determined (e.g. accessed in a loop). No module
+/// should rely on the value being specialized in order to work correctly.  The values in
+/// the pipelineParams specified on optixLaunch should match the bound value. If
+/// validation mode is enabled on the context, OptiX will verify that the bound values
+/// specified matches the values in pipelineParams specified to optixLaunch.
+///
+/// These values are compiled in to the module as constants. Once the constants are
+/// inserted into the code, an optimization pass will be run that will attempt to
+/// propagate the consants and remove unreachable code.
+///
+/// If caching is enabled, changes in these values will result in newly compiled modules.
+///
+/// The pipelineParamOffset and sizeInBytes must be within the bounds of the
+/// pipelineParams variable. OPTIX_ERROR_INVALID_VALUE will be returned from
+/// optixModuleCreate otherwise.
+///
+/// If more than one bound value overlaps or the size of a bound value is equal to 0,
+/// an OPTIX_ERROR_INVALID_VALUE will be returned from optixModuleCreate.
+///
+/// The same set of bound values do not need to be used for all modules in a pipeline, but
+/// overlapping values between modules must have the same value.
+/// OPTIX_ERROR_INVALID_VALUE will be returned from optixPipelineCreate otherwise.
+///
+/// \see #OptixModuleCompileOptions
+typedef struct OptixModuleCompileBoundValueEntry {
+    size_t pipelineParamOffsetInBytes;
+    size_t sizeInBytes;
+    const void* boundValuePtr;
+    const char* annotation; // optional string to display, set to 0 if unused.  If unused,
+                            // OptiX will report the annotation as "No annotation"
+} OptixModuleCompileBoundValueEntry;
+
+/// Payload type identifiers.
+typedef enum OptixPayloadTypeID {
+    OPTIX_PAYLOAD_TYPE_DEFAULT = 0,
+    OPTIX_PAYLOAD_TYPE_ID_0 = (1 << 0u),
+    OPTIX_PAYLOAD_TYPE_ID_1 = (1 << 1u),
+    OPTIX_PAYLOAD_TYPE_ID_2 = (1 << 2u),
+    OPTIX_PAYLOAD_TYPE_ID_3 = (1 << 3u),
+    OPTIX_PAYLOAD_TYPE_ID_4 = (1 << 4u),
+    OPTIX_PAYLOAD_TYPE_ID_5 = (1 << 5u),
+    OPTIX_PAYLOAD_TYPE_ID_6 = (1 << 6u),
+    OPTIX_PAYLOAD_TYPE_ID_7 = (1 << 7u)
+} OptixPayloadTypeID;
+
+/// Semantic flags for a single payload word.
+///
+/// Used to specify the semantics of a payload word per shader type.
+/// "read":  Shader of this type may read the payload word.
+/// "write": Shader of this type may write the payload word.
+///
+/// "trace_caller_write": Shaders may consume the value of the payload word passed to optixTrace by the caller.
+/// "trace_caller_read": The caller to optixTrace may read the payload word after the call to optixTrace.
+///
+/// Semantics can be bitwise combined.
+/// Combining "read" and "write" is equivalent to specifying "read_write".
+/// A payload needs to be writable by the caller or at least one shader type.
+/// A payload needs to be readable by the caller or at least one shader type after a being writable.
+typedef enum OptixPayloadSemantics
+{
+    OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_NONE       = 0,
+    OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_READ       = 1u << 0,
+    OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_WRITE      = 2u << 0,
+    OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_READ_WRITE = 3u << 0,
+
+    OPTIX_PAYLOAD_SEMANTICS_CH_NONE                 = 0,
+    OPTIX_PAYLOAD_SEMANTICS_CH_READ                 = 1u << 2,
+    OPTIX_PAYLOAD_SEMANTICS_CH_WRITE                = 2u << 2,
+    OPTIX_PAYLOAD_SEMANTICS_CH_READ_WRITE           = 3u << 2,
+
+    OPTIX_PAYLOAD_SEMANTICS_MS_NONE                 = 0,
+    OPTIX_PAYLOAD_SEMANTICS_MS_READ                 = 1u << 4,
+    OPTIX_PAYLOAD_SEMANTICS_MS_WRITE                = 2u << 4,
+    OPTIX_PAYLOAD_SEMANTICS_MS_READ_WRITE           = 3u << 4,
+
+    OPTIX_PAYLOAD_SEMANTICS_AH_NONE                 = 0,
+    OPTIX_PAYLOAD_SEMANTICS_AH_READ                 = 1u << 6,
+    OPTIX_PAYLOAD_SEMANTICS_AH_WRITE                = 2u << 6,
+    OPTIX_PAYLOAD_SEMANTICS_AH_READ_WRITE           = 3u << 6,
+
+    OPTIX_PAYLOAD_SEMANTICS_IS_NONE                 = 0,
+    OPTIX_PAYLOAD_SEMANTICS_IS_READ                 = 1u << 8,
+    OPTIX_PAYLOAD_SEMANTICS_IS_WRITE                = 2u << 8,
+    OPTIX_PAYLOAD_SEMANTICS_IS_READ_WRITE           = 3u << 8,
+} OptixPayloadSemantics;
+
+/// Specifies a single payload type
+typedef struct OptixPayloadType
+{
+    /// The number of 32b words the payload of this type holds
+    unsigned int numPayloadValues;
+
+    /// Points to host array of payload word semantics, size must match numPayloadValues
+    const unsigned int *payloadSemantics;
+} OptixPayloadType;
+
+/// Compilation options for module
+///
+/// \see #optixModuleCreate()
+typedef struct OptixModuleCompileOptions
+{
+    /// Maximum number of registers allowed when compiling to SASS.
+    /// Set to 0 for no explicit limit. May vary within a pipeline.
+    int maxRegisterCount;
+
+    /// Optimization level. May vary within a pipeline.
+    OptixCompileOptimizationLevel optLevel;
+
+    /// Generate debug information.
+    OptixCompileDebugLevel debugLevel;
+
+    /// Ingored if numBoundValues is set to 0
+    const OptixModuleCompileBoundValueEntry* boundValues;
+
+    /// set to 0 if unused
+    unsigned int numBoundValues;
+
+    /// The number of different payload types available for compilation.
+    /// Must be zero if OptixPipelineCompileOptions::numPayloadValues is not zero.
+    unsigned int numPayloadTypes;
+
+    /// Points to host array of payload type definitions, size must match numPayloadTypes
+    const OptixPayloadType* payloadTypes;
+
+    /// If not \c nullptr, pointer to the base module for potential specialization.
+    OptixModule baseModule;
+} OptixModuleCompileOptions;
+
+/// Specifies the options for retrieving an intersection program for a built-in primitive type.
+/// The primitive type must not be OPTIX_PRIMITIVE_TYPE_CUSTOM.
+///
+/// \see #optixBuiltinISModuleGet()
+typedef struct OptixBuiltinISOptions
+{
+    OptixPrimitiveType        builtinISModuleType;
+    /// Boolean value indicating whether vertex motion blur is used (but not motion transform blur).
+    int                       usesMotionBlur;
+    /// Build flags, see OptixBuildFlags.
+    unsigned int              buildFlags;
+    /// End cap properties of curves, see OptixCurveEndcapFlags, 0 for non-curve types.
+    unsigned int              curveEndcapFlags;
+} OptixBuiltinISOptions;
+
+/// Distinguishes different kinds of program groups.
+typedef enum OptixProgramGroupKind
+{
+    /// Program group containing a raygen (RG) program
+    /// \see #OptixProgramGroupSingleModule, #OptixProgramGroupDesc::raygen
+    OPTIX_PROGRAM_GROUP_KIND_RAYGEN = 0x2421,
+
+    /// Program group containing a miss (MS) program
+    /// \see #OptixProgramGroupSingleModule, #OptixProgramGroupDesc::miss
+    OPTIX_PROGRAM_GROUP_KIND_MISS = 0x2422,
+
+    /// Program group containing an exception (EX) program
+    /// \see OptixProgramGroupHitgroup, #OptixProgramGroupDesc::exception
+    OPTIX_PROGRAM_GROUP_KIND_EXCEPTION = 0x2423,
+
+    /// Program group containing an intersection (IS), any hit (AH), and/or closest hit (CH) program
+    /// \see #OptixProgramGroupSingleModule, #OptixProgramGroupDesc::hitgroup
+    OPTIX_PROGRAM_GROUP_KIND_HITGROUP = 0x2424,
+
+    /// Program group containing a direct (DC) or continuation (CC) callable program
+    /// \see OptixProgramGroupCallables, #OptixProgramGroupDesc::callables
+    OPTIX_PROGRAM_GROUP_KIND_CALLABLES = 0x2425
+} OptixProgramGroupKind;
+
+/// Flags for program groups
+typedef enum OptixProgramGroupFlags
+{
+    /// Currently there are no flags
+    OPTIX_PROGRAM_GROUP_FLAGS_NONE = 0
+} OptixProgramGroupFlags;
+
+/// Program group representing a single module.
+///
+/// Used for raygen, miss, and exception programs. In case of raygen and exception programs, module and entry
+/// function name need to be valid. For miss programs, module and entry function name might both be \c nullptr.
+///
+/// \see #OptixProgramGroupDesc::raygen, #OptixProgramGroupDesc::miss, #OptixProgramGroupDesc::exception
+typedef struct OptixProgramGroupSingleModule
+{
+    /// Module holding single program.
+    OptixModule module;
+    /// Entry function name of the single program.
+    const char* entryFunctionName;
+} OptixProgramGroupSingleModule;
+
+/// Program group representing the hitgroup.
+///
+/// For each of the three program types, module and entry function name might both be \c nullptr.
+///
+/// \see #OptixProgramGroupDesc::hitgroup
+typedef struct OptixProgramGroupHitgroup
+{
+    /// Module holding the closest hit (CH) program.
+    OptixModule moduleCH;
+    /// Entry function name of the closest hit (CH) program.
+    const char* entryFunctionNameCH;
+    /// Module holding the any hit (AH) program.
+    OptixModule moduleAH;
+    /// Entry function name of the any hit (AH) program.
+    const char* entryFunctionNameAH;
+    /// Module holding the intersection (Is) program.
+    OptixModule moduleIS;
+    /// Entry function name of the intersection (IS) program.
+    const char* entryFunctionNameIS;
+} OptixProgramGroupHitgroup;
+
+/// Program group representing callables.
+///
+/// Module and entry function name need to be valid for at least one of the two callables.
+///
+/// \see ##OptixProgramGroupDesc::callables
+typedef struct OptixProgramGroupCallables
+{
+    /// Module holding the direct callable (DC) program.
+    OptixModule moduleDC;
+    /// Entry function name of the direct callable (DC) program.
+    const char* entryFunctionNameDC;
+    /// Module holding the continuation callable (CC) program.
+    OptixModule moduleCC;
+    /// Entry function name of the continuation callable (CC) program.
+    const char* entryFunctionNameCC;
+} OptixProgramGroupCallables;
+
+/// Descriptor for program groups.
+typedef struct OptixProgramGroupDesc
+{
+    /// The kind of program group.
+    OptixProgramGroupKind kind;
+
+    /// See #OptixProgramGroupFlags
+    unsigned int flags;
+
+    union
+    {
+        /// \see #OPTIX_PROGRAM_GROUP_KIND_HITGROUP
+        OptixProgramGroupHitgroup hitgroup;
+        /// \see #OPTIX_PROGRAM_GROUP_KIND_RAYGEN
+        OptixProgramGroupSingleModule raygen;
+        /// \see #OPTIX_PROGRAM_GROUP_KIND_MISS
+        OptixProgramGroupSingleModule miss;
+        /// \see #OPTIX_PROGRAM_GROUP_KIND_EXCEPTION
+        OptixProgramGroupSingleModule exception;
+        /// \see #OPTIX_PROGRAM_GROUP_KIND_CALLABLES
+        OptixProgramGroupCallables callables;
+    };
+} OptixProgramGroupDesc;
+
+/// Program group options
+///
+/// \see #optixProgramGroupCreate()
+typedef struct OptixProgramGroupOptions
+{
+    /// Specifies the payload type of this program group.
+    /// All programs in the group must support the payload type
+    /// (Program support for a type is specified by calling
+    /// \see #optixSetPayloadTypes or otherwise all types specified in
+    /// \see #OptixModuleCompileOptions are supported).
+    /// If a program is not available for the requested payload type,
+    /// optixProgramGroupCreate returns OPTIX_ERROR_PAYLOAD_TYPE_MISMATCH.
+    /// If the payloadType is left zero, a unique type is deduced.
+    /// The payload type can be uniquely deduced if there is exactly one payload type
+    /// for which all programs in the group are available.
+    /// If the payload type could not be deduced uniquely
+    /// optixProgramGroupCreate returns OPTIX_ERROR_PAYLOAD_TYPE_RESOLUTION_FAILED.
+    const OptixPayloadType* payloadType;
+} OptixProgramGroupOptions;
+
+/// The following values are used to indicate which exception was thrown.
+typedef enum OptixExceptionCodes
+{
+    /// Stack overflow of the continuation stack.
+    /// no exception details.
+    OPTIX_EXCEPTION_CODE_STACK_OVERFLOW = -1,
+
+    /// The trace depth is exceeded.
+    /// no exception details.
+    OPTIX_EXCEPTION_CODE_TRACE_DEPTH_EXCEEDED = -2,
+
+
+} OptixExceptionCodes;
+
+/// Exception flags.
+///
+/// \see #OptixPipelineCompileOptions::exceptionFlags, #OptixExceptionCodes
+typedef enum OptixExceptionFlags
+{
+    /// No exception are enabled.
+    OPTIX_EXCEPTION_FLAG_NONE = 0,
+
+    /// Enables exceptions check related to the continuation stack.
+    /// This flag should be used when the application handles stack overflows
+    /// in a user exception program as part of the normal flow of execution.
+    /// For catching overflows during debugging and development, the
+    /// device context validation mode should be used instead.
+    /// \see OptixDeviceContextValidationMode
+    OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW = 1u << 0,
+
+    /// Enables exceptions check related to trace depth.
+    /// This flag should be used when the application handles trace depth overflows
+    /// in a user exception program as part of the normal flow of execution.
+    /// For catching overflows during debugging and development, the
+    /// device context validation mode should be used instead.
+    /// \see OptixDeviceContextValidationMode
+    OPTIX_EXCEPTION_FLAG_TRACE_DEPTH = 1u << 1,
+
+    /// Enables user exceptions via optixThrowException(). This flag must be specified for all modules in a pipeline
+    /// if any module calls optixThrowException().
+    OPTIX_EXCEPTION_FLAG_USER = 1u << 2,
+
+} OptixExceptionFlags;
+
+/// Compilation options for all modules of a pipeline.
+///
+/// Similar to #OptixModuleCompileOptions, but these options here need to be equal for all modules of a pipeline.
+///
+/// \see #optixModuleCreate(), #optixPipelineCreate()
+typedef struct OptixPipelineCompileOptions
+{
+    /// Boolean value indicating whether motion blur could be used
+    int usesMotionBlur;
+
+    /// Traversable graph bitfield. See OptixTraversableGraphFlags
+    unsigned int traversableGraphFlags;
+
+    /// How much storage, in 32b words, to make available for the payload, [0..32]
+    /// Must be zero if numPayloadTypes is not zero.
+    int numPayloadValues;
+
+    /// How much storage, in 32b words, to make available for the attributes. The
+    /// minimum number is 2. Values below that will automatically be changed to 2. [2..8]
+    int numAttributeValues;
+
+    /// A bitmask of OptixExceptionFlags indicating which exceptions are enabled.
+    unsigned int exceptionFlags;
+
+    /// The name of the pipeline parameter variable. If 0, no pipeline parameter
+    /// will be available. This will be ignored if the launch param variable was
+    /// optimized out or was not found in the modules linked to the pipeline.
+    const char* pipelineLaunchParamsVariableName;
+
+    /// Size of the variable pointed to by pipelineLaunchParamsVariableName. It will be a
+    /// compiler error if the size of the variable pointed to by
+    /// pipelineLaunchParamsVariableName is not equal to this size.
+    size_t pipelineLaunchParamsSizeInBytes;
+
+    /// Bit field enabling primitive types. See OptixPrimitiveTypeFlags.
+    /// Setting to zero corresponds to enabling OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM and OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE.
+    unsigned int usesPrimitiveTypeFlags;
+
+    /// Boolean value indicating whether opacity micromaps may be used
+    int allowOpacityMicromaps;
+
+    /// Boolean value indicating whether clusters (cluster acceleration structures) may be used.
+    /// This value MUST be set if clusters are present in the BVH, otherwise validation will return an error.
+    int allowClusteredGeometry;
+} OptixPipelineCompileOptions;
+
+/// Link options for a pipeline
+///
+/// \see #optixPipelineCreate()
+typedef struct OptixPipelineLinkOptions
+{
+    /// Maximum trace recursion depth. 0 means a ray generation program can be
+    /// launched, but can't trace any rays. The maximum allowed value is 31.
+    unsigned int maxTraceDepth;
+
+    /// Maximum depth of continuation callable call graphs. 0 means that continuation callables
+    /// will not take part in the stack size calculation and can most likely not be called.
+    unsigned int maxContinuationCallableDepth;
+    /// Maximum depth of direct callable call graphs called from raygen, closesthit, miss or continuation callable programs.
+    /// 0 means that direct callables will not take part in the default stack size calculation for that part of the pipeline
+    /// and can not be called from the programs mentioned above if the callable needs any stack.
+    unsigned int maxDirectCallableDepthFromState;
+    /// Maximum depth of direct callable call graphs called from intersect or anyhit programs.
+    /// 0 means that direct callables will not take part in the default stack size calculation for that part of the pipeline
+    /// and can not be called from the programs mentioned above if the callable needs any stack.
+    unsigned int maxDirectCallableDepthFromTraversal;
+
+    /// The maximum depth of a traversable graph passed to trace.
+    /// 0 means to take a default value based on the traversableGraphFlags passed to
+    /// OptixPipelineCompileOptions::traversableGraphFlags:
+    /// OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_GAS means to take 1, otherwise 2 will be taken.
+    unsigned int maxTraversableGraphDepth;
+} OptixPipelineLinkOptions;
+
+/// Describes the shader binding table (SBT)
+///
+/// \see #optixLaunch()
+typedef struct OptixShaderBindingTable
+{
+    /// Device address of the SBT record of the ray gen program to start launch at. The address must be a multiple of
+    /// OPTIX_SBT_RECORD_ALIGNMENT.
+    CUdeviceptr raygenRecord;
+
+    /// Device address of the SBT record of the exception program. The address must be a multiple of
+    /// OPTIX_SBT_RECORD_ALIGNMENT.
+    CUdeviceptr exceptionRecord;
+
+    /// Arrays of SBT records for miss programs. The base address and the stride must be a multiple of
+    /// OPTIX_SBT_RECORD_ALIGNMENT.
+    /// @{
+    CUdeviceptr  missRecordBase;
+    unsigned int missRecordStrideInBytes;
+    unsigned int missRecordCount;
+    /// @}
+
+    /// Arrays of SBT records for hit groups. The base address and the stride must be a multiple of
+    /// OPTIX_SBT_RECORD_ALIGNMENT.
+    /// @{
+    CUdeviceptr  hitgroupRecordBase;
+    unsigned int hitgroupRecordStrideInBytes;
+    unsigned int hitgroupRecordCount;
+    /// @}
+
+    /// Arrays of SBT records for callable programs. If the base address is not null, the stride and count must not be
+    /// zero. If the base address is null, then the count needs to zero. The base address and the stride must be a
+    /// multiple of OPTIX_SBT_RECORD_ALIGNMENT.
+    /// @{
+    CUdeviceptr  callablesRecordBase;
+    unsigned int callablesRecordStrideInBytes;
+    unsigned int callablesRecordCount;
+    /// @}
+
+} OptixShaderBindingTable;
+
+/// Describes the stack size requirements of a program group.
+///
+/// \see optixProgramGroupGetStackSize()
+typedef struct OptixStackSizes
+{
+    /// Continuation stack size of RG programs in bytes
+    unsigned int cssRG;
+    /// Continuation stack size of MS programs in bytes
+    unsigned int cssMS;
+    /// Continuation stack size of CH programs in bytes
+    unsigned int cssCH;
+    /// Continuation stack size of AH programs in bytes
+    unsigned int cssAH;
+    /// Continuation stack size of IS programs in bytes
+    unsigned int cssIS;
+    /// Continuation stack size of CC programs in bytes
+    unsigned int cssCC;
+    /// Direct stack size of DC programs in bytes
+    unsigned int dssDC;
+
+} OptixStackSizes;
+
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+///// Cooperative Vector
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+
+/// Flags used to interpret the result of #optixDeviceContextGetProperty() and
+/// OPTIX_DEVICE_PROPERTY_COOP_VEC
+///
+/// \see #optixDeviceContextGetProperty()
+typedef enum OptixDevicePropertyCoopVecFlags
+{
+    /// Any use of cooperative vector host APIs or device intrinsics will result in an
+    /// error.
+    OPTIX_DEVICE_PROPERTY_COOP_VEC_FLAG_NONE     = 0,
+
+    // Standard cooperative vector features are supported
+    OPTIX_DEVICE_PROPERTY_COOP_VEC_FLAG_STANDARD = 1 << 0,
+} OptixDevicePropertyCoopVecFlags;
+
+typedef enum OptixCoopVecElemType
+{
+    OPTIX_COOP_VEC_ELEM_TYPE_UNKNOWN = 0x2A00,
+    /// 16 bit float
+    OPTIX_COOP_VEC_ELEM_TYPE_FLOAT16 = 0x2A01,
+    /// 32 bit float
+    OPTIX_COOP_VEC_ELEM_TYPE_FLOAT32 = 0x2A03,
+    /// 8 bit unsigned integer
+    OPTIX_COOP_VEC_ELEM_TYPE_UINT8   = 0x2A04,
+    /// 8 bit signed integer
+    OPTIX_COOP_VEC_ELEM_TYPE_INT8    = 0x2A05,
+    /// 32 bit unsigned integer
+    OPTIX_COOP_VEC_ELEM_TYPE_UINT32  = 0x2A08,
+    /// 32 bit signed integer
+    OPTIX_COOP_VEC_ELEM_TYPE_INT32   = 0x2A09,
+    /// FLOAT8 type with 4 bits exponent, 3 bits mantissa. Only supported as the inputInterpretation and matrixElementType.
+    OPTIX_COOP_VEC_ELEM_TYPE_FLOAT8_E4M3 = 0x2A0A,
+    /// FLOAT8 type with 5 bits exponent, 2 bits mantissa. Only supported as the inputInterpretation and matrixElementType.
+    OPTIX_COOP_VEC_ELEM_TYPE_FLOAT8_E5M2 = 0x2A0B,
+} OptixCoopVecElemType;
+
+typedef enum OptixCoopVecMatrixLayout
+{
+    OPTIX_COOP_VEC_MATRIX_LAYOUT_ROW_MAJOR    = 0x2A40,
+    OPTIX_COOP_VEC_MATRIX_LAYOUT_COLUMN_MAJOR = 0x2A41,
+    OPTIX_COOP_VEC_MATRIX_LAYOUT_INFERENCING_OPTIMAL = 0x2A42,
+    OPTIX_COOP_VEC_MATRIX_LAYOUT_TRAINING_OPTIMAL    = 0x2A43,
+} OptixCoopVecMatrixLayout;
+
+/// Each matrix's offset from the base address is expressed with offsetInBytes. This
+/// allows for non-uniform matrices to be tightly packed.
+///
+/// The rowColumnStrideInBytes is ignored if the layout is either
+/// OPTIX_COOP_VEC_MATRIX_LAYOUT_INFERENCING_OPTIMAL or
+/// OPTIX_COOP_VEC_MATRIX_LAYOUT_TRAINING_OPTIMAL
+typedef struct OptixCoopVecMatrixDescription
+{
+    unsigned int             N;
+    unsigned int             K;
+    unsigned int             offsetInBytes;
+    OptixCoopVecElemType     elementType;
+    OptixCoopVecMatrixLayout layout;
+    unsigned int             rowColumnStrideInBytes;
+    unsigned int             sizeInBytes;
+} OptixCoopVecMatrixDescription;
+
+typedef struct OptixNetworkDescription
+{
+    OptixCoopVecMatrixDescription* layers;
+    unsigned int                   numLayers;
+} OptixNetworkDescription;
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+///// Function table
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+
+
+/// Options that can be passed to \c optixQueryFunctionTable()
+typedef enum OptixQueryFunctionTableOptions
+{
+    /// Placeholder (there are no options yet)
+    OPTIX_QUERY_FUNCTION_TABLE_OPTION_DUMMY = 0
+
+} OptixQueryFunctionTableOptions;
+
+/// Type of the function \c optixQueryFunctionTable()
+typedef OptixResult( OptixQueryFunctionTable_t )( int          abiId,
+                                                  unsigned int numOptions,
+                                                  OptixQueryFunctionTableOptions* /*optionKeys*/,
+                                                  const void** /*optionValues*/,
+                                                  void*  functionTable,
+                                                  size_t sizeOfTable );
+
+
+
+/**@}*/  // end group optix_types
+
+#endif  // OPTIX_OPTIX_TYPES_H

From ef4784ecb6a9f2ffbeb9feae5101c4e2f43c6690 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 17 Dec 2025 07:43:59 -0800
Subject: [PATCH 04/14] got rtxpy working with Cuda 13.1 and Optix 9.1

---
 CMakeLists.txt             |   4 +-
 MANIFEST.in                |   2 +-
 crtx/compileOptiX.sh       |   4 +-
 crtx/dllmain.cpp           | 184 ++++++++++++++++++++++++-------------
 kernel.ptx                 | 172 ++++++++++++++++++++++++++++++++++
 rtxpy/rtx.py               |  12 ++-
 rtxpy/tests/test_simple.py |   8 +-
 7 files changed, 306 insertions(+), 80 deletions(-)
 create mode 100644 kernel.ptx

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 448db92..89a35d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,8 +33,8 @@ set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT_DIR}/lib64")
 
 target_include_directories(${PROJECT_NAME} PRIVATE
   ${SOURCE_DIR}
-  ${SOURCE_DIR}/optix_7.1/include
-  ${SOURCE_DIR}/optix_7.1
+  ${SOURCE_DIR}/optix_9.1/include
+  ${SOURCE_DIR}/optix_9.1
   ${SOURCE_DIR}/cuew
   ${CUDA_INCLUDE_DIR}
 )
diff --git a/MANIFEST.in b/MANIFEST.in
index 4feb9fb..a5e2944 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,3 @@
 # Extra files required in sdist
 include CMakeLists.txt
-recursive-include crtx *.h *.c *cpp *.cu *.sh
+recursive-include crtx *.h *.c *cpp *.cu *.sh *.ptx
diff --git a/crtx/compileOptiX.sh b/crtx/compileOptiX.sh
index f452986..19c7a7f 100644
--- a/crtx/compileOptiX.sh
+++ b/crtx/compileOptiX.sh
@@ -12,7 +12,7 @@ esac
 
 mkdir -p external/shaders
 
-OPTIX_VERSION=7.1.0
+OPTIX_VERSION=9.1.0
 
 if [ "${machine}" == "Linux" ]
 then
@@ -22,7 +22,7 @@ then
     COMPILER="g++"
 
     INCLUDES=(
-        -I"./optix_7.1"                         # <-- OptiX 7.1 headers vendored in this repo
+        -I"./optix_9.1"                         # <-- OptiX 9.1 headers vendored in this repo
         -I"../include"
         -I"/usr/local/cuda/samples/common/inc"  # For helper_math.h / math_helper.h (CUDA samples)
     )
diff --git a/crtx/dllmain.cpp b/crtx/dllmain.cpp
index 6daad23..b21ed25 100644
--- a/crtx/dllmain.cpp
+++ b/crtx/dllmain.cpp
@@ -1,7 +1,5 @@
-#define CUDA_NO_PROTOTYPES
 #include "cuew/cuew.h"          // this pulls in cuda.h safely (no prototypes)
 
-#define OPTIX_DONT_INCLUDE_CUDA
 #include <optix.h>
 #include <optix_function_table_definition.h>
 #include <optix_stack_size.h>
@@ -13,6 +11,7 @@
 #include <sstream>
 #include <vector>
 #include <fstream>
+#include <string>
 
 struct float3 { float x,y,z; };
 struct int3   { int i[3]; };
@@ -21,6 +20,37 @@ struct int3   { int i[3]; };
 #include "rtx.h"
 #include "internal.h"
 
+
+// Optional: returns CUDA driver version as int like 12040 == 12.4
+static int getCudaDriverVersion()
+{
+    int v = 0;
+    cuDriverGetVersion(&v);
+    return v;
+}
+
+#define OPTIX_CHECK_RTX(call)                                                     \
+    do {                                                                          \
+        OptixResult _res = (call);                                                \
+        if (_res != OPTIX_SUCCESS) {                                              \
+            std::ostringstream _oss;                                              \
+            _oss << "OptiX call failed: " << #call                                \
+                 << " at " << __FILE__ << ":" << __LINE__                         \
+                 << " (" << __FUNCTION__ << ")\n"                                 \
+                 << "  OptixResult: " << optixResultToString(_res)                \
+                 << " (" << static_cast<int>(_res) << ")\n"                       \
+                 << "  CUDA driver version: " << getCudaDriverVersion() << "\n"  \
+                 << "  OPTIX_ABI_VERSION (compiled): " << OPTIX_ABI_VERSION       \
+                 << "\n";                                                         \
+            if (_res == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {                    \
+                _oss << "  Hint: driver/runtime ABI mismatch. Your NVIDIA "       \
+                        "driver is likely too old for this OptiX SDK.\n";         \
+            }                                                                     \
+            setLastErrorRTX(_oss.str().c_str());                                  \
+            return static_cast<int>(_res);                                        \
+        }                                                                         \
+    } while (0)
+
 State global_state;
 
 /// Read the contents of a text file @fileName and return them in a string
@@ -45,15 +75,47 @@ std::string getTextFileContents(const char* fileName) {
     return res;
 }
 
+
+static thread_local std::string g_last_error;
+
+extern "C" void setLastErrorRTX(const char* msg)
+{
+    g_last_error = (msg ? msg : "");
+}
+
+extern "C" const char* getLastErrorRTX()
+{
+    return g_last_error.empty() ? nullptr : g_last_error.c_str();
+}
+
+
+static const char* optixResultToString(OptixResult r)
+{
+    switch (r)
+    {
+        case OPTIX_SUCCESS: return "OPTIX_SUCCESS";
+        case OPTIX_ERROR_INVALID_VALUE: return "OPTIX_ERROR_INVALID_VALUE";
+        case OPTIX_ERROR_HOST_OUT_OF_MEMORY: return "OPTIX_ERROR_HOST_OUT_OF_MEMORY";
+        case OPTIX_ERROR_INVALID_OPERATION: return "OPTIX_ERROR_INVALID_OPERATION";
+        case OPTIX_ERROR_FILE_IO_ERROR: return "OPTIX_ERROR_FILE_IO_ERROR";
+        case OPTIX_ERROR_INVALID_FILE_FORMAT: return "OPTIX_ERROR_INVALID_FILE_FORMAT";
+        case OPTIX_ERROR_DISK_CACHE_INVALID_PATH: return "OPTIX_ERROR_DISK_CACHE_INVALID_PATH";
+        case OPTIX_ERROR_UNSUPPORTED_ABI_VERSION: return "OPTIX_ERROR_UNSUPPORTED_ABI_VERSION";
+        case OPTIX_ERROR_FUNCTION_TABLE_SIZE_MISMATCH: return "OPTIX_ERROR_FUNCTION_TABLE_SIZE_MISMATCH";
+        case OPTIX_ERROR_INVALID_DEVICE_CONTEXT: return "OPTIX_ERROR_INVALID_DEVICE_CONTEXT";
+        default: return "OPTIX_ERROR_UNKNOWN";
+    }
+}
+
 int createModule(State& state)
 {
-    char   log[2048];  // For error reporting from OptiX creation functions
+    char   log[16384];
     size_t logSize = sizeof(log);
 
     OptixModuleCompileOptions module_compile_options = {};
     module_compile_options.maxRegisterCount = OPTIX_COMPILE_DEFAULT_MAX_REGISTER_COUNT;
     module_compile_options.optLevel         = OPTIX_COMPILE_OPTIMIZATION_DEFAULT;
-    module_compile_options.debugLevel       = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+    module_compile_options.debugLevel       = OPTIX_COMPILE_DEBUG_LEVEL_MINIMAL;
 
     state.pipeline_compile_options.usesMotionBlur        = false;
     state.pipeline_compile_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_GAS;
@@ -62,32 +124,17 @@ int createModule(State& state)
     state.pipeline_compile_options.exceptionFlags        = OPTIX_EXCEPTION_FLAG_NONE;
     state.pipeline_compile_options.pipelineLaunchParamsVariableName = "params";
 
-    std::string ptx;
-    try {
-        // Path is relative to the *current working directory* when your .so is loaded.
-        // If you run from repo root, this is likely "crtx/kernel.ptx".
-        // If you run from crtx/, this is "kernel.ptx".
-        //
-        // Start simple:
-        //   bash compileOptiX.sh   (from crtx)
-        //   pytest ...             (from repo root)
-        //
-        // So try "crtx/kernel.ptx" first, then fallback to "kernel.ptx".
-        try {
-            ptx = load_ptx_file("crtx/kernel.ptx");
-        } catch (...) {
-            ptx = load_ptx_file("kernel.ptx");
-        }
-    } catch (const std::exception& e) {
-        fprintf(stderr, "[RTX] Failed to load PTX: %s\n", e.what());
+    std::string ptx = load_ptx_file("kernel.ptx");
+    if (ptx.empty()) {
+        fprintf(stderr, "Failed to load kernel.ptx\n");
         return -1;
     }
 
-    const char* input     = ptx.c_str();
+    const char*  input     = ptx.data();
     const size_t inputSize = ptx.size();
 
     OPTIX_CHECK_LOG(
-        optixModuleCreateFromPTX(
+        optixModuleCreate(
             state.context,
             &module_compile_options,
             &state.pipeline_compile_options,
@@ -103,6 +150,7 @@ int createModule(State& state)
 }
 
 
+
 int createProgramGroups(State& state)
 {
     char   log[2048];
@@ -165,60 +213,64 @@ int createPipelines(State& state)
 {
     OptixResult res = OPTIX_SUCCESS;
     char   log[2048];
-    size_t sizeof_log = sizeof( log );
+    size_t sizeof_log = sizeof(log);
 
     const uint32_t    max_trace_depth   = 1;
-    OptixProgramGroup program_groups[3] = {state.raygen, state.miss, state.hit};
+    OptixProgramGroup program_groups[3] = { state.raygen, state.miss, state.hit };
 
     OptixPipelineLinkOptions pipeline_link_options = {};
-    pipeline_link_options.maxTraceDepth            = max_trace_depth;
-    pipeline_link_options.debugLevel               = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+    pipeline_link_options.maxTraceDepth = max_trace_depth;
 
     res = optixPipelineCreate(
-        state.context, 
-        &state.pipeline_compile_options, 
+        state.context,
+        &state.pipeline_compile_options,
         &pipeline_link_options,
-        program_groups, 
-        sizeof(program_groups) / sizeof(program_groups[0]), 
+        program_groups,
+        sizeof(program_groups) / sizeof(program_groups[0]),
         log,
-        &sizeof_log, 
+        &sizeof_log,
         &state.pipeline
     );
     if (res != OPTIX_SUCCESS) {
-        fprintf(stderr, "Failed to create OptiX Pipeline.");
+        fprintf(stderr, "Failed to create OptiX Pipeline.\nLog:\n%s\n", log);
         return -1;
     }
+
     OptixStackSizes stack_sizes = {};
-    for(auto& prog_group : program_groups) {
-        OPTIX_CHECK(optixUtilAccumulateStackSizes(prog_group, &stack_sizes));
+    for (auto& prog_group : program_groups) {
+        OPTIX_CHECK(optixUtilAccumulateStackSizes(prog_group, &stack_sizes, state.pipeline));
     }
 
-    uint32_t direct_callable_stack_size_from_traversal;
-    uint32_t direct_callable_stack_size_from_state;
-    uint32_t continuation_stack_size;
+    uint32_t direct_callable_stack_size_from_traversal = 0;
+    uint32_t direct_callable_stack_size_from_state     = 0;
+    uint32_t continuation_stack_size                   = 0;
+
     OPTIX_CHECK(
         optixUtilComputeStackSizes(
-            &stack_sizes, 
+            &stack_sizes,
             max_trace_depth,
-            0,  // maxCCDepth
-            0,  // maxDCDEpth
+            0, // maxCCDepth
+            0, // maxDCDepth
             &direct_callable_stack_size_from_traversal,
-            &direct_callable_stack_size_from_state, 
+            &direct_callable_stack_size_from_state,
             &continuation_stack_size
         )
     );
+
     OPTIX_CHECK(
         optixPipelineSetStackSize(
             state.pipeline,
             direct_callable_stack_size_from_traversal,
             direct_callable_stack_size_from_state,
             continuation_stack_size,
-            1  // maxTraversableDepth
+            1 // maxTraversableDepth
         )
     );
+
     return 0;
 }
 
+
 int createSBT(State& state)
 {
     CUresult err = CUDA_SUCCESS;
@@ -550,36 +602,36 @@ int initRTX_internal(State& state) {
     err = cuMemAlloc(&state.d_params, sizeof(Params));
     CHECK_CUDA_LOG(err, "Failed to allocate internal state buffer");
 
-    OPTIX_CHECK(optixInit());
+  // Make the optixInit failure readable in Python:
+    OPTIX_CHECK_RTX(optixInit());
+
+    // Always zero-init options:
+    OptixDeviceContextOptions options = {};
+    options.logCallbackFunction = contextLogCallback;
+    options.logCallbackData     = nullptr;
+    options.logCallbackLevel    = 4; // max verbosity
 
-	OptixDeviceContextOptions options;
-	options.logCallbackLevel = 0; //MAX verbosity 4
-	options.logCallbackFunction = contextLogCallback;
-	options.logCallbackData = nullptr;
 #if OPTIX_VERSION >= 70300
 #if DEBUG_PRINTS
-	options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
-#endif //DEBUG_PRINTS
-#endif //OPTIX_VERSION >= 70300
-    OPTIX_CHECK(optixDeviceContextCreate(state.cuda.context, &options, &state.context));
+    options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
+#endif
+#endif
+
+    OPTIX_CHECK_RTX(optixDeviceContextCreate(state.cuda.context, &options, &state.context));
+
     if (!state.context) {
+        setLastErrorRTX("optixDeviceContextCreate returned success but state.context is null");
         return -1;
     }
-    if (createModule(state)) {
-        return -1;
-    }
-    if (createProgramGroups(state)) {
-        return -1;
-    }
-    if (createPipelines(state)) {
-        return -1;
-    }
-    if (createSBT(state)) {
-        return -1;
-    }
+
+    // For your other helpers, also setLastErrorRTX before returning -1
+    if (createModule(state))         { /* setLastErrorRTX inside createModule */ return -1; }
+    if (createProgramGroups(state))  { /* ... */ return -1; }
+    if (createPipelines(state))      { /* ... */ return -1; }
+    if (createSBT(state))            { /* ... */ return -1; }
 
     state.valid = true;
-    return err;
+    return 0;
 }
 
 int initBuffers_internal(State& state, int numRays) {
diff --git a/kernel.ptx b/kernel.ptx
new file mode 100644
index 0000000..97394c7
--- /dev/null
+++ b/kernel.ptx
@@ -0,0 +1,172 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-36836380
+// Cuda compilation tools, release 13.1, V13.1.80
+// Based on NVVM 7.0.1
+//
+
+.version 9.1
+.target sm_86
+.address_size 64
+
+	// .globl	__raygen__main
+.const .align 8 .b8 params[24];
+
+.visible .entry __raygen__main()
+{
+	.reg .f32 	%f<10>;
+	.reg .b32 	%r<83>;
+	.reg .b64 	%rd<10>;
+
+
+	// begin inline asm
+	call (%r1), _optix_get_launch_index_x, ();
+	// end inline asm
+	// begin inline asm
+	call (%r2), _optix_get_launch_index_y, ();
+	// end inline asm
+	// begin inline asm
+	call (%r3), _optix_get_launch_index_z, ();
+	// end inline asm
+	// begin inline asm
+	call (%r4), _optix_get_launch_dimension_x, ();
+	// end inline asm
+	// begin inline asm
+	call (%r5), _optix_get_launch_dimension_y, ();
+	// end inline asm
+	mad.lo.s32 	%r77, %r5, %r3, %r2;
+	mad.lo.s32 	%r78, %r77, %r4, %r1;
+	ld.const.u64 	%rd2, [params+8];
+	cvta.to.global.u64 	%rd3, %rd2;
+	mul.wide.u32 	%rd4, %r78, 32;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f1, [%rd5];
+	ld.global.f32 	%f2, [%rd5+4];
+	ld.global.f32 	%f3, [%rd5+8];
+	ld.global.f32 	%f7, [%rd5+12];
+	ld.global.f32 	%f4, [%rd5+16];
+	ld.global.f32 	%f5, [%rd5+20];
+	ld.global.f32 	%f6, [%rd5+24];
+	ld.global.f32 	%f8, [%rd5+28];
+	ld.const.u64 	%rd1, [params];
+	mov.f32 	%f9, 0f00000000;
+	mov.u32 	%r42, 1;
+	mov.u32 	%r44, 4;
+	mov.u32 	%r76, 0;
+	// begin inline asm
+	call(%r6,%r7,%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%r16,%r17,%r18,%r19,%r20,%r21,%r22,%r23,%r24,%r25,%r26,%r27,%r28,%r29,%r30,%r31,%r32,%r33,%r34,%r35,%r36,%r37),_optix_trace_typed_32,(%r76,%rd1,%f1,%f2,%f3,%f4,%f5,%f6,%f7,%f8,%f9,%r42,%r76,%r76,%r42,%r76,%r44,%r79,%r80,%r81,%r82,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76);
+	// end inline asm
+	ld.const.u64 	%rd6, [params+16];
+	cvta.to.global.u64 	%rd7, %rd6;
+	mul.wide.u32 	%rd8, %r78, 16;
+	add.s64 	%rd9, %rd7, %rd8;
+	st.global.u32 	[%rd9], %r6;
+	st.global.u32 	[%rd9+4], %r7;
+	st.global.u32 	[%rd9+8], %r8;
+	st.global.u32 	[%rd9+12], %r9;
+	ret;
+
+}
+	// .globl	__miss__miss
+.visible .entry __miss__miss()
+{
+	.reg .b32 	%r<9>;
+
+
+	mov.u32 	%r8, 0;
+	mov.u32 	%r2, -1082130432;
+	// begin inline asm
+	call _optix_set_payload, (%r8, %r2);
+	// end inline asm
+	mov.u32 	%r3, 1;
+	mov.u32 	%r4, 1065353216;
+	// begin inline asm
+	call _optix_set_payload, (%r3, %r4);
+	// end inline asm
+	mov.u32 	%r5, 2;
+	// begin inline asm
+	call _optix_set_payload, (%r5, %r8);
+	// end inline asm
+	mov.u32 	%r7, 3;
+	// begin inline asm
+	call _optix_set_payload, (%r7, %r8);
+	// end inline asm
+	ret;
+
+}
+	// .globl	__closesthit__chit
+.visible .entry __closesthit__chit()
+{
+	.reg .f32 	%f<37>;
+	.reg .b32 	%r<14>;
+	.reg .b64 	%rd<3>;
+
+
+	// begin inline asm
+	call (%f1), _optix_get_ray_tmax, ();
+	// end inline asm
+	cvt.rzi.ftz.u32.f32 	%r13, %f1;
+	// begin inline asm
+	call (%rd1), _optix_get_gas_traversable_handle, ();
+	// end inline asm
+	// begin inline asm
+	call (%r1), _optix_read_primitive_idx, ();
+	// end inline asm
+	// begin inline asm
+	call (%r2), _optix_read_sbt_gas_idx, ();
+	// end inline asm
+	// begin inline asm
+	call (%f2), _optix_get_ray_time, ();
+	// end inline asm
+	// begin inline asm
+	call (%f3, %f4, %f5, %f6, %f7, %f8, %f9, %f10, %f11), _optix_get_triangle_vertex_data, (%rd1, %r1, %r2, %f2);
+	// end inline asm
+	sub.ftz.f32 	%f13, %f6, %f3;
+	sub.ftz.f32 	%f14, %f7, %f4;
+	sub.ftz.f32 	%f15, %f8, %f5;
+	sub.ftz.f32 	%f16, %f9, %f3;
+	sub.ftz.f32 	%f17, %f10, %f4;
+	sub.ftz.f32 	%f18, %f11, %f5;
+	mul.ftz.f32 	%f19, %f14, %f18;
+	mul.ftz.f32 	%f20, %f15, %f17;
+	sub.ftz.f32 	%f21, %f19, %f20;
+	mul.ftz.f32 	%f22, %f13, %f18;
+	mul.ftz.f32 	%f23, %f15, %f16;
+	sub.ftz.f32 	%f24, %f22, %f23;
+	mul.ftz.f32 	%f25, %f13, %f17;
+	mul.ftz.f32 	%f26, %f14, %f16;
+	sub.ftz.f32 	%f27, %f25, %f26;
+	mul.ftz.f32 	%f28, %f24, %f24;
+	fma.rn.ftz.f32 	%f29, %f21, %f21, %f28;
+	fma.rn.ftz.f32 	%f30, %f27, %f27, %f29;
+	rsqrt.approx.ftz.f32 	%f31, %f30;
+	mul.ftz.f32 	%f32, %f31, %f21;
+	mul.ftz.f32 	%f33, %f24, %f31;
+	neg.ftz.f32 	%f34, %f33;
+	mul.ftz.f32 	%f35, %f31, %f27;
+	cvt.rn.f32.u32 	%f36, %r13;
+	mov.b32 	%r6, %f36;
+	mov.u32 	%r5, 0;
+	// begin inline asm
+	call _optix_set_payload, (%r5, %r6);
+	// end inline asm
+	mov.b32 	%r8, %f32;
+	mov.u32 	%r7, 1;
+	// begin inline asm
+	call _optix_set_payload, (%r7, %r8);
+	// end inline asm
+	mov.b32 	%r10, %f34;
+	mov.u32 	%r9, 2;
+	// begin inline asm
+	call _optix_set_payload, (%r9, %r10);
+	// end inline asm
+	mov.b32 	%r12, %f35;
+	mov.u32 	%r11, 3;
+	// begin inline asm
+	call _optix_set_payload, (%r11, %r12);
+	// end inline asm
+	ret;
+
+}
+
diff --git a/rtxpy/rtx.py b/rtxpy/rtx.py
index 1828977..ce2c232 100644
--- a/rtxpy/rtx.py
+++ b/rtxpy/rtx.py
@@ -35,17 +35,25 @@ def __init__(self):
 
         try:
             c_lib = ctypes.CDLL(dir_path)
+            c_lib = ctypes.CDLL(dir_path, use_errno=True)
+
             c_lib.initRTX.restype = ctypes.c_int
             c_lib.buildRTX.restype = ctypes.c_int
             c_lib.traceRTX.restype = ctypes.c_int
             c_lib.cleanRTX.restype = ctypes.c_int
             c_lib.getHashRTX.restype = ctypes.c_uint64
+            c_lib.getLastErrorRTX.restype = ctypes.c_char_p
         except:
             raise RuntimeError("Failed to load RTX library")
 
-        if c_lib.initRTX():
+        rc = c_lib.initRTX()
+        if rc != 0:
+            msg = c_lib.getLastErrorRTX()
             free_optix_resources()
-            raise RuntimeError("Failed to initialize RTX library")
+            raise RuntimeError(
+                f"Failed to initialize RTX library (rc={rc}): "
+                + (msg.decode("utf-8", "replace") if msg else "no details")
+            )
         else:
             atexit.register(free_optix_resources)
 
diff --git a/rtxpy/tests/test_simple.py b/rtxpy/tests/test_simple.py
index f3021ba..378c054 100644
--- a/rtxpy/tests/test_simple.py
+++ b/rtxpy/tests/test_simple.py
@@ -22,13 +22,7 @@ def test_simple(test_cupy):
     rays = backend.float32([0.33,0.33,100,0,0,0,-1,1000])
     hits =backend.float32([0,0,0,0])
 
-    try:
-        optix = RTX()
-    except RuntimeError as e:
-        # RTX fails to initialize if CUDA not available.
-        if str(e) == "Failed to initialize RTX library":
-            pytest.xfail("CUDA not available")
-        raise
+    optix = RTX()
 
     res = optix.build(0, verts, triangles)
     assert res == 0

From 8b33a5cd33474de4217c4b3d31b71025d6b798f6 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 17 Dec 2025 07:46:55 -0800
Subject: [PATCH 05/14] updated python versions in tests

---
 .github/workflows/test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 55d2c87..de12505 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -17,7 +17,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, windows-latest]
-        python-version: ["3.7", "3.8", "3.9", "3.10"]
+        python-version: ["3.11", "3.12", "3.13", "3.14"]
 
     steps:
       - name: Checkout source
@@ -26,7 +26,7 @@ jobs:
           fetch-depth: 0
 
       - name: Install Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v6
         with:
           python-version: ${{ matrix.python-version }}
 

From e0d4b277429049a7dc81e1d9b60cc974218df220 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 17 Dec 2025 08:11:48 -0800
Subject: [PATCH 06/14] added kernel.ptx to package

---
 MANIFEST.in      |   1 +
 crtx/internal.h  |  54 ++++++++++-----
 pyproject.toml   |   7 ++
 rtxpy/kernel.ptx | 172 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 216 insertions(+), 18 deletions(-)
 create mode 100644 rtxpy/kernel.ptx

diff --git a/MANIFEST.in b/MANIFEST.in
index a5e2944..46bb81c 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,4 @@
 # Extra files required in sdist
 include CMakeLists.txt
 recursive-include crtx *.h *.c *cpp *.cu *.sh *.ptx
+recursive-include rtxpy *.ptx *.so
diff --git a/crtx/internal.h b/crtx/internal.h
index 7ebfe30..57c97b9 100644
--- a/crtx/internal.h
+++ b/crtx/internal.h
@@ -6,6 +6,9 @@
 #include <string>
 #include <fstream>
 #include <sstream>
+#include <string>
+#include <stdexcept>
+#include <filesystem>
 
 // NOTE: dllmain.cpp must include <cuda.h> and <optix.h> BEFORE including this file.
 #ifdef WIN32
@@ -127,27 +130,42 @@ struct State {
     bool valid = false;
 };
 
-// Read a PTX file into a std::string (throws on failure)
-inline std::string load_ptx_file(const char* path)
+
+
+#include <dlfcn.h>
+
+static std::string read_file_to_string(const std::string& path)
+{
+    std::ifstream f(path.c_str(), std::ios::binary);
+    if (!f) return "";
+    std::ostringstream ss;
+    ss << f.rdbuf();
+    return ss.str();
+}
+
+static std::string shared_lib_dir()
 {
-    std::ifstream f(path, std::ios::in | std::ios::binary);
-    if (!f) {
-        throw std::runtime_error(std::string("Could not open PTX file: ") + path);
+    Dl_info info{};
+    if (dladdr((void*)&shared_lib_dir, &info) && info.dli_fname) {
+        std::string full(info.dli_fname);
+        auto pos = full.find_last_of('/');
+        return (pos == std::string::npos) ? "." : full.substr(0, pos);
     }
+    return ".";
+}
 
-    f.seekg(0, std::ios::end);
-    std::streamoff size = f.tellg();
-    f.seekg(0, std::ios::beg);
+std::string load_ptx_file(const std::string& filename)
+{
+    std::string dir = shared_lib_dir();
 
-    if (size <= 0) {
-        throw std::runtime_error(std::string("PTX file is empty: ") + path);
-    }
+    std::string p1 = dir + "/" + filename;
+    std::string s = read_file_to_string(p1);
+    if (!s.empty()) return s;
 
-    std::string ptx;
-    ptx.resize(static_cast<size_t>(size));
-    f.read(&ptx[0], size);
-    if (!f) {
-        throw std::runtime_error(std::string("Failed to read PTX file: ") + path);
-    }
-    return ptx;
+    std::string p2 = dir + "/data/" + filename;
+    s = read_file_to_string(p2);
+    if (!s.empty()) return s;
+
+    return "";
 }
+
diff --git a/pyproject.toml b/pyproject.toml
index c9cf764..05035b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,4 +5,11 @@ requires = [
     "packaging",
     "setuptools>=42",
     "wheel",
+    "pytest",
 ]
+
+[tool.setuptools]
+include-package-data = true
+
+[tool.setuptools.package-data]
+rtxpy = ["*.ptx", "*.so"]
diff --git a/rtxpy/kernel.ptx b/rtxpy/kernel.ptx
new file mode 100644
index 0000000..97394c7
--- /dev/null
+++ b/rtxpy/kernel.ptx
@@ -0,0 +1,172 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-36836380
+// Cuda compilation tools, release 13.1, V13.1.80
+// Based on NVVM 7.0.1
+//
+
+.version 9.1
+.target sm_86
+.address_size 64
+
+	// .globl	__raygen__main
+.const .align 8 .b8 params[24];
+
+.visible .entry __raygen__main()
+{
+	.reg .f32 	%f<10>;
+	.reg .b32 	%r<83>;
+	.reg .b64 	%rd<10>;
+
+
+	// begin inline asm
+	call (%r1), _optix_get_launch_index_x, ();
+	// end inline asm
+	// begin inline asm
+	call (%r2), _optix_get_launch_index_y, ();
+	// end inline asm
+	// begin inline asm
+	call (%r3), _optix_get_launch_index_z, ();
+	// end inline asm
+	// begin inline asm
+	call (%r4), _optix_get_launch_dimension_x, ();
+	// end inline asm
+	// begin inline asm
+	call (%r5), _optix_get_launch_dimension_y, ();
+	// end inline asm
+	mad.lo.s32 	%r77, %r5, %r3, %r2;
+	mad.lo.s32 	%r78, %r77, %r4, %r1;
+	ld.const.u64 	%rd2, [params+8];
+	cvta.to.global.u64 	%rd3, %rd2;
+	mul.wide.u32 	%rd4, %r78, 32;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f1, [%rd5];
+	ld.global.f32 	%f2, [%rd5+4];
+	ld.global.f32 	%f3, [%rd5+8];
+	ld.global.f32 	%f7, [%rd5+12];
+	ld.global.f32 	%f4, [%rd5+16];
+	ld.global.f32 	%f5, [%rd5+20];
+	ld.global.f32 	%f6, [%rd5+24];
+	ld.global.f32 	%f8, [%rd5+28];
+	ld.const.u64 	%rd1, [params];
+	mov.f32 	%f9, 0f00000000;
+	mov.u32 	%r42, 1;
+	mov.u32 	%r44, 4;
+	mov.u32 	%r76, 0;
+	// begin inline asm
+	call(%r6,%r7,%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%r16,%r17,%r18,%r19,%r20,%r21,%r22,%r23,%r24,%r25,%r26,%r27,%r28,%r29,%r30,%r31,%r32,%r33,%r34,%r35,%r36,%r37),_optix_trace_typed_32,(%r76,%rd1,%f1,%f2,%f3,%f4,%f5,%f6,%f7,%f8,%f9,%r42,%r76,%r76,%r42,%r76,%r44,%r79,%r80,%r81,%r82,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76,%r76);
+	// end inline asm
+	ld.const.u64 	%rd6, [params+16];
+	cvta.to.global.u64 	%rd7, %rd6;
+	mul.wide.u32 	%rd8, %r78, 16;
+	add.s64 	%rd9, %rd7, %rd8;
+	st.global.u32 	[%rd9], %r6;
+	st.global.u32 	[%rd9+4], %r7;
+	st.global.u32 	[%rd9+8], %r8;
+	st.global.u32 	[%rd9+12], %r9;
+	ret;
+
+}
+	// .globl	__miss__miss
+.visible .entry __miss__miss()
+{
+	.reg .b32 	%r<9>;
+
+
+	mov.u32 	%r8, 0;
+	mov.u32 	%r2, -1082130432;
+	// begin inline asm
+	call _optix_set_payload, (%r8, %r2);
+	// end inline asm
+	mov.u32 	%r3, 1;
+	mov.u32 	%r4, 1065353216;
+	// begin inline asm
+	call _optix_set_payload, (%r3, %r4);
+	// end inline asm
+	mov.u32 	%r5, 2;
+	// begin inline asm
+	call _optix_set_payload, (%r5, %r8);
+	// end inline asm
+	mov.u32 	%r7, 3;
+	// begin inline asm
+	call _optix_set_payload, (%r7, %r8);
+	// end inline asm
+	ret;
+
+}
+	// .globl	__closesthit__chit
+.visible .entry __closesthit__chit()
+{
+	.reg .f32 	%f<37>;
+	.reg .b32 	%r<14>;
+	.reg .b64 	%rd<3>;
+
+
+	// begin inline asm
+	call (%f1), _optix_get_ray_tmax, ();
+	// end inline asm
+	cvt.rzi.ftz.u32.f32 	%r13, %f1;
+	// begin inline asm
+	call (%rd1), _optix_get_gas_traversable_handle, ();
+	// end inline asm
+	// begin inline asm
+	call (%r1), _optix_read_primitive_idx, ();
+	// end inline asm
+	// begin inline asm
+	call (%r2), _optix_read_sbt_gas_idx, ();
+	// end inline asm
+	// begin inline asm
+	call (%f2), _optix_get_ray_time, ();
+	// end inline asm
+	// begin inline asm
+	call (%f3, %f4, %f5, %f6, %f7, %f8, %f9, %f10, %f11), _optix_get_triangle_vertex_data, (%rd1, %r1, %r2, %f2);
+	// end inline asm
+	sub.ftz.f32 	%f13, %f6, %f3;
+	sub.ftz.f32 	%f14, %f7, %f4;
+	sub.ftz.f32 	%f15, %f8, %f5;
+	sub.ftz.f32 	%f16, %f9, %f3;
+	sub.ftz.f32 	%f17, %f10, %f4;
+	sub.ftz.f32 	%f18, %f11, %f5;
+	mul.ftz.f32 	%f19, %f14, %f18;
+	mul.ftz.f32 	%f20, %f15, %f17;
+	sub.ftz.f32 	%f21, %f19, %f20;
+	mul.ftz.f32 	%f22, %f13, %f18;
+	mul.ftz.f32 	%f23, %f15, %f16;
+	sub.ftz.f32 	%f24, %f22, %f23;
+	mul.ftz.f32 	%f25, %f13, %f17;
+	mul.ftz.f32 	%f26, %f14, %f16;
+	sub.ftz.f32 	%f27, %f25, %f26;
+	mul.ftz.f32 	%f28, %f24, %f24;
+	fma.rn.ftz.f32 	%f29, %f21, %f21, %f28;
+	fma.rn.ftz.f32 	%f30, %f27, %f27, %f29;
+	rsqrt.approx.ftz.f32 	%f31, %f30;
+	mul.ftz.f32 	%f32, %f31, %f21;
+	mul.ftz.f32 	%f33, %f24, %f31;
+	neg.ftz.f32 	%f34, %f33;
+	mul.ftz.f32 	%f35, %f31, %f27;
+	cvt.rn.f32.u32 	%f36, %r13;
+	mov.b32 	%r6, %f36;
+	mov.u32 	%r5, 0;
+	// begin inline asm
+	call _optix_set_payload, (%r5, %r6);
+	// end inline asm
+	mov.b32 	%r8, %f32;
+	mov.u32 	%r7, 1;
+	// begin inline asm
+	call _optix_set_payload, (%r7, %r8);
+	// end inline asm
+	mov.b32 	%r10, %f34;
+	mov.u32 	%r9, 2;
+	// begin inline asm
+	call _optix_set_payload, (%r9, %r10);
+	// end inline asm
+	mov.b32 	%r12, %f35;
+	mov.u32 	%r11, 3;
+	// begin inline asm
+	call _optix_set_payload, (%r11, %r12);
+	// end inline asm
+	ret;
+
+}
+

From b964c14c3ded0c17ce8b1e67431d30ba84540a89 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 17 Dec 2025 08:18:24 -0800
Subject: [PATCH 07/14] added ptx file to package build

---
 pyproject.toml    | 24 ++++++++++++++++--------
 rtxpy/__init__.py |  3 +++
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 05035b8..d4852c6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,15 +1,23 @@
 [build-system]
+requires = ["setuptools>=68", "wheel"]
 build-backend = "setuptools.build_meta"
-requires = [
-    "cmake>=3.10",
-    "packaging",
-    "setuptools>=42",
-    "wheel",
-    "pytest",
-]
+
+[project]
+name = "rtxpy"
+version = "0.0.0"
+description = "Ray tracing using CUDA accessible from Python"
+readme = { file = "README.md", content-type = "text/markdown" }
+requires-python = ">=3.10"
+license = { text = "MIT" }
+authors = [{ name = "makepath" }]
+dependencies = ["numpy>=1.16"]
+
+[project.optional-dependencies]
+tests = ["pytest"]
 
 [tool.setuptools]
 include-package-data = true
 
 [tool.setuptools.package-data]
-rtxpy = ["*.ptx", "*.so"]
+rtxpy = ["*.ptx"]
+
diff --git a/rtxpy/__init__.py b/rtxpy/__init__.py
index 953d502..e4a474b 100644
--- a/rtxpy/__init__.py
+++ b/rtxpy/__init__.py
@@ -1 +1,4 @@
 from .rtx import RTX, has_cupy
+
+
+__version__ = "0.0.0"

From af458501ef423c4dd2b8b120fa151904d0309ea2 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 17 Dec 2025 08:24:17 -0800
Subject: [PATCH 08/14] update test to pull in cuda-toolkit

---
 .github/workflows/test.yml | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index de12505..3078ffa 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -2,11 +2,9 @@ name: Test
 
 on:
   pull_request:
-    branches:
-      - master
+    branches: [master]
   push:
-    branches:
-      - master
+    branches: [master]
 
 jobs:
   test:
@@ -30,11 +28,25 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
 
+      # Install CUDA toolkit (nvcc + CUDA_PATH)
+      - name: Install CUDA Toolkit
+        uses: Jimver/cuda-toolkit@v0.2.29
+        with:
+          cuda: "12.4.1"
+
+      - name: Verify CUDA
+        shell: bash
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH"
+          nvcc --version
+
       - name: Install rtxpy
         run: |
+          python -m pip install -U pip
           python -m pip install -ve .[tests]
           python -m pip list
 
       - name: Run tests
         run: |
           python -m pytest -v rtxpy/tests
+

From f3ab0ddd557f326bdbd4065c51ba5e8dea71a4d2 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 17 Dec 2025 08:24:55 -0800
Subject: [PATCH 09/14] update test to pull in cuda 13.1

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 3078ffa..8779d07 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Install CUDA Toolkit
         uses: Jimver/cuda-toolkit@v0.2.29
         with:
-          cuda: "12.4.1"
+          cuda: "13.1"
 
       - name: Verify CUDA
         shell: bash

From 9f7b498070106f9a53a1dad21b689f3f32a67f64 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 17 Dec 2025 11:05:41 -0800
Subject: [PATCH 10/14] added some additional build instructions based on
 building on wsl2; ;

---
 README.md | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/README.md b/README.md
index ed5d62d..9698bcf 100644
--- a/README.md
+++ b/README.md
@@ -32,3 +32,69 @@ To run tests
 
     pip install -ve .[tests]
     pytest -v rtxpy/tests
+
+
+## Building from source:
+
+### Building kernel.ptx
+```bash
+cd crtx
+bash compileOptiX.sh
+cp kernel.ptx ../rtxpy
+```
+
+### Building `librtxpy.so`
+```bash
+bash clean_build.sh
+cp build/librtxpy.so ./rtxpy
+```
+
+### Building on WSL2:
+To get the build working on WSL, I followed the post below:
+https://forums.developer.nvidia.com/t/problem-running-optix-7-6-in-wsl/239355/8
+
+---------------------
+
+Welcome @chris.schwindt,
+
+I believe we’re not yet packaging OptiX into the WSL2 driver. I believe this is hung up on a redesign of the driver packaging and delivery process, which is why it’s taking such a long time.
+
+I have heard rumors that people have been able to get OptiX to work in WSL2 via manual install. This is unofficial and subject to change, so your mileage may vary, but here are some steps that may work for you:
+
+Running OptiX Applications on WSL 2
+Install WSL 2 and enable CUDA
+Follow the canonical methods for installing WSL, display driver, and CUDA Toolkit within WSL
+
+As mentioned in the docs, do not install a Linux Display driver in WSL, this will break the mapping of libcuda.
+There are CUDA Toolkit downloads specifically for WSL that will not attempt to install a driver, only the toolkit.
+You can also deselect the driver in a normal version of the toolkit.
+Obtain OptiX / RTCore libraries for Linux
+Download and extract libraries from the linux display driver.
+You can run the driver installer in WSL using ./[driver filename].run -x which will unpack the driver but not install it.
+Copy libnvoptix.so.XXX.00, libnvidia-rtcore.so.XXX.00, and libnvidia-ptxjitcompiler.so.XXX.00 into C:/Windows/System32/lxss/lib where XXX is the driver version.
+Rename libnvoptix.so.XX.00 to libnvoptix.so.1
+Rename libnvidia-ptxjitcompiler.so.XXX.00 to libnvidia-ptxjitcompiler.so.1
+Do not rename libnvidia-rtcore.so.XXX.00
+Be aware that future drivers may need additional libraries that will need to be copied.
+Building an OptiX Application
+You may need to add /usr/local/cuda/bin to your PATH to access NVCC, but do NOT add /usr/local/cuda/lib64 to LD_LIBRARY_PATH as you normally would when installing the CUDA toolkit. libcuda and other libraries are passed through from C:/Windows/System32/lxss/lib where you placed the OptiX and RTCore libs.
+Instead, add /usr/lib/wsl/lib to your LD_LIBRARY_PATH to pick up CUDA, OptiX, etc.
+Running an OptiX Application
+With LD_LIBRARY_PATH set per the previous step, you should be able to run an OptiX executable.
+You may need to rebuild the WSL cache. You can do so by quitting any WSL sessions and running wsl --shutdown from Powershell, then starting a new WSL session. Failing to reset the cache may lead to strange load paths.
+You may verify paths are correct using strace, e.g., strace -o trace ./bin/optixHello
+–
+David.
+
+---------------------
+
+I ended up downloading: https://uk.download.nvidia.com/XFree86/Linux-x86_64/590.44.01/NVIDIA-Linux-x86_64-590.44.01.run
+Nvidia Driver: 591.44
+
+I then extract files and followed instructions above
+
+
+I then extracted 
+```bash
+bash 
+```

From a764cbaaa77cf8cf2eb974aa8c93e0774931c980 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 17 Dec 2025 11:08:12 -0800
Subject: [PATCH 11/14] trying to getting tests running

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 8779d07..303ca18 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Install CUDA Toolkit
         uses: Jimver/cuda-toolkit@v0.2.29
         with:
-          cuda: "13.1"
+          cuda: "12.4"
 
       - name: Verify CUDA
         shell: bash

From 45c37e8eff79f58acd7b3a3f3da0e1592fc7a323 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 17 Dec 2025 11:10:10 -0800
Subject: [PATCH 12/14] trying to getting tests running

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 303ca18..911cdcb 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Install CUDA Toolkit
         uses: Jimver/cuda-toolkit@v0.2.29
         with:
-          cuda: "12.4"
+          cuda: "12.3"
 
       - name: Verify CUDA
         shell: bash

From 4c79efbc864079073bedca5e0cf3a80e6cd5c36f Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 17 Dec 2025 11:11:48 -0800
Subject: [PATCH 13/14] trying to getting tests running

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 911cdcb..14d395d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Install CUDA Toolkit
         uses: Jimver/cuda-toolkit@v0.2.29
         with:
-          cuda: "12.3"
+          cuda: "12.3.0"
 
       - name: Verify CUDA
         shell: bash

From 464b01a8719b9f7db7ea827bc5aa6e2188b32da5 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Mon, 22 Dec 2025 08:22:18 -0800
Subject: [PATCH 14/14] cleaned up examples and tested with update data

---
 examples/cuda_utils.py | 19 ++++++------
 examples/hillshade.py  | 22 ++++++-------
 examples/mesh_utils.py | 57 ++++++++++++++++++----------------
 examples/playground.py | 70 ++++++++++++++++++------------------------
 examples/viewshed.py   | 21 +++++++------
 5 files changed, 93 insertions(+), 96 deletions(-)

diff --git a/examples/cuda_utils.py b/examples/cuda_utils.py
index e73690a..5b60057 100644
--- a/examples/cuda_utils.py
+++ b/examples/cuda_utils.py
@@ -1,3 +1,4 @@
+from numba import cuda
 import numba as nb
 import numpy as np
 
@@ -9,38 +10,38 @@ def calc_dims(shape):
     )
     return blockspergrid, threadsperblock
 
-@nb.cuda.jit(device=True)
+@cuda.jit(device=True)
 def add(a, b):
     return float3(a[0]+b[0], a[1]+b[1], a[2]+b[2])
 
-@nb.cuda.jit(device=True)
+@cuda.jit(device=True)
 def diff(a, b):
     return float3(a[0]-b[0], a[1]-b[1], a[2]-b[2])
 
-@nb.cuda.jit(device=True)
+@cuda.jit(device=True)
 def mul(a, b):
     return float3(a[0]*b, a[1]*b, a[2]*b)
 
-@nb.cuda.jit(device=True)
+@cuda.jit(device=True)
 def multColor(a, b):
     return float3(a[0]*b[0], a[1]*b[1], a[2]*b[2])
 
-@nb.cuda.jit(device=True)
+@cuda.jit(device=True)
 def dot(a, b):
     return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]
 
-@nb.cuda.jit(device=True)
+@cuda.jit(device=True)
 def mix(a, b, k):
     return add(mul(a, k), mul(b, 1-k))
 
-@nb.cuda.jit(device=True)
+@cuda.jit(device=True)
 def make_float3(a, offset):
     return float3(a[offset], a[offset+1], a[offset+2])
 
-@nb.cuda.jit(device=True)
+@cuda.jit(device=True)
 def invert(a):
     return float3(-a[0], -a[1], -a[2])
 
-@nb.cuda.jit(device=True)
+@cuda.jit(device=True)
 def float3(a, b, c):
     return (np.float32(a), np.float32(b), np.float32(c))
diff --git a/examples/hillshade.py b/examples/hillshade.py
index 9276e1d..fa433a7 100644
--- a/examples/hillshade.py
+++ b/examples/hillshade.py
@@ -1,5 +1,5 @@
 import numpy as np
-import numba as nb
+from numba import cuda
 
 import cupy
 import xarray as xr
@@ -9,17 +9,17 @@
 
 from scipy.spatial.transform import Rotation as R
 
-from raytrace.cuda_utils import *
-from raytrace import mesh_utils
+from cuda_utils import *
+import mesh_utils
 
-@nb.cuda.jit
+@cuda.jit
 def _generatePrimaryRays(data, x_coords, y_coords, H, W):
     """
     A GPU kernel that given a set of x and y discrete coordinates on a raster terrain
     generates in @data a list of parallel rays that represent camera rays generated from an ortographic camera
     that is looking straight down at the surface from an origin height 10000
     """
-    i, j = nb.cuda.grid(2)
+    i, j = cuda.grid(2)
     if i>=0 and i < H and j>=0 and j < W:
         #data[i,j,0] = j + 1e-6 # x_coords[j] + 1e-6
         #data[i,j,1] = i + 1e-6 # y_coords[i] + 1e-6
@@ -48,7 +48,7 @@ def generatePrimaryRays(rays, x_coords, y_coords, H, W):
     return 0
 
 
-@nb.cuda.jit
+@cuda.jit
 def _generateShadowRays(rays, hits, normals, H, W, sunDir):
     """
     A GPU kernel that given a set rays and their respective intersection points,
@@ -57,7 +57,7 @@ def _generateShadowRays(rays, hits, normals, H, W, sunDir):
     The normals vectors at the point of intersection of the original rays are cached in @normals
     Thus we can later use them to do lambertian shading, after the shadow rays have been traced
     """
-    i, j = nb.cuda.grid(2)
+    i, j = cuda.grid(2)
     if i>=0 and i < H and j>=0 and j < W:
         dist = hits[i,j,0]
         norm = make_float3(hits[i,j], 1)
@@ -88,7 +88,7 @@ def generateShadowRays(rays, hits, normals, H, W, sunDir):
     return 0
 
 
-@nb.cuda.jit
+@cuda.jit
 def _shadeLambert(hits, normals, output, H, W, sunDir, castShadows):
     """
     This kernel does a simple Lambertian shading
@@ -99,7 +99,7 @@ def _shadeLambert(hits, normals, output, H, W, sunDir, castShadows):
     We then use the information for light visibility and normal to apply Lambert's cosine law
     The final result is stored in output which is an RGB array
     """
-    i, j = nb.cuda.grid(2)
+    i, j = cuda.grid(2)
     if i>=0 and i < H and j>=0 and j < W:
         # Normal at the intersection of camera ray (i,j) with the scene
         norm = make_float3(normals[i,j], 0)
@@ -202,10 +202,10 @@ def hillshade_gpu(raster: xr.DataArray,
     # Move the terrain to GPU for testing the GPU path
     if not isinstance(raster.data, cupy.ndarray):
         print("WARNING: raster.data is not a cupy array. Additional overhead will be incurred")
-    H,W = raster.shape
+    H,W = raster.data.squeeze().shape
     optix = RTX()
 
-    datahash = np.uint64(hash(str(raster.data.get())))
+    datahash = np.uint64(hash(str(raster.data.get())) % (1 << 64))
     optixhash = np.uint64(optix.getHash())
     if (optixhash != datahash):
         numTris = (H - 1) * (W - 1) * 2
diff --git a/examples/mesh_utils.py b/examples/mesh_utils.py
index e847558..599f9c9 100644
--- a/examples/mesh_utils.py
+++ b/examples/mesh_utils.py
@@ -1,32 +1,34 @@
 
 import numba as nb
+from numba import cuda
 import numpy as np
 import cupy
-from raytrace.cuda_utils import calc_dims
 
-@nb.cuda.jit
+
+@cuda.jit
 def _triangulateTerrain(verts, triangles, data, H, W, scale, stride):
-    globalId = stride + nb.cuda.grid(1)
+    globalId = stride + cuda.grid(1)
     if globalId < W*H:
         h = globalId // W
         w = globalId % W
         meshMapIndex = h * W + w
 
-        val = data[h,w]
+        val = data[h, w]
 
         offset = 3*meshMapIndex
-        verts[offset]   = w # x_coords[w] # w
-        verts[offset+1] = h # y_coords[h] # h
+        verts[offset] = w  # x_coords[w] # w
+        verts[offset+1] = h  # y_coords[h] # h
         verts[offset+2] = val * scale
 
         if w != W - 1 and h != H - 1:
             offset = 6*(h * (W-1) + w)
-            triangles[offset+0]= np.int32(meshMapIndex + W)
-            triangles[offset+1]= np.int32(meshMapIndex + W + 1)
-            triangles[offset+2]= np.int32(meshMapIndex)
-            triangles[offset+3]= np.int32(meshMapIndex + W + 1)
-            triangles[offset+4]= np.int32(meshMapIndex + 1)
-            triangles[offset+5]= np.int32(meshMapIndex)
+            triangles[offset+0] = np.int32(meshMapIndex + W)
+            triangles[offset+1] = np.int32(meshMapIndex + W + 1)
+            triangles[offset+2] = np.int32(meshMapIndex)
+            triangles[offset+3] = np.int32(meshMapIndex + W + 1)
+            triangles[offset+4] = np.int32(meshMapIndex + 1)
+            triangles[offset+5] = np.int32(meshMapIndex)
+
 
 @nb.njit(parallel=True)
 def triangulateCPU(verts, triangles, data, H, W, scale):
@@ -34,25 +36,25 @@ def triangulateCPU(verts, triangles, data, H, W, scale):
         for w in range(W):
             meshMapIndex = h * W + w
 
-            val = data[h,w]
+            val = data[h, w]
 
             offset = 3*meshMapIndex
-            verts[offset]   = w # x_coords[w] # w
-            verts[offset+1] = h # y_coords[h] # h
+            verts[offset] = w  # x_coords[w] # w
+            verts[offset+1] = h  # y_coords[h] # h
             verts[offset+2] = val * scale
 
             if w != W - 1 and h != H - 1:
                 offset = 6*(h * (W-1) + w)
-                triangles[offset+0]= np.int32(meshMapIndex + W)
-                triangles[offset+1]= np.int32(meshMapIndex + W + 1)
-                triangles[offset+2]= np.int32(meshMapIndex)
-                triangles[offset+3]= np.int32(meshMapIndex + W+1)
-                triangles[offset+4]= np.int32(meshMapIndex + 1)
-                triangles[offset+5]= np.int32(meshMapIndex)
+                triangles[offset+0] = np.int32(meshMapIndex + W)
+                triangles[offset+1] = np.int32(meshMapIndex + W + 1)
+                triangles[offset+2] = np.int32(meshMapIndex)
+                triangles[offset+3] = np.int32(meshMapIndex + W+1)
+                triangles[offset+4] = np.int32(meshMapIndex + 1)
+                triangles[offset+5] = np.int32(meshMapIndex)
 
 
 def triangulateTerrain(verts, triangles, terrain, scale=1):
-    H,W = terrain.shape
+    H, W = terrain.shape
     if isinstance(terrain.data, np.ndarray):
         triangulateCPU(verts, triangles, terrain.data, H, W, scale)
     if isinstance(terrain.data, cupy.ndarray):
@@ -61,9 +63,11 @@ def triangulateTerrain(verts, triangles, terrain, scale=1):
         griddim = (jobSize + blockdim - 1) // 1024
         d = 100
         offset = 0
-        while(jobSize>0):
+        while (jobSize > 0):
             batch = min(d, griddim)
-            _triangulateTerrain[batch, blockdim](verts, triangles, terrain.data, H, W, scale, offset)
+            _triangulateTerrain[batch, blockdim](verts, triangles,
+                                                 terrain.data, H, W,
+                                                 scale, offset)
             offset += batch*blockdim
             jobSize -= batch*blockdim
     return 0
@@ -95,6 +99,7 @@ def fillContents(content, verts, triangles, numTris):
         content[offset:offset+48] = v.view(np.uint8)
         content[offset+48:offset+50] = pad
 
+
 def write(name, verts, triangles):
     """
     Save a triangulated raster to a standard STL file.
@@ -116,7 +121,7 @@ def write(name, verts, triangles):
     nf = np.empty(1, np.uint32)
     numTris = triangles.shape[0] // 3
     nf[0] = numTris
-    f=open(name,'wb')
+    f = open(name, 'wb')
     f.write(header)
     f.write(nf)
 
@@ -126,4 +131,4 @@ def write(name, verts, triangles):
     content = np.empty(numTris*(50), np.uint8)
     fillContents(content, vb, ib, numTris)
     f.write(content)
-    f.close()
\ No newline at end of file
+    f.close()
diff --git a/examples/playground.py b/examples/playground.py
index 271ad16..ca99b3e 100644
--- a/examples/playground.py
+++ b/examples/playground.py
@@ -1,54 +1,36 @@
-from datashader.transfer_functions import shade
-from datashader.transfer_functions import stack
-from datashader.transfer_functions import dynspread
-
 import matplotlib.pyplot as plt
 
 import numpy as np
 import cupy
 
-import os, sys
-#Add the parent folder to python search space
-sys.path.append(os.path.abspath(os.getcwd())+"/..")
-sys.path.append(os.path.abspath(os.getcwd()))
-
-from raytrace import hillshade_gpu
-from raytrace import viewshed_gpu
+from hillshade import hillshade_gpu
+from viewshed import viewshed_gpu
 
 from rtxpy import RTX
 
-import xarray as xr;
-
-from raytrace.hillshade import getSunDir
-#getSunDir(angle_altitude, azimuth)
+import xarray as xr
 
-#terrain = xr.open_dataarray("olympic_national_park.nc")
-#terrain.data = terrain.data * 0.025 #scale down
-terrain = xr.open_dataarray("crater_lake_national_park.nc")
 
+terrain = xr.open_dataarray("crater_lake_national_park.tif").squeeze()
 terrain = terrain[::2, ::2]
-
-terrain.data = terrain.data * 0.2 #scale down
-
+terrain.data = terrain.data * 0.2  # scale down
 azimuth = 225
 
 
-def debug(x,y):
-    rays = cupy.float32([x,y,10000,0,0,0,-1,np.inf])
-    hits = cupy.float32([0,0,0,0])
+def debug(x, y):
+    rays = cupy.float32([x, y, 10000, 0, 0, 0, -1, np.inf])
+    hits = cupy.float32([0, 0, 0, 0])
     optix = RTX()
     res = optix.trace(rays, hits, 1)
-    norm = cupy.asnumpy(hits[1:])
-    sun = getSunDir(25, azimuth)
-
     return res
 
+
 def onclick(event):
     """
-    Simple click handler for live adjustment of the viewshed origin when running matplotlib live session
+    Click handler for live adjustment of the viewshed origin
     """
     ix, iy = event.xdata, event.ydata
-    print ('x = {}, y = {}'.format(ix, iy))
+    print('x = {}, y = {}'.format(ix, iy))
 
     nix = ix/terrain.shape[1]
     niy = iy/terrain.shape[0]
@@ -62,18 +44,19 @@ def onclick(event):
     vsw = x_coords.min() + nix*rangex
     vsh = y_coords.max() - niy*rangey
 
-    #debug(ix, iy)
+    # debug(ix, iy)
     return None
 
+
 def test():
     runs = 360
-    H,W = terrain.data.shape
+    H, W = terrain.data.shape
     if isinstance(terrain.data, np.ndarray):
         terrain.data = cupy.array(terrain.data)
 
     fig = plt.figure()
-    cid = fig.canvas.mpl_connect('button_press_event', onclick)
-    colors = np.uint8(np.zeros((H,W,3)))
+    _ = fig.canvas.mpl_connect('button_press_event', onclick)
+    colors = np.uint8(np.zeros((H, W, 3)))
     imgplot = plt.imshow(colors)
 
     x_coords = terrain.indexes.get('x').values
@@ -94,22 +77,28 @@ def test():
             azimuth -= 360
 
         beforeRT = time.time()
-        hs = hillshade_gpu(terrain, shadows=True, azimuth=azimuth, angle_altitude=25)
-        vs = viewshed_gpu(terrain, x=vsw, y=vsh, observer_elev=0.01)
+        hs = hillshade_gpu(terrain,
+                           shadows=True,
+                           azimuth=azimuth,
+                           angle_altitude=25)
+        vs = viewshed_gpu(terrain,
+                          x=vsw,
+                          y=vsh,
+                          observer_elev=0.01)
         afterRT = time.time()
         print("  RT took ", afterRT-beforeRT)
 
         img = np.uint8(hs.data.get()*225)
 
-        withViewshed = True#False
+        withViewshed = True
         if withViewshed:
             visBuf = np.uint8(vs.data.get() > 0) * 255
-            view = np.maximum(visBuf,img)
+            view = np.maximum(visBuf, img)
         else:
             view = img
-        colors[:,:,0] = view
-        colors[:,:,1] = img
-        colors[:,:,2] = img
+        colors[:, :, 0] = view
+        colors[:, :, 1] = img
+        colors[:, :, 2] = img
         imgplot.set_data(colors)
         plt.pause(0.001)
 
@@ -117,6 +106,7 @@ def test():
 
     return
 
+
 res = test()
 
 print("Done")
diff --git a/examples/viewshed.py b/examples/viewshed.py
index 502c7d9..986c1b9 100644
--- a/examples/viewshed.py
+++ b/examples/viewshed.py
@@ -1,3 +1,4 @@
+from numba import cuda
 import numpy as np
 import numba as nb
 
@@ -10,8 +11,8 @@
 
 from scipy.spatial.transform import Rotation as R
 
-from raytrace.cuda_utils import *
-from raytrace import mesh_utils
+from cuda_utils import *
+import mesh_utils
 
 # view options default values
 OBS_ELEV = 0
@@ -20,9 +21,9 @@
 # if a cell is invisible, its value is set to -1
 INVISIBLE = -1
 
-@nb.cuda.jit
+@cuda.jit
 def _generatePrimaryRays(data, x_coords, y_coords, H, W):
-    i, j = nb.cuda.grid(2)
+    i, j = cuda.grid(2)
     if i>=0 and i < H and j>=0 and j < W:
 #        if (j == W-1):
 #            data[i,j,0] = y_coords[j] - 1e-6
@@ -58,9 +59,9 @@ def generatePrimaryRays(rays, x_coords, y_coords, H, W):
     _generatePrimaryRays[griddim, blockdim](rays, d_x_coords, d_y_coords, H, W)
     return 0
 
-@nb.cuda.jit
+@cuda.jit
 def _generateViewshedRays(camRays, hits, vsrays, visibility_grid, H, W, vp):
-    i, j = nb.cuda.grid(2)
+    i, j = cuda.grid(2)
     if i>=0 and i < H and j>=0 and j < W:
         elevationOffset = vp[2]
         targetElevation = vp[3]
@@ -111,9 +112,9 @@ def generateViewshedRays(rays, hits, vsrays, visibility_grid, H, W, vp):
     _generateViewshedRays[griddim, blockdim](rays, hits, vsrays, visibility_grid, H, W, vp)
     return 0
 
-@nb.cuda.jit
+@cuda.jit
 def _calcViewshed(hits, visibility_grid, H, W):
-    i, j = nb.cuda.grid(2)
+    i, j = cuda.grid(2)
     if i>=0 and i < H and j>=0 and j < W:
         dist = hits[i,j,0]
         # We traced the viewshed rays and now hits contains the intersection data
@@ -198,7 +199,7 @@ def viewshed_gpu(raster: xr.DataArray,
     H,W = raster.shape
     optix = RTX()
 
-    datahash = np.uint64(hash(str(raster.data.get())))
+    datahash = np.uint64(hash(str(raster.data.get())) % (1 << 64))
     optixhash = np.uint64(optix.getHash())
     if (optixhash != datahash):
         numTris = (H - 1) * (W - 1) * 2
@@ -218,4 +219,4 @@ def viewshed_gpu(raster: xr.DataArray,
         cupy.get_default_memory_pool().free_all_blocks()
 
     view = viewshed_rt(raster, optix, x, y, observer_elev, target_elev)
-    return view
\ No newline at end of file
+    return view