Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@ cmake_minimum_required(VERSION 3.16)

cmake_policy(SET CMP0048 NEW)
set(CMAKE_BUILD_TYPE_INIT "Release")
project(cuBQL VERSION 1.1.0 LANGUAGES C CXX)
project(cuBQL VERSION 1.2.0 LANGUAGES C CXX)

if (CUBQL_OMP)
set(CUBQL_DISABLE_CUDA ON)
endif()
if (CUBQL_DISABLE_CUDA)
message("#cuBQL: CUDA _DISABLED_ by user request")
set(CUBQL_HAVE_CUDA OFF)
Expand Down Expand Up @@ -136,3 +139,5 @@ add_subdirectory(cuBQL)
if (NOT CUBQL_IS_SUBPROJECT)
add_subdirectory(samples)
endif()

#add_subdirectory(testing)
6 changes: 3 additions & 3 deletions cuBQL/builder/cuda/sm_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -515,10 +515,10 @@ namespace cuBQL {
while (true) {
CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes,
sizeof(numNodes),cudaMemcpyDeviceToHost,s));
CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent));
if (numNodes == numDone)
break;
CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent));
#if CUBQL_PROFILE
t_nodePass[pass].sync_start();
#endif
Expand All @@ -529,7 +529,7 @@ namespace cuBQL {
#if CUBQL_PROFILE
t_nodePass[pass].sync_stop();
t_primPass[pass].sync_start();
#endif
#endif
numDone = numNodes;

// #if 1
Expand Down
19 changes: 19 additions & 0 deletions cuBQL/builder/omp.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

#pragma once

namespace cuBQL {
namespace omp {
struct Context;

template<typename T, int D>
void refit(BinaryBVH<T,D> &bvh,
const box_t<T,D> *boxes,
Context *ctx);
}
}

#include "cuBQL/builder/omp/refit.h"
#include "cuBQL/builder/omp/spatialMedian.h"

168 changes: 168 additions & 0 deletions cuBQL/builder/omp/AtomicBox.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
// CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "cuBQL/builder/omp/common.h"


namespace cuBQL {
namespace omp {

template<typename box_t>
struct AtomicBox : public box_t {

inline void set_empty()
{
*(box_t *)this = box_t();
}
};

template<typename T>
inline void atomic_min(T *ptr, T v);
template<typename T>
inline void atomic_max(T *ptr, T v);

/*! iw - note: this implementation of atomic min/max via atomic
compare-exchange (CAS); which is cetainly not optimal on any
sort of modern GPU - but it works in any C++-21 compliant
compiler, so it's what we do for now */
inline void atomic_min(float *ptr, float value)
{
#ifdef __NVCOMPILER
# if 1
float &mem = *ptr;
if (mem <= value) return;
while (1) {
float wasBefore;
#pragma omp atomic capture
{ wasBefore = mem; mem = value; }
if (wasBefore >= value) break;
value = wasBefore;
}
# else
float current = *(volatile float *)ptr;
while (current > value) {
bool wasChanged
= ((std::atomic<int>*)ptr)
->compare_exchange_weak((int&)current,(int&)value);
if (wasChanged) break;
}
# endif
#else
float &x = *ptr;
#pragma omp atomic compare
if (x > value) { x = value; }
// float t;
// #pragma omp atomic capture
// { t = *ptr; *ptr = std::min(t,value); }
#endif
}

/*! iw - note: this implementation of atomic min/max via atomic
compare-exchange (CAS); which is cetainly not optimal on any
sort of modern GPU - but it works in any C++-21 compliant
compiler, so it's what we do for now */
inline void atomic_max(float *ptr, float value)
{
#ifdef __NVCOMPILER
# if 1
float &mem = *ptr;
if (mem >= value) return;
while (1) {
float wasBefore;
#pragma omp atomic capture
{ wasBefore = mem; mem = value; }
if (wasBefore <= value) break;
value = wasBefore;
}
# else
float current = *(volatile float *)ptr;
while (current < value) {
bool wasChanged
= ((std::atomic<int>*)ptr)
->compare_exchange_weak((int&)current,(int&)value);
if (wasChanged) break;
}
# endif
#else
float &x = *ptr;
#pragma omp atomic compare
if (x < value) { x = value; }
// float t;
// #pragma omp atomic capture
// { t = *ptr; *ptr = std::max(t,value); }
#endif
}

template<typename T, int D>
inline void v_atomic_min(vec_t<T,D> *ptr, vec_t<T,D> v);
template<typename T, int D>
inline void v_atomic_max(vec_t<T,D> *ptr, vec_t<T,D> v);


template<typename T>
inline void v_atomic_min(vec_t<T,2> *ptr, vec_t<T,2> v)
{
atomic_min(&ptr->x,v.x);
atomic_min(&ptr->y,v.y);
}

template<typename T>
inline void v_atomic_min(vec_t<T,3> *ptr, vec_t<T,3> v)
{
atomic_min(&ptr->x,v.x);
atomic_min(&ptr->y,v.y);
atomic_min(&ptr->z,v.z);
}

template<typename T>
inline void v_atomic_min(vec_t<T,4> *ptr, vec_t<T,4> v)
{
atomic_min(&ptr->x,v.x);
atomic_min(&ptr->y,v.y);
atomic_min(&ptr->z,v.z);
atomic_min(&ptr->w,v.w);
}

template<typename T>
inline void v_atomic_max(vec_t<T,2> *ptr, vec_t<T,2> v)
{
atomic_max(&ptr->x,v.x);
atomic_max(&ptr->y,v.y);
}

template<typename T>
inline void v_atomic_max(vec_t<T,3> *ptr, vec_t<T,3> v)
{
atomic_max(&ptr->x,v.x);
atomic_max(&ptr->y,v.y);
atomic_max(&ptr->z,v.z);
}

template<typename T>
inline void v_atomic_max(vec_t<T,4> *ptr, vec_t<T,4> v)
{
atomic_max(&ptr->x,v.x);
atomic_max(&ptr->y,v.y);
atomic_max(&ptr->z,v.z);
atomic_max(&ptr->w,v.w);
}

template<typename box_t>
inline void atomic_grow(AtomicBox<box_t> &ab, typename box_t::vec_t P)
{
v_atomic_min(&ab.lower,P);
v_atomic_max(&ab.upper,P);
}

template<typename box_t>
inline void atomic_grow(AtomicBox<box_t> &ab, box_t B)
{
v_atomic_min(&ab.lower,B.lower);
v_atomic_max(&ab.upper,B.upper);
}

}
}
162 changes: 162 additions & 0 deletions cuBQL/builder/omp/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
// CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "cuBQL/bvh.h"
#include <omp.h>
#include <atomic>

namespace cuBQL {
namespace omp {

struct Context {
Context(int gpuID);

void *alloc(size_t numBytes);

template<typename T>
void alloc(T *&d_data, size_t Nelements);

template<typename T>
void alloc_and_upload(T *&d_data, const T *h_data, size_t Nelements);

template<typename T>
void upload(T *d_data, const T *h_data, size_t Nelements);

template<typename T>
void alloc_and_upload(T *&d_data, const std::vector<T> &h_vector);

template<typename T>
std::vector<T> download_vector(const T *d_data, size_t N);

template<typename T>
void download(T &h_value, T *d_value);

void free(void *);

int gpuID;
int hostID;
};

struct Kernel {
inline int workIdx() const { return _workIdx; }
int _workIdx;
};

inline uint32_t atomicAdd(uint32_t *ptr, uint32_t inc)
{
#ifdef __NVCOMPILER
return (uint32_t)((std::atomic<int> *)ptr)->fetch_add((int)inc);
#else
uint32_t t;
#pragma omp atomic capture
{ t = *ptr; *ptr += inc; }
// return ((std::atomic<int> *)p_value)->fetch_add(inc);
return t;
#endif
}


// ##################################################################
// IMPLEMENTATION SECTION
// ##################################################################
Context::Context(int gpuID)
: gpuID(gpuID),
hostID(omp_get_initial_device())
{
assert(gpuID < omp_get_num_devices());
printf("#cuBQL:omp:Context(gpu=%i/%i,host=%i)\n",
gpuID,omp_get_num_devices(),hostID);
}

void *Context::alloc(size_t numBytes)
{ return omp_target_alloc(numBytes,gpuID); }

template<typename T> inline
void Context::upload(T *d_data,
const T *h_data,
size_t N)
{
assert(d_data);
omp_target_memcpy(d_data,h_data,N*sizeof(T),
0,0,gpuID,hostID);
}

template<typename T> inline
void Context::alloc_and_upload(T *&d_data,
const T *h_data,
size_t N)
{
printf("target_alloc N %li gpu %i\n",N,gpuID);
d_data = (T *)omp_target_alloc(N*sizeof(T),gpuID);
printf("ptr %p\n",d_data);
upload(d_data,h_data,N);
}

template<typename T> inline
void Context::alloc_and_upload(T *&d_data,
const std::vector<T> &h_vector)
{ alloc_and_upload(d_data,h_vector.data(),h_vector.size()); }

template<typename T>
std::vector<T> Context::download_vector(const T *d_data, size_t N)
{
PRINT(N);
PRINT(d_data);

std::vector<T> out(N);
PRINT(out.data());
PRINT(sizeof(T));
omp_target_memcpy(out.data(),d_data,N*sizeof(T),
0,0,hostID,gpuID);
return out;
}

inline void Context::free(void *ptr)
{ omp_target_free(ptr,gpuID); }

template<typename T> inline
void Context::alloc(T *&d_data, size_t N)
{
d_data = (T*)omp_target_alloc(N*sizeof(T),gpuID);
}

// template<typename T> inline
// void Context::alloc_and_upload(T *&d_data,
// const T *h_data,
// size_t N)
// {
// alloc(d_data,N);
// upload(d_data,h_data,N);
// }

// template<typename T> inline
// void Context::alloc_and_upload(T *&d_data,
// const std::vector<T> &h_vector)
// {
// alloc(d_data,h_vector.size());
// upload(d_data,h_vector);
// }

// template<typename T> inline
// std::vector<T> Context::download_vector(const T *d_data,
// size_t N)
// {
// std::vector<T> vec(N);
// omp_target_memcpy(vec.data(),d_data,N*sizeof(T),
// 0,0,hostID,gpuID);
// return vec;
// }

template<typename T>
inline void Context::download(T &h_value, T *d_value)
{
omp_target_memcpy(&h_value,d_value,sizeof(T),
0,0,hostID,gpuID);
}


} // ::cuBQL::omp
} // ::cuBQL
Loading