diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ffa826..8059a18 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,8 +5,11 @@ cmake_minimum_required(VERSION 3.16) cmake_policy(SET CMP0048 NEW) set(CMAKE_BUILD_TYPE_INIT "Release") -project(cuBQL VERSION 1.1.0 LANGUAGES C CXX) +project(cuBQL VERSION 1.2.0 LANGUAGES C CXX) +if (CUBQL_OMP) + set(CUBQL_DISABLE_CUDA ON) +endif() if (CUBQL_DISABLE_CUDA) message("#cuBQL: CUDA _DISABLED_ by user request") set(CUBQL_HAVE_CUDA OFF) @@ -136,3 +139,5 @@ add_subdirectory(cuBQL) if (NOT CUBQL_IS_SUBPROJECT) add_subdirectory(samples) endif() + +#add_subdirectory(testing) diff --git a/cuBQL/builder/cuda/sm_builder.h b/cuBQL/builder/cuda/sm_builder.h index 7d8acb1..2442048 100644 --- a/cuBQL/builder/cuda/sm_builder.h +++ b/cuBQL/builder/cuda/sm_builder.h @@ -515,10 +515,10 @@ namespace cuBQL { while (true) { CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes, sizeof(numNodes),cudaMemcpyDeviceToHost,s)); - CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s)); - CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent)); if (numNodes == numDone) break; + CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s)); + CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent)); #if CUBQL_PROFILE t_nodePass[pass].sync_start(); #endif @@ -529,7 +529,7 @@ namespace cuBQL { #if CUBQL_PROFILE t_nodePass[pass].sync_stop(); t_primPass[pass].sync_start(); -#endif +#endif numDone = numNodes; // #if 1 diff --git a/cuBQL/builder/omp.h b/cuBQL/builder/omp.h new file mode 100644 index 0000000..17ba4c2 --- /dev/null +++ b/cuBQL/builder/omp.h @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace cuBQL { + namespace omp { + struct Context; + + template + void refit(BinaryBVH &bvh, + const box_t *boxes, + Context *ctx); + } +} + +#include "cuBQL/builder/omp/refit.h" +#include "cuBQL/builder/omp/spatialMedian.h" + diff --git a/cuBQL/builder/omp/AtomicBox.h b/cuBQL/builder/omp/AtomicBox.h new file mode 100644 index 0000000..9ffac06 --- /dev/null +++ b/cuBQL/builder/omp/AtomicBox.h @@ -0,0 +1,168 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA +// CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "cuBQL/builder/omp/common.h" + + +namespace cuBQL { + namespace omp { + + template + struct AtomicBox : public box_t { + + inline void set_empty() + { + *(box_t *)this = box_t(); + } + }; + + template + inline void atomic_min(T *ptr, T v); + template + inline void atomic_max(T *ptr, T v); + + /*! iw - note: this implementation of atomic min/max via atomic + compare-exchange (CAS); which is cetainly not optimal on any + sort of modern GPU - but it works in any C++-21 compliant + compiler, so it's what we do for now */ + inline void atomic_min(float *ptr, float value) + { +#ifdef __NVCOMPILER +# if 1 + float &mem = *ptr; + if (mem <= value) return; + while (1) { + float wasBefore; +#pragma omp atomic capture + { wasBefore = mem; mem = value; } + if (wasBefore >= value) break; + value = wasBefore; + } +# else + float current = *(volatile float *)ptr; + while (current > value) { + bool wasChanged + = ((std::atomic*)ptr) + ->compare_exchange_weak((int&)current,(int&)value); + if (wasChanged) break; + } +# endif +#else + float &x = *ptr; +#pragma omp atomic compare + if (x > value) { x = value; } +// float t; +// #pragma omp atomic capture +// { t = *ptr; *ptr = std::min(t,value); } +#endif + } + + /*! iw - note: this implementation of atomic min/max via atomic + compare-exchange (CAS); which is cetainly not optimal on any + sort of modern GPU - but it works in any C++-21 compliant + compiler, so it's what we do for now */ + inline void atomic_max(float *ptr, float value) + { +#ifdef __NVCOMPILER +# if 1 + float &mem = *ptr; + if (mem >= value) return; + while (1) { + float wasBefore; +#pragma omp atomic capture + { wasBefore = mem; mem = value; } + if (wasBefore <= value) break; + value = wasBefore; + } +# else + float current = *(volatile float *)ptr; + while (current < value) { + bool wasChanged + = ((std::atomic*)ptr) + ->compare_exchange_weak((int&)current,(int&)value); + if (wasChanged) break; + } +# endif +#else + float &x = *ptr; +#pragma omp atomic compare + if (x < value) { x = value; } + // float t; +// #pragma omp atomic capture +// { t = *ptr; *ptr = std::max(t,value); } +#endif + } + + template + inline void v_atomic_min(vec_t *ptr, vec_t v); + template + inline void v_atomic_max(vec_t *ptr, vec_t v); + + + template + inline void v_atomic_min(vec_t *ptr, vec_t v) + { + atomic_min(&ptr->x,v.x); + atomic_min(&ptr->y,v.y); + } + + template + inline void v_atomic_min(vec_t *ptr, vec_t v) + { + atomic_min(&ptr->x,v.x); + atomic_min(&ptr->y,v.y); + atomic_min(&ptr->z,v.z); + } + + template + inline void v_atomic_min(vec_t *ptr, vec_t v) + { + atomic_min(&ptr->x,v.x); + atomic_min(&ptr->y,v.y); + atomic_min(&ptr->z,v.z); + atomic_min(&ptr->w,v.w); + } + + template + inline void v_atomic_max(vec_t *ptr, vec_t v) + { + atomic_max(&ptr->x,v.x); + atomic_max(&ptr->y,v.y); + } + + template + inline void v_atomic_max(vec_t *ptr, vec_t v) + { + atomic_max(&ptr->x,v.x); + atomic_max(&ptr->y,v.y); + atomic_max(&ptr->z,v.z); + } + + template + inline void v_atomic_max(vec_t *ptr, vec_t v) + { + atomic_max(&ptr->x,v.x); + atomic_max(&ptr->y,v.y); + atomic_max(&ptr->z,v.z); + atomic_max(&ptr->w,v.w); + } + + template + inline void atomic_grow(AtomicBox &ab, typename box_t::vec_t P) + { + v_atomic_min(&ab.lower,P); + v_atomic_max(&ab.upper,P); + } + + template + inline void atomic_grow(AtomicBox &ab, box_t B) + { + v_atomic_min(&ab.lower,B.lower); + v_atomic_max(&ab.upper,B.upper); + } + + } +} diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h new file mode 100644 index 0000000..33b8be1 --- /dev/null +++ b/cuBQL/builder/omp/common.h @@ -0,0 +1,162 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA +// CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "cuBQL/bvh.h" +#include +#include + +namespace cuBQL { + namespace omp { + + struct Context { + Context(int gpuID); + + void *alloc(size_t numBytes); + + template + void alloc(T *&d_data, size_t Nelements); + + template + void alloc_and_upload(T *&d_data, const T *h_data, size_t Nelements); + + template + void upload(T *d_data, const T *h_data, size_t Nelements); + + template + void alloc_and_upload(T *&d_data, const std::vector &h_vector); + + template + std::vector download_vector(const T *d_data, size_t N); + + template + void download(T &h_value, T *d_value); + + void free(void *); + + int gpuID; + int hostID; + }; + + struct Kernel { + inline int workIdx() const { return _workIdx; } + int _workIdx; + }; + + inline uint32_t atomicAdd(uint32_t *ptr, uint32_t inc) + { +#ifdef __NVCOMPILER + return (uint32_t)((std::atomic *)ptr)->fetch_add((int)inc); +#else + uint32_t t; +#pragma omp atomic capture + { t = *ptr; *ptr += inc; } + // return ((std::atomic *)p_value)->fetch_add(inc); + return t; +#endif + } + + + // ################################################################## + // IMPLEMENTATION SECTION + // ################################################################## + Context::Context(int gpuID) + : gpuID(gpuID), + hostID(omp_get_initial_device()) + { + assert(gpuID < omp_get_num_devices()); + printf("#cuBQL:omp:Context(gpu=%i/%i,host=%i)\n", + gpuID,omp_get_num_devices(),hostID); + } + + void *Context::alloc(size_t numBytes) + { return omp_target_alloc(numBytes,gpuID); } + + template inline + void Context::upload(T *d_data, + const T *h_data, + size_t N) + { + assert(d_data); + omp_target_memcpy(d_data,h_data,N*sizeof(T), + 0,0,gpuID,hostID); + } + + template inline + void Context::alloc_and_upload(T *&d_data, + const T *h_data, + size_t N) + { + printf("target_alloc N %li gpu %i\n",N,gpuID); + d_data = (T *)omp_target_alloc(N*sizeof(T),gpuID); + printf("ptr %p\n",d_data); + upload(d_data,h_data,N); + } + + template inline + void Context::alloc_and_upload(T *&d_data, + const std::vector &h_vector) + { alloc_and_upload(d_data,h_vector.data(),h_vector.size()); } + + template + std::vector Context::download_vector(const T *d_data, size_t N) + { + PRINT(N); + PRINT(d_data); + + std::vector out(N); + PRINT(out.data()); + PRINT(sizeof(T)); + omp_target_memcpy(out.data(),d_data,N*sizeof(T), + 0,0,hostID,gpuID); + return out; + } + + inline void Context::free(void *ptr) + { omp_target_free(ptr,gpuID); } + + template inline + void Context::alloc(T *&d_data, size_t N) + { + d_data = (T*)omp_target_alloc(N*sizeof(T),gpuID); + } + + // template inline + // void Context::alloc_and_upload(T *&d_data, + // const T *h_data, + // size_t N) + // { + // alloc(d_data,N); + // upload(d_data,h_data,N); + // } + + // template inline + // void Context::alloc_and_upload(T *&d_data, + // const std::vector &h_vector) + // { + // alloc(d_data,h_vector.size()); + // upload(d_data,h_vector); + // } + + // template inline + // std::vector Context::download_vector(const T *d_data, + // size_t N) + // { + // std::vector vec(N); + // omp_target_memcpy(vec.data(),d_data,N*sizeof(T), + // 0,0,hostID,gpuID); + // return vec; + // } + + template + inline void Context::download(T &h_value, T *d_value) + { + omp_target_memcpy(&h_value,d_value,sizeof(T), + 0,0,hostID,gpuID); + } + + + } // ::cuBQL::omp +} // ::cuBQL diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h new file mode 100644 index 0000000..2094554 --- /dev/null +++ b/cuBQL/builder/omp/refit.h @@ -0,0 +1,111 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "cuBQL/builder/omp/common.h" +#include "cuBQL/builder/omp/AtomicBox.h" + +namespace cuBQL { + namespace omp { + + template inline + void refit_init(Kernel kernel, + typename BinaryBVH::Node *nodes, + uint32_t *refitData, + int numNodes) + { + const int nodeID = kernel.workIdx(); + if (nodeID == 1 || nodeID >= numNodes) return; + if (nodeID < 2) + refitData[0] = 0; + auto &node = nodes[nodeID]; + node.bounds = box_t(); + if (node.admin.count) return; + + refitData[node.admin.offset+0] = nodeID << 2; + refitData[node.admin.offset+1] = nodeID << 2; + } + + template inline + void refit_run(Kernel kernel, + uint32_t *bvh_primIDs, + typename BinaryBVH::Node *bvh_nodes, + uint32_t *refitData, + const box_t *boxes, + int numNodes) + { + int nodeID = kernel.workIdx(); + if (nodeID == 1 || nodeID >= numNodes) return; + + typename BinaryBVH::Node *node = bvh_nodes+nodeID; + + if (node->admin.count == 0) + // this is a inner node - exit + return; + + box_t bounds; bounds.set_empty(); + for (int i=0;iadmin.count;i++) { + const box_t primBox = boxes[bvh_primIDs[node->admin.offset+i]]; + bounds.lower = min(bounds.lower,primBox.lower); + bounds.upper = max(bounds.upper,primBox.upper); + } + + + int parentID = (refitData[nodeID] >> 2); + while (true) { + atomic_grow(*(AtomicBox> *)&node->bounds,bounds); + + if (node == bvh_nodes) + break; + + uint32_t refitBits = atomicAdd(&refitData[parentID],1u); + if ((refitBits & 1) == 0) + // we're the first one - let other one do it + break; + + nodeID = parentID; + node = &bvh_nodes[parentID]; + parentID = (refitBits >> 2); + + int ofs = node->admin.offset; + + typename BinaryBVH::Node l = bvh_nodes[ofs+0]; + typename BinaryBVH::Node r = bvh_nodes[ofs+1]; + bounds.lower = min(l.bounds.lower,r.bounds.lower); + bounds.upper = max(l.bounds.upper,r.bounds.upper); + } + } + + template + void refit(BinaryBVH &bvh, + const box_t *boxes, + Context *ctx) + { + assert(bvh.nodes); + assert(bvh.primIDs); + int numNodes = bvh.numNodes; + uint32_t *refitData + = (uint32_t*)ctx->alloc(numNodes*sizeof(uint32_t)); + auto bvh_nodes = bvh.nodes; + auto bvh_primIDs = bvh.primIDs; + { +#pragma omp target device(ctx->gpuID) is_device_ptr(refitData) is_device_ptr(bvh_nodes) +#pragma omp teams distribute parallel for + for (int i=0;i(Kernel{i},bvh_nodes,refitData,numNodes); + } + + { +#pragma omp target device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(bvh_nodes) is_device_ptr(refitData) is_device_ptr(boxes) +#pragma omp teams distribute parallel for + for (int i=0;ifree((void*)refitData); + } + + } +} + diff --git a/cuBQL/builder/omp/sort.h b/cuBQL/builder/omp/sort.h new file mode 100644 index 0000000..5f6cb39 --- /dev/null +++ b/cuBQL/builder/omp/sort.h @@ -0,0 +1,179 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// imported from github.com:ingowald/openmp_target_sort under this license: + +// SPDX-FileCopyrightText: Copyright (c) Ingo Wald +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +namespace omp { + namespace bitonic { + +#pragma omp declare target + template + inline + void g_orderSegmentPairs(uint32_t tid, + int logSegLen, + key_t *const d_values, + int numValues) + { + uint32_t segLen = 1<>logSegLen; + uint32_t pairRank = tid-(pairIdx<= numValues) return; + + key_t lv = d_values[l]; + key_t rv = d_values[r]; + if (rv < lv) { + d_values[r] = lv; + d_values[l] = rv; + } + } +#pragma omp end declare target + + template + void g_orderSegmentPairs(uint32_t tid, + int logSegLen, + key_t *const d_keys, + value_t *const d_values, + int numValues) + { + uint32_t segLen = 1<>logSegLen; + uint32_t pairRank = tid-(pairIdx<= numValues) return; + + key_t lk = d_keys[l]; + key_t rk = d_keys[r]; + key_t lv = d_values[l]; + key_t rv = d_values[r]; + if (rk < lk) { + d_values[r] = lv; + d_values[l] = rv; + d_keys[r] = lk; + d_keys[l] = rk; + } + } + + template + void orderSegmentPairs(int logSegLen, + key_t *const d_values, + int numValues, + uint32_t deviceID) + { +#if 0 +#pragma omp target device(deviceID) teams is_device_ptr(d_values) num_teams(128) + { + int bs = + } +#else +#pragma omp target device(deviceID) is_device_ptr(d_values) nowait +#pragma omp teams distribute parallel for + for (int i=0;i + void orderSegmentPairs(int logSegLen, + key_t *const d_keys, + value_t *const d_values, + int numValues, + uint32_t deviceID) + { +// #pragma omp target nowait device(deviceID) is_device_ptr(d_keys) is_device_ptr(d_values) +// #pragma omp target teams distribute parallel for +#pragma omp target nowait device(deviceID) is_device_ptr(d_keys) is_device_ptr(d_values) + for (int i=0;i + void sortSegments(int logSegLen, + key_t *const d_keys, + int numValues, + uint32_t deviceID) + { + if (logSegLen == 0) return; + + sortSegments(logSegLen-1,d_keys,numValues,deviceID); + orderSegmentPairs(logSegLen-1,d_keys,numValues,deviceID); + sortSegments(logSegLen-1,d_keys,numValues,deviceID); + } + + template + void sortSegments(int logSegLen, + key_t *const d_keys, + value_t *const d_values, + int numValues, + uint32_t deviceID) + { + if (logSegLen == 0) return; + + sortSegments(logSegLen-1,d_keys,d_values,numValues,deviceID); + orderSegmentPairs(logSegLen-1,d_keys,d_values,numValues,deviceID); + sortSegments(logSegLen-1,d_keys,d_values,numValues,deviceID); + } + + template + void sort(key_t *const d_values, + int numValues, + uint32_t deviceID) + { + uint32_t logSegLen = 0; + while (1< + void sort(key_t *const d_keys, + value_t *const d_values, + int numValues, + uint32_t deviceID) + { + uint32_t logSegLen = 0; + while ((1< + void omp_target_sort(key_t *const d_values, + size_t numValues, + uint32_t deviceID) + { + assert(numValues < (1ull<<31)); + bitonic::sort(d_values,(int)numValues,deviceID); + } + + template + void omp_target_sort(key_t *const d_keys, + value_t *const d_values, + size_t numValues, + uint32_t deviceID) + { + assert(numValues < (1ull<<31)); + bitonic::sort(d_keys,d_values,(int)numValues,deviceID); + } + +} diff --git a/cuBQL/builder/omp/spatialMedian.h b/cuBQL/builder/omp/spatialMedian.h new file mode 100644 index 0000000..5cca265 --- /dev/null +++ b/cuBQL/builder/omp/spatialMedian.h @@ -0,0 +1,439 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA +// CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "cuBQL/builder/omp/refit.h" +#include "cuBQL/builder/omp/sort.h" + +namespace cuBQL { + namespace omp { + + struct PrimState { + union { + /* careful with this order - this is intentionally chosen such + that all item with nodeID==-1 will end up at the end of the + list; and all others will be sorted by nodeID */ + struct { + uint64_t primID:31; //!< prim we're talking about + uint64_t done : 1; + uint64_t nodeID:32; //!< node the given prim is (currently) in. + }; + uint64_t bits; + }; + }; + + enum { OPEN_BRANCH, OPEN_NODE, DONE_NODE }; + typedef uint8_t NodeState; + + template + struct CUBQL_ALIGN(16) TempNode { + using box_t = cuBQL::box_t; + union { + struct { + uint32_t count; + AtomicBox centBounds; + } openBranch; + struct { + uint32_t offset; + int dim; + uint32_t tieBreaker; + float pos; + } openNode; + struct { + uint32_t offset; + uint32_t count; + uint32_t unused[2]; + } doneNode; + }; + }; + + template + void initState(Kernel kernel, + uint32_t *pNumNodes, + NodeState *nodeStates, + TempNode *nodes) + { + int tid = kernel.workIdx(); + if (tid > 0) return; + *pNumNodes = 2; + + nodeStates[0] = OPEN_BRANCH; + nodes[0].openBranch.count = 0; + + ((int*)&nodes[0].openBranch.centBounds)[0] = 0; + nodes[0].openBranch.centBounds.set_empty(); + + nodeStates[1] = DONE_NODE; + nodes[1].doneNode.offset = 0; + nodes[1].doneNode.count = 0; + } + + template + void initPrims(Kernel kernel, + TempNode *nodes, + PrimState *primState, + const box_t *primBoxes, + uint32_t numPrims) + { + const int primID = kernel.workIdx(); + if (primID >= numPrims) return; + + auto &me = primState[primID]; + me.primID = primID; + + const box_t box = primBoxes[primID]; + if (box.get_lower(0) <= box.get_upper(0)) { + me.nodeID = 0; + me.done = false; + // this could be made faster by block-reducing ... + atomicAdd(&nodes[0].openBranch.count,1); + auto ctr = box.center(); + atomic_grow(nodes[0].openBranch.centBounds,ctr);//centerOf(box)); + // printf("p %i ctr %f %f %f grownn box %i : (%f %f %f)(%f %f %f)\n", + // primID, + // ctr.x, + // ctr.y, + // ctr.z, + // 0, + // nodes[0].openBranch.centBounds.lower.x, + // nodes[0].openBranch.centBounds.lower.y, + // nodes[0].openBranch.centBounds.lower.z, + // nodes[0].openBranch.centBounds.upper.x, + // nodes[0].openBranch.centBounds.upper.y, + // nodes[0].openBranch.centBounds.upper.z); + } else { + me.nodeID = (uint32_t)-1; + me.done = true; + } + } + + template + void selectSplits(Kernel kernel, + uint32_t *pNumNodes, + NodeState *nodeStates, + TempNode *nodes, + uint32_t numNodes, + BuildConfig buildConfig) + { + const int nodeID = kernel.workIdx();//threadIdx.x+blockIdx.x*blockDim.x; + if (nodeID >= numNodes) return; + + NodeState &nodeState = nodeStates[nodeID]; + if (nodeState == DONE_NODE) + // this node was already closed before + return; + + if (nodeState == OPEN_NODE) { + // this node was open in the last pass, can close it. + nodeState = DONE_NODE; + int offset = nodes[nodeID].openNode.offset; + auto &done = nodes[nodeID].doneNode; + done.count = 0; + done.offset = offset; + return; + } + + auto in = nodes[nodeID].openBranch; + // if (nodeID < 5) + // printf("node split %i : %f %f %f : %f %f %f\n", + // nodeID, + // in.centBounds.lower.x, + // in.centBounds.lower.y, + // in.centBounds.lower.z, + // in.centBounds.upper.x, + // in.centBounds.upper.y, + // in.centBounds.upper.z); + if (in.count <= buildConfig.makeLeafThreshold) { + auto &done = nodes[nodeID].doneNode; + done.count = in.count; + // set this to max-value, so the prims can later do atomicMin + // with their position ion the leaf list; this value is + // greater than any prim position. + done.offset = (uint32_t)-1; + nodeState = DONE_NODE; + } else { + float widestWidth = 0.f; + int widestDim = -1; + float widestLo, widestHi, widestCtr; +#pragma unroll + for (int d=0;d= 0) { + open.pos = widestCtr; + } + open.dim + = (widestDim < 0 || widestCtr == widestLo || widestCtr == widestHi) + ? -1 + : widestDim; + + open.offset = atomicAdd(pNumNodes,2); + // printf("offset %i\n",open.offset); +#pragma unroll + for (int side=0;side<2;side++) { + const int childID = open.offset+side; + auto &child = nodes[childID].openBranch; + + child.centBounds.set_empty(); + child.count = 0; + nodeStates[childID] = OPEN_BRANCH; + } + nodeState = OPEN_NODE; + } + } + + template + void updatePrims(Kernel kernel, + NodeState *nodeStates, + TempNode *nodes, + PrimState *primStates, + const box_t *primBoxes, + int numPrims) + { + const int primID = kernel.workIdx(); + if (primID >= numPrims) return; + + const auto me = primStates[primID]; + if (me.done) return; + + const auto ns = nodeStates[me.nodeID]; + if (ns == DONE_NODE) { + // node became a leaf, we're done. + primStates[primID].done = true; + return; + } + + auto &split = nodes[me.nodeID].openNode; + const box_t primBox = primBoxes[me.primID]; + int side = 0; + if (split.dim == -1) { + // could block-reduce this, but will likely not happen often, anyway + side = (atomicAdd(&split.tieBreaker,1) & 1); + // printf("TIEBREAKER node %i prim %i\n",me.nodeID,me.primID); + } else { + const float center = 0.5f*(primBox.get_lower(split.dim)+ + primBox.get_upper(split.dim)); + side = (center >= split.pos); + } + int newNodeID = split.offset+side; + auto &myBranch = nodes[newNodeID].openBranch; + atomicAdd(&myBranch.count,1); + atomic_grow(myBranch.centBounds,primBox.center()); + primStates[primID].nodeID = newNodeID; + } + + /* given a sorted list of {nodeID,primID} pairs, this kernel does + two things: a) it extracts the 'primID's and puts them into the + bvh's primIDs[] array; and b) it writes, for each leaf nod ein + the nodes[] array, the node.offset value to point to the first + of this nodes' items in that bvh.primIDs[] list. */ + template + void writePrimsAndLeafOffsets(Kernel kernel, + TempNode *nodes, + uint32_t *bvhItemList, + PrimState *primStates, + int numPrims) + { + const int offset = kernel.workIdx();//threadIdx.x+blockIdx.x*blockDim.x; + if (offset >= numPrims) return; + + auto &ps = primStates[offset]; + // printf("ps %i -> %i : %i done %i\n", + // (int)offset,(int)ps.primID,(int)ps.nodeID,(int)ps.done); + bvhItemList[offset] = ps.primID; + + if ((int)ps.nodeID < 0) + /* invalid prim, just skip here */ + return; + auto &node = nodes[ps.nodeID]; + atomicMin(&node.doneNode.offset,offset); + } + + template + void writeLeafOffsets(Kernel kernel, + TempNode *nodes, + uint32_t *pNextFreeOffset, + int numNodes) + { + const int nodeID = kernel.workIdx(); + if (nodeID >= numNodes) return; + + auto &node = nodes[nodeID].doneNode; + if (node.count > 0) { + node.offset = atomicAdd(pNextFreeOffset,node.count)+node.count; + // printf("wrote leaf %i offset %i\n", + // nodeID,node.offset); + } + } + + template + void writePrims(Kernel kernel, + uint32_t *bvh_primIDs, + TempNode *nodes, + PrimState *primStates, + int numPrims) + { + const int primID = kernel.workIdx();//threadIdx.x+blockIdx.x*blockDim.x; + if (primID >= numPrims) return; + + auto &ps = primStates[primID]; + if ((int)ps.nodeID < 0) + /* invalid prim, just skip here */ + return; + auto &node = nodes[ps.nodeID].doneNode; + int myPos = atomicAdd(&node.offset,(uint32_t)-1)-1; + bvh_primIDs[myPos] = ps.primID; + } + + /* writes main phase's temp nodes into final bvh.nodes[] + layout. actual bounds of that will NOT yet bewritten */ + template + void writeNodes(Kernel kernel, + typename BinaryBVH::Node *finalNodes, + TempNode *tempNodes, + int numNodes) + { + const int nodeID = kernel.workIdx(); + if (nodeID >= numNodes) return; + + finalNodes[nodeID].admin.offset = tempNodes[nodeID].doneNode.offset; + finalNodes[nodeID].admin.count = tempNodes[nodeID].doneNode.count; + } + + + template + void spatialMedian(BinaryBVH &bvh, + /*! DEVICE array of boxes */ + const box_t *boxes, + uint32_t numPrims, + BuildConfig buildConfig, + Context *ctx) + { + assert(sizeof(PrimState) == sizeof(uint64_t)); + if (buildConfig.makeLeafThreshold < 1) + buildConfig.makeLeafThreshold = 1; + + // ================================================================== + // do build on temp nodes + // ================================================================== + TempNode *tempNodes = 0; + NodeState *nodeStates = 0; + PrimState *primStates = 0; + uint32_t *d_numNodes = 0; + ctx->alloc(tempNodes,2*numPrims); + ctx->alloc(nodeStates,2*numPrims); + ctx->alloc(primStates,numPrims); + ctx->alloc(d_numNodes,1); +#pragma omp target device(ctx->gpuID) is_device_ptr(d_numNodes) is_device_ptr(nodeStates) is_device_ptr(tempNodes) +#pragma omp teams distribute parallel for + for (int tid=0;tid<1;tid++) + initState(Kernel{tid}, + d_numNodes, + nodeStates, + tempNodes); +#pragma omp target device(ctx->gpuID) is_device_ptr(tempNodes) is_device_ptr(primStates) is_device_ptr(boxes) +#pragma omp teams distribute parallel for + for (int tid=0;tiddownload(numNodes,d_numNodes); + if (numNodes == numDone) + break; +#pragma omp target device(ctx->gpuID) is_device_ptr(d_numNodes) is_device_ptr(nodeStates) is_device_ptr(tempNodes) +#pragma omp teams distribute parallel for + for (int tid=0;tidgpuID) is_device_ptr(d_numNodes) is_device_ptr(nodeStates) is_device_ptr(tempNodes) +#pragma omp teams distribute parallel for + for (int tid=0;tidalloc(bvh.primIDs,numPrims); + auto bvh_primIDs = bvh.primIDs; + + uint32_t zero = 0u; + // set first uint there to zero; leaf writing code will + // atomically inc that value to determine leaf offset values. + ctx->upload(bvh_primIDs,&zero,1); +#pragma omp target device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(tempNodes) +#pragma omp teams distribute parallel for + for (int tid=0;tidgpuID) is_device_ptr(bvh_primIDs) is_device_ptr(tempNodes) +#pragma omp teams distribute parallel for + for (int tid=0;tidalloc(bvh.nodes,numNodes); + auto bvh_nodes = bvh.nodes; +#pragma omp target device(ctx->gpuID) is_device_ptr(bvh_nodes) is_device_ptr(tempNodes) +#pragma omp teams distribute parallel for + for (int tid=0;tidfree(tempNodes); + ctx->free(nodeStates); + ctx->free(primStates); + ctx->free(d_numNodes); + + cuBQL::omp::refit(bvh,boxes,ctx); + } + + } + + template + void build_omp_target(BinaryBVH &bvh, + const box_t *d_boxes, + uint32_t numBoxes, + BuildConfig buildConfig, + int gpuID) + { + omp::Context ctx(gpuID); + omp::spatialMedian(bvh,d_boxes,numBoxes,buildConfig,&ctx); + } +} + diff --git a/cuBQL/math/vec.h b/cuBQL/math/vec.h index 10ffdb7..11d7b37 100644 --- a/cuBQL/math/vec.h +++ b/cuBQL/math/vec.h @@ -356,34 +356,53 @@ namespace cuBQL { /* vec:vec */ \ template \ inline __cubql_both \ - vec_t long_op(vec_t a, vec_t b) \ + vec_t long_op(const vec_t &a, const vec_t &b) \ { \ vec_t r; \ - CUBQL_PRAGMA_UNROLL \ - for (int i=0;i \ + template \ inline __cubql_both \ - vec_t long_op(T a, vec_t b) \ + vec_t long_op(const vec_t &a, const vec_t &b) \ { \ - vec_t r; \ - CUBQL_PRAGMA_UNROLL \ - for (int i=0;i r; \ + r.x = a.x op b.x; \ + r.y = a.y op b.y; \ return r; \ } \ - /* vec:scalar */ \ - template \ - inline __cubql_both \ - vec_t long_op(vec_t a, T b) \ - { \ - vec_t r; \ - CUBQL_PRAGMA_UNROLL \ - for (int i=0;i \ + inline __cubql_both \ + vec_t long_op(const vec_t &a, const vec_t &b) \ + { \ + vec_t r; \ + r.x = a.x op b.x; \ + r.y = a.y op b.y; \ + r.z = a.z op b.z; \ + return r; \ + } \ + template \ + inline __cubql_both \ + vec_t long_op(const vec_t &a, const vec_t &b) \ + { \ + vec_t r; \ + r.x = a.x op b.x; \ + r.y = a.y op b.y; \ + r.z = a.z op b.z; \ + r.w = a.w op b.w; \ + return r; \ + } \ + /* scalar-vec */ \ + template \ + inline __cubql_both \ + vec_t long_op(T a, const vec_t &b) \ + { return vec_t(a) op b; } \ + /* vec:scalar */ \ + template \ + inline __cubql_both \ + vec_t long_op(const vec_t &a, T b) \ + { return a op vec_t(b); } \ + CUBQL_OPERATOR_CUDA_T(long_op,op) CUBQL_OPERATOR(operator+,+) CUBQL_OPERATOR(operator-,-)