From 6515227ec951503c561a3720c36f8df0e93d8c01 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Wed, 21 Jan 2026 13:54:30 -0700 Subject: [PATCH 01/14] refit ported to openmp --- cuBQL/builder/omp.h | 19 ++ cuBQL/builder/omp/common.h | 20 ++ cuBQL/builder/omp/refit.h | 104 +++++++ cuBQL/builder/omp/spatialMedian.h | 443 ++++++++++++++++++++++++++++++ 4 files changed, 586 insertions(+) create mode 100644 cuBQL/builder/omp.h create mode 100644 cuBQL/builder/omp/common.h create mode 100644 cuBQL/builder/omp/refit.h create mode 100644 cuBQL/builder/omp/spatialMedian.h diff --git a/cuBQL/builder/omp.h b/cuBQL/builder/omp.h new file mode 100644 index 0000000..17ba4c2 --- /dev/null +++ b/cuBQL/builder/omp.h @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace cuBQL { + namespace omp { + struct Context; + + template + void refit(BinaryBVH &bvh, + const box_t *boxes, + Context *ctx); + } +} + +#include "cuBQL/builder/omp/refit.h" +#include "cuBQL/builder/omp/spatialMedian.h" + diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h new file mode 100644 index 0000000..e516f35 --- /dev/null +++ b/cuBQL/builder/omp/common.h @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace cuBQL { + namespace omp { + + struct Context { + Context(int gpuID); + int gpuID; + int hostID; + }; + struct Kernel { + inline int workIdx() const { return _workIdx; } + int _workIdx; + }; + + } +} diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h new file mode 100644 index 0000000..420d30f --- /dev/null +++ b/cuBQL/builder/omp/refit.h @@ -0,0 +1,104 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "cuBQL/builder/omp/common.h" + +namespace cuBQL { + namespace omp { + + struct Context { + Context(int gpuID); + int gpuID; + int hostID; + }; + struct Kernel { + inline int workIdx() const { return _workIdx; } + int _workIdx; + }; + + template inline + void refit_init(Kernel kernel, + const typename BinaryBVH::Node *nodes, + uint32_t *refitData, + int numNodes) + { + const int nodeID = kernel.workIdx(); + if (nodeID == 1 || nodeID >= numNodes) return; + if (nodeID < 2) + refitData[0] = 0; + const auto &node = nodes[nodeID]; + if (node.admin.count) return; + + refitData[node.admin.offset+0] = nodeID << 1; + refitData[node.admin.offset+1] = nodeID << 1; + } + + template inline + void refit_run(Kernel kernel, + BinaryBVH bvh, + uint32_t *refitData, + const box_t *boxes) + { + int nodeID = kernel.workIdx(); + if (nodeID == 1 || nodeID >= bvh.numNodes) return; + + typename BinaryBVH::Node *node = &bvh.nodes[nodeID]; + if (node->admin.count == 0) + // this is a inner node - exit + return; + + box_t bounds; bounds.set_empty(); + for (int i=0;iadmin.count;i++) { + const box_t primBox = boxes[bvh.primIDs[node->admin.offset+i]]; + bounds.lower = min(bounds.lower,primBox.lower); + bounds.upper = max(bounds.upper,primBox.upper); + } + + int parentID = (refitData[nodeID] >> 1); + while (true) { + node->bounds = bounds; + // __threadfence(); + if (node == bvh.nodes) + break; + + uint32_t refitBits = atomicAdd(&refitData[parentID],1u); + if ((refitBits & 1) == 0) + // we're the first one - let other one do it + break; + + nodeID = parentID; + node = &bvh.nodes[parentID]; + parentID = (refitBits >> 1); + + typename BinaryBVH::Node l = bvh.nodes[node->admin.offset+0]; + typename BinaryBVH::Node r = bvh.nodes[node->admin.offset+1]; + bounds.lower = min(l.bounds.lower,r.bounds.lower); + bounds.upper = max(l.bounds.upper,r.bounds.upper); + } + } + + template + void refit(BinaryBVH &bvh, + const box_t *boxes, + Context *ctx) + { + int numNodes = bvh.numNodes; + uint32_t *refitData + = (uint32_t*)ctx->malloc(numNodes*sizeof(int)); + +# pragma omp target device(context->gpuID) +# pragma omp teams distribute parallel for + for (int i=0;i(Kernel{i},bvh.nodes,refitData,bvh.numNodes); +# pragma omp target device(context->gpuID) +# pragma omp teams distribute parallel for + for (int i=0;ifree((void*)refitData); + } + + } +} + diff --git a/cuBQL/builder/omp/spatialMedian.h b/cuBQL/builder/omp/spatialMedian.h new file mode 100644 index 0000000..7dea853 --- /dev/null +++ b/cuBQL/builder/omp/spatialMedian.h @@ -0,0 +1,443 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "cuBQL/builder/omp/refit.h" + +namespace cuBQL { + namespace omp { + + struct PrimState { + union { + /* careful with this order - this is intentionally chosen such + that all item with nodeID==-1 will end up at the end of the + list; and all others will be sorted by nodeID */ + struct { + uint64_t primID:31; //!< prim we're talking about + uint64_t done : 1; + uint64_t nodeID:32; //!< node the given prim is (currently) in. + }; + uint64_t bits; + }; + }; + + template + struct CUBQL_ALIGN(16) TempNode { + using box_t = cuBQL::box_t; + union { + struct { + AtomicBox centBounds; + uint32_t count; + uint32_t unused; + } openBranch; + struct { + uint32_t offset; + int dim; + uint32_t tieBreaker; + float pos; + } openNode; + struct { + uint32_t offset; + uint32_t count; + uint32_t unused[2]; + } doneNode; + }; + }; + + template + __global__ + void initState(BuildState *buildState, + NodeState *nodeStates, + TempNode *nodes) + { + buildState->numNodes = 2; + + nodeStates[0] = OPEN_BRANCH; + nodes[0].openBranch.count = 0; + nodes[0].openBranch.centBounds.set_empty(); + + nodeStates[1] = DONE_NODE; + nodes[1].doneNode.offset = 0; + nodes[1].doneNode.count = 0; + } + + template + __global__ void initPrims(TempNode *nodes, + PrimState *primState, + const box_t *primBoxes, + uint32_t numPrims) + { + const int primID = threadIdx.x+blockIdx.x*blockDim.x; + if (primID >= numPrims) return; + + auto &me = primState[primID]; + me.primID = primID; + + const box_t box = primBoxes[primID]; + if (box.get_lower(0) <= box.get_upper(0)) { + me.nodeID = 0; + me.done = false; + // this could be made faster by block-reducing ... + atomicAdd(&nodes[0].openBranch.count,1); + atomic_grow(nodes[0].openBranch.centBounds,box.center());//centerOf(box)); + } else { + me.nodeID = (uint32_t)-1; + me.done = true; + } + } + + template + __global__ + void selectSplits(BuildState *buildState, + NodeState *nodeStates, + TempNode *nodes, + uint32_t numNodes, + BuildConfig buildConfig) + { +#if 1 + __shared__ int l_newNodeOfs; + if (threadIdx.x == 0) + l_newNodeOfs = 0; + __syncthreads(); + + int *t_nodeOffsetToWrite = 0; + int t_localOffsetToAdd = 0; + + while (true) { + const int nodeID = threadIdx.x+blockIdx.x*blockDim.x; + if (nodeID >= numNodes) + break; + + NodeState &nodeState = nodeStates[nodeID]; + if (nodeState == DONE_NODE) + // this node was already closed before + break; + + if (nodeState == OPEN_NODE) { + // this node was open in the last pass, can close it. + nodeState = DONE_NODE; + int offset = nodes[nodeID].openNode.offset; + auto &done = nodes[nodeID].doneNode; + done.count = 0; + done.offset = offset; + break; + } + + auto in = nodes[nodeID].openBranch; + if (in.count <= buildConfig.makeLeafThreshold) { + auto &done = nodes[nodeID].doneNode; + done.count = in.count; + // set this to max-value, so the prims can later do atomicMin + // with their position ion the leaf list; this value is + // greater than any prim position. + done.offset = (uint32_t)-1; + nodeState = DONE_NODE; + } else { + float widestWidth = 0.f; + int widestDim = -1; + float widestLo, widestHi, widestCtr; +#pragma unroll + for (int d=0;d= 0) { + open.pos = widestCtr; + } + open.dim + = (widestDim < 0 || widestCtr == widestLo || widestCtr == widestHi) + ? -1 + : widestDim; + + // this will be epensive - could make this faster by block-reducing + // open.offset = atomicAdd(&buildState->numNodes,2); + t_nodeOffsetToWrite = (int*)&open.offset; + t_localOffsetToAdd = atomicAdd(&l_newNodeOfs,2); + nodeState = OPEN_NODE; + } + break; + } + __syncthreads(); + if (threadIdx.x == 0 && l_newNodeOfs > 0) + l_newNodeOfs = atomicAdd(&buildState->numNodes,l_newNodeOfs); + __syncthreads(); + if (t_nodeOffsetToWrite) { + int openOffset = *t_nodeOffsetToWrite = l_newNodeOfs + t_localOffsetToAdd; +#pragma unroll + for (int side=0;side<2;side++) { + const int childID = openOffset+side; + auto &child = nodes[childID].openBranch; + child.centBounds.set_empty(); + child.count = 0; + nodeStates[childID] = OPEN_BRANCH; + } + } +#else + const int nodeID = threadIdx.x+blockIdx.x*blockDim.x; + if (nodeID >= numNodes) return; + + NodeState &nodeState = nodeStates[nodeID]; + if (nodeState == DONE_NODE) + // this node was already closed before + return; + + if (nodeState == OPEN_NODE) { + // this node was open in the last pass, can close it. + nodeState = DONE_NODE; + int offset = nodes[nodeID].openNode.offset; + auto &done = nodes[nodeID].doneNode; + done.count = 0; + done.offset = offset; + return; + } + + auto in = nodes[nodeID].openBranch; + if (in.count <= buildConfig.makeLeafThreshold) { + auto &done = nodes[nodeID].doneNode; + done.count = in.count; + // set this to max-value, so the prims can later do atomicMin + // with their position ion the leaf list; this value is + // greater than any prim position. + done.offset = (uint32_t)-1; + nodeState = DONE_NODE; + } else { + float widestWidth = 0.f; + int widestDim = -1; + float widestLo, widestHi, widestCtr; +#pragma unroll + for (int d=0;d= 0) { + open.pos = widestCtr; + } + open.dim + = (widestDim < 0 || widestCtr == widestLo || widestCtr == widestHi) + ? -1 + : widestDim; + + // this will be epensive - could make this faster by block-reducing + open.offset = atomicAdd(&buildState->numNodes,2); +#pragma unroll + for (int side=0;side<2;side++) { + const int childID = open.offset+side; + auto &child = nodes[childID].openBranch; + child.centBounds.set_empty(); + child.count = 0; + nodeStates[childID] = OPEN_BRANCH; + } + nodeState = OPEN_NODE; + } +#endif + } + + template + __global__ + void updatePrims(NodeState *nodeStates, + TempNode *nodes, + PrimState *primStates, + const box_t *primBoxes, + int numPrims) + { + const int primID = threadIdx.x+blockIdx.x*blockDim.x; + if (primID >= numPrims) return; + + const auto me = primStates[primID]; + if (me.done) return; + + const auto ns = nodeStates[me.nodeID]; + if (ns == DONE_NODE) { + // node became a leaf, we're done. + primStates[primID].done = true; + return; + } + + auto &split = nodes[me.nodeID].openNode; + const box_t primBox = primBoxes[me.primID]; + int side = 0; + if (split.dim == -1) { + // could block-reduce this, but will likely not happen often, anyway + side = (atomicAdd(&split.tieBreaker,1) & 1); + } else { + const float center = 0.5f*(primBox.get_lower(split.dim)+ + primBox.get_upper(split.dim)); + side = (center >= split.pos); + } + int newNodeID = split.offset+side; + auto &myBranch = nodes[newNodeID].openBranch; + atomicAdd(&myBranch.count,1); + atomic_grow(myBranch.centBounds,primBox.center()); + primStates[primID].nodeID = newNodeID; + } + + /* given a sorted list of {nodeID,primID} pairs, this kernel does + two things: a) it extracts the 'primID's and puts them into the + bvh's primIDs[] array; and b) it writes, for each leaf nod ein + the nodes[] array, the node.offset value to point to the first + of this nodes' items in that bvh.primIDs[] list. */ + template + __global__ + void writePrimsAndLeafOffsets(TempNode *nodes, + uint32_t *bvhItemList, + PrimState *primStates, + int numPrims) + { + const int offset = threadIdx.x+blockIdx.x*blockDim.x; + if (offset >= numPrims) return; + + auto &ps = primStates[offset]; + bvhItemList[offset] = ps.primID; + + if ((int)ps.nodeID < 0) + /* invalid prim, just skip here */ + return; + auto &node = nodes[ps.nodeID]; + atomicMin(&node.doneNode.offset,offset); + } + + /* writes main phase's temp nodes into final bvh.nodes[] + layout. actual bounds of that will NOT yet bewritten */ + template + __global__ + void writeNodes(typename BinaryBVH::Node *finalNodes, + TempNode *tempNodes, + int numNodes) + { + const int nodeID = threadIdx.x+blockIdx.x*blockDim.x; + if (nodeID >= numNodes) return; + + finalNodes[nodeID].admin.offset = tempNodes[nodeID].doneNode.offset; + finalNodes[nodeID].admin.count = tempNodes[nodeID].doneNode.count; + } + + + template + void build(BinaryBVH &bvh, + const box_t *boxes, + int numPrims, + BuildConfig buildConfig, + cudaStream_t s, + GpuMemoryResource &memResource) + { + assert(sizeof(PrimState) == sizeof(uint64_t)); + + // ================================================================== + // do build on temp nodes + // ================================================================== + TempNode *tempNodes = 0; + NodeState *nodeStates = 0; + PrimState *primStates = 0; + BuildState *buildState = 0; + ctx->malloc(tempNodes,2*numPrims); + ctx->malloc(nodeStates,2*numPrims); + ctx->malloc(primStates,numPrims); + ctx->malloc(buildState,1); + initState<<<1,1,0,s>>>(buildState, + nodeStates, + tempNodes); + initPrims<<>> + (tempNodes, + primStates,boxes,numPrims); + + int numDone = 0; + int numNodes; + + // ------------------------------------------------------------------ + cudaEvent_t stateDownloadedEvent; + CUBQL_CUDA_CALL(EventCreate(&stateDownloadedEvent)); + + + while (true) { + CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes, + sizeof(numNodes),cudaMemcpyDeviceToHost,s)); + CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s)); + CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent)); + if (numNodes == numDone) + break; + selectSplits<<>> + (buildState, + nodeStates,tempNodes,numNodes, + buildConfig); + numDone = numNodes; + + updatePrims<<>> + (nodeStates,tempNodes, + primStates,boxes,numPrims); + } + CUBQL_CUDA_CALL(EventDestroy(stateDownloadedEvent)); + // ================================================================== + // sort {item,nodeID} list + // ================================================================== + + // set up sorting of prims + uint8_t *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + PrimState *sortedPrimStates = 0; + ctx->malloc(sortedPrimStates,numPrims); + auto rc = + cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes, + (uint64_t*)primStates, + (uint64_t*)sortedPrimStates, + numPrims,32,64,s); + ctx->malloc(d_temp_storage,temp_storage_bytes); + rc = + cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes, + (uint64_t*)primStates, + (uint64_t*)sortedPrimStates, + numPrims,32,64,s); + rc = rc; + ctx->free(d_temp_storage); + // ================================================================== + // allocate and write BVH item list, and write offsets of leaf nodes + // ================================================================== + + bvh.numPrims = numPrims; + ctx->malloc(bvh.primIDs,numPrims); + writePrimsAndLeafOffsets<<>> + (tempNodes,bvh.primIDs,sortedPrimStates,numPrims); + + // ================================================================== + // allocate and write final nodes + // ================================================================== + bvh.numNodes = numNodes; + ctx->malloc(bvh.nodes,numNodes); + writeNodes<<>> + (bvh.nodes,tempNodes,numNodes); + ctx->free(sortedPrimStates); + ctx->free(tempNodes); + ctx->free(nodeStates); + ctx->free(primStates); + ctx->free(buildState); + + refit(bvh); + } + + } +} + From 6263e92a503797d9e52ad2ca41de9f889189032f Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Fri, 23 Jan 2026 09:39:10 -0700 Subject: [PATCH 02/14] added openmp sorter --- CMakeLists.txt | 7 ++ cuBQL/builder/omp/common.h | 61 ++++++++++- cuBQL/builder/omp/sort.h | 204 +++++++++++++++++++++++++++++++++++++ testing/omp_sort.cpp | 40 ++++++++ 4 files changed, 309 insertions(+), 3 deletions(-) create mode 100644 cuBQL/builder/omp/sort.h create mode 100644 testing/omp_sort.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ffa826..a32e0da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,9 @@ cmake_policy(SET CMP0048 NEW) set(CMAKE_BUILD_TYPE_INIT "Release") project(cuBQL VERSION 1.1.0 LANGUAGES C CXX) +if (CUBQL_OMP) + set(CUBQL_DISABLE_CUDA ON) +endif() if (CUBQL_DISABLE_CUDA) message("#cuBQL: CUDA _DISABLED_ by user request") set(CUBQL_HAVE_CUDA OFF) @@ -136,3 +139,7 @@ add_subdirectory(cuBQL) if (NOT CUBQL_IS_SUBPROJECT) add_subdirectory(samples) endif() + +if (CUBQL_OMP) + add_subdirectory(testing) +endif() diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h index e516f35..df0a4ca 100644 --- a/cuBQL/builder/omp/common.h +++ b/cuBQL/builder/omp/common.h @@ -1,13 +1,27 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA +// CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 #pragma once +#include "cuBQL/bvh.h" +#include + namespace cuBQL { namespace omp { struct Context { Context(int gpuID); + + template + void alloc_and_upload(T *&d_data, const T *h_data, size_t Nelements); + + template + void alloc_and_upload(T *&d_data, const std::vector &h_vector); + + template + std::vector download_vector(const T *d_data, size_t N); + int gpuID; int hostID; }; @@ -16,5 +30,46 @@ namespace cuBQL { int _workIdx; }; - } -} + + + // ################################################################## + // IMPLEMENTATION SECTION + // ################################################################## + Context::Context(int gpuID) + : gpuID(gpuID), + hostID(omp_get_initial_device()) + { + assert(gpuID < omp_get_num_devices()); + printf("#cuBQL:omp:Context(gpu=%i/%i,host=%i)\n", + gpuID,omp_get_num_devices(),hostID); + } + + template + void Context::alloc_and_upload(T *&d_data, + const T *h_data, + size_t N) + { + printf("target_alloc N %li gpu %i\n",N,gpuID); + d_data = (T *)omp_target_alloc(N*sizeof(T),gpuID); + printf("ptr %p\n",d_data); + assert(d_data); + omp_target_memcpy(d_data,h_data,N*sizeof(T), + 0,0,gpuID,hostID); + } + + template + void Context::alloc_and_upload(T *&d_data, + const std::vector &h_vector) + { alloc_and_upload(d_data,h_vector.data(),h_vector.size()); } + + template + std::vector Context::download_vector(const T *d_data, size_t N) + { + std::vector out(N); + omp_target_memcpy(out.data(),d_data,N*sizeof(T), + 0,0,hostID,gpuID); + return out; + } + + } // ::cuBQL::omp +} // ::cuBQL diff --git a/cuBQL/builder/omp/sort.h b/cuBQL/builder/omp/sort.h new file mode 100644 index 0000000..3e5ca6b --- /dev/null +++ b/cuBQL/builder/omp/sort.h @@ -0,0 +1,204 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA +// CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "cuBQL/builder/omp/common.h" + +namespace cuBQL { + namespace omp { + namespace bitonic { + + template + void print_all(key_t *const d_values, + int numValues, + int barInterval, int commaInterval, + Context *omp) + { + std::vector v = omp->download_vector(d_values,numValues); + for (int i=0;i + void g_orderSegmentPairs(Kernel kernel, + int logSegLen, + key_t *const d_values, + int numValues) + + { + uint32_t tid = kernel.workIdx(); + + // bool dbg = tid == 1 || tid == 0; + + uint32_t segLen = 1<>logSegLen; + uint32_t pairRank = tid-(pairIdx<= numValues) return; + + key_t lv = d_values[l]; + key_t rv = d_values[r]; + if (rv < lv) { + d_values[r] = lv; + d_values[l] = rv; + } + } + + template + void g_orderSegmentPairs(Kernel kernel, + int logSegLen, + key_t *const d_keys, + value_t *const d_values, + int numValues) + + { + uint32_t tid = kernel.workIdx(); + + // bool dbg = tid == 1 || tid == 0; + + uint32_t segLen = 1<>logSegLen; + uint32_t pairRank = tid-(pairIdx<= numValues) return; + + key_t lk = d_keys[l]; + key_t rk = d_keys[r]; + key_t lv = d_values[l]; + key_t rv = d_values[r]; + if (rk < lk) { + d_values[r] = lv; + d_values[l] = rv; + d_keys[r] = lk; + d_keys[l] = rk; + } + } + + template + void orderSegmentPairs(int logSegLen, + key_t *const d_values, + int numValues, + Context *omp) + + { +#pragma omp target device(omp->gpuID) +#pragma omp teams distribute parallel for + for (int i=0;i + void orderSegmentPairs(int logSegLen, + key_t *const d_keys, + value_t *const d_values, + int numValues, + Context *omp) + + { +#pragma omp target device(omp->gpuID) +#pragma omp teams distribute parallel for + for (int i=0;i + void sortSegments(int logSegLen, + key_t *const d_keys, + int numValues, + Context *omp) + { + if (logSegLen == 0) return; + + sortSegments(logSegLen-1,d_keys,numValues,omp); + orderSegmentPairs(logSegLen-1,d_keys,numValues,omp); + sortSegments(logSegLen-1,d_keys,numValues,omp); + } + + template + void sortSegments(int logSegLen, + key_t *const d_keys, + value_t *const d_values, + int numValues, + Context *omp) + { + if (logSegLen == 0) return; + + sortSegments(logSegLen-1,d_keys,d_values,numValues,omp); + orderSegmentPairs(logSegLen-1,d_keys,d_values,numValues,omp); + sortSegments(logSegLen-1,d_keys,d_values,numValues,omp); + } + + template + void sort(key_t *const d_values, + int numValues, + Context *omp) + + { + uint32_t logSegLen = 0; + while (1< + void sort(key_t *const d_keys, + value_t *const d_values, + int numValues, + Context *omp) + + { + uint32_t logSegLen = 0; + while (1< + void sort(key_t *const d_values, + size_t numValues, + Context *omp) + { + assert(numValues < (1ull<<31)); + bitonic::sort(d_values,(int)numValues,omp); + } + + template + void sort(key_t *const d_keys, + value_t *const d_values, + size_t numValues, + Context *omp) + { + assert(numValues < (1ull<<31)); + bitonic::sort(d_keys,d_values,(int)numValues,omp); + } + } +} diff --git a/testing/omp_sort.cpp b/testing/omp_sort.cpp new file mode 100644 index 0000000..b043581 --- /dev/null +++ b/testing/omp_sort.cpp @@ -0,0 +1,40 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA +// CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "cuBQL/builder/omp/sort.h" +#include + +int main(int ac, char **av) +{ + cuBQL::omp::Context omp(0); + + int N = 13; + // int N = 123453; + std::vector inputs(N); + for (int i=0;i results + = omp.download_vector(d_data,N); + for (int i=1;i results[i]) + throw std::runtime_error("Not sorted..."); + } + std::cout << "sorted - perfect!" << std::endl; +} From 96eb64fd2d154d066b47c1c57226b5e48afd0bc0 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Fri, 23 Jan 2026 18:32:31 -0700 Subject: [PATCH 03/14] added sorting code and omp skeleton --- cuBQL/builder/omp/sort.h | 88 ++++++++++++---------------------------- testing/omp_sort.cpp | 9 +--- 2 files changed, 27 insertions(+), 70 deletions(-) diff --git a/cuBQL/builder/omp/sort.h b/cuBQL/builder/omp/sort.h index 3e5ca6b..4f1db34 100644 --- a/cuBQL/builder/omp/sort.h +++ b/cuBQL/builder/omp/sort.h @@ -4,52 +4,25 @@ #pragma once -#include "cuBQL/builder/omp/common.h" +#include namespace cuBQL { namespace omp { namespace bitonic { template - void print_all(key_t *const d_values, - int numValues, - int barInterval, int commaInterval, - Context *omp) - { - std::vector v = omp->download_vector(d_values,numValues); - for (int i=0;i - void g_orderSegmentPairs(Kernel kernel, + void g_orderSegmentPairs(uint32_t tid, int logSegLen, key_t *const d_values, int numValues) - { - uint32_t tid = kernel.workIdx(); - - // bool dbg = tid == 1 || tid == 0; - uint32_t segLen = 1<>logSegLen; uint32_t pairRank = tid-(pairIdx<= numValues) return; @@ -62,17 +35,12 @@ namespace cuBQL { } template - void g_orderSegmentPairs(Kernel kernel, + void g_orderSegmentPairs(uint32_t tid, int logSegLen, key_t *const d_keys, value_t *const d_values, int numValues) - { - uint32_t tid = kernel.workIdx(); - - // bool dbg = tid == 1 || tid == 0; - uint32_t segLen = 1<>logSegLen; uint32_t pairRank = tid-(pairIdx<= numValues) return; @@ -103,13 +68,12 @@ namespace cuBQL { void orderSegmentPairs(int logSegLen, key_t *const d_values, int numValues, - Context *omp) - + uint32_t deviceID) { -#pragma omp target device(omp->gpuID) +#pragma omp target device(deviceID) #pragma omp teams distribute parallel for for (int i=0;igpuID) +#pragma omp target device(deviceID) #pragma omp teams distribute parallel for for (int i=0;i void sort(key_t *const d_values, int numValues, - Context *omp) - + uint32_t deviceID) { uint32_t logSegLen = 0; while (1< - void sort(key_t *const d_values, - size_t numValues, - Context *omp) + void omp_target_sort(key_t *const d_values, + size_t numValues, + uint32_t deviceID) { assert(numValues < (1ull<<31)); - bitonic::sort(d_values,(int)numValues,omp); + bitonic::sort(d_values,(int)numValues,deviceID); } - + template - void sort(key_t *const d_keys, - value_t *const d_values, - size_t numValues, - Context *omp) + void omp_target_sort(key_t *const d_keys, + value_t *const d_values, + size_t numValues, + uint32_t deviceID) { assert(numValues < (1ull<<31)); - bitonic::sort(d_keys,d_values,(int)numValues,omp); + bitonic::sort(d_keys,d_values,(int)numValues,deviceID); } + } } diff --git a/testing/omp_sort.cpp b/testing/omp_sort.cpp index b043581..8227709 100644 --- a/testing/omp_sort.cpp +++ b/testing/omp_sort.cpp @@ -2,6 +2,7 @@ // CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +#include "cuBQL/bvh.h" #include "cuBQL/builder/omp/sort.h" #include @@ -21,13 +22,7 @@ int main(int ac, char **av) omp.alloc_and_upload(d_data,inputs); printf("d_data %p\n",d_data); -// #pragma omp target device(omp.gpuID) -// #pragma omp teams distribute parallel for -// for (int i=0;i<20;i++) -// if (1< results = omp.download_vector(d_data,N); From 08fb0d9cc80df855ba8d1a7d3485d3ba8e6527df Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Fri, 23 Jan 2026 20:28:23 -0700 Subject: [PATCH 04/14] added first draft of builder --- cuBQL/builder/omp/common.h | 27 +++ cuBQL/builder/omp/refit.h | 16 +- cuBQL/builder/omp/sort.h | 260 ++++++++++++++-------------- cuBQL/builder/omp/spatialMedian.h | 277 +++++++++++------------------- 4 files changed, 261 insertions(+), 319 deletions(-) diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h index df0a4ca..261d06f 100644 --- a/cuBQL/builder/omp/common.h +++ b/cuBQL/builder/omp/common.h @@ -6,6 +6,7 @@ #include "cuBQL/bvh.h" #include +#include namespace cuBQL { namespace omp { @@ -13,6 +14,11 @@ namespace cuBQL { struct Context { Context(int gpuID); + void *alloc(size_t Nelements); + + template + void alloc(T *&d_data, size_t Nelements); + template void alloc_and_upload(T *&d_data, const T *h_data, size_t Nelements); @@ -21,15 +27,36 @@ namespace cuBQL { template std::vector download_vector(const T *d_data, size_t N); + + template + void download(T &h_value, T *d_value); + + void free(void *); int gpuID; int hostID; }; + struct Kernel { inline int workIdx() const { return _workIdx; } int _workIdx; }; + inline uint32_t atomicAdd(uint32_t *p_value, uint32_t inc) + { + return ((std::atomic *)p_value)->fetch_add(inc); + } + + inline void atomicMin(uint32_t *p_value, uint32_t other) + { + uint32_t current = *(volatile uint32_t *)p_value; + while (current > other) { + bool wasChanged + = ((std::atomic*)p_value) + ->compare_exchange_weak((int&)current,(int&)other); + if (wasChanged) break; + } + } // ################################################################## diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h index 420d30f..d1c8f9a 100644 --- a/cuBQL/builder/omp/refit.h +++ b/cuBQL/builder/omp/refit.h @@ -8,16 +8,6 @@ namespace cuBQL { namespace omp { - struct Context { - Context(int gpuID); - int gpuID; - int hostID; - }; - struct Kernel { - inline int workIdx() const { return _workIdx; } - int _workIdx; - }; - template inline void refit_init(Kernel kernel, const typename BinaryBVH::Node *nodes, @@ -86,13 +76,13 @@ namespace cuBQL { { int numNodes = bvh.numNodes; uint32_t *refitData - = (uint32_t*)ctx->malloc(numNodes*sizeof(int)); + = (uint32_t*)ctx->alloc(numNodes*sizeof(int)); -# pragma omp target device(context->gpuID) +# pragma omp target device(ctx->gpuID) # pragma omp teams distribute parallel for for (int i=0;i(Kernel{i},bvh.nodes,refitData,bvh.numNodes); -# pragma omp target device(context->gpuID) +# pragma omp target device(ctx->gpuID) # pragma omp teams distribute parallel for for (int i=0;i -namespace cuBQL { - namespace omp { - namespace bitonic { +namespace omp { + namespace bitonic { - template - void g_orderSegmentPairs(uint32_t tid, - int logSegLen, - key_t *const d_values, - int numValues) - { - uint32_t segLen = 1<>logSegLen; - uint32_t pairRank = tid-(pairIdx< + void g_orderSegmentPairs(uint32_t tid, + int logSegLen, + key_t *const d_values, + int numValues) + { + uint32_t segLen = 1<>logSegLen; + uint32_t pairRank = tid-(pairIdx<= numValues) return; + if (r >= numValues) return; - key_t lv = d_values[l]; - key_t rv = d_values[r]; - if (rv < lv) { - d_values[r] = lv; - d_values[l] = rv; - } + key_t lv = d_values[l]; + key_t rv = d_values[r]; + if (rv < lv) { + d_values[r] = lv; + d_values[l] = rv; } + } - template - void g_orderSegmentPairs(uint32_t tid, - int logSegLen, - key_t *const d_keys, - value_t *const d_values, - int numValues) - { - uint32_t segLen = 1<>logSegLen; - uint32_t pairRank = tid-(pairIdx< + void g_orderSegmentPairs(uint32_t tid, + int logSegLen, + key_t *const d_keys, + value_t *const d_values, + int numValues) + { + uint32_t segLen = 1<>logSegLen; + uint32_t pairRank = tid-(pairIdx<= numValues) return; + if (r >= numValues) return; - key_t lk = d_keys[l]; - key_t rk = d_keys[r]; - key_t lv = d_values[l]; - key_t rv = d_values[r]; - if (rk < lk) { - d_values[r] = lv; - d_values[l] = rv; - d_keys[r] = lk; - d_keys[l] = rk; - } + key_t lk = d_keys[l]; + key_t rk = d_keys[r]; + key_t lv = d_values[l]; + key_t rv = d_values[r]; + if (rk < lk) { + d_values[r] = lv; + d_values[l] = rv; + d_keys[r] = lk; + d_keys[l] = rk; } + } - template - void orderSegmentPairs(int logSegLen, - key_t *const d_values, - int numValues, - uint32_t deviceID) - { + template + void orderSegmentPairs(int logSegLen, + key_t *const d_values, + int numValues, + uint32_t deviceID) + { #pragma omp target device(deviceID) #pragma omp teams distribute parallel for - for (int i=0;i - void orderSegmentPairs(int logSegLen, - key_t *const d_keys, - value_t *const d_values, - int numValues, - uint32_t deviceID) - { + template + void orderSegmentPairs(int logSegLen, + key_t *const d_keys, + value_t *const d_values, + int numValues, + uint32_t deviceID) + { #pragma omp target device(deviceID) #pragma omp teams distribute parallel for - for (int i=0;i - void sortSegments(int logSegLen, - key_t *const d_keys, - int numValues, - uint32_t deviceID) - { - if (logSegLen == 0) return; + template + void sortSegments(int logSegLen, + key_t *const d_keys, + int numValues, + uint32_t deviceID) + { + if (logSegLen == 0) return; - sortSegments(logSegLen-1,d_keys,numValues,omp); - orderSegmentPairs(logSegLen-1,d_keys,numValues,omp); - sortSegments(logSegLen-1,d_keys,numValues,omp); - } + sortSegments(logSegLen-1,d_keys,numValues,deviceID); + orderSegmentPairs(logSegLen-1,d_keys,numValues,deviceID); + sortSegments(logSegLen-1,d_keys,numValues,deviceID); + } - template - void sortSegments(int logSegLen, - key_t *const d_keys, - value_t *const d_values, - int numValues, - uint32_t deviceID) - { - if (logSegLen == 0) return; + template + void sortSegments(int logSegLen, + key_t *const d_keys, + value_t *const d_values, + int numValues, + uint32_t deviceID) + { + if (logSegLen == 0) return; - sortSegments(logSegLen-1,d_keys,d_values,numValues,omp); - orderSegmentPairs(logSegLen-1,d_keys,d_values,numValues,omp); - sortSegments(logSegLen-1,d_keys,d_values,numValues,omp); - } - - template - void sort(key_t *const d_values, - int numValues, - uint32_t deviceID) - { - uint32_t logSegLen = 0; - while (1< - void sort(key_t *const d_keys, - value_t *const d_values, - int numValues, - uint32_t deviceID) - { - uint32_t logSegLen = 0; - while (1< - void omp_target_sort(key_t *const d_values, - size_t numValues, - uint32_t deviceID) + void sort(key_t *const d_values, + int numValues, + uint32_t deviceID) { - assert(numValues < (1ull<<31)); - bitonic::sort(d_values,(int)numValues,deviceID); + uint32_t logSegLen = 0; + while (1< - void omp_target_sort(key_t *const d_keys, - value_t *const d_values, - size_t numValues, - uint32_t deviceID) + void sort(key_t *const d_keys, + value_t *const d_values, + int numValues, + uint32_t deviceID) { - assert(numValues < (1ull<<31)); - bitonic::sort(d_keys,d_values,(int)numValues,deviceID); + uint32_t logSegLen = 0; + while (1< + void omp_target_sort(key_t *const d_values, + size_t numValues, + uint32_t deviceID) + { + assert(numValues < (1ull<<31)); + bitonic::sort(d_values,(int)numValues,deviceID); + } + template + void omp_target_sort(key_t *const d_keys, + value_t *const d_values, + size_t numValues, + uint32_t deviceID) + { + assert(numValues < (1ull<<31)); + bitonic::sort(d_keys,d_values,(int)numValues,deviceID); } + } diff --git a/cuBQL/builder/omp/spatialMedian.h b/cuBQL/builder/omp/spatialMedian.h index 7dea853..4c9f63a 100644 --- a/cuBQL/builder/omp/spatialMedian.h +++ b/cuBQL/builder/omp/spatialMedian.h @@ -4,10 +4,21 @@ #pragma once #include "cuBQL/builder/omp/refit.h" +#include "cuBQL/builder/omp/sort.h" namespace cuBQL { namespace omp { + template + struct AtomicBox : public box_t { + }; + + template + void atomic_grow(AtomicBox &ab, typename box_t::vec_t P); + + template + void atomic_grow(AtomicBox &ab, box_t B); + struct PrimState { union { /* careful with this order - this is intentionally chosen such @@ -22,6 +33,8 @@ namespace cuBQL { }; }; + typedef enum : int8_t { OPEN_BRANCH, OPEN_NODE, DONE_NODE } NodeState; + template struct CUBQL_ALIGN(16) TempNode { using box_t = cuBQL::box_t; @@ -46,12 +59,11 @@ namespace cuBQL { }; template - __global__ - void initState(BuildState *buildState, - NodeState *nodeStates, + void initState(uint32_t *pNumNodes, + NodeState *nodeStates, TempNode *nodes) { - buildState->numNodes = 2; + *pNumNodes = 2; nodeStates[0] = OPEN_BRANCH; nodes[0].openBranch.count = 0; @@ -63,12 +75,13 @@ namespace cuBQL { } template - __global__ void initPrims(TempNode *nodes, - PrimState *primState, - const box_t *primBoxes, - uint32_t numPrims) + void initPrims(Kernel kernel, + TempNode *nodes, + PrimState *primState, + const box_t *primBoxes, + uint32_t numPrims) { - const int primID = threadIdx.x+blockIdx.x*blockDim.x; + const int primID = kernel.workIdx(); if (primID >= numPrims) return; auto &me = primState[primID]; @@ -88,105 +101,14 @@ namespace cuBQL { } template - __global__ - void selectSplits(BuildState *buildState, + void selectSplits(Kernel kernel, + uint32_t *pNumNodes, NodeState *nodeStates, TempNode *nodes, uint32_t numNodes, BuildConfig buildConfig) { -#if 1 - __shared__ int l_newNodeOfs; - if (threadIdx.x == 0) - l_newNodeOfs = 0; - __syncthreads(); - - int *t_nodeOffsetToWrite = 0; - int t_localOffsetToAdd = 0; - - while (true) { - const int nodeID = threadIdx.x+blockIdx.x*blockDim.x; - if (nodeID >= numNodes) - break; - - NodeState &nodeState = nodeStates[nodeID]; - if (nodeState == DONE_NODE) - // this node was already closed before - break; - - if (nodeState == OPEN_NODE) { - // this node was open in the last pass, can close it. - nodeState = DONE_NODE; - int offset = nodes[nodeID].openNode.offset; - auto &done = nodes[nodeID].doneNode; - done.count = 0; - done.offset = offset; - break; - } - - auto in = nodes[nodeID].openBranch; - if (in.count <= buildConfig.makeLeafThreshold) { - auto &done = nodes[nodeID].doneNode; - done.count = in.count; - // set this to max-value, so the prims can later do atomicMin - // with their position ion the leaf list; this value is - // greater than any prim position. - done.offset = (uint32_t)-1; - nodeState = DONE_NODE; - } else { - float widestWidth = 0.f; - int widestDim = -1; - float widestLo, widestHi, widestCtr; -#pragma unroll - for (int d=0;d= 0) { - open.pos = widestCtr; - } - open.dim - = (widestDim < 0 || widestCtr == widestLo || widestCtr == widestHi) - ? -1 - : widestDim; - - // this will be epensive - could make this faster by block-reducing - // open.offset = atomicAdd(&buildState->numNodes,2); - t_nodeOffsetToWrite = (int*)&open.offset; - t_localOffsetToAdd = atomicAdd(&l_newNodeOfs,2); - nodeState = OPEN_NODE; - } - break; - } - __syncthreads(); - if (threadIdx.x == 0 && l_newNodeOfs > 0) - l_newNodeOfs = atomicAdd(&buildState->numNodes,l_newNodeOfs); - __syncthreads(); - if (t_nodeOffsetToWrite) { - int openOffset = *t_nodeOffsetToWrite = l_newNodeOfs + t_localOffsetToAdd; -#pragma unroll - for (int side=0;side<2;side++) { - const int childID = openOffset+side; - auto &child = nodes[childID].openBranch; - child.centBounds.set_empty(); - child.count = 0; - nodeStates[childID] = OPEN_BRANCH; - } - } -#else - const int nodeID = threadIdx.x+blockIdx.x*blockDim.x; + const int nodeID = kernel.workIdx();//threadIdx.x+blockIdx.x*blockDim.x; if (nodeID >= numNodes) return; NodeState &nodeState = nodeStates[nodeID]; @@ -242,8 +164,7 @@ namespace cuBQL { ? -1 : widestDim; - // this will be epensive - could make this faster by block-reducing - open.offset = atomicAdd(&buildState->numNodes,2); + open.offset = atomicAdd(pNumNodes,2); #pragma unroll for (int side=0;side<2;side++) { const int childID = open.offset+side; @@ -254,18 +175,17 @@ namespace cuBQL { } nodeState = OPEN_NODE; } -#endif } template - __global__ - void updatePrims(NodeState *nodeStates, + void updatePrims(Kernel kernel, + NodeState *nodeStates, TempNode *nodes, PrimState *primStates, const box_t *primBoxes, int numPrims) { - const int primID = threadIdx.x+blockIdx.x*blockDim.x; + const int primID = kernel.workIdx(); if (primID >= numPrims) return; const auto me = primStates[primID]; @@ -302,13 +222,13 @@ namespace cuBQL { the nodes[] array, the node.offset value to point to the first of this nodes' items in that bvh.primIDs[] list. */ template - __global__ - void writePrimsAndLeafOffsets(TempNode *nodes, + void writePrimsAndLeafOffsets(Kernel kernel, + TempNode *nodes, uint32_t *bvhItemList, PrimState *primStates, int numPrims) { - const int offset = threadIdx.x+blockIdx.x*blockDim.x; + const int offset = kernel.workIdx();//threadIdx.x+blockIdx.x*blockDim.x; if (offset >= numPrims) return; auto &ps = primStates[offset]; @@ -324,12 +244,12 @@ namespace cuBQL { /* writes main phase's temp nodes into final bvh.nodes[] layout. actual bounds of that will NOT yet bewritten */ template - __global__ - void writeNodes(typename BinaryBVH::Node *finalNodes, + void writeNodes(Kernel kernel, + typename BinaryBVH::Node *finalNodes, TempNode *tempNodes, int numNodes) { - const int nodeID = threadIdx.x+blockIdx.x*blockDim.x; + const int nodeID = kernel.workIdx(); if (nodeID >= numNodes) return; finalNodes[nodeID].admin.offset = tempNodes[nodeID].doneNode.offset; @@ -338,12 +258,12 @@ namespace cuBQL { template - void build(BinaryBVH &bvh, - const box_t *boxes, - int numPrims, - BuildConfig buildConfig, - cudaStream_t s, - GpuMemoryResource &memResource) + void spatialMedian(BinaryBVH &bvh, + /*! DEVICE array of boxes */ + const box_t *boxes, + uint32_t numPrims, + BuildConfig buildConfig, + Context *ctx) { assert(sizeof(PrimState) == sizeof(uint64_t)); @@ -353,91 +273,94 @@ namespace cuBQL { TempNode *tempNodes = 0; NodeState *nodeStates = 0; PrimState *primStates = 0; - BuildState *buildState = 0; - ctx->malloc(tempNodes,2*numPrims); - ctx->malloc(nodeStates,2*numPrims); - ctx->malloc(primStates,numPrims); - ctx->malloc(buildState,1); - initState<<<1,1,0,s>>>(buildState, - nodeStates, - tempNodes); - initPrims<<>> - (tempNodes, - primStates,boxes,numPrims); + uint32_t *d_numNodes = 0; + ctx->alloc(tempNodes,2*numPrims); + ctx->alloc(nodeStates,2*numPrims); + ctx->alloc(primStates,numPrims); + ctx->alloc(d_numNodes,1); +#pragma omp target device(ctx->gpuID) +#pragma omp teams distribute parallel for + for (int tid=0;tid<1;tid++) + initState(d_numNodes, + nodeStates, + tempNodes); +#pragma omp target device(ctx->gpuID) +#pragma omp teams distribute parallel for + for (int tid=0;tidnumNodes, - sizeof(numNodes),cudaMemcpyDeviceToHost,s)); - CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s)); - CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent)); + ctx->download(numNodes,d_numNodes); if (numNodes == numDone) break; - selectSplits<<>> - (buildState, - nodeStates,tempNodes,numNodes, - buildConfig); +#pragma omp target device(ctx->gpuID) +#pragma omp teams distribute parallel for + for (int tid=0;tid>> - (nodeStates,tempNodes, - primStates,boxes,numPrims); +#pragma omp target device(ctx->gpuID) +#pragma omp teams distribute parallel for + for (int tid=0;tidmalloc(sortedPrimStates,numPrims); - auto rc = - cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes, - (uint64_t*)primStates, - (uint64_t*)sortedPrimStates, - numPrims,32,64,s); - ctx->malloc(d_temp_storage,temp_storage_bytes); - rc = - cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes, - (uint64_t*)primStates, - (uint64_t*)sortedPrimStates, - numPrims,32,64,s); - rc = rc; - ctx->free(d_temp_storage); + ::omp::omp_target_sort((uint64_t*)primStates,numPrims,ctx->gpuID); + // ================================================================== // allocate and write BVH item list, and write offsets of leaf nodes // ================================================================== bvh.numPrims = numPrims; - ctx->malloc(bvh.primIDs,numPrims); - writePrimsAndLeafOffsets<<>> - (tempNodes,bvh.primIDs,sortedPrimStates,numPrims); - + ctx->alloc(bvh.primIDs,numPrims); +#pragma omp target device(ctx->gpuID) +#pragma omp teams distribute parallel for + for (int tid=0;tidmalloc(bvh.nodes,numNodes); - writeNodes<<>> - (bvh.nodes,tempNodes,numNodes); - ctx->free(sortedPrimStates); + ctx->alloc(bvh.nodes,numNodes); +#pragma omp target device(ctx->gpuID) +#pragma omp teams distribute parallel for + for (int tid=0;tidfree(tempNodes); ctx->free(nodeStates); ctx->free(primStates); - ctx->free(buildState); + ctx->free(d_numNodes); - refit(bvh); + cuBQL::omp::refit(bvh,boxes,ctx); } } + + template + void build_omp_target(BinaryBVH &bvh, + const box_t *d_boxes, + uint32_t numBoxes, + BuildConfig buildConfig, + int gpuID) + { + omp::Context ctx(gpuID); + omp::spatialMedian(bvh,d_boxes,numBoxes,buildConfig,&ctx); + } } From 1ffa5f1f108c510d10ab7214513d8256c766231d Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Sat, 24 Jan 2026 13:54:57 -0700 Subject: [PATCH 05/14] first draft of builder that compiles; atomics realized via CAS --- cuBQL/builder/omp/common.h | 53 +++++++++++- cuBQL/builder/omp/spatialMedian.h | 133 +++++++++++++++++++++++++++++- 2 files changed, 181 insertions(+), 5 deletions(-) diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h index 261d06f..1eae7e7 100644 --- a/cuBQL/builder/omp/common.h +++ b/cuBQL/builder/omp/common.h @@ -14,7 +14,7 @@ namespace cuBQL { struct Context { Context(int gpuID); - void *alloc(size_t Nelements); + void *alloc(size_t numBytes); template void alloc(T *&d_data, size_t Nelements); @@ -70,8 +70,11 @@ namespace cuBQL { printf("#cuBQL:omp:Context(gpu=%i/%i,host=%i)\n", gpuID,omp_get_num_devices(),hostID); } + + void *Context::alloc(size_t numBytes) + { return omp_target_alloc(numBytes,gpuID); } - template + template inline void Context::alloc_and_upload(T *&d_data, const T *h_data, size_t N) @@ -84,7 +87,7 @@ namespace cuBQL { 0,0,gpuID,hostID); } - template + template inline void Context::alloc_and_upload(T *&d_data, const std::vector &h_vector) { alloc_and_upload(d_data,h_vector.data(),h_vector.size()); } @@ -97,6 +100,50 @@ namespace cuBQL { 0,0,hostID,gpuID); return out; } + + inline void Context::free(void *ptr) + { omp_target_free(ptr,gpuID); } + + template inline + void Context::alloc(T *&d_data, size_t N) + { + d_data = (T*)omp_target_alloc(N*sizeof(T),gpuID); + } + + // template inline + // void Context::alloc_and_upload(T *&d_data, + // const T *h_data, + // size_t N) + // { + // alloc(d_data,N); + // upload(d_data,h_data,N); + // } + + // template inline + // void Context::alloc_and_upload(T *&d_data, + // const std::vector &h_vector) + // { + // alloc(d_data,h_vector.size()); + // upload(d_data,h_vector); + // } + + // template inline + // std::vector Context::download_vector(const T *d_data, + // size_t N) + // { + // std::vector vec(N); + // omp_target_memcpy(vec.data(),d_data,N*sizeof(T), + // 0,0,hostID,gpuID); + // return vec; + // } + + template + inline void Context::download(T &h_value, T *d_value) + { + omp_target_memcpy(&h_value,d_value,sizeof(T), + 0,0,hostID,gpuID); + } + } // ::cuBQL::omp } // ::cuBQL diff --git a/cuBQL/builder/omp/spatialMedian.h b/cuBQL/builder/omp/spatialMedian.h index 4c9f63a..a07dd65 100644 --- a/cuBQL/builder/omp/spatialMedian.h +++ b/cuBQL/builder/omp/spatialMedian.h @@ -13,11 +13,140 @@ namespace cuBQL { struct AtomicBox : public box_t { }; + template + void atomic_min(T *ptr, T v); + template + void atomic_max(T *ptr, T v); + + /*! iw - note: this implementation of atomic min/max via atomic + compare-exchange (CAS); which is cetainly not optimal on any + sort of modern GPU - but it works in any C++-21 compliant + compiler, so it's what we do for now */ + void atomic_min(float *ptr, float value) + { + float current = *(volatile float *)ptr; + while (current > value) { + bool wasChanged + = ((std::atomic*)ptr) + ->compare_exchange_weak((int&)current,(int&)value); + if (wasChanged) break; + } + } + + /*! iw - note: this implementation of atomic min/max via atomic + compare-exchange (CAS); which is cetainly not optimal on any + sort of modern GPU - but it works in any C++-21 compliant + compiler, so it's what we do for now */ + void atomic_max(float *ptr, float value) + { + float current = *(volatile float *)ptr; + while (current > value) { + bool wasChanged + = ((std::atomic*)ptr) + ->compare_exchange_weak((int&)current,(int&)value); + if (wasChanged) break; + } + } + + /*! iw - note: this implementation of atomic min/max via atomic + compare-exchange (CAS); which is cetainly not optimal on any + sort of modern GPU - but it works in any C++-21 compliant + compiler, so it's what we do for now */ + void atomic_min(double *ptr, double value) + { + double current = *(volatile double *)ptr; + while (current > value) { + bool wasChanged + = ((std::atomic*)ptr) + ->compare_exchange_weak((long long int&)current, + (long long int&)value); + if (wasChanged) break; + } + } + + /*! iw - note: this implementation of atomic min/max via atomic + compare-exchange (CAS); which is cetainly not optimal on any + sort of modern GPU - but it works in any C++-21 compliant + compiler, so it's what we do for now */ + void atomic_max(double *ptr, double value) + { + double current = *(volatile double *)ptr; + while (current > value) { + bool wasChanged + = ((std::atomic*)ptr) + ->compare_exchange_weak((long long int&)current, + (long long int&)value); + if (wasChanged) break; + } + } + + template + void v_atomic_min(vec_t *ptr, vec_t v); + template + void v_atomic_max(vec_t *ptr, vec_t v); + + + template + void v_atomic_min(vec_t *ptr, vec_t v) + { + atomic_min(&ptr->x,v.x); + atomic_min(&ptr->y,v.y); + } + + template + void v_atomic_min(vec_t *ptr, vec_t v) + { + atomic_min(&ptr->x,v.x); + atomic_min(&ptr->y,v.y); + atomic_min(&ptr->z,v.z); + } + + template + void v_atomic_min(vec_t *ptr, vec_t v) + { + atomic_min(&ptr->x,v.x); + atomic_min(&ptr->y,v.y); + atomic_min(&ptr->z,v.z); + atomic_min(&ptr->w,v.w); + } + + template + void v_atomic_max(vec_t *ptr, vec_t v) + { + atomic_max(&ptr->x,v.x); + atomic_max(&ptr->y,v.y); + } + + template + void v_atomic_max(vec_t *ptr, vec_t v) + { + atomic_max(&ptr->x,v.x); + atomic_max(&ptr->y,v.y); + atomic_max(&ptr->z,v.z); + } + + template + void v_atomic_max(vec_t *ptr, vec_t v) + { + atomic_max(&ptr->x,v.x); + atomic_max(&ptr->y,v.y); + atomic_max(&ptr->z,v.z); + atomic_max(&ptr->w,v.w); + } + template - void atomic_grow(AtomicBox &ab, typename box_t::vec_t P); + void atomic_grow(AtomicBox &ab, typename box_t::vec_t P) + { + v_atomic_min(&ab.lower,P); + v_atomic_max(&ab.upper,P); + } template - void atomic_grow(AtomicBox &ab, box_t B); + void atomic_grow(AtomicBox &ab, box_t B) + { + v_atomic_min(&ab.lower,B.lower); + v_atomic_max(&ab.upper,B.upper); + } struct PrimState { union { From 0ac41c8339dfe3faccfd267b405b71cc29144e3c Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Mon, 26 Jan 2026 13:23:11 -0700 Subject: [PATCH 06/14] omp refit now working, but only with single thread per team; probably compiler issue --- cuBQL/builder/omp/AtomicBox.h | 157 +++++++++++++++++++ cuBQL/builder/omp/common.h | 24 ++- cuBQL/builder/omp/refit.h | 115 ++++++++++---- cuBQL/builder/omp/sort.h | 23 ++- cuBQL/builder/omp/spatialMedian.h | 250 +++++++++++------------------- cuBQL/math/vec.h | 59 ++++--- 6 files changed, 414 insertions(+), 214 deletions(-) create mode 100644 cuBQL/builder/omp/AtomicBox.h diff --git a/cuBQL/builder/omp/AtomicBox.h b/cuBQL/builder/omp/AtomicBox.h new file mode 100644 index 0000000..4ee90cd --- /dev/null +++ b/cuBQL/builder/omp/AtomicBox.h @@ -0,0 +1,157 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA +// CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "cuBQL/builder/omp/common.h" + +namespace cuBQL { + namespace omp { + + template + struct AtomicBox : public box_t { + + inline void set_empty() + { + *(box_t *)this = box_t(); + } + }; + + template + inline void atomic_min(T *ptr, T v); + template + inline void atomic_max(T *ptr, T v); + + /*! iw - note: this implementation of atomic min/max via atomic + compare-exchange (CAS); which is cetainly not optimal on any + sort of modern GPU - but it works in any C++-21 compliant + compiler, so it's what we do for now */ + inline void atomic_min(float *ptr, float value) + { + float current = *(volatile float *)ptr; + while (current > value) { + bool wasChanged + = ((std::atomic*)ptr) + ->compare_exchange_weak((int&)current,(int&)value); + if (wasChanged) break; + } + } + + /*! iw - note: this implementation of atomic min/max via atomic + compare-exchange (CAS); which is cetainly not optimal on any + sort of modern GPU - but it works in any C++-21 compliant + compiler, so it's what we do for now */ + inline void atomic_max(float *ptr, float value) + { + float current = *(volatile float *)ptr; + while (current < value) { + bool wasChanged + = ((std::atomic*)ptr) + ->compare_exchange_weak((int&)current,(int&)value); + if (wasChanged) break; + } + } + + /*! iw - note: this implementation of atomic min/max via atomic + compare-exchange (CAS); which is cetainly not optimal on any + sort of modern GPU - but it works in any C++-21 compliant + compiler, so it's what we do for now */ + inline void atomic_min(double *ptr, double value) + { + double current = *(volatile double *)ptr; + while (current > value) { + bool wasChanged + = ((std::atomic*)ptr) + ->compare_exchange_weak((long long int&)current, + (long long int&)value); + if (wasChanged) break; + } + } + + /*! iw - note: this implementation of atomic min/max via atomic + compare-exchange (CAS); which is cetainly not optimal on any + sort of modern GPU - but it works in any C++-21 compliant + compiler, so it's what we do for now */ + inline void atomic_max(double *ptr, double value) + { + double current = *(volatile double *)ptr; + while (current < value) { + bool wasChanged + = ((std::atomic*)ptr) + ->compare_exchange_weak((long long int&)current, + (long long int&)value); + if (wasChanged) break; + } + } + + template + inline void v_atomic_min(vec_t *ptr, vec_t v); + template + inline void v_atomic_max(vec_t *ptr, vec_t v); + + + template + inline void v_atomic_min(vec_t *ptr, vec_t v) + { + atomic_min(&ptr->x,v.x); + atomic_min(&ptr->y,v.y); + } + + template + inline void v_atomic_min(vec_t *ptr, vec_t v) + { + atomic_min(&ptr->x,v.x); + atomic_min(&ptr->y,v.y); + atomic_min(&ptr->z,v.z); + } + + template + inline void v_atomic_min(vec_t *ptr, vec_t v) + { + atomic_min(&ptr->x,v.x); + atomic_min(&ptr->y,v.y); + atomic_min(&ptr->z,v.z); + atomic_min(&ptr->w,v.w); + } + + template + inline void v_atomic_max(vec_t *ptr, vec_t v) + { + atomic_max(&ptr->x,v.x); + atomic_max(&ptr->y,v.y); + } + + template + inline void v_atomic_max(vec_t *ptr, vec_t v) + { + atomic_max(&ptr->x,v.x); + atomic_max(&ptr->y,v.y); + atomic_max(&ptr->z,v.z); + } + + template + inline void v_atomic_max(vec_t *ptr, vec_t v) + { + atomic_max(&ptr->x,v.x); + atomic_max(&ptr->y,v.y); + atomic_max(&ptr->z,v.z); + atomic_max(&ptr->w,v.w); + } + + template + inline void atomic_grow(AtomicBox &ab, typename box_t::vec_t P) + { + v_atomic_min(&ab.lower,P); + v_atomic_max(&ab.upper,P); + } + + template + inline void atomic_grow(AtomicBox &ab, box_t B) + { + v_atomic_min(&ab.lower,B.lower); + v_atomic_max(&ab.upper,B.upper); + } + + } +} diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h index 1eae7e7..1cb8178 100644 --- a/cuBQL/builder/omp/common.h +++ b/cuBQL/builder/omp/common.h @@ -22,6 +22,9 @@ namespace cuBQL { template void alloc_and_upload(T *&d_data, const T *h_data, size_t Nelements); + template + void upload(T *d_data, const T *h_data, size_t Nelements); + template void alloc_and_upload(T *&d_data, const std::vector &h_vector); @@ -42,7 +45,7 @@ namespace cuBQL { int _workIdx; }; - inline uint32_t atomicAdd(uint32_t *p_value, uint32_t inc) + inline uint32_t atomicAdd(volatile uint32_t *p_value, uint32_t inc) { return ((std::atomic *)p_value)->fetch_add(inc); } @@ -74,6 +77,16 @@ namespace cuBQL { void *Context::alloc(size_t numBytes) { return omp_target_alloc(numBytes,gpuID); } + template inline + void Context::upload(T *d_data, + const T *h_data, + size_t N) + { + assert(d_data); + omp_target_memcpy(d_data,h_data,N*sizeof(T), + 0,0,gpuID,hostID); + } + template inline void Context::alloc_and_upload(T *&d_data, const T *h_data, @@ -82,9 +95,7 @@ namespace cuBQL { printf("target_alloc N %li gpu %i\n",N,gpuID); d_data = (T *)omp_target_alloc(N*sizeof(T),gpuID); printf("ptr %p\n",d_data); - assert(d_data); - omp_target_memcpy(d_data,h_data,N*sizeof(T), - 0,0,gpuID,hostID); + upload(d_data,h_data,N); } template inline @@ -95,7 +106,12 @@ namespace cuBQL { template std::vector Context::download_vector(const T *d_data, size_t N) { + PRINT(N); + PRINT(d_data); + std::vector out(N); + PRINT(out.data()); + PRINT(sizeof(T)); omp_target_memcpy(out.data(),d_data,N*sizeof(T), 0,0,hostID,gpuID); return out; diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h index d1c8f9a..9b4fe9c 100644 --- a/cuBQL/builder/omp/refit.h +++ b/cuBQL/builder/omp/refit.h @@ -4,13 +4,14 @@ #pragma once #include "cuBQL/builder/omp/common.h" +#include "cuBQL/builder/omp/AtomicBox.h" namespace cuBQL { namespace omp { template inline - void refit_init(Kernel kernel, - const typename BinaryBVH::Node *nodes, + void refit_init_x(Kernel kernel, + typename BinaryBVH::Node *nodes, uint32_t *refitData, int numNodes) { @@ -18,52 +19,82 @@ namespace cuBQL { if (nodeID == 1 || nodeID >= numNodes) return; if (nodeID < 2) refitData[0] = 0; - const auto &node = nodes[nodeID]; + auto &node = nodes[nodeID]; + node.bounds = box_t(); if (node.admin.count) return; + if (node.admin.offset+1 >= numNodes) + printf("refit_init_overflow\n"); refitData[node.admin.offset+0] = nodeID << 1; refitData[node.admin.offset+1] = nodeID << 1; } - + template inline - void refit_run(Kernel kernel, - BinaryBVH bvh, - uint32_t *refitData, - const box_t *boxes) + void refit_run_x(Kernel kernel, + // BinaryBVH bvh, + uint32_t *bvh_primIDs, + typename BinaryBVH::Node *bvh_nodes, + uint32_t *refitData, + const box_t *boxes, + int numNodes) { int nodeID = kernel.workIdx(); - if (nodeID == 1 || nodeID >= bvh.numNodes) return; - - typename BinaryBVH::Node *node = &bvh.nodes[nodeID]; + if (nodeID == 1 || nodeID >= numNodes) return; + + typename BinaryBVH::Node *node = bvh_nodes+nodeID; + + if (nodeID < 0 || nodeID >= 601202) { + printf("BLA\n"); return; + } if (node->admin.count == 0) // this is a inner node - exit return; box_t bounds; bounds.set_empty(); for (int i=0;iadmin.count;i++) { - const box_t primBox = boxes[bvh.primIDs[node->admin.offset+i]]; + const box_t primBox = boxes[bvh_primIDs[node->admin.offset+i]]; bounds.lower = min(bounds.lower,primBox.lower); bounds.upper = max(bounds.upper,primBox.upper); } int parentID = (refitData[nodeID] >> 1); + int its = 0; while (true) { - node->bounds = bounds; - // __threadfence(); - if (node == bvh.nodes) + if (parentID == 1) { + printf("1 is parent!!!!\n"); + return; + } + + atomic_grow(*(AtomicBox> *)&node->bounds,bounds); + // node->bounds = bounds; + + if (node == bvh_nodes) break; + int it = its++; + // if (it >= 4) return; + + // if (it == 3) { printf("parentID %i\n",parentID); return; } + + if (parentID < 0 || parentID >= 601202) { + printf("BLA\n"); return; + } uint32_t refitBits = atomicAdd(&refitData[parentID],1u); if ((refitBits & 1) == 0) // we're the first one - let other one do it break; nodeID = parentID; - node = &bvh.nodes[parentID]; + node = &bvh_nodes[parentID]; parentID = (refitBits >> 1); + + int ofs = node->admin.offset; + if (ofs < 0 || ofs+1 >= 601202) { + printf("BLAB\n"); return; + } - typename BinaryBVH::Node l = bvh.nodes[node->admin.offset+0]; - typename BinaryBVH::Node r = bvh.nodes[node->admin.offset+1]; + typename BinaryBVH::Node l = bvh_nodes[ofs+0]; + typename BinaryBVH::Node r = bvh_nodes[ofs+1]; bounds.lower = min(l.bounds.lower,r.bounds.lower); bounds.upper = max(l.bounds.upper,r.bounds.upper); } @@ -74,18 +105,48 @@ namespace cuBQL { const box_t *boxes, Context *ctx) { + PING; + assert(bvh.nodes); + assert(bvh.primIDs); int numNodes = bvh.numNodes; uint32_t *refitData - = (uint32_t*)ctx->alloc(numNodes*sizeof(int)); - -# pragma omp target device(ctx->gpuID) -# pragma omp teams distribute parallel for - for (int i=0;i(Kernel{i},bvh.nodes,refitData,bvh.numNodes); -# pragma omp target device(ctx->gpuID) -# pragma omp teams distribute parallel for + = (uint32_t*)ctx->alloc(numNodes*sizeof(uint32_t)); + auto bvh_nodes = bvh.nodes; + auto bvh_primIDs = bvh.primIDs; + PING; + PRINT(numNodes); + PRINT(bvh.numNodes); + { +#pragma omp target device(ctx->gpuID) is_device_ptr(refitData) is_device_ptr(bvh_nodes) +#pragma omp teams distribute parallel for + for (int i=0;i(Kernel{i},bvh_nodes,refitData,numNodes); + } + PING; + PING; + { +#if 1 + int nb = 128; +#pragma omp target teams num_teams(nb) device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(bvh_nodes) is_device_ptr(refitData) is_device_ptr(boxes) + { + int team = omp_get_team_num(); + int nteams = omp_get_num_teams(); + int beg = ((team+0)*numNodes)/nteams; + int end = ((team+1)*numNodes)/nteams; + for (int i=beg;igpuID) is_device_ptr(bvh_primIDs) is_device_ptr(bvh_nodes) is_device_ptr(refitData) is_device_ptr(boxes) +#pragma omp teams distribute parallel for for (int i=0;ifree((void*)refitData); } diff --git a/cuBQL/builder/omp/sort.h b/cuBQL/builder/omp/sort.h index 724c0be..5f6cb39 100644 --- a/cuBQL/builder/omp/sort.h +++ b/cuBQL/builder/omp/sort.h @@ -12,8 +12,10 @@ namespace omp { namespace bitonic { - + +#pragma omp declare target template + inline void g_orderSegmentPairs(uint32_t tid, int logSegLen, key_t *const d_values, @@ -36,6 +38,7 @@ namespace omp { d_values[l] = rv; } } +#pragma omp end declare target template void g_orderSegmentPairs(uint32_t tid, @@ -73,11 +76,18 @@ namespace omp { int numValues, uint32_t deviceID) { -#pragma omp target device(deviceID) -#pragma omp teams distribute parallel for +#if 0 +#pragma omp target device(deviceID) teams is_device_ptr(d_values) num_teams(128) + { + int bs = + } +#else +#pragma omp target device(deviceID) is_device_ptr(d_values) nowait +#pragma omp teams distribute parallel for for (int i=0;i @@ -87,8 +97,9 @@ namespace omp { int numValues, uint32_t deviceID) { -#pragma omp target device(deviceID) -#pragma omp teams distribute parallel for +// #pragma omp target nowait device(deviceID) is_device_ptr(d_keys) is_device_ptr(d_values) +// #pragma omp target teams distribute parallel for +#pragma omp target nowait device(deviceID) is_device_ptr(d_keys) is_device_ptr(d_values) for (int i=0;i - struct AtomicBox : public box_t { - }; - - template - void atomic_min(T *ptr, T v); - template - void atomic_max(T *ptr, T v); - - /*! iw - note: this implementation of atomic min/max via atomic - compare-exchange (CAS); which is cetainly not optimal on any - sort of modern GPU - but it works in any C++-21 compliant - compiler, so it's what we do for now */ - void atomic_min(float *ptr, float value) - { - float current = *(volatile float *)ptr; - while (current > value) { - bool wasChanged - = ((std::atomic*)ptr) - ->compare_exchange_weak((int&)current,(int&)value); - if (wasChanged) break; - } - } - - /*! iw - note: this implementation of atomic min/max via atomic - compare-exchange (CAS); which is cetainly not optimal on any - sort of modern GPU - but it works in any C++-21 compliant - compiler, so it's what we do for now */ - void atomic_max(float *ptr, float value) - { - float current = *(volatile float *)ptr; - while (current > value) { - bool wasChanged - = ((std::atomic*)ptr) - ->compare_exchange_weak((int&)current,(int&)value); - if (wasChanged) break; - } - } - - /*! iw - note: this implementation of atomic min/max via atomic - compare-exchange (CAS); which is cetainly not optimal on any - sort of modern GPU - but it works in any C++-21 compliant - compiler, so it's what we do for now */ - void atomic_min(double *ptr, double value) - { - double current = *(volatile double *)ptr; - while (current > value) { - bool wasChanged - = ((std::atomic*)ptr) - ->compare_exchange_weak((long long int&)current, - (long long int&)value); - if (wasChanged) break; - } - } - - /*! iw - note: this implementation of atomic min/max via atomic - compare-exchange (CAS); which is cetainly not optimal on any - sort of modern GPU - but it works in any C++-21 compliant - compiler, so it's what we do for now */ - void atomic_max(double *ptr, double value) - { - double current = *(volatile double *)ptr; - while (current > value) { - bool wasChanged - = ((std::atomic*)ptr) - ->compare_exchange_weak((long long int&)current, - (long long int&)value); - if (wasChanged) break; - } - } - - template - void v_atomic_min(vec_t *ptr, vec_t v); - template - void v_atomic_max(vec_t *ptr, vec_t v); - - - template - void v_atomic_min(vec_t *ptr, vec_t v) - { - atomic_min(&ptr->x,v.x); - atomic_min(&ptr->y,v.y); - } - - template - void v_atomic_min(vec_t *ptr, vec_t v) - { - atomic_min(&ptr->x,v.x); - atomic_min(&ptr->y,v.y); - atomic_min(&ptr->z,v.z); - } - - template - void v_atomic_min(vec_t *ptr, vec_t v) - { - atomic_min(&ptr->x,v.x); - atomic_min(&ptr->y,v.y); - atomic_min(&ptr->z,v.z); - atomic_min(&ptr->w,v.w); - } - - template - void v_atomic_max(vec_t *ptr, vec_t v) - { - atomic_max(&ptr->x,v.x); - atomic_max(&ptr->y,v.y); - } - - template - void v_atomic_max(vec_t *ptr, vec_t v) - { - atomic_max(&ptr->x,v.x); - atomic_max(&ptr->y,v.y); - atomic_max(&ptr->z,v.z); - } - - template - void v_atomic_max(vec_t *ptr, vec_t v) - { - atomic_max(&ptr->x,v.x); - atomic_max(&ptr->y,v.y); - atomic_max(&ptr->z,v.z); - atomic_max(&ptr->w,v.w); - } - - template - void atomic_grow(AtomicBox &ab, typename box_t::vec_t P) - { - v_atomic_min(&ab.lower,P); - v_atomic_max(&ab.upper,P); - } - - template - void atomic_grow(AtomicBox &ab, box_t B) - { - v_atomic_min(&ab.lower,B.lower); - v_atomic_max(&ab.upper,B.upper); - } - struct PrimState { union { /* careful with this order - this is intentionally chosen such @@ -162,7 +24,8 @@ namespace cuBQL { }; }; - typedef enum : int8_t { OPEN_BRANCH, OPEN_NODE, DONE_NODE } NodeState; + enum { OPEN_BRANCH, OPEN_NODE, DONE_NODE }; + typedef uint8_t NodeState; template struct CUBQL_ALIGN(16) TempNode { @@ -186,21 +49,36 @@ namespace cuBQL { } doneNode; }; }; - + template - void initState(uint32_t *pNumNodes, + void initState(Kernel kernel, + uint32_t *pNumNodes, NodeState *nodeStates, TempNode *nodes) { + int tid = kernel.workIdx(); + if (tid > 0) return; + printf("initstate\n"); *pNumNodes = 2; + printf("initstate1\n"); nodeStates[0] = OPEN_BRANCH; + printf("initstate2\n"); nodes[0].openBranch.count = 0; + printf("initstate3\n"); + printf("initstate3 %p\n",&nodes[0].openBranch.centBounds); + + ((int*)&nodes[0].openBranch.centBounds)[0] = 0; + printf("bla\n"); nodes[0].openBranch.centBounds.set_empty(); + printf("initstate4\n"); nodeStates[1] = DONE_NODE; + printf("initstate5\n"); nodes[1].doneNode.offset = 0; + printf("initstate6\n"); nodes[1].doneNode.count = 0; + printf("initstate7\n"); } template @@ -212,7 +90,7 @@ namespace cuBQL { { const int primID = kernel.workIdx(); if (primID >= numPrims) return; - + auto &me = primState[primID]; me.primID = primID; @@ -222,7 +100,20 @@ namespace cuBQL { me.done = false; // this could be made faster by block-reducing ... atomicAdd(&nodes[0].openBranch.count,1); - atomic_grow(nodes[0].openBranch.centBounds,box.center());//centerOf(box)); + auto ctr = box.center(); + atomic_grow(nodes[0].openBranch.centBounds,ctr);//centerOf(box)); + // printf("p %i ctr %f %f %f grownn box %i : (%f %f %f)(%f %f %f)\n", + // primID, + // ctr.x, + // ctr.y, + // ctr.z, + // 0, + // nodes[0].openBranch.centBounds.lower.x, + // nodes[0].openBranch.centBounds.lower.y, + // nodes[0].openBranch.centBounds.lower.z, + // nodes[0].openBranch.centBounds.upper.x, + // nodes[0].openBranch.centBounds.upper.y, + // nodes[0].openBranch.centBounds.upper.z); } else { me.nodeID = (uint32_t)-1; me.done = true; @@ -254,7 +145,7 @@ namespace cuBQL { done.offset = offset; return; } - + auto in = nodes[nodeID].openBranch; if (in.count <= buildConfig.makeLeafThreshold) { auto &done = nodes[nodeID].doneNode; @@ -298,6 +189,7 @@ namespace cuBQL { for (int side=0;side<2;side++) { const int childID = open.offset+side; auto &child = nodes[childID].openBranch; + child.centBounds.set_empty(); child.count = 0; nodeStates[childID] = OPEN_BRANCH; @@ -361,11 +253,15 @@ namespace cuBQL { if (offset >= numPrims) return; auto &ps = primStates[offset]; + // printf("ps %i -> %i : %i done %i\n", + // (int)offset,(int)ps.primID,(int)ps.nodeID,(int)ps.done); bvhItemList[offset] = ps.primID; - + if ((int)ps.nodeID < 0) /* invalid prim, just skip here */ return; + if (ps.nodeID >= 853350) + { printf("OVERFLOW\n"); return; } auto &node = nodes[ps.nodeID]; atomicMin(&node.doneNode.offset,offset); } @@ -395,6 +291,12 @@ namespace cuBQL { Context *ctx) { assert(sizeof(PrimState) == sizeof(uint64_t)); + if (buildConfig.makeLeafThreshold < 1) + buildConfig.makeLeafThreshold = 1; + + PING; + PRINT(buildConfig.makeLeafThreshold); + PRINT(buildConfig.maxAllowedLeafSize); // ================================================================== // do build on temp nodes @@ -407,18 +309,28 @@ namespace cuBQL { ctx->alloc(nodeStates,2*numPrims); ctx->alloc(primStates,numPrims); ctx->alloc(d_numNodes,1); -#pragma omp target device(ctx->gpuID) + PING; + PRINT(numPrims); + PRINT(d_numNodes); + PRINT((int*)nodeStates); + PRINT(tempNodes); +#pragma omp target device(ctx->gpuID) is_device_ptr(d_numNodes) is_device_ptr(nodeStates) is_device_ptr(tempNodes) #pragma omp teams distribute parallel for for (int tid=0;tid<1;tid++) - initState(d_numNodes, + initState(Kernel{tid}, + d_numNodes, nodeStates, tempNodes); -#pragma omp target device(ctx->gpuID) + PING; + PRINT(numPrims); + PING; fflush(0); +#pragma omp target device(ctx->gpuID) is_device_ptr(tempNodes) is_device_ptr(primStates) is_device_ptr(boxes) #pragma omp teams distribute parallel for for (int tid=0;tiddownload(numNodes,d_numNodes); if (numNodes == numDone) break; -#pragma omp target device(ctx->gpuID) +#pragma omp target device(ctx->gpuID) is_device_ptr(d_numNodes) is_device_ptr(nodeStates) is_device_ptr(tempNodes) #pragma omp teams distribute parallel for for (int tid=0;tid h_primStates + = ctx->download_vector((uint64_t*)primStates,numPrims); + PING; + // ctx->download(h_primStates,primStates,numNodes*sizeof(uint64_t)); + std::sort(h_primStates.begin(),h_primStates.end()); + ctx->upload((uint64_t*)primStates,h_primStates.data(),numPrims); + PING; +#else + std::cout << "openmp sort ..." << std::endl; ::omp::omp_target_sort((uint64_t*)primStates,numPrims,ctx->gpuID); - +#endif + PING; // ================================================================== // allocate and write BVH item list, and write offsets of leaf nodes // ================================================================== + PRINT(numNodes); bvh.numPrims = numPrims; + bvh.primIDs = 0; + PING; ctx->alloc(bvh.primIDs,numPrims); -#pragma omp target device(ctx->gpuID) + PING; + auto primIDs = bvh.primIDs; +#pragma omp target device(ctx->gpuID) is_device_ptr(primStates) is_device_ptr(tempNodes) is_device_ptr(primIDs) #pragma omp teams distribute parallel for for (int tid=0;tidalloc(bvh.nodes,numNodes); -#pragma omp target device(ctx->gpuID) + PING; PRINT(numNodes); + auto bvhNodes = bvh.nodes; +#pragma omp target device(ctx->gpuID) is_device_ptr(bvhNodes) is_device_ptr(tempNodes) #pragma omp teams distribute parallel for for (int tid=0;tidfree(tempNodes); ctx->free(nodeStates); ctx->free(primStates); ctx->free(d_numNodes); + PING; cuBQL::omp::refit(bvh,boxes,ctx); + PING; } } diff --git a/cuBQL/math/vec.h b/cuBQL/math/vec.h index 10ffdb7..11d7b37 100644 --- a/cuBQL/math/vec.h +++ b/cuBQL/math/vec.h @@ -356,34 +356,53 @@ namespace cuBQL { /* vec:vec */ \ template \ inline __cubql_both \ - vec_t long_op(vec_t a, vec_t b) \ + vec_t long_op(const vec_t &a, const vec_t &b) \ { \ vec_t r; \ - CUBQL_PRAGMA_UNROLL \ - for (int i=0;i \ + template \ inline __cubql_both \ - vec_t long_op(T a, vec_t b) \ + vec_t long_op(const vec_t &a, const vec_t &b) \ { \ - vec_t r; \ - CUBQL_PRAGMA_UNROLL \ - for (int i=0;i r; \ + r.x = a.x op b.x; \ + r.y = a.y op b.y; \ return r; \ } \ - /* vec:scalar */ \ - template \ - inline __cubql_both \ - vec_t long_op(vec_t a, T b) \ - { \ - vec_t r; \ - CUBQL_PRAGMA_UNROLL \ - for (int i=0;i \ + inline __cubql_both \ + vec_t long_op(const vec_t &a, const vec_t &b) \ + { \ + vec_t r; \ + r.x = a.x op b.x; \ + r.y = a.y op b.y; \ + r.z = a.z op b.z; \ + return r; \ + } \ + template \ + inline __cubql_both \ + vec_t long_op(const vec_t &a, const vec_t &b) \ + { \ + vec_t r; \ + r.x = a.x op b.x; \ + r.y = a.y op b.y; \ + r.z = a.z op b.z; \ + r.w = a.w op b.w; \ + return r; \ + } \ + /* scalar-vec */ \ + template \ + inline __cubql_both \ + vec_t long_op(T a, const vec_t &b) \ + { return vec_t(a) op b; } \ + /* vec:scalar */ \ + template \ + inline __cubql_both \ + vec_t long_op(const vec_t &a, T b) \ + { return a op vec_t(b); } \ + CUBQL_OPERATOR_CUDA_T(long_op,op) CUBQL_OPERATOR(operator+,+) CUBQL_OPERATOR(operator-,-) From a68cb82c62a7b3cd26d12bc57406f825c62b6ee2 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Mon, 26 Jan 2026 17:54:09 -0700 Subject: [PATCH 07/14] rewritten openmp builder's atomics in openmp fashion --- cuBQL/builder/omp/AtomicBox.h | 25 +++++++++++++++++++++++++ cuBQL/builder/omp/common.h | 29 ++++++++++++++++++++--------- cuBQL/builder/omp/refit.h | 18 +++++++++--------- cuBQL/builder/omp/spatialMedian.h | 13 +++++++++++-- 4 files changed, 65 insertions(+), 20 deletions(-) diff --git a/cuBQL/builder/omp/AtomicBox.h b/cuBQL/builder/omp/AtomicBox.h index 4ee90cd..425beb9 100644 --- a/cuBQL/builder/omp/AtomicBox.h +++ b/cuBQL/builder/omp/AtomicBox.h @@ -6,6 +6,7 @@ #include "cuBQL/builder/omp/common.h" + namespace cuBQL { namespace omp { @@ -29,6 +30,7 @@ namespace cuBQL { compiler, so it's what we do for now */ inline void atomic_min(float *ptr, float value) { +#if 0 float current = *(volatile float *)ptr; while (current > value) { bool wasChanged @@ -36,6 +38,15 @@ namespace cuBQL { ->compare_exchange_weak((int&)current,(int&)value); if (wasChanged) break; } +#else +#pragma omp atomic compare + { + if (*ptr > value) *ptr = value; + } +// float t; +// #pragma omp atomic capture +// { t = *ptr; *ptr = std::min(t,value); } +#endif } /*! iw - note: this implementation of atomic min/max via atomic @@ -44,6 +55,7 @@ namespace cuBQL { compiler, so it's what we do for now */ inline void atomic_max(float *ptr, float value) { +#if 0 float current = *(volatile float *)ptr; while (current < value) { bool wasChanged @@ -51,6 +63,15 @@ namespace cuBQL { ->compare_exchange_weak((int&)current,(int&)value); if (wasChanged) break; } +#else +#pragma omp atomic compare + { + if (*ptr < value) *ptr = value; + } +// float t; +// #pragma omp atomic capture +// { t = *ptr; *ptr = std::max(t,value); } +#endif } /*! iw - note: this implementation of atomic min/max via atomic @@ -59,6 +80,7 @@ namespace cuBQL { compiler, so it's what we do for now */ inline void atomic_min(double *ptr, double value) { +#if 0 double current = *(volatile double *)ptr; while (current > value) { bool wasChanged @@ -67,6 +89,7 @@ namespace cuBQL { (long long int&)value); if (wasChanged) break; } +#endif } /*! iw - note: this implementation of atomic min/max via atomic @@ -75,6 +98,7 @@ namespace cuBQL { compiler, so it's what we do for now */ inline void atomic_max(double *ptr, double value) { +#if 0 double current = *(volatile double *)ptr; while (current < value) { bool wasChanged @@ -83,6 +107,7 @@ namespace cuBQL { (long long int&)value); if (wasChanged) break; } +#endif } template diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h index 1cb8178..2321b53 100644 --- a/cuBQL/builder/omp/common.h +++ b/cuBQL/builder/omp/common.h @@ -45,20 +45,31 @@ namespace cuBQL { int _workIdx; }; - inline uint32_t atomicAdd(volatile uint32_t *p_value, uint32_t inc) + inline uint32_t atomicAdd(volatile uint32_t *ptr, uint32_t inc) { - return ((std::atomic *)p_value)->fetch_add(inc); + uint32_t t; +#pragma omp atomic capture + { t = *ptr; *ptr += inc; } + // return ((std::atomic *)p_value)->fetch_add(inc); + return t; } - inline void atomicMin(uint32_t *p_value, uint32_t other) + inline void atomicMin(uint32_t *ptr, uint32_t value) { - uint32_t current = *(volatile uint32_t *)p_value; - while (current > other) { - bool wasChanged - = ((std::atomic*)p_value) - ->compare_exchange_weak((int&)current,(int&)other); - if (wasChanged) break; +#pragma omp atomic compare + { + if (*ptr > value) *ptr = value; } + // uint32_t t; +// #pragma omp atomic capture +// { t = *ptr; *ptr = std::min(t,value); } + // uint32_t current = *(volatile uint32_t *)p_value; + // while (current > other) { + // bool wasChanged + // = ((std::atomic*)p_value) + // ->compare_exchange_weak((int&)current,(int&)other); + // if (wasChanged) break; + // } } diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h index 9b4fe9c..ad2ea9c 100644 --- a/cuBQL/builder/omp/refit.h +++ b/cuBQL/builder/omp/refit.h @@ -43,9 +43,9 @@ namespace cuBQL { typename BinaryBVH::Node *node = bvh_nodes+nodeID; - if (nodeID < 0 || nodeID >= 601202) { - printf("BLA\n"); return; - } + // if (nodeID < 0 || nodeID >= 601202) { + // printf("BLA\n"); return; + // } if (node->admin.count == 0) // this is a inner node - exit return; @@ -76,9 +76,9 @@ namespace cuBQL { // if (it == 3) { printf("parentID %i\n",parentID); return; } - if (parentID < 0 || parentID >= 601202) { - printf("BLA\n"); return; - } + // if (parentID < 0 || parentID >= 601202) { + // printf("BLA\n"); return; + // } uint32_t refitBits = atomicAdd(&refitData[parentID],1u); if ((refitBits & 1) == 0) // we're the first one - let other one do it @@ -89,9 +89,9 @@ namespace cuBQL { parentID = (refitBits >> 1); int ofs = node->admin.offset; - if (ofs < 0 || ofs+1 >= 601202) { - printf("BLAB\n"); return; - } + // if (ofs < 0 || ofs+1 >= 601202) { + // printf("BLAB\n"); return; + // } typename BinaryBVH::Node l = bvh_nodes[ofs+0]; typename BinaryBVH::Node r = bvh_nodes[ofs+1]; diff --git a/cuBQL/builder/omp/spatialMedian.h b/cuBQL/builder/omp/spatialMedian.h index 32c754c..23a3a80 100644 --- a/cuBQL/builder/omp/spatialMedian.h +++ b/cuBQL/builder/omp/spatialMedian.h @@ -147,6 +147,14 @@ namespace cuBQL { } auto in = nodes[nodeID].openBranch; + // printf("node split %i : %f %f %f : %f %f %f\n", + // nodeID, + // in.centBounds.lower.x, + // in.centBounds.lower.y, + // in.centBounds.lower.z, + // in.centBounds.upper.x, + // in.centBounds.upper.y, + // in.centBounds.upper.z); if (in.count <= buildConfig.makeLeafThreshold) { auto &done = nodes[nodeID].doneNode; done.count = in.count; @@ -185,6 +193,7 @@ namespace cuBQL { : widestDim; open.offset = atomicAdd(pNumNodes,2); + // printf("offset %i\n",open.offset); #pragma unroll for (int side=0;side<2;side++) { const int childID = open.offset+side; @@ -260,8 +269,6 @@ namespace cuBQL { if ((int)ps.nodeID < 0) /* invalid prim, just skip here */ return; - if (ps.nodeID >= 853350) - { printf("OVERFLOW\n"); return; } auto &node = nodes[ps.nodeID]; atomicMin(&node.doneNode.offset,offset); } @@ -337,6 +344,7 @@ namespace cuBQL { // ------------------------------------------------------------------ while (true) { ctx->download(numNodes,d_numNodes); + PING; PRINT(numNodes); if (numNodes == numDone) break; #pragma omp target device(ctx->gpuID) is_device_ptr(d_numNodes) is_device_ptr(nodeStates) is_device_ptr(tempNodes) @@ -347,6 +355,7 @@ namespace cuBQL { nodeStates,tempNodes,numNodes, buildConfig); numDone = numNodes; + PRINT(numDone); #pragma omp target device(ctx->gpuID) #pragma omp teams distribute parallel for From 514e0373edfc4bf3e8e0e6550c7fb63654e4b49c Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Tue, 27 Jan 2026 11:57:51 -0700 Subject: [PATCH 08/14] added a non-sorting based final node writing stage --- cuBQL/builder/omp/refit.h | 24 +-------- cuBQL/builder/omp/spatialMedian.h | 89 ++++++++++++++++++++++++++----- 2 files changed, 76 insertions(+), 37 deletions(-) diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h index 9b4fe9c..9245e62 100644 --- a/cuBQL/builder/omp/refit.h +++ b/cuBQL/builder/omp/refit.h @@ -23,8 +23,6 @@ namespace cuBQL { node.bounds = box_t(); if (node.admin.count) return; - if (node.admin.offset+1 >= numNodes) - printf("refit_init_overflow\n"); refitData[node.admin.offset+0] = nodeID << 1; refitData[node.admin.offset+1] = nodeID << 1; } @@ -43,9 +41,6 @@ namespace cuBQL { typename BinaryBVH::Node *node = bvh_nodes+nodeID; - if (nodeID < 0 || nodeID >= 601202) { - printf("BLA\n"); return; - } if (node->admin.count == 0) // this is a inner node - exit return; @@ -58,27 +53,13 @@ namespace cuBQL { } int parentID = (refitData[nodeID] >> 1); - int its = 0; while (true) { - if (parentID == 1) { - printf("1 is parent!!!!\n"); - return; - } - atomic_grow(*(AtomicBox> *)&node->bounds,bounds); // node->bounds = bounds; if (node == bvh_nodes) break; - int it = its++; - // if (it >= 4) return; - - // if (it == 3) { printf("parentID %i\n",parentID); return; } - - if (parentID < 0 || parentID >= 601202) { - printf("BLA\n"); return; - } uint32_t refitBits = atomicAdd(&refitData[parentID],1u); if ((refitBits & 1) == 0) // we're the first one - let other one do it @@ -89,9 +70,6 @@ namespace cuBQL { parentID = (refitBits >> 1); int ofs = node->admin.offset; - if (ofs < 0 || ofs+1 >= 601202) { - printf("BLAB\n"); return; - } typename BinaryBVH::Node l = bvh_nodes[ofs+0]; typename BinaryBVH::Node r = bvh_nodes[ofs+1]; @@ -105,7 +83,6 @@ namespace cuBQL { const box_t *boxes, Context *ctx) { - PING; assert(bvh.nodes); assert(bvh.primIDs); int numNodes = bvh.numNodes; @@ -137,6 +114,7 @@ namespace cuBQL { refit_run_x(Kernel{i},//bvh, bvh_primIDs,bvh_nodes,refitData,boxes,numNodes); } + nb = nb; } #else #pragma omp target device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(bvh_nodes) is_device_ptr(refitData) is_device_ptr(boxes) diff --git a/cuBQL/builder/omp/spatialMedian.h b/cuBQL/builder/omp/spatialMedian.h index 32c754c..222af2d 100644 --- a/cuBQL/builder/omp/spatialMedian.h +++ b/cuBQL/builder/omp/spatialMedian.h @@ -10,6 +10,8 @@ namespace cuBQL { namespace omp { +#define NO_SORT 1 + struct PrimState { union { /* careful with this order - this is intentionally chosen such @@ -266,6 +268,41 @@ namespace cuBQL { atomicMin(&node.doneNode.offset,offset); } + template + void writeLeafOffsets(Kernel kernel, + TempNode *nodes, + uint32_t *pNextFreeOffset, + int numNodes) + { + const int nodeID = kernel.workIdx(); + if (nodeID >= numNodes) return; + + auto &node = nodes[nodeID].doneNode; + if (node.count > 0) + node.offset = atomicAdd(pNextFreeOffset,node.count)+node.count-1; + } + + template + void writePrims(Kernel kernel, + uint32_t *bvh_primIDs, + TempNode *nodes, + PrimState *primStates, + int numPrims) + { + const int primID = kernel.workIdx();//threadIdx.x+blockIdx.x*blockDim.x; + if (primID >= numPrims) return; + + auto &ps = primStates[primID]; + if ((int)ps.nodeID < 0) + /* invalid prim, just skip here */ + return; + auto &node = nodes[ps.nodeID].doneNode; + int myPos = atomicAdd(&node.offset,(uint32_t)-1); + if (myPos < 0 || myPos >= numPrims) + printf("OVERFLOW pri\n"); + bvh_primIDs[myPos] = ps.primID; + } + /* writes main phase's temp nodes into final bvh.nodes[] layout. actual bounds of that will NOT yet bewritten */ template @@ -348,18 +385,48 @@ namespace cuBQL { buildConfig); numDone = numNodes; -#pragma omp target device(ctx->gpuID) +#pragma omp target device(ctx->gpuID) is_device_ptr(d_numNodes) is_device_ptr(nodeStates) is_device_ptr(tempNodes) #pragma omp teams distribute parallel for for (int tid=0;tidalloc(bvh.primIDs,numPrims); + PING; + auto bvh_primIDs = bvh.primIDs; + +#if NO_SORT + uint32_t zero = 0u; + // set first uint there to zero; leaf writing code will + // atomically inc that value to determine leaf offset values. + ctx->upload(bvh_primIDs,&zero,1); +#pragma omp target device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(tempNodes) +#pragma omp teams distribute parallel for + for (int tid=0;tidgpuID) is_device_ptr(bvh_primIDs) is_device_ptr(tempNodes) +#pragma omp teams distribute parallel for + for (int tid=0;tidgpuID); -#endif PING; // ================================================================== // allocate and write BVH item list, and write offsets of leaf nodes // ================================================================== - +#endif PRINT(numNodes); - bvh.numPrims = numPrims; - bvh.primIDs = 0; - PING; - ctx->alloc(bvh.primIDs,numPrims); - PING; - auto primIDs = bvh.primIDs; #pragma omp target device(ctx->gpuID) is_device_ptr(primStates) is_device_ptr(tempNodes) is_device_ptr(primIDs) #pragma omp teams distribute parallel for for (int tid=0;tidalloc(bvh.nodes,numNodes); PING; PRINT(numNodes); - auto bvhNodes = bvh.nodes; -#pragma omp target device(ctx->gpuID) is_device_ptr(bvhNodes) is_device_ptr(tempNodes) + auto bvh_nodes = bvh.nodes; +#pragma omp target device(ctx->gpuID) is_device_ptr(bvh_nodes) is_device_ptr(tempNodes) #pragma omp teams distribute parallel for for (int tid=0;tidfree(tempNodes); ctx->free(nodeStates); ctx->free(primStates); From 7f9ce41cbf1295095599f27a87d1ef3b813c187d Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Tue, 27 Jan 2026 14:27:43 -0700 Subject: [PATCH 09/14] fix for atomic on amd --- cuBQL/builder/omp/AtomicBox.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cuBQL/builder/omp/AtomicBox.h b/cuBQL/builder/omp/AtomicBox.h index 425beb9..035dafd 100644 --- a/cuBQL/builder/omp/AtomicBox.h +++ b/cuBQL/builder/omp/AtomicBox.h @@ -90,6 +90,10 @@ namespace cuBQL { if (wasChanged) break; } #endif +#pragma omp atomic compare + { + if (*ptr > value) *ptr = value; + } } /*! iw - note: this implementation of atomic min/max via atomic @@ -108,6 +112,10 @@ namespace cuBQL { if (wasChanged) break; } #endif +#pragma omp atomic compare + { + if (*ptr < value) *ptr = value; + } } template From 480890a245f364b8d7b17ed1463c783c72a83454 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Tue, 27 Jan 2026 14:30:46 -0700 Subject: [PATCH 10/14] fixes for atomics on arc --- cuBQL/builder/omp/AtomicBox.h | 12 +++++------- cuBQL/builder/omp/common.h | 5 ++--- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/cuBQL/builder/omp/AtomicBox.h b/cuBQL/builder/omp/AtomicBox.h index 425beb9..7571ce2 100644 --- a/cuBQL/builder/omp/AtomicBox.h +++ b/cuBQL/builder/omp/AtomicBox.h @@ -39,10 +39,9 @@ namespace cuBQL { if (wasChanged) break; } #else + float &x = *ptr; #pragma omp atomic compare - { - if (*ptr > value) *ptr = value; - } + if (x > value) { x = value; } // float t; // #pragma omp atomic capture // { t = *ptr; *ptr = std::min(t,value); } @@ -64,11 +63,10 @@ namespace cuBQL { if (wasChanged) break; } #else + float &x = *ptr; #pragma omp atomic compare - { - if (*ptr < value) *ptr = value; - } -// float t; + if (x < value) { x = value; } + // float t; // #pragma omp atomic capture // { t = *ptr; *ptr = std::max(t,value); } #endif diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h index 2321b53..4729e90 100644 --- a/cuBQL/builder/omp/common.h +++ b/cuBQL/builder/omp/common.h @@ -56,10 +56,9 @@ namespace cuBQL { inline void atomicMin(uint32_t *ptr, uint32_t value) { + uint32_t &x = *ptr; #pragma omp atomic compare - { - if (*ptr > value) *ptr = value; - } + if (x > value) { x = value; } // uint32_t t; // #pragma omp atomic capture // { t = *ptr; *ptr = std::min(t,value); } From a77b80a242599d6208dd5a5a19fa55c2571d77b2 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Mon, 2 Feb 2026 15:08:08 -0700 Subject: [PATCH 11/14] added openmp backend --- CMakeLists.txt | 4 +--- testing/omp_sort.cpp | 35 ----------------------------------- 2 files changed, 1 insertion(+), 38 deletions(-) delete mode 100644 testing/omp_sort.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index a32e0da..6a1653e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -140,6 +140,4 @@ if (NOT CUBQL_IS_SUBPROJECT) add_subdirectory(samples) endif() -if (CUBQL_OMP) - add_subdirectory(testing) -endif() +#add_subdirectory(testing) diff --git a/testing/omp_sort.cpp b/testing/omp_sort.cpp deleted file mode 100644 index 8227709..0000000 --- a/testing/omp_sort.cpp +++ /dev/null @@ -1,35 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA -// CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "cuBQL/bvh.h" -#include "cuBQL/builder/omp/sort.h" -#include - -int main(int ac, char **av) -{ - cuBQL::omp::Context omp(0); - - int N = 13; - // int N = 123453; - std::vector inputs(N); - for (int i=0;i results - = omp.download_vector(d_data,N); - for (int i=1;i results[i]) - throw std::runtime_error("Not sorted..."); - } - std::cout << "sorted - perfect!" << std::endl; -} From 60cbf1ff71888d5530fadbb22acf41e21215fa3e Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Mon, 2 Feb 2026 15:10:22 -0700 Subject: [PATCH 12/14] bumped version to 1.2 to mark new openmp builder --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ffa826..40a0ca0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ cmake_minimum_required(VERSION 3.16) cmake_policy(SET CMP0048 NEW) set(CMAKE_BUILD_TYPE_INIT "Release") -project(cuBQL VERSION 1.1.0 LANGUAGES C CXX) +project(cuBQL VERSION 1.2.0 LANGUAGES C CXX) if (CUBQL_DISABLE_CUDA) message("#cuBQL: CUDA _DISABLED_ by user request") From b3bec662c583aa5ce006da3989ed43254ba3d49c Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Mon, 2 Feb 2026 15:41:08 -0700 Subject: [PATCH 13/14] various cleanups --- cuBQL/builder/omp/refit.h | 46 +++---------------------------- cuBQL/builder/omp/spatialMedian.h | 1 - 2 files changed, 4 insertions(+), 43 deletions(-) diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h index 2b6c13b..109654e 100644 --- a/cuBQL/builder/omp/refit.h +++ b/cuBQL/builder/omp/refit.h @@ -10,7 +10,7 @@ namespace cuBQL { namespace omp { template inline - void refit_init_x(Kernel kernel, + void refit_init(Kernel kernel, typename BinaryBVH::Node *nodes, uint32_t *refitData, int numNodes) @@ -23,17 +23,12 @@ namespace cuBQL { node.bounds = box_t(); if (node.admin.count) return; - // if (node.admin.offset < 0) - // { printf("BAD OFFSET IN INIT(1)\n"); return; } - // if (node.admin.offset+1 >= numNodes) - // { printf("BAD OFFSET IN INIT(2)\n"); return; } refitData[node.admin.offset+0] = nodeID << 2; refitData[node.admin.offset+1] = nodeID << 2; } template inline - void refit_run_x(Kernel kernel, - // BinaryBVH bvh, + void refit_run(Kernel kernel, uint32_t *bvh_primIDs, typename BinaryBVH::Node *bvh_nodes, uint32_t *refitData, @@ -45,12 +40,10 @@ namespace cuBQL { typename BinaryBVH::Node *node = bvh_nodes+nodeID; - // printf("begin nodeID %i\n",nodeID); if (node->admin.count == 0) // this is a inner node - exit return; - // printf(" -> leaf %i cnt %i ofs %i\n",nodeID,node->admin.count,node->admin.offset); box_t bounds; bounds.set_empty(); for (int i=0;iadmin.count;i++) { const box_t primBox = boxes[bvh_primIDs[node->admin.offset+i]]; @@ -61,9 +54,7 @@ namespace cuBQL { int parentID = (refitData[nodeID] >> 2); while (true) { - // printf("parentID %i\n",parentID); atomic_grow(*(AtomicBox> *)&node->bounds,bounds); - // node->bounds = bounds; if (node == bvh_nodes) break; @@ -98,49 +89,20 @@ namespace cuBQL { = (uint32_t*)ctx->alloc(numNodes*sizeof(uint32_t)); auto bvh_nodes = bvh.nodes; auto bvh_primIDs = bvh.primIDs; - PING; - PRINT(numNodes); - PRINT(bvh.numNodes); { #pragma omp target device(ctx->gpuID) is_device_ptr(refitData) is_device_ptr(bvh_nodes) #pragma omp teams distribute parallel for for (int i=0;i(Kernel{i},bvh_nodes,refitData,numNodes); + refit_init(Kernel{i},bvh_nodes,refitData,numNodes); } - PING; - -#pragma omp target device(ctx->gpuID) is_device_ptr(bvh_nodes) - for (int i=0;igpuID) is_device_ptr(bvh_primIDs) is_device_ptr(bvh_nodes) is_device_ptr(refitData) is_device_ptr(boxes) - { - int team = omp_get_team_num(); - int nteams = omp_get_num_teams(); - int beg = ((team+0)*numNodes)/nteams; - int end = ((team+1)*numNodes)/nteams; - for (int i=beg;igpuID) is_device_ptr(bvh_primIDs) is_device_ptr(bvh_nodes) is_device_ptr(refitData) is_device_ptr(boxes) #pragma omp teams distribute parallel for for (int i=0;ifree((void*)refitData); } diff --git a/cuBQL/builder/omp/spatialMedian.h b/cuBQL/builder/omp/spatialMedian.h index 075d036..5cca265 100644 --- a/cuBQL/builder/omp/spatialMedian.h +++ b/cuBQL/builder/omp/spatialMedian.h @@ -410,7 +410,6 @@ namespace cuBQL { // ================================================================== bvh.numNodes = numNodes; ctx->alloc(bvh.nodes,numNodes); - PING; PRINT(numNodes); auto bvh_nodes = bvh.nodes; #pragma omp target device(ctx->gpuID) is_device_ptr(bvh_nodes) is_device_ptr(tempNodes) #pragma omp teams distribute parallel for From 974484833d17b44426399ac5eb7943cc96040e35 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Mon, 2 Feb 2026 15:41:35 -0700 Subject: [PATCH 14/14] various cleanups --- cuBQL/builder/omp/refit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h index 109654e..2094554 100644 --- a/cuBQL/builder/omp/refit.h +++ b/cuBQL/builder/omp/refit.h @@ -100,7 +100,7 @@ namespace cuBQL { #pragma omp target device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(bvh_nodes) is_device_ptr(refitData) is_device_ptr(boxes) #pragma omp teams distribute parallel for for (int i=0;ifree((void*)refitData);