From 6515227ec951503c561a3720c36f8df0e93d8c01 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Wed, 21 Jan 2026 13:54:30 -0700
Subject: [PATCH 01/14] refit ported to openmp

---
 cuBQL/builder/omp.h               |  19 ++
 cuBQL/builder/omp/common.h        |  20 ++
 cuBQL/builder/omp/refit.h         | 104 +++++++
 cuBQL/builder/omp/spatialMedian.h | 443 ++++++++++++++++++++++++++++++
 4 files changed, 586 insertions(+)
 create mode 100644 cuBQL/builder/omp.h
 create mode 100644 cuBQL/builder/omp/common.h
 create mode 100644 cuBQL/builder/omp/refit.h
 create mode 100644 cuBQL/builder/omp/spatialMedian.h
diff --git a/cuBQL/builder/omp.h b/cuBQL/builder/omp.h
new file mode 100644
index 0000000..17ba4c2
--- /dev/null
+++ b/cuBQL/builder/omp.h
@@ -0,0 +1,19 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace cuBQL {
+  namespace omp {
+    struct Context;
+    
+    template<typename T, int D>
+    void refit(BinaryBVH<T,D>    &bvh,
+               const box_t<T,D>  *boxes,
+               Context *ctx);
+  }
+}
+
+#include "cuBQL/builder/omp/refit.h"
+#include "cuBQL/builder/omp/spatialMedian.h"
+
diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h
new file mode 100644
index 0000000..e516f35
--- /dev/null
+++ b/cuBQL/builder/omp/common.h
@@ -0,0 +1,20 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace cuBQL {
+  namespace omp {
+    
+    struct Context {
+      Context(int gpuID);
+      int gpuID;
+      int hostID;
+    };
+    struct Kernel {
+      inline int workIdx() const { return _workIdx; }
+      int _workIdx;
+    };
+
+  }
+}
diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h
new file mode 100644
index 0000000..420d30f
--- /dev/null
+++ b/cuBQL/builder/omp/refit.h
@@ -0,0 +1,104 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "cuBQL/builder/omp/common.h"
+
+namespace cuBQL {
+  namespace omp {
+    
+    struct Context {
+      Context(int gpuID);
+      int gpuID;
+      int hostID;
+    };
+    struct Kernel {
+      inline int workIdx() const { return _workIdx; }
+      int _workIdx;
+    };
+
+    template<typename T, int D> inline
+    void refit_init(Kernel kernel,
+                    const typename BinaryBVH<T,D>::Node *nodes,
+                    uint32_t              *refitData,
+                    int numNodes)
+    {
+      const int nodeID = kernel.workIdx();
+      if (nodeID == 1 || nodeID >= numNodes) return;
+      if (nodeID < 2)
+        refitData[0] = 0;
+      const auto &node = nodes[nodeID];
+      if (node.admin.count) return;
+
+      refitData[node.admin.offset+0] = nodeID << 1;
+      refitData[node.admin.offset+1] = nodeID << 1;
+    }
+
+    template<typename T, int D> inline 
+    void refit_run(Kernel kernel,
+                   BinaryBVH<T,D> bvh,
+                   uint32_t *refitData,
+                   const box_t<T,D> *boxes)
+    {
+      int nodeID = kernel.workIdx();
+      if (nodeID == 1 || nodeID >= bvh.numNodes) return;
+      
+      typename BinaryBVH<T,D>::Node *node = &bvh.nodes[nodeID];
+      if (node->admin.count == 0)
+        // this is a inner node - exit
+        return;
+
+      box_t<T,D> bounds; bounds.set_empty();
+      for (int i=0;i<node->admin.count;i++) {
+        const box_t<T,D> primBox = boxes[bvh.primIDs[node->admin.offset+i]];
+        bounds.lower = min(bounds.lower,primBox.lower);
+        bounds.upper = max(bounds.upper,primBox.upper);
+      }
+
+      int parentID = (refitData[nodeID] >> 1);
+      while (true) {
+        node->bounds = bounds;
+        // __threadfence();
+        if (node == bvh.nodes)
+          break;
+
+        uint32_t refitBits = atomicAdd(&refitData[parentID],1u);
+        if ((refitBits & 1) == 0)
+          // we're the first one - let other one do it
+          break;
+
+        nodeID   = parentID;
+        node     = &bvh.nodes[parentID];
+        parentID = (refitBits >> 1);
+        
+        typename BinaryBVH<T,D>::Node l = bvh.nodes[node->admin.offset+0];
+        typename BinaryBVH<T,D>::Node r = bvh.nodes[node->admin.offset+1];
+        bounds.lower = min(l.bounds.lower,r.bounds.lower);
+        bounds.upper = max(l.bounds.upper,r.bounds.upper);
+      }
+    }
+    
+    template<typename T, int D>
+    void refit(BinaryBVH<T,D>    &bvh,
+               const box_t<T,D>  *boxes,
+               Context *ctx)
+    {
+      int numNodes = bvh.numNodes;
+      uint32_t *refitData
+        = (uint32_t*)ctx->malloc(numNodes*sizeof(int));
+      
+# pragma omp target device(context->gpuID)
+# pragma omp teams distribute parallel for
+      for (int i=0;i<numNodes;i++)
+        refit_init<T,D>(Kernel{i},bvh.nodes,refitData,bvh.numNodes);
+# pragma omp target device(context->gpuID)
+# pragma omp teams distribute parallel for
+      for (int i=0;i<numNodes;i++)
+        refit_run(Kernel{i},bvh,refitData,boxes);
+      ctx->free((void*)refitData);
+    }
+    
+  }
+}
+
diff --git a/cuBQL/builder/omp/spatialMedian.h b/cuBQL/builder/omp/spatialMedian.h
new file mode 100644
index 0000000..7dea853
--- /dev/null
+++ b/cuBQL/builder/omp/spatialMedian.h
@@ -0,0 +1,443 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "cuBQL/builder/omp/refit.h"
+
+namespace cuBQL {
+  namespace omp {
+
+    struct PrimState {
+      union {
+        /* careful with this order - this is intentionally chosen such
+           that all item with nodeID==-1 will end up at the end of the
+           list; and all others will be sorted by nodeID */
+        struct {
+          uint64_t primID:31; //!< prim we're talking about
+          uint64_t done  : 1;
+          uint64_t nodeID:32; //!< node the given prim is (currently) in.
+        };
+        uint64_t bits;
+      };
+    };
+
+    template<typename T, int D>
+    struct CUBQL_ALIGN(16) TempNode {
+      using box_t = cuBQL::box_t<T,D>;
+      union {
+        struct {
+          AtomicBox<box_t> centBounds;
+          uint32_t         count;
+          uint32_t         unused;
+        } openBranch;
+        struct {
+          uint32_t offset;
+          int      dim;
+          uint32_t tieBreaker;
+          float    pos;
+        } openNode;
+        struct {
+          uint32_t offset;
+          uint32_t count;
+          uint32_t unused[2];
+        } doneNode;
+      };
+    };
+    
+    template<typename T, int D>
+    __global__
+    void initState(BuildState      *buildState,
+                   NodeState       *nodeStates,
+                   TempNode<T,D> *nodes)
+    {
+      buildState->numNodes = 2;
+      
+      nodeStates[0]             = OPEN_BRANCH;
+      nodes[0].openBranch.count = 0;
+      nodes[0].openBranch.centBounds.set_empty();
+
+      nodeStates[1]            = DONE_NODE;
+      nodes[1].doneNode.offset = 0;
+      nodes[1].doneNode.count  = 0;
+    }
+
+    template<typename T, int D>
+    __global__ void initPrims(TempNode<T,D> *nodes,
+                              PrimState       *primState,
+                              const box_t<T,D>     *primBoxes,
+                              uint32_t         numPrims)
+    {
+      const int primID = threadIdx.x+blockIdx.x*blockDim.x;
+      if (primID >= numPrims) return;
+      
+      auto &me = primState[primID];
+      me.primID = primID;
+                                                    
+      const box_t<T,D> box = primBoxes[primID];
+      if (box.get_lower(0) <= box.get_upper(0)) {
+        me.nodeID = 0;
+        me.done   = false;
+        // this could be made faster by block-reducing ...
+        atomicAdd(&nodes[0].openBranch.count,1);
+        atomic_grow(nodes[0].openBranch.centBounds,box.center());//centerOf(box));
+      } else {
+        me.nodeID = (uint32_t)-1;
+        me.done   = true;
+      }
+    }
+
+    template<typename T, int D>
+    __global__
+    void selectSplits(BuildState    *buildState,
+                      NodeState     *nodeStates,
+                      TempNode<T,D> *nodes,
+                      uint32_t       numNodes,
+                      BuildConfig    buildConfig)
+    {
+#if 1
+      __shared__ int l_newNodeOfs;
+      if (threadIdx.x == 0)
+        l_newNodeOfs = 0;
+      __syncthreads();
+
+      int *t_nodeOffsetToWrite = 0;
+      int  t_localOffsetToAdd = 0;
+
+      while (true) {
+        const int nodeID = threadIdx.x+blockIdx.x*blockDim.x;
+        if (nodeID >= numNodes)
+          break;
+        
+        NodeState &nodeState = nodeStates[nodeID];
+        if (nodeState == DONE_NODE)
+          // this node was already closed before
+          break;
+        
+        if (nodeState == OPEN_NODE) {
+          // this node was open in the last pass, can close it.
+          nodeState   = DONE_NODE;
+          int offset  = nodes[nodeID].openNode.offset;
+          auto &done  = nodes[nodeID].doneNode;
+          done.count  = 0;
+          done.offset = offset;
+          break;
+        }
+        
+        auto in = nodes[nodeID].openBranch;
+        if (in.count <= buildConfig.makeLeafThreshold) {
+          auto &done  = nodes[nodeID].doneNode;
+          done.count  = in.count;
+          // set this to max-value, so the prims can later do atomicMin
+          // with their position ion the leaf list; this value is
+          // greater than any prim position.
+          done.offset = (uint32_t)-1;
+          nodeState   = DONE_NODE;
+        } else {
+          float widestWidth = 0.f;
+          int   widestDim   = -1;
+          float widestLo, widestHi, widestCtr;
+#pragma unroll
+          for (int d=0;d<D;d++) {
+            float lo = in.centBounds.get_lower(d);
+            float hi = in.centBounds.get_upper(d);
+            float width = hi - lo;
+            if (width <= widestWidth)
+              continue;
+            float ctr = 0.5f*(hi+lo);
+            
+            widestWidth = width;
+            widestDim   = d;
+            widestLo = lo;
+            widestHi = hi;
+            widestCtr = ctr;
+          }
+          
+          auto &open = nodes[nodeID].openNode;
+          if (widestDim >= 0) {
+            open.pos = widestCtr;
+          }
+          open.dim
+            = (widestDim < 0 || widestCtr == widestLo || widestCtr == widestHi)
+            ? -1
+            : widestDim;
+          
+          // this will be epensive - could make this faster by block-reducing
+          // open.offset = atomicAdd(&buildState->numNodes,2);
+          t_nodeOffsetToWrite = (int*)&open.offset;
+          t_localOffsetToAdd = atomicAdd(&l_newNodeOfs,2);
+          nodeState = OPEN_NODE;
+        }
+        break;
+      }
+      __syncthreads();
+      if (threadIdx.x == 0 && l_newNodeOfs > 0)
+        l_newNodeOfs = atomicAdd(&buildState->numNodes,l_newNodeOfs);
+      __syncthreads();
+      if (t_nodeOffsetToWrite) {
+        int openOffset = *t_nodeOffsetToWrite = l_newNodeOfs + t_localOffsetToAdd;
+#pragma unroll
+          for (int side=0;side<2;side++) {
+            const int childID = openOffset+side;
+            auto &child = nodes[childID].openBranch;
+            child.centBounds.set_empty();
+            child.count         = 0;
+            nodeStates[childID] = OPEN_BRANCH;
+          }
+      }
+#else
+      const int nodeID = threadIdx.x+blockIdx.x*blockDim.x;
+      if (nodeID >= numNodes) return;
+
+      NodeState &nodeState = nodeStates[nodeID];
+      if (nodeState == DONE_NODE)
+        // this node was already closed before
+        return;
+      
+      if (nodeState == OPEN_NODE) {
+        // this node was open in the last pass, can close it.
+        nodeState   = DONE_NODE;
+        int offset  = nodes[nodeID].openNode.offset;
+        auto &done  = nodes[nodeID].doneNode;
+        done.count  = 0;
+        done.offset = offset;
+        return;
+      }
+      
+      auto in = nodes[nodeID].openBranch;
+      if (in.count <= buildConfig.makeLeafThreshold) {
+        auto &done  = nodes[nodeID].doneNode;
+        done.count  = in.count;
+        // set this to max-value, so the prims can later do atomicMin
+        // with their position ion the leaf list; this value is
+        // greater than any prim position.
+        done.offset = (uint32_t)-1;
+        nodeState   = DONE_NODE;
+      } else {
+        float widestWidth = 0.f;
+        int   widestDim   = -1;
+        float widestLo, widestHi, widestCtr;
+#pragma unroll
+        for (int d=0;d<D;d++) {
+          float lo = in.centBounds.get_lower(d);
+          float hi = in.centBounds.get_upper(d);
+          float width = hi - lo;
+          if (width <= widestWidth)
+            continue;
+          float ctr = 0.5f*(hi+lo);
+          
+          widestWidth = width;
+          widestDim   = d;
+          widestLo = lo;
+          widestHi = hi;
+          widestCtr = ctr;
+        }
+      
+        auto &open = nodes[nodeID].openNode;
+        if (widestDim >= 0) {
+          open.pos = widestCtr;
+        }
+        open.dim
+          = (widestDim < 0 || widestCtr == widestLo || widestCtr == widestHi)
+          ? -1
+          : widestDim;
+        
+        // this will be epensive - could make this faster by block-reducing
+        open.offset = atomicAdd(&buildState->numNodes,2);
+#pragma unroll
+        for (int side=0;side<2;side++) {
+          const int childID = open.offset+side;
+          auto &child = nodes[childID].openBranch;
+          child.centBounds.set_empty();
+          child.count         = 0;
+          nodeStates[childID] = OPEN_BRANCH;
+        }
+        nodeState = OPEN_NODE;
+      }
+#endif
+    }
+
+    template<typename T, int D>
+    __global__
+    void updatePrims(NodeState       *nodeStates,
+                     TempNode<T,D> *nodes,
+                     PrimState       *primStates,
+                     const box_t<T,D>     *primBoxes,
+                     int numPrims)
+    {
+      const int primID = threadIdx.x+blockIdx.x*blockDim.x;
+      if (primID >= numPrims) return;
+
+      const auto me = primStates[primID];
+      if (me.done) return;
+      
+      const auto ns = nodeStates[me.nodeID];
+      if (ns == DONE_NODE) {
+        // node became a leaf, we're done.
+        primStates[primID].done = true;
+        return;
+      }
+      
+      auto &split = nodes[me.nodeID].openNode;
+      const box_t<T,D> primBox = primBoxes[me.primID];
+      int side = 0;
+      if (split.dim == -1) {
+        // could block-reduce this, but will likely not happen often, anyway
+        side = (atomicAdd(&split.tieBreaker,1) & 1);
+      } else {
+        const float center = 0.5f*(primBox.get_lower(split.dim)+
+                                   primBox.get_upper(split.dim));
+        side = (center >= split.pos);
+      }
+      int newNodeID = split.offset+side;
+      auto &myBranch = nodes[newNodeID].openBranch;
+      atomicAdd(&myBranch.count,1);
+      atomic_grow(myBranch.centBounds,primBox.center());
+      primStates[primID].nodeID = newNodeID;
+    }
+    
+    /* given a sorted list of {nodeID,primID} pairs, this kernel does
+       two things: a) it extracts the 'primID's and puts them into the
+       bvh's primIDs[] array; and b) it writes, for each leaf nod ein
+       the nodes[] array, the node.offset value to point to the first
+       of this nodes' items in that bvh.primIDs[] list. */
+    template<typename T, int D>
+    __global__
+    void writePrimsAndLeafOffsets(TempNode<T,D> *nodes,
+                                  uint32_t        *bvhItemList,
+                                  PrimState       *primStates,
+                                  int              numPrims)
+    {
+      const int offset = threadIdx.x+blockIdx.x*blockDim.x;
+      if (offset >= numPrims) return;
+
+      auto &ps = primStates[offset];
+      bvhItemList[offset] = ps.primID;
+      
+      if ((int)ps.nodeID < 0)
+        /* invalid prim, just skip here */
+        return;
+      auto &node = nodes[ps.nodeID];
+      atomicMin(&node.doneNode.offset,offset);
+    }
+
+    /* writes main phase's temp nodes into final bvh.nodes[]
+       layout. actual bounds of that will NOT yet bewritten */
+    template<typename T, int D>
+    __global__
+    void writeNodes(typename BinaryBVH<T,D>::Node *finalNodes,
+                    TempNode<T,D>  *tempNodes,
+                    int        numNodes)
+    {
+      const int nodeID = threadIdx.x+blockIdx.x*blockDim.x;
+      if (nodeID >= numNodes) return;
+
+      finalNodes[nodeID].admin.offset = tempNodes[nodeID].doneNode.offset;
+      finalNodes[nodeID].admin.count  = tempNodes[nodeID].doneNode.count;
+    }
+
+    
+    template<typename T, int D>
+    void build(BinaryBVH<T,D>    &bvh,
+               const box_t<T,D>  *boxes,
+               int                numPrims,
+               BuildConfig        buildConfig,
+               cudaStream_t       s,
+               GpuMemoryResource &memResource)
+    {
+      assert(sizeof(PrimState) == sizeof(uint64_t));
+      
+      // ==================================================================
+      // do build on temp nodes
+      // ==================================================================
+      TempNode<T,D> *tempNodes = 0;
+      NodeState     *nodeStates = 0;
+      PrimState     *primStates = 0;
+      BuildState    *buildState = 0;
+      ctx->malloc(tempNodes,2*numPrims);
+      ctx->malloc(nodeStates,2*numPrims);
+      ctx->malloc(primStates,numPrims);
+      ctx->malloc(buildState,1);
+      initState<<<1,1,0,s>>>(buildState,
+                             nodeStates,
+                             tempNodes);
+      initPrims<<<divRoundUp(numPrims,1024),1024,0,s>>>
+        (tempNodes,
+         primStates,boxes,numPrims);
+
+      int numDone = 0;
+      int numNodes;
+
+      // ------------------------------------------------------------------      
+      cudaEvent_t stateDownloadedEvent;
+      CUBQL_CUDA_CALL(EventCreate(&stateDownloadedEvent));
+      
+      
+      while (true) {
+        CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes,
+                                    sizeof(numNodes),cudaMemcpyDeviceToHost,s));
+        CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
+        CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent));
+        if (numNodes == numDone)
+          break;
+        selectSplits<<<divRoundUp(numNodes,1024),1024,0,s>>>
+          (buildState,
+           nodeStates,tempNodes,numNodes,
+           buildConfig);
+        numDone = numNodes;
+
+        updatePrims<<<divRoundUp(numPrims,1024),1024,0,s>>>
+          (nodeStates,tempNodes,
+           primStates,boxes,numPrims);
+      }
+      CUBQL_CUDA_CALL(EventDestroy(stateDownloadedEvent));
+      // ==================================================================
+      // sort {item,nodeID} list
+      // ==================================================================
+      
+      // set up sorting of prims
+      uint8_t   *d_temp_storage     = NULL;
+      size_t     temp_storage_bytes = 0;
+      PrimState *sortedPrimStates   = 0;
+      ctx->malloc(sortedPrimStates,numPrims);
+      auto rc =
+        cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes,
+                                     (uint64_t*)primStates,
+                                     (uint64_t*)sortedPrimStates,
+                                     numPrims,32,64,s);
+      ctx->malloc(d_temp_storage,temp_storage_bytes);
+      rc =
+        cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes,
+                                       (uint64_t*)primStates,
+                                       (uint64_t*)sortedPrimStates,
+                                       numPrims,32,64,s);
+      rc = rc;
+      ctx->free(d_temp_storage);
+      // ==================================================================
+      // allocate and write BVH item list, and write offsets of leaf nodes
+      // ==================================================================
+
+      bvh.numPrims = numPrims;
+      ctx->malloc(bvh.primIDs,numPrims);
+      writePrimsAndLeafOffsets<<<divRoundUp(numPrims,1024),1024,0,s>>>
+        (tempNodes,bvh.primIDs,sortedPrimStates,numPrims);
+
+      // ==================================================================
+      // allocate and write final nodes
+      // ==================================================================
+      bvh.numNodes = numNodes;
+      ctx->malloc(bvh.nodes,numNodes);
+      writeNodes<<<divRoundUp(numNodes,1024),1024,0,s>>>
+        (bvh.nodes,tempNodes,numNodes);
+      ctx->free(sortedPrimStates);
+      ctx->free(tempNodes);
+      ctx->free(nodeStates);
+      ctx->free(primStates);
+      ctx->free(buildState);
+
+      refit(bvh);
+    }
+    
+  }
+}
+

From 6263e92a503797d9e52ad2ca41de9f889189032f Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Fri, 23 Jan 2026 09:39:10 -0700
Subject: [PATCH 02/14] added openmp sorter

---
 CMakeLists.txt             |   7 ++
 cuBQL/builder/omp/common.h |  61 ++++++++++-
 cuBQL/builder/omp/sort.h   | 204 +++++++++++++++++++++++++++++++++++++
 testing/omp_sort.cpp       |  40 ++++++++
 4 files changed, 309 insertions(+), 3 deletions(-)
 create mode 100644 cuBQL/builder/omp/sort.h
 create mode 100644 testing/omp_sort.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1ffa826..a32e0da 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,9 @@ cmake_policy(SET CMP0048 NEW)
 set(CMAKE_BUILD_TYPE_INIT "Release")
 project(cuBQL VERSION 1.1.0 LANGUAGES C CXX)
 
+if (CUBQL_OMP)
+  set(CUBQL_DISABLE_CUDA ON)
+endif()
 if (CUBQL_DISABLE_CUDA)
   message("#cuBQL: CUDA _DISABLED_ by user request")
   set(CUBQL_HAVE_CUDA OFF)
@@ -136,3 +139,7 @@ add_subdirectory(cuBQL)
 if (NOT CUBQL_IS_SUBPROJECT)
   add_subdirectory(samples)
 endif()
+
+if (CUBQL_OMP)
+  add_subdirectory(testing)
+endif()
diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h
index e516f35..df0a4ca 100644
--- a/cuBQL/builder/omp/common.h
+++ b/cuBQL/builder/omp/common.h
@@ -1,13 +1,27 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
 
+#include "cuBQL/bvh.h"
+#include <omp.h>
+
 namespace cuBQL {
   namespace omp {
     
     struct Context {
       Context(int gpuID);
+
+      template<typename T>
+      void alloc_and_upload(T *&d_data, const T *h_data, size_t Nelements);
+      
+      template<typename T>
+      void alloc_and_upload(T *&d_data, const std::vector<T> &h_vector);
+
+      template<typename T>
+      std::vector<T> download_vector(const T *d_data, size_t N);
+      
       int gpuID;
       int hostID;
     };
@@ -16,5 +30,46 @@ namespace cuBQL {
       int _workIdx;
     };
 
-  }
-}
+
+
+    // ##################################################################
+    // IMPLEMENTATION SECTION
+    // ##################################################################
+    Context::Context(int gpuID)
+      : gpuID(gpuID),
+        hostID(omp_get_initial_device())
+    {
+      assert(gpuID < omp_get_num_devices());
+      printf("#cuBQL:omp:Context(gpu=%i/%i,host=%i)\n",
+             gpuID,omp_get_num_devices(),hostID);
+    }
+    
+    template<typename T>
+    void Context::alloc_and_upload(T *&d_data,
+                                   const T *h_data,
+                                   size_t N)
+    {
+      printf("target_alloc N %li gpu %i\n",N,gpuID);
+      d_data = (T *)omp_target_alloc(N*sizeof(T),gpuID);
+      printf("ptr %p\n",d_data);
+      assert(d_data);
+      omp_target_memcpy(d_data,h_data,N*sizeof(T),
+                        0,0,gpuID,hostID);
+    }
+      
+    template<typename T>
+    void Context::alloc_and_upload(T *&d_data,
+                                   const std::vector<T> &h_vector)
+    { alloc_and_upload(d_data,h_vector.data(),h_vector.size()); }
+
+    template<typename T>
+    std::vector<T> Context::download_vector(const T *d_data, size_t N)
+    {
+      std::vector<T> out(N);
+      omp_target_memcpy(out.data(),d_data,N*sizeof(T),
+                        0,0,hostID,gpuID);
+      return out;
+    }
+    
+  } // ::cuBQL::omp
+} // ::cuBQL
diff --git a/cuBQL/builder/omp/sort.h b/cuBQL/builder/omp/sort.h
new file mode 100644
index 0000000..3e5ca6b
--- /dev/null
+++ b/cuBQL/builder/omp/sort.h
@@ -0,0 +1,204 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "cuBQL/builder/omp/common.h"
+
+namespace cuBQL {
+  namespace omp {
+    namespace bitonic {
+      
+      template<typename key_t>
+      void print_all(key_t *const d_values,
+                     int numValues,
+                     int barInterval, int commaInterval,
+                     Context *omp)
+      {
+        std::vector<key_t> v = omp->download_vector(d_values,numValues);
+        for (int i=0;i<numValues;i++) {
+          std::cout << v[i] << " ";
+          if (barInterval && (i+1)%barInterval==0)
+            printf("| ");
+          else if (commaInterval && (i+1)%commaInterval==0)
+            printf(", ");
+        }
+        std::cout << std::endl;
+      }
+      
+
+      template<typename key_t>
+      void g_orderSegmentPairs(Kernel kernel,
+                               int logSegLen,
+                               key_t *const d_values,
+                               int numValues)
+
+      {
+        uint32_t tid   = kernel.workIdx();
+        
+        // bool dbg = tid == 1 || tid == 0;
+        
+        uint32_t segLen    = 1<<logSegLen;
+        uint32_t pairIdx   = tid>>logSegLen;
+        uint32_t pairRank  = tid-(pairIdx<<logSegLen);
+        uint32_t pairBegin = 2*(pairIdx<<logSegLen);
+
+
+        uint32_t r = pairBegin+segLen+pairRank;
+        uint32_t l = pairBegin+segLen-1-pairRank;
+        // if (dbg) printf("(%i) pairIdx %i pairLen %i rank %i l %i r %i\n",
+        //                 tid,pairIdx,pairLen,pairRank,l,r);
+                        
+                        
+        if (r >= numValues) return;
+        
+        key_t lv = d_values[l];
+        key_t rv = d_values[r];
+        if (rv < lv) {
+          d_values[r] = lv;
+          d_values[l] = rv;
+        }
+      }
+      
+      template<typename key_t, typename value_t>
+      void g_orderSegmentPairs(Kernel kernel,
+                               int logSegLen,
+                               key_t *const d_keys,
+                               value_t *const d_values,
+                               int numValues)
+
+      {
+        uint32_t tid   = kernel.workIdx();
+        
+        // bool dbg = tid == 1 || tid == 0;
+        
+        uint32_t segLen    = 1<<logSegLen;
+        uint32_t pairIdx   = tid>>logSegLen;
+        uint32_t pairRank  = tid-(pairIdx<<logSegLen);
+        uint32_t pairBegin = 2*(pairIdx<<logSegLen);
+
+
+        uint32_t r = pairBegin+segLen+pairRank;
+        uint32_t l = pairBegin+segLen-1-pairRank;
+        // if (dbg) printf("(%i) pairIdx %i pairLen %i rank %i l %i r %i\n",
+        //                 tid,pairIdx,pairLen,pairRank,l,r);
+                        
+                        
+        if (r >= numValues) return;
+        
+        key_t lk = d_keys[l];
+        key_t rk = d_keys[r];
+        key_t lv = d_values[l];
+        key_t rv = d_values[r];
+        if (rk < lk) {
+          d_values[r] = lv;
+          d_values[l] = rv;
+          d_keys[r]   = lk;
+          d_keys[l]   = rk;
+        }
+      }
+      
+      template<typename key_t>
+      void orderSegmentPairs(int logSegLen,
+                             key_t *const d_values,
+                             int numValues,
+                             Context *omp)
+
+      {
+#pragma omp target device(omp->gpuID)
+#pragma omp teams distribute parallel for
+        for (int i=0;i<numValues;i++)
+          g_orderSegmentPairs(Kernel{i},logSegLen,
+                              d_values,numValues);
+      }
+      
+      template<typename key_t, typename value_t>
+      void orderSegmentPairs(int logSegLen,
+                             key_t *const d_keys,
+                             value_t *const d_values,
+                             int numValues,
+                             Context *omp)
+
+      {
+#pragma omp target device(omp->gpuID)
+#pragma omp teams distribute parallel for
+        for (int i=0;i<numValues;i++)
+          g_orderSegmentPairs(Kernel{i},logSegLen,
+                              d_keys,d_values,numValues);
+      }
+      
+      template<typename key_t>
+      void sortSegments(int logSegLen,
+                        key_t *const d_keys,
+                        int numValues,
+                        Context *omp)
+      {
+        if (logSegLen == 0) return;
+
+        sortSegments(logSegLen-1,d_keys,numValues,omp);
+        orderSegmentPairs(logSegLen-1,d_keys,numValues,omp);
+        sortSegments(logSegLen-1,d_keys,numValues,omp);
+      }
+      
+      template<typename key_t, typename value_t>
+      void sortSegments(int logSegLen,
+                        key_t *const d_keys,
+                        value_t *const d_values,
+                        int numValues,
+                        Context *omp)
+      {
+        if (logSegLen == 0) return;
+
+        sortSegments(logSegLen-1,d_keys,d_values,numValues,omp);
+        orderSegmentPairs(logSegLen-1,d_keys,d_values,numValues,omp);
+        sortSegments(logSegLen-1,d_keys,d_values,numValues,omp);
+      }
+      
+      template<typename key_t>
+      void sort(key_t *const d_values,
+                int numValues,
+                Context *omp)
+      
+      {
+        uint32_t logSegLen = 0;
+        while (1<<logSegLen < numValues)
+          logSegLen++;
+        sortSegments(logSegLen,d_values,numValues,omp);
+      }
+      
+      template<typename key_t, typename value_t>
+      void sort(key_t *const d_keys,
+                value_t *const d_values,
+                int numValues,
+                Context *omp)
+      
+      {
+        uint32_t logSegLen = 0;
+        while (1<<logSegLen < numValues)
+          logSegLen++;
+        sortSegments(logSegLen,d_keys,d_values,numValues,omp);
+      }
+      
+    }
+  
+    template<typename key_t>
+    void sort(key_t *const d_values,
+              size_t numValues,
+              Context *omp)
+    {
+      assert(numValues < (1ull<<31));
+      bitonic::sort(d_values,(int)numValues,omp);
+    }
+
+    template<typename key_t, typename value_t>
+    void sort(key_t *const d_keys,
+              value_t *const d_values,
+              size_t numValues,
+              Context *omp)
+    {
+      assert(numValues < (1ull<<31));
+      bitonic::sort(d_keys,d_values,(int)numValues,omp);
+    }
+  }
+}
diff --git a/testing/omp_sort.cpp b/testing/omp_sort.cpp
new file mode 100644
index 0000000..b043581
--- /dev/null
+++ b/testing/omp_sort.cpp
@@ -0,0 +1,40 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "cuBQL/builder/omp/sort.h"
+#include <iostream>
+
+int main(int ac, char **av)
+{
+  cuBQL::omp::Context omp(0);
+  
+  int N = 13;
+  // int N = 123453;
+  std::vector<int> inputs(N);
+  for (int i=0;i<N;i++) {
+    inputs[i] = 90-i;//random() % 100;
+    if (inputs[i] < 10) inputs[i] += 20;
+  }
+
+  int *d_data = 0;
+  omp.alloc_and_upload(d_data,inputs);
+  printf("d_data %p\n",d_data);
+
+// #pragma omp target device(omp.gpuID)
+// #pragma omp teams distribute parallel for
+//   for (int i=0;i<20;i++)
+//     if (1<<i < N)
+//       printf("d_data[%i] = %i\n",1<<i,d_data[1<<i]);
+
+  cuBQL::omp::sort(d_data,N,&omp);
+
+  std::vector<int> results
+    = omp.download_vector(d_data,N);
+  for (int i=1;i<results.size();i++) {
+    PRINT(results[i]);
+    if (results[i-1] > results[i])
+      throw std::runtime_error("Not sorted...");
+  }
+  std::cout << "sorted - perfect!" << std::endl;
+}

From 96eb64fd2d154d066b47c1c57226b5e48afd0bc0 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Fri, 23 Jan 2026 18:32:31 -0700
Subject: [PATCH 03/14] added sorting code and omp skeleton

---
 cuBQL/builder/omp/sort.h | 88 ++++++++++++----------------------------
 testing/omp_sort.cpp     |  9 +---
 2 files changed, 27 insertions(+), 70 deletions(-)

diff --git a/cuBQL/builder/omp/sort.h b/cuBQL/builder/omp/sort.h
index 3e5ca6b..4f1db34 100644
--- a/cuBQL/builder/omp/sort.h
+++ b/cuBQL/builder/omp/sort.h
@@ -4,52 +4,25 @@
 
 #pragma once
 
-#include "cuBQL/builder/omp/common.h"
+#include <omp.h>
 
 namespace cuBQL {
   namespace omp {
     namespace bitonic {
       
       template<typename key_t>
-      void print_all(key_t *const d_values,
-                     int numValues,
-                     int barInterval, int commaInterval,
-                     Context *omp)
-      {
-        std::vector<key_t> v = omp->download_vector(d_values,numValues);
-        for (int i=0;i<numValues;i++) {
-          std::cout << v[i] << " ";
-          if (barInterval && (i+1)%barInterval==0)
-            printf("| ");
-          else if (commaInterval && (i+1)%commaInterval==0)
-            printf(", ");
-        }
-        std::cout << std::endl;
-      }
-      
-
-      template<typename key_t>
-      void g_orderSegmentPairs(Kernel kernel,
+      void g_orderSegmentPairs(uint32_t tid,
                                int logSegLen,
                                key_t *const d_values,
                                int numValues)
-
       {
-        uint32_t tid   = kernel.workIdx();
-        
-        // bool dbg = tid == 1 || tid == 0;
-        
         uint32_t segLen    = 1<<logSegLen;
         uint32_t pairIdx   = tid>>logSegLen;
         uint32_t pairRank  = tid-(pairIdx<<logSegLen);
         uint32_t pairBegin = 2*(pairIdx<<logSegLen);
 
-
         uint32_t r = pairBegin+segLen+pairRank;
         uint32_t l = pairBegin+segLen-1-pairRank;
-        // if (dbg) printf("(%i) pairIdx %i pairLen %i rank %i l %i r %i\n",
-        //                 tid,pairIdx,pairLen,pairRank,l,r);
-                        
                         
         if (r >= numValues) return;
         
@@ -62,17 +35,12 @@ namespace cuBQL {
       }
       
       template<typename key_t, typename value_t>
-      void g_orderSegmentPairs(Kernel kernel,
+      void g_orderSegmentPairs(uint32_t tid,
                                int logSegLen,
                                key_t *const d_keys,
                                value_t *const d_values,
                                int numValues)
-
       {
-        uint32_t tid   = kernel.workIdx();
-        
-        // bool dbg = tid == 1 || tid == 0;
-        
         uint32_t segLen    = 1<<logSegLen;
         uint32_t pairIdx   = tid>>logSegLen;
         uint32_t pairRank  = tid-(pairIdx<<logSegLen);
@@ -81,9 +49,6 @@ namespace cuBQL {
 
         uint32_t r = pairBegin+segLen+pairRank;
         uint32_t l = pairBegin+segLen-1-pairRank;
-        // if (dbg) printf("(%i) pairIdx %i pairLen %i rank %i l %i r %i\n",
-        //                 tid,pairIdx,pairLen,pairRank,l,r);
-                        
                         
         if (r >= numValues) return;
         
@@ -103,13 +68,12 @@ namespace cuBQL {
       void orderSegmentPairs(int logSegLen,
                              key_t *const d_values,
                              int numValues,
-                             Context *omp)
-
+                             uint32_t deviceID)
       {
-#pragma omp target device(omp->gpuID)
+#pragma omp target device(deviceID)
 #pragma omp teams distribute parallel for
         for (int i=0;i<numValues;i++)
-          g_orderSegmentPairs(Kernel{i},logSegLen,
+          g_orderSegmentPairs(i,logSegLen,
                               d_values,numValues);
       }
       
@@ -118,13 +82,12 @@ namespace cuBQL {
                              key_t *const d_keys,
                              value_t *const d_values,
                              int numValues,
-                             Context *omp)
-
+                             uint32_t deviceID)
       {
-#pragma omp target device(omp->gpuID)
+#pragma omp target device(deviceID)
 #pragma omp teams distribute parallel for
         for (int i=0;i<numValues;i++)
-          g_orderSegmentPairs(Kernel{i},logSegLen,
+          g_orderSegmentPairs(i,logSegLen,
                               d_keys,d_values,numValues);
       }
       
@@ -132,7 +95,7 @@ namespace cuBQL {
       void sortSegments(int logSegLen,
                         key_t *const d_keys,
                         int numValues,
-                        Context *omp)
+                        uint32_t deviceID)
       {
         if (logSegLen == 0) return;
 
@@ -146,7 +109,7 @@ namespace cuBQL {
                         key_t *const d_keys,
                         value_t *const d_values,
                         int numValues,
-                        Context *omp)
+                        uint32_t deviceID)
       {
         if (logSegLen == 0) return;
 
@@ -158,8 +121,7 @@ namespace cuBQL {
       template<typename key_t>
       void sort(key_t *const d_values,
                 int numValues,
-                Context *omp)
-      
+                uint32_t deviceID)
       {
         uint32_t logSegLen = 0;
         while (1<<logSegLen < numValues)
@@ -171,34 +133,34 @@ namespace cuBQL {
       void sort(key_t *const d_keys,
                 value_t *const d_values,
                 int numValues,
-                Context *omp)
-      
+                uint32_t deviceID)
       {
         uint32_t logSegLen = 0;
         while (1<<logSegLen < numValues)
           logSegLen++;
-        sortSegments(logSegLen,d_keys,d_values,numValues,omp);
+        sortSegments(logSegLen,d_keys,d_values,numValues,deviceID);
       }
       
     }
   
     template<typename key_t>
-    void sort(key_t *const d_values,
-              size_t numValues,
-              Context *omp)
+    void omp_target_sort(key_t *const d_values,
+                         size_t numValues,
+                         uint32_t deviceID)
     {
       assert(numValues < (1ull<<31));
-      bitonic::sort(d_values,(int)numValues,omp);
+      bitonic::sort(d_values,(int)numValues,deviceID);
     }
-
+    
     template<typename key_t, typename value_t>
-    void sort(key_t *const d_keys,
-              value_t *const d_values,
-              size_t numValues,
-              Context *omp)
+    void omp_target_sort(key_t *const d_keys,
+                         value_t *const d_values,
+                         size_t numValues,
+                         uint32_t deviceID)
     {
       assert(numValues < (1ull<<31));
-      bitonic::sort(d_keys,d_values,(int)numValues,omp);
+      bitonic::sort(d_keys,d_values,(int)numValues,deviceID);
     }
+    
   }
 }
diff --git a/testing/omp_sort.cpp b/testing/omp_sort.cpp
index b043581..8227709 100644
--- a/testing/omp_sort.cpp
+++ b/testing/omp_sort.cpp
@@ -2,6 +2,7 @@
 // CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+#include "cuBQL/bvh.h"
 #include "cuBQL/builder/omp/sort.h"
 #include <iostream>
 
@@ -21,13 +22,7 @@ int main(int ac, char **av)
   omp.alloc_and_upload(d_data,inputs);
   printf("d_data %p\n",d_data);
 
-// #pragma omp target device(omp.gpuID)
-// #pragma omp teams distribute parallel for
-//   for (int i=0;i<20;i++)
-//     if (1<<i < N)
-//       printf("d_data[%i] = %i\n",1<<i,d_data[1<<i]);
-
-  cuBQL::omp::sort(d_data,N,&omp);
+  cuBQL::omp::omp_target_sort(d_data,N,omp.deviceID);
 
   std::vector<int> results
     = omp.download_vector(d_data,N);

From 08fb0d9cc80df855ba8d1a7d3485d3ba8e6527df Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Fri, 23 Jan 2026 20:28:23 -0700
Subject: [PATCH 04/14] added first draft of builder

---
 cuBQL/builder/omp/common.h        |  27 +++
 cuBQL/builder/omp/refit.h         |  16 +-
 cuBQL/builder/omp/sort.h          | 260 ++++++++++++++--------------
 cuBQL/builder/omp/spatialMedian.h | 277 +++++++++++-------------------
 4 files changed, 261 insertions(+), 319 deletions(-)

diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h
index df0a4ca..261d06f 100644
--- a/cuBQL/builder/omp/common.h
+++ b/cuBQL/builder/omp/common.h
@@ -6,6 +6,7 @@
 
 #include "cuBQL/bvh.h"
 #include <omp.h>
+#include <atomic>
 
 namespace cuBQL {
   namespace omp {
@@ -13,6 +14,11 @@ namespace cuBQL {
     struct Context {
       Context(int gpuID);
 
+      void *alloc(size_t Nelements);
+      
+      template<typename T>
+      void alloc(T *&d_data, size_t Nelements);
+      
       template<typename T>
       void alloc_and_upload(T *&d_data, const T *h_data, size_t Nelements);
       
@@ -21,15 +27,36 @@ namespace cuBQL {
 
       template<typename T>
       std::vector<T> download_vector(const T *d_data, size_t N);
+
+      template<typename T>
+      void download(T &h_value, T *d_value);
+
+      void free(void *);
       
       int gpuID;
       int hostID;
     };
+    
     struct Kernel {
       inline int workIdx() const { return _workIdx; }
       int _workIdx;
     };
 
+    inline uint32_t atomicAdd(uint32_t *p_value, uint32_t inc)
+    {
+      return ((std::atomic<int> *)p_value)->fetch_add(inc); 
+    }
+    
+    inline void atomicMin(uint32_t *p_value, uint32_t other)
+    {
+      uint32_t current = *(volatile uint32_t *)p_value;
+      while (current > other) {
+        bool wasChanged
+          = ((std::atomic<int>*)p_value)
+          ->compare_exchange_weak((int&)current,(int&)other);
+        if (wasChanged) break;
+      }
+    }
 
 
     // ##################################################################
diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h
index 420d30f..d1c8f9a 100644
--- a/cuBQL/builder/omp/refit.h
+++ b/cuBQL/builder/omp/refit.h
@@ -8,16 +8,6 @@
 namespace cuBQL {
   namespace omp {
     
-    struct Context {
-      Context(int gpuID);
-      int gpuID;
-      int hostID;
-    };
-    struct Kernel {
-      inline int workIdx() const { return _workIdx; }
-      int _workIdx;
-    };
-
     template<typename T, int D> inline
     void refit_init(Kernel kernel,
                     const typename BinaryBVH<T,D>::Node *nodes,
@@ -86,13 +76,13 @@ namespace cuBQL {
     {
       int numNodes = bvh.numNodes;
       uint32_t *refitData
-        = (uint32_t*)ctx->malloc(numNodes*sizeof(int));
+        = (uint32_t*)ctx->alloc(numNodes*sizeof(int));
       
-# pragma omp target device(context->gpuID)
+# pragma omp target device(ctx->gpuID)
 # pragma omp teams distribute parallel for
       for (int i=0;i<numNodes;i++)
         refit_init<T,D>(Kernel{i},bvh.nodes,refitData,bvh.numNodes);
-# pragma omp target device(context->gpuID)
+# pragma omp target device(ctx->gpuID)
 # pragma omp teams distribute parallel for
       for (int i=0;i<numNodes;i++)
         refit_run(Kernel{i},bvh,refitData,boxes);
diff --git a/cuBQL/builder/omp/sort.h b/cuBQL/builder/omp/sort.h
index 4f1db34..724c0be 100644
--- a/cuBQL/builder/omp/sort.h
+++ b/cuBQL/builder/omp/sort.h
@@ -1,166 +1,168 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
-// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// imported from github.com:ingowald/openmp_target_sort under this license:
+
+// SPDX-FileCopyrightText: Copyright (c) Ingo Wald
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
 
 #include <omp.h>
 
-namespace cuBQL {
-  namespace omp {
-    namespace bitonic {
+namespace omp {
+  namespace bitonic {
       
-      template<typename key_t>
-      void g_orderSegmentPairs(uint32_t tid,
-                               int logSegLen,
-                               key_t *const d_values,
-                               int numValues)
-      {
-        uint32_t segLen    = 1<<logSegLen;
-        uint32_t pairIdx   = tid>>logSegLen;
-        uint32_t pairRank  = tid-(pairIdx<<logSegLen);
-        uint32_t pairBegin = 2*(pairIdx<<logSegLen);
+    template<typename key_t>
+    void g_orderSegmentPairs(uint32_t tid,
+                             int logSegLen,
+                             key_t *const d_values,
+                             int numValues)
+    {
+      uint32_t segLen    = 1<<logSegLen;
+      uint32_t pairIdx   = tid>>logSegLen;
+      uint32_t pairRank  = tid-(pairIdx<<logSegLen);
+      uint32_t pairBegin = 2*(pairIdx<<logSegLen);
 
-        uint32_t r = pairBegin+segLen+pairRank;
-        uint32_t l = pairBegin+segLen-1-pairRank;
+      uint32_t r = pairBegin+segLen+pairRank;
+      uint32_t l = pairBegin+segLen-1-pairRank;
                         
-        if (r >= numValues) return;
+      if (r >= numValues) return;
         
-        key_t lv = d_values[l];
-        key_t rv = d_values[r];
-        if (rv < lv) {
-          d_values[r] = lv;
-          d_values[l] = rv;
-        }
+      key_t lv = d_values[l];
+      key_t rv = d_values[r];
+      if (rv < lv) {
+        d_values[r] = lv;
+        d_values[l] = rv;
       }
+    }
       
-      template<typename key_t, typename value_t>
-      void g_orderSegmentPairs(uint32_t tid,
-                               int logSegLen,
-                               key_t *const d_keys,
-                               value_t *const d_values,
-                               int numValues)
-      {
-        uint32_t segLen    = 1<<logSegLen;
-        uint32_t pairIdx   = tid>>logSegLen;
-        uint32_t pairRank  = tid-(pairIdx<<logSegLen);
-        uint32_t pairBegin = 2*(pairIdx<<logSegLen);
+    template<typename key_t, typename value_t>
+    void g_orderSegmentPairs(uint32_t tid,
+                             int logSegLen,
+                             key_t *const d_keys,
+                             value_t *const d_values,
+                             int numValues)
+    {
+      uint32_t segLen    = 1<<logSegLen;
+      uint32_t pairIdx   = tid>>logSegLen;
+      uint32_t pairRank  = tid-(pairIdx<<logSegLen);
+      uint32_t pairBegin = 2*(pairIdx<<logSegLen);
 
 
-        uint32_t r = pairBegin+segLen+pairRank;
-        uint32_t l = pairBegin+segLen-1-pairRank;
+      uint32_t r = pairBegin+segLen+pairRank;
+      uint32_t l = pairBegin+segLen-1-pairRank;
                         
-        if (r >= numValues) return;
+      if (r >= numValues) return;
         
-        key_t lk = d_keys[l];
-        key_t rk = d_keys[r];
-        key_t lv = d_values[l];
-        key_t rv = d_values[r];
-        if (rk < lk) {
-          d_values[r] = lv;
-          d_values[l] = rv;
-          d_keys[r]   = lk;
-          d_keys[l]   = rk;
-        }
+      key_t lk = d_keys[l];
+      key_t rk = d_keys[r];
+      key_t lv = d_values[l];
+      key_t rv = d_values[r];
+      if (rk < lk) {
+        d_values[r] = lv;
+        d_values[l] = rv;
+        d_keys[r]   = lk;
+        d_keys[l]   = rk;
       }
+    }
       
-      template<typename key_t>
-      void orderSegmentPairs(int logSegLen,
-                             key_t *const d_values,
-                             int numValues,
-                             uint32_t deviceID)
-      {
+    template<typename key_t>
+    void orderSegmentPairs(int logSegLen,
+                           key_t *const d_values,
+                           int numValues,
+                           uint32_t deviceID)
+    {
 #pragma omp target device(deviceID)
 #pragma omp teams distribute parallel for
-        for (int i=0;i<numValues;i++)
-          g_orderSegmentPairs(i,logSegLen,
-                              d_values,numValues);
-      }
+      for (int i=0;i<numValues;i++)
+        g_orderSegmentPairs(i,logSegLen,
+                            d_values,numValues);
+    }
       
-      template<typename key_t, typename value_t>
-      void orderSegmentPairs(int logSegLen,
-                             key_t *const d_keys,
-                             value_t *const d_values,
-                             int numValues,
-                             uint32_t deviceID)
-      {
+    template<typename key_t, typename value_t>
+    void orderSegmentPairs(int logSegLen,
+                           key_t *const d_keys,
+                           value_t *const d_values,
+                           int numValues,
+                           uint32_t deviceID)
+    {
 #pragma omp target device(deviceID)
 #pragma omp teams distribute parallel for
-        for (int i=0;i<numValues;i++)
-          g_orderSegmentPairs(i,logSegLen,
-                              d_keys,d_values,numValues);
-      }
+      for (int i=0;i<numValues;i++)
+        g_orderSegmentPairs(i,logSegLen,
+                            d_keys,d_values,numValues);
+    }
       
-      template<typename key_t>
-      void sortSegments(int logSegLen,
-                        key_t *const d_keys,
-                        int numValues,
-                        uint32_t deviceID)
-      {
-        if (logSegLen == 0) return;
+    template<typename key_t>
+    void sortSegments(int logSegLen,
+                      key_t *const d_keys,
+                      int numValues,
+                      uint32_t deviceID)
+    {
+      if (logSegLen == 0) return;
 
-        sortSegments(logSegLen-1,d_keys,numValues,omp);
-        orderSegmentPairs(logSegLen-1,d_keys,numValues,omp);
-        sortSegments(logSegLen-1,d_keys,numValues,omp);
-      }
+      sortSegments(logSegLen-1,d_keys,numValues,deviceID);
+      orderSegmentPairs(logSegLen-1,d_keys,numValues,deviceID);
+      sortSegments(logSegLen-1,d_keys,numValues,deviceID);
+    }
       
-      template<typename key_t, typename value_t>
-      void sortSegments(int logSegLen,
-                        key_t *const d_keys,
-                        value_t *const d_values,
-                        int numValues,
-                        uint32_t deviceID)
-      {
-        if (logSegLen == 0) return;
+    template<typename key_t, typename value_t>
+    void sortSegments(int logSegLen,
+                      key_t *const d_keys,
+                      value_t *const d_values,
+                      int numValues,
+                      uint32_t deviceID)
+    {
+      if (logSegLen == 0) return;
 
-        sortSegments(logSegLen-1,d_keys,d_values,numValues,omp);
-        orderSegmentPairs(logSegLen-1,d_keys,d_values,numValues,omp);
-        sortSegments(logSegLen-1,d_keys,d_values,numValues,omp);
-      }
-      
-      template<typename key_t>
-      void sort(key_t *const d_values,
-                int numValues,
-                uint32_t deviceID)
-      {
-        uint32_t logSegLen = 0;
-        while (1<<logSegLen < numValues)
-          logSegLen++;
-        sortSegments(logSegLen,d_values,numValues,omp);
-      }
-      
-      template<typename key_t, typename value_t>
-      void sort(key_t *const d_keys,
-                value_t *const d_values,
-                int numValues,
-                uint32_t deviceID)
-      {
-        uint32_t logSegLen = 0;
-        while (1<<logSegLen < numValues)
-          logSegLen++;
-        sortSegments(logSegLen,d_keys,d_values,numValues,deviceID);
-      }
-      
+      sortSegments(logSegLen-1,d_keys,d_values,numValues,deviceID);
+      orderSegmentPairs(logSegLen-1,d_keys,d_values,numValues,deviceID);
+      sortSegments(logSegLen-1,d_keys,d_values,numValues,deviceID);
     }
-  
+      
     template<typename key_t>
-    void omp_target_sort(key_t *const d_values,
-                         size_t numValues,
-                         uint32_t deviceID)
+    void sort(key_t *const d_values,
+              int numValues,
+              uint32_t deviceID)
     {
-      assert(numValues < (1ull<<31));
-      bitonic::sort(d_values,(int)numValues,deviceID);
+      uint32_t logSegLen = 0;
+      while (1<<logSegLen < numValues)
+        logSegLen++;
+      sortSegments(logSegLen,d_values,numValues,deviceID);
     }
-    
+      
     template<typename key_t, typename value_t>
-    void omp_target_sort(key_t *const d_keys,
-                         value_t *const d_values,
-                         size_t numValues,
-                         uint32_t deviceID)
+    void sort(key_t *const d_keys,
+              value_t *const d_values,
+              int numValues,
+              uint32_t deviceID)
     {
-      assert(numValues < (1ull<<31));
-      bitonic::sort(d_keys,d_values,(int)numValues,deviceID);
+      uint32_t logSegLen = 0;
+      while (1<<logSegLen < numValues)
+        logSegLen++;
+      sortSegments(logSegLen,d_keys,d_values,numValues,deviceID);
     }
+      
+  }
+  
+  template<typename key_t>
+  void omp_target_sort(key_t *const d_values,
+                       size_t numValues,
+                       uint32_t deviceID)
+  {
+    assert(numValues < (1ull<<31));
+    bitonic::sort(d_values,(int)numValues,deviceID);
+  }
     
+  template<typename key_t, typename value_t>
+  void omp_target_sort(key_t *const d_keys,
+                       value_t *const d_values,
+                       size_t numValues,
+                       uint32_t deviceID)
+  {
+    assert(numValues < (1ull<<31));
+    bitonic::sort(d_keys,d_values,(int)numValues,deviceID);
   }
+    
 }
diff --git a/cuBQL/builder/omp/spatialMedian.h b/cuBQL/builder/omp/spatialMedian.h
index 7dea853..4c9f63a 100644
--- a/cuBQL/builder/omp/spatialMedian.h
+++ b/cuBQL/builder/omp/spatialMedian.h
@@ -4,10 +4,21 @@
 #pragma once
 
 #include "cuBQL/builder/omp/refit.h"
+#include "cuBQL/builder/omp/sort.h"
 
 namespace cuBQL {
   namespace omp {
 
+    template<typename box_t>
+    struct AtomicBox : public box_t {
+    };
+
+    template<typename box_t>
+    void atomic_grow(AtomicBox<box_t> &ab, typename box_t::vec_t P);
+    
+    template<typename box_t>
+    void atomic_grow(AtomicBox<box_t> &ab, box_t B);
+    
     struct PrimState {
       union {
         /* careful with this order - this is intentionally chosen such
@@ -22,6 +33,8 @@ namespace cuBQL {
       };
     };
 
+    typedef enum : int8_t { OPEN_BRANCH, OPEN_NODE, DONE_NODE } NodeState;
+    
     template<typename T, int D>
     struct CUBQL_ALIGN(16) TempNode {
       using box_t = cuBQL::box_t<T,D>;
@@ -46,12 +59,11 @@ namespace cuBQL {
     };
     
     template<typename T, int D>
-    __global__
-    void initState(BuildState      *buildState,
-                   NodeState       *nodeStates,
+    void initState(uint32_t      *pNumNodes,
+                   NodeState     *nodeStates,
                    TempNode<T,D> *nodes)
     {
-      buildState->numNodes = 2;
+      *pNumNodes = 2;
       
       nodeStates[0]             = OPEN_BRANCH;
       nodes[0].openBranch.count = 0;
@@ -63,12 +75,13 @@ namespace cuBQL {
     }
 
     template<typename T, int D>
-    __global__ void initPrims(TempNode<T,D> *nodes,
-                              PrimState       *primState,
-                              const box_t<T,D>     *primBoxes,
-                              uint32_t         numPrims)
+    void initPrims(Kernel kernel,
+                   TempNode<T,D>    *nodes,
+                   PrimState        *primState,
+                   const box_t<T,D> *primBoxes,
+                   uint32_t          numPrims)
     {
-      const int primID = threadIdx.x+blockIdx.x*blockDim.x;
+      const int primID = kernel.workIdx();
       if (primID >= numPrims) return;
       
       auto &me = primState[primID];
@@ -88,105 +101,14 @@ namespace cuBQL {
     }
 
     template<typename T, int D>
-    __global__
-    void selectSplits(BuildState    *buildState,
+    void selectSplits(Kernel kernel,
+                      uint32_t      *pNumNodes,
                       NodeState     *nodeStates,
                       TempNode<T,D> *nodes,
                       uint32_t       numNodes,
                       BuildConfig    buildConfig)
     {
-#if 1
-      __shared__ int l_newNodeOfs;
-      if (threadIdx.x == 0)
-        l_newNodeOfs = 0;
-      __syncthreads();
-
-      int *t_nodeOffsetToWrite = 0;
-      int  t_localOffsetToAdd = 0;
-
-      while (true) {
-        const int nodeID = threadIdx.x+blockIdx.x*blockDim.x;
-        if (nodeID >= numNodes)
-          break;
-        
-        NodeState &nodeState = nodeStates[nodeID];
-        if (nodeState == DONE_NODE)
-          // this node was already closed before
-          break;
-        
-        if (nodeState == OPEN_NODE) {
-          // this node was open in the last pass, can close it.
-          nodeState   = DONE_NODE;
-          int offset  = nodes[nodeID].openNode.offset;
-          auto &done  = nodes[nodeID].doneNode;
-          done.count  = 0;
-          done.offset = offset;
-          break;
-        }
-        
-        auto in = nodes[nodeID].openBranch;
-        if (in.count <= buildConfig.makeLeafThreshold) {
-          auto &done  = nodes[nodeID].doneNode;
-          done.count  = in.count;
-          // set this to max-value, so the prims can later do atomicMin
-          // with their position ion the leaf list; this value is
-          // greater than any prim position.
-          done.offset = (uint32_t)-1;
-          nodeState   = DONE_NODE;
-        } else {
-          float widestWidth = 0.f;
-          int   widestDim   = -1;
-          float widestLo, widestHi, widestCtr;
-#pragma unroll
-          for (int d=0;d<D;d++) {
-            float lo = in.centBounds.get_lower(d);
-            float hi = in.centBounds.get_upper(d);
-            float width = hi - lo;
-            if (width <= widestWidth)
-              continue;
-            float ctr = 0.5f*(hi+lo);
-            
-            widestWidth = width;
-            widestDim   = d;
-            widestLo = lo;
-            widestHi = hi;
-            widestCtr = ctr;
-          }
-          
-          auto &open = nodes[nodeID].openNode;
-          if (widestDim >= 0) {
-            open.pos = widestCtr;
-          }
-          open.dim
-            = (widestDim < 0 || widestCtr == widestLo || widestCtr == widestHi)
-            ? -1
-            : widestDim;
-          
-          // this will be epensive - could make this faster by block-reducing
-          // open.offset = atomicAdd(&buildState->numNodes,2);
-          t_nodeOffsetToWrite = (int*)&open.offset;
-          t_localOffsetToAdd = atomicAdd(&l_newNodeOfs,2);
-          nodeState = OPEN_NODE;
-        }
-        break;
-      }
-      __syncthreads();
-      if (threadIdx.x == 0 && l_newNodeOfs > 0)
-        l_newNodeOfs = atomicAdd(&buildState->numNodes,l_newNodeOfs);
-      __syncthreads();
-      if (t_nodeOffsetToWrite) {
-        int openOffset = *t_nodeOffsetToWrite = l_newNodeOfs + t_localOffsetToAdd;
-#pragma unroll
-          for (int side=0;side<2;side++) {
-            const int childID = openOffset+side;
-            auto &child = nodes[childID].openBranch;
-            child.centBounds.set_empty();
-            child.count         = 0;
-            nodeStates[childID] = OPEN_BRANCH;
-          }
-      }
-#else
-      const int nodeID = threadIdx.x+blockIdx.x*blockDim.x;
+      const int nodeID = kernel.workIdx();//threadIdx.x+blockIdx.x*blockDim.x;
       if (nodeID >= numNodes) return;
 
       NodeState &nodeState = nodeStates[nodeID];
@@ -242,8 +164,7 @@ namespace cuBQL {
           ? -1
           : widestDim;
         
-        // this will be epensive - could make this faster by block-reducing
-        open.offset = atomicAdd(&buildState->numNodes,2);
+        open.offset = atomicAdd(pNumNodes,2);
 #pragma unroll
         for (int side=0;side<2;side++) {
           const int childID = open.offset+side;
@@ -254,18 +175,17 @@ namespace cuBQL {
         }
         nodeState = OPEN_NODE;
       }
-#endif
     }
 
     template<typename T, int D>
-    __global__
-    void updatePrims(NodeState       *nodeStates,
+    void updatePrims(Kernel kernel,
+                     NodeState       *nodeStates,
                      TempNode<T,D> *nodes,
                      PrimState       *primStates,
                      const box_t<T,D>     *primBoxes,
                      int numPrims)
     {
-      const int primID = threadIdx.x+blockIdx.x*blockDim.x;
+      const int primID = kernel.workIdx();
       if (primID >= numPrims) return;
 
       const auto me = primStates[primID];
@@ -302,13 +222,13 @@ namespace cuBQL {
        the nodes[] array, the node.offset value to point to the first
        of this nodes' items in that bvh.primIDs[] list. */
     template<typename T, int D>
-    __global__
-    void writePrimsAndLeafOffsets(TempNode<T,D> *nodes,
+    void writePrimsAndLeafOffsets(Kernel kernel,
+                                  TempNode<T,D> *nodes,
                                   uint32_t        *bvhItemList,
                                   PrimState       *primStates,
                                   int              numPrims)
     {
-      const int offset = threadIdx.x+blockIdx.x*blockDim.x;
+      const int offset = kernel.workIdx();//threadIdx.x+blockIdx.x*blockDim.x;
       if (offset >= numPrims) return;
 
       auto &ps = primStates[offset];
@@ -324,12 +244,12 @@ namespace cuBQL {
     /* writes main phase's temp nodes into final bvh.nodes[]
        layout. actual bounds of that will NOT yet bewritten */
     template<typename T, int D>
-    __global__
-    void writeNodes(typename BinaryBVH<T,D>::Node *finalNodes,
+    void writeNodes(Kernel kernel,
+                    typename BinaryBVH<T,D>::Node *finalNodes,
                     TempNode<T,D>  *tempNodes,
                     int        numNodes)
     {
-      const int nodeID = threadIdx.x+blockIdx.x*blockDim.x;
+      const int nodeID = kernel.workIdx();
       if (nodeID >= numNodes) return;
 
       finalNodes[nodeID].admin.offset = tempNodes[nodeID].doneNode.offset;
@@ -338,12 +258,12 @@ namespace cuBQL {
 
     
     template<typename T, int D>
-    void build(BinaryBVH<T,D>    &bvh,
-               const box_t<T,D>  *boxes,
-               int                numPrims,
-               BuildConfig        buildConfig,
-               cudaStream_t       s,
-               GpuMemoryResource &memResource)
+    void spatialMedian(BinaryBVH<T,D> &bvh,
+                       /*! DEVICE array of boxes */
+                       const box_t<T,D> *boxes,
+                       uint32_t          numPrims,
+                       BuildConfig       buildConfig,
+                       Context          *ctx)
     {
       assert(sizeof(PrimState) == sizeof(uint64_t));
       
@@ -353,91 +273,94 @@ namespace cuBQL {
       TempNode<T,D> *tempNodes = 0;
       NodeState     *nodeStates = 0;
       PrimState     *primStates = 0;
-      BuildState    *buildState = 0;
-      ctx->malloc(tempNodes,2*numPrims);
-      ctx->malloc(nodeStates,2*numPrims);
-      ctx->malloc(primStates,numPrims);
-      ctx->malloc(buildState,1);
-      initState<<<1,1,0,s>>>(buildState,
-                             nodeStates,
-                             tempNodes);
-      initPrims<<<divRoundUp(numPrims,1024),1024,0,s>>>
-        (tempNodes,
-         primStates,boxes,numPrims);
+      uint32_t      *d_numNodes = 0;
+      ctx->alloc(tempNodes,2*numPrims);
+      ctx->alloc(nodeStates,2*numPrims);
+      ctx->alloc(primStates,numPrims);
+      ctx->alloc(d_numNodes,1);
+#pragma omp target device(ctx->gpuID)
+#pragma omp teams distribute parallel for
+      for (int tid=0;tid<1;tid++)
+        initState(d_numNodes,
+                  nodeStates,
+                  tempNodes);
+#pragma omp target device(ctx->gpuID)
+#pragma omp teams distribute parallel for
+      for (int tid=0;tid<numPrims;tid++)
+        initPrims(Kernel{tid},tempNodes,
+                  primStates,boxes,numPrims);
 
       int numDone = 0;
-      int numNodes;
+      uint32_t numNodes;
 
       // ------------------------------------------------------------------      
-      cudaEvent_t stateDownloadedEvent;
-      CUBQL_CUDA_CALL(EventCreate(&stateDownloadedEvent));
-      
-      
       while (true) {
-        CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes,
-                                    sizeof(numNodes),cudaMemcpyDeviceToHost,s));
-        CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
-        CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent));
+        ctx->download(numNodes,d_numNodes);
         if (numNodes == numDone)
           break;
-        selectSplits<<<divRoundUp(numNodes,1024),1024,0,s>>>
-          (buildState,
-           nodeStates,tempNodes,numNodes,
-           buildConfig);
+#pragma omp target device(ctx->gpuID)
+#pragma omp teams distribute parallel for
+        for (int tid=0;tid<numNodes;tid++)
+          selectSplits(Kernel{tid},
+                       d_numNodes,
+                       nodeStates,tempNodes,numNodes,
+                       buildConfig);
         numDone = numNodes;
 
-        updatePrims<<<divRoundUp(numPrims,1024),1024,0,s>>>
-          (nodeStates,tempNodes,
-           primStates,boxes,numPrims);
+#pragma omp target device(ctx->gpuID)
+#pragma omp teams distribute parallel for
+        for (int tid=0;tid<numPrims;tid++)
+          updatePrims(Kernel{tid},
+                      nodeStates,tempNodes,
+                      primStates,boxes,numPrims);
       }
-      CUBQL_CUDA_CALL(EventDestroy(stateDownloadedEvent));
       // ==================================================================
       // sort {item,nodeID} list
       // ==================================================================
       
-      // set up sorting of prims
-      uint8_t   *d_temp_storage     = NULL;
-      size_t     temp_storage_bytes = 0;
-      PrimState *sortedPrimStates   = 0;
-      ctx->malloc(sortedPrimStates,numPrims);
-      auto rc =
-        cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes,
-                                     (uint64_t*)primStates,
-                                     (uint64_t*)sortedPrimStates,
-                                     numPrims,32,64,s);
-      ctx->malloc(d_temp_storage,temp_storage_bytes);
-      rc =
-        cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes,
-                                       (uint64_t*)primStates,
-                                       (uint64_t*)sortedPrimStates,
-                                       numPrims,32,64,s);
-      rc = rc;
-      ctx->free(d_temp_storage);
+      ::omp::omp_target_sort((uint64_t*)primStates,numPrims,ctx->gpuID);
+      
       // ==================================================================
       // allocate and write BVH item list, and write offsets of leaf nodes
       // ==================================================================
 
       bvh.numPrims = numPrims;
-      ctx->malloc(bvh.primIDs,numPrims);
-      writePrimsAndLeafOffsets<<<divRoundUp(numPrims,1024),1024,0,s>>>
-        (tempNodes,bvh.primIDs,sortedPrimStates,numPrims);
-
+      ctx->alloc(bvh.primIDs,numPrims);
+#pragma omp target device(ctx->gpuID)
+#pragma omp teams distribute parallel for
+      for (int tid=0;tid<numPrims;tid++)
+        writePrimsAndLeafOffsets(Kernel{tid},
+                                 tempNodes,
+                                 bvh.primIDs,primStates,numPrims);
+      
       // ==================================================================
       // allocate and write final nodes
       // ==================================================================
       bvh.numNodes = numNodes;
-      ctx->malloc(bvh.nodes,numNodes);
-      writeNodes<<<divRoundUp(numNodes,1024),1024,0,s>>>
-        (bvh.nodes,tempNodes,numNodes);
-      ctx->free(sortedPrimStates);
+      ctx->alloc(bvh.nodes,numNodes);
+#pragma omp target device(ctx->gpuID)
+#pragma omp teams distribute parallel for
+      for (int tid=0;tid<numNodes;tid++)
+        writeNodes(Kernel{tid},bvh.nodes,tempNodes,numNodes);
       ctx->free(tempNodes);
       ctx->free(nodeStates);
       ctx->free(primStates);
-      ctx->free(buildState);
+      ctx->free(d_numNodes);
 
-      refit(bvh);
+      cuBQL::omp::refit(bvh,boxes,ctx);
     }
     
   }
+  
+  template<typename T, int D>
+  void build_omp_target(BinaryBVH<T,D> &bvh,
+                        const box_t<T,D> *d_boxes,
+                        uint32_t          numBoxes,
+                        BuildConfig       buildConfig,
+                        int               gpuID)
+  {
+    omp::Context ctx(gpuID);
+    omp::spatialMedian(bvh,d_boxes,numBoxes,buildConfig,&ctx);
+  }
 }
 

From 1ffa5f1f108c510d10ab7214513d8256c766231d Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Sat, 24 Jan 2026 13:54:57 -0700
Subject: [PATCH 05/14] first draft of builder that compiles; atomics realized
 via CAS

---
 cuBQL/builder/omp/common.h        |  53 +++++++++++-
 cuBQL/builder/omp/spatialMedian.h | 133 +++++++++++++++++++++++++++++-
 2 files changed, 181 insertions(+), 5 deletions(-)

diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h
index 261d06f..1eae7e7 100644
--- a/cuBQL/builder/omp/common.h
+++ b/cuBQL/builder/omp/common.h
@@ -14,7 +14,7 @@ namespace cuBQL {
     struct Context {
       Context(int gpuID);
 
-      void *alloc(size_t Nelements);
+      void *alloc(size_t numBytes);
       
       template<typename T>
       void alloc(T *&d_data, size_t Nelements);
@@ -70,8 +70,11 @@ namespace cuBQL {
       printf("#cuBQL:omp:Context(gpu=%i/%i,host=%i)\n",
              gpuID,omp_get_num_devices(),hostID);
     }
+
+    void *Context::alloc(size_t numBytes)
+    { return omp_target_alloc(numBytes,gpuID); }
     
-    template<typename T>
+    template<typename T> inline
     void Context::alloc_and_upload(T *&d_data,
                                    const T *h_data,
                                    size_t N)
@@ -84,7 +87,7 @@ namespace cuBQL {
                         0,0,gpuID,hostID);
     }
       
-    template<typename T>
+    template<typename T> inline
     void Context::alloc_and_upload(T *&d_data,
                                    const std::vector<T> &h_vector)
     { alloc_and_upload(d_data,h_vector.data(),h_vector.size()); }
@@ -97,6 +100,50 @@ namespace cuBQL {
                         0,0,hostID,gpuID);
       return out;
     }
+
+    inline void Context::free(void *ptr)
+    { omp_target_free(ptr,gpuID); }
+
+    template<typename T> inline
+    void Context::alloc(T *&d_data, size_t N)
+    {
+      d_data = (T*)omp_target_alloc(N*sizeof(T),gpuID);
+    }
+      
+    // template<typename T> inline
+    // void Context::alloc_and_upload(T *&d_data,
+    //                                const T *h_data,
+    //                                size_t N)
+    // {
+    //   alloc(d_data,N);
+    //   upload(d_data,h_data,N);
+    // }
+      
+    // template<typename T> inline
+    // void Context::alloc_and_upload(T *&d_data,
+    //                                const std::vector<T> &h_vector)
+    // {
+    //   alloc(d_data,h_vector.size());
+    //   upload(d_data,h_vector);
+    // }
+
+    // template<typename T> inline
+    // std::vector<T> Context::download_vector(const T *d_data,
+    //                                         size_t N)
+    // {
+    //   std::vector<T> vec(N);
+    //   omp_target_memcpy(vec.data(),d_data,N*sizeof(T),
+    //                     0,0,hostID,gpuID);
+    //   return vec;
+    // }
+
+    template<typename T>
+    inline void Context::download(T &h_value, T *d_value)
+    {
+      omp_target_memcpy(&h_value,d_value,sizeof(T),
+                        0,0,hostID,gpuID);
+    }
+
     
   } // ::cuBQL::omp
 } // ::cuBQL
diff --git a/cuBQL/builder/omp/spatialMedian.h b/cuBQL/builder/omp/spatialMedian.h
index 4c9f63a..a07dd65 100644
--- a/cuBQL/builder/omp/spatialMedian.h
+++ b/cuBQL/builder/omp/spatialMedian.h
@@ -13,11 +13,140 @@ namespace cuBQL {
     struct AtomicBox : public box_t {
     };
 
+    template<typename T>
+    void atomic_min(T *ptr, T v);
+    template<typename T>
+    void atomic_max(T *ptr, T v);
+    
+    /*! iw - note: this implementation of atomic min/max via atomic
+        compare-exchange (CAS); which is cetainly not optimal on any
+        sort of modern GPU - but it works in any C++-21 compliant
+        compiler, so it's what we do for now */
+    void atomic_min(float *ptr, float value)
+    {
+      float current = *(volatile float *)ptr;
+      while (current > value) {
+        bool wasChanged
+          = ((std::atomic<int>*)ptr)
+          ->compare_exchange_weak((int&)current,(int&)value);
+        if (wasChanged) break;
+      }
+    }
+    
+    /*! iw - note: this implementation of atomic min/max via atomic
+        compare-exchange (CAS); which is cetainly not optimal on any
+        sort of modern GPU - but it works in any C++-21 compliant
+        compiler, so it's what we do for now */
+    void atomic_max(float *ptr, float value)
+    {
+      float current = *(volatile float *)ptr;
+      while (current > value) {
+        bool wasChanged
+          = ((std::atomic<int>*)ptr)
+          ->compare_exchange_weak((int&)current,(int&)value);
+        if (wasChanged) break;
+      }
+    }
+    
+    /*! iw - note: this implementation of atomic min/max via atomic
+        compare-exchange (CAS); which is cetainly not optimal on any
+        sort of modern GPU - but it works in any C++-21 compliant
+        compiler, so it's what we do for now */
+    void atomic_min(double *ptr, double value)
+    {
+      double current = *(volatile double *)ptr;
+      while (current > value) {
+        bool wasChanged
+          = ((std::atomic<long long int>*)ptr)
+          ->compare_exchange_weak((long long int&)current,
+                                  (long long int&)value);
+        if (wasChanged) break;
+      }
+    }
+    
+    /*! iw - note: this implementation of atomic min/max via atomic
+        compare-exchange (CAS); which is cetainly not optimal on any
+        sort of modern GPU - but it works in any C++-21 compliant
+        compiler, so it's what we do for now */
+    void atomic_max(double *ptr, double value)
+    {
+      double current = *(volatile double *)ptr;
+      while (current > value) {
+        bool wasChanged
+          = ((std::atomic<long long int>*)ptr)
+          ->compare_exchange_weak((long long int&)current,
+                                  (long long int&)value);
+        if (wasChanged) break;
+      }
+    }
+    
+    template<typename T, int D>
+    void v_atomic_min(vec_t<T,D> *ptr, vec_t<T,D> v);
+    template<typename T, int D>
+    void v_atomic_max(vec_t<T,D> *ptr, vec_t<T,D> v);
+    
+
+    template<typename T>
+    void v_atomic_min(vec_t<T,2> *ptr, vec_t<T,2> v)
+    {
+      atomic_min(&ptr->x,v.x); 
+      atomic_min(&ptr->y,v.y);
+    }
+    
+    template<typename T>
+    void v_atomic_min(vec_t<T,3> *ptr, vec_t<T,3> v)
+    {
+      atomic_min(&ptr->x,v.x); 
+      atomic_min(&ptr->y,v.y);
+      atomic_min(&ptr->z,v.z);
+    }
+    
+    template<typename T>
+    void v_atomic_min(vec_t<T,4> *ptr, vec_t<T,4> v)
+    {
+      atomic_min(&ptr->x,v.x); 
+      atomic_min(&ptr->y,v.y);
+      atomic_min(&ptr->z,v.z);
+      atomic_min(&ptr->w,v.w);
+    }
+
+    template<typename T>
+    void v_atomic_max(vec_t<T,2> *ptr, vec_t<T,2> v)
+    {
+      atomic_max(&ptr->x,v.x); 
+      atomic_max(&ptr->y,v.y);
+    }
+    
+    template<typename T>
+    void v_atomic_max(vec_t<T,3> *ptr, vec_t<T,3> v)
+    {
+      atomic_max(&ptr->x,v.x); 
+      atomic_max(&ptr->y,v.y);
+      atomic_max(&ptr->z,v.z);
+    }
+    
+    template<typename T>
+    void v_atomic_max(vec_t<T,4> *ptr, vec_t<T,4> v)
+    {
+      atomic_max(&ptr->x,v.x); 
+      atomic_max(&ptr->y,v.y);
+      atomic_max(&ptr->z,v.z);
+      atomic_max(&ptr->w,v.w);
+    }
+    
     template<typename box_t>
-    void atomic_grow(AtomicBox<box_t> &ab, typename box_t::vec_t P);
+    void atomic_grow(AtomicBox<box_t> &ab, typename box_t::vec_t P)
+    {
+      v_atomic_min(&ab.lower,P);
+      v_atomic_max(&ab.upper,P);
+    }
     
     template<typename box_t>
-    void atomic_grow(AtomicBox<box_t> &ab, box_t B);
+    void atomic_grow(AtomicBox<box_t> &ab, box_t B)
+    {
+      v_atomic_min(&ab.lower,B.lower);
+      v_atomic_max(&ab.upper,B.upper);
+    }
     
     struct PrimState {
       union {

From 0ac41c8339dfe3faccfd267b405b71cc29144e3c Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Mon, 26 Jan 2026 13:23:11 -0700
Subject: [PATCH 06/14] omp refit now working, but only with single thread per
 team; probably compiler issue

---
 cuBQL/builder/omp/AtomicBox.h     | 157 +++++++++++++++++++
 cuBQL/builder/omp/common.h        |  24 ++-
 cuBQL/builder/omp/refit.h         | 115 ++++++++++----
 cuBQL/builder/omp/sort.h          |  23 ++-
 cuBQL/builder/omp/spatialMedian.h | 250 +++++++++++-------------------
 cuBQL/math/vec.h                  |  59 ++++---
 6 files changed, 414 insertions(+), 214 deletions(-)
 create mode 100644 cuBQL/builder/omp/AtomicBox.h

diff --git a/cuBQL/builder/omp/AtomicBox.h b/cuBQL/builder/omp/AtomicBox.h
new file mode 100644
index 0000000..4ee90cd
--- /dev/null
+++ b/cuBQL/builder/omp/AtomicBox.h
@@ -0,0 +1,157 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "cuBQL/builder/omp/common.h"
+
+namespace cuBQL {
+  namespace omp {
+
+    template<typename box_t>
+    struct AtomicBox : public box_t {
+      
+      inline void set_empty()
+      {
+        *(box_t *)this = box_t();
+      }
+    };
+
+    template<typename T>
+    inline void atomic_min(T *ptr, T v);
+    template<typename T>
+    inline void atomic_max(T *ptr, T v);
+    
+    /*! iw - note: this implementation of atomic min/max via atomic
+        compare-exchange (CAS); which is cetainly not optimal on any
+        sort of modern GPU - but it works in any C++-21 compliant
+        compiler, so it's what we do for now */
+    inline void atomic_min(float *ptr, float value)
+    {
+      float current = *(volatile float *)ptr;
+      while (current > value) {
+        bool wasChanged
+          = ((std::atomic<int>*)ptr)
+          ->compare_exchange_weak((int&)current,(int&)value);
+        if (wasChanged) break;
+      }
+    }
+    
+    /*! iw - note: this implementation of atomic min/max via atomic
+        compare-exchange (CAS); which is cetainly not optimal on any
+        sort of modern GPU - but it works in any C++-21 compliant
+        compiler, so it's what we do for now */
+    inline void atomic_max(float *ptr, float value)
+    {
+      float current = *(volatile float *)ptr;
+      while (current < value) {
+        bool wasChanged
+          = ((std::atomic<int>*)ptr)
+          ->compare_exchange_weak((int&)current,(int&)value);
+        if (wasChanged) break;
+      }
+    }
+    
+    /*! iw - note: this implementation of atomic min/max via atomic
+        compare-exchange (CAS); which is cetainly not optimal on any
+        sort of modern GPU - but it works in any C++-21 compliant
+        compiler, so it's what we do for now */
+    inline void atomic_min(double *ptr, double value)
+    {
+      double current = *(volatile double *)ptr;
+      while (current > value) {
+        bool wasChanged
+          = ((std::atomic<long long int>*)ptr)
+          ->compare_exchange_weak((long long int&)current,
+                                  (long long int&)value);
+        if (wasChanged) break;
+      }
+    }
+    
+    /*! iw - note: this implementation of atomic min/max via atomic
+        compare-exchange (CAS); which is cetainly not optimal on any
+        sort of modern GPU - but it works in any C++-21 compliant
+        compiler, so it's what we do for now */
+    inline void atomic_max(double *ptr, double value)
+    {
+      double current = *(volatile double *)ptr;
+      while (current < value) {
+        bool wasChanged
+          = ((std::atomic<long long int>*)ptr)
+          ->compare_exchange_weak((long long int&)current,
+                                  (long long int&)value);
+        if (wasChanged) break;
+      }
+    }
+    
+    template<typename T, int D>
+    inline void v_atomic_min(vec_t<T,D> *ptr, vec_t<T,D> v);
+    template<typename T, int D>
+    inline void v_atomic_max(vec_t<T,D> *ptr, vec_t<T,D> v);
+    
+
+    template<typename T>
+    inline void v_atomic_min(vec_t<T,2> *ptr, vec_t<T,2> v)
+    {
+      atomic_min(&ptr->x,v.x); 
+      atomic_min(&ptr->y,v.y);
+    }
+    
+    template<typename T>
+    inline void v_atomic_min(vec_t<T,3> *ptr, vec_t<T,3> v)
+    {
+      atomic_min(&ptr->x,v.x); 
+      atomic_min(&ptr->y,v.y);
+      atomic_min(&ptr->z,v.z);
+    }
+    
+    template<typename T>
+    inline void v_atomic_min(vec_t<T,4> *ptr, vec_t<T,4> v)
+    {
+      atomic_min(&ptr->x,v.x); 
+      atomic_min(&ptr->y,v.y);
+      atomic_min(&ptr->z,v.z);
+      atomic_min(&ptr->w,v.w);
+    }
+
+    template<typename T>
+    inline void v_atomic_max(vec_t<T,2> *ptr, vec_t<T,2> v)
+    {
+      atomic_max(&ptr->x,v.x); 
+      atomic_max(&ptr->y,v.y);
+    }
+    
+    template<typename T>
+    inline void v_atomic_max(vec_t<T,3> *ptr, vec_t<T,3> v)
+    {
+      atomic_max(&ptr->x,v.x); 
+      atomic_max(&ptr->y,v.y);
+      atomic_max(&ptr->z,v.z);
+    }
+    
+    template<typename T>
+    inline void v_atomic_max(vec_t<T,4> *ptr, vec_t<T,4> v)
+    {
+      atomic_max(&ptr->x,v.x); 
+      atomic_max(&ptr->y,v.y);
+      atomic_max(&ptr->z,v.z);
+      atomic_max(&ptr->w,v.w);
+    }
+    
+    template<typename box_t>
+    inline void atomic_grow(AtomicBox<box_t> &ab, typename box_t::vec_t P)
+    {
+      v_atomic_min(&ab.lower,P);
+      v_atomic_max(&ab.upper,P);
+    }
+    
+    template<typename box_t>
+    inline void atomic_grow(AtomicBox<box_t> &ab, box_t B)
+    {
+      v_atomic_min(&ab.lower,B.lower);
+      v_atomic_max(&ab.upper,B.upper);
+    }
+    
+  }
+}
diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h
index 1eae7e7..1cb8178 100644
--- a/cuBQL/builder/omp/common.h
+++ b/cuBQL/builder/omp/common.h
@@ -22,6 +22,9 @@ namespace cuBQL {
       template<typename T>
       void alloc_and_upload(T *&d_data, const T *h_data, size_t Nelements);
       
+      template<typename T>
+      void upload(T *d_data, const T *h_data, size_t Nelements);
+      
       template<typename T>
       void alloc_and_upload(T *&d_data, const std::vector<T> &h_vector);
 
@@ -42,7 +45,7 @@ namespace cuBQL {
       int _workIdx;
     };
 
-    inline uint32_t atomicAdd(uint32_t *p_value, uint32_t inc)
+    inline uint32_t atomicAdd(volatile uint32_t *p_value, uint32_t inc)
     {
       return ((std::atomic<int> *)p_value)->fetch_add(inc); 
     }
@@ -74,6 +77,16 @@ namespace cuBQL {
     void *Context::alloc(size_t numBytes)
     { return omp_target_alloc(numBytes,gpuID); }
     
+    template<typename T> inline
+    void Context::upload(T *d_data,
+                         const T *h_data,
+                         size_t N)
+    {
+      assert(d_data);
+      omp_target_memcpy(d_data,h_data,N*sizeof(T),
+                        0,0,gpuID,hostID);
+    }
+      
     template<typename T> inline
     void Context::alloc_and_upload(T *&d_data,
                                    const T *h_data,
@@ -82,9 +95,7 @@ namespace cuBQL {
       printf("target_alloc N %li gpu %i\n",N,gpuID);
       d_data = (T *)omp_target_alloc(N*sizeof(T),gpuID);
       printf("ptr %p\n",d_data);
-      assert(d_data);
-      omp_target_memcpy(d_data,h_data,N*sizeof(T),
-                        0,0,gpuID,hostID);
+      upload(d_data,h_data,N);
     }
       
     template<typename T> inline
@@ -95,7 +106,12 @@ namespace cuBQL {
     template<typename T>
     std::vector<T> Context::download_vector(const T *d_data, size_t N)
     {
+      PRINT(N);
+      PRINT(d_data);
+      
       std::vector<T> out(N);
+      PRINT(out.data());
+      PRINT(sizeof(T));
       omp_target_memcpy(out.data(),d_data,N*sizeof(T),
                         0,0,hostID,gpuID);
       return out;
diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h
index d1c8f9a..9b4fe9c 100644
--- a/cuBQL/builder/omp/refit.h
+++ b/cuBQL/builder/omp/refit.h
@@ -4,13 +4,14 @@
 #pragma once
 
 #include "cuBQL/builder/omp/common.h"
+#include "cuBQL/builder/omp/AtomicBox.h"
 
 namespace cuBQL {
   namespace omp {
     
     template<typename T, int D> inline
-    void refit_init(Kernel kernel,
-                    const typename BinaryBVH<T,D>::Node *nodes,
+    void refit_init_x(Kernel kernel,
+                    typename BinaryBVH<T,D>::Node *nodes,
                     uint32_t              *refitData,
                     int numNodes)
     {
@@ -18,52 +19,82 @@ namespace cuBQL {
       if (nodeID == 1 || nodeID >= numNodes) return;
       if (nodeID < 2)
         refitData[0] = 0;
-      const auto &node = nodes[nodeID];
+      auto &node = nodes[nodeID];
+      node.bounds = box_t<T,D>();
       if (node.admin.count) return;
 
+      if (node.admin.offset+1 >= numNodes)
+        printf("refit_init_overflow\n");
       refitData[node.admin.offset+0] = nodeID << 1;
       refitData[node.admin.offset+1] = nodeID << 1;
     }
-
+    
     template<typename T, int D> inline 
-    void refit_run(Kernel kernel,
-                   BinaryBVH<T,D> bvh,
-                   uint32_t *refitData,
-                   const box_t<T,D> *boxes)
+    void refit_run_x(Kernel kernel,
+                     // BinaryBVH<T,D> bvh,
+                     uint32_t *bvh_primIDs,
+                     typename BinaryBVH<T,D>::Node *bvh_nodes,
+                     uint32_t *refitData,
+                     const box_t<T,D> *boxes,
+                     int numNodes)
     {
       int nodeID = kernel.workIdx();
-      if (nodeID == 1 || nodeID >= bvh.numNodes) return;
-      
-      typename BinaryBVH<T,D>::Node *node = &bvh.nodes[nodeID];
+      if (nodeID == 1 || nodeID >= numNodes) return;
+
+      typename BinaryBVH<T,D>::Node *node = bvh_nodes+nodeID;
+
+      if (nodeID < 0 || nodeID >= 601202) {
+        printf("BLA\n"); return;
+      }
       if (node->admin.count == 0)
         // this is a inner node - exit
         return;
 
       box_t<T,D> bounds; bounds.set_empty();
       for (int i=0;i<node->admin.count;i++) {
-        const box_t<T,D> primBox = boxes[bvh.primIDs[node->admin.offset+i]];
+        const box_t<T,D> primBox = boxes[bvh_primIDs[node->admin.offset+i]];
         bounds.lower = min(bounds.lower,primBox.lower);
         bounds.upper = max(bounds.upper,primBox.upper);
       }
 
       int parentID = (refitData[nodeID] >> 1);
+      int its = 0;
       while (true) {
-        node->bounds = bounds;
-        // __threadfence();
-        if (node == bvh.nodes)
+        if (parentID == 1) {
+          printf("1 is parent!!!!\n");
+                 return;
+        }
+        
+        atomic_grow(*(AtomicBox<box_t<T,D>> *)&node->bounds,bounds);
+        // node->bounds = bounds;
+          
+        if (node == bvh_nodes)
           break;
 
+        int it = its++;
+        // if (it >= 4) return;
+
+        // if (it == 3) { printf("parentID %i\n",parentID); return; }
+        
+        if (parentID < 0 || parentID >= 601202) {
+          printf("BLA\n"); return;
+        }
         uint32_t refitBits = atomicAdd(&refitData[parentID],1u);
         if ((refitBits & 1) == 0)
           // we're the first one - let other one do it
           break;
 
         nodeID   = parentID;
-        node     = &bvh.nodes[parentID];
+        node     = &bvh_nodes[parentID];
         parentID = (refitBits >> 1);
+
+        int ofs = node->admin.offset;
+        if (ofs < 0 || ofs+1 >= 601202) {
+          printf("BLAB\n"); return;
+        }
         
-        typename BinaryBVH<T,D>::Node l = bvh.nodes[node->admin.offset+0];
-        typename BinaryBVH<T,D>::Node r = bvh.nodes[node->admin.offset+1];
+        typename BinaryBVH<T,D>::Node l = bvh_nodes[ofs+0];
+        typename BinaryBVH<T,D>::Node r = bvh_nodes[ofs+1];
         bounds.lower = min(l.bounds.lower,r.bounds.lower);
         bounds.upper = max(l.bounds.upper,r.bounds.upper);
       }
@@ -74,18 +105,48 @@ namespace cuBQL {
                const box_t<T,D>  *boxes,
                Context *ctx)
     {
+      PING;
+      assert(bvh.nodes);
+      assert(bvh.primIDs);
       int numNodes = bvh.numNodes;
       uint32_t *refitData
-        = (uint32_t*)ctx->alloc(numNodes*sizeof(int));
-      
-# pragma omp target device(ctx->gpuID)
-# pragma omp teams distribute parallel for
-      for (int i=0;i<numNodes;i++)
-        refit_init<T,D>(Kernel{i},bvh.nodes,refitData,bvh.numNodes);
-# pragma omp target device(ctx->gpuID)
-# pragma omp teams distribute parallel for
+        = (uint32_t*)ctx->alloc(numNodes*sizeof(uint32_t));
+      auto bvh_nodes = bvh.nodes;
+      auto bvh_primIDs = bvh.primIDs;
+      PING;
+      PRINT(numNodes);
+      PRINT(bvh.numNodes);
+      {
+#pragma omp target device(ctx->gpuID) is_device_ptr(refitData) is_device_ptr(bvh_nodes) 
+#pragma omp teams distribute parallel for
+        for (int i=0;i<numNodes;i++)
+          refit_init_x<T,D>(Kernel{i},bvh_nodes,refitData,numNodes);
+      }
+      PING;
+      PING;
+      {
+#if 1
+        int nb = 128;
+#pragma omp target  teams num_teams(nb) device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(bvh_nodes) is_device_ptr(refitData) is_device_ptr(boxes)
+        {
+          int team = omp_get_team_num();
+          int nteams = omp_get_num_teams();
+          int beg = ((team+0)*numNodes)/nteams;
+          int end = ((team+1)*numNodes)/nteams;
+          for (int i=beg;i<end;i++)
+            refit_run_x(Kernel{i},//bvh,
+                        bvh_primIDs,bvh_nodes,refitData,boxes,numNodes);
+        }
+      }
+#else
+#pragma omp target device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(bvh_nodes) is_device_ptr(refitData) is_device_ptr(boxes) 
+#pragma omp teams distribute parallel for 
       for (int i=0;i<numNodes;i++)
-        refit_run(Kernel{i},bvh,refitData,boxes);
+        refit_run_x(Kernel{i},//bvh,
+                    bvh_primIDs,bvh_nodes,refitData,boxes,numNodes);
+      }
+#endif
+      PING;
       ctx->free((void*)refitData);
     }
     
diff --git a/cuBQL/builder/omp/sort.h b/cuBQL/builder/omp/sort.h
index 724c0be..5f6cb39 100644
--- a/cuBQL/builder/omp/sort.h
+++ b/cuBQL/builder/omp/sort.h
@@ -12,8 +12,10 @@
 
 namespace omp {
   namespace bitonic {
-      
+
+#pragma omp declare target
     template<typename key_t>
+    inline
     void g_orderSegmentPairs(uint32_t tid,
                              int logSegLen,
                              key_t *const d_values,
@@ -36,6 +38,7 @@ namespace omp {
         d_values[l] = rv;
       }
     }
+#pragma omp end declare target
       
     template<typename key_t, typename value_t>
     void g_orderSegmentPairs(uint32_t tid,
@@ -73,11 +76,18 @@ namespace omp {
                            int numValues,
                            uint32_t deviceID)
     {
-#pragma omp target device(deviceID)
-#pragma omp teams distribute parallel for
+#if 0
+#pragma omp target device(deviceID) teams is_device_ptr(d_values) num_teams(128)
+      {
+        int bs = 
+      }
+#else
+#pragma omp target device(deviceID) is_device_ptr(d_values) nowait
+#pragma omp teams distribute parallel for 
       for (int i=0;i<numValues;i++)
         g_orderSegmentPairs(i,logSegLen,
                             d_values,numValues);
+#endif
     }
       
     template<typename key_t, typename value_t>
@@ -87,8 +97,9 @@ namespace omp {
                            int numValues,
                            uint32_t deviceID)
     {
-#pragma omp target device(deviceID)
-#pragma omp teams distribute parallel for
+// #pragma omp target nowait device(deviceID) is_device_ptr(d_keys) is_device_ptr(d_values)
+// #pragma omp target teams distribute parallel for
+#pragma omp target nowait device(deviceID) is_device_ptr(d_keys) is_device_ptr(d_values) 
       for (int i=0;i<numValues;i++)
         g_orderSegmentPairs(i,logSegLen,
                             d_keys,d_values,numValues);
@@ -139,7 +150,7 @@ namespace omp {
               uint32_t deviceID)
     {
       uint32_t logSegLen = 0;
-      while (1<<logSegLen < numValues)
+      while ((1<<logSegLen) < numValues)
         logSegLen++;
       sortSegments(logSegLen,d_keys,d_values,numValues,deviceID);
     }
diff --git a/cuBQL/builder/omp/spatialMedian.h b/cuBQL/builder/omp/spatialMedian.h
index a07dd65..32c754c 100644
--- a/cuBQL/builder/omp/spatialMedian.h
+++ b/cuBQL/builder/omp/spatialMedian.h
@@ -1,4 +1,5 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
@@ -9,145 +10,6 @@
 namespace cuBQL {
   namespace omp {
 
-    template<typename box_t>
-    struct AtomicBox : public box_t {
-    };
-
-    template<typename T>
-    void atomic_min(T *ptr, T v);
-    template<typename T>
-    void atomic_max(T *ptr, T v);
-    
-    /*! iw - note: this implementation of atomic min/max via atomic
-        compare-exchange (CAS); which is cetainly not optimal on any
-        sort of modern GPU - but it works in any C++-21 compliant
-        compiler, so it's what we do for now */
-    void atomic_min(float *ptr, float value)
-    {
-      float current = *(volatile float *)ptr;
-      while (current > value) {
-        bool wasChanged
-          = ((std::atomic<int>*)ptr)
-          ->compare_exchange_weak((int&)current,(int&)value);
-        if (wasChanged) break;
-      }
-    }
-    
-    /*! iw - note: this implementation of atomic min/max via atomic
-        compare-exchange (CAS); which is cetainly not optimal on any
-        sort of modern GPU - but it works in any C++-21 compliant
-        compiler, so it's what we do for now */
-    void atomic_max(float *ptr, float value)
-    {
-      float current = *(volatile float *)ptr;
-      while (current > value) {
-        bool wasChanged
-          = ((std::atomic<int>*)ptr)
-          ->compare_exchange_weak((int&)current,(int&)value);
-        if (wasChanged) break;
-      }
-    }
-    
-    /*! iw - note: this implementation of atomic min/max via atomic
-        compare-exchange (CAS); which is cetainly not optimal on any
-        sort of modern GPU - but it works in any C++-21 compliant
-        compiler, so it's what we do for now */
-    void atomic_min(double *ptr, double value)
-    {
-      double current = *(volatile double *)ptr;
-      while (current > value) {
-        bool wasChanged
-          = ((std::atomic<long long int>*)ptr)
-          ->compare_exchange_weak((long long int&)current,
-                                  (long long int&)value);
-        if (wasChanged) break;
-      }
-    }
-    
-    /*! iw - note: this implementation of atomic min/max via atomic
-        compare-exchange (CAS); which is cetainly not optimal on any
-        sort of modern GPU - but it works in any C++-21 compliant
-        compiler, so it's what we do for now */
-    void atomic_max(double *ptr, double value)
-    {
-      double current = *(volatile double *)ptr;
-      while (current > value) {
-        bool wasChanged
-          = ((std::atomic<long long int>*)ptr)
-          ->compare_exchange_weak((long long int&)current,
-                                  (long long int&)value);
-        if (wasChanged) break;
-      }
-    }
-    
-    template<typename T, int D>
-    void v_atomic_min(vec_t<T,D> *ptr, vec_t<T,D> v);
-    template<typename T, int D>
-    void v_atomic_max(vec_t<T,D> *ptr, vec_t<T,D> v);
-    
-
-    template<typename T>
-    void v_atomic_min(vec_t<T,2> *ptr, vec_t<T,2> v)
-    {
-      atomic_min(&ptr->x,v.x); 
-      atomic_min(&ptr->y,v.y);
-    }
-    
-    template<typename T>
-    void v_atomic_min(vec_t<T,3> *ptr, vec_t<T,3> v)
-    {
-      atomic_min(&ptr->x,v.x); 
-      atomic_min(&ptr->y,v.y);
-      atomic_min(&ptr->z,v.z);
-    }
-    
-    template<typename T>
-    void v_atomic_min(vec_t<T,4> *ptr, vec_t<T,4> v)
-    {
-      atomic_min(&ptr->x,v.x); 
-      atomic_min(&ptr->y,v.y);
-      atomic_min(&ptr->z,v.z);
-      atomic_min(&ptr->w,v.w);
-    }
-
-    template<typename T>
-    void v_atomic_max(vec_t<T,2> *ptr, vec_t<T,2> v)
-    {
-      atomic_max(&ptr->x,v.x); 
-      atomic_max(&ptr->y,v.y);
-    }
-    
-    template<typename T>
-    void v_atomic_max(vec_t<T,3> *ptr, vec_t<T,3> v)
-    {
-      atomic_max(&ptr->x,v.x); 
-      atomic_max(&ptr->y,v.y);
-      atomic_max(&ptr->z,v.z);
-    }
-    
-    template<typename T>
-    void v_atomic_max(vec_t<T,4> *ptr, vec_t<T,4> v)
-    {
-      atomic_max(&ptr->x,v.x); 
-      atomic_max(&ptr->y,v.y);
-      atomic_max(&ptr->z,v.z);
-      atomic_max(&ptr->w,v.w);
-    }
-    
-    template<typename box_t>
-    void atomic_grow(AtomicBox<box_t> &ab, typename box_t::vec_t P)
-    {
-      v_atomic_min(&ab.lower,P);
-      v_atomic_max(&ab.upper,P);
-    }
-    
-    template<typename box_t>
-    void atomic_grow(AtomicBox<box_t> &ab, box_t B)
-    {
-      v_atomic_min(&ab.lower,B.lower);
-      v_atomic_max(&ab.upper,B.upper);
-    }
-    
     struct PrimState {
       union {
         /* careful with this order - this is intentionally chosen such
@@ -162,7 +24,8 @@ namespace cuBQL {
       };
     };
 
-    typedef enum : int8_t { OPEN_BRANCH, OPEN_NODE, DONE_NODE } NodeState;
+    enum { OPEN_BRANCH, OPEN_NODE, DONE_NODE };
+    typedef uint8_t NodeState;
     
     template<typename T, int D>
     struct CUBQL_ALIGN(16) TempNode {
@@ -186,21 +49,36 @@ namespace cuBQL {
         } doneNode;
       };
     };
-    
+
     template<typename T, int D>
-    void initState(uint32_t      *pNumNodes,
+    void initState(Kernel kernel,
+                   uint32_t      *pNumNodes,
                    NodeState     *nodeStates,
                    TempNode<T,D> *nodes)
     {
+      int tid = kernel.workIdx();
+      if (tid > 0) return;
+      printf("initstate\n");
       *pNumNodes = 2;
       
+      printf("initstate1\n");
       nodeStates[0]             = OPEN_BRANCH;
+      printf("initstate2\n");
       nodes[0].openBranch.count = 0;
+      printf("initstate3\n");
+      printf("initstate3 %p\n",&nodes[0].openBranch.centBounds);
+
+      ((int*)&nodes[0].openBranch.centBounds)[0] = 0;
+      printf("bla\n");
       nodes[0].openBranch.centBounds.set_empty();
 
+      printf("initstate4\n");
       nodeStates[1]            = DONE_NODE;
+      printf("initstate5\n");
       nodes[1].doneNode.offset = 0;
+      printf("initstate6\n");
       nodes[1].doneNode.count  = 0;
+      printf("initstate7\n");
     }
 
     template<typename T, int D>
@@ -212,7 +90,7 @@ namespace cuBQL {
     {
       const int primID = kernel.workIdx();
       if (primID >= numPrims) return;
-      
+
       auto &me = primState[primID];
       me.primID = primID;
                                                     
@@ -222,7 +100,20 @@ namespace cuBQL {
         me.done   = false;
         // this could be made faster by block-reducing ...
         atomicAdd(&nodes[0].openBranch.count,1);
-        atomic_grow(nodes[0].openBranch.centBounds,box.center());//centerOf(box));
+        auto ctr = box.center();
+        atomic_grow(nodes[0].openBranch.centBounds,ctr);//centerOf(box));
+        // printf("p %i ctr %f %f %f grownn box %i : (%f %f %f)(%f %f %f)\n",
+        //        primID,
+        //        ctr.x,
+        //        ctr.y,
+        //        ctr.z,
+        //        0,
+        //        nodes[0].openBranch.centBounds.lower.x,
+        //        nodes[0].openBranch.centBounds.lower.y,
+        //        nodes[0].openBranch.centBounds.lower.z,
+        //        nodes[0].openBranch.centBounds.upper.x,
+        //        nodes[0].openBranch.centBounds.upper.y,
+        //        nodes[0].openBranch.centBounds.upper.z);
       } else {
         me.nodeID = (uint32_t)-1;
         me.done   = true;
@@ -254,7 +145,7 @@ namespace cuBQL {
         done.offset = offset;
         return;
       }
-      
+
       auto in = nodes[nodeID].openBranch;
       if (in.count <= buildConfig.makeLeafThreshold) {
         auto &done  = nodes[nodeID].doneNode;
@@ -298,6 +189,7 @@ namespace cuBQL {
         for (int side=0;side<2;side++) {
           const int childID = open.offset+side;
           auto &child = nodes[childID].openBranch;
+
           child.centBounds.set_empty();
           child.count         = 0;
           nodeStates[childID] = OPEN_BRANCH;
@@ -361,11 +253,15 @@ namespace cuBQL {
       if (offset >= numPrims) return;
 
       auto &ps = primStates[offset];
+      // printf("ps %i -> %i : %i done %i\n",
+      //        (int)offset,(int)ps.primID,(int)ps.nodeID,(int)ps.done);
       bvhItemList[offset] = ps.primID;
-      
+       
       if ((int)ps.nodeID < 0)
         /* invalid prim, just skip here */
         return;
+      if (ps.nodeID >= 853350)
+        { printf("OVERFLOW\n"); return; }
       auto &node = nodes[ps.nodeID];
       atomicMin(&node.doneNode.offset,offset);
     }
@@ -395,6 +291,12 @@ namespace cuBQL {
                        Context          *ctx)
     {
       assert(sizeof(PrimState) == sizeof(uint64_t));
+      if (buildConfig.makeLeafThreshold < 1)
+        buildConfig.makeLeafThreshold = 1;
+
+      PING;
+      PRINT(buildConfig.makeLeafThreshold);
+      PRINT(buildConfig.maxAllowedLeafSize);
       
       // ==================================================================
       // do build on temp nodes
@@ -407,18 +309,28 @@ namespace cuBQL {
       ctx->alloc(nodeStates,2*numPrims);
       ctx->alloc(primStates,numPrims);
       ctx->alloc(d_numNodes,1);
-#pragma omp target device(ctx->gpuID)
+      PING;
+      PRINT(numPrims);
+      PRINT(d_numNodes);
+      PRINT((int*)nodeStates);
+      PRINT(tempNodes);
+#pragma omp target device(ctx->gpuID) is_device_ptr(d_numNodes) is_device_ptr(nodeStates) is_device_ptr(tempNodes)
 #pragma omp teams distribute parallel for
       for (int tid=0;tid<1;tid++)
-        initState(d_numNodes,
+        initState(Kernel{tid},
+                  d_numNodes,
                   nodeStates,
                   tempNodes);
-#pragma omp target device(ctx->gpuID)
+      PING;
+      PRINT(numPrims);
+      PING; fflush(0);
+#pragma omp target device(ctx->gpuID) is_device_ptr(tempNodes) is_device_ptr(primStates) is_device_ptr(boxes)
 #pragma omp teams distribute parallel for
       for (int tid=0;tid<numPrims;tid++)
         initPrims(Kernel{tid},tempNodes,
                   primStates,boxes,numPrims);
 
+      PING; fflush(0);
       int numDone = 0;
       uint32_t numNodes;
 
@@ -427,7 +339,7 @@ namespace cuBQL {
         ctx->download(numNodes,d_numNodes);
         if (numNodes == numDone)
           break;
-#pragma omp target device(ctx->gpuID)
+#pragma omp target device(ctx->gpuID) is_device_ptr(d_numNodes) is_device_ptr(nodeStates) is_device_ptr(tempNodes)
 #pragma omp teams distribute parallel for
         for (int tid=0;tid<numNodes;tid++)
           selectSplits(Kernel{tid},
@@ -446,37 +358,61 @@ namespace cuBQL {
       // ==================================================================
       // sort {item,nodeID} list
       // ==================================================================
-      
+
+      PING;
+#if 1
+      std::cout << "host sort ..." << std::endl;
+      PRINT(numPrims);
+      std::vector<uint64_t> h_primStates
+        = ctx->download_vector((uint64_t*)primStates,numPrims);
+      PING;
+      // ctx->download(h_primStates,primStates,numNodes*sizeof(uint64_t));
+      std::sort(h_primStates.begin(),h_primStates.end());
+      ctx->upload((uint64_t*)primStates,h_primStates.data(),numPrims);
+      PING;
+#else
+      std::cout << "openmp sort ..." << std::endl;
       ::omp::omp_target_sort((uint64_t*)primStates,numPrims,ctx->gpuID);
-      
+#endif 
+      PING;
       // ==================================================================
       // allocate and write BVH item list, and write offsets of leaf nodes
       // ==================================================================
 
+      PRINT(numNodes);
       bvh.numPrims = numPrims;
+      bvh.primIDs = 0;
+      PING;
       ctx->alloc(bvh.primIDs,numPrims);
-#pragma omp target device(ctx->gpuID)
+      PING;
+      auto primIDs = bvh.primIDs;
+#pragma omp target device(ctx->gpuID) is_device_ptr(primStates) is_device_ptr(tempNodes) is_device_ptr(primIDs)
 #pragma omp teams distribute parallel for
       for (int tid=0;tid<numPrims;tid++)
         writePrimsAndLeafOffsets(Kernel{tid},
                                  tempNodes,
-                                 bvh.primIDs,primStates,numPrims);
+                                 primIDs,primStates,numPrims);
       
+      PING;
       // ==================================================================
       // allocate and write final nodes
       // ==================================================================
       bvh.numNodes = numNodes;
       ctx->alloc(bvh.nodes,numNodes);
-#pragma omp target device(ctx->gpuID)
+      PING; PRINT(numNodes);
+      auto bvhNodes = bvh.nodes;
+#pragma omp target device(ctx->gpuID) is_device_ptr(bvhNodes) is_device_ptr(tempNodes)
 #pragma omp teams distribute parallel for
       for (int tid=0;tid<numNodes;tid++)
-        writeNodes(Kernel{tid},bvh.nodes,tempNodes,numNodes);
+        writeNodes(Kernel{tid},bvhNodes,tempNodes,numNodes);
       ctx->free(tempNodes);
       ctx->free(nodeStates);
       ctx->free(primStates);
       ctx->free(d_numNodes);
 
+      PING; 
       cuBQL::omp::refit(bvh,boxes,ctx);
+      PING; 
     }
     
   }
diff --git a/cuBQL/math/vec.h b/cuBQL/math/vec.h
index 10ffdb7..11d7b37 100644
--- a/cuBQL/math/vec.h
+++ b/cuBQL/math/vec.h
@@ -356,34 +356,53 @@ namespace cuBQL {
   /* vec:vec */                                                         \
   template<typename T, int D>                                           \
   inline __cubql_both                                                   \
-  vec_t<T,D> long_op(vec_t<T,D> a, vec_t<T,D> b)                        \
+  vec_t<T,D> long_op(const vec_t<T,D> &a, const vec_t<T,D> &b)          \
   {                                                                     \
     vec_t<T,D> r;                                                       \
-    CUBQL_PRAGMA_UNROLL                                                 \
-      for (int i=0;i<D;i++) r[i] = a[i] op b[i];                        \
+    for (int i=0;i<D;i++) r[i] = a[i] op b[i];                          \
     return r;                                                           \
   }                                                                     \
-  /* scalar-vec */                                                      \
-  template<typename T, int D>                                           \
+  template<typename T>                                                  \
   inline __cubql_both                                                   \
-  vec_t<T,D> long_op(T a, vec_t<T,D> b)                                 \
+  vec_t<T,2> long_op(const vec_t<T,2> &a, const vec_t<T,2> &b)          \
   {                                                                     \
-    vec_t<T,D> r;                                                       \
-    CUBQL_PRAGMA_UNROLL                                                 \
-      for (int i=0;i<D;i++) r[i] = a op b[i];                           \
+    vec_t<T,2> r;                                                       \
+    r.x = a.x op b.x;                                                   \
+    r.y = a.y op b.y;                                                   \
     return r;                                                           \
   }                                                                     \
-    /* vec:scalar */                                                    \
-    template<typename T, int D>                                         \
-    inline __cubql_both                                                 \
-    vec_t<T,D> long_op(vec_t<T,D> a, T b)                               \
-    {                                                                   \
-      vec_t<T,D> r;                                                     \
-      CUBQL_PRAGMA_UNROLL                                               \
-        for (int i=0;i<D;i++) r[i] = a[i] op b;                         \
-      return r;                                                         \
-    }                                                                   \
-    CUBQL_OPERATOR_CUDA_T(long_op,op)
+  template<typename T>                                                  \
+  inline __cubql_both                                                   \
+  vec_t<T,3> long_op(const vec_t<T,3> &a, const vec_t<T,3> &b)          \
+  {                                                                     \
+    vec_t<T,3> r;                                                       \
+    r.x = a.x op b.x;                                                   \
+    r.y = a.y op b.y;                                                   \
+    r.z = a.z op b.z;                                                   \
+    return r;                                                           \
+  }                                                                     \
+  template<typename T>                                                  \
+  inline __cubql_both                                                   \
+  vec_t<T,4> long_op(const vec_t<T,4> &a, const vec_t<T,4> &b)          \
+  {                                                                     \
+    vec_t<T,4> r;                                                       \
+    r.x = a.x op b.x;                                                   \
+    r.y = a.y op b.y;                                                   \
+    r.z = a.z op b.z;                                                   \
+    r.w = a.w op b.w;                                                   \
+    return r;                                                           \
+  }                                                                     \
+  /* scalar-vec */                                                      \
+  template<typename T, int D>                                           \
+  inline __cubql_both                                                   \
+  vec_t<T,D> long_op(T a, const vec_t<T,D> &b)                          \
+  { return vec_t<T,D>(a) op b; }                                        \
+  /* vec:scalar */                                                      \
+  template<typename T, int D>                                           \
+  inline __cubql_both                                                   \
+  vec_t<T,D> long_op(const vec_t<T,D> &a, T b)                          \
+  { return a op vec_t<T,D>(b); }                                        \
+  CUBQL_OPERATOR_CUDA_T(long_op,op)
 
   CUBQL_OPERATOR(operator+,+)
   CUBQL_OPERATOR(operator-,-)

From a68cb82c62a7b3cd26d12bc57406f825c62b6ee2 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Mon, 26 Jan 2026 17:54:09 -0700
Subject: [PATCH 07/14] rewritten openmp builder's atomics in openmp fashion

---
 cuBQL/builder/omp/AtomicBox.h     | 25 +++++++++++++++++++++++++
 cuBQL/builder/omp/common.h        | 29 ++++++++++++++++++++---------
 cuBQL/builder/omp/refit.h         | 18 +++++++++---------
 cuBQL/builder/omp/spatialMedian.h | 13 +++++++++++--
 4 files changed, 65 insertions(+), 20 deletions(-)

diff --git a/cuBQL/builder/omp/AtomicBox.h b/cuBQL/builder/omp/AtomicBox.h
index 4ee90cd..425beb9 100644
--- a/cuBQL/builder/omp/AtomicBox.h
+++ b/cuBQL/builder/omp/AtomicBox.h
@@ -6,6 +6,7 @@
 
 #include "cuBQL/builder/omp/common.h"
 
+
 namespace cuBQL {
   namespace omp {
 
@@ -29,6 +30,7 @@ namespace cuBQL {
         compiler, so it's what we do for now */
     inline void atomic_min(float *ptr, float value)
     {
+#if 0
       float current = *(volatile float *)ptr;
       while (current > value) {
         bool wasChanged
@@ -36,6 +38,15 @@ namespace cuBQL {
           ->compare_exchange_weak((int&)current,(int&)value);
         if (wasChanged) break;
       }
+#else
+#pragma omp atomic compare 
+      {
+        if (*ptr > value) *ptr = value;
+      }
+//       float t;
+// #pragma omp atomic capture
+//       { t = *ptr; *ptr = std::min(t,value); }
+#endif
     }
     
     /*! iw - note: this implementation of atomic min/max via atomic
@@ -44,6 +55,7 @@ namespace cuBQL {
         compiler, so it's what we do for now */
     inline void atomic_max(float *ptr, float value)
     {
+#if 0
       float current = *(volatile float *)ptr;
       while (current < value) {
         bool wasChanged
@@ -51,6 +63,15 @@ namespace cuBQL {
           ->compare_exchange_weak((int&)current,(int&)value);
         if (wasChanged) break;
       }
+#else
+#pragma omp atomic compare 
+      {
+        if (*ptr < value) *ptr = value;
+      }
+//       float t;
+// #pragma omp atomic capture
+//       { t = *ptr; *ptr = std::max(t,value); }
+#endif
     }
     
     /*! iw - note: this implementation of atomic min/max via atomic
@@ -59,6 +80,7 @@ namespace cuBQL {
         compiler, so it's what we do for now */
     inline void atomic_min(double *ptr, double value)
     {
+#if 0
       double current = *(volatile double *)ptr;
       while (current > value) {
         bool wasChanged
@@ -67,6 +89,7 @@ namespace cuBQL {
                                   (long long int&)value);
         if (wasChanged) break;
       }
+#endif
     }
     
     /*! iw - note: this implementation of atomic min/max via atomic
@@ -75,6 +98,7 @@ namespace cuBQL {
         compiler, so it's what we do for now */
     inline void atomic_max(double *ptr, double value)
     {
+#if 0
       double current = *(volatile double *)ptr;
       while (current < value) {
         bool wasChanged
@@ -83,6 +107,7 @@ namespace cuBQL {
                                   (long long int&)value);
         if (wasChanged) break;
       }
+#endif
     }
     
     template<typename T, int D>
diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h
index 1cb8178..2321b53 100644
--- a/cuBQL/builder/omp/common.h
+++ b/cuBQL/builder/omp/common.h
@@ -45,20 +45,31 @@ namespace cuBQL {
       int _workIdx;
     };
 
-    inline uint32_t atomicAdd(volatile uint32_t *p_value, uint32_t inc)
+    inline uint32_t atomicAdd(volatile uint32_t *ptr, uint32_t inc)
     {
-      return ((std::atomic<int> *)p_value)->fetch_add(inc); 
+      uint32_t t;
+#pragma omp atomic capture
+      { t = *ptr; *ptr += inc; }
+      // return ((std::atomic<int> *)p_value)->fetch_add(inc);
+      return t;
     }
     
-    inline void atomicMin(uint32_t *p_value, uint32_t other)
+    inline void atomicMin(uint32_t *ptr, uint32_t value)
     {
-      uint32_t current = *(volatile uint32_t *)p_value;
-      while (current > other) {
-        bool wasChanged
-          = ((std::atomic<int>*)p_value)
-          ->compare_exchange_weak((int&)current,(int&)other);
-        if (wasChanged) break;
+#pragma omp atomic compare 
+      {
+        if (*ptr > value) *ptr = value;
       }
+      // uint32_t t;
+// #pragma omp atomic capture
+//       { t = *ptr; *ptr = std::min(t,value); }
+      // uint32_t current = *(volatile uint32_t *)p_value;
+      // while (current > other) {
+      //   bool wasChanged
+      //     = ((std::atomic<int>*)p_value)
+      //     ->compare_exchange_weak((int&)current,(int&)other);
+      //   if (wasChanged) break;
+      // }
     }
 
 
diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h
index 9b4fe9c..ad2ea9c 100644
--- a/cuBQL/builder/omp/refit.h
+++ b/cuBQL/builder/omp/refit.h
@@ -43,9 +43,9 @@ namespace cuBQL {
 
       typename BinaryBVH<T,D>::Node *node = bvh_nodes+nodeID;
 
-      if (nodeID < 0 || nodeID >= 601202) {
-        printf("BLA\n"); return;
-      }
+      // if (nodeID < 0 || nodeID >= 601202) {
+      //   printf("BLA\n"); return;
+      // }
       if (node->admin.count == 0)
         // this is a inner node - exit
         return;
@@ -76,9 +76,9 @@ namespace cuBQL {
 
         // if (it == 3) { printf("parentID %i\n",parentID); return; }
         
-        if (parentID < 0 || parentID >= 601202) {
-          printf("BLA\n"); return;
-        }
+        // if (parentID < 0 || parentID >= 601202) {
+        //   printf("BLA\n"); return;
+        // }
         uint32_t refitBits = atomicAdd(&refitData[parentID],1u);
         if ((refitBits & 1) == 0)
           // we're the first one - let other one do it
@@ -89,9 +89,9 @@ namespace cuBQL {
         parentID = (refitBits >> 1);
 
         int ofs = node->admin.offset;
-        if (ofs < 0 || ofs+1 >= 601202) {
-          printf("BLAB\n"); return;
-        }
+        // if (ofs < 0 || ofs+1 >= 601202) {
+        //   printf("BLAB\n"); return;
+        // }
         
         typename BinaryBVH<T,D>::Node l = bvh_nodes[ofs+0];
         typename BinaryBVH<T,D>::Node r = bvh_nodes[ofs+1];
diff --git a/cuBQL/builder/omp/spatialMedian.h b/cuBQL/builder/omp/spatialMedian.h
index 32c754c..23a3a80 100644
--- a/cuBQL/builder/omp/spatialMedian.h
+++ b/cuBQL/builder/omp/spatialMedian.h
@@ -147,6 +147,14 @@ namespace cuBQL {
       }
 
       auto in = nodes[nodeID].openBranch;
+      // printf("node split %i : %f %f %f : %f %f %f\n",
+      //        nodeID,
+      //        in.centBounds.lower.x,
+      //        in.centBounds.lower.y,
+      //        in.centBounds.lower.z,
+      //        in.centBounds.upper.x,
+      //        in.centBounds.upper.y,
+      //        in.centBounds.upper.z);
       if (in.count <= buildConfig.makeLeafThreshold) {
         auto &done  = nodes[nodeID].doneNode;
         done.count  = in.count;
@@ -185,6 +193,7 @@ namespace cuBQL {
           : widestDim;
         
         open.offset = atomicAdd(pNumNodes,2);
+        // printf("offset %i\n",open.offset);
 #pragma unroll
         for (int side=0;side<2;side++) {
           const int childID = open.offset+side;
@@ -260,8 +269,6 @@ namespace cuBQL {
       if ((int)ps.nodeID < 0)
         /* invalid prim, just skip here */
         return;
-      if (ps.nodeID >= 853350)
-        { printf("OVERFLOW\n"); return; }
       auto &node = nodes[ps.nodeID];
       atomicMin(&node.doneNode.offset,offset);
     }
@@ -337,6 +344,7 @@ namespace cuBQL {
       // ------------------------------------------------------------------      
       while (true) {
         ctx->download(numNodes,d_numNodes);
+        PING; PRINT(numNodes);
         if (numNodes == numDone)
           break;
 #pragma omp target device(ctx->gpuID) is_device_ptr(d_numNodes) is_device_ptr(nodeStates) is_device_ptr(tempNodes)
@@ -347,6 +355,7 @@ namespace cuBQL {
                        nodeStates,tempNodes,numNodes,
                        buildConfig);
         numDone = numNodes;
+        PRINT(numDone);
 
 #pragma omp target device(ctx->gpuID)
 #pragma omp teams distribute parallel for

From 514e0373edfc4bf3e8e0e6550c7fb63654e4b49c Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Tue, 27 Jan 2026 11:57:51 -0700
Subject: [PATCH 08/14] added a non-sorting based final node writing stage

---
 cuBQL/builder/omp/refit.h         | 24 +--------
 cuBQL/builder/omp/spatialMedian.h | 89 ++++++++++++++++++++++++++-----
 2 files changed, 76 insertions(+), 37 deletions(-)

diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h
index 9b4fe9c..9245e62 100644
--- a/cuBQL/builder/omp/refit.h
+++ b/cuBQL/builder/omp/refit.h
@@ -23,8 +23,6 @@ namespace cuBQL {
       node.bounds = box_t<T,D>();
       if (node.admin.count) return;
 
-      if (node.admin.offset+1 >= numNodes)
-        printf("refit_init_overflow\n");
       refitData[node.admin.offset+0] = nodeID << 1;
       refitData[node.admin.offset+1] = nodeID << 1;
     }
@@ -43,9 +41,6 @@ namespace cuBQL {
 
       typename BinaryBVH<T,D>::Node *node = bvh_nodes+nodeID;
 
-      if (nodeID < 0 || nodeID >= 601202) {
-        printf("BLA\n"); return;
-      }
       if (node->admin.count == 0)
         // this is a inner node - exit
         return;
@@ -58,27 +53,13 @@ namespace cuBQL {
       }
 
       int parentID = (refitData[nodeID] >> 1);
-      int its = 0;
       while (true) {
-        if (parentID == 1) {
-          printf("1 is parent!!!!\n");
-                 return;
-        }
-        
         atomic_grow(*(AtomicBox<box_t<T,D>> *)&node->bounds,bounds);
         // node->bounds = bounds;
           
         if (node == bvh_nodes)
           break;
 
-        int it = its++;
-        // if (it >= 4) return;
-
-        // if (it == 3) { printf("parentID %i\n",parentID); return; }
-        
-        if (parentID < 0 || parentID >= 601202) {
-          printf("BLA\n"); return;
-        }
         uint32_t refitBits = atomicAdd(&refitData[parentID],1u);
         if ((refitBits & 1) == 0)
           // we're the first one - let other one do it
@@ -89,9 +70,6 @@ namespace cuBQL {
         parentID = (refitBits >> 1);
 
         int ofs = node->admin.offset;
-        if (ofs < 0 || ofs+1 >= 601202) {
-          printf("BLAB\n"); return;
-        }
         
         typename BinaryBVH<T,D>::Node l = bvh_nodes[ofs+0];
         typename BinaryBVH<T,D>::Node r = bvh_nodes[ofs+1];
@@ -105,7 +83,6 @@ namespace cuBQL {
                const box_t<T,D>  *boxes,
                Context *ctx)
     {
-      PING;
       assert(bvh.nodes);
       assert(bvh.primIDs);
       int numNodes = bvh.numNodes;
@@ -137,6 +114,7 @@ namespace cuBQL {
             refit_run_x(Kernel{i},//bvh,
                         bvh_primIDs,bvh_nodes,refitData,boxes,numNodes);
         }
+        nb = nb;
       }
 #else
 #pragma omp target device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(bvh_nodes) is_device_ptr(refitData) is_device_ptr(boxes) 
diff --git a/cuBQL/builder/omp/spatialMedian.h b/cuBQL/builder/omp/spatialMedian.h
index 32c754c..222af2d 100644
--- a/cuBQL/builder/omp/spatialMedian.h
+++ b/cuBQL/builder/omp/spatialMedian.h
@@ -10,6 +10,8 @@
 namespace cuBQL {
   namespace omp {
 
+#define NO_SORT 1
+    
     struct PrimState {
       union {
         /* careful with this order - this is intentionally chosen such
@@ -266,6 +268,41 @@ namespace cuBQL {
       atomicMin(&node.doneNode.offset,offset);
     }
 
+    template<typename T, int D>
+    void writeLeafOffsets(Kernel kernel,
+                          TempNode<T,D> *nodes,
+                          uint32_t      *pNextFreeOffset,
+                          int            numNodes)
+    {
+      const int nodeID = kernel.workIdx();
+      if (nodeID >= numNodes) return;
+      
+      auto &node = nodes[nodeID].doneNode;
+      if (node.count > 0)
+        node.offset = atomicAdd(pNextFreeOffset,node.count)+node.count-1;
+    }
+
+    template<typename T, int D>
+    void writePrims(Kernel kernel,
+                    uint32_t      *bvh_primIDs,
+                    TempNode<T,D> *nodes,
+                    PrimState     *primStates,
+                    int            numPrims)
+    {
+      const int primID = kernel.workIdx();//threadIdx.x+blockIdx.x*blockDim.x;
+      if (primID >= numPrims) return;
+
+      auto &ps = primStates[primID];
+      if ((int)ps.nodeID < 0)
+        /* invalid prim, just skip here */
+        return;
+      auto &node = nodes[ps.nodeID].doneNode;
+      int myPos = atomicAdd(&node.offset,(uint32_t)-1);
+      if (myPos < 0 || myPos >= numPrims)
+        printf("OVERFLOW pri\n");
+      bvh_primIDs[myPos] = ps.primID;
+    }
+    
     /* writes main phase's temp nodes into final bvh.nodes[]
        layout. actual bounds of that will NOT yet bewritten */
     template<typename T, int D>
@@ -348,18 +385,48 @@ namespace cuBQL {
                        buildConfig);
         numDone = numNodes;
 
-#pragma omp target device(ctx->gpuID)
+#pragma omp target device(ctx->gpuID) is_device_ptr(d_numNodes) is_device_ptr(nodeStates) is_device_ptr(tempNodes)
 #pragma omp teams distribute parallel for
         for (int tid=0;tid<numPrims;tid++)
           updatePrims(Kernel{tid},
                       nodeStates,tempNodes,
                       primStates,boxes,numPrims);
       }
+      
+      PING;
+      bvh.numPrims = numPrims;
+      bvh.primIDs = 0;
+      PING;
+      ctx->alloc(bvh.primIDs,numPrims);
+      PING;
+      auto bvh_primIDs = bvh.primIDs;
+      
+#if NO_SORT
+      uint32_t zero = 0u;
+      // set first uint there to zero; leaf writing code will
+      // atomically inc that value to determine leaf offset values.
+      ctx->upload(bvh_primIDs,&zero,1);
+#pragma omp target device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(tempNodes)
+#pragma omp teams distribute parallel for
+      for (int tid=0;tid<numNodes;tid++)
+        writeLeafOffsets(Kernel{tid},
+                         tempNodes,
+                         bvh_primIDs,
+                         numNodes);
+      
+#pragma omp target device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(tempNodes)
+#pragma omp teams distribute parallel for
+      for (int tid=0;tid<numPrims;tid++)
+        writePrims(Kernel{tid},
+                   bvh_primIDs,
+                   tempNodes,
+                   primStates,
+                   numPrims);
+#else
       // ==================================================================
       // sort {item,nodeID} list
       // ==================================================================
 
-      PING;
 #if 1
       std::cout << "host sort ..." << std::endl;
       PRINT(numPrims);
@@ -373,38 +440,32 @@ namespace cuBQL {
 #else
       std::cout << "openmp sort ..." << std::endl;
       ::omp::omp_target_sort((uint64_t*)primStates,numPrims,ctx->gpuID);
-#endif 
       PING;
       // ==================================================================
       // allocate and write BVH item list, and write offsets of leaf nodes
       // ==================================================================
-
+#endif
       PRINT(numNodes);
-      bvh.numPrims = numPrims;
-      bvh.primIDs = 0;
-      PING;
-      ctx->alloc(bvh.primIDs,numPrims);
-      PING;
-      auto primIDs = bvh.primIDs;
 #pragma omp target device(ctx->gpuID) is_device_ptr(primStates) is_device_ptr(tempNodes) is_device_ptr(primIDs)
 #pragma omp teams distribute parallel for
       for (int tid=0;tid<numPrims;tid++)
         writePrimsAndLeafOffsets(Kernel{tid},
                                  tempNodes,
-                                 primIDs,primStates,numPrims);
+                                 bvh_primIDs,primStates,numPrims);
       
       PING;
+#endif 
       // ==================================================================
       // allocate and write final nodes
       // ==================================================================
       bvh.numNodes = numNodes;
       ctx->alloc(bvh.nodes,numNodes);
       PING; PRINT(numNodes);
-      auto bvhNodes = bvh.nodes;
-#pragma omp target device(ctx->gpuID) is_device_ptr(bvhNodes) is_device_ptr(tempNodes)
+      auto bvh_nodes = bvh.nodes;
+#pragma omp target device(ctx->gpuID) is_device_ptr(bvh_nodes) is_device_ptr(tempNodes)
 #pragma omp teams distribute parallel for
       for (int tid=0;tid<numNodes;tid++)
-        writeNodes(Kernel{tid},bvhNodes,tempNodes,numNodes);
+        writeNodes(Kernel{tid},bvh_nodes,tempNodes,numNodes);
       ctx->free(tempNodes);
       ctx->free(nodeStates);
       ctx->free(primStates);

From 7f9ce41cbf1295095599f27a87d1ef3b813c187d Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Tue, 27 Jan 2026 14:27:43 -0700
Subject: [PATCH 09/14] fix for atomic on amd

---
 cuBQL/builder/omp/AtomicBox.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cuBQL/builder/omp/AtomicBox.h b/cuBQL/builder/omp/AtomicBox.h
index 425beb9..035dafd 100644
--- a/cuBQL/builder/omp/AtomicBox.h
+++ b/cuBQL/builder/omp/AtomicBox.h
@@ -90,6 +90,10 @@ namespace cuBQL {
         if (wasChanged) break;
       }
 #endif
+#pragma omp atomic compare 
+      {
+        if (*ptr > value) *ptr = value;
+      }
     }
     
     /*! iw - note: this implementation of atomic min/max via atomic
@@ -108,6 +112,10 @@ namespace cuBQL {
         if (wasChanged) break;
       }
 #endif
+#pragma omp atomic compare 
+      {
+        if (*ptr < value) *ptr = value;
+      }
     }
     
     template<typename T, int D>

From 480890a245f364b8d7b17ed1463c783c72a83454 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Tue, 27 Jan 2026 14:30:46 -0700
Subject: [PATCH 10/14] fixes for atomics on arc

---
 cuBQL/builder/omp/AtomicBox.h | 12 +++++-------
 cuBQL/builder/omp/common.h    |  5 ++---
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/cuBQL/builder/omp/AtomicBox.h b/cuBQL/builder/omp/AtomicBox.h
index 425beb9..7571ce2 100644
--- a/cuBQL/builder/omp/AtomicBox.h
+++ b/cuBQL/builder/omp/AtomicBox.h
@@ -39,10 +39,9 @@ namespace cuBQL {
         if (wasChanged) break;
       }
 #else
+      float &x = *ptr;
 #pragma omp atomic compare 
-      {
-        if (*ptr > value) *ptr = value;
-      }
+      if (x > value) { x = value; }
 //       float t;
 // #pragma omp atomic capture
 //       { t = *ptr; *ptr = std::min(t,value); }
@@ -64,11 +63,10 @@ namespace cuBQL {
         if (wasChanged) break;
       }
 #else
+      float &x = *ptr;
 #pragma omp atomic compare 
-      {
-        if (*ptr < value) *ptr = value;
-      }
-//       float t;
+      if (x < value) { x = value; }
+        //       float t;
 // #pragma omp atomic capture
 //       { t = *ptr; *ptr = std::max(t,value); }
 #endif
diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h
index 2321b53..4729e90 100644
--- a/cuBQL/builder/omp/common.h
+++ b/cuBQL/builder/omp/common.h
@@ -56,10 +56,9 @@ namespace cuBQL {
     
     inline void atomicMin(uint32_t *ptr, uint32_t value)
     {
+      uint32_t &x = *ptr;
 #pragma omp atomic compare 
-      {
-        if (*ptr > value) *ptr = value;
-      }
+        if (x > value) { x = value; }
       // uint32_t t;
 // #pragma omp atomic capture
 //       { t = *ptr; *ptr = std::min(t,value); }

From a77b80a242599d6208dd5a5a19fa55c2571d77b2 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Mon, 2 Feb 2026 15:08:08 -0700
Subject: [PATCH 11/14] added openmp backend

---
 CMakeLists.txt       |  4 +---
 testing/omp_sort.cpp | 35 -----------------------------------
 2 files changed, 1 insertion(+), 38 deletions(-)
 delete mode 100644 testing/omp_sort.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a32e0da..6a1653e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -140,6 +140,4 @@ if (NOT CUBQL_IS_SUBPROJECT)
   add_subdirectory(samples)
 endif()
 
-if (CUBQL_OMP)
-  add_subdirectory(testing)
-endif()
+#add_subdirectory(testing)
diff --git a/testing/omp_sort.cpp b/testing/omp_sort.cpp
deleted file mode 100644
index 8227709..0000000
--- a/testing/omp_sort.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
-// CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#include "cuBQL/bvh.h"
-#include "cuBQL/builder/omp/sort.h"
-#include <iostream>
-
-int main(int ac, char **av)
-{
-  cuBQL::omp::Context omp(0);
-  
-  int N = 13;
-  // int N = 123453;
-  std::vector<int> inputs(N);
-  for (int i=0;i<N;i++) {
-    inputs[i] = 90-i;//random() % 100;
-    if (inputs[i] < 10) inputs[i] += 20;
-  }
-
-  int *d_data = 0;
-  omp.alloc_and_upload(d_data,inputs);
-  printf("d_data %p\n",d_data);
-
-  cuBQL::omp::omp_target_sort(d_data,N,omp.deviceID);
-
-  std::vector<int> results
-    = omp.download_vector(d_data,N);
-  for (int i=1;i<results.size();i++) {
-    PRINT(results[i]);
-    if (results[i-1] > results[i])
-      throw std::runtime_error("Not sorted...");
-  }
-  std::cout << "sorted - perfect!" << std::endl;
-}

From 60cbf1ff71888d5530fadbb22acf41e21215fa3e Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Mon, 2 Feb 2026 15:10:22 -0700
Subject: [PATCH 12/14] bumped version to 1.2 to mark new openmp builder

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1ffa826..40a0ca0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@ cmake_minimum_required(VERSION 3.16)
 
 cmake_policy(SET CMP0048 NEW)
 set(CMAKE_BUILD_TYPE_INIT "Release")
-project(cuBQL VERSION 1.1.0 LANGUAGES C CXX)
+project(cuBQL VERSION 1.2.0 LANGUAGES C CXX)
 
 if (CUBQL_DISABLE_CUDA)
   message("#cuBQL: CUDA _DISABLED_ by user request")

From b3bec662c583aa5ce006da3989ed43254ba3d49c Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Mon, 2 Feb 2026 15:41:08 -0700
Subject: [PATCH 13/14] various cleanups

---
 cuBQL/builder/omp/refit.h         | 46 +++----------------------------
 cuBQL/builder/omp/spatialMedian.h |  1 -
 2 files changed, 4 insertions(+), 43 deletions(-)

diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h
index 2b6c13b..109654e 100644
--- a/cuBQL/builder/omp/refit.h
+++ b/cuBQL/builder/omp/refit.h
@@ -10,7 +10,7 @@ namespace cuBQL {
   namespace omp {
     
     template<typename T, int D> inline
-    void refit_init_x(Kernel kernel,
+    void refit_init(Kernel kernel,
                     typename BinaryBVH<T,D>::Node *nodes,
                     uint32_t              *refitData,
                     int numNodes)
@@ -23,17 +23,12 @@ namespace cuBQL {
       node.bounds = box_t<T,D>();
       if (node.admin.count) return;
 
-      // if (node.admin.offset < 0)
-      //   { printf("BAD OFFSET IN INIT(1)\n"); return; }
-      // if (node.admin.offset+1 >= numNodes)
-      //   { printf("BAD OFFSET IN INIT(2)\n"); return; }
       refitData[node.admin.offset+0] = nodeID << 2;
       refitData[node.admin.offset+1] = nodeID << 2;
     }
     
     template<typename T, int D> inline 
-    void refit_run_x(Kernel kernel,
-                     // BinaryBVH<T,D> bvh,
+    void refit_run(Kernel kernel,
                      uint32_t *bvh_primIDs,
                      typename BinaryBVH<T,D>::Node *bvh_nodes,
                      uint32_t *refitData,
@@ -45,12 +40,10 @@ namespace cuBQL {
 
       typename BinaryBVH<T,D>::Node *node = bvh_nodes+nodeID;
 
-      // printf("begin nodeID %i\n",nodeID);
       if (node->admin.count == 0)
         // this is a inner node - exit
         return;
 
-      // printf(" -> leaf %i cnt %i ofs %i\n",nodeID,node->admin.count,node->admin.offset);
       box_t<T,D> bounds; bounds.set_empty();
       for (int i=0;i<node->admin.count;i++) {
         const box_t<T,D> primBox = boxes[bvh_primIDs[node->admin.offset+i]];
@@ -61,9 +54,7 @@ namespace cuBQL {
       
       int parentID = (refitData[nodeID] >> 2);
       while (true) {
-        // printf("parentID %i\n",parentID);
         atomic_grow(*(AtomicBox<box_t<T,D>> *)&node->bounds,bounds);
-        // node->bounds = bounds;
           
         if (node == bvh_nodes)
           break;
@@ -98,49 +89,20 @@ namespace cuBQL {
         = (uint32_t*)ctx->alloc(numNodes*sizeof(uint32_t));
       auto bvh_nodes = bvh.nodes;
       auto bvh_primIDs = bvh.primIDs;
-      PING;
-      PRINT(numNodes);
-      PRINT(bvh.numNodes);
       {
 #pragma omp target device(ctx->gpuID) is_device_ptr(refitData) is_device_ptr(bvh_nodes) 
 #pragma omp teams distribute parallel for
         for (int i=0;i<numNodes;i++)
-          refit_init_x<T,D>(Kernel{i},bvh_nodes,refitData,numNodes);
+          refit_init<T,D>(Kernel{i},bvh_nodes,refitData,numNodes);
       }
-      PING;
 
-
-#pragma omp target device(ctx->gpuID) is_device_ptr(bvh_nodes)
-      for (int i=0;i<numNodes;i++)
-        if (bvh_nodes[i].admin.count != 0 && bvh_nodes[i].admin.offset < 0)
-          printf("REFIT INVALID OFFSET IN FINAL NODE %i\n",i);
-
-      
-      PING;
       {
-#if 0
-        int nb = 1;//128;
-#pragma omp target  teams num_teams(nb) device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(bvh_nodes) is_device_ptr(refitData) is_device_ptr(boxes)
-        {
-          int team = omp_get_team_num();
-          int nteams = omp_get_num_teams();
-          int beg = ((team+0)*numNodes)/nteams;
-          int end = ((team+1)*numNodes)/nteams;
-          for (int i=beg;i<end;i++)
-            refit_run_x(Kernel{i},//bvh,
-                        bvh_primIDs,bvh_nodes,refitData,boxes,numNodes);
-        }
-        nb = nb;
-      }
-#else
 #pragma omp target device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(bvh_nodes) is_device_ptr(refitData) is_device_ptr(boxes) 
 #pragma omp teams distribute parallel for 
       for (int i=0;i<numNodes;i++)
-        refit_run_x(Kernel{i},//bvh,
+        refit_run(Kernel{i},bvh,
                     bvh_primIDs,bvh_nodes,refitData,boxes,numNodes);
       }
-#endif
-      PING;
       ctx->free((void*)refitData);
     }
     
diff --git a/cuBQL/builder/omp/spatialMedian.h b/cuBQL/builder/omp/spatialMedian.h
index 075d036..5cca265 100644
--- a/cuBQL/builder/omp/spatialMedian.h
+++ b/cuBQL/builder/omp/spatialMedian.h
@@ -410,7 +410,6 @@ namespace cuBQL {
       // ==================================================================
       bvh.numNodes = numNodes;
       ctx->alloc(bvh.nodes,numNodes);
-      PING; PRINT(numNodes);
       auto bvh_nodes = bvh.nodes;
 #pragma omp target device(ctx->gpuID) is_device_ptr(bvh_nodes) is_device_ptr(tempNodes)
 #pragma omp teams distribute parallel for

From 974484833d17b44426399ac5eb7943cc96040e35 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Mon, 2 Feb 2026 15:41:35 -0700
Subject: [PATCH 14/14] various cleanups

---
 cuBQL/builder/omp/refit.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h
index 109654e..2094554 100644
--- a/cuBQL/builder/omp/refit.h
+++ b/cuBQL/builder/omp/refit.h
@@ -100,7 +100,7 @@ namespace cuBQL {
 #pragma omp target device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(bvh_nodes) is_device_ptr(refitData) is_device_ptr(boxes) 
 #pragma omp teams distribute parallel for 
       for (int i=0;i<numNodes;i++)
-        refit_run(Kernel{i},bvh,
+        refit_run(Kernel{i},
                     bvh_primIDs,bvh_nodes,refitData,boxes,numNodes);
       }
       ctx->free((void*)refitData);