NVIDIA · ingowald · Jan 21, 2026 · Jan 21, 2026 · Jan 23, 2026 · Jan 24, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -5,8 +5,11 @@ cmake_minimum_required(VERSION 3.16)
 
 cmake_policy(SET CMP0048 NEW)
 set(CMAKE_BUILD_TYPE_INIT "Release")
-project(cuBQL VERSION 1.1.0 LANGUAGES C CXX)
+project(cuBQL VERSION 1.2.0 LANGUAGES C CXX)
 
+if (CUBQL_OMP)
+  set(CUBQL_DISABLE_CUDA ON)
+endif()
 if (CUBQL_DISABLE_CUDA)
   message("#cuBQL: CUDA _DISABLED_ by user request")
   set(CUBQL_HAVE_CUDA OFF)
@@ -136,3 +139,5 @@ add_subdirectory(cuBQL)
 if (NOT CUBQL_IS_SUBPROJECT)
   add_subdirectory(samples)
 endif()
+
+#add_subdirectory(testing)
diff --git a/cuBQL/builder/cuda/sm_builder.h b/cuBQL/builder/cuda/sm_builder.h
@@ -515,10 +515,10 @@ namespace cuBQL {
       while (true) {
         CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes,
                                     sizeof(numNodes),cudaMemcpyDeviceToHost,s));
-        CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
-        CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent));
         if (numNodes == numDone)
           break;
+        CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
+        CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent));
 #if CUBQL_PROFILE
         t_nodePass[pass].sync_start();
 #endif
@@ -529,7 +529,7 @@ namespace cuBQL {
 #if CUBQL_PROFILE
         t_nodePass[pass].sync_stop();
         t_primPass[pass].sync_start();
-#endif        
+#endif
         numDone = numNodes;
 
 // #if 1

diff --git a/cuBQL/builder/omp.h b/cuBQL/builder/omp.h
@@ -0,0 +1,19 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace cuBQL {
+  namespace omp {
+    struct Context;
+
+    template<typename T, int D>
+    void refit(BinaryBVH<T,D>    &bvh,
+               const box_t<T,D>  *boxes,
+               Context *ctx);
+  }
+}
+
+#include "cuBQL/builder/omp/refit.h"
+#include "cuBQL/builder/omp/spatialMedian.h"
+
diff --git a/cuBQL/builder/omp/AtomicBox.h b/cuBQL/builder/omp/AtomicBox.h
@@ -0,0 +1,168 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "cuBQL/builder/omp/common.h"
+
+
+namespace cuBQL {
+  namespace omp {
+
+    template<typename box_t>
+    struct AtomicBox : public box_t {
+
+      inline void set_empty()
+      {
+        *(box_t *)this = box_t();
+      }
+    };
+
+    template<typename T>
+    inline void atomic_min(T *ptr, T v);
+    template<typename T>
+    inline void atomic_max(T *ptr, T v);
+
+    /*! iw - note: this implementation of atomic min/max via atomic
+        compare-exchange (CAS); which is cetainly not optimal on any
+        sort of modern GPU - but it works in any C++-21 compliant
+        compiler, so it's what we do for now */
+    inline void atomic_min(float *ptr, float value)
+    {
+#ifdef __NVCOMPILER
+# if 1
+      float &mem = *ptr;
+      if (mem <= value) return;
+      while (1) {
+        float wasBefore;
+#pragma omp atomic capture
+        { wasBefore = mem; mem = value; }
+        if (wasBefore >= value) break;
+        value = wasBefore;
+      }
+# else
+      float current = *(volatile float *)ptr;
+      while (current > value) {
+        bool wasChanged
+          = ((std::atomic<int>*)ptr)
+          ->compare_exchange_weak((int&)current,(int&)value);
+        if (wasChanged) break;
+      }
+# endif
+#else
+      float &x = *ptr;
+#pragma omp atomic compare 
+      if (x > value) { x = value; }
+//       float t;
+// #pragma omp atomic capture
+//       { t = *ptr; *ptr = std::min(t,value); }
+#endif
+    }
+
+    /*! iw - note: this implementation of atomic min/max via atomic
+        compare-exchange (CAS); which is cetainly not optimal on any
+        sort of modern GPU - but it works in any C++-21 compliant
+        compiler, so it's what we do for now */
+    inline void atomic_max(float *ptr, float value)
+    { 
+#ifdef __NVCOMPILER
+# if 1
+      float &mem = *ptr;
+      if (mem >= value) return;
+      while (1) {
+        float wasBefore;
+#pragma omp atomic capture
+        { wasBefore = mem; mem = value; }
+        if (wasBefore <= value) break;
+        value = wasBefore;
+      }
+# else
+      float current = *(volatile float *)ptr;
+      while (current < value) {
+        bool wasChanged
+          = ((std::atomic<int>*)ptr)
+          ->compare_exchange_weak((int&)current,(int&)value);
+        if (wasChanged) break;
+      }
+# endif
+#else
+      float &x = *ptr;
+#pragma omp atomic compare 
+      if (x < value) { x = value; }
+        //       float t;
+// #pragma omp atomic capture
+//       { t = *ptr; *ptr = std::max(t,value); }
+#endif
+    }
+
+    template<typename T, int D>
+    inline void v_atomic_min(vec_t<T,D> *ptr, vec_t<T,D> v);
+    template<typename T, int D>
+    inline void v_atomic_max(vec_t<T,D> *ptr, vec_t<T,D> v);
+
+
+    template<typename T>
+    inline void v_atomic_min(vec_t<T,2> *ptr, vec_t<T,2> v)
+    {
+      atomic_min(&ptr->x,v.x); 
+      atomic_min(&ptr->y,v.y);
+    }
+
+    template<typename T>
+    inline void v_atomic_min(vec_t<T,3> *ptr, vec_t<T,3> v)
+    {
+      atomic_min(&ptr->x,v.x); 
+      atomic_min(&ptr->y,v.y);
+      atomic_min(&ptr->z,v.z);
+    }
+
+    template<typename T>
+    inline void v_atomic_min(vec_t<T,4> *ptr, vec_t<T,4> v)
+    {
+      atomic_min(&ptr->x,v.x); 
+      atomic_min(&ptr->y,v.y);
+      atomic_min(&ptr->z,v.z);
+      atomic_min(&ptr->w,v.w);
+    }
+
+    template<typename T>
+    inline void v_atomic_max(vec_t<T,2> *ptr, vec_t<T,2> v)
+    {
+      atomic_max(&ptr->x,v.x); 
+      atomic_max(&ptr->y,v.y);
+    }
+
+    template<typename T>
+    inline void v_atomic_max(vec_t<T,3> *ptr, vec_t<T,3> v)
+    {
+      atomic_max(&ptr->x,v.x); 
+      atomic_max(&ptr->y,v.y);
+      atomic_max(&ptr->z,v.z);
+    }
+
+    template<typename T>
+    inline void v_atomic_max(vec_t<T,4> *ptr, vec_t<T,4> v)
+    {
+      atomic_max(&ptr->x,v.x); 
+      atomic_max(&ptr->y,v.y);
+      atomic_max(&ptr->z,v.z);
+      atomic_max(&ptr->w,v.w);
+    }
+
+    template<typename box_t>
+    inline void atomic_grow(AtomicBox<box_t> &ab, typename box_t::vec_t P)
+    {
+      v_atomic_min(&ab.lower,P);
+      v_atomic_max(&ab.upper,P);
+    }
+
+    template<typename box_t>
+    inline void atomic_grow(AtomicBox<box_t> &ab, box_t B)
+    {
+      v_atomic_min(&ab.lower,B.lower);
+      v_atomic_max(&ab.upper,B.upper);
+    }
+
+  }
+}
diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h
@@ -0,0 +1,162 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "cuBQL/bvh.h"
+#include <omp.h>
+#include <atomic>
+
+namespace cuBQL {
+  namespace omp {
+
+    struct Context {
+      Context(int gpuID);
+
+      void *alloc(size_t numBytes);
+
+      template<typename T>
+      void alloc(T *&d_data, size_t Nelements);
+
+      template<typename T>
+      void alloc_and_upload(T *&d_data, const T *h_data, size_t Nelements);
+
+      template<typename T>
+      void upload(T *d_data, const T *h_data, size_t Nelements);
+
+      template<typename T>
+      void alloc_and_upload(T *&d_data, const std::vector<T> &h_vector);
+
+      template<typename T>
+      std::vector<T> download_vector(const T *d_data, size_t N);
+
+      template<typename T>
+      void download(T &h_value, T *d_value);
+
+      void free(void *);
+
+      int gpuID;
+      int hostID;
+    };
+
+    struct Kernel {
+      inline int workIdx() const { return _workIdx; }
+      int _workIdx;
+    };
+
+    inline uint32_t atomicAdd(uint32_t *ptr, uint32_t inc)
+    {
+#ifdef __NVCOMPILER
+      return (uint32_t)((std::atomic<int> *)ptr)->fetch_add((int)inc);
+#else
+      uint32_t t;
+#pragma omp atomic capture
+      { t = *ptr; *ptr += inc; }
+      // return ((std::atomic<int> *)p_value)->fetch_add(inc);
+      return t;
+#endif
+    }
+
+
+    // ##################################################################
+    // IMPLEMENTATION SECTION
+    // ##################################################################
+    Context::Context(int gpuID)
+      : gpuID(gpuID),
+        hostID(omp_get_initial_device())
+    {
+      assert(gpuID < omp_get_num_devices());
+      printf("#cuBQL:omp:Context(gpu=%i/%i,host=%i)\n",
+             gpuID,omp_get_num_devices(),hostID);
+    }
+
+    void *Context::alloc(size_t numBytes)
+    { return omp_target_alloc(numBytes,gpuID); }
+
+    template<typename T> inline
+    void Context::upload(T *d_data,
+                         const T *h_data,
+                         size_t N)
+    {
+      assert(d_data);
+      omp_target_memcpy(d_data,h_data,N*sizeof(T),
+                        0,0,gpuID,hostID);
+    }
+
+    template<typename T> inline
+    void Context::alloc_and_upload(T *&d_data,
+                                   const T *h_data,
+                                   size_t N)
+    {
+      printf("target_alloc N %li gpu %i\n",N,gpuID);
+      d_data = (T *)omp_target_alloc(N*sizeof(T),gpuID);
+      printf("ptr %p\n",d_data);
+      upload(d_data,h_data,N);
+    }
+
+    template<typename T> inline
+    void Context::alloc_and_upload(T *&d_data,
+                                   const std::vector<T> &h_vector)
+    { alloc_and_upload(d_data,h_vector.data(),h_vector.size()); }
+
+    template<typename T>
+    std::vector<T> Context::download_vector(const T *d_data, size_t N)
+    {
+      PRINT(N);
+      PRINT(d_data);
+
+      std::vector<T> out(N);
+      PRINT(out.data());
+      PRINT(sizeof(T));
+      omp_target_memcpy(out.data(),d_data,N*sizeof(T),
+                        0,0,hostID,gpuID);
+      return out;
+    }
+
+    inline void Context::free(void *ptr)
+    { omp_target_free(ptr,gpuID); }
+
+    template<typename T> inline
+    void Context::alloc(T *&d_data, size_t N)
+    {
+      d_data = (T*)omp_target_alloc(N*sizeof(T),gpuID);
+    }
+
+    // template<typename T> inline
+    // void Context::alloc_and_upload(T *&d_data,
+    //                                const T *h_data,
+    //                                size_t N)
+    // {
+    //   alloc(d_data,N);
+    //   upload(d_data,h_data,N);
+    // }
+
+    // template<typename T> inline
+    // void Context::alloc_and_upload(T *&d_data,
+    //                                const std::vector<T> &h_vector)
+    // {
+    //   alloc(d_data,h_vector.size());
+    //   upload(d_data,h_vector);
+    // }
+
+    // template<typename T> inline
+    // std::vector<T> Context::download_vector(const T *d_data,
+    //                                         size_t N)
+    // {
+    //   std::vector<T> vec(N);
+    //   omp_target_memcpy(vec.data(),d_data,N*sizeof(T),
+    //                     0,0,hostID,gpuID);
+    //   return vec;
+    // }
+
+    template<typename T>
+    inline void Context::download(T &h_value, T *d_value)
+    {
+      omp_target_memcpy(&h_value,d_value,sizeof(T),
+                        0,0,hostID,gpuID);
+    }
+
+
+  } // ::cuBQL::omp
+} // ::cuBQL