diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1ffa826..8059a18 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,8 +5,11 @@ cmake_minimum_required(VERSION 3.16)
 
 cmake_policy(SET CMP0048 NEW)
 set(CMAKE_BUILD_TYPE_INIT "Release")
-project(cuBQL VERSION 1.1.0 LANGUAGES C CXX)
+project(cuBQL VERSION 1.2.0 LANGUAGES C CXX)
 
+if (CUBQL_OMP)
+  set(CUBQL_DISABLE_CUDA ON)
+endif()
 if (CUBQL_DISABLE_CUDA)
   message("#cuBQL: CUDA _DISABLED_ by user request")
   set(CUBQL_HAVE_CUDA OFF)
@@ -136,3 +139,5 @@ add_subdirectory(cuBQL)
 if (NOT CUBQL_IS_SUBPROJECT)
   add_subdirectory(samples)
 endif()
+
+#add_subdirectory(testing)
diff --git a/cuBQL/builder/cuda/sm_builder.h b/cuBQL/builder/cuda/sm_builder.h
index 7d8acb1..2442048 100644
--- a/cuBQL/builder/cuda/sm_builder.h
+++ b/cuBQL/builder/cuda/sm_builder.h
@@ -515,10 +515,10 @@ namespace cuBQL {
       while (true) {
         CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes,
                                     sizeof(numNodes),cudaMemcpyDeviceToHost,s));
-        CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
-        CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent));
         if (numNodes == numDone)
           break;
+        CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
+        CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent));
 #if CUBQL_PROFILE
         t_nodePass[pass].sync_start();
 #endif
@@ -529,7 +529,7 @@ namespace cuBQL {
 #if CUBQL_PROFILE
         t_nodePass[pass].sync_stop();
         t_primPass[pass].sync_start();
-#endif        
+#endif
         numDone = numNodes;
 
 // #if 1
diff --git a/cuBQL/builder/omp.h b/cuBQL/builder/omp.h
new file mode 100644
index 0000000..17ba4c2
--- /dev/null
+++ b/cuBQL/builder/omp.h
@@ -0,0 +1,19 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace cuBQL {
+  namespace omp {
+    struct Context;
+    
+    template<typename T, int D>
+    void refit(BinaryBVH<T,D>    &bvh,
+               const box_t<T,D>  *boxes,
+               Context *ctx);
+  }
+}
+
+#include "cuBQL/builder/omp/refit.h"
+#include "cuBQL/builder/omp/spatialMedian.h"
+
diff --git a/cuBQL/builder/omp/AtomicBox.h b/cuBQL/builder/omp/AtomicBox.h
new file mode 100644
index 0000000..9ffac06
--- /dev/null
+++ b/cuBQL/builder/omp/AtomicBox.h
@@ -0,0 +1,168 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "cuBQL/builder/omp/common.h"
+
+
+namespace cuBQL {
+  namespace omp {
+
+    template<typename box_t>
+    struct AtomicBox : public box_t {
+      
+      inline void set_empty()
+      {
+        *(box_t *)this = box_t();
+      }
+    };
+
+    template<typename T>
+    inline void atomic_min(T *ptr, T v);
+    template<typename T>
+    inline void atomic_max(T *ptr, T v);
+    
+    /*! iw - note: this implementation of atomic min/max via atomic
+        compare-exchange (CAS); which is cetainly not optimal on any
+        sort of modern GPU - but it works in any C++-21 compliant
+        compiler, so it's what we do for now */
+    inline void atomic_min(float *ptr, float value)
+    {
+#ifdef __NVCOMPILER
+# if 1
+      float &mem = *ptr;
+      if (mem <= value) return;
+      while (1) {
+        float wasBefore;
+#pragma omp atomic capture
+        { wasBefore = mem; mem = value; }
+        if (wasBefore >= value) break;
+        value = wasBefore;
+      }
+# else
+      float current = *(volatile float *)ptr;
+      while (current > value) {
+        bool wasChanged
+          = ((std::atomic<int>*)ptr)
+          ->compare_exchange_weak((int&)current,(int&)value);
+        if (wasChanged) break;
+      }
+# endif
+#else
+      float &x = *ptr;
+#pragma omp atomic compare 
+      if (x > value) { x = value; }
+//       float t;
+// #pragma omp atomic capture
+//       { t = *ptr; *ptr = std::min(t,value); }
+#endif
+    }
+    
+    /*! iw - note: this implementation of atomic min/max via atomic
+        compare-exchange (CAS); which is cetainly not optimal on any
+        sort of modern GPU - but it works in any C++-21 compliant
+        compiler, so it's what we do for now */
+    inline void atomic_max(float *ptr, float value)
+    { 
+#ifdef __NVCOMPILER
+# if 1
+      float &mem = *ptr;
+      if (mem >= value) return;
+      while (1) {
+        float wasBefore;
+#pragma omp atomic capture
+        { wasBefore = mem; mem = value; }
+        if (wasBefore <= value) break;
+        value = wasBefore;
+      }
+# else
+      float current = *(volatile float *)ptr;
+      while (current < value) {
+        bool wasChanged
+          = ((std::atomic<int>*)ptr)
+          ->compare_exchange_weak((int&)current,(int&)value);
+        if (wasChanged) break;
+      }
+# endif
+#else
+      float &x = *ptr;
+#pragma omp atomic compare 
+      if (x < value) { x = value; }
+        //       float t;
+// #pragma omp atomic capture
+//       { t = *ptr; *ptr = std::max(t,value); }
+#endif
+    }
+    
+    template<typename T, int D>
+    inline void v_atomic_min(vec_t<T,D> *ptr, vec_t<T,D> v);
+    template<typename T, int D>
+    inline void v_atomic_max(vec_t<T,D> *ptr, vec_t<T,D> v);
+    
+
+    template<typename T>
+    inline void v_atomic_min(vec_t<T,2> *ptr, vec_t<T,2> v)
+    {
+      atomic_min(&ptr->x,v.x); 
+      atomic_min(&ptr->y,v.y);
+    }
+    
+    template<typename T>
+    inline void v_atomic_min(vec_t<T,3> *ptr, vec_t<T,3> v)
+    {
+      atomic_min(&ptr->x,v.x); 
+      atomic_min(&ptr->y,v.y);
+      atomic_min(&ptr->z,v.z);
+    }
+    
+    template<typename T>
+    inline void v_atomic_min(vec_t<T,4> *ptr, vec_t<T,4> v)
+    {
+      atomic_min(&ptr->x,v.x); 
+      atomic_min(&ptr->y,v.y);
+      atomic_min(&ptr->z,v.z);
+      atomic_min(&ptr->w,v.w);
+    }
+
+    template<typename T>
+    inline void v_atomic_max(vec_t<T,2> *ptr, vec_t<T,2> v)
+    {
+      atomic_max(&ptr->x,v.x); 
+      atomic_max(&ptr->y,v.y);
+    }
+    
+    template<typename T>
+    inline void v_atomic_max(vec_t<T,3> *ptr, vec_t<T,3> v)
+    {
+      atomic_max(&ptr->x,v.x); 
+      atomic_max(&ptr->y,v.y);
+      atomic_max(&ptr->z,v.z);
+    }
+    
+    template<typename T>
+    inline void v_atomic_max(vec_t<T,4> *ptr, vec_t<T,4> v)
+    {
+      atomic_max(&ptr->x,v.x); 
+      atomic_max(&ptr->y,v.y);
+      atomic_max(&ptr->z,v.z);
+      atomic_max(&ptr->w,v.w);
+    }
+    
+    template<typename box_t>
+    inline void atomic_grow(AtomicBox<box_t> &ab, typename box_t::vec_t P)
+    {
+      v_atomic_min(&ab.lower,P);
+      v_atomic_max(&ab.upper,P);
+    }
+    
+    template<typename box_t>
+    inline void atomic_grow(AtomicBox<box_t> &ab, box_t B)
+    {
+      v_atomic_min(&ab.lower,B.lower);
+      v_atomic_max(&ab.upper,B.upper);
+    }
+    
+  }
+}
diff --git a/cuBQL/builder/omp/common.h b/cuBQL/builder/omp/common.h
new file mode 100644
index 0000000..33b8be1
--- /dev/null
+++ b/cuBQL/builder/omp/common.h
@@ -0,0 +1,162 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "cuBQL/bvh.h"
+#include <omp.h>
+#include <atomic>
+
+namespace cuBQL {
+  namespace omp {
+    
+    struct Context {
+      Context(int gpuID);
+
+      void *alloc(size_t numBytes);
+      
+      template<typename T>
+      void alloc(T *&d_data, size_t Nelements);
+      
+      template<typename T>
+      void alloc_and_upload(T *&d_data, const T *h_data, size_t Nelements);
+      
+      template<typename T>
+      void upload(T *d_data, const T *h_data, size_t Nelements);
+      
+      template<typename T>
+      void alloc_and_upload(T *&d_data, const std::vector<T> &h_vector);
+
+      template<typename T>
+      std::vector<T> download_vector(const T *d_data, size_t N);
+
+      template<typename T>
+      void download(T &h_value, T *d_value);
+
+      void free(void *);
+      
+      int gpuID;
+      int hostID;
+    };
+    
+    struct Kernel {
+      inline int workIdx() const { return _workIdx; }
+      int _workIdx;
+    };
+
+    inline uint32_t atomicAdd(uint32_t *ptr, uint32_t inc)
+    {
+#ifdef __NVCOMPILER
+      return (uint32_t)((std::atomic<int> *)ptr)->fetch_add((int)inc);
+#else
+      uint32_t t;
+#pragma omp atomic capture
+      { t = *ptr; *ptr += inc; }
+      // return ((std::atomic<int> *)p_value)->fetch_add(inc);
+      return t;
+#endif
+    }
+    
+
+    // ##################################################################
+    // IMPLEMENTATION SECTION
+    // ##################################################################
+    Context::Context(int gpuID)
+      : gpuID(gpuID),
+        hostID(omp_get_initial_device())
+    {
+      assert(gpuID < omp_get_num_devices());
+      printf("#cuBQL:omp:Context(gpu=%i/%i,host=%i)\n",
+             gpuID,omp_get_num_devices(),hostID);
+    }
+
+    void *Context::alloc(size_t numBytes)
+    { return omp_target_alloc(numBytes,gpuID); }
+    
+    template<typename T> inline
+    void Context::upload(T *d_data,
+                         const T *h_data,
+                         size_t N)
+    {
+      assert(d_data);
+      omp_target_memcpy(d_data,h_data,N*sizeof(T),
+                        0,0,gpuID,hostID);
+    }
+      
+    template<typename T> inline
+    void Context::alloc_and_upload(T *&d_data,
+                                   const T *h_data,
+                                   size_t N)
+    {
+      printf("target_alloc N %li gpu %i\n",N,gpuID);
+      d_data = (T *)omp_target_alloc(N*sizeof(T),gpuID);
+      printf("ptr %p\n",d_data);
+      upload(d_data,h_data,N);
+    }
+      
+    template<typename T> inline
+    void Context::alloc_and_upload(T *&d_data,
+                                   const std::vector<T> &h_vector)
+    { alloc_and_upload(d_data,h_vector.data(),h_vector.size()); }
+
+    template<typename T>
+    std::vector<T> Context::download_vector(const T *d_data, size_t N)
+    {
+      PRINT(N);
+      PRINT(d_data);
+      
+      std::vector<T> out(N);
+      PRINT(out.data());
+      PRINT(sizeof(T));
+      omp_target_memcpy(out.data(),d_data,N*sizeof(T),
+                        0,0,hostID,gpuID);
+      return out;
+    }
+
+    inline void Context::free(void *ptr)
+    { omp_target_free(ptr,gpuID); }
+
+    template<typename T> inline
+    void Context::alloc(T *&d_data, size_t N)
+    {
+      d_data = (T*)omp_target_alloc(N*sizeof(T),gpuID);
+    }
+      
+    // template<typename T> inline
+    // void Context::alloc_and_upload(T *&d_data,
+    //                                const T *h_data,
+    //                                size_t N)
+    // {
+    //   alloc(d_data,N);
+    //   upload(d_data,h_data,N);
+    // }
+      
+    // template<typename T> inline
+    // void Context::alloc_and_upload(T *&d_data,
+    //                                const std::vector<T> &h_vector)
+    // {
+    //   alloc(d_data,h_vector.size());
+    //   upload(d_data,h_vector);
+    // }
+
+    // template<typename T> inline
+    // std::vector<T> Context::download_vector(const T *d_data,
+    //                                         size_t N)
+    // {
+    //   std::vector<T> vec(N);
+    //   omp_target_memcpy(vec.data(),d_data,N*sizeof(T),
+    //                     0,0,hostID,gpuID);
+    //   return vec;
+    // }
+
+    template<typename T>
+    inline void Context::download(T &h_value, T *d_value)
+    {
+      omp_target_memcpy(&h_value,d_value,sizeof(T),
+                        0,0,hostID,gpuID);
+    }
+
+    
+  } // ::cuBQL::omp
+} // ::cuBQL
diff --git a/cuBQL/builder/omp/refit.h b/cuBQL/builder/omp/refit.h
new file mode 100644
index 0000000..2094554
--- /dev/null
+++ b/cuBQL/builder/omp/refit.h
@@ -0,0 +1,111 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "cuBQL/builder/omp/common.h"
+#include "cuBQL/builder/omp/AtomicBox.h"
+
+namespace cuBQL {
+  namespace omp {
+    
+    template<typename T, int D> inline
+    void refit_init(Kernel kernel,
+                    typename BinaryBVH<T,D>::Node *nodes,
+                    uint32_t              *refitData,
+                    int numNodes)
+    {
+      const int nodeID = kernel.workIdx();
+      if (nodeID == 1 || nodeID >= numNodes) return;
+      if (nodeID < 2)
+        refitData[0] = 0;
+      auto &node = nodes[nodeID];
+      node.bounds = box_t<T,D>();
+      if (node.admin.count) return;
+
+      refitData[node.admin.offset+0] = nodeID << 2;
+      refitData[node.admin.offset+1] = nodeID << 2;
+    }
+    
+    template<typename T, int D> inline 
+    void refit_run(Kernel kernel,
+                     uint32_t *bvh_primIDs,
+                     typename BinaryBVH<T,D>::Node *bvh_nodes,
+                     uint32_t *refitData,
+                     const box_t<T,D> *boxes,
+                     int numNodes)
+    {
+      int nodeID = kernel.workIdx();
+      if (nodeID == 1 || nodeID >= numNodes) return;
+
+      typename BinaryBVH<T,D>::Node *node = bvh_nodes+nodeID;
+
+      if (node->admin.count == 0)
+        // this is a inner node - exit
+        return;
+
+      box_t<T,D> bounds; bounds.set_empty();
+      for (int i=0;i<node->admin.count;i++) {
+        const box_t<T,D> primBox = boxes[bvh_primIDs[node->admin.offset+i]];
+        bounds.lower = min(bounds.lower,primBox.lower);
+        bounds.upper = max(bounds.upper,primBox.upper);
+      }
+
+      
+      int parentID = (refitData[nodeID] >> 2);
+      while (true) {
+        atomic_grow(*(AtomicBox<box_t<T,D>> *)&node->bounds,bounds);
+          
+        if (node == bvh_nodes)
+          break;
+
+        uint32_t refitBits = atomicAdd(&refitData[parentID],1u);
+        if ((refitBits & 1) == 0)
+          // we're the first one - let other one do it
+          break;
+
+        nodeID   = parentID;
+        node     = &bvh_nodes[parentID];
+        parentID = (refitBits >> 2);
+
+        int ofs = node->admin.offset;
+        
+        typename BinaryBVH<T,D>::Node l = bvh_nodes[ofs+0];
+        typename BinaryBVH<T,D>::Node r = bvh_nodes[ofs+1];
+        bounds.lower = min(l.bounds.lower,r.bounds.lower);
+        bounds.upper = max(l.bounds.upper,r.bounds.upper);
+      }
+    }
+    
+    template<typename T, int D>
+    void refit(BinaryBVH<T,D>    &bvh,
+               const box_t<T,D>  *boxes,
+               Context *ctx)
+    {
+      assert(bvh.nodes);
+      assert(bvh.primIDs);
+      int numNodes = bvh.numNodes;
+      uint32_t *refitData
+        = (uint32_t*)ctx->alloc(numNodes*sizeof(uint32_t));
+      auto bvh_nodes = bvh.nodes;
+      auto bvh_primIDs = bvh.primIDs;
+      {
+#pragma omp target device(ctx->gpuID) is_device_ptr(refitData) is_device_ptr(bvh_nodes) 
+#pragma omp teams distribute parallel for
+        for (int i=0;i<numNodes;i++)
+          refit_init<T,D>(Kernel{i},bvh_nodes,refitData,numNodes);
+      }
+
+      {
+#pragma omp target device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(bvh_nodes) is_device_ptr(refitData) is_device_ptr(boxes) 
+#pragma omp teams distribute parallel for 
+      for (int i=0;i<numNodes;i++)
+        refit_run(Kernel{i},
+                    bvh_primIDs,bvh_nodes,refitData,boxes,numNodes);
+      }
+      ctx->free((void*)refitData);
+    }
+    
+  }
+}
+
diff --git a/cuBQL/builder/omp/sort.h b/cuBQL/builder/omp/sort.h
new file mode 100644
index 0000000..5f6cb39
--- /dev/null
+++ b/cuBQL/builder/omp/sort.h
@@ -0,0 +1,179 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// imported from github.com:ingowald/openmp_target_sort under this license:
+
+// SPDX-FileCopyrightText: Copyright (c) Ingo Wald
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <omp.h>
+
+namespace omp {
+  namespace bitonic {
+
+#pragma omp declare target
+    template<typename key_t>
+    inline
+    void g_orderSegmentPairs(uint32_t tid,
+                             int logSegLen,
+                             key_t *const d_values,
+                             int numValues)
+    {
+      uint32_t segLen    = 1<<logSegLen;
+      uint32_t pairIdx   = tid>>logSegLen;
+      uint32_t pairRank  = tid-(pairIdx<<logSegLen);
+      uint32_t pairBegin = 2*(pairIdx<<logSegLen);
+
+      uint32_t r = pairBegin+segLen+pairRank;
+      uint32_t l = pairBegin+segLen-1-pairRank;
+                        
+      if (r >= numValues) return;
+        
+      key_t lv = d_values[l];
+      key_t rv = d_values[r];
+      if (rv < lv) {
+        d_values[r] = lv;
+        d_values[l] = rv;
+      }
+    }
+#pragma omp end declare target
+      
+    template<typename key_t, typename value_t>
+    void g_orderSegmentPairs(uint32_t tid,
+                             int logSegLen,
+                             key_t *const d_keys,
+                             value_t *const d_values,
+                             int numValues)
+    {
+      uint32_t segLen    = 1<<logSegLen;
+      uint32_t pairIdx   = tid>>logSegLen;
+      uint32_t pairRank  = tid-(pairIdx<<logSegLen);
+      uint32_t pairBegin = 2*(pairIdx<<logSegLen);
+
+
+      uint32_t r = pairBegin+segLen+pairRank;
+      uint32_t l = pairBegin+segLen-1-pairRank;
+                        
+      if (r >= numValues) return;
+        
+      key_t lk = d_keys[l];
+      key_t rk = d_keys[r];
+      key_t lv = d_values[l];
+      key_t rv = d_values[r];
+      if (rk < lk) {
+        d_values[r] = lv;
+        d_values[l] = rv;
+        d_keys[r]   = lk;
+        d_keys[l]   = rk;
+      }
+    }
+      
+    template<typename key_t>
+    void orderSegmentPairs(int logSegLen,
+                           key_t *const d_values,
+                           int numValues,
+                           uint32_t deviceID)
+    {
+#if 0
+#pragma omp target device(deviceID) teams is_device_ptr(d_values) num_teams(128)
+      {
+        int bs = 
+      }
+#else
+#pragma omp target device(deviceID) is_device_ptr(d_values) nowait
+#pragma omp teams distribute parallel for 
+      for (int i=0;i<numValues;i++)
+        g_orderSegmentPairs(i,logSegLen,
+                            d_values,numValues);
+#endif
+    }
+      
+    template<typename key_t, typename value_t>
+    void orderSegmentPairs(int logSegLen,
+                           key_t *const d_keys,
+                           value_t *const d_values,
+                           int numValues,
+                           uint32_t deviceID)
+    {
+// #pragma omp target nowait device(deviceID) is_device_ptr(d_keys) is_device_ptr(d_values)
+// #pragma omp target teams distribute parallel for
+#pragma omp target nowait device(deviceID) is_device_ptr(d_keys) is_device_ptr(d_values) 
+      for (int i=0;i<numValues;i++)
+        g_orderSegmentPairs(i,logSegLen,
+                            d_keys,d_values,numValues);
+    }
+      
+    template<typename key_t>
+    void sortSegments(int logSegLen,
+                      key_t *const d_keys,
+                      int numValues,
+                      uint32_t deviceID)
+    {
+      if (logSegLen == 0) return;
+
+      sortSegments(logSegLen-1,d_keys,numValues,deviceID);
+      orderSegmentPairs(logSegLen-1,d_keys,numValues,deviceID);
+      sortSegments(logSegLen-1,d_keys,numValues,deviceID);
+    }
+      
+    template<typename key_t, typename value_t>
+    void sortSegments(int logSegLen,
+                      key_t *const d_keys,
+                      value_t *const d_values,
+                      int numValues,
+                      uint32_t deviceID)
+    {
+      if (logSegLen == 0) return;
+
+      sortSegments(logSegLen-1,d_keys,d_values,numValues,deviceID);
+      orderSegmentPairs(logSegLen-1,d_keys,d_values,numValues,deviceID);
+      sortSegments(logSegLen-1,d_keys,d_values,numValues,deviceID);
+    }
+      
+    template<typename key_t>
+    void sort(key_t *const d_values,
+              int numValues,
+              uint32_t deviceID)
+    {
+      uint32_t logSegLen = 0;
+      while (1<<logSegLen < numValues)
+        logSegLen++;
+      sortSegments(logSegLen,d_values,numValues,deviceID);
+    }
+      
+    template<typename key_t, typename value_t>
+    void sort(key_t *const d_keys,
+              value_t *const d_values,
+              int numValues,
+              uint32_t deviceID)
+    {
+      uint32_t logSegLen = 0;
+      while ((1<<logSegLen) < numValues)
+        logSegLen++;
+      sortSegments(logSegLen,d_keys,d_values,numValues,deviceID);
+    }
+      
+  }
+  
+  template<typename key_t>
+  void omp_target_sort(key_t *const d_values,
+                       size_t numValues,
+                       uint32_t deviceID)
+  {
+    assert(numValues < (1ull<<31));
+    bitonic::sort(d_values,(int)numValues,deviceID);
+  }
+    
+  template<typename key_t, typename value_t>
+  void omp_target_sort(key_t *const d_keys,
+                       value_t *const d_values,
+                       size_t numValues,
+                       uint32_t deviceID)
+  {
+    assert(numValues < (1ull<<31));
+    bitonic::sort(d_keys,d_values,(int)numValues,deviceID);
+  }
+    
+}
diff --git a/cuBQL/builder/omp/spatialMedian.h b/cuBQL/builder/omp/spatialMedian.h
new file mode 100644
index 0000000..5cca265
--- /dev/null
+++ b/cuBQL/builder/omp/spatialMedian.h
@@ -0,0 +1,439 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "cuBQL/builder/omp/refit.h"
+#include "cuBQL/builder/omp/sort.h"
+
+namespace cuBQL {
+  namespace omp {
+
+    struct PrimState {
+      union {
+        /* careful with this order - this is intentionally chosen such
+           that all item with nodeID==-1 will end up at the end of the
+           list; and all others will be sorted by nodeID */
+        struct {
+          uint64_t primID:31; //!< prim we're talking about
+          uint64_t done  : 1;
+          uint64_t nodeID:32; //!< node the given prim is (currently) in.
+        };
+        uint64_t bits;
+      };
+    };
+
+    enum { OPEN_BRANCH, OPEN_NODE, DONE_NODE };
+    typedef uint8_t NodeState;
+    
+    template<typename T, int D>
+    struct CUBQL_ALIGN(16) TempNode {
+      using box_t = cuBQL::box_t<T,D>;
+      union {
+        struct {
+          uint32_t         count;
+          AtomicBox<box_t> centBounds;
+        } openBranch;
+        struct {
+          uint32_t offset;
+          int      dim;
+          uint32_t tieBreaker;
+          float    pos;
+        } openNode;
+        struct {
+          uint32_t offset;
+          uint32_t count;
+          uint32_t unused[2];
+        } doneNode;
+      };
+    };
+
+    template<typename T, int D>
+    void initState(Kernel kernel,
+                   uint32_t      *pNumNodes,
+                   NodeState     *nodeStates,
+                   TempNode<T,D> *nodes)
+    {
+      int tid = kernel.workIdx();
+      if (tid > 0) return;
+      *pNumNodes = 2;
+      
+      nodeStates[0]             = OPEN_BRANCH;
+      nodes[0].openBranch.count = 0;
+
+      ((int*)&nodes[0].openBranch.centBounds)[0] = 0;
+      nodes[0].openBranch.centBounds.set_empty();
+
+      nodeStates[1]            = DONE_NODE;
+      nodes[1].doneNode.offset = 0;
+      nodes[1].doneNode.count  = 0;
+    }
+
+    template<typename T, int D>
+    void initPrims(Kernel kernel,
+                   TempNode<T,D>    *nodes,
+                   PrimState        *primState,
+                   const box_t<T,D> *primBoxes,
+                   uint32_t          numPrims)
+    {
+      const int primID = kernel.workIdx();
+      if (primID >= numPrims) return;
+
+      auto &me = primState[primID];
+      me.primID = primID;
+                                                    
+      const box_t<T,D> box = primBoxes[primID];
+      if (box.get_lower(0) <= box.get_upper(0)) {
+        me.nodeID = 0;
+        me.done   = false;
+        // this could be made faster by block-reducing ...
+        atomicAdd(&nodes[0].openBranch.count,1);
+        auto ctr = box.center();
+        atomic_grow(nodes[0].openBranch.centBounds,ctr);//centerOf(box));
+        // printf("p %i ctr %f %f %f grownn box %i : (%f %f %f)(%f %f %f)\n",
+        //        primID,
+        //        ctr.x,
+        //        ctr.y,
+        //        ctr.z,
+        //        0,
+        //        nodes[0].openBranch.centBounds.lower.x,
+        //        nodes[0].openBranch.centBounds.lower.y,
+        //        nodes[0].openBranch.centBounds.lower.z,
+        //        nodes[0].openBranch.centBounds.upper.x,
+        //        nodes[0].openBranch.centBounds.upper.y,
+        //        nodes[0].openBranch.centBounds.upper.z);
+      } else {
+        me.nodeID = (uint32_t)-1;
+        me.done   = true;
+      }
+    }
+
+    template<typename T, int D>
+    void selectSplits(Kernel kernel,
+                      uint32_t      *pNumNodes,
+                      NodeState     *nodeStates,
+                      TempNode<T,D> *nodes,
+                      uint32_t       numNodes,
+                      BuildConfig    buildConfig)
+    {
+      const int nodeID = kernel.workIdx();//threadIdx.x+blockIdx.x*blockDim.x;
+      if (nodeID >= numNodes) return;
+
+      NodeState &nodeState = nodeStates[nodeID];
+      if (nodeState == DONE_NODE)
+        // this node was already closed before
+        return;
+      
+      if (nodeState == OPEN_NODE) {
+        // this node was open in the last pass, can close it.
+        nodeState   = DONE_NODE;
+        int offset  = nodes[nodeID].openNode.offset;
+        auto &done  = nodes[nodeID].doneNode;
+        done.count  = 0;
+        done.offset = offset;
+        return;
+      }
+
+      auto in = nodes[nodeID].openBranch;
+      // if (nodeID < 5)
+      // printf("node split %i : %f %f %f : %f %f %f\n",
+      //        nodeID,
+      //        in.centBounds.lower.x,
+      //        in.centBounds.lower.y,
+      //        in.centBounds.lower.z,
+      //        in.centBounds.upper.x,
+      //        in.centBounds.upper.y,
+      //        in.centBounds.upper.z);
+      if (in.count <= buildConfig.makeLeafThreshold) {
+        auto &done  = nodes[nodeID].doneNode;
+        done.count  = in.count;
+        // set this to max-value, so the prims can later do atomicMin
+        // with their position ion the leaf list; this value is
+        // greater than any prim position.
+        done.offset = (uint32_t)-1;
+        nodeState   = DONE_NODE;
+      } else {
+        float widestWidth = 0.f;
+        int   widestDim   = -1;
+        float widestLo, widestHi, widestCtr;
+#pragma unroll
+        for (int d=0;d<D;d++) {
+          float lo = in.centBounds.get_lower(d);
+          float hi = in.centBounds.get_upper(d);
+          float width = hi - lo;
+          if (width <= widestWidth)
+            continue;
+          float ctr = 0.5f*(hi+lo);
+          if (ctr == lo || ctr == hi)
+            continue;
+          
+          widestWidth = width;
+          widestDim   = d;
+          widestLo = lo;
+          widestHi = hi;
+          widestCtr = ctr;
+        }
+      
+        auto &open = nodes[nodeID].openNode;
+        if (widestDim >= 0) {
+          open.pos = widestCtr;
+        }
+        open.dim
+          = (widestDim < 0 || widestCtr == widestLo || widestCtr == widestHi)
+          ? -1
+          : widestDim;
+        
+        open.offset = atomicAdd(pNumNodes,2);
+        // printf("offset %i\n",open.offset);
+#pragma unroll
+        for (int side=0;side<2;side++) {
+          const int childID = open.offset+side;
+          auto &child = nodes[childID].openBranch;
+
+          child.centBounds.set_empty();
+          child.count         = 0;
+          nodeStates[childID] = OPEN_BRANCH;
+        }
+        nodeState = OPEN_NODE;
+      }
+    }
+
+    template<typename T, int D>
+    void updatePrims(Kernel kernel,
+                     NodeState       *nodeStates,
+                     TempNode<T,D> *nodes,
+                     PrimState       *primStates,
+                     const box_t<T,D>     *primBoxes,
+                     int numPrims)
+    {
+      const int primID = kernel.workIdx();
+      if (primID >= numPrims) return;
+
+      const auto me = primStates[primID];
+      if (me.done) return;
+      
+      const auto ns = nodeStates[me.nodeID];
+      if (ns == DONE_NODE) {
+        // node became a leaf, we're done.
+        primStates[primID].done = true;
+        return;
+      }
+      
+      auto &split = nodes[me.nodeID].openNode;
+      const box_t<T,D> primBox = primBoxes[me.primID];
+      int side = 0;
+      if (split.dim == -1) {
+        // could block-reduce this, but will likely not happen often, anyway
+        side = (atomicAdd(&split.tieBreaker,1) & 1);
+        // printf("TIEBREAKER node %i prim %i\n",me.nodeID,me.primID);
+      } else {
+        const float center = 0.5f*(primBox.get_lower(split.dim)+
+                                   primBox.get_upper(split.dim));
+        side = (center >= split.pos);
+      }
+      int newNodeID = split.offset+side;
+      auto &myBranch = nodes[newNodeID].openBranch;
+      atomicAdd(&myBranch.count,1);
+      atomic_grow(myBranch.centBounds,primBox.center());
+      primStates[primID].nodeID = newNodeID;
+    }
+    
+    /* given a sorted list of {nodeID,primID} pairs, this kernel does
+       two things: a) it extracts the 'primID's and puts them into the
+       bvh's primIDs[] array; and b) it writes, for each leaf nod ein
+       the nodes[] array, the node.offset value to point to the first
+       of this nodes' items in that bvh.primIDs[] list. */
+    template<typename T, int D>
+    void writePrimsAndLeafOffsets(Kernel kernel,
+                                  TempNode<T,D> *nodes,
+                                  uint32_t        *bvhItemList,
+                                  PrimState       *primStates,
+                                  int              numPrims)
+    {
+      const int offset = kernel.workIdx();//threadIdx.x+blockIdx.x*blockDim.x;
+      if (offset >= numPrims) return;
+
+      auto &ps = primStates[offset];
+      // printf("ps %i -> %i : %i done %i\n",
+      //        (int)offset,(int)ps.primID,(int)ps.nodeID,(int)ps.done);
+      bvhItemList[offset] = ps.primID;
+       
+      if ((int)ps.nodeID < 0)
+        /* invalid prim, just skip here */
+        return;
+      auto &node = nodes[ps.nodeID];
+      atomicMin(&node.doneNode.offset,offset);
+    }
+
+    template<typename T, int D>
+    void writeLeafOffsets(Kernel kernel,
+                          TempNode<T,D> *nodes,
+                          uint32_t      *pNextFreeOffset,
+                          int            numNodes)
+    {
+      const int nodeID = kernel.workIdx();
+      if (nodeID >= numNodes) return;
+      
+      auto &node = nodes[nodeID].doneNode;
+      if (node.count > 0) {
+        node.offset = atomicAdd(pNextFreeOffset,node.count)+node.count;
+          // printf("wrote leaf %i offset %i\n",
+          //        nodeID,node.offset);
+      }
+    }
+
+    template<typename T, int D>
+    void writePrims(Kernel kernel,
+                    uint32_t      *bvh_primIDs,
+                    TempNode<T,D> *nodes,
+                    PrimState     *primStates,
+                    int            numPrims)
+    {
+      const int primID = kernel.workIdx();//threadIdx.x+blockIdx.x*blockDim.x;
+      if (primID >= numPrims) return;
+
+      auto &ps = primStates[primID];
+      if ((int)ps.nodeID < 0)
+        /* invalid prim, just skip here */
+        return;
+      auto &node = nodes[ps.nodeID].doneNode;
+      int myPos = atomicAdd(&node.offset,(uint32_t)-1)-1;
+      bvh_primIDs[myPos] = ps.primID;
+    }
+    
+    /* writes main phase's temp nodes into final bvh.nodes[]
+       layout. actual bounds of that will NOT yet bewritten */
+    template<typename T, int D>
+    void writeNodes(Kernel kernel,
+                    typename BinaryBVH<T,D>::Node *finalNodes,
+                    TempNode<T,D>  *tempNodes,
+                    int        numNodes)
+    {
+      const int nodeID = kernel.workIdx();
+      if (nodeID >= numNodes) return;
+
+      finalNodes[nodeID].admin.offset = tempNodes[nodeID].doneNode.offset;
+      finalNodes[nodeID].admin.count  = tempNodes[nodeID].doneNode.count;
+    }
+
+    
+    template<typename T, int D>
+    void spatialMedian(BinaryBVH<T,D> &bvh,
+                       /*! DEVICE array of boxes */
+                       const box_t<T,D> *boxes,
+                       uint32_t          numPrims,
+                       BuildConfig       buildConfig,
+                       Context          *ctx)
+    {
+      assert(sizeof(PrimState) == sizeof(uint64_t));
+      if (buildConfig.makeLeafThreshold < 1)
+        buildConfig.makeLeafThreshold = 1;
+
+      // ==================================================================
+      // do build on temp nodes
+      // ==================================================================
+      TempNode<T,D> *tempNodes = 0;
+      NodeState     *nodeStates = 0;
+      PrimState     *primStates = 0;
+      uint32_t      *d_numNodes = 0;
+      ctx->alloc(tempNodes,2*numPrims);
+      ctx->alloc(nodeStates,2*numPrims);
+      ctx->alloc(primStates,numPrims);
+      ctx->alloc(d_numNodes,1);
+#pragma omp target device(ctx->gpuID) is_device_ptr(d_numNodes) is_device_ptr(nodeStates) is_device_ptr(tempNodes)
+#pragma omp teams distribute parallel for
+      for (int tid=0;tid<1;tid++)
+        initState(Kernel{tid},
+                  d_numNodes,
+                  nodeStates,
+                  tempNodes);
+#pragma omp target device(ctx->gpuID) is_device_ptr(tempNodes) is_device_ptr(primStates) is_device_ptr(boxes)
+#pragma omp teams distribute parallel for
+      for (int tid=0;tid<numPrims;tid++)
+        initPrims(Kernel{tid},tempNodes,
+                  primStates,boxes,numPrims);
+
+      int numDone = 0;
+      uint32_t numNodes;
+
+      // ------------------------------------------------------------------      
+      while (true) {
+        ctx->download(numNodes,d_numNodes);
+        if (numNodes == numDone)
+          break;
+#pragma omp target device(ctx->gpuID) is_device_ptr(d_numNodes) is_device_ptr(nodeStates) is_device_ptr(tempNodes)
+#pragma omp teams distribute parallel for
+        for (int tid=0;tid<numNodes;tid++)
+          selectSplits(Kernel{tid},
+                       d_numNodes,
+                       nodeStates,tempNodes,numNodes,
+                       buildConfig);
+        
+        numDone = numNodes;
+
+#pragma omp target device(ctx->gpuID) is_device_ptr(d_numNodes) is_device_ptr(nodeStates) is_device_ptr(tempNodes)
+#pragma omp teams distribute parallel for
+        for (int tid=0;tid<numPrims;tid++)
+          updatePrims(Kernel{tid},
+                      nodeStates,tempNodes,
+                      primStates,boxes,numPrims);
+      }
+      
+      bvh.numPrims = numPrims;
+      bvh.primIDs = 0;
+      ctx->alloc(bvh.primIDs,numPrims);
+      auto bvh_primIDs = bvh.primIDs;
+      
+      uint32_t zero = 0u;
+      // set first uint there to zero; leaf writing code will
+      // atomically inc that value to determine leaf offset values.
+      ctx->upload(bvh_primIDs,&zero,1);
+#pragma omp target device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(tempNodes)
+#pragma omp teams distribute parallel for
+      for (int tid=0;tid<numNodes;tid++)
+        writeLeafOffsets(Kernel{tid},
+                         tempNodes,
+                         bvh_primIDs,
+                         numNodes);
+
+#pragma omp target device(ctx->gpuID) is_device_ptr(bvh_primIDs) is_device_ptr(tempNodes)
+#pragma omp teams distribute parallel for
+      for (int tid=0;tid<numPrims;tid++)
+        writePrims(Kernel{tid},
+                   bvh_primIDs,
+                   tempNodes,
+                   primStates,
+                   numPrims);
+      // ==================================================================
+      // allocate and write final nodes
+      // ==================================================================
+      bvh.numNodes = numNodes;
+      ctx->alloc(bvh.nodes,numNodes);
+      auto bvh_nodes = bvh.nodes;
+#pragma omp target device(ctx->gpuID) is_device_ptr(bvh_nodes) is_device_ptr(tempNodes)
+#pragma omp teams distribute parallel for
+      for (int tid=0;tid<numNodes;tid++)
+        writeNodes(Kernel{tid},bvh_nodes,tempNodes,numNodes);
+      ctx->free(tempNodes);
+      ctx->free(nodeStates);
+      ctx->free(primStates);
+      ctx->free(d_numNodes);
+
+      cuBQL::omp::refit(bvh,boxes,ctx);
+    }
+    
+  }
+  
+  template<typename T, int D>
+  void build_omp_target(BinaryBVH<T,D> &bvh,
+                        const box_t<T,D> *d_boxes,
+                        uint32_t          numBoxes,
+                        BuildConfig       buildConfig,
+                        int               gpuID)
+  {
+    omp::Context ctx(gpuID);
+    omp::spatialMedian(bvh,d_boxes,numBoxes,buildConfig,&ctx);
+  }
+}
+
diff --git a/cuBQL/math/vec.h b/cuBQL/math/vec.h
index 10ffdb7..11d7b37 100644
--- a/cuBQL/math/vec.h
+++ b/cuBQL/math/vec.h
@@ -356,34 +356,53 @@ namespace cuBQL {
   /* vec:vec */                                                         \
   template<typename T, int D>                                           \
   inline __cubql_both                                                   \
-  vec_t<T,D> long_op(vec_t<T,D> a, vec_t<T,D> b)                        \
+  vec_t<T,D> long_op(const vec_t<T,D> &a, const vec_t<T,D> &b)          \
   {                                                                     \
     vec_t<T,D> r;                                                       \
-    CUBQL_PRAGMA_UNROLL                                                 \
-      for (int i=0;i<D;i++) r[i] = a[i] op b[i];                        \
+    for (int i=0;i<D;i++) r[i] = a[i] op b[i];                          \
     return r;                                                           \
   }                                                                     \
-  /* scalar-vec */                                                      \
-  template<typename T, int D>                                           \
+  template<typename T>                                                  \
   inline __cubql_both                                                   \
-  vec_t<T,D> long_op(T a, vec_t<T,D> b)                                 \
+  vec_t<T,2> long_op(const vec_t<T,2> &a, const vec_t<T,2> &b)          \
   {                                                                     \
-    vec_t<T,D> r;                                                       \
-    CUBQL_PRAGMA_UNROLL                                                 \
-      for (int i=0;i<D;i++) r[i] = a op b[i];                           \
+    vec_t<T,2> r;                                                       \
+    r.x = a.x op b.x;                                                   \
+    r.y = a.y op b.y;                                                   \
     return r;                                                           \
   }                                                                     \
-    /* vec:scalar */                                                    \
-    template<typename T, int D>                                         \
-    inline __cubql_both                                                 \
-    vec_t<T,D> long_op(vec_t<T,D> a, T b)                               \
-    {                                                                   \
-      vec_t<T,D> r;                                                     \
-      CUBQL_PRAGMA_UNROLL                                               \
-        for (int i=0;i<D;i++) r[i] = a[i] op b;                         \
-      return r;                                                         \
-    }                                                                   \
-    CUBQL_OPERATOR_CUDA_T(long_op,op)
+  template<typename T>                                                  \
+  inline __cubql_both                                                   \
+  vec_t<T,3> long_op(const vec_t<T,3> &a, const vec_t<T,3> &b)          \
+  {                                                                     \
+    vec_t<T,3> r;                                                       \
+    r.x = a.x op b.x;                                                   \
+    r.y = a.y op b.y;                                                   \
+    r.z = a.z op b.z;                                                   \
+    return r;                                                           \
+  }                                                                     \
+  template<typename T>                                                  \
+  inline __cubql_both                                                   \
+  vec_t<T,4> long_op(const vec_t<T,4> &a, const vec_t<T,4> &b)          \
+  {                                                                     \
+    vec_t<T,4> r;                                                       \
+    r.x = a.x op b.x;                                                   \
+    r.y = a.y op b.y;                                                   \
+    r.z = a.z op b.z;                                                   \
+    r.w = a.w op b.w;                                                   \
+    return r;                                                           \
+  }                                                                     \
+  /* scalar-vec */                                                      \
+  template<typename T, int D>                                           \
+  inline __cubql_both                                                   \
+  vec_t<T,D> long_op(T a, const vec_t<T,D> &b)                          \
+  { return vec_t<T,D>(a) op b; }                                        \
+  /* vec:scalar */                                                      \
+  template<typename T, int D>                                           \
+  inline __cubql_both                                                   \
+  vec_t<T,D> long_op(const vec_t<T,D> &a, T b)                          \
+  { return a op vec_t<T,D>(b); }                                        \
+  CUBQL_OPERATOR_CUDA_T(long_op,op)
 
   CUBQL_OPERATOR(operator+,+)
   CUBQL_OPERATOR(operator-,-)