NVIDIA · hodgesds · Feb 5, 2026 · xerothermic · Feb 9, 2026
diff --git a/src/include/utils.h b/src/include/utils.h
@@ -59,6 +59,20 @@ inline uint64_t clockNano() {
   return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
 }
 
+// Low-power spin-wait hint to CPU
+// ~43 cycles on x86, allows other hyperthreads to run
+static inline void ncclCpuRelax() {
+#if defined(__x86_64__) || defined(__i386__)
+  __asm__ __volatile__("pause" ::: "memory");
+#elif defined(__aarch64__)
+  __asm__ __volatile__("yield" ::: "memory");
+#elif defined(__PPC__) || defined(__ppc__) || defined(__powerpc__)
+  __asm__ __volatile__("or 27,27,27" ::: "memory");
+#else
+  __asm__ __volatile__("" ::: "memory");
+#endif
+}
+
 /* get any bytes of random data from /dev/urandom, return ncclSuccess (0) if it succeeds. */
 inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
   ncclResult_t ret = ncclSuccess;

diff --git a/src/proxy.cc b/src/proxy.cc
@@ -16,6 +16,7 @@
 #include "cpuset.h"
 #include "compiler.h"
 #include "os.h"
+#include "utils.h"
 
 #include <sys/syscall.h>
 #include <assert.h>
@@ -27,6 +28,15 @@
 
 #define NCCL_MAX_PROXY_CONNECTIONS (NCCL_MAX_LOCAL_RANKS+1)
 
+// Spin duration in nanoseconds before yielding when waiting for free ops.
+// Default 1000ns (1us) - reduces sched_yield syscall overhead.
+// Set to 0 to always yield immediately.
+NCCL_PARAM(ProxySpinTimeNs, "PROXY_SPIN_TIME_NS", 1000);
+
+// Spin duration for progress loop when idle (no ops to process).
+// Default 1000ns (1us). Set to 0 to always yield immediately.
+NCCL_PARAM(ProxyProgressSpinTimeNs, "PROXY_PROGRESS_SPIN_TIME_NS", 1000);
+
 enum { proxyRecv=0, proxySend=1 };
 void* ncclProxyServiceUDS(void* _args);
 
@@ -496,9 +506,18 @@ static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyCon
   } else {
     // Read the freeOps value and wait for a value different than -1. Once not -1, read the value with acquire and reset -1
     int freeOp = -1;
+    uint64_t t0 = clockNano();
+    int64_t spinTimeNs = ncclParamProxySpinTimeNs();
     while (freeOp == -1) {
       freeOp = COMPILER_ATOMIC_EXCHANGE(&pool->freeOps[tpLocalRank], -1, std::memory_order_acquire);
-      if (freeOp == -1) sched_yield();
+      if (freeOp == -1) {
+        if (clockNano() - t0 < (uint64_t)spinTimeNs) {
+          ncclCpuRelax();
+        } else {
+          sched_yield();
+          t0 = clockNano();  // Reset timer after yield
+        }
+      }
     }
     opIndex = freeOp;
     op = pool->ops+opIndex;
@@ -994,7 +1013,18 @@ void* ncclProxyProgress(void *proxyState_) {
         INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
       }
       if (added == 0) {
-        std::this_thread::yield(); // No request progressed. Let others run.
+        // Time-based spinning before yield to reduce syscall overhead
+        static thread_local uint64_t idleStartTime = 0;
+        static thread_local int64_t progressSpinTimeNs = ncclParamProxyProgressSpinTimeNs();
+        if (idleStartTime == 0) {
+          idleStartTime = clockNano();
+        }
+        if (clockNano() - idleStartTime < (uint64_t)progressSpinTimeNs) {
+          ncclCpuRelax();
+        } else {
+          std::this_thread::yield();
+          idleStartTime = 0;  // Reset after yield to re-spin next iteration
+        }
       }
     }
     lastIdle = idle;

diff --git a/src/transport/net_ib/gdaki/doca-gpunetio/src/doca_gpunetio.cpp b/src/transport/net_ib/gdaki/doca-gpunetio/src/doca_gpunetio.cpp
@@ -57,6 +57,19 @@
 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
 #define GPU_FULL_ASYNC_STORE_RELEASE_SUPPORT_COMPUTE_CAP_MAJOR 10
 
+// Low-power spin-wait hint to CPU
+static inline void cpuRelax() {
+#if defined(__x86_64__) || defined(__i386__)
+  __asm__ __volatile__("pause" ::: "memory");
+#elif defined(__aarch64__)
+  __asm__ __volatile__("yield" ::: "memory");
+#elif defined(__PPC__) || defined(__ppc__) || defined(__powerpc__)
+  __asm__ __volatile__("or 27,27,27" ::: "memory");
+#else
+  __asm__ __volatile__("" ::: "memory");
+#endif
+}
+
 struct doca_gpu_mtable {
     uintptr_t base_addr;
     size_t size_orig;
@@ -829,7 +842,7 @@ static void *priv_service_mainloop(void *args) {
             }
         }
         pthread_rwlock_unlock(&service->service_lock);
-        sched_yield();
+        cpuRelax();
     }
 
     return nullptr;