diff --git a/src/include/utils.h b/src/include/utils.h index 1ccf711a7..a5816876f 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -59,6 +59,20 @@ inline uint64_t clockNano() { return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec; } +// Low-power spin-wait hint to CPU +// ~43 cycles on x86, allows other hyperthreads to run +static inline void ncclCpuRelax() { +#if defined(__x86_64__) || defined(__i386__) + __asm__ __volatile__("pause" ::: "memory"); +#elif defined(__aarch64__) + __asm__ __volatile__("yield" ::: "memory"); +#elif defined(__PPC__) || defined(__ppc__) || defined(__powerpc__) + __asm__ __volatile__("or 27,27,27" ::: "memory"); +#else + __asm__ __volatile__("" ::: "memory"); +#endif +} + /* get any bytes of random data from /dev/urandom, return ncclSuccess (0) if it succeeds. */ inline ncclResult_t getRandomData(void* buffer, size_t bytes) { ncclResult_t ret = ncclSuccess; diff --git a/src/proxy.cc b/src/proxy.cc index 1bb213f51..415a2bd5c 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -16,6 +16,7 @@ #include "cpuset.h" #include "compiler.h" #include "os.h" +#include "utils.h" #include #include @@ -27,6 +28,15 @@ #define NCCL_MAX_PROXY_CONNECTIONS (NCCL_MAX_LOCAL_RANKS+1) +// Spin duration in nanoseconds before yielding when waiting for free ops. +// Default 1000ns (1us) - reduces sched_yield syscall overhead. +// Set to 0 to always yield immediately. +NCCL_PARAM(ProxySpinTimeNs, "PROXY_SPIN_TIME_NS", 1000); + +// Spin duration for progress loop when idle (no ops to process). +// Default 1000ns (1us). Set to 0 to always yield immediately. +NCCL_PARAM(ProxyProgressSpinTimeNs, "PROXY_PROGRESS_SPIN_TIME_NS", 1000); + enum { proxyRecv=0, proxySend=1 }; void* ncclProxyServiceUDS(void* _args); @@ -496,9 +506,18 @@ static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyCon } else { // Read the freeOps value and wait for a value different than -1. Once not -1, read the value with acquire and reset -1 int freeOp = -1; + uint64_t t0 = clockNano(); + int64_t spinTimeNs = ncclParamProxySpinTimeNs(); while (freeOp == -1) { freeOp = COMPILER_ATOMIC_EXCHANGE(&pool->freeOps[tpLocalRank], -1, std::memory_order_acquire); - if (freeOp == -1) sched_yield(); + if (freeOp == -1) { + if (clockNano() - t0 < (uint64_t)spinTimeNs) { + ncclCpuRelax(); + } else { + sched_yield(); + t0 = clockNano(); // Reset timer after yield + } + } } opIndex = freeOp; op = pool->ops+opIndex; @@ -994,7 +1013,18 @@ void* ncclProxyProgress(void *proxyState_) { INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret); } if (added == 0) { - std::this_thread::yield(); // No request progressed. Let others run. + // Time-based spinning before yield to reduce syscall overhead + static thread_local uint64_t idleStartTime = 0; + static thread_local int64_t progressSpinTimeNs = ncclParamProxyProgressSpinTimeNs(); + if (idleStartTime == 0) { + idleStartTime = clockNano(); + } + if (clockNano() - idleStartTime < (uint64_t)progressSpinTimeNs) { + ncclCpuRelax(); + } else { + std::this_thread::yield(); + idleStartTime = 0; // Reset after yield to re-spin next iteration + } } } lastIdle = idle; diff --git a/src/transport/net_ib/gdaki/doca-gpunetio/src/doca_gpunetio.cpp b/src/transport/net_ib/gdaki/doca-gpunetio/src/doca_gpunetio.cpp index 870243258..e87d8ad17 100644 --- a/src/transport/net_ib/gdaki/doca-gpunetio/src/doca_gpunetio.cpp +++ b/src/transport/net_ib/gdaki/doca-gpunetio/src/doca_gpunetio.cpp @@ -57,6 +57,19 @@ #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT) #define GPU_FULL_ASYNC_STORE_RELEASE_SUPPORT_COMPUTE_CAP_MAJOR 10 +// Low-power spin-wait hint to CPU +static inline void cpuRelax() { +#if defined(__x86_64__) || defined(__i386__) + __asm__ __volatile__("pause" ::: "memory"); +#elif defined(__aarch64__) + __asm__ __volatile__("yield" ::: "memory"); +#elif defined(__PPC__) || defined(__ppc__) || defined(__powerpc__) + __asm__ __volatile__("or 27,27,27" ::: "memory"); +#else + __asm__ __volatile__("" ::: "memory"); +#endif +} + struct doca_gpu_mtable { uintptr_t base_addr; size_t size_orig; @@ -829,7 +842,7 @@ static void *priv_service_mainloop(void *args) { } } pthread_rwlock_unlock(&service->service_lock); - sched_yield(); + cpuRelax(); } return nullptr;