Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions src/include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,20 @@ inline uint64_t clockNano() {
return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
}

// Low-power spin-wait hint to CPU
// ~43 cycles on x86, allows other hyperthreads to run
static inline void ncclCpuRelax() {
#if defined(__x86_64__) || defined(__i386__)
__asm__ __volatile__("pause" ::: "memory");
#elif defined(__aarch64__)
__asm__ __volatile__("yield" ::: "memory");
#elif defined(__PPC__) || defined(__ppc__) || defined(__powerpc__)
__asm__ __volatile__("or 27,27,27" ::: "memory");
#else
__asm__ __volatile__("" ::: "memory");
#endif
}

/* get any bytes of random data from /dev/urandom, return ncclSuccess (0) if it succeeds. */
inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
ncclResult_t ret = ncclSuccess;
Expand Down
34 changes: 32 additions & 2 deletions src/proxy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "cpuset.h"
#include "compiler.h"
#include "os.h"
#include "utils.h"

#include <sys/syscall.h>
#include <assert.h>
Expand All @@ -27,6 +28,15 @@

#define NCCL_MAX_PROXY_CONNECTIONS (NCCL_MAX_LOCAL_RANKS+1)

// Spin duration in nanoseconds before yielding when waiting for free ops.
// Default 1000ns (1us) - reduces sched_yield syscall overhead.
// Set to 0 to always yield immediately.
NCCL_PARAM(ProxySpinTimeNs, "PROXY_SPIN_TIME_NS", 1000);

// Spin duration for progress loop when idle (no ops to process).
// Default 1000ns (1us). Set to 0 to always yield immediately.
NCCL_PARAM(ProxyProgressSpinTimeNs, "PROXY_PROGRESS_SPIN_TIME_NS", 1000);

enum { proxyRecv=0, proxySend=1 };
void* ncclProxyServiceUDS(void* _args);

Expand Down Expand Up @@ -496,9 +506,18 @@ static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyCon
} else {
// Read the freeOps value and wait for a value different than -1. Once not -1, read the value with acquire and reset -1
int freeOp = -1;
uint64_t t0 = clockNano();
int64_t spinTimeNs = ncclParamProxySpinTimeNs();
while (freeOp == -1) {
freeOp = COMPILER_ATOMIC_EXCHANGE(&pool->freeOps[tpLocalRank], -1, std::memory_order_acquire);
if (freeOp == -1) sched_yield();
if (freeOp == -1) {
if (clockNano() - t0 < (uint64_t)spinTimeNs) {
ncclCpuRelax();
} else {
sched_yield();
t0 = clockNano(); // Reset timer after yield
}
}
}
opIndex = freeOp;
op = pool->ops+opIndex;
Expand Down Expand Up @@ -994,7 +1013,18 @@ void* ncclProxyProgress(void *proxyState_) {
INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
}
if (added == 0) {
std::this_thread::yield(); // No request progressed. Let others run.
// Time-based spinning before yield to reduce syscall overhead
static thread_local uint64_t idleStartTime = 0;
static thread_local int64_t progressSpinTimeNs = ncclParamProxyProgressSpinTimeNs();
if (idleStartTime == 0) {
idleStartTime = clockNano();
}
if (clockNano() - idleStartTime < (uint64_t)progressSpinTimeNs) {
ncclCpuRelax();
} else {
std::this_thread::yield();
idleStartTime = 0; // Reset after yield to re-spin next iteration
}
}
}
lastIdle = idle;
Expand Down
15 changes: 14 additions & 1 deletion src/transport/net_ib/gdaki/doca-gpunetio/src/doca_gpunetio.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,19 @@
#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
#define GPU_FULL_ASYNC_STORE_RELEASE_SUPPORT_COMPUTE_CAP_MAJOR 10

// Low-power spin-wait hint to CPU
static inline void cpuRelax() {
#if defined(__x86_64__) || defined(__i386__)
__asm__ __volatile__("pause" ::: "memory");
#elif defined(__aarch64__)
__asm__ __volatile__("yield" ::: "memory");
#elif defined(__PPC__) || defined(__ppc__) || defined(__powerpc__)
__asm__ __volatile__("or 27,27,27" ::: "memory");
#else
__asm__ __volatile__("" ::: "memory");
#endif
}

struct doca_gpu_mtable {
uintptr_t base_addr;
size_t size_orig;
Expand Down Expand Up @@ -829,7 +842,7 @@ static void *priv_service_mainloop(void *args) {
}
}
pthread_rwlock_unlock(&service->service_lock);
sched_yield();
cpuRelax();
Copy link

@xerothermic xerothermic Feb 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fyi, the pause instruction has variable latency per micro-architecture. For example, the Intel Skylake has 10x more latency per pause instruction compared to the previous generation. A more robust approach might be to calibrate the pause latency during init and insert an approximate known timed delay.

}

return nullptr;
Expand Down