|
__global__ void simpleCopyKernel(unsigned long long loopCount, uint4 *dst, uint4 *src) { |
|
for (unsigned int i = 0; i < loopCount; i++) { |
|
const int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
size_t offset = idx * sizeof(uint4); |
|
uint4* dst_uint4 = reinterpret_cast<uint4*>((char*)dst + offset); |
|
uint4* src_uint4 = reinterpret_cast<uint4*>((char*)src + offset); |
|
__stcg(dst_uint4, __ldcg(src_uint4)); |
|
} |
|
} |
It appears that offset or pointers never advance during loop iterations, and we always load/store from/to the same locations. It does not do as much copying as it's intended to.
https://godbolt.org/z/rM3dP5GsE
nvbandwidth/kernels.cu
Lines 20 to 28 in 4a49bda
It appears that
offsetor pointers never advance during loop iterations, and we always load/store from/to the same locations. It does not do as much copying as it's intended to.https://godbolt.org/z/rM3dP5GsE