forked from tinygrad/open-gpu-kernel-modules
-
Notifications
You must be signed in to change notification settings - Fork 19
Open
Labels
bugSomething isn't workingSomething isn't working
Description
NVIDIA Open GPU Kernel Modules Version
590.44.01
Please confirm this issue does not happen with the proprietary driver (of the same version). This issue tracker is only for bugs specific to the open kernel driver.
- I confirm that this does not happen with the proprietary driver package.
Operating System and Version
Ubuntu 22.04.5 LTS
Kernel Release
Linux GPU-18 5.15.0-127-generic NVIDIA#137-Ubuntu SMP Fri Nov 8 15:21:01 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux
Please confirm you are running a stable release kernel (e.g. not a -rc). We do not accept bug reports for unreleased kernels.
- I am running on a stable kernel release.
Hardware: GPU
RTX5090
Describe the bug
Thanks for the community, after updating the driver both p2ptest and nccl-tests works perfectly.
While when trying to use some GDS features, like calling cuFileAPI, the cuFile downgrade to compatible mode.
cufile.log shows:
cufile.log
29-12-2025 13:30:14:51 [pid=111425 tid=112507] ERROR 0:522 nvidia-fs MAP ioctl failed : ioctl_return: -22 ioctl_ret: -1
29-12-2025 13:30:14:51 [pid=111425 tid=112507] ERROR 0:555 map failed
29-12-2025 13:30:14:52 [pid=111425 tid=112507] ERROR cufio-obj:156 error allocating nvfs handle, size: 2097152
29-12-2025 13:30:14:52 [pid=111425 tid=112507] ERROR cufio_core:1648 cuFileBufRegister error, object allocation failed
29-12-2025 13:30:14:52 [pid=111425 tid=112507] ERROR cufio_core:1729 cuFileBufRegister error cufile success
29-12-2025 13:30:14:54 [pid=111425 tid=112507] ERROR 0:522 nvidia-fs MAP ioctl failed : ioctl_return: -22 ioctl_ret: -1
29-12-2025 13:30:14:54 [pid=111425 tid=112507] ERROR 0:555 map failed
29-12-2025 13:30:14:65 [pid=111425 tid=112507] ERROR 0:866 Buffer map failed for PCI-Group: 4 GPU: 4
29-12-2025 13:30:14:65 [pid=111425 tid=112507] ERROR 0:994 Failed to obtain bounce buffer from domain: 4 GPU: 4
29-12-2025 13:30:14:65 [pid=111425 tid=112507] ERROR 0:1299 failed to get bounce buffer for PCI group 4 GPU 4
29-12-2025 13:30:14:65 [pid=111425 tid=112507] ERROR cufio_core:3371 Final direct subio failed retval -5011 buf_offset: 0 file_offset: 4096 size: 528384
29-12-2025 13:30:14:65 [pid=111425 tid=112507] ERROR cufio_core:3392 Setting I/O to failed. Expected I/O Size 528384 actual: 0
29-12-2025 13:30:14:65 [pid=111425 tid=112515] ERROR cufio-obj:156 error allocating nvfs handle, size: 2097152
29-12-2025 13:30:14:65 [pid=111425 tid=112515] ERROR cufio_core:1648 cuFileBufRegister error, object allocation failed
29-12-2025 13:30:14:65 [pid=111425 tid=112515] ERROR cufio_core:1729 cuFileBufRegister error cufile success
29-12-2025 13:30:14:65 [pid=111425 tid=112525] ERROR 0:866 Buffer map failed for PCI-Group: 4 GPU: 4
29-12-2025 13:30:14:65 [pid=111425 tid=112525] ERROR 0:994 Failed to obtain bounce buffer from domain: 4 GPU: 4
29-12-2025 13:30:14:65 [pid=111425 tid=112525] ERROR 0:1299 failed to get bounce buffer for PCI group 4 GPU 4
29-12-2025 13:30:14:65 [pid=111425 tid=112525] ERROR cufio_core:3371 Final direct subio failed retval -5011 buf_offset: 0 file_offset: 4096 size: 528384
29-12-2025 13:30:14:65 [pid=111425 tid=112525] ERROR cufio_core:3392 Setting I/O to failed. Expected I/O Size 528384 actual: 0
29-12-2025 13:30:14:67 [pid=111425 tid=112515] ERROR 0:522 nvidia-fs MAP ioctl failed : ioctl_return: -22 ioctl_ret: -1
29-12-2025 13:30:14:67 [pid=111425 tid=112515] ERROR 0:555 map failed
dmesg shows:
dmesg
[ 688.044426] nvidia-fs:nvfs_pin_gpu_pages:1336 Error ret -22 invoking nvidia_p2p_get_pages_persistent
va_start=0x40fc3400000/va_end=0x40fc34fffff/rounded_size=0x100000/gpu_buf_length=0x100000
[ 688.045534] nvidia-fs:nvfs_pin_gpu_pages:1336 Error ret -22 invoking nvidia_p2p_get_pages_persistent
va_start=0x80fc2d00000/va_end=0x80fc2dfffff/rounded_size=0x100000/gpu_buf_length=0x100000
[ 688.048290] nvidia-fs:nvfs_pin_gpu_pages:1336 Error ret -22 invoking nvidia_p2p_get_pages_persistent
va_start=0x80fc2f00000/va_end=0x80fc2ffffff/rounded_size=0x100000/gpu_buf_length=0x100000
[ 688.051622] nvidia-fs:nvfs_pin_gpu_pages:1336 Error ret -22 invoking nvidia_p2p_get_pages_persistent
va_start=0x80fc2900000/va_end=0x80fc29fffff/rounded_size=0x100000/gpu_buf_length=0x100000
[ 688.054450] nvidia-fs:nvfs_pin_gpu_pages:1336 Error ret -22 invoking nvidia_p2p_get_pages_persistent
To Reproduce
compile nvidia-fs from source, insmod nvidia-fs.ko
Use cuFile API, example:
example
#include <iostream>
#include <vector>
#include <fcntl.h>
#include <unistd.h>
#include <cstring>
#include <cassert>
#include <sys/stat.h>
// CUDA & cuFile headers
#include <cuda_runtime.h>
#include <cufile.h>
#define MB(x) ((x) * 1024 * 1024UL)
#define TEST_SIZE MB(4) //
#define TEST_FILE "remote directory triggering gds"
#define CHECK_CUDA(call) { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at line " << __LINE__ << std::endl; \
exit(1); \
} \
}
#define CHECK_CUFILE(call) { \
CUfileError_t err = call; \
if (err.err != CU_FILE_SUCCESS) { \
std::cerr << "cuFile Error: " << cufileop_status_error(err.err) << " at line " << __LINE__ << std::endl; \
if (err.err == CU_FILE_DRIVER_VERSION_MISMATCH) std::cerr << "Hint: Driver/Library mismatch." << std::endl; \
exit(1); \
} \
}
void prepare_test_file() {
std::cout << "[INFO] Preparing test file with random pattern..." << std::endl;
std::vector<uint8_t> host_data(TEST_SIZE);
for (size_t i = 0; i < TEST_SIZE; ++i) {
host_data[i] = i % 255;
}
int fd = open(TEST_FILE, O_CREAT | O_RDWR | O_TRUNC, 0664);
if (fd < 0) { perror("open for write failed"); exit(1); }
write(fd, host_data.data(), TEST_SIZE);
close(fd);
sync();
}
int main() {
prepare_test_file();
std::cout << "[INFO] Opening cuFile Driver..." << std::endl;
CHECK_CUFILE(cuFileDriverOpen());
std::cout << "[INFO] Allocating GPU memory..." << std::endl;
void* devPtr;
CHECK_CUDA(cudaSetDevice(0));
CHECK_CUDA(cudaMalloc(&devPtr, TEST_SIZE));
CHECK_CUDA(cudaMemset(devPtr, 0, TEST_SIZE));
std::cout << "[INFO] Opening file with O_DIRECT..." << std::endl;
int fd = open(TEST_FILE, O_RDWR | O_DIRECT);
if (fd < 0) {
std::cerr << "Failed to open file with O_DIRECT. Check filesystem support." << std::endl;
perror("open");
exit(1);
}
std::cout << "[INFO] Registering cuFile handle (The Moment of Truth)..." << std::endl;
CUfileDescr_t cf_descr;
memset(&cf_descr, 0, sizeof(CUfileDescr_t));
cf_descr.handle.fd = fd;
cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
CUfileHandle_t cf_handle;
CHECK_CUFILE(cuFileHandleRegister(&cf_handle, &cf_descr));
std::cout << "[SUCCESS] Handle Registered! " << std::endl;
std::cout << "[INFO] Reading via cuFileRead..." << std::endl;
ssize_t ret = cuFileRead(cf_handle, devPtr, TEST_SIZE, 0, 0);
if (ret < 0) {
std::cerr << "cuFileRead failed with ret: " << ret << std::endl;
CUfileError_t status;
status.err = CU_FILE_IO_ERROR;
CHECK_CUFILE(status);
} else if (ret != TEST_SIZE) {
std::cerr << "Short read: " << ret << " vs expected " << TEST_SIZE << std::endl;
} else {
std::cout << "[SUCCESS] Read " << ret << " bytes direct to GPU." << std::endl;
}
std::cout << "[INFO] Verifying data integrity..." << std::endl;
std::vector<uint8_t> verify_buf(TEST_SIZE);
CHECK_CUDA(cudaMemcpy(verify_buf.data(), devPtr, TEST_SIZE, cudaMemcpyDeviceToHost));
bool pass = true;
for (size_t i = 0; i < TEST_SIZE; ++i) {
if (verify_buf[i] != (i % 255)) {
std::cerr << "Data mismatch at byte " << i << ": expected " << (i%255) << ", got " << (int)verify_buf[i] << std::endl;
pass = false;
break;
}
}
if (pass) {
std::cout << "\n[PASS] GDS Test Passed!\n" << std::endl;
} else {
std::cout << "\n[FAIL] Data corruption detected." << std::endl;
}
cuFileHandleDeregister(cf_handle);
close(fd);
cudaFree(devPtr);
cuFileDriverClose();
unlink(TEST_FILE);
return 0;
}Bug Incidence
Always
nvidia-bug-report.log.gz
1
More Info
No response
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working