Skip to content

support for GDS and cuFile API #17

@lyquid617

Description

@lyquid617

NVIDIA Open GPU Kernel Modules Version

590.44.01

Please confirm this issue does not happen with the proprietary driver (of the same version). This issue tracker is only for bugs specific to the open kernel driver.

  • I confirm that this does not happen with the proprietary driver package.

Operating System and Version

Ubuntu 22.04.5 LTS

Kernel Release

Linux GPU-18 5.15.0-127-generic NVIDIA#137-Ubuntu SMP Fri Nov 8 15:21:01 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux

Please confirm you are running a stable release kernel (e.g. not a -rc). We do not accept bug reports for unreleased kernels.

  • I am running on a stable kernel release.

Hardware: GPU

RTX5090

Describe the bug

Thanks for the community, after updating the driver both p2ptest and nccl-tests works perfectly.
While when trying to use some GDS features, like calling cuFileAPI, the cuFile downgrade to compatible mode.
cufile.log shows:

cufile.log

 29-12-2025 13:30:14:51 [pid=111425 tid=112507] ERROR  0:522 nvidia-fs MAP ioctl failed : ioctl_return: -22 ioctl_ret: -1
 29-12-2025 13:30:14:51 [pid=111425 tid=112507] ERROR  0:555 map failed

 29-12-2025 13:30:14:52 [pid=111425 tid=112507] ERROR  cufio-obj:156 error allocating nvfs handle, size: 2097152
 29-12-2025 13:30:14:52 [pid=111425 tid=112507] ERROR  cufio_core:1648 cuFileBufRegister error, object allocation failed
 29-12-2025 13:30:14:52 [pid=111425 tid=112507] ERROR  cufio_core:1729 cuFileBufRegister error cufile success
 29-12-2025 13:30:14:54 [pid=111425 tid=112507] ERROR  0:522 nvidia-fs MAP ioctl failed : ioctl_return: -22 ioctl_ret: -1
 29-12-2025 13:30:14:54 [pid=111425 tid=112507] ERROR  0:555 map failed
 29-12-2025 13:30:14:65 [pid=111425 tid=112507] ERROR  0:866 Buffer map failed for PCI-Group: 4 GPU: 4
 29-12-2025 13:30:14:65 [pid=111425 tid=112507] ERROR  0:994 Failed to obtain bounce buffer from domain: 4 GPU: 4
 29-12-2025 13:30:14:65 [pid=111425 tid=112507] ERROR  0:1299 failed to get bounce buffer for PCI group 4 GPU 4
 29-12-2025 13:30:14:65 [pid=111425 tid=112507] ERROR  cufio_core:3371 Final direct subio failed retval  -5011  buf_offset:  0  file_offset:  4096  size:  528384
 29-12-2025 13:30:14:65 [pid=111425 tid=112507] ERROR  cufio_core:3392 Setting I/O to failed. Expected I/O Size  528384  actual:  0
 29-12-2025 13:30:14:65 [pid=111425 tid=112515] ERROR  cufio-obj:156 error allocating nvfs handle, size: 2097152
 29-12-2025 13:30:14:65 [pid=111425 tid=112515] ERROR  cufio_core:1648 cuFileBufRegister error, object allocation failed
 29-12-2025 13:30:14:65 [pid=111425 tid=112515] ERROR  cufio_core:1729 cuFileBufRegister error cufile success
 29-12-2025 13:30:14:65 [pid=111425 tid=112525] ERROR  0:866 Buffer map failed for PCI-Group: 4 GPU: 4
 29-12-2025 13:30:14:65 [pid=111425 tid=112525] ERROR  0:994 Failed to obtain bounce buffer from domain: 4 GPU: 4
 29-12-2025 13:30:14:65 [pid=111425 tid=112525] ERROR  0:1299 failed to get bounce buffer for PCI group 4 GPU 4
 29-12-2025 13:30:14:65 [pid=111425 tid=112525] ERROR  cufio_core:3371 Final direct subio failed retval  -5011  buf_offset:  0  file_offset:  4096  size:  528384
 29-12-2025 13:30:14:65 [pid=111425 tid=112525] ERROR  cufio_core:3392 Setting I/O to failed. Expected I/O Size  528384  actual:  0
 29-12-2025 13:30:14:67 [pid=111425 tid=112515] ERROR  0:522 nvidia-fs MAP ioctl failed : ioctl_return: -22 ioctl_ret: -1
 29-12-2025 13:30:14:67 [pid=111425 tid=112515] ERROR  0:555 map failed

dmesg shows:

dmesg
[  688.044426] nvidia-fs:nvfs_pin_gpu_pages:1336 Error ret -22 invoking nvidia_p2p_get_pages_persistent
                va_start=0x40fc3400000/va_end=0x40fc34fffff/rounded_size=0x100000/gpu_buf_length=0x100000
[  688.045534] nvidia-fs:nvfs_pin_gpu_pages:1336 Error ret -22 invoking nvidia_p2p_get_pages_persistent
                va_start=0x80fc2d00000/va_end=0x80fc2dfffff/rounded_size=0x100000/gpu_buf_length=0x100000
[  688.048290] nvidia-fs:nvfs_pin_gpu_pages:1336 Error ret -22 invoking nvidia_p2p_get_pages_persistent
                va_start=0x80fc2f00000/va_end=0x80fc2ffffff/rounded_size=0x100000/gpu_buf_length=0x100000
[  688.051622] nvidia-fs:nvfs_pin_gpu_pages:1336 Error ret -22 invoking nvidia_p2p_get_pages_persistent
                va_start=0x80fc2900000/va_end=0x80fc29fffff/rounded_size=0x100000/gpu_buf_length=0x100000
[  688.054450] nvidia-fs:nvfs_pin_gpu_pages:1336 Error ret -22 invoking nvidia_p2p_get_pages_persistent
I guess the problem should lie in nvidia_p2p_get_pages_persistent

To Reproduce

compile nvidia-fs from source, insmod nvidia-fs.ko
Use cuFile API, example:

example
#include <iostream>
#include <vector>
#include <fcntl.h>
#include <unistd.h>
#include <cstring>
#include <cassert>
#include <sys/stat.h>

// CUDA & cuFile headers
#include <cuda_runtime.h>
#include <cufile.h>

#define MB(x) ((x) * 1024 * 1024UL)
#define TEST_SIZE MB(4) //
#define TEST_FILE "remote directory triggering gds"

#define CHECK_CUDA(call) { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at line " << __LINE__ << std::endl; \
        exit(1); \
    } \
}

#define CHECK_CUFILE(call) { \
    CUfileError_t err = call; \
    if (err.err != CU_FILE_SUCCESS) { \
        std::cerr << "cuFile Error: " << cufileop_status_error(err.err) << " at line " << __LINE__ << std::endl; \
        if (err.err == CU_FILE_DRIVER_VERSION_MISMATCH) std::cerr << "Hint: Driver/Library mismatch." << std::endl; \
        exit(1); \
    } \
}

void prepare_test_file() {
    std::cout << "[INFO] Preparing test file with random pattern..." << std::endl;
    std::vector<uint8_t> host_data(TEST_SIZE);
    for (size_t i = 0; i < TEST_SIZE; ++i) {
        host_data[i] = i % 255;
    }

    int fd = open(TEST_FILE, O_CREAT | O_RDWR | O_TRUNC, 0664);
    if (fd < 0) { perror("open for write failed"); exit(1); }
    write(fd, host_data.data(), TEST_SIZE);
    close(fd);
    sync(); 
}

int main() {
    prepare_test_file();

    std::cout << "[INFO] Opening cuFile Driver..." << std::endl;
    CHECK_CUFILE(cuFileDriverOpen());

    std::cout << "[INFO] Allocating GPU memory..." << std::endl;
    void* devPtr;
    CHECK_CUDA(cudaSetDevice(0)); 
    CHECK_CUDA(cudaMalloc(&devPtr, TEST_SIZE));
    CHECK_CUDA(cudaMemset(devPtr, 0, TEST_SIZE)); 

    std::cout << "[INFO] Opening file with O_DIRECT..." << std::endl;
    int fd = open(TEST_FILE, O_RDWR | O_DIRECT);
    if (fd < 0) {
        std::cerr << "Failed to open file with O_DIRECT. Check filesystem support." << std::endl;
        perror("open");
        exit(1);
    }

    std::cout << "[INFO] Registering cuFile handle (The Moment of Truth)..." << std::endl;
    CUfileDescr_t cf_descr;
    memset(&cf_descr, 0, sizeof(CUfileDescr_t));
    cf_descr.handle.fd = fd;
    cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
    
    CUfileHandle_t cf_handle;
    CHECK_CUFILE(cuFileHandleRegister(&cf_handle, &cf_descr));
    std::cout << "[SUCCESS] Handle Registered! " << std::endl;

    std::cout << "[INFO] Reading via cuFileRead..." << std::endl;
    ssize_t ret = cuFileRead(cf_handle, devPtr, TEST_SIZE, 0, 0);
    if (ret < 0) {
        std::cerr << "cuFileRead failed with ret: " << ret << std::endl;
        CUfileError_t status;
        status.err = CU_FILE_IO_ERROR;
        CHECK_CUFILE(status);
    } else if (ret != TEST_SIZE) {
        std::cerr << "Short read: " << ret << " vs expected " << TEST_SIZE << std::endl;
    } else {
        std::cout << "[SUCCESS] Read " << ret << " bytes direct to GPU." << std::endl;
    }

    std::cout << "[INFO] Verifying data integrity..." << std::endl;
    std::vector<uint8_t> verify_buf(TEST_SIZE);
    CHECK_CUDA(cudaMemcpy(verify_buf.data(), devPtr, TEST_SIZE, cudaMemcpyDeviceToHost));

    bool pass = true;
    for (size_t i = 0; i < TEST_SIZE; ++i) {
        if (verify_buf[i] != (i % 255)) {
            std::cerr << "Data mismatch at byte " << i << ": expected " << (i%255) << ", got " << (int)verify_buf[i] << std::endl;
            pass = false;
            break;
        }
    }

    if (pass) {
        std::cout << "\n[PASS] GDS Test Passed!\n" << std::endl;
    } else {
        std::cout << "\n[FAIL] Data corruption detected." << std::endl;
    }

    cuFileHandleDeregister(cf_handle);
    close(fd);
    cudaFree(devPtr);
    cuFileDriverClose();
    unlink(TEST_FILE);

    return 0;
}

Bug Incidence

Always

nvidia-bug-report.log.gz

1

More Info

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions