Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ gpu_compute_capability = "rapids_cli.doctor.checks.gpu:check_gpu_compute_capabil
cuda = "rapids_cli.doctor.checks.cuda_driver:cuda_check"
memory_to_gpu_ratio = "rapids_cli.doctor.checks.memory:check_memory_to_gpu_ratio"
nvlink_status = "rapids_cli.doctor.checks.nvlink:check_nvlink_status"
cuda_version_mismatch = "rapids_cli.doctor.checks.cuda_version_mismatch:check_cuda_major_version_mismatch"

[project.urls]
Homepage = "https://github.com/rapidsai/rapids-cli"
Expand Down
73 changes: 73 additions & 0 deletions rapids_cli/doctor/checks/cuda_version_mismatch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Check for CUDA toolkit vs driver major version mismatch."""

import re
from pathlib import Path

import cuda.pathfinder
import pynvml


def _get_driver_cuda_major() -> int:
"""Return the CUDA major version supported by the installed driver via pynvml."""
pynvml.nvmlInit()
return pynvml.nvmlSystemGetCudaDriverVersion() // 1000


def _get_toolkit_cuda_major() -> int | None:
"""Return the CUDA major version of the toolkit found via cuda-pathfinder, or None."""
header_dir = cuda.pathfinder.find_nvidia_header_directory("cudart")
if header_dir is None:
return None
version_file = Path(header_dir) / "cuda_runtime_version.h"
if not version_file.exists():
return None
match = re.search(r"#define\s+CUDA_VERSION\s+(\d+)", version_file.read_text())
return int(match.group(1)) // 1000 if match else None
Comment on lines +23 to +27
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious if this is the best way to get the CUDA Toolkit version.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was doing some digging to see if we could pull it from cudart via python API if it was available, because cudaRuntimeGetVersion exists

https://docs.nvidia.com/cuda/archive/9.0/cuda-runtime-api/group__CUDART____VERSION.html#group__CUDART____VERSION_1g0e3952c7802fd730432180f1f4a6cdc6

but i wasn't able to do something like

from cuda import cudart
cudart.cudaRuntimeGetVersion()

BUt with the help of perplexity, I was able to get the version using ctypes and accessing libcudart.

Idk if it's cleaner though. BUt it would be something like this

import ctypes
from ctypes import byref, c_int

libcudart = ctypes.cdll.LoadLibrary("libcudart.so")  # conda cuda-cudart provides this

cudaRuntimeGetVersion = libcudart.cudaRuntimeGetVersion
cudaRuntimeGetVersion.argtypes = [ctypes.POINTER(c_int)]
cudaRuntimeGetVersion.restype = c_int

ver = c_int()
err = cudaRuntimeGetVersion(byref(ver))
if err != 0:
    raise RuntimeError(f"cudaRuntimeGetVersion failed with error code {err}")

ver_int = ver.value
major = ver_int // 1000
minor = (ver_int % 1000) // 10
print("CUDA runtime version:", ver_int, f"({major}.{minor})")



def check_cuda_major_version_mismatch(
verbose=False,
get_driver_cuda_major=_get_driver_cuda_major,
get_toolkit_cuda_major=_get_toolkit_cuda_major,
Comment on lines +32 to +33
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I went with a dependency injection approach here after chatting about it with @mmccarty to make testing easier.

I haven't refactored other checks to reuse this to keep this PR simpler, but we could do that in the future.

**kwargs,
) -> bool | str:
"""Check that the CUDA toolkit major version matches the driver's supported CUDA major version.

Args:
verbose: If True, return a descriptive string on success.
get_driver_cuda_major: Callable returning the driver's max CUDA major version.
get_toolkit_cuda_major: Callable returning the toolkit CUDA major version, or None.
**kwargs: Accepted for forward compatibility.

Returns:
True on success, or a descriptive string if verbose is True.

Raises:
ValueError: If the toolkit and driver major versions differ.
"""
driver_major = get_driver_cuda_major()

toolkit_major = get_toolkit_cuda_major()
if toolkit_major is None:
return True

if toolkit_major > driver_major:
raise ValueError(
f"CUDA toolkit major version ({toolkit_major}) is newer than what the installed driver supports "
f"({driver_major}). Update your NVIDIA driver to one that supports CUDA {toolkit_major} or "
f"downgrade your CUDA toolkit to CUDA {driver_major}."
Comment on lines +58 to +60
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could improve these errors. It would be nice to detect how CUDA Toolkit has been installed (system, conda, pip, etc) and provide more nuanced advice for the user.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can do that via python, for example I'm in conda environment that has cudf and cuml and you can access that info via

>>> from cuda import pathfinder
>>> loaded = pathfinder.load_nvidia_dynamic_lib("cudart")
>>> loaded.abs_path
'/raid/myuser/conda/envs/ray-cuml/lib/libcudart.so'
>>> loaded.found_via
'conda'

and on a different conda env, that only has cuda-python, but that doesn't have cuda-runtime installed I get this

>>> from cuda import pathfinder
>>> loaded = pathfinder.load_nvidia_dynamic_lib("cudart")
>>> loaded.abs_path
'/usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.13'
>>> loaded.found_via
'system-search'

)

if toolkit_major < driver_major:
raise ValueError(
f"CUDA toolkit major version ({toolkit_major}) is older than the driver's supported CUDA major version "
f"({driver_major}). Upgrade your CUDA toolkit to CUDA {driver_major} or "
f"downgrade your NVIDIA driver to one that supports CUDA {toolkit_major}."
)
Comment on lines +63 to +68
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shouldn't necessarily be an error, a newer driver is ok as long as the CTK major matches all the packages. The problem would be when you have driver CUDA 13, with CTK 12 but a foo-cu13 Python package. E.g rapidsai/deployment#516


if verbose:
return f"CUDA toolkit major version ({toolkit_major}) matches driver CUDA major version ({driver_major})."

return True
75 changes: 75 additions & 0 deletions rapids_cli/tests/test_cuda_version_mismatch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from unittest.mock import patch

import pytest

from rapids_cli.doctor.checks.cuda_version_mismatch import (
check_cuda_major_version_mismatch,
_get_driver_cuda_major,
_get_toolkit_cuda_major,
)


def test_versions_match():
result = check_cuda_major_version_mismatch(
get_driver_cuda_major=lambda: 12,
get_toolkit_cuda_major=lambda: 12,
)
assert result is True


def test_toolkit_older_than_driver():
with pytest.raises(ValueError, match="older than the driver"):
check_cuda_major_version_mismatch(
get_driver_cuda_major=lambda: 12,
get_toolkit_cuda_major=lambda: 11,
)


def test_toolkit_newer_than_driver():
with pytest.raises(ValueError, match="CUDA toolkit major version"):
check_cuda_major_version_mismatch(
get_driver_cuda_major=lambda: 11,
get_toolkit_cuda_major=lambda: 12,
)


def test_verbose_output():
result = check_cuda_major_version_mismatch(
verbose=True,
get_driver_cuda_major=lambda: 12,
get_toolkit_cuda_major=lambda: 12,
)
assert isinstance(result, str)
assert "12" in result


def test_get_driver_cuda_major():
with (
patch("pynvml.nvmlInit"),
patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040),
):
assert _get_driver_cuda_major() == 12


def test_get_toolkit_cuda_major_no_header_dir():
with patch("cuda.pathfinder.find_nvidia_header_directory", return_value=None):
assert _get_toolkit_cuda_major() is None


def test_get_toolkit_cuda_major_file_missing(tmp_path):
with patch("cuda.pathfinder.find_nvidia_header_directory", return_value=str(tmp_path)):
assert _get_toolkit_cuda_major() is None


def test_get_toolkit_cuda_major_no_match(tmp_path):
(tmp_path / "cuda_runtime_version.h").write_text("/* no version define here */")
with patch("cuda.pathfinder.find_nvidia_header_directory", return_value=str(tmp_path)):
assert _get_toolkit_cuda_major() is None


def test_get_toolkit_cuda_major_success(tmp_path):
(tmp_path / "cuda_runtime_version.h").write_text("#define CUDA_VERSION 12040\n")
with patch("cuda.pathfinder.find_nvidia_header_directory", return_value=str(tmp_path)):
assert _get_toolkit_cuda_major() == 12
Loading
Loading