Skip to content
29 changes: 13 additions & 16 deletions cuda_core/cuda/core/_utils/driver_cu_result_explanations.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

# To regenerate the dictionary below run:
# ../../../../../toolshed/reformat_cuda_enums_as_py.py /usr/local/cuda/include/cuda.h
# Replace the dictionary below with the output.
# Also update the CUDA Toolkit version number below.
from cuda.bindings import driver
from cuda.core._utils.enum_explanations_helpers import get_best_available_explanations

# CUDA Toolkit v13.2.0
DRIVER_CU_RESULT_EXPLANATIONS = {
# CUDA Toolkit v13.1.1
_FALLBACK_EXPLANATIONS = {
0: (
"The API call returned with no errors. In the case of query calls, this"
" also means that the operation being queried is complete (see"
Expand Down Expand Up @@ -334,15 +332,12 @@
" changes which violated constraints specific to instantiated graph update."
),
911: (
"This indicates that an error has occurred in a device outside of GPU. It can be a"
" synchronous error w.r.t. CUDA API or an asynchronous error from the external device."
" In case of asynchronous error, it means that if cuda was waiting for an external device's"
" signal before consuming shared data, the external device signaled an error indicating that"
" the data is not valid for consumption. This leaves the process in an inconsistent"
" state and any further CUDA work will return the same error. To continue using CUDA,"
" the process must be terminated and relaunched."
" In case of synchronous error, it means that one or more external devices"
" have encountered an error and cannot complete the operation."
"This indicates that an async error has occurred in a device outside of CUDA."
" If CUDA was waiting for an external device's signal before consuming shared data,"
" the external device signaled an error indicating that the data is not valid for"
" consumption. This leaves the process in an inconsistent state and any further CUDA"
" work will return the same error. To continue using CUDA, the process must be"
" terminated and relaunched."
),
912: "Indicates a kernel launch error due to cluster misconfiguration.",
913: ("Indiciates a function handle is not loaded when calling an API that requires a loaded function."),
Expand All @@ -356,3 +351,5 @@
),
999: "This indicates that an unknown internal error has occurred.",
}

DRIVER_CU_RESULT_EXPLANATIONS = get_best_available_explanations(driver.CUresult, _FALLBACK_EXPLANATIONS)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be great to move this one level up so this module full of pre-written results doesn't have to be imported at all if we have a new enough cuda-bindings.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done: commit 7025854

See also: comment here laying out our options for how to do this

I went with Option 1.

121 changes: 121 additions & 0 deletions cuda_core/cuda/core/_utils/enum_explanations_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

"""Internal support for error-enum explanations.

``cuda_core`` keeps frozen 13.1.1 fallback tables for older ``cuda-bindings``
releases. Driver/runtime error enums carry usable ``__doc__`` text starting in
the 12.x backport line at ``cuda-bindings`` 12.9.6, and in the mainline 13.x
series at ``cuda-bindings`` 13.2.0. This module decides which source to use
and normalizes generated docstrings so user-facing ``CUDAError`` messages stay
presentable.

The cleanup rules here were derived while validating generated enum docstrings
in PR #1805. Keep them narrow and remove them when codegen quirks or fallback
support are no longer needed.
"""

from __future__ import annotations

import importlib.metadata
import re
from typing import Any

_MIN_12X_BINDING_VERSION_FOR_ENUM_DOCSTRINGS = (12, 9, 6)
_MIN_13X_BINDING_VERSION_FOR_ENUM_DOCSTRINGS = (13, 2, 0)


# ``version.pyx`` cannot be reused here (circular import via ``cuda_utils``).
def _binding_version() -> tuple[int, int, int]:
"""Return the installed ``cuda-bindings`` version, or a conservative old value."""
try:
parts = importlib.metadata.version("cuda-bindings").split(".")[:3]
except importlib.metadata.PackageNotFoundError:
return (0, 0, 0) # For very old versions of cuda-python
return tuple(int(v) for v in parts)


def _binding_version_has_usable_enum_docstrings(version: tuple[int, int, int]) -> bool:
"""Whether released bindings are known to carry usable error-enum ``__doc__`` text."""
return (
_MIN_12X_BINDING_VERSION_FOR_ENUM_DOCSTRINGS <= version < (13, 0, 0)
or version >= _MIN_13X_BINDING_VERSION_FOR_ENUM_DOCSTRINGS
)


def _fix_hyphenation_wordwrap_spacing(s: str) -> str:
"""Remove spaces around hyphens introduced by line wrapping in generated ``__doc__`` text.

This is a narrow workaround for wrapped forms such as ``non- linear`` that
would otherwise look awkward in user-facing messages.
"""
prev = None
while prev != s:
prev = s
s = re.sub(r"([a-z])- ([a-z])", r"\1-\2", s)
s = re.sub(r"([a-z]) -([a-z])", r"\1-\2", s)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems too narrow. Why only perform this on lower-case words?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done: commit f70c099

Cursor explained:

Broadened this so it is no longer limited to lowercase-only cases.

The cleanup now targets asymmetric word-wrap artifacts whenever the hyphen is attached to a word character on one side and separated by a space on the other, so it also covers cases like GPU- Direct, peer -GPU, and L2- cache. At the same time, it still avoids rewriting intentional separators like a - b.

I added focused tests for the uppercase/digit cases plus a negative case for preserving a real dash separator, and re-ran show_all_enum_doc.py to confirm the current 13.2.x output remains unchanged.

return s


def clean_enum_member_docstring(doc: str | None) -> str | None:
"""Turn an enum member ``__doc__`` into plain text.

The generated enum docstrings are already close to user-facing prose, but
they may contain Sphinx inline roles, line wrapping, or a small known
codegen defect. Normalize only those differences so the text is suitable
for error messages.
"""
if doc is None:
return None
s = doc
# Known codegen bug on cudaErrorIncompatibleDriverContext. Remove once fixed
# in cuda-bindings code generation.
s = s.replace("\n:py:obj:`~.Interactions`", ' "Interactions ')
s = re.sub(
r":(?:py:)?(?:obj|func|meth|class|mod|data|const|exc):`([^`]+)`",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This regex seems too specific. Could we just use one that would capture any rst directive?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something like (untested):

r":(?:[a-z]+:)?(?:[a-z]):`([^`]+)`"

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done: commit f70c099

Cursor explained:

Updated to use a generic inline-role matcher instead of enumerating specific py roles.

Concretely, the cleanup now accepts both :role:... and :domain:role:... forms, so it handles things like :term:, :c:func:, and the existing :py:...: cases uniformly. I also expanded the focused tests to cover both domain-qualified and non-domain roles.

I manually re-ran show_all_enum_doc.py after this change; the current 13.2.x output stayed unchanged, which is what I wanted. The broader regex just makes the cleanup less brittle for future/generated variants.

lambda m: re.sub(r"^~?\.", "", m.group(1)),
s,
)
s = re.sub(r"\*\*([^*]+)\*\*", r"\1", s)
s = re.sub(r"\*([^*]+)\*", r"\1", s)
s = re.sub(r"\s+", " ", s).strip()
s = _fix_hyphenation_wordwrap_spacing(s)
return s
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems really brittle, but I guess I don't see a way around it. I would prefer to just depend on rst2txt or something, that but seems unmaintained.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed offline: what we have here is sufficiently simple, extensively unit tested, and in the worst case we have a cosmetic issue (unwanted sphinx markup in the error messages).

To address the "brittle" concern, I added commit 27aae00

For easy reference, copy-pasting from the commit message:

Add a small set of real enum-doc cleanup examples that assert today's exact cleaned output for representative live bindings cases. Mark unexpected drift as xfail so future upstream doc changes trigger manual review without causing a hard test failure.

I believe in combination with the rest of the extensive unit tests, there is very little wiggle room for major issues to go undetected.



class DocstringBackedExplanations:
"""Compatibility shim exposing enum-member ``__doc__`` text via ``dict.get``.

Keeps the existing ``.get(int(error))`` lookup shape used by ``cuda_utils.pyx``.
"""

__slots__ = ("_enum_type",)

def __init__(self, enum_type: Any) -> None:
self._enum_type = enum_type

def get(self, code: int, default: str | None = None) -> str | None:
try:
member = self._enum_type(code)
except ValueError:
return default

raw_doc = member.__doc__
if raw_doc is None:
return default

return clean_enum_member_docstring(raw_doc)


def get_best_available_explanations(
enum_type: Any, fallback: dict[int, str | tuple[str, ...]]
) -> DocstringBackedExplanations | dict[int, str | tuple[str, ...]]:
"""Pick one explanation source per bindings version.

Use enum-member ``__doc__`` only for bindings versions known to expose
usable per-member text (12.9.6+ in the 12.x backport line, 13.2.0+ in the
13.x mainline). Otherwise keep using the frozen 13.1.1 fallback tables.
"""
if not _binding_version_has_usable_enum_docstrings(_binding_version()):
return fallback
return DocstringBackedExplanations(enum_type)
34 changes: 13 additions & 21 deletions cuda_core/cuda/core/_utils/runtime_cuda_error_explanations.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

# To regenerate the dictionary below run:
# ../../../../../toolshed/reformat_cuda_enums_as_py.py /usr/local/cuda/include/driver_types.h
# Replace the dictionary below with the output.
# Also update the CUDA Toolkit version number below.
from cuda.bindings import runtime
from cuda.core._utils.enum_explanations_helpers import get_best_available_explanations

# CUDA Toolkit v13.2.0
RUNTIME_CUDA_ERROR_EXPLANATIONS = {
# CUDA Toolkit v13.1.1
_FALLBACK_EXPLANATIONS = {
0: (
"The API call returned with no errors. In the case of query calls, this"
" also means that the operation being queried is complete (see"
Expand Down Expand Up @@ -52,11 +50,6 @@
" requesting too many threads or blocks. See ::cudaDeviceProp for more"
" device limitations."
),
10: (
"This indicates that the driver is newer than the runtime version"
" and returned graph node parameter information that the runtime"
" does not understand and is unable to translate."
),
12: (
"This indicates that one or more of the pitch-related parameters passed"
" to the API call is not within the acceptable range for pitch."
Expand Down Expand Up @@ -523,15 +516,12 @@
" changes which violated constraints specific to instantiated graph update."
),
911: (
"This indicates that an error has occurred in a device outside of GPU. It can be a"
" synchronous error w.r.t. CUDA API or an asynchronous error from the external device."
" In case of asynchronous error, it means that if cuda was waiting for an external device's"
" signal before consuming shared data, the external device signaled an error indicating that"
" the data is not valid for consumption. This leaves the process in an inconsistent"
" state and any further CUDA work will return the same error. To continue using CUDA,"
" the process must be terminated and relaunched."
" In case of synchronous error, it means that one or more external devices"
" have encountered an error and cannot complete the operation."
"This indicates that an async error has occurred in a device outside of CUDA."
" If CUDA was waiting for an external device's signal before consuming shared data,"
" the external device signaled an error indicating that the data is not valid for"
" consumption. This leaves the process in an inconsistent state and any further CUDA"
" work will return the same error. To continue using CUDA, the process must be"
" terminated and relaunched."
),
912: ("This indicates that a kernel launch error has occurred due to cluster misconfiguration."),
913: ("Indiciates a function handle is not loaded when calling an API that requires a loaded function."),
Expand All @@ -549,3 +539,5 @@
" This error return is deprecated as of CUDA 4.1."
),
}

RUNTIME_CUDA_ERROR_EXPLANATIONS = get_best_available_explanations(runtime.cudaError_t, _FALLBACK_EXPLANATIONS)
86 changes: 54 additions & 32 deletions cuda_core/tests/test_cuda_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,40 +11,12 @@
from cuda.core._utils.clear_error_support import assert_type_str_or_bytes_like, raise_code_path_meant_to_be_unreachable


def test_driver_cu_result_explanations_health():
expl_dict = cuda_utils.DRIVER_CU_RESULT_EXPLANATIONS

# Ensure all CUresult enums are in expl_dict
known_codes = set()
for error in driver.CUresult:
code = int(error)
assert code in expl_dict
known_codes.add(code)

from cuda.core._utils.version import binding_version

if binding_version() >= (13, 0, 0):
# Ensure expl_dict has no codes not known as a CUresult enum
extra_expl = sorted(set(expl_dict.keys()) - known_codes)
assert not extra_expl


def test_runtime_cuda_error_explanations_health():
expl_dict = cuda_utils.RUNTIME_CUDA_ERROR_EXPLANATIONS

# Ensure all cudaError_t enums are in expl_dict
known_codes = set()
for error in runtime.cudaError_t:
code = int(error)
assert code in expl_dict
known_codes.add(code)

def _skip_if_bindings_pre_enum_docstrings():
from cuda.core._utils.enum_explanations_helpers import _binding_version_has_usable_enum_docstrings
from cuda.core._utils.version import binding_version

if binding_version() >= (13, 0, 0):
# Ensure expl_dict has no codes not known as a cudaError_t enum
extra_expl = sorted(set(expl_dict.keys()) - known_codes)
assert not extra_expl
if not _binding_version_has_usable_enum_docstrings(binding_version()):
pytest.skip("cuda-bindings version does not expose usable enum __doc__ strings")


def test_check_driver_error():
Expand Down Expand Up @@ -85,6 +57,56 @@ def test_check_runtime_error():
assert num_unexpected < len(driver.CUresult) * 0.5


def test_driver_error_enum_has_non_empty_docstring():
_skip_if_bindings_pre_enum_docstrings()

doc = driver.CUresult.CUDA_ERROR_INVALID_VALUE.__doc__
assert doc is not None
assert doc.strip() != ""


def test_runtime_error_enum_has_non_empty_docstring():
_skip_if_bindings_pre_enum_docstrings()

doc = runtime.cudaError_t.cudaErrorInvalidValue.__doc__
assert doc is not None
assert doc.strip() != ""


def test_check_driver_error_attaches_explanation():
error = driver.CUresult.CUDA_ERROR_INVALID_VALUE
name_err, name = driver.cuGetErrorName(error)
assert name_err == driver.CUresult.CUDA_SUCCESS
desc_err, desc = driver.cuGetErrorString(error)
assert desc_err == driver.CUresult.CUDA_SUCCESS
expl = cuda_utils.DRIVER_CU_RESULT_EXPLANATIONS.get(int(error))
assert expl is not None
assert expl != desc.decode()

with pytest.raises(cuda_utils.CUDAError) as e:
cuda_utils._check_driver_error(error)

assert str(e.value) == f"{name.decode()}: {expl}"
assert str(e.value) != f"{name.decode()}: {desc.decode()}"


def test_check_runtime_error_attaches_explanation():
error = runtime.cudaError_t.cudaErrorInvalidValue
name_err, name = runtime.cudaGetErrorName(error)
assert name_err == runtime.cudaError_t.cudaSuccess
desc_err, desc = runtime.cudaGetErrorString(error)
assert desc_err == runtime.cudaError_t.cudaSuccess
expl = cuda_utils.RUNTIME_CUDA_ERROR_EXPLANATIONS.get(int(error))
assert expl is not None
assert expl != desc.decode()

with pytest.raises(cuda_utils.CUDAError) as e:
cuda_utils._check_runtime_error(error)

assert str(e.value) == f"{name.decode()}: {expl}"
assert str(e.value) != f"{name.decode()}: {desc.decode()}"


def test_precondition():
def checker(*args, what=""):
if args[0] < 0:
Expand Down
Loading
Loading