Skip to content

Commit 4cbe3f3

Browse files
committed
cuda_core: derive error enum explanations from bindings docstrings
Use cleaned driver/runtime enum __doc__ text from cuda-bindings 13.2.0+ as the primary source for CUDA error explanations in cuda_core, while freezing the 13.1.1 explanation tables as fallback for older bindings. Centralize the version-gated selection and docstring cleanup helpers, update the driver/runtime explanation modules to use them, add tests that verify representative enums expose __doc__ and that cuda_utils attaches the explanation text, and remove the obsolete enum-reformat toolshed helper script. Made-with: Cursor
1 parent 5064470 commit 4cbe3f3

File tree

6 files changed

+333
-181
lines changed

6 files changed

+333
-181
lines changed

cuda_core/cuda/core/_utils/driver_cu_result_explanations.py

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
33

4-
# To regenerate the dictionary below run:
5-
# ../../../../../toolshed/reformat_cuda_enums_as_py.py /usr/local/cuda/include/cuda.h
6-
# Replace the dictionary below with the output.
7-
# Also update the CUDA Toolkit version number below.
4+
from cuda.bindings import driver
5+
from cuda.core._utils.enum_explanations_helpers import get_best_available_explanations
86

9-
# CUDA Toolkit v13.2.0
10-
DRIVER_CU_RESULT_EXPLANATIONS = {
7+
# CUDA Toolkit v13.1.1
8+
_FALLBACK_EXPLANATIONS = {
119
0: (
1210
"The API call returned with no errors. In the case of query calls, this"
1311
" also means that the operation being queried is complete (see"
@@ -334,15 +332,12 @@
334332
" changes which violated constraints specific to instantiated graph update."
335333
),
336334
911: (
337-
"This indicates that an error has occurred in a device outside of GPU. It can be a"
338-
" synchronous error w.r.t. CUDA API or an asynchronous error from the external device."
339-
" In case of asynchronous error, it means that if cuda was waiting for an external device's"
340-
" signal before consuming shared data, the external device signaled an error indicating that"
341-
" the data is not valid for consumption. This leaves the process in an inconsistent"
342-
" state and any further CUDA work will return the same error. To continue using CUDA,"
343-
" the process must be terminated and relaunched."
344-
" In case of synchronous error, it means that one or more external devices"
345-
" have encountered an error and cannot complete the operation."
335+
"This indicates that an async error has occurred in a device outside of CUDA."
336+
" If CUDA was waiting for an external device's signal before consuming shared data,"
337+
" the external device signaled an error indicating that the data is not valid for"
338+
" consumption. This leaves the process in an inconsistent state and any further CUDA"
339+
" work will return the same error. To continue using CUDA, the process must be"
340+
" terminated and relaunched."
346341
),
347342
912: "Indicates a kernel launch error due to cluster misconfiguration.",
348343
913: ("Indiciates a function handle is not loaded when calling an API that requires a loaded function."),
@@ -356,3 +351,5 @@
356351
),
357352
999: "This indicates that an unknown internal error has occurred.",
358353
}
354+
355+
DRIVER_CU_RESULT_EXPLANATIONS = get_best_available_explanations(driver.CUresult, _FALLBACK_EXPLANATIONS)
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
3+
4+
"""Internal support for error-enum explanations.
5+
6+
``cuda_core`` keeps frozen 13.1.1 fallback tables for older ``cuda-bindings``
7+
releases. Starting with ``cuda-bindings`` 13.2.0, driver/runtime error enums
8+
carry usable ``__doc__`` text. This module decides which source to use and
9+
normalizes generated docstrings so user-facing ``CUDAError`` messages stay
10+
close to the long-form explanation prose.
11+
12+
The cleanup rules here were derived while validating docstring-vs-dict parity
13+
in PR #1805. Keep them narrow and remove them when codegen / fallback support is
14+
no longer needed.
15+
"""
16+
17+
from __future__ import annotations
18+
19+
import importlib.metadata
20+
import re
21+
from typing import Any
22+
23+
_MIN_BINDING_VERSION_FOR_ENUM_DOCSTRINGS = (13, 2, 0)
24+
25+
26+
# ``version.pyx`` cannot be reused here (circular import via ``cuda_utils``).
27+
def _binding_version() -> tuple[int, int, int]:
28+
"""Return the installed ``cuda-bindings`` version, or a conservative old value."""
29+
try:
30+
parts = importlib.metadata.version("cuda-bindings").split(".")[:3]
31+
except importlib.metadata.PackageNotFoundError:
32+
return (0, 0, 0) # For very old versions of cuda-python
33+
return tuple(int(v) for v in parts)
34+
35+
36+
def _strip_doxygen_double_colon_prefixes(s: str) -> str:
37+
"""Remove Doxygen-style ``::`` before CUDA identifiers (not C++ ``Foo::Bar`` scope).
38+
39+
The frozen fallback tables come from CUDA header comments and therefore use
40+
Doxygen ``::name`` references. Generated enum ``__doc__`` text uses Sphinx
41+
roles instead, so parity checks need a small amount of normalization.
42+
"""
43+
prev = None
44+
while prev != s:
45+
prev = s
46+
s = re.sub(r"(?<![A-Za-z0-9_])::+([A-Za-z_][A-Za-z0-9_]*)", r"\1", s)
47+
return s
48+
49+
50+
def _fix_hyphenation_wordwrap_spacing(s: str) -> str:
51+
"""Remove spaces around hyphens introduced by line wrapping in generated ``__doc__`` text.
52+
53+
This is a narrow workaround for wrapped forms such as ``non- linear`` that
54+
otherwise differ from the single-line fallback prose.
55+
"""
56+
prev = None
57+
while prev != s:
58+
prev = s
59+
s = re.sub(r"([a-z])- ([a-z])", r"\1-\2", s)
60+
s = re.sub(r"([a-z]) -([a-z])", r"\1-\2", s)
61+
return s
62+
63+
64+
def clean_enum_member_docstring(doc: str | None) -> str | None:
65+
"""Turn an enum member ``__doc__`` into plain text.
66+
67+
The generated enum docstrings are already close to the fallback explanation
68+
prose, but not byte-identical: they may contain Sphinx inline roles, line
69+
wrapping, or a small known codegen defect. Normalize only those differences
70+
so the text is suitable for user-facing error messages.
71+
"""
72+
if doc is None:
73+
return None
74+
s = doc
75+
# Known codegen bug on cudaErrorIncompatibleDriverContext. Remove once fixed
76+
# in cuda-bindings code generation. Do not use a raw string for the needle:
77+
# r"\n..." would not match the real newline present in __doc__.
78+
s = s.replace("\n:py:obj:`~.Interactions`", ' "Interactions ')
79+
s = re.sub(
80+
r":(?:py:)?(?:obj|func|meth|class|mod|data|const|exc):`([^`]+)`",
81+
lambda m: re.sub(r"^~?\.", "", m.group(1)),
82+
s,
83+
)
84+
s = re.sub(r"\*\*([^*]+)\*\*", r"\1", s)
85+
s = re.sub(r"\*([^*]+)\*", r"\1", s)
86+
s = re.sub(r"\s+", " ", s).strip()
87+
s = _fix_hyphenation_wordwrap_spacing(s)
88+
return s
89+
90+
91+
class DocstringBackedExplanations:
92+
"""``dict.get``-like lookup over enum-member ``__doc__`` strings.
93+
94+
Once the bindings-version gate says docstrings are available, use them
95+
exclusively. Missing docstrings should surface as ``None`` / ``default``
96+
rather than silently mixing in frozen fallback prose.
97+
"""
98+
99+
__slots__ = ("_enum_type",)
100+
101+
def __init__(self, enum_type: Any) -> None:
102+
self._enum_type = enum_type
103+
104+
def get(self, code: int, default: str | None = None) -> str | None:
105+
try:
106+
member = self._enum_type(code)
107+
except ValueError:
108+
return default
109+
110+
raw_doc = member.__doc__
111+
if raw_doc is None:
112+
return default
113+
114+
return clean_enum_member_docstring(raw_doc)
115+
116+
117+
def get_best_available_explanations(
118+
enum_type: Any, fallback: dict[int, str | tuple[str, ...]]
119+
) -> DocstringBackedExplanations | dict[int, str | tuple[str, ...]]:
120+
"""Pick one explanation source per bindings version.
121+
122+
``cuda-bindings`` < 13.2.0: use the frozen 13.1.1 fallback tables.
123+
``cuda-bindings`` >= 13.2.0: use enum-member ``__doc__`` exclusively.
124+
"""
125+
if _binding_version() < _MIN_BINDING_VERSION_FOR_ENUM_DOCSTRINGS:
126+
return fallback
127+
return DocstringBackedExplanations(enum_type)

cuda_core/cuda/core/_utils/runtime_cuda_error_explanations.py

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
33

4-
# To regenerate the dictionary below run:
5-
# ../../../../../toolshed/reformat_cuda_enums_as_py.py /usr/local/cuda/include/driver_types.h
6-
# Replace the dictionary below with the output.
7-
# Also update the CUDA Toolkit version number below.
4+
from cuda.bindings import runtime
5+
from cuda.core._utils.enum_explanations_helpers import get_best_available_explanations
86

9-
# CUDA Toolkit v13.2.0
10-
RUNTIME_CUDA_ERROR_EXPLANATIONS = {
7+
# CUDA Toolkit v13.1.1
8+
_FALLBACK_EXPLANATIONS = {
119
0: (
1210
"The API call returned with no errors. In the case of query calls, this"
1311
" also means that the operation being queried is complete (see"
@@ -52,11 +50,6 @@
5250
" requesting too many threads or blocks. See ::cudaDeviceProp for more"
5351
" device limitations."
5452
),
55-
10: (
56-
"This indicates that the driver is newer than the runtime version"
57-
" and returned graph node parameter information that the runtime"
58-
" does not understand and is unable to translate."
59-
),
6053
12: (
6154
"This indicates that one or more of the pitch-related parameters passed"
6255
" to the API call is not within the acceptable range for pitch."
@@ -523,15 +516,12 @@
523516
" changes which violated constraints specific to instantiated graph update."
524517
),
525518
911: (
526-
"This indicates that an error has occurred in a device outside of GPU. It can be a"
527-
" synchronous error w.r.t. CUDA API or an asynchronous error from the external device."
528-
" In case of asynchronous error, it means that if cuda was waiting for an external device's"
529-
" signal before consuming shared data, the external device signaled an error indicating that"
530-
" the data is not valid for consumption. This leaves the process in an inconsistent"
531-
" state and any further CUDA work will return the same error. To continue using CUDA,"
532-
" the process must be terminated and relaunched."
533-
" In case of synchronous error, it means that one or more external devices"
534-
" have encountered an error and cannot complete the operation."
519+
"This indicates that an async error has occurred in a device outside of CUDA."
520+
" If CUDA was waiting for an external device's signal before consuming shared data,"
521+
" the external device signaled an error indicating that the data is not valid for"
522+
" consumption. This leaves the process in an inconsistent state and any further CUDA"
523+
" work will return the same error. To continue using CUDA, the process must be"
524+
" terminated and relaunched."
535525
),
536526
912: ("This indicates that a kernel launch error has occurred due to cluster misconfiguration."),
537527
913: ("Indiciates a function handle is not loaded when calling an API that requires a loaded function."),
@@ -549,3 +539,5 @@
549539
" This error return is deprecated as of CUDA 4.1."
550540
),
551541
}
542+
543+
RUNTIME_CUDA_ERROR_EXPLANATIONS = get_best_available_explanations(runtime.cudaError_t, _FALLBACK_EXPLANATIONS)

cuda_core/tests/test_cuda_utils.py

Lines changed: 53 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -11,40 +11,11 @@
1111
from cuda.core._utils.clear_error_support import assert_type_str_or_bytes_like, raise_code_path_meant_to_be_unreachable
1212

1313

14-
def test_driver_cu_result_explanations_health():
15-
expl_dict = cuda_utils.DRIVER_CU_RESULT_EXPLANATIONS
16-
17-
# Ensure all CUresult enums are in expl_dict
18-
known_codes = set()
19-
for error in driver.CUresult:
20-
code = int(error)
21-
assert code in expl_dict
22-
known_codes.add(code)
23-
24-
from cuda.core._utils.version import binding_version
25-
26-
if binding_version() >= (13, 0, 0):
27-
# Ensure expl_dict has no codes not known as a CUresult enum
28-
extra_expl = sorted(set(expl_dict.keys()) - known_codes)
29-
assert not extra_expl
30-
31-
32-
def test_runtime_cuda_error_explanations_health():
33-
expl_dict = cuda_utils.RUNTIME_CUDA_ERROR_EXPLANATIONS
34-
35-
# Ensure all cudaError_t enums are in expl_dict
36-
known_codes = set()
37-
for error in runtime.cudaError_t:
38-
code = int(error)
39-
assert code in expl_dict
40-
known_codes.add(code)
41-
14+
def _skip_if_bindings_pre_enum_docstrings():
4215
from cuda.core._utils.version import binding_version
4316

44-
if binding_version() >= (13, 0, 0):
45-
# Ensure expl_dict has no codes not known as a cudaError_t enum
46-
extra_expl = sorted(set(expl_dict.keys()) - known_codes)
47-
assert not extra_expl
17+
if binding_version() < (13, 2, 0):
18+
pytest.skip("cuda-bindings < 13.2.0 may not expose enum __doc__ strings")
4819

4920

5021
def test_check_driver_error():
@@ -85,6 +56,56 @@ def test_check_runtime_error():
8556
assert num_unexpected < len(driver.CUresult) * 0.5
8657

8758

59+
def test_driver_error_enum_has_non_empty_docstring():
60+
_skip_if_bindings_pre_enum_docstrings()
61+
62+
doc = driver.CUresult.CUDA_ERROR_INVALID_VALUE.__doc__
63+
assert doc is not None
64+
assert doc.strip() != ""
65+
66+
67+
def test_runtime_error_enum_has_non_empty_docstring():
68+
_skip_if_bindings_pre_enum_docstrings()
69+
70+
doc = runtime.cudaError_t.cudaErrorInvalidValue.__doc__
71+
assert doc is not None
72+
assert doc.strip() != ""
73+
74+
75+
def test_check_driver_error_attaches_explanation():
76+
error = driver.CUresult.CUDA_ERROR_INVALID_VALUE
77+
name_err, name = driver.cuGetErrorName(error)
78+
assert name_err == driver.CUresult.CUDA_SUCCESS
79+
desc_err, desc = driver.cuGetErrorString(error)
80+
assert desc_err == driver.CUresult.CUDA_SUCCESS
81+
expl = cuda_utils.DRIVER_CU_RESULT_EXPLANATIONS.get(int(error))
82+
assert expl is not None
83+
assert expl != desc.decode()
84+
85+
with pytest.raises(cuda_utils.CUDAError) as e:
86+
cuda_utils._check_driver_error(error)
87+
88+
assert str(e.value) == f"{name.decode()}: {expl}"
89+
assert str(e.value) != f"{name.decode()}: {desc.decode()}"
90+
91+
92+
def test_check_runtime_error_attaches_explanation():
93+
error = runtime.cudaError_t.cudaErrorInvalidValue
94+
name_err, name = runtime.cudaGetErrorName(error)
95+
assert name_err == runtime.cudaError_t.cudaSuccess
96+
desc_err, desc = runtime.cudaGetErrorString(error)
97+
assert desc_err == runtime.cudaError_t.cudaSuccess
98+
expl = cuda_utils.RUNTIME_CUDA_ERROR_EXPLANATIONS.get(int(error))
99+
assert expl is not None
100+
assert expl != desc.decode()
101+
102+
with pytest.raises(cuda_utils.CUDAError) as e:
103+
cuda_utils._check_runtime_error(error)
104+
105+
assert str(e.value) == f"{name.decode()}: {expl}"
106+
assert str(e.value) != f"{name.decode()}: {desc.decode()}"
107+
108+
88109
def test_precondition():
89110
def checker(*args, what=""):
90111
if args[0] < 0:

0 commit comments

Comments
 (0)