cuda_core: derive error enum explanations from bindings docstrings

rwgk · rwgk · commit 4cbe3f3b5d2b · 2026-04-04T22:01:09.000-07:00
Use cleaned driver/runtime enum __doc__ text from cuda-bindings 13.2.0+ as the primary source for CUDA error explanations in cuda_core, while freezing the 13.1.1 explanation tables as fallback for older bindings. Centralize the version-gated selection and docstring cleanup helpers, update the driver/runtime explanation modules to use them, add tests that verify representative enums expose __doc__ and that cuda_utils attaches the explanation text, and remove the obsolete enum-reformat toolshed helper script.

Made-with: Cursor
diff --git a/cuda_core/cuda/core/_utils/driver_cu_result_explanations.py b/cuda_core/cuda/core/_utils/driver_cu_result_explanations.py
@@ -1,13 +1,11 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# To regenerate the dictionary below run:
-#     ../../../../../toolshed/reformat_cuda_enums_as_py.py /usr/local/cuda/include/cuda.h
-# Replace the dictionary below with the output.
-# Also update the CUDA Toolkit version number below.
+from cuda.bindings import driver
+from cuda.core._utils.enum_explanations_helpers import get_best_available_explanations
 
-# CUDA Toolkit v13.2.0
-DRIVER_CU_RESULT_EXPLANATIONS = {
+# CUDA Toolkit v13.1.1
+_FALLBACK_EXPLANATIONS = {
     0: (
         "The API call returned with no errors. In the case of query calls, this"
         " also means that the operation being queried is complete (see"
@@ -334,15 +332,12 @@
         " changes which violated constraints specific to instantiated graph update."
     ),
     911: (
-        "This indicates that an error has occurred in a device outside of GPU. It can be a"
-        " synchronous error w.r.t. CUDA API or an asynchronous error from the external device."
-        " In case of asynchronous error, it means that if cuda was waiting for an external device's"
-        " signal before consuming shared data, the external device signaled an error indicating that"
-        " the data is not valid for consumption. This leaves the process in an inconsistent"
-        " state and any further CUDA work will return the same error. To continue using CUDA,"
-        " the process must be terminated and relaunched."
-        " In case of synchronous error, it means that one or more external devices"
-        " have encountered an error and cannot complete the operation."
+        "This indicates that an async error has occurred in a device outside of CUDA."
+        " If CUDA was waiting for an external device's signal before consuming shared data,"
+        " the external device signaled an error indicating that the data is not valid for"
+        " consumption. This leaves the process in an inconsistent state and any further CUDA"
+        " work will return the same error. To continue using CUDA, the process must be"
+        " terminated and relaunched."
     ),
     912: "Indicates a kernel launch error due to cluster misconfiguration.",
     913: ("Indiciates a function handle is not loaded when calling an API that requires a loaded function."),
@@ -356,3 +351,5 @@
     ),
     999: "This indicates that an unknown internal error has occurred.",
 }
+
+DRIVER_CU_RESULT_EXPLANATIONS = get_best_available_explanations(driver.CUresult, _FALLBACK_EXPLANATIONS)
diff --git a/cuda_core/cuda/core/_utils/enum_explanations_helpers.py b/cuda_core/cuda/core/_utils/enum_explanations_helpers.py
@@ -0,0 +1,127 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+"""Internal support for error-enum explanations.
+
+``cuda_core`` keeps frozen 13.1.1 fallback tables for older ``cuda-bindings``
+releases. Starting with ``cuda-bindings`` 13.2.0, driver/runtime error enums
+carry usable ``__doc__`` text. This module decides which source to use and
+normalizes generated docstrings so user-facing ``CUDAError`` messages stay
+close to the long-form explanation prose.
+
+The cleanup rules here were derived while validating docstring-vs-dict parity
+in PR #1805. Keep them narrow and remove them when codegen / fallback support is
+no longer needed.
+"""
+
+from __future__ import annotations
+
+import importlib.metadata
+import re
+from typing import Any
+
+_MIN_BINDING_VERSION_FOR_ENUM_DOCSTRINGS = (13, 2, 0)
+
+
+# ``version.pyx`` cannot be reused here (circular import via ``cuda_utils``).
+def _binding_version() -> tuple[int, int, int]:
+    """Return the installed ``cuda-bindings`` version, or a conservative old value."""
+    try:
+        parts = importlib.metadata.version("cuda-bindings").split(".")[:3]
+    except importlib.metadata.PackageNotFoundError:
+        return (0, 0, 0)  # For very old versions of cuda-python
+    return tuple(int(v) for v in parts)
+
+
+def _strip_doxygen_double_colon_prefixes(s: str) -> str:
+    """Remove Doxygen-style ``::`` before CUDA identifiers (not C++ ``Foo::Bar`` scope).
+
+    The frozen fallback tables come from CUDA header comments and therefore use
+    Doxygen ``::name`` references. Generated enum ``__doc__`` text uses Sphinx
+    roles instead, so parity checks need a small amount of normalization.
+    """
+    prev = None
+    while prev != s:
+        prev = s
+        s = re.sub(r"(?<![A-Za-z0-9_])::+([A-Za-z_][A-Za-z0-9_]*)", r"\1", s)
+    return s
+
+
+def _fix_hyphenation_wordwrap_spacing(s: str) -> str:
+    """Remove spaces around hyphens introduced by line wrapping in generated ``__doc__`` text.
+
+    This is a narrow workaround for wrapped forms such as ``non- linear`` that
+    otherwise differ from the single-line fallback prose.
+    """
+    prev = None
+    while prev != s:
+        prev = s
+        s = re.sub(r"([a-z])- ([a-z])", r"\1-\2", s)
+        s = re.sub(r"([a-z]) -([a-z])", r"\1-\2", s)
+    return s
+
+
+def clean_enum_member_docstring(doc: str | None) -> str | None:
+    """Turn an enum member ``__doc__`` into plain text.
+
+    The generated enum docstrings are already close to the fallback explanation
+    prose, but not byte-identical: they may contain Sphinx inline roles, line
+    wrapping, or a small known codegen defect. Normalize only those differences
+    so the text is suitable for user-facing error messages.
+    """
+    if doc is None:
+        return None
+    s = doc
+    # Known codegen bug on cudaErrorIncompatibleDriverContext. Remove once fixed
+    # in cuda-bindings code generation. Do not use a raw string for the needle:
+    # r"\n..." would not match the real newline present in __doc__.
+    s = s.replace("\n:py:obj:`~.Interactions`", ' "Interactions ')
+    s = re.sub(
+        r":(?:py:)?(?:obj|func|meth|class|mod|data|const|exc):`([^`]+)`",
+        lambda m: re.sub(r"^~?\.", "", m.group(1)),
+        s,
+    )
+    s = re.sub(r"\*\*([^*]+)\*\*", r"\1", s)
+    s = re.sub(r"\*([^*]+)\*", r"\1", s)
+    s = re.sub(r"\s+", " ", s).strip()
+    s = _fix_hyphenation_wordwrap_spacing(s)
+    return s
+
+
+class DocstringBackedExplanations:
+    """``dict.get``-like lookup over enum-member ``__doc__`` strings.
+
+    Once the bindings-version gate says docstrings are available, use them
+    exclusively. Missing docstrings should surface as ``None`` / ``default``
+    rather than silently mixing in frozen fallback prose.
+    """
+
+    __slots__ = ("_enum_type",)
+
+    def __init__(self, enum_type: Any) -> None:
+        self._enum_type = enum_type
+
+    def get(self, code: int, default: str | None = None) -> str | None:
+        try:
+            member = self._enum_type(code)
+        except ValueError:
+            return default
+
+        raw_doc = member.__doc__
+        if raw_doc is None:
+            return default
+
+        return clean_enum_member_docstring(raw_doc)
+
+
+def get_best_available_explanations(
+    enum_type: Any, fallback: dict[int, str | tuple[str, ...]]
+) -> DocstringBackedExplanations | dict[int, str | tuple[str, ...]]:
+    """Pick one explanation source per bindings version.
+
+    ``cuda-bindings`` < 13.2.0: use the frozen 13.1.1 fallback tables.
+    ``cuda-bindings`` >= 13.2.0: use enum-member ``__doc__`` exclusively.
+    """
+    if _binding_version() < _MIN_BINDING_VERSION_FOR_ENUM_DOCSTRINGS:
+        return fallback
+    return DocstringBackedExplanations(enum_type)
diff --git a/cuda_core/cuda/core/_utils/runtime_cuda_error_explanations.py b/cuda_core/cuda/core/_utils/runtime_cuda_error_explanations.py
@@ -1,13 +1,11 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# To regenerate the dictionary below run:
-#     ../../../../../toolshed/reformat_cuda_enums_as_py.py /usr/local/cuda/include/driver_types.h
-# Replace the dictionary below with the output.
-# Also update the CUDA Toolkit version number below.
+from cuda.bindings import runtime
+from cuda.core._utils.enum_explanations_helpers import get_best_available_explanations
 
-# CUDA Toolkit v13.2.0
-RUNTIME_CUDA_ERROR_EXPLANATIONS = {
+# CUDA Toolkit v13.1.1
+_FALLBACK_EXPLANATIONS = {
     0: (
         "The API call returned with no errors. In the case of query calls, this"
         " also means that the operation being queried is complete (see"
@@ -52,11 +50,6 @@
         " requesting too many threads or blocks. See ::cudaDeviceProp for more"
         " device limitations."
     ),
-    10: (
-        "This indicates that the driver is newer than the runtime version"
-        " and returned graph node parameter information that the runtime"
-        " does not understand and is unable to translate."
-    ),
     12: (
         "This indicates that one or more of the pitch-related parameters passed"
         " to the API call is not within the acceptable range for pitch."
@@ -523,15 +516,12 @@
         " changes which violated constraints specific to instantiated graph update."
     ),
     911: (
-        "This indicates that an error has occurred in a device outside of GPU. It can be a"
-        " synchronous error w.r.t. CUDA API or an asynchronous error from the external device."
-        " In case of asynchronous error, it means that if cuda was waiting for an external device's"
-        " signal before consuming shared data, the external device signaled an error indicating that"
-        " the data is not valid for consumption. This leaves the process in an inconsistent"
-        " state and any further CUDA work will return the same error. To continue using CUDA,"
-        " the process must be terminated and relaunched."
-        " In case of synchronous error, it means that one or more external devices"
-        " have encountered an error and cannot complete the operation."
+        "This indicates that an async error has occurred in a device outside of CUDA."
+        " If CUDA was waiting for an external device's signal before consuming shared data,"
+        " the external device signaled an error indicating that the data is not valid for"
+        " consumption. This leaves the process in an inconsistent state and any further CUDA"
+        " work will return the same error. To continue using CUDA, the process must be"
+        " terminated and relaunched."
     ),
     912: ("This indicates that a kernel launch error has occurred due to cluster misconfiguration."),
     913: ("Indiciates a function handle is not loaded when calling an API that requires a loaded function."),
@@ -549,3 +539,5 @@
         " This error return is deprecated as of CUDA 4.1."
     ),
 }
+
+RUNTIME_CUDA_ERROR_EXPLANATIONS = get_best_available_explanations(runtime.cudaError_t, _FALLBACK_EXPLANATIONS)
diff --git a/cuda_core/tests/test_cuda_utils.py b/cuda_core/tests/test_cuda_utils.py
@@ -11,40 +11,11 @@
 from cuda.core._utils.clear_error_support import assert_type_str_or_bytes_like, raise_code_path_meant_to_be_unreachable
 
 
-def test_driver_cu_result_explanations_health():
-    expl_dict = cuda_utils.DRIVER_CU_RESULT_EXPLANATIONS
-
-    # Ensure all CUresult enums are in expl_dict
-    known_codes = set()
-    for error in driver.CUresult:
-        code = int(error)
-        assert code in expl_dict
-        known_codes.add(code)
-
-    from cuda.core._utils.version import binding_version
-
-    if binding_version() >= (13, 0, 0):
-        # Ensure expl_dict has no codes not known as a CUresult enum
-        extra_expl = sorted(set(expl_dict.keys()) - known_codes)
-        assert not extra_expl
-
-
-def test_runtime_cuda_error_explanations_health():
-    expl_dict = cuda_utils.RUNTIME_CUDA_ERROR_EXPLANATIONS
-
-    # Ensure all cudaError_t enums are in expl_dict
-    known_codes = set()
-    for error in runtime.cudaError_t:
-        code = int(error)
-        assert code in expl_dict
-        known_codes.add(code)
-
+def _skip_if_bindings_pre_enum_docstrings():
     from cuda.core._utils.version import binding_version
 
-    if binding_version() >= (13, 0, 0):
-        # Ensure expl_dict has no codes not known as a cudaError_t enum
-        extra_expl = sorted(set(expl_dict.keys()) - known_codes)
-        assert not extra_expl
+    if binding_version() < (13, 2, 0):
+        pytest.skip("cuda-bindings < 13.2.0 may not expose enum __doc__ strings")
 
 
 def test_check_driver_error():
@@ -85,6 +56,56 @@ def test_check_runtime_error():
     assert num_unexpected < len(driver.CUresult) * 0.5
 
 
+def test_driver_error_enum_has_non_empty_docstring():
+    _skip_if_bindings_pre_enum_docstrings()
+
+    doc = driver.CUresult.CUDA_ERROR_INVALID_VALUE.__doc__
+    assert doc is not None
+    assert doc.strip() != ""
+
+
+def test_runtime_error_enum_has_non_empty_docstring():
+    _skip_if_bindings_pre_enum_docstrings()
+
+    doc = runtime.cudaError_t.cudaErrorInvalidValue.__doc__
+    assert doc is not None
+    assert doc.strip() != ""
+
+
+def test_check_driver_error_attaches_explanation():
+    error = driver.CUresult.CUDA_ERROR_INVALID_VALUE
+    name_err, name = driver.cuGetErrorName(error)
+    assert name_err == driver.CUresult.CUDA_SUCCESS
+    desc_err, desc = driver.cuGetErrorString(error)
+    assert desc_err == driver.CUresult.CUDA_SUCCESS
+    expl = cuda_utils.DRIVER_CU_RESULT_EXPLANATIONS.get(int(error))
+    assert expl is not None
+    assert expl != desc.decode()
+
+    with pytest.raises(cuda_utils.CUDAError) as e:
+        cuda_utils._check_driver_error(error)
+
+    assert str(e.value) == f"{name.decode()}: {expl}"
+    assert str(e.value) != f"{name.decode()}: {desc.decode()}"
+
+
+def test_check_runtime_error_attaches_explanation():
+    error = runtime.cudaError_t.cudaErrorInvalidValue
+    name_err, name = runtime.cudaGetErrorName(error)
+    assert name_err == runtime.cudaError_t.cudaSuccess
+    desc_err, desc = runtime.cudaGetErrorString(error)
+    assert desc_err == runtime.cudaError_t.cudaSuccess
+    expl = cuda_utils.RUNTIME_CUDA_ERROR_EXPLANATIONS.get(int(error))
+    assert expl is not None
+    assert expl != desc.decode()
+
+    with pytest.raises(cuda_utils.CUDAError) as e:
+        cuda_utils._check_runtime_error(error)
+
+    assert str(e.value) == f"{name.decode()}: {expl}"
+    assert str(e.value) != f"{name.decode()}: {desc.decode()}"
+
+
 def test_precondition():
     def checker(*args, what=""):
         if args[0] < 0:
diff --git a/cuda_core/tests/test_utils_enum_explanations_helpers.py b/cuda_core/tests/test_utils_enum_explanations_helpers.py
diff --git a/toolshed/reformat_cuda_enums_as_py.py b/toolshed/reformat_cuda_enums_as_py.py