diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 3bbe0fafe03..df3edd908c4 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -6,8 +6,11 @@ #include "resource_handles.hpp" #include +#include #include +#include #include +#include #include #include #include @@ -70,6 +73,9 @@ decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel = nullptr; // Graph decltype(&cuGraphDestroy) p_cuGraphDestroy = nullptr; decltype(&cuGraphExecDestroy) p_cuGraphExecDestroy = nullptr; +decltype(&cuUserObjectCreate) p_cuUserObjectCreate = nullptr; +decltype(&cuUserObjectRelease) p_cuUserObjectRelease = nullptr; +decltype(&cuGraphRetainUserObject) p_cuGraphRetainUserObject = nullptr; // Linker decltype(&cuLinkDestroy) p_cuLinkDestroy = nullptr; @@ -1114,12 +1120,92 @@ LibraryHandle get_kernel_library(const KernelHandle& h) noexcept { // ============================================================================ namespace { + +// Slot table layout (internal). Each graph maps CUgraphNode -> a fixed-size +// array of type-erased owners. The width is the most any single node needs: a +// kernel node holds its kernel and its packed arguments; a host node holds its +// callback and the userData. The table is heap-allocated and retained on the +// graph as a user object, so the driver frees it -- and every owner in it -- +// when the graph is destroyed. +constexpr std::size_t SLOTS_PER_NODE = 2; +using NodeSlots = std::array; +using GraphSlotTable = std::map; + +// shared_ptr deleters for the payloads that need one. Typed handles convert to +// OpaqueHandle by assignment and reuse their own control block, so they need no +// deleter here. The Python deleter follows the owner-release pattern used by +// the stream/deviceptr handles above. +void py_deleter(const void* p) noexcept { + GILAcquireGuard gil; + if (gil.acquired()) { + Py_DECREF(const_cast(static_cast(p))); + } +} + +void free_deleter(const void* p) noexcept { + std::free(const_cast(p)); +} + +void destroy_graph_slot_table(void* table) noexcept { + delete static_cast(table); +} + struct GraphBox { CUgraph resource; - GraphHandle h_parent; // Keeps parent alive for child/branch graphs + GraphHandle h_parent; // Keeps parent alive for child/branch graphs + mutable GraphSlotTable* slot_table = nullptr; // Lazily created; owned by the graph's user object }; + +const GraphBox* get_box(const GraphHandle& h) { + const CUgraph* p = h.get(); + return reinterpret_cast( + reinterpret_cast(p) - offsetof(GraphBox, resource) + ); +} + +// Return box's slot table, creating it on first use. The table is retained on +// the graph as a user object (MOVE transfers our only reference into the +// graph), so it -- and every owner in it -- is freed when the graph is +// destroyed. Returns nullptr if the driver lacks user-object support or a +// driver call fails; the cached pointer is non-owning. +GraphSlotTable* ensure_slot_table(const GraphBox* box) { + if (box->slot_table) { + return box->slot_table; + } + if (!p_cuUserObjectCreate || !p_cuGraphRetainUserObject || !p_cuUserObjectRelease) { + return nullptr; + } + auto* table = new GraphSlotTable(); + CUuserObject user_obj = nullptr; + { + GILReleaseGuard gil; + if (p_cuUserObjectCreate(&user_obj, table, + reinterpret_cast(destroy_graph_slot_table), + 1, CU_USER_OBJECT_NO_DESTRUCTOR_SYNC) != CUDA_SUCCESS) { + delete table; // no user object created; nothing else owns the table + return nullptr; + } + if (p_cuGraphRetainUserObject(box->resource, user_obj, 1, + CU_GRAPH_USER_OBJECT_MOVE) != CUDA_SUCCESS) { + p_cuUserObjectRelease(user_obj, 1); // drops refcount to 0 -> frees table + return nullptr; + } + } + box->slot_table = table; // non-owning cache; the user object owns it + return table; +} + } // namespace +OpaqueHandle make_opaque_py(PyObject* obj) { + Py_INCREF(obj); + return OpaqueHandle(static_cast(obj), py_deleter); +} + +OpaqueHandle make_opaque_malloc(void* buf) { + return OpaqueHandle(static_cast(buf), free_deleter); +} + GraphHandle create_graph_handle(CUgraph graph) { auto box = std::shared_ptr( new GraphBox{graph, {}}, @@ -1137,6 +1223,19 @@ GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent) return GraphHandle(box, &box->resource); } +CUresult graph_set_slot(const GraphHandle& h_graph, CUgraphNode node, + unsigned int slot, OpaqueHandle owner) { + if (!h_graph || slot >= SLOTS_PER_NODE) { + return CUDA_ERROR_INVALID_VALUE; + } + GraphSlotTable* table = ensure_slot_table(get_box(h_graph)); + if (!table) { + return CUDA_ERROR_NOT_SUPPORTED; + } + (*table)[node][slot] = std::move(owner); + return CUDA_SUCCESS; +} + // ============================================================================ // Graph Exec Handles // ============================================================================ diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 520e7f47634..686d590b6e0 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -109,6 +109,9 @@ extern decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel; // Graph extern decltype(&cuGraphDestroy) p_cuGraphDestroy; extern decltype(&cuGraphExecDestroy) p_cuGraphExecDestroy; +extern decltype(&cuUserObjectCreate) p_cuUserObjectCreate; +extern decltype(&cuUserObjectRelease) p_cuUserObjectRelease; +extern decltype(&cuGraphRetainUserObject) p_cuGraphRetainUserObject; // Linker extern decltype(&cuLinkDestroy) p_cuLinkDestroy; @@ -466,6 +469,37 @@ GraphHandle create_graph_handle(CUgraph graph); // but h_parent will be prevented from destruction while this handle exists. GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent); +// ============================================================================ +// Graph slot attachments +// +// A graph carries a side table that keeps resources used by its nodes (kernel +// arguments, host callbacks, events, ...) alive for as long as the graph can +// execute. The table is created on first use and retained on the CUgraph as a +// user object, so the driver releases it -- and everything attached through it +// -- when the graph is destroyed. The table layout is an internal detail; +// callers use the abstract API below. +// ============================================================================ + +// Type-erased shared owner of an attached resource. Typed handles such as +// EventHandle and KernelHandle convert to OpaqueHandle by assignment, reusing +// their existing control block; the helpers below build OpaqueHandles for the +// two cases that need a custom deleter. +using OpaqueHandle = std::shared_ptr; + +// Build an OpaqueHandle from a Python object: increments its refcount now and +// decrements it (under the GIL) on release. The caller must hold the GIL. +OpaqueHandle make_opaque_py(PyObject* obj); + +// Build an OpaqueHandle from a malloc'd buffer: std::free on release. +OpaqueHandle make_opaque_malloc(void* buf); + +// Attach owner to one of node's fixed slots on h_graph, replacing whatever was +// there. The graph's slot table is created on first use. Returns CUDA_SUCCESS, +// or an error if slot is out of range or the graph cannot hold a table (e.g. +// the driver lacks user-object support). +CUresult graph_set_slot(const GraphHandle& h_graph, CUgraphNode node, + unsigned int slot, OpaqueHandle owner); + // ============================================================================ // Graph exec handle functions // ============================================================================ diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 54b22ac6028..0ca4c98440d 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -44,6 +44,13 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": ctypedef shared_ptr[const cydriver.CUlinkState] CuLinkHandle ctypedef shared_ptr[const int] FileDescriptorHandle + + # Type-erased shared owner for resources attached to graph node slots. + # Typed handles above assign directly to an OpaqueHandle (shared control + # block); make_opaque_py / make_opaque_malloc cover the two cases needing a + # custom deleter. + ctypedef shared_ptr[const void] OpaqueHandle + ctypedef shared_ptr[const cydriver.CUarray] OpaqueArrayHandle ctypedef shared_ptr[const cydriver.CUmipmappedArray] MipmappedArrayHandle @@ -223,6 +230,13 @@ cdef LibraryHandle get_kernel_library(const KernelHandle& h) noexcept nogil cdef GraphHandle create_graph_handle(cydriver.CUgraph graph) except+ nogil cdef GraphHandle create_graph_handle_ref(cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil +# Graph slot attachments +cdef OpaqueHandle make_opaque_py(object obj) except+ +cdef OpaqueHandle make_opaque_malloc(void* buf) except+ +cdef cydriver.CUresult graph_set_slot( + const GraphHandle& h_graph, cydriver.CUgraphNode node, + unsigned int slot, OpaqueHandle owner) except+ + # Graph exec handles cdef GraphExecHandle create_graph_exec_handle(cydriver.CUgraphExec graph_exec) except+ nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyi b/cuda_core/cuda/core/_resource_handles.pyi index d4511ae0634..92e686813e8 100644 --- a/cuda_core/cuda/core/_resource_handles.pyi +++ b/cuda_core/cuda/core/_resource_handles.pyi @@ -21,6 +21,7 @@ NvvmProgramHandle = shared_ptr NvJitLinkHandle = shared_ptr CuLinkHandle = shared_ptr FileDescriptorHandle = shared_ptr +OpaqueHandle = shared_ptr OpaqueArrayHandle = shared_ptr MipmappedArrayHandle = shared_ptr TexObjectHandle = shared_ptr diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 4bb7156109e..8c39e747977 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -151,6 +151,12 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": GraphHandle create_graph_handle_ref "cuda_core::create_graph_handle_ref" ( cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil + OpaqueHandle make_opaque_py "cuda_core::make_opaque_py" (object obj) except+ + OpaqueHandle make_opaque_malloc "cuda_core::make_opaque_malloc" (void* buf) except+ + cydriver.CUresult graph_set_slot "cuda_core::graph_set_slot" ( + const GraphHandle& h_graph, cydriver.CUgraphNode node, + unsigned int slot, OpaqueHandle owner) except+ + # Graph exec handles GraphExecHandle create_graph_exec_handle "cuda_core::create_graph_exec_handle" ( cydriver.CUgraphExec graph_exec) except+ nogil @@ -304,6 +310,9 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Graph void* p_cuGraphDestroy "reinterpret_cast(cuda_core::p_cuGraphDestroy)" void* p_cuGraphExecDestroy "reinterpret_cast(cuda_core::p_cuGraphExecDestroy)" + void* p_cuUserObjectCreate "reinterpret_cast(cuda_core::p_cuUserObjectCreate)" + void* p_cuUserObjectRelease "reinterpret_cast(cuda_core::p_cuUserObjectRelease)" + void* p_cuGraphRetainUserObject "reinterpret_cast(cuda_core::p_cuGraphRetainUserObject)" # Linker void* p_cuLinkDestroy "reinterpret_cast(cuda_core::p_cuLinkDestroy)" @@ -364,6 +373,7 @@ cdef void _init_driver_fn_pointers() noexcept: global p_cuMemPoolImportPointer global p_cuLibraryLoadFromFile, p_cuLibraryLoadData, p_cuLibraryUnload, p_cuLibraryGetKernel global p_cuGraphDestroy, p_cuGraphExecDestroy + global p_cuUserObjectCreate, p_cuUserObjectRelease, p_cuGraphRetainUserObject global p_cuLinkDestroy global p_cuGraphicsUnmapResources, p_cuGraphicsUnregisterResource global p_cuDevSmResourceSplit @@ -424,6 +434,9 @@ cdef void _init_driver_fn_pointers() noexcept: # Graph p_cuGraphDestroy = _get_driver_fn("cuGraphDestroy") p_cuGraphExecDestroy = _get_driver_fn("cuGraphExecDestroy") + p_cuUserObjectCreate = _get_driver_fn("cuUserObjectCreate") + p_cuUserObjectRelease = _get_driver_fn("cuUserObjectRelease") + p_cuGraphRetainUserObject = _get_driver_fn("cuGraphRetainUserObject") # Linker p_cuLinkDestroy = _get_driver_fn("cuLinkDestroy") diff --git a/cuda_core/cuda/core/_utils/_weak_handles.pyi b/cuda_core/cuda/core/_utils/_weak_handles.pyi new file mode 100644 index 00000000000..3cf095d7b87 --- /dev/null +++ b/cuda_core/cuda/core/_utils/_weak_handles.pyi @@ -0,0 +1,55 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_utils/_weak_handles.pyx + +"""Test-only weak handles for resource-handle lifetime checks. + +This module is **not** part of the public ``cuda.core`` API. It is built into +the package (like other private ``_utils`` modules) purely so the test suite can +observe, deterministically, when the strong references that keep a CUDA resource +alive have all been released -- without relying on driver- or hardware-specific +side effects (for example, whether freed device memory happens to remain +readable). + +Every resource handle is owned by a C++ ``std::shared_ptr``. A **weak handle** +is a non-owning ``std::weak_ptr`` observer of that control block: truthy while +some strong owner remains, falsy once the last one is gone. Use :func:`weak_handle` +to obtain a weak handle from a supported front-end object. + +To support another type, add a ``cdef _weak_from_`` that reads its ``cdef`` +handle field (see ``*.pxd``), assigns to :ctype:`OpaqueHandle`, and extend the +``isinstance`` chain in :func:`weak_handle`. Types whose slots hold arbitrary +Python owners via ``make_opaque_py`` are not covered here -- use +:class:`weakref.ref` on a weak-referenceable owner object in tests instead. +""" +from __future__ import annotations + + +class WeakHandle: + """Non-owning weak handle for a resource's shared control block. + + Truthy while some strong owner of the underlying resource handle remains, + falsy once the last strong reference is released. Obtain instances via + :func:`weak_handle` rather than constructing directly. + """ + + def __bool__(self): + ... + + def expired(self): + """Return ``True`` once every strong owner of the handle is gone.""" + + def use_count(self): + """Number of strong owners currently sharing the handle.""" + +def weak_handle(obj): + """Return a :class:`WeakHandle` observing the resource behind ``obj``. + + Currently supports :class:`~cuda.core.Buffer` (device allocation handle). + See the module docstring for how to add more types. + + Raises + ------ + ValueError + If ``obj`` is a :class:`~cuda.core.Buffer` with no active allocation. + TypeError + If ``obj`` is not a supported type. + """ \ No newline at end of file diff --git a/cuda_core/cuda/core/_utils/_weak_handles.pyx b/cuda_core/cuda/core/_utils/_weak_handles.pyx new file mode 100644 index 00000000000..65737b958a6 --- /dev/null +++ b/cuda_core/cuda/core/_utils/_weak_handles.pyx @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Test-only weak handles for resource-handle lifetime checks. + +This module is **not** part of the public ``cuda.core`` API. It is built into +the package (like other private ``_utils`` modules) purely so the test suite can +observe, deterministically, when the strong references that keep a CUDA resource +alive have all been released -- without relying on driver- or hardware-specific +side effects (for example, whether freed device memory happens to remain +readable). + +Every resource handle is owned by a C++ ``std::shared_ptr``. A **weak handle** +is a non-owning ``std::weak_ptr`` observer of that control block: truthy while +some strong owner remains, falsy once the last one is gone. Use :func:`weak_handle` +to obtain a weak handle from a supported front-end object. + +To support another type, add a ``cdef _weak_from_`` that reads its ``cdef`` +handle field (see ``*.pxd``), assigns to :ctype:`OpaqueHandle`, and extend the +``isinstance`` chain in :func:`weak_handle`. Types whose slots hold arbitrary +Python owners via ``make_opaque_py`` are not covered here -- use +:class:`weakref.ref` on a weak-referenceable owner object in tests instead. +""" + +from cuda.core._memory._buffer cimport Buffer +from cuda.core._resource_handles cimport OpaqueHandle + + +# Cython cannot spell ``weak_ptr[const void]`` inline (the ``const void`` +# template argument fails to parse), so the weak type and its one constructor +# are provided by a small inline C++ shim local to this test-only module. This +# keeps the production resource_handles translation units untouched. +cdef extern from *: + """ + #include + namespace cuda_core_test { + using OpaqueWeakHandle = std::weak_ptr; + static inline OpaqueWeakHandle make_weak(const std::shared_ptr& h) { + return OpaqueWeakHandle(h); + } + } // namespace cuda_core_test + """ + cppclass OpaqueWeakHandle "cuda_core_test::OpaqueWeakHandle": + OpaqueWeakHandle() + bint expired() + long use_count() + OpaqueWeakHandle make_weak "cuda_core_test::make_weak" (const OpaqueHandle& h) + + +cdef class WeakHandle: + """Non-owning weak handle for a resource's shared control block. + + Truthy while some strong owner of the underlying resource handle remains, + falsy once the last strong reference is released. Obtain instances via + :func:`weak_handle` rather than constructing directly. + """ + + cdef OpaqueWeakHandle _w + + def __bool__(self): + return not self._w.expired() + + def expired(self): + """Return ``True`` once every strong owner of the handle is gone.""" + return self._w.expired() + + def use_count(self): + """Number of strong owners currently sharing the handle.""" + return self._w.use_count() + + +cdef WeakHandle _weak_from_opaque(OpaqueHandle h): + # Build the weak handle from a (temporary) strong handle. The strong copy + # lives only for the duration of this call, so it does not perturb the + # reference count the weak handle later reports. + cdef WeakHandle wh = WeakHandle.__new__(WeakHandle) + wh._w = make_weak(h) + return wh + + +cdef WeakHandle _weak_from_buffer(Buffer buf): + cdef OpaqueHandle h = buf._h_ptr + if not h: + raise ValueError("Buffer has no active allocation") + return _weak_from_opaque(h) + + +def weak_handle(obj): + """Return a :class:`WeakHandle` observing the resource behind ``obj``. + + Currently supports :class:`~cuda.core.Buffer` (device allocation handle). + See the module docstring for how to add more types. + + Raises + ------ + ValueError + If ``obj`` is a :class:`~cuda.core.Buffer` with no active allocation. + TypeError + If ``obj`` is not a supported type. + """ + if isinstance(obj, Buffer): + return _weak_from_buffer(obj) + raise TypeError( + f"weak_handle() does not support {type(obj).__name__!r}; " + "supported types: Buffer" + ) diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyi b/cuda_core/cuda/core/graph/_graph_builder.pyi index af1748ad86c..fa89d835c2a 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyi +++ b/cuda_core/cuda/core/graph/_graph_builder.pyi @@ -106,6 +106,18 @@ class GraphBuilder: to ambiguity. New graph builders should instead be created through a :obj:`~_device.Device`, or a :obj:`~_stream.stream` object. + .. note:: + + Operations recorded during capture reference your memory but do not + take ownership of it. As with ordinary stream work, you must keep the + operands alive for as long as the completed graph may execute -- for + example, the :obj:`~_memory.Buffer` objects passed to :func:`~launch` + or :meth:`~_memory.Buffer.copy_to`. Host callbacks added with + :meth:`callback` are the exception: the callable (and any copied + ``user_data``) are retained for the graph's lifetime. This differs from + building a graph explicitly with :class:`~graph.GraphDefinition`, which + retains the operands it is given. + """ def __init__(self): @@ -129,7 +141,49 @@ class GraphBuilder: def is_join_required(self) -> bool: """Returns True if this graph builder must be joined before building is ended.""" - def begin_building(self, mode: str | None='relaxed') -> GraphBuilder: + @property + def graph_definition(self) -> GraphDefinition: + """The captured graph as an explicit :class:`~graph.GraphDefinition`. + + The returned :class:`~graph.GraphDefinition` is a view of the same + graph this builder is producing: nodes added through it appear in + subsequent :meth:`complete` and :meth:`debug_dot_print` calls, and + the view stays valid even after the builder is closed. + + This lets you mix the capture and explicit APIs on a single graph, + for example to inspect what was captured, augment it with extra + nodes, or build a conditional body entirely with the explicit API. + + Availability: + + - **Primary builders** (created by :meth:`Device.create_graph_builder` + or :meth:`Stream.create_graph_builder`): only after + :meth:`end_building`. + + - **Conditional-body builders** (returned by :meth:`if_then`, + :meth:`if_else`, :meth:`while_loop`, :meth:`switch`): both before + :meth:`begin_building` and after :meth:`end_building`. The body + graph already exists when the conditional is created, so you may + populate it through this view without ever calling + :meth:`begin_building` on the body builder. + + - **Forked builders** (returned by :meth:`split`): never. Forked + builders share the primary builder's graph; access it through the + primary instead. + + Returns + ------- + GraphDefinition + A view of the graph being built. + + Raises + ------ + RuntimeError + If the builder is forked, currently building, or (for primary + builders) has not started building yet. + """ + + def begin_building(self, mode='relaxed') -> GraphBuilder: """Begins the building process. Build `mode` for controlling interaction with other API calls must be one of the following: @@ -168,7 +222,7 @@ class GraphBuilder: """ - def debug_dot_print(self, path: str, options: GraphDebugPrintOptions | None=None) -> None: + def debug_dot_print(self, path, options: GraphDebugPrintOptions | None=None): """Generates a DOT debug file for the graph builder. Parameters @@ -200,7 +254,7 @@ class GraphBuilder: """ @staticmethod - def join(*graph_builders: GraphBuilder) -> GraphBuilder: + def join(*graph_builders) -> GraphBuilder: """Joins multiple graph builders into a single graph builder. The returned builder inherits work dependencies from the provided builders. @@ -223,7 +277,7 @@ class GraphBuilder: def _get_conditional_context(self) -> driver.CUcontext: ... - def create_condition(self, default_value: int | None=None) -> GraphCondition: + def create_condition(self, default_value=None) -> GraphCondition: """Create a condition variable for use with conditional nodes. The returned :class:`GraphCondition` object is passed to conditional-node @@ -339,7 +393,7 @@ class GraphBuilder: The child graph builder. Must have finished building. """ - def callback(self, fn, *, user_data=None) -> None: + def callback(self, fn, *, user_data=None): """Add a host callback to the graph during stream capture. The callback runs on the host CPU when the graph reaches this point @@ -382,7 +436,7 @@ class Graph: def __init__(self): ... - def close(self) -> None: + def close(self): """Destroy the graph.""" @property @@ -409,7 +463,7 @@ class Graph: """ - def upload(self, stream: Stream) -> None: + def upload(self, stream: Stream): """Uploads the graph in a stream. Parameters @@ -419,7 +473,7 @@ class Graph: """ - def launch(self, stream: Stream) -> None: + def launch(self, stream: Stream): """Launches the graph in a stream. Parameters diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx index c7b2ba5f74d..969c1caa478 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyx +++ b/cuda_core/cuda/core/graph/_graph_builder.pyx @@ -3,17 +3,15 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import TYPE_CHECKING from libc.stdint cimport intptr_t from cuda.bindings cimport cydriver -from cuda.core.graph._graph_definition cimport GraphCondition -from cuda.core.graph._utils cimport _attach_host_callback_to_graph +from cuda.core.graph._graph_definition cimport GraphCondition, GraphDefinition +from cuda.core.graph._host_callback cimport _attach_host_callback_owners, _resolve_host_callback from cuda.core._resource_handles cimport ( - GraphHandle, - as_cu, as_py, + OpaqueHandle, as_cu, as_py, create_graph_exec_handle, create_graph_handle, create_graph_handle_ref, ) from cuda.core._stream cimport Stream @@ -26,9 +24,6 @@ from cuda.core._utils.cuda_utils import ( handle_return, ) -if TYPE_CHECKING: - from cuda.core.graph._graph_definition import GraphDefinition - __all__ = ['Graph', 'GraphBuilder', 'GraphCompleteOptions', 'GraphDebugPrintOptions'] @@ -171,9 +166,8 @@ def _instantiate_graph(h_graph, options: GraphCompleteOptions | None = None) -> params.flags = flags py_exec = handle_return(driver.cuGraphInstantiateWithParams(h_graph, params)) - # Check result_out before wrapping the exec: on a non-SUCCESS result the exec - # may be invalid, and Graph._init's RAII deleter would call cuGraphExecDestroy - # on it during the exception unwind below. + c_exec = int(py_exec) + graph = Graph._init(c_exec) if params.result_out == driver.CUgraphInstantiateResult.CUDA_GRAPH_INSTANTIATE_ERROR: raise RuntimeError( "Instantiation failed for an unexpected reason which is described in the return value of the function." @@ -193,9 +187,7 @@ def _instantiate_graph(h_graph, options: GraphCompleteOptions | None = None) -> raise RuntimeError("One or more conditional handles are not associated with conditional builders.") elif params.result_out != driver.CUgraphInstantiateResult.CUDA_GRAPH_INSTANTIATE_SUCCESS: raise RuntimeError(f"Graph instantiation failed with unexpected error code: {params.result_out}") - - c_exec = int(py_exec) - return Graph._init(c_exec) + return graph # Distinguishes the three kinds of GraphBuilder, which differ in how they @@ -228,8 +220,7 @@ cdef enum _BuilderKind: cdef enum _CaptureState: CAPTURE_NOT_STARTED = 0 CAPTURING = 1 - CAPTURE_ENDED = 2 # Finished, valid handle - CLOSED = 3 # No valid handle + CAPTURE_ENDED = 2 cdef class GraphBuilder: @@ -243,16 +234,32 @@ cdef class GraphBuilder: to ambiguity. New graph builders should instead be created through a :obj:`~_device.Device`, or a :obj:`~_stream.stream` object. + .. note:: + + Operations recorded during capture reference your memory but do not + take ownership of it. As with ordinary stream work, you must keep the + operands alive for as long as the completed graph may execute -- for + example, the :obj:`~_memory.Buffer` objects passed to :func:`~launch` + or :meth:`~_memory.Buffer.copy_to`. Host callbacks added with + :meth:`callback` are the exception: the callable (and any copied + ``user_data``) are retained for the graph's lifetime. This differs from + building a graph explicitly with :class:`~graph.GraphDefinition`, which + retains the operands it is given. + """ def __init__(self): raise NotImplementedError( - "directly creating a GraphBuilder object can be ambiguous. Please either " + "directly creating a Graph object can be ambiguous. Please either " "call Device.create_graph_builder() or stream.create_graph_builder()" ) def __dealloc__(self): - GB_end_capture_if_needed(self, False) + # Note: _stream could be set to None by cyclic-GC tp_clear before + # __dealloc__, but _h_stream is guaranteed to be valid. + if self._h_stream and self._state == CAPTURING and self._kind != FORKED: + with nogil: + cydriver.cuStreamEndCapture(as_cu(self._h_stream), NULL) @staticmethod def _init(Stream stream): @@ -266,10 +273,12 @@ cdef class GraphBuilder: def close(self): """Destroy the graph builder.""" - GB_end_capture_if_needed(self, True) + if self._h_stream and self._state == CAPTURING and self._kind != FORKED: + with nogil: + HANDLE_RETURN(cydriver.cuStreamEndCapture(as_cu(self._h_stream), NULL)) self._h_graph.reset() self._h_stream.reset() - self._state = CLOSED + self._state = CAPTURE_ENDED self._stream = None @property @@ -282,7 +291,65 @@ cdef class GraphBuilder: """Returns True if this graph builder must be joined before building is ended.""" return self._kind == FORKED - def begin_building(self, mode: str | None = "relaxed") -> GraphBuilder: + @property + def graph_definition(self) -> GraphDefinition: + """The captured graph as an explicit :class:`~graph.GraphDefinition`. + + The returned :class:`~graph.GraphDefinition` is a view of the same + graph this builder is producing: nodes added through it appear in + subsequent :meth:`complete` and :meth:`debug_dot_print` calls, and + the view stays valid even after the builder is closed. + + This lets you mix the capture and explicit APIs on a single graph, + for example to inspect what was captured, augment it with extra + nodes, or build a conditional body entirely with the explicit API. + + Availability: + + - **Primary builders** (created by :meth:`Device.create_graph_builder` + or :meth:`Stream.create_graph_builder`): only after + :meth:`end_building`. + + - **Conditional-body builders** (returned by :meth:`if_then`, + :meth:`if_else`, :meth:`while_loop`, :meth:`switch`): both before + :meth:`begin_building` and after :meth:`end_building`. The body + graph already exists when the conditional is created, so you may + populate it through this view without ever calling + :meth:`begin_building` on the body builder. + + - **Forked builders** (returned by :meth:`split`): never. Forked + builders share the primary builder's graph; access it through the + primary instead. + + Returns + ------- + GraphDefinition + A view of the graph being built. + + Raises + ------ + RuntimeError + If the builder is forked, currently building, or (for primary + builders) has not started building yet. + """ + if self._kind == FORKED: + raise RuntimeError( + "graph_definition is unavailable on forked graph builders; " + "access it through the primary builder instead." + ) + if self._state == CAPTURING: + raise RuntimeError( + "graph_definition is unavailable while capture is in " + "progress; call end_building() first." + ) + if self._kind == PRIMARY and self._state == CAPTURE_NOT_STARTED: + raise RuntimeError( + "graph_definition is unavailable before begin_building() on " + "a primary builder; no graph has been created yet." + ) + return GraphDefinition._from_handle(self._h_graph) + + def begin_building(self, mode="relaxed") -> GraphBuilder: """Begins the building process. Build `mode` for controlling interaction with other API calls must be one of the following: @@ -298,7 +365,6 @@ cdef class GraphBuilder: Default set to use relaxed. """ - GB_check_open(self) if self._state != CAPTURE_NOT_STARTED: if self._state == CAPTURING: raise RuntimeError("Graph builder is already building.") @@ -322,25 +388,20 @@ cdef class GraphBuilder: with nogil: HANDLE_RETURN(cydriver.cuStreamBeginCaptureToGraph( c_stream, c_graph, NULL, NULL, 0, c_mode)) - self._state = CAPTURING else: with nogil: HANDLE_RETURN(cydriver.cuStreamBeginCapture(c_stream, c_mode)) - # Capture is active now; set CAPTURING before the calls below so a - # failure in _get_capture_info/create_graph_handle still lets - # cleanup end the capture rather than leaving the stream poisoned. - self._state = CAPTURING - with nogil: - # The driver rejects a NULL captureStatus_out, so pass a - # stack-local even though we only want the graph handle. + # The driver rejects NULL captureStatus_out, so we pass a + # stack-local even though begin_capture just succeeded and we + # only care about the resulting graph handle. _get_capture_info(c_stream, &c_status, &c_graph) self._h_graph = create_graph_handle(c_graph) + self._state = CAPTURING return self @property def is_building(self) -> bool: """Returns True if the graph builder is currently building.""" - GB_check_open(self) cdef cydriver.CUstream c_stream = as_cu(self._h_stream) cdef cydriver.CUstreamCaptureStatus status with nogil: @@ -358,13 +419,11 @@ cdef class GraphBuilder: def end_building(self) -> GraphBuilder: """Ends the building process.""" - GB_check_open(self) if not self.is_building: raise RuntimeError("Graph builder is not building.") cdef cydriver.CUstream c_stream = as_cu(self._h_stream) - cdef cydriver.CUgraph c_graph with nogil: - HANDLE_RETURN(cydriver.cuStreamEndCapture(c_stream, &c_graph)) + HANDLE_RETURN(cydriver.cuStreamEndCapture(c_stream, NULL)) # TODO: Resolving https://github.com/NVIDIA/cuda-python/issues/617 would allow us to # resume the build process after the first call to end_building() @@ -385,13 +444,12 @@ cdef class GraphBuilder: The newly built graph. """ - GB_check_open(self) if self._state != CAPTURE_ENDED: raise RuntimeError("Graph has not finished building.") return _instantiate_graph(as_py(self._h_graph), options) - def debug_dot_print(self, path: str, options: GraphDebugPrintOptions | None = None) -> None: + def debug_dot_print(self, path, options: GraphDebugPrintOptions | None = None): """Generates a DOT debug file for the graph builder. Parameters @@ -402,12 +460,11 @@ cdef class GraphBuilder: Customizable dataclass for the debug print options. """ - GB_check_open(self) if self._state != CAPTURE_ENDED: raise RuntimeError("Graph has not finished building.") cdef unsigned int c_flags = options._to_flags() if options else 0 cdef cydriver.CUgraph c_graph = as_cu(self._h_graph) - cdef bytes b_path = path.encode('utf-8') + cdef bytes b_path = path.encode() if isinstance(path, str) else path cdef const char* c_path = b_path with nogil: HANDLE_RETURN(cydriver.cuGraphDebugDotPrint(c_graph, c_path, c_flags)) @@ -432,21 +489,18 @@ cdef class GraphBuilder: """ if count < 2: raise ValueError(f"Invalid split count: expecting >= 2, got {count}") - GB_check_open(self) - if self._state != CAPTURING: - raise RuntimeError("Graph builder must be building before it can be split.") event = self._stream.record() result = [self] for i in range(count - 1): stream = self._stream.device.create_stream() stream.wait(event) - result.append(GB_init_forked(stream, self._h_graph)) + result.append(_init_forked(stream, self._h_graph)) event.close() return tuple(result) @staticmethod - def join(*graph_builders: GraphBuilder) -> GraphBuilder: + def join(*graph_builders) -> GraphBuilder: """Joins multiple graph builders into a single graph builder. The returned builder inherits work dependencies from the provided builders. @@ -486,13 +540,12 @@ cdef class GraphBuilder: def __cuda_stream__(self) -> tuple[int, int]: """Return an instance of a __cuda_stream__ protocol.""" - GB_check_open(self) return self.stream.__cuda_stream__() def _get_conditional_context(self) -> driver.CUcontext: return self._stream.context.handle - def create_condition(self, default_value: int | None = None) -> GraphCondition: + def create_condition(self, default_value=None) -> GraphCondition: """Create a condition variable for use with conditional nodes. The returned :class:`GraphCondition` object is passed to conditional-node @@ -511,7 +564,6 @@ cdef class GraphBuilder: GraphCondition A condition variable for controlling conditional execution. """ - GB_check_open(self) if cy_driver_version() < (12, 3, 0): raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional handles") if cy_binding_version() < (12, 3, 0): @@ -551,7 +603,6 @@ cdef class GraphBuilder: The newly created conditional graph builder. """ - GB_check_open(self) if cy_driver_version() < (12, 3, 0): raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional if") if cy_binding_version() < (12, 3, 0): @@ -566,7 +617,7 @@ cdef class GraphBuilder: node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_IF node_params.conditional.size = 1 node_params.conditional.ctx = self._get_conditional_context() - return GB_cond_with_params(self, node_params)[0] + return _cond_with_params(self, node_params)[0] def if_else(self, condition: GraphCondition) -> tuple[GraphBuilder, GraphBuilder]: """Adds an if-else condition branch and returns new graph builders for both branches. @@ -588,7 +639,6 @@ cdef class GraphBuilder: A tuple of two new graph builders, one for the if branch and one for the else branch. """ - GB_check_open(self) if cy_driver_version() < (12, 8, 0): raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional if-else") if cy_binding_version() < (12, 8, 0): @@ -603,7 +653,7 @@ cdef class GraphBuilder: node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_IF node_params.conditional.size = 2 node_params.conditional.ctx = self._get_conditional_context() - return GB_cond_with_params(self, node_params) + return _cond_with_params(self, node_params) def switch(self, condition: GraphCondition, count: int) -> tuple[GraphBuilder, ...]: """Adds a switch condition branch and returns new graph builders for all cases. @@ -628,7 +678,6 @@ cdef class GraphBuilder: A tuple of new graph builders, one for each branch. """ - GB_check_open(self) if cy_driver_version() < (12, 8, 0): raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional switch") if cy_binding_version() < (12, 8, 0): @@ -643,7 +692,7 @@ cdef class GraphBuilder: node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_SWITCH node_params.conditional.size = count node_params.conditional.ctx = self._get_conditional_context() - return GB_cond_with_params(self, node_params) + return _cond_with_params(self, node_params) def while_loop(self, condition: GraphCondition) -> GraphBuilder: """Adds a while loop and returns a new graph builder for it. @@ -665,7 +714,6 @@ cdef class GraphBuilder: The newly created while loop graph builder. """ - GB_check_open(self) if cy_driver_version() < (12, 3, 0): raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional while loop") if cy_binding_version() < (12, 3, 0): @@ -680,7 +728,7 @@ cdef class GraphBuilder: node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_WHILE node_params.conditional.size = 1 node_params.conditional.ctx = self._get_conditional_context() - return GB_cond_with_params(self, node_params)[0] + return _cond_with_params(self, node_params)[0] def embed(self, GraphBuilder child): """Embed a previously-built :obj:`~graph.GraphBuilder` as a child node. @@ -690,7 +738,6 @@ cdef class GraphBuilder: child : :obj:`~graph.GraphBuilder` The child graph builder. Must have finished building. """ - GB_check_open(self) if child._state != CAPTURE_ENDED: raise ValueError("Child graph has not finished building.") @@ -723,7 +770,7 @@ cdef class GraphBuilder: ) ) - def callback(self, fn, *, user_data=None) -> None: + def callback(self, fn, *, user_data=None): """Add a host callback to the graph during stream capture. The callback runs on the host CPU when the graph reaches this point @@ -751,68 +798,37 @@ cdef class GraphBuilder: pointer (caller manages lifetime). If bytes-like, the data is copied and its lifetime is tied to the graph. """ - GB_check_open(self) cdef Stream stream = self._stream cdef cydriver.CUstream c_stream = as_cu(stream._h_stream) cdef cydriver.CUstreamCaptureStatus capture_status - cdef cydriver.CUgraph c_graph = NULL with nogil: - _get_capture_info(c_stream, &capture_status, &c_graph) + _get_capture_info(c_stream, &capture_status, NULL) if capture_status != cydriver.CU_STREAM_CAPTURE_STATUS_ACTIVE: raise RuntimeError("Cannot add callback when graph is not being built") cdef cydriver.CUhostFn c_fn cdef void* c_user_data = NULL - _attach_host_callback_to_graph(c_graph, fn, user_data, &c_fn, &c_user_data) + cdef OpaqueHandle fn_owner, data_owner + _resolve_host_callback(fn, user_data, &c_fn, &c_user_data, &fn_owner, &data_owner) with nogil: HANDLE_RETURN(cydriver.cuLaunchHostFunc(c_stream, c_fn, c_user_data)) + # Capturing the host function added a node to the graph; it is now the + # stream's sole capture dependency. Key the callback's owners to it so + # they live in the graph's slot table like any explicitly-added node. + cdef cydriver.CUgraphNode host_node = _capture_tail_node(c_stream) + _attach_host_callback_owners(self._h_graph, host_node, fn_owner, data_owner) -cdef inline int GB_check_open(GraphBuilder gb) except -1: - """Reject operations on a builder that has been closed. - - A CLOSED builder has reset its stream and graph handles, so any method - that dereferences them would read a null handle (or, for the cached - Stream, a None typed as cdef Stream). Guarding here yields a clear error - instead. - """ - if gb._state == CLOSED: - raise RuntimeError("Graph builder has been closed.") - return 0 - - -cdef inline int GB_end_capture_if_needed(GraphBuilder gb, bint check_status) except -1 nogil: - """End an in-progress capture if this builder owns it. - Only a CAPTURING PRIMARY or CONDITIONAL_BODY builder owns the live - capture. A FORKED builder must not call cuStreamEndCapture: the driver - requires forked streams to be joined first. - - check_status=True checks the driver return (close()); False ignores it - (__dealloc__). - """ - cdef cydriver.CUgraph c_graph - cdef cydriver.CUresult err - cdef cydriver.CUstream c_stream - if gb._h_stream and gb._state == CAPTURING and gb._kind != FORKED: - c_stream = as_cu(gb._h_stream) - with nogil: - err = cydriver.cuStreamEndCapture(c_stream, &c_graph) - if check_status: - HANDLE_RETURN(err) - return 0 - - -cdef inline GraphBuilder GB_init_forked(Stream stream, GraphHandle h_primary_graph): +cdef inline GraphBuilder _init_forked(Stream stream, GraphHandle h_graph): cdef GraphBuilder gb = GraphBuilder.__new__(GraphBuilder) - # A FORKED builder captures into the primary's CUgraph. It holds the - # primary's GraphHandle so conditional bodies created on it (via - # GB_init_conditional -> create_graph_handle_ref(cond_graph, parent._h_graph)) - # have a valid parent handle to pin. - gb._h_graph = h_primary_graph + # Forked builders capture into the primary's graph. They share its handle + # so node attachments (e.g. callbacks) reach the same slot table; the FORKED + # kind still bars end_building()/graph_definition and graph destruction. + gb._h_graph = h_graph gb._h_stream = stream._h_stream gb._kind = FORKED gb._state = CAPTURING @@ -820,7 +836,7 @@ cdef inline GraphBuilder GB_init_forked(Stream stream, GraphHandle h_primary_gra return gb -cdef inline GraphBuilder GB_init_conditional(Stream stream, cydriver.CUgraph cond_graph, GraphBuilder parent): +cdef inline GraphBuilder _init_conditional(Stream stream, cydriver.CUgraph cond_graph, GraphBuilder parent): cdef GraphBuilder gb = GraphBuilder.__new__(GraphBuilder) gb._h_graph = create_graph_handle_ref(cond_graph, parent._h_graph) gb._h_stream = stream._h_stream @@ -849,7 +865,29 @@ cdef inline int _get_capture_info( stream, status, NULL, graph, NULL, NULL)) -cdef inline tuple GB_cond_with_params(GraphBuilder gb, node_params): +cdef inline cydriver.CUgraphNode _capture_tail_node(cydriver.CUstream stream) except *: + """Return the node a freshly-captured single-node operation left as the + stream's sole capture dependency (e.g. the host node added by + ``cuLaunchHostFunc``). The driver advances the stream's dependency set to + the new node, so the next captured op would depend on it. + """ + cdef cydriver.CUstreamCaptureStatus status + cdef const cydriver.CUgraphNode* deps = NULL + cdef size_t num_deps = 0 + with nogil: + IF CUDA_CORE_BUILD_MAJOR >= 13: + HANDLE_RETURN(cydriver.cuStreamGetCaptureInfo( + stream, &status, NULL, NULL, &deps, NULL, &num_deps)) + ELSE: + HANDLE_RETURN(cydriver.cuStreamGetCaptureInfo( + stream, &status, NULL, NULL, &deps, &num_deps)) + if num_deps != 1: + raise RuntimeError( + f"expected exactly one capture dependency after a host callback, got {num_deps}") + return deps[0] + + +cdef inline tuple _cond_with_params(GraphBuilder gb, node_params): status, _, graph, *deps_info, num_dependencies = handle_return( driver.cuStreamGetCaptureInfo(gb._stream.handle) ) @@ -870,7 +908,7 @@ cdef inline tuple GB_cond_with_params(GraphBuilder gb, node_params): ) return tuple( - GB_init_conditional( + _init_conditional( gb._stream.device.create_stream(), int(node_params.conditional.phGraph_out[i]), gb, @@ -899,7 +937,7 @@ cdef class Graph: self._h_graph_exec = create_graph_exec_handle(graph_exec) return self - def close(self) -> None: + def close(self): """Destroy the graph.""" self._h_graph_exec.reset() @@ -933,8 +971,6 @@ cdef class Graph: cdef cydriver.CUgraphExec cu_exec = as_cu(self._h_graph_exec) if isinstance(source, GraphBuilder): - if (source)._state == CLOSED: - raise ValueError("Source graph builder has been closed.") if (source)._state != CAPTURE_ENDED: raise ValueError("Graph has not finished building.") cu_graph = as_cu((source)._h_graph) @@ -954,7 +990,7 @@ cdef class Graph: raise CUDAError(msg) HANDLE_RETURN(err) - def upload(self, stream: Stream) -> None: + def upload(self, stream: Stream): """Uploads the graph in a stream. Parameters @@ -968,7 +1004,7 @@ cdef class Graph: with nogil: HANDLE_RETURN(cydriver.cuGraphUpload(c_exec, c_stream)) - def launch(self, stream: Stream) -> None: + def launch(self, stream: Stream): """Launches the graph in a stream. Parameters diff --git a/cuda_core/cuda/core/graph/_graph_definition.pyi b/cuda_core/cuda/core/graph/_graph_definition.pyi index 15f34cec9ab..efbf17abbad 100644 --- a/cuda_core/cuda/core/graph/_graph_definition.pyi +++ b/cuda_core/cuda/core/graph/_graph_definition.pyi @@ -85,7 +85,7 @@ class GraphDefinition: See :meth:`GraphNode.deallocate` for full documentation. """ - def memset(self, dst: int, value, width: int, height: int=1, pitch: int=0) -> MemsetNode: + def memset(self, dst: int, value, width: int, height: int=1, pitch: int=0, *, dst_owner=None) -> MemsetNode: """Add an entry-point memset node (no dependencies). See :meth:`GraphNode.memset` for full documentation. @@ -120,7 +120,7 @@ class GraphDefinition: A new EmptyNode that depends on all input nodes. """ - def memcpy(self, dst: int, src: int, size: int) -> MemcpyNode: + def memcpy(self, dst: int, src: int, size: int, *, dst_owner=None, src_owner=None) -> MemcpyNode: """Add an entry-point memcpy node (no dependencies). See :meth:`GraphNode.memcpy` for full documentation. diff --git a/cuda_core/cuda/core/graph/_graph_definition.pyx b/cuda_core/cuda/core/graph/_graph_definition.pyx index 1ec56978327..9774c9899e4 100644 --- a/cuda_core/cuda/core/graph/_graph_definition.pyx +++ b/cuda_core/cuda/core/graph/_graph_definition.pyx @@ -160,13 +160,17 @@ cdef class GraphDefinition: value, size_t width, size_t height=1, - size_t pitch=0 + size_t pitch=0, + *, + dst_owner=None, ) -> MemsetNode: """Add an entry-point memset node (no dependencies). See :meth:`GraphNode.memset` for full documentation. """ - return self._entry.memset(dst, value, width, height, pitch) + return self._entry.memset( + dst, value, width, height=height, pitch=pitch, dst_owner=dst_owner + ) def launch(self, config: LaunchConfig, kernel: Kernel, *args) -> KernelNode: """Add an entry-point kernel launch node (no dependencies). @@ -200,12 +204,22 @@ cdef class GraphDefinition: """ return self._entry.join(*nodes) - def memcpy(self, dst: int, src: int, size_t size) -> MemcpyNode: + def memcpy( + self, + dst: int, + src: int, + size_t size, + *, + dst_owner=None, + src_owner=None, + ) -> MemcpyNode: """Add an entry-point memcpy node (no dependencies). See :meth:`GraphNode.memcpy` for full documentation. """ - return self._entry.memcpy(dst, src, size) + return self._entry.memcpy( + dst, src, size, dst_owner=dst_owner, src_owner=src_owner + ) def embed(self, child: GraphDefinition) -> ChildGraphNode: """Add an entry-point child graph node (no dependencies). diff --git a/cuda_core/cuda/core/graph/_graph_node.pyi b/cuda_core/cuda/core/graph/_graph_node.pyi index 3e701fe3897..a923f198c49 100644 --- a/cuda_core/cuda/core/graph/_graph_node.pyi +++ b/cuda_core/cuda/core/graph/_graph_node.pyi @@ -4,20 +4,20 @@ from __future__ import annotations import weakref -from collections.abc import Iterable from cuda.core._device import Device from cuda.core._event import Event from cuda.core._launch_config import LaunchConfig +from cuda.core._memory._buffer import Buffer from cuda.core._module import Kernel from cuda.core._utils.cuda_utils import driver -from cuda.core.graph._adjacency_set_proxy import AdjacencySetProxy from cuda.core.graph._graph_definition import GraphCondition, GraphDefinition from cuda.core.graph._subclasses import (AllocNode, ChildGraphNode, EmptyNode, EventRecordNode, EventWaitNode, - FreeNode, IfElseNode, IfNode, - KernelNode, MemcpyNode, MemsetNode, - SwitchNode, WhileNode) + FreeNode, HostCallbackNode, + IfElseNode, IfNode, KernelNode, + MemcpyNode, MemsetNode, SwitchNode, + WhileNode) from cuda.core.typing import GraphMemoryType @@ -32,14 +32,14 @@ class GraphNode: def __repr__(self) -> str: ... - def __eq__(self, other: object) -> bool: + def __eq__(self, other) -> bool: ... def __hash__(self) -> int: ... @property - def type(self) -> driver.CUgraphNodeType | None: + def type(self): """Return the CUDA graph node type. Returns @@ -49,7 +49,7 @@ class GraphNode: """ @property - def graph(self) -> GraphDefinition: + def graph(self) -> 'GraphDefinition': """Return the GraphDefinition this node belongs to.""" @property @@ -60,13 +60,13 @@ class GraphNode: """ @property - def is_valid(self) -> bool: + def is_valid(self): """Whether this node is valid (not destroyed). Returns ``False`` after :meth:`destroy` has been called. """ - def destroy(self) -> None: + def destroy(self): """Destroy this node and remove all its edges from the parent graph. After this call, :attr:`is_valid` returns ``False`` and the node @@ -75,19 +75,19 @@ class GraphNode: """ @property - def pred(self) -> AdjacencySetProxy: + def pred(self): """A mutable set-like view of this node's predecessors.""" @pred.setter - def pred(self, value: Iterable[GraphNode]) -> None: + def pred(self, value): ... @property - def succ(self) -> AdjacencySetProxy: + def succ(self): """A mutable set-like view of this node's successors.""" @succ.setter - def succ(self, value: Iterable[GraphNode]) -> None: + def succ(self, value): ... def launch(self, config: LaunchConfig, kernel: Kernel, *args) -> KernelNode: @@ -178,13 +178,16 @@ class GraphNode: A new FreeNode representing the free operation. """ - def memset(self, dst: int, value, width: int, height: int=1, pitch: int=0) -> MemsetNode: + def memset(self, dst: Buffer | int, value, width: int, *, height: int=1, pitch: int=0, dst_owner=None) -> MemsetNode: """Add a memset node depending on this node. Parameters ---------- - dst : int - Destination device pointer. + dst : Buffer or int + Destination. When ``dst`` is a :class:`Buffer`, the underlying + allocation is retained for the graph's lifetime. A raw pointer + (``int``) is used as-is; the caller must keep the underlying memory + alive, or supply ``dst_owner`` to have the graph retain it. value : int or buffer-protocol object Fill value. int for 1-byte fill (range [0, 256)), or buffer-protocol object of 1, 2, or 4 bytes. @@ -194,14 +197,23 @@ class GraphNode: Number of rows (default 1). pitch : int, optional Pitch of destination in bytes (default 0, unused if height is 1). + dst_owner : object, optional + Object retained for the graph's lifetime when ``dst`` is a raw + pointer. A :class:`Buffer` owner retains its underlying allocation, + not the wrapper. Must not be passed when ``dst`` is a :class:`Buffer`. Returns ------- MemsetNode A new MemsetNode representing the memset operation. + + Raises + ------ + ValueError + If ``dst_owner`` is given together with a :class:`Buffer` ``dst``. """ - def memcpy(self, dst: int, src: int, size: int) -> MemcpyNode: + def memcpy(self, dst: Buffer | int, src: Buffer | int, size: int, *, dst_owner=None, src_owner=None) -> MemcpyNode: """Add a memcpy node depending on this node. Copies ``size`` bytes from ``src`` to ``dst``. Memory types are @@ -210,17 +222,35 @@ class GraphNode: Parameters ---------- - dst : int - Destination pointer (device or pinned host). - src : int - Source pointer (device or pinned host). + dst : Buffer or int + Destination (device or pinned host). When a :class:`Buffer` is given, + the underlying allocation is retained for the graph's lifetime. A raw + pointer (``int``) is used as-is; the caller must keep the underlying + memory alive, or supply ``dst_owner`` to have the graph retain it. + src : Buffer or int + Source (device or pinned host). Same retention rules as ``dst``; + use ``src_owner`` for a raw pointer. size : int Number of bytes to copy. + dst_owner : object, optional + Object retained for the graph's lifetime when ``dst`` is a raw + pointer. A :class:`Buffer` owner retains its underlying allocation. + Must not be passed when ``dst`` is a :class:`Buffer`. + src_owner : object, optional + Object retained for the graph's lifetime when ``src`` is a raw + pointer. A :class:`Buffer` owner retains its underlying allocation. + Must not be passed when ``src`` is a :class:`Buffer`. Returns ------- MemcpyNode A new MemcpyNode representing the copy operation. + + Raises + ------ + ValueError + If ``dst_owner`` or ``src_owner`` is given together with a + :class:`Buffer` ``dst`` or ``src`` respectively. """ def embed(self, child: GraphDefinition) -> ChildGraphNode: @@ -269,7 +299,7 @@ class GraphNode: A new EventWaitNode representing the event wait operation. """ - def callback(self, fn, *, user_data=None) -> object: + def callback(self, fn, *, user_data=None) -> HostCallbackNode: """Add a host callback node depending on this node. The callback runs on the host CPU when the graph reaches this node. diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx index 53145dd5e2a..8fe06cee638 100644 --- a/cuda_core/cuda/core/graph/_graph_node.pyx +++ b/cuda_core/cuda/core/graph/_graph_node.pyx @@ -6,11 +6,8 @@ from __future__ import annotations -from collections.abc import Iterable from typing import TYPE_CHECKING -from cpython.ref cimport Py_INCREF - from libc.stddef cimport size_t from libc.stdint cimport uintptr_t from libc.string cimport memset as c_memset @@ -22,6 +19,7 @@ from cuda.bindings cimport cydriver from cuda.core._event cimport Event from cuda.core._kernel_arg_handler cimport ParamHolder from cuda.core._launch_config cimport LaunchConfig +from cuda.core._memory._buffer cimport Buffer from cuda.core._module cimport Kernel from cuda.core.graph._graph_definition cimport GraphCondition, GraphDefinition from cuda.core.graph._subclasses cimport ( @@ -42,26 +40,27 @@ from cuda.core.graph._subclasses cimport ( WhileNode, ) from cuda.core._resource_handles cimport ( - EventHandle, GraphHandle, GraphNodeHandle, - KernelHandle, + OpaqueHandle, as_cu, as_intptr, as_py, create_graph_handle_ref, create_graph_node_handle, graph_node_get_graph, + graph_set_slot, invalidate_graph_node, - py_object_user_object_destroy, + make_opaque_py, ) from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value -from cuda.core.graph._utils cimport ( - _attach_host_callback_to_graph, - _attach_user_object, +from cuda.core.graph._host_callback cimport ( + _attach_host_callback_owners, + _resolve_host_callback, ) +import ctypes as ct import weakref from cuda.core.graph._adjacency_set_proxy import AdjacencySetProxy @@ -78,7 +77,8 @@ _node_registry: weakref.WeakValueDictionary[int, GraphNode] = weakref.WeakValueD cdef inline GraphNode _registered(GraphNode n): - return _node_registry.setdefault(n._h_node.get(), n) + _node_registry[n._h_node.get()] = n + return n cdef class GraphNode: @@ -100,7 +100,7 @@ cdef class GraphNode: return "" return f"node:x}>" - def __eq__(self, other: object) -> bool: + def __eq__(self, other) -> bool: if not isinstance(other, GraphNode): return NotImplemented cdef GraphNode o = other @@ -114,7 +114,7 @@ cdef class GraphNode: return hash((as_intptr(self._h_node), as_intptr(g))) @property - def type(self) -> driver.CUgraphNodeType | None: + def type(self): """Return the CUDA graph node type. Returns @@ -131,7 +131,7 @@ cdef class GraphNode: return driver.CUgraphNodeType(node_type) @property - def graph(self) -> GraphDefinition: + def graph(self) -> "GraphDefinition": """Return the GraphDefinition this node belongs to.""" return GraphDefinition._from_handle(graph_node_get_graph(self._h_node)) @@ -144,14 +144,14 @@ cdef class GraphNode: return as_py(self._h_node) @property - def is_valid(self) -> bool: + def is_valid(self): """Whether this node is valid (not destroyed). Returns ``False`` after :meth:`destroy` has been called. """ return as_intptr(self._h_node) != 0 - def destroy(self) -> None: + def destroy(self): """Destroy this node and remove all its edges from the parent graph. After this call, :attr:`is_valid` returns ``False`` and the node @@ -167,23 +167,23 @@ cdef class GraphNode: invalidate_graph_node(self._h_node) @property - def pred(self) -> AdjacencySetProxy: + def pred(self): """A mutable set-like view of this node's predecessors.""" return AdjacencySetProxy(self, False) @pred.setter - def pred(self, value: Iterable[GraphNode]) -> None: + def pred(self, value): p = AdjacencySetProxy(self, False) p.clear() p.update(value) @property - def succ(self) -> AdjacencySetProxy: + def succ(self): """A mutable set-like view of this node's successors.""" return AdjacencySetProxy(self, True) @succ.setter - def succ(self, value: Iterable[GraphNode]) -> None: + def succ(self, value): s = AdjacencySetProxy(self, True) s.clear() s.update(value) @@ -282,13 +282,25 @@ cdef class GraphNode: """ return GN_free(self, dptr) - def memset(self, dst: int, value, size_t width, size_t height=1, size_t pitch=0) -> MemsetNode: + def memset( + self, + dst: Buffer | int, + value, + size_t width, + *, + size_t height=1, + size_t pitch=0, + dst_owner=None, + ) -> MemsetNode: """Add a memset node depending on this node. Parameters ---------- - dst : int - Destination device pointer. + dst : Buffer or int + Destination. When ``dst`` is a :class:`Buffer`, the underlying + allocation is retained for the graph's lifetime. A raw pointer + (``int``) is used as-is; the caller must keep the underlying memory + alive, or supply ``dst_owner`` to have the graph retain it. value : int or buffer-protocol object Fill value. int for 1-byte fill (range [0, 256)), or buffer-protocol object of 1, 2, or 4 bytes. @@ -298,18 +310,38 @@ cdef class GraphNode: Number of rows (default 1). pitch : int, optional Pitch of destination in bytes (default 0, unused if height is 1). + dst_owner : object, optional + Object retained for the graph's lifetime when ``dst`` is a raw + pointer. A :class:`Buffer` owner retains its underlying allocation, + not the wrapper. Must not be passed when ``dst`` is a :class:`Buffer`. Returns ------- MemsetNode A new MemsetNode representing the memset operation. + + Raises + ------ + ValueError + If ``dst_owner`` is given together with a :class:`Buffer` ``dst``. """ + cdef cydriver.CUdeviceptr c_dst cdef unsigned int val cdef unsigned int elem_size + cdef OpaqueHandle dst_slot_owner + dst_slot_owner = _resolve_memcpy_operand(dst, dst_owner, "dst", &c_dst) val, elem_size = _parse_fill_value(value) - return GN_memset(self, dst, val, elem_size, width, height, pitch) - - def memcpy(self, dst: int, src: int, size_t size) -> MemcpyNode: + return GN_memset(self, c_dst, dst_slot_owner, val, elem_size, width, height, pitch) + + def memcpy( + self, + dst: Buffer | int, + src: Buffer | int, + size_t size, + *, + dst_owner=None, + src_owner=None, + ) -> MemcpyNode: """Add a memcpy node depending on this node. Copies ``size`` bytes from ``src`` to ``dst``. Memory types are @@ -318,19 +350,42 @@ cdef class GraphNode: Parameters ---------- - dst : int - Destination pointer (device or pinned host). - src : int - Source pointer (device or pinned host). + dst : Buffer or int + Destination (device or pinned host). When a :class:`Buffer` is given, + the underlying allocation is retained for the graph's lifetime. A raw + pointer (``int``) is used as-is; the caller must keep the underlying + memory alive, or supply ``dst_owner`` to have the graph retain it. + src : Buffer or int + Source (device or pinned host). Same retention rules as ``dst``; + use ``src_owner`` for a raw pointer. size : int Number of bytes to copy. + dst_owner : object, optional + Object retained for the graph's lifetime when ``dst`` is a raw + pointer. A :class:`Buffer` owner retains its underlying allocation. + Must not be passed when ``dst`` is a :class:`Buffer`. + src_owner : object, optional + Object retained for the graph's lifetime when ``src`` is a raw + pointer. A :class:`Buffer` owner retains its underlying allocation. + Must not be passed when ``src`` is a :class:`Buffer`. Returns ------- MemcpyNode A new MemcpyNode representing the copy operation. + + Raises + ------ + ValueError + If ``dst_owner`` or ``src_owner`` is given together with a + :class:`Buffer` ``dst`` or ``src`` respectively. """ - return GN_memcpy(self, dst, src, size) + cdef cydriver.CUdeviceptr c_dst + cdef cydriver.CUdeviceptr c_src + cdef OpaqueHandle dst_slot_owner, src_slot_owner + dst_slot_owner = _resolve_memcpy_operand(dst, dst_owner, "dst", &c_dst) + src_slot_owner = _resolve_memcpy_operand(src, src_owner, "src", &c_src) + return GN_memcpy(self, c_dst, dst_slot_owner, c_src, src_slot_owner, size) def embed(self, child: GraphDefinition) -> ChildGraphNode: """Add a child graph node depending on this node. @@ -381,7 +436,7 @@ cdef class GraphNode: """ return GN_wait_event(self, event) - def callback(self, fn, *, user_data=None) -> object: + def callback(self, fn, *, user_data=None) -> HostCallbackNode: """Add a host callback node depending on this node. The callback runs on the host CPU when the graph reaches this node. @@ -500,16 +555,6 @@ cdef class GraphNode: cydriver.CU_GRAPH_COND_TYPE_SWITCH, count, SwitchNode) -cdef void _destroy_event_handle_copy(void* ptr) noexcept nogil: - cdef EventHandle* p = ptr - del p - - -cdef void _destroy_kernel_handle_copy(void* ptr) noexcept nogil: - cdef KernelHandle* p = ptr - del p - - cdef inline ConditionalNode _make_conditional_node( GraphNode pred, GraphCondition condition, @@ -626,6 +671,7 @@ cdef inline KernelNode GN_launch(GraphNode self, LaunchConfig conf, Kernel ker, cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 + cdef OpaqueHandle owner if pred_node != NULL: deps = &pred_node @@ -648,14 +694,16 @@ cdef inline KernelNode GN_launch(GraphNode self, LaunchConfig conf, Kernel ker, HANDLE_RETURN(cydriver.cuGraphAddKernelNode( &new_node, as_cu(h_graph), deps, num_deps, &node_params)) - _attach_user_object(as_cu(h_graph), new KernelHandle(ker._h_kernel), - _destroy_kernel_handle_copy) - - cdef object kernel_args = ker_args.kernel_args + # Slot 0 keeps the kernel loaded; slot 1 keeps the Python kernel-argument + # objects (notably device Buffers) alive for the graph's lifetime. The + # driver copies argument values into the node at add time but does not own + # the device memory they reference. + owner = ker._h_kernel + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, owner)) + kernel_args = ker_args.kernel_args if kernel_args is not None: - Py_INCREF(kernel_args) - _attach_user_object(as_cu(h_graph), kernel_args, - py_object_user_object_destroy) + owner = make_opaque_py(kernel_args) + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 1, owner)) return _registered(KernelNode._create_with_params( create_graph_node_handle(new_node, h_graph), @@ -784,8 +832,52 @@ cdef inline FreeNode GN_free(GraphNode self, cydriver.CUdeviceptr c_dptr): return _registered(FreeNode._create_with_params(create_graph_node_handle(new_node, h_graph), c_dptr)) +cdef inline OpaqueHandle _buffer_slot_owner(Buffer buf, str label): + """Copy a Buffer's device-pointer handle into a graph slot owner.""" + cdef OpaqueHandle slot_owner + if not buf._h_ptr: + raise ValueError(f"{label} Buffer has no active allocation") + slot_owner = buf._h_ptr + return slot_owner + + +cdef inline OpaqueHandle _resolve_memcpy_operand( + object operand, object owner, str side, cydriver.CUdeviceptr* out_ptr): + """Resolve a memcpy/memset operand to a pointer and optional slot owner. + + ``operand`` is a :class:`Buffer` or a raw integer address; its device + pointer is written to ``out_ptr``. For a :class:`Buffer` operand, returns an + owner that retains the underlying allocation (not the wrapper). For a raw + pointer, returns an owner built from ``owner`` (or an empty handle when + ``owner`` is ``None``). + + Raises + ------ + ValueError + If ``operand`` is a :class:`Buffer` and ``owner`` is not ``None``. + If a :class:`Buffer` operand or ``*_owner`` has no active allocation. + """ + cdef Buffer buf + + if isinstance(operand, Buffer): + if owner is not None: + raise ValueError( + f"{side}_owner cannot be used when {side} is a Buffer" + ) + buf = operand + slot_owner = _buffer_slot_owner(buf, side) + out_ptr[0] = as_cu(buf._h_ptr) + return slot_owner + out_ptr[0] = operand + if owner is None: + return OpaqueHandle() + if isinstance(owner, Buffer): + return _buffer_slot_owner(owner, f"{side}_owner") + return make_opaque_py(owner) + + cdef inline MemsetNode GN_memset( - GraphNode self, cydriver.CUdeviceptr c_dst, + GraphNode self, cydriver.CUdeviceptr c_dst, OpaqueHandle dst_owner, unsigned int val, unsigned int elem_size, size_t width, size_t height, size_t pitch): cdef cydriver.CUDA_MEMSET_NODE_PARAMS memset_params @@ -816,14 +908,18 @@ cdef inline MemsetNode GN_memset( &new_node, as_cu(h_graph), deps, num_deps, &memset_params, ctx)) + # Retain the destination allocation for the graph's lifetime (slot 0). + if dst_owner: + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, dst_owner)) + return _registered(MemsetNode._create_with_params( create_graph_node_handle(new_node, h_graph), c_dst, val, elem_size, width, height, pitch)) cdef inline MemcpyNode GN_memcpy( - GraphNode self, cydriver.CUdeviceptr c_dst, - cydriver.CUdeviceptr c_src, size_t size): + GraphNode self, cydriver.CUdeviceptr c_dst, OpaqueHandle dst_owner, + cydriver.CUdeviceptr c_src, OpaqueHandle src_owner, size_t size): cdef unsigned int dst_mem_type = cydriver.CU_MEMORYTYPE_DEVICE cdef unsigned int src_mem_type = cydriver.CU_MEMORYTYPE_DEVICE cdef cydriver.CUresult ret @@ -877,6 +973,12 @@ cdef inline MemcpyNode GN_memcpy( HANDLE_RETURN(cydriver.cuGraphAddMemcpyNode( &new_node, as_cu(h_graph), deps, num_deps, ¶ms, ctx)) + # Retain operand allocations for the graph's lifetime (dst -> slot 0, src -> slot 1). + if dst_owner: + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, dst_owner)) + if src_owner: + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 1, src_owner)) + return _registered(MemcpyNode._create_with_params( create_graph_node_handle(new_node, h_graph), c_dst, c_src, size, c_dst_type, c_src_type)) @@ -914,6 +1016,7 @@ cdef inline EventRecordNode GN_record_event(GraphNode self, Event ev): cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 + cdef OpaqueHandle owner if pred_node != NULL: deps = &pred_node @@ -923,8 +1026,8 @@ cdef inline EventRecordNode GN_record_event(GraphNode self, Event ev): HANDLE_RETURN(cydriver.cuGraphAddEventRecordNode( &new_node, as_cu(h_graph), deps, num_deps, as_cu(ev._h_event))) - _attach_user_object(as_cu(h_graph), new EventHandle(ev._h_event), - _destroy_event_handle_copy) + owner = ev._h_event + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, owner)) return _registered(EventRecordNode._create_with_params( create_graph_node_handle(new_node, h_graph), ev._h_event)) @@ -936,6 +1039,7 @@ cdef inline EventWaitNode GN_wait_event(GraphNode self, Event ev): cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 + cdef OpaqueHandle owner if pred_node != NULL: deps = &pred_node @@ -945,35 +1049,36 @@ cdef inline EventWaitNode GN_wait_event(GraphNode self, Event ev): HANDLE_RETURN(cydriver.cuGraphAddEventWaitNode( &new_node, as_cu(h_graph), deps, num_deps, as_cu(ev._h_event))) - _attach_user_object(as_cu(h_graph), new EventHandle(ev._h_event), - _destroy_event_handle_copy) + owner = ev._h_event + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, owner)) return _registered(EventWaitNode._create_with_params( create_graph_node_handle(new_node, h_graph), ev._h_event)) cdef inline HostCallbackNode GN_callback(GraphNode self, object fn, object user_data): - import ctypes as ct - cdef cydriver.CUDA_HOST_NODE_PARAMS node_params cdef cydriver.CUgraphNode new_node = NULL cdef GraphHandle h_graph = graph_node_get_graph(self._h_node) cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 + cdef OpaqueHandle fn_owner, data_owner if pred_node != NULL: deps = &pred_node num_deps = 1 - _attach_host_callback_to_graph( - as_cu(h_graph), fn, user_data, - &node_params.fn, &node_params.userData) + _resolve_host_callback( + fn, user_data, &node_params.fn, &node_params.userData, + &fn_owner, &data_owner) with nogil: HANDLE_RETURN(cydriver.cuGraphAddHostNode( &new_node, as_cu(h_graph), deps, num_deps, &node_params)) + _attach_host_callback_owners(h_graph, new_node, fn_owner, data_owner) + cdef object callable_obj = fn if not isinstance(fn, ct._CFuncPtr) else None return _registered(HostCallbackNode._create_with_params( create_graph_node_handle(new_node, h_graph), callable_obj, diff --git a/cuda_core/cuda/core/graph/_host_callback.pxd b/cuda_core/cuda/core/graph/_host_callback.pxd new file mode 100644 index 00000000000..dac249c74ed --- /dev/null +++ b/cuda_core/cuda/core/graph/_host_callback.pxd @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.bindings cimport cydriver + +from cuda.core._resource_handles cimport GraphHandle, OpaqueHandle + + +cdef bint _is_py_host_trampoline(cydriver.CUhostFn fn) noexcept nogil + +cdef void _resolve_host_callback( + object fn, object user_data, + cydriver.CUhostFn* out_fn, void** out_user_data, + OpaqueHandle* out_fn_owner, OpaqueHandle* out_data_owner) except * + +cdef int _attach_host_callback_owners( + const GraphHandle& h_graph, cydriver.CUgraphNode node, + OpaqueHandle fn_owner, OpaqueHandle data_owner) except -1 diff --git a/cuda_core/cuda/core/graph/_utils.pyi b/cuda_core/cuda/core/graph/_host_callback.pyi similarity index 74% rename from cuda_core/cuda/core/graph/_utils.pyi rename to cuda_core/cuda/core/graph/_host_callback.pyi index 79072e66ebe..6c9d0ead317 100644 --- a/cuda_core/cuda/core/graph/_utils.pyi +++ b/cuda_core/cuda/core/graph/_host_callback.pyi @@ -1,3 +1,3 @@ -# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_utils.pyx +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_host_callback.pyx from __future__ import annotations \ No newline at end of file diff --git a/cuda_core/cuda/core/graph/_host_callback.pyx b/cuda_core/cuda/core/graph/_host_callback.pyx new file mode 100644 index 00000000000..bed2d8152f5 --- /dev/null +++ b/cuda_core/cuda/core/graph/_host_callback.pyx @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport uintptr_t +from libc.stdlib cimport malloc +from libc.string cimport memcpy as c_memcpy + +from cuda.bindings cimport cydriver + +from cuda.core._resource_handles cimport ( + GraphHandle, + OpaqueHandle, + graph_set_slot, + make_opaque_malloc, + make_opaque_py, +) +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN + +import ctypes as ct + + +cdef void _py_host_trampoline(void* data) noexcept with gil: + (data)() + + +cdef bint _is_py_host_trampoline(cydriver.CUhostFn fn) noexcept nogil: + return fn == _py_host_trampoline + + +cdef void _resolve_host_callback( + object fn, object user_data, + cydriver.CUhostFn* out_fn, void** out_user_data, + OpaqueHandle* out_fn_owner, OpaqueHandle* out_data_owner) except *: + """Resolve a Python callable or ctypes CFuncPtr into a C callback pair and + the owners that keep it alive. + + On return ``*out_fn`` / ``*out_user_data`` are ready to pass to + ``cuGraphAddHostNode`` or ``cuLaunchHostFunc``. ``*out_fn_owner`` owns the + callback object; ``*out_data_owner`` owns a copied ``user_data`` buffer and + is left null otherwise. The caller attaches the owners to the node's graph + slots. + """ + if isinstance(fn, ct._CFuncPtr): + out_fn[0] = ct.cast(fn, ct.c_void_p).value + if user_data is None: + out_user_data[0] = NULL + elif isinstance(user_data, int): + out_user_data[0] = user_data + else: + buf = bytes(user_data) + if len(buf): + out_user_data[0] = malloc(len(buf)) + if out_user_data[0] == NULL: + raise MemoryError("failed to allocate user_data buffer") + c_memcpy(out_user_data[0], buf, len(buf)) + out_data_owner[0] = make_opaque_malloc(out_user_data[0]) + else: + out_user_data[0] = NULL + else: + if user_data is not None: + raise ValueError( + "user_data is only supported with ctypes function pointers") + out_fn[0] = _py_host_trampoline + out_user_data[0] = fn + + out_fn_owner[0] = make_opaque_py(fn) + + +cdef int _attach_host_callback_owners( + const GraphHandle& h_graph, cydriver.CUgraphNode node, + OpaqueHandle fn_owner, OpaqueHandle data_owner) except -1: + """Attach a resolved host callback's owners to its node's graph slots: the + callback in slot 0 and any copied ``user_data`` buffer in slot 1. + """ + HANDLE_RETURN(graph_set_slot(h_graph, node, 0, fn_owner)) + if data_owner: + HANDLE_RETURN(graph_set_slot(h_graph, node, 1, data_owner)) + return 0 diff --git a/cuda_core/cuda/core/graph/_subclasses.pyi b/cuda_core/cuda/core/graph/_subclasses.pyi index 345e6417c4d..480b1b66a6f 100644 --- a/cuda_core/cuda/core/graph/_subclasses.pyi +++ b/cuda_core/cuda/core/graph/_subclasses.pyi @@ -204,7 +204,7 @@ class ChildGraphNode(GraphNode): ... @property - def child_graph(self) -> GraphDefinition: + def child_graph(self) -> 'GraphDefinition': """The embedded graph definition (non-owning wrapper).""" class EventRecordNode(GraphNode): @@ -290,7 +290,7 @@ class ConditionalNode(GraphNode): """ @property - def branches(self) -> tuple[GraphDefinition, ...]: + def branches(self) -> tuple['GraphDefinition', ...]: """The body graphs for each branch as a tuple of GraphDefinition. Returns an empty tuple when reconstructed from the driver @@ -304,7 +304,7 @@ class IfNode(ConditionalNode): ... @property - def then(self) -> GraphDefinition: + def then(self) -> 'GraphDefinition': """The 'then' branch graph.""" class IfElseNode(ConditionalNode): @@ -314,11 +314,11 @@ class IfElseNode(ConditionalNode): ... @property - def then(self) -> GraphDefinition: + def then(self) -> 'GraphDefinition': """The ``then`` branch graph (executed when condition is non-zero).""" @property - def else_(self) -> GraphDefinition: + def else_(self) -> 'GraphDefinition': """The ``else`` branch graph (executed when condition is zero).""" class WhileNode(ConditionalNode): @@ -328,7 +328,7 @@ class WhileNode(ConditionalNode): ... @property - def body(self) -> GraphDefinition: + def body(self) -> 'GraphDefinition': """The loop body graph.""" class SwitchNode(ConditionalNode): diff --git a/cuda_core/cuda/core/graph/_subclasses.pyx b/cuda_core/cuda/core/graph/_subclasses.pyx index 85a382197f8..df919426bc2 100644 --- a/cuda_core/cuda/core/graph/_subclasses.pyx +++ b/cuda_core/cuda/core/graph/_subclasses.pyx @@ -30,7 +30,7 @@ from cuda.core._resource_handles cimport ( ) from cuda.core._utils.cuda_utils cimport HANDLE_RETURN -from cuda.core.graph._utils cimport _is_py_host_trampoline +from cuda.core.graph._host_callback cimport _is_py_host_trampoline from cuda.core._utils.cuda_utils import driver, handle_return from cuda.core.typing import GraphConditionalType @@ -478,7 +478,7 @@ cdef class ChildGraphNode(GraphNode): f" child=0x{as_intptr(self._h_child_graph):x}>") @property - def child_graph(self) -> GraphDefinition: + def child_graph(self) -> "GraphDefinition": """The embedded graph definition (non-owning wrapper).""" return GraphDefinition._from_handle(self._h_child_graph) @@ -705,7 +705,7 @@ cdef class ConditionalNode(GraphNode): return GraphConditionalType("switch") @property - def branches(self) -> tuple[GraphDefinition, ...]: + def branches(self) -> tuple["GraphDefinition", ...]: """The body graphs for each branch as a tuple of GraphDefinition. Returns an empty tuple when reconstructed from the driver @@ -722,7 +722,7 @@ cdef class IfNode(ConditionalNode): f" condition=0x{self._condition._c_handle:x}>") @property - def then(self) -> GraphDefinition: + def then(self) -> "GraphDefinition": """The 'then' branch graph.""" return self._branches[0] @@ -735,12 +735,12 @@ cdef class IfElseNode(ConditionalNode): f" condition=0x{self._condition._c_handle:x}>") @property - def then(self) -> GraphDefinition: + def then(self) -> "GraphDefinition": """The ``then`` branch graph (executed when condition is non-zero).""" return self._branches[0] @property - def else_(self) -> GraphDefinition: + def else_(self) -> "GraphDefinition": """The ``else`` branch graph (executed when condition is zero).""" return self._branches[1] @@ -753,7 +753,7 @@ cdef class WhileNode(ConditionalNode): f" condition=0x{self._condition._c_handle:x}>") @property - def body(self) -> GraphDefinition: + def body(self) -> "GraphDefinition": """The loop body graph.""" return self._branches[0] diff --git a/cuda_core/cuda/core/graph/_utils.pxd b/cuda_core/cuda/core/graph/_utils.pxd deleted file mode 100644 index 63fdb00ac4f..00000000000 --- a/cuda_core/cuda/core/graph/_utils.pxd +++ /dev/null @@ -1,16 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -from cuda.bindings cimport cydriver - - -cdef bint _is_py_host_trampoline(cydriver.CUhostFn fn) noexcept nogil - -cdef void _attach_user_object( - cydriver.CUgraph graph, void* ptr, - cydriver.CUhostFn destroy) except * - -cdef void _attach_host_callback_to_graph( - cydriver.CUgraph graph, object fn, object user_data, - cydriver.CUhostFn* out_fn, void** out_user_data) except * diff --git a/cuda_core/cuda/core/graph/_utils.pyx b/cuda_core/cuda/core/graph/_utils.pyx deleted file mode 100644 index dfc2f4f3fec..00000000000 --- a/cuda_core/cuda/core/graph/_utils.pyx +++ /dev/null @@ -1,99 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -from cpython.ref cimport Py_INCREF - -from libc.stdint cimport uintptr_t -from libc.stdlib cimport malloc, free -from libc.string cimport memcpy as c_memcpy - -from cuda.bindings cimport cydriver - -from cuda.core._resource_handles cimport py_object_user_object_destroy -from cuda.core._utils.cuda_utils cimport HANDLE_RETURN - - -cdef void _py_host_trampoline(void* data) noexcept with gil: - (data)() - - -cdef bint _is_py_host_trampoline(cydriver.CUhostFn fn) noexcept nogil: - return fn == _py_host_trampoline - - -cdef void _attach_user_object( - cydriver.CUgraph graph, void* ptr, - cydriver.CUhostFn destroy) except *: - """Create a CUDA user object and transfer ownership to the graph. - - On success the graph owns the resource (via MOVE semantics). - On failure the destroy callback is invoked to clean up ptr, - then a CUDAError is raised — callers need no try/except. - """ - cdef cydriver.CUuserObject user_obj = NULL - cdef cydriver.CUresult ret - with nogil: - ret = cydriver.cuUserObjectCreate( - &user_obj, ptr, destroy, 1, - cydriver.CU_USER_OBJECT_NO_DESTRUCTOR_SYNC) - if ret == cydriver.CUDA_SUCCESS: - ret = cydriver.cuGraphRetainUserObject( - graph, user_obj, 1, cydriver.CU_GRAPH_USER_OBJECT_MOVE) - if ret != cydriver.CUDA_SUCCESS: - cydriver.cuUserObjectRelease(user_obj, 1) - if ret != cydriver.CUDA_SUCCESS: - if user_obj == NULL: - destroy(ptr) - HANDLE_RETURN(ret) - - -cdef void _attach_host_callback_to_graph( - cydriver.CUgraph graph, object fn, object user_data, - cydriver.CUhostFn* out_fn, void** out_user_data) except *: - """Resolve a Python callable or ctypes CFuncPtr into a C callback pair. - - Handles Py_INCREF, user-object attachment for lifetime management, - and user_data copying. On return, *out_fn and *out_user_data are - ready to pass to cuGraphAddHostNode or cuLaunchHostFunc. - """ - import ctypes as ct - - cdef void* fn_pyobj = NULL - - if isinstance(fn, ct._CFuncPtr): - Py_INCREF(fn) - fn_pyobj = fn - _attach_user_object( - graph, fn_pyobj, - py_object_user_object_destroy) - out_fn[0] = ct.cast( - fn, ct.c_void_p).value - - if user_data is not None: - if isinstance(user_data, int): - out_user_data[0] = user_data - else: - buf = bytes(user_data) - out_user_data[0] = malloc(len(buf)) - if out_user_data[0] == NULL: - raise MemoryError( - "failed to allocate user_data buffer") - c_memcpy(out_user_data[0], buf, len(buf)) - _attach_user_object( - graph, out_user_data[0], - free) - else: - out_user_data[0] = NULL - else: - if user_data is not None: - raise ValueError( - "user_data is only supported with ctypes " - "function pointers") - Py_INCREF(fn) - fn_pyobj = fn - out_fn[0] = _py_host_trampoline - out_user_data[0] = fn_pyobj - _attach_user_object( - graph, fn_pyobj, - py_object_user_object_destroy) diff --git a/cuda_core/tests/graph/test_graph_builder.py b/cuda_core/tests/graph/test_graph_builder.py index 18dfe21cc12..efb70fe75dd 100644 --- a/cuda_core/tests/graph/test_graph_builder.py +++ b/cuda_core/tests/graph/test_graph_builder.py @@ -5,11 +5,12 @@ import numpy as np import pytest -from helpers.graph_kernels import compile_common_kernels +from helpers.graph_kernels import compile_common_kernels, compile_conditional_kernels from helpers.marks import requires_module +from helpers.misc import try_create_condition from cuda.core import Device, LaunchConfig, LegacyPinnedMemoryResource, launch -from cuda.core.graph import GraphBuilder +from cuda.core.graph import GraphBuilder, GraphDefinition def test_graph_is_building(init_cuda): @@ -384,3 +385,190 @@ def test_graph_stream_lifetime(init_cuda): # Destroy the stream stream.close() + + +# --------------------------------------------------------------------------- +# GraphBuilder.graph_definition +# --------------------------------------------------------------------------- + + +def test_graph_definition_returns_graph_definition_after_end_building(init_cuda): + """Primary builder exposes its captured graph as a GraphDefinition after end_building().""" + mod = compile_common_kernels() + empty_kernel = mod.get_kernel("empty_kernel") + + gb = Device().create_graph_builder().begin_building() + launch(gb, LaunchConfig(grid=1, block=1), empty_kernel) + launch(gb, LaunchConfig(grid=1, block=1), empty_kernel) + gb.end_building() + + gd = gb.graph_definition + assert isinstance(gd, GraphDefinition) + # The captured graph must contain the launched kernels. + assert len(gd.nodes()) == 2 + + +def test_graph_definition_raises_before_begin_building(init_cuda): + """Primary builder has no graph allocated before begin_building().""" + gb = Device().create_graph_builder() + with pytest.raises(RuntimeError, match="before begin_building"): + _ = gb.graph_definition + + +def test_graph_definition_raises_during_capture(init_cuda): + """graph_definition is unsafe while the driver is actively capturing.""" + gb = Device().create_graph_builder().begin_building() + try: + with pytest.raises(RuntimeError, match="capture is in"): + _ = gb.graph_definition + finally: + gb.end_building() + + +def test_graph_definition_raises_for_forked(init_cuda): + """Forked builders share the primary's graph; their property must raise.""" + mod = compile_common_kernels() + empty_kernel = mod.get_kernel("empty_kernel") + + gb = Device().create_graph_builder().begin_building() + launch(gb, LaunchConfig(grid=1, block=1), empty_kernel) + primary, sibling = gb.split(2) + try: + with pytest.raises(RuntimeError, match="forked"): + _ = sibling.graph_definition + finally: + sibling = GraphBuilder.join(primary, sibling) + sibling.end_building() + + +def test_graph_definition_shares_ownership(init_cuda): + """Closing the builder must not invalidate a held GraphDefinition.""" + mod = compile_common_kernels() + empty_kernel = mod.get_kernel("empty_kernel") + + gb = Device().create_graph_builder().begin_building() + launch(gb, LaunchConfig(grid=1, block=1), empty_kernel) + gb.end_building() + + gd = gb.graph_definition + gb.close() + # The shared CUgraph keeps the graph alive. + assert len(gd.nodes()) == 1 + + +def test_graph_definition_round_trips_through_explicit_api(init_cuda): + """Mutating via the explicit API survives complete() and runs correctly.""" + mod = compile_common_kernels() + add_one = mod.get_kernel("add_one") + + launch_stream = Device().create_stream() + mr = LegacyPinnedMemoryResource() + b = mr.allocate(4) + arr = np.from_dlpack(b).view(np.int32) + arr[0] = 0 + + gb = launch_stream.create_graph_builder().begin_building() + launch(gb, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) + gb.end_building() + + # Add a second add_one through the explicit GraphDefinition view. + gd = gb.graph_definition + captured_node = next(iter(gd.nodes())) + captured_node.launch(LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) + assert len(gd.nodes()) == 2 + + graph = gb.complete() + graph.launch(launch_stream) + launch_stream.sync() + assert arr[0] == 2 + + b.close() + + +@requires_module(np, "2.1") +def test_graph_definition_hybrid_conditional_body(init_cuda): + """Populate a conditional body entirely through the explicit API. + + This is the headline hybrid flow enabled by the new property: + ``if_then`` returns a ``GraphBuilder`` for the body, but instead of + calling ``begin_building`` and capturing into it, we reach for + ``graph_definition`` and add nodes through the explicit API. + """ + mod = compile_conditional_kernels(int) + add_one = mod.get_kernel("add_one") + set_handle = mod.get_kernel("set_handle") + + launch_stream = Device().create_stream() + mr = LegacyPinnedMemoryResource() + b = mr.allocate(4) + arr = np.from_dlpack(b).view(np.int32) + arr[0] = 0 + + gb = Device().create_graph_builder().begin_building() + condition = try_create_condition(gb) + launch(gb, LaunchConfig(grid=1, block=1), set_handle, condition, 1) + body_gb = gb.if_then(condition) + + # Skip body_gb.begin_building() entirely -- the body graph already + # exists at conditional-node creation time and is exposed here. + body_def = body_gb.graph_definition + assert isinstance(body_def, GraphDefinition) + assert len(body_def.nodes()) == 0 + body_def.launch(LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) + + graph = gb.end_building().complete() + graph.launch(launch_stream) + launch_stream.sync() + assert arr[0] == 1 + + b.close() + + +@requires_module(np, "2.1") +def test_graph_definition_conditional_body_after_capture(init_cuda): + """Capture into a conditional body, then augment it via the explicit API.""" + mod = compile_conditional_kernels(int) + add_one = mod.get_kernel("add_one") + set_handle = mod.get_kernel("set_handle") + + launch_stream = Device().create_stream() + mr = LegacyPinnedMemoryResource() + b = mr.allocate(4) + arr = np.from_dlpack(b).view(np.int32) + arr[0] = 0 + + gb = Device().create_graph_builder().begin_building() + condition = try_create_condition(gb) + launch(gb, LaunchConfig(grid=1, block=1), set_handle, condition, 1) + body_gb = gb.if_then(condition).begin_building() + + # Capture one increment into the body. + launch(body_gb, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) + body_gb.end_building() + + # Add a second increment via the explicit API on the same body graph. + body_def = body_gb.graph_definition + captured_node = next(iter(body_def.nodes())) + captured_node.launch(LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) + assert len(body_def.nodes()) == 2 + + graph = gb.end_building().complete() + graph.launch(launch_stream) + launch_stream.sync() + assert arr[0] == 2 + + b.close() + + +@requires_module(np, "2.1") +def test_graph_definition_conditional_body_during_capture_raises(init_cuda): + """The CAPTURING-state guard fires for conditional bodies too.""" + gb = Device().create_graph_builder().begin_building() + condition = try_create_condition(gb) + body_gb = gb.if_then(condition).begin_building() + try: + with pytest.raises(RuntimeError, match="capture is in"): + _ = body_gb.graph_definition + finally: + body_gb.end_building() + gb.end_building() diff --git a/cuda_core/tests/graph/test_graph_definition_lifetime.py b/cuda_core/tests/graph/test_graph_definition_lifetime.py index 40bc6f3c442..d196e35f478 100644 --- a/cuda_core/tests/graph/test_graph_definition_lifetime.py +++ b/cuda_core/tests/graph/test_graph_definition_lifetime.py @@ -13,22 +13,52 @@ from helpers.misc import try_create_condition from conftest import xfail_on_graph_mempool_oom +from cuda_python_test_helpers import under_compute_sanitizer +# Resource finalization triggered by graph destruction is not strictly +# synchronous: the graph's slot table is freed through a CUDA user-object +# destructor that the driver may run on its own thread, after which each owner +# is released (a shared_ptr decrement, or Py_DECREF under the GIL). Release is +# deterministic at the reference-count level, so the predicate normally flips +# within milliseconds; this budget only bounds a slow/loaded runner. It stays a +# hard failure rather than a warning so a real leak still fails the suite. +# Compute-sanitizer slows everything down, hence the larger ceiling there. +_FINALIZE_TIMEOUT = 30.0 if under_compute_sanitizer() else 5.0 -def _wait_until(predicate, timeout=2.0, interval=0.01): - """Poll predicate() until True or timeout, driving gc each iteration. - Used for assertions about resource cleanup that may be delayed by CUDA's - asynchronous user-object destructor pump (DPC) or, on free-threaded - Python, by deferred reference-count processing. A bounded poll keeps the - test correct without depending on undocumented driver timing guarantees. +class _Sentinel: + """Weak-referenceable stand-in for an owner attached to a graph slot. + + Bare ``object()`` instances do not support weak references, so tests that + observe owner release through a :class:`weakref.ref` use this trivial + subclass instead. """ + + +def _wait_until(predicate, timeout=None, interval=0.02): + """Poll ``predicate()`` until true, or raise AssertionError on timeout. + + Each iteration drives ``gc.collect()`` and yields the main thread (which + releases the GIL) so the driver's asynchronous user-object destructor -- + and the ``Py_DECREF`` it triggers -- can make progress. Used for resource + cleanup that lags graph destruction; see ``_FINALIZE_TIMEOUT``. + """ + if timeout is None: + timeout = _FINALIZE_TIMEOUT deadline = time.monotonic() + timeout - while time.monotonic() < deadline: + while True: gc.collect() if predicate(): return + if time.monotonic() >= deadline: + break + time.sleep(0) # yield the GIL to the driver's finalizer thread time.sleep(interval) + # Final attempt after one more yield and collection. + time.sleep(0) + gc.collect() + if predicate(): + return raise AssertionError(f"condition not satisfied within {timeout}s") @@ -594,3 +624,366 @@ def test_kernel_args_survive_graph_clone(init_cuda): out = (ctypes.c_int * 1)(0) handle_return(driver.cuMemcpyDtoH(out, dptr, ctypes.sizeof(ctypes.c_int))) assert out[0] == 1 + + +# ============================================================================= +# Memcpy/memset Buffer lifetime — operands passed as Buffer objects +# ============================================================================= + + +def test_memset_buffer_lifetime(init_cuda): + """Memset retains the Buffer allocation after the wrapper is collected.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + dptr = int(buf.handle) + + g = GraphDefinition() + g.memset(buf, 0xAB, 4) + + del buf + gc.collect() + + stream = dev.create_stream() + g.instantiate().launch(stream) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dptr, 4)) + assert list(out) == [0xAB] * 4 + + +def test_memset_buffer_survives_close(init_cuda): + """Memset retains the allocation when the Buffer wrapper is closed.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + dptr = int(buf.handle) + + g = GraphDefinition() + g.memset(buf, 0xAB, 4) + buf.close() + + stream = dev.create_stream() + g.instantiate().launch(stream) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dptr, 4)) + assert list(out) == [0xAB] * 4 + + +def test_memcpy_buffer_lifetime(init_cuda): + """Memcpy retains operand allocations after the Buffer wrappers are collected.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + src = mr.allocate(4, stream=dev.default_stream) + dst = mr.allocate(4, stream=dev.default_stream) + src.fill(0xCD, stream=dev.default_stream) + dev.default_stream.sync() + dst_dptr = int(dst.handle) + + g = GraphDefinition() + g.memcpy(dst, src, 4) + + del src, dst + gc.collect() + + stream = dev.create_stream() + g.instantiate().launch(stream) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4)) + assert list(out) == [0xCD] * 4 + + +def test_memcpy_buffer_survives_close(init_cuda): + """Memcpy retains allocations when Buffer wrappers are closed.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + src = mr.allocate(4, stream=dev.default_stream) + dst = mr.allocate(4, stream=dev.default_stream) + src.fill(0xCD, stream=dev.default_stream) + dev.default_stream.sync() + dst_dptr = int(dst.handle) + + g = GraphDefinition() + g.memcpy(dst, src, 4) + src.close() + dst.close() + + stream = dev.create_stream() + g.instantiate().launch(stream) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4)) + assert list(out) == [0xCD] * 4 + + +def test_memcpy_buffer_allocations_released_after_graph_destroyed(init_cuda): + """Destroying the graph frees both memcpy operand allocations. + + Each operand's device-pointer handle is observed via a weak handle + (see ``cuda.core._utils._weak_handles``), so release is checked at the + reference-count level rather than through a driver side effect. With both + Buffer wrappers closed, the graph's slots are the only remaining owners; + destroying the graph releases them and the weak handles expire. + """ + from cuda.core._utils._weak_handles import weak_handle + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + src = mr.allocate(4, stream=dev.default_stream) + dst = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + + g = GraphDefinition() + g.memcpy(dst, src, 4) + + # Observe the allocations, then drop the wrappers' strong references; the + # graph slots remain the sole owners. + src_weak = weak_handle(src) + dst_weak = weak_handle(dst) + src.close() + dst.close() + assert src_weak and dst_weak # graph slots still retain both allocations + + del g + _wait_until(lambda: not src_weak and not dst_weak) + + +def test_memcpy_buffers_survive_graph_clone(init_cuda): + """Cloned graph keeps memcpy operand allocations alive via CUDA user objects.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + src = mr.allocate(4, stream=dev.default_stream) + dst = mr.allocate(4, stream=dev.default_stream) + src.fill(0xCD, stream=dev.default_stream) + dev.default_stream.sync() + dst_dptr = int(dst.handle) + + g = GraphDefinition() + g.memcpy(dst, src, 4) + cloned_cu_graph = handle_return(driver.cuGraphClone(driver.CUgraph(g.handle))) + + del src, dst, g + gc.collect() + + graph_exec = handle_return(driver.cuGraphInstantiate(cloned_cu_graph, 0)) + stream = dev.create_stream() + handle_return(driver.cuGraphLaunch(graph_exec, driver.CUstream(int(stream.handle)))) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4)) + assert list(out) == [0xCD] * 4 + + +# ============================================================================= +# Explicit dst_owner / src_owner for raw pointer operands +# ============================================================================= + + +def test_memset_raw_ptr_with_dst_owner(init_cuda): + """Raw dst plus Buffer dst_owner retains the allocation after close.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + dptr = int(buf.handle) + + g = GraphDefinition() + g.memset(dptr, 0xAB, 4, dst_owner=buf) + buf.close() + + stream = dev.create_stream() + g.instantiate().launch(stream) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dptr, 4)) + assert list(out) == [0xAB] * 4 + + +def test_slot_owners_released_after_graph_destroyed(init_cuda): + """Destroying the graph releases every owner held in its slot table. + + Raw-pointer operands with explicit sentinel owners make release observable + in pure Python: the slot table holds a strong Python reference to each owner + (via ``make_opaque_py``), and graph destruction frees the table -- dropping + those references. This exercises the same teardown that releases a Buffer + operand's device-pointer handle (slot 0 for ``dst``, slot 1 for ``src``). + """ + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(8, stream=dev.default_stream) + dev.default_stream.sync() + dptr = int(buf.handle) + + dst_owner = _Sentinel() + src_owner = _Sentinel() + dst_weak = weakref.ref(dst_owner) + src_weak = weakref.ref(src_owner) + + g = GraphDefinition() + # Non-overlapping 4-byte copy within an 8-byte allocation. + g.memcpy(dptr, dptr + 4, 4, dst_owner=dst_owner, src_owner=src_owner) + + del dst_owner, src_owner + gc.collect() + assert dst_weak() is not None and src_weak() is not None # graph retains owners + + del g + _wait_until(lambda: dst_weak() is None and src_weak() is None) + + buf.close() + + +def test_memcpy_raw_ptrs_with_owners(init_cuda): + """Raw src/dst plus Buffer owners retain allocations after close.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + src = mr.allocate(4, stream=dev.default_stream) + dst = mr.allocate(4, stream=dev.default_stream) + src.fill(0xCD, stream=dev.default_stream) + dev.default_stream.sync() + src_dptr = int(src.handle) + dst_dptr = int(dst.handle) + + g = GraphDefinition() + g.memcpy(dst_dptr, src_dptr, 4, dst_owner=dst, src_owner=src) + src.close() + dst.close() + + stream = dev.create_stream() + g.instantiate().launch(stream) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4)) + assert list(out) == [0xCD] * 4 + + +def test_memcpy_mixed_buffer_and_raw_owner(init_cuda): + """Buffer dst and raw src with src_owner retain allocations after close.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + src = mr.allocate(4, stream=dev.default_stream) + dst = mr.allocate(4, stream=dev.default_stream) + src.fill(0xCD, stream=dev.default_stream) + dev.default_stream.sync() + src_dptr = int(src.handle) + dst_dptr = int(dst.handle) + + g = GraphDefinition() + g.memcpy(dst, src_dptr, 4, src_owner=src) + src.close() + dst.close() + + stream = dev.create_stream() + g.instantiate().launch(stream) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4)) + assert list(out) == [0xCD] * 4 + + +def test_memset_closed_buffer_rejected(init_cuda): + """Memset rejects a Buffer with no active allocation.""" + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + buf.close() + + g = GraphDefinition() + with pytest.raises(ValueError, match="dst Buffer has no active allocation"): + g.memset(buf, 0xAB, 4) + + +def test_memset_closed_buffer_dst_owner_rejected(init_cuda): + """Memset rejects a closed Buffer passed as dst_owner.""" + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + dptr = int(buf.handle) + buf.close() + + g = GraphDefinition() + with pytest.raises(ValueError, match="dst_owner Buffer has no active allocation"): + g.memset(dptr, 0xAB, 4, dst_owner=buf) + + +def test_memcpy_closed_buffer_src_owner_rejected(init_cuda): + """Memcpy rejects a closed Buffer passed as src_owner.""" + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + dptr = int(buf.handle) + buf.close() + + g = GraphDefinition() + with pytest.raises(ValueError, match="src_owner Buffer has no active allocation"): + g.memcpy(dptr, dptr, 4, src_owner=buf) + + +def test_memcpy_buffer_and_dst_owner_rejected(init_cuda): + """dst_owner cannot be combined with a Buffer dst operand.""" + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + + g = GraphDefinition() + with pytest.raises(ValueError, match="dst_owner cannot be used when dst is a Buffer"): + g.memcpy(buf, buf, 4, dst_owner=object()) + + +def test_memcpy_buffer_and_src_owner_rejected(init_cuda): + """src_owner cannot be combined with a Buffer src operand.""" + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + + g = GraphDefinition() + with pytest.raises(ValueError, match="src_owner cannot be used when src is a Buffer"): + g.memcpy(buf, buf, 4, src_owner=object()) diff --git a/cuda_core/tests/graph/test_graph_definition_mutation.py b/cuda_core/tests/graph/test_graph_definition_mutation.py index 1db1089f825..b176503e3df 100644 --- a/cuda_core/tests/graph/test_graph_definition_mutation.py +++ b/cuda_core/tests/graph/test_graph_definition_mutation.py @@ -311,9 +311,9 @@ def test_add_wrong_type(init_cuda): """Adding a non-GraphNode raises TypeError.""" g = GraphDefinition() node = g.empty() - with pytest.raises(TypeError, match="expected .*GraphNode"): + with pytest.raises(TypeError, match="expected GraphNode"): node.succ.add("not a node") - with pytest.raises(TypeError, match="expected .*GraphNode"): + with pytest.raises(TypeError, match="expected GraphNode"): node.succ.add(42)