diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp
index 3bbe0fafe03..df3edd908c4 100644
--- a/cuda_core/cuda/core/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp
@@ -6,8 +6,11 @@
 
 #include "resource_handles.hpp"
 #include <cuda.h>
+#include <array>
 #include <cstdint>
+#include <cstdlib>
 #include <cstring>
+#include <map>
 #include <mutex>
 #include <stdexcept>
 #include <unordered_map>
@@ -70,6 +73,9 @@ decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel = nullptr;
 // Graph
 decltype(&cuGraphDestroy) p_cuGraphDestroy = nullptr;
 decltype(&cuGraphExecDestroy) p_cuGraphExecDestroy = nullptr;
+decltype(&cuUserObjectCreate) p_cuUserObjectCreate = nullptr;
+decltype(&cuUserObjectRelease) p_cuUserObjectRelease = nullptr;
+decltype(&cuGraphRetainUserObject) p_cuGraphRetainUserObject = nullptr;
 
 // Linker
 decltype(&cuLinkDestroy) p_cuLinkDestroy = nullptr;
@@ -1114,12 +1120,92 @@ LibraryHandle get_kernel_library(const KernelHandle& h) noexcept {
 // ============================================================================
 
 namespace {
+
+// Slot table layout (internal). Each graph maps CUgraphNode -> a fixed-size
+// array of type-erased owners. The width is the most any single node needs: a
+// kernel node holds its kernel and its packed arguments; a host node holds its
+// callback and the userData. The table is heap-allocated and retained on the
+// graph as a user object, so the driver frees it -- and every owner in it --
+// when the graph is destroyed.
+constexpr std::size_t SLOTS_PER_NODE = 2;
+using NodeSlots = std::array<OpaqueHandle, SLOTS_PER_NODE>;
+using GraphSlotTable = std::map<CUgraphNode, NodeSlots>;
+
+// shared_ptr deleters for the payloads that need one. Typed handles convert to
+// OpaqueHandle by assignment and reuse their own control block, so they need no
+// deleter here. The Python deleter follows the owner-release pattern used by
+// the stream/deviceptr handles above.
+void py_deleter(const void* p) noexcept {
+    GILAcquireGuard gil;
+    if (gil.acquired()) {
+        Py_DECREF(const_cast<PyObject*>(static_cast<const PyObject*>(p)));
+    }
+}
+
+void free_deleter(const void* p) noexcept {
+    std::free(const_cast<void*>(p));
+}
+
+void destroy_graph_slot_table(void* table) noexcept {
+    delete static_cast<GraphSlotTable*>(table);
+}
+
 struct GraphBox {
     CUgraph resource;
-    GraphHandle h_parent;  // Keeps parent alive for child/branch graphs
+    GraphHandle h_parent;                  // Keeps parent alive for child/branch graphs
+    mutable GraphSlotTable* slot_table = nullptr;  // Lazily created; owned by the graph's user object
 };
+
+const GraphBox* get_box(const GraphHandle& h) {
+    const CUgraph* p = h.get();
+    return reinterpret_cast<const GraphBox*>(
+        reinterpret_cast<const char*>(p) - offsetof(GraphBox, resource)
+    );
+}
+
+// Return box's slot table, creating it on first use. The table is retained on
+// the graph as a user object (MOVE transfers our only reference into the
+// graph), so it -- and every owner in it -- is freed when the graph is
+// destroyed. Returns nullptr if the driver lacks user-object support or a
+// driver call fails; the cached pointer is non-owning.
+GraphSlotTable* ensure_slot_table(const GraphBox* box) {
+    if (box->slot_table) {
+        return box->slot_table;
+    }
+    if (!p_cuUserObjectCreate || !p_cuGraphRetainUserObject || !p_cuUserObjectRelease) {
+        return nullptr;
+    }
+    auto* table = new GraphSlotTable();
+    CUuserObject user_obj = nullptr;
+    {
+        GILReleaseGuard gil;
+        if (p_cuUserObjectCreate(&user_obj, table,
+                                 reinterpret_cast<CUhostFn>(destroy_graph_slot_table),
+                                 1, CU_USER_OBJECT_NO_DESTRUCTOR_SYNC) != CUDA_SUCCESS) {
+            delete table;  // no user object created; nothing else owns the table
+            return nullptr;
+        }
+        if (p_cuGraphRetainUserObject(box->resource, user_obj, 1,
+                                      CU_GRAPH_USER_OBJECT_MOVE) != CUDA_SUCCESS) {
+            p_cuUserObjectRelease(user_obj, 1);  // drops refcount to 0 -> frees table
+            return nullptr;
+        }
+    }
+    box->slot_table = table;  // non-owning cache; the user object owns it
+    return table;
+}
+
 }  // namespace
 
+OpaqueHandle make_opaque_py(PyObject* obj) {
+    Py_INCREF(obj);
+    return OpaqueHandle(static_cast<const void*>(obj), py_deleter);
+}
+
+OpaqueHandle make_opaque_malloc(void* buf) {
+    return OpaqueHandle(static_cast<const void*>(buf), free_deleter);
+}
+
 GraphHandle create_graph_handle(CUgraph graph) {
     auto box = std::shared_ptr<const GraphBox>(
         new GraphBox{graph, {}},
@@ -1137,6 +1223,19 @@ GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent)
     return GraphHandle(box, &box->resource);
 }
 
+CUresult graph_set_slot(const GraphHandle& h_graph, CUgraphNode node,
+                        unsigned int slot, OpaqueHandle owner) {
+    if (!h_graph || slot >= SLOTS_PER_NODE) {
+        return CUDA_ERROR_INVALID_VALUE;
+    }
+    GraphSlotTable* table = ensure_slot_table(get_box(h_graph));
+    if (!table) {
+        return CUDA_ERROR_NOT_SUPPORTED;
+    }
+    (*table)[node][slot] = std::move(owner);
+    return CUDA_SUCCESS;
+}
+
 // ============================================================================
 // Graph Exec Handles
 // ============================================================================
diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp
index 520e7f47634..686d590b6e0 100644
--- a/cuda_core/cuda/core/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp
@@ -109,6 +109,9 @@ extern decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel;
 // Graph
 extern decltype(&cuGraphDestroy) p_cuGraphDestroy;
 extern decltype(&cuGraphExecDestroy) p_cuGraphExecDestroy;
+extern decltype(&cuUserObjectCreate) p_cuUserObjectCreate;
+extern decltype(&cuUserObjectRelease) p_cuUserObjectRelease;
+extern decltype(&cuGraphRetainUserObject) p_cuGraphRetainUserObject;
 
 // Linker
 extern decltype(&cuLinkDestroy) p_cuLinkDestroy;
@@ -466,6 +469,37 @@ GraphHandle create_graph_handle(CUgraph graph);
 // but h_parent will be prevented from destruction while this handle exists.
 GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent);
 
+// ============================================================================
+// Graph slot attachments
+//
+// A graph carries a side table that keeps resources used by its nodes (kernel
+// arguments, host callbacks, events, ...) alive for as long as the graph can
+// execute. The table is created on first use and retained on the CUgraph as a
+// user object, so the driver releases it -- and everything attached through it
+// -- when the graph is destroyed. The table layout is an internal detail;
+// callers use the abstract API below.
+// ============================================================================
+
+// Type-erased shared owner of an attached resource. Typed handles such as
+// EventHandle and KernelHandle convert to OpaqueHandle by assignment, reusing
+// their existing control block; the helpers below build OpaqueHandles for the
+// two cases that need a custom deleter.
+using OpaqueHandle = std::shared_ptr<const void>;
+
+// Build an OpaqueHandle from a Python object: increments its refcount now and
+// decrements it (under the GIL) on release. The caller must hold the GIL.
+OpaqueHandle make_opaque_py(PyObject* obj);
+
+// Build an OpaqueHandle from a malloc'd buffer: std::free on release.
+OpaqueHandle make_opaque_malloc(void* buf);
+
+// Attach owner to one of node's fixed slots on h_graph, replacing whatever was
+// there. The graph's slot table is created on first use. Returns CUDA_SUCCESS,
+// or an error if slot is out of range or the graph cannot hold a table (e.g.
+// the driver lacks user-object support).
+CUresult graph_set_slot(const GraphHandle& h_graph, CUgraphNode node,
+                        unsigned int slot, OpaqueHandle owner);
+
 // ============================================================================
 // Graph exec handle functions
 // ============================================================================
diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd
index 54b22ac6028..0ca4c98440d 100644
--- a/cuda_core/cuda/core/_resource_handles.pxd
+++ b/cuda_core/cuda/core/_resource_handles.pxd
@@ -44,6 +44,13 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
 
     ctypedef shared_ptr[const cydriver.CUlinkState] CuLinkHandle
     ctypedef shared_ptr[const int] FileDescriptorHandle
+
+    # Type-erased shared owner for resources attached to graph node slots.
+    # Typed handles above assign directly to an OpaqueHandle (shared control
+    # block); make_opaque_py / make_opaque_malloc cover the two cases needing a
+    # custom deleter.
+    ctypedef shared_ptr[const void] OpaqueHandle
+
     ctypedef shared_ptr[const cydriver.CUarray] OpaqueArrayHandle
     ctypedef shared_ptr[const cydriver.CUmipmappedArray] MipmappedArrayHandle
 
@@ -223,6 +230,13 @@ cdef LibraryHandle get_kernel_library(const KernelHandle& h) noexcept nogil
 cdef GraphHandle create_graph_handle(cydriver.CUgraph graph) except+ nogil
 cdef GraphHandle create_graph_handle_ref(cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil
 
+# Graph slot attachments
+cdef OpaqueHandle make_opaque_py(object obj) except+
+cdef OpaqueHandle make_opaque_malloc(void* buf) except+
+cdef cydriver.CUresult graph_set_slot(
+    const GraphHandle& h_graph, cydriver.CUgraphNode node,
+    unsigned int slot, OpaqueHandle owner) except+
+
 # Graph exec handles
 cdef GraphExecHandle create_graph_exec_handle(cydriver.CUgraphExec graph_exec) except+ nogil
 
diff --git a/cuda_core/cuda/core/_resource_handles.pyi b/cuda_core/cuda/core/_resource_handles.pyi
index d4511ae0634..92e686813e8 100644
--- a/cuda_core/cuda/core/_resource_handles.pyi
+++ b/cuda_core/cuda/core/_resource_handles.pyi
@@ -21,6 +21,7 @@ NvvmProgramHandle = shared_ptr
 NvJitLinkHandle = shared_ptr
 CuLinkHandle = shared_ptr
 FileDescriptorHandle = shared_ptr
+OpaqueHandle = shared_ptr
 OpaqueArrayHandle = shared_ptr
 MipmappedArrayHandle = shared_ptr
 TexObjectHandle = shared_ptr
diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx
index 4bb7156109e..8c39e747977 100644
--- a/cuda_core/cuda/core/_resource_handles.pyx
+++ b/cuda_core/cuda/core/_resource_handles.pyx
@@ -151,6 +151,12 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     GraphHandle create_graph_handle_ref "cuda_core::create_graph_handle_ref" (
         cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil
 
+    OpaqueHandle make_opaque_py "cuda_core::make_opaque_py" (object obj) except+
+    OpaqueHandle make_opaque_malloc "cuda_core::make_opaque_malloc" (void* buf) except+
+    cydriver.CUresult graph_set_slot "cuda_core::graph_set_slot" (
+        const GraphHandle& h_graph, cydriver.CUgraphNode node,
+        unsigned int slot, OpaqueHandle owner) except+
+
     # Graph exec handles
     GraphExecHandle create_graph_exec_handle "cuda_core::create_graph_exec_handle" (
         cydriver.CUgraphExec graph_exec) except+ nogil
@@ -304,6 +310,9 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     # Graph
     void* p_cuGraphDestroy "reinterpret_cast<void*&>(cuda_core::p_cuGraphDestroy)"
     void* p_cuGraphExecDestroy "reinterpret_cast<void*&>(cuda_core::p_cuGraphExecDestroy)"
+    void* p_cuUserObjectCreate "reinterpret_cast<void*&>(cuda_core::p_cuUserObjectCreate)"
+    void* p_cuUserObjectRelease "reinterpret_cast<void*&>(cuda_core::p_cuUserObjectRelease)"
+    void* p_cuGraphRetainUserObject "reinterpret_cast<void*&>(cuda_core::p_cuGraphRetainUserObject)"
 
     # Linker
     void* p_cuLinkDestroy "reinterpret_cast<void*&>(cuda_core::p_cuLinkDestroy)"
@@ -364,6 +373,7 @@ cdef void _init_driver_fn_pointers() noexcept:
     global p_cuMemPoolImportPointer
     global p_cuLibraryLoadFromFile, p_cuLibraryLoadData, p_cuLibraryUnload, p_cuLibraryGetKernel
     global p_cuGraphDestroy, p_cuGraphExecDestroy
+    global p_cuUserObjectCreate, p_cuUserObjectRelease, p_cuGraphRetainUserObject
     global p_cuLinkDestroy
     global p_cuGraphicsUnmapResources, p_cuGraphicsUnregisterResource
     global p_cuDevSmResourceSplit
@@ -424,6 +434,9 @@ cdef void _init_driver_fn_pointers() noexcept:
     # Graph
     p_cuGraphDestroy = _get_driver_fn("cuGraphDestroy")
     p_cuGraphExecDestroy = _get_driver_fn("cuGraphExecDestroy")
+    p_cuUserObjectCreate = _get_driver_fn("cuUserObjectCreate")
+    p_cuUserObjectRelease = _get_driver_fn("cuUserObjectRelease")
+    p_cuGraphRetainUserObject = _get_driver_fn("cuGraphRetainUserObject")
 
     # Linker
     p_cuLinkDestroy = _get_driver_fn("cuLinkDestroy")
diff --git a/cuda_core/cuda/core/_utils/_weak_handles.pyi b/cuda_core/cuda/core/_utils/_weak_handles.pyi
new file mode 100644
index 00000000000..3cf095d7b87
--- /dev/null
+++ b/cuda_core/cuda/core/_utils/_weak_handles.pyi
@@ -0,0 +1,55 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_utils/_weak_handles.pyx
+
+"""Test-only weak handles for resource-handle lifetime checks.
+
+This module is **not** part of the public ``cuda.core`` API. It is built into
+the package (like other private ``_utils`` modules) purely so the test suite can
+observe, deterministically, when the strong references that keep a CUDA resource
+alive have all been released -- without relying on driver- or hardware-specific
+side effects (for example, whether freed device memory happens to remain
+readable).
+
+Every resource handle is owned by a C++ ``std::shared_ptr``. A **weak handle**
+is a non-owning ``std::weak_ptr`` observer of that control block: truthy while
+some strong owner remains, falsy once the last one is gone. Use :func:`weak_handle`
+to obtain a weak handle from a supported front-end object.
+
+To support another type, add a ``cdef _weak_from_<type>`` that reads its ``cdef``
+handle field (see ``*.pxd``), assigns to :ctype:`OpaqueHandle`, and extend the
+``isinstance`` chain in :func:`weak_handle`. Types whose slots hold arbitrary
+Python owners via ``make_opaque_py`` are not covered here -- use
+:class:`weakref.ref` on a weak-referenceable owner object in tests instead.
+"""
+from __future__ import annotations
+
+
+class WeakHandle:
+    """Non-owning weak handle for a resource's shared control block.
+
+    Truthy while some strong owner of the underlying resource handle remains,
+    falsy once the last strong reference is released. Obtain instances via
+    :func:`weak_handle` rather than constructing directly.
+    """
+
+    def __bool__(self):
+        ...
+
+    def expired(self):
+        """Return ``True`` once every strong owner of the handle is gone."""
+
+    def use_count(self):
+        """Number of strong owners currently sharing the handle."""
+
+def weak_handle(obj):
+    """Return a :class:`WeakHandle` observing the resource behind ``obj``.
+
+    Currently supports :class:`~cuda.core.Buffer` (device allocation handle).
+    See the module docstring for how to add more types.
+
+    Raises
+    ------
+    ValueError
+        If ``obj`` is a :class:`~cuda.core.Buffer` with no active allocation.
+    TypeError
+        If ``obj`` is not a supported type.
+    """
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_utils/_weak_handles.pyx b/cuda_core/cuda/core/_utils/_weak_handles.pyx
new file mode 100644
index 00000000000..65737b958a6
--- /dev/null
+++ b/cuda_core/cuda/core/_utils/_weak_handles.pyx
@@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Test-only weak handles for resource-handle lifetime checks.
+
+This module is **not** part of the public ``cuda.core`` API. It is built into
+the package (like other private ``_utils`` modules) purely so the test suite can
+observe, deterministically, when the strong references that keep a CUDA resource
+alive have all been released -- without relying on driver- or hardware-specific
+side effects (for example, whether freed device memory happens to remain
+readable).
+
+Every resource handle is owned by a C++ ``std::shared_ptr``. A **weak handle**
+is a non-owning ``std::weak_ptr`` observer of that control block: truthy while
+some strong owner remains, falsy once the last one is gone. Use :func:`weak_handle`
+to obtain a weak handle from a supported front-end object.
+
+To support another type, add a ``cdef _weak_from_<type>`` that reads its ``cdef``
+handle field (see ``*.pxd``), assigns to :ctype:`OpaqueHandle`, and extend the
+``isinstance`` chain in :func:`weak_handle`. Types whose slots hold arbitrary
+Python owners via ``make_opaque_py`` are not covered here -- use
+:class:`weakref.ref` on a weak-referenceable owner object in tests instead.
+"""
+
+from cuda.core._memory._buffer cimport Buffer
+from cuda.core._resource_handles cimport OpaqueHandle
+
+
+# Cython cannot spell ``weak_ptr[const void]`` inline (the ``const void``
+# template argument fails to parse), so the weak type and its one constructor
+# are provided by a small inline C++ shim local to this test-only module. This
+# keeps the production resource_handles translation units untouched.
+cdef extern from *:
+    """
+    #include <memory>
+    namespace cuda_core_test {
+    using OpaqueWeakHandle = std::weak_ptr<const void>;
+    static inline OpaqueWeakHandle make_weak(const std::shared_ptr<const void>& h) {
+        return OpaqueWeakHandle(h);
+    }
+    }  // namespace cuda_core_test
+    """
+    cppclass OpaqueWeakHandle "cuda_core_test::OpaqueWeakHandle":
+        OpaqueWeakHandle()
+        bint expired()
+        long use_count()
+    OpaqueWeakHandle make_weak "cuda_core_test::make_weak" (const OpaqueHandle& h)
+
+
+cdef class WeakHandle:
+    """Non-owning weak handle for a resource's shared control block.
+
+    Truthy while some strong owner of the underlying resource handle remains,
+    falsy once the last strong reference is released. Obtain instances via
+    :func:`weak_handle` rather than constructing directly.
+    """
+
+    cdef OpaqueWeakHandle _w
+
+    def __bool__(self):
+        return not self._w.expired()
+
+    def expired(self):
+        """Return ``True`` once every strong owner of the handle is gone."""
+        return self._w.expired()
+
+    def use_count(self):
+        """Number of strong owners currently sharing the handle."""
+        return self._w.use_count()
+
+
+cdef WeakHandle _weak_from_opaque(OpaqueHandle h):
+    # Build the weak handle from a (temporary) strong handle. The strong copy
+    # lives only for the duration of this call, so it does not perturb the
+    # reference count the weak handle later reports.
+    cdef WeakHandle wh = WeakHandle.__new__(WeakHandle)
+    wh._w = make_weak(h)
+    return wh
+
+
+cdef WeakHandle _weak_from_buffer(Buffer buf):
+    cdef OpaqueHandle h = buf._h_ptr
+    if not h:
+        raise ValueError("Buffer has no active allocation")
+    return _weak_from_opaque(h)
+
+
+def weak_handle(obj):
+    """Return a :class:`WeakHandle` observing the resource behind ``obj``.
+
+    Currently supports :class:`~cuda.core.Buffer` (device allocation handle).
+    See the module docstring for how to add more types.
+
+    Raises
+    ------
+    ValueError
+        If ``obj`` is a :class:`~cuda.core.Buffer` with no active allocation.
+    TypeError
+        If ``obj`` is not a supported type.
+    """
+    if isinstance(obj, Buffer):
+        return _weak_from_buffer(obj)
+    raise TypeError(
+        f"weak_handle() does not support {type(obj).__name__!r}; "
+        "supported types: Buffer"
+    )
diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyi b/cuda_core/cuda/core/graph/_graph_builder.pyi
index af1748ad86c..fa89d835c2a 100644
--- a/cuda_core/cuda/core/graph/_graph_builder.pyi
+++ b/cuda_core/cuda/core/graph/_graph_builder.pyi
@@ -106,6 +106,18 @@ class GraphBuilder:
     to ambiguity. New graph builders should instead be created through a
     :obj:`~_device.Device`, or a :obj:`~_stream.stream` object.
 
+    .. note::
+
+        Operations recorded during capture reference your memory but do not
+        take ownership of it. As with ordinary stream work, you must keep the
+        operands alive for as long as the completed graph may execute -- for
+        example, the :obj:`~_memory.Buffer` objects passed to :func:`~launch`
+        or :meth:`~_memory.Buffer.copy_to`. Host callbacks added with
+        :meth:`callback` are the exception: the callable (and any copied
+        ``user_data``) are retained for the graph's lifetime. This differs from
+        building a graph explicitly with :class:`~graph.GraphDefinition`, which
+        retains the operands it is given.
+
     """
 
     def __init__(self):
@@ -129,7 +141,49 @@ class GraphBuilder:
     def is_join_required(self) -> bool:
         """Returns True if this graph builder must be joined before building is ended."""
 
-    def begin_building(self, mode: str | None='relaxed') -> GraphBuilder:
+    @property
+    def graph_definition(self) -> GraphDefinition:
+        """The captured graph as an explicit :class:`~graph.GraphDefinition`.
+
+        The returned :class:`~graph.GraphDefinition` is a view of the same
+        graph this builder is producing: nodes added through it appear in
+        subsequent :meth:`complete` and :meth:`debug_dot_print` calls, and
+        the view stays valid even after the builder is closed.
+
+        This lets you mix the capture and explicit APIs on a single graph,
+        for example to inspect what was captured, augment it with extra
+        nodes, or build a conditional body entirely with the explicit API.
+
+        Availability:
+
+        - **Primary builders** (created by :meth:`Device.create_graph_builder`
+          or :meth:`Stream.create_graph_builder`): only after
+          :meth:`end_building`.
+
+        - **Conditional-body builders** (returned by :meth:`if_then`,
+          :meth:`if_else`, :meth:`while_loop`, :meth:`switch`): both before
+          :meth:`begin_building` and after :meth:`end_building`. The body
+          graph already exists when the conditional is created, so you may
+          populate it through this view without ever calling
+          :meth:`begin_building` on the body builder.
+
+        - **Forked builders** (returned by :meth:`split`): never. Forked
+          builders share the primary builder's graph; access it through the
+          primary instead.
+
+        Returns
+        -------
+        GraphDefinition
+            A view of the graph being built.
+
+        Raises
+        ------
+        RuntimeError
+            If the builder is forked, currently building, or (for primary
+            builders) has not started building yet.
+        """
+
+    def begin_building(self, mode='relaxed') -> GraphBuilder:
         """Begins the building process.
 
         Build `mode` for controlling interaction with other API calls must be one of the following:
@@ -168,7 +222,7 @@ class GraphBuilder:
 
         """
 
-    def debug_dot_print(self, path: str, options: GraphDebugPrintOptions | None=None) -> None:
+    def debug_dot_print(self, path, options: GraphDebugPrintOptions | None=None):
         """Generates a DOT debug file for the graph builder.
 
         Parameters
@@ -200,7 +254,7 @@ class GraphBuilder:
         """
 
     @staticmethod
-    def join(*graph_builders: GraphBuilder) -> GraphBuilder:
+    def join(*graph_builders) -> GraphBuilder:
         """Joins multiple graph builders into a single graph builder.
 
         The returned builder inherits work dependencies from the provided builders.
@@ -223,7 +277,7 @@ class GraphBuilder:
     def _get_conditional_context(self) -> driver.CUcontext:
         ...
 
-    def create_condition(self, default_value: int | None=None) -> GraphCondition:
+    def create_condition(self, default_value=None) -> GraphCondition:
         """Create a condition variable for use with conditional nodes.
 
         The returned :class:`GraphCondition` object is passed to conditional-node
@@ -339,7 +393,7 @@ class GraphBuilder:
             The child graph builder. Must have finished building.
         """
 
-    def callback(self, fn, *, user_data=None) -> None:
+    def callback(self, fn, *, user_data=None):
         """Add a host callback to the graph during stream capture.
 
         The callback runs on the host CPU when the graph reaches this point
@@ -382,7 +436,7 @@ class Graph:
     def __init__(self):
         ...
 
-    def close(self) -> None:
+    def close(self):
         """Destroy the graph."""
 
     @property
@@ -409,7 +463,7 @@ class Graph:
 
         """
 
-    def upload(self, stream: Stream) -> None:
+    def upload(self, stream: Stream):
         """Uploads the graph in a stream.
 
         Parameters
@@ -419,7 +473,7 @@ class Graph:
 
         """
 
-    def launch(self, stream: Stream) -> None:
+    def launch(self, stream: Stream):
         """Launches the graph in a stream.
 
         Parameters
diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx
index c7b2ba5f74d..969c1caa478 100644
--- a/cuda_core/cuda/core/graph/_graph_builder.pyx
+++ b/cuda_core/cuda/core/graph/_graph_builder.pyx
@@ -3,17 +3,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
 
 from libc.stdint cimport intptr_t
 
 from cuda.bindings cimport cydriver
 
-from cuda.core.graph._graph_definition cimport GraphCondition
-from cuda.core.graph._utils cimport _attach_host_callback_to_graph
+from cuda.core.graph._graph_definition cimport GraphCondition, GraphDefinition
+from cuda.core.graph._host_callback cimport _attach_host_callback_owners, _resolve_host_callback
 from cuda.core._resource_handles cimport (
-    GraphHandle,
-    as_cu, as_py,
+    OpaqueHandle, as_cu, as_py,
     create_graph_exec_handle, create_graph_handle, create_graph_handle_ref,
 )
 from cuda.core._stream cimport Stream
@@ -26,9 +24,6 @@ from cuda.core._utils.cuda_utils import (
     handle_return,
 )
 
-if TYPE_CHECKING:
-    from cuda.core.graph._graph_definition import GraphDefinition
-
 __all__ = ['Graph', 'GraphBuilder', 'GraphCompleteOptions', 'GraphDebugPrintOptions']
 
 
@@ -171,9 +166,8 @@ def _instantiate_graph(h_graph, options: GraphCompleteOptions | None = None) ->
         params.flags = flags
 
     py_exec = handle_return(driver.cuGraphInstantiateWithParams(h_graph, params))
-    # Check result_out before wrapping the exec: on a non-SUCCESS result the exec
-    # may be invalid, and Graph._init's RAII deleter would call cuGraphExecDestroy
-    # on it during the exception unwind below.
+    c_exec = <cydriver.CUgraphExec><intptr_t>int(py_exec)
+    graph = Graph._init(c_exec)
     if params.result_out == driver.CUgraphInstantiateResult.CUDA_GRAPH_INSTANTIATE_ERROR:
         raise RuntimeError(
             "Instantiation failed for an unexpected reason which is described in the return value of the function."
@@ -193,9 +187,7 @@ def _instantiate_graph(h_graph, options: GraphCompleteOptions | None = None) ->
         raise RuntimeError("One or more conditional handles are not associated with conditional builders.")
     elif params.result_out != driver.CUgraphInstantiateResult.CUDA_GRAPH_INSTANTIATE_SUCCESS:
         raise RuntimeError(f"Graph instantiation failed with unexpected error code: {params.result_out}")
-
-    c_exec = <cydriver.CUgraphExec><intptr_t>int(py_exec)
-    return Graph._init(c_exec)
+    return graph
 
 
 # Distinguishes the three kinds of GraphBuilder, which differ in how they
@@ -228,8 +220,7 @@ cdef enum _BuilderKind:
 cdef enum _CaptureState:
     CAPTURE_NOT_STARTED = 0
     CAPTURING = 1
-    CAPTURE_ENDED = 2  # Finished, valid handle
-    CLOSED = 3         # No valid handle
+    CAPTURE_ENDED = 2
 
 
 cdef class GraphBuilder:
@@ -243,16 +234,32 @@ cdef class GraphBuilder:
     to ambiguity. New graph builders should instead be created through a
     :obj:`~_device.Device`, or a :obj:`~_stream.stream` object.
 
+    .. note::
+
+        Operations recorded during capture reference your memory but do not
+        take ownership of it. As with ordinary stream work, you must keep the
+        operands alive for as long as the completed graph may execute -- for
+        example, the :obj:`~_memory.Buffer` objects passed to :func:`~launch`
+        or :meth:`~_memory.Buffer.copy_to`. Host callbacks added with
+        :meth:`callback` are the exception: the callable (and any copied
+        ``user_data``) are retained for the graph's lifetime. This differs from
+        building a graph explicitly with :class:`~graph.GraphDefinition`, which
+        retains the operands it is given.
+
     """
 
     def __init__(self):
         raise NotImplementedError(
-            "directly creating a GraphBuilder object can be ambiguous. Please either "
+            "directly creating a Graph object can be ambiguous. Please either "
             "call Device.create_graph_builder() or stream.create_graph_builder()"
         )
 
     def __dealloc__(self):
-        GB_end_capture_if_needed(self, False)
+        # Note: _stream could be set to None by cyclic-GC tp_clear before
+        # __dealloc__, but _h_stream is guaranteed to be valid.
+        if self._h_stream and self._state == CAPTURING and self._kind != FORKED:
+            with nogil:
+                cydriver.cuStreamEndCapture(as_cu(self._h_stream), NULL)
 
     @staticmethod
     def _init(Stream stream):
@@ -266,10 +273,12 @@ cdef class GraphBuilder:
 
     def close(self):
         """Destroy the graph builder."""
-        GB_end_capture_if_needed(self, True)
+        if self._h_stream and self._state == CAPTURING and self._kind != FORKED:
+            with nogil:
+                HANDLE_RETURN(cydriver.cuStreamEndCapture(as_cu(self._h_stream), NULL))
         self._h_graph.reset()
         self._h_stream.reset()
-        self._state = CLOSED
+        self._state = CAPTURE_ENDED
         self._stream = None
 
     @property
@@ -282,7 +291,65 @@ cdef class GraphBuilder:
         """Returns True if this graph builder must be joined before building is ended."""
         return self._kind == FORKED
 
-    def begin_building(self, mode: str | None = "relaxed") -> GraphBuilder:
+    @property
+    def graph_definition(self) -> GraphDefinition:
+        """The captured graph as an explicit :class:`~graph.GraphDefinition`.
+
+        The returned :class:`~graph.GraphDefinition` is a view of the same
+        graph this builder is producing: nodes added through it appear in
+        subsequent :meth:`complete` and :meth:`debug_dot_print` calls, and
+        the view stays valid even after the builder is closed.
+
+        This lets you mix the capture and explicit APIs on a single graph,
+        for example to inspect what was captured, augment it with extra
+        nodes, or build a conditional body entirely with the explicit API.
+
+        Availability:
+
+        - **Primary builders** (created by :meth:`Device.create_graph_builder`
+          or :meth:`Stream.create_graph_builder`): only after
+          :meth:`end_building`.
+
+        - **Conditional-body builders** (returned by :meth:`if_then`,
+          :meth:`if_else`, :meth:`while_loop`, :meth:`switch`): both before
+          :meth:`begin_building` and after :meth:`end_building`. The body
+          graph already exists when the conditional is created, so you may
+          populate it through this view without ever calling
+          :meth:`begin_building` on the body builder.
+
+        - **Forked builders** (returned by :meth:`split`): never. Forked
+          builders share the primary builder's graph; access it through the
+          primary instead.
+
+        Returns
+        -------
+        GraphDefinition
+            A view of the graph being built.
+
+        Raises
+        ------
+        RuntimeError
+            If the builder is forked, currently building, or (for primary
+            builders) has not started building yet.
+        """
+        if self._kind == FORKED:
+            raise RuntimeError(
+                "graph_definition is unavailable on forked graph builders; "
+                "access it through the primary builder instead."
+            )
+        if self._state == CAPTURING:
+            raise RuntimeError(
+                "graph_definition is unavailable while capture is in "
+                "progress; call end_building() first."
+            )
+        if self._kind == PRIMARY and self._state == CAPTURE_NOT_STARTED:
+            raise RuntimeError(
+                "graph_definition is unavailable before begin_building() on "
+                "a primary builder; no graph has been created yet."
+            )
+        return GraphDefinition._from_handle(self._h_graph)
+
+    def begin_building(self, mode="relaxed") -> GraphBuilder:
         """Begins the building process.
 
         Build `mode` for controlling interaction with other API calls must be one of the following:
@@ -298,7 +365,6 @@ cdef class GraphBuilder:
             Default set to use relaxed.
 
         """
-        GB_check_open(self)
         if self._state != CAPTURE_NOT_STARTED:
             if self._state == CAPTURING:
                 raise RuntimeError("Graph builder is already building.")
@@ -322,25 +388,20 @@ cdef class GraphBuilder:
             with nogil:
                 HANDLE_RETURN(cydriver.cuStreamBeginCaptureToGraph(
                     c_stream, c_graph, NULL, NULL, 0, c_mode))
-            self._state = CAPTURING
         else:
             with nogil:
                 HANDLE_RETURN(cydriver.cuStreamBeginCapture(c_stream, c_mode))
-            # Capture is active now; set CAPTURING before the calls below so a
-            # failure in _get_capture_info/create_graph_handle still lets
-            # cleanup end the capture rather than leaving the stream poisoned.
-            self._state = CAPTURING
-            with nogil:
-                # The driver rejects a NULL captureStatus_out, so pass a
-                # stack-local even though we only want the graph handle.
+                # The driver rejects NULL captureStatus_out, so we pass a
+                # stack-local even though begin_capture just succeeded and we
+                # only care about the resulting graph handle.
                 _get_capture_info(c_stream, &c_status, &c_graph)
             self._h_graph = create_graph_handle(c_graph)
+        self._state = CAPTURING
         return self
 
     @property
     def is_building(self) -> bool:
         """Returns True if the graph builder is currently building."""
-        GB_check_open(self)
         cdef cydriver.CUstream c_stream = as_cu(self._h_stream)
         cdef cydriver.CUstreamCaptureStatus status
         with nogil:
@@ -358,13 +419,11 @@ cdef class GraphBuilder:
 
     def end_building(self) -> GraphBuilder:
         """Ends the building process."""
-        GB_check_open(self)
         if not self.is_building:
             raise RuntimeError("Graph builder is not building.")
         cdef cydriver.CUstream c_stream = as_cu(self._h_stream)
-        cdef cydriver.CUgraph c_graph
         with nogil:
-            HANDLE_RETURN(cydriver.cuStreamEndCapture(c_stream, &c_graph))
+            HANDLE_RETURN(cydriver.cuStreamEndCapture(c_stream, NULL))
 
         # TODO: Resolving https://github.com/NVIDIA/cuda-python/issues/617 would allow us to
         #       resume the build process after the first call to end_building()
@@ -385,13 +444,12 @@ cdef class GraphBuilder:
             The newly built graph.
 
         """
-        GB_check_open(self)
         if self._state != CAPTURE_ENDED:
             raise RuntimeError("Graph has not finished building.")
 
         return _instantiate_graph(as_py(self._h_graph), options)
 
-    def debug_dot_print(self, path: str, options: GraphDebugPrintOptions | None = None) -> None:
+    def debug_dot_print(self, path, options: GraphDebugPrintOptions | None = None):
         """Generates a DOT debug file for the graph builder.
 
         Parameters
@@ -402,12 +460,11 @@ cdef class GraphBuilder:
             Customizable dataclass for the debug print options.
 
         """
-        GB_check_open(self)
         if self._state != CAPTURE_ENDED:
             raise RuntimeError("Graph has not finished building.")
         cdef unsigned int c_flags = options._to_flags() if options else 0
         cdef cydriver.CUgraph c_graph = as_cu(self._h_graph)
-        cdef bytes b_path = path.encode('utf-8')
+        cdef bytes b_path = path.encode() if isinstance(path, str) else path
         cdef const char* c_path = b_path
         with nogil:
             HANDLE_RETURN(cydriver.cuGraphDebugDotPrint(c_graph, c_path, c_flags))
@@ -432,21 +489,18 @@ cdef class GraphBuilder:
         """
         if count < 2:
             raise ValueError(f"Invalid split count: expecting >= 2, got {count}")
-        GB_check_open(self)
-        if self._state != CAPTURING:
-            raise RuntimeError("Graph builder must be building before it can be split.")
 
         event = self._stream.record()
         result = [self]
         for i in range(count - 1):
             stream = self._stream.device.create_stream()
             stream.wait(event)
-            result.append(GB_init_forked(stream, self._h_graph))
+            result.append(_init_forked(stream, self._h_graph))
         event.close()
         return tuple(result)
 
     @staticmethod
-    def join(*graph_builders: GraphBuilder) -> GraphBuilder:
+    def join(*graph_builders) -> GraphBuilder:
         """Joins multiple graph builders into a single graph builder.
 
         The returned builder inherits work dependencies from the provided builders.
@@ -486,13 +540,12 @@ cdef class GraphBuilder:
 
     def __cuda_stream__(self) -> tuple[int, int]:
         """Return an instance of a __cuda_stream__ protocol."""
-        GB_check_open(self)
         return self.stream.__cuda_stream__()
 
     def _get_conditional_context(self) -> driver.CUcontext:
         return self._stream.context.handle
 
-    def create_condition(self, default_value: int | None = None) -> GraphCondition:
+    def create_condition(self, default_value=None) -> GraphCondition:
         """Create a condition variable for use with conditional nodes.
 
         The returned :class:`GraphCondition` object is passed to conditional-node
@@ -511,7 +564,6 @@ cdef class GraphBuilder:
         GraphCondition
             A condition variable for controlling conditional execution.
         """
-        GB_check_open(self)
         if cy_driver_version() < (12, 3, 0):
             raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional handles")
         if cy_binding_version() < (12, 3, 0):
@@ -551,7 +603,6 @@ cdef class GraphBuilder:
             The newly created conditional graph builder.
 
         """
-        GB_check_open(self)
         if cy_driver_version() < (12, 3, 0):
             raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional if")
         if cy_binding_version() < (12, 3, 0):
@@ -566,7 +617,7 @@ cdef class GraphBuilder:
         node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_IF
         node_params.conditional.size = 1
         node_params.conditional.ctx = self._get_conditional_context()
-        return GB_cond_with_params(self, node_params)[0]
+        return _cond_with_params(self, node_params)[0]
 
     def if_else(self, condition: GraphCondition) -> tuple[GraphBuilder, GraphBuilder]:
         """Adds an if-else condition branch and returns new graph builders for both branches.
@@ -588,7 +639,6 @@ cdef class GraphBuilder:
             A tuple of two new graph builders, one for the if branch and one for the else branch.
 
         """
-        GB_check_open(self)
         if cy_driver_version() < (12, 8, 0):
             raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional if-else")
         if cy_binding_version() < (12, 8, 0):
@@ -603,7 +653,7 @@ cdef class GraphBuilder:
         node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_IF
         node_params.conditional.size = 2
         node_params.conditional.ctx = self._get_conditional_context()
-        return GB_cond_with_params(self, node_params)
+        return _cond_with_params(self, node_params)
 
     def switch(self, condition: GraphCondition, count: int) -> tuple[GraphBuilder, ...]:
         """Adds a switch condition branch and returns new graph builders for all cases.
@@ -628,7 +678,6 @@ cdef class GraphBuilder:
             A tuple of new graph builders, one for each branch.
 
         """
-        GB_check_open(self)
         if cy_driver_version() < (12, 8, 0):
             raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional switch")
         if cy_binding_version() < (12, 8, 0):
@@ -643,7 +692,7 @@ cdef class GraphBuilder:
         node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_SWITCH
         node_params.conditional.size = count
         node_params.conditional.ctx = self._get_conditional_context()
-        return GB_cond_with_params(self, node_params)
+        return _cond_with_params(self, node_params)
 
     def while_loop(self, condition: GraphCondition) -> GraphBuilder:
         """Adds a while loop and returns a new graph builder for it.
@@ -665,7 +714,6 @@ cdef class GraphBuilder:
             The newly created while loop graph builder.
 
         """
-        GB_check_open(self)
         if cy_driver_version() < (12, 3, 0):
             raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional while loop")
         if cy_binding_version() < (12, 3, 0):
@@ -680,7 +728,7 @@ cdef class GraphBuilder:
         node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_WHILE
         node_params.conditional.size = 1
         node_params.conditional.ctx = self._get_conditional_context()
-        return GB_cond_with_params(self, node_params)[0]
+        return _cond_with_params(self, node_params)[0]
 
     def embed(self, GraphBuilder child):
         """Embed a previously-built :obj:`~graph.GraphBuilder` as a child node.
@@ -690,7 +738,6 @@ cdef class GraphBuilder:
         child : :obj:`~graph.GraphBuilder`
             The child graph builder. Must have finished building.
         """
-        GB_check_open(self)
         if child._state != CAPTURE_ENDED:
             raise ValueError("Child graph has not finished building.")
 
@@ -723,7 +770,7 @@ cdef class GraphBuilder:
             )
         )
 
-    def callback(self, fn, *, user_data=None) -> None:
+    def callback(self, fn, *, user_data=None):
         """Add a host callback to the graph during stream capture.
 
         The callback runs on the host CPU when the graph reaches this point
@@ -751,68 +798,37 @@ cdef class GraphBuilder:
             pointer (caller manages lifetime). If bytes-like, the data is
             copied and its lifetime is tied to the graph.
         """
-        GB_check_open(self)
         cdef Stream stream = self._stream
         cdef cydriver.CUstream c_stream = as_cu(stream._h_stream)
         cdef cydriver.CUstreamCaptureStatus capture_status
-        cdef cydriver.CUgraph c_graph = NULL
 
         with nogil:
-            _get_capture_info(c_stream, &capture_status, &c_graph)
+            _get_capture_info(c_stream, &capture_status, NULL)
 
         if capture_status != cydriver.CU_STREAM_CAPTURE_STATUS_ACTIVE:
             raise RuntimeError("Cannot add callback when graph is not being built")
 
         cdef cydriver.CUhostFn c_fn
         cdef void* c_user_data = NULL
-        _attach_host_callback_to_graph(c_graph, fn, user_data, &c_fn, &c_user_data)
+        cdef OpaqueHandle fn_owner, data_owner
+        _resolve_host_callback(fn, user_data, &c_fn, &c_user_data, &fn_owner, &data_owner)
 
         with nogil:
             HANDLE_RETURN(cydriver.cuLaunchHostFunc(c_stream, c_fn, c_user_data))
 
+        # Capturing the host function added a node to the graph; it is now the
+        # stream's sole capture dependency. Key the callback's owners to it so
+        # they live in the graph's slot table like any explicitly-added node.
+        cdef cydriver.CUgraphNode host_node = _capture_tail_node(c_stream)
+        _attach_host_callback_owners(self._h_graph, host_node, fn_owner, data_owner)
 
-cdef inline int GB_check_open(GraphBuilder gb) except -1:
-    """Reject operations on a builder that has been closed.
-
-    A CLOSED builder has reset its stream and graph handles, so any method
-    that dereferences them would read a null handle (or, for the cached
-    Stream, a None typed as cdef Stream). Guarding here yields a clear error
-    instead.
-    """
-    if gb._state == CLOSED:
-        raise RuntimeError("Graph builder has been closed.")
-    return 0
-
-
-cdef inline int GB_end_capture_if_needed(GraphBuilder gb, bint check_status) except -1 nogil:
-    """End an in-progress capture if this builder owns it.
 
-    Only a CAPTURING PRIMARY or CONDITIONAL_BODY builder owns the live
-    capture. A FORKED builder must not call cuStreamEndCapture: the driver
-    requires forked streams to be joined first.
-
-    check_status=True checks the driver return (close()); False ignores it
-    (__dealloc__).
-    """
-    cdef cydriver.CUgraph c_graph
-    cdef cydriver.CUresult err
-    cdef cydriver.CUstream c_stream
-    if gb._h_stream and gb._state == CAPTURING and gb._kind != FORKED:
-        c_stream = as_cu(gb._h_stream)
-        with nogil:
-            err = cydriver.cuStreamEndCapture(c_stream, &c_graph)
-            if check_status:
-                HANDLE_RETURN(err)
-    return 0
-
-
-cdef inline GraphBuilder GB_init_forked(Stream stream, GraphHandle h_primary_graph):
+cdef inline GraphBuilder _init_forked(Stream stream, GraphHandle h_graph):
     cdef GraphBuilder gb = GraphBuilder.__new__(GraphBuilder)
-    # A FORKED builder captures into the primary's CUgraph. It holds the
-    # primary's GraphHandle so conditional bodies created on it (via
-    # GB_init_conditional -> create_graph_handle_ref(cond_graph, parent._h_graph))
-    # have a valid parent handle to pin.
-    gb._h_graph = h_primary_graph
+    # Forked builders capture into the primary's graph. They share its handle
+    # so node attachments (e.g. callbacks) reach the same slot table; the FORKED
+    # kind still bars end_building()/graph_definition and graph destruction.
+    gb._h_graph = h_graph
     gb._h_stream = stream._h_stream
     gb._kind = FORKED
     gb._state = CAPTURING
@@ -820,7 +836,7 @@ cdef inline GraphBuilder GB_init_forked(Stream stream, GraphHandle h_primary_gra
     return gb
 
 
-cdef inline GraphBuilder GB_init_conditional(Stream stream, cydriver.CUgraph cond_graph, GraphBuilder parent):
+cdef inline GraphBuilder _init_conditional(Stream stream, cydriver.CUgraph cond_graph, GraphBuilder parent):
     cdef GraphBuilder gb = GraphBuilder.__new__(GraphBuilder)
     gb._h_graph = create_graph_handle_ref(cond_graph, parent._h_graph)
     gb._h_stream = stream._h_stream
@@ -849,7 +865,29 @@ cdef inline int _get_capture_info(
             stream, status, NULL, graph, NULL, NULL))
 
 
-cdef inline tuple GB_cond_with_params(GraphBuilder gb, node_params):
+cdef inline cydriver.CUgraphNode _capture_tail_node(cydriver.CUstream stream) except *:
+    """Return the node a freshly-captured single-node operation left as the
+    stream's sole capture dependency (e.g. the host node added by
+    ``cuLaunchHostFunc``). The driver advances the stream's dependency set to
+    the new node, so the next captured op would depend on it.
+    """
+    cdef cydriver.CUstreamCaptureStatus status
+    cdef const cydriver.CUgraphNode* deps = NULL
+    cdef size_t num_deps = 0
+    with nogil:
+        IF CUDA_CORE_BUILD_MAJOR >= 13:
+            HANDLE_RETURN(cydriver.cuStreamGetCaptureInfo(
+                stream, &status, NULL, NULL, &deps, NULL, &num_deps))
+        ELSE:
+            HANDLE_RETURN(cydriver.cuStreamGetCaptureInfo(
+                stream, &status, NULL, NULL, &deps, &num_deps))
+    if num_deps != 1:
+        raise RuntimeError(
+            f"expected exactly one capture dependency after a host callback, got {num_deps}")
+    return <cydriver.CUgraphNode>deps[0]
+
+
+cdef inline tuple _cond_with_params(GraphBuilder gb, node_params):
     status, _, graph, *deps_info, num_dependencies = handle_return(
         driver.cuStreamGetCaptureInfo(gb._stream.handle)
     )
@@ -870,7 +908,7 @@ cdef inline tuple GB_cond_with_params(GraphBuilder gb, node_params):
     )
 
     return tuple(
-        GB_init_conditional(
+        _init_conditional(
             gb._stream.device.create_stream(),
             <cydriver.CUgraph><intptr_t>int(node_params.conditional.phGraph_out[i]),
             gb,
@@ -899,7 +937,7 @@ cdef class Graph:
         self._h_graph_exec = create_graph_exec_handle(graph_exec)
         return self
 
-    def close(self) -> None:
+    def close(self):
         """Destroy the graph."""
         self._h_graph_exec.reset()
 
@@ -933,8 +971,6 @@ cdef class Graph:
         cdef cydriver.CUgraphExec cu_exec = as_cu(self._h_graph_exec)
 
         if isinstance(source, GraphBuilder):
-            if (<GraphBuilder>source)._state == CLOSED:
-                raise ValueError("Source graph builder has been closed.")
             if (<GraphBuilder>source)._state != CAPTURE_ENDED:
                 raise ValueError("Graph has not finished building.")
             cu_graph = as_cu((<GraphBuilder>source)._h_graph)
@@ -954,7 +990,7 @@ cdef class Graph:
             raise CUDAError(msg)
         HANDLE_RETURN(err)
 
-    def upload(self, stream: Stream) -> None:
+    def upload(self, stream: Stream):
         """Uploads the graph in a stream.
 
         Parameters
@@ -968,7 +1004,7 @@ cdef class Graph:
         with nogil:
             HANDLE_RETURN(cydriver.cuGraphUpload(c_exec, c_stream))
 
-    def launch(self, stream: Stream) -> None:
+    def launch(self, stream: Stream):
         """Launches the graph in a stream.
 
         Parameters
diff --git a/cuda_core/cuda/core/graph/_graph_definition.pyi b/cuda_core/cuda/core/graph/_graph_definition.pyi
index 15f34cec9ab..efbf17abbad 100644
--- a/cuda_core/cuda/core/graph/_graph_definition.pyi
+++ b/cuda_core/cuda/core/graph/_graph_definition.pyi
@@ -85,7 +85,7 @@ class GraphDefinition:
         See :meth:`GraphNode.deallocate` for full documentation.
         """
 
-    def memset(self, dst: int, value, width: int, height: int=1, pitch: int=0) -> MemsetNode:
+    def memset(self, dst: int, value, width: int, height: int=1, pitch: int=0, *, dst_owner=None) -> MemsetNode:
         """Add an entry-point memset node (no dependencies).
 
         See :meth:`GraphNode.memset` for full documentation.
@@ -120,7 +120,7 @@ class GraphDefinition:
             A new EmptyNode that depends on all input nodes.
         """
 
-    def memcpy(self, dst: int, src: int, size: int) -> MemcpyNode:
+    def memcpy(self, dst: int, src: int, size: int, *, dst_owner=None, src_owner=None) -> MemcpyNode:
         """Add an entry-point memcpy node (no dependencies).
 
         See :meth:`GraphNode.memcpy` for full documentation.
diff --git a/cuda_core/cuda/core/graph/_graph_definition.pyx b/cuda_core/cuda/core/graph/_graph_definition.pyx
index 1ec56978327..9774c9899e4 100644
--- a/cuda_core/cuda/core/graph/_graph_definition.pyx
+++ b/cuda_core/cuda/core/graph/_graph_definition.pyx
@@ -160,13 +160,17 @@ cdef class GraphDefinition:
         value,
         size_t width,
         size_t height=1,
-        size_t pitch=0
+        size_t pitch=0,
+        *,
+        dst_owner=None,
     ) -> MemsetNode:
         """Add an entry-point memset node (no dependencies).
 
         See :meth:`GraphNode.memset` for full documentation.
         """
-        return self._entry.memset(dst, value, width, height, pitch)
+        return self._entry.memset(
+            dst, value, width, height=height, pitch=pitch, dst_owner=dst_owner
+        )
 
     def launch(self, config: LaunchConfig, kernel: Kernel, *args) -> KernelNode:
         """Add an entry-point kernel launch node (no dependencies).
@@ -200,12 +204,22 @@ cdef class GraphDefinition:
         """
         return self._entry.join(*nodes)
 
-    def memcpy(self, dst: int, src: int, size_t size) -> MemcpyNode:
+    def memcpy(
+        self,
+        dst: int,
+        src: int,
+        size_t size,
+        *,
+        dst_owner=None,
+        src_owner=None,
+    ) -> MemcpyNode:
         """Add an entry-point memcpy node (no dependencies).
 
         See :meth:`GraphNode.memcpy` for full documentation.
         """
-        return self._entry.memcpy(dst, src, size)
+        return self._entry.memcpy(
+            dst, src, size, dst_owner=dst_owner, src_owner=src_owner
+        )
 
     def embed(self, child: GraphDefinition) -> ChildGraphNode:
         """Add an entry-point child graph node (no dependencies).
diff --git a/cuda_core/cuda/core/graph/_graph_node.pyi b/cuda_core/cuda/core/graph/_graph_node.pyi
index 3e701fe3897..a923f198c49 100644
--- a/cuda_core/cuda/core/graph/_graph_node.pyi
+++ b/cuda_core/cuda/core/graph/_graph_node.pyi
@@ -4,20 +4,20 @@
 from __future__ import annotations
 
 import weakref
-from collections.abc import Iterable
 
 from cuda.core._device import Device
 from cuda.core._event import Event
 from cuda.core._launch_config import LaunchConfig
+from cuda.core._memory._buffer import Buffer
 from cuda.core._module import Kernel
 from cuda.core._utils.cuda_utils import driver
-from cuda.core.graph._adjacency_set_proxy import AdjacencySetProxy
 from cuda.core.graph._graph_definition import GraphCondition, GraphDefinition
 from cuda.core.graph._subclasses import (AllocNode, ChildGraphNode, EmptyNode,
                                          EventRecordNode, EventWaitNode,
-                                         FreeNode, IfElseNode, IfNode,
-                                         KernelNode, MemcpyNode, MemsetNode,
-                                         SwitchNode, WhileNode)
+                                         FreeNode, HostCallbackNode,
+                                         IfElseNode, IfNode, KernelNode,
+                                         MemcpyNode, MemsetNode, SwitchNode,
+                                         WhileNode)
 from cuda.core.typing import GraphMemoryType
 
 
@@ -32,14 +32,14 @@ class GraphNode:
     def __repr__(self) -> str:
         ...
 
-    def __eq__(self, other: object) -> bool:
+    def __eq__(self, other) -> bool:
         ...
 
     def __hash__(self) -> int:
         ...
 
     @property
-    def type(self) -> driver.CUgraphNodeType | None:
+    def type(self):
         """Return the CUDA graph node type.
 
         Returns
@@ -49,7 +49,7 @@ class GraphNode:
         """
 
     @property
-    def graph(self) -> GraphDefinition:
+    def graph(self) -> 'GraphDefinition':
         """Return the GraphDefinition this node belongs to."""
 
     @property
@@ -60,13 +60,13 @@ class GraphNode:
         """
 
     @property
-    def is_valid(self) -> bool:
+    def is_valid(self):
         """Whether this node is valid (not destroyed).
 
         Returns ``False`` after :meth:`destroy` has been called.
         """
 
-    def destroy(self) -> None:
+    def destroy(self):
         """Destroy this node and remove all its edges from the parent graph.
 
         After this call, :attr:`is_valid` returns ``False`` and the node
@@ -75,19 +75,19 @@ class GraphNode:
         """
 
     @property
-    def pred(self) -> AdjacencySetProxy:
+    def pred(self):
         """A mutable set-like view of this node's predecessors."""
 
     @pred.setter
-    def pred(self, value: Iterable[GraphNode]) -> None:
+    def pred(self, value):
         ...
 
     @property
-    def succ(self) -> AdjacencySetProxy:
+    def succ(self):
         """A mutable set-like view of this node's successors."""
 
     @succ.setter
-    def succ(self, value: Iterable[GraphNode]) -> None:
+    def succ(self, value):
         ...
 
     def launch(self, config: LaunchConfig, kernel: Kernel, *args) -> KernelNode:
@@ -178,13 +178,16 @@ class GraphNode:
             A new FreeNode representing the free operation.
         """
 
-    def memset(self, dst: int, value, width: int, height: int=1, pitch: int=0) -> MemsetNode:
+    def memset(self, dst: Buffer | int, value, width: int, *, height: int=1, pitch: int=0, dst_owner=None) -> MemsetNode:
         """Add a memset node depending on this node.
 
         Parameters
         ----------
-        dst : int
-            Destination device pointer.
+        dst : Buffer or int
+            Destination. When ``dst`` is a :class:`Buffer`, the underlying
+            allocation is retained for the graph's lifetime. A raw pointer
+            (``int``) is used as-is; the caller must keep the underlying memory
+            alive, or supply ``dst_owner`` to have the graph retain it.
         value : int or buffer-protocol object
             Fill value. int for 1-byte fill (range [0, 256)),
             or buffer-protocol object of 1, 2, or 4 bytes.
@@ -194,14 +197,23 @@ class GraphNode:
             Number of rows (default 1).
         pitch : int, optional
             Pitch of destination in bytes (default 0, unused if height is 1).
+        dst_owner : object, optional
+            Object retained for the graph's lifetime when ``dst`` is a raw
+            pointer. A :class:`Buffer` owner retains its underlying allocation,
+            not the wrapper. Must not be passed when ``dst`` is a :class:`Buffer`.
 
         Returns
         -------
         MemsetNode
             A new MemsetNode representing the memset operation.
+
+        Raises
+        ------
+        ValueError
+            If ``dst_owner`` is given together with a :class:`Buffer` ``dst``.
         """
 
-    def memcpy(self, dst: int, src: int, size: int) -> MemcpyNode:
+    def memcpy(self, dst: Buffer | int, src: Buffer | int, size: int, *, dst_owner=None, src_owner=None) -> MemcpyNode:
         """Add a memcpy node depending on this node.
 
         Copies ``size`` bytes from ``src`` to ``dst``. Memory types are
@@ -210,17 +222,35 @@ class GraphNode:
 
         Parameters
         ----------
-        dst : int
-            Destination pointer (device or pinned host).
-        src : int
-            Source pointer (device or pinned host).
+        dst : Buffer or int
+            Destination (device or pinned host). When a :class:`Buffer` is given,
+            the underlying allocation is retained for the graph's lifetime. A raw
+            pointer (``int``) is used as-is; the caller must keep the underlying
+            memory alive, or supply ``dst_owner`` to have the graph retain it.
+        src : Buffer or int
+            Source (device or pinned host). Same retention rules as ``dst``;
+            use ``src_owner`` for a raw pointer.
         size : int
             Number of bytes to copy.
+        dst_owner : object, optional
+            Object retained for the graph's lifetime when ``dst`` is a raw
+            pointer. A :class:`Buffer` owner retains its underlying allocation.
+            Must not be passed when ``dst`` is a :class:`Buffer`.
+        src_owner : object, optional
+            Object retained for the graph's lifetime when ``src`` is a raw
+            pointer. A :class:`Buffer` owner retains its underlying allocation.
+            Must not be passed when ``src`` is a :class:`Buffer`.
 
         Returns
         -------
         MemcpyNode
             A new MemcpyNode representing the copy operation.
+
+        Raises
+        ------
+        ValueError
+            If ``dst_owner`` or ``src_owner`` is given together with a
+            :class:`Buffer` ``dst`` or ``src`` respectively.
         """
 
     def embed(self, child: GraphDefinition) -> ChildGraphNode:
@@ -269,7 +299,7 @@ class GraphNode:
             A new EventWaitNode representing the event wait operation.
         """
 
-    def callback(self, fn, *, user_data=None) -> object:
+    def callback(self, fn, *, user_data=None) -> HostCallbackNode:
         """Add a host callback node depending on this node.
 
         The callback runs on the host CPU when the graph reaches this node.
diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx
index 53145dd5e2a..8fe06cee638 100644
--- a/cuda_core/cuda/core/graph/_graph_node.pyx
+++ b/cuda_core/cuda/core/graph/_graph_node.pyx
@@ -6,11 +6,8 @@
 
 from __future__ import annotations
 
-from collections.abc import Iterable
 from typing import TYPE_CHECKING
 
-from cpython.ref cimport Py_INCREF
-
 from libc.stddef cimport size_t
 from libc.stdint cimport uintptr_t
 from libc.string cimport memset as c_memset
@@ -22,6 +19,7 @@ from cuda.bindings cimport cydriver
 from cuda.core._event cimport Event
 from cuda.core._kernel_arg_handler cimport ParamHolder
 from cuda.core._launch_config cimport LaunchConfig
+from cuda.core._memory._buffer cimport Buffer
 from cuda.core._module cimport Kernel
 from cuda.core.graph._graph_definition cimport GraphCondition, GraphDefinition
 from cuda.core.graph._subclasses cimport (
@@ -42,26 +40,27 @@ from cuda.core.graph._subclasses cimport (
     WhileNode,
 )
 from cuda.core._resource_handles cimport (
-    EventHandle,
     GraphHandle,
     GraphNodeHandle,
-    KernelHandle,
+    OpaqueHandle,
     as_cu,
     as_intptr,
     as_py,
     create_graph_handle_ref,
     create_graph_node_handle,
     graph_node_get_graph,
+    graph_set_slot,
     invalidate_graph_node,
-    py_object_user_object_destroy,
+    make_opaque_py,
 )
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value
 
-from cuda.core.graph._utils cimport (
-    _attach_host_callback_to_graph,
-    _attach_user_object,
+from cuda.core.graph._host_callback cimport (
+    _attach_host_callback_owners,
+    _resolve_host_callback,
 )
 
+import ctypes as ct
 import weakref
 
 from cuda.core.graph._adjacency_set_proxy import AdjacencySetProxy
@@ -78,7 +77,8 @@ _node_registry: weakref.WeakValueDictionary[int, GraphNode] = weakref.WeakValueD
 
 
 cdef inline GraphNode _registered(GraphNode n):
-    return _node_registry.setdefault(<uintptr_t>n._h_node.get(), n)
+    _node_registry[<uintptr_t>n._h_node.get()] = n
+    return n
 
 
 cdef class GraphNode:
@@ -100,7 +100,7 @@ cdef class GraphNode:
             return "<GraphNode entry>"
         return f"<GraphNode handle=0x{<uintptr_t>node:x}>"
 
-    def __eq__(self, other: object) -> bool:
+    def __eq__(self, other) -> bool:
         if not isinstance(other, GraphNode):
             return NotImplemented
         cdef GraphNode o = <GraphNode>other
@@ -114,7 +114,7 @@ cdef class GraphNode:
         return hash((as_intptr(self._h_node), as_intptr(g)))
 
     @property
-    def type(self) -> driver.CUgraphNodeType | None:
+    def type(self):
         """Return the CUDA graph node type.
 
         Returns
@@ -131,7 +131,7 @@ cdef class GraphNode:
         return driver.CUgraphNodeType(<int>node_type)
 
     @property
-    def graph(self) -> GraphDefinition:
+    def graph(self) -> "GraphDefinition":
         """Return the GraphDefinition this node belongs to."""
         return GraphDefinition._from_handle(graph_node_get_graph(self._h_node))
 
@@ -144,14 +144,14 @@ cdef class GraphNode:
         return as_py(self._h_node)
 
     @property
-    def is_valid(self) -> bool:
+    def is_valid(self):
         """Whether this node is valid (not destroyed).
 
         Returns ``False`` after :meth:`destroy` has been called.
         """
         return as_intptr(self._h_node) != 0
 
-    def destroy(self) -> None:
+    def destroy(self):
         """Destroy this node and remove all its edges from the parent graph.
 
         After this call, :attr:`is_valid` returns ``False`` and the node
@@ -167,23 +167,23 @@ cdef class GraphNode:
         invalidate_graph_node(self._h_node)
 
     @property
-    def pred(self) -> AdjacencySetProxy:
+    def pred(self):
         """A mutable set-like view of this node's predecessors."""
         return AdjacencySetProxy(self, False)
 
     @pred.setter
-    def pred(self, value: Iterable[GraphNode]) -> None:
+    def pred(self, value):
         p = AdjacencySetProxy(self, False)
         p.clear()
         p.update(value)
 
     @property
-    def succ(self) -> AdjacencySetProxy:
+    def succ(self):
         """A mutable set-like view of this node's successors."""
         return AdjacencySetProxy(self, True)
 
     @succ.setter
-    def succ(self, value: Iterable[GraphNode]) -> None:
+    def succ(self, value):
         s = AdjacencySetProxy(self, True)
         s.clear()
         s.update(value)
@@ -282,13 +282,25 @@ cdef class GraphNode:
         """
         return GN_free(self, <cydriver.CUdeviceptr>dptr)
 
-    def memset(self, dst: int, value, size_t width, size_t height=1, size_t pitch=0) -> MemsetNode:
+    def memset(
+        self,
+        dst: Buffer | int,
+        value,
+        size_t width,
+        *,
+        size_t height=1,
+        size_t pitch=0,
+        dst_owner=None,
+    ) -> MemsetNode:
         """Add a memset node depending on this node.
 
         Parameters
         ----------
-        dst : int
-            Destination device pointer.
+        dst : Buffer or int
+            Destination. When ``dst`` is a :class:`Buffer`, the underlying
+            allocation is retained for the graph's lifetime. A raw pointer
+            (``int``) is used as-is; the caller must keep the underlying memory
+            alive, or supply ``dst_owner`` to have the graph retain it.
         value : int or buffer-protocol object
             Fill value. int for 1-byte fill (range [0, 256)),
             or buffer-protocol object of 1, 2, or 4 bytes.
@@ -298,18 +310,38 @@ cdef class GraphNode:
             Number of rows (default 1).
         pitch : int, optional
             Pitch of destination in bytes (default 0, unused if height is 1).
+        dst_owner : object, optional
+            Object retained for the graph's lifetime when ``dst`` is a raw
+            pointer. A :class:`Buffer` owner retains its underlying allocation,
+            not the wrapper. Must not be passed when ``dst`` is a :class:`Buffer`.
 
         Returns
         -------
         MemsetNode
             A new MemsetNode representing the memset operation.
+
+        Raises
+        ------
+        ValueError
+            If ``dst_owner`` is given together with a :class:`Buffer` ``dst``.
         """
+        cdef cydriver.CUdeviceptr c_dst
         cdef unsigned int val
         cdef unsigned int elem_size
+        cdef OpaqueHandle dst_slot_owner
+        dst_slot_owner = _resolve_memcpy_operand(dst, dst_owner, "dst", &c_dst)
         val, elem_size = _parse_fill_value(value)
-        return GN_memset(self, <cydriver.CUdeviceptr>dst, val, elem_size, width, height, pitch)
-
-    def memcpy(self, dst: int, src: int, size_t size) -> MemcpyNode:
+        return GN_memset(self, c_dst, dst_slot_owner, val, elem_size, width, height, pitch)
+
+    def memcpy(
+        self,
+        dst: Buffer | int,
+        src: Buffer | int,
+        size_t size,
+        *,
+        dst_owner=None,
+        src_owner=None,
+    ) -> MemcpyNode:
         """Add a memcpy node depending on this node.
 
         Copies ``size`` bytes from ``src`` to ``dst``. Memory types are
@@ -318,19 +350,42 @@ cdef class GraphNode:
 
         Parameters
         ----------
-        dst : int
-            Destination pointer (device or pinned host).
-        src : int
-            Source pointer (device or pinned host).
+        dst : Buffer or int
+            Destination (device or pinned host). When a :class:`Buffer` is given,
+            the underlying allocation is retained for the graph's lifetime. A raw
+            pointer (``int``) is used as-is; the caller must keep the underlying
+            memory alive, or supply ``dst_owner`` to have the graph retain it.
+        src : Buffer or int
+            Source (device or pinned host). Same retention rules as ``dst``;
+            use ``src_owner`` for a raw pointer.
         size : int
             Number of bytes to copy.
+        dst_owner : object, optional
+            Object retained for the graph's lifetime when ``dst`` is a raw
+            pointer. A :class:`Buffer` owner retains its underlying allocation.
+            Must not be passed when ``dst`` is a :class:`Buffer`.
+        src_owner : object, optional
+            Object retained for the graph's lifetime when ``src`` is a raw
+            pointer. A :class:`Buffer` owner retains its underlying allocation.
+            Must not be passed when ``src`` is a :class:`Buffer`.
 
         Returns
         -------
         MemcpyNode
             A new MemcpyNode representing the copy operation.
+
+        Raises
+        ------
+        ValueError
+            If ``dst_owner`` or ``src_owner`` is given together with a
+            :class:`Buffer` ``dst`` or ``src`` respectively.
         """
-        return GN_memcpy(self, <cydriver.CUdeviceptr>dst, <cydriver.CUdeviceptr>src, size)
+        cdef cydriver.CUdeviceptr c_dst
+        cdef cydriver.CUdeviceptr c_src
+        cdef OpaqueHandle dst_slot_owner, src_slot_owner
+        dst_slot_owner = _resolve_memcpy_operand(dst, dst_owner, "dst", &c_dst)
+        src_slot_owner = _resolve_memcpy_operand(src, src_owner, "src", &c_src)
+        return GN_memcpy(self, c_dst, dst_slot_owner, c_src, src_slot_owner, size)
 
     def embed(self, child: GraphDefinition) -> ChildGraphNode:
         """Add a child graph node depending on this node.
@@ -381,7 +436,7 @@ cdef class GraphNode:
         """
         return GN_wait_event(self, <Event>event)
 
-    def callback(self, fn, *, user_data=None) -> object:
+    def callback(self, fn, *, user_data=None) -> HostCallbackNode:
         """Add a host callback node depending on this node.
 
         The callback runs on the host CPU when the graph reaches this node.
@@ -500,16 +555,6 @@ cdef class GraphNode:
             cydriver.CU_GRAPH_COND_TYPE_SWITCH, count, SwitchNode)
 
 
-cdef void _destroy_event_handle_copy(void* ptr) noexcept nogil:
-    cdef EventHandle* p = <EventHandle*>ptr
-    del p
-
-
-cdef void _destroy_kernel_handle_copy(void* ptr) noexcept nogil:
-    cdef KernelHandle* p = <KernelHandle*>ptr
-    del p
-
-
 cdef inline ConditionalNode _make_conditional_node(
         GraphNode pred,
         GraphCondition condition,
@@ -626,6 +671,7 @@ cdef inline KernelNode GN_launch(GraphNode self, LaunchConfig conf, Kernel ker,
     cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node)
     cdef cydriver.CUgraphNode* deps = NULL
     cdef size_t num_deps = 0
+    cdef OpaqueHandle owner
 
     if pred_node != NULL:
         deps = &pred_node
@@ -648,14 +694,16 @@ cdef inline KernelNode GN_launch(GraphNode self, LaunchConfig conf, Kernel ker,
         HANDLE_RETURN(cydriver.cuGraphAddKernelNode(
             &new_node, as_cu(h_graph), deps, num_deps, &node_params))
 
-    _attach_user_object(as_cu(h_graph), <void*>new KernelHandle(ker._h_kernel),
-                        <cydriver.CUhostFn>_destroy_kernel_handle_copy)
-
-    cdef object kernel_args = ker_args.kernel_args
+    # Slot 0 keeps the kernel loaded; slot 1 keeps the Python kernel-argument
+    # objects (notably device Buffers) alive for the graph's lifetime. The
+    # driver copies argument values into the node at add time but does not own
+    # the device memory they reference.
+    owner = ker._h_kernel
+    HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, owner))
+    kernel_args = ker_args.kernel_args
     if kernel_args is not None:
-        Py_INCREF(kernel_args)
-        _attach_user_object(as_cu(h_graph), <void*>kernel_args,
-                            <cydriver.CUhostFn>py_object_user_object_destroy)
+        owner = make_opaque_py(kernel_args)
+        HANDLE_RETURN(graph_set_slot(h_graph, new_node, 1, owner))
 
     return _registered(KernelNode._create_with_params(
         create_graph_node_handle(new_node, h_graph),
@@ -784,8 +832,52 @@ cdef inline FreeNode GN_free(GraphNode self, cydriver.CUdeviceptr c_dptr):
     return _registered(FreeNode._create_with_params(create_graph_node_handle(new_node, h_graph), c_dptr))
 
 
+cdef inline OpaqueHandle _buffer_slot_owner(Buffer buf, str label):
+    """Copy a Buffer's device-pointer handle into a graph slot owner."""
+    cdef OpaqueHandle slot_owner
+    if not buf._h_ptr:
+        raise ValueError(f"{label} Buffer has no active allocation")
+    slot_owner = buf._h_ptr
+    return slot_owner
+
+
+cdef inline OpaqueHandle _resolve_memcpy_operand(
+        object operand, object owner, str side, cydriver.CUdeviceptr* out_ptr):
+    """Resolve a memcpy/memset operand to a pointer and optional slot owner.
+
+    ``operand`` is a :class:`Buffer` or a raw integer address; its device
+    pointer is written to ``out_ptr``. For a :class:`Buffer` operand, returns an
+    owner that retains the underlying allocation (not the wrapper). For a raw
+    pointer, returns an owner built from ``owner`` (or an empty handle when
+    ``owner`` is ``None``).
+
+    Raises
+    ------
+    ValueError
+        If ``operand`` is a :class:`Buffer` and ``owner`` is not ``None``.
+        If a :class:`Buffer` operand or ``*_owner`` has no active allocation.
+    """
+    cdef Buffer buf
+
+    if isinstance(operand, Buffer):
+        if owner is not None:
+            raise ValueError(
+                f"{side}_owner cannot be used when {side} is a Buffer"
+            )
+        buf = operand
+        slot_owner = _buffer_slot_owner(buf, side)
+        out_ptr[0] = as_cu(buf._h_ptr)
+        return slot_owner
+    out_ptr[0] = <cydriver.CUdeviceptr><uintptr_t>operand
+    if owner is None:
+        return OpaqueHandle()
+    if isinstance(owner, Buffer):
+        return _buffer_slot_owner(owner, f"{side}_owner")
+    return make_opaque_py(owner)
+
+
 cdef inline MemsetNode GN_memset(
-        GraphNode self, cydriver.CUdeviceptr c_dst,
+        GraphNode self, cydriver.CUdeviceptr c_dst, OpaqueHandle dst_owner,
         unsigned int val, unsigned int elem_size,
         size_t width, size_t height, size_t pitch):
     cdef cydriver.CUDA_MEMSET_NODE_PARAMS memset_params
@@ -816,14 +908,18 @@ cdef inline MemsetNode GN_memset(
             &new_node, as_cu(h_graph), deps, num_deps,
             &memset_params, ctx))
 
+    # Retain the destination allocation for the graph's lifetime (slot 0).
+    if dst_owner:
+        HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, dst_owner))
+
     return _registered(MemsetNode._create_with_params(
         create_graph_node_handle(new_node, h_graph), c_dst,
         val, elem_size, width, height, pitch))
 
 
 cdef inline MemcpyNode GN_memcpy(
-        GraphNode self, cydriver.CUdeviceptr c_dst,
-        cydriver.CUdeviceptr c_src, size_t size):
+        GraphNode self, cydriver.CUdeviceptr c_dst, OpaqueHandle dst_owner,
+        cydriver.CUdeviceptr c_src, OpaqueHandle src_owner, size_t size):
     cdef unsigned int dst_mem_type = cydriver.CU_MEMORYTYPE_DEVICE
     cdef unsigned int src_mem_type = cydriver.CU_MEMORYTYPE_DEVICE
     cdef cydriver.CUresult ret
@@ -877,6 +973,12 @@ cdef inline MemcpyNode GN_memcpy(
         HANDLE_RETURN(cydriver.cuGraphAddMemcpyNode(
             &new_node, as_cu(h_graph), deps, num_deps, &params, ctx))
 
+    # Retain operand allocations for the graph's lifetime (dst -> slot 0, src -> slot 1).
+    if dst_owner:
+        HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, dst_owner))
+    if src_owner:
+        HANDLE_RETURN(graph_set_slot(h_graph, new_node, 1, src_owner))
+
     return _registered(MemcpyNode._create_with_params(
         create_graph_node_handle(new_node, h_graph), c_dst, c_src, size,
         c_dst_type, c_src_type))
@@ -914,6 +1016,7 @@ cdef inline EventRecordNode GN_record_event(GraphNode self, Event ev):
     cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node)
     cdef cydriver.CUgraphNode* deps = NULL
     cdef size_t num_deps = 0
+    cdef OpaqueHandle owner
 
     if pred_node != NULL:
         deps = &pred_node
@@ -923,8 +1026,8 @@ cdef inline EventRecordNode GN_record_event(GraphNode self, Event ev):
         HANDLE_RETURN(cydriver.cuGraphAddEventRecordNode(
             &new_node, as_cu(h_graph), deps, num_deps, as_cu(ev._h_event)))
 
-    _attach_user_object(as_cu(h_graph), <void*>new EventHandle(ev._h_event),
-                        <cydriver.CUhostFn>_destroy_event_handle_copy)
+    owner = ev._h_event
+    HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, owner))
 
     return _registered(EventRecordNode._create_with_params(
         create_graph_node_handle(new_node, h_graph), ev._h_event))
@@ -936,6 +1039,7 @@ cdef inline EventWaitNode GN_wait_event(GraphNode self, Event ev):
     cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node)
     cdef cydriver.CUgraphNode* deps = NULL
     cdef size_t num_deps = 0
+    cdef OpaqueHandle owner
 
     if pred_node != NULL:
         deps = &pred_node
@@ -945,35 +1049,36 @@ cdef inline EventWaitNode GN_wait_event(GraphNode self, Event ev):
         HANDLE_RETURN(cydriver.cuGraphAddEventWaitNode(
             &new_node, as_cu(h_graph), deps, num_deps, as_cu(ev._h_event)))
 
-    _attach_user_object(as_cu(h_graph), <void*>new EventHandle(ev._h_event),
-                        <cydriver.CUhostFn>_destroy_event_handle_copy)
+    owner = ev._h_event
+    HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, owner))
 
     return _registered(EventWaitNode._create_with_params(
         create_graph_node_handle(new_node, h_graph), ev._h_event))
 
 
 cdef inline HostCallbackNode GN_callback(GraphNode self, object fn, object user_data):
-    import ctypes as ct
-
     cdef cydriver.CUDA_HOST_NODE_PARAMS node_params
     cdef cydriver.CUgraphNode new_node = NULL
     cdef GraphHandle h_graph = graph_node_get_graph(self._h_node)
     cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node)
     cdef cydriver.CUgraphNode* deps = NULL
     cdef size_t num_deps = 0
+    cdef OpaqueHandle fn_owner, data_owner
 
     if pred_node != NULL:
         deps = &pred_node
         num_deps = 1
 
-    _attach_host_callback_to_graph(
-        as_cu(h_graph), fn, user_data,
-        &node_params.fn, &node_params.userData)
+    _resolve_host_callback(
+        fn, user_data, &node_params.fn, &node_params.userData,
+        &fn_owner, &data_owner)
 
     with nogil:
         HANDLE_RETURN(cydriver.cuGraphAddHostNode(
             &new_node, as_cu(h_graph), deps, num_deps, &node_params))
 
+    _attach_host_callback_owners(h_graph, new_node, fn_owner, data_owner)
+
     cdef object callable_obj = fn if not isinstance(fn, ct._CFuncPtr) else None
     return _registered(HostCallbackNode._create_with_params(
         create_graph_node_handle(new_node, h_graph), callable_obj,
diff --git a/cuda_core/cuda/core/graph/_host_callback.pxd b/cuda_core/cuda/core/graph/_host_callback.pxd
new file mode 100644
index 00000000000..dac249c74ed
--- /dev/null
+++ b/cuda_core/cuda/core/graph/_host_callback.pxd
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.bindings cimport cydriver
+
+from cuda.core._resource_handles cimport GraphHandle, OpaqueHandle
+
+
+cdef bint _is_py_host_trampoline(cydriver.CUhostFn fn) noexcept nogil
+
+cdef void _resolve_host_callback(
+    object fn, object user_data,
+    cydriver.CUhostFn* out_fn, void** out_user_data,
+    OpaqueHandle* out_fn_owner, OpaqueHandle* out_data_owner) except *
+
+cdef int _attach_host_callback_owners(
+    const GraphHandle& h_graph, cydriver.CUgraphNode node,
+    OpaqueHandle fn_owner, OpaqueHandle data_owner) except -1
diff --git a/cuda_core/cuda/core/graph/_utils.pyi b/cuda_core/cuda/core/graph/_host_callback.pyi
similarity index 74%
rename from cuda_core/cuda/core/graph/_utils.pyi
rename to cuda_core/cuda/core/graph/_host_callback.pyi
index 79072e66ebe..6c9d0ead317 100644
--- a/cuda_core/cuda/core/graph/_utils.pyi
+++ b/cuda_core/cuda/core/graph/_host_callback.pyi
@@ -1,3 +1,3 @@
-# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_utils.pyx
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_host_callback.pyx
 
 from __future__ import annotations
\ No newline at end of file
diff --git a/cuda_core/cuda/core/graph/_host_callback.pyx b/cuda_core/cuda/core/graph/_host_callback.pyx
new file mode 100644
index 00000000000..bed2d8152f5
--- /dev/null
+++ b/cuda_core/cuda/core/graph/_host_callback.pyx
@@ -0,0 +1,79 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport uintptr_t
+from libc.stdlib cimport malloc
+from libc.string cimport memcpy as c_memcpy
+
+from cuda.bindings cimport cydriver
+
+from cuda.core._resource_handles cimport (
+    GraphHandle,
+    OpaqueHandle,
+    graph_set_slot,
+    make_opaque_malloc,
+    make_opaque_py,
+)
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+
+import ctypes as ct
+
+
+cdef void _py_host_trampoline(void* data) noexcept with gil:
+    (<object>data)()
+
+
+cdef bint _is_py_host_trampoline(cydriver.CUhostFn fn) noexcept nogil:
+    return fn == <cydriver.CUhostFn>_py_host_trampoline
+
+
+cdef void _resolve_host_callback(
+        object fn, object user_data,
+        cydriver.CUhostFn* out_fn, void** out_user_data,
+        OpaqueHandle* out_fn_owner, OpaqueHandle* out_data_owner) except *:
+    """Resolve a Python callable or ctypes CFuncPtr into a C callback pair and
+    the owners that keep it alive.
+
+    On return ``*out_fn`` / ``*out_user_data`` are ready to pass to
+    ``cuGraphAddHostNode`` or ``cuLaunchHostFunc``. ``*out_fn_owner`` owns the
+    callback object; ``*out_data_owner`` owns a copied ``user_data`` buffer and
+    is left null otherwise. The caller attaches the owners to the node's graph
+    slots.
+    """
+    if isinstance(fn, ct._CFuncPtr):
+        out_fn[0] = <cydriver.CUhostFn><uintptr_t>ct.cast(fn, ct.c_void_p).value
+        if user_data is None:
+            out_user_data[0] = NULL
+        elif isinstance(user_data, int):
+            out_user_data[0] = <void*><uintptr_t>user_data
+        else:
+            buf = bytes(user_data)
+            if len(buf):
+                out_user_data[0] = malloc(len(buf))
+                if out_user_data[0] == NULL:
+                    raise MemoryError("failed to allocate user_data buffer")
+                c_memcpy(out_user_data[0], <const char*>buf, len(buf))
+                out_data_owner[0] = make_opaque_malloc(out_user_data[0])
+            else:
+                out_user_data[0] = NULL
+    else:
+        if user_data is not None:
+            raise ValueError(
+                "user_data is only supported with ctypes function pointers")
+        out_fn[0] = <cydriver.CUhostFn>_py_host_trampoline
+        out_user_data[0] = <void*>fn
+
+    out_fn_owner[0] = make_opaque_py(fn)
+
+
+cdef int _attach_host_callback_owners(
+        const GraphHandle& h_graph, cydriver.CUgraphNode node,
+        OpaqueHandle fn_owner, OpaqueHandle data_owner) except -1:
+    """Attach a resolved host callback's owners to its node's graph slots: the
+    callback in slot 0 and any copied ``user_data`` buffer in slot 1.
+    """
+    HANDLE_RETURN(graph_set_slot(h_graph, node, 0, fn_owner))
+    if data_owner:
+        HANDLE_RETURN(graph_set_slot(h_graph, node, 1, data_owner))
+    return 0
diff --git a/cuda_core/cuda/core/graph/_subclasses.pyi b/cuda_core/cuda/core/graph/_subclasses.pyi
index 345e6417c4d..480b1b66a6f 100644
--- a/cuda_core/cuda/core/graph/_subclasses.pyi
+++ b/cuda_core/cuda/core/graph/_subclasses.pyi
@@ -204,7 +204,7 @@ class ChildGraphNode(GraphNode):
         ...
 
     @property
-    def child_graph(self) -> GraphDefinition:
+    def child_graph(self) -> 'GraphDefinition':
         """The embedded graph definition (non-owning wrapper)."""
 
 class EventRecordNode(GraphNode):
@@ -290,7 +290,7 @@ class ConditionalNode(GraphNode):
         """
 
     @property
-    def branches(self) -> tuple[GraphDefinition, ...]:
+    def branches(self) -> tuple['GraphDefinition', ...]:
         """The body graphs for each branch as a tuple of GraphDefinition.
 
         Returns an empty tuple when reconstructed from the driver
@@ -304,7 +304,7 @@ class IfNode(ConditionalNode):
         ...
 
     @property
-    def then(self) -> GraphDefinition:
+    def then(self) -> 'GraphDefinition':
         """The 'then' branch graph."""
 
 class IfElseNode(ConditionalNode):
@@ -314,11 +314,11 @@ class IfElseNode(ConditionalNode):
         ...
 
     @property
-    def then(self) -> GraphDefinition:
+    def then(self) -> 'GraphDefinition':
         """The ``then`` branch graph (executed when condition is non-zero)."""
 
     @property
-    def else_(self) -> GraphDefinition:
+    def else_(self) -> 'GraphDefinition':
         """The ``else`` branch graph (executed when condition is zero)."""
 
 class WhileNode(ConditionalNode):
@@ -328,7 +328,7 @@ class WhileNode(ConditionalNode):
         ...
 
     @property
-    def body(self) -> GraphDefinition:
+    def body(self) -> 'GraphDefinition':
         """The loop body graph."""
 
 class SwitchNode(ConditionalNode):
diff --git a/cuda_core/cuda/core/graph/_subclasses.pyx b/cuda_core/cuda/core/graph/_subclasses.pyx
index 85a382197f8..df919426bc2 100644
--- a/cuda_core/cuda/core/graph/_subclasses.pyx
+++ b/cuda_core/cuda/core/graph/_subclasses.pyx
@@ -30,7 +30,7 @@ from cuda.core._resource_handles cimport (
 )
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
-from cuda.core.graph._utils cimport _is_py_host_trampoline
+from cuda.core.graph._host_callback cimport _is_py_host_trampoline
 
 from cuda.core._utils.cuda_utils import driver, handle_return
 from cuda.core.typing import GraphConditionalType
@@ -478,7 +478,7 @@ cdef class ChildGraphNode(GraphNode):
                 f" child=0x{as_intptr(self._h_child_graph):x}>")
 
     @property
-    def child_graph(self) -> GraphDefinition:
+    def child_graph(self) -> "GraphDefinition":
         """The embedded graph definition (non-owning wrapper)."""
         return GraphDefinition._from_handle(self._h_child_graph)
 
@@ -705,7 +705,7 @@ cdef class ConditionalNode(GraphNode):
             return GraphConditionalType("switch")
 
     @property
-    def branches(self) -> tuple[GraphDefinition, ...]:
+    def branches(self) -> tuple["GraphDefinition", ...]:
         """The body graphs for each branch as a tuple of GraphDefinition.
 
         Returns an empty tuple when reconstructed from the driver
@@ -722,7 +722,7 @@ cdef class IfNode(ConditionalNode):
                 f" condition=0x{<unsigned long long>self._condition._c_handle:x}>")
 
     @property
-    def then(self) -> GraphDefinition:
+    def then(self) -> "GraphDefinition":
         """The 'then' branch graph."""
         return self._branches[0]
 
@@ -735,12 +735,12 @@ cdef class IfElseNode(ConditionalNode):
                 f" condition=0x{<unsigned long long>self._condition._c_handle:x}>")
 
     @property
-    def then(self) -> GraphDefinition:
+    def then(self) -> "GraphDefinition":
         """The ``then`` branch graph (executed when condition is non-zero)."""
         return self._branches[0]
 
     @property
-    def else_(self) -> GraphDefinition:
+    def else_(self) -> "GraphDefinition":
         """The ``else`` branch graph (executed when condition is zero)."""
         return self._branches[1]
 
@@ -753,7 +753,7 @@ cdef class WhileNode(ConditionalNode):
                 f" condition=0x{<unsigned long long>self._condition._c_handle:x}>")
 
     @property
-    def body(self) -> GraphDefinition:
+    def body(self) -> "GraphDefinition":
         """The loop body graph."""
         return self._branches[0]
 
diff --git a/cuda_core/cuda/core/graph/_utils.pxd b/cuda_core/cuda/core/graph/_utils.pxd
deleted file mode 100644
index 63fdb00ac4f..00000000000
--- a/cuda_core/cuda/core/graph/_utils.pxd
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from cuda.bindings cimport cydriver
-
-
-cdef bint _is_py_host_trampoline(cydriver.CUhostFn fn) noexcept nogil
-
-cdef void _attach_user_object(
-    cydriver.CUgraph graph, void* ptr,
-    cydriver.CUhostFn destroy) except *
-
-cdef void _attach_host_callback_to_graph(
-    cydriver.CUgraph graph, object fn, object user_data,
-    cydriver.CUhostFn* out_fn, void** out_user_data) except *
diff --git a/cuda_core/cuda/core/graph/_utils.pyx b/cuda_core/cuda/core/graph/_utils.pyx
deleted file mode 100644
index dfc2f4f3fec..00000000000
--- a/cuda_core/cuda/core/graph/_utils.pyx
+++ /dev/null
@@ -1,99 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from cpython.ref cimport Py_INCREF
-
-from libc.stdint cimport uintptr_t
-from libc.stdlib cimport malloc, free
-from libc.string cimport memcpy as c_memcpy
-
-from cuda.bindings cimport cydriver
-
-from cuda.core._resource_handles cimport py_object_user_object_destroy
-from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
-
-
-cdef void _py_host_trampoline(void* data) noexcept with gil:
-    (<object>data)()
-
-
-cdef bint _is_py_host_trampoline(cydriver.CUhostFn fn) noexcept nogil:
-    return fn == <cydriver.CUhostFn>_py_host_trampoline
-
-
-cdef void _attach_user_object(
-        cydriver.CUgraph graph, void* ptr,
-        cydriver.CUhostFn destroy) except *:
-    """Create a CUDA user object and transfer ownership to the graph.
-
-    On success the graph owns the resource (via MOVE semantics).
-    On failure the destroy callback is invoked to clean up ptr,
-    then a CUDAError is raised — callers need no try/except.
-    """
-    cdef cydriver.CUuserObject user_obj = NULL
-    cdef cydriver.CUresult ret
-    with nogil:
-        ret = cydriver.cuUserObjectCreate(
-            &user_obj, ptr, destroy, 1,
-            cydriver.CU_USER_OBJECT_NO_DESTRUCTOR_SYNC)
-        if ret == cydriver.CUDA_SUCCESS:
-            ret = cydriver.cuGraphRetainUserObject(
-                graph, user_obj, 1, cydriver.CU_GRAPH_USER_OBJECT_MOVE)
-            if ret != cydriver.CUDA_SUCCESS:
-                cydriver.cuUserObjectRelease(user_obj, 1)
-    if ret != cydriver.CUDA_SUCCESS:
-        if user_obj == NULL:
-            destroy(ptr)
-        HANDLE_RETURN(ret)
-
-
-cdef void _attach_host_callback_to_graph(
-        cydriver.CUgraph graph, object fn, object user_data,
-        cydriver.CUhostFn* out_fn, void** out_user_data) except *:
-    """Resolve a Python callable or ctypes CFuncPtr into a C callback pair.
-
-    Handles Py_INCREF, user-object attachment for lifetime management,
-    and user_data copying.  On return, *out_fn and *out_user_data are
-    ready to pass to cuGraphAddHostNode or cuLaunchHostFunc.
-    """
-    import ctypes as ct
-
-    cdef void* fn_pyobj = NULL
-
-    if isinstance(fn, ct._CFuncPtr):
-        Py_INCREF(fn)
-        fn_pyobj = <void*>fn
-        _attach_user_object(
-            graph, fn_pyobj,
-            <cydriver.CUhostFn>py_object_user_object_destroy)
-        out_fn[0] = <cydriver.CUhostFn><uintptr_t>ct.cast(
-            fn, ct.c_void_p).value
-
-        if user_data is not None:
-            if isinstance(user_data, int):
-                out_user_data[0] = <void*><uintptr_t>user_data
-            else:
-                buf = bytes(user_data)
-                out_user_data[0] = malloc(len(buf))
-                if out_user_data[0] == NULL:
-                    raise MemoryError(
-                        "failed to allocate user_data buffer")
-                c_memcpy(out_user_data[0], <const char*>buf, len(buf))
-                _attach_user_object(
-                    graph, out_user_data[0],
-                    <cydriver.CUhostFn>free)
-        else:
-            out_user_data[0] = NULL
-    else:
-        if user_data is not None:
-            raise ValueError(
-                "user_data is only supported with ctypes "
-                "function pointers")
-        Py_INCREF(fn)
-        fn_pyobj = <void*>fn
-        out_fn[0] = <cydriver.CUhostFn>_py_host_trampoline
-        out_user_data[0] = fn_pyobj
-        _attach_user_object(
-            graph, fn_pyobj,
-            <cydriver.CUhostFn>py_object_user_object_destroy)
diff --git a/cuda_core/tests/graph/test_graph_builder.py b/cuda_core/tests/graph/test_graph_builder.py
index 18dfe21cc12..efb70fe75dd 100644
--- a/cuda_core/tests/graph/test_graph_builder.py
+++ b/cuda_core/tests/graph/test_graph_builder.py
@@ -5,11 +5,12 @@
 
 import numpy as np
 import pytest
-from helpers.graph_kernels import compile_common_kernels
+from helpers.graph_kernels import compile_common_kernels, compile_conditional_kernels
 from helpers.marks import requires_module
+from helpers.misc import try_create_condition
 
 from cuda.core import Device, LaunchConfig, LegacyPinnedMemoryResource, launch
-from cuda.core.graph import GraphBuilder
+from cuda.core.graph import GraphBuilder, GraphDefinition
 
 
 def test_graph_is_building(init_cuda):
@@ -384,3 +385,190 @@ def test_graph_stream_lifetime(init_cuda):
 
     # Destroy the stream
     stream.close()
+
+
+# ---------------------------------------------------------------------------
+# GraphBuilder.graph_definition
+# ---------------------------------------------------------------------------
+
+
+def test_graph_definition_returns_graph_definition_after_end_building(init_cuda):
+    """Primary builder exposes its captured graph as a GraphDefinition after end_building()."""
+    mod = compile_common_kernels()
+    empty_kernel = mod.get_kernel("empty_kernel")
+
+    gb = Device().create_graph_builder().begin_building()
+    launch(gb, LaunchConfig(grid=1, block=1), empty_kernel)
+    launch(gb, LaunchConfig(grid=1, block=1), empty_kernel)
+    gb.end_building()
+
+    gd = gb.graph_definition
+    assert isinstance(gd, GraphDefinition)
+    # The captured graph must contain the launched kernels.
+    assert len(gd.nodes()) == 2
+
+
+def test_graph_definition_raises_before_begin_building(init_cuda):
+    """Primary builder has no graph allocated before begin_building()."""
+    gb = Device().create_graph_builder()
+    with pytest.raises(RuntimeError, match="before begin_building"):
+        _ = gb.graph_definition
+
+
+def test_graph_definition_raises_during_capture(init_cuda):
+    """graph_definition is unsafe while the driver is actively capturing."""
+    gb = Device().create_graph_builder().begin_building()
+    try:
+        with pytest.raises(RuntimeError, match="capture is in"):
+            _ = gb.graph_definition
+    finally:
+        gb.end_building()
+
+
+def test_graph_definition_raises_for_forked(init_cuda):
+    """Forked builders share the primary's graph; their property must raise."""
+    mod = compile_common_kernels()
+    empty_kernel = mod.get_kernel("empty_kernel")
+
+    gb = Device().create_graph_builder().begin_building()
+    launch(gb, LaunchConfig(grid=1, block=1), empty_kernel)
+    primary, sibling = gb.split(2)
+    try:
+        with pytest.raises(RuntimeError, match="forked"):
+            _ = sibling.graph_definition
+    finally:
+        sibling = GraphBuilder.join(primary, sibling)
+        sibling.end_building()
+
+
+def test_graph_definition_shares_ownership(init_cuda):
+    """Closing the builder must not invalidate a held GraphDefinition."""
+    mod = compile_common_kernels()
+    empty_kernel = mod.get_kernel("empty_kernel")
+
+    gb = Device().create_graph_builder().begin_building()
+    launch(gb, LaunchConfig(grid=1, block=1), empty_kernel)
+    gb.end_building()
+
+    gd = gb.graph_definition
+    gb.close()
+    # The shared CUgraph keeps the graph alive.
+    assert len(gd.nodes()) == 1
+
+
+def test_graph_definition_round_trips_through_explicit_api(init_cuda):
+    """Mutating via the explicit API survives complete() and runs correctly."""
+    mod = compile_common_kernels()
+    add_one = mod.get_kernel("add_one")
+
+    launch_stream = Device().create_stream()
+    mr = LegacyPinnedMemoryResource()
+    b = mr.allocate(4)
+    arr = np.from_dlpack(b).view(np.int32)
+    arr[0] = 0
+
+    gb = launch_stream.create_graph_builder().begin_building()
+    launch(gb, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data)
+    gb.end_building()
+
+    # Add a second add_one through the explicit GraphDefinition view.
+    gd = gb.graph_definition
+    captured_node = next(iter(gd.nodes()))
+    captured_node.launch(LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data)
+    assert len(gd.nodes()) == 2
+
+    graph = gb.complete()
+    graph.launch(launch_stream)
+    launch_stream.sync()
+    assert arr[0] == 2
+
+    b.close()
+
+
+@requires_module(np, "2.1")
+def test_graph_definition_hybrid_conditional_body(init_cuda):
+    """Populate a conditional body entirely through the explicit API.
+
+    This is the headline hybrid flow enabled by the new property:
+    ``if_then`` returns a ``GraphBuilder`` for the body, but instead of
+    calling ``begin_building`` and capturing into it, we reach for
+    ``graph_definition`` and add nodes through the explicit API.
+    """
+    mod = compile_conditional_kernels(int)
+    add_one = mod.get_kernel("add_one")
+    set_handle = mod.get_kernel("set_handle")
+
+    launch_stream = Device().create_stream()
+    mr = LegacyPinnedMemoryResource()
+    b = mr.allocate(4)
+    arr = np.from_dlpack(b).view(np.int32)
+    arr[0] = 0
+
+    gb = Device().create_graph_builder().begin_building()
+    condition = try_create_condition(gb)
+    launch(gb, LaunchConfig(grid=1, block=1), set_handle, condition, 1)
+    body_gb = gb.if_then(condition)
+
+    # Skip body_gb.begin_building() entirely -- the body graph already
+    # exists at conditional-node creation time and is exposed here.
+    body_def = body_gb.graph_definition
+    assert isinstance(body_def, GraphDefinition)
+    assert len(body_def.nodes()) == 0
+    body_def.launch(LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data)
+
+    graph = gb.end_building().complete()
+    graph.launch(launch_stream)
+    launch_stream.sync()
+    assert arr[0] == 1
+
+    b.close()
+
+
+@requires_module(np, "2.1")
+def test_graph_definition_conditional_body_after_capture(init_cuda):
+    """Capture into a conditional body, then augment it via the explicit API."""
+    mod = compile_conditional_kernels(int)
+    add_one = mod.get_kernel("add_one")
+    set_handle = mod.get_kernel("set_handle")
+
+    launch_stream = Device().create_stream()
+    mr = LegacyPinnedMemoryResource()
+    b = mr.allocate(4)
+    arr = np.from_dlpack(b).view(np.int32)
+    arr[0] = 0
+
+    gb = Device().create_graph_builder().begin_building()
+    condition = try_create_condition(gb)
+    launch(gb, LaunchConfig(grid=1, block=1), set_handle, condition, 1)
+    body_gb = gb.if_then(condition).begin_building()
+
+    # Capture one increment into the body.
+    launch(body_gb, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data)
+    body_gb.end_building()
+
+    # Add a second increment via the explicit API on the same body graph.
+    body_def = body_gb.graph_definition
+    captured_node = next(iter(body_def.nodes()))
+    captured_node.launch(LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data)
+    assert len(body_def.nodes()) == 2
+
+    graph = gb.end_building().complete()
+    graph.launch(launch_stream)
+    launch_stream.sync()
+    assert arr[0] == 2
+
+    b.close()
+
+
+@requires_module(np, "2.1")
+def test_graph_definition_conditional_body_during_capture_raises(init_cuda):
+    """The CAPTURING-state guard fires for conditional bodies too."""
+    gb = Device().create_graph_builder().begin_building()
+    condition = try_create_condition(gb)
+    body_gb = gb.if_then(condition).begin_building()
+    try:
+        with pytest.raises(RuntimeError, match="capture is in"):
+            _ = body_gb.graph_definition
+    finally:
+        body_gb.end_building()
+        gb.end_building()
diff --git a/cuda_core/tests/graph/test_graph_definition_lifetime.py b/cuda_core/tests/graph/test_graph_definition_lifetime.py
index 40bc6f3c442..d196e35f478 100644
--- a/cuda_core/tests/graph/test_graph_definition_lifetime.py
+++ b/cuda_core/tests/graph/test_graph_definition_lifetime.py
@@ -13,22 +13,52 @@
 from helpers.misc import try_create_condition
 
 from conftest import xfail_on_graph_mempool_oom
+from cuda_python_test_helpers import under_compute_sanitizer
 
+# Resource finalization triggered by graph destruction is not strictly
+# synchronous: the graph's slot table is freed through a CUDA user-object
+# destructor that the driver may run on its own thread, after which each owner
+# is released (a shared_ptr decrement, or Py_DECREF under the GIL). Release is
+# deterministic at the reference-count level, so the predicate normally flips
+# within milliseconds; this budget only bounds a slow/loaded runner. It stays a
+# hard failure rather than a warning so a real leak still fails the suite.
+# Compute-sanitizer slows everything down, hence the larger ceiling there.
+_FINALIZE_TIMEOUT = 30.0 if under_compute_sanitizer() else 5.0
 
-def _wait_until(predicate, timeout=2.0, interval=0.01):
-    """Poll predicate() until True or timeout, driving gc each iteration.
 
-    Used for assertions about resource cleanup that may be delayed by CUDA's
-    asynchronous user-object destructor pump (DPC) or, on free-threaded
-    Python, by deferred reference-count processing. A bounded poll keeps the
-    test correct without depending on undocumented driver timing guarantees.
+class _Sentinel:
+    """Weak-referenceable stand-in for an owner attached to a graph slot.
+
+    Bare ``object()`` instances do not support weak references, so tests that
+    observe owner release through a :class:`weakref.ref` use this trivial
+    subclass instead.
     """
+
+
+def _wait_until(predicate, timeout=None, interval=0.02):
+    """Poll ``predicate()`` until true, or raise AssertionError on timeout.
+
+    Each iteration drives ``gc.collect()`` and yields the main thread (which
+    releases the GIL) so the driver's asynchronous user-object destructor --
+    and the ``Py_DECREF`` it triggers -- can make progress. Used for resource
+    cleanup that lags graph destruction; see ``_FINALIZE_TIMEOUT``.
+    """
+    if timeout is None:
+        timeout = _FINALIZE_TIMEOUT
     deadline = time.monotonic() + timeout
-    while time.monotonic() < deadline:
+    while True:
         gc.collect()
         if predicate():
             return
+        if time.monotonic() >= deadline:
+            break
+        time.sleep(0)  # yield the GIL to the driver's finalizer thread
         time.sleep(interval)
+    # Final attempt after one more yield and collection.
+    time.sleep(0)
+    gc.collect()
+    if predicate():
+        return
     raise AssertionError(f"condition not satisfied within {timeout}s")
 
 
@@ -594,3 +624,366 @@ def test_kernel_args_survive_graph_clone(init_cuda):
     out = (ctypes.c_int * 1)(0)
     handle_return(driver.cuMemcpyDtoH(out, dptr, ctypes.sizeof(ctypes.c_int)))
     assert out[0] == 1
+
+
+# =============================================================================
+# Memcpy/memset Buffer lifetime — operands passed as Buffer objects
+# =============================================================================
+
+
+def test_memset_buffer_lifetime(init_cuda):
+    """Memset retains the Buffer allocation after the wrapper is collected."""
+    from cuda.core._utils.cuda_utils import driver, handle_return
+
+    _skip_if_no_mempool()
+    dev = Device()
+    mr = DeviceMemoryResource(dev)
+    buf = mr.allocate(4, stream=dev.default_stream)
+    dev.default_stream.sync()
+    dptr = int(buf.handle)
+
+    g = GraphDefinition()
+    g.memset(buf, 0xAB, 4)
+
+    del buf
+    gc.collect()
+
+    stream = dev.create_stream()
+    g.instantiate().launch(stream)
+    stream.sync()
+
+    out = (ctypes.c_uint8 * 4)(0)
+    handle_return(driver.cuMemcpyDtoH(out, dptr, 4))
+    assert list(out) == [0xAB] * 4
+
+
+def test_memset_buffer_survives_close(init_cuda):
+    """Memset retains the allocation when the Buffer wrapper is closed."""
+    from cuda.core._utils.cuda_utils import driver, handle_return
+
+    _skip_if_no_mempool()
+    dev = Device()
+    mr = DeviceMemoryResource(dev)
+    buf = mr.allocate(4, stream=dev.default_stream)
+    dev.default_stream.sync()
+    dptr = int(buf.handle)
+
+    g = GraphDefinition()
+    g.memset(buf, 0xAB, 4)
+    buf.close()
+
+    stream = dev.create_stream()
+    g.instantiate().launch(stream)
+    stream.sync()
+
+    out = (ctypes.c_uint8 * 4)(0)
+    handle_return(driver.cuMemcpyDtoH(out, dptr, 4))
+    assert list(out) == [0xAB] * 4
+
+
+def test_memcpy_buffer_lifetime(init_cuda):
+    """Memcpy retains operand allocations after the Buffer wrappers are collected."""
+    from cuda.core._utils.cuda_utils import driver, handle_return
+
+    _skip_if_no_mempool()
+    dev = Device()
+    mr = DeviceMemoryResource(dev)
+    src = mr.allocate(4, stream=dev.default_stream)
+    dst = mr.allocate(4, stream=dev.default_stream)
+    src.fill(0xCD, stream=dev.default_stream)
+    dev.default_stream.sync()
+    dst_dptr = int(dst.handle)
+
+    g = GraphDefinition()
+    g.memcpy(dst, src, 4)
+
+    del src, dst
+    gc.collect()
+
+    stream = dev.create_stream()
+    g.instantiate().launch(stream)
+    stream.sync()
+
+    out = (ctypes.c_uint8 * 4)(0)
+    handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4))
+    assert list(out) == [0xCD] * 4
+
+
+def test_memcpy_buffer_survives_close(init_cuda):
+    """Memcpy retains allocations when Buffer wrappers are closed."""
+    from cuda.core._utils.cuda_utils import driver, handle_return
+
+    _skip_if_no_mempool()
+    dev = Device()
+    mr = DeviceMemoryResource(dev)
+    src = mr.allocate(4, stream=dev.default_stream)
+    dst = mr.allocate(4, stream=dev.default_stream)
+    src.fill(0xCD, stream=dev.default_stream)
+    dev.default_stream.sync()
+    dst_dptr = int(dst.handle)
+
+    g = GraphDefinition()
+    g.memcpy(dst, src, 4)
+    src.close()
+    dst.close()
+
+    stream = dev.create_stream()
+    g.instantiate().launch(stream)
+    stream.sync()
+
+    out = (ctypes.c_uint8 * 4)(0)
+    handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4))
+    assert list(out) == [0xCD] * 4
+
+
+def test_memcpy_buffer_allocations_released_after_graph_destroyed(init_cuda):
+    """Destroying the graph frees both memcpy operand allocations.
+
+    Each operand's device-pointer handle is observed via a weak handle
+    (see ``cuda.core._utils._weak_handles``), so release is checked at the
+    reference-count level rather than through a driver side effect. With both
+    Buffer wrappers closed, the graph's slots are the only remaining owners;
+    destroying the graph releases them and the weak handles expire.
+    """
+    from cuda.core._utils._weak_handles import weak_handle
+
+    _skip_if_no_mempool()
+    dev = Device()
+    mr = DeviceMemoryResource(dev)
+    src = mr.allocate(4, stream=dev.default_stream)
+    dst = mr.allocate(4, stream=dev.default_stream)
+    dev.default_stream.sync()
+
+    g = GraphDefinition()
+    g.memcpy(dst, src, 4)
+
+    # Observe the allocations, then drop the wrappers' strong references; the
+    # graph slots remain the sole owners.
+    src_weak = weak_handle(src)
+    dst_weak = weak_handle(dst)
+    src.close()
+    dst.close()
+    assert src_weak and dst_weak  # graph slots still retain both allocations
+
+    del g
+    _wait_until(lambda: not src_weak and not dst_weak)
+
+
+def test_memcpy_buffers_survive_graph_clone(init_cuda):
+    """Cloned graph keeps memcpy operand allocations alive via CUDA user objects."""
+    from cuda.core._utils.cuda_utils import driver, handle_return
+
+    _skip_if_no_mempool()
+    dev = Device()
+    mr = DeviceMemoryResource(dev)
+    src = mr.allocate(4, stream=dev.default_stream)
+    dst = mr.allocate(4, stream=dev.default_stream)
+    src.fill(0xCD, stream=dev.default_stream)
+    dev.default_stream.sync()
+    dst_dptr = int(dst.handle)
+
+    g = GraphDefinition()
+    g.memcpy(dst, src, 4)
+    cloned_cu_graph = handle_return(driver.cuGraphClone(driver.CUgraph(g.handle)))
+
+    del src, dst, g
+    gc.collect()
+
+    graph_exec = handle_return(driver.cuGraphInstantiate(cloned_cu_graph, 0))
+    stream = dev.create_stream()
+    handle_return(driver.cuGraphLaunch(graph_exec, driver.CUstream(int(stream.handle))))
+    stream.sync()
+
+    out = (ctypes.c_uint8 * 4)(0)
+    handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4))
+    assert list(out) == [0xCD] * 4
+
+
+# =============================================================================
+# Explicit dst_owner / src_owner for raw pointer operands
+# =============================================================================
+
+
+def test_memset_raw_ptr_with_dst_owner(init_cuda):
+    """Raw dst plus Buffer dst_owner retains the allocation after close."""
+    from cuda.core._utils.cuda_utils import driver, handle_return
+
+    _skip_if_no_mempool()
+    dev = Device()
+    mr = DeviceMemoryResource(dev)
+    buf = mr.allocate(4, stream=dev.default_stream)
+    dev.default_stream.sync()
+    dptr = int(buf.handle)
+
+    g = GraphDefinition()
+    g.memset(dptr, 0xAB, 4, dst_owner=buf)
+    buf.close()
+
+    stream = dev.create_stream()
+    g.instantiate().launch(stream)
+    stream.sync()
+
+    out = (ctypes.c_uint8 * 4)(0)
+    handle_return(driver.cuMemcpyDtoH(out, dptr, 4))
+    assert list(out) == [0xAB] * 4
+
+
+def test_slot_owners_released_after_graph_destroyed(init_cuda):
+    """Destroying the graph releases every owner held in its slot table.
+
+    Raw-pointer operands with explicit sentinel owners make release observable
+    in pure Python: the slot table holds a strong Python reference to each owner
+    (via ``make_opaque_py``), and graph destruction frees the table -- dropping
+    those references. This exercises the same teardown that releases a Buffer
+    operand's device-pointer handle (slot 0 for ``dst``, slot 1 for ``src``).
+    """
+    _skip_if_no_mempool()
+    dev = Device()
+    mr = DeviceMemoryResource(dev)
+    buf = mr.allocate(8, stream=dev.default_stream)
+    dev.default_stream.sync()
+    dptr = int(buf.handle)
+
+    dst_owner = _Sentinel()
+    src_owner = _Sentinel()
+    dst_weak = weakref.ref(dst_owner)
+    src_weak = weakref.ref(src_owner)
+
+    g = GraphDefinition()
+    # Non-overlapping 4-byte copy within an 8-byte allocation.
+    g.memcpy(dptr, dptr + 4, 4, dst_owner=dst_owner, src_owner=src_owner)
+
+    del dst_owner, src_owner
+    gc.collect()
+    assert dst_weak() is not None and src_weak() is not None  # graph retains owners
+
+    del g
+    _wait_until(lambda: dst_weak() is None and src_weak() is None)
+
+    buf.close()
+
+
+def test_memcpy_raw_ptrs_with_owners(init_cuda):
+    """Raw src/dst plus Buffer owners retain allocations after close."""
+    from cuda.core._utils.cuda_utils import driver, handle_return
+
+    _skip_if_no_mempool()
+    dev = Device()
+    mr = DeviceMemoryResource(dev)
+    src = mr.allocate(4, stream=dev.default_stream)
+    dst = mr.allocate(4, stream=dev.default_stream)
+    src.fill(0xCD, stream=dev.default_stream)
+    dev.default_stream.sync()
+    src_dptr = int(src.handle)
+    dst_dptr = int(dst.handle)
+
+    g = GraphDefinition()
+    g.memcpy(dst_dptr, src_dptr, 4, dst_owner=dst, src_owner=src)
+    src.close()
+    dst.close()
+
+    stream = dev.create_stream()
+    g.instantiate().launch(stream)
+    stream.sync()
+
+    out = (ctypes.c_uint8 * 4)(0)
+    handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4))
+    assert list(out) == [0xCD] * 4
+
+
+def test_memcpy_mixed_buffer_and_raw_owner(init_cuda):
+    """Buffer dst and raw src with src_owner retain allocations after close."""
+    from cuda.core._utils.cuda_utils import driver, handle_return
+
+    _skip_if_no_mempool()
+    dev = Device()
+    mr = DeviceMemoryResource(dev)
+    src = mr.allocate(4, stream=dev.default_stream)
+    dst = mr.allocate(4, stream=dev.default_stream)
+    src.fill(0xCD, stream=dev.default_stream)
+    dev.default_stream.sync()
+    src_dptr = int(src.handle)
+    dst_dptr = int(dst.handle)
+
+    g = GraphDefinition()
+    g.memcpy(dst, src_dptr, 4, src_owner=src)
+    src.close()
+    dst.close()
+
+    stream = dev.create_stream()
+    g.instantiate().launch(stream)
+    stream.sync()
+
+    out = (ctypes.c_uint8 * 4)(0)
+    handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4))
+    assert list(out) == [0xCD] * 4
+
+
+def test_memset_closed_buffer_rejected(init_cuda):
+    """Memset rejects a Buffer with no active allocation."""
+    _skip_if_no_mempool()
+    dev = Device()
+    mr = DeviceMemoryResource(dev)
+    buf = mr.allocate(4, stream=dev.default_stream)
+    dev.default_stream.sync()
+    buf.close()
+
+    g = GraphDefinition()
+    with pytest.raises(ValueError, match="dst Buffer has no active allocation"):
+        g.memset(buf, 0xAB, 4)
+
+
+def test_memset_closed_buffer_dst_owner_rejected(init_cuda):
+    """Memset rejects a closed Buffer passed as dst_owner."""
+    _skip_if_no_mempool()
+    dev = Device()
+    mr = DeviceMemoryResource(dev)
+    buf = mr.allocate(4, stream=dev.default_stream)
+    dev.default_stream.sync()
+    dptr = int(buf.handle)
+    buf.close()
+
+    g = GraphDefinition()
+    with pytest.raises(ValueError, match="dst_owner Buffer has no active allocation"):
+        g.memset(dptr, 0xAB, 4, dst_owner=buf)
+
+
+def test_memcpy_closed_buffer_src_owner_rejected(init_cuda):
+    """Memcpy rejects a closed Buffer passed as src_owner."""
+    _skip_if_no_mempool()
+    dev = Device()
+    mr = DeviceMemoryResource(dev)
+    buf = mr.allocate(4, stream=dev.default_stream)
+    dev.default_stream.sync()
+    dptr = int(buf.handle)
+    buf.close()
+
+    g = GraphDefinition()
+    with pytest.raises(ValueError, match="src_owner Buffer has no active allocation"):
+        g.memcpy(dptr, dptr, 4, src_owner=buf)
+
+
+def test_memcpy_buffer_and_dst_owner_rejected(init_cuda):
+    """dst_owner cannot be combined with a Buffer dst operand."""
+    _skip_if_no_mempool()
+    dev = Device()
+    mr = DeviceMemoryResource(dev)
+    buf = mr.allocate(4, stream=dev.default_stream)
+    dev.default_stream.sync()
+
+    g = GraphDefinition()
+    with pytest.raises(ValueError, match="dst_owner cannot be used when dst is a Buffer"):
+        g.memcpy(buf, buf, 4, dst_owner=object())
+
+
+def test_memcpy_buffer_and_src_owner_rejected(init_cuda):
+    """src_owner cannot be combined with a Buffer src operand."""
+    _skip_if_no_mempool()
+    dev = Device()
+    mr = DeviceMemoryResource(dev)
+    buf = mr.allocate(4, stream=dev.default_stream)
+    dev.default_stream.sync()
+
+    g = GraphDefinition()
+    with pytest.raises(ValueError, match="src_owner cannot be used when src is a Buffer"):
+        g.memcpy(buf, buf, 4, src_owner=object())
diff --git a/cuda_core/tests/graph/test_graph_definition_mutation.py b/cuda_core/tests/graph/test_graph_definition_mutation.py
index 1db1089f825..b176503e3df 100644
--- a/cuda_core/tests/graph/test_graph_definition_mutation.py
+++ b/cuda_core/tests/graph/test_graph_definition_mutation.py
@@ -311,9 +311,9 @@ def test_add_wrong_type(init_cuda):
     """Adding a non-GraphNode raises TypeError."""
     g = GraphDefinition()
     node = g.empty()
-    with pytest.raises(TypeError, match="expected .*GraphNode"):
+    with pytest.raises(TypeError, match="expected GraphNode"):
         node.succ.add("not a node")
-    with pytest.raises(TypeError, match="expected .*GraphNode"):
+    with pytest.raises(TypeError, match="expected GraphNode"):
         node.succ.add(42)