Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 100 additions & 1 deletion cuda_core/cuda/core/_cpp/resource_handles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@

#include "resource_handles.hpp"
#include <cuda.h>
#include <array>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <map>
#include <mutex>
#include <stdexcept>
#include <unordered_map>
Expand Down Expand Up @@ -70,6 +73,9 @@ decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel = nullptr;
// Graph
decltype(&cuGraphDestroy) p_cuGraphDestroy = nullptr;
decltype(&cuGraphExecDestroy) p_cuGraphExecDestroy = nullptr;
decltype(&cuUserObjectCreate) p_cuUserObjectCreate = nullptr;
decltype(&cuUserObjectRelease) p_cuUserObjectRelease = nullptr;
decltype(&cuGraphRetainUserObject) p_cuGraphRetainUserObject = nullptr;

// Linker
decltype(&cuLinkDestroy) p_cuLinkDestroy = nullptr;
Expand Down Expand Up @@ -1114,12 +1120,92 @@ LibraryHandle get_kernel_library(const KernelHandle& h) noexcept {
// ============================================================================

namespace {

// Slot table layout (internal). Each graph maps CUgraphNode -> a fixed-size
// array of type-erased owners. The width is the most any single node needs: a
// kernel node holds its kernel and its packed arguments; a host node holds its
// callback and the userData. The table is heap-allocated and retained on the
// graph as a user object, so the driver frees it -- and every owner in it --
// when the graph is destroyed.
constexpr std::size_t SLOTS_PER_NODE = 2;
using NodeSlots = std::array<OpaqueHandle, SLOTS_PER_NODE>;
using GraphSlotTable = std::map<CUgraphNode, NodeSlots>;

// shared_ptr deleters for the payloads that need one. Typed handles convert to
// OpaqueHandle by assignment and reuse their own control block, so they need no
// deleter here. The Python deleter follows the owner-release pattern used by
// the stream/deviceptr handles above.
void py_deleter(const void* p) noexcept {
GILAcquireGuard gil;
if (gil.acquired()) {
Py_DECREF(const_cast<PyObject*>(static_cast<const PyObject*>(p)));
}
}

void free_deleter(const void* p) noexcept {
std::free(const_cast<void*>(p));
}

void destroy_graph_slot_table(void* table) noexcept {
delete static_cast<GraphSlotTable*>(table);
}

struct GraphBox {
CUgraph resource;
GraphHandle h_parent; // Keeps parent alive for child/branch graphs
GraphHandle h_parent; // Keeps parent alive for child/branch graphs
mutable GraphSlotTable* slot_table = nullptr; // Lazily created; owned by the graph's user object
};

const GraphBox* get_box(const GraphHandle& h) {
const CUgraph* p = h.get();
return reinterpret_cast<const GraphBox*>(
reinterpret_cast<const char*>(p) - offsetof(GraphBox, resource)
);
}

// Return box's slot table, creating it on first use. The table is retained on
// the graph as a user object (MOVE transfers our only reference into the
// graph), so it -- and every owner in it -- is freed when the graph is
// destroyed. Returns nullptr if the driver lacks user-object support or a
// driver call fails; the cached pointer is non-owning.
GraphSlotTable* ensure_slot_table(const GraphBox* box) {
if (box->slot_table) {
return box->slot_table;
}
if (!p_cuUserObjectCreate || !p_cuGraphRetainUserObject || !p_cuUserObjectRelease) {
return nullptr;
}
auto* table = new GraphSlotTable();
CUuserObject user_obj = nullptr;
{
GILReleaseGuard gil;
if (p_cuUserObjectCreate(&user_obj, table,
reinterpret_cast<CUhostFn>(destroy_graph_slot_table),
1, CU_USER_OBJECT_NO_DESTRUCTOR_SYNC) != CUDA_SUCCESS) {
delete table; // no user object created; nothing else owns the table
return nullptr;
}
if (p_cuGraphRetainUserObject(box->resource, user_obj, 1,
CU_GRAPH_USER_OBJECT_MOVE) != CUDA_SUCCESS) {
p_cuUserObjectRelease(user_obj, 1); // drops refcount to 0 -> frees table
return nullptr;
}
}
box->slot_table = table; // non-owning cache; the user object owns it
return table;
}

} // namespace

OpaqueHandle make_opaque_py(PyObject* obj) {
Py_INCREF(obj);
return OpaqueHandle(static_cast<const void*>(obj), py_deleter);
}

OpaqueHandle make_opaque_malloc(void* buf) {
return OpaqueHandle(static_cast<const void*>(buf), free_deleter);
}

GraphHandle create_graph_handle(CUgraph graph) {
auto box = std::shared_ptr<const GraphBox>(
new GraphBox{graph, {}},
Expand All @@ -1137,6 +1223,19 @@ GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent)
return GraphHandle(box, &box->resource);
}

CUresult graph_set_slot(const GraphHandle& h_graph, CUgraphNode node,
unsigned int slot, OpaqueHandle owner) {
if (!h_graph || slot >= SLOTS_PER_NODE) {
return CUDA_ERROR_INVALID_VALUE;
}
GraphSlotTable* table = ensure_slot_table(get_box(h_graph));
if (!table) {
return CUDA_ERROR_NOT_SUPPORTED;
}
(*table)[node][slot] = std::move(owner);
return CUDA_SUCCESS;
}

// ============================================================================
// Graph Exec Handles
// ============================================================================
Expand Down
34 changes: 34 additions & 0 deletions cuda_core/cuda/core/_cpp/resource_handles.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ extern decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel;
// Graph
extern decltype(&cuGraphDestroy) p_cuGraphDestroy;
extern decltype(&cuGraphExecDestroy) p_cuGraphExecDestroy;
extern decltype(&cuUserObjectCreate) p_cuUserObjectCreate;
extern decltype(&cuUserObjectRelease) p_cuUserObjectRelease;
extern decltype(&cuGraphRetainUserObject) p_cuGraphRetainUserObject;

// Linker
extern decltype(&cuLinkDestroy) p_cuLinkDestroy;
Expand Down Expand Up @@ -466,6 +469,37 @@ GraphHandle create_graph_handle(CUgraph graph);
// but h_parent will be prevented from destruction while this handle exists.
GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent);

// ============================================================================
// Graph slot attachments
//
// A graph carries a side table that keeps resources used by its nodes (kernel
// arguments, host callbacks, events, ...) alive for as long as the graph can
// execute. The table is created on first use and retained on the CUgraph as a
// user object, so the driver releases it -- and everything attached through it
// -- when the graph is destroyed. The table layout is an internal detail;
// callers use the abstract API below.
// ============================================================================

// Type-erased shared owner of an attached resource. Typed handles such as
// EventHandle and KernelHandle convert to OpaqueHandle by assignment, reusing
// their existing control block; the helpers below build OpaqueHandles for the
// two cases that need a custom deleter.
using OpaqueHandle = std::shared_ptr<const void>;

// Build an OpaqueHandle from a Python object: increments its refcount now and
// decrements it (under the GIL) on release. The caller must hold the GIL.
OpaqueHandle make_opaque_py(PyObject* obj);

// Build an OpaqueHandle from a malloc'd buffer: std::free on release.
OpaqueHandle make_opaque_malloc(void* buf);

// Attach owner to one of node's fixed slots on h_graph, replacing whatever was
// there. The graph's slot table is created on first use. Returns CUDA_SUCCESS,
// or an error if slot is out of range or the graph cannot hold a table (e.g.
// the driver lacks user-object support).
CUresult graph_set_slot(const GraphHandle& h_graph, CUgraphNode node,
unsigned int slot, OpaqueHandle owner);

// ============================================================================
// Graph exec handle functions
// ============================================================================
Expand Down
14 changes: 14 additions & 0 deletions cuda_core/cuda/core/_resource_handles.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,13 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":

ctypedef shared_ptr[const cydriver.CUlinkState] CuLinkHandle
ctypedef shared_ptr[const int] FileDescriptorHandle

# Type-erased shared owner for resources attached to graph node slots.
# Typed handles above assign directly to an OpaqueHandle (shared control
# block); make_opaque_py / make_opaque_malloc cover the two cases needing a
# custom deleter.
ctypedef shared_ptr[const void] OpaqueHandle

ctypedef shared_ptr[const cydriver.CUarray] OpaqueArrayHandle
ctypedef shared_ptr[const cydriver.CUmipmappedArray] MipmappedArrayHandle

Expand Down Expand Up @@ -223,6 +230,13 @@ cdef LibraryHandle get_kernel_library(const KernelHandle& h) noexcept nogil
cdef GraphHandle create_graph_handle(cydriver.CUgraph graph) except+ nogil
cdef GraphHandle create_graph_handle_ref(cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil

# Graph slot attachments
cdef OpaqueHandle make_opaque_py(object obj) except+
cdef OpaqueHandle make_opaque_malloc(void* buf) except+
cdef cydriver.CUresult graph_set_slot(
const GraphHandle& h_graph, cydriver.CUgraphNode node,
unsigned int slot, OpaqueHandle owner) except+

# Graph exec handles
cdef GraphExecHandle create_graph_exec_handle(cydriver.CUgraphExec graph_exec) except+ nogil

Expand Down
1 change: 1 addition & 0 deletions cuda_core/cuda/core/_resource_handles.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ NvvmProgramHandle = shared_ptr
NvJitLinkHandle = shared_ptr
CuLinkHandle = shared_ptr
FileDescriptorHandle = shared_ptr
OpaqueHandle = shared_ptr
OpaqueArrayHandle = shared_ptr
MipmappedArrayHandle = shared_ptr
TexObjectHandle = shared_ptr
Expand Down
13 changes: 13 additions & 0 deletions cuda_core/cuda/core/_resource_handles.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,12 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
GraphHandle create_graph_handle_ref "cuda_core::create_graph_handle_ref" (
cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil

OpaqueHandle make_opaque_py "cuda_core::make_opaque_py" (object obj) except+
OpaqueHandle make_opaque_malloc "cuda_core::make_opaque_malloc" (void* buf) except+
cydriver.CUresult graph_set_slot "cuda_core::graph_set_slot" (
const GraphHandle& h_graph, cydriver.CUgraphNode node,
unsigned int slot, OpaqueHandle owner) except+

# Graph exec handles
GraphExecHandle create_graph_exec_handle "cuda_core::create_graph_exec_handle" (
cydriver.CUgraphExec graph_exec) except+ nogil
Expand Down Expand Up @@ -304,6 +310,9 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
# Graph
void* p_cuGraphDestroy "reinterpret_cast<void*&>(cuda_core::p_cuGraphDestroy)"
void* p_cuGraphExecDestroy "reinterpret_cast<void*&>(cuda_core::p_cuGraphExecDestroy)"
void* p_cuUserObjectCreate "reinterpret_cast<void*&>(cuda_core::p_cuUserObjectCreate)"
void* p_cuUserObjectRelease "reinterpret_cast<void*&>(cuda_core::p_cuUserObjectRelease)"
void* p_cuGraphRetainUserObject "reinterpret_cast<void*&>(cuda_core::p_cuGraphRetainUserObject)"

# Linker
void* p_cuLinkDestroy "reinterpret_cast<void*&>(cuda_core::p_cuLinkDestroy)"
Expand Down Expand Up @@ -364,6 +373,7 @@ cdef void _init_driver_fn_pointers() noexcept:
global p_cuMemPoolImportPointer
global p_cuLibraryLoadFromFile, p_cuLibraryLoadData, p_cuLibraryUnload, p_cuLibraryGetKernel
global p_cuGraphDestroy, p_cuGraphExecDestroy
global p_cuUserObjectCreate, p_cuUserObjectRelease, p_cuGraphRetainUserObject
global p_cuLinkDestroy
global p_cuGraphicsUnmapResources, p_cuGraphicsUnregisterResource
global p_cuDevSmResourceSplit
Expand Down Expand Up @@ -424,6 +434,9 @@ cdef void _init_driver_fn_pointers() noexcept:
# Graph
p_cuGraphDestroy = _get_driver_fn("cuGraphDestroy")
p_cuGraphExecDestroy = _get_driver_fn("cuGraphExecDestroy")
p_cuUserObjectCreate = _get_driver_fn("cuUserObjectCreate")
p_cuUserObjectRelease = _get_driver_fn("cuUserObjectRelease")
p_cuGraphRetainUserObject = _get_driver_fn("cuGraphRetainUserObject")

# Linker
p_cuLinkDestroy = _get_driver_fn("cuLinkDestroy")
Expand Down
55 changes: 55 additions & 0 deletions cuda_core/cuda/core/_utils/_weak_handles.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_utils/_weak_handles.pyx

"""Test-only weak handles for resource-handle lifetime checks.

This module is **not** part of the public ``cuda.core`` API. It is built into
the package (like other private ``_utils`` modules) purely so the test suite can
observe, deterministically, when the strong references that keep a CUDA resource
alive have all been released -- without relying on driver- or hardware-specific
side effects (for example, whether freed device memory happens to remain
readable).

Every resource handle is owned by a C++ ``std::shared_ptr``. A **weak handle**
is a non-owning ``std::weak_ptr`` observer of that control block: truthy while
some strong owner remains, falsy once the last one is gone. Use :func:`weak_handle`
to obtain a weak handle from a supported front-end object.

To support another type, add a ``cdef _weak_from_<type>`` that reads its ``cdef``
handle field (see ``*.pxd``), assigns to :ctype:`OpaqueHandle`, and extend the
``isinstance`` chain in :func:`weak_handle`. Types whose slots hold arbitrary
Python owners via ``make_opaque_py`` are not covered here -- use
:class:`weakref.ref` on a weak-referenceable owner object in tests instead.
"""
from __future__ import annotations


class WeakHandle:
"""Non-owning weak handle for a resource's shared control block.

Truthy while some strong owner of the underlying resource handle remains,
falsy once the last strong reference is released. Obtain instances via
:func:`weak_handle` rather than constructing directly.
"""

def __bool__(self):
...

def expired(self):
"""Return ``True`` once every strong owner of the handle is gone."""

def use_count(self):
"""Number of strong owners currently sharing the handle."""

def weak_handle(obj):
"""Return a :class:`WeakHandle` observing the resource behind ``obj``.

Currently supports :class:`~cuda.core.Buffer` (device allocation handle).
See the module docstring for how to add more types.

Raises
------
ValueError
If ``obj`` is a :class:`~cuda.core.Buffer` with no active allocation.
TypeError
If ``obj`` is not a supported type.
"""
Loading
Loading