Skip to content

Commit 52d6f75

Browse files
committed
Add RAII infrastructure for explicit graph construction
Phase 1 groundwork for explicit CUDA graph construction (issue #1317): - Add HandleRegistry template for reverse-lookup of CUDA handles back to their owning shared_ptr (via weak_ptr), enabling reconstruction of Python objects from driver-returned handles. - Extend EventBox with metadata fields (timing_disabled, busy_waited, ipc_enabled, device_id, context) accessed via get_box() pointer arithmetic, replacing cached Python-level fields. - Add event and kernel reverse-lookup registries for handle recovery. - Add Event.from_handle() and Kernel reverse-lookup integration with library-mismatch warning. - Convert _graph.py to _graph/ package (rename only, no content changes). Closes #1317 (partial) Made-with: Cursor
1 parent 188d3d6 commit 52d6f75

File tree

9 files changed

+306
-115
lines changed

9 files changed

+306
-115
lines changed

cuda_core/cuda/core/_cpp/resource_handles.cpp

Lines changed: 136 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,6 @@ NvJitLinkDestroyFn p_nvJitLinkDestroy = nullptr;
7777

7878
namespace {
7979

80-
using cuda_core::detail::py_is_finalizing;
81-
8280
// Helper to release the GIL while calling into the CUDA driver.
8381
// This guard is *conditional*: if the caller already dropped the GIL,
8482
// we avoid calling PyEval_SaveThread (which requires holding the GIL).
@@ -148,6 +146,51 @@ class GILAcquireGuard {
148146

149147
} // namespace
150148

149+
// ============================================================================
150+
// Handle reverse-lookup registry
151+
//
152+
// Maps raw CUDA handles (CUevent, CUkernel, etc.) back to their owning
153+
// shared_ptr so that _ref constructors can recover full metadata.
154+
// Uses weak_ptr to avoid preventing destruction.
155+
// ============================================================================
156+
157+
template<typename Key, typename Handle, typename Hash = std::hash<Key>>
158+
class HandleRegistry {
159+
public:
160+
using MapType = std::unordered_map<Key, std::weak_ptr<typename Handle::element_type>, Hash>;
161+
162+
void register_handle(const Key& key, const Handle& h) {
163+
std::lock_guard<std::mutex> lock(mutex_);
164+
map_[key] = h;
165+
}
166+
167+
void unregister_handle(const Key& key) noexcept {
168+
try {
169+
std::lock_guard<std::mutex> lock(mutex_);
170+
auto it = map_.find(key);
171+
if (it != map_.end() && it->second.expired()) {
172+
map_.erase(it);
173+
}
174+
} catch (...) {}
175+
}
176+
177+
Handle lookup(const Key& key) {
178+
std::lock_guard<std::mutex> lock(mutex_);
179+
auto it = map_.find(key);
180+
if (it != map_.end()) {
181+
if (auto h = it->second.lock()) {
182+
return h;
183+
}
184+
map_.erase(it);
185+
}
186+
return {};
187+
}
188+
189+
private:
190+
std::mutex mutex_;
191+
MapType map_;
192+
};
193+
151194
// ============================================================================
152195
// Thread-local error handling
153196
// ============================================================================
@@ -306,47 +349,98 @@ StreamHandle get_per_thread_stream() {
306349
namespace {
307350
struct EventBox {
308351
CUevent resource;
352+
bool timing_disabled;
353+
bool busy_waited;
354+
bool ipc_enabled;
355+
int device_id;
356+
ContextHandle h_context;
309357
};
310358
} // namespace
311359

312-
EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags) {
360+
static const EventBox* get_box(const EventHandle& h) {
361+
const CUevent* p = h.get();
362+
return reinterpret_cast<const EventBox*>(
363+
reinterpret_cast<const char*>(p) - offsetof(EventBox, resource)
364+
);
365+
}
366+
367+
bool get_event_timing_disabled(const EventHandle& h) noexcept {
368+
return h ? get_box(h)->timing_disabled : true;
369+
}
370+
371+
bool get_event_busy_waited(const EventHandle& h) noexcept {
372+
return h ? get_box(h)->busy_waited : false;
373+
}
374+
375+
bool get_event_ipc_enabled(const EventHandle& h) noexcept {
376+
return h ? get_box(h)->ipc_enabled : false;
377+
}
378+
379+
int get_event_device_id(const EventHandle& h) noexcept {
380+
return h ? get_box(h)->device_id : -1;
381+
}
382+
383+
ContextHandle get_event_context(const EventHandle& h) noexcept {
384+
return h ? get_box(h)->h_context : ContextHandle{};
385+
}
386+
387+
static HandleRegistry<CUevent, EventHandle> event_registry;
388+
389+
EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags,
390+
bool timing_disabled, bool busy_waited,
391+
bool ipc_enabled, int device_id) {
313392
GILReleaseGuard gil;
314393
CUevent event;
315394
if (CUDA_SUCCESS != (err = p_cuEventCreate(&event, flags))) {
316395
return {};
317396
}
318397

319398
auto box = std::shared_ptr<const EventBox>(
320-
new EventBox{event},
399+
new EventBox{event, timing_disabled, busy_waited, ipc_enabled, device_id, h_ctx},
321400
[h_ctx](const EventBox* b) {
401+
event_registry.unregister_handle(b->resource);
322402
GILReleaseGuard gil;
323403
p_cuEventDestroy(b->resource);
324404
delete b;
325405
}
326406
);
327-
return EventHandle(box, &box->resource);
407+
EventHandle h(box, &box->resource);
408+
event_registry.register_handle(event, h);
409+
return h;
328410
}
329411

330412
EventHandle create_event_handle_noctx(unsigned int flags) {
331-
return create_event_handle(ContextHandle{}, flags);
413+
return create_event_handle(ContextHandle{}, flags, true, false, false, -1);
332414
}
333415

334-
EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
416+
EventHandle create_event_handle_ref(CUevent event) {
417+
if (auto h = event_registry.lookup(event)) {
418+
return h;
419+
}
420+
auto box = std::make_shared<const EventBox>(EventBox{event, true, false, false, -1, {}});
421+
return EventHandle(box, &box->resource);
422+
}
423+
424+
EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle,
425+
bool busy_waited) {
335426
GILReleaseGuard gil;
336427
CUevent event;
337428
if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) {
338429
return {};
339430
}
340431

341432
auto box = std::shared_ptr<const EventBox>(
342-
new EventBox{event},
433+
new EventBox{event, true, busy_waited, true, -1, {}},
343434
[](const EventBox* b) {
435+
event_registry.unregister_handle(b->resource);
344436
GILReleaseGuard gil;
345437
p_cuEventDestroy(b->resource);
346438
delete b;
347439
}
348440
);
349-
return EventHandle(box, &box->resource);
441+
EventHandle h(box, &box->resource);
442+
event_registry.register_handle(event, h);
443+
return h;
350444
}
351445

352446
// ============================================================================
@@ -653,61 +747,43 @@ struct ExportDataKeyHash {
653747

654748
}
655749

656-
static std::mutex ipc_ptr_cache_mutex;
657-
static std::unordered_map<ExportDataKey, std::weak_ptr<DevicePtrBox>, ExportDataKeyHash> ipc_ptr_cache;
750+
static HandleRegistry<ExportDataKey, DevicePtrHandle, ExportDataKeyHash> ipc_ptr_cache;
751+
static std::mutex ipc_import_mutex;
658752

659753
DevicePtrHandle deviceptr_import_ipc(const MemoryPoolHandle& h_pool, const void* export_data, const StreamHandle& h_stream) {
660754
auto data = const_cast<CUmemPoolPtrExportData*>(
661755
reinterpret_cast<const CUmemPoolPtrExportData*>(export_data));
662756

663757
if (use_ipc_ptr_cache()) {
664-
// Check cache before calling cuMemPoolImportPointer
665758
ExportDataKey key;
666759
std::memcpy(&key.data, data, sizeof(key.data));
667760

668-
std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
761+
std::lock_guard<std::mutex> lock(ipc_import_mutex);
669762

670-
auto it = ipc_ptr_cache.find(key);
671-
if (it != ipc_ptr_cache.end()) {
672-
if (auto box = it->second.lock()) {
673-
// Cache hit - return existing handle
674-
return DevicePtrHandle(box, &box->resource);
675-
}
676-
ipc_ptr_cache.erase(it); // Expired entry
763+
if (auto h = ipc_ptr_cache.lookup(key)) {
764+
return h;
677765
}
678766

679-
// Cache miss - import the pointer
680767
GILReleaseGuard gil;
681768
CUdeviceptr ptr;
682769
if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) {
683770
return {};
684771
}
685772

686-
// Create new handle with cache-clearing deleter
687773
auto box = std::shared_ptr<DevicePtrBox>(
688774
new DevicePtrBox{ptr, h_stream},
689775
[h_pool, key](DevicePtrBox* b) {
776+
ipc_ptr_cache.unregister_handle(key);
690777
GILReleaseGuard gil;
691-
try {
692-
std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
693-
// Only erase if expired - avoids race where another thread
694-
// replaced the entry with a new import before we acquired the lock.
695-
auto it = ipc_ptr_cache.find(key);
696-
if (it != ipc_ptr_cache.end() && it->second.expired()) {
697-
ipc_ptr_cache.erase(it);
698-
}
699-
} catch (...) {
700-
// Cache cleanup is best-effort - swallow exceptions in destructor context
701-
}
702778
p_cuMemFreeAsync(b->resource, as_cu(b->h_stream));
703779
delete b;
704780
}
705781
);
706-
ipc_ptr_cache[key] = box;
707-
return DevicePtrHandle(box, &box->resource);
782+
DevicePtrHandle h(box, &box->resource);
783+
ipc_ptr_cache.register_handle(key, h);
784+
return h;
708785

709786
} else {
710-
// No caching - simple handle creation
711787
GILReleaseGuard gil;
712788
CUdeviceptr ptr;
713789
if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) {
@@ -786,25 +862,45 @@ LibraryHandle create_library_handle_ref(CUlibrary library) {
786862
namespace {
787863
struct KernelBox {
788864
CUkernel resource;
789-
LibraryHandle h_library; // Keeps library alive
865+
LibraryHandle h_library;
790866
};
791867
} // namespace
792868

869+
static const KernelBox* get_box(const KernelHandle& h) {
870+
const CUkernel* p = h.get();
871+
return reinterpret_cast<const KernelBox*>(
872+
reinterpret_cast<const char*>(p) - offsetof(KernelBox, resource)
873+
);
874+
}
875+
876+
static HandleRegistry<CUkernel, KernelHandle> kernel_registry;
877+
793878
KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* name) {
794879
GILReleaseGuard gil;
795880
CUkernel kernel;
796881
if (CUDA_SUCCESS != (err = p_cuLibraryGetKernel(&kernel, *h_library, name))) {
797882
return {};
798883
}
799884

800-
return create_kernel_handle_ref(kernel, h_library);
885+
auto box = std::make_shared<const KernelBox>(KernelBox{kernel, h_library});
886+
KernelHandle h(box, &box->resource);
887+
kernel_registry.register_handle(kernel, h);
888+
return h;
801889
}
802890

803-
KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_library) {
804-
auto box = std::make_shared<const KernelBox>(KernelBox{kernel, h_library});
891+
KernelHandle create_kernel_handle_ref(CUkernel kernel) {
892+
if (auto h = kernel_registry.lookup(kernel)) {
893+
return h;
894+
}
895+
auto box = std::make_shared<const KernelBox>(KernelBox{kernel, {}});
805896
return KernelHandle(box, &box->resource);
806897
}
807898

899+
LibraryHandle get_kernel_library(const KernelHandle& h) noexcept {
900+
if (!h) return {};
901+
return get_box(h)->h_library;
902+
}
903+
808904
// ============================================================================
809905
// Graphics Resource Handles
810906
// ============================================================================

cuda_core/cuda/core/_cpp/resource_handles.hpp

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -200,9 +200,12 @@ StreamHandle get_per_thread_stream();
200200

201201
// Create an owning event handle by calling cuEventCreate.
202202
// The event structurally depends on the provided context handle.
203+
// Metadata fields are stored in the EventBox for later retrieval.
203204
// When the last reference is released, cuEventDestroy is called automatically.
204205
// Returns empty handle on error (caller must check).
205-
EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags);
206+
EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags,
207+
bool timing_disabled, bool busy_waited,
208+
bool ipc_enabled, int device_id);
206209

207210
// Create an owning event handle without context dependency.
208211
// Use for temporary events that are created and destroyed in the same scope.
@@ -214,7 +217,21 @@ EventHandle create_event_handle_noctx(unsigned int flags);
214217
// The originating process owns the event and its context.
215218
// When the last reference is released, cuEventDestroy is called automatically.
216219
// Returns empty handle on error (caller must check).
217-
EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle);
220+
EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle,
221+
bool busy_waited);
222+
223+
// Create a non-owning event handle (references existing event).
224+
// Use for events that are managed by the CUDA graph or another owner.
225+
// The event will NOT be destroyed when the handle is released.
226+
// Metadata defaults to unknown (timing_disabled=true, device_id=-1).
227+
EventHandle create_event_handle_ref(CUevent event);
228+
229+
// Event metadata accessors (read from EventBox via pointer arithmetic)
230+
bool get_event_timing_disabled(const EventHandle& h) noexcept;
231+
bool get_event_busy_waited(const EventHandle& h) noexcept;
232+
bool get_event_ipc_enabled(const EventHandle& h) noexcept;
233+
int get_event_device_id(const EventHandle& h) noexcept;
234+
ContextHandle get_event_context(const EventHandle& h) noexcept;
218235

219236
// ============================================================================
220237
// Memory pool handle functions
@@ -345,9 +362,14 @@ LibraryHandle create_library_handle_ref(CUlibrary library);
345362
// Returns empty handle on error (caller must check).
346363
KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* name);
347364

348-
// Create a non-owning kernel handle with library dependency.
349-
// Use for borrowed kernels. The library handle keeps the library alive.
350-
KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_library);
365+
// Create a kernel handle from a raw CUkernel.
366+
// If the kernel is already managed (in the registry), returns the owning
367+
// handle with library dependency. Otherwise returns a non-owning ref.
368+
KernelHandle create_kernel_handle_ref(CUkernel kernel);
369+
370+
// Get the library handle associated with a kernel (from KernelBox).
371+
// Returns empty handle if the kernel has no library dependency.
372+
LibraryHandle get_kernel_library(const KernelHandle& h) noexcept;
351373

352374
// ============================================================================
353375
// Graphics resource handle functions
@@ -516,8 +538,6 @@ inline std::intptr_t as_intptr(const CuLinkHandle& h) noexcept {
516538
}
517539

518540
// as_py() - convert handle to Python wrapper object (returns new reference)
519-
namespace detail {
520-
521541
#if PY_VERSION_HEX < 0x030D0000
522542
extern "C" int _Py_IsFinalizing(void);
523543
#endif
@@ -530,6 +550,7 @@ inline bool py_is_finalizing() noexcept {
530550
#endif
531551
}
532552

553+
namespace detail {
533554
// n.b. class lookup is not cached to avoid deadlock hazard, see DESIGN.md
534555
inline PyObject* make_py(const char* module_name, const char* class_name, std::intptr_t value) noexcept {
535556
if (py_is_finalizing()) {

cuda_core/cuda/core/_event.pxd

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,13 @@ cdef class Event:
1010

1111
cdef:
1212
EventHandle _h_event
13-
ContextHandle _h_context
14-
bint _timing_disabled
15-
bint _busy_waited
16-
bint _ipc_enabled
1713
object _ipc_descriptor
18-
int _device_id
1914
object __weakref__
2015

2116
@staticmethod
2217
cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free)
2318

19+
@staticmethod
20+
cdef Event _from_handle(EventHandle h_event)
21+
2422
cpdef close(self)

0 commit comments

Comments
 (0)