diff --git a/warp/__init__.py b/warp/__init__.py index e7a285189e..1d6f7b9a59 100644 --- a/warp/__init__.py +++ b/warp/__init__.py @@ -82,6 +82,7 @@ from warp.context import Kernel, Function, Launch from warp.context import Stream, get_stream, set_stream, wait_stream, synchronize_stream from warp.context import Event, record_event, wait_event, synchronize_event, get_event_elapsed_time +from warp.context import ExternalMemoryBuffer, ExternalSemaphore, signal_external_semaphore, wait_external_semaphore from warp.context import RegisteredGLBuffer from warp.context import is_mempool_supported, is_mempool_enabled, set_mempool_enabled from warp.context import ( diff --git a/warp/context.py b/warp/context.py index ec4af6c9c2..989e045330 100644 --- a/warp/context.py +++ b/warp/context.py @@ -2669,6 +2669,20 @@ def wait_event(self, event: Event): """ runtime.core.cuda_stream_wait_event(self.cuda_stream, event.cuda_event) + def signal_external_semaphore(self, semaphore: ExternalSemaphore, value: int = 0): + """TODO: docs""" + + runtime.core.cuda_signal_external_semaphore_async( + semaphore.device.context, semaphore.external_semaphore, value, self.cuda_stream + ) + + def wait_external_semaphore(self, semaphore: ExternalSemaphore, value: int = 0): + """TODO: docs""" + + runtime.core.cuda_wait_external_semaphore_async( + semaphore.device.context, semaphore.external_semaphore, value, self.cuda_stream + ) + def wait_stream(self, other_stream: "Stream", event: Optional[Event] = None): """Records an event on `other_stream` and makes this stream wait on it. @@ -3809,6 +3823,50 @@ def __init__(self): self.core.cuda_graphics_unregister_resource.argtypes = [ctypes.c_void_p, ctypes.c_void_p] self.core.cuda_graphics_unregister_resource.restype = None + self.core.cuda_import_external_memory.argtypes = [ + ctypes.c_void_p, + ctypes.c_uint, + ctypes.c_void_p, + ctypes.c_uint64, + ctypes.c_uint, + ] + self.core.cuda_import_external_memory.restype = ctypes.c_void_p + self.core.cuda_external_memory_get_mapped_buffer.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_uint64, + ctypes.c_uint64, + ctypes.c_uint, + ctypes.POINTER(ctypes.c_uint64), + ] + self.core.cuda_external_memory_get_mapped_buffer.restype = None + self.core.cuda_destroy_external_memory.argtypes = [ctypes.c_void_p, ctypes.c_void_p] + self.core.cuda_destroy_external_memory.restype = None + + self.core.cuda_import_external_semaphore.argtypes = [ + ctypes.c_void_p, + ctypes.c_uint, + ctypes.c_void_p, + ctypes.c_uint, + ] + self.core.cuda_import_external_semaphore.restype = ctypes.c_void_p + self.core.cuda_destroy_external_semaphore.argtypes = [ctypes.c_void_p, ctypes.c_void_p] + self.core.cuda_destroy_external_semaphore.restype = None + self.core.cuda_signal_external_semaphore_async.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_uint64, + ctypes.c_void_p, + ] + self.core.cuda_signal_external_semaphore_async.restype = None + self.core.cuda_wait_external_semaphore_async.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_uint64, + ctypes.c_void_p, + ] + self.core.cuda_wait_external_semaphore_async.restype = None + self.core.cuda_timing_begin.argtypes = [ctypes.c_int] self.core.cuda_timing_begin.restype = None self.core.cuda_timing_get_result_count.argtypes = [] @@ -4732,6 +4790,52 @@ def wait_event(event: Event): get_stream().wait_event(event) +class ExternalSemaphore: + """TODO: docs""" + + HANDLE_TYPE_OPAQUEFD = 1 + HANDLE_TYPE_OPAQUEWIN32 = 2 + HANDLE_TYPE_OPAQUEWIN32KMT = 3 + HANDLE_TYPE_D3D12HEAP = 4 + HANDLE_TYPE_D3D12RESOURCE = 5 + HANDLE_TYPE_D3D11RESOURCE = 6 + HANDLE_TYPE_D3D11RESOURCEKMT = 7 + HANDLE_TYPE_NVSCIBUF = 8 + + def __init__( + self, handle: Union[ctypes.c_void_p, int], handle_type: int, flags: int = 0, device: Devicelike = None + ): + """TODO: docs""" + + self.device = get_device(device) + self.context = self.device.context + self.external_semaphore = runtime.core.cuda_import_external_semaphore(self.context, handle_type, handle, flags) + if self.external_semaphore is None: + raise RuntimeError(f"Failed to import external semaphore {handle} with CUDA") + + def __del__(self): + """TODO: docs""" + + if not self.external_semaphore: + return + + # use CUDA context guard to avoid side effects during garbage collection + with self.device.context_guard: + runtime.core.cuda_destroy_external_semaphore(self.context, self.external_semaphore) + + +def signal_external_semaphore(semaphore: ExternalSemaphore, value: int = 0): + """TODO: docs""" + + return get_stream().signal_external_semaphore(semaphore, value) + + +def wait_external_semaphore(semaphore: ExternalSemaphore, value: int = 0): + """TODO: docs""" + + get_stream().wait_external_semaphore(semaphore, value) + + def get_event_elapsed_time(start_event: Event, end_event: Event, synchronize: bool = True): """Get the elapsed time between two recorded events. @@ -4772,6 +4876,57 @@ def wait_stream(other_stream: Stream, event: Optional[Event] = None): get_stream().wait_stream(other_stream, event=event) +class ExternalMemoryBuffer: + """TODO: docs""" + + HANDLE_TYPE_OPAQUEFD = 1 + HANDLE_TYPE_OPAQUEWIN32 = 2 + HANDLE_TYPE_OPAQUEWIN32KMT = 3 + HANDLE_TYPE_D3D12HEAP = 4 + HANDLE_TYPE_D3D12RESOURCE = 5 + HANDLE_TYPE_D3D11RESOURCE = 6 + HANDLE_TYPE_D3D11RESOURCEKMT = 7 + HANDLE_TYPE_NVSCIBUF = 8 + + FLAG_DEDICATED = 1 + + def __init__( + self, + handle: Union[ctypes.c_void_p, int], + handle_type: int, + size: int, + flags: int = 0, + device: Devicelike = None, + ): + """TODO: docs""" + + self.device = get_device(device) + self.context = self.device.context + self.external_memory = runtime.core.cuda_import_external_memory(self.context, handle_type, handle, size, flags) + self.size = size + if self.external_memory is None: + raise RuntimeError(f"Failed to import external memory {handle} with CUDA") + + def map(self, dtype: type, shape: Sequence[int]) -> warp.array: + """TODO: docs""" + + ptr = ctypes.c_uint64(0) + runtime.core.cuda_external_memory_get_mapped_buffer( + self.context, self.external_memory, 0, self.size, 0, ctypes.byref(ptr) + ) + return warp.array(ptr=ptr.value, dtype=dtype, shape=shape, device=self.device, capacity=self.size) + + def __del__(self): + """TODO: docs""" + + if not self.external_memory: + return + + # use CUDA context guard to avoid side effects during garbage collection + with self.device.context_guard: + runtime.core.cuda_destroy_external_memory(self.context, self.external_memory) + + class RegisteredGLBuffer: """ Helper class to register a GL buffer with CUDA so that it can be mapped to a Warp array. diff --git a/warp/native/warp.cpp b/warp/native/warp.cpp index 516a36db42..3b777a2dfb 100644 --- a/warp/native/warp.cpp +++ b/warp/native/warp.cpp @@ -1076,6 +1076,15 @@ WP_API void cuda_graphics_device_ptr_and_size(void* context, void* resource, uin WP_API void* cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsigned int flags) { return NULL; } WP_API void cuda_graphics_unregister_resource(void* context, void* resource) {} +WP_API void* cuda_import_external_memory(void* context, unsigned int type, void* handle, uint64_t size, unsigned int flags) { return NULL} +WP_API void cuda_destroy_external_memory(void* context, void* external_memory) {} +WP_API void cuda_external_memory_get_mapped_buffer(void* context, void* external_memory, uint64_t offset, uint64_t size, unsigned int flags, uint64_t* ptr) {} + +WP_API void* cuda_import_external_semaphore(void* context, unsigned int type, void* handle, unsigned int flags) { return NULL } +WP_API void cuda_destroy_external_semaphore(void* context, void* external_semaphore) {} +WP_API void cuda_signal_external_semaphore_async(void* context, void* external_semaphore, uint64_t value, void* stream) {} +WP_API void cuda_wait_external_semaphore_async(void* context, void* external_semaphore, uint64_t value, void* stream) {} + WP_API void cuda_timing_begin(int flags) {} WP_API int cuda_timing_get_result_count() { return 0; } WP_API void cuda_timing_end(timing_result_t* results, int size) {} diff --git a/warp/native/warp.cu b/warp/native/warp.cu index 19806c25c8..246d1edf70 100644 --- a/warp/native/warp.cu +++ b/warp/native/warp.cu @@ -3584,6 +3584,100 @@ void cuda_graphics_unregister_resource(void* context, void* resource) delete res; } +void* cuda_import_external_memory(void* context, unsigned int type, void* handle, uint64_t size, unsigned int flags) +{ + ContextGuard guard(context); + + cudaExternalMemory_t *external_memory = new cudaExternalMemory_t; + cudaExternalMemoryHandleDesc desc = {}; + desc.type = (cudaExternalMemoryHandleType)type; + desc.handle.win32.handle = handle; + desc.size = size; + desc.flags = flags; + + bool success = check_cuda(cudaImportExternalMemory(external_memory, &desc)); + if (!success) + { + delete external_memory; + return NULL; + } + + return external_memory; +} + +void cuda_external_memory_get_mapped_buffer(void* context, void* external_memory, uint64_t offset, uint64_t size, unsigned int flags, uint64_t* ptr) +{ + ContextGuard guard(context); + + cudaExternalMemory_t *memory = (cudaExternalMemory_t*)external_memory; + cudaExternalMemoryBufferDesc desc = {}; + desc.offset = offset; + desc.size = size; + desc.flags = flags; + + void* device_ptr; + check_cuda(cudaExternalMemoryGetMappedBuffer(&device_ptr, *memory, &desc)); + *ptr = (uint64_t)device_ptr; +} + +void cuda_destroy_external_memory(void* context, void* external_memory) +{ + ContextGuard guard(context); + + cudaExternalMemory_t *memory = (cudaExternalMemory_t*)external_memory; + check_cuda(cudaDestroyExternalMemory(*memory)); + delete memory; +} + +void* cuda_import_external_semaphore(void* context, unsigned int type, void* handle, unsigned int flags) +{ + ContextGuard guard(context); + + cudaExternalSemaphore_t *external_semaphore = new cudaExternalSemaphore_t; + cudaExternalSemaphoreHandleDesc desc = {}; + desc.type = (cudaExternalSemaphoreHandleType)type; + desc.handle.win32.handle = handle; + desc.flags = flags; + + bool success = check_cuda(cudaImportExternalSemaphore(external_semaphore, &desc)); + if (!success) + { + delete external_semaphore; + return NULL; + } + + return external_semaphore; +} + +void cuda_destroy_external_semaphore(void* context, void* external_semaphore) +{ + ContextGuard guard(context); + + cudaExternalSemaphore_t *semaphore = (cudaExternalSemaphore_t*)external_semaphore; + check_cuda(cudaDestroyExternalSemaphore(*semaphore)); + delete semaphore; +} + +void cuda_signal_external_semaphore_async(void* context, void* external_semaphore, uint64_t value, void* stream) +{ + ContextGuard guard(context); + + cudaExternalSemaphore_t *semaphore = (cudaExternalSemaphore_t*)external_semaphore; + cudaExternalSemaphoreSignalParams params = {}; + params.params.fence.value = value; + check_cuda(cudaSignalExternalSemaphoresAsync(semaphore, ¶ms, 1, static_cast(stream))); +} + +void cuda_wait_external_semaphore_async(void* context, void* external_semaphore, uint64_t value, void* stream) +{ + ContextGuard guard(context); + + cudaExternalSemaphore_t *semaphore = (cudaExternalSemaphore_t*)external_semaphore; + cudaExternalSemaphoreWaitParams params = {}; + params.params.fence.value = value; + check_cuda(cudaWaitExternalSemaphoresAsync(semaphore, ¶ms, 1, static_cast(stream))); +} + void cuda_timing_begin(int flags) { g_cuda_timing_state = new CudaTimingState(flags, g_cuda_timing_state); diff --git a/warp/native/warp.h b/warp/native/warp.h index ec61c8267d..65a6160b48 100644 --- a/warp/native/warp.h +++ b/warp/native/warp.h @@ -370,6 +370,16 @@ extern "C" WP_API void* cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsigned int flags); WP_API void cuda_graphics_unregister_resource(void* context, void* resource); + // external resource interoperability + WP_API void* cuda_import_external_memory(void* context, unsigned int type, void* handle, uint64_t size, unsigned int flags); + WP_API void cuda_destroy_external_memory(void* context, void* external_memory); + WP_API void cuda_external_memory_get_mapped_buffer(void* context, void* external_memory, uint64_t offset, uint64_t size, unsigned int flags, uint64_t* ptr); + + WP_API void* cuda_import_external_semaphore(void* context, unsigned int type, void* handle, unsigned int flags); + WP_API void cuda_destroy_external_semaphore(void* context, void* external_semaphore); + WP_API void cuda_signal_external_semaphore_async(void* context, void* external_semaphore, uint64_t value, void* stream); + WP_API void cuda_wait_external_semaphore_async(void* context, void* external_semaphore, uint64_t value, void* stream); + // CUDA timing WP_API void cuda_timing_begin(int flags); WP_API int cuda_timing_get_result_count();