Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions warp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
from warp.context import Kernel, Function, Launch
from warp.context import Stream, get_stream, set_stream, wait_stream, synchronize_stream
from warp.context import Event, record_event, wait_event, synchronize_event, get_event_elapsed_time
from warp.context import ExternalMemoryBuffer, ExternalSemaphore, signal_external_semaphore, wait_external_semaphore
from warp.context import RegisteredGLBuffer
from warp.context import is_mempool_supported, is_mempool_enabled, set_mempool_enabled
from warp.context import (
Expand Down
155 changes: 155 additions & 0 deletions warp/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -2669,6 +2669,20 @@ def wait_event(self, event: Event):
"""
runtime.core.cuda_stream_wait_event(self.cuda_stream, event.cuda_event)

def signal_external_semaphore(self, semaphore: ExternalSemaphore, value: int = 0):
"""TODO: docs"""

runtime.core.cuda_signal_external_semaphore_async(
semaphore.device.context, semaphore.external_semaphore, value, self.cuda_stream
)

def wait_external_semaphore(self, semaphore: ExternalSemaphore, value: int = 0):
"""TODO: docs"""

runtime.core.cuda_wait_external_semaphore_async(
semaphore.device.context, semaphore.external_semaphore, value, self.cuda_stream
)

def wait_stream(self, other_stream: "Stream", event: Optional[Event] = None):
"""Records an event on `other_stream` and makes this stream wait on it.

Expand Down Expand Up @@ -3809,6 +3823,50 @@ def __init__(self):
self.core.cuda_graphics_unregister_resource.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
self.core.cuda_graphics_unregister_resource.restype = None

self.core.cuda_import_external_memory.argtypes = [
ctypes.c_void_p,
ctypes.c_uint,
ctypes.c_void_p,
ctypes.c_uint64,
ctypes.c_uint,
]
self.core.cuda_import_external_memory.restype = ctypes.c_void_p
self.core.cuda_external_memory_get_mapped_buffer.argtypes = [
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_uint64,
ctypes.c_uint64,
ctypes.c_uint,
ctypes.POINTER(ctypes.c_uint64),
]
self.core.cuda_external_memory_get_mapped_buffer.restype = None
self.core.cuda_destroy_external_memory.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
self.core.cuda_destroy_external_memory.restype = None

self.core.cuda_import_external_semaphore.argtypes = [
ctypes.c_void_p,
ctypes.c_uint,
ctypes.c_void_p,
ctypes.c_uint,
]
self.core.cuda_import_external_semaphore.restype = ctypes.c_void_p
self.core.cuda_destroy_external_semaphore.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
self.core.cuda_destroy_external_semaphore.restype = None
self.core.cuda_signal_external_semaphore_async.argtypes = [
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_uint64,
ctypes.c_void_p,
]
self.core.cuda_signal_external_semaphore_async.restype = None
self.core.cuda_wait_external_semaphore_async.argtypes = [
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_uint64,
ctypes.c_void_p,
]
self.core.cuda_wait_external_semaphore_async.restype = None

self.core.cuda_timing_begin.argtypes = [ctypes.c_int]
self.core.cuda_timing_begin.restype = None
self.core.cuda_timing_get_result_count.argtypes = []
Expand Down Expand Up @@ -4732,6 +4790,52 @@ def wait_event(event: Event):
get_stream().wait_event(event)


class ExternalSemaphore:
"""TODO: docs"""

HANDLE_TYPE_OPAQUEFD = 1
HANDLE_TYPE_OPAQUEWIN32 = 2
HANDLE_TYPE_OPAQUEWIN32KMT = 3
HANDLE_TYPE_D3D12HEAP = 4
HANDLE_TYPE_D3D12RESOURCE = 5
HANDLE_TYPE_D3D11RESOURCE = 6
HANDLE_TYPE_D3D11RESOURCEKMT = 7
HANDLE_TYPE_NVSCIBUF = 8

def __init__(
self, handle: Union[ctypes.c_void_p, int], handle_type: int, flags: int = 0, device: Devicelike = None
):
"""TODO: docs"""

self.device = get_device(device)
self.context = self.device.context
self.external_semaphore = runtime.core.cuda_import_external_semaphore(self.context, handle_type, handle, flags)
if self.external_semaphore is None:
raise RuntimeError(f"Failed to import external semaphore {handle} with CUDA")

def __del__(self):
"""TODO: docs"""

if not self.external_semaphore:
return

# use CUDA context guard to avoid side effects during garbage collection
with self.device.context_guard:
runtime.core.cuda_destroy_external_semaphore(self.context, self.external_semaphore)


def signal_external_semaphore(semaphore: ExternalSemaphore, value: int = 0):
"""TODO: docs"""

return get_stream().signal_external_semaphore(semaphore, value)


def wait_external_semaphore(semaphore: ExternalSemaphore, value: int = 0):
"""TODO: docs"""

get_stream().wait_external_semaphore(semaphore, value)


def get_event_elapsed_time(start_event: Event, end_event: Event, synchronize: bool = True):
"""Get the elapsed time between two recorded events.

Expand Down Expand Up @@ -4772,6 +4876,57 @@ def wait_stream(other_stream: Stream, event: Optional[Event] = None):
get_stream().wait_stream(other_stream, event=event)


class ExternalMemoryBuffer:
"""TODO: docs"""

HANDLE_TYPE_OPAQUEFD = 1
HANDLE_TYPE_OPAQUEWIN32 = 2
HANDLE_TYPE_OPAQUEWIN32KMT = 3
HANDLE_TYPE_D3D12HEAP = 4
HANDLE_TYPE_D3D12RESOURCE = 5
HANDLE_TYPE_D3D11RESOURCE = 6
HANDLE_TYPE_D3D11RESOURCEKMT = 7
HANDLE_TYPE_NVSCIBUF = 8

FLAG_DEDICATED = 1

def __init__(
self,
handle: Union[ctypes.c_void_p, int],
handle_type: int,
size: int,
flags: int = 0,
device: Devicelike = None,
):
"""TODO: docs"""

self.device = get_device(device)
self.context = self.device.context
self.external_memory = runtime.core.cuda_import_external_memory(self.context, handle_type, handle, size, flags)
self.size = size
if self.external_memory is None:
raise RuntimeError(f"Failed to import external memory {handle} with CUDA")

def map(self, dtype: type, shape: Sequence[int]) -> warp.array:
"""TODO: docs"""

ptr = ctypes.c_uint64(0)
runtime.core.cuda_external_memory_get_mapped_buffer(
self.context, self.external_memory, 0, self.size, 0, ctypes.byref(ptr)
)
return warp.array(ptr=ptr.value, dtype=dtype, shape=shape, device=self.device, capacity=self.size)

def __del__(self):
"""TODO: docs"""

if not self.external_memory:
return

# use CUDA context guard to avoid side effects during garbage collection
with self.device.context_guard:
runtime.core.cuda_destroy_external_memory(self.context, self.external_memory)


class RegisteredGLBuffer:
"""
Helper class to register a GL buffer with CUDA so that it can be mapped to a Warp array.
Expand Down
9 changes: 9 additions & 0 deletions warp/native/warp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1076,6 +1076,15 @@ WP_API void cuda_graphics_device_ptr_and_size(void* context, void* resource, uin
WP_API void* cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsigned int flags) { return NULL; }
WP_API void cuda_graphics_unregister_resource(void* context, void* resource) {}

WP_API void* cuda_import_external_memory(void* context, unsigned int type, void* handle, uint64_t size, unsigned int flags) { return NULL}
WP_API void cuda_destroy_external_memory(void* context, void* external_memory) {}
WP_API void cuda_external_memory_get_mapped_buffer(void* context, void* external_memory, uint64_t offset, uint64_t size, unsigned int flags, uint64_t* ptr) {}

WP_API void* cuda_import_external_semaphore(void* context, unsigned int type, void* handle, unsigned int flags) { return NULL }
WP_API void cuda_destroy_external_semaphore(void* context, void* external_semaphore) {}
WP_API void cuda_signal_external_semaphore_async(void* context, void* external_semaphore, uint64_t value, void* stream) {}
WP_API void cuda_wait_external_semaphore_async(void* context, void* external_semaphore, uint64_t value, void* stream) {}

WP_API void cuda_timing_begin(int flags) {}
WP_API int cuda_timing_get_result_count() { return 0; }
WP_API void cuda_timing_end(timing_result_t* results, int size) {}
Expand Down
94 changes: 94 additions & 0 deletions warp/native/warp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3584,6 +3584,100 @@ void cuda_graphics_unregister_resource(void* context, void* resource)
delete res;
}

void* cuda_import_external_memory(void* context, unsigned int type, void* handle, uint64_t size, unsigned int flags)
{
ContextGuard guard(context);

cudaExternalMemory_t *external_memory = new cudaExternalMemory_t;
cudaExternalMemoryHandleDesc desc = {};
desc.type = (cudaExternalMemoryHandleType)type;
desc.handle.win32.handle = handle;
desc.size = size;
desc.flags = flags;

bool success = check_cuda(cudaImportExternalMemory(external_memory, &desc));
if (!success)
{
delete external_memory;
return NULL;
}

return external_memory;
}

void cuda_external_memory_get_mapped_buffer(void* context, void* external_memory, uint64_t offset, uint64_t size, unsigned int flags, uint64_t* ptr)
{
ContextGuard guard(context);

cudaExternalMemory_t *memory = (cudaExternalMemory_t*)external_memory;
cudaExternalMemoryBufferDesc desc = {};
desc.offset = offset;
desc.size = size;
desc.flags = flags;

void* device_ptr;
check_cuda(cudaExternalMemoryGetMappedBuffer(&device_ptr, *memory, &desc));
*ptr = (uint64_t)device_ptr;
}

void cuda_destroy_external_memory(void* context, void* external_memory)
{
ContextGuard guard(context);

cudaExternalMemory_t *memory = (cudaExternalMemory_t*)external_memory;
check_cuda(cudaDestroyExternalMemory(*memory));
delete memory;
}

void* cuda_import_external_semaphore(void* context, unsigned int type, void* handle, unsigned int flags)
{
ContextGuard guard(context);

cudaExternalSemaphore_t *external_semaphore = new cudaExternalSemaphore_t;
cudaExternalSemaphoreHandleDesc desc = {};
desc.type = (cudaExternalSemaphoreHandleType)type;
desc.handle.win32.handle = handle;
desc.flags = flags;

bool success = check_cuda(cudaImportExternalSemaphore(external_semaphore, &desc));
if (!success)
{
delete external_semaphore;
return NULL;
}

return external_semaphore;
}

void cuda_destroy_external_semaphore(void* context, void* external_semaphore)
{
ContextGuard guard(context);

cudaExternalSemaphore_t *semaphore = (cudaExternalSemaphore_t*)external_semaphore;
check_cuda(cudaDestroyExternalSemaphore(*semaphore));
delete semaphore;
}

void cuda_signal_external_semaphore_async(void* context, void* external_semaphore, uint64_t value, void* stream)
{
ContextGuard guard(context);

cudaExternalSemaphore_t *semaphore = (cudaExternalSemaphore_t*)external_semaphore;
cudaExternalSemaphoreSignalParams params = {};
params.params.fence.value = value;
check_cuda(cudaSignalExternalSemaphoresAsync(semaphore, &params, 1, static_cast<CUstream>(stream)));
}

void cuda_wait_external_semaphore_async(void* context, void* external_semaphore, uint64_t value, void* stream)
{
ContextGuard guard(context);

cudaExternalSemaphore_t *semaphore = (cudaExternalSemaphore_t*)external_semaphore;
cudaExternalSemaphoreWaitParams params = {};
params.params.fence.value = value;
check_cuda(cudaWaitExternalSemaphoresAsync(semaphore, &params, 1, static_cast<CUstream>(stream)));
}

void cuda_timing_begin(int flags)
{
g_cuda_timing_state = new CudaTimingState(flags, g_cuda_timing_state);
Expand Down
10 changes: 10 additions & 0 deletions warp/native/warp.h
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,16 @@ extern "C"
WP_API void* cuda_graphics_register_gl_buffer(void* context, uint32_t gl_buffer, unsigned int flags);
WP_API void cuda_graphics_unregister_resource(void* context, void* resource);

// external resource interoperability
WP_API void* cuda_import_external_memory(void* context, unsigned int type, void* handle, uint64_t size, unsigned int flags);
WP_API void cuda_destroy_external_memory(void* context, void* external_memory);
WP_API void cuda_external_memory_get_mapped_buffer(void* context, void* external_memory, uint64_t offset, uint64_t size, unsigned int flags, uint64_t* ptr);

WP_API void* cuda_import_external_semaphore(void* context, unsigned int type, void* handle, unsigned int flags);
WP_API void cuda_destroy_external_semaphore(void* context, void* external_semaphore);
WP_API void cuda_signal_external_semaphore_async(void* context, void* external_semaphore, uint64_t value, void* stream);
WP_API void cuda_wait_external_semaphore_async(void* context, void* external_semaphore, uint64_t value, void* stream);

// CUDA timing
WP_API void cuda_timing_begin(int flags);
WP_API int cuda_timing_get_result_count();
Expand Down