Skip to content

Commit 7fd19cd

Browse files
authored
Merge branch 'main' into use-sccache
2 parents 3ad2235 + f3cb5a2 commit 7fd19cd

File tree

6 files changed

+108
-43
lines changed

6 files changed

+108
-43
lines changed

.github/workflows/build-wheel.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ jobs:
3333
- "3.11"
3434
- "3.12"
3535
- "3.13"
36-
- "3.13t"
3736
- "3.14"
3837
- "3.14t"
3938
name: py${{ matrix.python-version }}
@@ -373,9 +372,7 @@ jobs:
373372
cuda-path: "./cuda_toolkit_prev"
374373

375374
- name: Download cuda.bindings build artifacts from the prior branch
376-
if: ${{ matrix.python-version == '3.13t'
377-
|| matrix.python-version == '3.14'
378-
|| matrix.python-version == '3.14t' }}
375+
if: startsWith(matrix.python-version, '3.14')
379376
env:
380377
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
381378
run: |

.github/workflows/ci.yml

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,4 +202,28 @@ jobs:
202202
- doc
203203
steps:
204204
- name: Exit
205-
run: exit 0
205+
run: |
206+
# if any dependencies were cancelled, that's a failure
207+
#
208+
# see https://docs.github.com/en/actions/reference/workflows-and-actions/expressions#always
209+
# and https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/troubleshooting-required-status-checks#handling-skipped-but-required-checks
210+
# for why this cannot be encoded in the job-level `if:` field
211+
#
212+
# TL; DR: `$REASONS`
213+
#
214+
# The intersection of skipped-as-success and required status checks
215+
# creates a scenario where if you DON'T `always()` run this job, the
216+
# status check UI will block merging and if you DO `always()` run and
217+
# a dependency is _cancelled_ (due to a critical failure, which is
218+
# somehow not considered a failure ¯\_(ツ)_/¯) then the critically
219+
# failing job(s) will timeout causing a cancellation here and the
220+
# build to succeed which we don't want (originally this was just
221+
# 'exit 0')
222+
if ${{ needs.test-linux-64.result == 'cancelled' ||
223+
needs.test-linux-aarch64.result == 'cancelled' ||
224+
needs.test-windows.result == 'cancelled' ||
225+
needs.doc.result == 'cancelled' }}; then
226+
exit 1
227+
else
228+
exit 0
229+
fi

ci/test-matrix.json

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
{ "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
1515
{ "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
1616
{ "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
17-
{ "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
1817
{ "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
1918
{ "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
2019
{ "ARCH": "arm64", "PY_VER": "3.9", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
@@ -27,7 +26,6 @@
2726
{ "ARCH": "arm64", "PY_VER": "3.12", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
2827
{ "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
2928
{ "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
30-
{ "ARCH": "arm64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
3129
{ "ARCH": "arm64", "PY_VER": "3.14", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
3230
{ "ARCH": "arm64", "PY_VER": "3.14t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }
3331
],
@@ -95,8 +93,6 @@
9593
{ "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "t4", "DRIVER": "latest" },
9694
{ "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
9795
{ "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
98-
{ "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
99-
{ "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
10096
{ "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
10197
{ "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
10298
{ "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
@@ -106,9 +102,7 @@
106102
{ "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
107103
{ "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "11.8.0", "LOCAL_CTK": "1", "GPU": "t4", "DRIVER": "latest" },
108104
{ "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
109-
{ "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
110-
{ "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
111-
{ "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }
105+
{ "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }
112106
]
113107
}
114108
}

cuda_core/cuda/core/experimental/_memory.pyx

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,10 +1089,10 @@ class _SynchronousMemoryResource(MemoryResource):
10891089
return self._dev_id
10901090

10911091

1092-
VirtualMemoryHandleTypeT = Literal["posix_fd", "generic", "none", "win32", "win32_kmt", "fabric"]
1092+
VirtualMemoryHandleTypeT = Union[Literal["posix_fd", "generic", "win32", "win32_kmt", "fabric"], None]
10931093
VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"]
10941094
VirtualMemoryGranularityT = Literal["minimum", "recommended"]
1095-
VirtualMemoryAccessTypeT = Literal["rw", "r", "none"]
1095+
VirtualMemoryAccessTypeT = Union[Literal["rw", "r"], None]
10961096
VirtualMemoryAllocationTypeT = Literal["pinned", "managed"]
10971097

10981098

@@ -1101,35 +1101,48 @@ class VirtualMemoryResourceOptions:
11011101
"""A configuration object for the VirtualMemoryResource
11021102
Stores configuration information which tells the resource how to use the CUDA VMM APIs
11031103

1104-
Args:
1105-
handle_type: Export handle type for the physical allocation. Use
1106-
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR on Linux if you plan to
1107-
import/export the allocation (required for cuMemRetainAllocationHandle).
1108-
Use CU_MEM_HANDLE_TYPE_NONE if you don't need an exportable handle.
1109-
gpu_direct_rdma: Hint that the allocation should be GDR-capable (if supported).
1110-
granularity: 'recommended' or 'minimum'. Controls granularity query and size rounding.
1111-
addr_hint: A (optional) virtual address hint to try to reserve at. 0 -> let CUDA choose.
1112-
addr_align: Alignment for the VA reservation. If None, use the queried granularity.
1113-
peers: Extra device IDs that should be granted access in addition to `device`.
1114-
self_access: Access flags for the owning device ('rw', 'r', or 'none').
1115-
peer_access: Access flags for peers ('rw' or 'r').
1104+
Attributes
1105+
----------
1106+
allocation_type: :obj:`~_memory.VirtualMemoryAllocationTypeT`
1107+
Controls the type of allocation.
1108+
location_type: :obj:`~_memory.VirtualMemoryLocationTypeT`
1109+
Controls the location of the allocation.
1110+
handle_type: :obj:`~_memory.VirtualMemoryHandleTypeT`
1111+
Export handle type for the physical allocation. Use
1112+
``"posix_fd"`` on Linux if you plan to
1113+
import/export the allocation (required for cuMemRetainAllocationHandle).
1114+
Use `None` if you don't need an exportable handle.
1115+
gpu_direct_rdma: bool
1116+
Hint that the allocation should be GDR-capable (if supported).
1117+
granularity: :obj:`~_memory.VirtualMemoryGranularityT`
1118+
Controls granularity query and size rounding.
1119+
addr_hint: int
1120+
A (optional) virtual address hint to try to reserve at. Setting it to 0 lets the CUDA driver decide.
1121+
addr_align: int
1122+
Alignment for the VA reservation. If `None`, use the queried granularity.
1123+
peers: Iterable[int]
1124+
Extra device IDs that should be granted access in addition to ``device``.
1125+
self_access: :obj:`~_memory.VirtualMemoryAccessTypeT`
1126+
Access flags for the owning device.
1127+
peer_access: :obj:`~_memory.VirtualMemoryAccessTypeT`
1128+
Access flags for peers.
11161129
"""
11171130
# Human-friendly strings; normalized in __post_init__
11181131
allocation_type: VirtualMemoryAllocationTypeT = "pinned"
11191132
location_type: VirtualMemoryLocationTypeT = "device"
11201133
handle_type: VirtualMemoryHandleTypeT = "posix_fd"
11211134
granularity: VirtualMemoryGranularityT = "recommended"
1122-
gpu_direct_rdma: bool = True
1135+
gpu_direct_rdma: bool = False
11231136
addr_hint: Optional[int] = 0
11241137
addr_align: Optional[int] = None
11251138
peers: Iterable[int] = field(default_factory=tuple)
11261139
self_access: VirtualMemoryAccessTypeT = "rw"
11271140
peer_access: VirtualMemoryAccessTypeT = "rw"
11281141

11291142
_a = driver.CUmemAccess_flags
1130-
_access_flags = {"rw": _a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": _a.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0}
1143+
_access_flags = {"rw": _a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": _a.CU_MEM_ACCESS_FLAGS_PROT_READ, None: 0}
11311144
_h = driver.CUmemAllocationHandleType
1132-
_handle_types = {"none": _h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": _h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": _h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": _h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": _h.CU_MEM_HANDLE_TYPE_FABRIC}
1145+
_handle_types = {None: _h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": _h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": _h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": _h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": _h.CU_MEM_HANDLE_TYPE_FABRIC}
11331146
_g = driver.CUmemAllocationGranularity_flags
11341147
_granularity = {"recommended": _g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, "minimum": _g.CU_MEM_ALLOC_GRANULARITY_MINIMUM}
11351148
_l = driver.CUmemLocationType
@@ -1198,6 +1211,11 @@ class VirtualMemoryResource(MemoryResource):
11981211
if platform.system() == "Windows":
11991212
raise NotImplementedError("VirtualMemoryResource is not supported on Windows")
12001213

1214+
# Validate RDMA support if requested
1215+
if self.config.gpu_direct_rdma and self.device is not None:
1216+
if not self.device.properties.gpu_direct_rdma_supported:
1217+
raise RuntimeError("GPU Direct RDMA is not supported on this device")
1218+
12011219
@staticmethod
12021220
def _align_up(size: int, gran: int) -> int:
12031221
"""

cuda_core/docs/source/api_private.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ CUDA runtime
1818

1919
_memory.PyCapsule
2020
_memory.DevicePointerT
21+
_memory.VirtualMemoryAllocationTypeT
22+
_memory.VirtualMemoryLocationTypeT
23+
_memory.VirtualMemoryGranularityT
24+
_memory.VirtualMemoryAccessTypeT
25+
_memory.VirtualMemoryHandleTypeT
2126
_device.DeviceProperties
2227
_memory.IPCAllocationHandle
2328
_memory.IPCBufferDescriptor

cuda_core/tests/test_memory.py

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from cuda.core.experimental._utils.cuda_utils import handle_return
2929
from cuda.core.experimental.utils import StridedMemoryView
3030

31-
from cuda_python_test_helpers import IS_WSL, supports_ipc_mempool
31+
from cuda_python_test_helpers import supports_ipc_mempool
3232

3333
POOL_SIZE = 2097152 # 2MB size
3434

@@ -322,13 +322,13 @@ def test_vmm_allocator_basic_allocation():
322322
This test verifies that VirtualMemoryResource can allocate memory
323323
using CUDA VMM APIs with default configuration.
324324
"""
325-
if platform.system() == "Windows":
326-
pytest.skip("VirtualMemoryResource is not supported on Windows TCC")
327-
if IS_WSL:
328-
pytest.skip("VirtualMemoryResource is not supported on WSL")
329-
330325
device = Device()
331326
device.set_current()
327+
328+
# Skip if virtual memory management is not supported
329+
if not device.properties.virtual_memory_management_supported:
330+
pytest.skip("Virtual memory management is not supported on this device")
331+
332332
options = VirtualMemoryResourceOptions()
333333
# Create VMM allocator with default config
334334
vmm_mr = VirtualMemoryResource(device, config=options)
@@ -361,13 +361,17 @@ def test_vmm_allocator_policy_configuration():
361361
with different allocation policies and that the configuration affects
362362
the allocation behavior.
363363
"""
364-
if platform.system() == "Windows":
365-
pytest.skip("VirtualMemoryResource is not supported on Windows TCC")
366-
if IS_WSL:
367-
pytest.skip("VirtualMemoryResource is not supported on WSL")
368364
device = Device()
369365
device.set_current()
370366

367+
# Skip if virtual memory management is not supported
368+
if not device.properties.virtual_memory_management_supported:
369+
pytest.skip("Virtual memory management is not supported on this device")
370+
371+
# Skip if GPU Direct RDMA is supported (we want to test the unsupported case)
372+
if not device.properties.gpu_direct_rdma_supported:
373+
pytest.skip("This test requires a device that doesn't support GPU Direct RDMA")
374+
371375
# Test with custom VMM config
372376
custom_config = VirtualMemoryResourceOptions(
373377
allocation_type="pinned",
@@ -420,13 +424,13 @@ def test_vmm_allocator_grow_allocation():
420424
This test verifies that VirtualMemoryResource can grow existing
421425
allocations while preserving the base pointer when possible.
422426
"""
423-
if platform.system() == "Windows":
424-
pytest.skip("VirtualMemoryResource is not supported on Windows TCC")
425-
if IS_WSL:
426-
pytest.skip("VirtualMemoryResource is not supported on WSL")
427427
device = Device()
428428
device.set_current()
429429

430+
# Skip if virtual memory management is not supported (we need it for VMM)
431+
if not device.properties.virtual_memory_management_supported:
432+
pytest.skip("Virtual memory management is not supported on this device")
433+
430434
options = VirtualMemoryResourceOptions()
431435

432436
vmm_mr = VirtualMemoryResource(device, config=options)
@@ -458,6 +462,29 @@ def test_vmm_allocator_grow_allocation():
458462
grown_buffer.close()
459463

460464

465+
def test_vmm_allocator_rdma_unsupported_exception():
466+
"""Test that VirtualMemoryResource throws an exception when RDMA is requested but device doesn't support it.
467+
468+
This test verifies that the VirtualMemoryResource constructor throws a RuntimeError
469+
when gpu_direct_rdma=True is requested but the device doesn't support virtual memory management.
470+
"""
471+
device = Device()
472+
device.set_current()
473+
474+
# Skip if virtual memory management is not supported (we need it for VMM)
475+
if not device.properties.virtual_memory_management_supported:
476+
pytest.skip("Virtual memory management is not supported on this device")
477+
478+
# Skip if GPU Direct RDMA is supported (we want to test the unsupported case)
479+
if device.properties.gpu_direct_rdma_supported:
480+
pytest.skip("This test requires a device that doesn't support GPU Direct RDMA")
481+
482+
# Test that requesting RDMA on an unsupported device throws an exception
483+
options = VirtualMemoryResourceOptions(gpu_direct_rdma=True)
484+
with pytest.raises(RuntimeError, match="GPU Direct RDMA is not supported on this device"):
485+
VirtualMemoryResource(device, config=options)
486+
487+
461488
def test_mempool(mempool_device):
462489
device = mempool_device
463490

0 commit comments

Comments
 (0)