Merge branch 'main' into use-sccache

leofang · web-flow · commit 7fd19cd9b35d · 2025-10-22T21:28:50.000-04:00
diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml
@@ -33,7 +33,6 @@ jobs:
           - "3.11"
           - "3.12"
           - "3.13"
-          - "3.13t"
           - "3.14"
           - "3.14t"
     name: py${{ matrix.python-version }}
@@ -373,9 +372,7 @@ jobs:
           cuda-path: "./cuda_toolkit_prev"
 
       - name: Download cuda.bindings build artifacts from the prior branch
-        if: ${{ matrix.python-version == '3.13t'
-                || matrix.python-version == '3.14'
-                || matrix.python-version == '3.14t' }}
+        if: startsWith(matrix.python-version, '3.14')
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -202,4 +202,28 @@ jobs:
       - doc
     steps:
       - name: Exit
-        run: exit 0
+        run: |
+          # if any dependencies were cancelled, that's a failure
+          #
+          # see https://docs.github.com/en/actions/reference/workflows-and-actions/expressions#always
+          # and https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/troubleshooting-required-status-checks#handling-skipped-but-required-checks
+          # for why this cannot be encoded in the job-level `if:` field
+          #
+          # TL; DR: `$REASONS`
+          #
+          # The intersection of skipped-as-success and required status checks
+          # creates a scenario where if you DON'T `always()` run this job, the
+          # status check UI will block merging and if you DO `always()` run and
+          # a dependency is _cancelled_ (due to a critical failure, which is
+          # somehow not considered a failure ¯\_(ツ)_/¯) then the critically
+          # failing job(s) will timeout causing a cancellation here and the
+          # build to succeed which we don't want (originally this was just
+          # 'exit 0')
+          if ${{ needs.test-linux-64.result == 'cancelled' ||
+                 needs.test-linux-aarch64.result == 'cancelled' ||
+                 needs.test-windows.result == 'cancelled' ||
+                 needs.doc.result == 'cancelled' }}; then
+            exit 1
+          else
+            exit 0
+          fi
diff --git a/ci/test-matrix.json b/ci/test-matrix.json
@@ -14,7 +14,6 @@
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.9",  "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
@@ -27,7 +26,6 @@
       { "ARCH": "arm64", "PY_VER": "3.12", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
-      { "ARCH": "arm64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.14", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.14t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }
     ],
@@ -95,8 +93,6 @@
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "t4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
@@ -106,9 +102,7 @@
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "11.8.0", "LOCAL_CTK": "1", "GPU": "t4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }
+      { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }
     ]
   }
 }
diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -1089,10 +1089,10 @@ class _SynchronousMemoryResource(MemoryResource):
         return self._dev_id
 
 
-VirtualMemoryHandleTypeT = Literal["posix_fd", "generic", "none", "win32", "win32_kmt", "fabric"]
+VirtualMemoryHandleTypeT = Union[Literal["posix_fd", "generic", "win32", "win32_kmt", "fabric"], None]
 VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"]
 VirtualMemoryGranularityT = Literal["minimum", "recommended"]
-VirtualMemoryAccessTypeT = Literal["rw", "r", "none"]
+VirtualMemoryAccessTypeT = Union[Literal["rw", "r"], None]
 VirtualMemoryAllocationTypeT = Literal["pinned", "managed"]
 
 
@@ -1101,35 +1101,48 @@ class VirtualMemoryResourceOptions:
     """A configuration object for the VirtualMemoryResource
        Stores configuration information which tells the resource how to use the CUDA VMM APIs
 
-    Args:
-        handle_type: Export handle type for the physical allocation. Use
-            CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR on Linux if you plan to
-            import/export the allocation (required for cuMemRetainAllocationHandle).
-            Use CU_MEM_HANDLE_TYPE_NONE if you don't need an exportable handle.
-        gpu_direct_rdma: Hint that the allocation should be GDR-capable (if supported).
-        granularity: 'recommended' or 'minimum'. Controls granularity query and size rounding.
-        addr_hint: A (optional) virtual address hint to try to reserve at. 0 -> let CUDA choose.
-        addr_align: Alignment for the VA reservation. If None, use the queried granularity.
-        peers: Extra device IDs that should be granted access in addition to `device`.
-        self_access: Access flags for the owning device ('rw', 'r', or 'none').
-        peer_access: Access flags for peers ('rw' or 'r').
+    Attributes
+    ----------
+    allocation_type: :obj:`~_memory.VirtualMemoryAllocationTypeT`
+        Controls the type of allocation.
+    location_type: :obj:`~_memory.VirtualMemoryLocationTypeT`
+        Controls the location of the allocation.
+    handle_type: :obj:`~_memory.VirtualMemoryHandleTypeT`
+        Export handle type for the physical allocation. Use
+        ``"posix_fd"`` on Linux if you plan to
+        import/export the allocation (required for cuMemRetainAllocationHandle).
+        Use `None` if you don't need an exportable handle.
+    gpu_direct_rdma: bool
+        Hint that the allocation should be GDR-capable (if supported).
+    granularity: :obj:`~_memory.VirtualMemoryGranularityT`
+        Controls granularity query and size rounding.
+    addr_hint: int
+        A (optional) virtual address hint to try to reserve at. Setting it to 0 lets the CUDA driver decide.
+    addr_align: int
+        Alignment for the VA reservation. If `None`, use the queried granularity.
+    peers: Iterable[int]
+        Extra device IDs that should be granted access in addition to ``device``.
+    self_access: :obj:`~_memory.VirtualMemoryAccessTypeT`
+        Access flags for the owning device.
+    peer_access: :obj:`~_memory.VirtualMemoryAccessTypeT`
+        Access flags for peers.
     """
     # Human-friendly strings; normalized in __post_init__
     allocation_type: VirtualMemoryAllocationTypeT = "pinned"
     location_type: VirtualMemoryLocationTypeT = "device"
     handle_type: VirtualMemoryHandleTypeT = "posix_fd"
     granularity: VirtualMemoryGranularityT = "recommended"
-    gpu_direct_rdma: bool = True
+    gpu_direct_rdma: bool = False
     addr_hint: Optional[int] = 0
     addr_align: Optional[int] = None
     peers: Iterable[int] = field(default_factory=tuple)
     self_access: VirtualMemoryAccessTypeT = "rw"
     peer_access: VirtualMemoryAccessTypeT = "rw"
 
     _a = driver.CUmemAccess_flags
-    _access_flags = {"rw": _a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": _a.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0}
+    _access_flags = {"rw": _a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": _a.CU_MEM_ACCESS_FLAGS_PROT_READ, None: 0}
     _h = driver.CUmemAllocationHandleType
-    _handle_types = {"none": _h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": _h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": _h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": _h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": _h.CU_MEM_HANDLE_TYPE_FABRIC}
+    _handle_types = {None: _h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": _h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": _h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": _h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": _h.CU_MEM_HANDLE_TYPE_FABRIC}
     _g = driver.CUmemAllocationGranularity_flags
     _granularity = {"recommended": _g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, "minimum": _g.CU_MEM_ALLOC_GRANULARITY_MINIMUM}
     _l = driver.CUmemLocationType
@@ -1198,6 +1211,11 @@ class VirtualMemoryResource(MemoryResource):
         if platform.system() == "Windows":
             raise NotImplementedError("VirtualMemoryResource is not supported on Windows")
 
+        # Validate RDMA support if requested
+        if self.config.gpu_direct_rdma and self.device is not None:
+            if not self.device.properties.gpu_direct_rdma_supported:
+                raise RuntimeError("GPU Direct RDMA is not supported on this device")
+
     @staticmethod
     def _align_up(size: int, gran: int) -> int:
         """
diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst
@@ -18,6 +18,11 @@ CUDA runtime
 
    _memory.PyCapsule
    _memory.DevicePointerT
+   _memory.VirtualMemoryAllocationTypeT
+   _memory.VirtualMemoryLocationTypeT
+   _memory.VirtualMemoryGranularityT
+   _memory.VirtualMemoryAccessTypeT
+   _memory.VirtualMemoryHandleTypeT
    _device.DeviceProperties
    _memory.IPCAllocationHandle
    _memory.IPCBufferDescriptor
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
@@ -28,7 +28,7 @@
 from cuda.core.experimental._utils.cuda_utils import handle_return
 from cuda.core.experimental.utils import StridedMemoryView
 
-from cuda_python_test_helpers import IS_WSL, supports_ipc_mempool
+from cuda_python_test_helpers import supports_ipc_mempool
 
 POOL_SIZE = 2097152  # 2MB size
 
@@ -322,13 +322,13 @@ def test_vmm_allocator_basic_allocation():
     This test verifies that VirtualMemoryResource can allocate memory
     using CUDA VMM APIs with default configuration.
     """
-    if platform.system() == "Windows":
-        pytest.skip("VirtualMemoryResource is not supported on Windows TCC")
-    if IS_WSL:
-        pytest.skip("VirtualMemoryResource is not supported on WSL")
-
     device = Device()
     device.set_current()
+
+    # Skip if virtual memory management is not supported
+    if not device.properties.virtual_memory_management_supported:
+        pytest.skip("Virtual memory management is not supported on this device")
+
     options = VirtualMemoryResourceOptions()
     # Create VMM allocator with default config
     vmm_mr = VirtualMemoryResource(device, config=options)
@@ -361,13 +361,17 @@ def test_vmm_allocator_policy_configuration():
     with different allocation policies and that the configuration affects
     the allocation behavior.
     """
-    if platform.system() == "Windows":
-        pytest.skip("VirtualMemoryResource is not supported on Windows TCC")
-    if IS_WSL:
-        pytest.skip("VirtualMemoryResource is not supported on WSL")
     device = Device()
     device.set_current()
 
+    # Skip if virtual memory management is not supported
+    if not device.properties.virtual_memory_management_supported:
+        pytest.skip("Virtual memory management is not supported on this device")
+
+    # Skip if GPU Direct RDMA is supported (we want to test the unsupported case)
+    if not device.properties.gpu_direct_rdma_supported:
+        pytest.skip("This test requires a device that doesn't support GPU Direct RDMA")
+
     # Test with custom VMM config
     custom_config = VirtualMemoryResourceOptions(
         allocation_type="pinned",
@@ -420,13 +424,13 @@ def test_vmm_allocator_grow_allocation():
     This test verifies that VirtualMemoryResource can grow existing
     allocations while preserving the base pointer when possible.
     """
-    if platform.system() == "Windows":
-        pytest.skip("VirtualMemoryResource is not supported on Windows TCC")
-    if IS_WSL:
-        pytest.skip("VirtualMemoryResource is not supported on WSL")
     device = Device()
     device.set_current()
 
+    # Skip if virtual memory management is not supported (we need it for VMM)
+    if not device.properties.virtual_memory_management_supported:
+        pytest.skip("Virtual memory management is not supported on this device")
+
     options = VirtualMemoryResourceOptions()
 
     vmm_mr = VirtualMemoryResource(device, config=options)
@@ -458,6 +462,29 @@ def test_vmm_allocator_grow_allocation():
     grown_buffer.close()
 
 
+def test_vmm_allocator_rdma_unsupported_exception():
+    """Test that VirtualMemoryResource throws an exception when RDMA is requested but device doesn't support it.
+
+    This test verifies that the VirtualMemoryResource constructor throws a RuntimeError
+    when gpu_direct_rdma=True is requested but the device doesn't support virtual memory management.
+    """
+    device = Device()
+    device.set_current()
+
+    # Skip if virtual memory management is not supported (we need it for VMM)
+    if not device.properties.virtual_memory_management_supported:
+        pytest.skip("Virtual memory management is not supported on this device")
+
+    # Skip if GPU Direct RDMA is supported (we want to test the unsupported case)
+    if device.properties.gpu_direct_rdma_supported:
+        pytest.skip("This test requires a device that doesn't support GPU Direct RDMA")
+
+    # Test that requesting RDMA on an unsupported device throws an exception
+    options = VirtualMemoryResourceOptions(gpu_direct_rdma=True)
+    with pytest.raises(RuntimeError, match="GPU Direct RDMA is not supported on this device"):
+        VirtualMemoryResource(device, config=options)
+
+
 def test_mempool(mempool_device):
     device = mempool_device