Merge branch 'main' into ipc_events2

Andy-Jost · web-flow · commit 51e90e0d5f11 · 2025-10-23T10:46:08.000-07:00
diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml
@@ -33,7 +33,6 @@ jobs:
           - "3.11"
           - "3.12"
           - "3.13"
-          - "3.13t"
           - "3.14"
           - "3.14t"
     name: py${{ matrix.python-version }}
@@ -287,9 +286,7 @@ jobs:
           cuda-path: "./cuda_toolkit_prev"
 
       - name: Download cuda.bindings build artifacts from the prior branch
-        if: ${{ matrix.python-version == '3.13t'
-                || matrix.python-version == '3.14'
-                || matrix.python-version == '3.14t' }}
+        if: startsWith(matrix.python-version, '3.14')
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -202,4 +202,28 @@ jobs:
       - doc
     steps:
       - name: Exit
-        run: exit 0
+        run: |
+          # if any dependencies were cancelled, that's a failure
+          #
+          # see https://docs.github.com/en/actions/reference/workflows-and-actions/expressions#always
+          # and https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/troubleshooting-required-status-checks#handling-skipped-but-required-checks
+          # for why this cannot be encoded in the job-level `if:` field
+          #
+          # TL; DR: `$REASONS`
+          #
+          # The intersection of skipped-as-success and required status checks
+          # creates a scenario where if you DON'T `always()` run this job, the
+          # status check UI will block merging and if you DO `always()` run and
+          # a dependency is _cancelled_ (due to a critical failure, which is
+          # somehow not considered a failure ¯\_(ツ)_/¯) then the critically
+          # failing job(s) will timeout causing a cancellation here and the
+          # build to succeed which we don't want (originally this was just
+          # 'exit 0')
+          if ${{ needs.test-linux-64.result == 'cancelled' ||
+                 needs.test-linux-aarch64.result == 'cancelled' ||
+                 needs.test-windows.result == 'cancelled' ||
+                 needs.doc.result == 'cancelled' }}; then
+            exit 1
+          else
+            exit 0
+          fi
diff --git a/ci/test-matrix.json b/ci/test-matrix.json
@@ -14,7 +14,6 @@
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.9",  "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
@@ -27,7 +26,6 @@
       { "ARCH": "arm64", "PY_VER": "3.12", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
-      { "ARCH": "arm64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.14", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.14t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }
     ],
@@ -95,8 +93,6 @@
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "t4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
@@ -106,9 +102,7 @@
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "11.8.0", "LOCAL_CTK": "1", "GPU": "t4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }
+      { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }
     ]
   }
 }
diff --git a/cuda_bindings/docs/source/conf.py b/cuda_bindings/docs/source/conf.py
@@ -9,13 +9,11 @@
 
 # -- Path setup --------------------------------------------------------------
 
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
 import os
+import sys
+from pathlib import Path
 
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
+sys.path.insert(0, str((Path(__file__).parents[3] / "cuda_python" / "docs" / "exts").absolute()))
 
 
 # -- Project information -----------------------------------------------------
@@ -41,6 +39,7 @@
     "myst_nb",
     "enum_tools.autoenum",
     "sphinx_copybutton",
+    "release_toc",
 ]
 
 nb_execution_mode = "off"
diff --git a/cuda_bindings/docs/source/release.rst b/cuda_bindings/docs/source/release.rst
@@ -6,38 +6,6 @@ Release Notes
 
 .. toctree::
    :maxdepth: 3
+   :glob:
 
-   13.0.3 <release/13.0.3-notes.rst>
-   13.0.2 <release/13.0.2-notes.rst>
-   13.0.1 <release/13.0.1-notes.rst>
-   13.0.0 <release/13.0.0-notes.rst>
-   12.9.4 <release/12.9.4-notes.rst>
-   12.9.3 <release/12.9.3-notes.rst>
-   12.9.2 <release/12.9.2-notes.rst>
-   12.9.1 <release/12.9.1-notes.rst>
-   12.9.0 <release/12.9.0-notes.rst>
-   12.8.0 <release/12.8.0-notes.rst>
-   12.6.2 <release/12.6.2-notes.rst>
-   12.6.1 <release/12.6.1-notes.rst>
-   12.6.0 <release/12.6.0-notes.rst>
-   12.5.0 <release/12.5.0-notes.rst>
-   12.4.0 <release/12.4.0-notes.rst>
-   12.3.0 <release/12.3.0-notes.rst>
-   12.2.1 <release/12.2.1-notes.rst>
-   12.2.0 <release/12.2.0-notes.rst>
-   12.1.0 <release/12.1.0-notes.rst>
-   12.0.0 <release/12.0.0-notes.rst>
-   11.8.7 <release/11.8.7-notes.rst>
-   11.8.6 <release/11.8.6-notes.rst>
-   11.8.5 <release/11.8.5-notes.rst>
-   11.8.4 <release/11.8.4-notes.rst>
-   11.8.3 <release/11.8.3-notes.rst>
-   11.8.2 <release/11.8.2-notes.rst>
-   11.8.1 <release/11.8.1-notes.rst>
-   11.8.0 <release/11.8.0-notes.rst>
-   11.7.1 <release/11.7.1-notes.rst>
-   11.7.0 <release/11.7.0-notes.rst>
-   11.6.1 <release/11.6.1-notes.rst>
-   11.6.0 <release/11.6.0-notes.rst>
-   11.5.0 <release/11.5.0-notes.rst>
-   11.4.0 <release/11.4.0-notes.rst>
+   release/*[0-9]-notes
diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -1132,7 +1132,7 @@ class VirtualMemoryResourceOptions:
     location_type: VirtualMemoryLocationTypeT = "device"
     handle_type: VirtualMemoryHandleTypeT = "posix_fd"
     granularity: VirtualMemoryGranularityT = "recommended"
-    gpu_direct_rdma: bool = True
+    gpu_direct_rdma: bool = False
     addr_hint: Optional[int] = 0
     addr_align: Optional[int] = None
     peers: Iterable[int] = field(default_factory=tuple)
@@ -1211,6 +1211,11 @@ class VirtualMemoryResource(MemoryResource):
         if platform.system() == "Windows":
             raise NotImplementedError("VirtualMemoryResource is not supported on Windows")
 
+        # Validate RDMA support if requested
+        if self.config.gpu_direct_rdma and self.device is not None:
+            if not self.device.properties.gpu_direct_rdma_supported:
+                raise RuntimeError("GPU Direct RDMA is not supported on this device")
+
     @staticmethod
     def _align_up(size: int, gran: int) -> int:
         """
diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py
@@ -9,12 +9,11 @@
 
 # -- Path setup --------------------------------------------------------------
 
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
 import os
+import sys
+from pathlib import Path
 
-# sys.path.insert(0, os.path.abspath('.'))
+sys.path.insert(0, str((Path(__file__).parents[3] / "cuda_python" / "docs" / "exts").absolute()))
 
 
 # -- Project information -----------------------------------------------------
@@ -41,6 +40,7 @@
     "enum_tools.autoenum",
     "sphinx_copybutton",
     "sphinx_toolbox.more_autodoc.autoprotocol",
+    "release_toc",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/cuda_core/docs/source/release.rst b/cuda_core/docs/source/release.rst
@@ -6,11 +6,6 @@ Release Notes
 
 .. toctree::
    :maxdepth: 3
+   :glob:
 
-   0.4.0 <release/0.4.0-notes>
-   0.3.2 <release/0.3.2-notes>
-   0.3.1 <release/0.3.1-notes>
-   0.3.0 <release/0.3.0-notes>
-   0.2.0 <release/0.2.0-notes>
-   0.1.1 <release/0.1.1-notes>
-   0.1.0 <release/0.1.0-notes>
+   release/*[0-9]-notes
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
@@ -22,6 +22,10 @@
     launch,
 )
 
+if np.lib.NumpyVersion(np.__version__) < "2.2.5":
+    print("This example requires NumPy 2.2.5 or later", file=sys.stderr)
+    sys.exit(0)
+
 # prepare include
 cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME"))
 if cuda_path is None:
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
@@ -29,7 +29,7 @@
 from cuda.core.experimental.utils import StridedMemoryView
 from helpers.buffers import DummyUnifiedMemoryResource
 
-from cuda_python_test_helpers import IS_WSL, supports_ipc_mempool
+from cuda_python_test_helpers import supports_ipc_mempool
 
 POOL_SIZE = 2097152  # 2MB size
 
@@ -299,13 +299,13 @@ def test_vmm_allocator_basic_allocation():
     This test verifies that VirtualMemoryResource can allocate memory
     using CUDA VMM APIs with default configuration.
     """
-    if platform.system() == "Windows":
-        pytest.skip("VirtualMemoryResource is not supported on Windows TCC")
-    if IS_WSL:
-        pytest.skip("VirtualMemoryResource is not supported on WSL")
-
     device = Device()
     device.set_current()
+
+    # Skip if virtual memory management is not supported
+    if not device.properties.virtual_memory_management_supported:
+        pytest.skip("Virtual memory management is not supported on this device")
+
     options = VirtualMemoryResourceOptions()
     # Create VMM allocator with default config
     vmm_mr = VirtualMemoryResource(device, config=options)
@@ -338,13 +338,17 @@ def test_vmm_allocator_policy_configuration():
     with different allocation policies and that the configuration affects
     the allocation behavior.
     """
-    if platform.system() == "Windows":
-        pytest.skip("VirtualMemoryResource is not supported on Windows TCC")
-    if IS_WSL:
-        pytest.skip("VirtualMemoryResource is not supported on WSL")
     device = Device()
     device.set_current()
 
+    # Skip if virtual memory management is not supported
+    if not device.properties.virtual_memory_management_supported:
+        pytest.skip("Virtual memory management is not supported on this device")
+
+    # Skip if GPU Direct RDMA is supported (we want to test the unsupported case)
+    if not device.properties.gpu_direct_rdma_supported:
+        pytest.skip("This test requires a device that doesn't support GPU Direct RDMA")
+
     # Test with custom VMM config
     custom_config = VirtualMemoryResourceOptions(
         allocation_type="pinned",
@@ -397,13 +401,13 @@ def test_vmm_allocator_grow_allocation():
     This test verifies that VirtualMemoryResource can grow existing
     allocations while preserving the base pointer when possible.
     """
-    if platform.system() == "Windows":
-        pytest.skip("VirtualMemoryResource is not supported on Windows TCC")
-    if IS_WSL:
-        pytest.skip("VirtualMemoryResource is not supported on WSL")
     device = Device()
     device.set_current()
 
+    # Skip if virtual memory management is not supported (we need it for VMM)
+    if not device.properties.virtual_memory_management_supported:
+        pytest.skip("Virtual memory management is not supported on this device")
+
     options = VirtualMemoryResourceOptions()
 
     vmm_mr = VirtualMemoryResource(device, config=options)
@@ -435,6 +439,29 @@ def test_vmm_allocator_grow_allocation():
     grown_buffer.close()
 
 
+def test_vmm_allocator_rdma_unsupported_exception():
+    """Test that VirtualMemoryResource throws an exception when RDMA is requested but device doesn't support it.
+
+    This test verifies that the VirtualMemoryResource constructor throws a RuntimeError
+    when gpu_direct_rdma=True is requested but the device doesn't support virtual memory management.
+    """
+    device = Device()
+    device.set_current()
+
+    # Skip if virtual memory management is not supported (we need it for VMM)
+    if not device.properties.virtual_memory_management_supported:
+        pytest.skip("Virtual memory management is not supported on this device")
+
+    # Skip if GPU Direct RDMA is supported (we want to test the unsupported case)
+    if device.properties.gpu_direct_rdma_supported:
+        pytest.skip("This test requires a device that doesn't support GPU Direct RDMA")
+
+    # Test that requesting RDMA on an unsupported device throws an exception
+    options = VirtualMemoryResourceOptions(gpu_direct_rdma=True)
+    with pytest.raises(RuntimeError, match="GPU Direct RDMA is not supported on this device"):
+        VirtualMemoryResource(device, config=options)
+
+
 def test_mempool(mempool_device):
     device = mempool_device
 
diff --git a/cuda_pathfinder/docs/source/conf.py b/cuda_pathfinder/docs/source/conf.py
@@ -9,13 +9,11 @@
 
 # -- Path setup --------------------------------------------------------------
 
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
 import os
+import sys
+from pathlib import Path
 
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
+sys.path.insert(0, str((Path(__file__).parents[3] / "cuda_python" / "docs" / "exts").absolute()))
 
 
 # -- Project information -----------------------------------------------------
@@ -41,6 +39,7 @@
     "myst_nb",
     "enum_tools.autoenum",
     "sphinx_copybutton",
+    "release_toc",
 ]
 
 nb_execution_mode = "off"
diff --git a/cuda_pathfinder/docs/source/release.rst b/cuda_pathfinder/docs/source/release.rst
@@ -6,12 +6,6 @@ Release Notes
 
 .. toctree::
    :maxdepth: 3
+   :glob:
 
-   1.3.1 <release/1.3.1-notes>
-   1.3.0 <release/1.3.0-notes>
-   1.2.3 <release/1.2.3-notes>
-   1.2.2 <release/1.2.2-notes>
-   1.2.1 <release/1.2.1-notes>
-   1.2.0 <release/1.2.0-notes>
-   1.1.0 <release/1.1.0-notes>
-   1.0.0 <release/1.0.0-notes>
+   release/*[0-9]-notes
diff --git a/cuda_python/docs/exts/release_toc.py b/cuda_python/docs/exts/release_toc.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from pathlib import Path
+
+from packaging.version import Version
+from sphinx.directives.other import TocTree
+
+
+class TocTreeSorted(TocTree):
+    """A toctree directive that sorts entries by version."""
+
+    def parse_content(self, toctree):
+        super().parse_content(toctree)
+
+        if not toctree["glob"]:
+            return
+
+        entries = toctree.get("entries", [])
+        if not entries:
+            return
+
+        entries = [(Version(Path(x[1]).name.removesuffix("-notes")), x[1]) for x in entries]
+        entries.sort(key=lambda x: x[0], reverse=True)
+        entries = [(str(x[0]), x[1]) for x in entries]
+        toctree["entries"] = entries
+
+
+def setup(app):
+    app.add_directive("toctree", TocTreeSorted, override=True)
diff --git a/cuda_python/docs/source/conf.py b/cuda_python/docs/source/conf.py
diff --git a/cuda_python/docs/source/release.rst b/cuda_python/docs/source/release.rst