Standardize naming conventions across cuda.core examples

cpcloud · cpcloud · commit 76978c136ede · 2026-02-26T16:23:49.000-05:00
Apply consistent variable names for common objects across all cuda.core example files, addressing issue #1675. Conventions applied: - Stream: `stream` (not `s`), numbered `stream0`/`stream1` - Kernel: `kernel` (not `ker`/`gpu_ker`), descriptive `add_kernel`/`sub_kernel` - Program: `prog` (not `gpu_prog`) - Kernel args: `kernel_args` (not `ker_args`) - Program options: `program_options` (not `opts`), using ProgramOptions (not dicts) - Grid/block: `grid`/`block` (not `grid_size`/`block_size`) Made-with: Cursor
diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py
@@ -80,9 +80,9 @@ def main():
     result3 = cp.empty_like(a)
 
     # Prepare launch configuration
-    block_size = 256
-    grid_size = (size + block_size - 1) // block_size
-    config = LaunchConfig(grid=grid_size, block=block_size)
+    block = 256
+    grid = (size + block - 1) // block
+    config = LaunchConfig(grid=grid, block=block)
 
     # Sync before graph capture
     dev.sync()
diff --git a/cuda_core/examples/gl_interop_plasma.py b/cuda_core/examples/gl_interop_plasma.py
@@ -93,8 +93,8 @@ def setup_cuda(kernel_source):
     dev.set_current()
     stream = dev.create_stream()
 
-    opts = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
-    prog = Program(kernel_source, code_type="c++", options=opts)
+    program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
+    prog = Program(kernel_source, code_type="c++", options=program_options)
     mod = prog.compile("cubin")
     kernel = mod.get_kernel("plasma")
 
diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py
@@ -47,7 +47,7 @@ def __cuda_stream__(self):
         return (0, stream_id)  # Return format required by CUDA Python
 
 
-s = dev.create_stream(PyTorchStreamWrapper(pt_stream))
+stream = dev.create_stream(PyTorchStreamWrapper(pt_stream))
 
 # prepare program
 program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
@@ -59,7 +59,7 @@ def __cuda_stream__(self):
 )
 
 # Run in single precision
-ker = mod.get_kernel("saxpy_kernel<float>")
+kernel = mod.get_kernel("saxpy_kernel<float>")
 dtype = torch.float32
 
 # prepare input/output
@@ -74,16 +74,16 @@ def __cuda_stream__(self):
 block = 32
 grid = int((size + block - 1) // block)
 config = LaunchConfig(grid=grid, block=block)
-ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
+kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
 
 # launch kernel on our stream
-launch(s, config, ker, *ker_args)
+launch(stream, config, kernel, *kernel_args)
 
 # check result
 assert torch.allclose(out, a.item() * x + y)
 
 # let's repeat again with double precision
-ker = mod.get_kernel("saxpy_kernel<double>")
+kernel = mod.get_kernel("saxpy_kernel<double>")
 dtype = torch.float64
 
 # prepare input
@@ -100,10 +100,10 @@ def __cuda_stream__(self):
 block = 64
 grid = int((size + block - 1) // block)
 config = LaunchConfig(grid=grid, block=block)
-ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
+kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
 
 # launch kernel on PyTorch's stream
-launch(s, config, ker, *ker_args)
+launch(stream, config, kernel, *kernel_args)
 
 # check result
 assert torch.allclose(out, a * x + y)
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
@@ -34,7 +34,7 @@
 
 dev = Device()
 dev.set_current()
-s = dev.create_stream()
+stream = dev.create_stream()
 
 # prepare program
 program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
@@ -50,7 +50,7 @@
 )
 
 # run in single precision
-ker = mod.get_kernel("saxpy<float>")
+kernel = mod.get_kernel("saxpy<float>")
 dtype = cp.float32
 
 # prepare input/output
@@ -60,24 +60,24 @@
 x = rng.random(size, dtype=dtype)
 y = rng.random(size, dtype=dtype)
 out = cp.empty_like(x)
-dev.sync()  # cupy runs on a different stream from s, so sync before accessing
+dev.sync()  # cupy runs on a different stream from stream, so sync before accessing
 
 # prepare launch
 block = 32
 grid = int((size + block - 1) // block)
 config = LaunchConfig(grid=grid, block=block)
-ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
+kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
 
-# launch kernel on stream s
-launch(s, config, ker, *ker_args)
-s.sync()
+# launch kernel on stream
+launch(stream, config, kernel, *kernel_args)
+stream.sync()
 
 # check result
 assert cp.allclose(out, a * x + y)
 
 # let's repeat again, this time allocates our own out buffer instead of cupy's
 # run in double precision
-ker = mod.get_kernel("saxpy<double>")
+kernel = mod.get_kernel("saxpy<double>")
 dtype = cp.float64
 
 # prepare input
@@ -90,18 +90,18 @@
 # prepare output
 buf = dev.allocate(
     size * 8,  # = dtype.itemsize
-    stream=s,
+    stream=stream,
 )
 
 # prepare launch
 block = 64
 grid = int((size + block - 1) // block)
 config = LaunchConfig(grid=grid, block=block)
-ker_args = (a, x.data.ptr, y.data.ptr, buf, size)
+kernel_args = (a, x.data.ptr, y.data.ptr, buf, size)
 
-# launch kernel on stream s
-launch(s, config, ker, *ker_args)
-s.sync()
+# launch kernel on stream
+launch(stream, config, kernel, *kernel_args)
+stream.sync()
 
 # check result
 # we wrap output buffer as a cupy array for simplicity
@@ -112,5 +112,5 @@
 
 # clean up resources that we allocate
 # cupy cleans up automatically the rest
-buf.close(s)
-s.close()
+buf.close(stream)
+stream.close()
diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
@@ -12,7 +12,7 @@
 import sys
 
 import cupy as cp
-from cuda.core import Device, LaunchConfig, Program, launch, system
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system
 
 if system.get_num_devices() < 2:
     print("this example requires at least 2 GPUs", file=sys.stderr)
@@ -39,9 +39,9 @@
     }
 }
 """
-prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev0.arch}"})
+prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}"))
 mod_add = prog_add.compile("cubin")
-ker_add = mod_add.get_kernel("vector_add")
+add_kernel = mod_add.get_kernel("vector_add")
 
 # Set GPU 1
 dev1 = Device(1)
@@ -61,9 +61,9 @@
     }
 }
 """
-prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev1.arch}"})
+prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}"))
 mod_sub = prog_sub.compile("cubin")
-ker_sub = mod_sub.get_kernel("vector_sub")
+sub_kernel = mod_sub.get_kernel("vector_sub")
 
 
 # This adaptor ensures that any foreign stream (ex: from CuPy) that have not
@@ -99,7 +99,7 @@ def __cuda_stream__(self):
 stream0.wait(cp_stream0)
 
 # Launch the add kernel on GPU 0 / stream 0
-launch(stream0, config0, ker_add, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
 
 # Allocate memory on GPU 1
 # Note: This runs on CuPy's current stream for GPU 1.
@@ -114,7 +114,7 @@ def __cuda_stream__(self):
 stream1.wait(cp_stream1)
 
 # Launch the subtract kernel on GPU 1 / stream 1
-launch(stream1, config1, ker_sub, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
+launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
 
 # Synchronize both GPUs are validate the results
 dev0.set_current()
diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py
@@ -56,7 +56,7 @@
 # We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
 # of which are supported by StridedMemoryView).
 @args_viewable_as_strided_memory((0,))
-def my_func(arr, work_stream, gpu_ker):
+def my_func(arr, work_stream, kernel):
     # Create a memory view over arr (assumed to be a 1D array of int32). The stream
     # ordering is taken care of, so that arr can be safely accessed on our work
     # stream (ordered after a data stream on which arr is potentially prepared).
@@ -72,7 +72,7 @@ def my_func(arr, work_stream, gpu_ker):
     block = 256
     grid = (size + block - 1) // block
     config = LaunchConfig(grid=grid, block=block)
-    launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size))
+    launch(work_stream, config, kernel, view.ptr, np.uint64(size))
     # Here we're being conservative and synchronize over our work stream,
     # assuming we do not know the data stream; if we know then we could
     # just order the data stream after the work stream here, e.g.
@@ -100,24 +100,24 @@ def run():
     # To know the GPU's compute capability, we need to identify which GPU to use.
     dev = Device(0)
     dev.set_current()
-    gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
-    mod = gpu_prog.compile(target_type="cubin")
-    gpu_ker = mod.get_kernel(func_name)
+    prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
+    mod = prog.compile(target_type="cubin")
+    kernel = mod.get_kernel(func_name)
 
-    s = dev.create_stream()
+    stream = dev.create_stream()
     try:
         # Create input array on GPU
         arr_gpu = cp.ones(1024, dtype=cp.int32)
         print(f"before: {arr_gpu[:10]=}")
 
         # Run the workload
-        my_func(arr_gpu, s, gpu_ker)
+        my_func(arr_gpu, stream, kernel)
 
         # Check the result
         print(f"after: {arr_gpu[:10]=}")
         assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
     finally:
-        s.close()
+        stream.close()
 
 
 if __name__ == "__main__":
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
@@ -93,7 +93,7 @@
     options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path),
 )
 mod = prog.compile(target_type="cubin")
-ker = mod.get_kernel("check_cluster_info")
+kernel = mod.get_kernel("check_cluster_info")
 
 # prepare launch config
 grid = 4
@@ -121,7 +121,7 @@
 block_dims[:] = 0
 
 # launch kernel on the default stream
-launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer)
+launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer)
 dev.sync()
 
 # verify results
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
@@ -29,15 +29,15 @@
 
 dev = Device()
 dev.set_current()
-s = dev.create_stream()
+stream = dev.create_stream()
 
 # prepare program
 program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
 prog = Program(code, code_type="c++", options=program_options)
 mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
 
 # run in single precision
-ker = mod.get_kernel("vector_add<float>")
+kernel = mod.get_kernel("vector_add<float>")
 dtype = cp.float32
 
 # prepare input/output
@@ -47,17 +47,17 @@
 b = rng.random(size, dtype=dtype)
 c = cp.empty_like(a)
 
-# cupy runs on a different stream from s, so sync before accessing
+# cupy runs on a different stream from stream, so sync before accessing
 dev.sync()
 
 # prepare launch
 block = 256
 grid = (size + block - 1) // block
 config = LaunchConfig(grid=grid, block=block)
 
-# launch kernel on stream s
-launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
-s.sync()
+# launch kernel on stream
+launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+stream.sync()
 
 # check result
 assert cp.allclose(c, a + b)