Skip to content

Commit 76978c1

Browse files
committed
Standardize naming conventions across cuda.core examples
Apply consistent variable names for common objects across all cuda.core example files, addressing issue #1675. Conventions applied: - Stream: `stream` (not `s`), numbered `stream0`/`stream1` - Kernel: `kernel` (not `ker`/`gpu_ker`), descriptive `add_kernel`/`sub_kernel` - Program: `prog` (not `gpu_prog`) - Kernel args: `kernel_args` (not `ker_args`) - Program options: `program_options` (not `opts`), using ProgramOptions (not dicts) - Grid/block: `grid`/`block` (not `grid_size`/`block_size`) Made-with: Cursor
1 parent 2385e5e commit 76978c1

File tree

8 files changed

+50
-50
lines changed

8 files changed

+50
-50
lines changed

cuda_core/examples/cuda_graphs.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,9 @@ def main():
8080
result3 = cp.empty_like(a)
8181

8282
# Prepare launch configuration
83-
block_size = 256
84-
grid_size = (size + block_size - 1) // block_size
85-
config = LaunchConfig(grid=grid_size, block=block_size)
83+
block = 256
84+
grid = (size + block - 1) // block
85+
config = LaunchConfig(grid=grid, block=block)
8686

8787
# Sync before graph capture
8888
dev.sync()

cuda_core/examples/gl_interop_plasma.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,8 @@ def setup_cuda(kernel_source):
9393
dev.set_current()
9494
stream = dev.create_stream()
9595

96-
opts = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
97-
prog = Program(kernel_source, code_type="c++", options=opts)
96+
program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
97+
prog = Program(kernel_source, code_type="c++", options=program_options)
9898
mod = prog.compile("cubin")
9999
kernel = mod.get_kernel("plasma")
100100

cuda_core/examples/pytorch_example.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def __cuda_stream__(self):
4747
return (0, stream_id) # Return format required by CUDA Python
4848

4949

50-
s = dev.create_stream(PyTorchStreamWrapper(pt_stream))
50+
stream = dev.create_stream(PyTorchStreamWrapper(pt_stream))
5151

5252
# prepare program
5353
program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
@@ -59,7 +59,7 @@ def __cuda_stream__(self):
5959
)
6060

6161
# Run in single precision
62-
ker = mod.get_kernel("saxpy_kernel<float>")
62+
kernel = mod.get_kernel("saxpy_kernel<float>")
6363
dtype = torch.float32
6464

6565
# prepare input/output
@@ -74,16 +74,16 @@ def __cuda_stream__(self):
7474
block = 32
7575
grid = int((size + block - 1) // block)
7676
config = LaunchConfig(grid=grid, block=block)
77-
ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
77+
kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
7878

7979
# launch kernel on our stream
80-
launch(s, config, ker, *ker_args)
80+
launch(stream, config, kernel, *kernel_args)
8181

8282
# check result
8383
assert torch.allclose(out, a.item() * x + y)
8484

8585
# let's repeat again with double precision
86-
ker = mod.get_kernel("saxpy_kernel<double>")
86+
kernel = mod.get_kernel("saxpy_kernel<double>")
8787
dtype = torch.float64
8888

8989
# prepare input
@@ -100,10 +100,10 @@ def __cuda_stream__(self):
100100
block = 64
101101
grid = int((size + block - 1) // block)
102102
config = LaunchConfig(grid=grid, block=block)
103-
ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
103+
kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
104104

105105
# launch kernel on PyTorch's stream
106-
launch(s, config, ker, *ker_args)
106+
launch(stream, config, kernel, *kernel_args)
107107

108108
# check result
109109
assert torch.allclose(out, a * x + y)

cuda_core/examples/saxpy.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434

3535
dev = Device()
3636
dev.set_current()
37-
s = dev.create_stream()
37+
stream = dev.create_stream()
3838

3939
# prepare program
4040
program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
@@ -50,7 +50,7 @@
5050
)
5151

5252
# run in single precision
53-
ker = mod.get_kernel("saxpy<float>")
53+
kernel = mod.get_kernel("saxpy<float>")
5454
dtype = cp.float32
5555

5656
# prepare input/output
@@ -60,24 +60,24 @@
6060
x = rng.random(size, dtype=dtype)
6161
y = rng.random(size, dtype=dtype)
6262
out = cp.empty_like(x)
63-
dev.sync() # cupy runs on a different stream from s, so sync before accessing
63+
dev.sync() # cupy runs on a different stream from stream, so sync before accessing
6464

6565
# prepare launch
6666
block = 32
6767
grid = int((size + block - 1) // block)
6868
config = LaunchConfig(grid=grid, block=block)
69-
ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
69+
kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
7070

71-
# launch kernel on stream s
72-
launch(s, config, ker, *ker_args)
73-
s.sync()
71+
# launch kernel on stream
72+
launch(stream, config, kernel, *kernel_args)
73+
stream.sync()
7474

7575
# check result
7676
assert cp.allclose(out, a * x + y)
7777

7878
# let's repeat again, this time allocates our own out buffer instead of cupy's
7979
# run in double precision
80-
ker = mod.get_kernel("saxpy<double>")
80+
kernel = mod.get_kernel("saxpy<double>")
8181
dtype = cp.float64
8282

8383
# prepare input
@@ -90,18 +90,18 @@
9090
# prepare output
9191
buf = dev.allocate(
9292
size * 8, # = dtype.itemsize
93-
stream=s,
93+
stream=stream,
9494
)
9595

9696
# prepare launch
9797
block = 64
9898
grid = int((size + block - 1) // block)
9999
config = LaunchConfig(grid=grid, block=block)
100-
ker_args = (a, x.data.ptr, y.data.ptr, buf, size)
100+
kernel_args = (a, x.data.ptr, y.data.ptr, buf, size)
101101

102-
# launch kernel on stream s
103-
launch(s, config, ker, *ker_args)
104-
s.sync()
102+
# launch kernel on stream
103+
launch(stream, config, kernel, *kernel_args)
104+
stream.sync()
105105

106106
# check result
107107
# we wrap output buffer as a cupy array for simplicity
@@ -112,5 +112,5 @@
112112

113113
# clean up resources that we allocate
114114
# cupy cleans up automatically the rest
115-
buf.close(s)
116-
s.close()
115+
buf.close(stream)
116+
stream.close()

cuda_core/examples/simple_multi_gpu_example.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import sys
1313

1414
import cupy as cp
15-
from cuda.core import Device, LaunchConfig, Program, launch, system
15+
from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system
1616

1717
if system.get_num_devices() < 2:
1818
print("this example requires at least 2 GPUs", file=sys.stderr)
@@ -39,9 +39,9 @@
3939
}
4040
}
4141
"""
42-
prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev0.arch}"})
42+
prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}"))
4343
mod_add = prog_add.compile("cubin")
44-
ker_add = mod_add.get_kernel("vector_add")
44+
add_kernel = mod_add.get_kernel("vector_add")
4545

4646
# Set GPU 1
4747
dev1 = Device(1)
@@ -61,9 +61,9 @@
6161
}
6262
}
6363
"""
64-
prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev1.arch}"})
64+
prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}"))
6565
mod_sub = prog_sub.compile("cubin")
66-
ker_sub = mod_sub.get_kernel("vector_sub")
66+
sub_kernel = mod_sub.get_kernel("vector_sub")
6767

6868

6969
# This adaptor ensures that any foreign stream (ex: from CuPy) that have not
@@ -99,7 +99,7 @@ def __cuda_stream__(self):
9999
stream0.wait(cp_stream0)
100100

101101
# Launch the add kernel on GPU 0 / stream 0
102-
launch(stream0, config0, ker_add, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
102+
launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
103103

104104
# Allocate memory on GPU 1
105105
# Note: This runs on CuPy's current stream for GPU 1.
@@ -114,7 +114,7 @@ def __cuda_stream__(self):
114114
stream1.wait(cp_stream1)
115115

116116
# Launch the subtract kernel on GPU 1 / stream 1
117-
launch(stream1, config1, ker_sub, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
117+
launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
118118

119119
# Synchronize both GPUs are validate the results
120120
dev0.set_current()

cuda_core/examples/strided_memory_view_gpu.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
5757
# of which are supported by StridedMemoryView).
5858
@args_viewable_as_strided_memory((0,))
59-
def my_func(arr, work_stream, gpu_ker):
59+
def my_func(arr, work_stream, kernel):
6060
# Create a memory view over arr (assumed to be a 1D array of int32). The stream
6161
# ordering is taken care of, so that arr can be safely accessed on our work
6262
# stream (ordered after a data stream on which arr is potentially prepared).
@@ -72,7 +72,7 @@ def my_func(arr, work_stream, gpu_ker):
7272
block = 256
7373
grid = (size + block - 1) // block
7474
config = LaunchConfig(grid=grid, block=block)
75-
launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size))
75+
launch(work_stream, config, kernel, view.ptr, np.uint64(size))
7676
# Here we're being conservative and synchronize over our work stream,
7777
# assuming we do not know the data stream; if we know then we could
7878
# just order the data stream after the work stream here, e.g.
@@ -100,24 +100,24 @@ def run():
100100
# To know the GPU's compute capability, we need to identify which GPU to use.
101101
dev = Device(0)
102102
dev.set_current()
103-
gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
104-
mod = gpu_prog.compile(target_type="cubin")
105-
gpu_ker = mod.get_kernel(func_name)
103+
prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
104+
mod = prog.compile(target_type="cubin")
105+
kernel = mod.get_kernel(func_name)
106106

107-
s = dev.create_stream()
107+
stream = dev.create_stream()
108108
try:
109109
# Create input array on GPU
110110
arr_gpu = cp.ones(1024, dtype=cp.int32)
111111
print(f"before: {arr_gpu[:10]=}")
112112

113113
# Run the workload
114-
my_func(arr_gpu, s, gpu_ker)
114+
my_func(arr_gpu, stream, kernel)
115115

116116
# Check the result
117117
print(f"after: {arr_gpu[:10]=}")
118118
assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
119119
finally:
120-
s.close()
120+
stream.close()
121121

122122

123123
if __name__ == "__main__":

cuda_core/examples/thread_block_cluster.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@
9393
options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path),
9494
)
9595
mod = prog.compile(target_type="cubin")
96-
ker = mod.get_kernel("check_cluster_info")
96+
kernel = mod.get_kernel("check_cluster_info")
9797

9898
# prepare launch config
9999
grid = 4
@@ -121,7 +121,7 @@
121121
block_dims[:] = 0
122122

123123
# launch kernel on the default stream
124-
launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer)
124+
launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer)
125125
dev.sync()
126126

127127
# verify results

cuda_core/examples/vector_add.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,15 @@
2929

3030
dev = Device()
3131
dev.set_current()
32-
s = dev.create_stream()
32+
stream = dev.create_stream()
3333

3434
# prepare program
3535
program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
3636
prog = Program(code, code_type="c++", options=program_options)
3737
mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
3838

3939
# run in single precision
40-
ker = mod.get_kernel("vector_add<float>")
40+
kernel = mod.get_kernel("vector_add<float>")
4141
dtype = cp.float32
4242

4343
# prepare input/output
@@ -47,17 +47,17 @@
4747
b = rng.random(size, dtype=dtype)
4848
c = cp.empty_like(a)
4949

50-
# cupy runs on a different stream from s, so sync before accessing
50+
# cupy runs on a different stream from stream, so sync before accessing
5151
dev.sync()
5252

5353
# prepare launch
5454
block = 256
5555
grid = (size + block - 1) // block
5656
config = LaunchConfig(grid=grid, block=block)
5757

58-
# launch kernel on stream s
59-
launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
60-
s.sync()
58+
# launch kernel on stream
59+
launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
60+
stream.sync()
6161

6262
# check result
6363
assert cp.allclose(c, a + b)

0 commit comments

Comments
 (0)