5656# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
5757# of which are supported by StridedMemoryView).
5858@args_viewable_as_strided_memory ((0 ,))
59- def my_func (arr , work_stream , gpu_ker ):
59+ def my_func (arr , work_stream , kernel ):
6060 # Create a memory view over arr (assumed to be a 1D array of int32). The stream
6161 # ordering is taken care of, so that arr can be safely accessed on our work
6262 # stream (ordered after a data stream on which arr is potentially prepared).
@@ -72,7 +72,7 @@ def my_func(arr, work_stream, gpu_ker):
7272 block = 256
7373 grid = (size + block - 1 ) // block
7474 config = LaunchConfig (grid = grid , block = block )
75- launch (work_stream , config , gpu_ker , view .ptr , np .uint64 (size ))
75+ launch (work_stream , config , kernel , view .ptr , np .uint64 (size ))
7676 # Here we're being conservative and synchronize over our work stream,
7777 # assuming we do not know the data stream; if we know then we could
7878 # just order the data stream after the work stream here, e.g.
@@ -100,24 +100,24 @@ def run():
100100 # To know the GPU's compute capability, we need to identify which GPU to use.
101101 dev = Device (0 )
102102 dev .set_current ()
103- gpu_prog = Program (gpu_code , code_type = "c++" , options = ProgramOptions (arch = f"sm_{ dev .arch } " , std = "c++11" ))
104- mod = gpu_prog .compile (target_type = "cubin" )
105- gpu_ker = mod .get_kernel (func_name )
103+ prog = Program (gpu_code , code_type = "c++" , options = ProgramOptions (arch = f"sm_{ dev .arch } " , std = "c++11" ))
104+ mod = prog .compile (target_type = "cubin" )
105+ kernel = mod .get_kernel (func_name )
106106
107- s = dev .create_stream ()
107+ stream = dev .create_stream ()
108108 try :
109109 # Create input array on GPU
110110 arr_gpu = cp .ones (1024 , dtype = cp .int32 )
111111 print (f"before: { arr_gpu [:10 ]= } " )
112112
113113 # Run the workload
114- my_func (arr_gpu , s , gpu_ker )
114+ my_func (arr_gpu , stream , kernel )
115115
116116 # Check the result
117117 print (f"after: { arr_gpu [:10 ]= } " )
118118 assert cp .allclose (arr_gpu , 1 + cp .arange (1024 , dtype = cp .int32 ))
119119 finally :
120- s .close ()
120+ stream .close ()
121121
122122
123123if __name__ == "__main__" :
0 commit comments