The following IR does not work with the below pipeline. The returned pointer had been deallocated. It works fine when the generation of the gpu-dealloc op/call is omitted.
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/ptensor-gpu.pp --runner imex-cpu-runner -e main --entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck --O3
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/ptensor-gpu.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func private @printMemrefI32(tensor<*xi32>)
func.func private @printMemrefF32(tensor<*xf32>)
func.func @main() {
%0:4 = call @ddpt_jit() : () -> (memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<2xindex>)
%1 = bufferization.to_tensor %0#1 : memref<?x?xi32, strided<[?, ?], offset: ?>>
%cast = tensor.cast %1 : tensor<?x?xi32> to tensor<*xi32>
call @printMemrefI32(%cast) : (tensor<*xi32>) -> ()
// CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}}
return
}
func.func @ddpt_jit() -> (memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<2xindex>) attributes {llvm.emit_c_interface} {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = tensor.empty() : tensor<16x16xi32>
%1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel"]} outs(%0 : tensor<16x16xi32>) {
^bb0(%out: i32):
linalg.yield %c0_i32 : i32
} -> tensor<16x16xi32>
%2 = tensor.empty() : tensor<0x0xi32>
%3 = bufferization.to_memref %2 : memref<0x0xi32>
%cast = memref.cast %3 : memref<0x0xi32> to memref<?x?xi32, strided<[?, ?], offset: ?>>
%4 = bufferization.to_memref %1 : memref<16x16xi32>
%cast_0 = memref.cast %4 : memref<16x16xi32> to memref<?x?xi32, strided<[?, ?], offset: ?>>
%alloc = memref.alloc() {alignment = 8 : i64} : memref<2xindex>
memref.store %c0, %alloc[%c0] : memref<2xindex>
memref.store %c0, %alloc[%c1] : memref<2xindex>
return %cast, %cast_0, %cast, %alloc : memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<2xindex>
}
}
builtin.module(
func.func(tosa-make-broadcastable)
func.func(tosa-to-linalg)
func.func(tosa-to-tensor)
canonicalize
linalg-fuse-elementwise-ops
arith-expand
memref-expand
arith-bufferize
func-bufferize
func.func(empty-tensor-to-alloc-tensor)
func.func(scf-bufferize)
func.func(tensor-bufferize)
func.func(bufferization-bufferize)
func.func(linalg-bufferize)
func.func(linalg-detensorize)
func.func(tensor-bufferize)
func.func(finalizing-bufferize)
imex-remove-temporaries
func.func(convert-linalg-to-parallel-loops)
func.func(scf-parallel-loop-fusion)
// GPU
func.func(imex-add-outer-parallel-loop)
func.func(gpu-map-parallel-loops)
func.func(convert-parallel-loops-to-gpu)
// insert-gpu-allocs pass can have client-api = opencl or vulkan args
func.func(insert-gpu-allocs{client-api=opencl})
canonicalize
normalize-memrefs
// Unstride memrefs does not seem to be needed.
// func.func(unstride-memrefs)
func.func(lower-affine)
gpu-kernel-outlining
canonicalize
cse
// The following set-spirv-* passes can have client-api = opencl or vulkan args
set-spirv-capabilities{client-api=opencl}
gpu.module(set-spirv-abi-attrs{client-api=opencl})
canonicalize
fold-memref-alias-ops
imex-convert-gpu-to-spirv
spirv.module(spirv-lower-abi-attrs
spirv-update-vce)
func.func(llvm-request-c-wrappers)
serialize-spirv
convert-gpu-to-gpux
convert-func-to-llvm
convert-math-to-llvm
convert-gpux-to-llvm
expand-strided-metadata
lower-affine
finalize-memref-to-llvm
reconcile-unrealized-casts)
The following IR does not work with the below pipeline. The returned pointer had been deallocated. It works fine when the generation of the gpu-dealloc op/call is omitted.