Auto-inserted deallocation of GPU memory leads to crash

The following IR does not work with the below pipeline. The returned pointer had been deallocated. It works fine when the generation of the gpu-dealloc op/call is omitted.

```mlir
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/ptensor-gpu.pp --runner imex-cpu-runner -e main --entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck --O3
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/ptensor-gpu.pp \
// RUN:                                        --runner imex-cpu-runner -e main \
// RUN:                                        --entry-point-result=void \
// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func private @printMemrefI32(tensor<*xi32>)
  func.func private @printMemrefF32(tensor<*xf32>)
  func.func @main() {
    %0:4 = call @ddpt_jit() : () -> (memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<2xindex>)
    %1 = bufferization.to_tensor %0#1 : memref<?x?xi32, strided<[?, ?], offset: ?>>
    %cast = tensor.cast %1 : tensor<?x?xi32> to tensor<*xi32>
    call @printMemrefI32(%cast) : (tensor<*xi32>) -> ()
    // CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}}
    return
  }
  func.func @ddpt_jit() -> (memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<2xindex>) attributes {llvm.emit_c_interface} {
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<16x16xi32>
    %1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel"]} outs(%0 : tensor<16x16xi32>) {
    ^bb0(%out: i32):
      linalg.yield %c0_i32 : i32
    } -> tensor<16x16xi32>
    %2 = tensor.empty() : tensor<0x0xi32>
    %3 = bufferization.to_memref %2 : memref<0x0xi32>
    %cast = memref.cast %3 : memref<0x0xi32> to memref<?x?xi32, strided<[?, ?], offset: ?>>
    %4 = bufferization.to_memref %1 : memref<16x16xi32>
    %cast_0 = memref.cast %4 : memref<16x16xi32> to memref<?x?xi32, strided<[?, ?], offset: ?>>
    %alloc = memref.alloc() {alignment = 8 : i64} : memref<2xindex>
    memref.store %c0, %alloc[%c0] : memref<2xindex>
    memref.store %c0, %alloc[%c1] : memref<2xindex>
    return %cast, %cast_0, %cast, %alloc : memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<2xindex>
  }
}
```

```
builtin.module(
    func.func(tosa-make-broadcastable)
    func.func(tosa-to-linalg)
    func.func(tosa-to-tensor)
    canonicalize
    linalg-fuse-elementwise-ops
    arith-expand
    memref-expand
    arith-bufferize
    func-bufferize
    func.func(empty-tensor-to-alloc-tensor)
    func.func(scf-bufferize)
    func.func(tensor-bufferize)
    func.func(bufferization-bufferize)
    func.func(linalg-bufferize)
    func.func(linalg-detensorize)
    func.func(tensor-bufferize)
    func.func(finalizing-bufferize)
    imex-remove-temporaries
    func.func(convert-linalg-to-parallel-loops)
    func.func(scf-parallel-loop-fusion)
// GPU
    func.func(imex-add-outer-parallel-loop)
    func.func(gpu-map-parallel-loops)
    func.func(convert-parallel-loops-to-gpu)
// insert-gpu-allocs pass can have client-api = opencl or vulkan args
    func.func(insert-gpu-allocs{client-api=opencl})
    canonicalize
    normalize-memrefs
// Unstride memrefs does not seem to be needed.
//  func.func(unstride-memrefs)
    func.func(lower-affine)
    gpu-kernel-outlining
    canonicalize
    cse
// The following set-spirv-* passes can have client-api = opencl or vulkan args
    set-spirv-capabilities{client-api=opencl}
    gpu.module(set-spirv-abi-attrs{client-api=opencl})
    canonicalize
    fold-memref-alias-ops
    imex-convert-gpu-to-spirv
    spirv.module(spirv-lower-abi-attrs
             spirv-update-vce)
    func.func(llvm-request-c-wrappers)
    serialize-spirv
    convert-gpu-to-gpux
    convert-func-to-llvm
    convert-math-to-llvm
    convert-gpux-to-llvm
    expand-strided-metadata
    lower-affine
    finalize-memref-to-llvm
    reconcile-unrealized-casts)
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Auto-inserted deallocation of GPU memory leads to crash #664

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Auto-inserted deallocation of GPU memory leads to crash #664

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions