zerfoo · dndungu · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026
diff --git a/compute/gpu_engine.go b/compute/gpu_engine.go
@@ -978,13 +978,16 @@ func (e *GPUEngine[T]) MatMul(ctx context.Context, a, b *tensor.TensorNumeric[T]
 		return nil, err
 	}
 
-	// Allocate device output.
-	devCTotal, err := e.pool.Alloc(e.deviceID, outputBytes)
-	if err != nil {
-		e.oomFallbackCount.Add(1)
-		e.logger.Warn("MatMul: GPU output alloc failed, falling back to CPU", "error", err.Error())
+	// Reuse dst's existing GPU memory when possible (#84).
+	devCTotal, reusedC := tryReuseDstPtr[T](batchSize*cMatSize, dst)
+	if !reusedC {
+		devCTotal, err = e.pool.Alloc(e.deviceID, outputBytes)
+		if err != nil {
+			e.oomFallbackCount.Add(1)
+			e.logger.Warn("MatMul: GPU output alloc failed, falling back to CPU", "error", err.Error())
 
-		return e.cpu.MatMul(ctx, a, b, dst...)
+			return e.cpu.MatMul(ctx, a, b, dst...)
+		}
 	}
 
 	// Use strided batched GEMM when available for float32 with batch > 1.
@@ -1013,9 +1016,14 @@ func (e *GPUEngine[T]) MatMul(ctx context.Context, a, b *tensor.TensorNumeric[T]
 			if err := batched.SgemmStridedBatched(m, n, k, 1.0,
 				devA, strideA, devB, strideBVal, 0.0,
 				devCTotal, strideC, batchSize); err != nil {
-				e.pool.Free(e.deviceID, devCTotal, outputBytes)
+				if !reusedC {
+					e.pool.Free(e.deviceID, devCTotal, outputBytes)
+				}
 				return nil, fmt.Errorf("MatMul: batched GEMM: %w", err)
 			}
+			if reusedC {
+				return finishReusedDst[T](dst[0], outShape), nil
+			}
 			return makeGPUResult[T](e, outShape, devCTotal, batchSize*cMatSize, dst...)
 		}
 	}
@@ -1052,12 +1060,17 @@ func (e *GPUEngine[T]) MatMul(ctx context.Context, a, b *tensor.TensorNumeric[T]
 		}
 
 		if blasErr != nil {
-			e.pool.Free(e.deviceID, devCTotal, outputBytes)
+			if !reusedC {
+				e.pool.Free(e.deviceID, devCTotal, outputBytes)
+			}
 
 			return nil, fmt.Errorf("MatMul: BLAS batch %d: %w", batch, blasErr)
 		}
 	}
 
+	if reusedC {
+		return finishReusedDst[T](dst[0], outShape), nil
+	}
 	return makeGPUResult[T](e, outShape, devCTotal, batchSize*cMatSize, dst...)
 }
 
@@ -1256,7 +1269,8 @@ func (e *GPUEngine[T]) matMulQ4(ctx context.Context, qs *tensor.Q4Storage, a, b
 // matMulQ4BWeight handles MatMul where B has Q4 storage (virtual-transposed weight).
 // B's shape after virtual transpose is [K, N], but the Q4 data is laid out as [N, K].
 // We compute C[M, N] = A[M, K] * dequant(B)^T by reformulating as:
-//   C_temp[N, M] = gemm_q4(B_q4[N, K], A^T[K, M])
+//
+//	C_temp[N, M] = gemm_q4(B_q4[N, K], A^T[K, M])
 //
 // For GEMV (M=1), A^T[K,1] is just A's data as a column, and C_temp[N,1]
 // can be reshaped to [1, N] without a physical transpose.

diff --git a/compute/gpu_engine_memory.go b/compute/gpu_engine_memory.go
@@ -132,9 +132,14 @@ func (e *GPUEngine[T]) Transpose(ctx context.Context, a *tensor.TensorNumeric[T]
 	}
 
 	byteSize := total * f32Size
-	devOut, err := e.pool.Alloc(e.deviceID, byteSize)
-	if err != nil {
-		return e.cpu.Transpose(ctx, a, axes, dst...)
+
+	// Reuse dst's existing GPU memory when possible (#84).
+	devOut, reused := tryReuseDstPtr[T](total, dst)
+	if !reused {
+		devOut, err = e.pool.Alloc(e.deviceID, byteSize)
+		if err != nil {
+			return e.cpu.Transpose(ctx, a, axes, dst...)
+		}
 	}
 
 	// Fast path: 2D transpose.
@@ -145,9 +150,14 @@ func (e *GPUEngine[T]) Transpose(ctx context.Context, a *tensor.TensorNumeric[T]
 				"cols", fmt.Sprintf("%d", shape[1]))
 		}
 		if err := e.kernels.Transpose2D(devIn, devOut, shape[0], shape[1], e.stream); err != nil {
-			e.pool.Free(e.deviceID, devOut, byteSize)
+			if !reused {
+				e.pool.Free(e.deviceID, devOut, byteSize)
+			}
 			return nil, err
 		}
+		if reused {
+			return finishReusedDst[T](dst[0], outShape), nil
+		}
 		return makeGPUResult[T](e, outShape, devOut, total, dst...)
 	}
 
@@ -175,10 +185,15 @@ func (e *GPUEngine[T]) Transpose(ctx context.Context, a *tensor.TensorNumeric[T]
 	}
 
 	if err := e.kernels.TransposeND(devIn, devOut, inStrides32, outStrides32, perm32, rank, total, e.stream); err != nil {
-		e.pool.Free(e.deviceID, devOut, byteSize)
+		if !reused {
+			e.pool.Free(e.deviceID, devOut, byteSize)
+		}
 		return nil, err
 	}
 
+	if reused {
+		return finishReusedDst[T](dst[0], outShape), nil
+	}
 	return makeGPUResult[T](e, outShape, devOut, total, dst...)
 }
 

diff --git a/compute/gpu_kernels.go b/compute/gpu_kernels.go
@@ -115,6 +115,35 @@ func getDevicePtr[T tensor.Numeric](e *GPUEngine[T], t *tensor.TensorNumeric[T])
 	return devPtr, cleanup, nil
 }
 
+// tryReuseDstPtr checks whether dst[0] already has a GPUStorage with at least
+// neededElems capacity. If so, it returns the existing device pointer so the
+// caller can write kernel output directly into it, avoiding a pool.Alloc and
+// the resulting GC-pressure from orphaned GPUStorage objects. See ztensor#84.
+func tryReuseDstPtr[T tensor.Numeric](neededElems int, dst []*tensor.TensorNumeric[T]) (unsafe.Pointer, bool) {
+	if len(dst) == 0 || dst[0] == nil {
+		return nil, false
+	}
+	gs, ok := dst[0].GetStorage().(*tensor.GPUStorage[T])
+	if !ok || gs.Len() < neededElems {
+		return nil, false
+	}
+	return gs.Ptr(), true
+}
+
+// finishReusedDst updates dst's shape and strides in place after a kernel has
+// written into dst's existing device memory. No new GPUStorage is created.
+func finishReusedDst[T tensor.Numeric](dst *tensor.TensorNumeric[T], shape []int) *tensor.TensorNumeric[T] {
+	strides := make([]int, len(shape))
+	stride := 1
+	for i := len(shape) - 1; i >= 0; i-- {
+		strides[i] = stride
+		stride *= shape[i]
+	}
+	dst.SetShape(shape)
+	dst.SetStrides(strides)
+	return dst
+}
+
 // makeGPUResult creates a tensor with pool-backed GPUStorage wrapping the given
 // device pointer. When the tensor is freed, the pointer is returned to the pool
 // for reuse instead of calling cudaFree.
@@ -522,17 +551,26 @@ func gpuBinaryOp[T tensor.Numeric](
 
 	byteSize := n * f32Size
 
-	devC, err := e.pool.Alloc(e.deviceID, byteSize)
-	if err != nil {
-		return nil, err
+	// Reuse dst's existing GPU memory when possible (#84).
+	devC, reused := tryReuseDstPtr[T](n, dst)
+	if !reused {
+		devC, err = e.pool.Alloc(e.deviceID, byteSize)
+		if err != nil {
+			return nil, err
+		}
 	}
 
 	if err := kernelFn(devA, devB, devC, n, e.stream); err != nil {
-		e.pool.Free(e.deviceID, devC, byteSize)
+		if !reused {
+			e.pool.Free(e.deviceID, devC, byteSize)
+		}
 
 		return nil, err
 	}
 
+	if reused {
+		return finishReusedDst[T](dst[0], a.Shape()), nil
+	}
 	return makeGPUResult[T](e, a.Shape(), devC, n, dst...)
 }
 
@@ -559,17 +597,26 @@ func gpuUnaryOp[T tensor.Numeric](
 
 	byteSize := n * f32Size
 
-	devC, err := e.pool.Alloc(e.deviceID, byteSize)
-	if err != nil {
-		return nil, err
+	// Reuse dst's existing GPU memory when possible (#84).
+	devC, reused := tryReuseDstPtr[T](n, dst)
+	if !reused {
+		devC, err = e.pool.Alloc(e.deviceID, byteSize)
+		if err != nil {
+			return nil, err
+		}
 	}
 
 	if err := kernelFn(devA, devC, n, e.stream); err != nil {
-		e.pool.Free(e.deviceID, devC, byteSize)
+		if !reused {
+			e.pool.Free(e.deviceID, devC, byteSize)
+		}
 
 		return nil, err
 	}
 
+	if reused {
+		return finishReusedDst[T](dst[0], a.Shape()), nil
+	}
 	return makeGPUResult[T](e, a.Shape(), devC, n, dst...)
 }
 
@@ -597,17 +644,26 @@ func gpuScalarOp[T tensor.Numeric](
 
 	byteSize := n * f32Size
 
-	devC, err := e.pool.Alloc(e.deviceID, byteSize)
-	if err != nil {
-		return nil, err
+	// Reuse dst's existing GPU memory when possible (#84).
+	devC, reused := tryReuseDstPtr[T](n, dst)
+	if !reused {
+		devC, err = e.pool.Alloc(e.deviceID, byteSize)
+		if err != nil {
+			return nil, err
+		}
 	}
 
 	if err := kernelFn(devA, scalar, devC, n, e.stream); err != nil {
-		e.pool.Free(e.deviceID, devC, byteSize)
+		if !reused {
+			e.pool.Free(e.deviceID, devC, byteSize)
+		}
 
 		return nil, err
 	}
 
+	if reused {
+		return finishReusedDst[T](dst[0], a.Shape()), nil
+	}
 	return makeGPUResult[T](e, a.Shape(), devC, n, dst...)
 }
 
@@ -957,20 +1013,29 @@ func (e *GPUEngine[T]) gpuSum(ctx context.Context, a *tensor.TensorNumeric[T], a
 
 	outByteSize := numStripes * f32Size
 
-	devOut, err := e.pool.Alloc(e.deviceID, outByteSize)
-	if err != nil {
-		e.oomFallbackCount.Add(1)
-		e.logger.Warn("Sum: GPU output alloc failed, falling back to CPU", "error", err.Error())
+	// Reuse dst's existing GPU memory when possible (#84).
+	devOut, reused := tryReuseDstPtr[T](numStripes, dst)
+	if !reused {
+		devOut, err = e.pool.Alloc(e.deviceID, outByteSize)
+		if err != nil {
+			e.oomFallbackCount.Add(1)
+			e.logger.Warn("Sum: GPU output alloc failed, falling back to CPU", "error", err.Error())
 
-		return e.cpu.Sum(ctx, a, axis, keepDims, dst...)
+			return e.cpu.Sum(ctx, a, axis, keepDims, dst...)
+		}
 	}
 
 	if err := e.kernels.SumAxis(devIn, devOut, outer, inner, axisSize, e.stream); err != nil {
-		e.pool.Free(e.deviceID, devOut, outByteSize)
+		if !reused {
+			e.pool.Free(e.deviceID, devOut, outByteSize)
+		}
 
 		return nil, err
 	}
 
+	if reused {
+		return finishReusedDst[T](dst[0], newShape), nil
+	}
 	return makeGPUResult[T](e, newShape, devOut, numStripes, dst...)
 }