diff --git a/compute/gpu_engine.go b/compute/gpu_engine.go index e57c847..8482fb5 100644 --- a/compute/gpu_engine.go +++ b/compute/gpu_engine.go @@ -978,13 +978,16 @@ func (e *GPUEngine[T]) MatMul(ctx context.Context, a, b *tensor.TensorNumeric[T] return nil, err } - // Allocate device output. - devCTotal, err := e.pool.Alloc(e.deviceID, outputBytes) - if err != nil { - e.oomFallbackCount.Add(1) - e.logger.Warn("MatMul: GPU output alloc failed, falling back to CPU", "error", err.Error()) + // Reuse dst's existing GPU memory when possible (#84). + devCTotal, reusedC := tryReuseDstPtr[T](batchSize*cMatSize, dst) + if !reusedC { + devCTotal, err = e.pool.Alloc(e.deviceID, outputBytes) + if err != nil { + e.oomFallbackCount.Add(1) + e.logger.Warn("MatMul: GPU output alloc failed, falling back to CPU", "error", err.Error()) - return e.cpu.MatMul(ctx, a, b, dst...) + return e.cpu.MatMul(ctx, a, b, dst...) + } } // Use strided batched GEMM when available for float32 with batch > 1. @@ -1013,9 +1016,14 @@ func (e *GPUEngine[T]) MatMul(ctx context.Context, a, b *tensor.TensorNumeric[T] if err := batched.SgemmStridedBatched(m, n, k, 1.0, devA, strideA, devB, strideBVal, 0.0, devCTotal, strideC, batchSize); err != nil { - e.pool.Free(e.deviceID, devCTotal, outputBytes) + if !reusedC { + e.pool.Free(e.deviceID, devCTotal, outputBytes) + } return nil, fmt.Errorf("MatMul: batched GEMM: %w", err) } + if reusedC { + return finishReusedDst[T](dst[0], outShape), nil + } return makeGPUResult[T](e, outShape, devCTotal, batchSize*cMatSize, dst...) } } @@ -1052,12 +1060,17 @@ func (e *GPUEngine[T]) MatMul(ctx context.Context, a, b *tensor.TensorNumeric[T] } if blasErr != nil { - e.pool.Free(e.deviceID, devCTotal, outputBytes) + if !reusedC { + e.pool.Free(e.deviceID, devCTotal, outputBytes) + } return nil, fmt.Errorf("MatMul: BLAS batch %d: %w", batch, blasErr) } } + if reusedC { + return finishReusedDst[T](dst[0], outShape), nil + } return makeGPUResult[T](e, outShape, devCTotal, batchSize*cMatSize, dst...) } @@ -1256,7 +1269,8 @@ func (e *GPUEngine[T]) matMulQ4(ctx context.Context, qs *tensor.Q4Storage, a, b // matMulQ4BWeight handles MatMul where B has Q4 storage (virtual-transposed weight). // B's shape after virtual transpose is [K, N], but the Q4 data is laid out as [N, K]. // We compute C[M, N] = A[M, K] * dequant(B)^T by reformulating as: -// C_temp[N, M] = gemm_q4(B_q4[N, K], A^T[K, M]) +// +// C_temp[N, M] = gemm_q4(B_q4[N, K], A^T[K, M]) // // For GEMV (M=1), A^T[K,1] is just A's data as a column, and C_temp[N,1] // can be reshaped to [1, N] without a physical transpose. diff --git a/compute/gpu_engine_memory.go b/compute/gpu_engine_memory.go index 1aef149..6f90f16 100644 --- a/compute/gpu_engine_memory.go +++ b/compute/gpu_engine_memory.go @@ -132,9 +132,14 @@ func (e *GPUEngine[T]) Transpose(ctx context.Context, a *tensor.TensorNumeric[T] } byteSize := total * f32Size - devOut, err := e.pool.Alloc(e.deviceID, byteSize) - if err != nil { - return e.cpu.Transpose(ctx, a, axes, dst...) + + // Reuse dst's existing GPU memory when possible (#84). + devOut, reused := tryReuseDstPtr[T](total, dst) + if !reused { + devOut, err = e.pool.Alloc(e.deviceID, byteSize) + if err != nil { + return e.cpu.Transpose(ctx, a, axes, dst...) + } } // Fast path: 2D transpose. @@ -145,9 +150,14 @@ func (e *GPUEngine[T]) Transpose(ctx context.Context, a *tensor.TensorNumeric[T] "cols", fmt.Sprintf("%d", shape[1])) } if err := e.kernels.Transpose2D(devIn, devOut, shape[0], shape[1], e.stream); err != nil { - e.pool.Free(e.deviceID, devOut, byteSize) + if !reused { + e.pool.Free(e.deviceID, devOut, byteSize) + } return nil, err } + if reused { + return finishReusedDst[T](dst[0], outShape), nil + } return makeGPUResult[T](e, outShape, devOut, total, dst...) } @@ -175,10 +185,15 @@ func (e *GPUEngine[T]) Transpose(ctx context.Context, a *tensor.TensorNumeric[T] } if err := e.kernels.TransposeND(devIn, devOut, inStrides32, outStrides32, perm32, rank, total, e.stream); err != nil { - e.pool.Free(e.deviceID, devOut, byteSize) + if !reused { + e.pool.Free(e.deviceID, devOut, byteSize) + } return nil, err } + if reused { + return finishReusedDst[T](dst[0], outShape), nil + } return makeGPUResult[T](e, outShape, devOut, total, dst...) } diff --git a/compute/gpu_kernels.go b/compute/gpu_kernels.go index 3c36743..fb06e05 100644 --- a/compute/gpu_kernels.go +++ b/compute/gpu_kernels.go @@ -115,6 +115,35 @@ func getDevicePtr[T tensor.Numeric](e *GPUEngine[T], t *tensor.TensorNumeric[T]) return devPtr, cleanup, nil } +// tryReuseDstPtr checks whether dst[0] already has a GPUStorage with at least +// neededElems capacity. If so, it returns the existing device pointer so the +// caller can write kernel output directly into it, avoiding a pool.Alloc and +// the resulting GC-pressure from orphaned GPUStorage objects. See ztensor#84. +func tryReuseDstPtr[T tensor.Numeric](neededElems int, dst []*tensor.TensorNumeric[T]) (unsafe.Pointer, bool) { + if len(dst) == 0 || dst[0] == nil { + return nil, false + } + gs, ok := dst[0].GetStorage().(*tensor.GPUStorage[T]) + if !ok || gs.Len() < neededElems { + return nil, false + } + return gs.Ptr(), true +} + +// finishReusedDst updates dst's shape and strides in place after a kernel has +// written into dst's existing device memory. No new GPUStorage is created. +func finishReusedDst[T tensor.Numeric](dst *tensor.TensorNumeric[T], shape []int) *tensor.TensorNumeric[T] { + strides := make([]int, len(shape)) + stride := 1 + for i := len(shape) - 1; i >= 0; i-- { + strides[i] = stride + stride *= shape[i] + } + dst.SetShape(shape) + dst.SetStrides(strides) + return dst +} + // makeGPUResult creates a tensor with pool-backed GPUStorage wrapping the given // device pointer. When the tensor is freed, the pointer is returned to the pool // for reuse instead of calling cudaFree. @@ -522,17 +551,26 @@ func gpuBinaryOp[T tensor.Numeric]( byteSize := n * f32Size - devC, err := e.pool.Alloc(e.deviceID, byteSize) - if err != nil { - return nil, err + // Reuse dst's existing GPU memory when possible (#84). + devC, reused := tryReuseDstPtr[T](n, dst) + if !reused { + devC, err = e.pool.Alloc(e.deviceID, byteSize) + if err != nil { + return nil, err + } } if err := kernelFn(devA, devB, devC, n, e.stream); err != nil { - e.pool.Free(e.deviceID, devC, byteSize) + if !reused { + e.pool.Free(e.deviceID, devC, byteSize) + } return nil, err } + if reused { + return finishReusedDst[T](dst[0], a.Shape()), nil + } return makeGPUResult[T](e, a.Shape(), devC, n, dst...) } @@ -559,17 +597,26 @@ func gpuUnaryOp[T tensor.Numeric]( byteSize := n * f32Size - devC, err := e.pool.Alloc(e.deviceID, byteSize) - if err != nil { - return nil, err + // Reuse dst's existing GPU memory when possible (#84). + devC, reused := tryReuseDstPtr[T](n, dst) + if !reused { + devC, err = e.pool.Alloc(e.deviceID, byteSize) + if err != nil { + return nil, err + } } if err := kernelFn(devA, devC, n, e.stream); err != nil { - e.pool.Free(e.deviceID, devC, byteSize) + if !reused { + e.pool.Free(e.deviceID, devC, byteSize) + } return nil, err } + if reused { + return finishReusedDst[T](dst[0], a.Shape()), nil + } return makeGPUResult[T](e, a.Shape(), devC, n, dst...) } @@ -597,17 +644,26 @@ func gpuScalarOp[T tensor.Numeric]( byteSize := n * f32Size - devC, err := e.pool.Alloc(e.deviceID, byteSize) - if err != nil { - return nil, err + // Reuse dst's existing GPU memory when possible (#84). + devC, reused := tryReuseDstPtr[T](n, dst) + if !reused { + devC, err = e.pool.Alloc(e.deviceID, byteSize) + if err != nil { + return nil, err + } } if err := kernelFn(devA, scalar, devC, n, e.stream); err != nil { - e.pool.Free(e.deviceID, devC, byteSize) + if !reused { + e.pool.Free(e.deviceID, devC, byteSize) + } return nil, err } + if reused { + return finishReusedDst[T](dst[0], a.Shape()), nil + } return makeGPUResult[T](e, a.Shape(), devC, n, dst...) } @@ -957,20 +1013,29 @@ func (e *GPUEngine[T]) gpuSum(ctx context.Context, a *tensor.TensorNumeric[T], a outByteSize := numStripes * f32Size - devOut, err := e.pool.Alloc(e.deviceID, outByteSize) - if err != nil { - e.oomFallbackCount.Add(1) - e.logger.Warn("Sum: GPU output alloc failed, falling back to CPU", "error", err.Error()) + // Reuse dst's existing GPU memory when possible (#84). + devOut, reused := tryReuseDstPtr[T](numStripes, dst) + if !reused { + devOut, err = e.pool.Alloc(e.deviceID, outByteSize) + if err != nil { + e.oomFallbackCount.Add(1) + e.logger.Warn("Sum: GPU output alloc failed, falling back to CPU", "error", err.Error()) - return e.cpu.Sum(ctx, a, axis, keepDims, dst...) + return e.cpu.Sum(ctx, a, axis, keepDims, dst...) + } } if err := e.kernels.SumAxis(devIn, devOut, outer, inner, axisSize, e.stream); err != nil { - e.pool.Free(e.deviceID, devOut, outByteSize) + if !reused { + e.pool.Free(e.deviceID, devOut, outByteSize) + } return nil, err } + if reused { + return finishReusedDst[T](dst[0], newShape), nil + } return makeGPUResult[T](e, newShape, devOut, numStripes, dst...) }