diff --git a/backends/sdaa/kernels/abs_kernel.cc b/backends/sdaa/kernels/abs_kernel.cc old mode 100755 new mode 100644 index bda0f9bec56..55badd82531 --- a/backends/sdaa/kernels/abs_kernel.cc +++ b/backends/sdaa/kernels/abs_kernel.cc @@ -22,9 +22,9 @@ namespace custom_kernel { template void AbsGrad(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { tecodnnHandle_t tecodnnHandle = GetHandleFromCTX(dev_ctx); int num = static_cast(x.numel()); std::vector dims = {1, 1, 1, num}; @@ -46,9 +46,7 @@ void AbsGrad(const Context& dev_ctx, } template -void AbsKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { +void AbsKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { VLOG(4) << "Call SDAA AbsKernel"; dev_ctx.template Alloc(out); @@ -57,9 +55,9 @@ void AbsKernel(const Context& dev_ctx, template void AbsGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA AbsGradKernel"; dev_ctx.template Alloc(dx); diff --git a/backends/sdaa/kernels/accuracy_kernel.cc b/backends/sdaa/kernels/accuracy_kernel.cc index 47ba12b4ad2..3076b7e9f15 100644 --- a/backends/sdaa/kernels/accuracy_kernel.cc +++ b/backends/sdaa/kernels/accuracy_kernel.cc @@ -21,12 +21,12 @@ namespace custom_kernel { template void AccuracyRawKernel(const Context& dev_ctx, - const phi::DenseTensor& inference, - const phi::DenseTensor& indices, - const phi::DenseTensor& label, - phi::DenseTensor* accuracy, - phi::DenseTensor* correct, - phi::DenseTensor* total) { + const DenseTensor& inference, + const DenseTensor& indices, + const DenseTensor& label, + DenseTensor* accuracy, + DenseTensor* correct, + DenseTensor* total) { VLOG(4) << "Call sdaa Accuracy kernel"; dev_ctx.template Alloc(accuracy); dev_ctx.template Alloc(correct); diff --git a/backends/sdaa/kernels/activation_kernel.cc b/backends/sdaa/kernels/activation_kernel.cc old mode 100755 new mode 100644 index b68d03a86d1..ca6a916d85a --- a/backends/sdaa/kernels/activation_kernel.cc +++ b/backends/sdaa/kernels/activation_kernel.cc @@ -21,8 +21,8 @@ namespace custom_kernel { template void ReluKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA ReluKernel"; dev_ctx.template Alloc(out); sdaa_ops::doActivationForward(dev_ctx, @@ -35,9 +35,9 @@ void ReluKernel(const Context& dev_ctx, template void ReluGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA ReluGradKernel"; dev_ctx.template Alloc(dx); sdaa_ops::doActivationBackward(dev_ctx, @@ -51,8 +51,8 @@ void ReluGradKernel(const Context& dev_ctx, template void Relu6Kernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA Relu6Kernel"; dev_ctx.template Alloc(out); sdaa_ops::doActivationForward(dev_ctx, @@ -65,9 +65,9 @@ void Relu6Kernel(const Context& dev_ctx, template void Relu6GradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA Relu6GradKernel"; dev_ctx.template Alloc(dx); sdaa_ops::doActivationBackward(dev_ctx, @@ -81,8 +81,8 @@ void Relu6GradKernel(const Context& dev_ctx, template void SigmoidKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA SigmoidKernel"; dev_ctx.template Alloc(out); sdaa_ops::doActivationForward(dev_ctx, @@ -95,9 +95,9 @@ void SigmoidKernel(const Context& dev_ctx, template void SigmoidGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA SigmoidGradKernel"; dev_ctx.template Alloc(dx); sdaa_ops::doActivationBackward(dev_ctx, @@ -111,8 +111,8 @@ void SigmoidGradKernel(const Context& dev_ctx, template void TanhKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA TanhKernel"; dev_ctx.template Alloc(out); sdaa_ops::doActivationForward(dev_ctx, @@ -125,9 +125,9 @@ void TanhKernel(const Context& dev_ctx, template void TanhGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA TanhGradKernel"; dev_ctx.template Alloc(dx); sdaa_ops::doActivationBackward(dev_ctx, @@ -141,9 +141,9 @@ void TanhGradKernel(const Context& dev_ctx, template void EluKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float alpha, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA ELUKernel"; dev_ctx.template Alloc(out); sdaa_ops::doActivationForward(dev_ctx, @@ -156,11 +156,11 @@ void EluKernel(const Context& dev_ctx, template void EluGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x UNUSED, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, + const DenseTensor& x UNUSED, + const DenseTensor& out, + const DenseTensor& dout, float alpha, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "Call SDAA EluGradKernel"; dev_ctx.template Alloc(dx); sdaa_ops::doActivationBackward(dev_ctx, @@ -173,9 +173,7 @@ void EluGradKernel(const Context& dev_ctx, } template -void ExpKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { +void ExpKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { VLOG(4) << "Call SDAA ExpKernel"; dev_ctx.template Alloc(out); sdaa_ops::doUnaryOpTensor(dev_ctx, x, 1.0, UnaryOpMode::EXP, out); @@ -183,9 +181,9 @@ void ExpKernel(const Context& dev_ctx, template void ExpGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA ExpGradKernel"; dev_ctx.template Alloc(dx); @@ -194,9 +192,9 @@ void ExpGradKernel(const Context& dev_ctx, template void GeluKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, bool approximate, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA GeluKernel"; dev_ctx.template Alloc(out); @@ -219,10 +217,10 @@ void GeluKernel(const Context& dev_ctx, template void GeluGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& out_grad, + const DenseTensor& out, + const DenseTensor& out_grad, bool approximate, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { VLOG(4) << "Call SDAA GeluGradKernel"; dev_ctx.template Alloc(x_grad); if (approximate) { @@ -244,9 +242,7 @@ void GeluGradKernel(const Context& dev_ctx, } } template -void ErfKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { +void ErfKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { VLOG(4) << "CALL SDAA ErfKernel"; int64_t numel = x.numel(); std::vector dataTemp; @@ -269,9 +265,9 @@ void ErfKernel(const Context& dev_ctx, template void LeakyReluKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, double alpha, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA LeakyReluKernel"; dev_ctx.template Alloc(out); float alp = static_cast(alpha); @@ -285,10 +281,10 @@ void LeakyReluKernel(const Context& dev_ctx, template void LeakyReluGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& dout, double alpha, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "Call SDAA LeakyReluGradKernel"; dev_ctx.template Alloc(dx); float alp = static_cast(alpha); @@ -303,9 +299,9 @@ void LeakyReluGradKernel(const Context& dev_ctx, template void SqrtGrad(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { dev_ctx.template Alloc(dx); // dx = 0.5 * dout / out sdaa_ops::doElementDiv(dev_ctx, dout, out, -1, dx); @@ -316,8 +312,8 @@ void SqrtGrad(const Context& dev_ctx, template void SqrtKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA SqrtKernel"; dev_ctx.template Alloc(out); sdaa_ops::doUnaryOpTensor(dev_ctx, x, 1.0, UnaryOpMode::SQRT, out); @@ -325,18 +321,18 @@ void SqrtKernel(const Context& dev_ctx, template void SqrtGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA SqrtGradKernel"; SqrtGrad(dev_ctx, out, dout, dx); } template void RsqrtGrad(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { dev_ctx.template Alloc(dx); // dx = -0.5 * dout * out * out * out float alpha = -0.5f; @@ -348,8 +344,8 @@ void RsqrtGrad(const Context& dev_ctx, template void RsqrtKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA RsqrtKernel"; dev_ctx.template Alloc(out); sdaa_ops::doUnaryOpTensor(dev_ctx, x, 1.0, UnaryOpMode::RSQRT, out); @@ -357,18 +353,18 @@ void RsqrtKernel(const Context& dev_ctx, template void RsqrtGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA RsqrtGradKernel"; RsqrtGrad(dev_ctx, out, dout, dx); } template void PowKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& factor_scalar, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA PowKernel"; auto factor = factor_scalar.to(); dev_ctx.template Alloc(out); @@ -377,18 +373,18 @@ void PowKernel(const Context& dev_ctx, template void PowGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& dout, const phi::Scalar& factor_scalar, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "Call SDAA PowGradKernel"; auto factor = factor_scalar.to(); auto x_dims = x.dims(); // dx = dout * factor * x.pow(factor - 1) // step 1: compute x_pow = x.pow(factor - 1) - phi::DenseTensor x_pow; - phi::DenseTensorMeta x_pow_meta = {x.dtype(), x_dims}; + DenseTensor x_pow; + DenseTensorMeta x_pow_meta = {x.dtype(), x_dims}; x_pow.set_meta(x_pow_meta); dev_ctx.template Alloc(&x_pow); float factor_x_pow = factor - static_cast(1); @@ -403,9 +399,7 @@ void PowGradKernel(const Context& dev_ctx, } template -void LogKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { +void LogKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { VLOG(4) << "Call SDAA LogKernel"; dev_ctx.template Alloc(out); sdaa_ops::doUnaryOpTensor(dev_ctx, x, 1.0, UnaryOpMode::LOG, out); @@ -413,12 +407,12 @@ void LogKernel(const Context& dev_ctx, template void LogGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA LogGradKernel"; dev_ctx.template Alloc(dx); - phi::DenseTensor dx_temp; + DenseTensor dx_temp; dx_temp.Resize(dx->dims()); dev_ctx.template Alloc(&dx_temp); @@ -429,8 +423,8 @@ void LogGradKernel(const Context& dev_ctx, template void ReciprocalKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA ReciprocalKernel"; dev_ctx.template Alloc(out); sdaa_ops::doReciprocalTensor(dev_ctx, x, out); @@ -438,13 +432,13 @@ void ReciprocalKernel(const Context& dev_ctx, template void ReciprocalGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA ReciprocalGradKernel"; dev_ctx.template Alloc(dx); - phi::DenseTensor out_temp; + DenseTensor out_temp; out_temp.Resize(out.dims()); dev_ctx.template Alloc(&out_temp); @@ -455,8 +449,8 @@ void ReciprocalGradKernel(const Context& dev_ctx, template void SiluKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "CALL SDAA SiluKernel"; dev_ctx.template Alloc(out); sdaa_ops::doActivationForward(dev_ctx, @@ -469,10 +463,10 @@ void SiluKernel(const Context& dev_ctx, template void SiluGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "CALL SDAA SiluGradKernel"; dev_ctx.template Alloc(dx); sdaa_ops::doActivationBackward(dev_ctx, @@ -486,15 +480,15 @@ void SiluGradKernel(const Context& dev_ctx, template void doHardSwish(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { float threshold = 6; float scale = 6; float offset = 3; std::vector x_dims = phi::vectorize(x.dims()); - phi::DenseTensor* x_ = const_cast(&x); + DenseTensor* x_ = const_cast(&x); tecodnnHandle_t tecodnnHandle = GetHandleFromCTX(dev_ctx); tecodnnTensorDescriptor_t Desc = sdaa_ops::GetTecodnnTensorDesc( @@ -512,16 +506,16 @@ void doHardSwish(const Context& dev_ctx, template void doHardSwishGrad(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { float threshold = 6; float scale = 6; float offset = 3; std::vector x_dims = phi::vectorize(x.dims()); - phi::DenseTensor* x_ = const_cast(&x); - phi::DenseTensor* dout_ = const_cast(&dout); + DenseTensor* x_ = const_cast(&x); + DenseTensor* dout_ = const_cast(&dout); tecodnnHandle_t tecodnnHandle = GetHandleFromCTX(dev_ctx); tecodnnTensorDescriptor_t Desc = sdaa_ops::GetTecodnnTensorDesc( @@ -541,8 +535,8 @@ void doHardSwishGrad(const Context& dev_ctx, template void HardSwishKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "CALL SDAA HardSwishKernel"; dev_ctx.template Alloc(out); @@ -552,9 +546,9 @@ void HardSwishKernel(const Context& dev_ctx, template void HardSwishGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "CALL SDAA HardSwishGradKernel"; dev_ctx.template Alloc(dx); @@ -563,10 +557,10 @@ void HardSwishGradKernel(const Context& dev_ctx, template void HardSigmoidKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float slope, float offset, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA HardSigmoidKernel"; dev_ctx.template Alloc(out); @@ -586,11 +580,11 @@ void HardSigmoidKernel(const Context& dev_ctx, template void HardSigmoidGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, + const DenseTensor& out, + const DenseTensor& dout, float slope, float offset, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "Call SDAA HardSigmoidGradKernel"; dev_ctx.template Alloc(dx); @@ -617,8 +611,8 @@ void HardSigmoidGradKernel(const Context& dev_ctx, template void SoftsignKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA SoftsignKernel"; int N = 1, C = x.numel(), H = 1, W = 1; @@ -640,9 +634,9 @@ void SoftsignKernel(const Context& dev_ctx, template void SoftsignGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA SoftsignGradKernel"; int N = 1, C = x.numel(), H = 1, W = 1; @@ -671,10 +665,10 @@ void SoftsignGradKernel(const Context& dev_ctx, template void SoftplusKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, double beta, double threshold, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA SoftplusKernel"; int N = 1, C = x.numel(), H = 1, W = 1; @@ -705,11 +699,11 @@ void SoftplusKernel(const Context& dev_ctx, template void SoftplusGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& dout, double beta, double threshold, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "Call SDAA SoftplusGradKernel"; int N = 1, C = x.numel(), H = 1, W = 1; @@ -741,9 +735,7 @@ void SoftplusGradKernel(const Context& dev_ctx, } template -void SinKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { +void SinKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { VLOG(4) << "Call SDAA SinKernel"; dev_ctx.template Alloc(out); sdaa_ops::doUnaryOpTensor(dev_ctx, x, 0.0, UnaryOpMode::SIN, out); @@ -751,12 +743,12 @@ void SinKernel(const Context& dev_ctx, template void SinGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA SinGradKernel"; - phi::DenseTensor x_cos; - phi::DenseTensorMeta x_cos_meta = {x.dtype(), x.dims()}; + DenseTensor x_cos; + DenseTensorMeta x_cos_meta = {x.dtype(), x.dims()}; x_cos.set_meta(x_cos_meta); dev_ctx.template Alloc(&x_cos); @@ -767,9 +759,7 @@ void SinGradKernel(const Context& dev_ctx, } template -void CosKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { +void CosKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { VLOG(4) << "Call SDAA CosKernel"; dev_ctx.template Alloc(out); sdaa_ops::doUnaryOpTensor(dev_ctx, x, 0.0, UnaryOpMode::COS, out); @@ -777,12 +767,12 @@ void CosKernel(const Context& dev_ctx, template void CosGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA CosGradKernel"; - phi::DenseTensor x_sin; - phi::DenseTensorMeta x_sin_meta = {x.dtype(), x.dims()}; + DenseTensor x_sin; + DenseTensorMeta x_sin_meta = {x.dtype(), x.dims()}; x_sin.set_meta(x_sin_meta); dev_ctx.template Alloc(&x_sin); @@ -796,8 +786,8 @@ void CosGradKernel(const Context& dev_ctx, template void SquareKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA SquareKernel"; dev_ctx.template Alloc(out); sdaa_ops::doUnaryOpTensor(dev_ctx, x, 0.0, UnaryOpMode::SQUARE, out); @@ -805,11 +795,11 @@ void SquareKernel(const Context& dev_ctx, template void SquareGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA SquareGradKernel"; - phi::DenseTensor double_x; + DenseTensor double_x; double_x.set_meta(x.meta()); dev_ctx.template Alloc(&double_x); sdaa_ops::doUnaryOpTensor(dev_ctx, x, 2.0, UnaryOpMode::MUL_A, &double_x); @@ -820,8 +810,8 @@ void SquareGradKernel(const Context& dev_ctx, template void AtanKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA AtanKernel"; dev_ctx.template Alloc(out); @@ -837,18 +827,18 @@ void AtanKernel(const Context& dev_ctx, // dx = dout * 1 / (1 + x.pow(2)) template void AtanGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA AtanGradKernel"; // Step1: Compute x_pow = x.pow(2) - phi::DenseTensor x_pow; + DenseTensor x_pow; x_pow.Resize(x.dims()); dev_ctx.template Alloc(&x_pow); sdaa_ops::doUnaryOpTensor(dev_ctx, x, 0.0, UnaryOpMode::SQUARE, &x_pow); // Step2: x_pow_1 = x_pow + 1 - phi::DenseTensor x_pow_1; + DenseTensor x_pow_1; x_pow_1.Resize(x.dims()); dev_ctx.template Alloc(&x_pow_1); sdaa_ops::doUnaryOpTensor(dev_ctx, x_pow, 1.0, UnaryOpMode::ADD_A, &x_pow_1); @@ -860,8 +850,8 @@ void AtanGradKernel(const Context& dev_ctx, template void CeilKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA CeilKernel"; dev_ctx.template Alloc(out); @@ -876,8 +866,8 @@ void CeilKernel(const Context& dev_ctx, template void CeilGradKernel(const Context& dev_ctx, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "CALL SDAA CeilGradKernel."; dev_ctx.template Alloc(dx); @@ -886,9 +876,9 @@ void CeilGradKernel(const Context& dev_ctx, template void SwishRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float beta, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA SwishRawKernel"; dev_ctx.template Alloc(out); @@ -903,17 +893,17 @@ void SwishRawKernel(const Context& dev_ctx, template void SwishKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA SwishKernel"; custom_kernel::SwishRawKernel(dev_ctx, x, 1.0, out); } template void SwishGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA SwishGradKernel"; dev_ctx.template Alloc(dx); @@ -935,8 +925,8 @@ void SwishGradKernel(const Context& dev_ctx, template void FloorKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA FloorKernel"; dev_ctx.template Alloc(out); @@ -945,8 +935,8 @@ void FloorKernel(const Context& dev_ctx, template void FloorGradKernel(const Context& dev_ctx, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "CALL SDAA FloorGradKernel."; dev_ctx.template Alloc(dx); @@ -955,8 +945,8 @@ void FloorGradKernel(const Context& dev_ctx, template void Log2Kernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA Log2Kernel"; dev_ctx.template Alloc(out); @@ -970,14 +960,14 @@ void Log2Kernel(const Context& dev_ctx, template void Log2GradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "CALL SDAA Log2GradKernel."; dev_ctx.template Alloc(dx); - phi::DenseTensor x_log2; + DenseTensor x_log2; x_log2.Resize(x.dims()); dev_ctx.template Alloc(&x_log2); @@ -988,9 +978,9 @@ void Log2GradKernel(const Context& dev_ctx, template void MishKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float threshold, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA MishKernel."; dev_ctx.template Alloc(out); @@ -1006,10 +996,10 @@ void MishKernel(const Context& dev_ctx, template void MishGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& dout, float threshold, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "Call SDAA MishGradKernel."; dev_ctx.template Alloc(dx); @@ -1031,10 +1021,10 @@ void MishGradKernel(const Context& dev_ctx, template void HardTanhKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float min, float max, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA HardTanhKernel."; dev_ctx.template Alloc(out); @@ -1044,7 +1034,7 @@ void HardTanhKernel(const Context& dev_ctx, std::vector x_dims = phi::vectorize(x.dims()); - phi::DenseTensor x_temp(x); + DenseTensor x_temp(x); tecodnnHandle_t tecodnnHandle = GetHandleFromCTX(dev_ctx); tecodnnTensorDescriptor_t Desc = sdaa_ops::GetTecodnnTensorDesc( @@ -1063,8 +1053,8 @@ void HardTanhKernel(const Context& dev_ctx, template void LogSigmoidKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA LogSigmoidKernel"; dev_ctx.template Alloc(out); sdaa_ops::doActivationForward(dev_ctx, @@ -1077,9 +1067,9 @@ void LogSigmoidKernel(const Context& dev_ctx, template void LogSigmoidGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA LogSigmoidGradKernel"; dev_ctx.template Alloc(dx); sdaa_ops::doActivationBackward(dev_ctx, diff --git a/backends/sdaa/kernels/adam_kernel.cc b/backends/sdaa/kernels/adam_kernel.cc index c40aaec5346..32ef4b96f69 100644 --- a/backends/sdaa/kernels/adam_kernel.cc +++ b/backends/sdaa/kernels/adam_kernel.cc @@ -20,33 +20,32 @@ #include "tecodnn.h" // NOLINT namespace custom_kernel { template -void AdamKernel( - const Context& dev_ctx, - const phi::DenseTensor& param, - const phi::DenseTensor& grad, - const phi::DenseTensor& learning_rate, - const phi::DenseTensor& moment1, - const phi::DenseTensor& moment2, - const paddle::optional& moment2_max, // UNUSED - const phi::DenseTensor& beta1_pow_in, - const phi::DenseTensor& beta2_pow_in, - const paddle::optional& master_param, // fp32 - const paddle::optional& skip_update, - const phi::Scalar& beta1_in, - const phi::Scalar& beta2_in, - const phi::Scalar& epsilon_in, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - bool amsgrad, // UNUSED - phi::DenseTensor* param_out, - phi::DenseTensor* moment1_out, - phi::DenseTensor* moment2_out, - phi::DenseTensor* moment2_max_out, // UNUSED - phi::DenseTensor* beta1_pow_out, - phi::DenseTensor* beta2_pow_out, - phi::DenseTensor* master_param_out) { +void AdamKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const paddle::optional& moment2_max, // UNUSED + const DenseTensor& beta1_pow_in, + const DenseTensor& beta2_pow_in, + const paddle::optional& master_param, // fp32 + const paddle::optional& skip_update, + const phi::Scalar& beta1_in, + const phi::Scalar& beta2_in, + const phi::Scalar& epsilon_in, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + bool amsgrad, // UNUSED + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* moment2_max_out, // UNUSED + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_out) { VLOG(4) << "call sdaa AdamKernel"; PADDLE_ENFORCE_NE( @@ -99,8 +98,8 @@ void AdamKernel( std::vector cpu_if_skip = {false}; if (skip_update.is_initialized()) { - const phi::DenseTensor& skip_update_tmp = - static_cast(*skip_update); + const DenseTensor& skip_update_tmp = + static_cast(*skip_update); TensorToVector(dev_ctx, skip_update_tmp, dev_ctx, &cpu_if_skip); PADDLE_ENFORCE_EQ(skip_update->numel(), 1, @@ -140,18 +139,18 @@ void AdamKernel( device_scale += 1; } - phi::DenseTensor* beta1_pow = const_cast(&beta1_pow_in); - phi::DenseTensor* beta2_pow = const_cast(&beta2_pow_in); - phi::DenseTensor* lr = const_cast(&learning_rate); - phi::DenseTensor* grad_in = const_cast(&grad); + DenseTensor* beta1_pow = const_cast(&beta1_pow_in); + DenseTensor* beta2_pow = const_cast(&beta2_pow_in); + DenseTensor* lr = const_cast(&learning_rate); + DenseTensor* grad_in = const_cast(&grad); float beta1 = beta1_in.to(); // cpu float beta2 = beta2_in.to(); // cpu float epsilon = epsilon_in.to(); int n_total = static_cast(param.numel()); - phi::DenseTensor param_in = multi_precision ? master_param.get() : param; - phi::DenseTensor* moment1_in = const_cast(&moment1); - phi::DenseTensor* moment2_in = const_cast(&moment2); + DenseTensor param_in = multi_precision ? master_param.get() : param; + DenseTensor* moment1_in = const_cast(&moment1); + DenseTensor* moment2_in = const_cast(&moment2); // init beta_pow_out in case beta_pow_out is NULL when use_global_beta_pow is // true @@ -173,7 +172,7 @@ void AdamKernel( } void* A[4] = { grad_in->data(), param_in.data(), moment1_in->data(), moment2_in->data()}; - phi::DenseTensor param_out_; + DenseTensor param_out_; if (multi_precision) { param_out_ = *master_param_out; } else { @@ -204,36 +203,35 @@ void AdamKernel( } template -void AdamwKernel( - const Context& dev_ctx, - const phi::DenseTensor& param, - const phi::DenseTensor& grad, - const phi::DenseTensor& learning_rate, - const phi::DenseTensor& moment1, - const phi::DenseTensor& moment2, - const paddle::optional& moment2_max, // UNUSED - const phi::DenseTensor& beta1_pow, - const phi::DenseTensor& beta2_pow, - const paddle::optional& master_param, - const paddle::optional& skip_update, - const phi::Scalar& beta1, - const phi::Scalar& beta2, - const phi::Scalar& epsilon, - float lr_ratio, - float coeff, - bool with_decay, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - bool amsgrad, // UNUSED - phi::DenseTensor* param_out, - phi::DenseTensor* moment1_out, - phi::DenseTensor* moment2_out, - phi::DenseTensor* moment2_max_out, // UNUSED - phi::DenseTensor* beta1_pow_out, - phi::DenseTensor* beta2_pow_out, - phi::DenseTensor* master_param_outs) { +void AdamwKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const paddle::optional& moment2_max, // UNUSED + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + const paddle::optional& master_param, + const paddle::optional& skip_update, + const phi::Scalar& beta1, + const phi::Scalar& beta2, + const phi::Scalar& epsilon, + float lr_ratio, + float coeff, + bool with_decay, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + bool amsgrad, // UNUSED + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* moment2_max_out, // UNUSED + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs) { VLOG(4) << "call sdaa AdamwKernel"; PADDLE_ENFORCE_NE( amsgrad, @@ -285,8 +283,8 @@ void AdamwKernel( std::vector cpu_if_skip = {false}; if (skip_update.is_initialized()) { - const phi::DenseTensor& skip_update_tmp = - static_cast(*skip_update); + const DenseTensor& skip_update_tmp = + static_cast(*skip_update); TensorToVector(dev_ctx, skip_update_tmp, dev_ctx, &cpu_if_skip); PADDLE_ENFORCE_EQ(skip_update->numel(), 1, @@ -326,13 +324,13 @@ void AdamwKernel( float beta1_ = beta1.to(); // cpu float beta2_ = beta2.to(); // cpu float epsilon_ = epsilon.to(); - phi::DenseTensor* beta1_pow_ = const_cast(&beta1_pow); - phi::DenseTensor* beta2_pow_ = const_cast(&beta2_pow); - phi::DenseTensor* lr = const_cast(&learning_rate); - phi::DenseTensor* grad_in = const_cast(&grad); - phi::DenseTensor param_in = multi_precision ? master_param.get() : param; - phi::DenseTensor* moment1_in = const_cast(&moment1); - phi::DenseTensor* moment2_in = const_cast(&moment2); + DenseTensor* beta1_pow_ = const_cast(&beta1_pow); + DenseTensor* beta2_pow_ = const_cast(&beta2_pow); + DenseTensor* lr = const_cast(&learning_rate); + DenseTensor* grad_in = const_cast(&grad); + DenseTensor param_in = multi_precision ? master_param.get() : param; + DenseTensor* moment1_in = const_cast(&moment1); + DenseTensor* moment2_in = const_cast(&moment2); int n_total = static_cast(param.numel()); float* b1_out = beta1_pow_->data(); @@ -353,7 +351,7 @@ void AdamwKernel( } void* A[4] = { grad_in->data(), param_in.data(), moment1_in->data(), moment2_in->data()}; - phi::DenseTensor param_out_; + DenseTensor param_out_; if (multi_precision) { param_out_ = *master_param_outs; } else { diff --git a/backends/sdaa/kernels/add_n_kernel.cc b/backends/sdaa/kernels/add_n_kernel.cc index a4a5d57a457..71e652750e3 100644 --- a/backends/sdaa/kernels/add_n_kernel.cc +++ b/backends/sdaa/kernels/add_n_kernel.cc @@ -21,8 +21,8 @@ namespace custom_kernel { template void doAddNRaw(const Context& dev_ctx, - const std::vector& x, - phi::DenseTensor* out) { + const std::vector& x, + DenseTensor* out) { std::vector descs; std::vector data_ptrs; // NOTE(liaotianju): addN enforce all tensor shapes are equal @@ -33,7 +33,7 @@ void doAddNRaw(const Context& dev_ctx, descs.push_back(desc); data_ptrs.push_back(x[i]->data()); } - phi::DenseTensor ptrs; + DenseTensor ptrs; int64_t ptr_size = x.size() * sizeof(void*); ptrs.Resize({ptr_size}); dev_ctx.template Alloc(&ptrs); @@ -57,8 +57,8 @@ void doAddNRaw(const Context& dev_ctx, template void AddNKernel(const Context& dev_ctx, - const std::vector& x, - phi::DenseTensor* out) { + const std::vector& x, + DenseTensor* out) { VLOG(4) << "CALL SDAA AddNKernel"; PADDLE_ENFORCE_EQ(out->dtype() == phi::DataType::FLOAT32 || @@ -119,7 +119,7 @@ void AddNKernel(const Context& dev_ctx, } } - std::vector inputs; + std::vector inputs; std::vector> inputs_dims; for (int i = 0; i < n; i++) { if (x[i] && x[i]->numel() > 0) { diff --git a/backends/sdaa/kernels/amp/amp_funcs.h b/backends/sdaa/kernels/amp/amp_funcs.h index 40aad6c7abc..4d0beb8e455 100644 --- a/backends/sdaa/kernels/amp/amp_funcs.h +++ b/backends/sdaa/kernels/amp/amp_funcs.h @@ -81,9 +81,9 @@ inline DataTypes_t ToExtendDataType(const DataType& dtype) { template void AddOne(const Context& dev_ctx, - const phi::DenseTensor* in_tensor, - phi::DenseTensor* out_tensor) { - phi::DenseTensor factor_tensor, in_tensor_f, out_tensor_f; + const DenseTensor* in_tensor, + DenseTensor* out_tensor) { + DenseTensor factor_tensor, in_tensor_f, out_tensor_f; factor_tensor.Resize({1}); dev_ctx.template Alloc(&factor_tensor); @@ -105,13 +105,13 @@ void AddOne(const Context& dev_ctx, template void AbnormCheckAndScale(const Context& dev_ctx, - const std::vector& xs, - const phi::DenseTensor& t_scale, - std::vector outs, - phi::DenseTensor* found_inf) { + const std::vector& xs, + const DenseTensor& t_scale, + std::vector outs, + DenseTensor* found_inf) { VLOG(4) << "call sdaa custom fusedVSCheckInvalid op"; - phi::DenseTensor found_inf_INT; + DenseTensor found_inf_INT; found_inf_INT.Resize(found_inf->dims()); dev_ctx.template Alloc(&found_inf_INT); @@ -134,12 +134,12 @@ void AbnormCheckAndScale(const Context& dev_ctx, for (int i = 0; i < M; i++) { int64_t tensor_num = xs[i]->numel(); every_tensor_num.push_back(tensor_num); - auto* x = const_cast(xs[i]); + auto* x = const_cast(xs[i]); input[i] = x->data(); input[i + M] = outs[i]->data(); } - phi::DenseTensor total; + DenseTensor total; int total_numel = M * sizeof(int64_t) + 2 * M * sizeof(void*); total.Resize({total_numel}); dev_ctx.template Alloc(&total); diff --git a/backends/sdaa/kernels/amp/check_finite_and_scaling_kernel.cc b/backends/sdaa/kernels/amp/check_finite_and_scaling_kernel.cc index 69e4dc1b8a8..ea126676654 100644 --- a/backends/sdaa/kernels/amp/check_finite_and_scaling_kernel.cc +++ b/backends/sdaa/kernels/amp/check_finite_and_scaling_kernel.cc @@ -33,10 +33,10 @@ namespace custom_kernel { template void CheckFiniteAndUnscale(const Context& dev_ctx, - const std::vector& xs, - const phi::DenseTensor& t_scale, - std::vector outs, - phi::DenseTensor* found_inf) { + const std::vector& xs, + const DenseTensor& t_scale, + std::vector outs, + DenseTensor* found_inf) { VLOG(4) << "Call SDAA CheckFiniteAndUnscale"; // step 1: check whether the tensor has nan or inf diff --git a/backends/sdaa/kernels/amp/update_loss_scaling_kernel.cc b/backends/sdaa/kernels/amp/update_loss_scaling_kernel.cc index 789cb918d2b..24caa8622c5 100644 --- a/backends/sdaa/kernels/amp/update_loss_scaling_kernel.cc +++ b/backends/sdaa/kernels/amp/update_loss_scaling_kernel.cc @@ -36,8 +36,8 @@ class LazyZerosSDAA { public: void operator()(const Context& dev_ctx, const std::vector found_inf_vec, - const std::vector& xs, - const std::vector& outs) const { + const std::vector& xs, + const std::vector& outs) const { if (!xs.size()) { return; } @@ -60,22 +60,22 @@ class LazyZerosSDAA { template void Update(const Context& dev_ctx, const std::vector found_inf_vec, - const phi::DenseTensor* pre_loss_scaling_tensor, - const phi::DenseTensor* good_in_tensor, - const phi::DenseTensor* bad_in_tensor, + const DenseTensor* pre_loss_scaling_tensor, + const DenseTensor* good_in_tensor, + const DenseTensor* bad_in_tensor, const int incr_every_n_steps, const int decr_every_n_nan_or_inf, const float incr_ratio, const float decr_ratio, - phi::DenseTensor* updated_loss_scaling_tensor, - phi::DenseTensor* good_out_tensor, - phi::DenseTensor* bad_out_tensor) { + DenseTensor* updated_loss_scaling_tensor, + DenseTensor* good_out_tensor, + DenseTensor* bad_out_tensor) { dev_ctx.template Alloc(updated_loss_scaling_tensor); dev_ctx.template Alloc(good_out_tensor); dev_ctx.template Alloc(bad_out_tensor); - phi::DenseTensor* pre_loss_scaling_tensor_ = - const_cast(pre_loss_scaling_tensor); + DenseTensor* pre_loss_scaling_tensor_ = + const_cast(pre_loss_scaling_tensor); if (found_inf_vec[0]) { // good_out_data = 0 @@ -163,20 +163,20 @@ void Update(const Context& dev_ctx, template void UpdateLossScaling(const Context& dev_ctx, - const std::vector& xs, - const phi::DenseTensor& t_found_inf, - const phi::DenseTensor& t_pre_loss_scaling, - const phi::DenseTensor& t_good_in, - const phi::DenseTensor& t_bad_in, + const std::vector& xs, + const DenseTensor& t_found_inf, + const DenseTensor& t_pre_loss_scaling, + const DenseTensor& t_good_in, + const DenseTensor& t_bad_in, int incr_every_n_steps, int decr_every_n_nan_or_inf, float incr_ratio, float decr_ratio, const phi::Scalar& stop_update, - std::vector outs, - phi::DenseTensor* updated_loss_scaling, - phi::DenseTensor* good_out, - phi::DenseTensor* bad_out) { + std::vector outs, + DenseTensor* updated_loss_scaling, + DenseTensor* good_out, + DenseTensor* bad_out) { VLOG(4) << "Call SDAA UpdateLossScaling"; auto* found_inf = &t_found_inf; diff --git a/backends/sdaa/kernels/arange_kernel.cc b/backends/sdaa/kernels/arange_kernel.cc index 90a10baa80c..08dd924c180 100644 --- a/backends/sdaa/kernels/arange_kernel.cc +++ b/backends/sdaa/kernels/arange_kernel.cc @@ -63,7 +63,7 @@ void doArangeTensor(const Context& dev_ctx, const T& start, const T& end, const T& step, - phi::DenseTensor* out) { + DenseTensor* out) { tecodnnHandle_t tecodnnHandle = GetHandleFromCTX(dev_ctx); std::vector out_dims = phi::vectorize(out->dims()); @@ -77,10 +77,10 @@ void doArangeTensor(const Context& dev_ctx, template void ArangeTensorKernel(const Context& dev_ctx, - const phi::DenseTensor& start_t, - const phi::DenseTensor& end_t, - const phi::DenseTensor& step_t, - phi::DenseTensor* out) { + const DenseTensor& start_t, + const DenseTensor& end_t, + const DenseTensor& step_t, + DenseTensor* out) { VLOG(4) << "CALL SDAA ArangeTensorKernel"; T start_value = phi::GetValue(dev_ctx, start_t); @@ -101,7 +101,7 @@ void ArangeTensorKernel(const Context& dev_ctx, // const phi::Scalar& start, // const phi::Scalar& end, // const phi::Scalar& step, -// phi::DenseTensor* out) { +// DenseTensor* out) { // T start_value = start.to(); // T end_value = end.to(); // T step_value = step.to(); diff --git a/backends/sdaa/kernels/arg_max_min_kernel.cc b/backends/sdaa/kernels/arg_max_min_kernel.cc index a9feb7da2b6..65ae97f52f6 100755 --- a/backends/sdaa/kernels/arg_max_min_kernel.cc +++ b/backends/sdaa/kernels/arg_max_min_kernel.cc @@ -31,10 +31,10 @@ namespace custom_kernel { template void doArgMaxMinTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, bool arg_max, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "call tecodnn argmax/argmin kernel"; std::vector x_dims = phi::vectorize(x.dims()); @@ -63,13 +63,13 @@ void doArgMaxMinTensor(const Context& dev_ctx, template void ArgMaxMin(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& axis, bool keepdims, bool flatten, phi::DataType dtype, bool arg_max, - phi::DenseTensor* out) { + DenseTensor* out) { int axis_ = axis.to(); if (x.numel() == 0) return; @@ -96,7 +96,7 @@ void ArgMaxMin(const Context& dev_ctx, } if (flatten) { - phi::DenseTensor flatten_x(x); + DenseTensor flatten_x(x); flatten_x.Resize(phi::make_ddim({x.numel()})); // if flatten, the axis_ is 0 axis_ = 0; @@ -110,12 +110,12 @@ void ArgMaxMin(const Context& dev_ctx, template void ArgMaxKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& axis, bool keepdims, bool flatten, phi::DataType dtype, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA ArgMaxKernel"; custom_kernel::ArgMaxMin( dev_ctx, x, axis, keepdims, flatten, dtype, true, out); @@ -123,12 +123,12 @@ void ArgMaxKernel(const Context& dev_ctx, template void ArgMinKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& axis, bool keepdims, bool flatten, phi::DataType dtype, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA ArgMinKernel"; custom_kernel::ArgMaxMin( dev_ctx, x, axis, keepdims, flatten, dtype, false, out); diff --git a/backends/sdaa/kernels/argsort_kernel.cc b/backends/sdaa/kernels/argsort_kernel.cc index 69c6754ebd7..e2c44502581 100644 --- a/backends/sdaa/kernels/argsort_kernel.cc +++ b/backends/sdaa/kernels/argsort_kernel.cc @@ -36,12 +36,12 @@ namespace custom_kernel { template void ArgsortKernel(const Context& dev_ctx, - const phi::DenseTensor& in, + const DenseTensor& in, int axis, bool descending, bool stable, - phi::DenseTensor* output, - phi::DenseTensor* indices) { + DenseTensor* output, + DenseTensor* indices) { VLOG(4) << "call sdaa ArgsortKernel"; PADDLE_ENFORCE_EQ( descending, diff --git a/backends/sdaa/kernels/assign_kernel.cc b/backends/sdaa/kernels/assign_kernel.cc index 7c2913fa330..76ec8e1b52e 100644 --- a/backends/sdaa/kernels/assign_kernel.cc +++ b/backends/sdaa/kernels/assign_kernel.cc @@ -20,8 +20,8 @@ namespace custom_kernel { template void AssignKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA AssignKernel"; dev_ctx.template Alloc(out); @@ -30,8 +30,8 @@ void AssignKernel(const Context& dev_ctx, template void AssignRawKernel(const Context& dev_ctx, - const paddle::optional& x, - phi::DenseTensor* out) { + const paddle::optional& x, + DenseTensor* out) { if (x) { if (!x->initialized()) { return; @@ -43,8 +43,8 @@ void AssignRawKernel(const Context& dev_ctx, template void AssignArrayKernel(const Context& dev_ctx, - const std::vector& x, - std::vector out) { + const std::vector& x, + std::vector out) { for (size_t i = 0; i < x.size(); ++i) { custom_kernel::AssignKernel(dev_ctx, *x[i], out.at(i)); } @@ -54,7 +54,7 @@ template typename std::enable_if::value>::type CopyVectorToTensor( const Context& dev_ctx, const std::vector& values, - phi::DenseTensor* out) { + DenseTensor* out) { // If attribute value dtype is vector, it will be converted to // vector. at the same time, we can not use vector to hold // the value, because the c++ use bit value to replace byte value. @@ -78,7 +78,7 @@ template typename std::enable_if::value>::type CopyVectorToTensor( const Context& dev_ctx, const std::vector& values, - phi::DenseTensor* out) { + DenseTensor* out) { std::vector assign_values; assign_values.reserve(values.size()); for (const auto& val : values) { @@ -92,7 +92,7 @@ void AssignValueKernel(const Context& dev_ctx, const std::vector& shape, phi::DataType dtype, const std::vector& values, - phi::DenseTensor* out) { + DenseTensor* out) { auto template_dtype = phi::CppTypeToDataType::Type(); PADDLE_ENFORCE_EQ( dtype, diff --git a/backends/sdaa/kernels/batch_norm_kernel.cc b/backends/sdaa/kernels/batch_norm_kernel.cc index f34fdf07745..636a6753b02 100644 --- a/backends/sdaa/kernels/batch_norm_kernel.cc +++ b/backends/sdaa/kernels/batch_norm_kernel.cc @@ -19,17 +19,17 @@ namespace custom_kernel { template void BatchNormInferKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& mean, - const phi::DenseTensor& variance, - const phi::DenseTensor& scale, - const phi::DenseTensor& bias, + const DenseTensor& x, + const DenseTensor& mean, + const DenseTensor& variance, + const DenseTensor& scale, + const DenseTensor& bias, float momentum, float epsilon, const std::string& data_layout_str, - phi::DenseTensor* y, - phi::DenseTensor* mean_out, - phi::DenseTensor* variance_out) { + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out) { VLOG(4) << "Call SDAA BatchNormInferKernel"; // allocate memory for outputs @@ -59,23 +59,23 @@ void BatchNormInferKernel(const Context& dev_ctx, template void BatchNormKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& running_mean, - const phi::DenseTensor& running_var, - const paddle::optional& scale, - const paddle::optional& bias, + const DenseTensor& x, + const DenseTensor& running_mean, + const DenseTensor& running_var, + const paddle::optional& scale, + const paddle::optional& bias, bool is_test, float momentum, float epsilon, const std::string& data_layout_str, bool use_global_stats, bool trainable_stats, - phi::DenseTensor* y, - phi::DenseTensor* mean_out, - phi::DenseTensor* variance_out, - phi::DenseTensor* saved_mean, - phi::DenseTensor* saved_variance, - phi::DenseTensor* reserve_space) { + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out, + DenseTensor* saved_mean, + DenseTensor* saved_variance, + DenseTensor* reserve_space) { // if training is False, use global mean and std bool test_mode = is_test && (!trainable_stats); // current tecodnnAPI does not support parameter use_global_stats=True @@ -84,7 +84,7 @@ void BatchNormKernel(const Context& dev_ctx, auto* Scale = scale.get_ptr(); auto* Bias = bias.get_ptr(); - phi::DenseTensor new_scale, new_bias; + DenseTensor new_scale, new_bias; const auto data_layout = common::StringToDataLayout(data_layout_str); int C; @@ -157,26 +157,25 @@ void BatchNormKernel(const Context& dev_ctx, } template -void BatchNormGradKernel( - const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& scale, - const paddle::optional& bias, - const paddle::optional& mean, - const paddle::optional& variance, - const phi::DenseTensor& saved_mean, - const phi::DenseTensor& saved_inv_variance, - const paddle::optional& reserve_space, - const phi::DenseTensor& d_y, - float momentum, - float epsilon, - const std::string& data_layout_str, - bool is_test, - bool use_global_stats, - bool trainable_statistics, - phi::DenseTensor* d_x, - phi::DenseTensor* d_scale, - phi::DenseTensor* d_bias) { +void BatchNormGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional& scale, + const paddle::optional& bias, + const paddle::optional& mean, + const paddle::optional& variance, + const DenseTensor& saved_mean, + const DenseTensor& saved_inv_variance, + const paddle::optional& reserve_space, + const DenseTensor& d_y, + float momentum, + float epsilon, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + DenseTensor* d_x, + DenseTensor* d_scale, + DenseTensor* d_bias) { VLOG(4) << "Call SDAA BatchNormGradKernel"; use_global_stats = is_test || use_global_stats; // check arguments @@ -206,7 +205,7 @@ void BatchNormGradKernel( auto* Scale = scale.get_ptr(); auto* Bias = bias.get_ptr(); - phi::DenseTensor new_scale, new_bias; + DenseTensor new_scale, new_bias; if (Scale) { new_scale = scale.get(); @@ -226,7 +225,7 @@ void BatchNormGradKernel( // allocate memory for outputs dev_ctx.template Alloc(d_x); - phi::DenseTensor scale_grad_tmp, bias_grad_tmp; + DenseTensor scale_grad_tmp, bias_grad_tmp; scale_grad_tmp.Resize(new_scale.dims()); bias_grad_tmp.Resize(new_bias.dims()); dev_ctx.template Alloc(&scale_grad_tmp); @@ -250,7 +249,7 @@ void BatchNormGradKernel( // since the tecodnnBatchNormBackward func only supports 4-D tensor, // when tensor dims=3, a dimensional complement is required. - phi::DenseTensor x_temp(x), dy_temp(d_y), dx_temp(*d_x); + DenseTensor x_temp(x), dy_temp(d_y), dx_temp(*d_x); if (x_dims.size() < 4) { if (need_trans) { x_temp.Resize(phi::make_ddim({N, C, H, W})); @@ -263,7 +262,7 @@ void BatchNormGradKernel( } } - phi::DenseTensor x_NHWC, dy_NHWC, dx_NHWC; + DenseTensor x_NHWC, dy_NHWC, dx_NHWC; phi::DDim x_NHWC_dims, dy_NHWC_dims, dx_NHWC_dims; if (need_trans) { @@ -301,11 +300,11 @@ void BatchNormGradKernel( true, phi::errors::InvalidArgument("scale not support NULL in sdaa device.")); // 1. compuate inv var - phi::DenseTensor inv_var; - phi::DenseTensor running_mean = mean.get(); + DenseTensor inv_var; + DenseTensor running_mean = mean.get(); const auto* running_variance = variance.get_ptr(); - phi::DenseTensor add_res, sqrt_res; + DenseTensor add_res, sqrt_res; phi::DDim C_dims = {C}; add_res.Resize(C_dims); @@ -320,8 +319,8 @@ void BatchNormGradKernel( dev_ctx, add_res, 1.0, UnaryOpMode::SQRT, &sqrt_res); sdaa_ops::doReciprocalTensor(dev_ctx, sqrt_res, &inv_var); - phi::DenseTensor dy_sum, dy_mul_x_sub_mean_mul_invstd_sum, scale_inv_var; - phi::DenseTensor dy_NHWC_fp32, x_NHWC_fp32; + DenseTensor dy_sum, dy_mul_x_sub_mean_mul_invstd_sum, scale_inv_var; + DenseTensor dy_NHWC_fp32, x_NHWC_fp32; scale_inv_var.Resize(C_dims); dev_ctx.Alloc(&scale_inv_var, new_scale.dtype()); sdaa_ops::doElementMul(dev_ctx, new_scale, inv_var, -1, &scale_inv_var); @@ -350,7 +349,7 @@ void BatchNormGradKernel( sdaa_ops::doSumTensor(dev_ctx, dy_NHWC_fp32, {0, 1, 2}, &dy_sum); // 3. compute dy_mul_x_sub_mean_mul_invstd_sum - phi::DenseTensor x_sub_mean, invstd_mul_dy, intermediate_res; + DenseTensor x_sub_mean, invstd_mul_dy, intermediate_res; x_sub_mean.set_meta(x_NHWC_fp32.meta()); invstd_mul_dy.set_meta(dy_NHWC_fp32.meta()); intermediate_res.set_meta(dy_NHWC_fp32.meta()); @@ -370,7 +369,7 @@ void BatchNormGradKernel( // 4. compute dx if (d_x) { if (!std::is_same::value) { - phi::DenseTensor dx_NHWC_fp32; + DenseTensor dx_NHWC_fp32; dx_NHWC_fp32.Resize(dx_NHWC.dims()); dev_ctx.Alloc(&dx_NHWC_fp32, phi::DataType::FLOAT32); sdaa_ops::doElementMul( @@ -410,7 +409,7 @@ void BatchNormGradKernel( size_t workSpaceSizeInBytes = 0; TECODNN_CHECK(tecodnnGetBatchNormalizationBackwardWorkspaceSize( bnMode, sbmv_NHWC_Desc, &workSpaceSizeInBytes)); - phi::DenseTensor workspace; + DenseTensor workspace; if (workSpaceSizeInBytes != 0) workspace.Resize({static_cast(workSpaceSizeInBytes)}); dev_ctx.Alloc(&workspace, phi::DataType::INT8); diff --git a/backends/sdaa/kernels/bce_loss_kernel.cc b/backends/sdaa/kernels/bce_loss_kernel.cc index 6dc993ac517..bc19d1123a9 100644 --- a/backends/sdaa/kernels/bce_loss_kernel.cc +++ b/backends/sdaa/kernels/bce_loss_kernel.cc @@ -123,17 +123,17 @@ void bce_loss_grad(const Context& dev_ctx, template void BCELossKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& labels, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& labels, + DenseTensor* out) { VLOG(4) << "Call SDAA BCELossKernel"; dev_ctx.template Alloc(out); std::vector xdims; for (int i = 0; i < x.dims().size(); i++) { xdims.push_back(x.dims()[i]); } - phi::DenseTensor w; - phi::DenseTensorMeta w_meta = {x.dtype(), x.dims()}; + DenseTensor w; + DenseTensorMeta w_meta = {x.dtype(), x.dims()}; w.set_meta(w_meta); dev_ctx.template Alloc(&w); sdaa_ops::doFillTensor(dev_ctx, static_cast(1), x.dtype(), &w); @@ -149,18 +149,18 @@ void BCELossKernel(const Context& dev_ctx, template void BCELossGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& labels, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& labels, + const DenseTensor& dout, + DenseTensor* dx) { VLOG(4) << "Call SDAA BCELossGradKernel"; dev_ctx.template Alloc(dx); std::vector xdims; for (int i = 0; i < x.dims().size(); i++) { xdims.push_back(x.dims()[i]); } - phi::DenseTensor w; - phi::DenseTensorMeta w_meta = {x.dtype(), x.dims()}; + DenseTensor w; + DenseTensorMeta w_meta = {x.dtype(), x.dims()}; w.set_meta(w_meta); dev_ctx.template Alloc(&w); sdaa_ops::doFillTensor(dev_ctx, static_cast(1), x.dtype(), &w); diff --git a/backends/sdaa/kernels/bitwise_kernel.cc b/backends/sdaa/kernels/bitwise_kernel.cc index 60bb6602962..7e73f5deebb 100644 --- a/backends/sdaa/kernels/bitwise_kernel.cc +++ b/backends/sdaa/kernels/bitwise_kernel.cc @@ -31,13 +31,13 @@ namespace custom_kernel { template void BitwiseOrKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "CALL SDAA BitwiseOrKernel"; dev_ctx.template Alloc(out); - phi::DenseTensor x_temp(x), y_temp(y), out_temp(*out); + DenseTensor x_temp(x), y_temp(y), out_temp(*out); if (x.dims().size() == 0 && y.dims().size() == 0) { x_temp.Resize(phi::make_ddim({1})); y_temp.Resize(phi::make_ddim({1})); @@ -50,12 +50,12 @@ void BitwiseOrKernel(const Context& dev_ctx, template void BitwiseNotKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "CALL SDAA BitwiseNotKernel"; dev_ctx.template Alloc(out); - phi::DenseTensor x_temp(x), out_temp(*out); + DenseTensor x_temp(x), out_temp(*out); if (x.dims().size() == 0) { x_temp.Resize(phi::make_ddim({1})); out_temp.Resize(phi::make_ddim({1})); @@ -67,13 +67,13 @@ void BitwiseNotKernel(const Context& dev_ctx, template void BitwiseAndKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "CALL SDAA BitwiseAndKernel"; dev_ctx.template Alloc(out); - phi::DenseTensor x_temp(x), y_temp(y), out_temp(*out); + DenseTensor x_temp(x), y_temp(y), out_temp(*out); if (x.dims().size() == 0 && y.dims().size() == 0) { x_temp.Resize(phi::make_ddim({1})); y_temp.Resize(phi::make_ddim({1})); @@ -86,13 +86,13 @@ void BitwiseAndKernel(const Context& dev_ctx, template void BitwiseXorKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "CALL SDAA BitwiseXorKernel"; dev_ctx.template Alloc(out); - phi::DenseTensor x_temp(x), y_temp(y), out_temp(*out); + DenseTensor x_temp(x), y_temp(y), out_temp(*out); if (x.dims().size() == 0 && y.dims().size() == 0) { x_temp.Resize(phi::make_ddim({1})); y_temp.Resize(phi::make_ddim({1})); diff --git a/backends/sdaa/kernels/bmm_kernel.cc b/backends/sdaa/kernels/bmm_kernel.cc index d792112c02f..2b5eea3eb49 100644 --- a/backends/sdaa/kernels/bmm_kernel.cc +++ b/backends/sdaa/kernels/bmm_kernel.cc @@ -33,9 +33,9 @@ namespace custom_kernel { template void BmmKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "CALL SDAA BmmKernel."; dev_ctx.template Alloc(out); @@ -49,11 +49,11 @@ void BmmKernel(const Context& dev_ctx, template void BmmGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& dout, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy) { VLOG(4) << "CALL SDAA BmmGradKernel."; if (dx) { diff --git a/backends/sdaa/kernels/cast_kernel.cc b/backends/sdaa/kernels/cast_kernel.cc index eb1c19d1b02..5603f9dda39 100644 --- a/backends/sdaa/kernels/cast_kernel.cc +++ b/backends/sdaa/kernels/cast_kernel.cc @@ -20,9 +20,9 @@ namespace custom_kernel { template void CastKernel(const Context &dev_ctx, - const phi::DenseTensor &x, + const DenseTensor &x, phi::DataType out_dtype, - phi::DenseTensor *out) { + DenseTensor *out) { VLOG(4) << "Call SDAA CastKernel"; dev_ctx.Alloc(out, out_dtype, 0, false, false); diff --git a/backends/sdaa/kernels/clip_kernel.cc b/backends/sdaa/kernels/clip_kernel.cc index fafa41f5e32..93e51ca78a9 100644 --- a/backends/sdaa/kernels/clip_kernel.cc +++ b/backends/sdaa/kernels/clip_kernel.cc @@ -21,10 +21,10 @@ namespace custom_kernel { template void ClipKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& min, const phi::Scalar& max, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA ClipKernel"; dev_ctx.template Alloc(out); @@ -44,11 +44,11 @@ void ClipKernel(const Context& dev_ctx, template void ClipGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& out_grad, const phi::Scalar& min, const phi::Scalar& max, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { VLOG(4) << "Call SDAA ClipGradKernel"; dev_ctx.template Alloc(x_grad); diff --git a/backends/sdaa/kernels/coalesce_tensor_kernel.cc b/backends/sdaa/kernels/coalesce_tensor_kernel.cc index 8285f68bc37..149ffca5ef2 100644 --- a/backends/sdaa/kernels/coalesce_tensor_kernel.cc +++ b/backends/sdaa/kernels/coalesce_tensor_kernel.cc @@ -44,13 +44,12 @@ size_t Alignment(size_t size, const phi::Place& place, int align_size) { return remaining == 0 ? size : size + (alignment - remaining); } -void GetMemSizeAndDtype( - const std::vector& dense_tensor, - size_t* numel, - const size_t& size_of_dtype, - const phi::Place& place, - const bool use_align = true, - const int align_size = -1) { +void GetMemSizeAndDtype(const std::vector& dense_tensor, + size_t* numel, + const size_t& size_of_dtype, + const phi::Place& place, + const bool use_align = true, + const int align_size = -1) { *numel = 0; std::stringstream ss; ss << "alloc_space_for_vars: "; @@ -78,7 +77,7 @@ void GetMemSizeAndDtype( template struct FillConstantVisitor { FillConstantVisitor(const Context& dev_ctx, - phi::DenseTensor* tensor, + DenseTensor* tensor, const float value, phi::DataType dtype) : dev_ctx_(dev_ctx), tensor_(tensor), value_(value), dtype_(dtype) {} @@ -101,7 +100,7 @@ struct FillConstantVisitor { } const Context& dev_ctx_; - phi::DenseTensor* tensor_; + DenseTensor* tensor_; float value_; phi::DataType dtype_; }; @@ -128,7 +127,7 @@ static void VisitDataType(phi::DataType type, Visitor visitor) { template void CoalesceTensorKernel(const Context& dev_ctx, - const std::vector& input, + const std::vector& input, phi::DataType dtype, bool copy_data, bool set_constant, @@ -139,8 +138,8 @@ void CoalesceTensorKernel(const Context& dev_ctx, int size_of_dtype, const std::vector& concated_shapes, const std::vector& concated_ranks, - std::vector output, - phi::DenseTensor* fused_output) { + std::vector output, + DenseTensor* fused_output) { VLOG(4) << "CALL SDAA CoalesceTensorKernel"; PADDLE_ENFORCE_GT(input.size(), diff --git a/backends/sdaa/kernels/compare_kernel.cc b/backends/sdaa/kernels/compare_kernel.cc index 6bc7e5a0310..328110cbfb0 100644 --- a/backends/sdaa/kernels/compare_kernel.cc +++ b/backends/sdaa/kernels/compare_kernel.cc @@ -21,10 +21,10 @@ namespace custom_kernel { template void EqualRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA EqualRawKernel"; dev_ctx.template Alloc(out); sdaa_ops::doCompareTensor(dev_ctx, x, y, CompareType::Equal, out); @@ -32,18 +32,18 @@ void EqualRawKernel(const Context& dev_ctx, template void EqualKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { custom_kernel::EqualRawKernel(dev_ctx, x, y, -1, out); } template void NotEqualRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA NotEqualRawKernel"; dev_ctx.template Alloc(out); sdaa_ops::doCompareTensor(dev_ctx, x, y, CompareType::NotEqual, out); @@ -51,18 +51,18 @@ void NotEqualRawKernel(const Context& dev_ctx, template void NotEqualKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { custom_kernel::NotEqualRawKernel(dev_ctx, x, y, -1, out); } template void LessEqualRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA LessEqualRawKernel"; dev_ctx.template Alloc(out); sdaa_ops::doCompareTensor(dev_ctx, x, y, CompareType::LessEqual, out); @@ -70,18 +70,18 @@ void LessEqualRawKernel(const Context& dev_ctx, template void LessEqualKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { custom_kernel::LessEqualRawKernel(dev_ctx, x, y, -1, out); } template void GreaterEqualRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA GreaterEqualRawKernel"; dev_ctx.template Alloc(out); sdaa_ops::doCompareTensor(dev_ctx, x, y, CompareType::GreaterEqual, out); @@ -89,18 +89,18 @@ void GreaterEqualRawKernel(const Context& dev_ctx, template void GreaterEqualKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { custom_kernel::GreaterEqualRawKernel(dev_ctx, x, y, -1, out); } template void LessThanRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA LessThanRawKernel"; dev_ctx.template Alloc(out); sdaa_ops::doCompareTensor(dev_ctx, x, y, CompareType::LessThan, out); @@ -108,18 +108,18 @@ void LessThanRawKernel(const Context& dev_ctx, template void LessThanKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { custom_kernel::LessThanRawKernel(dev_ctx, x, y, -1, out); } template void GreaterThanRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA GreaterThanRawKernel"; dev_ctx.template Alloc(out); sdaa_ops::doCompareTensor(dev_ctx, x, y, CompareType::GreaterThan, out); @@ -127,9 +127,9 @@ void GreaterThanRawKernel(const Context& dev_ctx, template void GreaterThanKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { custom_kernel::GreaterThanRawKernel(dev_ctx, x, y, -1, out); } } // namespace custom_kernel diff --git a/backends/sdaa/kernels/concat_kernel.cc b/backends/sdaa/kernels/concat_kernel.cc index 7910fd2c8c8..bbebb910011 100644 --- a/backends/sdaa/kernels/concat_kernel.cc +++ b/backends/sdaa/kernels/concat_kernel.cc @@ -48,9 +48,9 @@ static inline int ComputeAxis(int axis, int rank) { template void ConcatKernel(const Context& dev_ctx, - const std::vector& ins, + const std::vector& ins, const phi::Scalar& axis_scalar, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA ConcatKernel."; dev_ctx.template Alloc(out); @@ -61,13 +61,13 @@ void ConcatKernel(const Context& dev_ctx, template void ConcatGradKernel(const Context& dev_ctx, - const std::vector& ins, - const phi::DenseTensor& dout, + const std::vector& ins, + const DenseTensor& dout, const phi::Scalar& axis_scalar, - std::vector outs) { + std::vector outs) { VLOG(4) << "CALL SDAA ConcatGradKernel"; - std::vector outputs_vec; - std::vector tmp_outputs_vec; + std::vector outputs_vec; + std::vector tmp_outputs_vec; int axis = axis_scalar.to(); axis = ComputeAxis(axis, static_cast(ins[0]->dims().size())); for (int i = 0; i < outs.size(); ++i) { @@ -75,7 +75,7 @@ void ConcatGradKernel(const Context& dev_ctx, dev_ctx.template Alloc(outs[i]); outputs_vec.push_back(outs[i]); } else { - phi::DenseTensor tmp_tensor; + DenseTensor tmp_tensor; tmp_tensor.Resize(ins[i]->dims()); dev_ctx.template Alloc(&tmp_tensor); tmp_outputs_vec.push_back((std::move(tmp_tensor))); diff --git a/backends/sdaa/kernels/contiguous_kernel.cc b/backends/sdaa/kernels/contiguous_kernel.cc index 4599a674730..18c8a56842d 100644 --- a/backends/sdaa/kernels/contiguous_kernel.cc +++ b/backends/sdaa/kernels/contiguous_kernel.cc @@ -32,11 +32,11 @@ namespace custom_kernel { template void ContiguousKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - phi::DenseTensor* out) { + const DenseTensor& input, + DenseTensor* out) { VLOG(4) << "CALL SDAA ContiguousKernel."; - phi::DenseTensorMeta meta = input.meta(); + DenseTensorMeta meta = input.meta(); meta.strides = meta.calc_strides(meta.dims); meta.offset = 0; out->set_meta(meta); diff --git a/backends/sdaa/kernels/conv2d_kernel.cc b/backends/sdaa/kernels/conv2d_kernel.cc index 4b27861ad25..7e07303b0ce 100644 --- a/backends/sdaa/kernels/conv2d_kernel.cc +++ b/backends/sdaa/kernels/conv2d_kernel.cc @@ -29,15 +29,15 @@ namespace custom_kernel { template void Conv2dTecodnnKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& filter, + const DenseTensor& input, + const DenseTensor& filter, const std::vector& strides_t, const std::vector& paddings_t, const std::string& padding_algorithm, const std::vector& dilations_t, int groups, const std::string& data_format, - phi::DenseTensor* output) { + DenseTensor* output) { VLOG(4) << "CALL SDAA Conv2dTecodnnKernel"; ConvKernel(dev_ctx, @@ -56,15 +56,15 @@ void Conv2dTecodnnKernel(const Context& dev_ctx, template void DepthwiseConv2dTecodnnKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& filter, + const DenseTensor& input, + const DenseTensor& filter, const std::vector& strides_t, const std::vector& paddings_t, const std::string& padding_algorithm, int groups, const std::vector& dilations_t, const std::string& data_format, - phi::DenseTensor* output) { + DenseTensor* output) { VLOG(4) << "CALL SDAA DepthwiseConv2dTecodnnKernel"; const bool is_NHWC = data_format == "NHWC"; phi::DDim in_dims = input.dims(); @@ -99,17 +99,17 @@ void DepthwiseConv2dTecodnnKernel(const Context& dev_ctx, template void Conv2dGradTecodnnKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& filter, - const phi::DenseTensor& output_grad, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& output_grad, const std::vector& strides_t, const std::vector& paddings_t, const std::string& padding_algorithm, const std::vector& dilations_t, int groups, const std::string& data_format, - phi::DenseTensor* input_grad, - phi::DenseTensor* filter_grad) { + DenseTensor* input_grad, + DenseTensor* filter_grad) { VLOG(4) << "CALL SDAA Conv2dGradTecodnnKernel"; ConvBackwardKernel(dev_ctx, @@ -130,17 +130,17 @@ void Conv2dGradTecodnnKernel(const Context& dev_ctx, template void DepthwiseConv2dGradTecodnnKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& filter, - const phi::DenseTensor& output_grad, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& output_grad, const std::vector& strides_t, const std::vector& paddings_t, const std::string& padding_algorithm, int groups, const std::vector& dilations_t, const std::string& data_format, - phi::DenseTensor* input_grad, - phi::DenseTensor* filter_grad) { + DenseTensor* input_grad, + DenseTensor* filter_grad) { VLOG(4) << "CALL SDAA DepthwiseConv2dGradTecodnnKernel"; const bool is_NHWC = data_format == "NHWC"; phi::DDim in_dims = input.dims(); diff --git a/backends/sdaa/kernels/conv_transpose_kernel.cc b/backends/sdaa/kernels/conv_transpose_kernel.cc index a4e84a26834..186760b0393 100644 --- a/backends/sdaa/kernels/conv_transpose_kernel.cc +++ b/backends/sdaa/kernels/conv_transpose_kernel.cc @@ -30,8 +30,8 @@ namespace custom_kernel { template void Conv2dTransposeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& filter, + const DenseTensor& x, + const DenseTensor& filter, const std::vector& strides, const std::vector& padding, const std::vector& output_padding, @@ -40,7 +40,7 @@ void Conv2dTransposeKernel(const Context& dev_ctx, int groups, const std::vector& dilation, const std::string& data_format, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA Conv2dTransposeKernel"; PADDLE_ENFORCE_EQ( @@ -74,9 +74,9 @@ void Conv2dTransposeKernel(const Context& dev_ctx, template void Conv2dTransposeGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& filter, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, const std::vector& strides, const std::vector& padding, const std::vector& output_padding, @@ -85,8 +85,8 @@ void Conv2dTransposeGradKernel(const Context& dev_ctx, int groups, const std::vector& dilation, const std::string& data_format, - phi::DenseTensor* dx, - phi::DenseTensor* dfilter) { + DenseTensor* dx, + DenseTensor* dfilter) { VLOG(4) << "CALL SDAA Conv2dTransposeGardKernel"; if (!dx && !dfilter) return; diff --git a/backends/sdaa/kernels/cross_entropy_kernel.cc b/backends/sdaa/kernels/cross_entropy_kernel.cc index 63d23ace09a..a10cef09ea0 100644 --- a/backends/sdaa/kernels/cross_entropy_kernel.cc +++ b/backends/sdaa/kernels/cross_entropy_kernel.cc @@ -65,12 +65,12 @@ static inline int SizeOutAxis(const int axis, phi::DDim dims) { template void crossEntropy(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& labels, + const DenseTensor& x, + const DenseTensor& labels, int ignore_index, int axis, bool soft_label, - phi::DenseTensor* loss) { + DenseTensor* loss) { auto handle = GetHandleFromCTX(dev_ctx); const int rank = x.dims().size(); const int axis_v = CanonicalAxis(axis, rank); @@ -88,12 +88,12 @@ void crossEntropy(const Context& dev_ctx, const int d = SizeFromAxis(axis_v, x.dims()); // weight is processsed outside the kernel, so we can't access actual weight - phi::DenseTensor w; + DenseTensor w; w.Resize({d}); dev_ctx.template Alloc(&w); sdaa_ops::doFillTensor(dev_ctx, static_cast(1.f), w.dtype(), &w); - phi::DenseTensor labels_cast; + DenseTensor labels_cast; if (soft_label) { PADDLE_ENFORCE_EQ( labels.numel(), @@ -114,7 +114,7 @@ void crossEntropy(const Context& dev_ctx, "but got size of labels is %d and phi::funcs::SizeToAxis is %d.", labels.numel(), n)); - phi::DenseTensorMeta labels_meta = {phi::DataType::INT32, labels.dims()}; + DenseTensorMeta labels_meta = {phi::DataType::INT32, labels.dims()}; labels_cast.set_meta(labels_meta); dev_ctx.template Alloc(&labels_cast); sdaa_ops::doCastTensor(dev_ctx, labels, &labels_cast); @@ -154,13 +154,13 @@ void crossEntropy(const Context& dev_ctx, template void crossEntropyGrad(const Context& dev_ctx, - const phi::DenseTensor& labels, - const phi::DenseTensor& x, - const phi::DenseTensor& loss_grad, + const DenseTensor& labels, + const DenseTensor& x, + const DenseTensor& loss_grad, int ignore_index, int axis, bool soft_label, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { auto handle = GetHandleFromCTX(dev_ctx); const int rank = x.dims().size(); const int axis_v = CanonicalAxis(axis, rank); @@ -178,12 +178,12 @@ void crossEntropyGrad(const Context& dev_ctx, const int d = SizeFromAxis(axis_v, x.dims()); // weight is processsed outside the kernel, so we can't access actual weight - phi::DenseTensor w; + DenseTensor w; w.Resize({1, d}); dev_ctx.template Alloc(&w); sdaa_ops::doFillTensor(dev_ctx, static_cast(1.f), w.dtype(), &w); - phi::DenseTensor labels_cast; + DenseTensor labels_cast; if (soft_label) { PADDLE_ENFORCE_EQ( labels.numel(), @@ -204,13 +204,13 @@ void crossEntropyGrad(const Context& dev_ctx, "but got size of labels is %d and phi::funcs::SizeToAxis is %d.", labels.numel(), n)); - phi::DenseTensorMeta labels_meta = {phi::DataType::INT32, labels.dims()}; + DenseTensorMeta labels_meta = {phi::DataType::INT32, labels.dims()}; labels_cast.set_meta(labels_meta); dev_ctx.template Alloc(&labels_cast); sdaa_ops::doCastTensor(dev_ctx, labels, &labels_cast); } - phi::DenseTensor x_2d(x), labels_d(labels_cast), loss_2d(loss_grad); + DenseTensor x_2d(x), labels_d(labels_cast), loss_2d(loss_grad); x_2d.Resize({n, d}); if (soft_label) { labels_d.Resize({n, d}); @@ -249,15 +249,15 @@ void crossEntropyGrad(const Context& dev_ctx, template void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx, - const phi::DenseTensor& logits, - const phi::DenseTensor& labels, + const DenseTensor& logits, + const DenseTensor& labels, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis, - phi::DenseTensor* softmax, - phi::DenseTensor* loss) { + DenseTensor* softmax, + DenseTensor* loss) { VLOG(4) << "Call SDAA CrossEntropyWithSoftmaxKernel"; dev_ctx.template Alloc(loss); @@ -284,22 +284,22 @@ void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx, template void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, - const phi::DenseTensor& labels, - const phi::DenseTensor& softmax, - const phi::DenseTensor& loss_grad, + const DenseTensor& labels, + const DenseTensor& softmax, + const DenseTensor& loss_grad, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis, - phi::DenseTensor* logits_grad) { + DenseTensor* logits_grad) { VLOG(4) << "Call SDAA CrossEntropyWithSoftmaxGradKernel"; dev_ctx.template Alloc(logits_grad); // input is softmax, skip softmax if (!use_softmax) { - phi::DenseTensor dlogits; + DenseTensor dlogits; // CELoss is not an inplace operator if (logits_grad->IsSharedWith(softmax)) { phi::Copy(dev_ctx, softmax, dev_ctx.GetPlace(), false, &dlogits); @@ -317,8 +317,8 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, return; } - phi::DenseTensor dlogits; - phi::DenseTensorMeta dlogits_meta = {softmax.dtype(), softmax.dims()}; + DenseTensor dlogits; + DenseTensorMeta dlogits_meta = {softmax.dtype(), softmax.dims()}; dlogits.set_meta(dlogits_meta); dev_ctx.template Alloc(&dlogits); diff --git a/backends/sdaa/kernels/cum_kernel.cc b/backends/sdaa/kernels/cum_kernel.cc index 7d471669c26..b94b84d3880 100644 --- a/backends/sdaa/kernels/cum_kernel.cc +++ b/backends/sdaa/kernels/cum_kernel.cc @@ -20,10 +20,10 @@ namespace custom_kernel { template void cumsum(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& x_dims, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "tecodnn cumsum tensor called"; int x_size = x_dims.size(); @@ -39,12 +39,12 @@ void cumsum(const Context& dev_ctx, template void CumsumKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& axis_scalar, bool flatten, bool exclusive, bool reverse, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA CumsumKernel"; dev_ctx.template Alloc(out); @@ -78,16 +78,16 @@ void CumsumKernel(const Context& dev_ctx, if (reverse) { std::vector reduce_dims = xdims; reduce_dims[axis] = 1; - phi::DenseTensor Sn; + DenseTensor Sn; phi::DDim Sn_dims = phi::make_ddim(reduce_dims); - phi::DenseTensorMeta Sn_meta = {x.dtype(), Sn_dims}; + DenseTensorMeta Sn_meta = {x.dtype(), Sn_dims}; Sn.set_meta(Sn_meta); dev_ctx.template Alloc(&Sn); std::vector reduce_axis = {static_cast(axis)}; sdaa_ops::doSumTensor(dev_ctx, x, reduce_axis, &Sn); - phi::DenseTensor Sm; - phi::DenseTensorMeta Sm_meta = {x.dtype(), x.dims()}; + DenseTensor Sm; + DenseTensorMeta Sm_meta = {x.dtype(), x.dims()}; Sm.set_meta(Sm_meta); dev_ctx.template Alloc(&Sm); sdaa_ops::doElementSub(dev_ctx, Sn, *out, -1, &Sm); diff --git a/backends/sdaa/kernels/distribute_fpn_proposals_kernel.cc b/backends/sdaa/kernels/distribute_fpn_proposals_kernel.cc index 99742a3796c..1754a4752ea 100644 --- a/backends/sdaa/kernels/distribute_fpn_proposals_kernel.cc +++ b/backends/sdaa/kernels/distribute_fpn_proposals_kernel.cc @@ -31,10 +31,10 @@ namespace custom_kernel { template inline std::vector GetLodFromRoisNum(const Context& dev_ctx, - const phi::DenseTensor* rois_num) { + const DenseTensor* rois_num) { std::vector rois_lod; auto* rois_num_data = rois_num->data(); - phi::DenseTensor cpu_tensor; + DenseTensor cpu_tensor; phi::Copy(dev_ctx, *rois_num, phi::CPUPlace(), true, &cpu_tensor); rois_num_data = cpu_tensor.data(); rois_lod.push_back(static_cast(0)); @@ -47,16 +47,16 @@ inline std::vector GetLodFromRoisNum(const Context& dev_ctx, template void DistributeFpnProposalsKernel( const Context& dev_ctx, - const phi::DenseTensor& fpn_rois, - const paddle::optional& rois_num, + const DenseTensor& fpn_rois, + const paddle::optional& rois_num, int min_level, int max_level, int refer_level, int refer_scale, bool pixel_offset, - std::vector multi_fpn_rois, - std::vector multi_level_rois_num, - phi::DenseTensor* restore_index) { + std::vector multi_fpn_rois, + std::vector multi_level_rois_num, + DenseTensor* restore_index) { VLOG(4) << "CALL SDAA DistributeFpnProposalsKernel"; int num_level = max_level - min_level + 1; @@ -80,7 +80,7 @@ void DistributeFpnProposalsKernel( int lod_size = fpn_rois_lod.size() - 1; int roi_num = fpn_rois_lod[lod_size]; - phi::DenseTensor rois_num_cpu, rois_num_sdaa; + DenseTensor rois_num_cpu, rois_num_sdaa; if (!rois_num.get_ptr()) { // get rois_num from fpn_rois_lod rois_num_cpu.Resize({lod_size}); @@ -160,14 +160,14 @@ void DistributeFpnProposalsKernel( phi::LoD lod; lod.clear(); lod.emplace_back(lod_offset[i]); - phi::DenseTensorMeta lod_meta = {multi_fpn_rois[i]->dtype(), - multi_fpn_rois[i]->dims(), - multi_fpn_rois[i]->layout(), - lod}; + DenseTensorMeta lod_meta = {multi_fpn_rois[i]->dtype(), + multi_fpn_rois[i]->dims(), + multi_fpn_rois[i]->layout(), + lod}; multi_fpn_rois[i]->set_meta(lod_meta); } - phi::DenseTensor multi_level_rois_num_ptr_tensor; + DenseTensor multi_level_rois_num_ptr_tensor; void* multi_level_rois_num_ptr_tensor_data_ptr = nullptr; if (multi_level_rois_num.size() > 0) { std::vector multi_level_rois_num_dims = {lod_size}; @@ -210,7 +210,7 @@ void DistributeFpnProposalsKernel( restore_index_dims, restore_index->dtype(), TensorFormat::Undefined); int multi_fpn_rois_ptr_size = multi_fpn_rois_ptr.size() * sizeof(void*); - phi::DenseTensor multi_fpn_rois_ptr_tensor; + DenseTensor multi_fpn_rois_ptr_tensor; multi_fpn_rois_ptr_tensor.Resize(phi::make_ddim({multi_fpn_rois_ptr_size})); dev_ctx.template Alloc(&multi_fpn_rois_ptr_tensor); AsyncMemCpyH2D(nullptr, diff --git a/backends/sdaa/kernels/dropout_kernel.cc b/backends/sdaa/kernels/dropout_kernel.cc index 15aa8f3a16a..058dd87f5b5 100644 --- a/backends/sdaa/kernels/dropout_kernel.cc +++ b/backends/sdaa/kernels/dropout_kernel.cc @@ -20,14 +20,14 @@ namespace custom_kernel { template void DropoutNVAlign(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& seed_tensor, + const DenseTensor& x, + const paddle::optional& seed_tensor, float p, int seed, bool fix_seed, const char* mode, - phi::DenseTensor* out, - phi::DenseTensor* mask) { + DenseTensor* out, + DenseTensor* mask) { // Align sdaa with NV device uint64_t seed_data; uint64_t increment; @@ -54,7 +54,7 @@ void DropoutNVAlign(const Context& dev_ctx, << ", increment=" << offset; sdaaStream_t custom_stream = GetStreamFromCTX(dev_ctx); - phi::DenseTensor x_temp, out_temp; + DenseTensor x_temp, out_temp; x_temp = x; out_temp = *out; @@ -72,15 +72,15 @@ void DropoutNVAlign(const Context& dev_ctx, template void DropoutKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& seed_tensor, + const DenseTensor& x, + const paddle::optional& seed_tensor, const phi::Scalar& p, bool is_test, const std::string& mode, int seed, bool fix_seed, - phi::DenseTensor* out, - phi::DenseTensor* mask) { + DenseTensor* out, + DenseTensor* mask) { VLOG(4) << "Call SDAA DropoutKernel"; dev_ctx.template Alloc(out); @@ -159,10 +159,10 @@ void DropoutKernel(const Context& dev_ctx, // set states size_t act_statesSize = 4 * 1024 * sizeof(int); TECODNN_CHECK(tecodnnDropoutGetStatesSize(tecodnnHandle, &act_statesSize)); - phi::DenseTensorMeta meta = {phi::DataType::INT8, - {static_cast(act_statesSize)}}; + DenseTensorMeta meta = {phi::DataType::INT8, + {static_cast(act_statesSize)}}; - phi::DenseTensor states; + DenseTensor states; states.set_meta(meta); dev_ctx.template Alloc(&states); @@ -192,12 +192,12 @@ void DropoutKernel(const Context& dev_ctx, template void DropoutGradKernel(const Context& dev_ctx, - const phi::DenseTensor& mask, - const phi::DenseTensor& dout, + const DenseTensor& mask, + const DenseTensor& dout, const phi::Scalar& p, bool is_test, const std::string& mode, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "Call SDAA DropoutGradKernel"; PADDLE_ENFORCE_EQ( @@ -222,9 +222,9 @@ void DropoutGradKernel(const Context& dev_ctx, // set states size_t act_statesSize = 4 * 1024 * sizeof(int); TECODNN_CHECK(tecodnnDropoutGetStatesSize(tecodnnHandle, &act_statesSize)); - phi::DenseTensorMeta meta = {phi::DataType::INT8, - {static_cast(act_statesSize)}}; - phi::DenseTensor states; + DenseTensorMeta meta = {phi::DataType::INT8, + {static_cast(act_statesSize)}}; + DenseTensor states; states.set_meta(meta); dev_ctx.template Alloc(&states); diff --git a/backends/sdaa/kernels/element_add_kernel.cc b/backends/sdaa/kernels/element_add_kernel.cc index 511342e62e4..5910a49b176 100644 --- a/backends/sdaa/kernels/element_add_kernel.cc +++ b/backends/sdaa/kernels/element_add_kernel.cc @@ -20,10 +20,10 @@ namespace custom_kernel { template void AddRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA AddKernel"; dev_ctx.template Alloc(out); sdaa_ops::doElementAdd(dev_ctx, x, y, axis, out); @@ -31,9 +31,9 @@ void AddRawKernel(const Context& dev_ctx, template void AddKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { // In RPNFeat layer of mask-rcnn, five conv layers use the same filter // weights, so out's storage properties will be eliminated while accumulating // gradients and it must add storage properties to out. @@ -78,12 +78,12 @@ void AddKernel(const Context& dev_ctx, } template void AddGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, int axis, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + DenseTensor* dx, + DenseTensor* dy) { VLOG(4) << "Call SDAA AddGradKernel"; auto out_dims_vec = phi::vectorize(dout.dims()); @@ -115,9 +115,9 @@ void AddGradKernel(const Context& dev_ctx, template void GradAddKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "Call SDAA GradAddKernel"; custom_kernel::AddRawKernel(dev_ctx, x, y, -1, out); diff --git a/backends/sdaa/kernels/element_div_kernel.cc b/backends/sdaa/kernels/element_div_kernel.cc index 224525c0a60..e50bc0c0d1b 100644 --- a/backends/sdaa/kernels/element_div_kernel.cc +++ b/backends/sdaa/kernels/element_div_kernel.cc @@ -20,10 +20,10 @@ namespace custom_kernel { template void DivideRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA DivideRawKernel"; dev_ctx.template Alloc(out); sdaa_ops::doElementDiv(dev_ctx, x, y, axis, out); @@ -31,22 +31,22 @@ void DivideRawKernel(const Context& dev_ctx, template void DivideKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { int axis = -1; custom_kernel::DivideRawKernel(dev_ctx, x, y, axis, out); } template void DivideGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, int axis, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + DenseTensor* dx, + DenseTensor* dy) { VLOG(4) << "Call SDAA DivideGradKernel"; auto out_dims_vec = phi::vectorize(dout.dims()); @@ -54,8 +54,8 @@ void DivideGradKernel(const Context& dev_ctx, broadcastDims(x.dims(), y.dims(), axis, &x_dims_vec, &y_dims_vec); if (dy) { dev_ctx.template Alloc(dy); - phi::DenseTensor temp_out; - phi::DenseTensorMeta temp_out_meta = {dout.dtype(), dout.dims()}; + DenseTensor temp_out; + DenseTensorMeta temp_out_meta = {dout.dtype(), dout.dims()}; temp_out.set_meta(temp_out_meta); dev_ctx.template Alloc(&temp_out); sdaa_ops::doElementMul(dev_ctx, dout, out, -1, &temp_out); @@ -70,13 +70,13 @@ void DivideGradKernel(const Context& dev_ctx, } if (dx) { dev_ctx.template Alloc(dx); - phi::DenseTensor y_temp(y); + DenseTensor y_temp(y); y_temp.Resize(phi::make_ddim(y_dims_vec)); if (dx->dims() == dout.dims()) { sdaa_ops::doElementDiv(dev_ctx, dout, y_temp, -1, dx); } else { - phi::DenseTensor x_temp; + DenseTensor x_temp; x_temp.Resize(phi::make_ddim(out_dims_vec)); dev_ctx.template Alloc(&x_temp); sdaa_ops::doElementDiv(dev_ctx, dout, y_temp, -1, &x_temp); diff --git a/backends/sdaa/kernels/element_mul_kernel.cc b/backends/sdaa/kernels/element_mul_kernel.cc index d3aa2f09bcb..ecbc1b2c002 100644 --- a/backends/sdaa/kernels/element_mul_kernel.cc +++ b/backends/sdaa/kernels/element_mul_kernel.cc @@ -20,10 +20,10 @@ namespace custom_kernel { template void MultiplyRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA MultiplyKernel"; dev_ctx.template Alloc(out); sdaa_ops::doElementMul(dev_ctx, x, y, axis, out); @@ -31,9 +31,9 @@ void MultiplyRawKernel(const Context& dev_ctx, template void MultiplyKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { if (isEnvEnable("HIGH_PERFORMANCE_CONV") && (&x != out && x.storage_properties_initialized() && !out->storage_properties_initialized())) { @@ -58,12 +58,12 @@ void MultiplyKernel(const Context& dev_ctx, template void MultiplyGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, int axis, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + DenseTensor* dx, + DenseTensor* dy) { VLOG(4) << "Call SDAA MultiplyGradKernel"; auto out_dims_vec = phi::vectorize(dout.dims()); @@ -74,7 +74,7 @@ void MultiplyGradKernel(const Context& dev_ctx, if (dy->dims() == dout.dims()) { sdaa_ops::doElementMul(dev_ctx, dout, x, axis, dy); } else { - phi::DenseTensor y_temp; + DenseTensor y_temp; y_temp.Resize(dout.dims()); dev_ctx.template Alloc(&y_temp); sdaa_ops::doElementMul(dev_ctx, @@ -91,7 +91,7 @@ void MultiplyGradKernel(const Context& dev_ctx, if (dx->dims() == dout.dims()) { sdaa_ops::doElementMul(dev_ctx, dout, y, axis, dx); } else { - phi::DenseTensor x_temp; + DenseTensor x_temp; x_temp.Resize(dout.dims()); dev_ctx.template Alloc(&x_temp); sdaa_ops::doElementMul(dev_ctx, diff --git a/backends/sdaa/kernels/element_sub_kernel.cc b/backends/sdaa/kernels/element_sub_kernel.cc index 2d363914080..54796574d84 100644 --- a/backends/sdaa/kernels/element_sub_kernel.cc +++ b/backends/sdaa/kernels/element_sub_kernel.cc @@ -20,10 +20,10 @@ namespace custom_kernel { template void SubtractRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA SubtractKernel"; dev_ctx.template Alloc(out); sdaa_ops::doElementSub(dev_ctx, x, y, axis, out); @@ -31,21 +31,21 @@ void SubtractRawKernel(const Context& dev_ctx, template void SubtractKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { int axis = -1; custom_kernel::SubtractRawKernel(dev_ctx, x, y, axis, out); } template void SubtractGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, int axis, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + DenseTensor* dx, + DenseTensor* dy) { VLOG(4) << "Call SDAA SubtractGradKernel"; auto out_dims_vec = phi::vectorize(dout.dims()); std::vector x_dims_vec, y_dims_vec; diff --git a/backends/sdaa/kernels/elementwise_floordiv_kernel.cc b/backends/sdaa/kernels/elementwise_floordiv_kernel.cc index d2d1b3d3f20..b6c57563cb1 100644 --- a/backends/sdaa/kernels/elementwise_floordiv_kernel.cc +++ b/backends/sdaa/kernels/elementwise_floordiv_kernel.cc @@ -34,9 +34,9 @@ namespace custom_kernel { template void doFloorDivTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "call tecodnn floordivide"; std::vector x_dims = phi::vectorize(x.dims()); @@ -76,9 +76,9 @@ void doFloorDivTensor(const Context& dev_ctx, template void FloorDivideKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "CALL SDAA FloorDivideKernel"; dev_ctx.template Alloc(out); diff --git a/backends/sdaa/kernels/elementwise_max_kernel.cc b/backends/sdaa/kernels/elementwise_max_kernel.cc index 4b935c3e612..77b79173a7f 100644 --- a/backends/sdaa/kernels/elementwise_max_kernel.cc +++ b/backends/sdaa/kernels/elementwise_max_kernel.cc @@ -34,9 +34,9 @@ namespace custom_kernel { template void MaximumKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "Call SDAA MaximumKernel"; dev_ctx.template Alloc(out); diff --git a/backends/sdaa/kernels/elementwise_min_kernel.cc b/backends/sdaa/kernels/elementwise_min_kernel.cc index 269a9b48394..457bff80647 100644 --- a/backends/sdaa/kernels/elementwise_min_kernel.cc +++ b/backends/sdaa/kernels/elementwise_min_kernel.cc @@ -34,9 +34,9 @@ namespace custom_kernel { template void MinimumKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "Call SDAA MinimumKernel"; dev_ctx.template Alloc(out); diff --git a/backends/sdaa/kernels/elementwise_mod_kernel.cc b/backends/sdaa/kernels/elementwise_mod_kernel.cc index 16a633c3ad5..dacdb47032b 100644 --- a/backends/sdaa/kernels/elementwise_mod_kernel.cc +++ b/backends/sdaa/kernels/elementwise_mod_kernel.cc @@ -31,9 +31,9 @@ namespace custom_kernel { template void RemainderKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "CALL SDAA RemainderKernel"; dev_ctx.template Alloc(out); diff --git a/backends/sdaa/kernels/elementwise_pow_kernel.cc b/backends/sdaa/kernels/elementwise_pow_kernel.cc index 313d003bed2..1e646ecf646 100644 --- a/backends/sdaa/kernels/elementwise_pow_kernel.cc +++ b/backends/sdaa/kernels/elementwise_pow_kernel.cc @@ -31,10 +31,10 @@ namespace custom_kernel { template void ElementwisePowRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA ElementwisePowRawKernel."; dev_ctx.template Alloc(out); @@ -69,9 +69,9 @@ void ElementwisePowRawKernel(const Context& dev_ctx, template void ElementwisePowKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "CALL SDAA ElementwisePowKernel."; int axis = -1; custom_kernel::ElementwisePowRawKernel(dev_ctx, x, y, axis, out); diff --git a/backends/sdaa/kernels/embedding_kernel.cc b/backends/sdaa/kernels/embedding_kernel.cc index ec1f4df1b7f..a65b8077bf6 100644 --- a/backends/sdaa/kernels/embedding_kernel.cc +++ b/backends/sdaa/kernels/embedding_kernel.cc @@ -20,10 +20,10 @@ namespace custom_kernel { template void EmbeddingKernel(const Context &dev_ctx, - const phi::DenseTensor &inputx, - const phi::DenseTensor &weight, + const DenseTensor &inputx, + const DenseTensor &weight, int64_t padding_idx, - phi::DenseTensor *out) { + DenseTensor *out) { VLOG(4) << "Call SDAA EmbeddingKernel"; // basic settings dev_ctx.template Alloc(out); @@ -136,11 +136,11 @@ void EmbeddingKernel(const Context &dev_ctx, template void EmbeddingGradKernel(const Context &dev_ctx, - const phi::DenseTensor &input, - const phi::DenseTensor &weight, - const phi::DenseTensor &out_grad, + const DenseTensor &input, + const DenseTensor &weight, + const DenseTensor &out_grad, int64_t padding_idx, - phi::DenseTensor *weight_grad) { + DenseTensor *weight_grad) { VLOG(4) << "Call SDAA EmbeddingGradKernel"; // basic settings dev_ctx.template Alloc(weight_grad); @@ -162,7 +162,7 @@ void EmbeddingGradKernel(const Context &dev_ctx, } // switch input from int64 into int32 - phi::DenseTensor inputx_cast; + DenseTensor inputx_cast; if (input.dtype() == phi::DataType::INT64) { inputx_cast.Resize(input.dims()); dev_ctx.template Alloc(&inputx_cast); diff --git a/backends/sdaa/kernels/expand_as_kernel.cc b/backends/sdaa/kernels/expand_as_kernel.cc index e16462ec6f2..b70de103cb2 100644 --- a/backends/sdaa/kernels/expand_as_kernel.cc +++ b/backends/sdaa/kernels/expand_as_kernel.cc @@ -35,10 +35,10 @@ namespace custom_kernel { template void ExpandAsKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& y, + const DenseTensor& x, + const paddle::optional& y, const std::vector& target_shape_64, - phi::DenseTensor* out) { + DenseTensor* out) { std::vector target_shape = std::vector(target_shape_64.begin(), target_shape_64.end()); VLOG(4) << "CALL SDAA ExpandAsKernel"; diff --git a/backends/sdaa/kernels/expand_kernel.cc b/backends/sdaa/kernels/expand_kernel.cc index 992e35b7c27..ce120c18aa3 100644 --- a/backends/sdaa/kernels/expand_kernel.cc +++ b/backends/sdaa/kernels/expand_kernel.cc @@ -33,9 +33,9 @@ namespace custom_kernel { template void ExpandKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& shape, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA ExpandKernel"; auto expand_shape = shape.GetData(); @@ -120,10 +120,10 @@ void ExpandKernel(const Context& dev_ctx, template void ExpandGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& out_grad, const phi::IntArray& shape, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { VLOG(4) << "CALL SDAA ExpandGradKernel"; dev_ctx.template Alloc(x_grad); diff --git a/backends/sdaa/kernels/fill_kernel.cc b/backends/sdaa/kernels/fill_kernel.cc index 9259625d6f2..6abe9d03dae 100644 --- a/backends/sdaa/kernels/fill_kernel.cc +++ b/backends/sdaa/kernels/fill_kernel.cc @@ -31,9 +31,9 @@ namespace custom_kernel { template void FillKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& value, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA FillKernel"; double fill_var = value.to(); diff --git a/backends/sdaa/kernels/flash_attenttion_kernel.cc b/backends/sdaa/kernels/flash_attenttion_kernel.cc index fc8b8106053..58e7d3d2a8a 100644 --- a/backends/sdaa/kernels/flash_attenttion_kernel.cc +++ b/backends/sdaa/kernels/flash_attenttion_kernel.cc @@ -35,9 +35,9 @@ struct TensorStride { uint32_t stride; }; -void CheckInputs(const phi::DenseTensor& q, - const phi::DenseTensor& k, - const phi::DenseTensor& v, +void CheckInputs(const DenseTensor& q, + const DenseTensor& k, + const DenseTensor& v, float dropout) { // q,k,v [seq_len, batch_size, num_heads, head_dim] const auto& dims = q.dims(); @@ -69,7 +69,7 @@ void CheckInputs(const phi::DenseTensor& q, } void CastFP32TOFP16Raw(const Context& dev_ctx, - const phi::DenseTensor& src, + const DenseTensor& src, void* dst) { std::vector src_dims(phi::vectorize(src.dims())); tecodnnTensorDescriptor_t src_Desc = @@ -84,11 +84,11 @@ void CastFP32TOFP16Raw(const Context& dev_ctx, TECODNN_CHECK(tecodnnDestroyTensorDescriptor(dst_Desc)); } -int64_t GetFP16TensorSize(const phi::DenseTensor& t) { +int64_t GetFP16TensorSize(const DenseTensor& t) { return phi::SizeOf(phi::DataType::FLOAT16) * t.numel(); } -TensorStride GenTensorStride(const phi::DenseTensor& t) { +TensorStride GenTensorStride(const DenseTensor& t) { // t [seq_len, batch_size, num_heads, head_dim] auto dims = t.dims(); PADDLE_ENFORCE_EQ( @@ -102,22 +102,21 @@ TensorStride GenTensorStride(const phi::DenseTensor& t) { } template -void FlashAttnKernel( - const Context& dev_ctx, - const phi::DenseTensor& q, - const phi::DenseTensor& k, - const phi::DenseTensor& v, - const paddle::optional& fixed_seed_offset, - const paddle::optional& attn_mask, - float dropout, - bool causal, - bool return_softmax, - bool is_test, - const std::string& rng_name, - phi::DenseTensor* out, - phi::DenseTensor* softmax, - phi::DenseTensor* softmax_lse, - phi::DenseTensor* seed_offset) { +void FlashAttnKernel(const Context& dev_ctx, + const DenseTensor& q, + const DenseTensor& k, + const DenseTensor& v, + const paddle::optional& fixed_seed_offset, + const paddle::optional& attn_mask, + float dropout, + bool causal, + bool return_softmax, + bool is_test, + const std::string& rng_name, + DenseTensor* out, + DenseTensor* softmax, + DenseTensor* softmax_lse, + DenseTensor* seed_offset) { VLOG(4) << "Call SDAA FlashAttnKernel"; // q,k,v [seq_len, batch_size, num_heads, head_dim] CheckInputs(q, k, v, dropout); @@ -197,19 +196,19 @@ void FlashAttnKernel( template void FlashAttnGradKernel(const Context& dev_ctx, - const phi::DenseTensor& q, - const phi::DenseTensor& k, - const phi::DenseTensor& v, - const phi::DenseTensor& out, - const phi::DenseTensor& softmax_lse, - const phi::DenseTensor& seed_offset, - const paddle::optional& attn_mask, - const phi::DenseTensor& dout, + const DenseTensor& q, + const DenseTensor& k, + const DenseTensor& v, + const DenseTensor& out, + const DenseTensor& softmax_lse, + const DenseTensor& seed_offset, + const paddle::optional& attn_mask, + const DenseTensor& dout, float dropout, bool causal, - phi::DenseTensor* dq, - phi::DenseTensor* dk, - phi::DenseTensor* dv) { + DenseTensor* dq, + DenseTensor* dk, + DenseTensor* dv) { VLOG(4) << "Call SDAA FlashAttnGradKernel"; // q,k,v [seq_len, batch_size, num_heads, head_dim] CheckInputs(q, k, v, dropout); diff --git a/backends/sdaa/kernels/flip_kernel.cc b/backends/sdaa/kernels/flip_kernel.cc index 124b9731a63..56b3052a9cf 100644 --- a/backends/sdaa/kernels/flip_kernel.cc +++ b/backends/sdaa/kernels/flip_kernel.cc @@ -33,9 +33,9 @@ namespace custom_kernel { template void FlipKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA FlipKernel."; dev_ctx.template Alloc(out); diff --git a/backends/sdaa/kernels/full_kernel.cc b/backends/sdaa/kernels/full_kernel.cc index 370b72abea6..c0e0aee158c 100644 --- a/backends/sdaa/kernels/full_kernel.cc +++ b/backends/sdaa/kernels/full_kernel.cc @@ -63,7 +63,7 @@ void FullKernel(const Context& dev_ctx, const phi::IntArray& shape, const phi::Scalar& val, phi::DataType dtype, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA FullKernel"; CheckLimitCommon(val); @@ -80,10 +80,10 @@ void FullKernel(const Context& dev_ctx, template void FullLikeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& val, phi::DataType dtype, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA FullLikeKernel"; CheckLimitCommon(val); @@ -96,13 +96,13 @@ void FullLikeKernel(const Context& dev_ctx, template void FullBatchSizeLikeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& shape, const phi::Scalar& val, phi::DataType dtype, int x_batch_size_dim, int out_batch_size_dim, - phi::DenseTensor* out) { + DenseTensor* out) { if (x.lod().size() && x_batch_size_dim == 0) { // set the correct batch size for the DenseTensor. auto odims = out->dims(); diff --git a/backends/sdaa/kernels/funcs/contiguous/contiguous_register.h b/backends/sdaa/kernels/funcs/contiguous/contiguous_register.h index 6ef6895f015..7f7c4570781 100644 --- a/backends/sdaa/kernels/funcs/contiguous/contiguous_register.h +++ b/backends/sdaa/kernels/funcs/contiguous/contiguous_register.h @@ -43,11 +43,11 @@ class ContiguousOpt { ContiguousOpt() {} virtual ~ContiguousOpt() = default; virtual bool Optimize(const Context& dev_ctx, - const phi::DenseTensor& src, - phi::DenseTensor* dst) = 0; + const DenseTensor& src, + DenseTensor* dst) = 0; virtual bool CanOptimize(const Context& dev_ctx, - const phi::DenseTensor& src, - phi::DenseTensor* dst) { + const DenseTensor& src, + DenseTensor* dst) { return false; } }; @@ -80,8 +80,8 @@ class CopyOptRegister { bool CanOptimize(std::string& name, // NOLINT const Context& dev_ctx, - const phi::DenseTensor& src, - phi::DenseTensor* dst) { + const DenseTensor& src, + DenseTensor* dst) { for (int8_t level = ProfLevel::PROF_HIGH; level < ProfLevel::PROF_MAX_CNT; level++) { if (FindOptimize(registry[level], name, dev_ctx, src, dst)) { @@ -93,8 +93,8 @@ class CopyOptRegister { bool Run(const std::string& name, const Context& dev_ctx, - const phi::DenseTensor& src, - phi::DenseTensor* dst) { + const DenseTensor& src, + DenseTensor* dst) { for (int8_t level = ProfLevel::PROF_HIGH; level < ProfLevel::PROF_MAX_CNT; level++) { auto itr = registry[level].find(name); @@ -114,8 +114,8 @@ class CopyOptRegister { bool FindOptimize(OptMap& opt_map, // NOLINT std::string& name, // NOLINT const Context& dev_ctx, - const phi::DenseTensor& src, - phi::DenseTensor* dst) { + const DenseTensor& src, + DenseTensor* dst) { for (auto& opt : opt_map) { if (opt.second->CanOptimize(dev_ctx, src, dst)) { name = opt.first; diff --git a/backends/sdaa/kernels/funcs/contiguous/copy_stride_opt.cc b/backends/sdaa/kernels/funcs/contiguous/copy_stride_opt.cc index 0106ae6a9f7..faec7954a2b 100644 --- a/backends/sdaa/kernels/funcs/contiguous/copy_stride_opt.cc +++ b/backends/sdaa/kernels/funcs/contiguous/copy_stride_opt.cc @@ -35,8 +35,8 @@ namespace sdaa_copy { class CopyStrideContiguousOpt : public ContiguousOpt { public: bool Optimize(const Context& dev_ctx, - const phi::DenseTensor& src, - phi::DenseTensor* dst) override { + const DenseTensor& src, + DenseTensor* dst) override { VLOG(1) << "SDAA use CopyStride to complete the strided_copy."; auto shape = phi::vectorize(src.dims()); auto src_stride = phi::vectorize(src.strides()); @@ -49,8 +49,8 @@ class CopyStrideContiguousOpt : public ContiguousOpt { } bool CanOptimize(const Context& dev_ctx, - const phi::DenseTensor& src, - phi::DenseTensor* dst) override { + const DenseTensor& src, + DenseTensor* dst) override { if (!check_CopyStride_dtype(src) || !check_CopyStride_dtype(*dst)) { return false; } @@ -63,7 +63,7 @@ class CopyStrideContiguousOpt : public ContiguousOpt { return false; } - auto is_bad_case = [](const phi::DenseTensor& t) { + auto is_bad_case = [](const DenseTensor& t) { int64_vec stride = phi::vectorize(t.strides()); return std::find(stride.begin(), stride.end(), 0) != stride.end(); }; @@ -77,7 +77,7 @@ class CopyStrideContiguousOpt : public ContiguousOpt { } private: - bool check_CopyStride_dtype(const phi::DenseTensor& t) { + bool check_CopyStride_dtype(const DenseTensor& t) { static std::vector CopyStrideDtype = { phi::DataType::FLOAT64, phi::DataType::FLOAT32, diff --git a/backends/sdaa/kernels/funcs/contiguous/transpose_opt.cc b/backends/sdaa/kernels/funcs/contiguous/transpose_opt.cc index 3b533c366a1..f415afbb375 100644 --- a/backends/sdaa/kernels/funcs/contiguous/transpose_opt.cc +++ b/backends/sdaa/kernels/funcs/contiguous/transpose_opt.cc @@ -32,7 +32,7 @@ namespace custom_kernel { namespace sdaa_copy { -vec_tuple get_permute_back_order(const phi::DenseTensor& input) { +vec_tuple get_permute_back_order(const DenseTensor& input) { int64_vec input_shapes = phi::vectorize(input.dims()); int64_vec input_strides = phi::vectorize(input.strides()); int64_t rank = input.dims().size(); @@ -71,15 +71,15 @@ vec_tuple get_permute_back_order(const phi::DenseTensor& input) { class TransposeContiguousOpt : public ContiguousOpt { public: bool Optimize(const Context& dev_ctx, - const phi::DenseTensor& src, - phi::DenseTensor* dst) override { + const DenseTensor& src, + DenseTensor* dst) override { VLOG(1) << "SDAA use tranpose to complete the strided_copy."; return transpose_to_contiguous(dev_ctx, src, dst); } bool CanOptimize(const Context& dev_ctx, - const phi::DenseTensor& src, - phi::DenseTensor* dst) override { + const DenseTensor& src, + DenseTensor* dst) override { if (!check_Transpose_dtype(src) || !check_Transpose_dtype(*dst)) { return false; } @@ -107,7 +107,7 @@ class TransposeContiguousOpt : public ContiguousOpt { } private: - bool check_Transpose_dtype(const phi::DenseTensor& t) { + bool check_Transpose_dtype(const DenseTensor& t) { static std::vector TransposeDtype = {phi::DataType::FLOAT64, phi::DataType::FLOAT32, phi::DataType::FLOAT16, @@ -126,14 +126,14 @@ class TransposeContiguousOpt : public ContiguousOpt { } bool transpose_to_contiguous(const Context& dev_ctx, - const phi::DenseTensor& src, - phi::DenseTensor* dst) { + const DenseTensor& src, + DenseTensor* dst) { // convert a non_overlapping_and_dense tensor to contiguous tensor. auto recover_contiguous = [](const Context& dev_ctx, - const phi::DenseTensor& t, - std::vector* permute_order) -> phi::DenseTensor { - phi::DenseTensor view_contiguous; + const DenseTensor& t, + std::vector* permute_order) -> DenseTensor { + DenseTensor view_contiguous; if (!t.meta().is_contiguous()) { vec_tuple order_info = sdaa_copy::get_permute_back_order(t); @@ -144,7 +144,7 @@ class TransposeContiguousOpt : public ContiguousOpt { phi::DDim new_stride = sdaa_copy::permute(t.strides(), std::get<0>(order_info)); view_contiguous = t; - phi::DenseTensorMeta meta = t.meta(); + DenseTensorMeta meta = t.meta(); meta.dims = new_dim; meta.strides = new_stride; view_contiguous.set_meta(meta); @@ -160,9 +160,9 @@ class TransposeContiguousOpt : public ContiguousOpt { int64_vec src_permute_order(src_rank); int64_vec dst_permute_order(dst_rank); - phi::DenseTensor src_view_contiguous = + DenseTensor src_view_contiguous = recover_contiguous(dev_ctx, src, &src_permute_order); - phi::DenseTensor dst_view_contiguous = + DenseTensor dst_view_contiguous = recover_contiguous(dev_ctx, *dst, &dst_permute_order); std::vector new_permute_order(dst_rank); diff --git a/backends/sdaa/kernels/funcs/nv_align.h b/backends/sdaa/kernels/funcs/nv_align.h index 584f8edba58..99586c68743 100644 --- a/backends/sdaa/kernels/funcs/nv_align.h +++ b/backends/sdaa/kernels/funcs/nv_align.h @@ -39,7 +39,7 @@ static const char* ALIGN_NV = "RANDOM_ALIGN_NV_DEVICE"; namespace custom_kernel { inline void GetSeed(const phi::CustomContext& dev_ctx, - const paddle::optional& seed_tensor, + const paddle::optional& seed_tensor, int seed, bool fix_seed, const int offset, diff --git a/backends/sdaa/kernels/funcs/sdaa_baseop.cc b/backends/sdaa/kernels/funcs/sdaa_baseop.cc index f6f28a50e6c..7989f899ef3 100644 --- a/backends/sdaa/kernels/funcs/sdaa_baseop.cc +++ b/backends/sdaa/kernels/funcs/sdaa_baseop.cc @@ -333,7 +333,7 @@ tecocustomTensorDescriptor_t GetTecocustomTensorDesc( // - ndims[0] 的值应为当前张量的 numel(),即张量的总元素数量。 // 这种处理方式将多维张量的形状简化为一个一维数组,其中数组的长度等于张量的总元素数。 tecocustomTensorListDescriptor_t GetTecocustomTensorListDesc( - const std::vector& tensor_list, bool merged_optimizer) { + const std::vector& tensor_list, bool merged_optimizer) { int M = tensor_list.size(); tecocustomDataType_t dt = ToTecocustomDataType(tensor_list[0]->dtype()); tecocustomTensorListDescriptor_t CustomListDesc; @@ -363,9 +363,9 @@ tecocustomTensorListDescriptor_t GetTecocustomTensorListDesc( } void doTransformTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, Convert_TF convert_tf, - phi::DenseTensor* y) { + DenseTensor* y) { VLOG(4) << "call tecodnn transform tensor"; phi::DDim x_d; if (x.storage_properties_initialized() && @@ -404,8 +404,8 @@ void doTransformTensor(const Context& dev_ctx, } void doCastTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* y) { + const DenseTensor& x, + DenseTensor* y) { VLOG(4) << "call tecodnn cast tensor"; phi::DDim x_d; if (x.storage_properties_initialized() && @@ -463,10 +463,10 @@ void doCastTensor(const Context& dev_ctx, } void doAddTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float alpha, float beta, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "call tecodnn add tensor"; std::vector x_dims = phi::vectorize(x.dims()); std::vector x_dimensions(4, 1); @@ -493,11 +493,11 @@ void doAddTensor(const Context& dev_ctx, } void doActivationForward(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, double factor, ActivationMode activation_mode, NanPropagation nan_propagate, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "call tecodnn activation forward"; int N = 1, C = x.numel(), H = 1, W = 1; @@ -531,12 +531,12 @@ void doActivationForward(const Context& dev_ctx, } void doActivationBackward(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, + const DenseTensor& out, + const DenseTensor& dout, double factor, ActivationMode activation_mode, NanPropagation nan_propagate, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "call tecodnn activation backward"; int N = 1, C = out.numel(), H = 1, W = 1; @@ -574,10 +574,10 @@ void doActivationBackward(const Context& dev_ctx, } void doUnaryOpTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float alpha, UnaryOpMode unaryOpMode, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "call tecodnn unary op"; int N = 1, C = x.numel(), H = 1, W = 1; @@ -599,12 +599,12 @@ void doUnaryOpTensor(const Context& dev_ctx, } void doScaleTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float scale, float bias, bool inplace, bool bias_flag, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "call tecodnn scale tensor"; std::vector x_dims = phi::vectorize(x.dims()); @@ -655,8 +655,8 @@ void doScaleTensor(const Context& dev_ctx, } void doNegTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "call tecodnn neg tensor"; std::vector x_dims = phi::vectorize(x.dims()); @@ -687,10 +687,10 @@ void doNegTensor(const Context& dev_ctx, } void doCompareTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, CompareType tct, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "call tecodnn compare tensor"; std::vector x_dims = phi::vectorize(x.dims()); @@ -789,10 +789,10 @@ void doCompareTensor(const Context& dev_ctx, } void doOpTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, OpTensorMode opTensorMode, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "call tecodnn Op Tensor"; std::vector x_dims = phi::vectorize(x.dims()); @@ -869,13 +869,13 @@ inline void doReduceTensor(tecodnnHandle_t handle, } void doReduceTensorImpl(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& reduce_dims, tecodnnReduceTensorOp_t op, tecodnnNanPropagation_t nan_prop, tecodnnReduceTensorIndices_t indices_op, tecodnnIndicesType_t indices_type, - phi::DenseTensor* y) { + DenseTensor* y) { if (reduce_dims.size() == 0) { if (x.data() == y->data()) { return; @@ -895,7 +895,7 @@ void doReduceTensorImpl(const Context& dev_ctx, nan_prop, indices_op, indices_type)); - phi::DenseTensor workspace; + DenseTensor workspace; workspace.Resize({static_cast(sizeof(float) * x.numel())}); dev_ctx.Alloc(&workspace, DataType::INT8); std::vector x_dims = phi::vectorize(x.dims()); @@ -904,7 +904,7 @@ void doReduceTensorImpl(const Context& dev_ctx, for (auto&& i : reduce_dims) { y_dims[i] = 1; } - phi::DenseTensor y_temp; + DenseTensor y_temp; // Reduce is not an inplace op if (x.data() == y->data()) { y_temp.Resize(y->dims()); @@ -935,7 +935,7 @@ void doReduceTensorImpl(const Context& dev_ctx, std::vector ref_dims, ref_reduce_dims; foldNonReduceDims(x_dims, reduce_dims_int, &ref_dims, &ref_reduce_dims); - phi::DenseTensor temp_input, temp_output; + DenseTensor temp_input, temp_output; TensorCopy(dev_ctx, x, false, &temp_input); temp_output.Resize(x.dims()); @@ -1039,17 +1039,17 @@ inline tecodnnIndicesType_t ToTecodnnIndiceDataType(const DataType& dtype) { // temp function to mitigate int64 problem, when int64 is adapted change the // impl function to this function void doReduceTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& reduce_dims, tecodnnReduceTensorOp_t op, tecodnnNanPropagation_t nan_prop, tecodnnReduceTensorIndices_t indices_op, tecodnnIndicesType_t indices_type, - phi::DenseTensor* y) { + DenseTensor* y) { // The reason for removing the dtype cast expect float16 // is to avoid the loss of precision due to the dtype cast. if (x.dtype() == DataType::FLOAT16) { - phi::DenseTensor x_temp, y_temp; + DenseTensor x_temp, y_temp; x_temp.Resize(x.dims()); dev_ctx.Alloc(&x_temp, DataType::FLOAT32); sdaa_ops::doCastTensor(dev_ctx, x, &x_temp); @@ -1071,9 +1071,9 @@ void doReduceTensor(const Context& dev_ctx, } void doMeanTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& reduce_dims, - phi::DenseTensor* y) { + DenseTensor* y) { doReduceTensor(dev_ctx, x, reduce_dims, @@ -1085,9 +1085,9 @@ void doMeanTensor(const Context& dev_ctx, } void doSumTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& reduce_dims, - phi::DenseTensor* y) { + DenseTensor* y) { doReduceTensor(dev_ctx, x, reduce_dims, @@ -1099,9 +1099,9 @@ void doSumTensor(const Context& dev_ctx, } void doProdTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& reduce_dims, - phi::DenseTensor* y) { + DenseTensor* y) { doReduceTensor(dev_ctx, x, reduce_dims, @@ -1113,9 +1113,9 @@ void doProdTensor(const Context& dev_ctx, } void doMinTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& reduce_dims, - phi::DenseTensor* y) { + DenseTensor* y) { doReduceTensor(dev_ctx, x, reduce_dims, @@ -1127,9 +1127,9 @@ void doMinTensor(const Context& dev_ctx, } void doMaxTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& reduce_dims, - phi::DenseTensor* y) { + DenseTensor* y) { doReduceTensor(dev_ctx, x, reduce_dims, @@ -1142,12 +1142,12 @@ void doMaxTensor(const Context& dev_ctx, template void doElementWise(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, T mode, funcs::ElementwiseFunc tecodnnElementwiseFunctor, - phi::DenseTensor* out) { + DenseTensor* out) { std::vector x_expanded_dims, y_expanded_dims; custom_kernel::broadcastDims( x.dims(), y.dims(), axis, &x_expanded_dims, &y_expanded_dims); @@ -1179,40 +1179,40 @@ void doElementWise(const Context& dev_ctx, } void doElementAdd(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { doElementWise(dev_ctx, x, y, axis, BINARY_ADD, tecodnnAddTensorEx, out); } void doElementSub(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { doElementWise(dev_ctx, x, y, axis, BINARY_SUB, tecodnnSubTensorEx, out); } void doElementMul(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { doElementWise(dev_ctx, x, y, axis, BINARY_MUL, tecodnnMulTensorEx, out); } void doElementDiv(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { doElementWise(dev_ctx, x, y, axis, BINARY_DIV, tecodnnDivTensorEx, out); } void doReciprocalTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "call tecodnn reciprocal op."; std::vector x_dims = phi::vectorize(x.dims()); @@ -1232,7 +1232,7 @@ void doReciprocalTensor(const Context& dev_ctx, tecodnnTensorDescriptor_t x_Desc = GetTecodnnTensorDesc(x_dimensions, x.dtype(), TensorFormat::NHWC); - phi::DenseTensor x_(x); + DenseTensor x_(x); TECODNN_CHECK(tecodnnReciprocalTensor( tecodnnHandle, x_Desc, x_.data(), x_Desc, out->data())); @@ -1241,10 +1241,10 @@ void doReciprocalTensor(const Context& dev_ctx, } void doSoftmaxForward(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, bool high_precision, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "call tecodnn softmaxforward op"; if (axis < 0) { @@ -1288,11 +1288,11 @@ void doSoftmaxForward(const Context& dev_ctx, } void doSoftmaxBackward(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, + const DenseTensor& out, + const DenseTensor& dout, int axis, bool high_precision, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "call tecodnn softmaxbackward op"; if (axis < 0) { @@ -1339,9 +1339,9 @@ void doSoftmaxBackward(const Context& dev_ctx, /*This function has not benn tested.*/ void doLogSoftmaxForward(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "call tecodnn softmaxforward op"; if (axis < 0) { @@ -1383,10 +1383,10 @@ void doLogSoftmaxForward(const Context& dev_ctx, /*This function has not benn tested.*/ void doLogSoftmaxBackward(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, + const DenseTensor& out, + const DenseTensor& dout, int axis, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "call tecodnn softmaxbackward op"; if (axis < 0) { @@ -1429,7 +1429,7 @@ void doLogSoftmaxBackward(const Context& dev_ctx, } /*This function only tests NCHW2NHWC.*/ -phi::DDim doDimPermute(const phi::DenseTensor& x, Convert_TF convert_tf) { +phi::DDim doDimPermute(const DenseTensor& x, Convert_TF convert_tf) { std::vector dim_permute; switch (convert_tf) { case Convert_TF::NCHW2NHWC: @@ -1462,13 +1462,13 @@ phi::DDim doDimPermute(const phi::DenseTensor& x, Convert_TF convert_tf) { template void doSliceTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axes, const std::vector& starts, const std::vector& ends, const std::vector& strides, const std::vector& decrease_axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "call tecodnn slice tensor op"; int axes_num = axes.size(); @@ -1583,9 +1583,9 @@ void Padding(const Context& dev_ctx, } void doPaddingTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector>& Paddings, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "call tecodnn padding tensor"; std::vector x_dims = phi::vectorize(x.dims()); @@ -1682,7 +1682,7 @@ void doPaddingTensor(const Context& dev_ctx, return; } - phi::DenseTensor out_temp; + DenseTensor out_temp; out_temp.Resize(out->dims()); dev_ctx.Alloc(&out_temp, out->dtype()); phi::Copy(dev_ctx, x, x.place(), false, &out_temp); @@ -1837,9 +1837,9 @@ paddle::optional> TryDDimFusion( } void doTransposeTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "call tecodnn transpose tensor"; // FIXME(huangzhen): not sure whether the efficiency of the @@ -1904,10 +1904,10 @@ void doTransposeTensor(const Context& dev_ctx, } void doLogicTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axis_reduce, TensorLogicType TLT, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "call tecodnn logic tensor"; std::vector x_dims = phi::vectorize(x.dims()); std::vector out_dims = phi::vectorize(out->dims()); @@ -1919,11 +1919,11 @@ void doLogicTensor(const Context& dev_ctx, int x_ndim = x_dims.size(); - phi::DenseTensor x_int; + DenseTensor x_int; x_int.Resize(x.dims()); dev_ctx.Alloc(&x_int, DataType::INT32); - phi::DenseTensor out_int; + DenseTensor out_int; out_int.Resize(out->dims()); dev_ctx.Alloc(&out_int, DataType::INT32); @@ -1977,9 +1977,9 @@ void doLogicTensor(const Context& dev_ctx, } void doConcatTensor(const Context& dev_ctx, - const std::vector& x, + const std::vector& x, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "tecodnn concat tensor called"; tecodnnHandle_t tecodnnHandle = GetHandleFromCTX(dev_ctx); @@ -2026,7 +2026,7 @@ void doConcatTensor(const Context& dev_ctx, input_ptr.data(), sizeworkspaceInBytes); - phi::DenseTensor tmp; + DenseTensor tmp; tmp.Resize({hostInputSize}); dev_ctx.Alloc(&tmp, phi::DataType::INT8); AsyncMemCpyH2D(nullptr, @@ -2062,11 +2062,11 @@ void doConcatTensor(const Context& dev_ctx, } void doScatterTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& index, - const phi::DenseTensor& updates, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& updates, bool overwrite, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "tecodnn scatter tensor called"; std::vector x_dims = phi::vectorize(x.dims()); @@ -2147,9 +2147,9 @@ void doScatterTensor(const Context& dev_ctx, } void doSplitTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, - std::vector outs) { + std::vector outs) { VLOG(4) << "tecodnn split tensor called"; if (outs.size() == 1) { @@ -2197,7 +2197,7 @@ void doSplitTensor(const Context& dev_ctx, outs_ptr.data(), sizeworkspaceInBytes); - phi::DenseTensor tmp; + DenseTensor tmp; tmp.Resize(phi::make_ddim({hostOutputSize})); dev_ctx.Alloc(&tmp, phi::DataType::INT8); AsyncMemCpyH2D(nullptr, @@ -2233,8 +2233,8 @@ void doSplitTensor(const Context& dev_ctx, } void doExpandTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "tecodnn expand tensor called."; std::vector x_dims = phi::vectorize(x.dims()); @@ -2253,12 +2253,12 @@ void doExpandTensor(const Context& dev_ctx, } void doNearestInterpolateForward(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const float ratio_w, const float ratio_h, const float ratio_d, const bool align_corners, - phi::DenseTensor* out) { + DenseTensor* out) { // Note: data layout of doNearestInterpolateForward only support NHWC VLOG(4) << "tecodnn nearest interpolate forward called"; @@ -2308,12 +2308,12 @@ void doNearestInterpolateForward(const Context& dev_ctx, } void doNearestInterpolateBackward(const Context& dev_ctx, - const phi::DenseTensor& out, + const DenseTensor& out, const float ratio_w, const float ratio_h, const float ratio_d, const bool align_corners, - phi::DenseTensor* dx) { + DenseTensor* dx) { // Note: data layout of doNearestInterpolateBackward only support NHWC VLOG(4) << "tecodnn nearest interpolate backward called"; @@ -2363,10 +2363,10 @@ void doNearestInterpolateBackward(const Context& dev_ctx, } void doBitwiseBinaryOpTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, BitwiseOpType bitwiseType, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "tecodnn bitwise op tensor called."; std::vector x_dims = phi::vectorize(x.dims()); @@ -2419,9 +2419,9 @@ void doBitwiseBinaryOpTensor(const Context& dev_ctx, } void doBitwiseUnaryOpTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, BitwiseOpType bitwiseType, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "tecodnn bitwise op tensor called."; std::vector x_dims = phi::vectorize(x.dims()); @@ -2446,17 +2446,17 @@ void doBitwiseUnaryOpTensor(const Context& dev_ctx, TECODNN_CHECK(tecodnnDestroyTensorDescriptor(out_Desc)); } void doLogicalOpTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, LogicalOpType logicaltype, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "tecodnn logical op tensor called."; std::vector x_dims = phi::vectorize(x.dims()); std::vector y_dims = phi::vectorize(y.dims()); std::vector out_dims = phi::vectorize(out->dims()); - phi::DenseTensor out_int; + DenseTensor out_int; out_int.Resize(out->dims()); dev_ctx.Alloc(&out_int, DataType::INT32); @@ -2525,8 +2525,8 @@ void doLogicalOpTensor(const Context& dev_ctx, } void doLogicalNotOpTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "tecodnn Logical Notop tensor called."; if (DataType::BOOL == x.dtype()) { @@ -2550,7 +2550,7 @@ void doLogicalNotOpTensor(const Context& dev_ctx, std::vector x_dims = phi::vectorize(x.dims()); std::vector out_dims = phi::vectorize(out->dims()); - phi::DenseTensor out_int; + DenseTensor out_int; out_int.Resize(out->dims()); dev_ctx.Alloc(&out_int, DataType::INT32); @@ -2569,9 +2569,7 @@ void doLogicalNotOpTensor(const Context& dev_ctx, TECODNN_CHECK(tecodnnDestroyTensorDescriptor(out_Desc)); } -void doIsnanOp(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { +void doIsnanOp(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { VLOG(4) << "tecodnn isnan op called"; // basic settings @@ -2594,7 +2592,7 @@ void doIsnanOp(const Context& dev_ctx, int64_t doAddStorageProperties( const Context& dev_ctx, - phi::DenseTensor* tensor, + DenseTensor* tensor, SDAAStorageProperties& storage_properties) { // NOLINT PADDLE_ENFORCE( tensor->valid(), @@ -2616,7 +2614,7 @@ int64_t doAddStorageProperties( } void swapTensorData(const Context& dev_ctx, - const phi::DenseTensor& in, + const DenseTensor& in, SDAAStorageProperties& storage_properties) { // NOLINT Convert_TF tf; switch (storage_properties.storage_format) { @@ -2627,9 +2625,9 @@ void swapTensorData(const Context& dev_ctx, PADDLE_THROW(phi::errors::InvalidArgument("invaild storage format")); break; } - phi::DenseTensor* temp_in = const_cast(&in); - phi::DenseTensor trans_in; - phi::DenseTensorMeta meta_in = {in.dtype(), doDimPermute(in, tf)}; + DenseTensor* temp_in = const_cast(&in); + DenseTensor trans_in; + DenseTensorMeta meta_in = {in.dtype(), doDimPermute(in, tf)}; trans_in.set_meta(meta_in); dev_ctx.Alloc(&trans_in, in.dtype()); doTransformTensor(dev_ctx, in, tf, &trans_in); // CHWN @@ -2642,15 +2640,15 @@ void swapTensorData(const Context& dev_ctx, } void swapTensorData(const Context& dev_ctx, - const phi::DenseTensor& in, + const DenseTensor& in, SDAAStorageProperties& storage_properties, // NOLINT - phi::DenseTensor* out) { + DenseTensor* out) { Convert_TF tf = Convert_TF::NCHW2CHWN; PADDLE_ENFORCE_EQ(storage_properties.storage_format, StoragePropertiesCHWN, phi::errors::InvalidArgument("invaild storage format!")); - phi::DenseTensorMeta meta_in = {in.dtype(), doDimPermute(in, tf)}; + DenseTensorMeta meta_in = {in.dtype(), doDimPermute(in, tf)}; out->set_meta(meta_in); dev_ctx.Alloc(out, in.dtype()); doTransformTensor(dev_ctx, in, tf, out); // CHWN @@ -2679,20 +2677,20 @@ std::vector GetReduceDimAxis(const phi::DDim& in, } void BatchNormFunc(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& mean, - const phi::DenseTensor& variance, - const phi::DenseTensor& scale, - const phi::DenseTensor& bias, + const DenseTensor& x, + const DenseTensor& mean, + const DenseTensor& variance, + const DenseTensor& scale, + const DenseTensor& bias, float momentum, float epsilon, bool training, const std::string& data_layout_str, - phi::DenseTensor* y, - phi::DenseTensor* mean_out, - phi::DenseTensor* variance_out, - phi::DenseTensor* saved_mean, - phi::DenseTensor* saved_variance) { + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out, + DenseTensor* saved_mean, + DenseTensor* saved_variance) { // check arguments const auto& x_dims = x.dims(); PADDLE_ENFORCE_EQ( @@ -2722,7 +2720,7 @@ void BatchNormFunc(const Context& dev_ctx, // since the tecodnnBatchNormForward func only supports 4-D tensor, // when tensor dims=3, a dimensional complement is required. - phi::DenseTensor x_temp(x), y_temp(*y); + DenseTensor x_temp(x), y_temp(*y); if (x_dims.size() < 4) { if (need_trans) { x_temp.Resize(phi::make_ddim({N, C, H, W})); @@ -2733,7 +2731,7 @@ void BatchNormFunc(const Context& dev_ctx, } } - phi::DenseTensor x_NHWC, y_NHWC; + DenseTensor x_NHWC, y_NHWC; phi::DDim x_NHWC_dims, y_NHWC_dims; if (need_trans) { @@ -2810,7 +2808,7 @@ void BatchNormFunc(const Context& dev_ctx, void doMemsetTensor(const Context& dev_ctx, const int value, - phi::DenseTensor* tensor) { + DenseTensor* tensor) { tecodnnHandle_t handle = GetHandleFromCTX(dev_ctx); TECODNN_CHECK(tecodnnMemset(handle, tensor->data(), @@ -2820,9 +2818,9 @@ void doMemsetTensor(const Context& dev_ctx, template void doScatterNdAdd(const Context& ctx, - const phi::DenseTensor& index, - const phi::DenseTensor& updates, - phi::DenseTensor* out) { + const DenseTensor& index, + const DenseTensor& updates, + DenseTensor* out) { const auto& index_type = index.dtype(); bool index_type_match = @@ -2879,9 +2877,9 @@ void doScatterNdAdd(const Context& ctx, } template void doScatterNdAdd(const Context& ctx, - const phi::DenseTensor& index, - const phi::DenseTensor& updates, - phi::DenseTensor* out); + const DenseTensor& index, + const DenseTensor& updates, + DenseTensor* out); void GetReduceDimReduceAll(const std::vector& axis_dims, int input_dims_size, @@ -2906,11 +2904,11 @@ void GetReduceDimReduceAll(const std::vector& axis_dims, } void doStrideCopy(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& shape, const std::vector& x_strides, const std::vector& out_strides, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "tecodnn CopyStride op called"; // basic settings @@ -2942,10 +2940,10 @@ void doStrideCopy(const Context& dev_ctx, template void doClipTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, T min, T max, - phi::DenseTensor* out) { + DenseTensor* out) { tecodnnHandle_t tecodnnHandle = GetHandleFromCTX(dev_ctx); std::vector x_dims = phi::vectorize(x.dims()); std::vector out_dims = phi::vectorize(out->dims()); @@ -2954,7 +2952,7 @@ void doClipTensor(const Context& dev_ctx, tecodnnTensorDescriptor_t out_Desc = sdaa_ops::GetTecodnnTensorDesc( out_dims, out->dtype(), TensorFormat::Undefined); - phi::DenseTensor x_temp(x); + DenseTensor x_temp(x); TECODNN_CHECK(tecodnnClampTensor( tecodnnHandle, &min, &max, x_Desc, x_temp.data(), out_Desc, out->data())); @@ -2963,40 +2961,40 @@ void doClipTensor(const Context& dev_ctx, } template void doClipTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int min, int max, - phi::DenseTensor* out); + DenseTensor* out); template void doClipTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int64_t min, int64_t max, - phi::DenseTensor* out); + DenseTensor* out); template void doClipTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float min, float max, - phi::DenseTensor* out); + DenseTensor* out); template void doClipTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, double min, double max, - phi::DenseTensor* out); + DenseTensor* out); template void doClipTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, phi::dtype::float16 min, phi::dtype::float16 max, - phi::DenseTensor* out); + DenseTensor* out); template void doClipTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, phi::dtype::bfloat16 min, phi::dtype::bfloat16 max, - phi::DenseTensor* out); + DenseTensor* out); } // namespace sdaa_ops } // namespace custom_kernel diff --git a/backends/sdaa/kernels/funcs/sdaa_baseop.h b/backends/sdaa/kernels/funcs/sdaa_baseop.h index 586f7bad4e2..ed1d1ba0d02 100644 --- a/backends/sdaa/kernels/funcs/sdaa_baseop.h +++ b/backends/sdaa/kernels/funcs/sdaa_baseop.h @@ -46,6 +46,8 @@ namespace custom_kernel { using Context = phi::CustomContext; using DataType = phi::DataType; using DataLayout = phi::DataLayout; +using DenseTensor = DenseTensor; +using DenseTensorMeta = DenseTensorMeta; template class MPTypeTrait { @@ -465,113 +467,111 @@ const std::map, Convert_TF> TransposeModeMap = { {{3, 4, 0, 1, 2}, Convert_TF::CHWN2NCHW}}; void doMeanTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& reduce_dims, - phi::DenseTensor* y); + DenseTensor* y); void doSumTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& reduce_dims, - phi::DenseTensor* y); + DenseTensor* y); void doProdTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& reduce_dims, - phi::DenseTensor* y); + DenseTensor* y); void doMinTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& reduce_dims, - phi::DenseTensor* y); + DenseTensor* y); void doMaxTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& reduce_dims, - phi::DenseTensor* y); + DenseTensor* y); void doTransformTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, Convert_TF convert_tf, - phi::DenseTensor* y); + DenseTensor* y); -void doCastTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* y); +void doCastTensor(const Context& dev_ctx, const DenseTensor& x, DenseTensor* y); void doAddTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float alpha, float beta, - phi::DenseTensor* out); + DenseTensor* out); void doActivationForward(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, double factor, ActivationMode activation_mode, NanPropagation nan_propagate, - phi::DenseTensor* out); + DenseTensor* out); void doActivationBackward(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, + const DenseTensor& out, + const DenseTensor& dout, double factor, ActivationMode activation_mode, NanPropagation nan_propagate, - phi::DenseTensor* dx); + DenseTensor* dx); void doUnaryOpTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float alpha, UnaryOpMode unaryOpMode, - phi::DenseTensor* out); + DenseTensor* out); void doScaleTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float scale, float bias, bool inplace, bool bias_flag, - phi::DenseTensor* out); + DenseTensor* out); void doNegTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out); + const DenseTensor& x, + DenseTensor* out); void doCompareTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, CompareType tct, - phi::DenseTensor* out); + DenseTensor* out); void doOpTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, OpTensorMode opTensorMode, - phi::DenseTensor* out); + DenseTensor* out); void doElementAdd(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out); + DenseTensor* out); void doElementSub(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out); + DenseTensor* out); void doElementMul(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out); + DenseTensor* out); void doElementDiv(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out); + DenseTensor* out); tecodnnTensorDescriptor_t GetTecodnnTensorDesc( const std::vector& dims, @@ -591,149 +591,147 @@ tecocustomTensorDescriptor_t GetTecocustomTensorDesc( const std::vector& strides = {}); tecocustomTensorListDescriptor_t GetTecocustomTensorListDesc( - const std::vector& tensor_list, + const std::vector& tensor_list, bool merged_optimizer = false); void doReciprocalTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out); + const DenseTensor& x, + DenseTensor* out); void doSoftmaxForward(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, bool high_precision, - phi::DenseTensor* out); + DenseTensor* out); void doSoftmaxBackward(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, + const DenseTensor& out, + const DenseTensor& dout, int axis, bool high_precision, - phi::DenseTensor* dx); + DenseTensor* dx); void doLogSoftmaxForward(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, - phi::DenseTensor* out); + DenseTensor* out); void doLogSoftmaxBackward(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, + const DenseTensor& out, + const DenseTensor& dout, int axis, - phi::DenseTensor* dx); + DenseTensor* dx); -phi::DDim doDimPermute(const phi::DenseTensor& x, Convert_TF convert_tf); +phi::DDim doDimPermute(const DenseTensor& x, Convert_TF convert_tf); template void doSliceTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axes, const std::vector& starts, const std::vector& ends, const std::vector& strides, const std::vector& decrease_axis, - phi::DenseTensor* out); + DenseTensor* out); void doPaddingTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector>& Paddings, - phi::DenseTensor* out); + DenseTensor* out); void doTransposeTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axis, - phi::DenseTensor* out); + DenseTensor* out); void doLogicTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axis_reduce, TensorLogicType TLT, - phi::DenseTensor* out); + DenseTensor* out); void doConcatTensor(const Context& dev_ctx, - const std::vector& x, + const std::vector& x, int axis, - phi::DenseTensor* out); + DenseTensor* out); void doScatterTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& index, - const phi::DenseTensor& updates, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& updates, bool overwrite, - phi::DenseTensor* out); + DenseTensor* out); void doSplitTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, - std::vector outs); + std::vector outs); void doExpandTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out); + const DenseTensor& x, + DenseTensor* out); void doNearestInterpolateForward(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const float ratio_w, const float ratio_h, const float ratio_d, const bool align_corners, - phi::DenseTensor* out); + DenseTensor* out); void doNearestInterpolateBackward(const Context& dev_ctx, - const phi::DenseTensor& out, + const DenseTensor& out, const float ratio_w, const float ratio_h, const float ratio_d, const bool align_corners, - phi::DenseTensor* dx); + DenseTensor* dx); void doBitwiseBinaryOpTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, BitwiseOpType bitwiseType, - phi::DenseTensor* out); + DenseTensor* out); void doBitwiseUnaryOpTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, BitwiseOpType bitwiseType, - phi::DenseTensor* out); + DenseTensor* out); int64_t doAddStorageProperties( const Context& dev_ctx, - phi::DenseTensor* tensor, + DenseTensor* tensor, SDAAStorageProperties& storage_properties); // NOLINT -void doIsnanOp(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out); +void doIsnanOp(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); void doLogicalOpTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, LogicalOpType logicaltype, - phi::DenseTensor* out); + DenseTensor* out); void doLogicalNotOpTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out); + const DenseTensor& x, + DenseTensor* out); void swapTensorData(const Context& dev_ctx, - const phi::DenseTensor& in, + const DenseTensor& in, SDAAStorageProperties& storage_properties); // NOLINT void swapTensorData(const Context& dev_ctx, - const phi::DenseTensor& in, + const DenseTensor& in, SDAAStorageProperties& storage_properties, // NOLINT - phi::DenseTensor* out); + DenseTensor* out); void doAtanTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out); + const DenseTensor& x, + DenseTensor* out); template void doFillTensor(const Context& dev_ctx, T val, phi::DataType dtype, - phi::DenseTensor* out) { + DenseTensor* out) { auto handle = custom_kernel::GetHandleFromCTX(dev_ctx); std::vector out_dims = phi::vectorize(out->dims()); tecodnnTensorDescriptor_t Desc; @@ -753,24 +751,24 @@ std::vector GetReduceDimAxis(const phi::DDim& in, int axis); void BatchNormFunc(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& mean, - const phi::DenseTensor& variance, - const phi::DenseTensor& scale, - const phi::DenseTensor& bias, + const DenseTensor& x, + const DenseTensor& mean, + const DenseTensor& variance, + const DenseTensor& scale, + const DenseTensor& bias, float momentum, float epsilon, bool training, const std::string& data_layout_str, - phi::DenseTensor* y, - phi::DenseTensor* mean_out, - phi::DenseTensor* variance_out, - phi::DenseTensor* saved_mean, - phi::DenseTensor* saved_variance); + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out, + DenseTensor* saved_mean, + DenseTensor* saved_variance); void doMemsetTensor(const Context& dev_ctx, const int value, - phi::DenseTensor* tensor); + DenseTensor* tensor); void GetReduceDimReduceAll(const std::vector& axis_dims, int input_dims_size, @@ -779,23 +777,23 @@ void GetReduceDimReduceAll(const std::vector& axis_dims, template void doScatterNdAdd(const Context& ctx, - const phi::DenseTensor& index, - const phi::DenseTensor& updates, - phi::DenseTensor* out); + const DenseTensor& index, + const DenseTensor& updates, + DenseTensor* out); void doStrideCopy(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& shape, const std::vector& x_strides, const std::vector& out_strides, - phi::DenseTensor* out); + DenseTensor* out); template void doClipTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, T min, T max, - phi::DenseTensor* out); + DenseTensor* out); } // namespace sdaa_ops } // namespace custom_kernel diff --git a/backends/sdaa/kernels/funcs/sdaa_funcs.h b/backends/sdaa/kernels/funcs/sdaa_funcs.h index 15ff99a6f5a..7554debee63 100644 --- a/backends/sdaa/kernels/funcs/sdaa_funcs.h +++ b/backends/sdaa/kernels/funcs/sdaa_funcs.h @@ -29,34 +29,34 @@ using Context = phi::CustomContext; template void ExpandKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& shape, - phi::DenseTensor* out); + DenseTensor* out); template void NonZeroKernel(const Context& dev_ctx, - const phi::DenseTensor& condition, - phi::DenseTensor* out); + const DenseTensor& condition, + DenseTensor* out); template void SplitWithNumKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int num, const phi::Scalar& axis_scalar, - std::vector outs); + std::vector outs); template void CastKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, phi::DataType out_dtype, - phi::DenseTensor* out); + DenseTensor* out); template -phi::DenseTensor GetReshapeAndExpandTensor(const Context& dev_ctx, - const phi::DenseTensor& tensor, - const phi::DDim& res_dim, - const phi::DDim& bd_dim, - int index) { +DenseTensor GetReshapeAndExpandTensor(const Context& dev_ctx, + const DenseTensor& tensor, + const phi::DDim& res_dim, + const phi::DDim& bd_dim, + int index) { std::vector before_dims = phi::vectorize(tensor.dims()); std::vector mid_dims(res_dim.size(), 1); @@ -68,14 +68,14 @@ phi::DenseTensor GetReshapeAndExpandTensor(const Context& dev_ctx, mid_dims[index] = before_dims[0]; } - phi::DenseTensor mid_tensor; - phi::DenseTensorMeta meta_mid = {tensor.dtype(), phi::make_ddim(mid_dims)}; + DenseTensor mid_tensor; + DenseTensorMeta meta_mid = {tensor.dtype(), phi::make_ddim(mid_dims)}; mid_tensor.set_meta(meta_mid); phi::ReshapeKernel( dev_ctx, tensor, phi::IntArray(mid_dims), &mid_tensor); - phi::DenseTensor res_tensor; - phi::DenseTensorMeta meta_res = {tensor.dtype(), res_dim}; + DenseTensor res_tensor; + DenseTensorMeta meta_res = {tensor.dtype(), res_dim}; res_tensor.set_meta(meta_res); custom_kernel::ExpandKernel( dev_ctx, mid_tensor, phi::IntArray(phi::vectorize(res_dim)), &res_tensor); @@ -83,11 +83,11 @@ phi::DenseTensor GetReshapeAndExpandTensor(const Context& dev_ctx, } template -std::vector DealWithBoolIndices( +std::vector DealWithBoolIndices( const Context& dev_ctx, - const std::vector& indices_v, - std::vector* tmp_indices_v) { - std::vector res; + const std::vector& indices_v, + std::vector* tmp_indices_v) { + std::vector res; bool contains_bool_tensor = false; for (size_t i = 0; i < indices_v.size(); ++i) { @@ -106,20 +106,20 @@ std::vector DealWithBoolIndices( phi::errors::InvalidArgument( "the only bool tensor in indices should " "have number of dimension at least 1")); - phi::DenseTensor nonzero_indices; + DenseTensor nonzero_indices; custom_kernel::NonZeroKernel( dev_ctx, *indices_v[i], &nonzero_indices); if (nonzero_indices.numel() == 0) { - std::vector empty_indices; + std::vector empty_indices; return empty_indices; } - std::vector integer_indices(rank, nullptr); + std::vector integer_indices(rank, nullptr); const int tmp_ix = tmp_indices_v->size(); for (int i = 0; i < rank; ++i) { - phi::DenseTensor tmp_index; - phi::DenseTensorMeta meta_tmp_index = { + DenseTensor tmp_index; + DenseTensorMeta meta_tmp_index = { phi::DataType::INT64, phi::make_ddim({nonzero_indices.dims()[0], 1})}; tmp_index.set_meta(meta_tmp_index); @@ -154,7 +154,7 @@ std::vector DealWithBoolIndices( } static phi::DDim BroadCastTensorsDims( - const std::vector& tensors) { + const std::vector& tensors) { int target_rank = 0; for (const auto& tensor : tensors) { target_rank = std::max(target_rank, tensor->dims().size()); @@ -195,11 +195,11 @@ static phi::DDim BroadCastTensorsDims( template void DealWithIndices(const Context& dev_ctx, - const phi::DenseTensor& x, - const std::vector& int_indices_v, - std::vector* res_indices_v, - std::vector* tmp_res_indices_v, - const std::vector& range_tensor_v, + const DenseTensor& x, + const std::vector& int_indices_v, + std::vector* res_indices_v, + std::vector* tmp_res_indices_v, + const std::vector& range_tensor_v, const phi::DDim& bd_dim, std::vector* res_dim_v) { size_t total_dims = x.dims().size(); @@ -211,7 +211,7 @@ void DealWithIndices(const Context& dev_ctx, tmp_x_dims.end()); phi::DDim res_dim = phi::make_ddim(*res_dim_v); for (size_t i = 0; i < int_indices_v.size(); ++i) { - phi::DenseTensor index_tensor; + DenseTensor index_tensor; if (int_indices_v[i]->dtype() == phi::DataType::INT32) { index_tensor.Resize(int_indices_v[i]->dims()); custom_kernel::CastKernel( @@ -234,8 +234,8 @@ void DealWithIndices(const Context& dev_ctx, } else { for (size_t i = 0; i < int_indices_v.size(); ++i) { - phi::DenseTensor index_tensor; - phi::DenseTensor expand_index; + DenseTensor index_tensor; + DenseTensor expand_index; if (int_indices_v[i]->dtype() == phi::DataType::INT32) { index_tensor.Resize(int_indices_v[i]->dims()); custom_kernel::CastKernel( @@ -244,8 +244,8 @@ void DealWithIndices(const Context& dev_ctx, index_tensor = *int_indices_v[i]; } if (bd_dim != int_indices_v[i]->dims()) { - phi::DenseTensor expand_index; - phi::DenseTensorMeta meta_ei = {phi::DataType::INT64, bd_dim}; + DenseTensor expand_index; + DenseTensorMeta meta_ei = {phi::DataType::INT64, bd_dim}; expand_index.set_meta(meta_ei); custom_kernel::ExpandKernel( dev_ctx, @@ -270,9 +270,9 @@ void DealWithIndices(const Context& dev_ctx, */ template inline void TensorCopy(const Context& dev_ctx, - const phi::DenseTensor& src, + const DenseTensor& src, bool blocking, - phi::DenseTensor* dst, + DenseTensor* dst, const phi::Place& dst_place = phi::CustomPlace()) { auto* src_ptr = src.data(); if (src_ptr == nullptr) { @@ -291,7 +291,7 @@ inline void TensorCopy(const Context& dev_ctx, } else { VLOG(6) << "Src and dst are the same Tensor, in-place copy data(" << src_ptr << ") from " << src_place << " to " << dst_place_; - const phi::DenseTensor src_copy = src; + const DenseTensor src_copy = src; TensorCopy(dev_ctx, src_copy, blocking, dst, dst_place_); } return; @@ -372,7 +372,7 @@ void TensorFromArray(const phi::CustomContext& ctx, const T* src, const size_t& array_size, const phi::CustomContext& dev_ctx, - phi::DenseTensor* dst) { + DenseTensor* dst) { VLOG(4) << "TensorFromArray start"; auto dst_place = dev_ctx.GetPlace(); auto src_ptr = static_cast(src); @@ -400,7 +400,7 @@ template inline void TensorFromVector(const phi::CustomContext& ctx, const std::vector& src, const phi::CustomContext& dev_ctx, - phi::DenseTensor* dst) { + DenseTensor* dst) { auto dst_place = dev_ctx.GetPlace(); auto src_ptr = static_cast(src.data()); dst->Resize({static_cast(src.size())}); @@ -425,7 +425,7 @@ inline void TensorFromVector(const phi::CustomContext& ctx, */ template inline void TensorToVector(const phi::CustomContext& ctx, - const phi::DenseTensor& src, + const DenseTensor& src, const phi::CustomContext& dev_ctx, std::vector* dst) { VLOG(4) << "MemCpyD2H start"; @@ -455,7 +455,7 @@ inline void TensorToVector(const phi::CustomContext& ctx, template <> inline void TensorToVector(const phi::CustomContext& ctx, - const phi::DenseTensor& src, + const DenseTensor& src, const phi::CustomContext& dev_ctx, std::vector* dst) { auto src_ptr = static_cast(src.data()); @@ -489,10 +489,9 @@ inline void TensorToVector(const phi::CustomContext& ctx, * only used for the number of tensor is one */ template -inline void TensorFromVectorTensor( - const phi::CustomContext& dev_ctx, - const std::vector& src, - phi::DenseTensor* dst) { +inline void TensorFromVectorTensor(const phi::CustomContext& dev_ctx, + const std::vector& src, + DenseTensor* dst) { int n = src.size(); dst->Resize({static_cast(n)}); dev_ctx.template Alloc(dst); @@ -615,7 +614,7 @@ struct MatmulParam { static void setTBlasWorkspace(const Context& dev_ctx, const struct MatmulParam& param, - phi::DenseTensor* workspace) { + DenseTensor* workspace) { CustomSDAAStream_t stream = reinterpret_cast(dev_ctx.stream()); tblasHandle_t tblas_handle = stream->tblasHandle; @@ -644,8 +643,8 @@ static void setTBlasWorkspace(const Context& dev_ctx, if (kWorkspaceSize) { VLOG(4) << "start to allocate memory for tblas's workspace with size: " << kWorkspaceSize / 1024 << " KB."; - phi::DenseTensorMeta w_meta = {phi::DataType::UINT8, - {static_cast(kWorkspaceSize)}}; + DenseTensorMeta w_meta = {phi::DataType::UINT8, + {static_cast(kWorkspaceSize)}}; workspace->set_meta(w_meta); dev_ctx.template Alloc(workspace); @@ -657,7 +656,7 @@ static void setTBlasWorkspace(const Context& dev_ctx, inline static tblasHandle_t GetBlasHandleFromCTX( const Context& dev_ctx, const struct MatmulParam& param, - phi::DenseTensor* workspace) { + DenseTensor* workspace) { CustomSDAAStream_t stream = reinterpret_cast(dev_ctx.stream()); tblasHandle_t& tblas_handle = stream->tblasHandle; @@ -770,10 +769,10 @@ inline void foldNonReduceDims(const std::vector& x_dims, } } template -inline phi::DenseTensor build_dummy_tensor(const Context& dev_ctx, - phi::DataType dtype, - phi::DDim input_dims) { - phi::DenseTensor input_; +inline DenseTensor build_dummy_tensor(const Context& dev_ctx, + phi::DataType dtype, + phi::DDim input_dims) { + DenseTensor input_; input_.Resize(input_dims); dev_ctx.Alloc(&input_, dtype); return input_; diff --git a/backends/sdaa/kernels/funcs/slice_utils.h b/backends/sdaa/kernels/funcs/slice_utils.h index 46323170222..fdcfc7ebf83 100644 --- a/backends/sdaa/kernels/funcs/slice_utils.h +++ b/backends/sdaa/kernels/funcs/slice_utils.h @@ -30,9 +30,9 @@ namespace custom_kernel { -inline phi::DenseTensor Slice(const phi::DenseTensor& src, - int64_t begin_index, - int64_t end_index) { +inline DenseTensor Slice(const DenseTensor& src, + int64_t begin_index, + int64_t end_index) { auto meta = src.meta(); PADDLE_ENFORCE_GE( begin_index, @@ -57,13 +57,12 @@ inline phi::DenseTensor Slice(const phi::DenseTensor& src, return src; } else { size_t base = src.numel() / meta.dims[0]; - phi::DenseTensor dst(src); + DenseTensor dst(src); phi::DDim dst_dims = meta.dims; dst_dims[0] = end_index - begin_index; size_t dst_offset = meta.offset + begin_index * base * phi::SizeOf(meta.dtype); - phi::DenseTensorMeta dst_meta = { - meta.dtype, dst_dims, meta.layout, dst_offset}; + DenseTensorMeta dst_meta = {meta.dtype, dst_dims, meta.layout, dst_offset}; dst.set_meta(dst_meta); return dst; } diff --git a/backends/sdaa/kernels/funcs/strided_copy_utils.cc b/backends/sdaa/kernels/funcs/strided_copy_utils.cc index fd1c42f5ac2..68c1586360c 100644 --- a/backends/sdaa/kernels/funcs/strided_copy_utils.cc +++ b/backends/sdaa/kernels/funcs/strided_copy_utils.cc @@ -47,7 +47,7 @@ phi::DDim permute(const phi::DDim& dims, const int64_vec& axis) { return out_dim; } -bool is_permute(const phi::DenseTensor& input) { +bool is_permute(const DenseTensor& input) { if (input.meta().is_contiguous()) { return true; } @@ -83,8 +83,7 @@ bool is_permute(const phi::DenseTensor& input) { return is_permute; } -bool is_same_shapes_sizes(const phi::DenseTensor& src, - const phi::DenseTensor& dst) { +bool is_same_shapes_sizes(const DenseTensor& src, const DenseTensor& dst) { if (src.dims() != dst.dims()) { return false; } @@ -97,13 +96,12 @@ bool is_same_shapes_sizes(const phi::DenseTensor& src, return true; } -inline bool is_total_same(const phi::DenseTensor& src, - const phi::DenseTensor& dst) { +inline bool is_total_same(const DenseTensor& src, const DenseTensor& dst) { return is_permute(src) && (src.dtype() == dst.dtype()) && is_same_shapes_sizes(src, dst); } -inline bool check_sdaa_align(const phi::DenseTensor& t) { +inline bool check_sdaa_align(const DenseTensor& t) { constexpr int kSDAAAlignSize = 4; bool align = reinterpret_cast(t.data()) % kSDAAAlignSize == 0; if (align) { @@ -113,8 +111,8 @@ inline bool check_sdaa_align(const phi::DenseTensor& t) { } bool strided_copy(const Context& dev_ctx, - const phi::DenseTensor& src, - phi::DenseTensor* dst) { + const DenseTensor& src, + DenseTensor* dst) { auto src_place_type = src.place().GetType(); auto dst_place_type = dst->place().GetType(); bool sdaa_place = (src_place_type == phi::AllocationType::CUSTOM) && diff --git a/backends/sdaa/kernels/funcs/strided_copy_utils.h b/backends/sdaa/kernels/funcs/strided_copy_utils.h index c730af0f0d9..5fb0797ad9f 100644 --- a/backends/sdaa/kernels/funcs/strided_copy_utils.h +++ b/backends/sdaa/kernels/funcs/strided_copy_utils.h @@ -43,14 +43,14 @@ using vec_tuple = std::tuple; phi::DDim permute(const phi::DDim& dims, const int64_vec& axis); -bool is_permute(const phi::DenseTensor& input); +bool is_permute(const DenseTensor& input); bool pair_first_down(std::pair pair1, std::pair pair2); bool strided_copy(const Context& dev_ctx, - const phi::DenseTensor& src, - phi::DenseTensor* dst); + const DenseTensor& src, + DenseTensor* dst); } // namespace sdaa_copy diff --git a/backends/sdaa/kernels/funcs/tblas_baseop.h b/backends/sdaa/kernels/funcs/tblas_baseop.h index ace5db5012a..ff5db3c05e9 100644 --- a/backends/sdaa/kernels/funcs/tblas_baseop.h +++ b/backends/sdaa/kernels/funcs/tblas_baseop.h @@ -70,7 +70,7 @@ struct TecoBlas { void* result) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._n = n; param_pack._Atype = TECOBLAS_DATA_FLOAT; @@ -102,7 +102,7 @@ struct TecoBlas { int incr) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._transa = trans; param_pack._m = m; @@ -139,7 +139,7 @@ struct TecoBlas { VLOG(4) << "use SDAA high performance GEMM"; - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._transa = trans_a; param_pack._transb = trans_b; @@ -194,7 +194,7 @@ struct TecoBlas { int ldc) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._transa = trans_a; param_pack._transb = trans_b; @@ -252,7 +252,7 @@ struct TecoBlas { VLOG(4) << "use SDAA high performance GemmStridedBatched"; - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._transa = trans_a; param_pack._transb = trans_b; @@ -319,7 +319,7 @@ struct TecoBlas { int batch) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._transa = trans_a; param_pack._transb = trans_b; @@ -380,7 +380,7 @@ struct TecoBlas { VLOG(4) << "use SDAA high performance GemmBatched"; - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._transa = trans_a; param_pack._transb = trans_b; @@ -473,7 +473,7 @@ struct TecoBlas { void* result) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._m = m; param_pack._n = n; @@ -499,7 +499,7 @@ struct TecoBlas { void* result) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._m = m; param_pack._n = n; @@ -527,7 +527,7 @@ struct TecoBlas { void* result) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._n = n; param_pack._Atype = TECOBLAS_DATA_HALF; @@ -554,7 +554,7 @@ struct TecoBlas { int incr) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._transa = trans; param_pack._m = m; @@ -589,7 +589,7 @@ struct TecoBlas { int ldc) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._transa = trans_a; param_pack._transb = trans_b; @@ -647,7 +647,7 @@ struct TecoBlas { VLOG(4) << "use hgemm strided batch"; - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._transa = trans_a; param_pack._transb = trans_b; @@ -708,7 +708,7 @@ struct TecoBlas { std::unique_lock lock(g_blas_workspace_mutex); VLOG(4) << "use hgemm batched V2"; - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._transa = trans_a; param_pack._transb = trans_b; @@ -760,7 +760,7 @@ struct TecoBlas { void* result) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._m = m; param_pack._n = n; @@ -785,7 +785,7 @@ struct TecoBlas { void* result) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._m = m; param_pack._n = n; @@ -813,7 +813,7 @@ struct TecoBlas { void* result) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._n = n; param_pack._Atype = TECOBLAS_DATA_FLOAT; @@ -845,7 +845,7 @@ struct TecoBlas { int incr) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._transa = trans; param_pack._m = m; @@ -882,7 +882,7 @@ struct TecoBlas { VLOG(4) << "use BFloat16 to calculate Gemm."; - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._transa = trans_a; param_pack._transb = trans_b; @@ -943,7 +943,7 @@ struct TecoBlas { VLOG(4) << "use BFloat16 to calculate GemmStridedBatched."; - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._transa = trans_a; param_pack._transb = trans_b; @@ -1009,7 +1009,7 @@ struct TecoBlas { VLOG(4) << "use BFloat16 to calculate GemmBatched."; - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._transa = trans_a; param_pack._transb = trans_b; @@ -1068,7 +1068,7 @@ struct TecoBlas { void* result) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._m = m; param_pack._n = n; @@ -1093,7 +1093,7 @@ struct TecoBlas { void* result) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._m = m; param_pack._n = n; @@ -1121,7 +1121,7 @@ struct TecoBlas { void* result) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._m = m; param_pack._n = n; @@ -1146,7 +1146,7 @@ struct TecoBlas { void* result) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._m = m; param_pack._n = n; @@ -1174,7 +1174,7 @@ struct TecoBlas { void* result) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._m = m; param_pack._n = n; @@ -1202,7 +1202,7 @@ struct TecoBlas { void* result) { std::unique_lock lock(g_blas_workspace_mutex); - phi::DenseTensor workspace; + DenseTensor workspace; struct MatmulParam param_pack; param_pack._m = m; param_pack._n = n; @@ -1294,16 +1294,16 @@ void doBroadcastTo(const T* x_ptr, template void Dot(const Context& dev_ctx, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, - phi::DenseTensor* out) { + const DenseTensor& X, + const DenseTensor& Y, + DenseTensor* out) { sdaa_ops::doMemsetTensor(dev_ctx, static_cast(0), out); int n = X.numel(); int incx = 1, incy = 1; if (out->dtype() == phi::DataType::FLOAT16) { - phi::DenseTensor out_float; + DenseTensor out_float; out_float.Resize(out->dims()); dev_ctx.template Alloc(&out_float); @@ -1312,7 +1312,7 @@ void Dot(const Context& dev_ctx, sdaa_ops::doCastTensor(dev_ctx, out_float, out); } else if (X.dtype() == phi::DataType::BFLOAT16) { - phi::DenseTensor x_float, y_float, out_float; + DenseTensor x_float, y_float, out_float; x_float.Resize(X.dims()); dev_ctx.template Alloc(&x_float); y_float.Resize(Y.dims()); @@ -1339,10 +1339,10 @@ void Dot(const Context& dev_ctx, template void MatVec(const Context& dev_ctx, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, + const DenseTensor& X, + const DenseTensor& Y, const bool transpose_x, - phi::DenseTensor* out, + DenseTensor* out, float alpha = 1.0f, float beta = 0.0f) { sdaa_ops::doMemsetTensor(dev_ctx, static_cast(0), out); @@ -1361,7 +1361,7 @@ void MatVec(const Context& dev_ctx, } if (X.dtype() == phi::DataType::BFLOAT16) { - phi::DenseTensor x_float, y_float, out_float; + DenseTensor x_float, y_float, out_float; x_float.Resize(X.dims()); dev_ctx.template Alloc(&x_float); y_float.Resize(Y.dims()); @@ -1405,11 +1405,11 @@ void MatVec(const Context& dev_ctx, template void MatMul2D(const Context& dev_ctx, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, + const DenseTensor& X, + const DenseTensor& Y, const bool transpose_x, const bool transpose_y, - phi::DenseTensor* out, + DenseTensor* out, float alpha = 1.0f, float beta = 0.0f) { if (out->dtype() == phi::DataType::FLOAT32) { @@ -1445,14 +1445,14 @@ void MatMul2D(const Context& dev_ctx, } if (out->dtype() == phi::DataType::FLOAT32 && isEnableHighPerformanceGemm()) { - phi::DenseTensor temp_x; - phi::DenseTensorMeta temp_x_meta = {phi::DataType::FLOAT16, X.dims()}; + DenseTensor temp_x; + DenseTensorMeta temp_x_meta = {phi::DataType::FLOAT16, X.dims()}; temp_x.set_meta(temp_x_meta); dev_ctx.Alloc(&temp_x); sdaa_ops::doCastTensor(dev_ctx, X, &temp_x); - phi::DenseTensor temp_y; - phi::DenseTensorMeta temp_y_meta = {phi::DataType::FLOAT16, Y.dims()}; + DenseTensor temp_y; + DenseTensorMeta temp_y_meta = {phi::DataType::FLOAT16, Y.dims()}; temp_y.set_meta(temp_y_meta); dev_ctx.Alloc(&temp_y); sdaa_ops::doCastTensor(dev_ctx, Y, &temp_y); @@ -1498,7 +1498,7 @@ void MatMulND(const Context& dev_ctx, const std::vector& result, const bool transpose_x, const bool transpose_y, - phi::DenseTensor* out, + DenseTensor* out, float alpha = 1.0f, float beta = 0.0f) { if (out->dtype() == phi::DataType::FLOAT32) { @@ -1550,20 +1550,20 @@ void MatMulND(const Context& dev_ctx, template void DotGradFunction(const Context& dev_ctx, - const phi::DenseTensor& dout, - const phi::DenseTensor& x, - phi::DenseTensor* dy) { + const DenseTensor& dout, + const DenseTensor& x, + DenseTensor* dy) { sdaa_ops::doMemsetTensor(dev_ctx, static_cast(0), dy); sdaa_ops::doElementMul(dev_ctx, dout, x, -1, dy); } template void SingleMatmulWithBatchedMat(const Context& dev_ctx, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, + const DenseTensor& X, + const DenseTensor& Y, const bool transpose_x, const bool transpose_y, - phi::DenseTensor* out, + DenseTensor* out, float alpha = 1.0f, float beta = 0.0f) { VLOG(4) << "mat x batched mat"; @@ -1597,14 +1597,14 @@ void SingleMatmulWithBatchedMat(const Context& dev_ctx, int batch = phi::product(phi::slice_ddim(y_dims, 0, y_dims_size - 2)); if (out->dtype() == phi::DataType::FLOAT32 && isEnableHighPerformanceGemm()) { - phi::DenseTensor temp_x; - phi::DenseTensorMeta temp_x_meta = {phi::DataType::FLOAT16, X.dims()}; + DenseTensor temp_x; + DenseTensorMeta temp_x_meta = {phi::DataType::FLOAT16, X.dims()}; temp_x.set_meta(temp_x_meta); dev_ctx.Alloc(&temp_x); sdaa_ops::doCastTensor(dev_ctx, X, &temp_x); - phi::DenseTensor temp_y; - phi::DenseTensorMeta temp_y_meta = {phi::DataType::FLOAT16, Y.dims()}; + DenseTensor temp_y; + DenseTensorMeta temp_y_meta = {phi::DataType::FLOAT16, Y.dims()}; temp_y.set_meta(temp_y_meta); dev_ctx.Alloc(&temp_y); sdaa_ops::doCastTensor(dev_ctx, Y, &temp_y); @@ -1651,11 +1651,11 @@ void SingleMatmulWithBatchedMat(const Context& dev_ctx, template void BatchedMatmulWithSingleMat(const Context& dev_ctx, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, + const DenseTensor& X, + const DenseTensor& Y, const bool transpose_x, const bool transpose_y, - phi::DenseTensor* out, + DenseTensor* out, float alpha = 1.0f, float beta = 0.0f) { VLOG(4) << "batched mat x mat"; @@ -1680,7 +1680,7 @@ void BatchedMatmulWithSingleMat(const Context& dev_ctx, if (!transpose_x) { VLOG(4) << "Matrix A-dimension fusion"; int fused_dims = phi::product(phi::slice_ddim(x_dims, 0, x_dims_size - 1)); - phi::DenseTensor x_temp(X); + DenseTensor x_temp(X); x_temp.Resize({fused_dims, x_dims[x_dims_size - 1]}); tblas_ops::MatMul2D( dev_ctx, x_temp, Y, transpose_x, transpose_y, out, alpha, beta); @@ -1701,14 +1701,14 @@ void BatchedMatmulWithSingleMat(const Context& dev_ctx, int batch = phi::product(phi::slice_ddim(x_dims, 0, x_dims_size - 2)); if (out->dtype() == phi::DataType::FLOAT32 && isEnableHighPerformanceGemm()) { - phi::DenseTensor temp_x; - phi::DenseTensorMeta temp_x_meta = {phi::DataType::FLOAT16, X.dims()}; + DenseTensor temp_x; + DenseTensorMeta temp_x_meta = {phi::DataType::FLOAT16, X.dims()}; temp_x.set_meta(temp_x_meta); dev_ctx.Alloc(&temp_x); sdaa_ops::doCastTensor(dev_ctx, X, &temp_x); - phi::DenseTensor temp_y; - phi::DenseTensorMeta temp_y_meta = {phi::DataType::FLOAT16, Y.dims()}; + DenseTensor temp_y; + DenseTensorMeta temp_y_meta = {phi::DataType::FLOAT16, Y.dims()}; temp_y.set_meta(temp_y_meta); dev_ctx.Alloc(&temp_y); sdaa_ops::doCastTensor(dev_ctx, Y, &temp_y); @@ -1755,11 +1755,11 @@ void BatchedMatmulWithSingleMat(const Context& dev_ctx, template void BatchMatmul(const Context& dev_ctx, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, + const DenseTensor& X, + const DenseTensor& Y, const bool transpose_x, const bool transpose_y, - phi::DenseTensor* out, + DenseTensor* out, float alpha = 1.0f, float beta = 0.0f) { std::vector X_Dims = phi::vectorize(X.dims()); @@ -1779,14 +1779,14 @@ void BatchMatmul(const Context& dev_ctx, tblasOperation_t trans_y = transpose_y ? TBLAS_OP_T : TBLAS_OP_N; if (out->dtype() == phi::DataType::FLOAT32 && isEnableHighPerformanceGemm()) { - phi::DenseTensor temp_x; - phi::DenseTensorMeta temp_x_meta = {phi::DataType::FLOAT16, X.dims()}; + DenseTensor temp_x; + DenseTensorMeta temp_x_meta = {phi::DataType::FLOAT16, X.dims()}; temp_x.set_meta(temp_x_meta); dev_ctx.Alloc(&temp_x); sdaa_ops::doCastTensor(dev_ctx, X, &temp_x); - phi::DenseTensor temp_y; - phi::DenseTensorMeta temp_y_meta = {phi::DataType::FLOAT16, Y.dims()}; + DenseTensor temp_y; + DenseTensorMeta temp_y_meta = {phi::DataType::FLOAT16, Y.dims()}; temp_y.set_meta(temp_y_meta); dev_ctx.Alloc(&temp_y); sdaa_ops::doCastTensor(dev_ctx, Y, &temp_y); diff --git a/backends/sdaa/kernels/funcs/tecodnn_conv_impl.h b/backends/sdaa/kernels/funcs/tecodnn_conv_impl.h index 00118ee161d..dff91f7dcb1 100644 --- a/backends/sdaa/kernels/funcs/tecodnn_conv_impl.h +++ b/backends/sdaa/kernels/funcs/tecodnn_conv_impl.h @@ -80,12 +80,12 @@ inline void checkpadding(int* h, int* w, std::vector paddings) { } template -inline phi::DenseTensor build_dummy_tensor(const Context& dev_ctx, - phi::DataType dtype, - phi::Dim dims) { +inline DenseTensor build_dummy_tensor(const Context& dev_ctx, + phi::DataType dtype, + phi::Dim dims) { phi::DDim input_dims(dims); - phi::DenseTensor input_; - phi::DenseTensorMeta input_meta = {dtype, input_dims}; + DenseTensor input_; + DenseTensorMeta input_meta = {dtype, input_dims}; input_.set_meta(input_meta); dev_ctx.template Alloc(&input_); return input_; @@ -173,7 +173,7 @@ inline void check_paddings(int* h, int* w, const std::vector& paddings) { } } -inline void checkdims(const phi::DenseTensor* input, +inline void checkdims(const DenseTensor* input, checkformat check_f, std::string kernel_name) { switch (check_f) { @@ -203,8 +203,8 @@ inline void checkdims(const phi::DenseTensor* input, template void Gen_Tecodnn_Out(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out, + const DenseTensor& x, + DenseTensor* out, bool if_nchw) { phi::DDim out_dims; if (if_nchw) { @@ -212,15 +212,15 @@ void Gen_Tecodnn_Out(const Context& dev_ctx, } else { out_dims = x.dims(); } - phi::DenseTensorMeta out_meta = {x.dtype(), out_dims}; + DenseTensorMeta out_meta = {x.dtype(), out_dims}; out->set_meta(out_meta); dev_ctx.template Alloc(out); } template bool Trans_Xy_Tensor_in(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out, + const DenseTensor& x, + DenseTensor* out, bool is_NCHW = true, bool is_depthwise_conv = false) { phi::DDim out_dims; @@ -238,7 +238,7 @@ bool Trans_Xy_Tensor_in(const Context& dev_ctx, } return true; } - phi::DenseTensorMeta out_meta = {phi::DataType::FLOAT16, out_dims}; + DenseTensorMeta out_meta = {phi::DataType::FLOAT16, out_dims}; out->set_meta(out_meta); dev_ctx.template Alloc(out); @@ -256,8 +256,8 @@ bool Trans_Xy_Tensor_in(const Context& dev_ctx, template void Trans_Xy_Tensor_out(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out, + const DenseTensor& x, + DenseTensor* out, bool is_NCHW = true) { dev_ctx.template Alloc(out); // input -> input_nchw @@ -269,8 +269,8 @@ void Trans_Xy_Tensor_out(const Context& dev_ctx, sdaa_ops::doCastTensor(dev_ctx, x, out); } else if (std::is_same::value && is_NCHW) { // input -> input_float - phi::DenseTensor in_float; - phi::DenseTensorMeta out_meta = {phi::DataType::FLOAT32, x.dims()}; + DenseTensor in_float; + DenseTensorMeta out_meta = {phi::DataType::FLOAT32, x.dims()}; in_float.set_meta(out_meta); dev_ctx.template Alloc(&in_float); sdaa_ops::doCastTensor(dev_ctx, x, &in_float); @@ -282,10 +282,10 @@ void Trans_Xy_Tensor_out(const Context& dev_ctx, } template void doConv2dForward(const Context& dev_ctx, - const phi::DenseTensor& in_x_NHWC_HALF, - const phi::DenseTensor& filter_CHWN_HALF, + const DenseTensor& in_x_NHWC_HALF, + const DenseTensor& filter_CHWN_HALF, const phi::DDim& filter_dims, - phi::DenseTensor* out, + DenseTensor* out, int* padA, int* filterStrideA, int* upscaleA, @@ -297,7 +297,7 @@ void doConv2dForward(const Context& dev_ctx, phi::DDim in_dims = in_x_NHWC_HALF.dims(); phi::DDim out_dims = out->dims(); - phi::DenseTensor out_temp; + DenseTensor out_temp; if (output_need_cast_for_group_conv) { out_temp.Resize(out_dims); dev_ctx.template Alloc(&out_temp); @@ -346,7 +346,7 @@ void doConv2dForward(const Context& dev_ctx, y_Desc, algo, &workSpaceSizeInBytes)); - phi::DenseTensor workspace; + DenseTensor workspace; if (workSpaceSizeInBytes != 0) workspace.Resize({static_cast(workSpaceSizeInBytes)}); dev_ctx.Alloc(&workspace, DataType::INT8); @@ -376,8 +376,8 @@ void doConv2dForward(const Context& dev_ctx, template void ConvKernel(const Context& dev_ctx, int Nd, - const phi::DenseTensor& input, - const phi::DenseTensor& filter_t, + const DenseTensor& input, + const DenseTensor& filter_t, const std::vector& strides_t, const std::vector& paddings_t, const std::string& padding_algorithm, @@ -385,7 +385,7 @@ void ConvKernel(const Context& dev_ctx, int groups, bool is_depthwise_conv, const std::string& data_format, - phi::DenseTensor* output) { + DenseTensor* output) { // HIGH PERFORMANCE CONV if (isEnvEnable("HIGH_PERFORMANCE_CONV") && @@ -461,7 +461,7 @@ void ConvKernel(const Context& dev_ctx, bool is_NCHW = !channel_last; phi::DDim filter_chwn_dim; - phi::DenseTensor input_nhwc_half; + DenseTensor input_nhwc_half; bool flag = Trans_Xy_Tensor_in( dev_ctx, input, &input_nhwc_half, is_NCHW, is_depthwise_conv); @@ -469,11 +469,11 @@ void ConvKernel(const Context& dev_ctx, input_nhwc_half = input; } - phi::DenseTensor filter_chwn_half; + DenseTensor filter_chwn_half; if (!filter_t.storage_properties_initialized()) { phi::DDim out_dims = sdaa_ops::doDimPermute(filter_t, Convert_TF::NCHW2CHWN); - phi::DenseTensorMeta out_meta; + DenseTensorMeta out_meta; if (is_depthwise_conv) { out_meta = {filter_t.dtype(), out_dims}; } else { @@ -493,7 +493,7 @@ void ConvKernel(const Context& dev_ctx, if (is_depthwise_conv || (std::is_same::value)) { filter_chwn_half = filter_t; } else { - phi::DenseTensorMeta out_meta = {phi::DataType::FLOAT16, filter_dims}; + DenseTensorMeta out_meta = {phi::DataType::FLOAT16, filter_dims}; filter_chwn_half.set_meta(out_meta); dev_ctx.template Alloc(&filter_chwn_half); sdaa_ops::doCastTensor(dev_ctx, filter_t, &filter_chwn_half); @@ -516,9 +516,9 @@ void ConvKernel(const Context& dev_ctx, output_need_cast_for_group_conv); } else { // NCHW - phi::DenseTensor out_nhwc; + DenseTensor out_nhwc; phi::DDim out_dims = sdaa_ops::doDimPermute(*output, Convert_TF::NCHW2NHWC); - phi::DenseTensorMeta out_meta = {output->dtype(), out_dims}; + DenseTensorMeta out_meta = {output->dtype(), out_dims}; out_nhwc.set_meta(out_meta); dev_ctx.template Alloc(&out_nhwc); doConv2dForward(dev_ctx, @@ -539,18 +539,17 @@ void ConvKernel(const Context& dev_ctx, // compute Conv2dForward } -inline void doConv2dBackwardFilter( - const Context& dev_ctx, - const phi::DenseTensor& input_NHWC_HALF, - const phi::DenseTensor& output_grad_NHWC_HALF, - phi::DenseTensor* filter_grad_CHWN, - const phi::DDim& filter_dims_chwn, - int* padA, - int* filterStrideA, - int* upscaleA, - int groups, - int Nd, - bool output_need_cast_for_group_conv) { +inline void doConv2dBackwardFilter(const Context& dev_ctx, + const DenseTensor& input_NHWC_HALF, + const DenseTensor& output_grad_NHWC_HALF, + DenseTensor* filter_grad_CHWN, + const phi::DDim& filter_dims_chwn, + int* padA, + int* filterStrideA, + int* upscaleA, + int groups, + int Nd, + bool output_need_cast_for_group_conv) { tecodnnHandle_t tecodnnHandle = GetHandleFromCTX(dev_ctx); tecodnnTensorDescriptor_t x_Desc, dy_Desc; tecodnnFilterDescriptor_t filterDesc; @@ -568,7 +567,7 @@ inline void doConv2dBackwardFilter( output_grad_NHWC_HALF.dtype(), TensorFormat::NHWC); - phi::DenseTensor filter_grad_CHWN_temp; + DenseTensor filter_grad_CHWN_temp; if (output_need_cast_for_group_conv) { filter_grad_CHWN_temp.Resize(filter_grad_CHWN->dims()); dev_ctx.template Alloc(&filter_grad_CHWN_temp); @@ -609,7 +608,7 @@ inline void doConv2dBackwardFilter( filterDesc, BF_algo, &workSpaceSizeInBytes)); - phi::DenseTensor workspace; + DenseTensor workspace; if (workSpaceSizeInBytes != 0) workspace.Resize({static_cast(workSpaceSizeInBytes)}); dev_ctx.Alloc(&workspace, DataType::INT8); @@ -637,10 +636,10 @@ inline void doConv2dBackwardFilter( } inline void doConv2dBackwardData(const Context& dev_ctx, - const phi::DenseTensor& filter_CHWN_HALF, + const DenseTensor& filter_CHWN_HALF, const phi::DDim& filter_dims_chwn, - const phi::DenseTensor& output_grad_NHWC_HALF, - phi::DenseTensor* input_grad_NHWC, + const DenseTensor& output_grad_NHWC_HALF, + DenseTensor* input_grad_NHWC, int* padA, int* filterStrideA, int* upscaleA, @@ -657,7 +656,7 @@ inline void doConv2dBackwardData(const Context& dev_ctx, TECODNN_CHECK(tecodnnCreateFilterDescriptor(&filterDesc)); TECODNN_CHECK(tecodnnCreateConvolutionDescriptor(&convDesc)); - phi::DenseTensor input_grad_NHWC_temp; + DenseTensor input_grad_NHWC_temp; if (output_need_cast_for_group_conv) { input_grad_NHWC_temp.Resize(input_grad_NHWC->dims()); dev_ctx.template Alloc(&input_grad_NHWC_temp); @@ -704,7 +703,7 @@ inline void doConv2dBackwardData(const Context& dev_ctx, dx_Desc, BD_algo, &workSpaceSizeInBytes)); - phi::DenseTensor workspace; + DenseTensor workspace; if (workSpaceSizeInBytes != 0) workspace.Resize({static_cast(workSpaceSizeInBytes)}); dev_ctx.Alloc(&workspace, DataType::INT8); @@ -733,9 +732,9 @@ inline void doConv2dBackwardData(const Context& dev_ctx, template void ConvBackwardKernel(const Context& dev_ctx, int Nd, - const phi::DenseTensor& input, - const phi::DenseTensor& filter, - const phi::DenseTensor& output_grad, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& output_grad, const std::vector& strides_t, const std::vector& paddings_t, const std::string& padding_algorithm, @@ -743,8 +742,8 @@ void ConvBackwardKernel(const Context& dev_ctx, int groups, bool is_depthwise_conv, const std::string& data_format, - phi::DenseTensor* input_grad, - phi::DenseTensor* filter_grad) { + DenseTensor* input_grad, + DenseTensor* filter_grad) { phi::DDim filter_data_dims; phi::DDim filter_dims; @@ -777,11 +776,11 @@ void ConvBackwardKernel(const Context& dev_ctx, VLOG(4) << "conv backward called" << filter_dims; VLOG(4) << "filter.storage_properties_initialized " << filter.storage_properties_initialized() << std::endl; - phi::DenseTensor filter_chwn_half; + DenseTensor filter_chwn_half; phi::DDim filter_chwn_dim; if (!filter.storage_properties_initialized()) { phi::DDim out_dims = sdaa_ops::doDimPermute(filter, Convert_TF::NCHW2CHWN); - phi::DenseTensorMeta out_meta; + DenseTensorMeta out_meta; if (is_depthwise_conv) { out_meta = {filter.dtype(), out_dims}; } else { @@ -800,7 +799,7 @@ void ConvBackwardKernel(const Context& dev_ctx, if (is_depthwise_conv || (std::is_same::value)) { filter_chwn_half = filter; } else { - phi::DenseTensorMeta out_meta = {phi::DataType::FLOAT16, filter_dims}; + DenseTensorMeta out_meta = {phi::DataType::FLOAT16, filter_dims}; filter_chwn_half.set_meta(out_meta); dev_ctx.template Alloc(&filter_chwn_half); sdaa_ops::doCastTensor(dev_ctx, filter, &filter_chwn_half); @@ -859,8 +858,8 @@ void ConvBackwardKernel(const Context& dev_ctx, bool is_NCHW = !channel_last; if (filter_grad) { - phi::DenseTensor input_nhwc_half; - phi::DenseTensor output_grad_nhwc_half; + DenseTensor input_nhwc_half; + DenseTensor output_grad_nhwc_half; dev_ctx.template Alloc(filter_grad); if (Trans_Xy_Tensor_in( dev_ctx, input, &input_nhwc_half, is_NCHW, is_depthwise_conv)) @@ -893,8 +892,8 @@ void ConvBackwardKernel(const Context& dev_ctx, output_need_cast_for_group_conv); } else { // output: filter_grad - phi::DenseTensor filter_grad_chwn; - phi::DenseTensorMeta out_meta = { + DenseTensor filter_grad_chwn; + DenseTensorMeta out_meta = { filter_grad->dtype(), sdaa_ops::doDimPermute(*filter_grad, Convert_TF::NCHW2CHWN)}; filter_grad_chwn.set_meta(out_meta); @@ -918,7 +917,7 @@ void ConvBackwardKernel(const Context& dev_ctx, if (input_grad) { VLOG(4) << "input_grad compute"; dev_ctx.template Alloc(input_grad); - phi::DenseTensor output_grad_nhwc_half; + DenseTensor output_grad_nhwc_half; if (Trans_Xy_Tensor_in(dev_ctx, output_grad, &output_grad_nhwc_half, @@ -939,7 +938,7 @@ void ConvBackwardKernel(const Context& dev_ctx, Nd, output_need_cast_for_group_conv); } else { // NCHW - phi::DenseTensor input_grad_nhwc; + DenseTensor input_grad_nhwc; Gen_Tecodnn_Out(dev_ctx, *input_grad, &input_grad_nhwc, is_NCHW); doConv2dBackwardData(dev_ctx, filter_chwn_half, diff --git a/backends/sdaa/kernels/gather_kernel.cc b/backends/sdaa/kernels/gather_kernel.cc index 7f4671efb6b..b5270a54746 100644 --- a/backends/sdaa/kernels/gather_kernel.cc +++ b/backends/sdaa/kernels/gather_kernel.cc @@ -33,10 +33,10 @@ namespace custom_kernel { template void doGatherTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& index, + const DenseTensor& x, + const DenseTensor& index, const phi::Scalar& axis, - phi::DenseTensor* out) { + DenseTensor* out) { int axis_ = axis.to(); std::vector x_dims = phi::vectorize(x.dims()); std::vector index_dims = phi::vectorize(index.dims()); @@ -66,10 +66,10 @@ void doGatherTensor(const Context& dev_ctx, template void GatherKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& index, + const DenseTensor& x, + const DenseTensor& index, const phi::Scalar& axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA GatherKernel"; dev_ctx.template Alloc(out); @@ -80,11 +80,11 @@ void GatherKernel(const Context& dev_ctx, template void GatherGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& index, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& dout, const phi::Scalar& axis, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "CALL SDAA GatherGradKernel"; dev_ctx.template Alloc(dx); @@ -113,8 +113,8 @@ void GatherGradKernel(const Context& dev_ctx, index_dims.size())); } - phi::DenseTensor zeroslike_x; - phi::DenseTensorMeta meta = {x.dtype(), x.dims()}; + DenseTensor zeroslike_x; + DenseTensorMeta meta = {x.dtype(), x.dims()}; zeroslike_x.set_meta(meta); dev_ctx.template Alloc(&zeroslike_x); diff --git a/backends/sdaa/kernels/gather_nd_kernel.cc b/backends/sdaa/kernels/gather_nd_kernel.cc index 09a30a38d06..19c98eaff54 100644 --- a/backends/sdaa/kernels/gather_nd_kernel.cc +++ b/backends/sdaa/kernels/gather_nd_kernel.cc @@ -30,16 +30,16 @@ namespace custom_kernel { template void GatherNdKernel(const Context &dev_ctx, - const phi::DenseTensor &x, - const phi::DenseTensor &index, - phi::DenseTensor *out) { + const DenseTensor &x, + const DenseTensor &index, + DenseTensor *out) { VLOG(4) << "Call SDAA GatherNdKernel"; dev_ctx.template Alloc(out); if (x.numel() == 0) return; if (index.numel() == 0) { - phi::DenseTensor x_temp(x); + DenseTensor x_temp(x); sdaa_ops::doExpandTensor(dev_ctx, x_temp, out); return; } @@ -84,10 +84,10 @@ void GatherNdKernel(const Context &dev_ctx, template void GatherNdGradKernel(const Context &ctx, - const phi::DenseTensor &x UNUSED, - const phi::DenseTensor &index, - const phi::DenseTensor &out_grad, - phi::DenseTensor *x_grad) { + const DenseTensor &x UNUSED, + const DenseTensor &index, + const DenseTensor &out_grad, + DenseTensor *x_grad) { VLOG(4) << "Call SDAA GatherNdGradKernel"; ctx.template Alloc(x_grad); @@ -131,8 +131,8 @@ void GatherNdGradKernel(const Context &ctx, tecodnnTensorDescriptor_t x_grad_desc = sdaa_ops::GetTecodnnTensorDesc( x_grad_dims, x_grad->dtype(), TensorFormat::Undefined); - phi::DenseTensor x_tmp; - phi::DenseTensorMeta temp_x_meta = {x_grad->dtype(), x_grad->dims()}; + DenseTensor x_tmp; + DenseTensorMeta temp_x_meta = {x_grad->dtype(), x_grad->dims()}; x_tmp.set_meta(temp_x_meta); ctx.template Alloc(&x_tmp); sdaa_ops::doFillTensor(ctx, static_cast(0), x_grad->dtype(), &x_tmp); diff --git a/backends/sdaa/kernels/gaussian_random_kernel.cc b/backends/sdaa/kernels/gaussian_random_kernel.cc index a80c9637563..875f18f80c4 100644 --- a/backends/sdaa/kernels/gaussian_random_kernel.cc +++ b/backends/sdaa/kernels/gaussian_random_kernel.cc @@ -26,7 +26,7 @@ void GaussianRandomAlign(const Context& dev_ctx, float mean, float stddev, const char* mode, - phi::DenseTensor* out) { + DenseTensor* out) { // Align sdaa with NV device uint64_t seed_data; int max_threads, sm_count; @@ -56,7 +56,7 @@ void GaussianRandomAlign(const Context& dev_ctx, << ", block_size=" << block_size << ", grid_size=" << grid_size << ", seed=" << seed << ", offset=" << offset; - phi::DenseTensor float_temp; + DenseTensor float_temp; if (out->dtype() == phi::DataType::FLOAT16 || out->dtype() == phi::DataType::BFLOAT16) { float_temp.Resize(out->dims()); @@ -83,7 +83,7 @@ void GaussianRandomKernel(const Context& dev_ctx, float std, int seed, phi::DataType dtype, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA GaussianRandomKernel"; auto shape_vec = shape.GetData(); @@ -100,7 +100,7 @@ void GaussianRandomKernel(const Context& dev_ctx, return; } std::normal_distribution dist(mean, std); - phi::DenseTensor host_temp; + DenseTensor host_temp; host_temp.Resize(out->dims()); float* data = dev_ctx.template HostAlloc(&host_temp); if (seed == 0) { @@ -117,7 +117,7 @@ void GaussianRandomKernel(const Context& dev_ctx, } if (out->dtype() == phi::DataType::FLOAT16 || out->dtype() == phi::DataType::BFLOAT16) { - phi::DenseTensor float_temp; + DenseTensor float_temp; float_temp.Resize(out->dims()); dev_ctx.template Alloc(&float_temp); phi::Copy(dev_ctx, host_temp, float_temp.place(), false, &float_temp); diff --git a/backends/sdaa/kernels/generate_proposals_kernel.cc b/backends/sdaa/kernels/generate_proposals_kernel.cc index df84e441f78..43f939a5066 100644 --- a/backends/sdaa/kernels/generate_proposals_kernel.cc +++ b/backends/sdaa/kernels/generate_proposals_kernel.cc @@ -33,20 +33,20 @@ namespace custom_kernel { template void GenerateProposalsKernel(const Context& dev_ctx, - const phi::DenseTensor& scores, - const phi::DenseTensor& bbox_deltas, - const phi::DenseTensor& im_shape, - const phi::DenseTensor& anchors, - const phi::DenseTensor& variances, + const DenseTensor& scores, + const DenseTensor& bbox_deltas, + const DenseTensor& im_shape, + const DenseTensor& anchors, + const DenseTensor& variances, int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, float eta, bool pixel_offset, - phi::DenseTensor* rpn_rois, - phi::DenseTensor* rpn_roi_probs, - phi::DenseTensor* rpn_rois_num) { + DenseTensor* rpn_rois, + DenseTensor* rpn_roi_probs, + DenseTensor* rpn_rois_num) { VLOG(4) << "Call Sdaa GenerateProposalkKernel"; PADDLE_ENFORCE_EQ( scores.dims().size(), @@ -74,7 +74,7 @@ void GenerateProposalsKernel(const Context& dev_ctx, dev_ctx.template Alloc(rpn_roi_probs); // [N,1] - phi::DenseTensor rpn_roi_num_tmp; + DenseTensor rpn_roi_num_tmp; if (rpn_rois_num) { rpn_rois_num->Resize({num}); dev_ctx.template Alloc(rpn_rois_num); @@ -86,7 +86,7 @@ void GenerateProposalsKernel(const Context& dev_ctx, tecodnnHandle_t tecodnnHandle = GetHandleFromCTX(dev_ctx); // Scores [N,C,H,W] -> [N,H,W,C] - phi::DenseTensor scores_swap, bbox_deltas_swap; + DenseTensor scores_swap, bbox_deltas_swap; std::vector dims = {num, h_score, w_score, c_score}; scores_swap.Resize(phi::make_ddim(dims)); @@ -190,7 +190,7 @@ void GenerateProposalsKernel(const Context& dev_ctx, anchor_Desc, var_Desc, &workSpaceSizeInBytes)); - phi::DenseTensor workspace; + DenseTensor workspace; int8_t* workspace_data = dev_ctx.template Alloc(&workspace, workSpaceSizeInBytes); diff --git a/backends/sdaa/kernels/grid_sample_kernel.cc b/backends/sdaa/kernels/grid_sample_kernel.cc index af86cd6567e..022d4941867 100644 --- a/backends/sdaa/kernels/grid_sample_kernel.cc +++ b/backends/sdaa/kernels/grid_sample_kernel.cc @@ -33,12 +33,12 @@ namespace custom_kernel { template void GridSampleKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& grid, + const DenseTensor& x, + const DenseTensor& grid, const std::string& mode, const std::string& padding_mode, bool align_corners, - phi::DenseTensor* out) { + DenseTensor* out) { using PaddingMode = tecodnnGridSamplePaddingMode_t; using Mode = tecodnnGridSampleInterpolationMode_t; PaddingMode enum_padding_mode; @@ -80,16 +80,15 @@ void GridSampleKernel(const Context& dev_ctx, VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; " << out->dims()[2] << "; " << out->dims()[3]; - auto x_temp = phi::DenseTensor(); + auto x_temp = DenseTensor(); auto x_temp_dims = std::vector{n, in_h, in_w, c}; - auto tensor_meta = - phi::DenseTensorMeta{x.dtype(), phi::make_ddim(x_temp_dims)}; + auto tensor_meta = DenseTensorMeta{x.dtype(), phi::make_ddim(x_temp_dims)}; x_temp.set_meta(tensor_meta); dev_ctx.template Alloc(&x_temp); sdaa_ops::doTransformTensor(dev_ctx, x, Convert_TF::NCHW2NHWC, &x_temp); - auto out_temp = phi::DenseTensor{}; - auto out_meta = phi::DenseTensorMeta(x.dtype(), {n, out_h, out_w, c}); + auto out_temp = DenseTensor{}; + auto out_meta = DenseTensorMeta(x.dtype(), {n, out_h, out_w, c}); out_temp.set_meta(out_meta); dev_ctx.template Alloc(&out_temp); @@ -124,14 +123,14 @@ void GridSampleKernel(const Context& dev_ctx, template void GridSampleGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& grid, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& grid, + const DenseTensor& out_grad, const std::string& mode, const std::string& padding_mode, bool align_corners, - phi::DenseTensor* x_grad, - phi::DenseTensor* grid_grad) { + DenseTensor* x_grad, + DenseTensor* grid_grad) { using PaddingMode = tecodnnGridSamplePaddingMode_t; using Mode = tecodnnGridSampleInterpolationMode_t; PaddingMode enum_padding_mode; @@ -176,11 +175,10 @@ void GridSampleGradKernel(const Context& dev_ctx, dev_ctx, static_cast(0), phi::CppTypeToDataType::Type(), x_grad); #define NCHW_TRANFORM_NHWC(tensor) \ - auto tensor##_temp = phi::DenseTensor(); \ + auto tensor##_temp = DenseTensor(); \ auto tensor##_temp_dims = \ sdaa_ops::doDimPermute(tensor, Convert_TF::NCHW2NHWC); \ - auto tensor##_meta = \ - phi::DenseTensorMeta{tensor.dtype(), tensor##_temp_dims}; \ + auto tensor##_meta = DenseTensorMeta{tensor.dtype(), tensor##_temp_dims}; \ tensor##_temp.set_meta(tensor##_meta); \ dev_ctx.template Alloc(&tensor##_temp); \ sdaa_ops::doTransformTensor( \ @@ -193,8 +191,8 @@ void GridSampleGradKernel(const Context& dev_ctx, NCHW_TRANFORM_NHWC(x); NCHW_TRANFORM_NHWC(out_grad); - auto x_grad_temp = phi::DenseTensor{}; - auto x_grad_meta = phi::DenseTensorMeta{x.dtype(), {n, in_h, in_w, c}}; + auto x_grad_temp = DenseTensor{}; + auto x_grad_meta = DenseTensorMeta{x.dtype(), {n, in_h, in_w, c}}; x_grad_temp.set_meta(x_grad_meta); dev_ctx.template Alloc(&x_grad_temp); diff --git a/backends/sdaa/kernels/group_norm_kernel.cc b/backends/sdaa/kernels/group_norm_kernel.cc index a6fbd060168..1f3b507e2cb 100644 --- a/backends/sdaa/kernels/group_norm_kernel.cc +++ b/backends/sdaa/kernels/group_norm_kernel.cc @@ -49,15 +49,15 @@ inline bool CheckDNNSupport(const phi::DataType input_dtype, template void GroupNormKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& scale, - const paddle::optional& bias, + const DenseTensor& x, + const paddle::optional& scale, + const paddle::optional& bias, float epsilon, int groups, const std::string& data_layout_str, - phi::DenseTensor* y, - phi::DenseTensor* mean, - phi::DenseTensor* var) { + DenseTensor* y, + DenseTensor* mean, + DenseTensor* var) { VLOG(4) << "CALL SDAA GroupNormKernel."; auto x_dims = x.dims(); @@ -102,7 +102,7 @@ void GroupNormKernel(const Context& dev_ctx, dev_ctx.template Alloc(y); - phi::DenseTensor scale_tensor, bias_tensor, mean_tensor, inv_var_tensor; + DenseTensor scale_tensor, bias_tensor, mean_tensor, inv_var_tensor; if (scale) { scale_tensor = scale.get(); diff --git a/backends/sdaa/kernels/huber_loss_kernel.cc b/backends/sdaa/kernels/huber_loss_kernel.cc index ca3236e0670..292d670b082 100644 --- a/backends/sdaa/kernels/huber_loss_kernel.cc +++ b/backends/sdaa/kernels/huber_loss_kernel.cc @@ -31,11 +31,11 @@ namespace custom_kernel { template void HuberLossKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& label, + const DenseTensor& input, + const DenseTensor& label, float delta, - phi::DenseTensor* out, - phi::DenseTensor* residual) { + DenseTensor* out, + DenseTensor* residual) { VLOG(4) << "CALL SDAA HuberLossKernel."; dev_ctx.template Alloc(out); @@ -71,17 +71,17 @@ void HuberLossKernel(const Context& dev_ctx, template void HuberLossGradKernel(const Context& dev_ctx, - const phi::DenseTensor& residual, - const phi::DenseTensor& out_grad, + const DenseTensor& residual, + const DenseTensor& out_grad, float delta, - phi::DenseTensor* input_grad, - phi::DenseTensor* label_grad) { + DenseTensor* input_grad, + DenseTensor* label_grad) { VLOG(4) << "CALL SDAA HuberLossGradKernel"; void* input_grad_ptr = nullptr; void* label_grad_ptr = nullptr; - phi::DenseTensor input_grad_temp, label_grad_temp; + DenseTensor input_grad_temp, label_grad_temp; if (input_grad) { dev_ctx.template Alloc(input_grad); diff --git a/backends/sdaa/kernels/identity_kernel.cc b/backends/sdaa/kernels/identity_kernel.cc index 9c7263accb2..ff76656e1be 100644 --- a/backends/sdaa/kernels/identity_kernel.cc +++ b/backends/sdaa/kernels/identity_kernel.cc @@ -33,9 +33,9 @@ namespace custom_kernel { template void NPUIdentityKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const int format, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_ENFORCE_EQ( format == -1 || format == 0, true, @@ -50,7 +50,7 @@ void NPUIdentityKernel(const Context& dev_ctx, } auto storages = x.storage_properties(); phi::DDim x_dims = storages.storage_dims; // CHWN - phi::DenseTensorMeta out_meta; + DenseTensorMeta out_meta; out_meta = {x.dtype(), {x_dims[3], x_dims[0], x_dims[1], x_dims[2]}}; out->set_meta(out_meta); dev_ctx.template Alloc(out, x.numel() * sizeof(T)); diff --git a/backends/sdaa/kernels/increment_kernel.cc b/backends/sdaa/kernels/increment_kernel.cc index ef00194b017..1617dd92c83 100644 --- a/backends/sdaa/kernels/increment_kernel.cc +++ b/backends/sdaa/kernels/increment_kernel.cc @@ -33,15 +33,15 @@ namespace custom_kernel { template void IncrementKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float value, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA IncrementKernel."; dev_ctx.template Alloc(out); - phi::DenseTensor value_tensor; - phi::DenseTensorMeta meta = {x.dtype(), {1}}; + DenseTensor value_tensor; + DenseTensorMeta meta = {x.dtype(), {1}}; value_tensor.set_meta(meta); dev_ctx.template Alloc(&value_tensor); diff --git a/backends/sdaa/kernels/index_put_kernel.cc b/backends/sdaa/kernels/index_put_kernel.cc index d0d94cbb3f1..5a966a9ce82 100644 --- a/backends/sdaa/kernels/index_put_kernel.cc +++ b/backends/sdaa/kernels/index_put_kernel.cc @@ -34,15 +34,15 @@ void doArangeTensor(const Context& dev_ctx, const T& start, const T& end, const T& step, - phi::DenseTensor* out); + DenseTensor* out); template void IndexPutKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const std::vector& indices, - const phi::DenseTensor& value, + const DenseTensor& x, + const std::vector& indices, + const DenseTensor& value, bool accumulate, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA IndexPutKernel."; if (!out->initialized()) { @@ -62,8 +62,8 @@ void IndexPutKernel(const Context& dev_ctx, false, phi::errors::InvalidArgument("Indices cannot be empty.")); - std::vector tmp_args; - std::vector int_indices_v = + std::vector tmp_args; + std::vector int_indices_v = custom_kernel::DealWithBoolIndices( dev_ctx, indices, &tmp_args); @@ -74,13 +74,13 @@ void IndexPutKernel(const Context& dev_ctx, auto bd_dim = custom_kernel::BroadCastTensorsDims(int_indices_v); std::vector res_dim_v(phi::vectorize(bd_dim)); - std::vector res_indices_v(x.dims().size(), nullptr); - std::vector tmp_res_indices_v; - std::vector range_tensor_v; + std::vector res_indices_v(x.dims().size(), nullptr); + std::vector tmp_res_indices_v; + std::vector range_tensor_v; for (int i = static_cast(int_indices_v.size()); i < x.dims().size(); ++i) { - phi::DenseTensor range_tensor; + DenseTensor range_tensor; range_tensor.Resize(phi::make_ddim({x.dims()[i]})); dev_ctx.template Alloc(&range_tensor); custom_kernel::doArangeTensor( @@ -96,9 +96,9 @@ void IndexPutKernel(const Context& dev_ctx, range_tensor_v, bd_dim, &res_dim_v); - phi::DenseTensor value_tmp; + DenseTensor value_tmp; if (value.numel() != 1) { - phi::DenseTensorMeta meta = {value.dtype(), phi::make_ddim(res_dim_v)}; + DenseTensorMeta meta = {value.dtype(), phi::make_ddim(res_dim_v)}; value_tmp.set_meta(meta); custom_kernel::ExpandKernel( @@ -119,7 +119,7 @@ void IndexPutKernel(const Context& dev_ctx, indicesDesc.emplace_back(indexDesc); } - phi::DenseTensor index; + DenseTensor index; int64_t index_size = res_indices_v.size() * sizeof(void*); index.Resize({index_size}); dev_ctx.template Alloc(&index); diff --git a/backends/sdaa/kernels/index_sample_kernel.cc b/backends/sdaa/kernels/index_sample_kernel.cc index be0311ba00a..c772cb1e99d 100644 --- a/backends/sdaa/kernels/index_sample_kernel.cc +++ b/backends/sdaa/kernels/index_sample_kernel.cc @@ -34,9 +34,9 @@ namespace custom_kernel { template void IndexSampleKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& index, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& index, + DenseTensor* out) { VLOG(4) << "Call SDAA IndexSampleKernel"; auto index_type = index.dtype(); diff --git a/backends/sdaa/kernels/index_select_kernel.cc b/backends/sdaa/kernels/index_select_kernel.cc index c4cbece6365..805426e129c 100644 --- a/backends/sdaa/kernels/index_select_kernel.cc +++ b/backends/sdaa/kernels/index_select_kernel.cc @@ -34,10 +34,10 @@ namespace custom_kernel { template void IndexSelectKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& index, + const DenseTensor& x, + const DenseTensor& index, int dim, - phi::DenseTensor* output) { + DenseTensor* output) { VLOG(4) << "Call SDAA IndexSelectKernel"; if (dim < 0) { diff --git a/backends/sdaa/kernels/instance_norm_kernel.cc b/backends/sdaa/kernels/instance_norm_kernel.cc index fa1f7f51bcc..70a85947459 100644 --- a/backends/sdaa/kernels/instance_norm_kernel.cc +++ b/backends/sdaa/kernels/instance_norm_kernel.cc @@ -33,13 +33,13 @@ namespace custom_kernel { template void InstanceNormKernel(const Context &dev_ctx, - const phi::DenseTensor &x, - const paddle::optional &scale, - const paddle::optional &bias, + const DenseTensor &x, + const paddle::optional &scale, + const paddle::optional &bias, float epsilon_f, - phi::DenseTensor *y, - phi::DenseTensor *saved_mean, - phi::DenseTensor *saved_variance) { + DenseTensor *y, + DenseTensor *saved_mean, + DenseTensor *saved_variance) { // This OP only support NCHW for paddlepaddle. VLOG(4) << "CALL SDAA InstanceNormKernel."; @@ -74,14 +74,13 @@ void InstanceNormKernel(const Context &dev_ctx, std::vector scale_bias_dims = {1, 1, 1, C}; std::vector mean_variance_dims = {N, 1, 1, C}; - phi::DenseTensor x_NHWC, y_NHWC; + DenseTensor x_NHWC, y_NHWC; x_NHWC.Resize(phi::make_ddim(x_NHWC_dims)); y_NHWC.Resize(phi::make_ddim(x_NHWC_dims)); dev_ctx.template Alloc(&x_NHWC); dev_ctx.template Alloc(&y_NHWC); - phi::DenseTensor scale_tensor, bias_tensor, saved_mean_tmp, - saved_variance_tmp; + DenseTensor scale_tensor, bias_tensor, saved_mean_tmp, saved_variance_tmp; if (scale) { scale_tensor = scale.get(); } else { @@ -118,7 +117,7 @@ void InstanceNormKernel(const Context &dev_ctx, // tecodnnInstanceNormalizationForwardTraining also calculate moving_mean and // moving_variance. - phi::DenseTensor moving_mean, moving_var; + DenseTensor moving_mean, moving_var; moving_mean.Resize(phi::make_ddim(scale_bias_dims)); dev_ctx.template Alloc(&moving_mean); moving_var.Resize(phi::make_ddim(scale_bias_dims)); @@ -167,17 +166,16 @@ void InstanceNormKernel(const Context &dev_ctx, template void InstanceNormGradKernel(const Context &dev_ctx, - const phi::DenseTensor &x, - const paddle::optional &scale, - const paddle::optional &bias - UNUSED, - const phi::DenseTensor &saved_mean, - const phi::DenseTensor &saved_variance, - const phi::DenseTensor &d_y, + const DenseTensor &x, + const paddle::optional &scale, + const paddle::optional &bias UNUSED, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + const DenseTensor &d_y, float epsilon_f, - phi::DenseTensor *d_x, - phi::DenseTensor *d_scale, - phi::DenseTensor *d_bias) { + DenseTensor *d_x, + DenseTensor *d_scale, + DenseTensor *d_bias) { VLOG(4) << "CALL SDAA InstanceNormGradKernel."; auto x_dims = x.dims(); @@ -208,7 +206,7 @@ void InstanceNormGradKernel(const Context &dev_ctx, std::vector scale_bias_dims = {1, 1, 1, C}; std::vector mean_variance_dims = {N, 1, 1, C}; - phi::DenseTensor x_NHWC, dy_NHWC, dx_NHWC; + DenseTensor x_NHWC, dy_NHWC, dx_NHWC; x_NHWC.Resize(phi::make_ddim(x_NHWC_dims)); dy_NHWC.Resize(phi::make_ddim(x_NHWC_dims)); dx_NHWC.Resize(phi::make_ddim(x_NHWC_dims)); @@ -216,7 +214,7 @@ void InstanceNormGradKernel(const Context &dev_ctx, dev_ctx.template Alloc(&dy_NHWC); dev_ctx.template Alloc(&dx_NHWC); - phi::DenseTensor scale_tensor, d_scale_temp, d_bias_temp; + DenseTensor scale_tensor, d_scale_temp, d_bias_temp; if (scale) { scale_tensor = scale.get(); diff --git a/backends/sdaa/kernels/interpolate_kernel.cc b/backends/sdaa/kernels/interpolate_kernel.cc index 7c48dc3c0c9..71eba358021 100644 --- a/backends/sdaa/kernels/interpolate_kernel.cc +++ b/backends/sdaa/kernels/interpolate_kernel.cc @@ -63,7 +63,7 @@ inline void ExtractNCDWH(const phi::DDim& dims, inline std::vector GetNewShape( const Context& dev_ctx, - const std::vector& list_new_shape_tensor) { + const std::vector& list_new_shape_tensor) { // get tensor from std::vector vec_new_shape; for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { @@ -76,7 +76,7 @@ inline std::vector GetNewShape( "but received d%.", tensor->dims())); if (src_place.GetType() == phi::AllocationType::CUSTOM) { - phi::DenseTensor temp; + DenseTensor temp; TensorCopy(dev_ctx, *tensor, true, &temp, phi::CPUPlace()); vec_new_shape.push_back(static_cast(*temp.data())); } else { @@ -88,11 +88,11 @@ inline std::vector GetNewShape( } template -inline std::vector GetNewDataFromTensor( - const Context& dev_ctx, const phi::DenseTensor* new_data_tensor) { +inline std::vector GetNewDataFromTensor(const Context& dev_ctx, + const DenseTensor* new_data_tensor) { std::vector vec_new_data; auto* new_data = new_data_tensor->data(); - phi::DenseTensor cpu_starts_tensor; + DenseTensor cpu_starts_tensor; const auto& src_place = new_data_tensor->place(); if (src_place.GetType() == phi::AllocationType::CUSTOM) { TensorCopy( @@ -109,10 +109,10 @@ inline std::vector GetNewDataFromTensor( template void NearestInterpKernel( const Context& dev_ctx, - const phi::DenseTensor& input, - const paddle::optional& out_size, - const paddle::optional>& size_tensor, - const paddle::optional& scale_tensor, + const DenseTensor& input, + const paddle::optional& out_size, + const paddle::optional>& size_tensor, + const paddle::optional& scale_tensor, const std::string& data_layout_str, int out_d, int out_h, @@ -121,7 +121,7 @@ void NearestInterpKernel( const std::string& interp_method, bool align_corners, int align_mode, - phi::DenseTensor* output) { + DenseTensor* output) { VLOG(4) << "Call SDAA NearestInterpKernel"; auto* input_data = input.data(); @@ -233,9 +233,9 @@ void NearestInterpKernel( phi::DDim out_NHWC_dims = sdaa_ops::doDimPermute(*output, Convert_TF::NCHW2NHWC); - phi::DenseTensor in_x_NHWC, out_NHWC; - phi::DenseTensorMeta in_x_NHWC_meta = {input.dtype(), in_x_NHWC_dims}; - phi::DenseTensorMeta out_NHWC_meta = {output->dtype(), out_NHWC_dims}; + DenseTensor in_x_NHWC, out_NHWC; + DenseTensorMeta in_x_NHWC_meta = {input.dtype(), in_x_NHWC_dims}; + DenseTensorMeta out_NHWC_meta = {output->dtype(), out_NHWC_dims}; in_x_NHWC.set_meta(in_x_NHWC_meta); out_NHWC.set_meta(out_NHWC_meta); @@ -264,11 +264,11 @@ void NearestInterpKernel( template void NearestInterpGradKernel( const Context& dev_ctx, - const phi::DenseTensor& input, - const paddle::optional& out_size, - const paddle::optional>& size_tensor, - const paddle::optional& scale_tensor, - const phi::DenseTensor& output_grad, + const DenseTensor& input, + const paddle::optional& out_size, + const paddle::optional>& size_tensor, + const paddle::optional& scale_tensor, + const DenseTensor& output_grad, const std::string& data_layout_str, int out_d, int out_h, @@ -277,7 +277,7 @@ void NearestInterpGradKernel( const std::string& interp_method, bool align_corners, int align_mode, - phi::DenseTensor* input_grad) { + DenseTensor* input_grad) { VLOG(4) << "Call SDAA NearestInterpGradKernel"; const DataLayout data_layout = common::StringToDataLayout(data_layout_str); @@ -378,9 +378,9 @@ void NearestInterpGradKernel( phi::DDim out_NHWC_dims = sdaa_ops::doDimPermute(*input_grad, Convert_TF::NCHW2NHWC); - phi::DenseTensor in_x_NHWC, out_NHWC; - phi::DenseTensorMeta in_x_NHWC_meta = {output_grad.dtype(), in_x_NHWC_dims}; - phi::DenseTensorMeta out_NHWC_meta = {input_grad->dtype(), out_NHWC_dims}; + DenseTensor in_x_NHWC, out_NHWC; + DenseTensorMeta in_x_NHWC_meta = {output_grad.dtype(), in_x_NHWC_dims}; + DenseTensorMeta out_NHWC_meta = {input_grad->dtype(), out_NHWC_dims}; in_x_NHWC.set_meta(in_x_NHWC_meta); out_NHWC.set_meta(out_NHWC_meta); diff --git a/backends/sdaa/kernels/is_empty_kernel.cc b/backends/sdaa/kernels/is_empty_kernel.cc index a892bb934f9..821422a9012 100644 --- a/backends/sdaa/kernels/is_empty_kernel.cc +++ b/backends/sdaa/kernels/is_empty_kernel.cc @@ -33,8 +33,8 @@ namespace custom_kernel { template void IsEmptyKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "CALL SDAA IsEmptyKernel."; // Note: is_empty is always executed on CPU and the output data should diff --git a/backends/sdaa/kernels/isfinite_kernel.cc b/backends/sdaa/kernels/isfinite_kernel.cc index f018c7a6631..1f6dedfadd9 100644 --- a/backends/sdaa/kernels/isfinite_kernel.cc +++ b/backends/sdaa/kernels/isfinite_kernel.cc @@ -31,8 +31,8 @@ namespace custom_kernel { template void IsnanKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA IsnanKernel"; dev_ctx.template Alloc(out); sdaa_ops::doIsnanOp(dev_ctx, x, out); diff --git a/backends/sdaa/kernels/label_smooth_kernel.cc b/backends/sdaa/kernels/label_smooth_kernel.cc index 805678c2ca7..c03d2abf13a 100644 --- a/backends/sdaa/kernels/label_smooth_kernel.cc +++ b/backends/sdaa/kernels/label_smooth_kernel.cc @@ -33,15 +33,15 @@ namespace custom_kernel { template void LabelSmoothKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& dist, + const DenseTensor& x, + const paddle::optional& dist, float epsilon, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA LabelSmoothKernel"; auto label_dim = x.dims()[x.dims().size() - 1]; dev_ctx.template Alloc(out); // (1 − epsilon) ∗ x - phi::DenseTensor x_temp; + DenseTensor x_temp; x_temp.Resize(x.dims()); dev_ctx.template Alloc(&x_temp); sdaa_ops::doUnaryOpTensor( @@ -50,7 +50,7 @@ void LabelSmoothKernel(const Context& dev_ctx, if (dist) { // epsilon * dist auto& dist_tensor = dist.get(); - phi::DenseTensor dist_temp; + DenseTensor dist_temp; dist_temp.Resize({1, label_dim}); dev_ctx.template Alloc(&dist_temp); sdaa_ops::doUnaryOpTensor( @@ -65,9 +65,9 @@ void LabelSmoothKernel(const Context& dev_ctx, template void LabelSmoothGradKernel(const Context& dev_ctx, - const phi::DenseTensor& dout, + const DenseTensor& dout, float epsilon, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "Call SDAA LabelSmoothGradKernel"; dev_ctx.template Alloc(dx); sdaa_ops::doUnaryOpTensor(dev_ctx, dout, 1 - epsilon, UnaryOpMode::MUL_A, dx); diff --git a/backends/sdaa/kernels/layer_norm_kernel.cc b/backends/sdaa/kernels/layer_norm_kernel.cc index 7109ee52656..fd82077995c 100644 --- a/backends/sdaa/kernels/layer_norm_kernel.cc +++ b/backends/sdaa/kernels/layer_norm_kernel.cc @@ -24,14 +24,14 @@ namespace custom_kernel { template void LayerNormKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& scale_opt, - const paddle::optional& bias_opt, + const DenseTensor& x, + const paddle::optional& scale_opt, + const paddle::optional& bias_opt, float epsilon, int begin_norm_axis, - phi::DenseTensor* out, - phi::DenseTensor* mean, - phi::DenseTensor* variance) { + DenseTensor* out, + DenseTensor* mean, + DenseTensor* variance) { VLOG(4) << "Call SDAA LayerNormKernel"; // check argument @@ -64,29 +64,29 @@ void LayerNormKernel(const Context& dev_ctx, int right = static_cast(matrix_dim[1]); // set scale to all ones if its none - phi::DenseTensor default_scale; + DenseTensor default_scale; if (!scale) { - phi::DenseTensorMeta default_scale_meta = {x.dtype(), phi::make_ddim(axes)}; + DenseTensorMeta default_scale_meta = {x.dtype(), phi::make_ddim(axes)}; default_scale.set_meta(default_scale_meta); dev_ctx.template Alloc(&default_scale); sdaa_ops::doFillTensor( dev_ctx, static_cast(1.0), DataType::FLOAT32, &default_scale); scale = &default_scale; // shallow copy to scale } else { - const_cast(scale)->Resize(phi::make_ddim(axes)); + const_cast(scale)->Resize(phi::make_ddim(axes)); } // set bias to all zeros if its none - phi::DenseTensor default_bias; + DenseTensor default_bias; if (!bias) { - phi::DenseTensorMeta default_bias_meta = {x.dtype(), phi::make_ddim(axes)}; + DenseTensorMeta default_bias_meta = {x.dtype(), phi::make_ddim(axes)}; default_bias.set_meta(default_bias_meta); dev_ctx.template Alloc(&default_bias); sdaa_ops::doFillTensor( dev_ctx, static_cast(0.0), DataType::FLOAT32, &default_bias); bias = &default_bias; // shallow copy to bias } else { - const_cast(bias)->Resize(phi::make_ddim(axes)); + const_cast(bias)->Resize(phi::make_ddim(axes)); } // calculate row and col according to input's shape and axis @@ -137,8 +137,8 @@ void LayerNormKernel(const Context& dev_ctx, variance->data())); // resize scale and bias - const_cast(scale)->Resize(phi::make_ddim({right})); - const_cast(bias)->Resize(phi::make_ddim({right})); + const_cast(scale)->Resize(phi::make_ddim({right})); + const_cast(bias)->Resize(phi::make_ddim({right})); // destroy descriptors TECODNN_CHECK(tecodnnDestroyTensorDescriptor(x_Desc)); @@ -151,17 +151,17 @@ void LayerNormKernel(const Context& dev_ctx, template void LayerNormGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& scale_opt, - const paddle::optional& bias, - const phi::DenseTensor& mean, - const phi::DenseTensor& variance, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const paddle::optional& scale_opt, + const paddle::optional& bias, + const DenseTensor& mean, + const DenseTensor& variance, + const DenseTensor& out_grad, float epsilon, int begin_norm_axis, - phi::DenseTensor* x_grad, - phi::DenseTensor* scale_grad, - phi::DenseTensor* bias_grad) { + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad) { VLOG(4) << "Call SDAA LayerNormGradKernel"; // check argument @@ -202,24 +202,24 @@ void LayerNormGradKernel(const Context& dev_ctx, // The rank of mean should be equal to x auto mean_dims = mean.dims(); // save its original dims - const_cast(&mean)->Resize(phi::make_ddim({new_shape})); - const_cast(&variance)->Resize(phi::make_ddim({new_shape})); + const_cast(&mean)->Resize(phi::make_ddim({new_shape})); + const_cast(&variance)->Resize(phi::make_ddim({new_shape})); // set scale to all ones if its none - phi::DenseTensor default_scale; + DenseTensor default_scale; if (!scale) { - phi::DenseTensorMeta default_scale_meta = {x.dtype(), phi::make_ddim(axes)}; + DenseTensorMeta default_scale_meta = {x.dtype(), phi::make_ddim(axes)}; default_scale.set_meta(default_scale_meta); dev_ctx.template Alloc(&default_scale); sdaa_ops::doFillTensor( dev_ctx, static_cast(1.0), DataType::FLOAT32, &default_scale); scale = &default_scale; // shallow copy to scale } else { - const_cast(scale)->Resize(phi::make_ddim(axes)); + const_cast(scale)->Resize(phi::make_ddim(axes)); } // set and allocate memory for outputs - phi::DenseTensor x_grad_, scale_grad_, bias_grad_; + DenseTensor x_grad_, scale_grad_, bias_grad_; x_grad = (x_grad == nullptr) ? &x_grad_ : x_grad; scale_grad = (scale_grad == nullptr) ? &scale_grad_ : scale_grad; bias_grad = (bias_grad == nullptr) ? &bias_grad_ : bias_grad; @@ -281,9 +281,9 @@ void LayerNormGradKernel(const Context& dev_ctx, bias_grad->data())); // resize tensors - const_cast(&mean)->Resize(mean_dims); - const_cast(&variance)->Resize(mean_dims); - const_cast(scale)->Resize(phi::make_ddim({right})); + const_cast(&mean)->Resize(mean_dims); + const_cast(&variance)->Resize(mean_dims); + const_cast(scale)->Resize(phi::make_ddim({right})); scale_grad->Resize(phi::make_ddim({right})); bias_grad->Resize(phi::make_ddim({right})); diff --git a/backends/sdaa/kernels/linspace_kernel.cc b/backends/sdaa/kernels/linspace_kernel.cc index 869c6cbc8ac..7b100fae8ab 100644 --- a/backends/sdaa/kernels/linspace_kernel.cc +++ b/backends/sdaa/kernels/linspace_kernel.cc @@ -30,7 +30,7 @@ namespace custom_kernel { template -T GetValueOfExpectedType(const Context& ctx, const phi::DenseTensor& x) { +T GetValueOfExpectedType(const Context& ctx, const DenseTensor& x) { switch (x.dtype()) { case DataType::FLOAT32: return static_cast(phi::GetValue(ctx, x)); @@ -52,11 +52,11 @@ T GetValueOfExpectedType(const Context& ctx, const phi::DenseTensor& x) { template void LinspaceKernel(const Context& dev_ctx, - const phi::DenseTensor& start, - const phi::DenseTensor& stop, - const phi::DenseTensor& number, + const DenseTensor& start, + const DenseTensor& stop, + const DenseTensor& number, phi::DataType dtype, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA LinspaceKernel"; T start_value = GetValueOfExpectedType(dev_ctx, start); diff --git a/backends/sdaa/kernels/log_loss_kernel.cc b/backends/sdaa/kernels/log_loss_kernel.cc index d7c16599f53..bb532e19c7b 100644 --- a/backends/sdaa/kernels/log_loss_kernel.cc +++ b/backends/sdaa/kernels/log_loss_kernel.cc @@ -32,17 +32,17 @@ namespace custom_kernel { // out = -label*log(input+epsilon)-(1-label)*log(1-input+epsilon) template void LogLossKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& label, + const DenseTensor& input, + const DenseTensor& label, float epsilon, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA LogLossKernel"; dev_ctx.template Alloc(out); // compute out=-label*log(input+epsilon) - phi::DenseTensor input_temp; + DenseTensor input_temp; input_temp.Resize(input.dims()); dev_ctx.template Alloc(&input_temp); - phi::DenseTensor label_temp; + DenseTensor label_temp; label_temp.Resize(label.dims()); dev_ctx.template Alloc(&label_temp); sdaa_ops::doUnaryOpTensor( @@ -53,7 +53,7 @@ void LogLossKernel(const Context& dev_ctx, sdaa_ops::doUnaryOpTensor(dev_ctx, input_temp, 1.0, UnaryOpMode::LOG, out); sdaa_ops::doElementMul(dev_ctx, *out, label_temp, -1, out); // compute (1-label)*log(1-input+epsilon) - phi::DenseTensor out_2; + DenseTensor out_2; out_2.Resize(out->dims()); dev_ctx.template Alloc(&out_2); @@ -71,18 +71,18 @@ void LogLossKernel(const Context& dev_ctx, // dout/dx = -label*1/(input+epsilon)+(1-label)*1/(1-input+epsilon) template void LogLossGradKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& label, - const phi::DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& label, + const DenseTensor& out_grad, float epsilon, - phi::DenseTensor* in_grad) { + DenseTensor* in_grad) { VLOG(4) << "Call SDAA LogLossGradKernel"; dev_ctx.template Alloc(in_grad); // compute out=-label*1/(input+epsilon) - phi::DenseTensor input_temp; + DenseTensor input_temp; input_temp.Resize(input.dims()); dev_ctx.template Alloc(&input_temp); - phi::DenseTensor label_temp; + DenseTensor label_temp; label_temp.Resize(label.dims()); dev_ctx.template Alloc(&label_temp); sdaa_ops::doUnaryOpTensor( @@ -93,7 +93,7 @@ void LogLossGradKernel(const Context& dev_ctx, dev_ctx, input_temp, 1.0, UnaryOpMode::RDIV, in_grad); sdaa_ops::doElementMul(dev_ctx, *in_grad, label_temp, -1, in_grad); // compute (1-label)*1/(1-input+epsilon) - phi::DenseTensor out_2; + DenseTensor out_2; out_2.Resize(in_grad->dims()); dev_ctx.template Alloc(&out_2); diff --git a/backends/sdaa/kernels/log_softmax_kernel.cc b/backends/sdaa/kernels/log_softmax_kernel.cc index 3df8376237c..f12c461430f 100644 --- a/backends/sdaa/kernels/log_softmax_kernel.cc +++ b/backends/sdaa/kernels/log_softmax_kernel.cc @@ -30,9 +30,9 @@ namespace custom_kernel { template void LogSoftmaxKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA LogSoftmaxKernel"; const int rank = x.dims().size(); @@ -47,8 +47,8 @@ void LogSoftmaxKernel(const Context& dev_ctx, axis += x.dims().size(); } if (axis != x.dims().size() - 1) { - phi::DenseTensor x_temp; - phi::DenseTensor out_temp; + DenseTensor x_temp; + DenseTensor out_temp; std::vector x_dims = phi::vectorize(x.dims()); std::vector axis_vec(x.dims().size()); std::iota(axis_vec.begin(), axis_vec.end(), 0); @@ -81,10 +81,10 @@ void LogSoftmaxKernel(const Context& dev_ctx, template void LogSoftmaxGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& out_grad, + const DenseTensor& out, + const DenseTensor& out_grad, int axis, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { VLOG(4) << "Call SDAA LogSoftmaxGradKernel"; const int rank = out.dims().size(); dev_ctx.template Alloc(x_grad); @@ -99,9 +99,9 @@ void LogSoftmaxGradKernel(const Context& dev_ctx, axis += out.dims().size(); } if (axis != out.dims().size() - 1) { - phi::DenseTensor out_temp; - phi::DenseTensor out_grad_temp; - phi::DenseTensor x_grad_temp; + DenseTensor out_temp; + DenseTensor out_grad_temp; + DenseTensor x_grad_temp; std::vector out_dims = phi::vectorize(out.dims()); std::vector axis_vec(out.dims().size()); diff --git a/backends/sdaa/kernels/logical_kernel.cc b/backends/sdaa/kernels/logical_kernel.cc index 19cc781aa33..920876bd19c 100644 --- a/backends/sdaa/kernels/logical_kernel.cc +++ b/backends/sdaa/kernels/logical_kernel.cc @@ -33,7 +33,7 @@ namespace custom_kernel { #define _dologicalop(type) \ if (x.dims().size() == 0 && y.dims().size() == 0) { \ - phi::DenseTensor x_temp(x), y_temp(y); \ + DenseTensor x_temp(x), y_temp(y); \ x_temp.Resize(phi::make_ddim({1})); \ y_temp.Resize(phi::make_ddim({1})); \ out->Resize(phi::make_ddim({1})); \ @@ -45,9 +45,9 @@ namespace custom_kernel { template void LogicalAndKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "CALL SDAA LogicalAndKernel"; dev_ctx.template Alloc(out); _dologicalop(And); @@ -55,9 +55,9 @@ void LogicalAndKernel(const Context& dev_ctx, template void LogicalOrKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "CALL SDAA LogicalOrKernel"; dev_ctx.template Alloc(out); _dologicalop(Or); @@ -65,9 +65,9 @@ void LogicalOrKernel(const Context& dev_ctx, template void LogicalXorKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "CALL SDAA LogicalXorKernel"; dev_ctx.template Alloc(out); _dologicalop(Xor); @@ -76,12 +76,12 @@ void LogicalXorKernel(const Context& dev_ctx, template void LogicalNotKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "CALL SDAA LogicalNotKernel"; dev_ctx.template Alloc(out); if (x.dims().size() == 0) { - phi::DenseTensor x_temp(x); + DenseTensor x_temp(x); x_temp.Resize(phi::make_ddim({1})); out->Resize(phi::make_ddim({1})); sdaa_ops::doLogicalNotOpTensor(dev_ctx, x_temp, out); diff --git a/backends/sdaa/kernels/masked_select_kernel.cc b/backends/sdaa/kernels/masked_select_kernel.cc index aa45f1bd385..b363c8b63ce 100644 --- a/backends/sdaa/kernels/masked_select_kernel.cc +++ b/backends/sdaa/kernels/masked_select_kernel.cc @@ -30,9 +30,9 @@ namespace custom_kernel { void doMaskedSelectOpTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& mask, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& mask, + DenseTensor* out) { VLOG(4) << "tecodnn selectmask op called"; auto x_dim = x.dims(); @@ -42,7 +42,7 @@ void doMaskedSelectOpTensor(const Context& dev_ctx, std::vector mask_dims = phi::vectorize(mask_dim); std::vector out_dims = phi::vectorize(out->dims()); - phi::DenseTensor mask_int; + DenseTensor mask_int; mask_int.Resize(mask_dim); dev_ctx.Alloc(&mask_int, DataType::UINT8); sdaa_ops::doCastTensor(dev_ctx, mask, &mask_int); @@ -55,7 +55,7 @@ void doMaskedSelectOpTensor(const Context& dev_ctx, tecodnnTensorDescriptor_t out_Desc = sdaa_ops::GetTecodnnTensorDesc( out_dims, out->dtype(), TensorFormat::NHWC); // int selectCount = 0; - phi::DenseTensor selectCount; + DenseTensor selectCount; selectCount.Resize(phi::make_ddim({1})); dev_ctx.Alloc(&selectCount, DataType::INT32); @@ -75,9 +75,9 @@ void doMaskedSelectOpTensor(const Context& dev_ctx, template void MaskedSelectKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& mask, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& mask, + DenseTensor* out) { VLOG(4) << "CALL SDAA MaskedSelectKernel"; auto input_dim = x.dims(); auto mask_dim = mask.dims(); @@ -91,12 +91,12 @@ void MaskedSelectKernel(const Context& dev_ctx, input_dim, mask_dim)); - phi::DenseTensor mask_int; + DenseTensor mask_int; mask_int.Resize(mask_dim); dev_ctx.Alloc(&mask_int, DataType::INT64); sdaa_ops::doCastTensor(dev_ctx, mask, &mask_int); - phi::DenseTensor nonzconunt; + DenseTensor nonzconunt; nonzconunt.Resize(phi::make_ddim({1})); dev_ctx.Alloc(&nonzconunt, DataType::INT64); @@ -117,10 +117,10 @@ void MaskedSelectKernel(const Context& dev_ctx, template void MaskedSelectGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& mask, - const phi::DenseTensor& out_grad, - phi::DenseTensor* x_grad) { + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& out_grad, + DenseTensor* x_grad) { VLOG(4) << "CALL SDAA MaskedSelectGradKernel"; auto mask_size = mask.numel(); diff --git a/backends/sdaa/kernels/matmul_kernel.cc b/backends/sdaa/kernels/matmul_kernel.cc index 315cd242d13..d9e18a6577c 100644 --- a/backends/sdaa/kernels/matmul_kernel.cc +++ b/backends/sdaa/kernels/matmul_kernel.cc @@ -28,11 +28,11 @@ namespace custom_kernel { template void MatmulKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, bool transpose_x, bool transpose_y, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA MatmulKernel"; dev_ctx.template Alloc(out); @@ -90,7 +90,7 @@ void MatmulKernel(const Context& dev_ctx, } // Resize dim 1 to 2 - phi::DenseTensor x_temp(x), y_temp; + DenseTensor x_temp(x), y_temp; float unscale_alpha = 1.0 / FLAGS_sdaa_matmul_scale; @@ -302,13 +302,13 @@ void MatmulKernel(const Context& dev_ctx, template void MatmulGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, bool transpose_x, bool transpose_y, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + DenseTensor* dx, + DenseTensor* dy) { VLOG(4) << "Call SDAA MatmulGradKernel"; if (dx) { dev_ctx.template Alloc(dx); @@ -342,7 +342,7 @@ void MatmulGradKernel(const Context& dev_ctx, } // Resize dim 1 to 2 - phi::DenseTensor x_temp(x), y_temp(y), dout_temp, dx_temp, dy_temp; + DenseTensor x_temp(x), y_temp(y), dout_temp, dx_temp, dy_temp; float unscale_alpha = 1.0 / FLAGS_sdaa_matmul_scale; if (FLAGS_sdaa_matmul_scale != 1.0) { @@ -452,7 +452,7 @@ void MatmulGradKernel(const Context& dev_ctx, } else { // 1. [batch, K, M] x [batch, M, N] = [batch, K, N] // 2. [batch, K, N] --> [K, N] - phi::DenseTensor dy_unreduced; + DenseTensor dy_unreduced; auto batch = x_temp.numel() / x_dims[x_ndim - 1] / x_dims[x_ndim - 2]; x_temp.Resize({batch, x_dims[x_ndim - 2], x_dims[x_ndim - 1]}); dout_temp.Resize( @@ -481,7 +481,7 @@ void MatmulGradKernel(const Context& dev_ctx, // 1. [batch, M, N] x [batch, N, K] = [batch, M, K] // 2. [batch, M, K] --> [M, K] if (dx) { - phi::DenseTensor dx_unreduced; + DenseTensor dx_unreduced; auto batch = y_temp.numel() / y_dims[y_ndim - 1] / y_dims[y_ndim - 2]; y_temp.Resize({batch, y_dims[y_ndim - 2], y_dims[y_ndim - 1]}); dout_temp.Resize( @@ -603,9 +603,9 @@ void MatmulGradKernel(const Context& dev_ctx, for (int idx = 0; idx < x_broadcast_dims.size(); idx++) { dx_brd_size *= x_broadcast_dims[idx]; } - phi::DenseTensor dx_brd; + DenseTensor dx_brd; phi::DDim dx_brd_dims = phi::make_ddim(x_broadcast_dims); - phi::DenseTensorMeta dx_meta = {dout.dtype(), dx_brd_dims}; + DenseTensorMeta dx_meta = {dout.dtype(), dx_brd_dims}; dx_brd.set_meta(dx_meta); dev_ctx.template Alloc(&dx_brd); @@ -614,9 +614,9 @@ void MatmulGradKernel(const Context& dev_ctx, for (int idx = 0; idx < y_broadcast_dims.size(); idx++) { dy_brd_size *= y_broadcast_dims[idx]; } - phi::DenseTensor dy_brd; + DenseTensor dy_brd; phi::DDim dy_brd_dims = phi::make_ddim(y_broadcast_dims); - phi::DenseTensorMeta dy_meta = {dout.dtype(), dy_brd_dims}; + DenseTensorMeta dy_meta = {dout.dtype(), dy_brd_dims}; dy_brd.set_meta(dy_meta); dev_ctx.template Alloc(&dy_brd); @@ -791,11 +791,11 @@ void MatmulGradKernel(const Context& dev_ctx, template void MatmulWithFlattenKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int x_num_col_dims, int y_num_col_dims, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA MatmulWithFlattenKernel"; dev_ctx.template Alloc(out); @@ -848,9 +848,9 @@ void MatmulWithFlattenKernel(const Context& dev_ctx, x_matrix_dims[1], y_matrix_dims[0])); - phi::DenseTensor x_matrix(x); + DenseTensor x_matrix(x); x_matrix.Resize(phi::make_ddim(x_matrix_dims)); - phi::DenseTensor y_matrix(y); + DenseTensor y_matrix(y); y_matrix.Resize(phi::make_ddim(y_matrix_dims)); tblas_ops::MatMul2D(dev_ctx, x_matrix, y_matrix, false, false, out); @@ -859,13 +859,13 @@ void MatmulWithFlattenKernel(const Context& dev_ctx, template void MatmulWithFlattenGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, int x_num_col_dims, int y_num_col_dims, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + DenseTensor* dx, + DenseTensor* dy) { VLOG(4) << "CALL SDAA MatmulWithFlattenGradKernel"; if (dx) dev_ctx.template Alloc(dx); @@ -905,11 +905,11 @@ void MatmulWithFlattenGradKernel(const Context& dev_ctx, tblas_ops::ReshpaeToMatrix(y_num_col_dims, y_dims, &y_matrix_dims); std::vector dout_temp_dims = {x_matrix_dims[0], y_matrix_dims[1]}; - phi::DenseTensor x_matrix(x); + DenseTensor x_matrix(x); x_matrix.Resize(phi::make_ddim(x_matrix_dims)); - phi::DenseTensor y_matrix(y); + DenseTensor y_matrix(y); y_matrix.Resize(phi::make_ddim(y_matrix_dims)); - phi::DenseTensor dout_temp(dout); + DenseTensor dout_temp(dout); dout_temp.Resize(phi::make_ddim(dout_temp_dims)); if (dx) { diff --git a/backends/sdaa/kernels/memcpy_kernel.cc b/backends/sdaa/kernels/memcpy_kernel.cc index b5ddea91727..8716944b5fe 100644 --- a/backends/sdaa/kernels/memcpy_kernel.cc +++ b/backends/sdaa/kernels/memcpy_kernel.cc @@ -30,9 +30,9 @@ namespace custom_kernel { template void MemcpyKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int dst_place_type, - phi::DenseTensor* out) { + DenseTensor* out) { if (!x.initialized()) { return; } @@ -53,9 +53,9 @@ void MemcpyKernel(const Context& dev_ctx, template void MemcpyH2DKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int dst_place_type, - phi::DenseTensor* out) { + DenseTensor* out) { TensorCopy(dev_ctx, x, false, out, dev_ctx.GetPlace()); dev_ctx.Wait(); } @@ -63,9 +63,9 @@ void MemcpyH2DKernel(const Context& dev_ctx, // used in new executor, for memory copy from device to host template void MemcpyD2HKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int dst_place_type, - phi::DenseTensor* out) { + DenseTensor* out) { TensorCopy(dev_ctx, x, false, out, phi::CPUPlace()); dev_ctx.Wait(); } diff --git a/backends/sdaa/kernels/merged_adam_kernel.cc b/backends/sdaa/kernels/merged_adam_kernel.cc index 37caffecabb..2af214afd80 100644 --- a/backends/sdaa/kernels/merged_adam_kernel.cc +++ b/backends/sdaa/kernels/merged_adam_kernel.cc @@ -35,28 +35,28 @@ namespace custom_kernel { template void MergedAdamKernel( const Context& dev_ctx, - const std::vector& param, - const std::vector& grad, - const std::vector& learning_rate, - const std::vector& moment1, - const std::vector& moment2, - const paddle::optional>& moment2_max, - const std::vector& beta1_pow, - const std::vector& beta2_pow, - const paddle::optional>& master_param, + const std::vector& param, + const std::vector& grad, + const std::vector& learning_rate, + const std::vector& moment1, + const std::vector& moment2, + const paddle::optional>& moment2_max, + const std::vector& beta1_pow, + const std::vector& beta2_pow, + const paddle::optional>& master_param, const phi::Scalar& beta1, const phi::Scalar& beta2, const phi::Scalar& epsilon, bool multi_precision, bool use_global_beta_pow, bool amsgrad, - std::vector param_out, - std::vector moment1_out, - std::vector moment2_out, - std::vector moment2_max_out, - std::vector beta1_pow_out, - std::vector beta2_pow_out, - std::vector master_param_out) { + std::vector param_out, + std::vector moment1_out, + std::vector moment2_out, + std::vector moment2_max_out, + std::vector beta1_pow_out, + std::vector beta2_pow_out, + std::vector master_param_out) { VLOG(4) << "call sdaa MergedAdamKernel"; PADDLE_ENFORCE_NE( amsgrad, @@ -118,11 +118,11 @@ void MergedAdamKernel( "is %d, the size of Input(param) is %d.", beta2_pow.size(), param_num)); - phi::DenseTensor lr, b1_pow, b2_pow; + DenseTensor lr, b1_pow, b2_pow; const int M = param_num; std::vector dims = {M}; phi::DDim dim = phi::make_ddim(dims); - phi::DenseTensorMeta meta = {learning_rate[0]->dtype(), dim}; + DenseTensorMeta meta = {learning_rate[0]->dtype(), dim}; lr.set_meta(meta); b1_pow.set_meta(meta); b2_pow.set_meta(meta); @@ -132,12 +132,12 @@ void MergedAdamKernel( int input_num = 4; void* data[input_num][M]; - std::vector grad_in; + std::vector grad_in; for (int i = 0; i < param_num; ++i) { TensorCopy(dev_ctx, *param[i], false, param_out[i]); TensorCopy(dev_ctx, *moment1[i], false, moment1_out[i]); TensorCopy(dev_ctx, *moment2[i], false, moment2_out[i]); - grad_in.push_back(const_cast(grad[i])); + grad_in.push_back(const_cast(grad[i])); } for (int i = 0; i < M; i++) { @@ -148,7 +148,7 @@ void MergedAdamKernel( } void** pointer[input_num]; - std::vector pointer_data(input_num); + std::vector pointer_data(input_num); int64_t pointer_bytes = M * sizeof(void*); for (int i = 0; i < input_num; ++i) { pointer_data[i].Resize({pointer_bytes}); @@ -170,8 +170,8 @@ void MergedAdamKernel( int64_t num = param_out[i]->numel(); len.push_back(num); } - phi::DenseTensor n_total; - phi::DenseTensorMeta meta1 = {phi::DataType::INT64, dim}; + DenseTensor n_total; + DenseTensorMeta meta1 = {phi::DataType::INT64, dim}; n_total.set_meta(meta1); TensorFromVector(dev_ctx, len, dev_ctx, &n_total); diff --git a/backends/sdaa/kernels/merged_momentum_kernel.cc b/backends/sdaa/kernels/merged_momentum_kernel.cc index 7b3d4bb0206..3a4177df201 100644 --- a/backends/sdaa/kernels/merged_momentum_kernel.cc +++ b/backends/sdaa/kernels/merged_momentum_kernel.cc @@ -22,20 +22,20 @@ namespace custom_kernel { void CheckInputs( - const std::vector& param, - const std::vector& grad, - const std::vector& velocity, - const std::vector& learning_rate, - const paddle::optional>& master_param, + const std::vector& param, + const std::vector& grad, + const std::vector& velocity, + const std::vector& learning_rate, + const paddle::optional>& master_param, float mu, bool use_nesterov, const std::vector& regularization_method, const std::vector& regularization_coeff, bool multi_precision, float rescale_grad, - std::vector param_out, - std::vector velocity_out, - std::vector master_param_out) { + std::vector param_out, + std::vector velocity_out, + std::vector master_param_out) { size_t n = param.size(); PADDLE_ENFORCE_EQ(n, param_out.size(), @@ -122,20 +122,20 @@ void CheckInputs( template void MergedMomentumKernel( const Context& dev_ctx, - const std::vector& param, - const std::vector& grad, - const std::vector& velocity, - const std::vector& learning_rate, - const paddle::optional>& master_param, + const std::vector& param, + const std::vector& grad, + const std::vector& velocity, + const std::vector& learning_rate, + const paddle::optional>& master_param, float mu, bool use_nesterov, const std::vector& regularization_method, const std::vector& regularization_coeff, bool multi_precision, float rescale_grad, - std::vector param_out, - std::vector velocity_out, - std::vector master_param_out) { + std::vector param_out, + std::vector velocity_out, + std::vector master_param_out) { size_t param_num = param.size(); const int M = param_num; static bool is_first_time = true; @@ -169,18 +169,18 @@ void MergedMomentumKernel( param_out, velocity_out, std::move(master_param_out)); - phi::DenseTensor lr, coeff, l2_decay; + DenseTensor lr, coeff, l2_decay; phi::DDim dim{M}; - phi::DenseTensorMeta meta(learning_rate[0]->dtype(), dim); + DenseTensorMeta meta(learning_rate[0]->dtype(), dim); lr.set_meta(meta); coeff.set_meta(meta); - phi::DenseTensorMeta l2_meta(phi::DataType::INT32, dim); + DenseTensorMeta l2_meta(phi::DataType::INT32, dim); l2_decay.set_meta(l2_meta); if (learning_rate.size() != 1) { TensorFromVectorTensor(dev_ctx, learning_rate, &lr); } else { - std::vector lr_vec; + std::vector lr_vec; for (int i = 0; i < M; ++i) { lr_vec.push_back(learning_rate[0]); } @@ -203,9 +203,9 @@ void MergedMomentumKernel( TensorFromVector(dev_ctx, l2_decay_vec, dev_ctx, &l2_decay); int input_num = 3; void* data[input_num][M]; - std::vector grad_in; + std::vector grad_in; for (int i = 0; i < param_num; ++i) { - grad_in.push_back(const_cast(grad[i])); + grad_in.push_back(const_cast(grad[i])); } for (int i = 0; i < M; i++) { data[0][i] = param_out[i]->data(); @@ -214,7 +214,7 @@ void MergedMomentumKernel( } void* pointer[input_num]; - std::vector pointer_data(input_num); + std::vector pointer_data(input_num); int64_t pointer_bytes = M * sizeof(void*); for (int i = 0; i < input_num; ++i) { pointer_data[i].Resize({pointer_bytes}); @@ -231,8 +231,8 @@ void MergedMomentumKernel( for (int i = 0; i < M; i++) { len.push_back(param_out[i]->numel()); } - phi::DenseTensor n_total; - phi::DenseTensorMeta meta1(phi::DataType::INT64, dim); + DenseTensor n_total; + DenseTensorMeta meta1(phi::DataType::INT64, dim); n_total.set_meta(meta1); TensorFromVector(dev_ctx, len, dev_ctx, &n_total); sdaaStream_t custom_stream = GetStreamFromCTX(dev_ctx); diff --git a/backends/sdaa/kernels/meshgrid_kernel.cc b/backends/sdaa/kernels/meshgrid_kernel.cc index 9a408eeb577..cd1dae606aa 100644 --- a/backends/sdaa/kernels/meshgrid_kernel.cc +++ b/backends/sdaa/kernels/meshgrid_kernel.cc @@ -34,8 +34,8 @@ namespace custom_kernel { template void MeshgridKernel(const Context& dev_ctx, - const std::vector& inputs, - std::vector outputs) { + const std::vector& inputs, + std::vector outputs) { VLOG(4) << "Call SDAA MeshgridKernel"; int rank = inputs.size(); @@ -72,7 +72,7 @@ void MeshgridKernel(const Context& dev_ctx, memcpy(host_input.data(), input_ptr.data(), inputWorkspaceSize); memcpy(host_output.data(), output_ptr.data(), outputWorkspaceSize); - phi::DenseTensor input_tmp, output_tmp; + DenseTensor input_tmp, output_tmp; input_tmp.Resize(phi::make_ddim({hostInputSize})); output_tmp.Resize(phi::make_ddim({hostOutputSize})); dev_ctx.Alloc(&input_tmp, phi::DataType::INT8); diff --git a/backends/sdaa/kernels/momentum_kernel.cc b/backends/sdaa/kernels/momentum_kernel.cc index 0779698caeb..bac754efc88 100644 --- a/backends/sdaa/kernels/momentum_kernel.cc +++ b/backends/sdaa/kernels/momentum_kernel.cc @@ -22,20 +22,20 @@ namespace custom_kernel { template void MomentumKernel(const Context& dev_ctx, - const phi::DenseTensor& param, - const phi::DenseTensor& grad, - const phi::DenseTensor& velocity, - const phi::DenseTensor& learning_rate, - const paddle::optional& master_param, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& velocity, + const DenseTensor& learning_rate, + const paddle::optional& master_param, float mu_f, bool use_nesterov, const std::string& regularization_method, float regularization_coeff, bool multi_precision, float rescale_grad, - phi::DenseTensor* param_out, - phi::DenseTensor* velocity_out, - phi::DenseTensor* master_param_out) { + DenseTensor* param_out, + DenseTensor* velocity_out, + DenseTensor* master_param_out) { if (isEnvEnable("HIGH_PERFORMANCE_CONV") && (grad.storage_properties_initialized())) { SDAAStorageProperties grad_properties = @@ -59,7 +59,7 @@ void MomentumKernel(const Context& dev_ctx, VLOG(4) << "Call SDAA MomentumKernel"; TensorCopy(dev_ctx, param, false, param_out); TensorCopy(dev_ctx, velocity, false, velocity_out); - phi::DenseTensor* grad_in = const_cast(&grad); + DenseTensor* grad_in = const_cast(&grad); bool l2_decay = false; if (regularization_method == "l2_decay") { diff --git a/backends/sdaa/kernels/multiclass_nms3_kernel.cc b/backends/sdaa/kernels/multiclass_nms3_kernel.cc index b9e097d8298..49665f1b3c1 100644 --- a/backends/sdaa/kernels/multiclass_nms3_kernel.cc +++ b/backends/sdaa/kernels/multiclass_nms3_kernel.cc @@ -31,9 +31,9 @@ namespace custom_kernel { template void MultiClassNMSKernel(const Context& dev_ctx, - const phi::DenseTensor& bboxes, - const phi::DenseTensor& scores, - const paddle::optional& rois_num, + const DenseTensor& bboxes, + const DenseTensor& scores, + const paddle::optional& rois_num, float score_threshold, int nms_top_k, int keep_top_k, @@ -41,9 +41,9 @@ void MultiClassNMSKernel(const Context& dev_ctx, bool normalized, float nms_eta, int background_label, - phi::DenseTensor* out, - phi::DenseTensor* index, - phi::DenseTensor* nms_rois_num) { + DenseTensor* out, + DenseTensor* index, + DenseTensor* nms_rois_num) { VLOG(4) << "Call SDAA MultiClassNMSKernel"; bool return_index = index != nullptr; @@ -94,7 +94,7 @@ void MultiClassNMSKernel(const Context& dev_ctx, scores_desc, rois_num_desc, &workspace_size)); - phi::DenseTensor dev_workspace; + DenseTensor dev_workspace; dev_workspace.Resize(phi::make_ddim({static_cast(workspace_size)})); dev_ctx.Alloc(&dev_workspace, phi::DataType::INT8); @@ -134,18 +134,18 @@ void MultiClassNMSKernel(const Context& dev_ctx, tecodnnTensorDescriptor_t nms_rois_num_desc = sdaa_ops::GetTecodnnTensorDesc( nms_rois_num_dimensions, DataType::INT32, TensorFormat::Undefined); - phi::DenseTensor out_temp; + DenseTensor out_temp; out_temp.Resize(phi::make_ddim(out_dimensions_temp)); dev_ctx.template Alloc(&out_temp); - phi::DenseTensor index_temp; + DenseTensor index_temp; // if index is nullptr if (return_index) { index_temp.Resize(phi::make_ddim(index_dimensions_temp)); dev_ctx.template Alloc(&index_temp); } - phi::DenseTensor nms_rois_num_temp; + DenseTensor nms_rois_num_temp; nms_rois_num_temp.Resize(phi::make_ddim(nms_rois_num_dimensions)); dev_ctx.template Alloc(&nms_rois_num_temp); @@ -174,7 +174,7 @@ void MultiClassNMSKernel(const Context& dev_ctx, // truncate output, judging the length of out by the sum of nms_rois_num // nms_rois_num data is on sdaa, copy from sdaa to CPU - phi::DenseTensor nms_rois_num_cpu; + DenseTensor nms_rois_num_cpu; auto cpu_place = phi::CPUPlace(); auto custom_place = dev_ctx.GetPlace(); phi::Copy(dev_ctx, nms_rois_num_temp, cpu_place, true, &nms_rois_num_cpu); diff --git a/backends/sdaa/kernels/nll_loss_kernel.cc b/backends/sdaa/kernels/nll_loss_kernel.cc index d1240ac6b22..f086d92f631 100644 --- a/backends/sdaa/kernels/nll_loss_kernel.cc +++ b/backends/sdaa/kernels/nll_loss_kernel.cc @@ -31,13 +31,13 @@ namespace custom_kernel { template void NllLossRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& labels, - const paddle::optional& weight, + const DenseTensor& x, + const DenseTensor& labels, + const paddle::optional& weight, int64_t ignore_index, const std::string& reduction, - phi::DenseTensor* out, - phi::DenseTensor* total_weight) { + DenseTensor* out, + DenseTensor* total_weight) { VLOG(4) << "Call SDAA NllLossRawKernel"; std::vector x_dims = phi::vectorize(x.dims()); @@ -49,7 +49,7 @@ void NllLossRawKernel(const Context& dev_ctx, int batch_size = x_dims[0]; int n_classes = x_dims[1]; - phi::DenseTensor weight_temp; + DenseTensor weight_temp; if (weight.get_ptr() == nullptr) { std::vector temp = {n_classes}; phi::DDim weight_dims = phi::make_ddim(std::move(temp)); @@ -83,11 +83,11 @@ void NllLossRawKernel(const Context& dev_ctx, labels_dims.emplace_back(1); std::vector weight_dims = phi::vectorize(weight_temp.dims()); - phi::DenseTensor labels_cast; + DenseTensor labels_cast; if (labels.dtype() != phi::DataType::INT32) { // due to tecodnn is only support labels of INT32 -> do cast labels_cast.Resize(labels.dims()); - phi::DenseTensorMeta labels_meta = {phi::DataType::INT32, labels.dims()}; + DenseTensorMeta labels_meta = {phi::DataType::INT32, labels.dims()}; labels_cast.set_meta(labels_meta); dev_ctx.template Alloc(&labels_cast); sdaa_ops::doCastTensor(dev_ctx, labels, &labels_cast); @@ -175,7 +175,7 @@ void NllLossRawKernel(const Context& dev_ctx, total_weight_desc, total_weight->data())); } else { - phi::DenseTensor out_temp; + DenseTensor out_temp; out_temp.Resize(temp_out_dim); dev_ctx.template Alloc(&out_temp); TECODNN_CHECK(tecodnnNLLLoss2dForward(handle, @@ -217,14 +217,14 @@ void NllLossRawKernel(const Context& dev_ctx, template void NllLossGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& labels, - const paddle::optional& weight, - const phi::DenseTensor& total_weight, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& labels, + const paddle::optional& weight, + const DenseTensor& total_weight, + const DenseTensor& dout, int64_t ignore_index, const std::string& reduction, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "Call SDAA NllLossGradKernel"; std::vector x_dims = phi::vectorize(x.dims()); @@ -235,7 +235,7 @@ void NllLossGradKernel(const Context& dev_ctx, int batch_size = x_dims[0]; int n_classes = x_dims[1]; - phi::DenseTensor weight_temp; + DenseTensor weight_temp; if (weight.get_ptr() == nullptr) { std::vector temp = {n_classes}; phi::DDim weight_dims = phi::make_ddim(std::move(temp)); @@ -269,11 +269,11 @@ void NllLossGradKernel(const Context& dev_ctx, labels_dims.emplace_back(1); std::vector weight_dims = phi::vectorize(weight_temp.dims()); - phi::DenseTensor labels_cast; + DenseTensor labels_cast; if (labels.dtype() != phi::DataType::INT32) { // due to tecodnn is only support labels of INT32 -> do cast labels_cast.Resize(labels.dims()); - phi::DenseTensorMeta labels_meta = {phi::DataType::INT32, labels.dims()}; + DenseTensorMeta labels_meta = {phi::DataType::INT32, labels.dims()}; labels_cast.set_meta(labels_meta); dev_ctx.template Alloc(&labels_cast); sdaa_ops::doCastTensor(dev_ctx, labels, &labels_cast); diff --git a/backends/sdaa/kernels/nonzero_kernel.cc b/backends/sdaa/kernels/nonzero_kernel.cc index 6bd66479ba1..56daf9095ad 100644 --- a/backends/sdaa/kernels/nonzero_kernel.cc +++ b/backends/sdaa/kernels/nonzero_kernel.cc @@ -32,9 +32,9 @@ namespace custom_kernel { template void doNonZeroTensor(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* nonzeroCount, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* nonzeroCount, + DenseTensor* out) { std::vector x_dims = phi::vectorize(x.dims()); std::vector out_dims = phi::vectorize(out->dims()); bool as_tuple = false; @@ -65,14 +65,14 @@ void doNonZeroTensor(const Context& dev_ctx, template void NonZeroKernel(const Context& dev_ctx, - const phi::DenseTensor& condition, - phi::DenseTensor* out) { + const DenseTensor& condition, + DenseTensor* out) { VLOG(4) << "CALL SDAA NonZeroKernel"; int64_t numel = condition.numel(); int64_t rank = condition.dims().size(); - phi::DenseTensor out_temp, nonzeroCount; + DenseTensor out_temp, nonzeroCount; out_temp.Resize(phi::make_ddim({numel, rank})); dev_ctx.template Alloc(&out_temp); @@ -82,7 +82,7 @@ void NonZeroKernel(const Context& dev_ctx, custom_kernel::doNonZeroTensor( dev_ctx, condition, &nonzeroCount, &out_temp); - phi::DenseTensor nonzeroCountHost; + DenseTensor nonzeroCountHost; phi::Copy(dev_ctx, nonzeroCount, phi::CPUPlace(), true, &nonzeroCountHost); auto nonzeroNum = *nonzeroCountHost.data(); diff --git a/backends/sdaa/kernels/one_hot_kernel.cc b/backends/sdaa/kernels/one_hot_kernel.cc index 5d12f6bc38b..2ee75d63c63 100644 --- a/backends/sdaa/kernels/one_hot_kernel.cc +++ b/backends/sdaa/kernels/one_hot_kernel.cc @@ -42,9 +42,9 @@ __inline__ static bool isEnableParallelTP() { template void doOneHotTensor(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int num_classes, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "call tecodnn OneHot tensor"; std::vector x_dims = phi::vectorize(x.dims()); std::vector out_dims = phi::vectorize(out->dims()); @@ -53,26 +53,26 @@ void doOneHotTensor(const Context& dev_ctx, out_dims.insert(out_dims.begin(), 1); } if (isEnableParallelTP()) { - phi::DenseTensorMeta bound_meta = {x.dtype(), phi::make_ddim({1})}; - phi::DenseTensorMeta compare_meta = {phi::DataType::BOOL, x.dims()}; + DenseTensorMeta bound_meta = {x.dtype(), phi::make_ddim({1})}; + DenseTensorMeta compare_meta = {phi::DataType::BOOL, x.dims()}; - phi::DenseTensor upper_bound; + DenseTensor upper_bound; upper_bound.set_meta(bound_meta); dev_ctx.template Alloc(&upper_bound); sdaa_ops::doFillTensor( dev_ctx, static_cast(num_classes), x.dtype(), &upper_bound); - phi::DenseTensor lower_bound; + DenseTensor lower_bound; lower_bound.set_meta(bound_meta); dev_ctx.template Alloc(&lower_bound); sdaa_ops::doFillTensor( dev_ctx, static_cast(0), x.dtype(), &lower_bound); - phi::DenseTensor x_large; + DenseTensor x_large; x_large.set_meta(compare_meta); dev_ctx.template Alloc(&x_large); - phi::DenseTensor x_min; + DenseTensor x_min; x_min.set_meta(compare_meta); dev_ctx.template Alloc(&x_min); @@ -81,24 +81,24 @@ void doOneHotTensor(const Context& dev_ctx, sdaa_ops::doCompareTensor( dev_ctx, x, lower_bound, CompareType::GreaterEqual, &x_min); - phi::DenseTensor x_true; + DenseTensor x_true; x_true.set_meta(compare_meta); dev_ctx.template Alloc(&x_true); sdaa_ops::doBitwiseBinaryOpTensor( dev_ctx, x_large, x_min, BitwiseOpType::And, &x_true); - phi::DenseTensor x_true_cast; + DenseTensor x_true_cast; x_true_cast.Resize(x.dims()); dev_ctx.Alloc(&x_true_cast, x.dtype()); sdaa_ops::doCastTensor(dev_ctx, x_true, &x_true_cast); - phi::DenseTensor y_true_cast; + DenseTensor y_true_cast; y_true_cast.Resize(x.dims()); dev_ctx.Alloc(&y_true_cast, out->dtype()); sdaa_ops::doCastTensor(dev_ctx, x_true, &y_true_cast); - phi::DenseTensor real_x_label; + DenseTensor real_x_label; real_x_label.Resize(x.dims()); dev_ctx.Alloc(&real_x_label, x.dtype()); sdaa_ops::doElementMul(dev_ctx, x, x_true_cast, -1, &real_x_label); @@ -107,12 +107,12 @@ void doOneHotTensor(const Context& dev_ctx, size_vec.push_back(1); y_true_cast.Resize(phi::make_ddim({size_vec})); - phi::DenseTensor x_true_expand; + DenseTensor x_true_expand; x_true_expand.Resize(out->dims()); dev_ctx.Alloc(&x_true_expand, out->dtype()); sdaa_ops::doExpandTensor(dev_ctx, y_true_cast, &x_true_expand); - phi::DenseTensor out_tmp; + DenseTensor out_tmp; out_tmp.Resize(out->dims()); dev_ctx.Alloc(&out_tmp, out->dtype()); @@ -148,11 +148,11 @@ void doOneHotTensor(const Context& dev_ctx, template void OneHotRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& depth_scalar, phi::DataType dtype, bool allow_out_of_range, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA OneHotRawKernel"; int depth = depth_scalar.to(); @@ -167,9 +167,9 @@ void OneHotRawKernel(const Context& dev_ctx, template void OneHotKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& num_classes_s, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA OneHotKernel"; custom_kernel::OneHotRawKernel( diff --git a/backends/sdaa/kernels/p_norm_kernel.cc b/backends/sdaa/kernels/p_norm_kernel.cc index b08f6d70c77..586dd45e772 100644 --- a/backends/sdaa/kernels/p_norm_kernel.cc +++ b/backends/sdaa/kernels/p_norm_kernel.cc @@ -34,13 +34,13 @@ namespace custom_kernel { template void PNormKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float porder, int axis, float epsilon, bool keepdim, bool asvector, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA PNormKernel"; PADDLE_ENFORCE_LT(x.dims().size(), @@ -85,15 +85,15 @@ void PNormKernel(const Context& dev_ctx, template void PNormGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& dy, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dy, float porder, int axis, float epsilon, bool keepdim UNUSED, bool asvector, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "CALL SDAA PNormGradKernel"; dev_ctx.template Alloc(dx); diff --git a/backends/sdaa/kernels/pool2d_kernel.cc b/backends/sdaa/kernels/pool2d_kernel.cc index fa71c3a11ff..da8d4a349e4 100644 --- a/backends/sdaa/kernels/pool2d_kernel.cc +++ b/backends/sdaa/kernels/pool2d_kernel.cc @@ -126,13 +126,13 @@ tecodnnPoolingMode_t GetTecodnnPoolingMode(const std::string& pooling_type, /*The tensor format of this function must be NHWC*/ template void doPoolingForward(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::string& pooling_type, const std::vector& pool2dParameters, bool adaptive, bool exclusive, bool ceil_mode, - phi::DenseTensor* out) { + DenseTensor* out) { std::vector x_dims = phi::vectorize(x.dims()); std::vector out_dims = phi::vectorize(out->dims()); @@ -200,15 +200,15 @@ void doPoolingForward(const Context& dev_ctx, /*The tensor format of this function must be NHWC*/ template void doPoolingBackward(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& out, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, const std::string& pooling_type, const std::vector& pool2dParameters, bool adaptive, bool exclusive, bool ceil_mode, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { std::vector x_dims = phi::vectorize(x.dims()); std::vector out_dims = phi::vectorize(out.dims()); std::vector out_grad_dims = phi::vectorize(out_grad.dims()); @@ -289,7 +289,7 @@ void doPoolingBackward(const Context& dev_ctx, template void Pool2dKernel(const Context& dev_ctx, - const phi::DenseTensor& in_x, + const DenseTensor& in_x, const phi::IntArray& kernel_size, const std::vector& strides_t_64, const std::vector& paddings_t_64, @@ -300,7 +300,7 @@ void Pool2dKernel(const Context& dev_ctx, bool global_pooling, bool adaptive, const std::string& padding_algorithm, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA Pool2dKernel"; std::vector strides_t = std::vector(strides_t_64.begin(), strides_t_64.end()); @@ -400,9 +400,9 @@ void Pool2dKernel(const Context& dev_ctx, phi::DDim out_NHWC_dims = sdaa_ops::doDimPermute(*out, Convert_TF::NCHW2NHWC); - phi::DenseTensor in_x_NHWC, out_NHWC; - phi::DenseTensorMeta in_x_NHWC_meta = {in_x.dtype(), in_x_NHWC_dims}; - phi::DenseTensorMeta out_NHWC_meta = {out->dtype(), out_NHWC_dims}; + DenseTensor in_x_NHWC, out_NHWC; + DenseTensorMeta in_x_NHWC_meta = {in_x.dtype(), in_x_NHWC_dims}; + DenseTensorMeta out_NHWC_meta = {out->dtype(), out_NHWC_dims}; in_x_NHWC.set_meta(in_x_NHWC_meta); out_NHWC.set_meta(out_NHWC_meta); @@ -436,9 +436,9 @@ void Pool2dKernel(const Context& dev_ctx, template void Pool2dGradKernel(const Context& dev_ctx, - const phi::DenseTensor& in_x, - const phi::DenseTensor& out, - const phi::DenseTensor& out_grad, + const DenseTensor& in_x, + const DenseTensor& out, + const DenseTensor& out_grad, const phi::IntArray& kernel_size, const std::vector& strides_t, const std::vector& paddings_t, @@ -449,7 +449,7 @@ void Pool2dGradKernel(const Context& dev_ctx, bool global_pooling, bool adaptive, const std::string& padding_algorithm, - phi::DenseTensor* in_x_grad) { + DenseTensor* in_x_grad) { VLOG(4) << "CALL SDAA Pool2dGradKernel"; dev_ctx.template Alloc(in_x_grad); @@ -552,13 +552,12 @@ void Pool2dGradKernel(const Context& dev_ctx, phi::DDim in_x_grad_NHWC_dims = sdaa_ops::doDimPermute(*in_x_grad, Convert_TF::NCHW2NHWC); - phi::DenseTensor in_x_NHWC, in_x_grad_NHWC, out_NHWC, out_grad_NHWC; - phi::DenseTensorMeta in_x_NHWC_meta = {in_x.dtype(), in_x_NHWC_dims}; - phi::DenseTensorMeta out_NHWC_meta = {out_NHWC.dtype(), out_NHWC_dims}; - phi::DenseTensorMeta out_grad_NHWC_meta = {out_grad.dtype(), - out_grad_NHWC_dims}; - phi::DenseTensorMeta in_x_grad_NHWC_meta = {in_x_grad->dtype(), - in_x_grad_NHWC_dims}; + DenseTensor in_x_NHWC, in_x_grad_NHWC, out_NHWC, out_grad_NHWC; + DenseTensorMeta in_x_NHWC_meta = {in_x.dtype(), in_x_NHWC_dims}; + DenseTensorMeta out_NHWC_meta = {out_NHWC.dtype(), out_NHWC_dims}; + DenseTensorMeta out_grad_NHWC_meta = {out_grad.dtype(), out_grad_NHWC_dims}; + DenseTensorMeta in_x_grad_NHWC_meta = {in_x_grad->dtype(), + in_x_grad_NHWC_dims}; in_x_NHWC.set_meta(in_x_NHWC_meta); out_NHWC.set_meta(out_NHWC_meta); diff --git a/backends/sdaa/kernels/prelu_kernel.cc b/backends/sdaa/kernels/prelu_kernel.cc index d9044f44481..f9d674d28c6 100644 --- a/backends/sdaa/kernels/prelu_kernel.cc +++ b/backends/sdaa/kernels/prelu_kernel.cc @@ -31,11 +31,11 @@ namespace custom_kernel { template void PReluKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& alpha, + const DenseTensor& x, + const DenseTensor& alpha, const std::string& data_format, const std::string& mode, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA PReluKernel"; if (1 == alpha.numel()) { diff --git a/backends/sdaa/kernels/prior_box_kernel.cc b/backends/sdaa/kernels/prior_box_kernel.cc index 3364cded98a..cefa6b3ba1f 100644 --- a/backends/sdaa/kernels/prior_box_kernel.cc +++ b/backends/sdaa/kernels/prior_box_kernel.cc @@ -57,8 +57,8 @@ inline void ExpandAspectRatios(const std::vector& input_aspect_ratior, template void PriorBoxKernel(const Context& ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& image, + const DenseTensor& input, + const DenseTensor& image, const std::vector& min_sizes, const std::vector& max_sizes, const std::vector& aspect_ratios, @@ -69,8 +69,8 @@ void PriorBoxKernel(const Context& ctx, float step_h, float offset, bool min_max_aspect_ratios_order, - phi::DenseTensor* out, - phi::DenseTensor* var) { + DenseTensor* out, + DenseTensor* var) { VLOG(4) << "Call SDAA PriorBoxKernel"; std::vector new_aspect_ratios; ExpandAspectRatios(aspect_ratios, flip, &new_aspect_ratios); @@ -97,28 +97,28 @@ void PriorBoxKernel(const Context& ctx, ctx.template Alloc(out); ctx.template Alloc(var); - phi::DenseTensor r; + DenseTensor r; phi::TensorFromVector(new_aspect_ratios, ctx, &r); auto aspect_ratios_desc = sdaa_ops::GetTecodnnTensorDesc( {static_cast(new_aspect_ratios.size())}, r.dtype(), TensorFormat::Undefined); - phi::DenseTensor min; + DenseTensor min; phi::TensorFromVector(min_sizes, ctx, &min); auto min_desc = sdaa_ops::GetTecodnnTensorDesc({static_cast(min_sizes.size())}, min.dtype(), TensorFormat::Undefined); - phi::DenseTensor max; + DenseTensor max; phi::TensorFromVector(max_sizes, ctx, &max); auto max_desc = sdaa_ops::GetTecodnnTensorDesc({static_cast(max_sizes.size())}, max.dtype(), TensorFormat::Undefined); - phi::DenseTensor v; + DenseTensor v; phi::TensorFromVector(variances, ctx, &v); auto variances_desc = sdaa_ops::GetTecodnnTensorDesc( {static_cast(variances.size())}, v.dtype(), TensorFormat::Undefined); diff --git a/backends/sdaa/kernels/randint_kernel.cc b/backends/sdaa/kernels/randint_kernel.cc index 4cacc90f004..54fec24017c 100644 --- a/backends/sdaa/kernels/randint_kernel.cc +++ b/backends/sdaa/kernels/randint_kernel.cc @@ -37,7 +37,7 @@ void RandintKernelNVAlign(const Context& dev_ctx, int high, const phi::IntArray& shape, phi::DataType dtype, - phi::DenseTensor* out) { + DenseTensor* out) { out->Resize(common::make_ddim(shape.GetData())); dev_ctx.template Alloc(out); @@ -72,9 +72,9 @@ void RandintKernelNVAlign(const Context& dev_ctx, // << ", seed=" << seed_data << ", offset=" << increment // << ", increment=" << offset; - phi::DenseTensor out_int32{}; + DenseTensor out_int32{}; if (need_trans) { - auto out_meta = phi::DenseTensorMeta{phi::DataType::INT32, out->dims()}; + auto out_meta = DenseTensorMeta{phi::DataType::INT32, out->dims()}; out_int32.set_meta(out_meta); dev_ctx.template Alloc(&out_int32); } diff --git a/backends/sdaa/kernels/reduce_logic_kernel.cc b/backends/sdaa/kernels/reduce_logic_kernel.cc index b9385d4c887..468c73406b8 100644 --- a/backends/sdaa/kernels/reduce_logic_kernel.cc +++ b/backends/sdaa/kernels/reduce_logic_kernel.cc @@ -22,9 +22,9 @@ namespace custom_kernel { template void logic_kernel_impl(const Context& ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& dims, - phi::DenseTensor* out, + DenseTensor* out, TensorLogicType tlt) { VLOG(4) << "Call SDAA LogicKernel"; @@ -42,10 +42,10 @@ void logic_kernel_impl(const Context& ctx, #define DEFINE_TECODNN_LOGIC_KERNEL(logic_kernel, logic_type) \ template \ void logic_kernel(const Context& dev_ctx, \ - const phi::DenseTensor& x, \ + const DenseTensor& x, \ const std::vector& dims, \ bool keep_dim, \ - phi::DenseTensor* out) { \ + DenseTensor* out) { \ logic_kernel_impl(dev_ctx, x, dims, out, logic_type); \ } DEFINE_TECODNN_LOGIC_KERNEL(AllKernel, TensorLogicType::all); @@ -55,11 +55,11 @@ DEFINE_TECODNN_LOGIC_KERNEL(AnyKernel, TensorLogicType::any); #define DEFINE_TECODNN_LOGIC_RAW_KERNEL(logic_raw_kernel, logic_type) \ template \ void logic_raw_kernel(const Context& dev_ctx, \ - const phi::DenseTensor& x, \ + const DenseTensor& x, \ const std::vector& dims, \ bool keep_dim, \ bool reduce_all, \ - phi::DenseTensor* out) { \ + DenseTensor* out) { \ logic_kernel_impl(dev_ctx, x, dims, out, logic_type); \ } DEFINE_TECODNN_LOGIC_RAW_KERNEL(AllRawKernel, TensorLogicType::all); diff --git a/backends/sdaa/kernels/reduce_max_kernel.cc b/backends/sdaa/kernels/reduce_max_kernel.cc index c815b66aabe..0b79f600b51 100644 --- a/backends/sdaa/kernels/reduce_max_kernel.cc +++ b/backends/sdaa/kernels/reduce_max_kernel.cc @@ -17,11 +17,11 @@ namespace custom_kernel { template void MaxRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& axes, bool keep_dim, bool reduce_all, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA MaxRawKernel"; auto dims = axes.GetData(); std::vector reduce_dims; @@ -46,10 +46,10 @@ void MaxRawKernel(const Context& dev_ctx, template void MaxKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& dims, bool keep_dim, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA MaxKernel"; bool reduce_all = false; if (dims.size() == 0) { diff --git a/backends/sdaa/kernels/reduce_mean_kernel.cc b/backends/sdaa/kernels/reduce_mean_kernel.cc index dbd3c0f72b4..86dd922ca99 100644 --- a/backends/sdaa/kernels/reduce_mean_kernel.cc +++ b/backends/sdaa/kernels/reduce_mean_kernel.cc @@ -18,11 +18,11 @@ namespace custom_kernel { template void MeanRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& axes, bool keep_dim, bool reduce_all, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA MeanRawKernel"; auto dims = axes.GetData(); std::vector reduce_dims; @@ -55,10 +55,10 @@ void MeanRawKernel(const Context& dev_ctx, template void MeanKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& dims, bool keep_dim, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA MeanKernel"; bool reduce_all = false; if (dims.size() == 0) { @@ -69,8 +69,8 @@ void MeanKernel(const Context& dev_ctx, template void MeanAllKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA MeanAllKernel"; custom_kernel::MeanRawKernel( @@ -79,9 +79,9 @@ void MeanAllKernel(const Context& dev_ctx, template void MeanAllGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& grad, - phi::DenseTensor* x_grad) { + const DenseTensor& x, + const DenseTensor& grad, + DenseTensor* x_grad) { PADDLE_ENFORCE_EQ(grad.numel(), 1, phi::errors::InvalidArgument( @@ -96,15 +96,15 @@ void MeanAllGradKernel(const Context& dev_ctx, template void MeanGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& out_grad, const phi::IntArray& dims, bool keep_dim, bool reduce_all, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { VLOG(4) << "call sdaa mean grad kernel"; dev_ctx.template Alloc(x_grad); - phi::DenseTensor out_grad_temp(out_grad); + DenseTensor out_grad_temp(out_grad); float constant = 1; if (reduce_all || dims.size() == 0) { std::vector out_dims(x.dims().size(), 1); diff --git a/backends/sdaa/kernels/reduce_min_kernel.cc b/backends/sdaa/kernels/reduce_min_kernel.cc index 5ead132015c..9bfde03cabd 100644 --- a/backends/sdaa/kernels/reduce_min_kernel.cc +++ b/backends/sdaa/kernels/reduce_min_kernel.cc @@ -17,11 +17,11 @@ namespace custom_kernel { template void MinRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& axes, bool keep_dim, bool reduce_all, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA MinRawKernel"; std::vector reduce_dims; auto dims = axes.GetData(); @@ -46,10 +46,10 @@ void MinRawKernel(const Context& dev_ctx, template void MinKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& dims, bool keep_dim, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA MinKernel"; bool reduce_all = false; if (dims.size() == 0) { diff --git a/backends/sdaa/kernels/reduce_prod_kernel.cc b/backends/sdaa/kernels/reduce_prod_kernel.cc index 5b19b31ce62..a88245aa8b4 100644 --- a/backends/sdaa/kernels/reduce_prod_kernel.cc +++ b/backends/sdaa/kernels/reduce_prod_kernel.cc @@ -19,11 +19,11 @@ namespace custom_kernel { template void ProdKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& axes, bool keep_dim, bool reduce_all, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA ProdRawKernel"; std::vector reduce_dims; auto dims = axes.GetData(); @@ -47,10 +47,10 @@ void ProdKernel(const Context& dev_ctx, template void ProdInferKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& dims, bool keep_dim, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA ProdKernel"; bool reduce_all = false; if (dims.size() == 0) { diff --git a/backends/sdaa/kernels/reduce_sum_kernel.cc b/backends/sdaa/kernels/reduce_sum_kernel.cc index 7b99e83db40..96daed359fb 100644 --- a/backends/sdaa/kernels/reduce_sum_kernel.cc +++ b/backends/sdaa/kernels/reduce_sum_kernel.cc @@ -35,12 +35,12 @@ bool CheckDtype(const phi::DataType& dt) { template void SumRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& axes, bool keep_dim, bool reduce_all, phi::DataType out_dtype, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA SumRawKernel"; auto tecodnn_support = CheckDtype(out->dtype()); @@ -74,7 +74,7 @@ void SumRawKernel(const Context& dev_ctx, sdaa_ops::doSumTensor(dev_ctx, x, reduce_dims, out); } else { // cast x tensor to out dtype - phi::DenseTensor in_t; + DenseTensor in_t; in_t.Resize(x.dims()); dev_ctx.Alloc(&in_t, out->dtype()); sdaa_ops::doCastTensor(dev_ctx, x, &in_t); @@ -84,11 +84,11 @@ void SumRawKernel(const Context& dev_ctx, template void SumKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& dims, phi::DataType out_dtype, bool keep_dim, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA SumKernel"; bool reduce_all = false; if (dims.size() == 0) { @@ -100,15 +100,15 @@ void SumKernel(const Context& dev_ctx, template void SumGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& out_grad, const phi::IntArray& dims, bool keep_dim, bool reduce_all, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { VLOG(4) << "Call SDAA SumGradKernel"; dev_ctx.template Alloc(x_grad); - phi::DenseTensor out_grad_temp(out_grad); + DenseTensor out_grad_temp(out_grad); if (reduce_all || dims.size() == 0) { std::vector out_dims(x.dims().size(), 1); out_grad_temp.Resize(phi::make_ddim(out_dims)); diff --git a/backends/sdaa/kernels/rnn_kernel.cc b/backends/sdaa/kernels/rnn_kernel.cc index 2bd50cce6f3..8a50913e81b 100644 --- a/backends/sdaa/kernels/rnn_kernel.cc +++ b/backends/sdaa/kernels/rnn_kernel.cc @@ -48,8 +48,8 @@ bool IsContinuous(const Type& weight_list) { template void WeightToTensor(const Context& dev_ctx, - const std::vector& weight_list, - phi::DenseTensor* weight) { + const std::vector& weight_list, + DenseTensor* weight) { auto weight_data = weight->data(); int weight_offset = 0; for (size_t i = 0; i < weight_list.size(); ++i) { @@ -67,12 +67,12 @@ void WeightToTensor(const Context& dev_ctx, } template -size_t GetWeightNum(const std::vector& weight_list) { +size_t GetWeightNum(const std::vector& weight_list) { size_t weight_num = std::accumulate( weight_list.begin(), weight_list.end(), 0, - [](int64_t num, const phi::DenseTensor* t) { return num + t->numel(); }); + [](int64_t num, const DenseTensor* t) { return num + t->numel(); }); return weight_num; } @@ -126,9 +126,9 @@ tecodnnRNNDescriptor_t GetTecodnnRnnDesc(const Context& dev_ctx, size_t act_statesSize = 4 * 1024 * sizeof(int); // when statesSize is fixed, will use DropoutGetStatesSize() func. TECODNN_CHECK(tecodnnDropoutGetStatesSize(tecodnnHandle, &act_statesSize)); - phi::DenseTensorMeta meta = {phi::DataType::INT8, - {static_cast(act_statesSize)}}; - phi::DenseTensor states; + DenseTensorMeta meta = {phi::DataType::INT8, + {static_cast(act_statesSize)}}; + DenseTensor states; states.set_meta(meta); dev_ctx.template Alloc(&states); TECODNN_CHECK(tecodnnSetDropoutDescriptor(DropoutDesc, @@ -158,10 +158,10 @@ tecodnnRNNDescriptor_t GetTecodnnRnnDesc(const Context& dev_ctx, template void doRnnForward(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& hx, - const phi::DenseTensor& cx, - const std::vector& weight_list, + const DenseTensor& x, + const DenseTensor& hx, + const DenseTensor& cx, + const std::vector& weight_list, int batch_size, int input_size, int direction_num, @@ -173,11 +173,11 @@ void doRnnForward(const Context& dev_ctx, bool is_bidirec, bool is_test, const std::string& mode, - phi::DenseTensor* y, - phi::DenseTensor* hy, - phi::DenseTensor* cy, - phi::DenseTensor* dropout_state, - phi::DenseTensor* reserve) { + DenseTensor* y, + DenseTensor* hy, + DenseTensor* cy, + DenseTensor* dropout_state, + DenseTensor* reserve) { std::string input_mode = "linear"; std::string rnn_algo = "standard"; @@ -188,10 +188,10 @@ void doRnnForward(const Context& dev_ctx, int weight_num = static_cast(GetWeightNum(weight_list)); - phi::DenseTensor weight_whole; + DenseTensor weight_whole; void* w_data = nullptr; bool continuous = - IsContinuous>(weight_list); + IsContinuous>(weight_list); if (!continuous) { VLOG(2) << "If the memory space of the Input WeightList is not continuous, " "less efficient calculation will be called. Please call " @@ -265,7 +265,7 @@ void doRnnForward(const Context& dev_ctx, phi::errors::InvalidArgument( "The sdaa rnn and setting weight size should be same.")); - phi::DenseTensor workspace_data; + DenseTensor workspace_data; workspace_data.Resize(phi::make_ddim({static_cast(workspace_size)})); dev_ctx.template Alloc(&workspace_data); sdaa_ops::doFillTensor( @@ -304,15 +304,15 @@ void doRnnForward(const Context& dev_ctx, template void doRnnBackward(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& hx, - const phi::DenseTensor& cx, - const phi::DenseTensor& dhy, - const phi::DenseTensor& dcy, - const phi::DenseTensor& out, - const phi::DenseTensor& out_grad, - const phi::DenseTensor& reserve, - const std::vector& weight_list, + const DenseTensor& x, + const DenseTensor& hx, + const DenseTensor& cx, + const DenseTensor& dhy, + const DenseTensor& dcy, + const DenseTensor& out, + const DenseTensor& out_grad, + const DenseTensor& reserve, + const std::vector& weight_list, int batch_size, int input_size, int direction_num, @@ -324,10 +324,10 @@ void doRnnBackward(const Context& dev_ctx, bool is_bidirec, bool is_test, const std::string& mode, - phi::DenseTensor* x_grad, - phi::DenseTensor* dhx, - phi::DenseTensor* dcx, - std::vector weight_grad_list) { + DenseTensor* x_grad, + DenseTensor* dhx, + DenseTensor* dcx, + std::vector weight_grad_list) { std::string input_mode = "linear"; std::string rnn_algo = "standard"; @@ -338,9 +338,9 @@ void doRnnBackward(const Context& dev_ctx, int64_t weight_num = static_cast(GetWeightNum(weight_list)); bool continuous = - IsContinuous>(weight_list); + IsContinuous>(weight_list); - phi::DenseTensor weight_whole; + DenseTensor weight_whole; void* w_data = nullptr; if (!continuous) { VLOG(2) << "If the memory space of the Input WeightList is not continuous, " @@ -409,7 +409,7 @@ void doRnnBackward(const Context& dev_ctx, TECODNN_CHECK(tecodnnGetRNNWeightSpaceSize( tecodnnHandle, RnnDesc, x_Desc, &weightspace_size)); - phi::DenseTensor workspace_data; + DenseTensor workspace_data; workspace_data.Resize(phi::make_ddim({static_cast(workspace_size)})); dev_ctx.template Alloc(&workspace_data); sdaa_ops::doFillTensor( @@ -454,7 +454,7 @@ void doRnnBackward(const Context& dev_ctx, if (!weight_grad_list.empty()) { // 1. Allocate a contiguous block of memory space to the // tecodnnRNNBackwardWeights. - phi::DenseTensor weight_grad; + DenseTensor weight_grad; weight_grad.Resize(phi::make_ddim({weight_num})); dev_ctx.template Alloc(&weight_grad); sdaa_ops::doMemsetTensor(dev_ctx, static_cast(0), &weight_grad); @@ -504,10 +504,10 @@ void doRnnBackward(const Context& dev_ctx, template void RnnKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const std::vector& pre_state, - const std::vector& weight_list, - const paddle::optional& sequence_length, + const DenseTensor& x, + const std::vector& pre_state, + const std::vector& weight_list, + const paddle::optional& sequence_length, float dropout_prob, bool is_bidirec, int input_size, @@ -516,10 +516,10 @@ void RnnKernel(const Context& dev_ctx, const std::string& mode, int seed, bool is_test, - phi::DenseTensor* out, - phi::DenseTensor* dropout_state, - std::vector state, - phi::DenseTensor* reserve) { + DenseTensor* out, + DenseTensor* dropout_state, + std::vector state, + DenseTensor* reserve) { VLOG(4) << "CALL SDAA RnnKernel"; PADDLE_ENFORCE_EQ( @@ -617,15 +617,15 @@ void RnnKernel(const Context& dev_ctx, template void RnnGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const std::vector& pre_state, - const std::vector& weight_list, - const paddle::optional& sequence_length, - const phi::DenseTensor& out, - const phi::DenseTensor& dropout_state, - const phi::DenseTensor& reserve, - const phi::DenseTensor& out_grad, - const std::vector& state_grad, + const DenseTensor& x, + const std::vector& pre_state, + const std::vector& weight_list, + const paddle::optional& sequence_length, + const DenseTensor& out, + const DenseTensor& dropout_state, + const DenseTensor& reserve, + const DenseTensor& out_grad, + const std::vector& state_grad, float dropout_prob, bool is_bidirec, int input_size, @@ -634,9 +634,9 @@ void RnnGradKernel(const Context& dev_ctx, const std::string& mode, int seed, bool is_test, - phi::DenseTensor* x_grad, - std::vector pre_state_grad, - std::vector weight_grad_list) { + DenseTensor* x_grad, + std::vector pre_state_grad, + std::vector weight_grad_list) { VLOG(4) << "CALL SDAA RnnGradKernel"; PADDLE_ENFORCE_EQ( @@ -651,8 +651,8 @@ void RnnGradKernel(const Context& dev_ctx, auto last_h_grad = state_grad[0]; // -> dhy auto last_c_grad = state_grad[1]; // -> dcy - phi::DenseTensor* init_h_grad = nullptr; - phi::DenseTensor* init_c_grad = nullptr; + DenseTensor* init_h_grad = nullptr; + DenseTensor* init_c_grad = nullptr; if (pre_state_grad.size() > 0) { // has gradient init_h_grad = pre_state_grad[0]; // -> dhx init_c_grad = pre_state_grad[1]; // -> dcx @@ -690,7 +690,7 @@ void RnnGradKernel(const Context& dev_ctx, num_layers, init_c->dims()[0])); - phi::DenseTensor input_grad_value; + DenseTensor input_grad_value; if (!x_grad) { x_grad = &input_grad_value; x_grad->Resize(x.dims()); diff --git a/backends/sdaa/kernels/roi_align_kernel.cc b/backends/sdaa/kernels/roi_align_kernel.cc index d9d4a5f1a4b..91fc05d5a1b 100644 --- a/backends/sdaa/kernels/roi_align_kernel.cc +++ b/backends/sdaa/kernels/roi_align_kernel.cc @@ -33,15 +33,15 @@ namespace custom_kernel { template void RoiAlignKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& boxes, - const paddle::optional& boxes_num, + const DenseTensor& x, + const DenseTensor& boxes, + const paddle::optional& boxes_num, int pooled_height, int pooled_width, float spatial_scale, int sampling_ratio, bool aligned, - phi::DenseTensor* out) { + DenseTensor* out) { if (boxes.dims()[0] == 0) { dev_ctx.template Alloc(out); return; @@ -119,12 +119,12 @@ void RoiAlignKernel(const Context& dev_ctx, } } - phi::DenseTensor boxes_num_t; + DenseTensor boxes_num_t; TensorFromVector(dev_ctx, roi_batch_id_data, dev_ctx, &boxes_num_t); boxes_num_t.Resize({boxes.dims()[0], 1}); // x and boxes must be the same dtype - phi::DenseTensor boxes_t; + DenseTensor boxes_t; if (boxes.dtype() == x.dtype()) { boxes_t = boxes; } else { @@ -133,10 +133,10 @@ void RoiAlignKernel(const Context& dev_ctx, sdaa_ops::doCastTensor(dev_ctx, boxes, &boxes_t); } - std::vector boxes_list; + std::vector boxes_list; boxes_list.emplace_back(&boxes_num_t); boxes_list.emplace_back(&boxes_t); - phi::DenseTensor boxes_N5; + DenseTensor boxes_N5; boxes_N5.Resize({boxes.dims()[0], 5}); dev_ctx.template Alloc(&boxes_N5); @@ -147,10 +147,10 @@ void RoiAlignKernel(const Context& dev_ctx, // tecodnnRoiAlignForward only support input and output of NHWC format phi::DDim x_t_dims = sdaa_ops::doDimPermute(x, Convert_TF::NCHW2NHWC); phi::DDim out_t_dims = sdaa_ops::doDimPermute(*out, Convert_TF::NCHW2NHWC); - phi::DenseTensor x_t, out_t; - phi::DenseTensorMeta x_t_meta = {x.dtype(), x_t_dims, phi::DataLayout::NHWC}, - out_t_meta = { - out->dtype(), out_t_dims, phi::DataLayout::NHWC}; + DenseTensor x_t, out_t; + DenseTensorMeta x_t_meta = {x.dtype(), x_t_dims, phi::DataLayout::NHWC}, + out_t_meta = { + out->dtype(), out_t_dims, phi::DataLayout::NHWC}; x_t.set_meta(x_t_meta); out_t.set_meta(out_t_meta); dev_ctx.template Alloc(&x_t); @@ -186,16 +186,16 @@ void RoiAlignKernel(const Context& dev_ctx, template void RoiAlignGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& boxes, - const paddle::optional& boxes_num, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& boxes, + const paddle::optional& boxes_num, + const DenseTensor& out_grad, int pooled_height, int pooled_width, float spatial_scale, int sampling_ratio, bool aligned, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "Call SDAA RoiAlignGradKernel."; PADDLE_ENFORCE_LT( sampling_ratio, @@ -235,14 +235,14 @@ void RoiAlignGradKernel(const Context& dev_ctx, } } - phi::DenseTensor boxes_num_t; + DenseTensor boxes_num_t; TensorFromVector(dev_ctx, box_batch_id_data, dev_ctx, &boxes_num_t); boxes_num_t.Resize({boxes.dims()[0], 1}); - std::vector boxes_list; + std::vector boxes_list; boxes_list.emplace_back(&boxes_num_t); boxes_list.emplace_back(&boxes); - phi::DenseTensor boxes_N5; + DenseTensor boxes_N5; boxes_N5.Resize({boxes.dims()[0], 5}); dev_ctx.template Alloc(&boxes_N5); @@ -254,12 +254,11 @@ void RoiAlignGradKernel(const Context& dev_ctx, phi::DDim dout_t_dims = sdaa_ops::doDimPermute(out_grad, Convert_TF::NCHW2NHWC); phi::DDim dx_t_dims = sdaa_ops::doDimPermute(*dx, Convert_TF::NCHW2NHWC); - phi::DenseTensor dout_t, dx_t; - phi::DenseTensorMeta dout_t_meta = {out_grad.dtype(), - dout_t_dims, - phi::DataLayout::NHWC}, - dx_t_meta = { - dx->dtype(), dx_t_dims, phi::DataLayout::NHWC}; + DenseTensor dout_t, dx_t; + DenseTensorMeta dout_t_meta = {out_grad.dtype(), + dout_t_dims, + phi::DataLayout::NHWC}, + dx_t_meta = {dx->dtype(), dx_t_dims, phi::DataLayout::NHWC}; dout_t.set_meta(dout_t_meta); dx_t.set_meta(dx_t_meta); dev_ctx.template Alloc(&dout_t); diff --git a/backends/sdaa/kernels/scale_kernel.cc b/backends/sdaa/kernels/scale_kernel.cc index b8f79250792..7c3424a700d 100644 --- a/backends/sdaa/kernels/scale_kernel.cc +++ b/backends/sdaa/kernels/scale_kernel.cc @@ -21,11 +21,11 @@ namespace custom_kernel { template void ScaleKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& in_scale, const phi::Scalar& bias, bool bias_after_scale, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA ScaleKernel"; using MT = typename sdaa_ops::MPTypeTrait::Type; auto scale = in_scale.to(); diff --git a/backends/sdaa/kernels/scatter_kernel.cc b/backends/sdaa/kernels/scatter_kernel.cc index 9b271dbff87..9e89260d7a9 100644 --- a/backends/sdaa/kernels/scatter_kernel.cc +++ b/backends/sdaa/kernels/scatter_kernel.cc @@ -33,11 +33,11 @@ namespace custom_kernel { template void ScatterKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& index, - const phi::DenseTensor& updates, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& updates, bool overwirte, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA ScatterKernel"; dev_ctx.template Alloc(out); diff --git a/backends/sdaa/kernels/scatter_nd_add_kernel.cc b/backends/sdaa/kernels/scatter_nd_add_kernel.cc index 3345f5c6888..eb3ef75f1b4 100644 --- a/backends/sdaa/kernels/scatter_nd_add_kernel.cc +++ b/backends/sdaa/kernels/scatter_nd_add_kernel.cc @@ -30,10 +30,10 @@ namespace custom_kernel { template void ScatterNdAddKernel(const Context &ctx, - const phi::DenseTensor &x, - const phi::DenseTensor &index, - const phi::DenseTensor &updates, - phi::DenseTensor *out) { + const DenseTensor &x, + const DenseTensor &index, + const DenseTensor &updates, + DenseTensor *out) { VLOG(4) << "Call SDAA ScatterNdAddKernel"; // In place output: Out = X diff --git a/backends/sdaa/kernels/set_value_kernel.cc b/backends/sdaa/kernels/set_value_kernel.cc index b7378644282..284bf24108a 100644 --- a/backends/sdaa/kernels/set_value_kernel.cc +++ b/backends/sdaa/kernels/set_value_kernel.cc @@ -31,8 +31,7 @@ namespace custom_kernel { // This function is used to check if the value_dims size is less than // decrease_slice_dims size. -inline void CheckIsDimsMatch(const phi::DenseTensor& input, - phi::DenseTensor* output) { +inline void CheckIsDimsMatch(const DenseTensor& input, DenseTensor* output) { std::vector input_dims = phi::vectorize(input.dims()); std::vector out_dims = phi::vectorize(output->dims()); @@ -209,15 +208,15 @@ inline phi::DDim GetDecreasedDims(const phi::DDim slice_dims, template void SetTensorValueKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& value, + const DenseTensor& x, + const DenseTensor& value, const phi::IntArray& starts, const phi::IntArray& ends, const phi::IntArray& steps, const std::vector& axes, const std::vector& decrease_axes, const std::vector& none_axes, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA SetTensorValueKernel"; dev_ctx.template Alloc(out); @@ -282,7 +281,7 @@ void SetTensorValueKernel(const Context& dev_ctx, strides_indices[axis_index] = static_cast(steps_local[i]); } - phi::DenseTensor value_temp; + DenseTensor value_temp; if (slice_dims_for_assign == value.dims()) { value_temp = value; } else { @@ -298,7 +297,7 @@ void SetTensorValueKernel(const Context& dev_ctx, std::vector index_indices(stride_step); std::iota(index_indices.begin(), index_indices.end(), 0); - phi::DenseTensor in_temp, val_temp, index_out, index_temp; + DenseTensor in_temp, val_temp, index_out, index_temp; in_temp = x; val_temp = value_temp; index_temp.Resize(in_dims); @@ -354,13 +353,12 @@ void SetTensorValueKernel(const Context& dev_ctx, phi::errors::InvalidArgument( "OP(set_value) error index indices and value update not match ")); - phi::DenseTensor index_final(index_out); + DenseTensor index_final(index_out); int64_t indices_numel = phi::product(index_dims); auto new_index_dims = phi::make_ddim({indices_numel}); index_final.Resize(new_index_dims); - phi::DenseTensor in_temp_non_int, val_temp_non_int, out_non_int, - index_final_int32; + DenseTensor in_temp_non_int, val_temp_non_int, out_non_int, index_final_int32; if (x.dtype() == DataType::INT64) { index_final_int32 = index_final; diff --git a/backends/sdaa/kernels/sigmoid_cross_entropy_with_logits_kernel.cc b/backends/sdaa/kernels/sigmoid_cross_entropy_with_logits_kernel.cc index 00d4babac44..2b8aa25e358 100644 --- a/backends/sdaa/kernels/sigmoid_cross_entropy_with_logits_kernel.cc +++ b/backends/sdaa/kernels/sigmoid_cross_entropy_with_logits_kernel.cc @@ -50,12 +50,12 @@ void CheckAttrs(bool normalize, int ignore_index) { template void SigmoidCrossEntropyWithLogitsKernel( const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& label, - const paddle::optional& pos_weight, + const DenseTensor& x, + const DenseTensor& label, + const paddle::optional& pos_weight, bool normalize, int ignore_index, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA SigmoidCrossEntropyWithLogitsKernel"; CheckAttrs(normalize, ignore_index); @@ -66,14 +66,14 @@ void SigmoidCrossEntropyWithLogitsKernel( n *= x.dims()[i]; } int d = x.dims()[x_size - 1]; - phi::DenseTensor w; + DenseTensor w; std::vector w_dims = {n, d}; phi::DDim w_dim = phi::make_ddim(w_dims); - phi::DenseTensorMeta w_meta = {x.dtype(), w_dim}; + DenseTensorMeta w_meta = {x.dtype(), w_dim}; w.set_meta(w_meta); dev_ctx.template Alloc(&w); sdaa_ops::doFillTensor(dev_ctx, static_cast(1), x.dtype(), &w); - phi::DenseTensor p_w; + DenseTensor p_w; w_dims = {d}; w_dim = phi::make_ddim(w_dims); w_meta = {x.dtype(), w_dim}; @@ -116,13 +116,13 @@ void SigmoidCrossEntropyWithLogitsKernel( template void SigmoidCrossEntropyWithLogitsGradKernel( const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& label, - const paddle::optional& pos_weight, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& label, + const paddle::optional& pos_weight, + const DenseTensor& dout, bool normalize, int ignore_index, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "Call SDAA SigmoidCrossEntropyWithLogitsGradKernel"; CheckAttrs(normalize, ignore_index); @@ -133,14 +133,14 @@ void SigmoidCrossEntropyWithLogitsGradKernel( n *= x.dims()[i]; } int d = x.dims()[x_size - 1]; - phi::DenseTensor w; + DenseTensor w; std::vector w_dims = {n, d}; phi::DDim w_dim = phi::make_ddim(w_dims); - phi::DenseTensorMeta w_meta = {x.dtype(), w_dim}; + DenseTensorMeta w_meta = {x.dtype(), w_dim}; w.set_meta(w_meta); dev_ctx.template Alloc(&w); sdaa_ops::doFillTensor(dev_ctx, static_cast(1), x.dtype(), &w); - phi::DenseTensor p_w; + DenseTensor p_w; w_dims = {d}; w_dim = phi::make_ddim(w_dims); w_meta = {x.dtype(), w_dim}; diff --git a/backends/sdaa/kernels/slice_kernel.cc b/backends/sdaa/kernels/slice_kernel.cc index 8198e4ce69a..fef48a81c04 100644 --- a/backends/sdaa/kernels/slice_kernel.cc +++ b/backends/sdaa/kernels/slice_kernel.cc @@ -105,13 +105,13 @@ inline phi::DDim GetDecreasedDims(const phi::DDim slice_dims, template void SliceRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axes, const phi::IntArray& starts_array, const phi::IntArray& ends_array, const std::vector& infer_flags, const std::vector& decrease_axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA SliceRawKernel"; auto starts = starts_array.GetData(); @@ -157,14 +157,14 @@ void SliceRawKernel(const Context& dev_ctx, template void SliceGradRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& dout, const std::vector& axes_t, const phi::IntArray& starts_array, const phi::IntArray& ends_array, const std::vector& infer_flags, const std::vector& decrease_axis, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "Call SDAA SliceGradRawKernel"; std::vector axes(axes_t.begin(), axes_t.end()); @@ -194,7 +194,7 @@ void SliceGradRawKernel(const Context& dev_ctx, paddings[1][i] = in_dims[i] - size[i] - offset[i]; } - phi::DenseTensor tmp_dout(dout); + DenseTensor tmp_dout(dout); auto out_dims = dout.dims(); auto decrease_size = decrease_axis.size(); diff --git a/backends/sdaa/kernels/softmax_kernel.cc b/backends/sdaa/kernels/softmax_kernel.cc index f820bb67a87..0df368bf005 100644 --- a/backends/sdaa/kernels/softmax_kernel.cc +++ b/backends/sdaa/kernels/softmax_kernel.cc @@ -21,9 +21,9 @@ namespace custom_kernel { template void SoftmaxKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA SoftmaxKernel"; dev_ctx.template Alloc(out); @@ -46,8 +46,8 @@ void SoftmaxKernel(const Context& dev_ctx, axis += x.dims().size(); } if (axis != x.dims().size() - 1) { - phi::DenseTensor x_temp; - phi::DenseTensor out_temp; + DenseTensor x_temp; + DenseTensor out_temp; std::vector x_dims = phi::vectorize(x.dims()); std::vector axis_vec(x.dims().size()); std::iota(axis_vec.begin(), axis_vec.end(), 0); @@ -80,10 +80,10 @@ void SoftmaxKernel(const Context& dev_ctx, template void SoftmaxGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& out_grad, + const DenseTensor& out, + const DenseTensor& out_grad, int axis, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { VLOG(4) << "Call SDAA SoftmaxGradKernel"; dev_ctx.template Alloc(x_grad); @@ -109,9 +109,9 @@ void SoftmaxGradKernel(const Context& dev_ctx, if (is_in_high_precision_op_list("softmax_grad")) high_precision = true; if (axis != out.dims().size() - 1) { - phi::DenseTensor out_temp; - phi::DenseTensor out_grad_temp; - phi::DenseTensor x_grad_temp; + DenseTensor out_temp; + DenseTensor out_grad_temp; + DenseTensor x_grad_temp; std::vector out_dims = phi::vectorize(out.dims()); std::vector axis_vec(out.dims().size()); diff --git a/backends/sdaa/kernels/split_kernel.cc b/backends/sdaa/kernels/split_kernel.cc index 73c6fdadf0c..880b5109bc8 100644 --- a/backends/sdaa/kernels/split_kernel.cc +++ b/backends/sdaa/kernels/split_kernel.cc @@ -46,10 +46,10 @@ static inline int ComputeAxis(int axis, int rank) { template void SplitKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& num_or_sections, const phi::Scalar& axis_scalar, - std::vector outs) { + std::vector outs) { VLOG(4) << "Call SDAA SplitKernel"; int axis = axis_scalar.to(); @@ -65,10 +65,10 @@ void SplitKernel(const Context& dev_ctx, template void SplitWithNumKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int num, const phi::Scalar& axis_scalar, - std::vector outs) { + std::vector outs) { VLOG(4) << "Call SDAA SplitWithNumKernel"; int axis_value = axis_scalar.to(); diff --git a/backends/sdaa/kernels/squared_l2_norm_kernel.cc b/backends/sdaa/kernels/squared_l2_norm_kernel.cc index 27ec1bcdb52..42a615a2e54 100644 --- a/backends/sdaa/kernels/squared_l2_norm_kernel.cc +++ b/backends/sdaa/kernels/squared_l2_norm_kernel.cc @@ -19,8 +19,8 @@ namespace custom_kernel { template void SquaredL2NormKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { VLOG(4) << "Call SDAA SquaredL2NormKernel"; dev_ctx.template Alloc(out); diff --git a/backends/sdaa/kernels/squeeze_kernel.cc b/backends/sdaa/kernels/squeeze_kernel.cc index 3132706c50c..920f2ee0cf7 100644 --- a/backends/sdaa/kernels/squeeze_kernel.cc +++ b/backends/sdaa/kernels/squeeze_kernel.cc @@ -31,9 +31,9 @@ namespace custom_kernel { template void SqueezeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& axes_int_array, - phi::DenseTensor* out) { + DenseTensor* out) { auto stream = dev_ctx.stream(); std::vector axes(axes_int_array.GetData().begin(), axes_int_array.GetData().end()); @@ -48,19 +48,19 @@ void SqueezeKernel(const Context& dev_ctx, template void SqueezeWithXShapeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& axes_int_array, - phi::DenseTensor* out, - phi::DenseTensor* xshape) { + DenseTensor* out, + DenseTensor* xshape) { custom_kernel::SqueezeKernel(dev_ctx, x, axes_int_array, out); } template void SqueezeGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& dout, const phi::IntArray& axes_int_array, - phi::DenseTensor* dx) { + DenseTensor* dx) { auto stream = dev_ctx.stream(); auto x_dims = dx->dims(); diff --git a/backends/sdaa/kernels/stack_kernel.cc b/backends/sdaa/kernels/stack_kernel.cc index cac488cefc5..83376ecb669 100644 --- a/backends/sdaa/kernels/stack_kernel.cc +++ b/backends/sdaa/kernels/stack_kernel.cc @@ -31,9 +31,9 @@ namespace custom_kernel { template void StackKernel(const Context& dev_ctx, - const std::vector& x, + const std::vector& x, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA StackKernel"; dev_ctx.template Alloc(out); @@ -43,16 +43,16 @@ void StackKernel(const Context& dev_ctx, PADDLE_ENFORCE_GT( num, 0, phi::errors::InvalidArgument("number of input Tensor <= 0")); - std::vector x_; + std::vector x_; std::vector input_dims = phi::vectorize(x[0]->dims()); input_dims.insert(input_dims.begin() + axis, 1); for (int i = 0; i < num; i++) { - x_.push_back(const_cast(x[i])); + x_.push_back(const_cast(x[i])); x_[i]->Resize(phi::make_ddim(input_dims)); } - std::vector x_temp; + std::vector x_temp; for (int i = 0; i < num; i++) { x_temp.push_back(x_[i]); } @@ -62,9 +62,9 @@ void StackKernel(const Context& dev_ctx, template void StackGradKernel(const Context& dev_ctx, - const phi::DenseTensor& dy, + const DenseTensor& dy, int axis, - std::vector dx) { + std::vector dx) { VLOG(4) << "CALL SDAA StackGradKernel"; // get dx dims @@ -96,18 +96,18 @@ void StackGradKernel(const Context& dev_ctx, std::vector input_dims_origin(input_dims); input_dims.insert(input_dims.begin() + axis, 1); - std::vector tmp_outputs_vec; + std::vector tmp_outputs_vec; tmp_outputs_vec.resize(dx.size()); - std::vector dx_; + std::vector dx_; - const phi::DenseTensorMeta meta_data(dy.dtype(), phi::make_ddim(input_dims)); + const DenseTensorMeta meta_data(dy.dtype(), phi::make_ddim(input_dims)); for (int i = 0; i < dx.size(); ++i) { if (dx[i]) { dev_ctx.template Alloc(dx[i]); dx_.push_back(dx[i]); dx_[i]->Resize(phi::make_ddim(input_dims)); } else { - phi::DenseTensor tmp_tensor; + DenseTensor tmp_tensor; tmp_tensor.set_meta(meta_data); dev_ctx.template Alloc(&tmp_tensor); tmp_outputs_vec[i] = std::move(tmp_tensor); diff --git a/backends/sdaa/kernels/stride_slice_kernel.cc b/backends/sdaa/kernels/stride_slice_kernel.cc index 7e72a5cdb49..5338d65fe66 100644 --- a/backends/sdaa/kernels/stride_slice_kernel.cc +++ b/backends/sdaa/kernels/stride_slice_kernel.cc @@ -181,14 +181,14 @@ static void ProcessStridedSliceParams( template void StridedSliceRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axes, const phi::IntArray& starts, const phi::IntArray& ends, const phi::IntArray& strides, const std::vector& infer_flags, const std::vector& decrease_axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA StridedSliceRawKernel"; auto x_dims = x.dims(); @@ -274,12 +274,12 @@ void StridedSliceRawKernel(const Context& dev_ctx, template void StridedSliceKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axes, const phi::IntArray& starts, const phi::IntArray& ends, const phi::IntArray& strides, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA StridedSliceKernel"; std::vector infer_flags(axes.size(), 1); std::vector decrease_axis; diff --git a/backends/sdaa/kernels/strided_copy_kernel.cc b/backends/sdaa/kernels/strided_copy_kernel.cc index 77e97014fd8..1a8fc6d30cc 100644 --- a/backends/sdaa/kernels/strided_copy_kernel.cc +++ b/backends/sdaa/kernels/strided_copy_kernel.cc @@ -32,14 +32,14 @@ namespace custom_kernel { template void StridedCopyKernel(const Context& dev_ctx, - const phi::DenseTensor& input, + const DenseTensor& input, const std::vector& dims, const std::vector& out_stride, int64_t offset, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA StridedCopyKernel."; - phi::DenseTensorMeta meta = input.meta(); + DenseTensorMeta meta = input.meta(); meta.strides = phi::make_ddim(out_stride); meta.dims = phi::make_ddim(dims); meta.offset = offset; diff --git a/backends/sdaa/kernels/sync_batch_norm_kernel.cc b/backends/sdaa/kernels/sync_batch_norm_kernel.cc index bf16b108671..3fe82bf1787 100644 --- a/backends/sdaa/kernels/sync_batch_norm_kernel.cc +++ b/backends/sdaa/kernels/sync_batch_norm_kernel.cc @@ -32,23 +32,23 @@ namespace custom_kernel { template void SyncBatchNormKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& mean, - const phi::DenseTensor& variance, - const phi::DenseTensor& scale, - const phi::DenseTensor& bias, + const DenseTensor& x, + const DenseTensor& mean, + const DenseTensor& variance, + const DenseTensor& scale, + const DenseTensor& bias, bool is_test, float momentum, float epsilon_f, const std::string& data_layout_str, bool use_global_stats, bool trainable_statistics, - phi::DenseTensor* y, - phi::DenseTensor* mean_out, - phi::DenseTensor* variance_out, - phi::DenseTensor* saved_mean, - phi::DenseTensor* saved_variance, - phi::DenseTensor* reserve_space) { + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out, + DenseTensor* saved_mean, + DenseTensor* saved_variance, + DenseTensor* reserve_space) { VLOG(4) << "CALL SDAA SyncBatchNormKernel"; PADDLE_ENFORCE_EQ(use_global_stats, @@ -97,7 +97,7 @@ void SyncBatchNormKernel(const Context& dev_ctx, int N, H, W, C, D; sdaa_ops::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); - phi::DenseTensor trans_x, trans_y; + DenseTensor trans_x, trans_y; phi::DDim trans_x_dims, trans_y_dims; const bool need_transpose = ((layout == DataLayout::kNCHW && x_dims.size() != 2) || @@ -124,7 +124,7 @@ void SyncBatchNormKernel(const Context& dev_ctx, // that the first addr of the two output parameters be 64B aligned. int C_temp = ceil(C / 16) * 16; const int data_num = C_temp + C; - phi::DenseTensor status; + DenseTensor status; status.Resize(phi::make_ddim({data_num})); dev_ctx.template Alloc(&status); @@ -190,7 +190,7 @@ void SyncBatchNormKernel(const Context& dev_ctx, saved_mean); // 2. calculate saved_inv_variance - phi::DenseTensor global_square_mean, saved_mean_pow, var, sqrt_var, + DenseTensor global_square_mean, saved_mean_pow, var, sqrt_var, saved_variance_tmp; global_square_mean.Resize(mean.dims()); dev_ctx.template Alloc(&global_square_mean); @@ -268,24 +268,23 @@ void SyncBatchNormKernel(const Context& dev_ctx, } template -void SyncBatchNormGradKernel( - const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& scale, - const phi::DenseTensor& bias, - const phi::DenseTensor& saved_mean, - const phi::DenseTensor& saved_variance, - const paddle::optional& reserve_space, - const phi::DenseTensor& y_grad, - float momentum, - float epsilon_f, - const std::string& data_layout_str, - bool is_test, - bool use_global_stats, - bool trainable_statistics, - phi::DenseTensor* x_grad, - phi::DenseTensor* scale_grad, - phi::DenseTensor* bias_grad) { +void SyncBatchNormGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + const paddle::optional& reserve_space, + const DenseTensor& y_grad, + float momentum, + float epsilon_f, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad) { VLOG(4) << "CALL SDAA SyncBatchNormGradKernel"; const DataLayout layout = common::StringToDataLayout(data_layout_str); @@ -316,7 +315,7 @@ void SyncBatchNormGradKernel( using MPDType = typename sdaa_ops::MPTypeTrait::Type; dev_ctx.template Alloc(x_grad); - phi::DenseTensor d_scale, d_bias; + DenseTensor d_scale, d_bias; void* scale_grad_ptr = nullptr; void* bias_grad_ptr = nullptr; if (scale_grad && bias_grad) { @@ -333,7 +332,7 @@ void SyncBatchNormGradKernel( bias_grad_ptr = d_bias.data(); } - phi::DenseTensor trans_x, trans_dy, trans_dx; + DenseTensor trans_x, trans_dy, trans_dx; phi::DDim trans_x_dims, trans_dy_dims, trans_dx_dims; const bool need_transpose = @@ -360,7 +359,7 @@ void SyncBatchNormGradKernel( trans_dx = *x_grad; } - phi::DenseTensor sum_dy_and_sum_dy_xmu; + DenseTensor sum_dy_and_sum_dy_xmu; sum_dy_and_sum_dy_xmu.Resize(phi::make_ddim({2 * C})); dev_ctx.template Alloc(&sum_dy_and_sum_dy_xmu); diff --git a/backends/sdaa/kernels/tile_kernel.cc b/backends/sdaa/kernels/tile_kernel.cc index e9554e13995..ec0326b8703 100644 --- a/backends/sdaa/kernels/tile_kernel.cc +++ b/backends/sdaa/kernels/tile_kernel.cc @@ -34,9 +34,9 @@ namespace custom_kernel { template void TileKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& repeat_times, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA TileKernel"; auto rank = x.dims().size(); auto& repeat_times_data = repeat_times.GetData(); diff --git a/backends/sdaa/kernels/top_k_kernel.cc b/backends/sdaa/kernels/top_k_kernel.cc index 5e788d6c2fe..c42206da1a0 100644 --- a/backends/sdaa/kernels/top_k_kernel.cc +++ b/backends/sdaa/kernels/top_k_kernel.cc @@ -20,13 +20,13 @@ namespace custom_kernel { template void TopkKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& k_scalar, int axis, bool largest, bool sorted, - phi::DenseTensor* out, - phi::DenseTensor* indices) { + DenseTensor* out, + DenseTensor* indices) { VLOG(4) << "Call SDAA TopkKernel"; int xDims = x.dims().size(); if (axis < 0) { @@ -64,7 +64,7 @@ void TopkKernel(const Context& dev_ctx, std::vector y_dimensions = phi::vectorize(output_dims); std::vector indices_dimensions = phi::vectorize(output_dims); - phi::DenseTensor* in = const_cast(&x); + DenseTensor* in = const_cast(&x); tecodnnHandle_t tecodnnHandle = GetHandleFromCTX(dev_ctx); @@ -93,7 +93,7 @@ void TopkKernel(const Context& dev_ctx, tecodnnGetTopkWorkspaceSize( topk_Desc, x_Desc, y_Desc, indices_Desc, &workspace_size); - phi::DenseTensor dev_workspace; + DenseTensor dev_workspace; dev_workspace.Resize( phi::make_ddim({static_cast(workspace_size)})); dev_ctx.Alloc(&dev_workspace, phi::DataType::INT8); @@ -117,14 +117,14 @@ void TopkKernel(const Context& dev_ctx, } // the indices param in the tecodnnTopk is int. - phi::DenseTensor indices_int; + DenseTensor indices_int; indices_int.Resize(indices->dims()); dev_ctx.template Alloc(&indices_int); std::vector x_dimensions = phi::vectorize(x.dims()); std::vector y_dimensions = phi::vectorize(output_dims); - phi::DenseTensor* in = const_cast(&x); + DenseTensor* in = const_cast(&x); tecodnnHandle_t tecodnnHandle = GetHandleFromCTX(dev_ctx); @@ -153,14 +153,14 @@ void TopkKernel(const Context& dev_ctx, template void TopkGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& indices, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out_grad, const phi::Scalar& k_scalar, int axis, bool largest UNUSED, bool sorted UNUSED, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { VLOG(4) << "Call SDAA TopkGradKernel"; const auto& in_dim_size = x.dims().size(); // axis < 0, get the real axis @@ -193,7 +193,7 @@ void TopkGradKernel(const Context& dev_ctx, std::vector out_grad_dimensions = phi::vectorize(out_grad.dims()); std::vector x_grad_dimensions = phi::vectorize(x_grad->dims()); - phi::DenseTensor* in = const_cast(&x); + DenseTensor* in = const_cast(&x); tecodnnHandle_t tecodnnHandle = GetHandleFromCTX(dev_ctx); diff --git a/backends/sdaa/kernels/transpose_kernel.cc b/backends/sdaa/kernels/transpose_kernel.cc index 1835974fb63..32790580d4a 100644 --- a/backends/sdaa/kernels/transpose_kernel.cc +++ b/backends/sdaa/kernels/transpose_kernel.cc @@ -33,9 +33,9 @@ namespace custom_kernel { template void TransposeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axis, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA TransposeKernel"; dev_ctx.template Alloc(out); @@ -44,9 +44,9 @@ void TransposeKernel(const Context& dev_ctx, template void TransposeGradKernel(const Context& dev_ctx, - const phi::DenseTensor& dout, + const DenseTensor& dout, const std::vector& axis, - phi::DenseTensor* dx) { + DenseTensor* dx) { VLOG(4) << "Call SDAA TransposeGradKernel"; dev_ctx.template Alloc(dx); diff --git a/backends/sdaa/kernels/tril_triu_kernel.cc b/backends/sdaa/kernels/tril_triu_kernel.cc index e32d057c418..b954cdf5076 100644 --- a/backends/sdaa/kernels/tril_triu_kernel.cc +++ b/backends/sdaa/kernels/tril_triu_kernel.cc @@ -31,10 +31,10 @@ namespace custom_kernel { template void TrilTriuKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int diagonal, bool lower, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA TrilTriuKernel"; dev_ctx.template Alloc(out); auto x_dims = x.dims(); @@ -52,17 +52,17 @@ void TrilTriuKernel(const Context& dev_ctx, template void TrilKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int diagonal, - phi::DenseTensor* out) { + DenseTensor* out) { custom_kernel::TrilTriuKernel(dev_ctx, x, diagonal, true, out); } template void TriuKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int diagonal, - phi::DenseTensor* out) { + DenseTensor* out) { custom_kernel::TrilTriuKernel(dev_ctx, x, diagonal, false, out); } diff --git a/backends/sdaa/kernels/truncated_gaussian_random_kernel.cc b/backends/sdaa/kernels/truncated_gaussian_random_kernel.cc index 6adb2d970eb..64125de1756 100644 --- a/backends/sdaa/kernels/truncated_gaussian_random_kernel.cc +++ b/backends/sdaa/kernels/truncated_gaussian_random_kernel.cc @@ -44,14 +44,14 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx, float a, float b, phi::DataType dtype, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "Call SDAA TruncatedGaussianRandomKernel"; T* data = dev_ctx.template Alloc(out); auto size = out->numel(); // 1.CPU implement - phi::DenseTensor cpu_out; - phi::DenseTensorMeta cpu_out_meta = {out->dtype(), out->dims()}; + DenseTensor cpu_out; + DenseTensorMeta cpu_out_meta = {out->dtype(), out->dims()}; cpu_out.set_meta(cpu_out_meta); T* cpu_data = dev_ctx.template HostAlloc(&cpu_out); diff --git a/backends/sdaa/kernels/unbind_kernel.cc b/backends/sdaa/kernels/unbind_kernel.cc index eebd786f96e..667f5fa0fc5 100644 --- a/backends/sdaa/kernels/unbind_kernel.cc +++ b/backends/sdaa/kernels/unbind_kernel.cc @@ -33,9 +33,9 @@ namespace custom_kernel { template void UnbindKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, - std::vector outs) { + std::vector outs) { VLOG(4) << "CALL SDAA UnbindKernel."; auto x_dims = x.dims(); diff --git a/backends/sdaa/kernels/uniform_random_kernel.cc b/backends/sdaa/kernels/uniform_random_kernel.cc index ec75abd0bb5..913234b22a6 100644 --- a/backends/sdaa/kernels/uniform_random_kernel.cc +++ b/backends/sdaa/kernels/uniform_random_kernel.cc @@ -41,7 +41,7 @@ void UniformRandomRawKernel(const Context& dev_ctx, int diag_num, int diag_step, float diag_val, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA UniformRandomRawKernel"; out->Resize(phi::make_ddim(shape.GetData())); VLOG(4) << out->dims(); @@ -49,8 +49,8 @@ void UniformRandomRawKernel(const Context& dev_ctx, auto size = out->numel(); // 1.CPU implement - phi::DenseTensor cpu_out; - phi::DenseTensorMeta cpu_out_meta = {out->dtype(), out->dims()}; + DenseTensor cpu_out; + DenseTensorMeta cpu_out_meta = {out->dtype(), out->dims()}; cpu_out.set_meta(cpu_out_meta); T* cpu_data = dev_ctx.template HostAlloc(&cpu_out); @@ -74,7 +74,7 @@ void UniformRandomKernel(const Context& dev_ctx, const phi::Scalar& min, const phi::Scalar& max, int seed, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(4) << "CALL SDAA UniformRandomKernel"; custom_kernel::UniformRandomRawKernel( dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out); diff --git a/backends/sdaa/kernels/unsqueeze_kernel.cc b/backends/sdaa/kernels/unsqueeze_kernel.cc index 3adbbc2176a..efb0086a2ba 100644 --- a/backends/sdaa/kernels/unsqueeze_kernel.cc +++ b/backends/sdaa/kernels/unsqueeze_kernel.cc @@ -69,9 +69,9 @@ inline phi::DDim GetUnsqueezeShape(const std::vector unsqz_dims, template void UnsqueezeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& axes, - phi::DenseTensor* out) { + DenseTensor* out) { auto x_dims = x.dims(); auto out_dims = out->dims(); if (axes.FromTensor() && out->dims()[0] == -1) { @@ -86,18 +86,18 @@ void UnsqueezeKernel(const Context& dev_ctx, template void UnsqueezeWithXShapeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& axes, - phi::DenseTensor* out, - phi::DenseTensor* xshape UNUSED) { + DenseTensor* out, + DenseTensor* xshape UNUSED) { custom_kernel::UnsqueezeKernel(dev_ctx, x, axes, out); } template void UnsqueezeGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x_shape, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x_shape, + const DenseTensor& dout, + DenseTensor* dx) { auto xshape_dims = x_shape.dims(); auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size()); dev_ctx.template Alloc(dx); diff --git a/backends/sdaa/kernels/unstack_kernel.cc b/backends/sdaa/kernels/unstack_kernel.cc index b559c9d0531..238ef2767b2 100644 --- a/backends/sdaa/kernels/unstack_kernel.cc +++ b/backends/sdaa/kernels/unstack_kernel.cc @@ -31,10 +31,10 @@ namespace custom_kernel { template void UnStackKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, int num UNUSED, - std::vector outs) { + std::vector outs) { VLOG(4) << "Call SDAA UnStackKernel"; // get outs dims @@ -45,11 +45,11 @@ void UnStackKernel(const Context& dev_ctx, std::vector output_dims_origin(output_dims); output_dims.insert(output_dims.begin() + axis, 1); - std::vector tmp_outputs_vec; + std::vector tmp_outputs_vec; tmp_outputs_vec.resize(outs.size()); - std::vector outs_; + std::vector outs_; - const phi::DenseTensorMeta meta_data(x.dtype(), phi::make_ddim(output_dims)); + const DenseTensorMeta meta_data(x.dtype(), phi::make_ddim(output_dims)); for (int i = 0; i < outs.size(); ++i) { dev_ctx.template Alloc(outs[i]); outs_.push_back(outs[i]); @@ -65,9 +65,9 @@ void UnStackKernel(const Context& dev_ctx, template void UnStackGradKernel(const Context& dev_ctx, - const std::vector& x, + const std::vector& x, int axis, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { VLOG(4) << "CALL SDAA UnStackGradKernel."; dev_ctx.template Alloc(x_grad); @@ -76,14 +76,14 @@ void UnStackGradKernel(const Context& dev_ctx, int num = static_cast(x.size()); - std::vector x_; + std::vector x_; std::vector input_dims = phi::vectorize(x[0]->dims()); input_dims.insert(input_dims.begin() + axis, 1); - phi::DenseTensor* temp; + DenseTensor* temp; for (int i = 0; i < num; i++) { - temp = const_cast(x[i]); + temp = const_cast(x[i]); temp->Resize(phi::make_ddim(input_dims)); x_.push_back(temp); } diff --git a/backends/sdaa/kernels/warpctc_kernel.cc b/backends/sdaa/kernels/warpctc_kernel.cc index 99e126e35cb..669adcfb5fe 100644 --- a/backends/sdaa/kernels/warpctc_kernel.cc +++ b/backends/sdaa/kernels/warpctc_kernel.cc @@ -38,14 +38,14 @@ namespace custom_kernel { template void WarpctcKernel(const Context& dev_ctx, - const phi::DenseTensor& logits, - const phi::DenseTensor& label, - const paddle::optional& logits_length, - const paddle::optional& labels_length, + const DenseTensor& logits, + const DenseTensor& label, + const paddle::optional& logits_length, + const paddle::optional& labels_length, int blank, bool norm_by_times, - phi::DenseTensor* loss, - phi::DenseTensor* warpctcgrad) { + DenseTensor* loss, + DenseTensor* warpctcgrad) { VLOG(4) << "Call SDAA WarpctcKernel"; bool has_logits_length = logits_length.is_initialized(); if (!has_logits_length) { @@ -190,7 +190,7 @@ void WarpctcKernel(const Context& dev_ctx, gradsDesc, &workSpaceSizeInBytes)); - phi::DenseTensor workspace; + DenseTensor workspace; T* workspace_data = dev_ctx.template Alloc(&workspace, workSpaceSizeInBytes); TECODNN_CHECK(tecodnnCTCLoss(tecodnnHandle, @@ -216,13 +216,13 @@ void WarpctcKernel(const Context& dev_ctx, template void WarpctcGradKernel(const Context& dev_ctx, - const phi::DenseTensor& logits, - const paddle::optional& logits_length, - const phi::DenseTensor& warpctcgrad, - const phi::DenseTensor& loss_grad, + const DenseTensor& logits, + const paddle::optional& logits_length, + const DenseTensor& warpctcgrad, + const DenseTensor& loss_grad, int blank, bool norm_by_times, - phi::DenseTensor* logits_grad) { + DenseTensor* logits_grad) { VLOG(4) << "Call SDAA WarpctcGradKernel"; dev_ctx.template Alloc(logits_grad); diff --git a/backends/sdaa/kernels/where_kernel.cc b/backends/sdaa/kernels/where_kernel.cc index 166c610b643..efd16ee2376 100644 --- a/backends/sdaa/kernels/where_kernel.cc +++ b/backends/sdaa/kernels/where_kernel.cc @@ -19,10 +19,10 @@ namespace custom_kernel { template void doWhereTensor(const Context& dev_ctx, - const phi::DenseTensor& condition, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "tecodnn where tensor called"; std::vector condition_dims = phi::vectorize(condition.dims()); @@ -58,10 +58,10 @@ void doWhereTensor(const Context& dev_ctx, template void WhereKernel(const Context& dev_ctx, - const phi::DenseTensor& condition, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { VLOG(4) << "Call SDAA WhereKernel"; dev_ctx.template Alloc(out); @@ -71,19 +71,19 @@ void WhereKernel(const Context& dev_ctx, template void WhereGradKernel(const Context& dev_ctx, - const phi::DenseTensor& condition, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& dout, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy) { VLOG(4) << "CALL SDAA WhereGradKernel"; if (dx) dev_ctx.template Alloc(dx); if (dy) dev_ctx.template Alloc(dy); - phi::DenseTensor zero_tensor; - phi::DenseTensorMeta zero_tensor_meta = {dout.dtype(), dout.dims()}; + DenseTensor zero_tensor; + DenseTensorMeta zero_tensor_meta = {dout.dtype(), dout.dims()}; zero_tensor.set_meta(zero_tensor_meta); dev_ctx.template Alloc(&zero_tensor);