intel · yucai-intel · Apr 3, 2026 · Apr 8, 2026 · Copilot · Apr 3, 2026
diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h
@@ -1352,15 +1352,20 @@ inline void gpu_reduce_kernel(
   // XXX: Avoid all WIs in a work group contributes on one output. If so,
   // It is inefficient to store output, each work group stores only one
   // output. It is not friendly to collapse memory request in an EU.
-  if (config.values_per_item() >= group_height * 16 ||
-      config.values_per_item() >= 512) {
-    // Divide the input across SGs in a work group, if that leaves at least
-    // 16 elements to be summed by each WI. This will require inter-SG
-    // reduction using shared memory.
-    config.input_mult[1] = config.split_input(group_height);
-  } else {
-    // Otherwise, each SG handles a separate output.
-    config.output_mult[1] = config.split_output(group_height);
+  //
+  // When input is vectorized, each work-item processes input_vec_size
+  // elements per load.  Scale the threshold proportionally so that each
+  // work-item gets enough vector loads to amortize the inter-SG reduction
+  // overhead via shared local memory.  For non-vectorized paths the factor
+  // is 1 and the condition is unchanged.
+  {
+    int vf = config.vectorize_input ? config.input_vec_size : 1;
+    if (config.values_per_item() >= group_height * 16 * vf ||
+        config.values_per_item() >= 512 * vf) {
+      config.input_mult[1] = config.split_input(group_height);
+    } else {
+      config.output_mult[1] = config.split_output(group_height);
+    }
-  {
-    int vf = config.vectorize_input ? config.input_vec_size : 1;
-    if (config.values_per_item() >= group_height * 16 * vf ||
-        config.values_per_item() >= 512 * vf) {
-      config.input_mult[1] = config.split_input(group_height);
-    } else {
-      config.output_mult[1] = config.split_output(group_height);
-    }
+  const auto vf = config.vectorize_input ? config.input_vec_size : 1;
+  if (config.values_per_item() >= group_height * 16 * vf ||
+      config.values_per_item() >= 512 * vf) {
+    config.input_mult[1] = config.split_input(group_height);
+  } else {
+    config.output_mult[1] = config.split_output(group_height);
-  {
-    int vf = config.vectorize_input ? config.input_vec_size : 1;
-    if (config.values_per_item() >= group_height * 16 * vf ||
-        config.values_per_item() >= 512 * vf) {
-      config.input_mult[1] = config.split_input(group_height);
-    } else {
-      config.output_mult[1] = config.split_output(group_height);
-    }
+  const auto vf = config.vectorize_input ? config.input_vec_size : 1;
+  if (config.values_per_item() >= group_height * 16 * vf ||
+      config.values_per_item() >= 512 * vf) {
+    config.input_mult[1] = config.split_input(group_height);
+  } else {
+    config.output_mult[1] = config.split_output(group_height);
   }
 
   // We are finding a general rountine to work out target max WI number on dev

diff --git a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
@@ -18,33 +18,31 @@
 
 namespace at::native::xpu {
 
-// This reduction accumulates results as the type `acc_t`. By default, when
-// `scalar_t` is complex, `acc_t` is the downgraded real number type.
-// Otherwise, `acc_t` and `scalar_t` are the same type.
 template <
     typename scalar_t,
     typename acc_t = typename scalar_value_type<scalar_t>::type,
     typename out_t = typename scalar_value_type<scalar_t>::type>
 void norm_kernel_xpu_impl(TensorIterator& iter, double p) {
+  constexpr int vt0 = 8;
   if (p == static_cast<double>(0)) {
-    gpu_reduce_kernel<scalar_t, out_t>(
+    gpu_reduce_kernel<scalar_t, out_t, vt0>(
         iter, NormZeroOps<scalar_t, acc_t, out_t>(), 0);
   } else if (p == static_cast<double>(1)) {
-    gpu_reduce_kernel<scalar_t, out_t>(
+    gpu_reduce_kernel<scalar_t, out_t, vt0>(
         iter, NormOneOps<scalar_t, acc_t, out_t>(), 0);
   } else if (p == static_cast<double>(2)) {
-    gpu_reduce_kernel<scalar_t, out_t>(
+    gpu_reduce_kernel<scalar_t, out_t, vt0>(
         iter, NormTwoOps<scalar_t, acc_t, out_t>(), 0);
   } else if (p == static_cast<double>(INFINITY)) {
-    gpu_reduce_kernel<scalar_t, out_t>(
+    gpu_reduce_kernel<scalar_t, out_t, vt0>(
         iter, AbsMaxOps<scalar_t, acc_t, out_t>(), 0);
   } else if (p == static_cast<double>(-INFINITY)) {
-    gpu_reduce_kernel<scalar_t, out_t>(
+    gpu_reduce_kernel<scalar_t, out_t, vt0>(
         iter,
         AbsMinOps<scalar_t, acc_t, out_t>(),
         std::numeric_limits<acc_t>::infinity());
   } else {
-    gpu_reduce_kernel<scalar_t, out_t>(
+    gpu_reduce_kernel<scalar_t, out_t, vt0>(
         iter, NormOps<scalar_t, acc_t, out_t>{acc_t(p)}, 0);
   }
 }
@@ -53,12 +51,10 @@ void norm_launch_kernel(TensorIterator& iter, double ord) {
   if (iter.dtype(0) == kHalf) {
     return norm_kernel_xpu_impl<at::Half, float>(iter, ord);
   } else if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) {
-    // type promotion that does cast and reduction in a single kernel
     return norm_kernel_xpu_impl<at::Half, float, float>(iter, ord);
   } else if (iter.dtype(0) == kBFloat16) {
     return norm_kernel_xpu_impl<at::BFloat16, float>(iter, ord);
   } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) {
-    // type promotion that does cast and reduction in a single kernel
     return norm_kernel_xpu_impl<at::BFloat16, float, float>(iter, ord);
   }
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.input_dtype(), "norm_xpu", [&] {