pytorch · vacu9708 · Jun 8, 2026 · Jun 8, 2026
@@ -7,6 +7,7 @@
  */
 
 #include <cmath>
+#include <type_traits>
 
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
@@ -42,8 +43,16 @@ Tensor& log_softmax_out(
   // Adjust for negative dim
   dim = dim < 0 ? dim + nonzero_dim(in) : dim;
 
+  // For half-precision inputs, the exp-sum is accumulated in float to avoid
+  // saturation (BFloat16 saturates near 256, Half near 2048). Matches ATen's
+  // acc_type behavior. See also op_grid_sampler_2d.cpp.
   ET_SWITCH_FLOATHBF16_TYPES(
       in.scalar_type(), ctx, "_log_softmax.out", CTYPE, [&]() {
+        using ACC = std::conditional_t<
+            std::is_same_v<CTYPE, executorch::aten::Half> ||
+                std::is_same_v<CTYPE, executorch::aten::BFloat16>,
+            float,
+            CTYPE>;
         const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
         CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
@@ -61,11 +70,12 @@ Tensor& log_softmax_out(
                   size,
                   stride);
 
-              CTYPE temp_sum = apply_unary_map_reduce_fn<CTYPE, CTYPE>(
+              ACC temp_sum = apply_unary_map_reduce_fn<CTYPE, ACC>(
                   [max_in](const CTYPE val_in) {
-                    return std::exp(val_in - max_in);
+                    return std::exp(
+                        static_cast<ACC>(val_in) - static_cast<ACC>(max_in));
                   },
-                  [](const CTYPE mapped_in, CTYPE val_accum) {
+                  [](const ACC mapped_in, ACC val_accum) {
                     return val_accum + mapped_in;
                   },
                   in_data + base,
@@ -75,7 +85,9 @@ Tensor& log_softmax_out(
 
               apply_unary_map_fn(
                   [max_in, temp_sum](const CTYPE val_in) {
-                    return val_in - max_in - temp_sum;
+                    return static_cast<CTYPE>(
+                        static_cast<ACC>(val_in) - static_cast<ACC>(max_in) -
+                        temp_sum);
                   },
                   in_data + base,
                   out_data + base,

@@ -7,6 +7,8 @@
  */
 #include <c10/util/irange.h>
 
+#include <type_traits>
+
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -58,17 +60,24 @@ Tensor& mean_dim_out(
 
       // @lint-ignore CLANGTIDY facebook-hte-CArray
       static constexpr const char op_name[] = "mean.out";
+      // For half-precision inputs, accumulate in float to avoid saturation.
+      // Matches ATen's acc_type behavior.
       ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
+        using ACC = std::conditional_t<
+            std::is_same_v<CTYPE, executorch::aten::Half> ||
+                std::is_same_v<CTYPE, executorch::aten::BFloat16>,
+            float,
+            CTYPE>;
         const CTYPE* in_data = in.const_data_ptr<CTYPE>();
         CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
-        const CTYPE denom = static_cast<CTYPE>(reduce_size);
+        const ACC denom = static_cast<ACC>(reduce_size);
         for (int64_t i = 0; i < outer_size; i++) {
           const CTYPE* row = in_data + i * reduce_size;
-          CTYPE acc = 0;
+          ACC acc = 0;
           for (int64_t j = 0; j < reduce_size; j++) {
             acc += row[j];
           }
-          out_data[i] = acc / denom;
+          out_data[i] = static_cast<CTYPE>(acc / denom);
         }
       });
       return out;
@@ -83,19 +92,25 @@ Tensor& mean_dim_out(
   static constexpr const char op_name[] = "mean.out";
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
     ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
+      using ACC = std::conditional_t<
+          std::is_same_v<CTYPE_OUT, executorch::aten::Half> ||
+              std::is_same_v<CTYPE_OUT, executorch::aten::BFloat16>,
+          float,
+          CTYPE_OUT>;
       CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
       const size_t num = get_reduced_dim_product(in, dim_list);
       const bool success = parallel_for_each_reduce_over_dim_list_output_index(
           in, dim_list, out, [&](const auto begin, const auto end) {
             for (const auto out_ix : c10::irange(begin, end)) {
-              CTYPE_OUT sum = 0;
+              ACC sum = 0;
               if (plan.has_value()) {
-                sum = plan->execute<CTYPE_IN, CTYPE_OUT>(
-                    [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-                    [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                sum = plan->execute<CTYPE_IN, ACC>(
+                    [](CTYPE_IN v) { return static_cast<ACC>(v); },
+                    [](ACC outv, ACC acc) { return acc + outv; },
                     out_ix);
               }
-              out_data[out_ix] = sum / static_cast<float>(num);
+              out_data[out_ix] =
+                  static_cast<CTYPE_OUT>(sum / static_cast<float>(num));
             }
           });
       ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");

@@ -7,6 +7,7 @@
  */
 
 #include <cmath>
+#include <type_traits>
 
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
@@ -42,8 +43,16 @@ Tensor& softmax_out(
   // Adjust for negative dim
   dim = dim < 0 ? dim + nonzero_dim(in) : dim;
 
+  // For half-precision inputs, the exp-sum is accumulated in float to avoid
+  // saturation (BFloat16 saturates near 256, Half near 2048). Matches ATen's
+  // acc_type behavior. See also op_grid_sampler_2d.cpp.
   ET_SWITCH_FLOATHBF16_TYPES(
       in.scalar_type(), ctx, "_softmax.out", CTYPE, [&]() {
+        using ACC = std::conditional_t<
+            std::is_same_v<CTYPE, executorch::aten::Half> ||
+                std::is_same_v<CTYPE, executorch::aten::BFloat16>,
+            float,
+            CTYPE>;
         const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
         CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
@@ -61,11 +70,12 @@ Tensor& softmax_out(
                   size,
                   stride);
 
-              const CTYPE temp_sum = apply_unary_map_reduce_fn<CTYPE, CTYPE>(
+              const ACC temp_sum = apply_unary_map_reduce_fn<CTYPE, ACC>(
                   [max_in](const CTYPE val_in) {
-                    return std::exp(val_in - max_in);
+                    return std::exp(
+                        static_cast<ACC>(val_in) - static_cast<ACC>(max_in));
                   },
-                  [](const CTYPE mapped_in, CTYPE val_accum) {
+                  [](const ACC mapped_in, ACC val_accum) {
                     return val_accum + mapped_in;
                   },
                   in_data + base,
@@ -74,7 +84,11 @@ Tensor& softmax_out(
 
               apply_unary_map_fn(
                   [max_in, temp_sum](const CTYPE val_in) {
-                    return std::exp(val_in - max_in) / temp_sum;
+                    return static_cast<CTYPE>(
+                        std::exp(
+                            static_cast<ACC>(val_in) -
+                            static_cast<ACC>(max_in)) /
+                        temp_sum);
                   },
                   in_data + base,
                   out_data + base,

@@ -7,6 +7,8 @@
  */
 #include <c10/util/irange.h>
 
+#include <type_traits>
+
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
@@ -60,16 +62,23 @@ Tensor& sum_dim_out(
 
       // @lint-ignore CLANGTIDY facebook-hte-CArray
       static constexpr const char op_name[] = "sum.IntList_out";
+      // For half-precision inputs, accumulate in float to avoid saturation.
+      // Matches ATen's acc_type behavior. See also op_grid_sampler_2d.cpp.
       ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
+        using ACC = std::conditional_t<
+            std::is_same_v<CTYPE, executorch::aten::Half> ||
+                std::is_same_v<CTYPE, executorch::aten::BFloat16>,
+            float,
+            CTYPE>;
         const CTYPE* in_data = in.const_data_ptr<CTYPE>();
         CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
         for (int64_t i = 0; i < outer_size; i++) {
           const CTYPE* row = in_data + i * reduce_size;
-          CTYPE acc = 0;
+          ACC acc = 0;
           for (int64_t j = 0; j < reduce_size; j++) {
             acc += row[j];
           }
-          out_data[i] = acc;
+          out_data[i] = static_cast<CTYPE>(acc);
         }
       });
       return out;
@@ -108,23 +117,24 @@ Tensor& sum_dim_out(
     ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
       ET_SWITCH_REALHBBF16_TYPES(
           out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
+            using ACC = std::conditional_t<
+                std::is_same_v<CTYPE_OUT, executorch::aten::Half> ||
+                    std::is_same_v<CTYPE_OUT, executorch::aten::BFloat16>,
+                float,
+                CTYPE_OUT>;
             CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
             const bool success =
                 parallel_for_each_reduce_over_dim_list_output_index(
                     in, dim_list, out, [&](const auto begin, const auto end) {
                       for (const auto out_ix : c10::irange(begin, end)) {
-                        CTYPE_OUT sum = 0;
+                        ACC sum = 0;
                         if (plan.has_value()) {
-                          sum = plan->execute<CTYPE_IN, CTYPE_OUT>(
-                              [](CTYPE_IN v) {
-                                return static_cast<CTYPE_OUT>(v);
-                              },
-                              [](CTYPE_OUT outv, CTYPE_OUT acc) {
-                                return acc + outv;
-                              },
+                          sum = plan->execute<CTYPE_IN, ACC>(
+                              [](CTYPE_IN v) { return static_cast<ACC>(v); },
+                              [](ACC outv, ACC acc) { return acc + outv; },
                               out_ix);
                         }
-                        out_data[out_ix] = sum;
+                        out_data[out_ix] = static_cast<CTYPE_OUT>(sum);
                       }
                     });
             ET_KERNEL_CHECK_MSG(

@@ -369,6 +369,19 @@ TEST_F(OpLogSoftmaxOutTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
+TEST_F(OpLogSoftmaxOutTest, BFloat16LargeDimAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // N=512: without fp32 accumulation, the exp-sum saturates at BFloat16's
+  // precision limit (~256), so the output is ~-log(256) instead of -log(512).
+  // atol=1e-1 can catch pre-fix error: |log(512) - log(256)| = log(2)
+  constexpr int N = 512;
+  Tensor x = tf.zeros({1, N});
+  Tensor out = tf.zeros({1, N});
+  op_log_softmax_out(x, /*dim=*/1, /*half_to_float=*/false, out);
+  Tensor expected = tf.full({1, N}, -std::log(static_cast<float>(N)));
+  EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, /*rtol=*/1e-5, /*atol=*/1e-1);
+}
+
 TEST_F(OpLogSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 

@@ -263,6 +263,35 @@ void OpMeanOutTest::
   test_mean_dim_out_bool<ScalarType::Double>();
 }
 
+TEST_F(OpMeanOutTest, BFloat16GenericPathAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // Reducing dim=0 of {512, 1} is not the last dim, so the generic path is
+  // taken. Without fp32 accumulation the sum saturates at ~256, giving
+  // 256/512 = 0.5 instead of 1.0.
+  constexpr int N = 512;
+  Tensor x = tf.ones({N, 1});
+  Tensor out = tf.zeros({1});
+  int64_t dim = 0;
+  op_mean_out(
+      x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
+  Tensor expected = tf.full({1}, 1.0f);
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+TEST_F(OpMeanOutTest, BFloat16LargeDimAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // N=512, all-ones input: without fp32 accumulation the sum saturates at
+  // ~256 in BFloat16, giving 256/512 = 0.5 instead of 1.0.
+  constexpr int N = 512;
+  Tensor x = tf.ones({1, N});
+  Tensor out = tf.zeros({1});
+  int64_t dim = 1;
+  op_mean_out(
+      x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
+  Tensor expected = tf.full({1}, 1.0f);
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
 TEST_F(OpMeanOutTest, InvalidDimensionListDies) {
   ET_SKIP_IF(
       torch::executor::testing::SupportedFeatures::get()->is_aten,

@@ -251,6 +251,19 @@ TEST_F(OpSoftmaxOutTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
+TEST_F(OpSoftmaxOutTest, BFloat16LargeDimAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // N=512: without fp32 accumulation the exp-sum saturates at BFloat16's
+  // precision limit (~256), so the output is ~1/256 instead of 1/512.
+  // 1e-3 is tight enough to catch pre-fix error: |1/256 - 1/512| ≈ 0.00195
+  constexpr int N = 512;
+  Tensor x = tf.zeros({1, N});
+  Tensor out = tf.zeros({1, N});
+  op_softmax_out(x, /*dim=*/1, /*half_to_float=*/false, out);
+  Tensor expected = tf.full({1, N}, 1.0f / N);
+  EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, /*rtol=*/1e-5, /*atol=*/1e-3);
+}
+
 TEST_F(OpSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 

@@ -307,6 +307,35 @@ class OpSumOutTest : public OperatorTest {
   }
 };
 
+TEST_F(OpSumOutTest, BFloat16GenericPathAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // Reducing dim=0 of {512, 1} is not the last dim, so the generic path is
+  // taken. Without fp32 accumulation the sum saturates at ~256 instead of
+  // 512. 512 = 2^9 is exactly representable in BFloat16.
+  constexpr int N = 512;
+  Tensor x = tf.ones({N, 1});
+  Tensor out = tf.zeros({1});
+  int64_t dim = 0;
+  op_sum_intlist_out(
+      x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
+  Tensor expected = tf.full({1}, static_cast<float>(N));
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+TEST_F(OpSumOutTest, BFloat16LargeDimAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // N=512, all-ones input: without fp32 accumulation the sum saturates at
+  // ~256 in BFloat16 instead of 512.
+  constexpr int N = 512;
+  Tensor x = tf.ones({1, N});
+  Tensor out = tf.zeros({1});
+  int64_t dim = 1;
+  op_sum_intlist_out(
+      x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
+  Tensor expected = tf.full({1}, static_cast<float>(N));
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
 TEST_F(OpSumOutTest, InvalidDimensionListDies) {
   ET_SKIP_IF(
       torch::executor::testing::SupportedFeatures::get()->is_aten,