Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions kernels/portable/cpu/op_log_softmax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/

#include <cmath>
#include <type_traits>

#include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
#include <executorch/kernels/portable/cpu/util/functional_util.h>
Expand Down Expand Up @@ -42,8 +43,16 @@ Tensor& log_softmax_out(
// Adjust for negative dim
dim = dim < 0 ? dim + nonzero_dim(in) : dim;

// For half-precision inputs, the exp-sum is accumulated in float to avoid
// saturation (BFloat16 saturates near 256, Half near 2048). Matches ATen's
// acc_type behavior. See also op_grid_sampler_2d.cpp.
ET_SWITCH_FLOATHBF16_TYPES(
in.scalar_type(), ctx, "_log_softmax.out", CTYPE, [&]() {
using ACC = std::conditional_t<
std::is_same_v<CTYPE, executorch::aten::Half> ||
std::is_same_v<CTYPE, executorch::aten::BFloat16>,
float,
CTYPE>;
const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();

Expand All @@ -61,11 +70,12 @@ Tensor& log_softmax_out(
size,
stride);

CTYPE temp_sum = apply_unary_map_reduce_fn<CTYPE, CTYPE>(
ACC temp_sum = apply_unary_map_reduce_fn<CTYPE, ACC>(
[max_in](const CTYPE val_in) {
return std::exp(val_in - max_in);
return std::exp(
static_cast<ACC>(val_in) - static_cast<ACC>(max_in));
},
[](const CTYPE mapped_in, CTYPE val_accum) {
[](const ACC mapped_in, ACC val_accum) {
return val_accum + mapped_in;
},
in_data + base,
Expand All @@ -75,7 +85,9 @@ Tensor& log_softmax_out(

apply_unary_map_fn(
[max_in, temp_sum](const CTYPE val_in) {
return val_in - max_in - temp_sum;
return static_cast<CTYPE>(
static_cast<ACC>(val_in) - static_cast<ACC>(max_in) -
temp_sum);
},
in_data + base,
out_data + base,
Expand Down
31 changes: 23 additions & 8 deletions kernels/portable/cpu/op_mean.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
*/
#include <c10/util/irange.h>

#include <type_traits>

#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
#include <executorch/kernels/portable/cpu/util/reduce_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
Expand Down Expand Up @@ -58,17 +60,24 @@ Tensor& mean_dim_out(

// @lint-ignore CLANGTIDY facebook-hte-CArray
static constexpr const char op_name[] = "mean.out";
// For half-precision inputs, accumulate in float to avoid saturation.
// Matches ATen's acc_type behavior.
ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
using ACC = std::conditional_t<
std::is_same_v<CTYPE, executorch::aten::Half> ||
std::is_same_v<CTYPE, executorch::aten::BFloat16>,
float,
CTYPE>;
const CTYPE* in_data = in.const_data_ptr<CTYPE>();
CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
const CTYPE denom = static_cast<CTYPE>(reduce_size);
const ACC denom = static_cast<ACC>(reduce_size);
for (int64_t i = 0; i < outer_size; i++) {
const CTYPE* row = in_data + i * reduce_size;
CTYPE acc = 0;
ACC acc = 0;
for (int64_t j = 0; j < reduce_size; j++) {
acc += row[j];
}
out_data[i] = acc / denom;
out_data[i] = static_cast<CTYPE>(acc / denom);
}
});
return out;
Expand All @@ -83,19 +92,25 @@ Tensor& mean_dim_out(
static constexpr const char op_name[] = "mean.out";
ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
using ACC = std::conditional_t<
std::is_same_v<CTYPE_OUT, executorch::aten::Half> ||
std::is_same_v<CTYPE_OUT, executorch::aten::BFloat16>,
float,
CTYPE_OUT>;
CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
const size_t num = get_reduced_dim_product(in, dim_list);
const bool success = parallel_for_each_reduce_over_dim_list_output_index(
in, dim_list, out, [&](const auto begin, const auto end) {
for (const auto out_ix : c10::irange(begin, end)) {
CTYPE_OUT sum = 0;
ACC sum = 0;
if (plan.has_value()) {
sum = plan->execute<CTYPE_IN, CTYPE_OUT>(
[](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
[](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
sum = plan->execute<CTYPE_IN, ACC>(
[](CTYPE_IN v) { return static_cast<ACC>(v); },
[](ACC outv, ACC acc) { return acc + outv; },
out_ix);
}
out_data[out_ix] = sum / static_cast<float>(num);
out_data[out_ix] =
static_cast<CTYPE_OUT>(sum / static_cast<float>(num));
}
});
ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
Expand Down
22 changes: 18 additions & 4 deletions kernels/portable/cpu/op_softmax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/

#include <cmath>
#include <type_traits>

#include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
#include <executorch/kernels/portable/cpu/util/functional_util.h>
Expand Down Expand Up @@ -42,8 +43,16 @@ Tensor& softmax_out(
// Adjust for negative dim
dim = dim < 0 ? dim + nonzero_dim(in) : dim;

// For half-precision inputs, the exp-sum is accumulated in float to avoid
// saturation (BFloat16 saturates near 256, Half near 2048). Matches ATen's
// acc_type behavior. See also op_grid_sampler_2d.cpp.
ET_SWITCH_FLOATHBF16_TYPES(
in.scalar_type(), ctx, "_softmax.out", CTYPE, [&]() {
using ACC = std::conditional_t<
std::is_same_v<CTYPE, executorch::aten::Half> ||
std::is_same_v<CTYPE, executorch::aten::BFloat16>,
float,
CTYPE>;
const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();

Expand All @@ -61,11 +70,12 @@ Tensor& softmax_out(
size,
stride);

const CTYPE temp_sum = apply_unary_map_reduce_fn<CTYPE, CTYPE>(
const ACC temp_sum = apply_unary_map_reduce_fn<CTYPE, ACC>(
[max_in](const CTYPE val_in) {
return std::exp(val_in - max_in);
return std::exp(
static_cast<ACC>(val_in) - static_cast<ACC>(max_in));
},
[](const CTYPE mapped_in, CTYPE val_accum) {
[](const ACC mapped_in, ACC val_accum) {
return val_accum + mapped_in;
},
in_data + base,
Expand All @@ -74,7 +84,11 @@ Tensor& softmax_out(

apply_unary_map_fn(
[max_in, temp_sum](const CTYPE val_in) {
return std::exp(val_in - max_in) / temp_sum;
return static_cast<CTYPE>(
std::exp(
static_cast<ACC>(val_in) -
static_cast<ACC>(max_in)) /
temp_sum);
},
in_data + base,
out_data + base,
Expand Down
32 changes: 21 additions & 11 deletions kernels/portable/cpu/op_sum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
*/
#include <c10/util/irange.h>

#include <type_traits>

#include <executorch/kernels/portable/cpu/util/reduce_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/platform/assert.h>
Expand Down Expand Up @@ -60,16 +62,23 @@ Tensor& sum_dim_out(

// @lint-ignore CLANGTIDY facebook-hte-CArray
static constexpr const char op_name[] = "sum.IntList_out";
// For half-precision inputs, accumulate in float to avoid saturation.
// Matches ATen's acc_type behavior. See also op_grid_sampler_2d.cpp.
ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
using ACC = std::conditional_t<
std::is_same_v<CTYPE, executorch::aten::Half> ||
std::is_same_v<CTYPE, executorch::aten::BFloat16>,
float,
CTYPE>;
const CTYPE* in_data = in.const_data_ptr<CTYPE>();
CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
for (int64_t i = 0; i < outer_size; i++) {
const CTYPE* row = in_data + i * reduce_size;
CTYPE acc = 0;
ACC acc = 0;
for (int64_t j = 0; j < reduce_size; j++) {
acc += row[j];
}
out_data[i] = acc;
out_data[i] = static_cast<CTYPE>(acc);
}
});
return out;
Expand Down Expand Up @@ -108,23 +117,24 @@ Tensor& sum_dim_out(
ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
ET_SWITCH_REALHBBF16_TYPES(
out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
using ACC = std::conditional_t<
std::is_same_v<CTYPE_OUT, executorch::aten::Half> ||
std::is_same_v<CTYPE_OUT, executorch::aten::BFloat16>,
float,
CTYPE_OUT>;
CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
const bool success =
parallel_for_each_reduce_over_dim_list_output_index(
in, dim_list, out, [&](const auto begin, const auto end) {
for (const auto out_ix : c10::irange(begin, end)) {
CTYPE_OUT sum = 0;
ACC sum = 0;
if (plan.has_value()) {
sum = plan->execute<CTYPE_IN, CTYPE_OUT>(
[](CTYPE_IN v) {
return static_cast<CTYPE_OUT>(v);
},
[](CTYPE_OUT outv, CTYPE_OUT acc) {
return acc + outv;
},
sum = plan->execute<CTYPE_IN, ACC>(
[](CTYPE_IN v) { return static_cast<ACC>(v); },
[](ACC outv, ACC acc) { return acc + outv; },
out_ix);
}
out_data[out_ix] = sum;
out_data[out_ix] = static_cast<CTYPE_OUT>(sum);
}
});
ET_KERNEL_CHECK_MSG(
Expand Down
13 changes: 13 additions & 0 deletions kernels/test/op_log_softmax_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,19 @@ TEST_F(OpLogSoftmaxOutTest, SimpleGeneratedCase) {
EXPECT_TENSOR_CLOSE(out, expected_result);
}

TEST_F(OpLogSoftmaxOutTest, BFloat16LargeDimAccumulatesInFloat) {
TensorFactory<ScalarType::BFloat16> tf;
// N=512: without fp32 accumulation, the exp-sum saturates at BFloat16's
// precision limit (~256), so the output is ~-log(256) instead of -log(512).
// atol=1e-1 can catch pre-fix error: |log(512) - log(256)| = log(2)
constexpr int N = 512;
Tensor x = tf.zeros({1, N});
Tensor out = tf.zeros({1, N});
op_log_softmax_out(x, /*dim=*/1, /*half_to_float=*/false, out);
Tensor expected = tf.full({1, N}, -std::log(static_cast<float>(N)));
EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, /*rtol=*/1e-5, /*atol=*/1e-1);
}

TEST_F(OpLogSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
TensorFactory<ScalarType::Float> tf;

Expand Down
29 changes: 29 additions & 0 deletions kernels/test/op_mean_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,35 @@ void OpMeanOutTest::
test_mean_dim_out_bool<ScalarType::Double>();
}

TEST_F(OpMeanOutTest, BFloat16GenericPathAccumulatesInFloat) {
TensorFactory<ScalarType::BFloat16> tf;
// Reducing dim=0 of {512, 1} is not the last dim, so the generic path is
// taken. Without fp32 accumulation the sum saturates at ~256, giving
// 256/512 = 0.5 instead of 1.0.
constexpr int N = 512;
Tensor x = tf.ones({N, 1});
Tensor out = tf.zeros({1});
int64_t dim = 0;
op_mean_out(
x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
Tensor expected = tf.full({1}, 1.0f);
EXPECT_TENSOR_CLOSE(out, expected);
}

TEST_F(OpMeanOutTest, BFloat16LargeDimAccumulatesInFloat) {
TensorFactory<ScalarType::BFloat16> tf;
// N=512, all-ones input: without fp32 accumulation the sum saturates at
// ~256 in BFloat16, giving 256/512 = 0.5 instead of 1.0.
constexpr int N = 512;
Tensor x = tf.ones({1, N});
Tensor out = tf.zeros({1});
int64_t dim = 1;
op_mean_out(
x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
Tensor expected = tf.full({1}, 1.0f);
EXPECT_TENSOR_CLOSE(out, expected);
}

TEST_F(OpMeanOutTest, InvalidDimensionListDies) {
ET_SKIP_IF(
torch::executor::testing::SupportedFeatures::get()->is_aten,
Expand Down
13 changes: 13 additions & 0 deletions kernels/test/op_softmax_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,19 @@ TEST_F(OpSoftmaxOutTest, SimpleGeneratedCase) {
EXPECT_TENSOR_CLOSE(out, expected_result);
}

TEST_F(OpSoftmaxOutTest, BFloat16LargeDimAccumulatesInFloat) {
TensorFactory<ScalarType::BFloat16> tf;
// N=512: without fp32 accumulation the exp-sum saturates at BFloat16's
// precision limit (~256), so the output is ~1/256 instead of 1/512.
// 1e-3 is tight enough to catch pre-fix error: |1/256 - 1/512| ≈ 0.00195
constexpr int N = 512;
Tensor x = tf.zeros({1, N});
Tensor out = tf.zeros({1, N});
op_softmax_out(x, /*dim=*/1, /*half_to_float=*/false, out);
Tensor expected = tf.full({1, N}, 1.0f / N);
EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, /*rtol=*/1e-5, /*atol=*/1e-3);
}

TEST_F(OpSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
TensorFactory<ScalarType::Float> tf;

Expand Down
29 changes: 29 additions & 0 deletions kernels/test/op_sum_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,35 @@ class OpSumOutTest : public OperatorTest {
}
};

TEST_F(OpSumOutTest, BFloat16GenericPathAccumulatesInFloat) {
TensorFactory<ScalarType::BFloat16> tf;
// Reducing dim=0 of {512, 1} is not the last dim, so the generic path is
// taken. Without fp32 accumulation the sum saturates at ~256 instead of
// 512. 512 = 2^9 is exactly representable in BFloat16.
constexpr int N = 512;
Tensor x = tf.ones({N, 1});
Tensor out = tf.zeros({1});
int64_t dim = 0;
op_sum_intlist_out(
x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
Tensor expected = tf.full({1}, static_cast<float>(N));
EXPECT_TENSOR_CLOSE(out, expected);
}

TEST_F(OpSumOutTest, BFloat16LargeDimAccumulatesInFloat) {
TensorFactory<ScalarType::BFloat16> tf;
// N=512, all-ones input: without fp32 accumulation the sum saturates at
// ~256 in BFloat16 instead of 512.
constexpr int N = 512;
Tensor x = tf.ones({1, N});
Tensor out = tf.zeros({1});
int64_t dim = 1;
op_sum_intlist_out(
x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
Tensor expected = tf.full({1}, static_cast<float>(N));
EXPECT_TENSOR_CLOSE(out, expected);
}

TEST_F(OpSumOutTest, InvalidDimensionListDies) {
ET_SKIP_IF(
torch::executor::testing::SupportedFeatures::get()->is_aten,
Expand Down
Loading