Skip to content

Commit c2b3284

Browse files
committed
Improve: Over-align partial sums to avoid false-sharing
1 parent 1259361 commit c2b3284

1 file changed

Lines changed: 17 additions & 5 deletions

File tree

reduce_cpu.hpp

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -687,7 +687,16 @@ class fork_union_gt {
687687
float const *const begin_ = nullptr;
688688
float const *const end_ = nullptr;
689689
pool_t pool_;
690-
std::vector<double> sums_;
690+
691+
/**
692+
* Make sure different threads never output to the same cache lines.
693+
* Over-aligning with `std::max_align_t` or a fixed size of 128 bytes
694+
* should be enough to avoid false sharing.
695+
*/
696+
struct alignas(128) thread_result_t {
697+
double partial_sum = 0;
698+
};
699+
std::vector<thread_result_t> sums_;
691700

692701
public:
693702
fork_union_gt() = default;
@@ -699,11 +708,14 @@ class fork_union_gt {
699708

700709
double operator()() {
701710
auto const input_size = static_cast<std::size_t>(end_ - begin_);
702-
pool_.for_each_slice(input_size, [this](pool_t::task_t first_task, std::size_t slice_length) noexcept {
703-
auto const slice_begin = begin_ + first_task.task_index;
704-
sums_[first_task.thread_index] = serial_at {slice_begin, slice_begin + slice_length}();
711+
auto const chunk_size = scalars_per_core(input_size, sums_.size());
712+
pool_.for_each_thread([&](std::size_t thread_id) noexcept {
713+
std::size_t const start = std::min(thread_id * chunk_size, input_size);
714+
std::size_t const stop = std::min(start + chunk_size, input_size);
715+
sums_[thread_id].partial_sum = serial_at {begin_ + start, begin_ + stop}();
705716
});
706-
return std::accumulate(sums_.begin(), sums_.end(), 0.0);
717+
return std::accumulate(sums_.begin(), sums_.end(), 0.0,
718+
[](double const &a, thread_result_t const &b) { return a + b.partial_sum; });
707719
}
708720
};
709721

0 commit comments

Comments
 (0)