@@ -687,7 +687,16 @@ class fork_union_gt {
687687 float const *const begin_ = nullptr ;
688688 float const *const end_ = nullptr ;
689689 pool_t pool_;
690- std::vector<double > sums_;
690+
691+ /* *
692+ * Make sure different threads never output to the same cache lines.
693+ * Over-aligning with `std::max_align_t` or a fixed size of 128 bytes
694+ * should be enough to avoid false sharing.
695+ */
696+ struct alignas (128 ) thread_result_t {
697+ double partial_sum = 0 ;
698+ };
699+ std::vector<thread_result_t > sums_;
691700
692701 public:
693702 fork_union_gt () = default;
@@ -699,11 +708,14 @@ class fork_union_gt {
699708
700709 double operator ()() {
701710 auto const input_size = static_cast <std::size_t >(end_ - begin_);
702- pool_.for_each_slice (input_size, [this ](pool_t ::task_t first_task, std::size_t slice_length) noexcept {
703- auto const slice_begin = begin_ + first_task.task_index ;
704- sums_[first_task.thread_index ] = serial_at {slice_begin, slice_begin + slice_length}();
711+ auto const chunk_size = scalars_per_core (input_size, sums_.size ());
712+ pool_.for_each_thread ([&](std::size_t thread_id) noexcept {
713+ std::size_t const start = std::min (thread_id * chunk_size, input_size);
714+ std::size_t const stop = std::min (start + chunk_size, input_size);
715+ sums_[thread_id].partial_sum = serial_at {begin_ + start, begin_ + stop}();
705716 });
706- return std::accumulate (sums_.begin (), sums_.end (), 0.0 );
717+ return std::accumulate (sums_.begin (), sums_.end (), 0.0 ,
718+ [](double const &a, thread_result_t const &b) { return a + b.partial_sum ; });
707719 }
708720};
709721
0 commit comments