Skip to content

Commit ef15401

Browse files
[pre-commit.ci] auto code formatting
1 parent ccb621b commit ef15401

3 files changed

Lines changed: 139 additions & 93 deletions

File tree

include/cuco/detail/reduction_functor_impl.cuh

Lines changed: 38 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -29,46 +29,54 @@ namespace detail {
2929
* @warning This class should not be used directly.
3030
*
3131
*/
32-
class reduction_functor_base {};
32+
class reduction_functor_base {
33+
};
3334

3435
template <typename T, typename Enable = void>
3536
struct reduce_add_impl {
3637
template <cuda::thread_scope Scope>
37-
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
38+
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
39+
{
3840
return lhs.fetch_add(rhs) + rhs;
3941
}
4042
};
4143

4244
template <typename T, typename Enable = void>
4345
struct reduce_min_impl {
4446
template <cuda::thread_scope Scope>
45-
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
47+
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
48+
{
4649
return min(lhs.fetch_min(rhs), rhs);
4750
}
4851
};
4952

5053
template <typename T, typename Enable = void>
5154
struct reduce_max_impl {
5255
template <cuda::thread_scope Scope>
53-
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
56+
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
57+
{
5458
return max(lhs.fetch_max(rhs), rhs);
5559
}
5660
};
5761

5862
template <typename T, typename Enable = void>
5963
struct reduce_count_impl {
6064
template <cuda::thread_scope Scope>
61-
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& /* rhs */) const noexcept {
65+
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& /* rhs */) const noexcept
66+
{
6267
return ++lhs;
6368
}
6469
};
6570

6671
// remove the following WAR once libcu++ extends FP atomics support and fixes signed integer atomics
6772
// https://github.com/NVIDIA/libcudacxx/pull/286
6873
template <typename T>
69-
struct reduce_add_impl<T, typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
74+
struct reduce_add_impl<
75+
T,
76+
typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
7077
template <cuda::thread_scope Scope>
71-
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept {
78+
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
79+
{
7280
if constexpr (Scope == cuda::thread_scope_system)
7381
return atomicAdd_system(reinterpret_cast<T*>(&lhs), rhs) + rhs;
7482
else if constexpr (Scope == cuda::thread_scope_device)
@@ -79,11 +87,15 @@ struct reduce_add_impl<T, typename cuda::std::enable_if<cuda::std::is_floating_p
7987
};
8088

8189
template <typename T>
82-
struct reduce_min_impl<T, typename cuda::std::enable_if<cuda::std::is_integral<T>::value && cuda::std::is_signed<T>::value && sizeof(T) == 8>::type> {
90+
struct reduce_min_impl<
91+
T,
92+
typename cuda::std::enable_if<cuda::std::is_integral<T>::value &&
93+
cuda::std::is_signed<T>::value && sizeof(T) == 8>::type> {
8394
template <cuda::thread_scope Scope>
84-
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
95+
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
96+
{
8597
using InternalT = typename cuda::std::conditional<sizeof(T) == 8, long long int, int>::type;
86-
InternalT * ptr = reinterpret_cast<InternalT*>(&lhs);
98+
InternalT* ptr = reinterpret_cast<InternalT*>(&lhs);
8799
InternalT value = rhs;
88100
if constexpr (Scope == cuda::thread_scope_system)
89101
return min(atomicMin_system(ptr, value), value);
@@ -95,11 +107,14 @@ struct reduce_min_impl<T, typename cuda::std::enable_if<cuda::std::is_integral<T
95107
};
96108

97109
template <typename T>
98-
struct reduce_max_impl<T, typename cuda::std::enable_if<cuda::std::is_integral<T>::value && cuda::std::is_signed<T>::value>::type> {
110+
struct reduce_max_impl<T,
111+
typename cuda::std::enable_if<cuda::std::is_integral<T>::value &&
112+
cuda::std::is_signed<T>::value>::type> {
99113
template <cuda::thread_scope Scope>
100-
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
114+
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
115+
{
101116
using InternalT = typename cuda::std::conditional<sizeof(T) == 8, long long int, int>::type;
102-
InternalT * ptr = reinterpret_cast<InternalT*>(&lhs);
117+
InternalT* ptr = reinterpret_cast<InternalT*>(&lhs);
103118
InternalT value = rhs;
104119
if constexpr (Scope == cuda::thread_scope_system)
105120
return max(atomicMax_system(ptr, value), value);
@@ -111,18 +126,18 @@ struct reduce_max_impl<T, typename cuda::std::enable_if<cuda::std::is_integral<T
111126
};
112127

113128
template <typename T>
114-
struct reduce_min_impl<T, typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
115-
__device__ T operator()(T lhs, T rhs) const noexcept {
116-
return min(lhs, rhs);
117-
}
129+
struct reduce_min_impl<
130+
T,
131+
typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
132+
__device__ T operator()(T lhs, T rhs) const noexcept { return min(lhs, rhs); }
118133
};
119134

120135
template <typename T>
121-
struct reduce_max_impl<T, typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
122-
__device__ T operator()(T lhs, T rhs) const noexcept {
123-
return max(lhs, rhs);
124-
}
136+
struct reduce_max_impl<
137+
T,
138+
typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
139+
__device__ T operator()(T lhs, T rhs) const noexcept { return max(lhs, rhs); }
125140
};
126141

127-
} // namespace detail
128-
} // namespace cuco
142+
} // namespace detail
143+
} // namespace cuco

include/cuco/reduction_functors.cuh

Lines changed: 55 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ class identity_value {
3636
using type = T;
3737
constexpr identity_value(T const& identity) noexcept : identity_(identity) {}
3838
constexpr T value() const noexcept { return identity_; }
39+
3940
private:
4041
T identity_;
4142
};
@@ -64,13 +65,16 @@ class identity_value {
6465
* };
6566
*
6667
* int main() {
67-
* cuco::identity_value<int> identity{0}; // define the identity value for the given reduction operation, i.e., op(identity, x) == x
68+
* cuco::identity_value<int> identity{0}; // define the identity value for the given reduction
69+
* operation, i.e., op(identity, x) == x
6870
*
69-
* auto f1 = cuco::reduction_functor<custom_plus<int>, int>(identity); // synchronized via CAS-loop
70-
* auto f2 = cuco::reduction_functor<custom_plus_sync<int>, int>(identity); // implicitly synchronized
71+
* auto f1 = cuco::reduction_functor<custom_plus<int>, int>(identity); // synchronized via
72+
* CAS-loop auto f2 = cuco::reduction_functor<custom_plus_sync<int>, int>(identity); // implicitly
73+
* synchronized
7174
*
7275
* auto custom_plus_lambda = [] __device__ (int lhs, int rhs) noexcept { return lhs + rhs; };
73-
* auto f3 = cuco::reduction_functor<decltype(custom_plus_lambda), int>(identity, custom_plus_lambda);
76+
* auto f3 = cuco::reduction_functor<decltype(custom_plus_lambda), int>(identity,
77+
* custom_plus_lambda);
7478
* }
7579
* \endcode
7680
*
@@ -82,42 +86,59 @@ class reduction_functor : detail::reduction_functor_base {
8286
public:
8387
using value_type = Value;
8488

85-
reduction_functor(cuco::identity_value<Value> identity, Func functor = Func{}) noexcept : identity_(identity), functor_(functor) {}
89+
reduction_functor(cuco::identity_value<Value> identity, Func functor = Func{}) noexcept
90+
: identity_(identity), functor_(functor)
91+
{
92+
}
8693

8794
template <cuda::thread_scope Scope>
88-
__device__ value_type operator()(cuda::atomic<value_type, Scope>& lhs, value_type const& rhs) const noexcept
95+
__device__ value_type operator()(cuda::atomic<value_type, Scope>& lhs,
96+
value_type const& rhs) const noexcept
8997
{
9098
if constexpr (uses_external_sync()) {
9199
value_type old = lhs.load(cuda::memory_order_relaxed);
92100
value_type desired;
93101

94102
do {
95103
desired = functor_(old, rhs);
96-
} while (!lhs.compare_exchange_weak(old, desired, cuda::memory_order_release, cuda::memory_order_relaxed));
104+
} while (!lhs.compare_exchange_weak(
105+
old, desired, cuda::memory_order_release, cuda::memory_order_relaxed));
97106

98107
return desired;
99108
} else {
100109
return functor_(lhs, rhs);
101110
}
102111
}
103112

104-
__host__ __device__ value_type identity() const noexcept {
105-
return identity_.value();
106-
}
113+
__host__ __device__ value_type identity() const noexcept { return identity_.value(); }
107114

108-
__host__ __device__ static constexpr bool uses_external_sync() noexcept {
115+
__host__ __device__ static constexpr bool uses_external_sync() noexcept
116+
{
109117
return !atomic_invocable_ || naive_invocable_;
110118
}
111119

112120
private:
113121
cuco::identity_value<value_type> identity_;
114122
Func functor_;
115-
static constexpr bool naive_invocable_ = std::is_invocable_r<value_type, Func, value_type, value_type>::value;
123+
static constexpr bool naive_invocable_ =
124+
std::is_invocable_r<value_type, Func, value_type, value_type>::value;
116125
static constexpr bool atomic_invocable_ =
117-
std::is_invocable_r<value_type, Func, cuda::atomic<value_type, cuda::thread_scope_system>&, value_type>::value ||
118-
std::is_invocable_r<value_type, Func, cuda::atomic<value_type, cuda::thread_scope_device>&, value_type>::value ||
119-
std::is_invocable_r<value_type, Func, cuda::atomic<value_type, cuda::thread_scope_block>&, value_type>::value ||
120-
std::is_invocable_r<value_type, Func, cuda::atomic<value_type, cuda::thread_scope_thread>&, value_type>::value;
126+
std::is_invocable_r<value_type,
127+
Func,
128+
cuda::atomic<value_type, cuda::thread_scope_system>&,
129+
value_type>::value ||
130+
std::is_invocable_r<value_type,
131+
Func,
132+
cuda::atomic<value_type, cuda::thread_scope_device>&,
133+
value_type>::value ||
134+
std::is_invocable_r<value_type,
135+
Func,
136+
cuda::atomic<value_type, cuda::thread_scope_block>&,
137+
value_type>::value ||
138+
std::is_invocable_r<value_type,
139+
Func,
140+
cuda::atomic<value_type, cuda::thread_scope_thread>&,
141+
value_type>::value;
121142

122143
static_assert(atomic_invocable_ || naive_invocable_, "Invalid operator signature.");
123144
};
@@ -128,30 +149,44 @@ class reduction_functor : detail::reduction_functor_base {
128149
* @tparam T The value type used for reduction
129150
*/
130151
template <typename T>
131-
auto reduce_add() { return reduction_functor(identity_value<T>{0}, detail::reduce_add_impl<T>{}); };
152+
auto reduce_add()
153+
{
154+
return reduction_functor(identity_value<T>{0}, detail::reduce_add_impl<T>{});
155+
};
132156

133157
/**
134158
* @brief Synchronized `min` reduction functor.
135159
*
136160
* @tparam T The value type used for reduction
137161
*/
138162
template <typename T>
139-
auto reduce_min() { return reduction_functor(identity_value{cuda::std::numeric_limits<T>::max()}, detail::reduce_min_impl<T>{}); };
163+
auto reduce_min()
164+
{
165+
return reduction_functor(identity_value{cuda::std::numeric_limits<T>::max()},
166+
detail::reduce_min_impl<T>{});
167+
};
140168

141169
/**
142170
* @brief Synchronized `max` reduction functor.
143171
*
144172
* @tparam T The value type used for reduction
145173
*/
146174
template <typename T>
147-
auto reduce_max() { return reduction_functor(identity_value{cuda::std::numeric_limits<T>::lowest()}, detail::reduce_max_impl<T>{}); };
175+
auto reduce_max()
176+
{
177+
return reduction_functor(identity_value{cuda::std::numeric_limits<T>::lowest()},
178+
detail::reduce_max_impl<T>{});
179+
};
148180

149181
/**
150182
* @brief Synchronized `count` reduction functor.
151183
*
152184
* @tparam T The value type used for reduction
153185
*/
154186
template <typename T>
155-
auto reduce_count() { return reduction_functor(identity_value<T>{0}, detail::reduce_count_impl<T>{}); };
187+
auto reduce_count()
188+
{
189+
return reduction_functor(identity_value<T>{0}, detail::reduce_count_impl<T>{});
190+
};
156191

157192
} // namespace cuco

0 commit comments

Comments
 (0)