Skip to content

Commit b3cdc4c

Browse files
committed
Add inline docs for public APIs
1 parent 1928649 commit b3cdc4c

7 files changed

Lines changed: 484 additions & 48 deletions

File tree

include/cuco/detail/hyperloglog/finalizer.cuh

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,29 @@
2020
#include <cuda/std/cmath>
2121

2222
namespace cuco::hyperloglog_ns::detail {
23+
24+
/**
25+
* @brief Estimate correction algorithm based on HyperLogLog++.
26+
*
27+
* @note Variable names correspond to the definitions given in the HLL++ paper:
28+
* https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf
29+
*
30+
* @tparam Precision Tuning parameter to trade accuracy for runtime/memory footprint
31+
*/
2332
template <int32_t Precision>
2433
class finalizer {
2534
// this minimum number of registers is required by HLL++
2635
static_assert(Precision >= 4, "Precision must be greater or equal to 4");
2736

2837
public:
38+
/**
39+
* @brief Compute the bias-corrected cardinality estimate.
40+
*
41+
* @param z Geometric mean of registers
42+
* @param v Number of 0 registers
43+
*
44+
* @return Bias-corrected cardinality estimate
45+
*/
2946
__host__ __device__ static double constexpr finalize(double z, int v) noexcept
3047
{
3148
auto e = alpha_mm() / z;
@@ -50,7 +67,7 @@ class finalizer {
5067
}
5168

5269
private:
53-
static auto constexpr m = (1 << Precision);
70+
static auto constexpr m = (1 << Precision); ///< Number of registers
5471

5572
__host__ __device__ static double constexpr alpha_mm() noexcept
5673
{

include/cuco/detail/hyperloglog/hyperloglog.cuh

Lines changed: 156 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -29,20 +29,47 @@
2929
#include <memory>
3030

3131
namespace cuco::detail {
32+
/**
33+
* @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset.
34+
*
35+
* @note This class implements the HyperLogLog/HyperLogLog++ algorithm:
36+
* https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf.
37+
* @note The `Precision` parameter can be used to trade runtime/memory footprint for better
38+
* accuracy. A higher value corresponds to a more accurate result, however, setting the precision
39+
* too high will result in deminishing results.
40+
*
41+
* @tparam T Type of items to count
42+
* @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy
43+
* @tparam Scope The scope in which operations will be performed by individual threads
44+
* @tparam Hash Hash function used to hash items
45+
* @tparam Allocator Type of allocator used for device storage
46+
*/
3247
template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
3348
class hyperloglog {
3449
public:
35-
static constexpr auto thread_scope = Scope; ///< CUDA thread scope
36-
static constexpr auto precision = Precision;
50+
static constexpr auto thread_scope = Scope; ///< CUDA thread scope
51+
static constexpr auto precision = Precision; ///< Precision
3752

3853
template <cuda::thread_scope NewScope = thread_scope>
39-
using ref_type = hyperloglog_ref<T, Precision, NewScope, Hash>;
40-
41-
using allocator_type = Allocator; ///< Allocator type
42-
using storage_type = typename ref_type<>::storage_type;
43-
using storage_allocator_type =
44-
typename std::allocator_traits<Allocator>::template rebind_alloc<storage_type>;
45-
54+
using ref_type = hyperloglog_ref<T, Precision, NewScope, Hash>; ///< Non-owning reference
55+
///< type
56+
57+
using allocator_type = Allocator; ///< Allocator type
58+
using storage_type = typename ref_type<>::storage_type; ///< Storage type
59+
using storage_allocator_type = typename std::allocator_traits<Allocator>::template rebind_alloc<
60+
storage_type>; ///< Storage allocator type
61+
62+
/**
63+
* @brief Constructs a `hyperloglog` host object.
64+
*
65+
* @note This function synchronizes the given stream.
66+
*
67+
* @param hash The hash function used to hash items
68+
* @param alloc Allocator used for allocating device storage
69+
* @param stream CUDA stream used to initialize the object
70+
*/
71+
// Doxygen cannot document unnamed parameter for scope, see
72+
// https://github.com/doxygen/doxygen/issues/6926
4673
constexpr hyperloglog(cuco::cuda_thread_scope<Scope>,
4774
Hash const& hash,
4875
Allocator const& alloc,
@@ -55,24 +82,56 @@ class hyperloglog {
5582
this->clear_async(stream); // TODO async or sync?
5683
}
5784

58-
hyperloglog(hyperloglog const&) = delete;
59-
hyperloglog& operator=(hyperloglog const&) = delete;
60-
hyperloglog(hyperloglog&&) = default;
61-
hyperloglog& operator=(hyperloglog&&) = default;
62-
~hyperloglog() = default;
85+
~hyperloglog() = default;
6386

87+
hyperloglog(hyperloglog const&) = delete;
88+
hyperloglog& operator=(hyperloglog const&) = delete;
89+
hyperloglog(hyperloglog&&) = default; ///< Move constructor
90+
91+
// TODO this is somehow required to pass the Doxygen check.
92+
/**
93+
* @brief Copy-assignment operator.
94+
*
95+
* @return Copy of `*this`
96+
*/
97+
hyperloglog& operator=(hyperloglog&&) = default;
98+
99+
/**
100+
* @brief Asynchronously resets the estimator, i.e., clears the current count estimate.
101+
*
102+
* @param stream CUDA stream this operation is executed in
103+
*/
64104
void clear_async(cuco::cuda_stream_ref stream) noexcept
65105
{
66106
auto constexpr block_size = 1024;
67107
cuco::hyperloglog_ns::detail::clear<<<1, block_size, 0, stream>>>(this->ref());
68108
}
69109

110+
/**
111+
* @brief Resets the estimator, i.e., clears the current count estimate.
112+
*
113+
* @note This function synchronizes the given stream. For asynchronous execution use
114+
* `clear_async`.
115+
*
116+
* @param stream CUDA stream this operation is executed in
117+
*/
70118
void clear(cuco::cuda_stream_ref stream)
71119
{
72120
this->clear_async(stream);
73121
stream.synchronize();
74122
}
75123

124+
/**
125+
* @brief Asynchronously adds to be counted items to the estimator.
126+
*
127+
* @tparam InputIt Device accessible random access input iterator where
128+
* <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
129+
* T></tt> is `true`
130+
*
131+
* @param first Beginning of the sequence of items
132+
* @param last End of the sequence of items
133+
* @param stream CUDA stream this operation is executed in
134+
*/
76135
template <class InputIt>
77136
void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) noexcept
78137
{
@@ -83,50 +142,117 @@ class hyperloglog {
83142

84143
int grid_size = 0;
85144
int block_size = 0;
86-
// TODO check cuda error?
145+
146+
// We make use of the occupancy calculator here to get the minimum number of blocks which still
147+
// saturate the GPU. This reduces the atomic contention on the final register array during the
148+
// merge phase.
149+
// TODO check cuda error or will it sync the stream??
87150
cudaOccupancyMaxPotentialBlockSize(
88151
&grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_shmem<InputIt, ref_type<>>);
89152

90153
cuco::hyperloglog_ns::detail::add_shmem<<<grid_size, block_size, 0, stream>>>(
91154
first, num_items, this->ref());
92155
}
93156

157+
/**
158+
* @brief Adds to be counted items to the estimator.
159+
*
160+
* @note This function synchronizes the given stream. For asynchronous execution use
161+
* `add_async`.
162+
*
163+
* @tparam InputIt Device accessible random access input iterator where
164+
* <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
165+
* T></tt> is `true`
166+
*
167+
* @param first Beginning of the sequence of items
168+
* @param last End of the sequence of items
169+
* @param stream CUDA stream this operation is executed in
170+
*/
94171
template <class InputIt>
95172
void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
96173
{
97174
this->add_async(first, last, stream);
98175
stream.synchronize();
99176
}
100177

178+
/**
179+
* @brief Asynchronously merges the result of `other` estimator into `*this` estimator.
180+
*
181+
* @tparam OtherScope Thread scope of `other` estimator
182+
* @tparam OtherAllocator Allocator type of `other` estimator
183+
*
184+
* @param other Other estimator to be merged into `*this`
185+
* @param stream CUDA stream this operation is executed in
186+
*/
101187
template <cuda::thread_scope OtherScope, class OtherAllocator>
102188
void merge_async(hyperloglog<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
103-
cuco::cuda_stream_ref stream = {}) noexcept
189+
cuco::cuda_stream_ref stream) noexcept
104190
{
105191
this->merge_async(other.ref(), stream);
106192
}
107193

194+
/**
195+
* @brief Merges the result of `other` estimator into `*this` estimator.
196+
*
197+
* @note This function synchronizes the given stream. For asynchronous execution use
198+
* `merge_async`.
199+
*
200+
* @tparam OtherScope Thread scope of `other` estimator
201+
* @tparam OtherAllocator Allocator type of `other` estimator
202+
*
203+
* @param other Other estimator to be merged into `*this`
204+
* @param stream CUDA stream this operation is executed in
205+
*/
108206
template <cuda::thread_scope OtherScope, class OtherAllocator>
109207
void merge(hyperloglog<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
110-
cuco::cuda_stream_ref stream = {})
208+
cuco::cuda_stream_ref stream)
111209
{
112210
this->merge_async(other, stream);
113211
stream.synchronize();
114212
}
115213

214+
/**
215+
* @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
216+
*
217+
* @tparam OtherScope Thread scope of `other` estimator
218+
*
219+
* @param other Other estimator reference to be merged into `*this`
220+
* @param stream CUDA stream this operation is executed in
221+
*/
116222
template <cuda::thread_scope OtherScope>
117-
void merge_async(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream = {}) noexcept
223+
void merge_async(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream) noexcept
118224
{
119225
auto constexpr block_size = 1024;
120226
cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream>>>(other, this->ref());
121227
}
122228

229+
/**
230+
* @brief Merges the result of `other` estimator reference into `*this` estimator.
231+
*
232+
* @note This function synchronizes the given stream. For asynchronous execution use
233+
* `merge_async`.
234+
*
235+
* @tparam OtherScope Thread scope of `other` estimator
236+
*
237+
* @param other Other estimator reference to be merged into `*this`
238+
* @param stream CUDA stream this operation is executed in
239+
*/
123240
template <cuda::thread_scope OtherScope>
124-
void merge(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream = {})
241+
void merge(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream)
125242
{
126243
this->merge_async(other, stream);
127244
stream.synchronize();
128245
}
129246

247+
/**
248+
* @brief Compute the estimated distinct items count.
249+
*
250+
* @note This function synchronizes the given stream.
251+
*
252+
* @param stream CUDA stream this operation is executed in
253+
*
254+
* @return Approximate distinct items count
255+
*/
130256
[[nodiscard]] std::size_t estimate(cuco::cuda_stream_ref stream) const
131257
{
132258
// TODO remove test code
@@ -167,6 +293,11 @@ class hyperloglog {
167293
return cuco::hyperloglog_ns::detail::finalizer<Precision>::finalize(sum, zeroes);
168294
}
169295

296+
/**
297+
* @brief Get device ref.
298+
*
299+
* @return Device ref object of the current `distinct_count_estimator` host object
300+
*/
170301
[[nodiscard]] ref_type<> ref() const noexcept
171302
{
172303
return ref_type<>{*(this->storage_.get()), {}, this->hash_};
@@ -185,11 +316,13 @@ class hyperloglog {
185316
storage_allocator_type& allocator;
186317
};
187318

188-
Hash hash_;
189-
storage_allocator_type storage_allocator_;
190-
storage_deleter storage_deleter_;
191-
std::unique_ptr<storage_type, storage_deleter> storage_;
319+
Hash hash_; ///< Hash function used to hash items
320+
storage_allocator_type storage_allocator_; ///< Storage allocator
321+
storage_deleter storage_deleter_; ///< Storage deleter
322+
std::unique_ptr<storage_type, storage_deleter> storage_; ///< Storage
192323

324+
// Needs to be friends with other instantiations of this class template to have access to their
325+
// storage
193326
template <class T_, int32_t Precision_, cuda::thread_scope Scope_, class Hash_, class Allocator_>
194327
friend class hyperloglog;
195328
};

0 commit comments

Comments
 (0)