2929#include < memory>
3030
3131namespace cuco ::detail {
32+ /* *
33+ * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset.
34+ *
35+ * @note This class implements the HyperLogLog/HyperLogLog++ algorithm:
36+ * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf.
37+ * @note The `Precision` parameter can be used to trade runtime/memory footprint for better
38+ * accuracy. A higher value corresponds to a more accurate result, however, setting the precision
39+ * too high will result in deminishing results.
40+ *
41+ * @tparam T Type of items to count
42+ * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy
43+ * @tparam Scope The scope in which operations will be performed by individual threads
44+ * @tparam Hash Hash function used to hash items
45+ * @tparam Allocator Type of allocator used for device storage
46+ */
3247template <class T , int32_t Precision, cuda::thread_scope Scope, class Hash , class Allocator >
3348class hyperloglog {
3449 public:
35- static constexpr auto thread_scope = Scope; // /< CUDA thread scope
36- static constexpr auto precision = Precision;
50+ static constexpr auto thread_scope = Scope; // /< CUDA thread scope
51+ static constexpr auto precision = Precision; // /< Precision
3752
3853 template <cuda::thread_scope NewScope = thread_scope>
39- using ref_type = hyperloglog_ref<T, Precision, NewScope, Hash>;
40-
41- using allocator_type = Allocator; // /< Allocator type
42- using storage_type = typename ref_type<>::storage_type;
43- using storage_allocator_type =
44- typename std::allocator_traits<Allocator>::template rebind_alloc<storage_type>;
45-
54+ using ref_type = hyperloglog_ref<T, Precision, NewScope, Hash>; // /< Non-owning reference
55+ // /< type
56+
57+ using allocator_type = Allocator; // /< Allocator type
58+ using storage_type = typename ref_type<>::storage_type; // /< Storage type
59+ using storage_allocator_type = typename std::allocator_traits<Allocator>::template rebind_alloc<
60+ storage_type>; // /< Storage allocator type
61+
62+ /* *
63+ * @brief Constructs a `hyperloglog` host object.
64+ *
65+ * @note This function synchronizes the given stream.
66+ *
67+ * @param hash The hash function used to hash items
68+ * @param alloc Allocator used for allocating device storage
69+ * @param stream CUDA stream used to initialize the object
70+ */
71+ // Doxygen cannot document unnamed parameter for scope, see
72+ // https://github.com/doxygen/doxygen/issues/6926
4673 constexpr hyperloglog (cuco::cuda_thread_scope<Scope>,
4774 Hash const & hash,
4875 Allocator const & alloc,
@@ -55,24 +82,56 @@ class hyperloglog {
5582 this ->clear_async (stream); // TODO async or sync?
5683 }
5784
58- hyperloglog (hyperloglog const &) = delete ;
59- hyperloglog& operator =(hyperloglog const &) = delete ;
60- hyperloglog (hyperloglog&&) = default ;
61- hyperloglog& operator =(hyperloglog&&) = default ;
62- ~hyperloglog () = default ;
85+ ~hyperloglog () = default ;
6386
87+ hyperloglog (hyperloglog const &) = delete ;
88+ hyperloglog& operator =(hyperloglog const &) = delete ;
89+ hyperloglog (hyperloglog&&) = default ; // /< Move constructor
90+
91+ // TODO this is somehow required to pass the Doxygen check.
92+ /* *
93+ * @brief Copy-assignment operator.
94+ *
95+ * @return Copy of `*this`
96+ */
97+ hyperloglog& operator =(hyperloglog&&) = default ;
98+
99+ /* *
100+ * @brief Asynchronously resets the estimator, i.e., clears the current count estimate.
101+ *
102+ * @param stream CUDA stream this operation is executed in
103+ */
64104 void clear_async (cuco::cuda_stream_ref stream) noexcept
65105 {
66106 auto constexpr block_size = 1024 ;
67107 cuco::hyperloglog_ns::detail::clear<<<1 , block_size, 0 , stream>>> (this ->ref ());
68108 }
69109
110+ /* *
111+ * @brief Resets the estimator, i.e., clears the current count estimate.
112+ *
113+ * @note This function synchronizes the given stream. For asynchronous execution use
114+ * `clear_async`.
115+ *
116+ * @param stream CUDA stream this operation is executed in
117+ */
70118 void clear (cuco::cuda_stream_ref stream)
71119 {
72120 this ->clear_async (stream);
73121 stream.synchronize ();
74122 }
75123
124+ /* *
125+ * @brief Asynchronously adds to be counted items to the estimator.
126+ *
127+ * @tparam InputIt Device accessible random access input iterator where
128+ * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
129+ * T></tt> is `true`
130+ *
131+ * @param first Beginning of the sequence of items
132+ * @param last End of the sequence of items
133+ * @param stream CUDA stream this operation is executed in
134+ */
76135 template <class InputIt >
77136 void add_async (InputIt first, InputIt last, cuco::cuda_stream_ref stream) noexcept
78137 {
@@ -83,50 +142,117 @@ class hyperloglog {
83142
84143 int grid_size = 0 ;
85144 int block_size = 0 ;
86- // TODO check cuda error?
145+
146+ // We make use of the occupancy calculator here to get the minimum number of blocks which still
147+ // saturate the GPU. This reduces the atomic contention on the final register array during the
148+ // merge phase.
149+ // TODO check cuda error or will it sync the stream??
87150 cudaOccupancyMaxPotentialBlockSize (
88151 &grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_shmem<InputIt, ref_type<>>);
89152
90153 cuco::hyperloglog_ns::detail::add_shmem<<<grid_size, block_size, 0 , stream>>> (
91154 first, num_items, this ->ref ());
92155 }
93156
157+ /* *
158+ * @brief Adds to be counted items to the estimator.
159+ *
160+ * @note This function synchronizes the given stream. For asynchronous execution use
161+ * `add_async`.
162+ *
163+ * @tparam InputIt Device accessible random access input iterator where
164+ * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
165+ * T></tt> is `true`
166+ *
167+ * @param first Beginning of the sequence of items
168+ * @param last End of the sequence of items
169+ * @param stream CUDA stream this operation is executed in
170+ */
94171 template <class InputIt >
95172 void add (InputIt first, InputIt last, cuco::cuda_stream_ref stream)
96173 {
97174 this ->add_async (first, last, stream);
98175 stream.synchronize ();
99176 }
100177
178+ /* *
179+ * @brief Asynchronously merges the result of `other` estimator into `*this` estimator.
180+ *
181+ * @tparam OtherScope Thread scope of `other` estimator
182+ * @tparam OtherAllocator Allocator type of `other` estimator
183+ *
184+ * @param other Other estimator to be merged into `*this`
185+ * @param stream CUDA stream this operation is executed in
186+ */
101187 template <cuda::thread_scope OtherScope, class OtherAllocator >
102188 void merge_async (hyperloglog<T, Precision, OtherScope, Hash, OtherAllocator> const & other,
103- cuco::cuda_stream_ref stream = {} ) noexcept
189+ cuco::cuda_stream_ref stream) noexcept
104190 {
105191 this ->merge_async (other.ref (), stream);
106192 }
107193
194+ /* *
195+ * @brief Merges the result of `other` estimator into `*this` estimator.
196+ *
197+ * @note This function synchronizes the given stream. For asynchronous execution use
198+ * `merge_async`.
199+ *
200+ * @tparam OtherScope Thread scope of `other` estimator
201+ * @tparam OtherAllocator Allocator type of `other` estimator
202+ *
203+ * @param other Other estimator to be merged into `*this`
204+ * @param stream CUDA stream this operation is executed in
205+ */
108206 template <cuda::thread_scope OtherScope, class OtherAllocator >
109207 void merge (hyperloglog<T, Precision, OtherScope, Hash, OtherAllocator> const & other,
110- cuco::cuda_stream_ref stream = {} )
208+ cuco::cuda_stream_ref stream)
111209 {
112210 this ->merge_async (other, stream);
113211 stream.synchronize ();
114212 }
115213
214+ /* *
215+ * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
216+ *
217+ * @tparam OtherScope Thread scope of `other` estimator
218+ *
219+ * @param other Other estimator reference to be merged into `*this`
220+ * @param stream CUDA stream this operation is executed in
221+ */
116222 template <cuda::thread_scope OtherScope>
117- void merge_async (ref_type<OtherScope> const & other, cuco::cuda_stream_ref stream = {} ) noexcept
223+ void merge_async (ref_type<OtherScope> const & other, cuco::cuda_stream_ref stream) noexcept
118224 {
119225 auto constexpr block_size = 1024 ;
120226 cuco::hyperloglog_ns::detail::merge<<<1 , block_size, 0 , stream>>> (other, this ->ref ());
121227 }
122228
229+ /* *
230+ * @brief Merges the result of `other` estimator reference into `*this` estimator.
231+ *
232+ * @note This function synchronizes the given stream. For asynchronous execution use
233+ * `merge_async`.
234+ *
235+ * @tparam OtherScope Thread scope of `other` estimator
236+ *
237+ * @param other Other estimator reference to be merged into `*this`
238+ * @param stream CUDA stream this operation is executed in
239+ */
123240 template <cuda::thread_scope OtherScope>
124- void merge (ref_type<OtherScope> const & other, cuco::cuda_stream_ref stream = {} )
241+ void merge (ref_type<OtherScope> const & other, cuco::cuda_stream_ref stream)
125242 {
126243 this ->merge_async (other, stream);
127244 stream.synchronize ();
128245 }
129246
247+ /* *
248+ * @brief Compute the estimated distinct items count.
249+ *
250+ * @note This function synchronizes the given stream.
251+ *
252+ * @param stream CUDA stream this operation is executed in
253+ *
254+ * @return Approximate distinct items count
255+ */
130256 [[nodiscard]] std::size_t estimate (cuco::cuda_stream_ref stream) const
131257 {
132258 // TODO remove test code
@@ -167,6 +293,11 @@ class hyperloglog {
167293 return cuco::hyperloglog_ns::detail::finalizer<Precision>::finalize (sum, zeroes);
168294 }
169295
296+ /* *
297+ * @brief Get device ref.
298+ *
299+ * @return Device ref object of the current `distinct_count_estimator` host object
300+ */
170301 [[nodiscard]] ref_type<> ref () const noexcept
171302 {
172303 return ref_type<>{*(this ->storage_ .get ()), {}, this ->hash_ };
@@ -185,11 +316,13 @@ class hyperloglog {
185316 storage_allocator_type& allocator;
186317 };
187318
188- Hash hash_;
189- storage_allocator_type storage_allocator_;
190- storage_deleter storage_deleter_;
191- std::unique_ptr<storage_type, storage_deleter> storage_;
319+ Hash hash_; // /< Hash function used to hash items
320+ storage_allocator_type storage_allocator_; // /< Storage allocator
321+ storage_deleter storage_deleter_; // /< Storage deleter
322+ std::unique_ptr<storage_type, storage_deleter> storage_; // /< Storage
192323
324+ // Needs to be friends with other instantiations of this class template to have access to their
325+ // storage
193326 template <class T_ , int32_t Precision_, cuda::thread_scope Scope_, class Hash_ , class Allocator_ >
194327 friend class hyperloglog ;
195328};
0 commit comments