Skip to content

Commit 66bdff9

Browse files
committed
Merge remote-tracking branch 'upstream/dev' into rm-old-multimap
2 parents 737ffa1 + f75390e commit 66bdff9

4 files changed

Lines changed: 53 additions & 40 deletions

File tree

include/cuco/detail/hyperloglog/finalizer.cuh

Lines changed: 8 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,11 @@ class finalizer {
123123

124124
__host__ __device__ constexpr int interpolation_anchor_index(double e) const noexcept
125125
{
126-
auto estimates = raw_estimate_data(this->precision_);
127-
int const n = raw_estimate_data_size(this->precision_);
128-
int left = 0;
129-
int right = static_cast<int>(n) - 1;
130-
int mid = -1;
131-
int candidate_index = 0; // Index of the closest element found
132-
126+
auto estimates = raw_estimate_data(this->precision_);
127+
int const n = raw_estimate_data_size(this->precision_);
128+
int left = 0;
129+
int right = static_cast<int>(n) - 1;
130+
int mid = -1;
133131
while (left <= right) {
134132
mid = left + (right - left) / 2;
135133

@@ -143,19 +141,9 @@ class finalizer {
143141
}
144142
}
145143

146-
// At this point, 'left' is the insertion point. We need to compare the elements at 'left' and
147-
// 'left - 1' to find the closest one, taking care of boundary conditions.
148-
149-
// Distance from 'e' to the element at 'left', if within bounds
150-
double const dist_lhs = left < static_cast<int>(n) ? cuda::std::abs(estimates[left] - e)
151-
: cuda::std::numeric_limits<double>::max();
152-
// Distance from 'e' to the element at 'left - 1', if within bounds
153-
double const dist_rhs = left - 1 >= 0 ? cuda::std::abs(estimates[left - 1] - e)
154-
: cuda::std::numeric_limits<double>::max();
155-
156-
candidate_index = (dist_lhs < dist_rhs) ? left : left - 1;
157-
158-
return candidate_index;
144+
// At this point, `left` is the insertion point. Spark uses the insertion point as the anchor
145+
// index when the exact estimate is not present.
146+
return left;
159147
}
160148

161149
static constexpr auto k = 6; ///< Number of interpolation points to consider

include/cuco/detail/hyperloglog/hyperloglog_impl.cuh

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@
3131
#include <cuda/std/__algorithm/max.h> // TODO #include <cuda/std/algorithm> once available
3232
#include <cuda/std/bit>
3333
#include <cuda/std/cstddef>
34+
#include <cuda/std/limits>
3435
#include <cuda/std/span>
36+
#include <cuda/std/type_traits>
3537
#include <cuda/std/utility>
3638
#include <cuda/stream_ref>
3739
#include <thrust/iterator/constant_iterator.h>
@@ -63,6 +65,9 @@ class hyperloglog_impl {
6365
using fp_type = double; ///< Floating point type used for reduction
6466
using hash_value_type = cuda::std::remove_cvref_t<decltype(cuda::std::declval<Hash>()(
6567
cuda::std::declval<T>()))>; ///< Hash value type
68+
static_assert(cuda::std::is_unsigned_v<hash_value_type>,
69+
"HyperLogLog requires an unsigned hash value type");
70+
6671
public:
6772
static constexpr auto thread_scope = Scope; ///< CUDA thread scope
6873

@@ -154,13 +159,11 @@ class hyperloglog_impl {
154159
*/
155160
__device__ constexpr void add(T const& item) noexcept
156161
{
157-
auto const h = this->hash_(item);
158-
auto const reg = h & this->register_mask();
159-
auto const zeroes = cuda::std::countl_zero(h | this->register_mask()) + 1; // __clz
160-
161-
// reversed order (same one as Spark uses)
162-
// auto const reg = h >> ((sizeof(hash_value_type) * 8) - this->precision_);
163-
// auto const zeroes = cuda::std::countl_zero(h << this->precision_) + 1;
162+
constexpr auto hash_bits = cuda::std::numeric_limits<hash_value_type>::digits;
163+
auto const h = this->hash_(item);
164+
auto const reg = static_cast<int>(h >> (hash_bits - this->precision_));
165+
auto const w_padding = hash_value_type{1} << static_cast<hash_value_type>(this->precision_ - 1);
166+
auto const zeroes = cuda::std::countl_zero((h << this->precision_) | w_padding) + 1; // __clz
164167

165168
this->update_max(reg, zeroes);
166169
}
@@ -406,7 +409,7 @@ class hyperloglog_impl {
406409
int thread_zeroes = 0;
407410
for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) {
408411
auto const reg = this->sketch_[i];
409-
thread_sum += fp_type{1} / static_cast<fp_type>(1 << reg);
412+
thread_sum += fp_type{1} / static_cast<fp_type>(1ull << reg);
410413
thread_zeroes += reg == 0;
411414
}
412415

@@ -627,16 +630,6 @@ class hyperloglog_impl {
627630
}
628631
}
629632

630-
/**
631-
* @brief Gets the register mask used to separate register index from count.
632-
*
633-
* @return The register mask
634-
*/
635-
__host__ __device__ constexpr hash_value_type register_mask() const noexcept
636-
{
637-
return (1ull << this->precision_) - 1;
638-
}
639-
640633
hasher hash_; ///< Hash function used to hash items
641634
int32_t precision_; ///< HLL precision parameter
642635
cuda::std::span<register_type> sketch_; ///< HLL sketch storage

include/cuco/detail/hyperloglog/kernels.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ CUCO_KERNEL void add_if_shmem(
6161
auto idx = cuco::detail::global_thread_id();
6262
auto const block = cooperative_groups::this_thread_block();
6363

64-
local_ref_type local_ref(cuda::std::span{local_sketch, ref.sketch_bytes()}, {});
64+
local_ref_type local_ref(cuda::std::span{local_sketch, ref.sketch_bytes()}, ref.hash_function());
6565
local_ref.clear(block);
6666
block.sync();
6767

@@ -92,7 +92,7 @@ CUCO_KERNEL void add_if_shmem_vectorized(typename RefType::value_type const* fir
9292
auto const grid = cooperative_groups::this_grid();
9393
auto const block = cooperative_groups::this_thread_block();
9494

95-
local_ref_type local_ref(cuda::std::span{local_sketch, ref.sketch_bytes()}, {});
95+
local_ref_type local_ref(cuda::std::span{local_sketch, ref.sketch_bytes()}, ref.hash_function());
9696
local_ref.clear(block);
9797
block.sync();
9898

tests/hyperloglog/spark_parity_test.cu

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include <cstddef>
3333
#include <cstdint>
3434
#include <cstring> // std::memcpy
35+
#include <vector>
3536

3637
/**
3738
* @file spark_parity_test.cu
@@ -99,6 +100,37 @@ TEST_CASE("hyperloglog: Spark parity: deterministic cardinality estimation", "")
99100
REQUIRE(relative_error < expected_standard_deviation * tolerance_factor);
100101
}
101102

103+
TEST_CASE("hyperloglog: Spark parity: regression for issue #696", "")
104+
{
105+
using T = int;
106+
using estimator_type = cuco::hyperloglog<T, cuda::thread_scope_device, cuco::xxhash_64<T>>;
107+
108+
auto const standard_deviation = cuco::standard_deviation{0.3};
109+
std::vector<T> const host_items = {
110+
434971005, -1801141102, 1963272577, -493001830, -1087762159, 843441079, 959409252,
111+
252071729, -1830233271, 820808802, -1535782039, 1531475465, 1642188005, 552222160,
112+
-194998970, 2109544455, 1405026214, 1672131131, 1247840828, -180033177, -1286780806,
113+
933672832, 1401381638, -241603026, 615622263, -957425136, -276735314, -2009711680,
114+
-639722582, 974221725, 713012837, -1402812678, -546850329, -866141232, 848946484,
115+
-635203849, -1450175774, 844979905, 888971584, 1855780699, -1268565561, -1185513673,
116+
1019479409, -1333229875, -1246182436, -2147483648, 900525526, 1006079044, -698588704,
117+
-943987698, 27695788, -84695147, -1441291062, 397673504, -392707402, 1290858625,
118+
1420750585, -1178564290, 1921246226, 188935376, 6560145, -1928347973, 820364161,
119+
-401706971, -1118924186, 1759421546, -1350108963, 2097517825, -23883470, -1221269093,
120+
1264159503, 97097882, 982791723, 638708040, -349593807, 361658100, 341780548,
121+
-4171545, 1095633384, -1694321873, 1777502952, -1699998259, -1432813716, 1113816192,
122+
-966808405, 1583478695, -650293396, 35500231, -440874147, 995739986, 207692068,
123+
0, -1243401007, -1576220155, 1868986580, -87141217, 2108694405, -251958436,
124+
2028975576, 1725957984, -354115601, 888726314, 1032487345, -1968749299, 1880817790,
125+
1113480821, 789387254, -1724956749, -1201901245};
126+
thrust::device_vector<T> items = host_items;
127+
128+
estimator_type estimator{standard_deviation, cuco::xxhash_64<T>{42}};
129+
estimator.add(items.begin(), items.end());
130+
131+
REQUIRE(estimator.estimate() == 81);
132+
}
133+
102134
// the following test is omitted since we refrain from doing randomized unit tests in cuco
103135
// TEST_CASE("hyperloglog: Spark parity: random cardinality estimation", "")
104136

0 commit comments

Comments
 (0)