From 169de66802422bf362f46b15d8a4a08082414b96 Mon Sep 17 00:00:00 2001 From: Mikael Huppunen Date: Fri, 30 Jan 2026 16:02:08 +0200 Subject: [PATCH 1/5] Overloading functions to allow custom allocators --- include/hashinator/hashinator.h | 21 +++++++++++++++++ include/splitvector/split_tools.h | 39 +++++++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h index e2a37e4..445d65f 100644 --- a/include/hashinator/hashinator.h +++ b/include/hashinator/hashinator.h @@ -1183,6 +1183,13 @@ class Hashmap { buckets, elements, rule, nBlocks, mPool, s); return retval; } + template + void extractPatternLoop(split::SplitVector, ALLOCATOR>& elements, Rule rule, + split_gpuStream_t s = 0) { + // Extract elements matching the Pattern Rule(element)==true; + split::tools::copy_if_loop, Rule, ALLOCATOR, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>( + *device_buckets, elements, rule, s); + } template void extractPatternLoop(split::SplitVector>& elements, Rule rule, split_gpuStream_t s = 0) { @@ -1226,6 +1233,12 @@ class Hashmap { defaults::WARPSIZE>(buckets, elements, rule, stack, max_size, s); return elements.size(); } + template + void extractKeysByPatternLoop(split::SplitVector& elements, Rule rule, split_gpuStream_t s = 0) { + // Extract element **keys** matching the Pattern Rule(element)==true; + split::tools::copy_if_keys_loop, KEY_TYPE, Rule, ALLOCATOR, defaults::MAX_BLOCKSIZE, + defaults::WARPSIZE>(*device_buckets, elements, rule, s); + } template void extractKeysByPatternLoop(split::SplitVector& elements, Rule rule, split_gpuStream_t s = 0) { // Extract element **keys** matching the Pattern Rule(element)==true; @@ -1250,6 +1263,14 @@ class Hashmap { }; return extractKeysByPattern(elements, rule, stack, max_size, s); } + template + void extractAllKeysLoop(split::SplitVector& elements, split_gpuStream_t s = 0) { + // Extract all keys + auto rule = [] __host__ __device__(const hash_pair& kval) -> bool { + return kval.first != EMPTYBUCKET && kval.first != TOMBSTONE; + }; + extractKeysByPatternLoop(elements, rule, s); + } void extractAllKeysLoop(split::SplitVector& elements, split_gpuStream_t s = 0) { // Extract all keys auto rule = [] __host__ __device__(const hash_pair& kval) -> bool { diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index 045a624..838a749 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -715,9 +715,9 @@ __global__ void block_compact_keys(T* input, U* output, size_t inputSize, Rule r } } -template +template __global__ void loop_compact(split::SplitVector>& inputVec, - split::SplitVector>& outputVec, Rule rule) { + split::SplitVector& outputVec, Rule rule) { // This must be equal to at least both WARPLENGTH and MAX_BLOCKSIZE/WARPLENGTH __shared__ uint32_t warpSums[WARPLENGTH]; __shared__ uint32_t outputCount; @@ -797,9 +797,9 @@ __global__ void loop_compact(split::SplitVector +template __global__ void loop_compact_keys(split::SplitVector>& inputVec, - split::SplitVector>& outputVec, Rule rule) { + split::SplitVector& outputVec, Rule rule) { // This must be equal to at least both WARPLENGTH and MAX_BLOCKSIZE/WARPLENGTH __shared__ uint32_t warpSums[WARPLENGTH]; __shared__ uint32_t outputCount; @@ -988,6 +988,24 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, size_t nBlocks return numel; } +/** + * @brief Extraction routines using just a single block. + These methods assume splitvectors are fully allocated on UM or Device. + */ + +template +void copy_if_loop(split::SplitVector>& input, + split::SplitVector& output, Rule rule, + split_gpuStream_t s = 0) { +#ifdef HASHINATOR_DEBUG + bool input_ok = isDeviceAccessible(reinterpret_cast(&input)); + bool output_ok = isDeviceAccessible(reinterpret_cast(&output)); + assert((input_ok && output_ok) && + "This method supports splitvectors dynamically allocated on device or unified memory!"); +#endif + split::tools::loop_compact<<<1, BLOCKSIZE, 0, s>>>(input, output, rule); +} + /** * @brief Extraction routines using just a single block. These methods assume splitvectors are fully allocated on UM or Device. @@ -1006,6 +1024,19 @@ void copy_if_loop(split::SplitVector>& inpu split::tools::loop_compact<<<1, BLOCKSIZE, 0, s>>>(input, output, rule); } +template +void copy_if_keys_loop(split::SplitVector>& input, + split::SplitVector& output, Rule rule, + split_gpuStream_t s = 0) { +#ifdef HASHINATOR_DEBUG + bool input_ok = isDeviceAccessible(reinterpret_cast(&input)); + bool output_ok = isDeviceAccessible(reinterpret_cast(&output)); + assert((input_ok && output_ok) && + "This method supports splitvectors dynamically allocated on device or unified memory!"); +#endif + split::tools::loop_compact_keys<<<1, BLOCKSIZE, 0, s>>>(input, output, rule); +} + template void copy_if_keys_loop(split::SplitVector>& input, split::SplitVector>& output, Rule rule, From da7f0ece0265eb6bd29ff5dbee4017ee64b095dd Mon Sep 17 00:00:00 2001 From: Mikael Huppunen Date: Mon, 2 Feb 2026 15:04:45 +0200 Subject: [PATCH 2/5] Using default allocator as a default argument in template --- include/hashinator/hashinator.h | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h index 445d65f..2b6fa39 100644 --- a/include/hashinator/hashinator.h +++ b/include/hashinator/hashinator.h @@ -1183,20 +1183,13 @@ class Hashmap { buckets, elements, rule, nBlocks, mPool, s); return retval; } - template + template >> void extractPatternLoop(split::SplitVector, ALLOCATOR>& elements, Rule rule, split_gpuStream_t s = 0) { // Extract elements matching the Pattern Rule(element)==true; split::tools::copy_if_loop, Rule, ALLOCATOR, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>( *device_buckets, elements, rule, s); } - template - void extractPatternLoop(split::SplitVector>& elements, Rule rule, - split_gpuStream_t s = 0) { - // Extract elements matching the Pattern Rule(element)==true; - split::tools::copy_if_loop, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>( - *device_buckets, elements, rule, s); - } void extractLoop(split::SplitVector>& elements, split_gpuStream_t s = 0) { // Extract all valid elements auto rule = [] __host__ __device__(const hash_pair& kval) -> bool { @@ -1233,18 +1226,12 @@ class Hashmap { defaults::WARPSIZE>(buckets, elements, rule, stack, max_size, s); return elements.size(); } - template + template >> void extractKeysByPatternLoop(split::SplitVector& elements, Rule rule, split_gpuStream_t s = 0) { // Extract element **keys** matching the Pattern Rule(element)==true; split::tools::copy_if_keys_loop, KEY_TYPE, Rule, ALLOCATOR, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>(*device_buckets, elements, rule, s); } - template - void extractKeysByPatternLoop(split::SplitVector& elements, Rule rule, split_gpuStream_t s = 0) { - // Extract element **keys** matching the Pattern Rule(element)==true; - split::tools::copy_if_keys_loop, KEY_TYPE, Rule, defaults::MAX_BLOCKSIZE, - defaults::WARPSIZE>(*device_buckets, elements, rule, s); - } template size_t extractAllKeys(split::SplitVector& elements, split_gpuStream_t s = 0) { @@ -1263,7 +1250,7 @@ class Hashmap { }; return extractKeysByPattern(elements, rule, stack, max_size, s); } - template + template >> void extractAllKeysLoop(split::SplitVector& elements, split_gpuStream_t s = 0) { // Extract all keys auto rule = [] __host__ __device__(const hash_pair& kval) -> bool { @@ -1271,13 +1258,6 @@ class Hashmap { }; extractKeysByPatternLoop(elements, rule, s); } - void extractAllKeysLoop(split::SplitVector& elements, split_gpuStream_t s = 0) { - // Extract all keys - auto rule = [] __host__ __device__(const hash_pair& kval) -> bool { - return kval.first != EMPTYBUCKET && kval.first != TOMBSTONE; - }; - extractKeysByPatternLoop(elements, rule, s); - } template void clean_tombstones(split_gpuStream_t s = 0) { From 615b04898642029ea14cc1167f1a4edb826f903f Mon Sep 17 00:00:00 2001 From: Mikael Huppunen Date: Mon, 9 Feb 2026 13:49:23 +0200 Subject: [PATCH 3/5] Added unit test for custom allocator --- unit_tests/Makefile | 5 +- unit_tests/custom_allocator/unit.cu | 237 ++++++++++++++++++++++++++++ 2 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 unit_tests/custom_allocator/unit.cu diff --git a/unit_tests/Makefile b/unit_tests/Makefile index 2cf0177..21707e5 100644 --- a/unit_tests/Makefile +++ b/unit_tests/Makefile @@ -7,7 +7,7 @@ EXTRA= --std=c++17 EXTRA+= -gencode arch=compute_60,code=sm_60 EXTRA+= -DHASHMAPDEBUG --expt-relaxed-constexpr --expt-extended-lambda -lpthread GTEST= -L/home/kstppd/libs/googletest/build/lib -I/home/kstppd/libs/googletest/googletest/include -lgtest -lgtest_main -lpthread -OBJ= gtest_vec_host.o gtest_vec_device.o gtest_hashmap.o stream_compaction.o stream_compaction2.o delete_mechanism.o insertion_mechanism.o hybrid_cpu.o hybrid_gpu.o pointer_test.o benchmark.o benchmarkLF.o tbPerf.o realistic.o preallocated.o +OBJ= gtest_vec_host.o gtest_vec_device.o gtest_hashmap.o stream_compaction.o stream_compaction2.o custom_allocator.o delete_mechanism.o insertion_mechanism.o hybrid_cpu.o hybrid_gpu.o pointer_test.o benchmark.o benchmarkLF.o tbPerf.o realistic.o preallocated.o default: tests @@ -63,6 +63,9 @@ preallocated.o: stream_compaction/preallocated.cu stream_compaction2.o: stream_compaction/unit.cu ${CC} ${CXXFLAGS} ${OPT} ${EXTRA} ${GTEST} -o compaction2 stream_compaction/unit.cu +custom_allocator.o: custom_allocator/unit.cu + ${CC} ${CXXFLAGS} ${OPT} ${EXTRA} ${GTEST} -o compaction2 stream_compaction/unit.cu + delete_mechanism.o: delete_by_compaction/main.cu ${CC} ${CXXFLAGS} ${OPT} ${EXTRA} ${GTEST} -o delete_mechanism delete_by_compaction/main.cu diff --git a/unit_tests/custom_allocator/unit.cu b/unit_tests/custom_allocator/unit.cu new file mode 100644 index 0000000..3ae5439 --- /dev/null +++ b/unit_tests/custom_allocator/unit.cu @@ -0,0 +1,237 @@ +#include +#include +#include +#include +#include +#include +#include "../../include/splitvector/splitvec.h" +#include "../../include/splitvector/split_tools.h" +#include "../../include/common.h" +#include "../../include/splitvector/archMacros.h" +#define expect_true EXPECT_TRUE +#define expect_false EXPECT_FALSE +#define expect_eq EXPECT_EQ +#define TARGET 1 + +/** + * @brief Custom allocator for unified memory (GPU and CPU accessible). + * + * This class provides an allocator for unified memory, which can be accessed + * by both the GPU and the CPU. It allocates and deallocates memory using split_gpuMallocManaged + * and split_gpuFree functions, while also providing constructors and destructors for objects. + * + * @tparam T Type of the allocated objects. + */ +template +class customAllocator { +public: + typedef T value_type; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef ptrdiff_t difference_type; + typedef size_t size_type; + template + struct rebind { + typedef customAllocator other; + }; + /** + * @brief Default constructor. + */ + customAllocator() throw() {} + + /** + * @brief Copy constructor with different type. + */ + template + customAllocator(customAllocator const&) throw() {} + pointer address(reference x) const { return &x; } + const_pointer address(const_reference x) const { return &x; } + + pointer allocate(size_type n, const void* /*hint*/ = 0) { + T* ret; + assert(n && "allocate 0"); + SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&ret, n * sizeof(value_type))); + if (ret == nullptr) { + throw std::bad_alloc(); + } + return ret; + } + + static void* allocate_raw(size_type n, const void* /*hint*/ = 0) { + void* ret; + SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&ret, n)); + if (ret == nullptr) { + throw std::bad_alloc(); + } + return ret; + } + + void deallocate(pointer p, size_type n) { + if (n != 0 && p != 0) { + SPLIT_CHECK_ERR(split_gpuFree(p)); + } + } + static void deallocate(void* p, size_type n) { + if (n != 0 && p != 0) { + SPLIT_CHECK_ERR(split_gpuFree(p)); + } + } + + size_type max_size() const throw() { + size_type max = static_cast(-1) / sizeof(value_type); + return (max > 0 ? max : 1); + } + + template + __host__ __device__ void construct(U* p, Args&&... args) { + ::new (p) U(std::forward(args)...); + } + + void destroy(pointer p) { p->~value_type(); } +}; + +typedef uint32_t int_type ; +typedef struct{ + int_type num; + int_type flag; +} test_t; +typedef split::SplitVector> vector; +size_t count = 0; + +void print_vector(vector& v){ + std::cout<<"-------------------"< dist(1, std::numeric_limits::max()); + v.clear(); + while (v.size() < targetSize) { + int_type val =++st;// dist(gen); + v.push_back(test_t{val,(val%2==0)}); + if (val%2 == 0){count++;}; + } +} + +void fill_vec_lin(vector& v, size_t targetSize){ + v.clear(); + int_type s=0; + while (v.size() < targetSize) { + v.push_back(test_t{s,s}); + s++; + } +} + +bool checkFlags(const vector& v,const int_type target){ + for (const auto& i:v){ + if (i.flag!=target){return false;} + } + return true; +} + +bool run_test(int power){ + //std::cout<<"Testing with vector size: "<<( 1<bool{ return element.flag == 1 ;}; + auto predicate_off =[]__host__ __device__ (test_t element)->bool{ return element.flag == 0 ;}; + vector output1(v.size()); + vector output2(v.size()); + split::tools::copy_if(v,output1,predicate_on); + split::tools::copy_if(v,output2,predicate_off); + bool sane1 = checkFlags(output1,1); + bool sane2 = checkFlags(output2,0); + bool sane3 = ((output1.size()+output2.size())==v.size()); + bool sane4 =( output1.size() ==count ); + bool sane5 = ( output2.size() ==v.size()-count ); + return sane1 && sane2 && sane3 && sane4 && sane5; +} + +bool run_test_small(size_t size){ + //std::cout<<"Testing with vector size: "<bool{ return element.flag == 1 ;}; + auto predicate_off =[]__host__ __device__ (test_t element)->bool{ return element.flag == 0 ;}; + vector output1(v.size()); + vector output2(v.size()); +#if 1 + split::tools::copy_if(v,output1,predicate_on); + split::tools::copy_if(v,output2,predicate_off); +#else + auto one = split::tools::copy_if_raw(v,output1.data(),predicate_on); + auto two = split::tools::copy_if_raw(v,output2.data(),predicate_off); + output1.erase(&output1[one] , output1.end() ); + output2.erase(&output2[two] , output2.end() ) ; +#endif + bool sane1 = checkFlags(output1,1); + bool sane2 = checkFlags(output2,0); + bool sane3 = ((output1.size()+output2.size())==v.size()); + bool sane4 =( output1.size() ==count ); + bool sane5 = ( output2.size() ==v.size()-count ); + //printf( " %d - %d - %d - %d - %d\n",sane1,sane2,sane3,sane4,sane5 ); + bool retval = sane1 && sane2 && sane3 && sane4 && sane5; + return retval; +} + +bool run_test_small_loop_variant(size_t size){ + // std::cout<<"Testing with vector size: "<bool{ return element.flag == 1 ;}; + auto predicate_off =[]__host__ __device__ (test_t element)->bool{ return element.flag == 0 ;}; + vector* output1=new vector(nextPow2(2*v->size())); + vector* output2=new vector(nextPow2(2*v->size())); + + split::tools::copy_if_loop(*v,*output1,predicate_on); + split::tools::copy_if_loop(*v,*output2,predicate_off); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + bool sane1 = checkFlags(*output1,1); + bool sane2 = checkFlags(*output2,0); + bool sane3 = ((output1->size()+output2->size())==v->size()); + bool sane4 =( output1->size() ==count ); + bool sane5 = ( output2->size() ==v->size()-count ); + // printf( " %d - %d - %d - %d - %d\n",sane1,sane2,sane3,sane4,sane5 ); + bool retval = sane1 && sane2 && sane3 && sane4 && sane5; + return retval; +} + +TEST(StremCompaction , Compaction_Tests_Linear){ + for (size_t s=32; s< 3000; s++ ){ + bool a = run_test_small(s); + expect_true(a); + } + +} + +TEST(StremCompaction , Compaction_Tests_Linear_Loop_Variant){ + for (size_t s=32; s< 1024; s++ ){ + bool a = run_test_small_loop_variant(s); + expect_true(a); + } + +} + +TEST(StremCompaction , Compaction_Tests_Power_of_2){ + for (uint32_t i =5; i< 25; i++){ + expect_true(run_test(i)); + } +} + +int main(int argc, char* argv[]){ + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} From 2c677ce692f7022375794495585e1b1962beeeba Mon Sep 17 00:00:00 2001 From: Mikael Huppunen Date: Wed, 25 Feb 2026 15:31:04 +0200 Subject: [PATCH 4/5] updated unit test to correct input type --- unit_tests/custom_allocator/unit.cu | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/unit_tests/custom_allocator/unit.cu b/unit_tests/custom_allocator/unit.cu index 3ae5439..0f1d11f 100644 --- a/unit_tests/custom_allocator/unit.cu +++ b/unit_tests/custom_allocator/unit.cu @@ -97,7 +97,8 @@ typedef struct{ int_type num; int_type flag; } test_t; -typedef split::SplitVector> vector; +typedef split::SplitVector vector; +typedef split::SplitVector> customAllocatorVector; size_t count = 0; void print_vector(vector& v){ @@ -133,7 +134,7 @@ void fill_vec_lin(vector& v, size_t targetSize){ } } -bool checkFlags(const vector& v,const int_type target){ +bool checkFlags(const customAllocatorVector& v,const int_type target){ for (const auto& i:v){ if (i.flag!=target){return false;} } @@ -146,8 +147,8 @@ bool run_test(int power){ fill_vec(v,1<bool{ return element.flag == 1 ;}; auto predicate_off =[]__host__ __device__ (test_t element)->bool{ return element.flag == 0 ;}; - vector output1(v.size()); - vector output2(v.size()); + customAllocatorVector output1(v.size()); + customAllocatorVector output2(v.size()); split::tools::copy_if(v,output1,predicate_on); split::tools::copy_if(v,output2,predicate_off); bool sane1 = checkFlags(output1,1); @@ -164,8 +165,8 @@ bool run_test_small(size_t size){ fill_vec(v,size); auto predicate_on =[]__host__ __device__ (test_t element)->bool{ return element.flag == 1 ;}; auto predicate_off =[]__host__ __device__ (test_t element)->bool{ return element.flag == 0 ;}; - vector output1(v.size()); - vector output2(v.size()); + customAllocatorVector output1(v.size()); + customAllocatorVector output2(v.size()); #if 1 split::tools::copy_if(v,output1,predicate_on); split::tools::copy_if(v,output2,predicate_off); @@ -192,8 +193,8 @@ bool run_test_small_loop_variant(size_t size){ auto predicate_on =[]__host__ __device__ (test_t element)->bool{ return element.flag == 1 ;}; auto predicate_off =[]__host__ __device__ (test_t element)->bool{ return element.flag == 0 ;}; - vector* output1=new vector(nextPow2(2*v->size())); - vector* output2=new vector(nextPow2(2*v->size())); + customAllocatorVector* output1=new customAllocatorVector(nextPow2(2*v->size())); + customAllocatorVector* output2=new customAllocatorVector(nextPow2(2*v->size())); split::tools::copy_if_loop(*v,*output1,predicate_on); split::tools::copy_if_loop(*v,*output2,predicate_off); From 3b72b9bad302d630de86c0b2777a99fc52ee1aa1 Mon Sep 17 00:00:00 2001 From: Mikael Huppunen Date: Thu, 26 Feb 2026 14:49:31 +0200 Subject: [PATCH 5/5] Custom allocator test now only tests things that were actually implemented for custom allocator --- unit_tests/custom_allocator/unit.cu | 78 ----------------------------- 1 file changed, 78 deletions(-) diff --git a/unit_tests/custom_allocator/unit.cu b/unit_tests/custom_allocator/unit.cu index 0f1d11f..9c2a2b9 100644 --- a/unit_tests/custom_allocator/unit.cu +++ b/unit_tests/custom_allocator/unit.cu @@ -101,16 +101,6 @@ typedef split::SplitVector vector; typedef split::SplitVector> customAllocatorVector; size_t count = 0; -void print_vector(vector& v){ - std::cout<<"-------------------"<bool{ return element.flag == 1 ;}; - auto predicate_off =[]__host__ __device__ (test_t element)->bool{ return element.flag == 0 ;}; - customAllocatorVector output1(v.size()); - customAllocatorVector output2(v.size()); - split::tools::copy_if(v,output1,predicate_on); - split::tools::copy_if(v,output2,predicate_off); - bool sane1 = checkFlags(output1,1); - bool sane2 = checkFlags(output2,0); - bool sane3 = ((output1.size()+output2.size())==v.size()); - bool sane4 =( output1.size() ==count ); - bool sane5 = ( output2.size() ==v.size()-count ); - return sane1 && sane2 && sane3 && sane4 && sane5; -} - -bool run_test_small(size_t size){ - //std::cout<<"Testing with vector size: "<bool{ return element.flag == 1 ;}; - auto predicate_off =[]__host__ __device__ (test_t element)->bool{ return element.flag == 0 ;}; - customAllocatorVector output1(v.size()); - customAllocatorVector output2(v.size()); -#if 1 - split::tools::copy_if(v,output1,predicate_on); - split::tools::copy_if(v,output2,predicate_off); -#else - auto one = split::tools::copy_if_raw(v,output1.data(),predicate_on); - auto two = split::tools::copy_if_raw(v,output2.data(),predicate_off); - output1.erase(&output1[one] , output1.end() ); - output2.erase(&output2[two] , output2.end() ) ; -#endif - bool sane1 = checkFlags(output1,1); - bool sane2 = checkFlags(output2,0); - bool sane3 = ((output1.size()+output2.size())==v.size()); - bool sane4 =( output1.size() ==count ); - bool sane5 = ( output2.size() ==v.size()-count ); - //printf( " %d - %d - %d - %d - %d\n",sane1,sane2,sane3,sane4,sane5 ); - bool retval = sane1 && sane2 && sane3 && sane4 && sane5; - return retval; -} - bool run_test_small_loop_variant(size_t size){ // std::cout<<"Testing with vector size: "<