From b5a743ff3f5606977a2eda391c1b7f60378dea02 Mon Sep 17 00:00:00 2001 From: mperikov Date: Tue, 2 Dec 2025 21:01:26 +0300 Subject: [PATCH 01/12] comparison for load/store vs loadu/storeu --- CMakeLists.txt | 14 +++++ src/alignment_comparison.cpp | 113 +++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 src/alignment_comparison.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 92df29b..8d7ff87 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,6 +57,7 @@ target_link_libraries(benchmarks benchmark benchmark_main pasta_bit_vector) + add_test( NAME Benchmarks COMMAND bt_benchmarks --benchmark_out=bm_report.csv --benchmark_out_format=csv @@ -101,4 +102,17 @@ target_link_libraries(test_rmm gtest gtest_main) + +add_executable(alignment_comparison + src/alignment_comparison.cpp) + +target_include_directories(alignment_comparison + PUBLIC include +) + +target_link_libraries(alignment_comparison + benchmark + benchmark_main + ) + enable_testing() \ No newline at end of file diff --git a/src/alignment_comparison.cpp b/src/alignment_comparison.cpp new file mode 100644 index 0000000..e2249fb --- /dev/null +++ b/src/alignment_comparison.cpp @@ -0,0 +1,113 @@ +#include +#include +#include + +#include "bits.h" + +#ifdef PIXIE_AVX512_SUPPORT + +static void BM_Loadu512(benchmark::State& state) { + alignas(64) uint8_t data[128]; + + const __m512i* ptr = reinterpret_cast(data); + + for (auto _ : state) { + benchmark::DoNotOptimize(_mm256_loadu_si512(ptr)); + } +} + + +static void BM_Load512(benchmark::State& state) { + alignas(64) uint8_t data[128]; + + const __m512i* ptr = reinterpret_cast(data); + + for (auto _ : state) { + benchmark::DoNotOptimize(_mm512_load_si512(ptr)); + } +} + + +static void BM_Storeu512(benchmark::State& state) { + alignas(64) uint8_t data[128]; + __m512i value = _mm512_setzero_si512(); + __m512i* ptr = reinterpret_cast<__m512i*>(data); + + for (auto _ : state) { + _mm512_storeu_si512(ptr, value); + benchmark::DoNotOptimize(*ptr); + } +} + + +static void BM_Store512(benchmark::State& state) { + alignas(64) uint8_t data[128]; + __m512i value = _mm512_setzero_si512(); + __m512i* ptr = reinterpret_cast<__m512i*>(data); + + for (auto _ : state) { + _mm512_store_si512(ptr, value); + benchmark::DoNotOptimize(*ptr); + } +} + + +BENCHMARK(BM_Loadu512); +BENCHMARK(BM_Load512); +BENCHMARK(BM_Storeu512); +BENCHMARK(BM_Store512); + +#else + +static void BM_Loadu256(benchmark::State& state) { + alignas(32) uint8_t data[64]; + + const __m256i* ptr = reinterpret_cast(data); + + for (auto _ : state) { + benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); + } +} + + +static void BM_Load256(benchmark::State& state) { + alignas(32) uint8_t data[64]; + + const __m256i* ptr = reinterpret_cast(data); + + for (auto _ : state) { + benchmark::DoNotOptimize(_mm256_load_si256(ptr)); + } +} + + +static void BM_Storeu256(benchmark::State& state) { + alignas(32) uint8_t data[64]; + __m256i value = _mm256_setzero_si256(); + __m256i* ptr = reinterpret_cast<__m256i*>(data); + + for (auto _ : state) { + _mm256_storeu_si256(ptr, value); + benchmark::DoNotOptimize(*ptr); + } +} + + +static void BM_Store256(benchmark::State& state) { + alignas(32) uint8_t data[64]; + __m256i value = _mm256_setzero_si256(); + __m256i* ptr = reinterpret_cast<__m256i*>(data); + + for (auto _ : state) { + _mm256_store_si256(ptr, value); + benchmark::DoNotOptimize(*ptr); + } +} + + +BENCHMARK(BM_Loadu256); +BENCHMARK(BM_Load256); +BENCHMARK(BM_Storeu256); +BENCHMARK(BM_Store256); + +#endif \ No newline at end of file From 110ff2f38b9295b6a5810d5f59dc3f999fe3c076 Mon Sep 17 00:00:00 2001 From: mperikov Date: Tue, 2 Dec 2025 21:12:07 +0300 Subject: [PATCH 02/12] Format fix --- src/alignment_comparison.cpp | 120 ++++++++++++++++------------------- 1 file changed, 56 insertions(+), 64 deletions(-) diff --git a/src/alignment_comparison.cpp b/src/alignment_comparison.cpp index e2249fb..eb44f0b 100644 --- a/src/alignment_comparison.cpp +++ b/src/alignment_comparison.cpp @@ -6,52 +6,48 @@ #ifdef PIXIE_AVX512_SUPPORT -static void BM_Loadu512(benchmark::State& state) { - alignas(64) uint8_t data[128]; +static void BM_Loadu512(benchmark::State &state) { + alignas(64) uint8_t data[128]; - const __m512i* ptr = reinterpret_cast(data); + const __m512i *ptr = reinterpret_cast(data); - for (auto _ : state) { - benchmark::DoNotOptimize(_mm256_loadu_si512(ptr)); - } + for (auto _ : state) { + benchmark::DoNotOptimize(_mm256_loadu_si512(ptr)); + } } +static void BM_Load512(benchmark::State &state) { + alignas(64) uint8_t data[128]; -static void BM_Load512(benchmark::State& state) { - alignas(64) uint8_t data[128]; + const __m512i *ptr = reinterpret_cast(data); - const __m512i* ptr = reinterpret_cast(data); - - for (auto _ : state) { - benchmark::DoNotOptimize(_mm512_load_si512(ptr)); - } + for (auto _ : state) { + benchmark::DoNotOptimize(_mm512_load_si512(ptr)); + } } +static void BM_Storeu512(benchmark::State &state) { + alignas(64) uint8_t data[128]; + __m512i value = _mm512_setzero_si512(); + __m512i *ptr = reinterpret_cast<__m512i *>(data); -static void BM_Storeu512(benchmark::State& state) { - alignas(64) uint8_t data[128]; - __m512i value = _mm512_setzero_si512(); - __m512i* ptr = reinterpret_cast<__m512i*>(data); - - for (auto _ : state) { - _mm512_storeu_si512(ptr, value); - benchmark::DoNotOptimize(*ptr); - } + for (auto _ : state) { + _mm512_storeu_si512(ptr, value); + benchmark::DoNotOptimize(*ptr); + } } +static void BM_Store512(benchmark::State &state) { + alignas(64) uint8_t data[128]; + __m512i value = _mm512_setzero_si512(); + __m512i *ptr = reinterpret_cast<__m512i *>(data); -static void BM_Store512(benchmark::State& state) { - alignas(64) uint8_t data[128]; - __m512i value = _mm512_setzero_si512(); - __m512i* ptr = reinterpret_cast<__m512i*>(data); - - for (auto _ : state) { - _mm512_store_si512(ptr, value); - benchmark::DoNotOptimize(*ptr); - } + for (auto _ : state) { + _mm512_store_si512(ptr, value); + benchmark::DoNotOptimize(*ptr); + } } - BENCHMARK(BM_Loadu512); BENCHMARK(BM_Load512); BENCHMARK(BM_Storeu512); @@ -59,52 +55,48 @@ BENCHMARK(BM_Store512); #else -static void BM_Loadu256(benchmark::State& state) { - alignas(32) uint8_t data[64]; +static void BM_Loadu256(benchmark::State &state) { + alignas(32) uint8_t data[64]; - const __m256i* ptr = reinterpret_cast(data); + const __m256i *ptr = reinterpret_cast(data); - for (auto _ : state) { - benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); - } + for (auto _ : state) { + benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); + } } +static void BM_Load256(benchmark::State &state) { + alignas(32) uint8_t data[64]; -static void BM_Load256(benchmark::State& state) { - alignas(32) uint8_t data[64]; + const __m256i *ptr = reinterpret_cast(data); - const __m256i* ptr = reinterpret_cast(data); - - for (auto _ : state) { - benchmark::DoNotOptimize(_mm256_load_si256(ptr)); - } + for (auto _ : state) { + benchmark::DoNotOptimize(_mm256_load_si256(ptr)); + } } +static void BM_Storeu256(benchmark::State &state) { + alignas(32) uint8_t data[64]; + __m256i value = _mm256_setzero_si256(); + __m256i *ptr = reinterpret_cast<__m256i *>(data); -static void BM_Storeu256(benchmark::State& state) { - alignas(32) uint8_t data[64]; - __m256i value = _mm256_setzero_si256(); - __m256i* ptr = reinterpret_cast<__m256i*>(data); - - for (auto _ : state) { - _mm256_storeu_si256(ptr, value); - benchmark::DoNotOptimize(*ptr); - } + for (auto _ : state) { + _mm256_storeu_si256(ptr, value); + benchmark::DoNotOptimize(*ptr); + } } +static void BM_Store256(benchmark::State &state) { + alignas(32) uint8_t data[64]; + __m256i value = _mm256_setzero_si256(); + __m256i *ptr = reinterpret_cast<__m256i *>(data); -static void BM_Store256(benchmark::State& state) { - alignas(32) uint8_t data[64]; - __m256i value = _mm256_setzero_si256(); - __m256i* ptr = reinterpret_cast<__m256i*>(data); - - for (auto _ : state) { - _mm256_store_si256(ptr, value); - benchmark::DoNotOptimize(*ptr); - } + for (auto _ : state) { + _mm256_store_si256(ptr, value); + benchmark::DoNotOptimize(*ptr); + } } - BENCHMARK(BM_Loadu256); BENCHMARK(BM_Load256); BENCHMARK(BM_Storeu256); From d999c8975a06defa1cdfd6348c0bdc510200ae32 Mon Sep 17 00:00:00 2001 From: mperikov Date: Tue, 2 Dec 2025 21:15:19 +0300 Subject: [PATCH 03/12] Format fix --- src/alignment_comparison.cpp | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/src/alignment_comparison.cpp b/src/alignment_comparison.cpp index eb44f0b..05e2210 100644 --- a/src/alignment_comparison.cpp +++ b/src/alignment_comparison.cpp @@ -1,35 +1,36 @@ #include #include + #include #include "bits.h" #ifdef PIXIE_AVX512_SUPPORT -static void BM_Loadu512(benchmark::State &state) { +static void BM_Loadu512(benchmark::State& state) { alignas(64) uint8_t data[128]; - const __m512i *ptr = reinterpret_cast(data); + const __m512i* ptr = reinterpret_cast(data); for (auto _ : state) { benchmark::DoNotOptimize(_mm256_loadu_si512(ptr)); } } -static void BM_Load512(benchmark::State &state) { +static void BM_Load512(benchmark::State& state) { alignas(64) uint8_t data[128]; - const __m512i *ptr = reinterpret_cast(data); + const __m512i* ptr = reinterpret_cast(data); for (auto _ : state) { benchmark::DoNotOptimize(_mm512_load_si512(ptr)); } } -static void BM_Storeu512(benchmark::State &state) { +static void BM_Storeu512(benchmark::State& state) { alignas(64) uint8_t data[128]; __m512i value = _mm512_setzero_si512(); - __m512i *ptr = reinterpret_cast<__m512i *>(data); + __m512i* ptr = reinterpret_cast<__m512i*>(data); for (auto _ : state) { _mm512_storeu_si512(ptr, value); @@ -37,10 +38,10 @@ static void BM_Storeu512(benchmark::State &state) { } } -static void BM_Store512(benchmark::State &state) { +static void BM_Store512(benchmark::State& state) { alignas(64) uint8_t data[128]; __m512i value = _mm512_setzero_si512(); - __m512i *ptr = reinterpret_cast<__m512i *>(data); + __m512i* ptr = reinterpret_cast<__m512i*>(data); for (auto _ : state) { _mm512_store_si512(ptr, value); @@ -55,30 +56,30 @@ BENCHMARK(BM_Store512); #else -static void BM_Loadu256(benchmark::State &state) { +static void BM_Loadu256(benchmark::State& state) { alignas(32) uint8_t data[64]; - const __m256i *ptr = reinterpret_cast(data); + const __m256i* ptr = reinterpret_cast(data); for (auto _ : state) { benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); } } -static void BM_Load256(benchmark::State &state) { +static void BM_Load256(benchmark::State& state) { alignas(32) uint8_t data[64]; - const __m256i *ptr = reinterpret_cast(data); + const __m256i* ptr = reinterpret_cast(data); for (auto _ : state) { benchmark::DoNotOptimize(_mm256_load_si256(ptr)); } } -static void BM_Storeu256(benchmark::State &state) { +static void BM_Storeu256(benchmark::State& state) { alignas(32) uint8_t data[64]; __m256i value = _mm256_setzero_si256(); - __m256i *ptr = reinterpret_cast<__m256i *>(data); + __m256i* ptr = reinterpret_cast<__m256i*>(data); for (auto _ : state) { _mm256_storeu_si256(ptr, value); @@ -86,10 +87,10 @@ static void BM_Storeu256(benchmark::State &state) { } } -static void BM_Store256(benchmark::State &state) { +static void BM_Store256(benchmark::State& state) { alignas(32) uint8_t data[64]; __m256i value = _mm256_setzero_si256(); - __m256i *ptr = reinterpret_cast<__m256i *>(data); + __m256i* ptr = reinterpret_cast<__m256i*>(data); for (auto _ : state) { _mm256_store_si256(ptr, value); @@ -102,4 +103,4 @@ BENCHMARK(BM_Load256); BENCHMARK(BM_Storeu256); BENCHMARK(BM_Store256); -#endif \ No newline at end of file +#endif From 770244acae0d642e9612de96dba57d716c8eca52 Mon Sep 17 00:00:00 2001 From: mperikov Date: Tue, 2 Dec 2025 21:19:42 +0300 Subject: [PATCH 04/12] typo correction --- src/alignment_comparison.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alignment_comparison.cpp b/src/alignment_comparison.cpp index 05e2210..1f2b99b 100644 --- a/src/alignment_comparison.cpp +++ b/src/alignment_comparison.cpp @@ -13,7 +13,7 @@ static void BM_Loadu512(benchmark::State& state) { const __m512i* ptr = reinterpret_cast(data); for (auto _ : state) { - benchmark::DoNotOptimize(_mm256_loadu_si512(ptr)); + benchmark::DoNotOptimize(_mm512_loadu_si512(ptr)); } } From 1e4c1b4ec56ce079ed44e5c6b3a318abbf115fcb Mon Sep 17 00:00:00 2001 From: mperikov Date: Tue, 9 Dec 2025 16:00:39 +0300 Subject: [PATCH 05/12] Benhmarks update --- src/alignment_comparison.cpp | 124 +++++++++++++++++++++++++++-------- 1 file changed, 98 insertions(+), 26 deletions(-) diff --git a/src/alignment_comparison.cpp b/src/alignment_comparison.cpp index 1f2b99b..afd9db3 100644 --- a/src/alignment_comparison.cpp +++ b/src/alignment_comparison.cpp @@ -7,19 +7,33 @@ #ifdef PIXIE_AVX512_SUPPORT -static void BM_Loadu512(benchmark::State& state) { - alignas(64) uint8_t data[128]; +alignas(64) uint8_t data[128]; - const __m512i* ptr = reinterpret_cast(data); +static void BM_Loadu512_shift63(benchmark::State& state) { + const __m512i* ptr = reinterpret_cast(data + 63); for (auto _ : state) { benchmark::DoNotOptimize(_mm512_loadu_si512(ptr)); } } -static void BM_Load512(benchmark::State& state) { - alignas(64) uint8_t data[128]; +static void BM_Loadu512_shift31(benchmark::State& state) { + const __m512i* ptr = reinterpret_cast(data + 31); + + for (auto _ : state) { + benchmark::DoNotOptimize(_mm512_loadu_si512(ptr)); + } +} +static void BM_Loadu512_shift0(benchmark::State& state) { + const __m512i* ptr = reinterpret_cast(data); + + for (auto _ : state) { + benchmark::DoNotOptimize(_mm512_loadu_si512(ptr)); + } +} + +static void BM_Load512_shift0(benchmark::State& state) { const __m512i* ptr = reinterpret_cast(data); for (auto _ : state) { @@ -27,8 +41,27 @@ static void BM_Load512(benchmark::State& state) { } } -static void BM_Storeu512(benchmark::State& state) { - alignas(64) uint8_t data[128]; +static void BM_Storeu512_shift63(benchmark::State& state) { + __m512i value = _mm512_setzero_si512(); + __m512i* ptr = reinterpret_cast<__m512i*>(data + 63); + + for (auto _ : state) { + _mm512_storeu_si512(ptr, value); + benchmark::DoNotOptimize(*ptr); + } +} + +static void BM_Storeu512_shift31(benchmark::State& state) { + __m512i value = _mm512_setzero_si512(); + __m512i* ptr = reinterpret_cast<__m512i*>(data + 31); + + for (auto _ : state) { + _mm512_storeu_si512(ptr, value); + benchmark::DoNotOptimize(*ptr); + } +} + +static void BM_Storeu512_shift0(benchmark::State& state) { __m512i value = _mm512_setzero_si512(); __m512i* ptr = reinterpret_cast<__m512i*>(data); @@ -38,8 +71,7 @@ static void BM_Storeu512(benchmark::State& state) { } } -static void BM_Store512(benchmark::State& state) { - alignas(64) uint8_t data[128]; +static void BM_Store512_shift0(benchmark::State& state) { __m512i value = _mm512_setzero_si512(); __m512i* ptr = reinterpret_cast<__m512i*>(data); @@ -49,26 +81,44 @@ static void BM_Store512(benchmark::State& state) { } } -BENCHMARK(BM_Loadu512); -BENCHMARK(BM_Load512); -BENCHMARK(BM_Storeu512); -BENCHMARK(BM_Store512); +BENCHMARK(BM_Loadu512_shift63); +BENCHMARK(BM_Loadu512_shift31); +BENCHMARK(BM_Loadu512_shift0); +BENCHMARK(BM_Load512_shift0); +BENCHMARK(BM_Storeu512_shift63); +BENCHMARK(BM_Storeu512_shift31); +BENCHMARK(BM_Storeu512_shift0); +BENCHMARK(BM_Store512_shift0); #else -static void BM_Loadu256(benchmark::State& state) { - alignas(32) uint8_t data[64]; +alignas(64) uint8_t data[128]; - const __m256i* ptr = reinterpret_cast(data); +static void BM_Loadu256_shift63(benchmark::State& state) { + const __m256i* ptr = reinterpret_cast(data + 63); for (auto _ : state) { benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); } } -static void BM_Load256(benchmark::State& state) { - alignas(32) uint8_t data[64]; +static void BM_Loadu256_shift31(benchmark::State& state) { + const __m256i* ptr = reinterpret_cast(data + 31); + + for (auto _ : state) { + benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); + } +} +static void BM_Loadu256_shift0(benchmark::State& state) { + const __m256i* ptr = reinterpret_cast(data); + + for (auto _ : state) { + benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); + } +} + +static void BM_Load256_shift0(benchmark::State& state) { const __m256i* ptr = reinterpret_cast(data); for (auto _ : state) { @@ -76,8 +126,27 @@ static void BM_Load256(benchmark::State& state) { } } -static void BM_Storeu256(benchmark::State& state) { - alignas(32) uint8_t data[64]; +static void BM_Storeu256_shift63(benchmark::State& state) { + __m256i value = _mm256_setzero_si256(); + __m256i* ptr = reinterpret_cast<__m256i*>(data + 63); + + for (auto _ : state) { + _mm256_storeu_si256(ptr, value); + benchmark::DoNotOptimize(*ptr); + } +} + +static void BM_Storeu256_shift31(benchmark::State& state) { + __m256i value = _mm256_setzero_si256(); + __m256i* ptr = reinterpret_cast<__m256i*>(data + 31); + + for (auto _ : state) { + _mm256_storeu_si256(ptr, value); + benchmark::DoNotOptimize(*ptr); + } +} + +static void BM_Storeu256_shift0(benchmark::State& state) { __m256i value = _mm256_setzero_si256(); __m256i* ptr = reinterpret_cast<__m256i*>(data); @@ -87,8 +156,7 @@ static void BM_Storeu256(benchmark::State& state) { } } -static void BM_Store256(benchmark::State& state) { - alignas(32) uint8_t data[64]; +static void BM_Store256_shift0(benchmark::State& state) { __m256i value = _mm256_setzero_si256(); __m256i* ptr = reinterpret_cast<__m256i*>(data); @@ -98,9 +166,13 @@ static void BM_Store256(benchmark::State& state) { } } -BENCHMARK(BM_Loadu256); -BENCHMARK(BM_Load256); -BENCHMARK(BM_Storeu256); -BENCHMARK(BM_Store256); +BENCHMARK(BM_Loadu256_shift63); +BENCHMARK(BM_Loadu256_shift31); +BENCHMARK(BM_Loadu256_shift0); +BENCHMARK(BM_Load256_shift0); +BENCHMARK(BM_Storeu256_shift63); +BENCHMARK(BM_Storeu256_shift31); +BENCHMARK(BM_Storeu256_shift0); +BENCHMARK(BM_Store256_shift0); #endif From 5109a8107a141ecd5041b04435c82d2d77332561 Mon Sep 17 00:00:00 2001 From: mperikov Date: Tue, 9 Dec 2025 19:17:50 +0300 Subject: [PATCH 06/12] Random pointers --- src/alignment_comparison.cpp | 297 +++++++++++++++++++++++++++++++++++ 1 file changed, 297 insertions(+) diff --git a/src/alignment_comparison.cpp b/src/alignment_comparison.cpp index afd9db3..be759f6 100644 --- a/src/alignment_comparison.cpp +++ b/src/alignment_comparison.cpp @@ -1,6 +1,8 @@ #include #include +#include +#include #include #include "bits.h" @@ -90,6 +92,154 @@ BENCHMARK(BM_Storeu512_shift31); BENCHMARK(BM_Storeu512_shift0); BENCHMARK(BM_Store512_shift0); + +static void BM_Loadu512_Random(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + size_t alignment = 64; + size_t size = 64 * n; + + void* data = std::aligned_alloc(alignment, size); + + for (auto _ : state) { + size_t idx = 1 + 64 * (rng() % (n - 1)); + const __m512i* ptr = reinterpret_cast( + reinterpret_cast(data) + idx); + + auto start = std::chrono::high_resolution_clock::now(); + + _mm512_loadu_si512(ptr); + + auto end = std::chrono::high_resolution_clock::now(); + + std::chrono::duration duration = end - start; + + benchmark::DoNotOptimize(ptr); + + state.SetIterationTime(duration.count()); + } + + std::free(data); +} + +static void BM_Load512_Random(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + size_t alignment = 64; + size_t size = 64 * n; + + void* data = std::aligned_alloc(alignment, size); + + for (auto _ : state) { + size_t idx = 64 * (rng() % (n - 1)); + const __m512i* ptr = reinterpret_cast( + reinterpret_cast(data) + idx); + + auto start = std::chrono::high_resolution_clock::now(); + + _mm512_load_si512(ptr); + + auto end = std::chrono::high_resolution_clock::now(); + + benchmark::DoNotOptimize(ptr); + + std::chrono::duration duration = end - start; + + state.SetIterationTime(duration.count()); + } + + std::free(data); +} + +static void BM_Storeu512_Random(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + size_t alignment = 64; + size_t size = 64 * n; + + void* data = std::aligned_alloc(alignment, size); + + for (auto _ : state) { + size_t idx = 1 + 64 * (rng() % (n - 1)); + __m512i* ptr = + reinterpret_cast<__m512i*>(reinterpret_cast(data) + idx); + __m512i value = _mm512_setzero_si512(); + + auto start = std::chrono::high_resolution_clock::now(); + + _mm512_storeu_si512(ptr, value); + + auto end = std::chrono::high_resolution_clock::now(); + + std::chrono::duration duration = end - start; + + benchmark::DoNotOptimize(ptr); + + state.SetIterationTime(duration.count()); + } + + std::free(data); +} + +static void BM_Store512_Random(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + size_t alignment = 64; + size_t size = 64 * n; + + void* data = std::aligned_alloc(alignment, size); + + for (auto _ : state) { + size_t idx = 64 * (rng() % (n - 1)); + __m512i* ptr = + reinterpret_cast<__m512i*>(reinterpret_cast(data) + idx); + __m512i value = _mm512_setzero_si512(); + + auto start = std::chrono::high_resolution_clock::now(); + + _mm512_store_si512(ptr, value); + + auto end = std::chrono::high_resolution_clock::now(); + + benchmark::DoNotOptimize(ptr); + + std::chrono::duration duration = end - start; + + state.SetIterationTime(duration.count()); + } + + std::free(data); +} + +BENCHMARK(BM_Loadu512_Random) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(2, 1 << 20) + ->UseManualTime(); + +BENCHMARK(BM_Load512_Random) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(2, 1 << 20) + ->UseManualTime(); + +BENCHMARK(BM_Storeu512_Random) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(2, 1 << 20) + ->UseManualTime(); + +BENCHMARK(BM_Store512_Random) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(2, 1 << 20) + ->UseManualTime(); + + #else alignas(64) uint8_t data[128]; @@ -175,4 +325,151 @@ BENCHMARK(BM_Storeu256_shift31); BENCHMARK(BM_Storeu256_shift0); BENCHMARK(BM_Store256_shift0); + +static void BM_Loadu256_Random(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + size_t alignment = 64; + size_t size = 64 * n; + + void* data = std::aligned_alloc(alignment, size); + + for (auto _ : state) { + size_t idx = 1 + 32 * (rng() % (n - 1)); + const __m256i* ptr = reinterpret_cast( + reinterpret_cast(data) + idx); + + auto start = std::chrono::high_resolution_clock::now(); + + _mm256_loadu_si256(ptr); + + auto end = std::chrono::high_resolution_clock::now(); + + std::chrono::duration duration = end - start; + + benchmark::DoNotOptimize(ptr); + + state.SetIterationTime(duration.count()); + } + + std::free(data); +} + +static void BM_Load256_Random(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + size_t alignment = 64; + size_t size = 64 * n; + + void* data = std::aligned_alloc(alignment, size); + + for (auto _ : state) { + size_t idx = 32 * (rng() % (n - 1)); + const __m256i* ptr = reinterpret_cast( + reinterpret_cast(data) + idx); + + auto start = std::chrono::high_resolution_clock::now(); + + _mm256_load_si256(ptr); + + auto end = std::chrono::high_resolution_clock::now(); + + benchmark::DoNotOptimize(ptr); + + std::chrono::duration duration = end - start; + + state.SetIterationTime(duration.count()); + } + + std::free(data); +} + +static void BM_Storeu256_Random(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + size_t alignment = 64; + size_t size = 64 * n; + + void* data = std::aligned_alloc(alignment, size); + + for (auto _ : state) { + size_t idx = 1 + 32 * (rng() % (n - 1)); + __m256i* ptr = + reinterpret_cast<__m256i*>(reinterpret_cast(data) + idx); + __m256i value = _mm256_setzero_si256(); + + auto start = std::chrono::high_resolution_clock::now(); + + _mm256_storeu_si256(ptr, value); + + auto end = std::chrono::high_resolution_clock::now(); + + std::chrono::duration duration = end - start; + + benchmark::DoNotOptimize(ptr); + + state.SetIterationTime(duration.count()); + } + + std::free(data); +} + +static void BM_Store256_Random(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + size_t alignment = 64; + size_t size = 64 * n; + + void* data = std::aligned_alloc(alignment, size); + + for (auto _ : state) { + size_t idx = 32 * (rng() % (n - 1)); + __m256i* ptr = + reinterpret_cast<__m256i*>(reinterpret_cast(data) + idx); + __m256i value = _mm256_setzero_si256(); + + auto start = std::chrono::high_resolution_clock::now(); + + _mm256_store_si256(ptr, value); + + auto end = std::chrono::high_resolution_clock::now(); + + benchmark::DoNotOptimize(ptr); + + std::chrono::duration duration = end - start; + + state.SetIterationTime(duration.count()); + } + + std::free(data); +} + +BENCHMARK(BM_Loadu256_Random) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(2, 1 << 20) + ->UseManualTime(); + +BENCHMARK(BM_Load256_Random) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(2, 1 << 20) + ->UseManualTime(); + +BENCHMARK(BM_Storeu256_Random) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(2, 1 << 20) + ->UseManualTime(); + +BENCHMARK(BM_Store256_Random) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(2, 1 << 20) + ->UseManualTime(); + #endif From 1f77b2bcfc52cb494b4fc043bd13304afad791d0 Mon Sep 17 00:00:00 2001 From: mperikov Date: Tue, 9 Dec 2025 19:25:01 +0300 Subject: [PATCH 07/12] Format fix --- src/alignment_comparison.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/alignment_comparison.cpp b/src/alignment_comparison.cpp index be759f6..35f72fd 100644 --- a/src/alignment_comparison.cpp +++ b/src/alignment_comparison.cpp @@ -92,7 +92,6 @@ BENCHMARK(BM_Storeu512_shift31); BENCHMARK(BM_Storeu512_shift0); BENCHMARK(BM_Store512_shift0); - static void BM_Loadu512_Random(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -239,7 +238,6 @@ BENCHMARK(BM_Store512_Random) ->Range(2, 1 << 20) ->UseManualTime(); - #else alignas(64) uint8_t data[128]; @@ -325,7 +323,6 @@ BENCHMARK(BM_Storeu256_shift31); BENCHMARK(BM_Storeu256_shift0); BENCHMARK(BM_Store256_shift0); - static void BM_Loadu256_Random(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); From a917da8b84b95a8bc998761649aca5378ba1df7d Mon Sep 17 00:00:00 2001 From: mperikov Date: Tue, 9 Dec 2025 22:25:56 +0300 Subject: [PATCH 08/12] Random benchmarks fix --- src/alignment_comparison.cpp | 98 ++++-------------------------------- 1 file changed, 10 insertions(+), 88 deletions(-) diff --git a/src/alignment_comparison.cpp b/src/alignment_comparison.cpp index 35f72fd..6aadcff 100644 --- a/src/alignment_comparison.cpp +++ b/src/alignment_comparison.cpp @@ -106,17 +106,9 @@ static void BM_Loadu512_Random(benchmark::State& state) { const __m512i* ptr = reinterpret_cast( reinterpret_cast(data) + idx); - auto start = std::chrono::high_resolution_clock::now(); - _mm512_loadu_si512(ptr); - auto end = std::chrono::high_resolution_clock::now(); - - std::chrono::duration duration = end - start; - benchmark::DoNotOptimize(ptr); - - state.SetIterationTime(duration.count()); } std::free(data); @@ -136,17 +128,9 @@ static void BM_Load512_Random(benchmark::State& state) { const __m512i* ptr = reinterpret_cast( reinterpret_cast(data) + idx); - auto start = std::chrono::high_resolution_clock::now(); - _mm512_load_si512(ptr); - auto end = std::chrono::high_resolution_clock::now(); - benchmark::DoNotOptimize(ptr); - - std::chrono::duration duration = end - start; - - state.SetIterationTime(duration.count()); } std::free(data); @@ -167,17 +151,9 @@ static void BM_Storeu512_Random(benchmark::State& state) { reinterpret_cast<__m512i*>(reinterpret_cast(data) + idx); __m512i value = _mm512_setzero_si512(); - auto start = std::chrono::high_resolution_clock::now(); - _mm512_storeu_si512(ptr, value); - auto end = std::chrono::high_resolution_clock::now(); - - std::chrono::duration duration = end - start; - benchmark::DoNotOptimize(ptr); - - state.SetIterationTime(duration.count()); } std::free(data); @@ -198,17 +174,9 @@ static void BM_Store512_Random(benchmark::State& state) { reinterpret_cast<__m512i*>(reinterpret_cast(data) + idx); __m512i value = _mm512_setzero_si512(); - auto start = std::chrono::high_resolution_clock::now(); - _mm512_store_si512(ptr, value); - auto end = std::chrono::high_resolution_clock::now(); - benchmark::DoNotOptimize(ptr); - - std::chrono::duration duration = end - start; - - state.SetIterationTime(duration.count()); } std::free(data); @@ -217,26 +185,22 @@ static void BM_Store512_Random(benchmark::State& state) { BENCHMARK(BM_Loadu512_Random) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20) - ->UseManualTime(); + ->Range(2, 1 << 20); BENCHMARK(BM_Load512_Random) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20) - ->UseManualTime(); + ->Range(2, 1 << 20); BENCHMARK(BM_Storeu512_Random) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20) - ->UseManualTime(); + ->Range(2, 1 << 20); BENCHMARK(BM_Store512_Random) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20) - ->UseManualTime(); + ->Range(2, 1 << 20); #else @@ -337,17 +301,7 @@ static void BM_Loadu256_Random(benchmark::State& state) { const __m256i* ptr = reinterpret_cast( reinterpret_cast(data) + idx); - auto start = std::chrono::high_resolution_clock::now(); - - _mm256_loadu_si256(ptr); - - auto end = std::chrono::high_resolution_clock::now(); - - std::chrono::duration duration = end - start; - - benchmark::DoNotOptimize(ptr); - - state.SetIterationTime(duration.count()); + benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); } std::free(data); @@ -367,17 +321,7 @@ static void BM_Load256_Random(benchmark::State& state) { const __m256i* ptr = reinterpret_cast( reinterpret_cast(data) + idx); - auto start = std::chrono::high_resolution_clock::now(); - - _mm256_load_si256(ptr); - - auto end = std::chrono::high_resolution_clock::now(); - - benchmark::DoNotOptimize(ptr); - - std::chrono::duration duration = end - start; - - state.SetIterationTime(duration.count()); + benchmark::DoNotOptimize(_mm256_load_si256(ptr)); } std::free(data); @@ -398,20 +342,10 @@ static void BM_Storeu256_Random(benchmark::State& state) { reinterpret_cast<__m256i*>(reinterpret_cast(data) + idx); __m256i value = _mm256_setzero_si256(); - auto start = std::chrono::high_resolution_clock::now(); - _mm256_storeu_si256(ptr, value); - auto end = std::chrono::high_resolution_clock::now(); - - std::chrono::duration duration = end - start; - benchmark::DoNotOptimize(ptr); - - state.SetIterationTime(duration.count()); } - - std::free(data); } static void BM_Store256_Random(benchmark::State& state) { @@ -429,17 +363,9 @@ static void BM_Store256_Random(benchmark::State& state) { reinterpret_cast<__m256i*>(reinterpret_cast(data) + idx); __m256i value = _mm256_setzero_si256(); - auto start = std::chrono::high_resolution_clock::now(); - _mm256_store_si256(ptr, value); - auto end = std::chrono::high_resolution_clock::now(); - benchmark::DoNotOptimize(ptr); - - std::chrono::duration duration = end - start; - - state.SetIterationTime(duration.count()); } std::free(data); @@ -448,25 +374,21 @@ static void BM_Store256_Random(benchmark::State& state) { BENCHMARK(BM_Loadu256_Random) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20) - ->UseManualTime(); + ->Range(2, 1 << 20); BENCHMARK(BM_Load256_Random) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20) - ->UseManualTime(); + ->Range(2, 1 << 20); BENCHMARK(BM_Storeu256_Random) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20) - ->UseManualTime(); + ->Range(2, 1 << 20); BENCHMARK(BM_Store256_Random) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20) - ->UseManualTime(); + ->Range(2, 1 << 20); #endif From caf8ea3070da366401804fc67fe9d82cc5669a4a Mon Sep 17 00:00:00 2001 From: mperikov Date: Wed, 10 Dec 2025 15:01:21 +0300 Subject: [PATCH 09/12] 4 types of tests --- src/alignment_comparison.cpp | 348 ++++++++++++----------------------- 1 file changed, 118 insertions(+), 230 deletions(-) diff --git a/src/alignment_comparison.cpp b/src/alignment_comparison.cpp index 6aadcff..2b1af2d 100644 --- a/src/alignment_comparison.cpp +++ b/src/alignment_comparison.cpp @@ -7,339 +7,214 @@ #include "bits.h" -#ifdef PIXIE_AVX512_SUPPORT - -alignas(64) uint8_t data[128]; - -static void BM_Loadu512_shift63(benchmark::State& state) { - const __m512i* ptr = reinterpret_cast(data + 63); +uint8_t data[1 << 29]; - for (auto _ : state) { - benchmark::DoNotOptimize(_mm512_loadu_si512(ptr)); - } -} +#ifdef PIXIE_AVX512_SUPPORT -static void BM_Loadu512_shift31(benchmark::State& state) { - const __m512i* ptr = reinterpret_cast(data + 31); +static void BM_Loadu512_aligned(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); for (auto _ : state) { - benchmark::DoNotOptimize(_mm512_loadu_si512(ptr)); - } -} - -static void BM_Loadu512_shift0(benchmark::State& state) { - const __m512i* ptr = reinterpret_cast(data); + size_t idx = 64 * (rng() % (n - 1)); + const __m512i* ptr = reinterpret_cast(data + idx); - for (auto _ : state) { benchmark::DoNotOptimize(_mm512_loadu_si512(ptr)); } } -static void BM_Load512_shift0(benchmark::State& state) { - const __m512i* ptr = reinterpret_cast(data); - - for (auto _ : state) { - benchmark::DoNotOptimize(_mm512_load_si512(ptr)); - } -} - -static void BM_Storeu512_shift63(benchmark::State& state) { - __m512i value = _mm512_setzero_si512(); - __m512i* ptr = reinterpret_cast<__m512i*>(data + 63); - - for (auto _ : state) { - _mm512_storeu_si512(ptr, value); - benchmark::DoNotOptimize(*ptr); - } -} - -static void BM_Storeu512_shift31(benchmark::State& state) { - __m512i value = _mm512_setzero_si512(); - __m512i* ptr = reinterpret_cast<__m512i*>(data + 31); - - for (auto _ : state) { - _mm512_storeu_si512(ptr, value); - benchmark::DoNotOptimize(*ptr); - } -} - -static void BM_Storeu512_shift0(benchmark::State& state) { - __m512i value = _mm512_setzero_si512(); - __m512i* ptr = reinterpret_cast<__m512i*>(data); +static void BM_Loadu512_unaligned_crossing_64byte_border(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); for (auto _ : state) { - _mm512_storeu_si512(ptr, value); - benchmark::DoNotOptimize(*ptr); - } -} + size_t idx = 48 + 64 * (rng() % (n - 1)); + const __m512i* ptr = reinterpret_cast(data + idx); -static void BM_Store512_shift0(benchmark::State& state) { - __m512i value = _mm512_setzero_si512(); - __m512i* ptr = reinterpret_cast<__m512i*>(data); - - for (auto _ : state) { - _mm512_store_si512(ptr, value); - benchmark::DoNotOptimize(*ptr); + benchmark::DoNotOptimize(_mm512_loadu_si512(ptr)); } } -BENCHMARK(BM_Loadu512_shift63); -BENCHMARK(BM_Loadu512_shift31); -BENCHMARK(BM_Loadu512_shift0); -BENCHMARK(BM_Load512_shift0); -BENCHMARK(BM_Storeu512_shift63); -BENCHMARK(BM_Storeu512_shift31); -BENCHMARK(BM_Storeu512_shift0); -BENCHMARK(BM_Store512_shift0); - -static void BM_Loadu512_Random(benchmark::State& state) { +static void BM_Load512_aligned(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); - size_t alignment = 64; - size_t size = 64 * n; - - void* data = std::aligned_alloc(alignment, size); - for (auto _ : state) { - size_t idx = 1 + 64 * (rng() % (n - 1)); - const __m512i* ptr = reinterpret_cast( - reinterpret_cast(data) + idx); - - _mm512_loadu_si512(ptr); + size_t idx = 64 * (rng() % (n - 1)); + const __m512i* ptr = reinterpret_cast(data + idx); - benchmark::DoNotOptimize(ptr); + benchmark::DoNotOptimize(_mm512_load_si512(ptr)); } - - std::free(data); } -static void BM_Load512_Random(benchmark::State& state) { +static void BM_Storeu512_aligned(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); - size_t alignment = 64; - size_t size = 64 * n; - - void* data = std::aligned_alloc(alignment, size); - for (auto _ : state) { size_t idx = 64 * (rng() % (n - 1)); - const __m512i* ptr = reinterpret_cast( - reinterpret_cast(data) + idx); + __m512i* ptr = + reinterpret_cast<__m512i*>(data + idx); + __m512i value = _mm512_setzero_si512(); - _mm512_load_si512(ptr); + _mm512_storeu_si512(ptr, value); benchmark::DoNotOptimize(ptr); } - - std::free(data); } -static void BM_Storeu512_Random(benchmark::State& state) { +static void BM_Storeu512_unaligned_crossing_64byte_border(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); - size_t alignment = 64; - size_t size = 64 * n; - - void* data = std::aligned_alloc(alignment, size); - for (auto _ : state) { - size_t idx = 1 + 64 * (rng() % (n - 1)); + size_t idx = 48 + 64 * (rng() % (n - 1)); __m512i* ptr = - reinterpret_cast<__m512i*>(reinterpret_cast(data) + idx); + reinterpret_cast<__m512i*>(data + idx); __m512i value = _mm512_setzero_si512(); _mm512_storeu_si512(ptr, value); benchmark::DoNotOptimize(ptr); } - - std::free(data); } -static void BM_Store512_Random(benchmark::State& state) { +static void BM_Store512_aligned(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); - size_t alignment = 64; - size_t size = 64 * n; - - void* data = std::aligned_alloc(alignment, size); - for (auto _ : state) { size_t idx = 64 * (rng() % (n - 1)); __m512i* ptr = - reinterpret_cast<__m512i*>(reinterpret_cast(data) + idx); + reinterpret_cast<__m512i*>(data + idx); __m512i value = _mm512_setzero_si512(); _mm512_store_si512(ptr, value); benchmark::DoNotOptimize(ptr); } - - std::free(data); } -BENCHMARK(BM_Loadu512_Random) +BENCHMARK(BM_Loadu512_aligned) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20); + ->Range(2, 1 << 23); -BENCHMARK(BM_Load512_Random) +BENCHMARK(BM_Loadu512_unaligned_crossing_64byte_border) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20); + ->Range(2, 1 << 23); -BENCHMARK(BM_Storeu512_Random) +BENCHMARK(BM_Load512_aligned) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20); + ->Range(2, 1 << 23); -BENCHMARK(BM_Store512_Random) +BENCHMARK(BM_Storeu512_aligned) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20); + ->Range(2, 1 << 23); -#else - -alignas(64) uint8_t data[128]; +BENCHMARK(BM_Storeu512_unaligned_crossing_64byte_border) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(2, 1 << 23); -static void BM_Loadu256_shift63(benchmark::State& state) { - const __m256i* ptr = reinterpret_cast(data + 63); +BENCHMARK(BM_Store512_aligned) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(2, 1 << 23); - for (auto _ : state) { - benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); - } -} +#else -static void BM_Loadu256_shift31(benchmark::State& state) { - const __m256i* ptr = reinterpret_cast(data + 31); +static void BM_Loadu256_aligned(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); for (auto _ : state) { - benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); - } -} - -static void BM_Loadu256_shift0(benchmark::State& state) { - const __m256i* ptr = reinterpret_cast(data); + size_t idx = 64 * (rng() % (n - 1)); + const __m256i* ptr = reinterpret_cast(data + idx); - for (auto _ : state) { benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); } } -static void BM_Load256_shift0(benchmark::State& state) { - const __m256i* ptr = reinterpret_cast(data); +static void BM_Loadu256_unaligned(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); for (auto _ : state) { - benchmark::DoNotOptimize(_mm256_load_si256(ptr)); - } -} - -static void BM_Storeu256_shift63(benchmark::State& state) { - __m256i value = _mm256_setzero_si256(); - __m256i* ptr = reinterpret_cast<__m256i*>(data + 63); + size_t idx = 16 + 64 * (rng() % (n - 1)); + const __m256i* ptr = reinterpret_cast(data + idx); - for (auto _ : state) { - _mm256_storeu_si256(ptr, value); - benchmark::DoNotOptimize(*ptr); + benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); } } -static void BM_Storeu256_shift31(benchmark::State& state) { - __m256i value = _mm256_setzero_si256(); - __m256i* ptr = reinterpret_cast<__m256i*>(data + 31); +static void BM_Loadu256_unaligned_crossing_64byte_border(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); for (auto _ : state) { - _mm256_storeu_si256(ptr, value); - benchmark::DoNotOptimize(*ptr); - } -} - -static void BM_Storeu256_shift0(benchmark::State& state) { - __m256i value = _mm256_setzero_si256(); - __m256i* ptr = reinterpret_cast<__m256i*>(data); + size_t idx = 48 + 64 * (rng() % (n - 1)); + const __m256i* ptr = reinterpret_cast(data + idx); - for (auto _ : state) { - _mm256_storeu_si256(ptr, value); - benchmark::DoNotOptimize(*ptr); + benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); } } -static void BM_Store256_shift0(benchmark::State& state) { - __m256i value = _mm256_setzero_si256(); - __m256i* ptr = reinterpret_cast<__m256i*>(data); +static void BM_Load256_aligned(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); for (auto _ : state) { - _mm256_store_si256(ptr, value); - benchmark::DoNotOptimize(*ptr); + size_t idx = 64 * (rng() % (n - 1)); + const __m256i* ptr = reinterpret_cast(data + idx); + + benchmark::DoNotOptimize(_mm256_load_si256(ptr)); } } -BENCHMARK(BM_Loadu256_shift63); -BENCHMARK(BM_Loadu256_shift31); -BENCHMARK(BM_Loadu256_shift0); -BENCHMARK(BM_Load256_shift0); -BENCHMARK(BM_Storeu256_shift63); -BENCHMARK(BM_Storeu256_shift31); -BENCHMARK(BM_Storeu256_shift0); -BENCHMARK(BM_Store256_shift0); - -static void BM_Loadu256_Random(benchmark::State& state) { +static void BM_Storeu256_aligned(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); - size_t alignment = 64; - size_t size = 64 * n; - - void* data = std::aligned_alloc(alignment, size); - for (auto _ : state) { - size_t idx = 1 + 32 * (rng() % (n - 1)); - const __m256i* ptr = reinterpret_cast( - reinterpret_cast(data) + idx); + size_t idx = 64 * (rng() % (n - 1)); + __m256i* ptr = + reinterpret_cast<__m256i*>(data + idx); + __m256i value = _mm256_setzero_si256(); - benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); - } + _mm256_storeu_si256(ptr, value); - std::free(data); + benchmark::DoNotOptimize(ptr); + } } -static void BM_Load256_Random(benchmark::State& state) { +static void BM_Storeu256_unaligned(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); - size_t alignment = 64; - size_t size = 64 * n; - - void* data = std::aligned_alloc(alignment, size); - for (auto _ : state) { - size_t idx = 32 * (rng() % (n - 1)); - const __m256i* ptr = reinterpret_cast( - reinterpret_cast(data) + idx); + size_t idx = 16 + 64 * (rng() % (n - 1)); + __m256i* ptr = + reinterpret_cast<__m256i*>(data + idx); + __m256i value = _mm256_setzero_si256(); - benchmark::DoNotOptimize(_mm256_load_si256(ptr)); - } + _mm256_storeu_si256(ptr, value); - std::free(data); + benchmark::DoNotOptimize(ptr); + } } -static void BM_Storeu256_Random(benchmark::State& state) { +static void BM_Storeu256_unaligned_crossing_64byte_border(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); - size_t alignment = 64; - size_t size = 64 * n; - - void* data = std::aligned_alloc(alignment, size); - for (auto _ : state) { - size_t idx = 1 + 32 * (rng() % (n - 1)); + size_t idx = 48 + 64 * (rng() % (n - 1)); __m256i* ptr = - reinterpret_cast<__m256i*>(reinterpret_cast(data) + idx); + reinterpret_cast<__m256i*>(data + idx); __m256i value = _mm256_setzero_si256(); _mm256_storeu_si256(ptr, value); @@ -348,47 +223,60 @@ static void BM_Storeu256_Random(benchmark::State& state) { } } -static void BM_Store256_Random(benchmark::State& state) { +static void BM_Store256_aligned(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); - size_t alignment = 64; - size_t size = 64 * n; - - void* data = std::aligned_alloc(alignment, size); - for (auto _ : state) { - size_t idx = 32 * (rng() % (n - 1)); + size_t idx = 64 * (rng() % (n - 1)); __m256i* ptr = - reinterpret_cast<__m256i*>(reinterpret_cast(data) + idx); + reinterpret_cast<__m256i*>(data + idx); __m256i value = _mm256_setzero_si256(); _mm256_store_si256(ptr, value); benchmark::DoNotOptimize(ptr); } - - std::free(data); } -BENCHMARK(BM_Loadu256_Random) +BENCHMARK(BM_Loadu256_aligned) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(2, 1 << 23); + +BENCHMARK(BM_Loadu256_unaligned) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(2, 1 << 23); + +BENCHMARK(BM_Loadu256_unaligned_crossing_64byte_border) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(2, 1 << 23); + +BENCHMARK(BM_Load256_aligned) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(2, 1 << 23); + +BENCHMARK(BM_Storeu256_aligned) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20); + ->Range(2, 1 << 23); -BENCHMARK(BM_Load256_Random) +BENCHMARK(BM_Storeu256_unaligned) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20); + ->Range(2, 1 << 23); -BENCHMARK(BM_Storeu256_Random) +BENCHMARK(BM_Storeu256_unaligned_crossing_64byte_border) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20); + ->Range(2, 1 << 23); -BENCHMARK(BM_Store256_Random) +BENCHMARK(BM_Store256_aligned) ->ArgNames({"n"}) ->RangeMultiplier(4) - ->Range(2, 1 << 20); + ->Range(2, 1 << 23); #endif From 6c897d5d805420563d3912958cfbc5b0cf790a4f Mon Sep 17 00:00:00 2001 From: mperikov Date: Wed, 10 Dec 2025 15:03:07 +0300 Subject: [PATCH 10/12] Format --- src/alignment_comparison.cpp | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/src/alignment_comparison.cpp b/src/alignment_comparison.cpp index 2b1af2d..80f8659 100644 --- a/src/alignment_comparison.cpp +++ b/src/alignment_comparison.cpp @@ -23,7 +23,8 @@ static void BM_Loadu512_aligned(benchmark::State& state) { } } -static void BM_Loadu512_unaligned_crossing_64byte_border(benchmark::State& state) { +static void BM_Loadu512_unaligned_crossing_64byte_border( + benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -53,8 +54,7 @@ static void BM_Storeu512_aligned(benchmark::State& state) { for (auto _ : state) { size_t idx = 64 * (rng() % (n - 1)); - __m512i* ptr = - reinterpret_cast<__m512i*>(data + idx); + __m512i* ptr = reinterpret_cast<__m512i*>(data + idx); __m512i value = _mm512_setzero_si512(); _mm512_storeu_si512(ptr, value); @@ -63,14 +63,14 @@ static void BM_Storeu512_aligned(benchmark::State& state) { } } -static void BM_Storeu512_unaligned_crossing_64byte_border(benchmark::State& state) { +static void BM_Storeu512_unaligned_crossing_64byte_border( + benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { size_t idx = 48 + 64 * (rng() % (n - 1)); - __m512i* ptr = - reinterpret_cast<__m512i*>(data + idx); + __m512i* ptr = reinterpret_cast<__m512i*>(data + idx); __m512i value = _mm512_setzero_si512(); _mm512_storeu_si512(ptr, value); @@ -85,8 +85,7 @@ static void BM_Store512_aligned(benchmark::State& state) { for (auto _ : state) { size_t idx = 64 * (rng() % (n - 1)); - __m512i* ptr = - reinterpret_cast<__m512i*>(data + idx); + __m512i* ptr = reinterpret_cast<__m512i*>(data + idx); __m512i value = _mm512_setzero_si512(); _mm512_store_si512(ptr, value); @@ -151,7 +150,8 @@ static void BM_Loadu256_unaligned(benchmark::State& state) { } } -static void BM_Loadu256_unaligned_crossing_64byte_border(benchmark::State& state) { +static void BM_Loadu256_unaligned_crossing_64byte_border( + benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -181,8 +181,7 @@ static void BM_Storeu256_aligned(benchmark::State& state) { for (auto _ : state) { size_t idx = 64 * (rng() % (n - 1)); - __m256i* ptr = - reinterpret_cast<__m256i*>(data + idx); + __m256i* ptr = reinterpret_cast<__m256i*>(data + idx); __m256i value = _mm256_setzero_si256(); _mm256_storeu_si256(ptr, value); @@ -197,8 +196,7 @@ static void BM_Storeu256_unaligned(benchmark::State& state) { for (auto _ : state) { size_t idx = 16 + 64 * (rng() % (n - 1)); - __m256i* ptr = - reinterpret_cast<__m256i*>(data + idx); + __m256i* ptr = reinterpret_cast<__m256i*>(data + idx); __m256i value = _mm256_setzero_si256(); _mm256_storeu_si256(ptr, value); @@ -207,14 +205,14 @@ static void BM_Storeu256_unaligned(benchmark::State& state) { } } -static void BM_Storeu256_unaligned_crossing_64byte_border(benchmark::State& state) { +static void BM_Storeu256_unaligned_crossing_64byte_border( + benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { size_t idx = 48 + 64 * (rng() % (n - 1)); - __m256i* ptr = - reinterpret_cast<__m256i*>(data + idx); + __m256i* ptr = reinterpret_cast<__m256i*>(data + idx); __m256i value = _mm256_setzero_si256(); _mm256_storeu_si256(ptr, value); @@ -229,8 +227,7 @@ static void BM_Store256_aligned(benchmark::State& state) { for (auto _ : state) { size_t idx = 64 * (rng() % (n - 1)); - __m256i* ptr = - reinterpret_cast<__m256i*>(data + idx); + __m256i* ptr = reinterpret_cast<__m256i*>(data + idx); __m256i value = _mm256_setzero_si256(); _mm256_store_si256(ptr, value); From e922dc77332da22535bab48411451496e265f7d8 Mon Sep 17 00:00:00 2001 From: mperikov Date: Mon, 15 Dec 2025 19:25:26 +0300 Subject: [PATCH 11/12] array alignas and size fix --- src/alignment_comparison.cpp | 128 ++++++++++++++++------------------- 1 file changed, 57 insertions(+), 71 deletions(-) diff --git a/src/alignment_comparison.cpp b/src/alignment_comparison.cpp index 80f8659..a039e4c 100644 --- a/src/alignment_comparison.cpp +++ b/src/alignment_comparison.cpp @@ -7,16 +7,16 @@ #include "bits.h" -uint8_t data[1 << 29]; +alignas(64) uint8_t data[(1 << 29) + 1]; #ifdef PIXIE_AVX512_SUPPORT static void BM_Loadu512_aligned(benchmark::State& state) { - size_t n = state.range(0); + size_t k = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { - size_t idx = 64 * (rng() % (n - 1)); + size_t idx = (rng() & ((1 << k) - 1)) << 6; const __m512i* ptr = reinterpret_cast(data + idx); benchmark::DoNotOptimize(_mm512_loadu_si512(ptr)); @@ -25,11 +25,11 @@ static void BM_Loadu512_aligned(benchmark::State& state) { static void BM_Loadu512_unaligned_crossing_64byte_border( benchmark::State& state) { - size_t n = state.range(0); + size_t k = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { - size_t idx = 48 + 64 * (rng() % (n - 1)); + size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 48; const __m512i* ptr = reinterpret_cast(data + idx); benchmark::DoNotOptimize(_mm512_loadu_si512(ptr)); @@ -37,11 +37,11 @@ static void BM_Loadu512_unaligned_crossing_64byte_border( } static void BM_Load512_aligned(benchmark::State& state) { - size_t n = state.range(0); + size_t k = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { - size_t idx = 64 * (rng() % (n - 1)); + size_t idx = (rng() & ((1 << k) - 1)) << 6; const __m512i* ptr = reinterpret_cast(data + idx); benchmark::DoNotOptimize(_mm512_load_si512(ptr)); @@ -49,11 +49,11 @@ static void BM_Load512_aligned(benchmark::State& state) { } static void BM_Storeu512_aligned(benchmark::State& state) { - size_t n = state.range(0); + size_t k = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { - size_t idx = 64 * (rng() % (n - 1)); + size_t idx = (rng() & ((1 << k) - 1)) << 6; __m512i* ptr = reinterpret_cast<__m512i*>(data + idx); __m512i value = _mm512_setzero_si512(); @@ -65,11 +65,11 @@ static void BM_Storeu512_aligned(benchmark::State& state) { static void BM_Storeu512_unaligned_crossing_64byte_border( benchmark::State& state) { - size_t n = state.range(0); + size_t k = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { - size_t idx = 48 + 64 * (rng() % (n - 1)); + size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 48; __m512i* ptr = reinterpret_cast<__m512i*>(data + idx); __m512i value = _mm512_setzero_si512(); @@ -80,11 +80,11 @@ static void BM_Storeu512_unaligned_crossing_64byte_border( } static void BM_Store512_aligned(benchmark::State& state) { - size_t n = state.range(0); + size_t k = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { - size_t idx = 64 * (rng() % (n - 1)); + size_t idx = (rng() & ((1 << k) - 1)) << 6; __m512i* ptr = reinterpret_cast<__m512i*>(data + idx); __m512i value = _mm512_setzero_si512(); @@ -95,43 +95,37 @@ static void BM_Store512_aligned(benchmark::State& state) { } BENCHMARK(BM_Loadu512_aligned) - ->ArgNames({"n"}) - ->RangeMultiplier(4) - ->Range(2, 1 << 23); + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); BENCHMARK(BM_Loadu512_unaligned_crossing_64byte_border) - ->ArgNames({"n"}) - ->RangeMultiplier(4) - ->Range(2, 1 << 23); + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); BENCHMARK(BM_Load512_aligned) - ->ArgNames({"n"}) - ->RangeMultiplier(4) - ->Range(2, 1 << 23); + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); BENCHMARK(BM_Storeu512_aligned) - ->ArgNames({"n"}) - ->RangeMultiplier(4) - ->Range(2, 1 << 23); + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); BENCHMARK(BM_Storeu512_unaligned_crossing_64byte_border) - ->ArgNames({"n"}) - ->RangeMultiplier(4) - ->Range(2, 1 << 23); + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); BENCHMARK(BM_Store512_aligned) - ->ArgNames({"n"}) - ->RangeMultiplier(4) - ->Range(2, 1 << 23); + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); #else static void BM_Loadu256_aligned(benchmark::State& state) { - size_t n = state.range(0); + size_t k = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { - size_t idx = 64 * (rng() % (n - 1)); + size_t idx = (rng() & ((1 << k) - 1)) << 6; const __m256i* ptr = reinterpret_cast(data + idx); benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); @@ -139,11 +133,11 @@ static void BM_Loadu256_aligned(benchmark::State& state) { } static void BM_Loadu256_unaligned(benchmark::State& state) { - size_t n = state.range(0); + size_t k = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { - size_t idx = 16 + 64 * (rng() % (n - 1)); + size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 16; const __m256i* ptr = reinterpret_cast(data + idx); benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); @@ -152,11 +146,11 @@ static void BM_Loadu256_unaligned(benchmark::State& state) { static void BM_Loadu256_unaligned_crossing_64byte_border( benchmark::State& state) { - size_t n = state.range(0); + size_t k = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { - size_t idx = 48 + 64 * (rng() % (n - 1)); + size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 48; const __m256i* ptr = reinterpret_cast(data + idx); benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); @@ -164,11 +158,11 @@ static void BM_Loadu256_unaligned_crossing_64byte_border( } static void BM_Load256_aligned(benchmark::State& state) { - size_t n = state.range(0); + size_t k = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { - size_t idx = 64 * (rng() % (n - 1)); + size_t idx = (rng() & ((1 << k) - 1)) << 6; const __m256i* ptr = reinterpret_cast(data + idx); benchmark::DoNotOptimize(_mm256_load_si256(ptr)); @@ -176,11 +170,11 @@ static void BM_Load256_aligned(benchmark::State& state) { } static void BM_Storeu256_aligned(benchmark::State& state) { - size_t n = state.range(0); + size_t k = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { - size_t idx = 64 * (rng() % (n - 1)); + size_t idx = (rng() & ((1 << k) - 1)) << 6; __m256i* ptr = reinterpret_cast<__m256i*>(data + idx); __m256i value = _mm256_setzero_si256(); @@ -191,11 +185,11 @@ static void BM_Storeu256_aligned(benchmark::State& state) { } static void BM_Storeu256_unaligned(benchmark::State& state) { - size_t n = state.range(0); + size_t k = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { - size_t idx = 16 + 64 * (rng() % (n - 1)); + size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 16; __m256i* ptr = reinterpret_cast<__m256i*>(data + idx); __m256i value = _mm256_setzero_si256(); @@ -207,11 +201,11 @@ static void BM_Storeu256_unaligned(benchmark::State& state) { static void BM_Storeu256_unaligned_crossing_64byte_border( benchmark::State& state) { - size_t n = state.range(0); + size_t k = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { - size_t idx = 48 + 64 * (rng() % (n - 1)); + size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 48; __m256i* ptr = reinterpret_cast<__m256i*>(data + idx); __m256i value = _mm256_setzero_si256(); @@ -222,11 +216,11 @@ static void BM_Storeu256_unaligned_crossing_64byte_border( } static void BM_Store256_aligned(benchmark::State& state) { - size_t n = state.range(0); + size_t k = state.range(0); std::mt19937_64 rng(42); for (auto _ : state) { - size_t idx = 64 * (rng() % (n - 1)); + size_t idx = (rng() & ((1 << k) - 1)) << 6; __m256i* ptr = reinterpret_cast<__m256i*>(data + idx); __m256i value = _mm256_setzero_si256(); @@ -237,43 +231,35 @@ static void BM_Store256_aligned(benchmark::State& state) { } BENCHMARK(BM_Loadu256_aligned) - ->ArgNames({"n"}) - ->RangeMultiplier(4) - ->Range(2, 1 << 23); + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); BENCHMARK(BM_Loadu256_unaligned) - ->ArgNames({"n"}) - ->RangeMultiplier(4) - ->Range(2, 1 << 23); + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); BENCHMARK(BM_Loadu256_unaligned_crossing_64byte_border) - ->ArgNames({"n"}) - ->RangeMultiplier(4) - ->Range(2, 1 << 23); + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); BENCHMARK(BM_Load256_aligned) - ->ArgNames({"n"}) - ->RangeMultiplier(4) - ->Range(2, 1 << 23); + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); BENCHMARK(BM_Storeu256_aligned) - ->ArgNames({"n"}) - ->RangeMultiplier(4) - ->Range(2, 1 << 23); + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); BENCHMARK(BM_Storeu256_unaligned) - ->ArgNames({"n"}) - ->RangeMultiplier(4) - ->Range(2, 1 << 23); + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); BENCHMARK(BM_Storeu256_unaligned_crossing_64byte_border) - ->ArgNames({"n"}) - ->RangeMultiplier(4) - ->Range(2, 1 << 23); + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); BENCHMARK(BM_Store256_aligned) - ->ArgNames({"n"}) - ->RangeMultiplier(4) - ->Range(2, 1 << 23); + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); #endif From fb88e0b61909fbbc895261cf321bc24ee8e985e0 Mon Sep 17 00:00:00 2001 From: mperikov Date: Mon, 15 Dec 2025 19:27:21 +0300 Subject: [PATCH 12/12] Format fix --- src/alignment_comparison.cpp | 40 +++++++++--------------------------- 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/src/alignment_comparison.cpp b/src/alignment_comparison.cpp index a039e4c..ff1b024 100644 --- a/src/alignment_comparison.cpp +++ b/src/alignment_comparison.cpp @@ -94,29 +94,21 @@ static void BM_Store512_aligned(benchmark::State& state) { } } -BENCHMARK(BM_Loadu512_aligned) - ->ArgNames({"k"}) - ->DenseRange(1, 23, 2); +BENCHMARK(BM_Loadu512_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); BENCHMARK(BM_Loadu512_unaligned_crossing_64byte_border) ->ArgNames({"k"}) ->DenseRange(1, 23, 2); -BENCHMARK(BM_Load512_aligned) - ->ArgNames({"k"}) - ->DenseRange(1, 23, 2); +BENCHMARK(BM_Load512_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); -BENCHMARK(BM_Storeu512_aligned) - ->ArgNames({"k"}) - ->DenseRange(1, 23, 2); +BENCHMARK(BM_Storeu512_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); BENCHMARK(BM_Storeu512_unaligned_crossing_64byte_border) ->ArgNames({"k"}) ->DenseRange(1, 23, 2); -BENCHMARK(BM_Store512_aligned) - ->ArgNames({"k"}) - ->DenseRange(1, 23, 2); +BENCHMARK(BM_Store512_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); #else @@ -230,36 +222,24 @@ static void BM_Store256_aligned(benchmark::State& state) { } } -BENCHMARK(BM_Loadu256_aligned) - ->ArgNames({"k"}) - ->DenseRange(1, 23, 2); +BENCHMARK(BM_Loadu256_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); -BENCHMARK(BM_Loadu256_unaligned) - ->ArgNames({"k"}) - ->DenseRange(1, 23, 2); +BENCHMARK(BM_Loadu256_unaligned)->ArgNames({"k"})->DenseRange(1, 23, 2); BENCHMARK(BM_Loadu256_unaligned_crossing_64byte_border) ->ArgNames({"k"}) ->DenseRange(1, 23, 2); -BENCHMARK(BM_Load256_aligned) - ->ArgNames({"k"}) - ->DenseRange(1, 23, 2); +BENCHMARK(BM_Load256_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); -BENCHMARK(BM_Storeu256_aligned) - ->ArgNames({"k"}) - ->DenseRange(1, 23, 2); +BENCHMARK(BM_Storeu256_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); -BENCHMARK(BM_Storeu256_unaligned) - ->ArgNames({"k"}) - ->DenseRange(1, 23, 2); +BENCHMARK(BM_Storeu256_unaligned)->ArgNames({"k"})->DenseRange(1, 23, 2); BENCHMARK(BM_Storeu256_unaligned_crossing_64byte_border) ->ArgNames({"k"}) ->DenseRange(1, 23, 2); -BENCHMARK(BM_Store256_aligned) - ->ArgNames({"k"}) - ->DenseRange(1, 23, 2); +BENCHMARK(BM_Store256_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); #endif