diff --git a/CMakeLists.txt b/CMakeLists.txt index 63b1877..487ff0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,6 +59,7 @@ target_link_libraries(benchmarks benchmark benchmark_main pasta_bit_vector) + add_test( NAME Benchmarks COMMAND bt_benchmarks --benchmark_out=bm_report.csv --benchmark_out_format=csv @@ -104,6 +105,17 @@ target_link_libraries(test_rmm gtest_main) +add_executable(alignment_comparison + src/alignment_comparison.cpp) + +target_include_directories(alignment_comparison + PUBLIC include +) + +target_link_libraries(alignment_comparison + benchmark + benchmark_main + ) FetchContent_Declare( doxygen-awesome-css URL https://github.com/jothepro/doxygen-awesome-css/archive/refs/heads/main.zip diff --git a/src/alignment_comparison.cpp b/src/alignment_comparison.cpp new file mode 100644 index 0000000..ff1b024 --- /dev/null +++ b/src/alignment_comparison.cpp @@ -0,0 +1,245 @@ +#include +#include + +#include +#include +#include + +#include "bits.h" + +alignas(64) uint8_t data[(1 << 29) + 1]; + +#ifdef PIXIE_AVX512_SUPPORT + +static void BM_Loadu512_aligned(benchmark::State& state) { + size_t k = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + size_t idx = (rng() & ((1 << k) - 1)) << 6; + const __m512i* ptr = reinterpret_cast(data + idx); + + benchmark::DoNotOptimize(_mm512_loadu_si512(ptr)); + } +} + +static void BM_Loadu512_unaligned_crossing_64byte_border( + benchmark::State& state) { + size_t k = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 48; + const __m512i* ptr = reinterpret_cast(data + idx); + + benchmark::DoNotOptimize(_mm512_loadu_si512(ptr)); + } +} + +static void BM_Load512_aligned(benchmark::State& state) { + size_t k = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + size_t idx = (rng() & ((1 << k) - 1)) << 6; + const __m512i* ptr = reinterpret_cast(data + idx); + + benchmark::DoNotOptimize(_mm512_load_si512(ptr)); + } +} + +static void BM_Storeu512_aligned(benchmark::State& state) { + size_t k = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + size_t idx = (rng() & ((1 << k) - 1)) << 6; + __m512i* ptr = reinterpret_cast<__m512i*>(data + idx); + __m512i value = _mm512_setzero_si512(); + + _mm512_storeu_si512(ptr, value); + + benchmark::DoNotOptimize(ptr); + } +} + +static void BM_Storeu512_unaligned_crossing_64byte_border( + benchmark::State& state) { + size_t k = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 48; + __m512i* ptr = reinterpret_cast<__m512i*>(data + idx); + __m512i value = _mm512_setzero_si512(); + + _mm512_storeu_si512(ptr, value); + + benchmark::DoNotOptimize(ptr); + } +} + +static void BM_Store512_aligned(benchmark::State& state) { + size_t k = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + size_t idx = (rng() & ((1 << k) - 1)) << 6; + __m512i* ptr = reinterpret_cast<__m512i*>(data + idx); + __m512i value = _mm512_setzero_si512(); + + _mm512_store_si512(ptr, value); + + benchmark::DoNotOptimize(ptr); + } +} + +BENCHMARK(BM_Loadu512_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); + +BENCHMARK(BM_Loadu512_unaligned_crossing_64byte_border) + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); + +BENCHMARK(BM_Load512_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); + +BENCHMARK(BM_Storeu512_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); + +BENCHMARK(BM_Storeu512_unaligned_crossing_64byte_border) + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); + +BENCHMARK(BM_Store512_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); + +#else + +static void BM_Loadu256_aligned(benchmark::State& state) { + size_t k = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + size_t idx = (rng() & ((1 << k) - 1)) << 6; + const __m256i* ptr = reinterpret_cast(data + idx); + + benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); + } +} + +static void BM_Loadu256_unaligned(benchmark::State& state) { + size_t k = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 16; + const __m256i* ptr = reinterpret_cast(data + idx); + + benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); + } +} + +static void BM_Loadu256_unaligned_crossing_64byte_border( + benchmark::State& state) { + size_t k = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 48; + const __m256i* ptr = reinterpret_cast(data + idx); + + benchmark::DoNotOptimize(_mm256_loadu_si256(ptr)); + } +} + +static void BM_Load256_aligned(benchmark::State& state) { + size_t k = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + size_t idx = (rng() & ((1 << k) - 1)) << 6; + const __m256i* ptr = reinterpret_cast(data + idx); + + benchmark::DoNotOptimize(_mm256_load_si256(ptr)); + } +} + +static void BM_Storeu256_aligned(benchmark::State& state) { + size_t k = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + size_t idx = (rng() & ((1 << k) - 1)) << 6; + __m256i* ptr = reinterpret_cast<__m256i*>(data + idx); + __m256i value = _mm256_setzero_si256(); + + _mm256_storeu_si256(ptr, value); + + benchmark::DoNotOptimize(ptr); + } +} + +static void BM_Storeu256_unaligned(benchmark::State& state) { + size_t k = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 16; + __m256i* ptr = reinterpret_cast<__m256i*>(data + idx); + __m256i value = _mm256_setzero_si256(); + + _mm256_storeu_si256(ptr, value); + + benchmark::DoNotOptimize(ptr); + } +} + +static void BM_Storeu256_unaligned_crossing_64byte_border( + benchmark::State& state) { + size_t k = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 48; + __m256i* ptr = reinterpret_cast<__m256i*>(data + idx); + __m256i value = _mm256_setzero_si256(); + + _mm256_storeu_si256(ptr, value); + + benchmark::DoNotOptimize(ptr); + } +} + +static void BM_Store256_aligned(benchmark::State& state) { + size_t k = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + size_t idx = (rng() & ((1 << k) - 1)) << 6; + __m256i* ptr = reinterpret_cast<__m256i*>(data + idx); + __m256i value = _mm256_setzero_si256(); + + _mm256_store_si256(ptr, value); + + benchmark::DoNotOptimize(ptr); + } +} + +BENCHMARK(BM_Loadu256_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); + +BENCHMARK(BM_Loadu256_unaligned)->ArgNames({"k"})->DenseRange(1, 23, 2); + +BENCHMARK(BM_Loadu256_unaligned_crossing_64byte_border) + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); + +BENCHMARK(BM_Load256_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); + +BENCHMARK(BM_Storeu256_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); + +BENCHMARK(BM_Storeu256_unaligned)->ArgNames({"k"})->DenseRange(1, 23, 2); + +BENCHMARK(BM_Storeu256_unaligned_crossing_64byte_border) + ->ArgNames({"k"}) + ->DenseRange(1, 23, 2); + +BENCHMARK(BM_Store256_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2); + +#endif