diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml new file mode 100644 index 0000000000..b443e3a210 --- /dev/null +++ b/.github/actions/setup-linux/action.yml @@ -0,0 +1,19 @@ +name: Setup Linux build environment +description: Install dependencies +runs: + using: "composite" + steps: + - shell: bash + run: | + set -euo pipefail + sudo apt-get update -y -qq + sudo apt-get install libgl-dev \ + libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \ + libboost-regex-dev libboost-timer-dev libsimde-dev \ + libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \ + libglew-dev libglu1-mesa-dev libgl1-mesa-dev \ + libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \ + libpng-dev libalut-dev \ + qtbase5-dev libgtest-dev \ + doxygen graphviz \ + nlohmann-json3-dev diff --git a/.github/workflows/main.yml b/.github/workflows/ci-linux.yml similarity index 74% rename from .github/workflows/main.yml rename to .github/workflows/ci-linux.yml index c577f58841..785792a2b5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/ci-linux.yml @@ -1,5 +1,4 @@ - -name: CI +name: CI Linux on: push: branches: [ dev ] @@ -8,44 +7,32 @@ on: release: types: [published] jobs: - build: + ############################## + build-linux: runs-on: ubuntu-24.04 steps: - # Note: some steps require the checkout in the root directory - uses: actions/checkout@v3 + - uses: ./.github/actions/setup-linux - name: Build REGEN workspace shell: bash run: | - sudo apt-get update -y -qq - sudo apt-get install libgl-dev \ - libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \ - libboost-regex-dev libboost-timer-dev \ - libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \ - libglew-dev libglu1-mesa-dev libgl1-mesa-dev \ - libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \ - libpng-dev libalut-dev \ - qtbase5-dev libgtest-dev \ - doxygen graphviz \ - nlohmann-json3-dev + set -euo pipefail mkdir build cd build - cmake ../ -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DBUILD_VIDEO_PLAYER=ON - make 2> >(tee "make-output.txt") + cmake ../ \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_UNIT_TESTS=ON \ + -DBUILD_TESTS=ON \ + -DBUILD_VIDEO_PLAYER=ON + make -j$(nproc) 2> >(tee "make-output.txt") - name: Annotate compilation warnings/errors if: ${{github.event_name == 'pull_request'}} uses: JacobDomagala/CompileResult@master # just so that in case this step fails, the workflow doesn't stop. - # this is done as it is unclear how well the action is maintained. continue-on-error: true with: comment_title: Compilation compile_result_file: build/make-output.txt - - name: Create debian package - if: ${{github.event_name == 'push' || github.event_name == 'release'}} - shell: bash - run: | - cd build - cpack - name: Run unit tests if: ${{github.event_name == 'push' || github.event_name == 'pull_request'}} shell: bash @@ -59,12 +46,55 @@ jobs: junit_files: "gtest-regen.xml" action_fail: true action_fail_on_inconclusive: true - ##### + - name: Upload test results + uses: actions/upload-artifact@v4 + with: + name: test results + path: ./gtest-regen.xml + ############################## + debian-package: + needs: build-linux + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v3 + - uses: ./.github/actions/setup-linux + - name: Create debian package + if: ${{github.event_name == 'push' || github.event_name == 'release'}} + shell: bash + run: | + set -euo pipefail + mkdir build + cd build + cmake ../ -DCMAKE_BUILD_TYPE=Release + cpack + - name: Release debian package + if: github.event_name == 'release' + shell: bash + env: + GITHUB_TOKEN: ${{ github.TOKEN }} + run: | + gh release upload ${{github.event.release.tag_name}} ./build/regen-*.deb + - name: Upload debian package + if: github.event_name == 'push' + uses: actions/upload-artifact@v4 + with: + name: debian package + path: ./build/regen-*.deb + ############################## + doxygen: + needs: build-linux + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v3 + - uses: ./.github/actions/setup-linux - name: Run doxygen if: ${{github.event_name == 'push' || github.event_name == 'release'}} shell: bash run: | + set -euo pipefail + mkdir build cd build + cmake ../ -DCMAKE_BUILD_TYPE=Release cmake --build . --target doc cp -r ../img regen/doc/html/ - name: Extract version tag @@ -88,26 +118,3 @@ jobs: BRANCH: gh-pages # The folder the action should deploy. FOLDER: build/regen/doc/html - # The folder in the target branch - TARGET_FOLDER: ${{ env.REGEN_DOCU_VERSION }} - CLEAN: true - SINGLE_COMMIT: true - ##### - - name: Release debian package - if: github.event_name == 'release' - shell: bash - env: - GITHUB_TOKEN: ${{ github.TOKEN }} - run: | - gh release upload ${{github.event.release.tag_name}} ./build/regen-*.deb - - name: Upload debian package - if: github.event_name == 'push' - uses: actions/upload-artifact@v4 - with: - name: debian package - path: ./build/regen-*.deb - - name: Upload test results - uses: actions/upload-artifact@v4 - with: - name: test results - path: ./gtest-regen.xml diff --git a/.github/workflows/ci-macos.yml b/.github/workflows/ci-macos.yml new file mode 100644 index 0000000000..409bc2394c --- /dev/null +++ b/.github/workflows/ci-macos.yml @@ -0,0 +1,29 @@ +name: CI MacOS +on: + push: + branches: [ dev ] + pull_request: + branches: [ dev ] + release: + types: [published] + +jobs: + build-macos: + runs-on: macos-latest + steps: + - uses: actions/checkout@v3 + - name: Install dependencies + run: | + brew install cmake gcc simde boost assimp openal-soft freetype bullet glew qt5 doxygen graphviz devil ffmpeg alut nlohmann-json googletest + - name: Build REGEN workspace + run: | + mkdir build + cd build + cmake \ + -DCMAKE_C_COMPILER=/opt/homebrew/bin/gcc-14 -DCMAKE_CXX_COMPILER=/opt/homebrew/bin/g++-14 \ + -DCMAKE_PREFIX_PATH="/opt/homebrew/opt/qt@5" \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_TESTS=ON \ + -DREGEN_EXTRA_INCLUDE_DIRS="/opt/homebrew/opt/openal-soft/include/" \ + ../ + make diff --git a/CMakeLists.txt b/CMakeLists.txt index 9284168bcf..281b34302c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,8 +75,10 @@ endif() # given in the books "Effective C++" and "More Effective C++" # add_definitions( -Weffc++ ) -if(UNIX) # gcc options - add_definitions( -mfpmath=sse -march=native ) +if(UNIX) + # -march=native enables all instruction subsets supported by the local machine + # e.g. SSE2, SSE3, SSE4, AVX, AVX2, etc. + add_definitions( -march=native ) endif() # perform more aggressive floating-point optimizations @@ -107,6 +109,10 @@ find_package(Boost ${Boost_MIN_VERSION} find_package(Threads REQUIRED) # Font library: text rendering support find_package(Freetype REQUIRED) +# Note: On Linux, it seems needed in addition to link against brotlidec +if (UNIX AND NOT APPLE) + set(FREETYPE_LIBRARIES ${FREETYPE_LIBRARIES} -lbrotlidec) +endif() # JSON library: for serialization find_package(nlohmann_json REQUIRED) set(JSON_LIBRARIES nlohmann_json::nlohmann_json) @@ -184,8 +190,10 @@ message(STATUS " Models:${ASSIMP_INCLUDE_DIRS};") message(STATUS " Fonts:${FREETYPE_INCLUDE_DIRS};") message(STATUS " Physics:${BULLET_INCLUDE_DIRS};") -enable_testing() -find_package(GTest REQUIRED) +if (BUILD_UNIT_TESTS) + enable_testing() + find_package(GTest REQUIRED) +endif () if (HAS_AV_LIBS) # allow includes without AL/ prefix. Some openAL versions require this. @@ -199,7 +207,7 @@ set(REGEN_LIBRARIES ${Boost_LIBRARIES} ${JSON_LIBRARIES} ${ASSIMP_LIBRARIES} - ${FREETYPE_LIBRARIES} -lbrotlidec + ${FREETYPE_LIBRARIES} ${IMG_LIBRARIES} ${AV_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} @@ -239,6 +247,10 @@ endif() ########### ########### +if(REGEN_EXTRA_INCLUDE_DIRS) + include_directories(${REGEN_EXTRA_INCLUDE_DIRS}) +endif() + # allow includes like even if the engine is not installed include_directories(.) # allow include of generated files @@ -260,19 +272,21 @@ install(FILES img/icon-small.png DESTINATION ${SHARE_INSTALL_PATH}/img) ########## Unit Testings ############## -# add an executable target for GTest. -# but the testing code is partly in the *knowrob* library -# where gtest won't find them without using the "--no-as-needed" -# flag for the linker. -add_executable(all_gtests - tests/gtests.cpp - tests/shapes/quad-tree-test.cpp) -target_link_libraries(all_gtests - -Wl,--whole-archive,--no-as-needed - regen - -Wl,--no-whole-archive - ${Boost_Python_COMPONENT} - ${GTEST_MAIN_LIBRARIES}) +if (BUILD_UNIT_TESTS) + # add an executable target for GTest. + # but the testing code is partly in the *knowrob* library + # where gtest won't find them without using the "--no-as-needed" + # flag for the linker. + add_executable(all_gtests + tests/gtests.cpp + tests/shapes/quad-tree-test.cpp) + target_link_libraries(all_gtests + -Wl,--whole-archive,--no-as-needed + regen + -Wl,--no-whole-archive + ${Boost_Python_COMPONENT} + ${GTEST_MAIN_LIBRARIES}) +endif () ############## ########## packaging diff --git a/README.md b/README.md index 0880beb1db..6ba1d8645b 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,13 @@

-![CI](https://github.com/daniel86/regen/workflows/CI/badge.svg) +![Linux](https://github.com/daniel86/regen/actions/workflows/ci-linux.yml/badge.svg) +![MacOS](https://github.com/daniel86/regen/actions/workflows/ci-macos.yml/badge.svg) +![Warnings](https://img.shields.io/badge/compiler%20warnings-clean-brightgreen) +[![Docs](https://img.shields.io/badge/docs-online-blue)](https://daniel86.github.io/regen/) +![Debian](https://img.shields.io/badge/debian-.deb%20package-blue) +![GitHub release](https://img.shields.io/github/v/release/daniel86/regen?include_prereleases) +![License](https://img.shields.io/github/license/daniel86/regen) `regen` -- **Real-time Graphics Engine** -- is a modular OpenGL-based C++ engine designed for research and experimentation in real-time rendering, GPU compute, and virtual world simulation. diff --git a/regen/compute/radix-sort-cpu.h b/regen/compute/radix-sort-cpu.h index 3374b8ebe4..6493311f7b 100644 --- a/regen/compute/radix-sort-cpu.h +++ b/regen/compute/radix-sort-cpu.h @@ -139,23 +139,23 @@ namespace regen { if constexpr (KEY_TYPE_BITS == 16) { // Gather 8 keys manually, and promote to 32-bit for (int k = 0; k < 8; ++k) tmpKeys32[k] = static_cast(keys[src[keyIdx+k]]); - r0 = _mm256_load_si256(reinterpret_cast(tmpKeys32)); - r0 = _mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask); + r0 = simde_mm256_load_si256(reinterpret_cast(tmpKeys32)); + r0 = simde_mm256_and_si256(simde_mm256_srli_epi32(r0, SHIFT), mask); keyIdx += 8; // processed 8 keys, not 16! } else if constexpr (KEY_TYPE_BITS == 32) { - simd::Register_i idx = _mm256_loadu_si256(reinterpret_cast(&src[keyIdx])); + simd::Register_i idx = simde_mm256_loadu_si256(reinterpret_cast(&src[keyIdx])); // Gather 8 scattered keys, and apply shift and mask to get bucket ids - r0 = _mm256_i32gather_epi32(reinterpret_cast(keys), idx, 4); - r0 = _mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask); + r0 = simde_mm256_i32gather_epi32(reinterpret_cast(keys), idx, 4); + r0 = simde_mm256_and_si256(simde_mm256_srli_epi32(r0, SHIFT), mask); keyIdx += KEYS_PER_SIMD_PASS; } else if constexpr (KEY_TYPE_BITS == 64) { // note: values have 32 bits, use __m128i to load only 4 - __m128i idx32 = _mm_loadu_si128(reinterpret_cast(&src[keyIdx])); + simde__m128i idx32 = simde_mm_loadu_si128(reinterpret_cast(&src[keyIdx])); // Gather 4 scattered keys, and apply shift and mask to get bucket ids - r0 = _mm256_i32gather_epi64(reinterpret_cast(keys), idx32, 8); - r0 = _mm256_and_si256(_mm256_srli_epi64(r0, SHIFT), mask); + r0 = simde_mm256_i32gather_epi64(reinterpret_cast(keys), idx32, 8); + r0 = simde_mm256_and_si256(simde_mm256_srli_epi64(r0, SHIFT), mask); keyIdx += KEYS_PER_SIMD_PASS; } else { @@ -163,7 +163,7 @@ namespace regen { break; } // Store results into tmpBins_ and increment histogram - _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmpBins_), r0); + simde_mm256_storeu_si256(reinterpret_cast(tmpBins_), r0); for (auto x : tmpBins_) ++histogram_[x]; } } diff --git a/regen/compute/simd.h b/regen/compute/simd.h index b47af9a8e8..ed06be8761 100644 --- a/regen/compute/simd.h +++ b/regen/compute/simd.h @@ -4,21 +4,31 @@ #include #include -// NOTE: Check for REGEN_HAS_SIMD, if it is not defined, the SIMD operations will be disabled -// and the code here will fall back to scalar operations. -// NOLINTBEGIN(portability-simd-intrinsics) -#if defined(__AVX__) - #include // AVX - #define REGEN_SIMD_MODE AVX - #define REGEN_SIMD_WIDTH 8 - #define REGEN_HAS_SIMD -#elif defined(__SSE__) - #include // SSE - #define REGEN_SIMD_MODE SSE - #define REGEN_SIMD_WIDTH 4 - #define REGEN_HAS_SIMD +#define SIMDE_ENABLE_NATIVE_ALIASES +#include +#include +#include +#include +#include +#include + +#define REGEN_SIMD_NONE 0 +#define REGEN_SIMD_SSE 1 +#define REGEN_SIMD_AVX 2 + +#if defined(SIMDE_NATURAL_VECTOR_SIZE) + #if SIMDE_NATURAL_VECTOR_SIZE >= 32 + #define REGEN_SIMD_MODE REGEN_SIMD_AVX + #define REGEN_SIMD_WIDTH 8 + #elif SIMDE_NATURAL_VECTOR_SIZE >= 16 + #define REGEN_SIMD_MODE REGEN_SIMD_SSE + #define REGEN_SIMD_WIDTH 4 + #else + #define REGEN_SIMD_MODE REGEN_SIMD_NONE + #define REGEN_SIMD_WIDTH 1 + #endif #else - #define REGEN_SIMD_MODE NONE + #define REGEN_SIMD_MODE REGEN_SIMD_NONE #define REGEN_SIMD_WIDTH 1 #endif @@ -32,236 +42,236 @@ namespace regen::simd { return bitIndex; } -#if REGEN_SIMD_MODE == AVX +#if REGEN_SIMD_MODE == REGEN_SIMD_AVX static constexpr int8_t RegisterMask = 0xFF; // 8 bits for AVX - using Register = __m256; // 8 floats - using Register_i = __m256i; // 8 integers + using Register = simde__m256; // 8 floats + using Register_i = simde__m256i; // 8 integers - inline __m256 set1_ps(float v) { return _mm256_set1_ps(v); } - inline __m256i set1_epi32(int32_t v) { return _mm256_set1_epi32(v); } - inline __m256i set1_epi16(uint16_t v) { return _mm256_set1_epi16(v); } - inline __m256i set1_epi64(int64_t v) { return _mm256_set1_epi64x(v); } - inline __m256i set1_epi64u(uint64_t v) { return _mm256_set1_epi64x(v); } + inline Register set1_ps(float v) { return simde_mm256_set1_ps(v); } + inline Register_i set1_epi32(int32_t v) { return simde_mm256_set1_epi32(v); } + inline Register_i set1_epi16(uint16_t v) { return simde_mm256_set1_epi16(v); } + inline Register_i set1_epi64(int64_t v) { return simde_mm256_set1_epi64x(v); } + inline Register_i set1_epi64u(uint64_t v) { return simde_mm256_set1_epi64x(v); } - inline __m256 setzero_ps() { return _mm256_setzero_ps(); } - inline __m256i setzero_si256() { return _mm256_setzero_si256(); } + inline Register setzero_ps() { return simde_mm256_setzero_ps(); } + inline Register_i setzero_si256() { return simde_mm256_setzero_si256(); } - inline __m256 load_ps(const float *p) { return _mm256_load_ps(p); } - inline __m256 loadu_ps(const float *p) { return _mm256_loadu_ps(p); } + inline Register load_ps(const float *p) { return simde_mm256_load_ps(p); } + inline Register loadu_ps(const float *p) { return simde_mm256_loadu_ps(p); } - inline __m256i load_si256(const uint16_t *p) { - return _mm256_load_si256(reinterpret_cast(p)); + inline Register_i load_si256(const uint16_t *p) { + return simde_mm256_load_si256(reinterpret_cast(p)); } - inline __m256i load_si256(const uint32_t *p) { - return _mm256_load_si256(reinterpret_cast(p)); + inline Register_i load_si256(const uint32_t *p) { + return simde_mm256_load_si256(reinterpret_cast(p)); } - inline __m256i load_si256(const uint64_t *p) { - return _mm256_load_si256(reinterpret_cast(p)); + inline Register_i load_si256(const uint64_t *p) { + return simde_mm256_load_si256(reinterpret_cast(p)); } - inline __m256i load_si256(const int32_t *p) { - return _mm256_load_si256(reinterpret_cast(p)); + inline Register_i load_si256(const int32_t *p) { + return simde_mm256_load_si256(reinterpret_cast(p)); } - inline __m256i loadu_si256(const uint16_t *p) { - return _mm256_loadu_si256(reinterpret_cast(p)); + inline Register_i loadu_si256(const uint16_t *p) { + return simde_mm256_loadu_si256(reinterpret_cast(p)); } - inline __m256i loadu_si256(const uint32_t *p) { - return _mm256_loadu_si256(reinterpret_cast(p)); + inline Register_i loadu_si256(const uint32_t *p) { + return simde_mm256_loadu_si256(reinterpret_cast(p)); } - inline __m256i loadu_si256(const uint64_t *p) { - return _mm256_loadu_si256(reinterpret_cast(p)); + inline Register_i loadu_si256(const uint64_t *p) { + return simde_mm256_loadu_si256(reinterpret_cast(p)); } - inline __m256i loadu_si256(const int32_t *p) { - return _mm256_loadu_si256(reinterpret_cast(p)); + inline Register_i loadu_si256(const int32_t *p) { + return simde_mm256_loadu_si256(reinterpret_cast(p)); } - inline __m256 epi_to_ps(const __m256i &v) { return _mm256_castsi256_ps(v); } + inline Register epi_to_ps(const Register_i &v) { return simde_mm256_castsi256_ps(v); } - inline __m256 i32gather_ps(const float *p, const __m256i &indices) { - return _mm256_i32gather_ps(p, indices, sizeof(float)); + inline Register i32gather_ps(const float *p, const Register_i &indices) { + return simde_mm256_i32gather_ps(p, indices, sizeof(float)); } - inline void storeu_ps(float *p, const __m256 &v) { _mm256_storeu_ps(p, v); } - inline void store_ps(float *p, const __m256 &v) { _mm256_store_ps(p, v); } + inline void storeu_ps(float *p, const Register &v) { simde_mm256_storeu_ps(p, v); } + inline void store_ps(float *p, const Register &v) { simde_mm256_store_ps(p, v); } - inline void storeu_epi32(int32_t *p, const __m256i &v) { - _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v); + inline void storeu_epi32(int32_t *p, const Register_i &v) { + simde_mm256_storeu_si256(reinterpret_cast(p), v); } - inline void storeu_epi32(uint32_t *p, const __m256i &v) { - _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v); + inline void storeu_epi32(uint32_t *p, const Register_i &v) { + simde_mm256_storeu_si256(reinterpret_cast(p), v); } - inline void store_epi32(int32_t *p, const __m256i &v) { - _mm256_store_si256(reinterpret_cast<__m256i*>(p), v); + inline void store_epi32(int32_t *p, const Register_i &v) { + simde_mm256_store_si256(reinterpret_cast(p), v); } - inline void store_epi32(uint32_t *p, const __m256i &v) { - _mm256_store_si256(reinterpret_cast<__m256i*>(p), v); + inline void store_epi32(uint32_t *p, const Register_i &v) { + simde_mm256_store_si256(reinterpret_cast(p), v); } - inline __m256 add_ps(const __m256 &a, const __m256 &b) { return _mm256_add_ps(a, b); } - inline __m256 sub_ps(const __m256 &a, const __m256 &b) { return _mm256_sub_ps(a, b); } - inline __m256 mul_ps(const __m256 &a, const __m256 &b) { return _mm256_mul_ps(a, b); } - inline __m256 div_ps(const __m256 &a, const __m256 &b) { return _mm256_div_ps(a, b); } + inline Register add_ps(const Register &a, const Register &b) { return simde_mm256_add_ps(a, b); } + inline Register sub_ps(const Register &a, const Register &b) { return simde_mm256_sub_ps(a, b); } + inline Register mul_ps(const Register &a, const Register &b) { return simde_mm256_mul_ps(a, b); } + inline Register div_ps(const Register &a, const Register &b) { return simde_mm256_div_ps(a, b); } /** * Fused multiply-add: (a * b) + c */ - inline __m256 mul_add_ps(const __m256 &a, const __m256 &b, const __m256 &c) { - return _mm256_fmadd_ps(a, b, c); + inline Register mul_add_ps(const Register &a, const Register &b, const Register &c) { + return simde_mm256_fmadd_ps(a, b, c); } - inline __m256i add_epi32(const __m256i &a, const __m256i &b) { return _mm256_add_epi32(a, b); } - inline __m256i sub_epi32(const __m256i &a, const __m256i &b) { return _mm256_sub_epi32(a, b); } - inline __m256i mul_epi32(const __m256i &a, const __m256i &b) { return _mm256_mullo_epi32(a, b); } + inline Register_i add_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_add_epi32(a, b); } + inline Register_i sub_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_sub_epi32(a, b); } + inline Register_i mul_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_mullo_epi32(a, b); } - inline __m256 min_ps(const __m256 &a, const __m256 &b) { return _mm256_min_ps(a, b); } - inline __m256 max_ps(const __m256 &a, const __m256 &b) { return _mm256_max_ps(a, b); } - inline __m256 sqrt_ps(const __m256 &a) { return _mm256_sqrt_ps(a); } + inline Register min_ps(const Register &a, const Register &b) { return simde_mm256_min_ps(a, b); } + inline Register max_ps(const Register &a, const Register &b) { return simde_mm256_max_ps(a, b); } + inline Register sqrt_ps(const Register &a) { return simde_mm256_sqrt_ps(a); } - inline __m256i min_epi32(const __m256i &a, const __m256i &b) { return _mm256_min_epi32(a, b); } - inline __m256i max_epi32(const __m256i &a, const __m256i &b) { return _mm256_max_epi32(a, b); } + inline Register_i min_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_min_epi32(a, b); } + inline Register_i max_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_max_epi32(a, b); } /** - * Horizontal sum of all elements in an __m256 + * Horizontal sum of all elements in an Register */ - inline float hsum_ps(__m256 v) { - __m128 vlow = _mm256_castps256_ps128(v); // low 128 - __m128 vhigh = _mm256_extractf128_ps(v, 1); // high 128 - __m128 sum = _mm_add_ps(vlow, vhigh); // add low and high parts - __m128 shuf = _mm_movehdup_ps(sum); // (sum.y, sum.y, sum.w, sum.w) - __m128 sums = _mm_add_ps(sum, shuf); - shuf = _mm_movehl_ps(shuf, sums); // high half of sums - sums = _mm_add_ss(sums, shuf); - return _mm_cvtss_f32(sums); + inline float hsum_ps(Register v) { + simde__m128 vlow = simde_mm256_castps256_ps128(v); // low 128 + simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1); // high 128 + simde__m128 sum = simde_mm_add_ps(vlow, vhigh); // add low and high parts + simde__m128 shuf = simde_mm_movehdup_ps(sum); // (sum.y, sum.y, sum.w, sum.w) + simde__m128 sums = simde_mm_add_ps(sum, shuf); + shuf = simde_mm_movehl_ps(shuf, sums); // high half of sums + sums = simde_mm_add_ss(sums, shuf); + return simde_mm_cvtss_f32(sums); } /** - * Horizontal min of all elements in an __m256 + * Horizontal min of all elements in an Register */ - inline float hmin_ps(__m256 v) { - __m128 vlow = _mm256_castps256_ps128(v); // low 128 - __m128 vhigh = _mm256_extractf128_ps(v, 1); // high 128 - __m128 min = _mm_min_ps(vlow, vhigh); // min low and high parts - __m128 shuf = _mm_movehdup_ps(min); // (min.y, min.y, min.w, min.w) - __m128 mins = _mm_min_ps(min, shuf); - shuf = _mm_movehl_ps(shuf, mins); // high half of mins - mins = _mm_min_ss(mins, shuf); - return _mm_cvtss_f32(mins); + inline float hmin_ps(Register v) { + simde__m128 vlow = simde_mm256_castps256_ps128(v); // low 128 + simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1); // high 128 + simde__m128 min = simde_mm_min_ps(vlow, vhigh); // min low and high parts + simde__m128 shuf = simde_mm_movehdup_ps(min); // (min.y, min.y, min.w, min.w) + simde__m128 mins = simde_mm_min_ps(min, shuf); + shuf = simde_mm_movehl_ps(shuf, mins); // high half of mins + mins = simde_mm_min_ss(mins, shuf); + return simde_mm_cvtss_f32(mins); } /** - * Horizontal max of all elements in an __m256 + * Horizontal max of all elements in an Register */ - inline float hmax_ps(__m256 v) { - __m128 vlow = _mm256_castps256_ps128(v); // low 128 - __m128 vhigh = _mm256_extractf128_ps(v, 1); // high 128 - __m128 max = _mm_max_ps(vlow, vhigh); // max low and high parts - __m128 shuf = _mm_movehdup_ps(max); // (max.y, max.y, max.w, max.w) - __m128 maxs = _mm_max_ps(max, shuf); - shuf = _mm_movehl_ps(shuf, maxs); // high half of maxs - maxs = _mm_max_ss(maxs, shuf); - return _mm_cvtss_f32(maxs); + inline float hmax_ps(Register v) { + simde__m128 vlow = simde_mm256_castps256_ps128(v); // low 128 + simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1); // high 128 + simde__m128 max = simde_mm_max_ps(vlow, vhigh); // max low and high parts + simde__m128 shuf = simde_mm_movehdup_ps(max); // (max.y, max.y, max.w, max.w) + simde__m128 maxs = simde_mm_max_ps(max, shuf); + shuf = simde_mm_movehl_ps(shuf, maxs); // high half of maxs + maxs = simde_mm_max_ss(maxs, shuf); + return simde_mm_cvtss_f32(maxs); } - inline __m256 rcp_ps(const __m256 &a) { return _mm256_rcp_ps(a); } + inline Register rcp_ps(const Register &a) { return simde_mm256_rcp_ps(a); } - inline __m256 cmp_lt(const __m256 &a, const __m256 &b) { - return _mm256_cmp_ps(a, b, _CMP_LT_OQ); + inline Register cmp_lt(const Register &a, const Register &b) { + return simde_mm256_cmp_ps(a, b, _CMP_LT_OQ); } - inline __m256 cmp_gt(const __m256 &a, const __m256 &b) { - return _mm256_cmp_ps(a, b, _CMP_GT_OQ); + inline Register cmp_gt(const Register &a, const Register &b) { + return simde_mm256_cmp_ps(a, b, _CMP_GT_OQ); } - inline __m256 cmp_eq(const __m256 &a, const __m256 &b) { - return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); + inline Register cmp_eq(const Register &a, const Register &b) { + return simde_mm256_cmp_ps(a, b, _CMP_EQ_OQ); } - inline __m256 cmp_neq(const __m256 &a, const __m256 &b) { - return _mm256_cmp_ps(a, b, _CMP_NEQ_OQ); + inline Register cmp_neq(const Register &a, const Register &b) { + return simde_mm256_cmp_ps(a, b, _CMP_NEQ_OQ); } - inline __m256 cmp_or(const __m256 &a, const __m256 &b) { - return _mm256_or_ps(a, b); + inline Register cmp_or(const Register &a, const Register &b) { + return simde_mm256_or_ps(a, b); } - inline __m256 cmp_and(const __m256 &a, const __m256 &b) { - return _mm256_and_ps(a, b); + inline Register cmp_and(const Register &a, const Register &b) { + return simde_mm256_and_ps(a, b); } - inline __m256 cmp_and_not(const __m256 &a, const __m256 &b) { - return _mm256_andnot_ps(a, b); + inline Register cmp_and_not(const Register &a, const Register &b) { + return simde_mm256_andnot_ps(a, b); } - inline __m256i cvttps_epi32(const __m256 &a) { return _mm256_cvttps_epi32(a); } + inline Register_i cvttps_epi32(const Register &a) { return simde_mm256_cvttps_epi32(a); } - inline int movemask_ps(const __m256 &v) { return _mm256_movemask_ps(v); } + inline int movemask_ps(const Register &v) { return simde_mm256_movemask_ps(v); } - inline __m256 blendv_ps(const __m256 &a, const __m256 &b, const __m256 &mask) { - return _mm256_blendv_ps(a, b, mask); + inline Register blendv_ps(const Register &a, const Register &b, const Register &mask) { + return simde_mm256_blendv_ps(a, b, mask); } -#elif REGEN_SIMD_MODE == SSE +#elif REGEN_SIMD_MODE == REGEN_SIMD_SSE static constexpr int8_t RegisterMask = 0x0F; // 4 bits for SSE - using Register = __m128; // 4 floats - using Register_i = __m128i; // 4 integers + using Register = simde__m128; // 4 floats + using Register_i = simde__m128i; // 4 integers - inline __m128 set1_ps(float v) { return _mm_set1_ps(v); } - inline __m128i set1_epi32(int32_t v) { return _mm_set1_epi32(v); } + inline Register set1_ps(float v) { return simde_mm_set1_ps(v); } + inline Register_i set1_epi32(int32_t v) { return simde_mm_set1_epi32(v); } - inline __m128 setzero_ps() { return _mm_setzero_ps(); } - inline __m128i setzero_si256() { return _mm_setzero_si128(); } + inline Register setzero_ps() { return simde_mm_setzero_ps(); } + inline Register_i setzero_si256() { return simde_mm_setzero_si128(); } - inline __m128 load_ps(const float *p) { return _mm_load_ps(p); } - inline __m128 loadu_ps(const float *p) { return _mm_loadu_ps(p); } + inline Register load_ps(const float *p) { return simde_mm_load_ps(p); } + inline Register loadu_ps(const float *p) { return simde_mm_loadu_ps(p); } - inline __m128i loadu_si256(const uint32_t *p) { - return _mm_loadu_si128(reinterpret_cast(indices)); + inline Register_i loadu_si256(const uint32_t *p) { + return simde_mm_loadu_si128(reinterpret_cast(p)); } - inline __m128 epi_to_ps(const __m128i &v) { return _mm_castsi128_ps(v); } + inline Register epi_to_ps(const Register_i &v) { return simde_mm_castsi128_ps(v); } - inline __m128 i32gather_ps(const float *p, const __m128i &indices) { - return _mm_i32gather_ps(p, indices, sizeof(float)); + inline Register i32gather_ps(const float *p, const Register_i &indices) { + return simde_mm_i32gather_ps(p, indices, sizeof(float)); } - inline void storeu_ps(float *p, const __m128 &v) { _mm_storeu_ps(p, v); } + inline void storeu_ps(float *p, const Register &v) { simde_mm_storeu_ps(p, v); } - inline __m128 add_ps(const __m128 &a, const __m128 &b) { return _mm_add_ps(a, b); } - inline __m128 sub_ps(const __m128 &a, const __m128 &b) { return _mm_sub_ps(a, b); } - inline __m128 mul_ps(const __m128 &a, const __m128 &b) { return _mm_mul_ps(a, b); } - inline __m128 div_ps(const __m128 &a, const __m128 &b) { return _mm_div_ps(a, b); } + inline Register add_ps(const Register &a, const Register &b) { return simde_mm_add_ps(a, b); } + inline Register sub_ps(const Register &a, const Register &b) { return simde_mm_sub_ps(a, b); } + inline Register mul_ps(const Register &a, const Register &b) { return simde_mm_mul_ps(a, b); } + inline Register div_ps(const Register &a, const Register &b) { return simde_mm_div_ps(a, b); } - inline __m128i add_epi32(const __m128i &a, const __m128i &b) { return _mm_add_epi32(a, b); } - inline __m128i sub_epi32(const __m128i &a, const __m128i &b) { return _mm_sub_epi32(a, b); } - inline __m128i mul_epi32(const __m128i &a, const __m128i &b) { return _mm_mullo_epi32(a, b); } + inline Register_i add_epi32(const Register_i &a, const Register_i &b) { return simde_mm_add_epi32(a, b); } + inline Register_i sub_epi32(const Register_i &a, const Register_i &b) { return simde_mm_sub_epi32(a, b); } + inline Register_i mul_epi32(const Register_i &a, const Register_i &b) { return simde_mm_mullo_epi32(a, b); } - inline __m128 min_ps(const __m128 &a, const __m128 &b) { return _mm_min_ps(a, b); } - inline __m128 max_ps(const __m128 &a, const __m128 &b) { return _mm_max_ps(a, b); } - inline __m128 sqrt_ps(const __m128 &a) { return _mm_sqrt_ps(a); } + inline Register min_ps(const Register &a, const Register &b) { return simde_mm_min_ps(a, b); } + inline Register max_ps(const Register &a, const Register &b) { return simde_mm_max_ps(a, b); } + inline Register sqrt_ps(const Register &a) { return simde_mm_sqrt_ps(a); } - inline __m128i min_epi32(const __m128i &a, const __m128i &b) { return _mm_min_epi32(a, b); } - inline __m128i max_epi32(const __m128i &a, const __m128i &b) { return _mm_max_epi32(a, b); } + inline Register_i min_epi32(const Register_i &a, const Register_i &b) { return simde_mm_min_epi32(a, b); } + inline Register_i max_epi32(const Register_i &a, const Register_i &b) { return simde_mm_max_epi32(a, b); } - inline float hsum_ps(__m128 v) { - __m128 shuf = _mm_movehdup_ps(v); // (v1, v1, v3, v3) - __m128 sums = _mm_add_ps(v, shuf); - shuf = _mm_movehl_ps(shuf, sums); // (v2 + v3, v3, -, -) - sums = _mm_add_ss(sums, shuf); - return _mm_cvtss_f32(sums); + inline float hsum_ps(Register v) { + Register shuf = simde_mm_movehdup_ps(v); // (v1, v1, v3, v3) + Register sums = simde_mm_add_ps(v, shuf); + shuf = simde_mm_movehl_ps(shuf, sums); // (v2 + v3, v3, -, -) + sums = simde_mm_add_ss(sums, shuf); + return simde_mm_cvtss_f32(sums); } - inline __m128 rcp_ps(const __m128 &a) { return _mm_rcp_ps(a); } + inline Register rcp_ps(const Register &a) { return simde_mm_rcp_ps(a); } - inline __m128 cmp_lt(const __m128 &a, const __m128 &b) { return _mm_cmplt_ps(a, b); } - inline __m128 cmp_gt(const __m128 &a, const __m128 &b) { return _mm_cmplt_ps(b, a); } - inline __m128 cmp_eq(const __m128 &a, const __m128 &b) { return _mm_cmpeq_ps(a, b); } - inline __m128 cmp_neq(const __m128 &a, const __m128 &b) { - __m128 eq = _mm_cmpeq_ps(a, b); - return _mm_andnot_ps(eq, _mm_castsi128_ps(_mm_set1_epi32(-1))); // ~eq & all_ones + inline Register cmp_lt(const Register &a, const Register &b) { return simde_mm_cmplt_ps(a, b); } + inline Register cmp_gt(const Register &a, const Register &b) { return simde_mm_cmplt_ps(b, a); } + inline Register cmp_eq(const Register &a, const Register &b) { return simde_mm_cmpeq_ps(a, b); } + inline Register cmp_neq(const Register &a, const Register &b) { + Register eq = simde_mm_cmpeq_ps(a, b); + return simde_mm_andnot_ps(eq, simde_mm_castsi128_ps(simde_mm_set1_epi32(-1))); // ~eq & all_ones } - inline __m128 cmp_or(const __m128 &a, const __m128 &b) { return _mm_or_ps(a, b); } - inline __m128 cmp_and(const __m128 &a, const __m128 &b) { return _mm_and_ps(a, b); } + inline Register cmp_or(const Register &a, const Register &b) { return simde_mm_or_ps(a, b); } + inline Register cmp_and(const Register &a, const Register &b) { return simde_mm_and_ps(a, b); } - inline __m128i cvttps_epi32(const __m128 &a) { return _mm_cvttps_epi32(a); } + inline Register_i cvttps_epi32(const Register &a) { return simde_mm_cvttps_epi32(a); } - inline int movemask_ps(const __m128 &v) { return _mm_movemask_ps(v); } + inline int movemask_ps(const Register &v) { return simde_mm_movemask_ps(v); } - inline __m128 blendv_ps(const __m128 &a, const __m128 &b, const __m128 &mask) { - return _mm_blendv_ps(a, b, mask); + inline Register blendv_ps(const Register &a, const Register &b, const Register &mask) { + return simde_mm_blendv_ps(a, b, mask); } #else // Fallback to scalar operations @@ -752,7 +762,7 @@ namespace regen { } static BatchOf_int32 castFloatBatch(const BatchOf_float &v) { - return BatchOf_int32{_mm256_castps_si256(v.c)}; + return BatchOf_int32{simde_mm256_castps_si256(v.c)}; } /** @@ -832,7 +842,7 @@ namespace regen { } BatchOf_int32 operator&(const BatchOf_int32 &other) const { - return BatchOf_int32{_mm256_and_si256(c, other.c)}; + return BatchOf_int32{simde_mm256_and_si256(c, other.c)}; } static BatchOf_int32 allZeros() { @@ -1199,7 +1209,7 @@ namespace regen { /** * Computes the length squared of each vector in the batch. - * @return __m128 containing the length squared for each vector. + * @return Register containing the length squared for each vector. */ BatchOf_float lengthSquared() const { return x*x + y*y + z*z; @@ -1272,6 +1282,4 @@ namespace regen { template using vectorSIMD = std::vector>; } -// NOLINTEND(portability-simd-intrinsics) - #endif /* REGEN_SIMD_H_ */ diff --git a/regen/compute/threading.h b/regen/compute/threading.h index 250905e299..9a2d5c2082 100644 --- a/regen/compute/threading.h +++ b/regen/compute/threading.h @@ -9,8 +9,8 @@ #include "regen/memory/aligned-allocator.h" #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) - #include - #define CPU_PAUSE() _mm_pause() + #include + #define CPU_PAUSE() simde_mm_pause() #elif defined(__aarch64__) || defined(__arm__) #define CPU_PAUSE() asm volatile("yield" ::: "memory") #else @@ -373,7 +373,7 @@ namespace regen { #if 0 int spins = 0; while (numJobsRemaining_.load(std::memory_order_acquire) > 0u) { - if (++spins < 1000) _mm_pause(); + if (++spins < 1000) simde_mm_pause(); else std::this_thread::yield(); } #else diff --git a/regen/shader/includer.h b/regen/shader/includer.h index fc0ea12a88..5be3ec9a50 100644 --- a/regen/shader/includer.h +++ b/regen/shader/includer.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace regen { /**