diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml
new file mode 100644
index 0000000000..b443e3a210
--- /dev/null
+++ b/.github/actions/setup-linux/action.yml
@@ -0,0 +1,19 @@
+name: Setup Linux build environment
+description: Install dependencies
+runs:
+ using: "composite"
+ steps:
+ - shell: bash
+ run: |
+ set -euo pipefail
+ sudo apt-get update -y -qq
+ sudo apt-get install libgl-dev \
+ libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \
+ libboost-regex-dev libboost-timer-dev libsimde-dev \
+ libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \
+ libglew-dev libglu1-mesa-dev libgl1-mesa-dev \
+ libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \
+ libpng-dev libalut-dev \
+ qtbase5-dev libgtest-dev \
+ doxygen graphviz \
+ nlohmann-json3-dev
diff --git a/.github/workflows/main.yml b/.github/workflows/ci-linux.yml
similarity index 74%
rename from .github/workflows/main.yml
rename to .github/workflows/ci-linux.yml
index c577f58841..785792a2b5 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/ci-linux.yml
@@ -1,5 +1,4 @@
-
-name: CI
+name: CI Linux
on:
push:
branches: [ dev ]
@@ -8,44 +7,32 @@ on:
release:
types: [published]
jobs:
- build:
+ ##############################
+ build-linux:
runs-on: ubuntu-24.04
steps:
- # Note: some steps require the checkout in the root directory
- uses: actions/checkout@v3
+ - uses: ./.github/actions/setup-linux
- name: Build REGEN workspace
shell: bash
run: |
- sudo apt-get update -y -qq
- sudo apt-get install libgl-dev \
- libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \
- libboost-regex-dev libboost-timer-dev \
- libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \
- libglew-dev libglu1-mesa-dev libgl1-mesa-dev \
- libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \
- libpng-dev libalut-dev \
- qtbase5-dev libgtest-dev \
- doxygen graphviz \
- nlohmann-json3-dev
+ set -euo pipefail
mkdir build
cd build
- cmake ../ -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DBUILD_VIDEO_PLAYER=ON
- make 2> >(tee "make-output.txt")
+ cmake ../ \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DBUILD_UNIT_TESTS=ON \
+ -DBUILD_TESTS=ON \
+ -DBUILD_VIDEO_PLAYER=ON
+ make -j$(nproc) 2> >(tee "make-output.txt")
- name: Annotate compilation warnings/errors
if: ${{github.event_name == 'pull_request'}}
uses: JacobDomagala/CompileResult@master
# just so that in case this step fails, the workflow doesn't stop.
- # this is done as it is unclear how well the action is maintained.
continue-on-error: true
with:
comment_title: Compilation
compile_result_file: build/make-output.txt
- - name: Create debian package
- if: ${{github.event_name == 'push' || github.event_name == 'release'}}
- shell: bash
- run: |
- cd build
- cpack
- name: Run unit tests
if: ${{github.event_name == 'push' || github.event_name == 'pull_request'}}
shell: bash
@@ -59,12 +46,55 @@ jobs:
junit_files: "gtest-regen.xml"
action_fail: true
action_fail_on_inconclusive: true
- #####
+ - name: Upload test results
+ uses: actions/upload-artifact@v4
+ with:
+ name: test results
+ path: ./gtest-regen.xml
+ ##############################
+ debian-package:
+ needs: build-linux
+ runs-on: ubuntu-24.04
+ steps:
+ - uses: actions/checkout@v3
+ - uses: ./.github/actions/setup-linux
+ - name: Create debian package
+ if: ${{github.event_name == 'push' || github.event_name == 'release'}}
+ shell: bash
+ run: |
+ set -euo pipefail
+ mkdir build
+ cd build
+ cmake ../ -DCMAKE_BUILD_TYPE=Release
+ cpack
+ - name: Release debian package
+ if: github.event_name == 'release'
+ shell: bash
+ env:
+ GITHUB_TOKEN: ${{ github.TOKEN }}
+ run: |
+ gh release upload ${{github.event.release.tag_name}} ./build/regen-*.deb
+ - name: Upload debian package
+ if: github.event_name == 'push'
+ uses: actions/upload-artifact@v4
+ with:
+ name: debian package
+ path: ./build/regen-*.deb
+ ##############################
+ doxygen:
+ needs: build-linux
+ runs-on: ubuntu-24.04
+ steps:
+ - uses: actions/checkout@v3
+ - uses: ./.github/actions/setup-linux
- name: Run doxygen
if: ${{github.event_name == 'push' || github.event_name == 'release'}}
shell: bash
run: |
+ set -euo pipefail
+ mkdir build
cd build
+ cmake ../ -DCMAKE_BUILD_TYPE=Release
cmake --build . --target doc
cp -r ../img regen/doc/html/
- name: Extract version tag
@@ -88,26 +118,3 @@ jobs:
BRANCH: gh-pages
# The folder the action should deploy.
FOLDER: build/regen/doc/html
- # The folder in the target branch
- TARGET_FOLDER: ${{ env.REGEN_DOCU_VERSION }}
- CLEAN: true
- SINGLE_COMMIT: true
- #####
- - name: Release debian package
- if: github.event_name == 'release'
- shell: bash
- env:
- GITHUB_TOKEN: ${{ github.TOKEN }}
- run: |
- gh release upload ${{github.event.release.tag_name}} ./build/regen-*.deb
- - name: Upload debian package
- if: github.event_name == 'push'
- uses: actions/upload-artifact@v4
- with:
- name: debian package
- path: ./build/regen-*.deb
- - name: Upload test results
- uses: actions/upload-artifact@v4
- with:
- name: test results
- path: ./gtest-regen.xml
diff --git a/.github/workflows/ci-macos.yml b/.github/workflows/ci-macos.yml
new file mode 100644
index 0000000000..409bc2394c
--- /dev/null
+++ b/.github/workflows/ci-macos.yml
@@ -0,0 +1,29 @@
+name: CI MacOS
+on:
+ push:
+ branches: [ dev ]
+ pull_request:
+ branches: [ dev ]
+ release:
+ types: [published]
+
+jobs:
+ build-macos:
+ runs-on: macos-latest
+ steps:
+ - uses: actions/checkout@v3
+ - name: Install dependencies
+ run: |
+ brew install cmake gcc simde boost assimp openal-soft freetype bullet glew qt5 doxygen graphviz devil ffmpeg alut nlohmann-json googletest
+ - name: Build REGEN workspace
+ run: |
+ mkdir build
+ cd build
+ cmake \
+ -DCMAKE_C_COMPILER=/opt/homebrew/bin/gcc-14 -DCMAKE_CXX_COMPILER=/opt/homebrew/bin/g++-14 \
+ -DCMAKE_PREFIX_PATH="/opt/homebrew/opt/qt@5" \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DBUILD_TESTS=ON \
+ -DREGEN_EXTRA_INCLUDE_DIRS="/opt/homebrew/opt/openal-soft/include/" \
+ ../
+ make
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9284168bcf..281b34302c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,8 +75,10 @@ endif()
# given in the books "Effective C++" and "More Effective C++"
# add_definitions( -Weffc++ )
-if(UNIX) # gcc options
- add_definitions( -mfpmath=sse -march=native )
+if(UNIX)
+ # -march=native enables all instruction subsets supported by the local machine
+ # e.g. SSE2, SSE3, SSE4, AVX, AVX2, etc.
+ add_definitions( -march=native )
endif()
# perform more aggressive floating-point optimizations
@@ -107,6 +109,10 @@ find_package(Boost ${Boost_MIN_VERSION}
find_package(Threads REQUIRED)
# Font library: text rendering support
find_package(Freetype REQUIRED)
+# Note: On Linux, it seems needed in addition to link against brotlidec
+if (UNIX AND NOT APPLE)
+ set(FREETYPE_LIBRARIES ${FREETYPE_LIBRARIES} -lbrotlidec)
+endif()
# JSON library: for serialization
find_package(nlohmann_json REQUIRED)
set(JSON_LIBRARIES nlohmann_json::nlohmann_json)
@@ -184,8 +190,10 @@ message(STATUS " Models:${ASSIMP_INCLUDE_DIRS};")
message(STATUS " Fonts:${FREETYPE_INCLUDE_DIRS};")
message(STATUS " Physics:${BULLET_INCLUDE_DIRS};")
-enable_testing()
-find_package(GTest REQUIRED)
+if (BUILD_UNIT_TESTS)
+ enable_testing()
+ find_package(GTest REQUIRED)
+endif ()
if (HAS_AV_LIBS)
# allow includes without AL/ prefix. Some openAL versions require this.
@@ -199,7 +207,7 @@ set(REGEN_LIBRARIES
${Boost_LIBRARIES}
${JSON_LIBRARIES}
${ASSIMP_LIBRARIES}
- ${FREETYPE_LIBRARIES} -lbrotlidec
+ ${FREETYPE_LIBRARIES}
${IMG_LIBRARIES}
${AV_LIBRARIES}
${CMAKE_THREAD_LIBS_INIT}
@@ -239,6 +247,10 @@ endif()
###########
###########
+if(REGEN_EXTRA_INCLUDE_DIRS)
+ include_directories(${REGEN_EXTRA_INCLUDE_DIRS})
+endif()
+
# allow includes like even if the engine is not installed
include_directories(.)
# allow include of generated files
@@ -260,19 +272,21 @@ install(FILES img/icon-small.png DESTINATION ${SHARE_INSTALL_PATH}/img)
########## Unit Testings
##############
-# add an executable target for GTest.
-# but the testing code is partly in the *knowrob* library
-# where gtest won't find them without using the "--no-as-needed"
-# flag for the linker.
-add_executable(all_gtests
- tests/gtests.cpp
- tests/shapes/quad-tree-test.cpp)
-target_link_libraries(all_gtests
- -Wl,--whole-archive,--no-as-needed
- regen
- -Wl,--no-whole-archive
- ${Boost_Python_COMPONENT}
- ${GTEST_MAIN_LIBRARIES})
+if (BUILD_UNIT_TESTS)
+ # add an executable target for GTest.
+ # but the testing code is partly in the *knowrob* library
+ # where gtest won't find them without using the "--no-as-needed"
+ # flag for the linker.
+ add_executable(all_gtests
+ tests/gtests.cpp
+ tests/shapes/quad-tree-test.cpp)
+ target_link_libraries(all_gtests
+ -Wl,--whole-archive,--no-as-needed
+ regen
+ -Wl,--no-whole-archive
+ ${Boost_Python_COMPONENT}
+ ${GTEST_MAIN_LIBRARIES})
+endif ()
##############
########## packaging
diff --git a/README.md b/README.md
index 0880beb1db..6ba1d8645b 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,13 @@
-
+
+
+
+[](https://daniel86.github.io/regen/)
+
+
+
`regen` -- **Real-time Graphics Engine** -- is a modular OpenGL-based C++ engine designed for research and experimentation in real-time rendering, GPU compute, and virtual world simulation.
diff --git a/regen/compute/radix-sort-cpu.h b/regen/compute/radix-sort-cpu.h
index 3374b8ebe4..6493311f7b 100644
--- a/regen/compute/radix-sort-cpu.h
+++ b/regen/compute/radix-sort-cpu.h
@@ -139,23 +139,23 @@ namespace regen {
if constexpr (KEY_TYPE_BITS == 16) {
// Gather 8 keys manually, and promote to 32-bit
for (int k = 0; k < 8; ++k) tmpKeys32[k] = static_cast(keys[src[keyIdx+k]]);
- r0 = _mm256_load_si256(reinterpret_cast(tmpKeys32));
- r0 = _mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask);
+ r0 = simde_mm256_load_si256(reinterpret_cast(tmpKeys32));
+ r0 = simde_mm256_and_si256(simde_mm256_srli_epi32(r0, SHIFT), mask);
keyIdx += 8; // processed 8 keys, not 16!
}
else if constexpr (KEY_TYPE_BITS == 32) {
- simd::Register_i idx = _mm256_loadu_si256(reinterpret_cast(&src[keyIdx]));
+ simd::Register_i idx = simde_mm256_loadu_si256(reinterpret_cast(&src[keyIdx]));
// Gather 8 scattered keys, and apply shift and mask to get bucket ids
- r0 = _mm256_i32gather_epi32(reinterpret_cast(keys), idx, 4);
- r0 = _mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask);
+ r0 = simde_mm256_i32gather_epi32(reinterpret_cast(keys), idx, 4);
+ r0 = simde_mm256_and_si256(simde_mm256_srli_epi32(r0, SHIFT), mask);
keyIdx += KEYS_PER_SIMD_PASS;
}
else if constexpr (KEY_TYPE_BITS == 64) {
// note: values have 32 bits, use __m128i to load only 4
- __m128i idx32 = _mm_loadu_si128(reinterpret_cast(&src[keyIdx]));
+ simde__m128i idx32 = simde_mm_loadu_si128(reinterpret_cast(&src[keyIdx]));
// Gather 4 scattered keys, and apply shift and mask to get bucket ids
- r0 = _mm256_i32gather_epi64(reinterpret_cast(keys), idx32, 8);
- r0 = _mm256_and_si256(_mm256_srli_epi64(r0, SHIFT), mask);
+ r0 = simde_mm256_i32gather_epi64(reinterpret_cast(keys), idx32, 8);
+ r0 = simde_mm256_and_si256(simde_mm256_srli_epi64(r0, SHIFT), mask);
keyIdx += KEYS_PER_SIMD_PASS;
}
else {
@@ -163,7 +163,7 @@ namespace regen {
break;
}
// Store results into tmpBins_ and increment histogram
- _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmpBins_), r0);
+ simde_mm256_storeu_si256(reinterpret_cast(tmpBins_), r0);
for (auto x : tmpBins_) ++histogram_[x];
}
}
diff --git a/regen/compute/simd.h b/regen/compute/simd.h
index b47af9a8e8..ed06be8761 100644
--- a/regen/compute/simd.h
+++ b/regen/compute/simd.h
@@ -4,21 +4,31 @@
#include
#include
-// NOTE: Check for REGEN_HAS_SIMD, if it is not defined, the SIMD operations will be disabled
-// and the code here will fall back to scalar operations.
-// NOLINTBEGIN(portability-simd-intrinsics)
-#if defined(__AVX__)
- #include // AVX
- #define REGEN_SIMD_MODE AVX
- #define REGEN_SIMD_WIDTH 8
- #define REGEN_HAS_SIMD
-#elif defined(__SSE__)
- #include // SSE
- #define REGEN_SIMD_MODE SSE
- #define REGEN_SIMD_WIDTH 4
- #define REGEN_HAS_SIMD
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include
+#include
+#include
+#include
+#include
+#include
+
+#define REGEN_SIMD_NONE 0
+#define REGEN_SIMD_SSE 1
+#define REGEN_SIMD_AVX 2
+
+#if defined(SIMDE_NATURAL_VECTOR_SIZE)
+ #if SIMDE_NATURAL_VECTOR_SIZE >= 32
+ #define REGEN_SIMD_MODE REGEN_SIMD_AVX
+ #define REGEN_SIMD_WIDTH 8
+ #elif SIMDE_NATURAL_VECTOR_SIZE >= 16
+ #define REGEN_SIMD_MODE REGEN_SIMD_SSE
+ #define REGEN_SIMD_WIDTH 4
+ #else
+ #define REGEN_SIMD_MODE REGEN_SIMD_NONE
+ #define REGEN_SIMD_WIDTH 1
+ #endif
#else
- #define REGEN_SIMD_MODE NONE
+ #define REGEN_SIMD_MODE REGEN_SIMD_NONE
#define REGEN_SIMD_WIDTH 1
#endif
@@ -32,236 +42,236 @@ namespace regen::simd {
return bitIndex;
}
-#if REGEN_SIMD_MODE == AVX
+#if REGEN_SIMD_MODE == REGEN_SIMD_AVX
static constexpr int8_t RegisterMask = 0xFF; // 8 bits for AVX
- using Register = __m256; // 8 floats
- using Register_i = __m256i; // 8 integers
+ using Register = simde__m256; // 8 floats
+ using Register_i = simde__m256i; // 8 integers
- inline __m256 set1_ps(float v) { return _mm256_set1_ps(v); }
- inline __m256i set1_epi32(int32_t v) { return _mm256_set1_epi32(v); }
- inline __m256i set1_epi16(uint16_t v) { return _mm256_set1_epi16(v); }
- inline __m256i set1_epi64(int64_t v) { return _mm256_set1_epi64x(v); }
- inline __m256i set1_epi64u(uint64_t v) { return _mm256_set1_epi64x(v); }
+ inline Register set1_ps(float v) { return simde_mm256_set1_ps(v); }
+ inline Register_i set1_epi32(int32_t v) { return simde_mm256_set1_epi32(v); }
+ inline Register_i set1_epi16(uint16_t v) { return simde_mm256_set1_epi16(v); }
+ inline Register_i set1_epi64(int64_t v) { return simde_mm256_set1_epi64x(v); }
+ inline Register_i set1_epi64u(uint64_t v) { return simde_mm256_set1_epi64x(v); }
- inline __m256 setzero_ps() { return _mm256_setzero_ps(); }
- inline __m256i setzero_si256() { return _mm256_setzero_si256(); }
+ inline Register setzero_ps() { return simde_mm256_setzero_ps(); }
+ inline Register_i setzero_si256() { return simde_mm256_setzero_si256(); }
- inline __m256 load_ps(const float *p) { return _mm256_load_ps(p); }
- inline __m256 loadu_ps(const float *p) { return _mm256_loadu_ps(p); }
+ inline Register load_ps(const float *p) { return simde_mm256_load_ps(p); }
+ inline Register loadu_ps(const float *p) { return simde_mm256_loadu_ps(p); }
- inline __m256i load_si256(const uint16_t *p) {
- return _mm256_load_si256(reinterpret_cast(p));
+ inline Register_i load_si256(const uint16_t *p) {
+ return simde_mm256_load_si256(reinterpret_cast(p));
}
- inline __m256i load_si256(const uint32_t *p) {
- return _mm256_load_si256(reinterpret_cast(p));
+ inline Register_i load_si256(const uint32_t *p) {
+ return simde_mm256_load_si256(reinterpret_cast(p));
}
- inline __m256i load_si256(const uint64_t *p) {
- return _mm256_load_si256(reinterpret_cast(p));
+ inline Register_i load_si256(const uint64_t *p) {
+ return simde_mm256_load_si256(reinterpret_cast(p));
}
- inline __m256i load_si256(const int32_t *p) {
- return _mm256_load_si256(reinterpret_cast(p));
+ inline Register_i load_si256(const int32_t *p) {
+ return simde_mm256_load_si256(reinterpret_cast(p));
}
- inline __m256i loadu_si256(const uint16_t *p) {
- return _mm256_loadu_si256(reinterpret_cast(p));
+ inline Register_i loadu_si256(const uint16_t *p) {
+ return simde_mm256_loadu_si256(reinterpret_cast(p));
}
- inline __m256i loadu_si256(const uint32_t *p) {
- return _mm256_loadu_si256(reinterpret_cast(p));
+ inline Register_i loadu_si256(const uint32_t *p) {
+ return simde_mm256_loadu_si256(reinterpret_cast(p));
}
- inline __m256i loadu_si256(const uint64_t *p) {
- return _mm256_loadu_si256(reinterpret_cast(p));
+ inline Register_i loadu_si256(const uint64_t *p) {
+ return simde_mm256_loadu_si256(reinterpret_cast(p));
}
- inline __m256i loadu_si256(const int32_t *p) {
- return _mm256_loadu_si256(reinterpret_cast(p));
+ inline Register_i loadu_si256(const int32_t *p) {
+ return simde_mm256_loadu_si256(reinterpret_cast(p));
}
- inline __m256 epi_to_ps(const __m256i &v) { return _mm256_castsi256_ps(v); }
+ inline Register epi_to_ps(const Register_i &v) { return simde_mm256_castsi256_ps(v); }
- inline __m256 i32gather_ps(const float *p, const __m256i &indices) {
- return _mm256_i32gather_ps(p, indices, sizeof(float));
+ inline Register i32gather_ps(const float *p, const Register_i &indices) {
+ return simde_mm256_i32gather_ps(p, indices, sizeof(float));
}
- inline void storeu_ps(float *p, const __m256 &v) { _mm256_storeu_ps(p, v); }
- inline void store_ps(float *p, const __m256 &v) { _mm256_store_ps(p, v); }
+ inline void storeu_ps(float *p, const Register &v) { simde_mm256_storeu_ps(p, v); }
+ inline void store_ps(float *p, const Register &v) { simde_mm256_store_ps(p, v); }
- inline void storeu_epi32(int32_t *p, const __m256i &v) {
- _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v);
+ inline void storeu_epi32(int32_t *p, const Register_i &v) {
+ simde_mm256_storeu_si256(reinterpret_cast(p), v);
}
- inline void storeu_epi32(uint32_t *p, const __m256i &v) {
- _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v);
+ inline void storeu_epi32(uint32_t *p, const Register_i &v) {
+ simde_mm256_storeu_si256(reinterpret_cast(p), v);
}
- inline void store_epi32(int32_t *p, const __m256i &v) {
- _mm256_store_si256(reinterpret_cast<__m256i*>(p), v);
+ inline void store_epi32(int32_t *p, const Register_i &v) {
+ simde_mm256_store_si256(reinterpret_cast(p), v);
}
- inline void store_epi32(uint32_t *p, const __m256i &v) {
- _mm256_store_si256(reinterpret_cast<__m256i*>(p), v);
+ inline void store_epi32(uint32_t *p, const Register_i &v) {
+ simde_mm256_store_si256(reinterpret_cast(p), v);
}
- inline __m256 add_ps(const __m256 &a, const __m256 &b) { return _mm256_add_ps(a, b); }
- inline __m256 sub_ps(const __m256 &a, const __m256 &b) { return _mm256_sub_ps(a, b); }
- inline __m256 mul_ps(const __m256 &a, const __m256 &b) { return _mm256_mul_ps(a, b); }
- inline __m256 div_ps(const __m256 &a, const __m256 &b) { return _mm256_div_ps(a, b); }
+ inline Register add_ps(const Register &a, const Register &b) { return simde_mm256_add_ps(a, b); }
+ inline Register sub_ps(const Register &a, const Register &b) { return simde_mm256_sub_ps(a, b); }
+ inline Register mul_ps(const Register &a, const Register &b) { return simde_mm256_mul_ps(a, b); }
+ inline Register div_ps(const Register &a, const Register &b) { return simde_mm256_div_ps(a, b); }
/**
* Fused multiply-add: (a * b) + c
*/
- inline __m256 mul_add_ps(const __m256 &a, const __m256 &b, const __m256 &c) {
- return _mm256_fmadd_ps(a, b, c);
+ inline Register mul_add_ps(const Register &a, const Register &b, const Register &c) {
+ return simde_mm256_fmadd_ps(a, b, c);
}
- inline __m256i add_epi32(const __m256i &a, const __m256i &b) { return _mm256_add_epi32(a, b); }
- inline __m256i sub_epi32(const __m256i &a, const __m256i &b) { return _mm256_sub_epi32(a, b); }
- inline __m256i mul_epi32(const __m256i &a, const __m256i &b) { return _mm256_mullo_epi32(a, b); }
+ inline Register_i add_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_add_epi32(a, b); }
+ inline Register_i sub_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_sub_epi32(a, b); }
+ inline Register_i mul_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_mullo_epi32(a, b); }
- inline __m256 min_ps(const __m256 &a, const __m256 &b) { return _mm256_min_ps(a, b); }
- inline __m256 max_ps(const __m256 &a, const __m256 &b) { return _mm256_max_ps(a, b); }
- inline __m256 sqrt_ps(const __m256 &a) { return _mm256_sqrt_ps(a); }
+ inline Register min_ps(const Register &a, const Register &b) { return simde_mm256_min_ps(a, b); }
+ inline Register max_ps(const Register &a, const Register &b) { return simde_mm256_max_ps(a, b); }
+ inline Register sqrt_ps(const Register &a) { return simde_mm256_sqrt_ps(a); }
- inline __m256i min_epi32(const __m256i &a, const __m256i &b) { return _mm256_min_epi32(a, b); }
- inline __m256i max_epi32(const __m256i &a, const __m256i &b) { return _mm256_max_epi32(a, b); }
+ inline Register_i min_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_min_epi32(a, b); }
+ inline Register_i max_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_max_epi32(a, b); }
/**
- * Horizontal sum of all elements in an __m256
+ * Horizontal sum of all elements in an Register
*/
- inline float hsum_ps(__m256 v) {
- __m128 vlow = _mm256_castps256_ps128(v); // low 128
- __m128 vhigh = _mm256_extractf128_ps(v, 1); // high 128
- __m128 sum = _mm_add_ps(vlow, vhigh); // add low and high parts
- __m128 shuf = _mm_movehdup_ps(sum); // (sum.y, sum.y, sum.w, sum.w)
- __m128 sums = _mm_add_ps(sum, shuf);
- shuf = _mm_movehl_ps(shuf, sums); // high half of sums
- sums = _mm_add_ss(sums, shuf);
- return _mm_cvtss_f32(sums);
+ inline float hsum_ps(Register v) {
+ simde__m128 vlow = simde_mm256_castps256_ps128(v); // low 128
+ simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1); // high 128
+ simde__m128 sum = simde_mm_add_ps(vlow, vhigh); // add low and high parts
+ simde__m128 shuf = simde_mm_movehdup_ps(sum); // (sum.y, sum.y, sum.w, sum.w)
+ simde__m128 sums = simde_mm_add_ps(sum, shuf);
+ shuf = simde_mm_movehl_ps(shuf, sums); // high half of sums
+ sums = simde_mm_add_ss(sums, shuf);
+ return simde_mm_cvtss_f32(sums);
}
/**
- * Horizontal min of all elements in an __m256
+ * Horizontal min of all elements in an Register
*/
- inline float hmin_ps(__m256 v) {
- __m128 vlow = _mm256_castps256_ps128(v); // low 128
- __m128 vhigh = _mm256_extractf128_ps(v, 1); // high 128
- __m128 min = _mm_min_ps(vlow, vhigh); // min low and high parts
- __m128 shuf = _mm_movehdup_ps(min); // (min.y, min.y, min.w, min.w)
- __m128 mins = _mm_min_ps(min, shuf);
- shuf = _mm_movehl_ps(shuf, mins); // high half of mins
- mins = _mm_min_ss(mins, shuf);
- return _mm_cvtss_f32(mins);
+ inline float hmin_ps(Register v) {
+ simde__m128 vlow = simde_mm256_castps256_ps128(v); // low 128
+ simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1); // high 128
+ simde__m128 min = simde_mm_min_ps(vlow, vhigh); // min low and high parts
+ simde__m128 shuf = simde_mm_movehdup_ps(min); // (min.y, min.y, min.w, min.w)
+ simde__m128 mins = simde_mm_min_ps(min, shuf);
+ shuf = simde_mm_movehl_ps(shuf, mins); // high half of mins
+ mins = simde_mm_min_ss(mins, shuf);
+ return simde_mm_cvtss_f32(mins);
}
/**
- * Horizontal max of all elements in an __m256
+ * Horizontal max of all elements in an Register
*/
- inline float hmax_ps(__m256 v) {
- __m128 vlow = _mm256_castps256_ps128(v); // low 128
- __m128 vhigh = _mm256_extractf128_ps(v, 1); // high 128
- __m128 max = _mm_max_ps(vlow, vhigh); // max low and high parts
- __m128 shuf = _mm_movehdup_ps(max); // (max.y, max.y, max.w, max.w)
- __m128 maxs = _mm_max_ps(max, shuf);
- shuf = _mm_movehl_ps(shuf, maxs); // high half of maxs
- maxs = _mm_max_ss(maxs, shuf);
- return _mm_cvtss_f32(maxs);
+ inline float hmax_ps(Register v) {
+ simde__m128 vlow = simde_mm256_castps256_ps128(v); // low 128
+ simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1); // high 128
+ simde__m128 max = simde_mm_max_ps(vlow, vhigh); // max low and high parts
+ simde__m128 shuf = simde_mm_movehdup_ps(max); // (max.y, max.y, max.w, max.w)
+ simde__m128 maxs = simde_mm_max_ps(max, shuf);
+ shuf = simde_mm_movehl_ps(shuf, maxs); // high half of maxs
+ maxs = simde_mm_max_ss(maxs, shuf);
+ return simde_mm_cvtss_f32(maxs);
}
- inline __m256 rcp_ps(const __m256 &a) { return _mm256_rcp_ps(a); }
+ inline Register rcp_ps(const Register &a) { return simde_mm256_rcp_ps(a); }
- inline __m256 cmp_lt(const __m256 &a, const __m256 &b) {
- return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
+ inline Register cmp_lt(const Register &a, const Register &b) {
+ return simde_mm256_cmp_ps(a, b, _CMP_LT_OQ);
}
- inline __m256 cmp_gt(const __m256 &a, const __m256 &b) {
- return _mm256_cmp_ps(a, b, _CMP_GT_OQ);
+ inline Register cmp_gt(const Register &a, const Register &b) {
+ return simde_mm256_cmp_ps(a, b, _CMP_GT_OQ);
}
- inline __m256 cmp_eq(const __m256 &a, const __m256 &b) {
- return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
+ inline Register cmp_eq(const Register &a, const Register &b) {
+ return simde_mm256_cmp_ps(a, b, _CMP_EQ_OQ);
}
- inline __m256 cmp_neq(const __m256 &a, const __m256 &b) {
- return _mm256_cmp_ps(a, b, _CMP_NEQ_OQ);
+ inline Register cmp_neq(const Register &a, const Register &b) {
+ return simde_mm256_cmp_ps(a, b, _CMP_NEQ_OQ);
}
- inline __m256 cmp_or(const __m256 &a, const __m256 &b) {
- return _mm256_or_ps(a, b);
+ inline Register cmp_or(const Register &a, const Register &b) {
+ return simde_mm256_or_ps(a, b);
}
- inline __m256 cmp_and(const __m256 &a, const __m256 &b) {
- return _mm256_and_ps(a, b);
+ inline Register cmp_and(const Register &a, const Register &b) {
+ return simde_mm256_and_ps(a, b);
}
- inline __m256 cmp_and_not(const __m256 &a, const __m256 &b) {
- return _mm256_andnot_ps(a, b);
+ inline Register cmp_and_not(const Register &a, const Register &b) {
+ return simde_mm256_andnot_ps(a, b);
}
- inline __m256i cvttps_epi32(const __m256 &a) { return _mm256_cvttps_epi32(a); }
+ inline Register_i cvttps_epi32(const Register &a) { return simde_mm256_cvttps_epi32(a); }
- inline int movemask_ps(const __m256 &v) { return _mm256_movemask_ps(v); }
+ inline int movemask_ps(const Register &v) { return simde_mm256_movemask_ps(v); }
- inline __m256 blendv_ps(const __m256 &a, const __m256 &b, const __m256 &mask) {
- return _mm256_blendv_ps(a, b, mask);
+ inline Register blendv_ps(const Register &a, const Register &b, const Register &mask) {
+ return simde_mm256_blendv_ps(a, b, mask);
}
-#elif REGEN_SIMD_MODE == SSE
+#elif REGEN_SIMD_MODE == REGEN_SIMD_SSE
static constexpr int8_t RegisterMask = 0x0F; // 4 bits for SSE
- using Register = __m128; // 4 floats
- using Register_i = __m128i; // 4 integers
+ using Register = simde__m128; // 4 floats
+ using Register_i = simde__m128i; // 4 integers
- inline __m128 set1_ps(float v) { return _mm_set1_ps(v); }
- inline __m128i set1_epi32(int32_t v) { return _mm_set1_epi32(v); }
+ inline Register set1_ps(float v) { return simde_mm_set1_ps(v); }
+ inline Register_i set1_epi32(int32_t v) { return simde_mm_set1_epi32(v); }
- inline __m128 setzero_ps() { return _mm_setzero_ps(); }
- inline __m128i setzero_si256() { return _mm_setzero_si128(); }
+ inline Register setzero_ps() { return simde_mm_setzero_ps(); }
+ inline Register_i setzero_si256() { return simde_mm_setzero_si128(); }
- inline __m128 load_ps(const float *p) { return _mm_load_ps(p); }
- inline __m128 loadu_ps(const float *p) { return _mm_loadu_ps(p); }
+ inline Register load_ps(const float *p) { return simde_mm_load_ps(p); }
+ inline Register loadu_ps(const float *p) { return simde_mm_loadu_ps(p); }
- inline __m128i loadu_si256(const uint32_t *p) {
- return _mm_loadu_si128(reinterpret_cast(indices));
+ inline Register_i loadu_si256(const uint32_t *p) {
+ return simde_mm_loadu_si128(reinterpret_cast(p));
}
- inline __m128 epi_to_ps(const __m128i &v) { return _mm_castsi128_ps(v); }
+ inline Register epi_to_ps(const Register_i &v) { return simde_mm_castsi128_ps(v); }
- inline __m128 i32gather_ps(const float *p, const __m128i &indices) {
- return _mm_i32gather_ps(p, indices, sizeof(float));
+ inline Register i32gather_ps(const float *p, const Register_i &indices) {
+ return simde_mm_i32gather_ps(p, indices, sizeof(float));
}
- inline void storeu_ps(float *p, const __m128 &v) { _mm_storeu_ps(p, v); }
+ inline void storeu_ps(float *p, const Register &v) { simde_mm_storeu_ps(p, v); }
- inline __m128 add_ps(const __m128 &a, const __m128 &b) { return _mm_add_ps(a, b); }
- inline __m128 sub_ps(const __m128 &a, const __m128 &b) { return _mm_sub_ps(a, b); }
- inline __m128 mul_ps(const __m128 &a, const __m128 &b) { return _mm_mul_ps(a, b); }
- inline __m128 div_ps(const __m128 &a, const __m128 &b) { return _mm_div_ps(a, b); }
+ inline Register add_ps(const Register &a, const Register &b) { return simde_mm_add_ps(a, b); }
+ inline Register sub_ps(const Register &a, const Register &b) { return simde_mm_sub_ps(a, b); }
+ inline Register mul_ps(const Register &a, const Register &b) { return simde_mm_mul_ps(a, b); }
+ inline Register div_ps(const Register &a, const Register &b) { return simde_mm_div_ps(a, b); }
- inline __m128i add_epi32(const __m128i &a, const __m128i &b) { return _mm_add_epi32(a, b); }
- inline __m128i sub_epi32(const __m128i &a, const __m128i &b) { return _mm_sub_epi32(a, b); }
- inline __m128i mul_epi32(const __m128i &a, const __m128i &b) { return _mm_mullo_epi32(a, b); }
+ inline Register_i add_epi32(const Register_i &a, const Register_i &b) { return simde_mm_add_epi32(a, b); }
+ inline Register_i sub_epi32(const Register_i &a, const Register_i &b) { return simde_mm_sub_epi32(a, b); }
+ inline Register_i mul_epi32(const Register_i &a, const Register_i &b) { return simde_mm_mullo_epi32(a, b); }
- inline __m128 min_ps(const __m128 &a, const __m128 &b) { return _mm_min_ps(a, b); }
- inline __m128 max_ps(const __m128 &a, const __m128 &b) { return _mm_max_ps(a, b); }
- inline __m128 sqrt_ps(const __m128 &a) { return _mm_sqrt_ps(a); }
+ inline Register min_ps(const Register &a, const Register &b) { return simde_mm_min_ps(a, b); }
+ inline Register max_ps(const Register &a, const Register &b) { return simde_mm_max_ps(a, b); }
+ inline Register sqrt_ps(const Register &a) { return simde_mm_sqrt_ps(a); }
- inline __m128i min_epi32(const __m128i &a, const __m128i &b) { return _mm_min_epi32(a, b); }
- inline __m128i max_epi32(const __m128i &a, const __m128i &b) { return _mm_max_epi32(a, b); }
+ inline Register_i min_epi32(const Register_i &a, const Register_i &b) { return simde_mm_min_epi32(a, b); }
+ inline Register_i max_epi32(const Register_i &a, const Register_i &b) { return simde_mm_max_epi32(a, b); }
- inline float hsum_ps(__m128 v) {
- __m128 shuf = _mm_movehdup_ps(v); // (v1, v1, v3, v3)
- __m128 sums = _mm_add_ps(v, shuf);
- shuf = _mm_movehl_ps(shuf, sums); // (v2 + v3, v3, -, -)
- sums = _mm_add_ss(sums, shuf);
- return _mm_cvtss_f32(sums);
+ inline float hsum_ps(Register v) {
+ Register shuf = simde_mm_movehdup_ps(v); // (v1, v1, v3, v3)
+ Register sums = simde_mm_add_ps(v, shuf);
+ shuf = simde_mm_movehl_ps(shuf, sums); // (v2 + v3, v3, -, -)
+ sums = simde_mm_add_ss(sums, shuf);
+ return simde_mm_cvtss_f32(sums);
}
- inline __m128 rcp_ps(const __m128 &a) { return _mm_rcp_ps(a); }
+ inline Register rcp_ps(const Register &a) { return simde_mm_rcp_ps(a); }
- inline __m128 cmp_lt(const __m128 &a, const __m128 &b) { return _mm_cmplt_ps(a, b); }
- inline __m128 cmp_gt(const __m128 &a, const __m128 &b) { return _mm_cmplt_ps(b, a); }
- inline __m128 cmp_eq(const __m128 &a, const __m128 &b) { return _mm_cmpeq_ps(a, b); }
- inline __m128 cmp_neq(const __m128 &a, const __m128 &b) {
- __m128 eq = _mm_cmpeq_ps(a, b);
- return _mm_andnot_ps(eq, _mm_castsi128_ps(_mm_set1_epi32(-1))); // ~eq & all_ones
+ inline Register cmp_lt(const Register &a, const Register &b) { return simde_mm_cmplt_ps(a, b); }
+ inline Register cmp_gt(const Register &a, const Register &b) { return simde_mm_cmplt_ps(b, a); }
+ inline Register cmp_eq(const Register &a, const Register &b) { return simde_mm_cmpeq_ps(a, b); }
+ inline Register cmp_neq(const Register &a, const Register &b) {
+ Register eq = simde_mm_cmpeq_ps(a, b);
+ return simde_mm_andnot_ps(eq, simde_mm_castsi128_ps(simde_mm_set1_epi32(-1))); // ~eq & all_ones
}
- inline __m128 cmp_or(const __m128 &a, const __m128 &b) { return _mm_or_ps(a, b); }
- inline __m128 cmp_and(const __m128 &a, const __m128 &b) { return _mm_and_ps(a, b); }
+ inline Register cmp_or(const Register &a, const Register &b) { return simde_mm_or_ps(a, b); }
+ inline Register cmp_and(const Register &a, const Register &b) { return simde_mm_and_ps(a, b); }
- inline __m128i cvttps_epi32(const __m128 &a) { return _mm_cvttps_epi32(a); }
+ inline Register_i cvttps_epi32(const Register &a) { return simde_mm_cvttps_epi32(a); }
- inline int movemask_ps(const __m128 &v) { return _mm_movemask_ps(v); }
+ inline int movemask_ps(const Register &v) { return simde_mm_movemask_ps(v); }
- inline __m128 blendv_ps(const __m128 &a, const __m128 &b, const __m128 &mask) {
- return _mm_blendv_ps(a, b, mask);
+ inline Register blendv_ps(const Register &a, const Register &b, const Register &mask) {
+ return simde_mm_blendv_ps(a, b, mask);
}
#else // Fallback to scalar operations
@@ -752,7 +762,7 @@ namespace regen {
}
static BatchOf_int32 castFloatBatch(const BatchOf_float &v) {
- return BatchOf_int32{_mm256_castps_si256(v.c)};
+ return BatchOf_int32{simde_mm256_castps_si256(v.c)};
}
/**
@@ -832,7 +842,7 @@ namespace regen {
}
BatchOf_int32 operator&(const BatchOf_int32 &other) const {
- return BatchOf_int32{_mm256_and_si256(c, other.c)};
+ return BatchOf_int32{simde_mm256_and_si256(c, other.c)};
}
static BatchOf_int32 allZeros() {
@@ -1199,7 +1209,7 @@ namespace regen {
/**
* Computes the length squared of each vector in the batch.
- * @return __m128 containing the length squared for each vector.
+ * @return Register containing the length squared for each vector.
*/
BatchOf_float lengthSquared() const {
return x*x + y*y + z*z;
@@ -1272,6 +1282,4 @@ namespace regen {
template using vectorSIMD = std::vector>;
}
-// NOLINTEND(portability-simd-intrinsics)
-
#endif /* REGEN_SIMD_H_ */
diff --git a/regen/compute/threading.h b/regen/compute/threading.h
index 250905e299..9a2d5c2082 100644
--- a/regen/compute/threading.h
+++ b/regen/compute/threading.h
@@ -9,8 +9,8 @@
#include "regen/memory/aligned-allocator.h"
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
- #include
- #define CPU_PAUSE() _mm_pause()
+ #include
+ #define CPU_PAUSE() simde_mm_pause()
#elif defined(__aarch64__) || defined(__arm__)
#define CPU_PAUSE() asm volatile("yield" ::: "memory")
#else
@@ -373,7 +373,7 @@ namespace regen {
#if 0
int spins = 0;
while (numJobsRemaining_.load(std::memory_order_acquire) > 0u) {
- if (++spins < 1000) _mm_pause();
+ if (++spins < 1000) simde_mm_pause();
else std::this_thread::yield();
}
#else
diff --git a/regen/shader/includer.h b/regen/shader/includer.h
index fc0ea12a88..5be3ec9a50 100644
--- a/regen/shader/includer.h
+++ b/regen/shader/includer.h
@@ -6,6 +6,7 @@
#include
#include