diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml
new file mode 100644
index 0000000000..b443e3a210
--- /dev/null
+++ b/.github/actions/setup-linux/action.yml
@@ -0,0 +1,19 @@
+name: Setup Linux build environment
+description: Install dependencies
+runs:
+  using: "composite"
+  steps:
+    - shell: bash
+      run: |
+        set -euo pipefail
+        sudo apt-get update -y -qq
+        sudo apt-get install libgl-dev \
+            libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \
+            libboost-regex-dev libboost-timer-dev libsimde-dev \
+            libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \
+            libglew-dev libglu1-mesa-dev libgl1-mesa-dev \
+            libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \
+            libpng-dev libalut-dev \
+            qtbase5-dev libgtest-dev \
+            doxygen graphviz \
+            nlohmann-json3-dev
diff --git a/.github/workflows/main.yml b/.github/workflows/ci-linux.yml
similarity index 74%
rename from .github/workflows/main.yml
rename to .github/workflows/ci-linux.yml
index c577f58841..785792a2b5 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/ci-linux.yml
@@ -1,5 +1,4 @@
-
-name: CI
+name: CI Linux
 on:
   push:
     branches: [ dev ]
@@ -8,44 +7,32 @@ on:
   release:
     types: [published]
 jobs:
-  build:
+  ##############################
+  build-linux:
     runs-on: ubuntu-24.04
     steps:
-    # Note: some steps require the checkout in the root directory
     - uses: actions/checkout@v3
+    - uses: ./.github/actions/setup-linux
     - name: Build REGEN workspace
       shell: bash
       run: |
-        sudo apt-get update -y -qq
-        sudo apt-get install libgl-dev \
-            libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \
-            libboost-regex-dev libboost-timer-dev \
-            libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \
-            libglew-dev libglu1-mesa-dev libgl1-mesa-dev \
-            libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \
-            libpng-dev libalut-dev \
-            qtbase5-dev libgtest-dev \
-            doxygen graphviz \
-            nlohmann-json3-dev
+        set -euo pipefail
         mkdir build
         cd build
-        cmake ../ -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DBUILD_VIDEO_PLAYER=ON
-        make 2> >(tee "make-output.txt")
+        cmake ../ \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DBUILD_UNIT_TESTS=ON \
+          -DBUILD_TESTS=ON \
+          -DBUILD_VIDEO_PLAYER=ON
+        make -j$(nproc) 2> >(tee "make-output.txt")
     - name: Annotate compilation warnings/errors
       if: ${{github.event_name == 'pull_request'}}
       uses: JacobDomagala/CompileResult@master
       # just so that in case this step fails, the workflow doesn't stop.
-      # this is done as it is unclear how well the action is maintained.
       continue-on-error: true
       with:
         comment_title: Compilation
         compile_result_file: build/make-output.txt
-    - name: Create debian package
-      if: ${{github.event_name == 'push' || github.event_name == 'release'}}
-      shell: bash
-      run: |
-        cd build
-        cpack
     - name: Run unit tests
       if: ${{github.event_name == 'push' || github.event_name == 'pull_request'}}
       shell: bash
@@ -59,12 +46,55 @@ jobs:
         junit_files: "gtest-regen.xml"
         action_fail: true
         action_fail_on_inconclusive: true
-    #####
+    - name: Upload test results
+      uses: actions/upload-artifact@v4
+      with:
+        name: test results
+        path: ./gtest-regen.xml
+  ##############################
+  debian-package:
+    needs: build-linux
+    runs-on: ubuntu-24.04
+    steps:
+    - uses: actions/checkout@v3
+    - uses: ./.github/actions/setup-linux
+    - name: Create debian package
+      if: ${{github.event_name == 'push' || github.event_name == 'release'}}
+      shell: bash
+      run: |
+        set -euo pipefail
+        mkdir build
+        cd build
+        cmake ../ -DCMAKE_BUILD_TYPE=Release
+        cpack
+    - name: Release debian package
+      if: github.event_name == 'release'
+      shell: bash
+      env:
+        GITHUB_TOKEN: ${{ github.TOKEN }}
+      run: |
+        gh release upload ${{github.event.release.tag_name}} ./build/regen-*.deb
+    - name: Upload debian package
+      if: github.event_name == 'push'
+      uses: actions/upload-artifact@v4
+      with:
+        name: debian package
+        path: ./build/regen-*.deb
+  ##############################
+  doxygen:
+    needs: build-linux
+    runs-on: ubuntu-24.04
+    steps:
+    - uses: actions/checkout@v3
+    - uses: ./.github/actions/setup-linux
     - name: Run doxygen
       if: ${{github.event_name == 'push' || github.event_name == 'release'}}
       shell: bash
       run: |
+        set -euo pipefail
+        mkdir build
         cd build
+        cmake ../ -DCMAKE_BUILD_TYPE=Release
         cmake --build . --target doc
         cp -r ../img regen/doc/html/
     - name: Extract version tag
@@ -88,26 +118,3 @@ jobs:
         BRANCH: gh-pages
         # The folder the action should deploy.
         FOLDER: build/regen/doc/html
-        # The folder in the target branch
-        TARGET_FOLDER: ${{ env.REGEN_DOCU_VERSION }}
-        CLEAN: true
-        SINGLE_COMMIT: true
-    #####
-    - name: Release debian package
-      if: github.event_name == 'release'
-      shell: bash
-      env:
-        GITHUB_TOKEN: ${{ github.TOKEN }}
-      run: |
-        gh release upload ${{github.event.release.tag_name}} ./build/regen-*.deb
-    - name: Upload debian package
-      if: github.event_name == 'push'
-      uses: actions/upload-artifact@v4
-      with:
-        name: debian package
-        path: ./build/regen-*.deb
-    - name: Upload test results
-      uses: actions/upload-artifact@v4
-      with:
-        name: test results
-        path: ./gtest-regen.xml
diff --git a/.github/workflows/ci-macos.yml b/.github/workflows/ci-macos.yml
new file mode 100644
index 0000000000..409bc2394c
--- /dev/null
+++ b/.github/workflows/ci-macos.yml
@@ -0,0 +1,29 @@
+name: CI MacOS
+on:
+  push:
+    branches: [ dev ]
+  pull_request:
+    branches: [ dev ]
+  release:
+    types: [published]
+
+jobs:
+  build-macos:
+    runs-on: macos-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Install dependencies
+        run: |
+          brew install cmake gcc simde boost assimp openal-soft freetype bullet glew qt5 doxygen graphviz devil ffmpeg alut nlohmann-json googletest
+      - name: Build REGEN workspace
+        run: |
+          mkdir build
+          cd build
+          cmake \
+            -DCMAKE_C_COMPILER=/opt/homebrew/bin/gcc-14 -DCMAKE_CXX_COMPILER=/opt/homebrew/bin/g++-14 \
+            -DCMAKE_PREFIX_PATH="/opt/homebrew/opt/qt@5" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DBUILD_TESTS=ON \
+            -DREGEN_EXTRA_INCLUDE_DIRS="/opt/homebrew/opt/openal-soft/include/" \
+            ../
+          make
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9284168bcf..281b34302c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,8 +75,10 @@ endif()
 # given in the books "Effective C++" and "More Effective C++"
 # add_definitions( -Weffc++ )
 
-if(UNIX) # gcc options
-    add_definitions( -mfpmath=sse -march=native )
+if(UNIX)
+    # -march=native enables all instruction subsets supported by the local machine
+    # e.g. SSE2, SSE3, SSE4, AVX, AVX2, etc.
+    add_definitions( -march=native )
 endif()
 
 # perform more aggressive floating-point optimizations
@@ -107,6 +109,10 @@ find_package(Boost ${Boost_MIN_VERSION}
 find_package(Threads REQUIRED)
 # Font library: text rendering support
 find_package(Freetype REQUIRED)
+# Note: On Linux, it seems needed in addition to link against brotlidec
+if (UNIX AND NOT APPLE)
+    set(FREETYPE_LIBRARIES ${FREETYPE_LIBRARIES} -lbrotlidec)
+endif()
 # JSON library: for serialization
 find_package(nlohmann_json REQUIRED)
 set(JSON_LIBRARIES nlohmann_json::nlohmann_json)
@@ -184,8 +190,10 @@ message(STATUS "  Models:${ASSIMP_INCLUDE_DIRS};")
 message(STATUS "  Fonts:${FREETYPE_INCLUDE_DIRS};")
 message(STATUS "  Physics:${BULLET_INCLUDE_DIRS};")
 
-enable_testing()
-find_package(GTest REQUIRED)
+if (BUILD_UNIT_TESTS)
+    enable_testing()
+    find_package(GTest REQUIRED)
+endif ()
 
 if (HAS_AV_LIBS)
     # allow includes without AL/ prefix. Some openAL versions require this.
@@ -199,7 +207,7 @@ set(REGEN_LIBRARIES
     ${Boost_LIBRARIES}
     ${JSON_LIBRARIES}
     ${ASSIMP_LIBRARIES}
-    ${FREETYPE_LIBRARIES} -lbrotlidec
+    ${FREETYPE_LIBRARIES}
     ${IMG_LIBRARIES}
     ${AV_LIBRARIES}
     ${CMAKE_THREAD_LIBS_INIT}
@@ -239,6 +247,10 @@ endif()
 ###########
 ###########
 
+if(REGEN_EXTRA_INCLUDE_DIRS)
+    include_directories(${REGEN_EXTRA_INCLUDE_DIRS})
+endif()
+
 # allow includes like <regen/xxx.h> even if the engine is not installed
 include_directories(.)
 # allow include of generated files
@@ -260,19 +272,21 @@ install(FILES img/icon-small.png DESTINATION ${SHARE_INSTALL_PATH}/img)
 ########## Unit Testings
 ##############
 
-# add an executable target for GTest.
-# but the testing code is partly in the *knowrob* library
-# where gtest won't find them without using the "--no-as-needed"
-# flag for the linker.
-add_executable(all_gtests
-        tests/gtests.cpp
-        tests/shapes/quad-tree-test.cpp)
-target_link_libraries(all_gtests
-        -Wl,--whole-archive,--no-as-needed
-        regen
-        -Wl,--no-whole-archive
-        ${Boost_Python_COMPONENT}
-        ${GTEST_MAIN_LIBRARIES})
+if (BUILD_UNIT_TESTS)
+    # add an executable target for GTest.
+    # but the testing code is partly in the *knowrob* library
+    # where gtest won't find them without using the "--no-as-needed"
+    # flag for the linker.
+    add_executable(all_gtests
+            tests/gtests.cpp
+            tests/shapes/quad-tree-test.cpp)
+    target_link_libraries(all_gtests
+            -Wl,--whole-archive,--no-as-needed
+            regen
+            -Wl,--no-whole-archive
+            ${Boost_Python_COMPONENT}
+            ${GTEST_MAIN_LIBRARIES})
+endif ()
 
 ##############
 ########## packaging
diff --git a/README.md b/README.md
index 0880beb1db..6ba1d8645b 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,13 @@
     <img src="img/gallery.gif" width="160" height="100" />
 </p>
 
-![CI](https://github.com/daniel86/regen/workflows/CI/badge.svg)
+![Linux](https://github.com/daniel86/regen/actions/workflows/ci-linux.yml/badge.svg)
+![MacOS](https://github.com/daniel86/regen/actions/workflows/ci-macos.yml/badge.svg)
+![Warnings](https://img.shields.io/badge/compiler%20warnings-clean-brightgreen)
+[![Docs](https://img.shields.io/badge/docs-online-blue)](https://daniel86.github.io/regen/)
+![Debian](https://img.shields.io/badge/debian-.deb%20package-blue)
+![GitHub release](https://img.shields.io/github/v/release/daniel86/regen?include_prereleases)
+![License](https://img.shields.io/github/license/daniel86/regen)
 
 `regen` -- **Real-time Graphics Engine** -- is a modular OpenGL-based C++ engine designed for research and experimentation in real-time rendering, GPU compute, and virtual world simulation.
 
diff --git a/regen/compute/radix-sort-cpu.h b/regen/compute/radix-sort-cpu.h
index 3374b8ebe4..6493311f7b 100644
--- a/regen/compute/radix-sort-cpu.h
+++ b/regen/compute/radix-sort-cpu.h
@@ -139,23 +139,23 @@ namespace regen {
 					if constexpr (KEY_TYPE_BITS == 16) {
 						// Gather 8 keys manually, and promote to 32-bit
 						for (int k = 0; k < 8; ++k) tmpKeys32[k] = static_cast<int32_t>(keys[src[keyIdx+k]]);
-						r0 = _mm256_load_si256(reinterpret_cast<const __m256i*>(tmpKeys32));
-						r0 = _mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask);
+						r0 = simde_mm256_load_si256(reinterpret_cast<const simde__m256i*>(tmpKeys32));
+						r0 = simde_mm256_and_si256(simde_mm256_srli_epi32(r0, SHIFT), mask);
 						keyIdx += 8; // processed 8 keys, not 16!
 					}
 					else if constexpr (KEY_TYPE_BITS == 32) {
-						simd::Register_i idx = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[keyIdx]));
+						simd::Register_i idx = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(&src[keyIdx]));
 						// Gather 8 scattered keys, and apply shift and mask to get bucket ids
-						r0 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(keys), idx, 4);
-						r0 = _mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask);
+						r0 = simde_mm256_i32gather_epi32(reinterpret_cast<const int*>(keys), idx, 4);
+						r0 = simde_mm256_and_si256(simde_mm256_srli_epi32(r0, SHIFT), mask);
 						keyIdx += KEYS_PER_SIMD_PASS;
 					}
 					else if constexpr (KEY_TYPE_BITS == 64) {
 						// note: values have 32 bits, use __m128i to load only 4
-						__m128i idx32 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[keyIdx]));
+						simde__m128i idx32 = simde_mm_loadu_si128(reinterpret_cast<const simde__m128i*>(&src[keyIdx]));
 						// Gather 4 scattered keys, and apply shift and mask to get bucket ids
-						r0 = _mm256_i32gather_epi64(reinterpret_cast<const long long*>(keys), idx32, 8);
-						r0 = _mm256_and_si256(_mm256_srli_epi64(r0, SHIFT), mask);
+						r0 = simde_mm256_i32gather_epi64(reinterpret_cast<const long long*>(keys), idx32, 8);
+						r0 = simde_mm256_and_si256(simde_mm256_srli_epi64(r0, SHIFT), mask);
 						keyIdx += KEYS_PER_SIMD_PASS;
 					}
 					else {
@@ -163,7 +163,7 @@ namespace regen {
 						break;
 					}
 					// Store results into tmpBins_ and increment histogram
-					_mm256_storeu_si256(reinterpret_cast<__m256i*>(tmpBins_), r0);
+					simde_mm256_storeu_si256(reinterpret_cast<simde__m256i*>(tmpBins_), r0);
 					for (auto x : tmpBins_) ++histogram_[x];
 				}
 			}
diff --git a/regen/compute/simd.h b/regen/compute/simd.h
index b47af9a8e8..ed06be8761 100644
--- a/regen/compute/simd.h
+++ b/regen/compute/simd.h
@@ -4,21 +4,31 @@
 #include <regen/compute/vector.h>
 #include <regen/memory/aligned-allocator.h>
 
-// NOTE: Check for REGEN_HAS_SIMD, if it is not defined, the SIMD operations will be disabled
-//       and the code here will fall back to scalar operations.
-// NOLINTBEGIN(portability-simd-intrinsics)
-#if defined(__AVX__)
-	#include <immintrin.h> // AVX
-	#define REGEN_SIMD_MODE AVX
-	#define REGEN_SIMD_WIDTH 8
-	#define REGEN_HAS_SIMD
-#elif defined(__SSE__)
-	#include <xmmintrin.h> // SSE
-	#define REGEN_SIMD_MODE SSE
-	#define REGEN_SIMD_WIDTH 4
-	#define REGEN_HAS_SIMD
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/simde-common.h>
+#include <simde/x86/avx.h>
+#include <simde/x86/avx2.h>
+#include <simde/x86/fma.h>
+#include <simde/x86/sse.h>
+#include <simde/x86/sse2.h>
+
+#define REGEN_SIMD_NONE 0
+#define REGEN_SIMD_SSE 1
+#define REGEN_SIMD_AVX 2
+
+#if defined(SIMDE_NATURAL_VECTOR_SIZE)
+	#if SIMDE_NATURAL_VECTOR_SIZE >= 32
+		#define REGEN_SIMD_MODE  REGEN_SIMD_AVX
+		#define REGEN_SIMD_WIDTH 8
+	#elif SIMDE_NATURAL_VECTOR_SIZE >= 16
+		#define REGEN_SIMD_MODE  REGEN_SIMD_SSE
+		#define REGEN_SIMD_WIDTH 4
+	#else
+		#define REGEN_SIMD_MODE  REGEN_SIMD_NONE
+		#define REGEN_SIMD_WIDTH 1
+	#endif
 #else
-	#define REGEN_SIMD_MODE NONE
+	#define REGEN_SIMD_MODE  REGEN_SIMD_NONE
 	#define REGEN_SIMD_WIDTH 1
 #endif
 
@@ -32,236 +42,236 @@ namespace regen::simd {
 		return bitIndex;
 	}
 
-#if REGEN_SIMD_MODE == AVX
+#if REGEN_SIMD_MODE == REGEN_SIMD_AVX
 	static constexpr int8_t RegisterMask = 0xFF; // 8 bits for AVX
-	using Register = __m256; // 8 floats
-	using Register_i = __m256i; // 8 integers
+	using Register = simde__m256; // 8 floats
+	using Register_i = simde__m256i; // 8 integers
 
-	inline __m256 set1_ps(float v) { return _mm256_set1_ps(v); }
-	inline __m256i set1_epi32(int32_t v) { return _mm256_set1_epi32(v); }
-	inline __m256i set1_epi16(uint16_t v) { return _mm256_set1_epi16(v); }
-	inline __m256i set1_epi64(int64_t v) { return _mm256_set1_epi64x(v); }
-	inline __m256i set1_epi64u(uint64_t v) { return _mm256_set1_epi64x(v); }
+	inline Register set1_ps(float v) { return simde_mm256_set1_ps(v); }
+	inline Register_i set1_epi32(int32_t v) { return simde_mm256_set1_epi32(v); }
+	inline Register_i set1_epi16(uint16_t v) { return simde_mm256_set1_epi16(v); }
+	inline Register_i set1_epi64(int64_t v) { return simde_mm256_set1_epi64x(v); }
+	inline Register_i set1_epi64u(uint64_t v) { return simde_mm256_set1_epi64x(v); }
 
-	inline __m256 setzero_ps() { return _mm256_setzero_ps(); }
-	inline __m256i setzero_si256() { return _mm256_setzero_si256(); }
+	inline Register setzero_ps() { return simde_mm256_setzero_ps(); }
+	inline Register_i setzero_si256() { return simde_mm256_setzero_si256(); }
 
-	inline __m256 load_ps(const float *p) { return _mm256_load_ps(p); }
-	inline __m256 loadu_ps(const float *p) { return _mm256_loadu_ps(p); }
+	inline Register load_ps(const float *p) { return simde_mm256_load_ps(p); }
+	inline Register loadu_ps(const float *p) { return simde_mm256_loadu_ps(p); }
 
-	inline __m256i load_si256(const uint16_t *p) {
-		return _mm256_load_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i load_si256(const uint16_t *p) {
+		return simde_mm256_load_si256(reinterpret_cast<const Register_i*>(p));
 	}
-	inline __m256i load_si256(const uint32_t *p) {
-		return _mm256_load_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i load_si256(const uint32_t *p) {
+		return simde_mm256_load_si256(reinterpret_cast<const Register_i*>(p));
 	}
-	inline __m256i load_si256(const uint64_t *p) {
-		return _mm256_load_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i load_si256(const uint64_t *p) {
+		return simde_mm256_load_si256(reinterpret_cast<const Register_i*>(p));
 	}
-	inline __m256i load_si256(const int32_t *p) {
-		return _mm256_load_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i load_si256(const int32_t *p) {
+		return simde_mm256_load_si256(reinterpret_cast<const Register_i*>(p));
 	}
 
-	inline __m256i loadu_si256(const uint16_t *p) {
-		return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i loadu_si256(const uint16_t *p) {
+		return simde_mm256_loadu_si256(reinterpret_cast<const Register_i*>(p));
 	}
-	inline __m256i loadu_si256(const uint32_t *p) {
-		return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i loadu_si256(const uint32_t *p) {
+		return simde_mm256_loadu_si256(reinterpret_cast<const Register_i*>(p));
 	}
-	inline __m256i loadu_si256(const uint64_t *p) {
-		return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i loadu_si256(const uint64_t *p) {
+		return simde_mm256_loadu_si256(reinterpret_cast<const Register_i*>(p));
 	}
-	inline __m256i loadu_si256(const int32_t *p) {
-		return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i loadu_si256(const int32_t *p) {
+		return simde_mm256_loadu_si256(reinterpret_cast<const Register_i*>(p));
 	}
 
-	inline __m256 epi_to_ps(const __m256i &v) { return _mm256_castsi256_ps(v); }
+	inline Register epi_to_ps(const Register_i &v) { return simde_mm256_castsi256_ps(v); }
 
-	inline __m256 i32gather_ps(const float *p, const __m256i &indices) {
-		return _mm256_i32gather_ps(p, indices, sizeof(float));
+	inline Register i32gather_ps(const float *p, const Register_i &indices) {
+		return simde_mm256_i32gather_ps(p, indices, sizeof(float));
 	}
 
-	inline void storeu_ps(float *p, const __m256 &v) { _mm256_storeu_ps(p, v); }
-	inline void store_ps(float *p, const __m256 &v) { _mm256_store_ps(p, v); }
+	inline void storeu_ps(float *p, const Register &v) { simde_mm256_storeu_ps(p, v); }
+	inline void store_ps(float *p, const Register &v) { simde_mm256_store_ps(p, v); }
 
-	inline void storeu_epi32(int32_t *p, const __m256i &v) {
-		_mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v);
+	inline void storeu_epi32(int32_t *p, const Register_i &v) {
+		simde_mm256_storeu_si256(reinterpret_cast<Register_i*>(p), v);
 	}
-	inline void storeu_epi32(uint32_t *p, const __m256i &v) {
-		_mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v);
+	inline void storeu_epi32(uint32_t *p, const Register_i &v) {
+		simde_mm256_storeu_si256(reinterpret_cast<Register_i*>(p), v);
 	}
-	inline void store_epi32(int32_t *p, const __m256i &v) {
-		_mm256_store_si256(reinterpret_cast<__m256i*>(p), v);
+	inline void store_epi32(int32_t *p, const Register_i &v) {
+		simde_mm256_store_si256(reinterpret_cast<Register_i*>(p), v);
 	}
-	inline void store_epi32(uint32_t *p, const __m256i &v) {
-		_mm256_store_si256(reinterpret_cast<__m256i*>(p), v);
+	inline void store_epi32(uint32_t *p, const Register_i &v) {
+		simde_mm256_store_si256(reinterpret_cast<Register_i*>(p), v);
 	}
 
-	inline __m256 add_ps(const __m256 &a, const __m256 &b) { return _mm256_add_ps(a, b); }
-	inline __m256 sub_ps(const __m256 &a, const __m256 &b) { return _mm256_sub_ps(a, b); }
-	inline __m256 mul_ps(const __m256 &a, const __m256 &b) { return _mm256_mul_ps(a, b); }
-	inline __m256 div_ps(const __m256 &a, const __m256 &b) { return _mm256_div_ps(a, b); }
+	inline Register add_ps(const Register &a, const Register &b) { return simde_mm256_add_ps(a, b); }
+	inline Register sub_ps(const Register &a, const Register &b) { return simde_mm256_sub_ps(a, b); }
+	inline Register mul_ps(const Register &a, const Register &b) { return simde_mm256_mul_ps(a, b); }
+	inline Register div_ps(const Register &a, const Register &b) { return simde_mm256_div_ps(a, b); }
 
 	/**
 	 * Fused multiply-add: (a * b) + c
 	 */
-	inline __m256 mul_add_ps(const __m256 &a, const __m256 &b, const __m256 &c) {
-		return _mm256_fmadd_ps(a, b, c);
+	inline Register mul_add_ps(const Register &a, const Register &b, const Register &c) {
+		return simde_mm256_fmadd_ps(a, b, c);
 	}
 
-	inline __m256i add_epi32(const __m256i &a, const __m256i &b) { return _mm256_add_epi32(a, b); }
-	inline __m256i sub_epi32(const __m256i &a, const __m256i &b) { return _mm256_sub_epi32(a, b); }
-	inline __m256i mul_epi32(const __m256i &a, const __m256i &b) { return _mm256_mullo_epi32(a, b); }
+	inline Register_i add_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_add_epi32(a, b); }
+	inline Register_i sub_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_sub_epi32(a, b); }
+	inline Register_i mul_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_mullo_epi32(a, b); }
 
-	inline __m256 min_ps(const __m256 &a, const __m256 &b) { return _mm256_min_ps(a, b); }
-	inline __m256 max_ps(const __m256 &a, const __m256 &b) { return _mm256_max_ps(a, b); }
-	inline __m256 sqrt_ps(const __m256 &a) { return _mm256_sqrt_ps(a); }
+	inline Register min_ps(const Register &a, const Register &b) { return simde_mm256_min_ps(a, b); }
+	inline Register max_ps(const Register &a, const Register &b) { return simde_mm256_max_ps(a, b); }
+	inline Register sqrt_ps(const Register &a) { return simde_mm256_sqrt_ps(a); }
 
-	inline __m256i min_epi32(const __m256i &a, const __m256i &b) { return _mm256_min_epi32(a, b); }
-	inline __m256i max_epi32(const __m256i &a, const __m256i &b) { return _mm256_max_epi32(a, b); }
+	inline Register_i min_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_min_epi32(a, b); }
+	inline Register_i max_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_max_epi32(a, b); }
 
 	/**
-	 * Horizontal sum of all elements in an __m256
+	 * Horizontal sum of all elements in an Register
 	 */
-	inline float hsum_ps(__m256 v) {
-		__m128 vlow  = _mm256_castps256_ps128(v);        // low 128
-		__m128 vhigh = _mm256_extractf128_ps(v, 1);   // high 128
-		__m128 sum   = _mm_add_ps(vlow, vhigh);       // add low and high parts
-		__m128 shuf  = _mm_movehdup_ps(sum);             // (sum.y, sum.y, sum.w, sum.w)
-		__m128 sums  = _mm_add_ps(sum, shuf);
-		shuf         = _mm_movehl_ps(shuf, sums);     // high half of sums
-		sums         = _mm_add_ss(sums, shuf);
-		return _mm_cvtss_f32(sums);
+	inline float hsum_ps(Register v) {
+		simde__m128 vlow  = simde_mm256_castps256_ps128(v);        // low 128
+		simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1);   // high 128
+		simde__m128 sum   = simde_mm_add_ps(vlow, vhigh);       // add low and high parts
+		simde__m128 shuf  = simde_mm_movehdup_ps(sum);             // (sum.y, sum.y, sum.w, sum.w)
+		simde__m128 sums  = simde_mm_add_ps(sum, shuf);
+		shuf         = simde_mm_movehl_ps(shuf, sums);     // high half of sums
+		sums         = simde_mm_add_ss(sums, shuf);
+		return simde_mm_cvtss_f32(sums);
 	}
 
 	/**
-	 * Horizontal min of all elements in an __m256
+	 * Horizontal min of all elements in an Register
 	 */
-	inline float hmin_ps(__m256 v) {
-		__m128 vlow  = _mm256_castps256_ps128(v);        // low 128
-		__m128 vhigh = _mm256_extractf128_ps(v, 1);   // high 128
-		__m128 min   = _mm_min_ps(vlow, vhigh);       // min low and high parts
-		__m128 shuf  = _mm_movehdup_ps(min);             // (min.y, min.y, min.w, min.w)
-		__m128 mins  = _mm_min_ps(min, shuf);
-		shuf         = _mm_movehl_ps(shuf, mins);     // high half of mins
-		mins         = _mm_min_ss(mins, shuf);
-		return _mm_cvtss_f32(mins);
+	inline float hmin_ps(Register v) {
+		simde__m128 vlow  = simde_mm256_castps256_ps128(v);        // low 128
+		simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1);   // high 128
+		simde__m128 min   = simde_mm_min_ps(vlow, vhigh);       // min low and high parts
+		simde__m128 shuf  = simde_mm_movehdup_ps(min);             // (min.y, min.y, min.w, min.w)
+		simde__m128 mins  = simde_mm_min_ps(min, shuf);
+		shuf         = simde_mm_movehl_ps(shuf, mins);     // high half of mins
+		mins         = simde_mm_min_ss(mins, shuf);
+		return simde_mm_cvtss_f32(mins);
 	}
 
 	/**
-	 * Horizontal max of all elements in an __m256
+	 * Horizontal max of all elements in an Register
 	 */
-	inline float hmax_ps(__m256 v) {
-		__m128 vlow  = _mm256_castps256_ps128(v);        // low 128
-		__m128 vhigh = _mm256_extractf128_ps(v, 1);   // high 128
-		__m128 max   = _mm_max_ps(vlow, vhigh);       // max low and high parts
-		__m128 shuf  = _mm_movehdup_ps(max);             // (max.y, max.y, max.w, max.w)
-		__m128 maxs  = _mm_max_ps(max, shuf);
-		shuf         = _mm_movehl_ps(shuf, maxs);     // high half of maxs
-		maxs         = _mm_max_ss(maxs, shuf);
-		return _mm_cvtss_f32(maxs);
+	inline float hmax_ps(Register v) {
+		simde__m128 vlow  = simde_mm256_castps256_ps128(v);        // low 128
+		simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1);   // high 128
+		simde__m128 max   = simde_mm_max_ps(vlow, vhigh);       // max low and high parts
+		simde__m128 shuf  = simde_mm_movehdup_ps(max);             // (max.y, max.y, max.w, max.w)
+		simde__m128 maxs  = simde_mm_max_ps(max, shuf);
+		shuf         = simde_mm_movehl_ps(shuf, maxs);     // high half of maxs
+		maxs         = simde_mm_max_ss(maxs, shuf);
+		return simde_mm_cvtss_f32(maxs);
 	}
 
-	inline __m256 rcp_ps(const __m256 &a) { return _mm256_rcp_ps(a); }
+	inline Register rcp_ps(const Register &a) { return simde_mm256_rcp_ps(a); }
 
-	inline __m256 cmp_lt(const __m256 &a, const __m256 &b) {
-		return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
+	inline Register cmp_lt(const Register &a, const Register &b) {
+		return simde_mm256_cmp_ps(a, b, _CMP_LT_OQ);
 	}
-	inline __m256 cmp_gt(const __m256 &a, const __m256 &b) {
-		return _mm256_cmp_ps(a, b, _CMP_GT_OQ);
+	inline Register cmp_gt(const Register &a, const Register &b) {
+		return simde_mm256_cmp_ps(a, b, _CMP_GT_OQ);
 	}
-	inline __m256 cmp_eq(const __m256 &a, const __m256 &b) {
-		return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
+	inline Register cmp_eq(const Register &a, const Register &b) {
+		return simde_mm256_cmp_ps(a, b, _CMP_EQ_OQ);
 	}
-	inline __m256 cmp_neq(const __m256 &a, const __m256 &b) {
-		return _mm256_cmp_ps(a, b, _CMP_NEQ_OQ);
+	inline Register cmp_neq(const Register &a, const Register &b) {
+		return simde_mm256_cmp_ps(a, b, _CMP_NEQ_OQ);
 	}
-	inline __m256 cmp_or(const __m256 &a, const __m256 &b) {
-		return _mm256_or_ps(a, b);
+	inline Register cmp_or(const Register &a, const Register &b) {
+		return simde_mm256_or_ps(a, b);
 	}
-	inline __m256 cmp_and(const __m256 &a, const __m256 &b) {
-		return _mm256_and_ps(a, b);
+	inline Register cmp_and(const Register &a, const Register &b) {
+		return simde_mm256_and_ps(a, b);
 	}
-	inline __m256 cmp_and_not(const __m256 &a, const __m256 &b) {
-		return _mm256_andnot_ps(a, b);
+	inline Register cmp_and_not(const Register &a, const Register &b) {
+		return simde_mm256_andnot_ps(a, b);
 	}
 
-	inline __m256i cvttps_epi32(const __m256 &a) { return _mm256_cvttps_epi32(a); }
+	inline Register_i cvttps_epi32(const Register &a) { return simde_mm256_cvttps_epi32(a); }
 
-	inline int movemask_ps(const __m256 &v) { return _mm256_movemask_ps(v); }
+	inline int movemask_ps(const Register &v) { return simde_mm256_movemask_ps(v); }
 
-	inline __m256 blendv_ps(const __m256 &a, const __m256 &b, const __m256 &mask) {
-		return _mm256_blendv_ps(a, b, mask);
+	inline Register blendv_ps(const Register &a, const Register &b, const Register &mask) {
+		return simde_mm256_blendv_ps(a, b, mask);
 	}
 
-#elif REGEN_SIMD_MODE == SSE
+#elif REGEN_SIMD_MODE == REGEN_SIMD_SSE
 	static constexpr int8_t RegisterMask = 0x0F; // 4 bits for SSE
-	using Register = __m128; // 4 floats
-	using Register_i = __m128i; // 4 integers
+	using Register = simde__m128; // 4 floats
+	using Register_i = simde__m128i; // 4 integers
 
-	inline __m128 set1_ps(float v) { return _mm_set1_ps(v); }
-	inline __m128i set1_epi32(int32_t v) { return _mm_set1_epi32(v); }
+	inline Register set1_ps(float v) { return simde_mm_set1_ps(v); }
+	inline Register_i set1_epi32(int32_t v) { return simde_mm_set1_epi32(v); }
 
-	inline __m128 setzero_ps() { return _mm_setzero_ps(); }
-	inline __m128i setzero_si256() { return _mm_setzero_si128(); }
+	inline Register setzero_ps() { return simde_mm_setzero_ps(); }
+	inline Register_i setzero_si256() { return simde_mm_setzero_si128(); }
 
-	inline __m128 load_ps(const float *p) { return _mm_load_ps(p); }
-	inline __m128 loadu_ps(const float *p) { return _mm_loadu_ps(p); }
+	inline Register load_ps(const float *p) { return simde_mm_load_ps(p); }
+	inline Register loadu_ps(const float *p) { return simde_mm_loadu_ps(p); }
 
-	inline __m128i loadu_si256(const uint32_t *p) {
-		return _mm_loadu_si128(reinterpret_cast<const __m128i*>(indices));
+	inline Register_i loadu_si256(const uint32_t *p) {
+		return simde_mm_loadu_si128(reinterpret_cast<const Register_i*>(p));
 	}
 
-	inline __m128 epi_to_ps(const __m128i &v) { return _mm_castsi128_ps(v); }
+	inline Register epi_to_ps(const Register_i &v) { return simde_mm_castsi128_ps(v); }
 
-	inline __m128 i32gather_ps(const float *p, const __m128i &indices) {
-		return _mm_i32gather_ps(p, indices, sizeof(float));
+	inline Register i32gather_ps(const float *p, const Register_i &indices) {
+		return simde_mm_i32gather_ps(p, indices, sizeof(float));
 	}
 
-	inline void storeu_ps(float *p, const __m128 &v) { _mm_storeu_ps(p, v); }
+	inline void storeu_ps(float *p, const Register &v) { simde_mm_storeu_ps(p, v); }
 
-	inline __m128 add_ps(const __m128 &a, const __m128 &b) { return _mm_add_ps(a, b); }
-	inline __m128 sub_ps(const __m128 &a, const __m128 &b) { return _mm_sub_ps(a, b); }
-	inline __m128 mul_ps(const __m128 &a, const __m128 &b) { return _mm_mul_ps(a, b); }
-	inline __m128 div_ps(const __m128 &a, const __m128 &b) { return _mm_div_ps(a, b); }
+	inline Register add_ps(const Register &a, const Register &b) { return simde_mm_add_ps(a, b); }
+	inline Register sub_ps(const Register &a, const Register &b) { return simde_mm_sub_ps(a, b); }
+	inline Register mul_ps(const Register &a, const Register &b) { return simde_mm_mul_ps(a, b); }
+	inline Register div_ps(const Register &a, const Register &b) { return simde_mm_div_ps(a, b); }
 
-	inline __m128i add_epi32(const __m128i &a, const __m128i &b) { return _mm_add_epi32(a, b); }
-	inline __m128i sub_epi32(const __m128i &a, const __m128i &b) { return _mm_sub_epi32(a, b); }
-	inline __m128i mul_epi32(const __m128i &a, const __m128i &b) { return _mm_mullo_epi32(a, b); }
+	inline Register_i add_epi32(const Register_i &a, const Register_i &b) { return simde_mm_add_epi32(a, b); }
+	inline Register_i sub_epi32(const Register_i &a, const Register_i &b) { return simde_mm_sub_epi32(a, b); }
+	inline Register_i mul_epi32(const Register_i &a, const Register_i &b) { return simde_mm_mullo_epi32(a, b); }
 
-	inline __m128 min_ps(const __m128 &a, const __m128 &b) { return _mm_min_ps(a, b); }
-	inline __m128 max_ps(const __m128 &a, const __m128 &b) { return _mm_max_ps(a, b); }
-	inline __m128 sqrt_ps(const __m128 &a) { return _mm_sqrt_ps(a); }
+	inline Register min_ps(const Register &a, const Register &b) { return simde_mm_min_ps(a, b); }
+	inline Register max_ps(const Register &a, const Register &b) { return simde_mm_max_ps(a, b); }
+	inline Register sqrt_ps(const Register &a) { return simde_mm_sqrt_ps(a); }
 
-	inline __m128i min_epi32(const __m128i &a, const __m128i &b) { return _mm_min_epi32(a, b); }
-	inline __m128i max_epi32(const __m128i &a, const __m128i &b) { return _mm_max_epi32(a, b); }
+	inline Register_i min_epi32(const Register_i &a, const Register_i &b) { return simde_mm_min_epi32(a, b); }
+	inline Register_i max_epi32(const Register_i &a, const Register_i &b) { return simde_mm_max_epi32(a, b); }
 
-	inline float hsum_ps(__m128 v) {
-		__m128 shuf = _mm_movehdup_ps(v);  // (v1, v1, v3, v3)
-		__m128 sums = _mm_add_ps(v, shuf);
-		shuf = _mm_movehl_ps(shuf, sums); // (v2 + v3, v3, -, -)
-		sums = _mm_add_ss(sums, shuf);
-		return _mm_cvtss_f32(sums);
+	inline float hsum_ps(Register v) {
+		Register shuf = simde_mm_movehdup_ps(v);  // (v1, v1, v3, v3)
+		Register sums = simde_mm_add_ps(v, shuf);
+		shuf = simde_mm_movehl_ps(shuf, sums); // (v2 + v3, v3, -, -)
+		sums = simde_mm_add_ss(sums, shuf);
+		return simde_mm_cvtss_f32(sums);
 	}
 
-	inline __m128 rcp_ps(const __m128 &a) { return _mm_rcp_ps(a); }
+	inline Register rcp_ps(const Register &a) { return simde_mm_rcp_ps(a); }
 
-	inline __m128 cmp_lt(const __m128 &a, const __m128 &b)  { return _mm_cmplt_ps(a, b); }
-	inline __m128 cmp_gt(const __m128 &a, const __m128 &b)  { return _mm_cmplt_ps(b, a); }
-	inline __m128 cmp_eq(const __m128 &a, const __m128 &b)  { return _mm_cmpeq_ps(a, b); }
-	inline __m128 cmp_neq(const __m128 &a, const __m128 &b) {
-		__m128 eq = _mm_cmpeq_ps(a, b);
-		return _mm_andnot_ps(eq, _mm_castsi128_ps(_mm_set1_epi32(-1)));  // ~eq & all_ones
+	inline Register cmp_lt(const Register &a, const Register &b)  { return simde_mm_cmplt_ps(a, b); }
+	inline Register cmp_gt(const Register &a, const Register &b)  { return simde_mm_cmplt_ps(b, a); }
+	inline Register cmp_eq(const Register &a, const Register &b)  { return simde_mm_cmpeq_ps(a, b); }
+	inline Register cmp_neq(const Register &a, const Register &b) {
+		Register eq = simde_mm_cmpeq_ps(a, b);
+		return simde_mm_andnot_ps(eq, simde_mm_castsi128_ps(simde_mm_set1_epi32(-1)));  // ~eq & all_ones
 	}
-	inline __m128 cmp_or(const __m128 &a, const __m128 &b) { return _mm_or_ps(a, b); }
-	inline __m128 cmp_and(const __m128 &a, const __m128 &b) { return _mm_and_ps(a, b); }
+	inline Register cmp_or(const Register &a, const Register &b) { return simde_mm_or_ps(a, b); }
+	inline Register cmp_and(const Register &a, const Register &b) { return simde_mm_and_ps(a, b); }
 
-	inline __m128i cvttps_epi32(const __m128 &a) { return _mm_cvttps_epi32(a); }
+	inline Register_i cvttps_epi32(const Register &a) { return simde_mm_cvttps_epi32(a); }
 
-	inline int movemask_ps(const __m128 &v) { return _mm_movemask_ps(v); }
+	inline int movemask_ps(const Register &v) { return simde_mm_movemask_ps(v); }
 
-	inline __m128 blendv_ps(const __m128 &a, const __m128 &b, const __m128 &mask) {
-		return _mm_blendv_ps(a, b, mask);
+	inline Register blendv_ps(const Register &a, const Register &b, const Register &mask) {
+		return simde_mm_blendv_ps(a, b, mask);
 	}
 
 #else // Fallback to scalar operations
@@ -752,7 +762,7 @@ namespace regen {
 		}
 
 		static BatchOf_int32 castFloatBatch(const BatchOf_float &v) {
-			return BatchOf_int32{_mm256_castps_si256(v.c)};
+			return BatchOf_int32{simde_mm256_castps_si256(v.c)};
 		}
 
 		/**
@@ -832,7 +842,7 @@ namespace regen {
 		}
 
 		BatchOf_int32 operator&(const BatchOf_int32 &other) const {
-			return BatchOf_int32{_mm256_and_si256(c, other.c)};
+			return BatchOf_int32{simde_mm256_and_si256(c, other.c)};
 		}
 
 		static BatchOf_int32 allZeros() {
@@ -1199,7 +1209,7 @@ namespace regen {
 
 		/**
 		 * Computes the length squared of each vector in the batch.
-		 * @return __m128 containing the length squared for each vector.
+		 * @return Register containing the length squared for each vector.
 		 */
 		BatchOf_float lengthSquared() const {
 			return x*x + y*y + z*z;
@@ -1272,6 +1282,4 @@ namespace regen {
 	template<typename T> using vectorSIMD = std::vector<T, AlignedAllocator<T, 32>>;
 }
 
-// NOLINTEND(portability-simd-intrinsics)
-
 #endif /* REGEN_SIMD_H_ */
diff --git a/regen/compute/threading.h b/regen/compute/threading.h
index 250905e299..9a2d5c2082 100644
--- a/regen/compute/threading.h
+++ b/regen/compute/threading.h
@@ -9,8 +9,8 @@
 #include "regen/memory/aligned-allocator.h"
 
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
-    #include <immintrin.h>
-    #define CPU_PAUSE() _mm_pause()
+    #include <regen/compute/simd.h>
+    #define CPU_PAUSE() simde_mm_pause()
 #elif defined(__aarch64__) || defined(__arm__)
     #define CPU_PAUSE() asm volatile("yield" ::: "memory")
 #else
@@ -373,7 +373,7 @@ namespace regen {
 #if 0
 			int spins = 0;
 			while (numJobsRemaining_.load(std::memory_order_acquire) > 0u) {
-				if (++spins < 1000) _mm_pause();
+				if (++spins < 1000) simde_mm_pause();
 				else std::this_thread::yield();
 			}
 #else
diff --git a/regen/shader/includer.h b/regen/shader/includer.h
index fc0ea12a88..5be3ec9a50 100644
--- a/regen/shader/includer.h
+++ b/regen/shader/includer.h
@@ -6,6 +6,7 @@
 #include <string>
 #include <map>
 #include <set>
+#include <list>
 
 namespace regen {
   /**