From 65edde95533c18b12d623fcfde7fffb2992913d0 Mon Sep 17 00:00:00 2001
From: kranks-uga <krepin1001@gmail.com>
Date: Mon, 25 May 2026 18:08:59 +0300
Subject: [PATCH 1/3] =?UTF-8?q?perf:=20=D0=BF=D0=B0=D1=80=D0=B0=D0=BB?=
 =?UTF-8?q?=D0=BB=D0=B5=D0=BB=D1=8C=D0=BD=D1=8B=D0=B9=20=D1=80=D0=B0=D1=81?=
 =?UTF-8?q?=D1=87=D1=91=D1=82=20=D1=84=D0=B8=D0=B7=D0=B8=D0=BA=D0=B8=20?=
 =?UTF-8?q?=D1=87=D0=B5=D1=80=D0=B5=D0=B7=20OpenMP?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CMakeLists.txt                              |  11 +++
 Engine/NeighborSearch/NeighborList.cpp      |  22 +++--
 Engine/physics/ForceField.cpp               | 104 +++++++++++++++++---
 Engine/physics/integrators/VerletScheme.cpp |   4 +-
 4 files changed, 120 insertions(+), 21 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c680d83..6e3c2ad8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,7 @@ set(APP_VERSION_STRING "${PROJECT_VERSION}")
 option(OPTIMIZE_FOR_NATIVE "Enable native CPU optimizations outside Debug builds" OFF)
 option(BUILD_BENCHMARKS "Build benchmarks" OFF)
 option(ENABLE_IPO "Enable link-time optimization for non-Debug builds when supported" ON)
+option(ENABLE_OPENMP "Enable OpenMP multithreading for physics simulation" ON)
 
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -65,6 +66,16 @@ add_dependencies(latticelab_lib embedded_resources)
 include(shaders)
 add_dependencies(latticelab_lib wgsl_shaders)
 
+if(ENABLE_OPENMP)
+    find_package(OpenMP)
+    if(OpenMP_CXX_FOUND)
+        target_link_libraries(latticelab_lib PUBLIC OpenMP::OpenMP_CXX)
+        message(STATUS "OpenMP: enabled (${OpenMP_CXX_VERSION})")
+    else()
+        message(WARNING "OpenMP not found — building without multithreading")
+    endif()
+endif()
+
 target_compile_options(latticelab_lib PRIVATE
     $<$<AND:$<BOOL:${OPTIMIZE_FOR_NATIVE}>,$<NOT:$<CONFIG:Debug>>,$<CXX_COMPILER_ID:GNU,Clang,AppleClang>>:-march=native>
 )
diff --git a/Engine/NeighborSearch/NeighborList.cpp b/Engine/NeighborSearch/NeighborList.cpp
index e4b13a26..dbaac4af 100644
--- a/Engine/NeighborSearch/NeighborList.cpp
+++ b/Engine/NeighborSearch/NeighborList.cpp
@@ -69,14 +69,24 @@ void NeighborList::build(const AtomStorage& atoms, SimBox& box) {
 
     reserveListBuffers(atoms);
 
+    // Фаза 1: каждый поток строит список соседей своих атомов независимо
+    std::vector<std::vector<uint32_t>> perAtom(atomCount);
+#pragma omp parallel for schedule(dynamic, 64)
+    for (uint32_t i = 0; i < atomCount; ++i) {
+        writeAtomNeighbors(grid, x, y, z, i, x[i], y[i], z[i], perAtom[i]);
+    }
+
+    // Фаза 2: вычисляем смещения (prefix sum) — последовательно
     offsets_[0] = 0;
     for (uint32_t i = 0; i < atomCount; ++i) {
-        const float xi = x[i];
-        const float yi = y[i];
-        const float zi = z[i];
-        // запись всех соседей атома в массив
-        writeAtomNeighbors(grid, x, y, z, i, xi, yi, zi, neighbors_);
-        offsets_[i + 1] = neighbors_.size();
+        offsets_[i + 1] = offsets_[i] + static_cast<uint32_t>(perAtom[i].size());
+    }
+    neighbors_.resize(offsets_[atomCount]);
+
+    // Фаза 3: копируем в плоский массив — снова параллельно
+#pragma omp parallel for schedule(static)
+    for (uint32_t i = 0; i < atomCount; ++i) {
+        std::copy(perAtom[i].begin(), perAtom[i].end(), neighbors_.begin() + offsets_[i]);
     }
 
     std::copy(x, x + atoms.mobileCount(), refPosX_.data());
diff --git a/Engine/physics/ForceField.cpp b/Engine/physics/ForceField.cpp
index b2316cc0..d452afe0 100644
--- a/Engine/physics/ForceField.cpp
+++ b/Engine/physics/ForceField.cpp
@@ -1,18 +1,53 @@
 #include "ForceField.h"
 
+#include <cmath>
+#include <vector>
+
+#include "Engine/Consts.h"
 #include "Engine/NeighborSearch/NeighborList.h"
 #include "Engine/metrics/Profiler.h"
+#include "Engine/physics/ForceFields/CoulombForceField.h"
+#include "Engine/restrict.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 
 namespace {
     template <bool UseLJ, bool UseCoulomb>
     void computePairInteractionsImpl(AtomStorage& atoms, NeighborList& neighborList, const LJForceField& ljForceField,
                                      const CoulombForceField& coulombForceField) {
-        const auto& offsets = neighborList.offsets();
+        const size_t totalN  = atoms.size();
+        const size_t mobileN = atoms.mobileCount();
+        const auto& offsets    = neighborList.offsets();
         const auto& neighbours = neighborList.neighbors();
 
-        for (size_t atomIndex = 0; atomIndex < atoms.mobileCount(); ++atomIndex) {
+        int numThreads = 1;
+#ifdef _OPENMP
+        numThreads = omp_get_max_threads();
+#endif
+
+        // Каждый поток накапливает силы в свой срез массива (thread t, atom i → base + i).
+        // Это исключает гонки данных при использовании третьего закона Ньютона.
+        std::vector<float> tFx(numThreads * totalN, 0.0f);
+        std::vector<float> tFy(numThreads * totalN, 0.0f);
+        std::vector<float> tFz(numThreads * totalN, 0.0f);
+        std::vector<float> tE (numThreads * totalN, 0.0f);
+
+#pragma omp parallel for schedule(dynamic, 64)
+        for (size_t atomIndex = 0; atomIndex < mobileN; ++atomIndex) {
+#ifdef _OPENMP
+            const int tid = omp_get_thread_num();
+#else
+            const int tid = 0;
+#endif
+            float* RESTRICT localFx = tFx.data() + tid * totalN;
+            float* RESTRICT localFy = tFy.data() + tid * totalN;
+            float* RESTRICT localFz = tFz.data() + tid * totalN;
+            float* RESTRICT localE  = tE .data() + tid * totalN;
+
             const uint32_t begin = offsets[atomIndex];
-            const uint32_t end = offsets[atomIndex + 1];
+            const uint32_t end   = offsets[atomIndex + 1];
             if (begin > end || static_cast<size_t>(end) > neighbours.size()) {
                 continue;
             }
@@ -20,10 +55,7 @@ namespace {
             const float posX = atoms.posX(atomIndex);
             const float posY = atoms.posY(atomIndex);
             const float posZ = atoms.posZ(atomIndex);
-            float forceX = atoms.forceX(atomIndex);
-            float forceY = atoms.forceY(atomIndex);
-            float forceZ = atoms.forceZ(atomIndex);
-            float potentialEnergy = atoms.energy(atomIndex);
+            float myFx = 0.0f, myFy = 0.0f, myFz = 0.0f, myE = 0.0f;
 
             const LJForceField::LJPairRow* ljPairRow = nullptr;
             if constexpr (UseLJ) {
@@ -48,19 +80,65 @@ namespace {
                 const float d2 = dx * dx + dy * dy + dz * dz;
 
                 if constexpr (UseLJ) {
-                    ljForceField.pairInteraction(atoms, bIndex, dx, dy, dz, d2, *ljPairRow, forceX, forceY, forceZ, potentialEnergy);
+                    if (d2 > Consts::Epsilon) {
+                        const auto& params = (*ljPairRow)[static_cast<size_t>(atoms.type(bIndex))];
+                        const float invD2  = 1.0f / d2;
+                        const float invD6  = invD2 * invD2 * invD2;
+                        const float invD12 = invD6 * invD6;
+                        const float term6  = params.potentialC6  * invD6;
+                        const float term12 = params.potentialC12 * invD12;
+                        const float fScale = (12.0f * term12 - 6.0f * term6) * invD2;
+                        const float potential = term12 - term6;
+                        const float pfx = dx * fScale;
+                        const float pfy = dy * fScale;
+                        const float pfz = dz * fScale;
+
+                        myFx -= pfx;  myFy -= pfy;  myFz -= pfz;
+                        myE  += 0.5f * potential;
+                        localFx[bIndex] += pfx;
+                        localFy[bIndex] += pfy;
+                        localFz[bIndex] += pfz;
+                        localE [bIndex] += 0.5f * potential;
+                    }
                 }
                 if constexpr (UseCoulomb) {
                     if (charge != 0.0f) {
-                        coulombForceField.pairInteraction(atoms, bIndex, dx, dy, dz, d2, charge, forceX, forceY, forceZ, potentialEnergy);
+                        const float chargeB = atoms.charge(bIndex);
+                        if (chargeB != 0.0f && d2 > Consts::Epsilon) {
+                            const float qqScale  = CoulombForceField::kCoulombEvAngstrom * charge * chargeB;
+                            const float invR     = 1.0f / std::sqrt(d2);
+                            const float fScale   = qqScale * invR / d2;
+                            const float potential = qqScale * invR;
+                            const float pfx = dx * fScale;
+                            const float pfy = dy * fScale;
+                            const float pfz = dz * fScale;
+
+                            myFx -= pfx;  myFy -= pfy;  myFz -= pfz;
+                            myE  += 0.5f * potential;
+                            localFx[bIndex] += pfx;
+                            localFy[bIndex] += pfy;
+                            localFz[bIndex] += pfz;
+                            localE [bIndex] += 0.5f * potential;
+                        }
                     }
                 }
             }
 
-            atoms.forceX(atomIndex) = forceX;
-            atoms.forceY(atomIndex) = forceY;
-            atoms.forceZ(atomIndex) = forceZ;
-            atoms.energy(atomIndex) = potentialEnergy;
+            localFx[atomIndex] += myFx;
+            localFy[atomIndex] += myFy;
+            localFz[atomIndex] += myFz;
+            localE [atomIndex] += myE;
+        }
+
+        // Суммируем вклады всех потоков в атомы
+        for (int t = 0; t < numThreads; ++t) {
+            const size_t base = static_cast<size_t>(t) * totalN;
+            for (size_t i = 0; i < mobileN; ++i) {
+                atoms.forceX(i) += tFx[base + i];
+                atoms.forceY(i) += tFy[base + i];
+                atoms.forceZ(i) += tFz[base + i];
+                atoms.energy(i) += tE [base + i];
+            }
         }
     }
 }
diff --git a/Engine/physics/integrators/VerletScheme.cpp b/Engine/physics/integrators/VerletScheme.cpp
index ef80c977..5cba99eb 100644
--- a/Engine/physics/integrators/VerletScheme.cpp
+++ b/Engine/physics/integrators/VerletScheme.cpp
@@ -30,7 +30,7 @@ void VerletScheme::predict(AtomStorage& atomStorage, float dt) {
 
     const float* RESTRICT invMass = atomStorage.invMassData();
 
-#pragma GCC ivdep
+#pragma omp parallel for schedule(static)
     for (size_t i = 0; i < n; ++i) {
         x[i] += (vx[i] + fx[i] * invMass[i] * 0.5f * dt) * dt;
         y[i] += (vy[i] + fy[i] * invMass[i] * 0.5f * dt) * dt;
@@ -56,7 +56,7 @@ void VerletScheme::correct(AtomStorage& atomStorage, float accelDamping, float d
 
     const float* RESTRICT invMass = atomStorage.invMassData();
 
-#pragma GCC ivdep
+#pragma omp parallel for schedule(static)
     for (size_t i = 0; i < n; ++i) {
         const float halfDtInvMass = 0.5f * accelDamping * dt * invMass[i];
 

From 4df58ea7bac87158b34484fe44401fd900f7272b Mon Sep 17 00:00:00 2001
From: kranks-uga <krepin1001@gmail.com>
Date: Thu, 28 May 2026 13:34:00 +0300
Subject: [PATCH 2/3] perf: replace OpenMP with TBB, adapt to World
 architecture

---
 CMakeLists.txt                              |  37 +++--
 Engine/NeighborSearch/NeighborList.cpp      |  35 +++--
 Engine/physics/ForceField.cpp               | 145 ++++++--------------
 Engine/physics/integrators/VerletScheme.cpp |  32 ++++-
 4 files changed, 128 insertions(+), 121 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6e3c2ad8..f881972a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,7 +11,7 @@ set(APP_VERSION_STRING "${PROJECT_VERSION}")
 option(OPTIMIZE_FOR_NATIVE "Enable native CPU optimizations outside Debug builds" OFF)
 option(BUILD_BENCHMARKS "Build benchmarks" OFF)
 option(ENABLE_IPO "Enable link-time optimization for non-Debug builds when supported" ON)
-option(ENABLE_OPENMP "Enable OpenMP multithreading for physics simulation" ON)
+option(ENABLE_TBB "Enable Intel oneAPI TBB multithreading for physics simulation" ON)
 
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -66,14 +66,14 @@ add_dependencies(latticelab_lib embedded_resources)
 include(shaders)
 add_dependencies(latticelab_lib wgsl_shaders)
 
-if(ENABLE_OPENMP)
-    find_package(OpenMP)
-    if(OpenMP_CXX_FOUND)
-        target_link_libraries(latticelab_lib PUBLIC OpenMP::OpenMP_CXX)
-        message(STATUS "OpenMP: enabled (${OpenMP_CXX_VERSION})")
-    else()
-        message(WARNING "OpenMP not found — building without multithreading")
-    endif()
+include(embedded_translate)
+add_dependencies(latticelab_lib embedded_translate)
+
+if(ENABLE_TBB)
+    find_package(TBB REQUIRED)
+    target_link_libraries(latticelab_lib PUBLIC TBB::tbb)
+    target_compile_definitions(latticelab_lib PUBLIC ENABLE_TBB)
+    message(STATUS "TBB: enabled")
 endif()
 
 target_compile_options(latticelab_lib PRIVATE
@@ -94,6 +94,25 @@ target_include_directories(latticelab_lib PUBLIC
     ${webgpu_cpp_SOURCE_DIR}/wgpu-native
 )
 
+if(APPLE)
+    find_library(COCOA_FRAMEWORK Cocoa)
+    find_library(METAL_FRAMEWORK Metal)
+    find_library(QUARTZCORE_FRAMEWORK QuartzCore)
+
+    target_sources(latticelab_lib PRIVATE
+            Rendering/MetalBackend.mm
+    )
+
+    target_link_libraries(latticelab_lib PUBLIC
+        "-framework Cocoa"
+        "-framework QuartzCore"
+        "-framework Metal"
+        ${COCOA_FRAMEWORK}
+        ${METAL_FRAMEWORK}
+        ${QUARTZCORE_FRAMEWORK}
+    )
+endif()
+
 target_link_libraries(latticelab_lib PUBLIC
     glfw
     imgui ImGuiFileDialog_lib
diff --git a/Engine/NeighborSearch/NeighborList.cpp b/Engine/NeighborSearch/NeighborList.cpp
index dbaac4af..8f0b396b 100644
--- a/Engine/NeighborSearch/NeighborList.cpp
+++ b/Engine/NeighborSearch/NeighborList.cpp
@@ -6,11 +6,16 @@
 #include <vector>
 
 #include "Engine/NeighborSearch/SpatialGrid.h"
-#include "Engine/SimBox.h"
+#include "Engine/World.h"
 #include "Engine/metrics/Profiler.h"
 #include "Engine/physics/AtomStorage.h"
 #include "Engine/restrict.h"
 
+#ifdef ENABLE_TBB
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+#endif
+
 void NeighborList::setCutoff(float cutoff) {
     cutoff_ = cutoff;
     listRadius_ = cutoff_ + skin_;
@@ -48,20 +53,20 @@ void NeighborList::clear() {
     resetStats();
 }
 
-void NeighborList::rebuildPipeline(const AtomStorage& atoms, SimBox& box, int simStep) {
+void NeighborList::rebuildPipeline(const AtomStorage& atoms, World& world, int simStep) {
     // перестройка пространственной сетки
-    box.grid.rebuild(atoms.xDataSpan(), atoms.yDataSpan(), atoms.zDataSpan());
+    world.getGrid().rebuild(atoms.xDataSpan(), atoms.yDataSpan(), atoms.zDataSpan());
     // перестройка списка соседей
-    build(atoms, box);
+    build(atoms, world);
     // обновление метрик
     const float rebuildTimeMs = static_cast<float>(Profiler::instance().lastMs("NeighborList::build"));
     stats_.recordRebuild(simStep, rebuildTimeMs);
 }
 
-void NeighborList::build(const AtomStorage& atoms, SimBox& box) {
+void NeighborList::build(const AtomStorage& atoms, World& box) {
     PROFILE_SCOPE("NeighborList::build");
 
-    const SpatialGrid& grid = box.grid;
+    const SpatialGrid& grid = box.getGrid();
     const uint32_t atomCount = static_cast<uint32_t>(atoms.size());
     const float* RESTRICT x = atoms.xData();
     const float* RESTRICT y = atoms.yData();
@@ -71,10 +76,17 @@ void NeighborList::build(const AtomStorage& atoms, SimBox& box) {
 
     // Фаза 1: каждый поток строит список соседей своих атомов независимо
     std::vector<std::vector<uint32_t>> perAtom(atomCount);
-#pragma omp parallel for schedule(dynamic, 64)
+#ifdef ENABLE_TBB
+    tbb::parallel_for(tbb::blocked_range<uint32_t>(0, atomCount, 64),
+        [&](const tbb::blocked_range<uint32_t>& r) {
+            for (uint32_t i = r.begin(); i != r.end(); ++i)
+                writeAtomNeighbors(grid, x, y, z, i, x[i], y[i], z[i], perAtom[i]);
+        });
+#else
     for (uint32_t i = 0; i < atomCount; ++i) {
         writeAtomNeighbors(grid, x, y, z, i, x[i], y[i], z[i], perAtom[i]);
     }
+#endif
 
     // Фаза 2: вычисляем смещения (prefix sum) — последовательно
     offsets_[0] = 0;
@@ -84,10 +96,17 @@ void NeighborList::build(const AtomStorage& atoms, SimBox& box) {
     neighbors_.resize(offsets_[atomCount]);
 
     // Фаза 3: копируем в плоский массив — снова параллельно
-#pragma omp parallel for schedule(static)
+#ifdef ENABLE_TBB
+    tbb::parallel_for(tbb::blocked_range<uint32_t>(0, atomCount),
+        [&](const tbb::blocked_range<uint32_t>& r) {
+            for (uint32_t i = r.begin(); i != r.end(); ++i)
+                std::copy(perAtom[i].begin(), perAtom[i].end(), neighbors_.begin() + offsets_[i]);
+        });
+#else
     for (uint32_t i = 0; i < atomCount; ++i) {
         std::copy(perAtom[i].begin(), perAtom[i].end(), neighbors_.begin() + offsets_[i]);
     }
+#endif
 
     std::copy(x, x + atoms.mobileCount(), refPosX_.data());
     std::copy(y, y + atoms.mobileCount(), refPosY_.data());
diff --git a/Engine/physics/ForceField.cpp b/Engine/physics/ForceField.cpp
index d452afe0..4669b361 100644
--- a/Engine/physics/ForceField.cpp
+++ b/Engine/physics/ForceField.cpp
@@ -1,61 +1,37 @@
 #include "ForceField.h"
 
-#include <cmath>
-#include <vector>
-
-#include "Engine/Consts.h"
 #include "Engine/NeighborSearch/NeighborList.h"
+#include "Engine/World.h"
 #include "Engine/metrics/Profiler.h"
-#include "Engine/physics/ForceFields/CoulombForceField.h"
-#include "Engine/restrict.h"
+#include "Engine/physics/AtomStorage.h"
 
-#ifdef _OPENMP
-#include <omp.h>
+#ifdef ENABLE_TBB
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
 #endif
 
 namespace {
     template <bool UseLJ, bool UseCoulomb>
-    void computePairInteractionsImpl(AtomStorage& atoms, NeighborList& neighborList, const LJForceField& ljForceField,
+    void computePairInteractionsImpl(AtomStorage& atoms, const NeighborList& neighborList, const LJForceField& ljForceField,
                                      const CoulombForceField& coulombForceField) {
-        const size_t totalN  = atoms.size();
-        const size_t mobileN = atoms.mobileCount();
         const auto& offsets    = neighborList.offsets();
         const auto& neighbours = neighborList.neighbors();
+        const size_t mobileN   = atoms.mobileCount();
 
-        int numThreads = 1;
-#ifdef _OPENMP
-        numThreads = omp_get_max_threads();
-#endif
-
-        // Каждый поток накапливает силы в свой срез массива (thread t, atom i → base + i).
-        // Это исключает гонки данных при использовании третьего закона Ньютона.
-        std::vector<float> tFx(numThreads * totalN, 0.0f);
-        std::vector<float> tFy(numThreads * totalN, 0.0f);
-        std::vector<float> tFz(numThreads * totalN, 0.0f);
-        std::vector<float> tE (numThreads * totalN, 0.0f);
-
-#pragma omp parallel for schedule(dynamic, 64)
-        for (size_t atomIndex = 0; atomIndex < mobileN; ++atomIndex) {
-#ifdef _OPENMP
-            const int tid = omp_get_thread_num();
-#else
-            const int tid = 0;
-#endif
-            float* RESTRICT localFx = tFx.data() + tid * totalN;
-            float* RESTRICT localFy = tFy.data() + tid * totalN;
-            float* RESTRICT localFz = tFz.data() + tid * totalN;
-            float* RESTRICT localE  = tE .data() + tid * totalN;
-
+        auto processAtom = [&](size_t atomIndex) {
             const uint32_t begin = offsets[atomIndex];
             const uint32_t end   = offsets[atomIndex + 1];
             if (begin > end || static_cast<size_t>(end) > neighbours.size()) {
-                continue;
+                return;
             }
 
             const float posX = atoms.posX(atomIndex);
             const float posY = atoms.posY(atomIndex);
             const float posZ = atoms.posZ(atomIndex);
-            float myFx = 0.0f, myFy = 0.0f, myFz = 0.0f, myE = 0.0f;
+            float forceX         = atoms.forceX(atomIndex);
+            float forceY         = atoms.forceY(atomIndex);
+            float forceZ         = atoms.forceZ(atomIndex);
+            float potentialEnergy = atoms.energy(atomIndex);
 
             const LJForceField::LJPairRow* ljPairRow = nullptr;
             if constexpr (UseLJ) {
@@ -67,7 +43,7 @@ namespace {
                 charge = atoms.charge(atomIndex);
                 if (charge == 0.0f) {
                     if constexpr (!UseLJ) {
-                        continue;
+                        return;
                     }
                 }
             }
@@ -80,92 +56,59 @@ namespace {
                 const float d2 = dx * dx + dy * dy + dz * dz;
 
                 if constexpr (UseLJ) {
-                    if (d2 > Consts::Epsilon) {
-                        const auto& params = (*ljPairRow)[static_cast<size_t>(atoms.type(bIndex))];
-                        const float invD2  = 1.0f / d2;
-                        const float invD6  = invD2 * invD2 * invD2;
-                        const float invD12 = invD6 * invD6;
-                        const float term6  = params.potentialC6  * invD6;
-                        const float term12 = params.potentialC12 * invD12;
-                        const float fScale = (12.0f * term12 - 6.0f * term6) * invD2;
-                        const float potential = term12 - term6;
-                        const float pfx = dx * fScale;
-                        const float pfy = dy * fScale;
-                        const float pfz = dz * fScale;
-
-                        myFx -= pfx;  myFy -= pfy;  myFz -= pfz;
-                        myE  += 0.5f * potential;
-                        localFx[bIndex] += pfx;
-                        localFy[bIndex] += pfy;
-                        localFz[bIndex] += pfz;
-                        localE [bIndex] += 0.5f * potential;
-                    }
+                    ljForceField.pairInteraction(atoms, bIndex, dx, dy, dz, d2, *ljPairRow, forceX, forceY, forceZ, potentialEnergy);
                 }
                 if constexpr (UseCoulomb) {
                     if (charge != 0.0f) {
-                        const float chargeB = atoms.charge(bIndex);
-                        if (chargeB != 0.0f && d2 > Consts::Epsilon) {
-                            const float qqScale  = CoulombForceField::kCoulombEvAngstrom * charge * chargeB;
-                            const float invR     = 1.0f / std::sqrt(d2);
-                            const float fScale   = qqScale * invR / d2;
-                            const float potential = qqScale * invR;
-                            const float pfx = dx * fScale;
-                            const float pfy = dy * fScale;
-                            const float pfz = dz * fScale;
-
-                            myFx -= pfx;  myFy -= pfy;  myFz -= pfz;
-                            myE  += 0.5f * potential;
-                            localFx[bIndex] += pfx;
-                            localFy[bIndex] += pfy;
-                            localFz[bIndex] += pfz;
-                            localE [bIndex] += 0.5f * potential;
-                        }
+                        coulombForceField.pairInteraction(atoms, bIndex, dx, dy, dz, d2, charge, forceX, forceY, forceZ, potentialEnergy);
                     }
                 }
             }
 
-            localFx[atomIndex] += myFx;
-            localFy[atomIndex] += myFy;
-            localFz[atomIndex] += myFz;
-            localE [atomIndex] += myE;
-        }
-
-        // Суммируем вклады всех потоков в атомы
-        for (int t = 0; t < numThreads; ++t) {
-            const size_t base = static_cast<size_t>(t) * totalN;
-            for (size_t i = 0; i < mobileN; ++i) {
-                atoms.forceX(i) += tFx[base + i];
-                atoms.forceY(i) += tFy[base + i];
-                atoms.forceZ(i) += tFz[base + i];
-                atoms.energy(i) += tE [base + i];
-            }
-        }
+            atoms.forceX(atomIndex) = forceX;
+            atoms.forceY(atomIndex) = forceY;
+            atoms.forceZ(atomIndex) = forceZ;
+            atoms.energy(atomIndex) = potentialEnergy;
+        };
+
+#ifdef ENABLE_TBB
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, mobileN, 64),
+            [&](const tbb::blocked_range<size_t>& r) {
+                for (size_t i = r.begin(); i != r.end(); ++i) processAtom(i);
+            });
+#else
+        for (size_t i = 0; i < mobileN; ++i) processAtom(i);
+#endif
     }
 }
 
 ForceField::ForceField() = default;
 
-void ForceField::syncWalls(const SimBox& box) { wallForceField_.syncWalls(box); }
-
-void ForceField::compute(AtomStorage& atoms, Bond::List& bonds, SimBox& box, NeighborList& neighborList, bool allowBondFormation,
-                         float dt) const {
+void ForceField::compute(World& world, bool allowBondFormation, float dt) const {
     PROFILE_SCOPE("ForceField::compute");
 
-    wallForceField_.compute(atoms, static_force_);
-    computePairInteractions(atoms, neighborList);
+    AtomStorage& atoms        = world.getAtomStorage();
+    Bond::List& bonds         = world.getBonds();
+    NeighborList& neighborList = world.getNeighborList();
+
+    wallForceField_.compute(world);
+    computePairInteractions(world);
     bondForceField_.compute(atoms, bonds, neighborList, allowBondFormation, dt);
 }
 
-void ForceField::computePairInteractions(AtomStorage& atoms, NeighborList& neighborList) const {
+void ForceField::computePairInteractions(World& world) const {
     PROFILE_SCOPE("ForceField::pairInteractions");
 
-    if (enableLJ_ && enableCoulomb_) {
+    AtomStorage& atoms              = world.getAtomStorage();
+    const NeighborList& neighborList = world.getNeighborList();
+
+    if (world.isLJEnabled() && world.isCoulombEnabled()) {
         computePairInteractionsImpl<true, true>(atoms, neighborList, ljForceField_, coulombForceField_);
     }
-    else if (enableLJ_) {
+    else if (world.isLJEnabled()) {
         computePairInteractionsImpl<true, false>(atoms, neighborList, ljForceField_, coulombForceField_);
     }
-    else if (enableCoulomb_) {
+    else if (world.isCoulombEnabled()) {
         computePairInteractionsImpl<false, true>(atoms, neighborList, ljForceField_, coulombForceField_);
     }
 }
diff --git a/Engine/physics/integrators/VerletScheme.cpp b/Engine/physics/integrators/VerletScheme.cpp
index 5cba99eb..dbbc6f7a 100644
--- a/Engine/physics/integrators/VerletScheme.cpp
+++ b/Engine/physics/integrators/VerletScheme.cpp
@@ -3,6 +3,11 @@
 #include "Engine/metrics/Profiler.h"
 #include "Engine/physics/integrators/StepOps.h"
 
+#ifdef ENABLE_TBB
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+#endif
+
 void VerletScheme::pipeline(StepData& stepData) const {
     PROFILE_SCOPE("VerletScheme::pipeline");
     // Расчет новых позиций
@@ -10,7 +15,7 @@ void VerletScheme::pipeline(StepData& stepData) const {
     // Расчет сил
     StepOps::computeForces(stepData);
     // Корректировка скоростей
-    correct(stepData.atomStorage, stepData.accelDamping, stepData.dt);
+    correct(stepData.world.getAtomStorage(), stepData.accelDamping, stepData.dt);
 }
 
 void VerletScheme::predict(AtomStorage& atomStorage, float dt) {
@@ -30,12 +35,22 @@ void VerletScheme::predict(AtomStorage& atomStorage, float dt) {
 
     const float* RESTRICT invMass = atomStorage.invMassData();
 
-#pragma omp parallel for schedule(static)
+#ifdef ENABLE_TBB
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, n),
+        [&](const tbb::blocked_range<size_t>& r) {
+            for (size_t i = r.begin(); i != r.end(); ++i) {
+                x[i] += (vx[i] + fx[i] * invMass[i] * 0.5f * dt) * dt;
+                y[i] += (vy[i] + fy[i] * invMass[i] * 0.5f * dt) * dt;
+                z[i] += (vz[i] + fz[i] * invMass[i] * 0.5f * dt) * dt;
+            }
+        });
+#else
     for (size_t i = 0; i < n; ++i) {
         x[i] += (vx[i] + fx[i] * invMass[i] * 0.5f * dt) * dt;
         y[i] += (vy[i] + fy[i] * invMass[i] * 0.5f * dt) * dt;
         z[i] += (vz[i] + fz[i] * invMass[i] * 0.5f * dt) * dt;
     }
+#endif
 }
 
 void VerletScheme::correct(AtomStorage& atomStorage, float accelDamping, float dt) {
@@ -56,7 +71,17 @@ void VerletScheme::correct(AtomStorage& atomStorage, float accelDamping, float d
 
     const float* RESTRICT invMass = atomStorage.invMassData();
 
-#pragma omp parallel for schedule(static)
+#ifdef ENABLE_TBB
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, n),
+        [&](const tbb::blocked_range<size_t>& r) {
+            for (size_t i = r.begin(); i != r.end(); ++i) {
+                const float halfDtInvMass = 0.5f * accelDamping * dt * invMass[i];
+                vx[i] += (pfx[i] + fx[i]) * halfDtInvMass;
+                vy[i] += (pfy[i] + fy[i]) * halfDtInvMass;
+                vz[i] += (pfz[i] + fz[i]) * halfDtInvMass;
+            }
+        });
+#else
     for (size_t i = 0; i < n; ++i) {
         const float halfDtInvMass = 0.5f * accelDamping * dt * invMass[i];
 
@@ -64,4 +89,5 @@ void VerletScheme::correct(AtomStorage& atomStorage, float accelDamping, float d
         vy[i] += (pfy[i] + fy[i]) * halfDtInvMass;
         vz[i] += (pfz[i] + fz[i]) * halfDtInvMass;
     }
+#endif
 }

From 0daf2aaa55fea2853c6407feecc593c18c13ac9d Mon Sep 17 00:00:00 2001
From: kranks-uga <krepin1001@gmail.com>
Date: Sun, 31 May 2026 16:27:20 +0300
Subject: [PATCH 3/3] perf: restore vectorization in TBB integrator lambdas

---
 Engine/physics/integrators/KDKScheme.cpp    | 10 ++--
 Engine/physics/integrators/VerletScheme.cpp | 58 ++++++++++++++++++++-
 2 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/Engine/physics/integrators/KDKScheme.cpp b/Engine/physics/integrators/KDKScheme.cpp
index ab459c01..fc02064a 100644
--- a/Engine/physics/integrators/KDKScheme.cpp
+++ b/Engine/physics/integrators/KDKScheme.cpp
@@ -29,10 +29,13 @@ void KDKScheme::halfKick(AtomStorage& atomStorage, float accelDamping, float dt)
 
     const size_t mobileCount = atomStorage.mobileCount();
 
+    const float halfDt = 0.5f * accelDamping * dt;
+#pragma GCC ivdep
     for (size_t i = 0; i < mobileCount; ++i) {
-        vx[i] += 0.5f * fx[i] * invMass[i] * accelDamping * dt;
-        vy[i] += 0.5f * fy[i] * invMass[i] * accelDamping * dt;
-        vz[i] += 0.5f * fz[i] * invMass[i] * accelDamping * dt;
+        const float halfDtInvMass = halfDt * invMass[i];
+        vx[i] += fx[i] * halfDtInvMass;
+        vy[i] += fy[i] * halfDtInvMass;
+        vz[i] += fz[i] * halfDtInvMass;
     }
 }
 
@@ -47,6 +50,7 @@ void KDKScheme::drift(AtomStorage& atomStorage, float dt) {
     const float* RESTRICT vz = atomStorage.vzData();
 
     const size_t mobileCount = atomStorage.mobileCount();
+#pragma GCC ivdep
     for (size_t i = 0; i < mobileCount; ++i) {
         x[i] += vx[i] * dt;
         y[i] += vy[i] * dt;
diff --git a/Engine/physics/integrators/VerletScheme.cpp b/Engine/physics/integrators/VerletScheme.cpp
index 2e35cf7c..7d2949df 100644
--- a/Engine/physics/integrators/VerletScheme.cpp
+++ b/Engine/physics/integrators/VerletScheme.cpp
@@ -3,6 +3,11 @@
 #include "Engine/metrics/Profiler.h"
 #include "Engine/physics/integrators/StepOps.h"
 
+#ifdef ENABLE_TBB
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+#endif
+
 void VerletScheme::pipeline(StepData& stepData) const {
     PROFILE_SCOPE("VerletScheme::pipeline");
     // Расчет новых позиций
@@ -30,12 +35,36 @@ void VerletScheme::predict(AtomStorage& atomStorage, float dt) {
 
     const float* RESTRICT invMass = atomStorage.invMassData();
 
+#ifdef ENABLE_TBB
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, n),
+        [&](const tbb::blocked_range<size_t>& r) {
+            const size_t begin = r.begin();
+            float* __restrict__ lx = x + begin;
+            float* __restrict__ ly = y + begin;
+            float* __restrict__ lz = z + begin;
+            const float* __restrict__ lvx = vx + begin;
+            const float* __restrict__ lvy = vy + begin;
+            const float* __restrict__ lvz = vz + begin;
+            const float* __restrict__ lfx = fx + begin;
+            const float* __restrict__ lfy = fy + begin;
+            const float* __restrict__ lfz = fz + begin;
+            const float* __restrict__ lim = invMass + begin;
+            const size_t len = r.end() - begin;
+#pragma GCC ivdep
+            for (size_t i = 0; i < len; ++i) {
+                lx[i] += (lvx[i] + lfx[i] * lim[i] * 0.5f * dt) * dt;
+                ly[i] += (lvy[i] + lfy[i] * lim[i] * 0.5f * dt) * dt;
+                lz[i] += (lvz[i] + lfz[i] * lim[i] * 0.5f * dt) * dt;
+            }
+        });
+#else
 #pragma GCC ivdep
     for (size_t i = 0; i < n; ++i) {
         x[i] += (vx[i] + fx[i] * invMass[i] * 0.5f * dt) * dt;
         y[i] += (vy[i] + fy[i] * invMass[i] * 0.5f * dt) * dt;
         z[i] += (vz[i] + fz[i] * invMass[i] * 0.5f * dt) * dt;
     }
+#endif
 }
 
 void VerletScheme::correct(AtomStorage& atomStorage, float accelDamping, float dt) {
@@ -56,12 +85,37 @@ void VerletScheme::correct(AtomStorage& atomStorage, float accelDamping, float d
 
     const float* RESTRICT invMass = atomStorage.invMassData();
 
+    const float halfDt = 0.5f * accelDamping * dt;
+#ifdef ENABLE_TBB
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, n),
+        [&](const tbb::blocked_range<size_t>& r) {
+            const size_t begin = r.begin();
+            const float* __restrict__ lfx = fx + begin;
+            const float* __restrict__ lfy = fy + begin;
+            const float* __restrict__ lfz = fz + begin;
+            const float* __restrict__ lpfx = pfx + begin;
+            const float* __restrict__ lpfy = pfy + begin;
+            const float* __restrict__ lpfz = pfz + begin;
+            float* __restrict__ lvx = vx + begin;
+            float* __restrict__ lvy = vy + begin;
+            float* __restrict__ lvz = vz + begin;
+            const float* __restrict__ lim = invMass + begin;
+            const size_t len = r.end() - begin;
+#pragma GCC ivdep
+            for (size_t i = 0; i < len; ++i) {
+                const float halfDtInvMass = halfDt * lim[i];
+                lvx[i] += (lpfx[i] + lfx[i]) * halfDtInvMass;
+                lvy[i] += (lpfy[i] + lfy[i]) * halfDtInvMass;
+                lvz[i] += (lpfz[i] + lfz[i]) * halfDtInvMass;
+            }
+        });
+#else
 #pragma GCC ivdep
     for (size_t i = 0; i < n; ++i) {
-        const float halfDtInvMass = 0.5f * accelDamping * dt * invMass[i];
-
+        const float halfDtInvMass = halfDt * invMass[i];
         vx[i] += (pfx[i] + fx[i]) * halfDtInvMass;
         vy[i] += (pfy[i] + fy[i]) * halfDtInvMass;
         vz[i] += (pfz[i] + fz[i]) * halfDtInvMass;
     }
+#endif
 }