From 65edde95533c18b12d623fcfde7fffb2992913d0 Mon Sep 17 00:00:00 2001 From: kranks-uga Date: Mon, 25 May 2026 18:08:59 +0300 Subject: [PATCH 1/3] =?UTF-8?q?perf:=20=D0=BF=D0=B0=D1=80=D0=B0=D0=BB?= =?UTF-8?q?=D0=BB=D0=B5=D0=BB=D1=8C=D0=BD=D1=8B=D0=B9=20=D1=80=D0=B0=D1=81?= =?UTF-8?q?=D1=87=D1=91=D1=82=20=D1=84=D0=B8=D0=B7=D0=B8=D0=BA=D0=B8=20?= =?UTF-8?q?=D1=87=D0=B5=D1=80=D0=B5=D0=B7=20OpenMP?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 11 +++ Engine/NeighborSearch/NeighborList.cpp | 22 +++-- Engine/physics/ForceField.cpp | 104 +++++++++++++++++--- Engine/physics/integrators/VerletScheme.cpp | 4 +- 4 files changed, 120 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c680d83..6e3c2ad8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,7 @@ set(APP_VERSION_STRING "${PROJECT_VERSION}") option(OPTIMIZE_FOR_NATIVE "Enable native CPU optimizations outside Debug builds" OFF) option(BUILD_BENCHMARKS "Build benchmarks" OFF) option(ENABLE_IPO "Enable link-time optimization for non-Debug builds when supported" ON) +option(ENABLE_OPENMP "Enable OpenMP multithreading for physics simulation" ON) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -65,6 +66,16 @@ add_dependencies(latticelab_lib embedded_resources) include(shaders) add_dependencies(latticelab_lib wgsl_shaders) +if(ENABLE_OPENMP) + find_package(OpenMP) + if(OpenMP_CXX_FOUND) + target_link_libraries(latticelab_lib PUBLIC OpenMP::OpenMP_CXX) + message(STATUS "OpenMP: enabled (${OpenMP_CXX_VERSION})") + else() + message(WARNING "OpenMP not found — building without multithreading") + endif() +endif() + target_compile_options(latticelab_lib PRIVATE $<$,$>,$>:-march=native> ) diff --git a/Engine/NeighborSearch/NeighborList.cpp b/Engine/NeighborSearch/NeighborList.cpp index e4b13a26..dbaac4af 100644 --- a/Engine/NeighborSearch/NeighborList.cpp +++ b/Engine/NeighborSearch/NeighborList.cpp @@ -69,14 +69,24 @@ void NeighborList::build(const AtomStorage& atoms, SimBox& box) { reserveListBuffers(atoms); + // Фаза 1: каждый поток строит список соседей своих атомов независимо + std::vector> perAtom(atomCount); +#pragma omp parallel for schedule(dynamic, 64) + for (uint32_t i = 0; i < atomCount; ++i) { + writeAtomNeighbors(grid, x, y, z, i, x[i], y[i], z[i], perAtom[i]); + } + + // Фаза 2: вычисляем смещения (prefix sum) — последовательно offsets_[0] = 0; for (uint32_t i = 0; i < atomCount; ++i) { - const float xi = x[i]; - const float yi = y[i]; - const float zi = z[i]; - // запись всех соседей атома в массив - writeAtomNeighbors(grid, x, y, z, i, xi, yi, zi, neighbors_); - offsets_[i + 1] = neighbors_.size(); + offsets_[i + 1] = offsets_[i] + static_cast(perAtom[i].size()); + } + neighbors_.resize(offsets_[atomCount]); + + // Фаза 3: копируем в плоский массив — снова параллельно +#pragma omp parallel for schedule(static) + for (uint32_t i = 0; i < atomCount; ++i) { + std::copy(perAtom[i].begin(), perAtom[i].end(), neighbors_.begin() + offsets_[i]); } std::copy(x, x + atoms.mobileCount(), refPosX_.data()); diff --git a/Engine/physics/ForceField.cpp b/Engine/physics/ForceField.cpp index b2316cc0..d452afe0 100644 --- a/Engine/physics/ForceField.cpp +++ b/Engine/physics/ForceField.cpp @@ -1,18 +1,53 @@ #include "ForceField.h" +#include +#include + +#include "Engine/Consts.h" #include "Engine/NeighborSearch/NeighborList.h" #include "Engine/metrics/Profiler.h" +#include "Engine/physics/ForceFields/CoulombForceField.h" +#include "Engine/restrict.h" + +#ifdef _OPENMP +#include +#endif namespace { template void computePairInteractionsImpl(AtomStorage& atoms, NeighborList& neighborList, const LJForceField& ljForceField, const CoulombForceField& coulombForceField) { - const auto& offsets = neighborList.offsets(); + const size_t totalN = atoms.size(); + const size_t mobileN = atoms.mobileCount(); + const auto& offsets = neighborList.offsets(); const auto& neighbours = neighborList.neighbors(); - for (size_t atomIndex = 0; atomIndex < atoms.mobileCount(); ++atomIndex) { + int numThreads = 1; +#ifdef _OPENMP + numThreads = omp_get_max_threads(); +#endif + + // Каждый поток накапливает силы в свой срез массива (thread t, atom i → base + i). + // Это исключает гонки данных при использовании третьего закона Ньютона. + std::vector tFx(numThreads * totalN, 0.0f); + std::vector tFy(numThreads * totalN, 0.0f); + std::vector tFz(numThreads * totalN, 0.0f); + std::vector tE (numThreads * totalN, 0.0f); + +#pragma omp parallel for schedule(dynamic, 64) + for (size_t atomIndex = 0; atomIndex < mobileN; ++atomIndex) { +#ifdef _OPENMP + const int tid = omp_get_thread_num(); +#else + const int tid = 0; +#endif + float* RESTRICT localFx = tFx.data() + tid * totalN; + float* RESTRICT localFy = tFy.data() + tid * totalN; + float* RESTRICT localFz = tFz.data() + tid * totalN; + float* RESTRICT localE = tE .data() + tid * totalN; + const uint32_t begin = offsets[atomIndex]; - const uint32_t end = offsets[atomIndex + 1]; + const uint32_t end = offsets[atomIndex + 1]; if (begin > end || static_cast(end) > neighbours.size()) { continue; } @@ -20,10 +55,7 @@ namespace { const float posX = atoms.posX(atomIndex); const float posY = atoms.posY(atomIndex); const float posZ = atoms.posZ(atomIndex); - float forceX = atoms.forceX(atomIndex); - float forceY = atoms.forceY(atomIndex); - float forceZ = atoms.forceZ(atomIndex); - float potentialEnergy = atoms.energy(atomIndex); + float myFx = 0.0f, myFy = 0.0f, myFz = 0.0f, myE = 0.0f; const LJForceField::LJPairRow* ljPairRow = nullptr; if constexpr (UseLJ) { @@ -48,19 +80,65 @@ namespace { const float d2 = dx * dx + dy * dy + dz * dz; if constexpr (UseLJ) { - ljForceField.pairInteraction(atoms, bIndex, dx, dy, dz, d2, *ljPairRow, forceX, forceY, forceZ, potentialEnergy); + if (d2 > Consts::Epsilon) { + const auto& params = (*ljPairRow)[static_cast(atoms.type(bIndex))]; + const float invD2 = 1.0f / d2; + const float invD6 = invD2 * invD2 * invD2; + const float invD12 = invD6 * invD6; + const float term6 = params.potentialC6 * invD6; + const float term12 = params.potentialC12 * invD12; + const float fScale = (12.0f * term12 - 6.0f * term6) * invD2; + const float potential = term12 - term6; + const float pfx = dx * fScale; + const float pfy = dy * fScale; + const float pfz = dz * fScale; + + myFx -= pfx; myFy -= pfy; myFz -= pfz; + myE += 0.5f * potential; + localFx[bIndex] += pfx; + localFy[bIndex] += pfy; + localFz[bIndex] += pfz; + localE [bIndex] += 0.5f * potential; + } } if constexpr (UseCoulomb) { if (charge != 0.0f) { - coulombForceField.pairInteraction(atoms, bIndex, dx, dy, dz, d2, charge, forceX, forceY, forceZ, potentialEnergy); + const float chargeB = atoms.charge(bIndex); + if (chargeB != 0.0f && d2 > Consts::Epsilon) { + const float qqScale = CoulombForceField::kCoulombEvAngstrom * charge * chargeB; + const float invR = 1.0f / std::sqrt(d2); + const float fScale = qqScale * invR / d2; + const float potential = qqScale * invR; + const float pfx = dx * fScale; + const float pfy = dy * fScale; + const float pfz = dz * fScale; + + myFx -= pfx; myFy -= pfy; myFz -= pfz; + myE += 0.5f * potential; + localFx[bIndex] += pfx; + localFy[bIndex] += pfy; + localFz[bIndex] += pfz; + localE [bIndex] += 0.5f * potential; + } } } } - atoms.forceX(atomIndex) = forceX; - atoms.forceY(atomIndex) = forceY; - atoms.forceZ(atomIndex) = forceZ; - atoms.energy(atomIndex) = potentialEnergy; + localFx[atomIndex] += myFx; + localFy[atomIndex] += myFy; + localFz[atomIndex] += myFz; + localE [atomIndex] += myE; + } + + // Суммируем вклады всех потоков в атомы + for (int t = 0; t < numThreads; ++t) { + const size_t base = static_cast(t) * totalN; + for (size_t i = 0; i < mobileN; ++i) { + atoms.forceX(i) += tFx[base + i]; + atoms.forceY(i) += tFy[base + i]; + atoms.forceZ(i) += tFz[base + i]; + atoms.energy(i) += tE [base + i]; + } } } } diff --git a/Engine/physics/integrators/VerletScheme.cpp b/Engine/physics/integrators/VerletScheme.cpp index ef80c977..5cba99eb 100644 --- a/Engine/physics/integrators/VerletScheme.cpp +++ b/Engine/physics/integrators/VerletScheme.cpp @@ -30,7 +30,7 @@ void VerletScheme::predict(AtomStorage& atomStorage, float dt) { const float* RESTRICT invMass = atomStorage.invMassData(); -#pragma GCC ivdep +#pragma omp parallel for schedule(static) for (size_t i = 0; i < n; ++i) { x[i] += (vx[i] + fx[i] * invMass[i] * 0.5f * dt) * dt; y[i] += (vy[i] + fy[i] * invMass[i] * 0.5f * dt) * dt; @@ -56,7 +56,7 @@ void VerletScheme::correct(AtomStorage& atomStorage, float accelDamping, float d const float* RESTRICT invMass = atomStorage.invMassData(); -#pragma GCC ivdep +#pragma omp parallel for schedule(static) for (size_t i = 0; i < n; ++i) { const float halfDtInvMass = 0.5f * accelDamping * dt * invMass[i]; From 4df58ea7bac87158b34484fe44401fd900f7272b Mon Sep 17 00:00:00 2001 From: kranks-uga Date: Thu, 28 May 2026 13:34:00 +0300 Subject: [PATCH 2/3] perf: replace OpenMP with TBB, adapt to World architecture --- CMakeLists.txt | 37 +++-- Engine/NeighborSearch/NeighborList.cpp | 35 +++-- Engine/physics/ForceField.cpp | 145 ++++++-------------- Engine/physics/integrators/VerletScheme.cpp | 32 ++++- 4 files changed, 128 insertions(+), 121 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e3c2ad8..f881972a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ set(APP_VERSION_STRING "${PROJECT_VERSION}") option(OPTIMIZE_FOR_NATIVE "Enable native CPU optimizations outside Debug builds" OFF) option(BUILD_BENCHMARKS "Build benchmarks" OFF) option(ENABLE_IPO "Enable link-time optimization for non-Debug builds when supported" ON) -option(ENABLE_OPENMP "Enable OpenMP multithreading for physics simulation" ON) +option(ENABLE_TBB "Enable Intel oneAPI TBB multithreading for physics simulation" ON) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -66,14 +66,14 @@ add_dependencies(latticelab_lib embedded_resources) include(shaders) add_dependencies(latticelab_lib wgsl_shaders) -if(ENABLE_OPENMP) - find_package(OpenMP) - if(OpenMP_CXX_FOUND) - target_link_libraries(latticelab_lib PUBLIC OpenMP::OpenMP_CXX) - message(STATUS "OpenMP: enabled (${OpenMP_CXX_VERSION})") - else() - message(WARNING "OpenMP not found — building without multithreading") - endif() +include(embedded_translate) +add_dependencies(latticelab_lib embedded_translate) + +if(ENABLE_TBB) + find_package(TBB REQUIRED) + target_link_libraries(latticelab_lib PUBLIC TBB::tbb) + target_compile_definitions(latticelab_lib PUBLIC ENABLE_TBB) + message(STATUS "TBB: enabled") endif() target_compile_options(latticelab_lib PRIVATE @@ -94,6 +94,25 @@ target_include_directories(latticelab_lib PUBLIC ${webgpu_cpp_SOURCE_DIR}/wgpu-native ) +if(APPLE) + find_library(COCOA_FRAMEWORK Cocoa) + find_library(METAL_FRAMEWORK Metal) + find_library(QUARTZCORE_FRAMEWORK QuartzCore) + + target_sources(latticelab_lib PRIVATE + Rendering/MetalBackend.mm + ) + + target_link_libraries(latticelab_lib PUBLIC + "-framework Cocoa" + "-framework QuartzCore" + "-framework Metal" + ${COCOA_FRAMEWORK} + ${METAL_FRAMEWORK} + ${QUARTZCORE_FRAMEWORK} + ) +endif() + target_link_libraries(latticelab_lib PUBLIC glfw imgui ImGuiFileDialog_lib diff --git a/Engine/NeighborSearch/NeighborList.cpp b/Engine/NeighborSearch/NeighborList.cpp index dbaac4af..8f0b396b 100644 --- a/Engine/NeighborSearch/NeighborList.cpp +++ b/Engine/NeighborSearch/NeighborList.cpp @@ -6,11 +6,16 @@ #include #include "Engine/NeighborSearch/SpatialGrid.h" -#include "Engine/SimBox.h" +#include "Engine/World.h" #include "Engine/metrics/Profiler.h" #include "Engine/physics/AtomStorage.h" #include "Engine/restrict.h" +#ifdef ENABLE_TBB +#include +#include +#endif + void NeighborList::setCutoff(float cutoff) { cutoff_ = cutoff; listRadius_ = cutoff_ + skin_; @@ -48,20 +53,20 @@ void NeighborList::clear() { resetStats(); } -void NeighborList::rebuildPipeline(const AtomStorage& atoms, SimBox& box, int simStep) { +void NeighborList::rebuildPipeline(const AtomStorage& atoms, World& world, int simStep) { // перестройка пространственной сетки - box.grid.rebuild(atoms.xDataSpan(), atoms.yDataSpan(), atoms.zDataSpan()); + world.getGrid().rebuild(atoms.xDataSpan(), atoms.yDataSpan(), atoms.zDataSpan()); // перестройка списка соседей - build(atoms, box); + build(atoms, world); // обновление метрик const float rebuildTimeMs = static_cast(Profiler::instance().lastMs("NeighborList::build")); stats_.recordRebuild(simStep, rebuildTimeMs); } -void NeighborList::build(const AtomStorage& atoms, SimBox& box) { +void NeighborList::build(const AtomStorage& atoms, World& box) { PROFILE_SCOPE("NeighborList::build"); - const SpatialGrid& grid = box.grid; + const SpatialGrid& grid = box.getGrid(); const uint32_t atomCount = static_cast(atoms.size()); const float* RESTRICT x = atoms.xData(); const float* RESTRICT y = atoms.yData(); @@ -71,10 +76,17 @@ void NeighborList::build(const AtomStorage& atoms, SimBox& box) { // Фаза 1: каждый поток строит список соседей своих атомов независимо std::vector> perAtom(atomCount); -#pragma omp parallel for schedule(dynamic, 64) +#ifdef ENABLE_TBB + tbb::parallel_for(tbb::blocked_range(0, atomCount, 64), + [&](const tbb::blocked_range& r) { + for (uint32_t i = r.begin(); i != r.end(); ++i) + writeAtomNeighbors(grid, x, y, z, i, x[i], y[i], z[i], perAtom[i]); + }); +#else for (uint32_t i = 0; i < atomCount; ++i) { writeAtomNeighbors(grid, x, y, z, i, x[i], y[i], z[i], perAtom[i]); } +#endif // Фаза 2: вычисляем смещения (prefix sum) — последовательно offsets_[0] = 0; @@ -84,10 +96,17 @@ void NeighborList::build(const AtomStorage& atoms, SimBox& box) { neighbors_.resize(offsets_[atomCount]); // Фаза 3: копируем в плоский массив — снова параллельно -#pragma omp parallel for schedule(static) +#ifdef ENABLE_TBB + tbb::parallel_for(tbb::blocked_range(0, atomCount), + [&](const tbb::blocked_range& r) { + for (uint32_t i = r.begin(); i != r.end(); ++i) + std::copy(perAtom[i].begin(), perAtom[i].end(), neighbors_.begin() + offsets_[i]); + }); +#else for (uint32_t i = 0; i < atomCount; ++i) { std::copy(perAtom[i].begin(), perAtom[i].end(), neighbors_.begin() + offsets_[i]); } +#endif std::copy(x, x + atoms.mobileCount(), refPosX_.data()); std::copy(y, y + atoms.mobileCount(), refPosY_.data()); diff --git a/Engine/physics/ForceField.cpp b/Engine/physics/ForceField.cpp index d452afe0..4669b361 100644 --- a/Engine/physics/ForceField.cpp +++ b/Engine/physics/ForceField.cpp @@ -1,61 +1,37 @@ #include "ForceField.h" -#include -#include - -#include "Engine/Consts.h" #include "Engine/NeighborSearch/NeighborList.h" +#include "Engine/World.h" #include "Engine/metrics/Profiler.h" -#include "Engine/physics/ForceFields/CoulombForceField.h" -#include "Engine/restrict.h" +#include "Engine/physics/AtomStorage.h" -#ifdef _OPENMP -#include +#ifdef ENABLE_TBB +#include +#include #endif namespace { template - void computePairInteractionsImpl(AtomStorage& atoms, NeighborList& neighborList, const LJForceField& ljForceField, + void computePairInteractionsImpl(AtomStorage& atoms, const NeighborList& neighborList, const LJForceField& ljForceField, const CoulombForceField& coulombForceField) { - const size_t totalN = atoms.size(); - const size_t mobileN = atoms.mobileCount(); const auto& offsets = neighborList.offsets(); const auto& neighbours = neighborList.neighbors(); + const size_t mobileN = atoms.mobileCount(); - int numThreads = 1; -#ifdef _OPENMP - numThreads = omp_get_max_threads(); -#endif - - // Каждый поток накапливает силы в свой срез массива (thread t, atom i → base + i). - // Это исключает гонки данных при использовании третьего закона Ньютона. - std::vector tFx(numThreads * totalN, 0.0f); - std::vector tFy(numThreads * totalN, 0.0f); - std::vector tFz(numThreads * totalN, 0.0f); - std::vector tE (numThreads * totalN, 0.0f); - -#pragma omp parallel for schedule(dynamic, 64) - for (size_t atomIndex = 0; atomIndex < mobileN; ++atomIndex) { -#ifdef _OPENMP - const int tid = omp_get_thread_num(); -#else - const int tid = 0; -#endif - float* RESTRICT localFx = tFx.data() + tid * totalN; - float* RESTRICT localFy = tFy.data() + tid * totalN; - float* RESTRICT localFz = tFz.data() + tid * totalN; - float* RESTRICT localE = tE .data() + tid * totalN; - + auto processAtom = [&](size_t atomIndex) { const uint32_t begin = offsets[atomIndex]; const uint32_t end = offsets[atomIndex + 1]; if (begin > end || static_cast(end) > neighbours.size()) { - continue; + return; } const float posX = atoms.posX(atomIndex); const float posY = atoms.posY(atomIndex); const float posZ = atoms.posZ(atomIndex); - float myFx = 0.0f, myFy = 0.0f, myFz = 0.0f, myE = 0.0f; + float forceX = atoms.forceX(atomIndex); + float forceY = atoms.forceY(atomIndex); + float forceZ = atoms.forceZ(atomIndex); + float potentialEnergy = atoms.energy(atomIndex); const LJForceField::LJPairRow* ljPairRow = nullptr; if constexpr (UseLJ) { @@ -67,7 +43,7 @@ namespace { charge = atoms.charge(atomIndex); if (charge == 0.0f) { if constexpr (!UseLJ) { - continue; + return; } } } @@ -80,92 +56,59 @@ namespace { const float d2 = dx * dx + dy * dy + dz * dz; if constexpr (UseLJ) { - if (d2 > Consts::Epsilon) { - const auto& params = (*ljPairRow)[static_cast(atoms.type(bIndex))]; - const float invD2 = 1.0f / d2; - const float invD6 = invD2 * invD2 * invD2; - const float invD12 = invD6 * invD6; - const float term6 = params.potentialC6 * invD6; - const float term12 = params.potentialC12 * invD12; - const float fScale = (12.0f * term12 - 6.0f * term6) * invD2; - const float potential = term12 - term6; - const float pfx = dx * fScale; - const float pfy = dy * fScale; - const float pfz = dz * fScale; - - myFx -= pfx; myFy -= pfy; myFz -= pfz; - myE += 0.5f * potential; - localFx[bIndex] += pfx; - localFy[bIndex] += pfy; - localFz[bIndex] += pfz; - localE [bIndex] += 0.5f * potential; - } + ljForceField.pairInteraction(atoms, bIndex, dx, dy, dz, d2, *ljPairRow, forceX, forceY, forceZ, potentialEnergy); } if constexpr (UseCoulomb) { if (charge != 0.0f) { - const float chargeB = atoms.charge(bIndex); - if (chargeB != 0.0f && d2 > Consts::Epsilon) { - const float qqScale = CoulombForceField::kCoulombEvAngstrom * charge * chargeB; - const float invR = 1.0f / std::sqrt(d2); - const float fScale = qqScale * invR / d2; - const float potential = qqScale * invR; - const float pfx = dx * fScale; - const float pfy = dy * fScale; - const float pfz = dz * fScale; - - myFx -= pfx; myFy -= pfy; myFz -= pfz; - myE += 0.5f * potential; - localFx[bIndex] += pfx; - localFy[bIndex] += pfy; - localFz[bIndex] += pfz; - localE [bIndex] += 0.5f * potential; - } + coulombForceField.pairInteraction(atoms, bIndex, dx, dy, dz, d2, charge, forceX, forceY, forceZ, potentialEnergy); } } } - localFx[atomIndex] += myFx; - localFy[atomIndex] += myFy; - localFz[atomIndex] += myFz; - localE [atomIndex] += myE; - } - - // Суммируем вклады всех потоков в атомы - for (int t = 0; t < numThreads; ++t) { - const size_t base = static_cast(t) * totalN; - for (size_t i = 0; i < mobileN; ++i) { - atoms.forceX(i) += tFx[base + i]; - atoms.forceY(i) += tFy[base + i]; - atoms.forceZ(i) += tFz[base + i]; - atoms.energy(i) += tE [base + i]; - } - } + atoms.forceX(atomIndex) = forceX; + atoms.forceY(atomIndex) = forceY; + atoms.forceZ(atomIndex) = forceZ; + atoms.energy(atomIndex) = potentialEnergy; + }; + +#ifdef ENABLE_TBB + tbb::parallel_for(tbb::blocked_range(0, mobileN, 64), + [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i != r.end(); ++i) processAtom(i); + }); +#else + for (size_t i = 0; i < mobileN; ++i) processAtom(i); +#endif } } ForceField::ForceField() = default; -void ForceField::syncWalls(const SimBox& box) { wallForceField_.syncWalls(box); } - -void ForceField::compute(AtomStorage& atoms, Bond::List& bonds, SimBox& box, NeighborList& neighborList, bool allowBondFormation, - float dt) const { +void ForceField::compute(World& world, bool allowBondFormation, float dt) const { PROFILE_SCOPE("ForceField::compute"); - wallForceField_.compute(atoms, static_force_); - computePairInteractions(atoms, neighborList); + AtomStorage& atoms = world.getAtomStorage(); + Bond::List& bonds = world.getBonds(); + NeighborList& neighborList = world.getNeighborList(); + + wallForceField_.compute(world); + computePairInteractions(world); bondForceField_.compute(atoms, bonds, neighborList, allowBondFormation, dt); } -void ForceField::computePairInteractions(AtomStorage& atoms, NeighborList& neighborList) const { +void ForceField::computePairInteractions(World& world) const { PROFILE_SCOPE("ForceField::pairInteractions"); - if (enableLJ_ && enableCoulomb_) { + AtomStorage& atoms = world.getAtomStorage(); + const NeighborList& neighborList = world.getNeighborList(); + + if (world.isLJEnabled() && world.isCoulombEnabled()) { computePairInteractionsImpl(atoms, neighborList, ljForceField_, coulombForceField_); } - else if (enableLJ_) { + else if (world.isLJEnabled()) { computePairInteractionsImpl(atoms, neighborList, ljForceField_, coulombForceField_); } - else if (enableCoulomb_) { + else if (world.isCoulombEnabled()) { computePairInteractionsImpl(atoms, neighborList, ljForceField_, coulombForceField_); } } diff --git a/Engine/physics/integrators/VerletScheme.cpp b/Engine/physics/integrators/VerletScheme.cpp index 5cba99eb..dbbc6f7a 100644 --- a/Engine/physics/integrators/VerletScheme.cpp +++ b/Engine/physics/integrators/VerletScheme.cpp @@ -3,6 +3,11 @@ #include "Engine/metrics/Profiler.h" #include "Engine/physics/integrators/StepOps.h" +#ifdef ENABLE_TBB +#include +#include +#endif + void VerletScheme::pipeline(StepData& stepData) const { PROFILE_SCOPE("VerletScheme::pipeline"); // Расчет новых позиций @@ -10,7 +15,7 @@ void VerletScheme::pipeline(StepData& stepData) const { // Расчет сил StepOps::computeForces(stepData); // Корректировка скоростей - correct(stepData.atomStorage, stepData.accelDamping, stepData.dt); + correct(stepData.world.getAtomStorage(), stepData.accelDamping, stepData.dt); } void VerletScheme::predict(AtomStorage& atomStorage, float dt) { @@ -30,12 +35,22 @@ void VerletScheme::predict(AtomStorage& atomStorage, float dt) { const float* RESTRICT invMass = atomStorage.invMassData(); -#pragma omp parallel for schedule(static) +#ifdef ENABLE_TBB + tbb::parallel_for(tbb::blocked_range(0, n), + [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i != r.end(); ++i) { + x[i] += (vx[i] + fx[i] * invMass[i] * 0.5f * dt) * dt; + y[i] += (vy[i] + fy[i] * invMass[i] * 0.5f * dt) * dt; + z[i] += (vz[i] + fz[i] * invMass[i] * 0.5f * dt) * dt; + } + }); +#else for (size_t i = 0; i < n; ++i) { x[i] += (vx[i] + fx[i] * invMass[i] * 0.5f * dt) * dt; y[i] += (vy[i] + fy[i] * invMass[i] * 0.5f * dt) * dt; z[i] += (vz[i] + fz[i] * invMass[i] * 0.5f * dt) * dt; } +#endif } void VerletScheme::correct(AtomStorage& atomStorage, float accelDamping, float dt) { @@ -56,7 +71,17 @@ void VerletScheme::correct(AtomStorage& atomStorage, float accelDamping, float d const float* RESTRICT invMass = atomStorage.invMassData(); -#pragma omp parallel for schedule(static) +#ifdef ENABLE_TBB + tbb::parallel_for(tbb::blocked_range(0, n), + [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i != r.end(); ++i) { + const float halfDtInvMass = 0.5f * accelDamping * dt * invMass[i]; + vx[i] += (pfx[i] + fx[i]) * halfDtInvMass; + vy[i] += (pfy[i] + fy[i]) * halfDtInvMass; + vz[i] += (pfz[i] + fz[i]) * halfDtInvMass; + } + }); +#else for (size_t i = 0; i < n; ++i) { const float halfDtInvMass = 0.5f * accelDamping * dt * invMass[i]; @@ -64,4 +89,5 @@ void VerletScheme::correct(AtomStorage& atomStorage, float accelDamping, float d vy[i] += (pfy[i] + fy[i]) * halfDtInvMass; vz[i] += (pfz[i] + fz[i]) * halfDtInvMass; } +#endif } From 0daf2aaa55fea2853c6407feecc593c18c13ac9d Mon Sep 17 00:00:00 2001 From: kranks-uga Date: Sun, 31 May 2026 16:27:20 +0300 Subject: [PATCH 3/3] perf: restore vectorization in TBB integrator lambdas --- Engine/physics/integrators/KDKScheme.cpp | 10 ++-- Engine/physics/integrators/VerletScheme.cpp | 58 ++++++++++++++++++++- 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/Engine/physics/integrators/KDKScheme.cpp b/Engine/physics/integrators/KDKScheme.cpp index ab459c01..fc02064a 100644 --- a/Engine/physics/integrators/KDKScheme.cpp +++ b/Engine/physics/integrators/KDKScheme.cpp @@ -29,10 +29,13 @@ void KDKScheme::halfKick(AtomStorage& atomStorage, float accelDamping, float dt) const size_t mobileCount = atomStorage.mobileCount(); + const float halfDt = 0.5f * accelDamping * dt; +#pragma GCC ivdep for (size_t i = 0; i < mobileCount; ++i) { - vx[i] += 0.5f * fx[i] * invMass[i] * accelDamping * dt; - vy[i] += 0.5f * fy[i] * invMass[i] * accelDamping * dt; - vz[i] += 0.5f * fz[i] * invMass[i] * accelDamping * dt; + const float halfDtInvMass = halfDt * invMass[i]; + vx[i] += fx[i] * halfDtInvMass; + vy[i] += fy[i] * halfDtInvMass; + vz[i] += fz[i] * halfDtInvMass; } } @@ -47,6 +50,7 @@ void KDKScheme::drift(AtomStorage& atomStorage, float dt) { const float* RESTRICT vz = atomStorage.vzData(); const size_t mobileCount = atomStorage.mobileCount(); +#pragma GCC ivdep for (size_t i = 0; i < mobileCount; ++i) { x[i] += vx[i] * dt; y[i] += vy[i] * dt; diff --git a/Engine/physics/integrators/VerletScheme.cpp b/Engine/physics/integrators/VerletScheme.cpp index 2e35cf7c..7d2949df 100644 --- a/Engine/physics/integrators/VerletScheme.cpp +++ b/Engine/physics/integrators/VerletScheme.cpp @@ -3,6 +3,11 @@ #include "Engine/metrics/Profiler.h" #include "Engine/physics/integrators/StepOps.h" +#ifdef ENABLE_TBB +#include +#include +#endif + void VerletScheme::pipeline(StepData& stepData) const { PROFILE_SCOPE("VerletScheme::pipeline"); // Расчет новых позиций @@ -30,12 +35,36 @@ void VerletScheme::predict(AtomStorage& atomStorage, float dt) { const float* RESTRICT invMass = atomStorage.invMassData(); +#ifdef ENABLE_TBB + tbb::parallel_for(tbb::blocked_range(0, n), + [&](const tbb::blocked_range& r) { + const size_t begin = r.begin(); + float* __restrict__ lx = x + begin; + float* __restrict__ ly = y + begin; + float* __restrict__ lz = z + begin; + const float* __restrict__ lvx = vx + begin; + const float* __restrict__ lvy = vy + begin; + const float* __restrict__ lvz = vz + begin; + const float* __restrict__ lfx = fx + begin; + const float* __restrict__ lfy = fy + begin; + const float* __restrict__ lfz = fz + begin; + const float* __restrict__ lim = invMass + begin; + const size_t len = r.end() - begin; +#pragma GCC ivdep + for (size_t i = 0; i < len; ++i) { + lx[i] += (lvx[i] + lfx[i] * lim[i] * 0.5f * dt) * dt; + ly[i] += (lvy[i] + lfy[i] * lim[i] * 0.5f * dt) * dt; + lz[i] += (lvz[i] + lfz[i] * lim[i] * 0.5f * dt) * dt; + } + }); +#else #pragma GCC ivdep for (size_t i = 0; i < n; ++i) { x[i] += (vx[i] + fx[i] * invMass[i] * 0.5f * dt) * dt; y[i] += (vy[i] + fy[i] * invMass[i] * 0.5f * dt) * dt; z[i] += (vz[i] + fz[i] * invMass[i] * 0.5f * dt) * dt; } +#endif } void VerletScheme::correct(AtomStorage& atomStorage, float accelDamping, float dt) { @@ -56,12 +85,37 @@ void VerletScheme::correct(AtomStorage& atomStorage, float accelDamping, float d const float* RESTRICT invMass = atomStorage.invMassData(); + const float halfDt = 0.5f * accelDamping * dt; +#ifdef ENABLE_TBB + tbb::parallel_for(tbb::blocked_range(0, n), + [&](const tbb::blocked_range& r) { + const size_t begin = r.begin(); + const float* __restrict__ lfx = fx + begin; + const float* __restrict__ lfy = fy + begin; + const float* __restrict__ lfz = fz + begin; + const float* __restrict__ lpfx = pfx + begin; + const float* __restrict__ lpfy = pfy + begin; + const float* __restrict__ lpfz = pfz + begin; + float* __restrict__ lvx = vx + begin; + float* __restrict__ lvy = vy + begin; + float* __restrict__ lvz = vz + begin; + const float* __restrict__ lim = invMass + begin; + const size_t len = r.end() - begin; +#pragma GCC ivdep + for (size_t i = 0; i < len; ++i) { + const float halfDtInvMass = halfDt * lim[i]; + lvx[i] += (lpfx[i] + lfx[i]) * halfDtInvMass; + lvy[i] += (lpfy[i] + lfy[i]) * halfDtInvMass; + lvz[i] += (lpfz[i] + lfz[i]) * halfDtInvMass; + } + }); +#else #pragma GCC ivdep for (size_t i = 0; i < n; ++i) { - const float halfDtInvMass = 0.5f * accelDamping * dt * invMass[i]; - + const float halfDtInvMass = halfDt * invMass[i]; vx[i] += (pfx[i] + fx[i]) * halfDtInvMass; vy[i] += (pfy[i] + fy[i]) * halfDtInvMass; vz[i] += (pfz[i] + fz[i]) * halfDtInvMass; } +#endif }