From a510ff64fb0ba7cad579317a105346cadad1f862 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Wed, 29 Oct 2025 18:16:30 +0100
Subject: [PATCH 1/2] Improve TLSPH performance on GPUs

---
 src/general/abstract_system.jl                | 24 ++++---
 .../fluid/weakly_compressible_sph/system.jl   | 12 ++--
 .../total_lagrangian_sph/penalty_force.jl     | 17 ++---
 .../structure/total_lagrangian_sph/rhs.jl     | 27 ++++----
 .../structure/total_lagrangian_sph/system.jl  | 62 ++++++++++---------
 .../structure/total_lagrangian_sph/rhs.jl     | 15 +++--
 test/systems/tlsph_system.jl                  |  5 +-
 7 files changed, 91 insertions(+), 71 deletions(-)

diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
index 1d0bc2813f..91fb139470 100644
--- a/src/general/abstract_system.jl
+++ b/src/general/abstract_system.jl
@@ -53,26 +53,30 @@ end
 initialize!(system, semi) = system
 
 # This should not be dispatched by system type. We always expect to get a column of `A`.
-@propagate_inbounds function extract_svector(A, system, i)
-    extract_svector(A, Val(ndims(system)), i)
+@propagate_inbounds function extract_svector(A, system, i...)
+    extract_svector(A, Val(ndims(system)), i...)
 end
 
 # Return the `i`-th column of the array `A` as an `SVector`.
-@inline function extract_svector(A, ::Val{NDIMS}, i) where {NDIMS}
+@inline function extract_svector(A, ::Val{NDIMS}, i...) where {NDIMS}
     # Explicit bounds check, which can be removed by calling this function with `@inbounds`
-    @boundscheck checkbounds(A, NDIMS, i)
+    @boundscheck checkbounds(A, NDIMS, i...)
 
     # Assume inbounds access now
-    return SVector(ntuple(@inline(dim->@inbounds A[dim, i]), NDIMS))
+    return SVector(ntuple(@inline(dim->@inbounds A[dim, i...]), NDIMS))
 end
 
 # Return `A[:, :, i]` as an `SMatrix`.
 @inline function extract_smatrix(A, system, particle)
+    @boundscheck checkbounds(A, ndims(system), ndims(system), particle)
+
     # Extract the matrix elements for this particle as a tuple to pass to SMatrix
     return SMatrix{ndims(system),
-                   ndims(system)}(ntuple(@inline(i->A[mod(i - 1, ndims(system)) + 1,
-                                                      div(i - 1, ndims(system)) + 1,
-                                                      particle]),
+                   ndims(system)}(ntuple(@inline(i->@inbounds A[mod(i - 1,
+                                                                    ndims(system)) + 1,
+                                                                div(i - 1,
+                                                                    ndims(system)) + 1,
+                                                                particle]),
                                          Val(ndims(system)^2)))
 end
 
@@ -86,7 +90,7 @@ end
 @inline current_coordinates(u, system) = u
 
 # Specifically get the initial coordinates of a particle for all system types
-@inline function initial_coords(system, particle)
+@propagate_inbounds function initial_coords(system, particle)
     return extract_svector(initial_coordinates(system), system, particle)
 end
 
@@ -102,7 +106,7 @@ end
 # By default, try to extract it from `v`.
 @inline current_velocity(v, system) = v
 
-@inline function current_density(v, system::AbstractSystem, particle)
+@propagate_inbounds function current_density(v, system::AbstractSystem, particle)
     return current_density(v, system)[particle]
 end
 
diff --git a/src/schemes/fluid/weakly_compressible_sph/system.jl b/src/schemes/fluid/weakly_compressible_sph/system.jl
index 5462a663aa..c03c1ecd75 100644
--- a/src/schemes/fluid/weakly_compressible_sph/system.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/system.jl
@@ -244,7 +244,7 @@ end
 
 system_correction(system::WeaklyCompressibleSPHSystem) = system.correction
 
-@inline function current_velocity(v, system::WeaklyCompressibleSPHSystem)
+@propagate_inbounds function current_velocity(v, system::WeaklyCompressibleSPHSystem)
     return current_velocity(v, system.density_calculator, system)
 end
 
@@ -254,14 +254,14 @@ end
     return v
 end
 
-@inline function current_velocity(v, ::ContinuityDensity,
-                                  system::WeaklyCompressibleSPHSystem)
+@propagate_inbounds function current_velocity(v, ::ContinuityDensity,
+                                              system::WeaklyCompressibleSPHSystem)
     # When using `ContinuityDensity`, the velocity is stored
     # in the first `ndims(system)` rows of `v`.
     return view(v, 1:ndims(system), :)
 end
 
-@inline function current_density(v, system::WeaklyCompressibleSPHSystem)
+@propagate_inbounds function current_density(v, system::WeaklyCompressibleSPHSystem)
     return current_density(v, system.density_calculator, system)
 end
 
@@ -271,8 +271,8 @@ end
     return system.cache.density
 end
 
-@inline function current_density(v, ::ContinuityDensity,
-                                 system::WeaklyCompressibleSPHSystem)
+@propagate_inbounds function current_density(v, ::ContinuityDensity,
+                                             system::WeaklyCompressibleSPHSystem)
     # When using `ContinuityDensity`, the density is stored in the last row of `v`
     return view(v, size(v, 1), :)
 end
diff --git a/src/schemes/structure/total_lagrangian_sph/penalty_force.jl b/src/schemes/structure/total_lagrangian_sph/penalty_force.jl
index 94dbd70aa5..025f1ab660 100644
--- a/src/schemes/structure/total_lagrangian_sph/penalty_force.jl
+++ b/src/schemes/structure/total_lagrangian_sph/penalty_force.jl
@@ -21,10 +21,11 @@ end
     return zero(initial_pos_diff)
 end
 
-@inline function dv_penalty_force(penalty_force::PenaltyForceGanzenmueller,
-                                  particle, neighbor, initial_pos_diff, initial_distance,
-                                  current_pos_diff, current_distance,
-                                  system, m_a, m_b, rho_a, rho_b)
+@propagate_inbounds function dv_penalty_force(penalty_force::PenaltyForceGanzenmueller,
+                                              particle, neighbor, initial_pos_diff,
+                                              initial_distance,
+                                              current_pos_diff, current_distance,
+                                              system, m_a, m_b, rho_a, rho_b)
     volume_a = m_a / rho_a
     volume_b = m_b / rho_b
 
@@ -33,15 +34,17 @@ end
     F_a = deformation_gradient(system, particle)
     F_b = deformation_gradient(system, neighbor)
 
+    inv_current_distance = 1 / current_distance
+
     # Use the symmetry of epsilon to simplify computations
     eps_sum = (F_a + F_b) * initial_pos_diff - 2 * current_pos_diff
-    delta_sum = dot(eps_sum, current_pos_diff) / current_distance
+    delta_sum = dot(eps_sum, current_pos_diff) * inv_current_distance
 
     E = young_modulus(system, particle)
 
     f = (penalty_force.alpha / 2) * volume_a * volume_b *
-        kernel_weight / initial_distance^2 * E * delta_sum * current_pos_diff /
-        current_distance
+        kernel_weight / initial_distance^2 * E * delta_sum * current_pos_diff *
+        inv_current_distance
 
     # Divide force by mass to obtain acceleration
     return f / m_a
diff --git a/src/schemes/structure/total_lagrangian_sph/rhs.jl b/src/schemes/structure/total_lagrangian_sph/rhs.jl
index 3edad47a55..a1b6e63a84 100644
--- a/src/schemes/structure/total_lagrangian_sph/rhs.jl
+++ b/src/schemes/structure/total_lagrangian_sph/rhs.jl
@@ -26,7 +26,8 @@ end
                            points=each_integrated_particle(system)) do particle, neighbor,
                                                                        initial_pos_diff,
                                                                        initial_distance
-        # Only consider particles with a distance > 0. See `src/general/smoothing_kernels.jl` for more details.
+        # Only consider particles with a distance > 0.
+        # See `src/general/smoothing_kernels.jl` for more details.
         initial_distance^2 < eps(initial_smoothing_length(system)^2) && return
 
         rho_a = @inbounds system.material_density[particle]
@@ -38,22 +39,24 @@ end
         m_a = @inbounds system.mass[particle]
         m_b = @inbounds system.mass[neighbor]
 
+        # PK1 / rho^2
+        pk1_rho2_a = @inbounds pk1_rho2(system, particle)
+        pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)
+
         current_pos_diff = @inbounds current_coords(system, particle) -
                                      current_coords(system, neighbor)
-        current_distance = norm(current_pos_diff)
+        current_distance = sqrt(dot(current_pos_diff, current_pos_diff))
 
-        dv_stress = m_b *
-                    (pk1_corrected(system, particle) / rho_a^2 +
-                     pk1_corrected(system, neighbor) / rho_b^2) * grad_kernel
+        dv_stress = m_b * (pk1_rho2_a + pk1_rho2_b) * grad_kernel
 
-        dv_penalty_force_ = dv_penalty_force(penalty_force, particle, neighbor,
-                                             initial_pos_diff, initial_distance,
-                                             current_pos_diff, current_distance,
-                                             system, m_a, m_b, rho_a, rho_b)
+        dv_penalty_force_ = @inbounds dv_penalty_force(penalty_force, particle, neighbor,
+                                                       initial_pos_diff, initial_distance,
+                                                       current_pos_diff, current_distance,
+                                                       system, m_a, m_b, rho_a, rho_b)
 
-        dv_viscosity = dv_viscosity_tlsph(system, v_system, particle, neighbor,
-                                          current_pos_diff, current_distance,
-                                          m_a, m_b, rho_a, rho_b, grad_kernel)
+        dv_viscosity = @inbounds dv_viscosity_tlsph(system, v_system, particle, neighbor,
+                                                    current_pos_diff, current_distance,
+                                                    m_a, m_b, rho_a, rho_b, grad_kernel)
 
         dv_particle = dv_stress + dv_penalty_force_ + dv_viscosity
 
diff --git a/src/schemes/structure/total_lagrangian_sph/system.jl b/src/schemes/structure/total_lagrangian_sph/system.jl
index 4bcf567874..83fcfac1ae 100644
--- a/src/schemes/structure/total_lagrangian_sph/system.jl
+++ b/src/schemes/structure/total_lagrangian_sph/system.jl
@@ -68,7 +68,7 @@ struct TotalLagrangianSPHSystem{BM, NDIMS, ELTYPE <: Real, IC, ARRAY1D, ARRAY2D,
     current_coordinates      :: ARRAY2D # Array{ELTYPE, 2}: [dimension, particle]
     mass                     :: ARRAY1D # Array{ELTYPE, 1}: [particle]
     correction_matrix        :: ARRAY3D # Array{ELTYPE, 3}: [i, j, particle]
-    pk1_corrected            :: ARRAY3D # Array{ELTYPE, 3}: [i, j, particle]
+    pk1_rho2                 :: ARRAY3D # PK1 corrected divided by rho^2: [i, j, particle]
     deformation_grad         :: ARRAY3D # Array{ELTYPE, 3}: [i, j, particle]
     material_density         :: ARRAY1D # Array{ELTYPE, 1}: [particle]
     n_integrated_particles   :: Int64
@@ -115,7 +115,7 @@ function TotalLagrangianSPHSystem(initial_condition, smoothing_kernel, smoothing
     mass = copy(initial_condition.mass)
     material_density = copy(initial_condition.density)
     correction_matrix = Array{ELTYPE, 3}(undef, NDIMS, NDIMS, n_particles)
-    pk1_corrected = Array{ELTYPE, 3}(undef, NDIMS, NDIMS, n_particles)
+    pk1_rho2 = Array{ELTYPE, 3}(undef, NDIMS, NDIMS, n_particles)
     deformation_grad = Array{ELTYPE, 3}(undef, NDIMS, NDIMS, n_particles)
 
     n_integrated_particles = n_particles - n_clamped_particles
@@ -132,7 +132,7 @@ function TotalLagrangianSPHSystem(initial_condition, smoothing_kernel, smoothing
 
     return TotalLagrangianSPHSystem(initial_condition, initial_coordinates,
                                     current_coordinates, mass, correction_matrix,
-                                    pk1_corrected, deformation_grad, material_density,
+                                    pk1_rho2, deformation_grad, material_density,
                                     n_integrated_particles, young_modulus, poisson_ratio,
                                     lame_lambda, lame_mu, smoothing_kernel,
                                     smoothing_length, acceleration_, boundary_model,
@@ -171,14 +171,14 @@ end
     return system.current_coordinates
 end
 
-@inline function current_coords(system::TotalLagrangianSPHSystem, particle)
+@propagate_inbounds function current_coords(system::TotalLagrangianSPHSystem, particle)
     # For this system, the current coordinates are stored in the system directly,
     # so we don't need a `u` array. This function is only to be used in this file
     # when no `u` is available.
     current_coords(nothing, system, particle)
 end
 
-@inline function current_velocity(v, system::TotalLagrangianSPHSystem, particle)
+@propagate_inbounds function current_velocity(v, system::TotalLagrangianSPHSystem, particle)
     if particle <= system.n_integrated_particles
         return extract_svector(v, system, particle)
     end
@@ -204,58 +204,58 @@ end
     error("`current_velocity(v, system)` is not implemented for `TotalLagrangianSPHSystem`")
 end
 
-@inline function viscous_velocity(v, system::TotalLagrangianSPHSystem, particle)
+@propagate_inbounds function viscous_velocity(v, system::TotalLagrangianSPHSystem, particle)
     return extract_svector(system.boundary_model.cache.wall_velocity, system, particle)
 end
 
-@inline function current_density(v, system::TotalLagrangianSPHSystem)
+@propagate_inbounds function current_density(v, system::TotalLagrangianSPHSystem)
     return current_density(v, system.boundary_model, system)
 end
 
 # In fluid-structure interaction, use the "hydrodynamic pressure" of the structure particles
 # corresponding to the chosen boundary model.
-@inline function current_pressure(v, system::TotalLagrangianSPHSystem)
+@propagate_inbounds function current_pressure(v, system::TotalLagrangianSPHSystem)
     return current_pressure(v, system.boundary_model, system)
 end
 
-@inline function hydrodynamic_mass(system::TotalLagrangianSPHSystem, particle)
+@propagate_inbounds function hydrodynamic_mass(system::TotalLagrangianSPHSystem, particle)
     return system.boundary_model.hydrodynamic_mass[particle]
 end
 
-@inline function correction_matrix(system, particle)
+@propagate_inbounds function correction_matrix(system, particle)
     extract_smatrix(system.correction_matrix, system, particle)
 end
 
-@inline function deformation_gradient(system, particle)
+@propagate_inbounds function deformation_gradient(system, particle)
     extract_smatrix(system.deformation_grad, system, particle)
 end
-@inline function pk1_corrected(system, particle)
-    extract_smatrix(system.pk1_corrected, system, particle)
+@propagate_inbounds function pk1_rho2(system, particle)
+    extract_smatrix(system.pk1_rho2, system, particle)
 end
 
-function young_modulus(system::TotalLagrangianSPHSystem, particle)
+@propagate_inbounds function young_modulus(system::TotalLagrangianSPHSystem, particle)
     return young_modulus(system, system.young_modulus, particle)
 end
 
-function young_modulus(::TotalLagrangianSPHSystem, young_modulus, particle)
+@inline function young_modulus(::TotalLagrangianSPHSystem, young_modulus, particle)
     return young_modulus
 end
 
-function young_modulus(::TotalLagrangianSPHSystem,
-                       young_modulus::AbstractVector, particle)
+@propagate_inbounds function young_modulus(::TotalLagrangianSPHSystem,
+                                           young_modulus::AbstractVector, particle)
     return young_modulus[particle]
 end
 
-function poisson_ratio(system::TotalLagrangianSPHSystem, particle)
+@propagate_inbounds function poisson_ratio(system::TotalLagrangianSPHSystem, particle)
     return poisson_ratio(system, system.poisson_ratio, particle)
 end
 
-function poisson_ratio(::TotalLagrangianSPHSystem, poisson_ratio, particle)
+@inline function poisson_ratio(::TotalLagrangianSPHSystem, poisson_ratio, particle)
     return poisson_ratio
 end
 
-function poisson_ratio(::TotalLagrangianSPHSystem,
-                       poisson_ratio::AbstractVector, particle)
+@inline function poisson_ratio(::TotalLagrangianSPHSystem,
+                               poisson_ratio::AbstractVector, particle)
     return poisson_ratio[particle]
 end
 
@@ -316,16 +316,18 @@ function update_boundary_interpolation!(system::TotalLagrangianSPHSystem, v, u,
 end
 
 @inline function compute_pk1_corrected!(system, semi)
-    (; deformation_grad) = system
+    (; deformation_grad, pk1_rho2, material_density) = system
 
     calc_deformation_grad!(deformation_grad, system, semi)
 
     @threaded semi for particle in eachparticle(system)
         pk1_particle = pk1_stress_tensor(system, particle)
         pk1_particle_corrected = pk1_particle * correction_matrix(system, particle)
+        rho2_inv = 1 / material_density[particle]^2
 
-        @inbounds for j in 1:ndims(system), i in 1:ndims(system)
-            system.pk1_corrected[i, j, particle] = pk1_particle_corrected[i, j]
+        for j in 1:ndims(system), i in 1:ndims(system)
+            # Precompute PK1 / rho^2 to avoid repeated divisions in the interaction loop
+            @inbounds pk1_rho2[i, j, particle] = pk1_particle_corrected[i, j] * rho2_inv
         end
     end
 end
@@ -446,7 +448,7 @@ end
 # The von-Mises stress is one form of equivalent stress, where sigma is the deviatoric stress.
 # See pages 32 and 123.
 function von_mises_stress(system)
-    von_mises_stress_vector = zeros(eltype(system.pk1_corrected), nparticles(system))
+    von_mises_stress_vector = zeros(eltype(system.pk1_rho2), nparticles(system))
 
     @threaded default_backend(von_mises_stress_vector) for particle in
                                                            each_integrated_particle(system)
@@ -463,7 +465,7 @@ end
 @inline function von_mises_stress(system, particle::Integer)
     F = deformation_gradient(system, particle)
     J = det(F)
-    P = pk1_corrected(system, particle)
+    P = pk1_rho2(system, particle) * system.material_density[particle]^2
     sigma = (1.0 / J) * P * F'
 
     # Calculate deviatoric stress tensor
@@ -479,14 +481,14 @@ end
 function cauchy_stress(system::TotalLagrangianSPHSystem)
     NDIMS = ndims(system)
 
-    cauchy_stress_tensors = zeros(eltype(system.pk1_corrected), NDIMS, NDIMS,
+    cauchy_stress_tensors = zeros(eltype(system.pk1_rho2), NDIMS, NDIMS,
                                   nparticles(system))
 
     @threaded default_backend(cauchy_stress_tensors) for particle in
                                                          each_integrated_particle(system)
         F = deformation_gradient(system, particle)
         J = det(F)
-        P = pk1_corrected(system, particle)
+        P = pk1_rho2(system, particle) * system.material_density[particle]^2
         sigma = (1.0 / J) * P * F'
         cauchy_stress_tensors[:, :, particle] = sigma
     end
@@ -507,7 +509,7 @@ end
 end
 
 function system_data(system::TotalLagrangianSPHSystem, dv_ode, du_ode, v_ode, u_ode, semi)
-    (; mass, material_density, deformation_grad, pk1_corrected, young_modulus,
+    (; mass, material_density, deformation_grad, young_modulus,
      poisson_ratio, lame_lambda, lame_mu) = system
 
     dv = wrap_v(dv_ode, system, semi)
@@ -518,6 +520,8 @@ function system_data(system::TotalLagrangianSPHSystem, dv_ode, du_ode, v_ode, u_
     initial_coordinates_ = initial_coordinates(system)
     velocity = [current_velocity(v, system, particle) for particle in eachparticle(system)]
     acceleration = system_data_acceleration(dv, system, system.clamped_particles_motion)
+    pk1_corrected = [pk1_rho2(system, particle) * system.material_density[particle]^2
+                     for particle in eachparticle(system)]
 
     return (; coordinates, initial_coordinates=initial_coordinates_, velocity, mass,
             material_density, deformation_grad, pk1_corrected, young_modulus, poisson_ratio,
diff --git a/test/schemes/structure/total_lagrangian_sph/rhs.jl b/test/schemes/structure/total_lagrangian_sph/rhs.jl
index 555338187e..47f6467e8d 100644
--- a/test/schemes/structure/total_lagrangian_sph/rhs.jl
+++ b/test/schemes/structure/total_lagrangian_sph/rhs.jl
@@ -41,13 +41,18 @@
             initial_coordinates[:, neighbor[i]] = initial_coordinates_neighbor[i]
             current_coordinates = zeros(2, 10)
             v_system = zeros(2, 10)
-            pk1_corrected = 2000 * ones(2, 2, 10) # Just something that's not zero to catch errors
-            pk1_corrected[:, :, particle[i]] = pk1_particle_corrected[i]
-            pk1_corrected[:, :, neighbor[i]] = pk1_neighbor_corrected[i]
 
             # Density equals the ID of the particle
             material_density = 1:10
 
+            pk1_rho2 = 2000 * ones(2, 2, 10) # Just something that's not zero to catch errors
+            pk1_rho2[:, :,
+                     particle[i]] = pk1_particle_corrected[i] /
+                                    material_density[particle[i]]^2
+            pk1_rho2[:, :,
+                     neighbor[i]] = pk1_neighbor_corrected[i] /
+                                    material_density[neighbor[i]]^2
+
             # Use the same setup as in the unit test above for calc_dv!
             mass = ones(Float64, 10)
             kernel_deriv = 1.0
@@ -67,8 +72,8 @@
                     return current_coordinates
                 elseif f === :material_density
                     return material_density
-                elseif f === :pk1_corrected
-                    return pk1_corrected
+                elseif f === :pk1_rho2
+                    return pk1_rho2
                 elseif f === :mass
                     return mass
                 elseif f === :penalty_force
diff --git a/test/systems/tlsph_system.jl b/test/systems/tlsph_system.jl
index f8c587e3fa..370e0e187c 100644
--- a/test/systems/tlsph_system.jl
+++ b/test/systems/tlsph_system.jl
@@ -379,10 +379,11 @@
         system = TotalLagrangianSPHSystem(initial_condition, smoothing_kernel,
                                           smoothing_length, E, nu)
 
-        # Initialize deformation_grad and pk1_corrected with arbitrary values
+        # Initialize deformation_grad and pk1_rho2 with arbitrary values
         for particle in TrixiParticles.eachparticle(system)
             system.deformation_grad[:, :, particle] = [1.0 0.2; 0.2 1.0]
-            system.pk1_corrected[:, :, particle] = [1.0 0.5; 0.5 1.0]
+            system.pk1_rho2[:, :,
+                            particle] = [1.0 0.5; 0.5 1.0] / material_densities[particle]^2
         end
 
         von_mises_stress = TrixiParticles.von_mises_stress(system)

From dd28efcc43e11f503b1692a5ff990977d4d9fd77 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Mon, 3 Nov 2025 11:13:36 +0100
Subject: [PATCH 2/2] Undo varargs change

---
 src/general/abstract_system.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
index 91fb139470..f854d80d6b 100644
--- a/src/general/abstract_system.jl
+++ b/src/general/abstract_system.jl
@@ -53,17 +53,17 @@ end
 initialize!(system, semi) = system
 
 # This should not be dispatched by system type. We always expect to get a column of `A`.
-@propagate_inbounds function extract_svector(A, system, i...)
-    extract_svector(A, Val(ndims(system)), i...)
+@propagate_inbounds function extract_svector(A, system, i)
+    extract_svector(A, Val(ndims(system)), i)
 end
 
 # Return the `i`-th column of the array `A` as an `SVector`.
-@inline function extract_svector(A, ::Val{NDIMS}, i...) where {NDIMS}
+@inline function extract_svector(A, ::Val{NDIMS}, i) where {NDIMS}
     # Explicit bounds check, which can be removed by calling this function with `@inbounds`
-    @boundscheck checkbounds(A, NDIMS, i...)
+    @boundscheck checkbounds(A, NDIMS, i)
 
     # Assume inbounds access now
-    return SVector(ntuple(@inline(dim->@inbounds A[dim, i...]), NDIMS))
+    return SVector(ntuple(@inline(dim->@inbounds A[dim, i]), NDIMS))
 end
 
 # Return `A[:, :, i]` as an `SMatrix`.