From 7a91cfa355e25094f62220141f0d6ef66281858a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Fri, 14 Jun 2024 23:02:13 +0200
Subject: [PATCH] Redesign P3M grid operations

Encapsulate P3M hot loops into lambdas executed via for_each_3d().
Regroup P3M-specific math functions into a single file.
---
 src/core/electrostatics/p3m.cpp               | 243 ++++++------
 src/core/electrostatics/p3m_gpu_cuda.cu       |   8 +-
 src/core/electrostatics/p3m_gpu_error_cuda.cu |  55 +--
 src/core/magnetostatics/dp3m.cpp              | 358 ++++++++----------
 src/core/p3m/common.cpp                       |  44 ---
 src/core/p3m/common.hpp                       |  12 +-
 src/core/p3m/for_each_3d.hpp                  |  70 ++++
 src/core/p3m/influence_function.hpp           | 115 +++---
 src/core/p3m/influence_function_dipolar.hpp   | 219 ++++++-----
 src/core/p3m/math.hpp                         | 151 ++++++++
 src/core/unit_tests/CMakeLists.txt            |   4 +-
 src/core/unit_tests/fft_test.cpp              |  38 ++
 src/core/unit_tests/math_test.cpp             | 105 +++++
 src/core/unit_tests/p3m_test.cpp              |  30 +-
 src/utils/include/utils/Vector.hpp            |  34 +-
 src/utils/include/utils/math/abs.hpp          |  40 --
 src/utils/include/utils/math/sinc.hpp         |  61 ---
 src/utils/include/utils/math/sqr.hpp          |   2 +
 src/utils/tests/CMakeLists.txt                |   2 -
 src/utils/tests/Vector_test.cpp               |  32 +-
 src/utils/tests/abs_test.cpp                  |  38 --
 src/utils/tests/sinc_test.cpp                 |  39 --
 22 files changed, 877 insertions(+), 823 deletions(-)
 create mode 100644 src/core/p3m/for_each_3d.hpp
 create mode 100644 src/core/p3m/math.hpp
 create mode 100644 src/core/unit_tests/math_test.cpp
 delete mode 100644 src/utils/include/utils/math/abs.hpp
 delete mode 100644 src/utils/include/utils/math/sinc.hpp
 delete mode 100644 src/utils/tests/abs_test.cpp
 delete mode 100644 src/utils/tests/sinc_test.cpp
diff --git a/src/core/electrostatics/p3m.cpp b/src/core/electrostatics/p3m.cpp
index 15cb7028cbc..b16cb96f141 100644
--- a/src/core/electrostatics/p3m.cpp
+++ b/src/core/electrostatics/p3m.cpp
@@ -36,7 +36,9 @@
 #include "p3m/TuningAlgorithm.hpp"
 #include "p3m/TuningLogger.hpp"
 #include "p3m/fft.hpp"
+#include "p3m/for_each_3d.hpp"
 #include "p3m/influence_function.hpp"
+#include "p3m/math.hpp"
 
 #include "BoxGeometry.hpp"
 #include "LocalBox.hpp"
@@ -57,7 +59,6 @@
 #include <utils/Vector.hpp>
 #include <utils/integral_parameter.hpp>
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
@@ -76,9 +77,11 @@
 #include <initializer_list>
 #include <numbers>
 #include <optional>
+#include <span>
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include <tuple>
 #include <utility>
 
 void CoulombP3M::count_charged_particles() {
@@ -115,7 +118,7 @@ void CoulombP3M::calc_influence_function_force() {
   auto const size = Utils::Vector3i{p3m.fft.plan[3].new_mesh};
 
   p3m.g_force = grid_influence_function<1>(p3m.params, start, start + size,
-                                           get_system().box_geo->length());
+                                           get_system().box_geo->length_inv());
 }
 
 /** Calculate the influence function optimized for the energy and the
@@ -126,41 +129,38 @@ void CoulombP3M::calc_influence_function_energy() {
   auto const size = Utils::Vector3i{p3m.fft.plan[3].new_mesh};
 
   p3m.g_energy = grid_influence_function<0>(p3m.params, start, start + size,
-                                            get_system().box_geo->length());
+                                            get_system().box_geo->length_inv());
 }
 
 /** Aliasing sum used by @ref p3m_k_space_error. */
-static auto p3m_tune_aliasing_sums(int nx, int ny, int nz,
+static auto p3m_tune_aliasing_sums(Utils::Vector3i const &shift,
                                    Utils::Vector3i const &mesh,
                                    Utils::Vector3d const &mesh_i, int cao,
                                    double alpha_L_i) {
-  using Utils::sinc;
 
+  auto constexpr m_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN);
+  auto constexpr m_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1);
   auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i);
-
   auto alias1 = 0.;
   auto alias2 = 0.;
-  for (int mx = -P3M_BRILLOUIN; mx <= P3M_BRILLOUIN; mx++) {
-    auto const nmx = nx + mx * mesh[0];
-    auto const fnmx = mesh_i[0] * nmx;
-    for (int my = -P3M_BRILLOUIN; my <= P3M_BRILLOUIN; my++) {
-      auto const nmy = ny + my * mesh[1];
-      auto const fnmy = mesh_i[1] * nmy;
-      for (int mz = -P3M_BRILLOUIN; mz <= P3M_BRILLOUIN; mz++) {
-        auto const nmz = nz + mz * mesh[2];
-        auto const fnmz = mesh_i[2] * nmz;
-
-        auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz);
-        auto const ex = exp(-factor1 * nm2);
-        auto const ex2 = Utils::sqr(ex);
-
-        auto const U2 = pow(sinc(fnmx) * sinc(fnmy) * sinc(fnmz), 2.0 * cao);
-
-        alias1 += ex2 / nm2;
-        alias2 += U2 * ex * (nx * nmx + ny * nmy + nz * nmz) / nm2;
-      }
-    }
-  }
+
+  Utils::Vector3i indices{};
+  Utils::Vector3i nm{};
+  Utils::Vector3d fnm{};
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() {
+        auto const norm_sq = nm.norm2();
+        auto const ex = exp(-factor1 * norm_sq);
+        auto const energy = std::pow(Utils::product(fnm), 2 * cao);
+        alias1 += Utils::sqr(ex) / norm_sq;
+        alias2 += energy * ex * (shift * nm) / norm_sq;
+      },
+      [&](unsigned dim, int n) {
+        nm[dim] = shift[dim] + n * mesh[dim];
+        fnm[dim] = math::sinc(nm[dim] * mesh_i[dim]);
+      });
+
   return std::make_pair(alias1, alias2);
 }
 
@@ -198,51 +198,40 @@ static double p3m_real_space_error(double pref, double r_cut_iL, int n_c_part,
 static double p3m_k_space_error(double pref, Utils::Vector3i const &mesh,
                                 int cao, int n_c_part, double sum_q2,
                                 double alpha_L, Utils::Vector3d const &box_l) {
-  auto const mesh_i =
-      Utils::hadamard_division(Utils::Vector3d::broadcast(1.), mesh);
+
+  auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao);
+  auto const mesh_i = 1. / Utils::Vector3d(mesh);
   auto const alpha_L_i = 1. / alpha_L;
+  auto const m_stop = mesh / 2;
+  auto const m_start = -m_stop;
+  auto indices = Utils::Vector3i{};
+  auto values = Utils::Vector3d{};
   auto he_q = 0.;
 
-  for (int nx = -mesh[0] / 2; nx < mesh[0] / 2; nx++) {
-    auto const ctan_x = p3m_analytic_cotangent_sum(nx, mesh_i[0], cao);
-    for (int ny = -mesh[1] / 2; ny < mesh[1] / 2; ny++) {
-      auto const ctan_y =
-          ctan_x * p3m_analytic_cotangent_sum(ny, mesh_i[1], cao);
-      for (int nz = -mesh[2] / 2; nz < mesh[2] / 2; nz++) {
-        if ((nx != 0) || (ny != 0) || (nz != 0)) {
-          auto const n2 = Utils::sqr(nx) + Utils::sqr(ny) + Utils::sqr(nz);
-          auto const cs =
-              p3m_analytic_cotangent_sum(nz, mesh_i[2], cao) * ctan_y;
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() {
+        if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) {
+          auto const n2 = indices.norm2();
+          auto const cs = Utils::product(values);
           auto const [alias1, alias2] =
-              p3m_tune_aliasing_sums(nx, ny, nz, mesh, mesh_i, cao, alpha_L_i);
+              p3m_tune_aliasing_sums(indices, mesh, mesh_i, cao, alpha_L_i);
           auto const d = alias1 - Utils::sqr(alias2 / cs) / n2;
           /* at high precision, d can become negative due to extinction;
              also, don't take values that have no significant digits left*/
-          if (d > 0 && (fabs(d / alias1) > ROUND_ERROR_PREC))
+          if (d > 0. and std::fabs(d / alias1) > ROUND_ERROR_PREC) {
             he_q += d;
+          }
         }
-      }
-    }
-  }
+      },
+      [&values, &mesh_i, cotangent_sum](unsigned dim, int n) {
+        values[dim] = cotangent_sum(n, mesh_i[dim]);
+      });
+
   return 2. * pref * sum_q2 * sqrt(he_q / static_cast<double>(n_c_part)) /
          (box_l[1] * box_l[2]);
 }
 
-#ifdef CUDA
-static double p3mgpu_k_space_error(double prefactor,
-                                   Utils::Vector3i const &mesh, int cao,
-                                   int npart, double sum_q2, double alpha_L,
-                                   Utils::Vector3d const &box_l) {
-  auto ks_err = 0.;
-  if (this_node == 0) {
-    ks_err = p3m_k_space_error_gpu(prefactor, mesh.data(), cao, npart, sum_q2,
-                                   alpha_L, box_l.data());
-  }
-  boost::mpi::broadcast(comm_cart, ks_err, 0);
-  return ks_err;
-}
-#endif
-
 void CoulombP3M::init() {
   assert(p3m.params.mesh >= Utils::Vector3i::broadcast(1));
   assert(p3m.params.cao >= 1 and p3m.params.cao <= 7);
@@ -422,48 +411,44 @@ CoulombP3M::long_range_pressure(ParticleRange const &particles) {
     p3m.sm.gather_grid(p3m.rs_mesh.data(), comm_cart, p3m.local_mesh.dim);
     fft_perform_forw(p3m.rs_mesh.data(), p3m.fft, comm_cart);
 
-    auto diagonal = 0.;
-    int ind = 0;
-    int j[3];
+    auto constexpr m_start = Utils::Vector3i::broadcast(0);
     auto const half_alpha_inv_sq = Utils::sqr(1. / 2. / p3m.params.alpha);
-    for (j[0] = 0; j[0] < p3m.fft.plan[3].new_mesh[RX]; j[0]++) {
-      for (j[1] = 0; j[1] < p3m.fft.plan[3].new_mesh[RY]; j[1]++) {
-        for (j[2] = 0; j[2] < p3m.fft.plan[3].new_mesh[RZ]; j[2]++) {
-          auto const kx = 2. * std::numbers::pi *
-                          p3m.d_op[RX][j[KX] + p3m.fft.plan[3].start[KX]] *
-                          box_geo.length_inv()[RX];
-          auto const ky = 2. * std::numbers::pi *
-                          p3m.d_op[RY][j[KY] + p3m.fft.plan[3].start[KY]] *
-                          box_geo.length_inv()[RY];
-          auto const kz = 2. * std::numbers::pi *
-                          p3m.d_op[RZ][j[KZ] + p3m.fft.plan[3].start[KZ]] *
-                          box_geo.length_inv()[RZ];
-          auto const sqk = Utils::sqr(kx) + Utils::sqr(ky) + Utils::sqr(kz);
-
-          if (sqk != 0.) {
-            auto const node_k_space_energy =
-                p3m.g_energy[ind] * (Utils::sqr(p3m.rs_mesh[2 * ind]) +
-                                     Utils::sqr(p3m.rs_mesh[2 * ind + 1]));
-            auto const vterm = -2. * (1. / sqk + half_alpha_inv_sq);
-            auto const pref = node_k_space_energy * vterm;
-            node_k_space_pressure_tensor[0] += pref * kx * kx; /* sigma_xx */
-            node_k_space_pressure_tensor[1] += pref * kx * ky; /* sigma_xy */
-            node_k_space_pressure_tensor[2] += pref * kx * kz; /* sigma_xz */
-            node_k_space_pressure_tensor[3] += pref * ky * kx; /* sigma_yx */
-            node_k_space_pressure_tensor[4] += pref * ky * ky; /* sigma_yy */
-            node_k_space_pressure_tensor[5] += pref * ky * kz; /* sigma_yz */
-            node_k_space_pressure_tensor[6] += pref * kz * kx; /* sigma_zx */
-            node_k_space_pressure_tensor[7] += pref * kz * ky; /* sigma_zy */
-            node_k_space_pressure_tensor[8] += pref * kz * kz; /* sigma_zz */
-            diagonal += node_k_space_energy;
-          }
-          ind++;
-        }
+    auto const m_stop = std::span(p3m.fft.plan[3].new_mesh);
+    auto const offset = std::span(p3m.fft.plan[3].start);
+    auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv();
+    auto indices = Utils::Vector3i{};
+    auto index = std::size_t(0u);
+    auto diagonal = 0.;
+
+    for_each_3d(m_start, m_stop, indices, [&]() {
+      auto const kx = p3m.d_op[RX][indices[KX] + offset[KX]] * wavevector[RX];
+      auto const ky = p3m.d_op[RY][indices[KY] + offset[KY]] * wavevector[RY];
+      auto const kz = p3m.d_op[RZ][indices[KZ] + offset[KZ]] * wavevector[RZ];
+      auto const norm_sq = Utils::sqr(kx) + Utils::sqr(ky) + Utils::sqr(kz);
+
+      if (norm_sq != 0.) {
+        auto const node_k_space_energy =
+            p3m.g_energy[index] * (Utils::sqr(p3m.rs_mesh[2u * index + 0u]) +
+                                   Utils::sqr(p3m.rs_mesh[2u * index + 1u]));
+        auto const vterm = -2. * (1. / norm_sq + half_alpha_inv_sq);
+        auto const pref = node_k_space_energy * vterm;
+        node_k_space_pressure_tensor[0u] += pref * kx * kx; /* sigma_xx */
+        node_k_space_pressure_tensor[1u] += pref * kx * ky; /* sigma_xy */
+        node_k_space_pressure_tensor[2u] += pref * kx * kz; /* sigma_xz */
+        node_k_space_pressure_tensor[4u] += pref * ky * ky; /* sigma_yy */
+        node_k_space_pressure_tensor[5u] += pref * ky * kz; /* sigma_yz */
+        node_k_space_pressure_tensor[8u] += pref * kz * kz; /* sigma_zz */
+        diagonal += node_k_space_energy;
       }
-    }
-    node_k_space_pressure_tensor[0] += diagonal;
-    node_k_space_pressure_tensor[4] += diagonal;
-    node_k_space_pressure_tensor[8] += diagonal;
+      ++index;
+    });
+
+    node_k_space_pressure_tensor[0u] += diagonal;
+    node_k_space_pressure_tensor[4u] += diagonal;
+    node_k_space_pressure_tensor[8u] += diagonal;
+    node_k_space_pressure_tensor[3u] = node_k_space_pressure_tensor[1u];
+    node_k_space_pressure_tensor[6u] = node_k_space_pressure_tensor[2u];
+    node_k_space_pressure_tensor[7u] = node_k_space_pressure_tensor[5u];
   }
 
   return node_k_space_pressure_tensor * prefactor / (2. * box_geo.volume());
@@ -510,35 +495,35 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
 
   /* === k-space force calculation  === */
   if (force_flag) {
-    /* sqrt(-1)*k differentiation */
-    int j[3];
-    int ind = 0;
+    /* i*k differentiation */
+    auto constexpr m_start = Utils::Vector3i::broadcast(0);
+    auto const m_stop = std::span(p3m.fft.plan[3].new_mesh);
+    auto const offset = std::span(p3m.fft.plan[3].start);
+    auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv();
+    auto indices = Utils::Vector3i{};
+    auto index = std::size_t(0u);
+
     /* compute electric field */
     // Eq. (3.49) @cite deserno00b
-    for (j[0] = 0; j[0] < p3m.fft.plan[3].new_mesh[0]; j[0]++) {
-      for (j[1] = 0; j[1] < p3m.fft.plan[3].new_mesh[1]; j[1]++) {
-        for (j[2] = 0; j[2] < p3m.fft.plan[3].new_mesh[2]; j[2]++) {
-          auto const rho_hat = std::complex<double>(p3m.rs_mesh[2 * ind + 0],
-                                                    p3m.rs_mesh[2 * ind + 1]);
-          auto const phi_hat = p3m.g_force[ind] * rho_hat;
-
-          for (int d = 0; d < 3; d++) {
-            /* direction in r-space: */
-            int d_rs = (d + p3m.ks_pnum) % 3;
-            /* directions */
-            auto const k = 2. * std::numbers::pi *
-                           p3m.d_op[d_rs][j[d] + p3m.fft.plan[3].start[d]] *
-                           box_geo.length_inv()[d_rs];
-
-            /* i*k*(Re+i*Im) = - Im*k + i*Re*k     (i=sqrt(-1)) */
-            p3m.E_mesh[d_rs][2 * ind + 0] = -k * phi_hat.imag();
-            p3m.E_mesh[d_rs][2 * ind + 1] = +k * phi_hat.real();
-          }
-
-          ind++;
-        }
+    for_each_3d(m_start, m_stop, indices, [&]() {
+      auto const rho_hat = std::complex<double>(p3m.rs_mesh[2u * index + 0u],
+                                                p3m.rs_mesh[2u * index + 1u]);
+      auto const phi_hat = p3m.g_force[index] * rho_hat;
+
+      for (int d = 0; d < 3; d++) {
+        /* direction in r-space: */
+        int d_rs = (d + p3m.ks_pnum) % 3;
+        /* directions */
+        auto const k =
+            p3m.d_op[d_rs][indices[d] + offset[d]] * wavevector[d_rs];
+
+        /* i*k*(Re+i*Im) = - Im*k + i*Re*k     (i=sqrt(-1)) */
+        p3m.E_mesh[d_rs][2u * index + 0u] = -k * phi_hat.imag();
+        p3m.E_mesh[d_rs][2u * index + 1u] = +k * phi_hat.real();
       }
-    }
+
+      ++index;
+    });
 
     /* Back FFT force component mesh */
     auto const check_complex = !p3m.params.tuning and check_complex_residuals;
@@ -679,8 +664,12 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
 #ifdef CUDA
     auto const &solver = m_system.coulomb.impl->solver;
     if (has_actor_of_type<CoulombP3MGPU>(solver)) {
-      ks_err = p3mgpu_k_space_error(m_prefactor, mesh, cao, p3m.sum_qpart,
-                                    p3m.sum_q2, alpha_L, box_geo.length());
+      if (this_node == 0) {
+        ks_err =
+            p3m_k_space_error_gpu(m_prefactor, mesh.data(), cao, p3m.sum_qpart,
+                                  p3m.sum_q2, alpha_L, box_geo.length().data());
+      }
+      boost::mpi::broadcast(comm_cart, ks_err, 0);
     } else
 #endif
       ks_err = p3m_k_space_error(m_prefactor, mesh, cao, p3m.sum_qpart,
diff --git a/src/core/electrostatics/p3m_gpu_cuda.cu b/src/core/electrostatics/p3m_gpu_cuda.cu
index a01b64b1496..c734ba0cccd 100644
--- a/src/core/electrostatics/p3m_gpu_cuda.cu
+++ b/src/core/electrostatics/p3m_gpu_cuda.cu
@@ -53,11 +53,11 @@
 #include "electrostatics/p3m_gpu_cuda.cuh"
 
 #include "cuda/utils.cuh"
+#include "p3m/math.hpp"
 #include "system/System.hpp"
 
 #include <utils/math/bspline.hpp>
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <cuda.h>
@@ -185,15 +185,15 @@ __device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY,
   for (MX = -P3M_BRILLOUIN; MX <= P3M_BRILLOUIN; MX++) {
     NMX = static_cast<REAL_TYPE>(((NX > p.mesh[0] / 2) ? NX - p.mesh[0] : NX) +
                                  p.mesh[0] * MX);
-    S1 = int_pow<2 * cao>(Utils::sinc(Meshi[0] * NMX));
+    S1 = int_pow<2 * cao>(math::sinc(Meshi[0] * NMX));
     for (MY = -P3M_BRILLOUIN; MY <= P3M_BRILLOUIN; MY++) {
       NMY = static_cast<REAL_TYPE>(
           ((NY > p.mesh[1] / 2) ? NY - p.mesh[1] : NY) + p.mesh[1] * MY);
-      S2 = S1 * int_pow<2 * cao>(Utils::sinc(Meshi[1] * NMY));
+      S2 = S1 * int_pow<2 * cao>(math::sinc(Meshi[1] * NMY));
       for (MZ = -P3M_BRILLOUIN; MZ <= P3M_BRILLOUIN; MZ++) {
         NMZ = static_cast<REAL_TYPE>(
             ((NZ > p.mesh[2] / 2) ? NZ - p.mesh[2] : NZ) + p.mesh[2] * MZ);
-        S3 = S2 * int_pow<2 * cao>(Utils::sinc(Meshi[2] * NMZ));
+        S3 = S2 * int_pow<2 * cao>(math::sinc(Meshi[2] * NMZ));
 
         NM2 = sqr(NMX * Leni[0]) + sqr(NMY * Leni[1]) + sqr(NMZ * Leni[2]);
         *Nenner += S3;
diff --git a/src/core/electrostatics/p3m_gpu_error_cuda.cu b/src/core/electrostatics/p3m_gpu_error_cuda.cu
index 6e54e2dfe7e..fbc51fecf4a 100644
--- a/src/core/electrostatics/p3m_gpu_error_cuda.cu
+++ b/src/core/electrostatics/p3m_gpu_error_cuda.cu
@@ -23,12 +23,12 @@
  *  The corresponding header file is p3m_gpu_error.hpp.
  */
 
+#include "p3m/math.hpp"
 #include "p3m_gpu_error.hpp"
 
 #include "cuda/utils.cuh"
 
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <thrust/device_vector.h>
@@ -44,49 +44,12 @@
 #error CU-file includes mpi.h! This should not happen!
 #endif
 
-using Utils::int_pow;
-using Utils::sqr;
-
-/** @todo Extend to higher order. This comes from some 1/sin expansion in
- *  @cite hockney88a
- */
-
-template <int cao>
-__device__ static double p3m_analytic_cotangent_sum(int n, double mesh_i) {
-  const double c = sqr(cos(std::numbers::pi * mesh_i * n));
-
-  switch (cao) {
-  case 1:
-    return 1;
-  case 2:
-    return (1.0 + c * 2.0) / 3.0;
-  case 3:
-    return (2.0 + c * (11.0 + c * 2.0)) / 15.0;
-  case 4:
-    return (17.0 + c * (180.0 + c * (114.0 + c * 4.0))) / 315.0;
-  case 5:
-    return (62.0 + c * (1072.0 + c * (1452.0 + c * (247.0 + c * 2.0)))) /
-           2835.0;
-  case 6:
-    return (1382.0 +
-            c * (35396.0 +
-                 c * (83021.0 + c * (34096.0 + c * (2026.0 + c * 4.0))))) /
-           155925.0;
-  case 7:
-    return (21844.0 +
-            c * (776661.0 +
-                 c * (2801040.0 +
-                      c * (2123860.0 +
-                           c * (349500.0 + c * (8166.0 + c * 4.0)))))) /
-           6081075.0;
-  }
-
-  return 0.0;
-}
-
 template <int cao>
 __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi,
                                                 double alpha_L, double *he_q) {
+  using Utils::int_pow;
+  using Utils::sqr;
+
   const int nx =
       -mesh.x / 2 + static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
   const int ny =
@@ -103,14 +66,14 @@ __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi,
   if ((nx != 0) || (ny != 0) || (nz != 0)) {
     const double alpha_L_i = 1. / alpha_L;
     const double n2 = sqr(nx) + sqr(ny) + sqr(nz);
-    const double cs = p3m_analytic_cotangent_sum<cao>(nz, meshi.z) *
-                      p3m_analytic_cotangent_sum<cao>(nx, meshi.x) *
-                      p3m_analytic_cotangent_sum<cao>(ny, meshi.y);
+    const double cs = math::analytic_cotangent_sum<cao>(nz, meshi.z) *
+                      math::analytic_cotangent_sum<cao>(nx, meshi.x) *
+                      math::analytic_cotangent_sum<cao>(ny, meshi.y);
     const double ex = exp(-sqr(std::numbers::pi * alpha_L_i) * n2);
     const double ex2 = sqr(ex);
     const double U2 =
-        int_pow<2 * cao>(Utils::sinc(meshi.x * nx) * Utils::sinc(meshi.y * ny) *
-                         Utils::sinc(meshi.z * nz));
+        int_pow<2 * cao>(math::sinc(meshi.x * nx) * math::sinc(meshi.y * ny) *
+                         math::sinc(meshi.z * nz));
     auto const alias1 = ex2 / n2;
     auto const d = alias1 - sqr(U2 * ex / cs) / n2;
 
diff --git a/src/core/magnetostatics/dp3m.cpp b/src/core/magnetostatics/dp3m.cpp
index c5485a2ca1c..4214900fb52 100644
--- a/src/core/magnetostatics/dp3m.cpp
+++ b/src/core/magnetostatics/dp3m.cpp
@@ -37,6 +37,7 @@
 #include "p3m/fft.hpp"
 #include "p3m/influence_function_dipolar.hpp"
 #include "p3m/interpolation.hpp"
+#include "p3m/math.hpp"
 #include "p3m/send_mesh.hpp"
 
 #include "BoxGeometry.hpp"
@@ -56,7 +57,6 @@
 #include <utils/Vector.hpp>
 #include <utils/integral_parameter.hpp>
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
@@ -67,10 +67,13 @@
 #include <cstddef>
 #include <cstdio>
 #include <functional>
+#include <iterator>
 #include <numbers>
 #include <optional>
+#include <span>
 #include <sstream>
 #include <stdexcept>
+#include <tuple>
 #include <vector>
 
 void DipolarP3M::count_magnetic_particles() {
@@ -272,15 +275,13 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
     dipole_assign(particles);
     /* Gather information for FFT grid inside the nodes domain (inner local
      * mesh) and perform forward 3D FFT (Charge Assignment Mesh). */
-    std::array<double *, 3> meshes = {{dp3m.rs_mesh_dip[0].data(),
-                                       dp3m.rs_mesh_dip[1].data(),
-                                       dp3m.rs_mesh_dip[2].data()}};
-
+    std::array<double *, 3> meshes = {{dp3m.rs_mesh_dip[0u].data(),
+                                       dp3m.rs_mesh_dip[1u].data(),
+                                       dp3m.rs_mesh_dip[2u].data()}};
     dp3m.sm.gather_grid(meshes, comm_cart, dp3m.local_mesh.dim);
-
-    fft_perform_forw(dp3m.rs_mesh_dip[0].data(), dp3m.fft, comm_cart);
-    fft_perform_forw(dp3m.rs_mesh_dip[1].data(), dp3m.fft, comm_cart);
-    fft_perform_forw(dp3m.rs_mesh_dip[2].data(), dp3m.fft, comm_cart);
+    for (auto i = 0u; i < 3u; ++i) {
+      fft_perform_forw(dp3m.rs_mesh_dip[i].data(), dp3m.fft, comm_cart);
+    }
     // Note: after these calls, the grids are in the order yzx and not xyz
     // anymore!!!
   }
@@ -293,34 +294,32 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
     if (dp3m.sum_mu2 > 0.) {
       /* i*k differentiation for dipolar gradients:
        * |(\Fourier{\vect{mu}}(k)\cdot \vect{k})|^2 */
-      int ind = 0;
-      int i = 0;
-      int j[3];
-      double node_energy = 0.0;
-      for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) {
-        for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; j[1]++) {
-          for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; j[2]++) {
-            node_energy +=
-                dp3m.g_energy[i] *
-                (Utils::sqr(
-                     dp3m.rs_mesh_dip[0][ind] *
-                         dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                     dp3m.rs_mesh_dip[1][ind] *
-                         dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                     dp3m.rs_mesh_dip[2][ind] *
-                         dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]) +
-                 Utils::sqr(
-                     dp3m.rs_mesh_dip[0][ind + 1] *
-                         dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                     dp3m.rs_mesh_dip[1][ind + 1] *
-                         dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                     dp3m.rs_mesh_dip[2][ind + 1] *
-                         dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]));
-            ind += 2;
-            i++;
-          }
-        }
-      }
+
+      auto constexpr m_start = Utils::Vector3i::broadcast(0);
+      auto const m_stop = std::span(dp3m.fft.plan[3].new_mesh);
+      auto const offset = std::span(dp3m.fft.plan[3].start);
+      auto const &d_op = dp3m.d_op[0u];
+      auto const &mesh_dip = dp3m.rs_mesh_dip;
+      auto indices = Utils::Vector3i{};
+      auto index = std::size_t(0u);
+      auto it_energy = dp3m.g_energy.begin();
+      auto node_energy = 0.;
+      for_each_3d(m_start, m_stop, indices, [&]() {
+        using namespace detail::FFT_indexing;
+        // Re(mu)*k
+        auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
+                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
+                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        ++index;
+        // Im(mu)*k
+        auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
+                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
+                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        ++index;
+        node_energy += *it_energy * (Utils::sqr(re) + Utils::sqr(im));
+        std::advance(it_energy, 1);
+      });
+
       node_energy *= dipole_prefac * std::numbers::pi * box_geo.length_inv()[0];
       boost::mpi::reduce(comm_cart, node_energy, energy, std::plus<>(), 0);
 
@@ -344,62 +343,45 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
      * DIPOLAR TORQUES (k-space)
      ****************************/
     if (dp3m.sum_mu2 > 0.) {
-      auto const two_pi_L_i = 2. * std::numbers::pi * box_geo.length_inv()[0];
+      auto const wavenumber = 2. * std::numbers::pi * box_geo.length_inv()[0u];
+      auto constexpr m_start = Utils::Vector3i::broadcast(0);
+      auto const m_stop = std::span(dp3m.fft.plan[3].new_mesh);
+      auto const offset = std::span(dp3m.fft.plan[3].start);
+      auto const &d_op = dp3m.d_op[0u];
+      auto const &mesh_dip = dp3m.rs_mesh_dip;
+      auto indices = Utils::Vector3i{};
+      auto index = std::size_t(0u);
+
       /* fill in ks_mesh array for torque calculation */
-      int ind = 0;
-      int i = 0;
-      int j[3];
-      double tmp0, tmp1;
-
-      for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { // j[0]=n_y
-        for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1];
-             j[1]++) { // j[1]=n_z
-          for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2];
-               j[2]++) { // j[2]=n_x
-            // tmp0 = Re(mu)*k,   tmp1 = Im(mu)*k
-
-            tmp0 = dp3m.rs_mesh_dip[0][ind] *
-                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                   dp3m.rs_mesh_dip[1][ind] *
-                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                   dp3m.rs_mesh_dip[2][ind] *
-                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
-
-            tmp1 = dp3m.rs_mesh_dip[0][ind + 1] *
-                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                   dp3m.rs_mesh_dip[1][ind + 1] *
-                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                   dp3m.rs_mesh_dip[2][ind + 1] *
-                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
-
-            /* the optimal influence function is the same for torques
-               and energy */
-            dp3m.ks_mesh[ind] = tmp0 * dp3m.g_energy[i];
-            dp3m.ks_mesh[ind + 1] = tmp1 * dp3m.g_energy[i];
-            ind += 2;
-            i++;
-          }
-        }
-      }
+      auto it_energy = dp3m.g_energy.begin();
+      index = 0u;
+      for_each_3d(m_start, m_stop, indices, [&]() {
+        using namespace detail::FFT_indexing;
+        // Re(mu)*k
+        auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
+                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
+                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        dp3m.ks_mesh[index] = *it_energy * re;
+        ++index;
+        // Im(mu)*k
+        auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
+                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
+                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        dp3m.ks_mesh[index] = *it_energy * im;
+        ++index;
+        std::advance(it_energy, 1);
+      });
 
       /* Force component loop */
       for (int d = 0; d < 3; d++) {
-        auto const d_rs = (d + dp3m.ks_pnum) % 3;
-        ind = 0;
-        for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) {
-          for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; j[1]++) {
-            for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; j[2]++) {
-              dp3m.rs_mesh[ind] =
-                  dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
-                  dp3m.ks_mesh[ind];
-              ind++;
-              dp3m.rs_mesh[ind] =
-                  dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
-                  dp3m.ks_mesh[ind];
-              ind++;
-            }
-          }
-        }
+        index = 0u;
+        for_each_3d(m_start, m_stop, indices, [&]() {
+          auto const pos = indices[d] + offset[d];
+          dp3m.rs_mesh[index] = d_op[pos] * dp3m.ks_mesh[index];
+          ++index;
+          dp3m.rs_mesh[index] = d_op[pos] * dp3m.ks_mesh[index];
+          ++index;
+        });
 
         /* Back FFT force component mesh */
         fft_perform_back(dp3m.rs_mesh.data(), false, dp3m.fft, comm_cart);
@@ -407,94 +389,68 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
         dp3m.sm.spread_grid(dp3m.rs_mesh.data(), comm_cart,
                             dp3m.local_mesh.dim);
         /* Assign force component from mesh to particle */
+        auto const d_rs = (d + dp3m.ks_pnum) % 3;
         Utils::integral_parameter<int, AssignTorques, 1, 7>(
-            dp3m.params.cao, dp3m, dipole_prefac * two_pi_L_i, d_rs, particles);
+            dp3m.params.cao, dp3m, dipole_prefac * wavenumber, d_rs, particles);
       }
 
       /***************************
          DIPOLAR FORCES (k-space)
       ****************************/
-
       // Compute forces after torques because the algorithm below overwrites the
       // grids dp3m.rs_mesh_dip !
       // Note: I'll do here 9 inverse FFTs. By symmetry, we can reduce this
       // number to 6 !
       /* fill in ks_mesh array for force calculation */
-      ind = 0;
-      i = 0;
-      for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { // j[0]=n_y
-        for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1];
-             j[1]++) { // j[1]=n_z
-          for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2];
-               j[2]++) { // j[2]=n_x
-            // tmp0 = Im(mu)*k,   tmp1 = -Re(mu)*k
-            tmp0 = dp3m.rs_mesh_dip[0][ind + 1] *
-                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                   dp3m.rs_mesh_dip[1][ind + 1] *
-                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                   dp3m.rs_mesh_dip[2][ind + 1] *
-                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
-            tmp1 = dp3m.rs_mesh_dip[0][ind] *
-                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                   dp3m.rs_mesh_dip[1][ind] *
-                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                   dp3m.rs_mesh_dip[2][ind] *
-                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
-            dp3m.ks_mesh[ind] = tmp0 * dp3m.g_force[i];
-            dp3m.ks_mesh[ind + 1] = -tmp1 * dp3m.g_force[i];
-            ind += 2;
-            i++;
-          }
-        }
-      }
+      auto it_force = dp3m.g_force.begin();
+      index = 0u;
+      for_each_3d(m_start, m_stop, indices, [&]() {
+        using namespace detail::FFT_indexing;
+        // Re(mu)*k
+        auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
+                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
+                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        ++index;
+        // Im(mu)*k
+        auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
+                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
+                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        ++index;
+        dp3m.ks_mesh[index - 2] = *it_force * im;
+        dp3m.ks_mesh[index - 1] = *it_force * (-re);
+        std::advance(it_force, 1);
+      });
 
       /* Force component loop */
-      for (int d = 0; d < 3; d++) { /* direction in k-space: */
-        auto const d_rs = (d + dp3m.ks_pnum) % 3;
-        ind = 0;
-        for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0];
-             j[0]++) { // j[0]=n_y
-          for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1];
-               j[1]++) { // j[1]=n_z
-            for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2];
-                 j[2]++) { // j[2]=n_x
-              tmp0 = dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
-                     dp3m.ks_mesh[ind];
-              dp3m.rs_mesh_dip[0][ind] =
-                  dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] * tmp0;
-              dp3m.rs_mesh_dip[1][ind] =
-                  dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] * tmp0;
-              dp3m.rs_mesh_dip[2][ind] =
-                  dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]] * tmp0;
-              ind++;
-              tmp0 = dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
-                     dp3m.ks_mesh[ind];
-              dp3m.rs_mesh_dip[0][ind] =
-                  dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] * tmp0;
-              dp3m.rs_mesh_dip[1][ind] =
-                  dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] * tmp0;
-              dp3m.rs_mesh_dip[2][ind] =
-                  dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]] * tmp0;
-              ind++;
-            }
-          }
-        }
+      for (int d = 0; d < 3; d++) {
+        auto &rs_mesh_dip = dp3m.rs_mesh_dip;
+        index = 0u;
+        for_each_3d(m_start, m_stop, indices, [&]() {
+          using namespace detail::FFT_indexing;
+          auto const f1 = d_op[indices[d] + offset[d]] * dp3m.ks_mesh[index];
+          rs_mesh_dip[0][index] = d_op[indices[KX] + offset[KX]] * f1;
+          rs_mesh_dip[1][index] = d_op[indices[KY] + offset[KY]] * f1;
+          rs_mesh_dip[2][index] = d_op[indices[KZ] + offset[KZ]] * f1;
+          ++index;
+          auto const f2 = d_op[indices[d] + offset[d]] * dp3m.ks_mesh[index];
+          rs_mesh_dip[0][index] = d_op[indices[KX] + offset[KX]] * f2;
+          rs_mesh_dip[1][index] = d_op[indices[KY] + offset[KY]] * f2;
+          rs_mesh_dip[2][index] = d_op[indices[KZ] + offset[KZ]] * f2;
+          ++index;
+        });
         /* Back FFT force component mesh */
-        fft_perform_back(dp3m.rs_mesh_dip[0].data(), false, dp3m.fft,
-                         comm_cart);
-        fft_perform_back(dp3m.rs_mesh_dip[1].data(), false, dp3m.fft,
-                         comm_cart);
-        fft_perform_back(dp3m.rs_mesh_dip[2].data(), false, dp3m.fft,
-                         comm_cart);
+        for (auto i = 0u; i < 3u; ++i) {
+          fft_perform_back(rs_mesh_dip[i].data(), false, dp3m.fft, comm_cart);
+        }
         /* redistribute force component mesh */
-        std::array<double *, 3> meshes = {{dp3m.rs_mesh_dip[0].data(),
-                                           dp3m.rs_mesh_dip[1].data(),
-                                           dp3m.rs_mesh_dip[2].data()}};
-
+        std::array<double *, 3> meshes = {{rs_mesh_dip[0].data(),
+                                           rs_mesh_dip[1].data(),
+                                           rs_mesh_dip[2].data()}};
         dp3m.sm.spread_grid(meshes, comm_cart, dp3m.local_mesh.dim);
         /* Assign force component from mesh to particle */
+        auto const d_rs = (d + dp3m.ks_pnum) % 3;
         Utils::integral_parameter<int, AssignForces, 1, 7>(
-            dp3m.params.cao, dp3m, dipole_prefac * Utils::sqr(two_pi_L_i), d_rs,
+            dp3m.params.cao, dp3m, dipole_prefac * Utils::sqr(wavenumber), d_rs,
             particles);
       }
     } /* if (dp3m.sum_mu2 > 0) */
@@ -590,15 +546,15 @@ void DipolarP3M::calc_influence_function_force() {
   auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh};
 
   dp3m.g_force = grid_influence_function<3>(dp3m.params, start, start + size,
-                                            get_system().box_geo->length());
+                                            get_system().box_geo->length_inv());
 }
 
 void DipolarP3M::calc_influence_function_energy() {
   auto const start = Utils::Vector3i{dp3m.fft.plan[3].start};
   auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh};
 
-  dp3m.g_energy = grid_influence_function<2>(dp3m.params, start, start + size,
-                                             get_system().box_geo->length());
+  dp3m.g_energy = grid_influence_function<2>(
+      dp3m.params, start, start + size, get_system().box_geo->length_inv());
 }
 
 class DipolarTuningAlgorithm : public TuningAlgorithm {
@@ -749,62 +705,68 @@ void DipolarP3M::tune() {
 }
 
 /** Tuning dipolar-P3M */
-static auto dp3m_tune_aliasing_sums(int nx, int ny, int nz, int mesh,
+static auto dp3m_tune_aliasing_sums(Utils::Vector3i const &shift, int mesh,
                                     double mesh_i, int cao, double alpha_L_i) {
-  using Utils::sinc;
 
+  auto constexpr m_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN);
+  auto constexpr m_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1);
   auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i);
-
   auto alias1 = 0.;
   auto alias2 = 0.;
-  for (int mx = -P3M_BRILLOUIN; mx <= P3M_BRILLOUIN; mx++) {
-    auto const nmx = nx + mx * mesh;
-    auto const fnmx = mesh_i * nmx;
-    for (int my = -P3M_BRILLOUIN; my <= P3M_BRILLOUIN; my++) {
-      auto const nmy = ny + my * mesh;
-      auto const fnmy = mesh_i * nmy;
-      for (int mz = -P3M_BRILLOUIN; mz <= P3M_BRILLOUIN; mz++) {
-        auto const nmz = nz + mz * mesh;
-        auto const fnmz = mesh_i * nmz;
-
-        auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz);
-        auto const ex = std::exp(-factor1 * nm2);
-
-        auto const U2 = pow(sinc(fnmx) * sinc(fnmy) * sinc(fnmz), 2. * cao);
-
-        alias1 += Utils::sqr(ex) * nm2;
-        alias2 += U2 * ex * pow((nx * nmx + ny * nmy + nz * nmz), 3.) / nm2;
-      }
-    }
-  }
+
+  Utils::Vector3i indices{};
+  Utils::Vector3i nm{};
+  Utils::Vector3d fnm{};
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() {
+        auto const norm_sq = nm.norm2();
+        auto const ex = std::exp(-factor1 * norm_sq);
+        auto const U2 = std::pow(Utils::product(fnm), 2 * cao);
+        alias1 += Utils::sqr(ex) * norm_sq;
+        alias2 += U2 * ex * std::pow(shift * nm, 3) / norm_sq;
+      },
+      [&](unsigned dim, int n) {
+        nm[dim] = shift[dim] + n * mesh;
+        fnm[dim] = math::sinc(nm[dim] * mesh_i);
+      });
+
   return std::make_pair(alias1, alias2);
 }
 
 /** Calculate the k-space error of dipolar-P3M */
 static double dp3m_k_space_error(double box_size, int mesh, int cao,
                                  int n_c_part, double sum_q2, double alpha_L) {
-  double he_q = 0.;
-  auto const mesh_i = 1. / mesh;
-  auto const alpha_L_i = 1. / alpha_L;
 
-  for (int nx = -mesh / 2; nx < mesh / 2; nx++)
-    for (int ny = -mesh / 2; ny < mesh / 2; ny++)
-      for (int nz = -mesh / 2; nz < mesh / 2; nz++)
-        if ((nx != 0) || (ny != 0) || (nz != 0)) {
-          auto const n2 = Utils::sqr(nx) + Utils::sqr(ny) + Utils::sqr(nz);
-          auto const cs = p3m_analytic_cotangent_sum(nx, mesh_i, cao) *
-                          p3m_analytic_cotangent_sum(ny, mesh_i, cao) *
-                          p3m_analytic_cotangent_sum(nz, mesh_i, cao);
+  auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao);
+  auto const mesh_i = 1. / static_cast<double>(mesh);
+  auto const alpha_L_i = 1. / alpha_L;
+  auto const m_stop = Utils::Vector3i::broadcast(mesh / 2);
+  auto const m_start = -m_stop;
+  auto indices = Utils::Vector3i{};
+  auto values = Utils::Vector3d{};
+  auto he_q = 0.;
+
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() {
+        if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) {
+          auto const n2 = indices.norm2();
+          auto const cs = Utils::product(values);
           auto const [alias1, alias2] =
-              dp3m_tune_aliasing_sums(nx, ny, nz, mesh, mesh_i, cao, alpha_L_i);
+              dp3m_tune_aliasing_sums(indices, mesh, mesh_i, cao, alpha_L_i);
           auto const d =
               alias1 - Utils::sqr(alias2 / cs) /
                            Utils::int_pow<3>(static_cast<double>(n2));
           /* at high precision, d can become negative due to extinction;
              also, don't take values that have no significant digits left*/
-          if (d > 0 && (fabs(d / alias1) > ROUND_ERROR_PREC))
+          if (d > 0. and std::fabs(d / alias1) > ROUND_ERROR_PREC)
             he_q += d;
         }
+      },
+      [&values, &mesh_i, cotangent_sum](unsigned dim, int n) {
+        values[dim] = cotangent_sum(n, mesh_i);
+      });
 
   return 8. * Utils::sqr(std::numbers::pi) / 3. * sum_q2 *
          sqrt(he_q / n_c_part) / Utils::int_pow<4>(box_size);
diff --git a/src/core/p3m/common.cpp b/src/core/p3m/common.cpp
index bc87e99a6a8..fb8023eba74 100644
--- a/src/core/p3m/common.cpp
+++ b/src/core/p3m/common.cpp
@@ -28,52 +28,8 @@
 #include "LocalBox.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/math/sqr.hpp>
 
 #include <cmath>
-#include <numbers>
-#include <stdexcept>
-
-double p3m_analytic_cotangent_sum(int n, double mesh_i, int cao) {
-  auto const c =
-      Utils::sqr(std::cos(std::numbers::pi * mesh_i * static_cast<double>(n)));
-
-  switch (cao) {
-  case 1: {
-    return 1.0;
-  }
-  case 2: {
-    return (1.0 + c * 2.0) / 3.0;
-  }
-  case 3: {
-    return (2.0 + c * (11.0 + c * 2.0)) / 15.0;
-  }
-  case 4: {
-    return (17.0 + c * (180.0 + c * (114.0 + c * 4.0))) / 315.0;
-  }
-  case 5: {
-    return (62.0 + c * (1072.0 + c * (1452.0 + c * (247.0 + c * 2.0)))) /
-           2835.0;
-  }
-  case 6: {
-    return (1382.0 +
-            c * (35396.0 +
-                 c * (83021.0 + c * (34096.0 + c * (2026.0 + c * 4.0))))) /
-           155925.0;
-  }
-  case 7: {
-    return (21844.0 +
-            c * (776661.0 +
-                 c * (2801040.0 +
-                      c * (2123860.0 +
-                           c * (349500.0 + c * (8166.0 + c * 4.0)))))) /
-           6081075.0;
-  }
-  default: {
-    throw std::logic_error("Invalid value cao=" + std::to_string(cao));
-  }
-  }
-}
 
 void P3MLocalMesh::calc_local_ca_mesh(P3MParameters const &params,
                                       LocalBox const &local_geo, double skin,
diff --git a/src/core/p3m/common.hpp b/src/core/p3m/common.hpp
index e5a42993755..6bc30828ee5 100644
--- a/src/core/p3m/common.hpp
+++ b/src/core/p3m/common.hpp
@@ -38,6 +38,7 @@
 
 #include <utils/Vector.hpp>
 
+#include <algorithm>
 #include <array>
 #include <vector>
 
@@ -49,6 +50,7 @@ auto constexpr P3M_EPSILON_METALLIC = 0.0;
 #include "LocalBox.hpp"
 
 #include <array>
+#include <cstddef>
 #include <stdexcept>
 #include <vector>
 
@@ -226,16 +228,6 @@ struct P3MLocalMesh {
                           double space_layer);
 };
 
-/** One of the aliasing sums used to compute k-space errors.
- *  Fortunately the one which is most important (because it converges
- *  most slowly, since it is not damped exponentially) can be
- *  calculated analytically. The result (which depends on the order of
- *  the spline interpolation) can be written as an even trigonometric
- *  polynomial. The results are tabulated here (the employed formula
- *  is eq. (7.66) in @cite hockney88a).
- */
-double p3m_analytic_cotangent_sum(int n, double mesh_i, int cao);
-
 #endif /* P3M || DP3M */
 
 namespace detail {
diff --git a/src/core/p3m/for_each_3d.hpp b/src/core/p3m/for_each_3d.hpp
new file mode 100644
index 00000000000..9f44871bd54
--- /dev/null
+++ b/src/core/p3m/for_each_3d.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2024 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <concepts>
+
+namespace detail {
+
+constexpr inline void noop_projector(unsigned, int) {}
+
+template <typename T>
+concept IndexVectorConcept = requires(T vector) {
+  { vector[0] } -> std::convertible_to<std::size_t>;
+};
+
+} // namespace detail
+
+/**
+ * @brief Repeat an operation on every element of a 3D grid.
+ *
+ * Intermediate values that depend on the iterated coordinates
+ * are calculated and stored once per iteration. This is useful
+ * when the operation is costly.
+ *
+ * @param start       Initial values for the loop counters.
+ * @param stop        Final values (one-past-the-end) for the loop counters.
+ * @param counters    Loop counters.
+ * @param kernel      Functor to execute.
+ * @param projector   Projection of the current loop counter.
+ * @tparam Kernel     Nullary function.
+ * @tparam Projector  Binary function that takes a nesting depth and a loop
+ *                    counter as arguments and projects a value.
+ */
+template <class Kernel, class Projector = decltype(detail::noop_projector)>
+  requires std::invocable<Kernel> and std::invocable<Projector, unsigned, int>
+void for_each_3d(detail::IndexVectorConcept auto &&start,
+                 detail::IndexVectorConcept auto &&stop,
+                 detail::IndexVectorConcept auto &&counters, Kernel &&kernel,
+                 Projector &&projector = detail::noop_projector) {
+  auto &nx = counters[0u];
+  auto &ny = counters[1u];
+  auto &nz = counters[2u];
+  for (nx = start[0u]; nx < stop[0u]; ++nx) {
+    projector(0u, nx);
+    for (ny = start[1u]; ny < stop[1u]; ++ny) {
+      projector(1u, ny);
+      for (nz = start[2u]; nz < stop[2u]; ++nz) {
+        projector(2u, nz);
+        kernel();
+      }
+    }
+  }
+}
diff --git a/src/core/p3m/influence_function.hpp b/src/core/p3m/influence_function.hpp
index 9c322c23506..24853d43e25 100644
--- a/src/core/p3m/influence_function.hpp
+++ b/src/core/p3m/influence_function.hpp
@@ -16,15 +16,16 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef ESPRESSO_P3M_INFLUENCE_FUNCTION_HPP
-#define ESPRESSO_P3M_INFLUENCE_FUNCTION_HPP
+
+#pragma once
 
 #include "p3m/common.hpp"
+#include "p3m/for_each_3d.hpp"
+#include "p3m/math.hpp"
 
 #include <utils/Vector.hpp>
 #include <utils/index.hpp>
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <cmath>
@@ -52,44 +53,43 @@
 template <std::size_t S, std::size_t m>
 double G_opt(int cao, double alpha, Utils::Vector3d const &k,
              Utils::Vector3d const &h) {
-  using namespace detail::FFT_indexing;
-  using Utils::int_pow;
-  using Utils::sinc;
-
-  auto constexpr two_pi = 2. * std::numbers::pi;
-  auto constexpr two_pi_i = 1. / two_pi;
-  auto constexpr limit = 30.;
 
   auto const k2 = k.norm2();
-  if (k2 == 0.0) {
-    return 0.0;
+  if (k2 == 0.) {
+    return 0.;
   }
 
-  double numerator = 0.0;
-  double denominator = 0.0;
-
-  for (int mx = -m; mx <= m; mx++) {
-    for (int my = -m; my <= m; my++) {
-      for (int mz = -m; mz <= m; mz++) {
-        auto const km =
-            k + two_pi * Utils::Vector3d{mx / h[RX], my / h[RY], mz / h[RZ]};
-        auto const U2 = std::pow(sinc(km[RX] * h[RX] * two_pi_i) *
-                                     sinc(km[RY] * h[RY] * two_pi_i) *
-                                     sinc(km[RZ] * h[RZ] * two_pi_i),
-                                 2 * cao);
-
+  auto constexpr limit = 30.;
+  auto constexpr m_start = Utils::Vector3i::broadcast(-m);
+  auto constexpr m_stop = Utils::Vector3i::broadcast(m + 1);
+  auto const exponent_prefactor = Utils::sqr(1. / (2. * alpha));
+  auto const wavevector = (2. * std::numbers::pi) / h;
+  auto const wavevector_i = 1. / wavevector;
+  auto indices = Utils::Vector3i{};
+  auto km = Utils::Vector3d{};
+  auto fnm = Utils::Vector3d{};
+  auto numerator = 0.;
+  auto denominator = 0.;
+
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() {
+        using namespace detail::FFT_indexing;
+        auto const U2 = std::pow(Utils::product(fnm), 2 * cao);
         auto const km2 = km.norm2();
-        auto const exponent = Utils::sqr(1. / (2. * alpha)) * km2;
+        auto const exponent = exponent_prefactor * km2;
         if (exponent < limit) {
           auto const f3 = std::exp(-exponent) * (4. * std::numbers::pi / km2);
-          numerator += U2 * f3 * int_pow<S>(k * km);
+          numerator += U2 * f3 * Utils::int_pow<S>(k * km);
         }
         denominator += U2;
-      }
-    }
-  }
+      },
+      [&](unsigned dim, int n) {
+        km[dim] = k[dim] + n * wavevector[dim];
+        fnm[dim] = math::sinc(km[dim] * wavevector_i[dim]);
+      });
 
-  return numerator / (int_pow<S>(k2) * Utils::sqr(denominator));
+  return numerator / (Utils::int_pow<S>(k2) * Utils::sqr(denominator));
 }
 
 /**
@@ -104,20 +104,18 @@ double G_opt(int cao, double alpha, Utils::Vector3d const &k,
  *
  * @param params P3M parameters
  * @param n_start Lower left corner of the grid
- * @param n_end Upper right corner of the grid.
- * @param box_l Box size
+ * @param n_stop Upper right corner of the grid.
+ * @param inv_box_l Inverse box length
  * @return Values of G_opt at regular grid points.
  */
 template <std::size_t S, std::size_t m = 0>
 std::vector<double> grid_influence_function(const P3MParameters &params,
                                             const Utils::Vector3i &n_start,
-                                            const Utils::Vector3i &n_end,
-                                            const Utils::Vector3d &box_l) {
-  using namespace detail::FFT_indexing;
+                                            const Utils::Vector3i &n_stop,
+                                            const Utils::Vector3d &inv_box_l) {
 
   auto const shifts = detail::calc_meshift(params.mesh);
-
-  auto const size = n_end - n_start;
+  auto const size = n_stop - n_start;
 
   /* The influence function grid */
   auto g = std::vector<double>(Utils::product(size), 0.);
@@ -128,31 +126,24 @@ std::vector<double> grid_influence_function(const P3MParameters &params,
     return g;
   }
 
-  auto const h = Utils::Vector3d{params.a};
-
-  Utils::Vector3i n{};
-  for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) {
-    for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) {
-      for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) {
-        auto const ind = Utils::get_linear_index(n - n_start, size,
-                                                 Utils::MemoryOrder::ROW_MAJOR);
-        if ((n[KX] % (params.mesh[RX] / 2) == 0) &&
-            (n[KY] % (params.mesh[RY] / 2) == 0) &&
-            (n[KZ] % (params.mesh[RZ] / 2) == 0)) {
-          g[ind] = 0.0;
-        } else {
-          auto const k = 2. * std::numbers::pi *
-                         Utils::Vector3d{shifts[RX][n[KX]] / box_l[RX],
-                                         shifts[RY][n[KY]] / box_l[RY],
-                                         shifts[RZ][n[KZ]] / box_l[RZ]};
-
-          g[ind] = G_opt<S, m>(params.cao, params.alpha, k, h);
-        }
-      }
+  auto const wavevector = (2. * std::numbers::pi) * inv_box_l;
+  auto const half_mesh = params.mesh / 2;
+  auto indices = Utils::Vector3i{};
+  auto index = std::size_t(0u);
+
+  for_each_3d(n_start, n_stop, indices, [&]() {
+    using namespace detail::FFT_indexing;
+    if ((indices[KX] % half_mesh[RX] != 0) or
+        (indices[KY] % half_mesh[RY] != 0) or
+        (indices[KZ] % half_mesh[RZ] != 0)) {
+      auto const k =
+          Utils::Vector3d{{shifts[RX][indices[KX]] * wavevector[RX],
+                           shifts[RY][indices[KY]] * wavevector[RY],
+                           shifts[RZ][indices[KZ]] * wavevector[RZ]}};
+      g[index] = G_opt<S, m>(params.cao, params.alpha, k, params.a);
     }
-  }
+    ++index;
+  });
 
   return g;
 }
-
-#endif
diff --git a/src/core/p3m/influence_function_dipolar.hpp b/src/core/p3m/influence_function_dipolar.hpp
index e327846d20e..d52a1a6d675 100644
--- a/src/core/p3m/influence_function_dipolar.hpp
+++ b/src/core/p3m/influence_function_dipolar.hpp
@@ -18,19 +18,19 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef ESPRESSO_CORE_P3M_INFLUENCE_FUNCTION_DIPOLAR_HPP
-#define ESPRESSO_CORE_P3M_INFLUENCE_FUNCTION_DIPOLAR_HPP
+
+#pragma once
 
 #include "config/config.hpp"
 
 #if defined(DP3M)
 
 #include "p3m/common.hpp"
+#include "p3m/for_each_3d.hpp"
+#include "p3m/math.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/index.hpp>
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <cmath>
@@ -54,40 +54,41 @@
 template <std::size_t S>
 double G_opt_dipolar(P3MParameters const &params, Utils::Vector3i const &shift,
                      Utils::Vector3i const &d_op) {
-  using Utils::int_pow;
-  using Utils::sinc;
+
   auto constexpr limit = P3M_BRILLOUIN;
   auto constexpr exp_limit = 30.;
-  auto const exponent = 2. * params.cao;
-
+  auto constexpr m_start = Utils::Vector3i::broadcast(-limit);
+  auto constexpr m_stop = Utils::Vector3i::broadcast(limit + 1);
+  auto const cao = params.cao;
+  auto const mesh = params.mesh[0];
+  auto const offset =
+      static_cast<Utils::Vector3d>(shift) / static_cast<double>(mesh);
+  auto const f2 = Utils::sqr(std::numbers::pi / params.alpha_L);
+  auto indices = Utils::Vector3i{};
+  auto nm = Utils::Vector3i{};
+  auto fnm = Utils::Vector3d{};
   auto numerator = 0.;
   auto denominator = 0.;
 
-  auto const f1 = 1. / static_cast<double>(params.mesh[0]);
-  auto const f2 = Utils::sqr(std::numbers::pi / params.alpha_L);
-
-  for (int mx = -limit; mx <= limit; mx++) {
-    auto const nmx = shift[0] + params.mesh[0] * mx;
-    auto const sx = std::pow(sinc(f1 * nmx), exponent);
-    for (int my = -limit; my <= limit; my++) {
-      auto const nmy = shift[1] + params.mesh[0] * my;
-      auto const sy = sx * std::pow(sinc(f1 * nmy), exponent);
-      for (int mz = -limit; mz <= limit; mz++) {
-        auto const nmz = shift[2] + params.mesh[0] * mz;
-        auto const sz = sy * std::pow(sinc(f1 * nmz), exponent);
-        auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz);
-        auto const exp_term = f2 * nm2;
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() {
+        auto const norm_sq = nm.norm2();
+        auto const sz = std::pow(Utils::product(fnm), 2 * cao);
+        auto const exp_term = f2 * norm_sq;
         if (exp_term < exp_limit) {
-          auto const f3 = sz * std::exp(-exp_term) / nm2;
-          auto const n_nm = d_op[0] * nmx + d_op[1] * nmy + d_op[2] * nmz;
-          numerator += f3 * int_pow<S>(n_nm);
+          auto const f3 = sz * std::exp(-exp_term) / norm_sq;
+          numerator += f3 * Utils::int_pow<S>(d_op * nm);
         }
         denominator += sz;
-      }
-    }
-  }
-  return numerator / (int_pow<S>(static_cast<double>(d_op.norm2())) *
-                      Utils::sqr(denominator));
+      },
+      [&](unsigned dim, int n) {
+        nm[dim] = shift[dim] + n * mesh;
+        fnm[dim] = math::sinc(offset[dim] + n * mesh);
+      });
+
+  return numerator / (Utils::int_pow<S>(static_cast<double>(d_op.norm2())) *
+                      Utils::int_pow<2>(denominator));
 }
 
 /**
@@ -100,17 +101,17 @@ double G_opt_dipolar(P3MParameters const &params, Utils::Vector3i const &shift,
  *
  * @param params DP3M parameters
  * @param n_start Lower left corner of the grid
- * @param n_end Upper right corner of the grid.
- * @param box_l Box size
+ * @param n_stop Upper right corner of the grid.
+ * @param inv_box_l Inverse box length
  * @return Values of the influence function at regular grid points.
  */
 template <std::size_t S>
 std::vector<double> grid_influence_function(P3MParameters const &params,
                                             Utils::Vector3i const &n_start,
-                                            Utils::Vector3i const &n_end,
-                                            Utils::Vector3d const &box_l) {
+                                            Utils::Vector3i const &n_stop,
+                                            Utils::Vector3d const &inv_box_l) {
 
-  auto const size = n_end - n_start;
+  auto const size = n_stop - n_start;
 
   /* The influence function grid */
   auto g = std::vector<double>(Utils::product(size), 0.);
@@ -121,61 +122,56 @@ std::vector<double> grid_influence_function(P3MParameters const &params,
     return g;
   }
 
-  double fak1 = Utils::int_pow<3>(static_cast<double>(params.mesh[0])) * 2.0 /
-                Utils::sqr(box_l[0]);
-
-  auto const shifts = detail::calc_meshift(params.mesh, false);
-  auto const d_ops = detail::calc_meshift(params.mesh, true);
-
-  Utils::Vector3i n{};
-  for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) {
-    for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) {
-      for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) {
-        auto const ind = Utils::get_linear_index(n - n_start, size,
-                                                 Utils::MemoryOrder::ROW_MAJOR);
-
-        if (((n[0] % (params.mesh[0] / 2) == 0) &&
-             (n[1] % (params.mesh[0] / 2) == 0) &&
-             (n[2] % (params.mesh[0] / 2) == 0))) {
-          g[ind] = 0.0;
-        } else {
-          auto const shift = Utils::Vector3i{shifts[0][n[0]], shifts[0][n[1]],
-                                             shifts[0][n[2]]};
-          auto const d_op =
-              Utils::Vector3i{d_ops[0][n[0]], d_ops[0][n[1]], d_ops[0][n[2]]};
-          auto const fak2 = G_opt_dipolar<S>(params, shift, d_op);
-          g[ind] = fak1 * fak2;
+  auto prefactor = Utils::int_pow<3>(static_cast<double>(params.mesh[0])) * 2. *
+                   Utils::int_pow<2>(inv_box_l[0]);
+
+  auto const offset = detail::calc_meshift(params.mesh, false)[0];
+  auto const d_op = detail::calc_meshift(params.mesh, true)[0];
+  auto const half_mesh = params.mesh[0] / 2;
+  auto indices = Utils::Vector3i{};
+  auto shift_off = Utils::Vector3i{};
+  auto d_op_off = Utils::Vector3i{};
+  auto index = std::size_t(0u);
+
+  for_each_3d(
+      n_start, n_stop, indices,
+      [&]() {
+        if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or
+             (indices[2] % half_mesh != 0))) {
+          g[index] = prefactor * G_opt_dipolar<S>(params, shift_off, d_op_off);
         }
-      }
-    }
-  }
+        ++index;
+      },
+      [&](unsigned dim, int n) {
+        d_op_off[dim] = d_op[n];
+        shift_off[dim] = offset[n];
+      });
+
   return g;
 }
 
 inline double G_opt_dipolar_self_energy(P3MParameters const &params,
                                         Utils::Vector3i const &shift) {
-  using Utils::sinc;
+
   auto constexpr limit = P3M_BRILLOUIN + 1;
-  auto const exponent = 2. * params.cao;
-
-  auto u_sum = 0.0;
-
-  auto const f1 = 1.0 / static_cast<double>(params.mesh[0]);
-
-  for (int mx = -limit; mx <= limit; mx++) {
-    auto const nmx = shift[0] + params.mesh[0] * mx;
-    auto const sx = std::pow(sinc(f1 * nmx), exponent);
-    for (int my = -limit; my <= limit; my++) {
-      auto const nmy = shift[1] + params.mesh[0] * my;
-      auto const sy = sx * std::pow(sinc(f1 * nmy), exponent);
-      for (int mz = -limit; mz <= limit; mz++) {
-        auto const nmz = shift[2] + params.mesh[0] * mz;
-        auto const sz = sy * std::pow(sinc(f1 * nmz), exponent);
-        u_sum += sz;
-      }
-    }
-  }
-  return u_sum;
+  auto constexpr m_start = Utils::Vector3i::broadcast(-limit);
+  auto constexpr m_stop = Utils::Vector3i::broadcast(limit + 1);
+  auto const cao = params.cao;
+  auto const mesh = params.mesh[0];
+  auto const offset =
+      static_cast<Utils::Vector3d>(shift) / static_cast<double>(mesh);
+  auto indices = Utils::Vector3i{};
+  auto fnm = Utils::Vector3d{};
+  auto energy = 0.;
+
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() { energy += std::pow(Utils::product(fnm), 2 * cao); },
+      [&](unsigned dim, int n) {
+        fnm[dim] = math::sinc(offset[dim] + n * mesh);
+      });
+
+  return energy;
 }
 
 /**
@@ -183,42 +179,39 @@ inline double G_opt_dipolar_self_energy(P3MParameters const &params,
  *
  * @param params DP3M parameters
  * @param n_start Lower left corner of the grid
- * @param n_end Upper right corner of the grid.
+ * @param n_stop Upper right corner of the grid.
  * @param g Energies on the grid.
  * @return Total self-energy.
  */
 inline double grid_influence_function_self_energy(
     P3MParameters const &params, Utils::Vector3i const &n_start,
-    Utils::Vector3i const &n_end, std::vector<double> const &g) {
-  auto const size = n_end - n_start;
-
-  auto const shifts = detail::calc_meshift(params.mesh, false);
-  auto const d_ops = detail::calc_meshift(params.mesh, true);
-
-  double energy = 0.0;
-  Utils::Vector3i n{};
-  for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) {
-    for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) {
-      for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) {
-        if (((n[0] % (params.mesh[0] / 2) == 0) &&
-             (n[1] % (params.mesh[0] / 2) == 0) &&
-             (n[2] % (params.mesh[0] / 2) == 0))) {
-          energy += 0.0;
-        } else {
-          auto const ind = Utils::get_linear_index(
-              n - n_start, size, Utils::MemoryOrder::ROW_MAJOR);
-          auto const shift = Utils::Vector3i{shifts[0][n[0]], shifts[0][n[1]],
-                                             shifts[0][n[2]]};
-          auto const d_op =
-              Utils::Vector3i{d_ops[0][n[0]], d_ops[0][n[1]], d_ops[0][n[2]]};
-          auto const U2 = G_opt_dipolar_self_energy(params, shift);
-          energy += g[ind] * U2 * d_op.norm2();
+    Utils::Vector3i const &n_stop, std::vector<double> const &g) {
+
+  auto const offset = detail::calc_meshift(params.mesh, false)[0];
+  auto const d_op = detail::calc_meshift(params.mesh, true)[0];
+  auto const half_mesh = params.mesh[0] / 2;
+  auto indices = Utils::Vector3i{};
+  auto shift_off = Utils::Vector3i{};
+  auto d_op_off = Utils::Vector3i{};
+  auto index = std::size_t(0u);
+  auto energy = 0.;
+
+  for_each_3d(
+      n_start, n_stop, indices,
+      [&]() {
+        if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or
+             (indices[2] % half_mesh != 0))) {
+          auto const U2 = G_opt_dipolar_self_energy(params, shift_off);
+          energy += g[index] * U2 * d_op_off.norm2();
         }
-      }
-    }
-  }
+        ++index;
+      },
+      [&](unsigned dim, int n) {
+        d_op_off[dim] = d_op[n];
+        shift_off[dim] = offset[n];
+      });
+
   return energy;
 }
 
-#endif
-#endif
+#endif // defined(DP3M)
diff --git a/src/core/p3m/math.hpp b/src/core/p3m/math.hpp
new file mode 100644
index 00000000000..2da9b38dd19
--- /dev/null
+++ b/src/core/p3m/math.hpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (C) 2010-2024 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <utils/device_qualifier.hpp>
+#include <utils/math/sqr.hpp>
+
+#ifndef __CUDACC__
+#include <cmath>
+#endif
+#include <numbers>
+#include <stdexcept>
+#include <string>
+
+namespace math {
+
+/** @brief Return the absolute value of x. */
+inline DEVICE_QUALIFIER auto abs(double x) { return fabs(x); }
+
+/** @brief Return the absolute value of x. */
+inline DEVICE_QUALIFIER auto abs(float x) { return fabsf(x); }
+
+/**
+ * @brief Calculate the function @f$ \mathrm{sinc}(x) = \sin(\pi x)/(\pi x) @f$.
+ *
+ * (same convention as in @cite hockney88a). In order to avoid divisions
+ * by 0, arguments whose modulus is smaller than @f$ \epsilon = 0.1 @f$
+ * are evaluated by an 8th order Taylor expansion.
+ * Note that the difference between sinc(x) and this expansion
+ * is smaller than 0.235e-12, if x is smaller than @f$ \epsilon @f$.
+ * (The next term in the expansion is the 10th order contribution, i.e.
+ * @f$ \pi^{10} x^{10}/39916800 \approx 0.2346 \cdot x^{12} @f$).
+ */
+template <typename T> DEVICE_QUALIFIER auto sinc(T x) {
+  auto constexpr epsilon = T(0.1);
+#if not defined(__CUDACC__)
+  using std::sin;
+#endif
+
+  auto const pix = std::numbers::pi_v<T> * x;
+
+  if (::math::abs(x) > epsilon)
+    return sin(pix) / pix;
+
+  auto constexpr factorial = [](int n) consteval {
+    int acc{1}, c{1};
+    while (c < n) {
+      acc *= ++c;
+    }
+    return acc;
+  };
+
+  /* Coefficients of the Taylor expansion of sinc */
+  auto constexpr c0 = T(+1) / T(factorial(1));
+  auto constexpr c2 = T(-1) / T(factorial(3));
+  auto constexpr c4 = T(+1) / T(factorial(5));
+  auto constexpr c6 = T(-1) / T(factorial(7));
+  auto constexpr c8 = T(+1) / T(factorial(9));
+
+  auto const pix2 = pix * pix;
+  return c0 + pix2 * (c2 + pix2 * (c4 + pix2 * (c6 + pix2 * c8)));
+}
+
+/**
+ * @brief One of the aliasing sums used to compute k-space errors.
+ * Fortunately the one which is most important (because it converges
+ * most slowly, since it is not damped exponentially) can be calculated
+ * analytically. The result (which depends on the order of the spline
+ * interpolation) can be written as an even trigonometric polynomial.
+ * The results are tabulated here (the employed formula is eq. 7-66
+ * p. 233 in @cite hockney88a).
+ */
+template <int cao>
+DEVICE_QUALIFIER auto analytic_cotangent_sum(int n, double mesh_i) {
+  static_assert(cao >= 1 and cao <= 7);
+#if not defined(__CUDACC__)
+  using std::cos;
+#endif
+  auto const theta = static_cast<double>(n) * mesh_i * std::numbers::pi;
+  auto const c = Utils::sqr(cos(theta));
+
+  if constexpr (cao == 1) {
+    return 1.;
+  }
+  if constexpr (cao == 2) {
+    return (1. + c * 2.) / 3.;
+  }
+  if constexpr (cao == 3) {
+    return (2. + c * (11. + c * 2.)) / 15.;
+  }
+  if constexpr (cao == 4) {
+    return (17. + c * (180. + c * (114. + c * 4.))) / 315.;
+  }
+  if constexpr (cao == 5) {
+    return (62. + c * (1072. + c * (1452. + c * (247. + c * 2.)))) / 2835.;
+  }
+  if constexpr (cao == 6) {
+    return (1382. +
+            c * (35396. + c * (83021. + c * (34096. + c * (2026. + c * 4.))))) /
+           155925.;
+  }
+  return (21844. +
+          c * (776661. +
+               c * (2801040. +
+                    c * (2123860. + c * (349500. + c * (8166. + c * 4.)))))) /
+         6081075.;
+}
+
+inline auto get_analytic_cotangent_sum_kernel(int cao) {
+  decltype(&analytic_cotangent_sum<1>) ptr = nullptr;
+  if (cao == 1) {
+    ptr = &analytic_cotangent_sum<1>;
+  } else if (cao == 2) {
+    ptr = &analytic_cotangent_sum<2>;
+  } else if (cao == 3) {
+    ptr = &analytic_cotangent_sum<3>;
+  } else if (cao == 4) {
+    ptr = &analytic_cotangent_sum<4>;
+  } else if (cao == 5) {
+    ptr = &analytic_cotangent_sum<5>;
+  } else if (cao == 6) {
+    ptr = &analytic_cotangent_sum<6>;
+  } else if (cao == 7) {
+    ptr = &analytic_cotangent_sum<7>;
+  }
+  if (ptr == nullptr) {
+    throw std::logic_error("Invalid value cao=" + std::to_string(cao));
+  }
+  return ptr;
+}
+
+} // namespace math
diff --git a/src/core/unit_tests/CMakeLists.txt b/src/core/unit_tests/CMakeLists.txt
index d843da0a636..60fcadae764 100644
--- a/src/core/unit_tests/CMakeLists.txt
+++ b/src/core/unit_tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2010-2022 The ESPResSo project
+# Copyright (C) 2010-2024 The ESPResSo project
 # Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
 #   Max-Planck-Institute for Polymer Research, Theory Group
 #
@@ -94,6 +94,8 @@ if(ESPRESSO_BUILD_WITH_FFTW)
             espresso::core)
   unit_test(NAME fft_test SRC fft_test.cpp DEPENDS espresso::utils
             espresso::core)
+  unit_test(NAME math_test SRC math_test.cpp DEPENDS espresso::utils
+            espresso::core)
 endif()
 
 if(ESPRESSO_BUILD_WITH_CUDA)
diff --git a/src/core/unit_tests/fft_test.cpp b/src/core/unit_tests/fft_test.cpp
index 74ce5445867..6e301a48da2 100644
--- a/src/core/unit_tests/fft_test.cpp
+++ b/src/core/unit_tests/fft_test.cpp
@@ -27,6 +27,7 @@
 #include <boost/test/unit_test.hpp>
 
 #include "p3m/fft.hpp"
+#include "p3m/for_each_3d.hpp"
 
 #include <utils/Vector.hpp>
 
@@ -165,6 +166,43 @@ BOOST_AUTO_TEST_CASE(fft_exceptions) {
   BOOST_CHECK_THROW(allocator.allocate(bad_size), std::bad_array_new_length);
 }
 
+BOOST_AUTO_TEST_CASE(for_each_3d_test) {
+  auto const m_start = Utils::Vector3i{{0, -1, 3}};
+  auto const m_stop = Utils::Vector3i{{2, 2, 5}};
+  auto ref_loop_counters = m_start;
+  auto indices = Utils::Vector3i{};
+
+  auto const kernel = [&]() {
+    BOOST_REQUIRE_EQUAL(indices, ref_loop_counters);
+    if (++ref_loop_counters[2u] == m_stop[2u]) {
+      ref_loop_counters[2u] = m_start[2u];
+      if (++ref_loop_counters[1u] == m_stop[1u]) {
+        ref_loop_counters[1u] = m_start[1u];
+        if (++ref_loop_counters[0u] == m_stop[0u]) {
+          ref_loop_counters[0u] = m_start[0u];
+        }
+      }
+    }
+  };
+
+  {
+    for_each_3d(m_start, m_stop, indices, kernel, [&](unsigned dim, int n) {
+      BOOST_REQUIRE_GE(dim, 0);
+      BOOST_REQUIRE_LE(dim, 2);
+      BOOST_REQUIRE_EQUAL(n, ref_loop_counters[dim]);
+    });
+
+    BOOST_REQUIRE_EQUAL(indices, m_stop);
+    BOOST_REQUIRE_EQUAL(ref_loop_counters, m_start);
+  }
+  {
+    for_each_3d(m_start, m_stop, indices, kernel);
+
+    BOOST_REQUIRE_EQUAL(indices, m_stop);
+    BOOST_REQUIRE_EQUAL(ref_loop_counters, m_start);
+  }
+}
+
 #else  // defined(P3M) || defined(DP3M)
 int main(int argc, char **argv) {}
 #endif // defined(P3M) || defined(DP3M)
diff --git a/src/core/unit_tests/math_test.cpp b/src/core/unit_tests/math_test.cpp
new file mode 100644
index 00000000000..46a638800f1
--- /dev/null
+++ b/src/core/unit_tests/math_test.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (C) 2017-2024 The ESPResSo project
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define BOOST_TEST_MODULE "Special mathematical functions"
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+
+#include "p3m/math.hpp"
+
+#include <utils/math/sqr.hpp>
+
+#include <boost/math/special_functions/bernoulli.hpp>
+#include <boost/math/special_functions/factorials.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <numbers>
+#include <numeric>
+#include <ranges>
+#include <stdexcept>
+
+/** @brief Compute the n-th term in the Taylor series of @c tan(x). */
+static auto taylor_series_tangent(int n) {
+  auto const two_power_2n = std::pow(2., 2 * n);
+  auto const b2n = boost::math::bernoulli_b2n<double>(n);
+  auto const f2n = boost::math::factorial<double>(2 * n);
+  return std::pow(-1., n - 1) * two_power_2n * (two_power_2n - 1.) * b2n / f2n;
+}
+
+/** @brief Compute the n-th term in the Taylor series of @c sinc(x). */
+static auto taylor_series_sinc(int n) {
+  return std::pow(-1., n) / std::tgamma(2 * n + 2);
+}
+
+BOOST_AUTO_TEST_CASE(abs_test) {
+  static_assert(std::is_same_v<float, decltype(math::abs(1.f))>);
+  static_assert(std::is_same_v<double, decltype(math::abs(1.))>);
+
+  BOOST_CHECK_EQUAL(math::abs(+3.1415), std::abs(+3.1415));
+  BOOST_CHECK_EQUAL(math::abs(-3.1415), std::abs(-3.1415));
+  BOOST_CHECK_EQUAL(math::abs(+3.1415f), std::abs(+3.1415f));
+  BOOST_CHECK_EQUAL(math::abs(-3.1415f), std::abs(-3.1415f));
+}
+
+BOOST_AUTO_TEST_CASE(sinc_test) {
+  // edge case
+  BOOST_CHECK_EQUAL(math::sinc(0.0), 1.0);
+
+  // check against standard math functions
+  auto x = 0.001;
+  while (x <= 0.11) {
+    auto const approx = math::sinc(x);
+    auto const pi_x = std::numbers::pi * x;
+    auto const exact = std::sin(pi_x) / (pi_x);
+    BOOST_CHECK_SMALL(approx - exact, 1e-13);
+    x += 0.01;
+  }
+
+  // check Taylor expansion
+  auto const series = std::views::iota(0, 5) | std::views::reverse |
+                      std::views::transform(taylor_series_sinc);
+  for (auto const x : {1e-6, 1e-5, 1e-4, 1e-3, 1e-2}) {
+    auto const pix2 = Utils::sqr(std::numbers::pi * x);
+    auto const ref = std::accumulate(
+        series.begin(), series.end(), 0.,
+        [pix2](auto const &acc, auto const &c) { return acc * pix2 + c; });
+    BOOST_CHECK_SMALL(math::sinc(x) - ref, 1e-13);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(analytic_cotangent_sum_test) {
+  auto constexpr tol = 8. * 100. * std::numeric_limits<double>::epsilon();
+  auto const kernel = [](int n, double mesh_i, int cao) {
+    return math::get_analytic_cotangent_sum_kernel(cao)(n, mesh_i);
+  };
+
+  // edge case: at theta = 0, aliasing sums are unity
+  for (auto const cao : std::ranges::iota_view{1, 8}) {
+    BOOST_CHECK_CLOSE(kernel(0, 0., cao), 1., tol);
+  }
+  // edge case: at theta = pi / 2, aliasing sums are equal to the cao-th term
+  // in the tangent Taylor series
+  for (auto const cao : std::ranges::iota_view{1, 8}) {
+    BOOST_CHECK_CLOSE(kernel(1, 0.5, cao), taylor_series_tangent(cao), tol);
+  }
+
+  // check assertion
+  for (auto const invalid_cao : {-1, 0, 8}) {
+    BOOST_CHECK_THROW(kernel(1, 0., invalid_cao), std::logic_error);
+  }
+}
diff --git a/src/core/unit_tests/p3m_test.cpp b/src/core/unit_tests/p3m_test.cpp
index 5fd3a83915b..3f091d565ce 100644
--- a/src/core/unit_tests/p3m_test.cpp
+++ b/src/core/unit_tests/p3m_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2020-2022 The ESPResSo project
  *
  * This file is part of ESPResSo.
  *
@@ -17,7 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#define BOOST_TEST_MODULE p3m test
+#define BOOST_TEST_MODULE "P3M utility functions"
 #define BOOST_TEST_DYN_LINK
 #include <boost/test/unit_test.hpp>
 
@@ -27,8 +27,6 @@
 
 #include <array>
 #include <cstddef>
-#include <limits>
-#include <stdexcept>
 #include <vector>
 
 BOOST_AUTO_TEST_CASE(calc_meshift_false) {
@@ -60,27 +58,3 @@ BOOST_AUTO_TEST_CASE(calc_meshift_true) {
     }
   }
 }
-
-#if defined(P3M) || defined(DP3M)
-BOOST_AUTO_TEST_CASE(analytic_cotangent_sum) {
-  auto constexpr kernel = p3m_analytic_cotangent_sum;
-  auto constexpr tol = 8. * 100. * std::numeric_limits<double>::epsilon();
-
-  // check only trivial cases
-  for (auto const cao : {1, 2, 3, 4, 5, 6, 7}) {
-    BOOST_CHECK_CLOSE(kernel(0, 0., cao), 1., tol);
-  }
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 1), 1., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 2), 1. / 3., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 3), 2. / 15., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 4), 17. / 315., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 5), 62. / 2835., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 6), 1382. / 155925., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 7), 21844. / 6081075., tol);
-
-  // check assertion
-  for (auto const invalid_cao : {-1, 0, 8}) {
-    BOOST_CHECK_THROW(kernel(1, 0., invalid_cao), std::logic_error);
-  }
-}
-#endif // defined(P3M) || defined(DP3M)
diff --git a/src/utils/include/utils/Vector.hpp b/src/utils/include/utils/Vector.hpp
index 4553741af6a..06167c901dc 100644
--- a/src/utils/include/utils/Vector.hpp
+++ b/src/utils/include/utils/Vector.hpp
@@ -39,6 +39,7 @@
 #include <initializer_list>
 #include <iterator>
 #include <numeric>
+#include <span>
 #include <type_traits>
 #include <vector>
 
@@ -79,7 +80,8 @@ template <typename T, std::size_t N> class Vector : public Array<T, N> {
 
 public:
   template <class Range>
-  explicit Vector(Range const &rng) : Vector(std::begin(rng), std::end(rng)) {}
+  explicit constexpr Vector(Range const &rng)
+      : Vector(std::begin(rng), std::end(rng)) {}
   explicit constexpr Vector(T const (&v)[N]) : Base() {
     copy_init(std::begin(v), std::end(v));
   }
@@ -103,14 +105,13 @@ template <typename T, std::size_t N> class Vector : public Array<T, N> {
     }
   }
 
-  /**
-   * @brief Create a vector that has all entries set to
-   *         one value.
-   */
-  static Vector<T, N> broadcast(T const &s) {
-    Vector<T, N> ret;
-    std::fill(ret.begin(), ret.end(), s);
-
+  /** @brief Create a vector that has all entries set to the same value. */
+  DEVICE_QUALIFIER static constexpr Vector<T, N>
+  broadcast(typename Base::value_type const &value) {
+    Vector<T, N> ret{};
+    for (std::size_t i = 0u; i != N; ++i) {
+      ret[i] = value;
+    }
     return ret;
   }
 
@@ -118,6 +119,12 @@ template <typename T, std::size_t N> class Vector : public Array<T, N> {
 
   operator std::vector<T>() const { return as_vector(); }
 
+  constexpr std::span<T, N> as_span() const {
+    return std::span<T, N>(const_cast<T *>(begin()), size());
+  }
+
+  constexpr operator std::span<T, N>() const { return as_span(); }
+
   template <class U> explicit operator Vector<U, N>() const {
     Vector<U, N> ret;
 
@@ -299,6 +306,15 @@ Vector<T, N> operator/(Vector<T, N> const &a, T const &b) {
   return ret;
 }
 
+template <std::size_t N, typename T>
+Vector<T, N> operator/(T const &a, Vector<T, N> const &b) {
+  Vector<T, N> ret;
+
+  std::transform(std::begin(b), std::end(b), ret.begin(),
+                 [a](T const &val) { return a / val; });
+  return ret;
+}
+
 template <std::size_t N, typename T>
 Vector<T, N> &operator/=(Vector<T, N> &a, T const &b) {
   std::transform(std::begin(a), std::end(a), std::begin(a),
diff --git a/src/utils/include/utils/math/abs.hpp b/src/utils/include/utils/math/abs.hpp
deleted file mode 100644
index 4576f2c9964..00000000000
--- a/src/utils/include/utils/math/abs.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2010-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef ESPRESSO_ABS_HPP
-#define ESPRESSO_ABS_HPP
-
-#include "utils/device_qualifier.hpp"
-
-#ifndef __CUDACC__
-#include <cmath>
-#endif
-
-namespace Utils {
-/**
- * @brief Return the absolute value of x.
- */
-inline DEVICE_QUALIFIER double abs(double x) { return fabs(x); }
-
-/**
- * @brief Return the absolute value of x.
- */
-inline DEVICE_QUALIFIER float abs(float x) { return fabsf(x); }
-} // namespace Utils
-
-#endif // ESPRESSO_DEVICE_MATH_HPP
diff --git a/src/utils/include/utils/math/sinc.hpp b/src/utils/include/utils/math/sinc.hpp
deleted file mode 100644
index 0a2cf4f47c7..00000000000
--- a/src/utils/include/utils/math/sinc.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) 2010-2022 The ESPResSo project
- * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
- *   Max-Planck-Institute for Polymer Research, Theory Group
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "utils/device_qualifier.hpp"
-#include "utils/math/abs.hpp"
-
-#include <cmath>
-#include <numbers>
-
-namespace Utils {
-/**
- * @brief Calculates the sinc-function as sin(PI*x)/(PI*x).
- *
- * (same convention as in @cite hockney88a). In order to avoid
- * divisions by 0, arguments, whose modulus is smaller than epsi, will
- * be evaluated by an 8th order Taylor expansion of the sinc
- * function. Note that the difference between sinc(x) and this
- * expansion is smaller than 0.235e-12, if x is smaller than 0.1. (The
- * next term in the expansion is the 10th order contribution
- * PI^10/39916800 * x^10 = 0.2346...*x^12). This expansion should
- * also save time, since it reduces the number of function calls to
- * sin().
- */
-template <typename T> DEVICE_QUALIFIER T sinc(T d) {
-  const constexpr T epsi = T(0.1);
-
-  const auto PId = std::numbers::pi_v<T> * d;
-
-  if (::Utils::abs(d) > epsi)
-    return sin(PId) / PId;
-
-  /* Coefficients of the Taylor expansion of sinc */
-  const constexpr T c2 = T(-0.1666666666667e-0);
-  const constexpr T c4 = T(0.8333333333333e-2);
-  const constexpr T c6 = T(-0.1984126984127e-3);
-  const constexpr T c8 = T(0.2755731922399e-5);
-
-  const auto PId2 = PId * PId;
-  return T(1) + PId2 * (c2 + PId2 * (c4 + PId2 * (c6 + PId2 * c8)));
-}
-} // namespace Utils
diff --git a/src/utils/include/utils/math/sqr.hpp b/src/utils/include/utils/math/sqr.hpp
index dae6ee6d8af..a5ac0f25969 100644
--- a/src/utils/include/utils/math/sqr.hpp
+++ b/src/utils/include/utils/math/sqr.hpp
@@ -1,5 +1,7 @@
 /*
  * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
  *
  * This file is part of ESPResSo.
  *
diff --git a/src/utils/tests/CMakeLists.txt b/src/utils/tests/CMakeLists.txt
index 2dd87d9f775..89755749fab 100644
--- a/src/utils/tests/CMakeLists.txt
+++ b/src/utils/tests/CMakeLists.txt
@@ -21,7 +21,6 @@
 
 include(unit_test)
 
-unit_test(NAME abs_test SRC abs_test.cpp DEPENDS espresso::utils)
 unit_test(NAME Vector_test SRC Vector_test.cpp DEPENDS espresso::utils)
 unit_test(NAME Factory_test SRC Factory_test.cpp DEPENDS espresso::utils)
 unit_test(NAME NumeratedContainer_test SRC NumeratedContainer_test.cpp DEPENDS
@@ -34,7 +33,6 @@ unit_test(NAME accumulator SRC accumulator.cpp DEPENDS espresso::utils
 unit_test(NAME int_pow SRC int_pow_test.cpp DEPENDS espresso::utils)
 unit_test(NAME sgn SRC sgn_test.cpp DEPENDS espresso::utils)
 unit_test(NAME AS_erfc_part SRC AS_erfc_part_test.cpp DEPENDS espresso::utils)
-unit_test(NAME sinc SRC sinc_test.cpp DEPENDS espresso::utils)
 unit_test(NAME permute_ifield_test SRC permute_ifield_test.cpp DEPENDS
           espresso::utils)
 unit_test(NAME vec_rotate SRC vec_rotate_test.cpp DEPENDS espresso::utils)
diff --git a/src/utils/tests/Vector_test.cpp b/src/utils/tests/Vector_test.cpp
index 4079f4f5d15..64463077fdf 100644
--- a/src/utils/tests/Vector_test.cpp
+++ b/src/utils/tests/Vector_test.cpp
@@ -36,6 +36,7 @@
 #include <iterator>
 #include <limits>
 #include <numeric>
+#include <span>
 #include <stdexcept>
 #include <type_traits>
 #include <vector>
@@ -43,7 +44,8 @@
 using Utils::Vector;
 
 /* Number of nontrivial Baxter permutations of length 2n-1. (A001185) */
-#define TEST_NUMBERS {0, 1, 1, 7, 21, 112, 456, 2603, 13203}
+#define TEST_NUMBERS                                                           \
+  { 0, 1, 1, 7, 21, 112, 456, 2603, 13203 }
 
 constexpr int test_numbers[] = TEST_NUMBERS;
 constexpr std::size_t n_test_numbers = sizeof(test_numbers) / sizeof(int);
@@ -187,6 +189,13 @@ BOOST_AUTO_TEST_CASE(algebraic_operators) {
     BOOST_CHECK(v4 == (v3 /= 2));
   }
 
+  {
+    Utils::Vector3i v3{2, 12, 91};
+    Utils::Vector3i v4{180, 30, 3};
+    auto v5 = 360 / v3;
+    BOOST_CHECK(v5 == v4);
+  }
+
   BOOST_CHECK((sqrt(Utils::Vector3d{1., 2., 3.}) ==
                Utils::Vector3d{sqrt(1.), sqrt(2.), sqrt(3.)}));
 
@@ -308,6 +317,27 @@ BOOST_AUTO_TEST_CASE(conversion) {
     auto const result = Utils::Vector3d{orig.as_vector()};
     BOOST_TEST(result == orig);
   }
+
+  // check span conversion
+  {
+    auto const view = static_cast<std::span<double, 3>>(orig);
+    BOOST_TEST(view.data() == orig.data());
+    BOOST_TEST(view.size() == orig.size());
+  }
+
+  // check span conversion
+  {
+    auto const view = std::span(orig);
+    BOOST_TEST(view.data() == orig.data());
+    BOOST_TEST(view.size() == orig.size());
+  }
+
+  // check span conversion
+  {
+    auto const view = orig.as_span();
+    BOOST_TEST(view.data() == orig.data());
+    BOOST_TEST(view.size() == orig.size());
+  }
 }
 
 BOOST_AUTO_TEST_CASE(vector_product_test) {
diff --git a/src/utils/tests/abs_test.cpp b/src/utils/tests/abs_test.cpp
deleted file mode 100644
index 94e43728e7b..00000000000
--- a/src/utils/tests/abs_test.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2019-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#define BOOST_TEST_MODULE abs test
-#define BOOST_TEST_DYN_LINK
-#include <boost/test/unit_test.hpp>
-
-#include <utils/math/abs.hpp>
-
-#include <cmath>
-#include <type_traits>
-
-BOOST_AUTO_TEST_CASE(abs_test) {
-  using Utils::abs;
-
-  static_assert(std::is_same_v<float, decltype(abs(1.f))>);
-  static_assert(std::is_same_v<double, decltype(abs(1.))>);
-
-  BOOST_CHECK_EQUAL(std::abs(3.1415), abs(3.1415));
-  BOOST_CHECK_EQUAL(std::abs(-3.1415), abs(-3.1415));
-  BOOST_CHECK_EQUAL(std::abs(3.1415f), abs(3.1415f));
-  BOOST_CHECK_EQUAL(std::abs(-3.1415f), abs(-3.1415f));
-}
diff --git a/src/utils/tests/sinc_test.cpp b/src/utils/tests/sinc_test.cpp
deleted file mode 100644
index 063120ca971..00000000000
--- a/src/utils/tests/sinc_test.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (C) 2017-2022 The ESPResSo project
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#define BOOST_TEST_MODULE Utils::sinc test
-#define BOOST_TEST_DYN_LINK
-#include <boost/test/unit_test.hpp>
-
-#include "utils/math/sinc.hpp"
-
-#include <cmath>
-#include <cstdlib>
-#include <numbers>
-
-BOOST_AUTO_TEST_CASE(zero) { BOOST_CHECK_EQUAL(Utils::sinc(0.0), 1.0); }
-
-BOOST_AUTO_TEST_CASE(approx) {
-  auto x = 0.001;
-  while (x <= 0.11) {
-    auto const approx = Utils::sinc(x);
-    auto const pi_x = std::numbers::pi * x;
-    auto const exact = std::sin(pi_x) / (pi_x);
-    BOOST_CHECK_SMALL(approx - exact, 1e-13);
-    x += 0.01;
-  }
-}