From 77eb93a0d1dad5afcb18f6c4ed7a3c5ae2e2c0bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Fri, 14 Jun 2024 19:08:23 +0200
Subject: [PATCH] fft

---
 src/core/electrostatics/p3m.cpp               | 245 ++++++------
 src/core/electrostatics/p3m_gpu_cuda.cu       |  94 ++---
 src/core/electrostatics/p3m_gpu_error_cuda.cu |  55 +--
 src/core/magnetostatics/dp3m.cpp              | 360 ++++++++----------
 src/core/p3m/common.cpp                       |  44 ---
 src/core/p3m/common.hpp                       |  16 +-
 src/core/p3m/fft.cpp                          |   2 +-
 src/core/p3m/fft.hpp                          |  43 +--
 src/core/p3m/influence_function.hpp           | 115 +++---
 src/core/p3m/influence_function_dipolar.hpp   | 219 ++++++-----
 src/core/unit_tests/CMakeLists.txt            |   4 +-
 src/core/unit_tests/cuda_test.cu              | 329 ++++++++--------
 src/core/unit_tests/fft_test.cpp              |  58 ++-
 src/core/unit_tests/p3m_test.cpp              |  26 --
 src/utils/include/utils/Vector.hpp            |   2 +-
 src/utils/include/utils/math/abs.hpp          |  40 --
 src/utils/include/utils/math/sinc.hpp         |  61 ---
 src/utils/tests/CMakeLists.txt                |   2 -
 src/utils/tests/abs_test.cpp                  |  38 --
 src/utils/tests/sinc_test.cpp                 |  39 --
 20 files changed, 741 insertions(+), 1051 deletions(-)
 delete mode 100644 src/utils/include/utils/math/abs.hpp
 delete mode 100644 src/utils/include/utils/math/sinc.hpp
 delete mode 100644 src/utils/tests/abs_test.cpp
 delete mode 100644 src/utils/tests/sinc_test.cpp
diff --git a/src/core/electrostatics/p3m.cpp b/src/core/electrostatics/p3m.cpp
index e247b88418d..32667d67966 100644
--- a/src/core/electrostatics/p3m.cpp
+++ b/src/core/electrostatics/p3m.cpp
@@ -36,7 +36,9 @@
 #include "p3m/TuningAlgorithm.hpp"
 #include "p3m/TuningLogger.hpp"
 #include "p3m/fft.hpp"
+#include "p3m/for_each_3d.hpp"
 #include "p3m/influence_function.hpp"
+#include "p3m/math.hpp"
 
 #include "BoxGeometry.hpp"
 #include "LocalBox.hpp"
@@ -57,7 +59,6 @@
 #include <utils/Vector.hpp>
 #include <utils/integral_parameter.hpp>
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
@@ -75,9 +76,11 @@
 #include <functional>
 #include <numbers>
 #include <optional>
+#include <span>
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include <tuple>
 #include <utility>
 
 void CoulombP3M::count_charged_particles() {
@@ -114,7 +117,7 @@ void CoulombP3M::calc_influence_function_force() {
   auto const size = Utils::Vector3i{p3m.fft.plan[3].new_mesh};
 
   p3m.g_force = grid_influence_function<1>(p3m.params, start, start + size,
-                                           get_system().box_geo->length());
+                                           get_system().box_geo->length_inv());
 }
 
 /** Calculate the influence function optimized for the energy and the
@@ -125,41 +128,38 @@ void CoulombP3M::calc_influence_function_energy() {
   auto const size = Utils::Vector3i{p3m.fft.plan[3].new_mesh};
 
   p3m.g_energy = grid_influence_function<0>(p3m.params, start, start + size,
-                                            get_system().box_geo->length());
+                                            get_system().box_geo->length_inv());
 }
 
 /** Aliasing sum used by @ref p3m_k_space_error. */
-static auto p3m_tune_aliasing_sums(int nx, int ny, int nz,
+static auto p3m_tune_aliasing_sums(Utils::Vector3i const &shift,
                                    Utils::Vector3i const &mesh,
                                    Utils::Vector3d const &mesh_i, int cao,
                                    double alpha_L_i) {
-  using Utils::sinc;
 
+  auto constexpr m_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN);
+  auto constexpr m_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1);
   auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i);
-
   auto alias1 = 0.;
   auto alias2 = 0.;
-  for (int mx = -P3M_BRILLOUIN; mx <= P3M_BRILLOUIN; mx++) {
-    auto const nmx = nx + mx * mesh[0];
-    auto const fnmx = mesh_i[0] * nmx;
-    for (int my = -P3M_BRILLOUIN; my <= P3M_BRILLOUIN; my++) {
-      auto const nmy = ny + my * mesh[1];
-      auto const fnmy = mesh_i[1] * nmy;
-      for (int mz = -P3M_BRILLOUIN; mz <= P3M_BRILLOUIN; mz++) {
-        auto const nmz = nz + mz * mesh[2];
-        auto const fnmz = mesh_i[2] * nmz;
-
-        auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz);
-        auto const ex = exp(-factor1 * nm2);
-        auto const ex2 = Utils::sqr(ex);
-
-        auto const U2 = pow(sinc(fnmx) * sinc(fnmy) * sinc(fnmz), 2.0 * cao);
-
-        alias1 += ex2 / nm2;
-        alias2 += U2 * ex * (nx * nmx + ny * nmy + nz * nmz) / nm2;
-      }
-    }
-  }
+
+  Utils::Vector3i indices{};
+  Utils::Vector3i nm{};
+  Utils::Vector3d fnm{};
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() {
+        auto const norm_sq = nm.norm2();
+        auto const ex = exp(-factor1 * norm_sq);
+        auto const energy = std::pow(Utils::product(fnm), 2 * cao);
+        alias1 += Utils::sqr(ex) / norm_sq;
+        alias2 += energy * ex * (shift * nm) / norm_sq;
+      },
+      [&](unsigned dim, int n) {
+        nm[dim] = shift[dim] + n * mesh[dim];
+        fnm[dim] = math::sinc(nm[dim] * mesh_i[dim]);
+      });
+
   return std::make_pair(alias1, alias2);
 }
 
@@ -197,51 +197,40 @@ static double p3m_real_space_error(double pref, double r_cut_iL, int n_c_part,
 static double p3m_k_space_error(double pref, Utils::Vector3i const &mesh,
                                 int cao, int n_c_part, double sum_q2,
                                 double alpha_L, Utils::Vector3d const &box_l) {
-  auto const mesh_i =
-      Utils::hadamard_division(Utils::Vector3d::broadcast(1.), mesh);
+
+  auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao);
+  auto const mesh_i = 1. / Utils::Vector3d(mesh);
   auto const alpha_L_i = 1. / alpha_L;
+  auto const m_stop = mesh / 2;
+  auto const m_start = -m_stop;
+  auto indices = Utils::Vector3i{};
+  auto values = Utils::Vector3d{};
   auto he_q = 0.;
 
-  for (int nx = -mesh[0] / 2; nx < mesh[0] / 2; nx++) {
-    auto const ctan_x = p3m_analytic_cotangent_sum(nx, mesh_i[0], cao);
-    for (int ny = -mesh[1] / 2; ny < mesh[1] / 2; ny++) {
-      auto const ctan_y =
-          ctan_x * p3m_analytic_cotangent_sum(ny, mesh_i[1], cao);
-      for (int nz = -mesh[2] / 2; nz < mesh[2] / 2; nz++) {
-        if ((nx != 0) || (ny != 0) || (nz != 0)) {
-          auto const n2 = Utils::sqr(nx) + Utils::sqr(ny) + Utils::sqr(nz);
-          auto const cs =
-              p3m_analytic_cotangent_sum(nz, mesh_i[2], cao) * ctan_y;
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() {
+        if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) {
+          auto const n2 = indices.norm2();
+          auto const cs = Utils::product(values);
           auto const [alias1, alias2] =
-              p3m_tune_aliasing_sums(nx, ny, nz, mesh, mesh_i, cao, alpha_L_i);
+              p3m_tune_aliasing_sums(indices, mesh, mesh_i, cao, alpha_L_i);
           auto const d = alias1 - Utils::sqr(alias2 / cs) / n2;
           /* at high precision, d can become negative due to extinction;
              also, don't take values that have no significant digits left*/
-          if (d > 0 && (fabs(d / alias1) > ROUND_ERROR_PREC))
+          if (d > 0. and std::fabs(d / alias1) > ROUND_ERROR_PREC) {
             he_q += d;
+          }
         }
-      }
-    }
-  }
+      },
+      [&values, &mesh_i, cotangent_sum](unsigned dim, int n) {
+        values[dim] = cotangent_sum(n, mesh_i[dim]);
+      });
+
   return 2. * pref * sum_q2 * sqrt(he_q / static_cast<double>(n_c_part)) /
          (box_l[1] * box_l[2]);
 }
 
-#ifdef CUDA
-static double p3mgpu_k_space_error(double prefactor,
-                                   Utils::Vector3i const &mesh, int cao,
-                                   int npart, double sum_q2, double alpha_L,
-                                   Utils::Vector3d const &box_l) {
-  auto ks_err = 0.;
-  if (this_node == 0) {
-    ks_err = p3m_k_space_error_gpu(prefactor, mesh.data(), cao, npart, sum_q2,
-                                   alpha_L, box_l.data());
-  }
-  boost::mpi::broadcast(comm_cart, ks_err, 0);
-  return ks_err;
-}
-#endif
-
 void CoulombP3M::init() {
   assert(p3m.params.mesh >= Utils::Vector3i::broadcast(1));
   assert(p3m.params.cao >= 1 and p3m.params.cao <= 7);
@@ -421,48 +410,44 @@ CoulombP3M::long_range_pressure(ParticleRange const &particles) {
     p3m.sm.gather_grid(p3m.rs_mesh.data(), comm_cart, p3m.local_mesh.dim);
     fft_perform_forw(p3m.rs_mesh.data(), p3m.fft, comm_cart);
 
-    auto diagonal = 0.;
-    int ind = 0;
-    int j[3];
+    auto constexpr m_start = Utils::Vector3i::broadcast(0);
     auto const half_alpha_inv_sq = Utils::sqr(1. / 2. / p3m.params.alpha);
-    for (j[0] = 0; j[0] < p3m.fft.plan[3].new_mesh[RX]; j[0]++) {
-      for (j[1] = 0; j[1] < p3m.fft.plan[3].new_mesh[RY]; j[1]++) {
-        for (j[2] = 0; j[2] < p3m.fft.plan[3].new_mesh[RZ]; j[2]++) {
-          auto const kx = 2. * std::numbers::pi *
-                          p3m.d_op[RX][j[KX] + p3m.fft.plan[3].start[KX]] *
-                          box_geo.length_inv()[RX];
-          auto const ky = 2. * std::numbers::pi *
-                          p3m.d_op[RY][j[KY] + p3m.fft.plan[3].start[KY]] *
-                          box_geo.length_inv()[RY];
-          auto const kz = 2. * std::numbers::pi *
-                          p3m.d_op[RZ][j[KZ] + p3m.fft.plan[3].start[KZ]] *
-                          box_geo.length_inv()[RZ];
-          auto const sqk = Utils::sqr(kx) + Utils::sqr(ky) + Utils::sqr(kz);
-
-          if (sqk != 0.) {
-            auto const node_k_space_energy =
-                p3m.g_energy[ind] * (Utils::sqr(p3m.rs_mesh[2 * ind]) +
-                                     Utils::sqr(p3m.rs_mesh[2 * ind + 1]));
-            auto const vterm = -2. * (1. / sqk + half_alpha_inv_sq);
-            auto const pref = node_k_space_energy * vterm;
-            node_k_space_pressure_tensor[0] += pref * kx * kx; /* sigma_xx */
-            node_k_space_pressure_tensor[1] += pref * kx * ky; /* sigma_xy */
-            node_k_space_pressure_tensor[2] += pref * kx * kz; /* sigma_xz */
-            node_k_space_pressure_tensor[3] += pref * ky * kx; /* sigma_yx */
-            node_k_space_pressure_tensor[4] += pref * ky * ky; /* sigma_yy */
-            node_k_space_pressure_tensor[5] += pref * ky * kz; /* sigma_yz */
-            node_k_space_pressure_tensor[6] += pref * kz * kx; /* sigma_zx */
-            node_k_space_pressure_tensor[7] += pref * kz * ky; /* sigma_zy */
-            node_k_space_pressure_tensor[8] += pref * kz * kz; /* sigma_zz */
-            diagonal += node_k_space_energy;
-          }
-          ind++;
-        }
+    auto const m_stop = std::span(p3m.fft.plan[3].new_mesh);
+    auto const offset = std::span(p3m.fft.plan[3].start);
+    auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv();
+    auto indices = Utils::Vector3i{};
+    auto index = std::size_t(0u);
+    auto diagonal = 0.;
+
+    for_each_3d(m_start, m_stop, indices, [&]() {
+      auto const kx = p3m.d_op[RX][indices[KX] + offset[KX]] * wavevector[RX];
+      auto const ky = p3m.d_op[RY][indices[KY] + offset[KY]] * wavevector[RY];
+      auto const kz = p3m.d_op[RZ][indices[KZ] + offset[KZ]] * wavevector[RZ];
+      auto const norm_sq = Utils::sqr(kx) + Utils::sqr(ky) + Utils::sqr(kz);
+
+      if (norm_sq != 0.) {
+        auto const node_k_space_energy =
+            p3m.g_energy[index] * (Utils::sqr(p3m.rs_mesh[2u * index + 0u]) +
+                                   Utils::sqr(p3m.rs_mesh[2u * index + 1u]));
+        auto const vterm = -2. * (1. / norm_sq + half_alpha_inv_sq);
+        auto const pref = node_k_space_energy * vterm;
+        node_k_space_pressure_tensor[0u] += pref * kx * kx; /* sigma_xx */
+        node_k_space_pressure_tensor[1u] += pref * kx * ky; /* sigma_xy */
+        node_k_space_pressure_tensor[2u] += pref * kx * kz; /* sigma_xz */
+        node_k_space_pressure_tensor[4u] += pref * ky * ky; /* sigma_yy */
+        node_k_space_pressure_tensor[5u] += pref * ky * kz; /* sigma_yz */
+        node_k_space_pressure_tensor[8u] += pref * kz * kz; /* sigma_zz */
+        diagonal += node_k_space_energy;
       }
-    }
-    node_k_space_pressure_tensor[0] += diagonal;
-    node_k_space_pressure_tensor[4] += diagonal;
-    node_k_space_pressure_tensor[8] += diagonal;
+      ++index;
+    });
+
+    node_k_space_pressure_tensor[0u] += diagonal;
+    node_k_space_pressure_tensor[4u] += diagonal;
+    node_k_space_pressure_tensor[8u] += diagonal;
+    node_k_space_pressure_tensor[3u] = node_k_space_pressure_tensor[1u];
+    node_k_space_pressure_tensor[6u] = node_k_space_pressure_tensor[2u];
+    node_k_space_pressure_tensor[7u] = node_k_space_pressure_tensor[5u];
   }
 
   return node_k_space_pressure_tensor * prefactor / (2. * box_geo.volume());
@@ -509,35 +494,35 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
 
   /* === k-space force calculation  === */
   if (force_flag) {
-    /* sqrt(-1)*k differentiation */
-    int j[3];
-    int ind = 0;
+    /* i*k differentiation */
+    auto constexpr m_start = Utils::Vector3i::broadcast(0);
+    auto const m_stop = std::span(p3m.fft.plan[3].new_mesh);
+    auto const offset = std::span(p3m.fft.plan[3].start);
+    auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv();
+    auto indices = Utils::Vector3i{};
+    auto index = std::size_t(0u);
+
     /* compute electric field */
     // Eq. (3.49) @cite deserno00b
-    for (j[0] = 0; j[0] < p3m.fft.plan[3].new_mesh[0]; j[0]++) {
-      for (j[1] = 0; j[1] < p3m.fft.plan[3].new_mesh[1]; j[1]++) {
-        for (j[2] = 0; j[2] < p3m.fft.plan[3].new_mesh[2]; j[2]++) {
-          auto const rho_hat = std::complex<double>(p3m.rs_mesh[2 * ind + 0],
-                                                    p3m.rs_mesh[2 * ind + 1]);
-          auto const phi_hat = p3m.g_force[ind] * rho_hat;
-
-          for (int d = 0; d < 3; d++) {
-            /* direction in r-space: */
-            int d_rs = (d + p3m.ks_pnum) % 3;
-            /* directions */
-            auto const k = 2. * std::numbers::pi *
-                           p3m.d_op[d_rs][j[d] + p3m.fft.plan[3].start[d]] *
-                           box_geo.length_inv()[d_rs];
-
-            /* i*k*(Re+i*Im) = - Im*k + i*Re*k     (i=sqrt(-1)) */
-            p3m.E_mesh[d_rs][2 * ind + 0] = -k * phi_hat.imag();
-            p3m.E_mesh[d_rs][2 * ind + 1] = +k * phi_hat.real();
-          }
-
-          ind++;
-        }
+    for_each_3d(m_start, m_stop, indices, [&]() {
+      auto const rho_hat = std::complex<double>(p3m.rs_mesh[2u * index + 0u],
+                                                p3m.rs_mesh[2u * index + 1u]);
+      auto const phi_hat = p3m.g_force[index] * rho_hat;
+
+      for (int d = 0; d < 3; d++) {
+        /* direction in r-space: */
+        int d_rs = (d + p3m.ks_pnum) % 3;
+        /* directions */
+        auto const k =
+            p3m.d_op[d_rs][indices[d] + offset[d]] * wavevector[d_rs];
+
+        /* i*k*(Re+i*Im) = - Im*k + i*Re*k     (i=sqrt(-1)) */
+        p3m.E_mesh[d_rs][2u * index + 0u] = -k * phi_hat.imag();
+        p3m.E_mesh[d_rs][2u * index + 1u] = +k * phi_hat.real();
       }
-    }
+
+      ++index;
+    });
 
     /* Back FFT force component mesh */
     auto const check_complex = !p3m.params.tuning and check_complex_residuals;
@@ -678,8 +663,12 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
 #ifdef CUDA
     auto const &solver = m_system.coulomb.impl->solver;
     if (has_actor_of_type<CoulombP3MGPU>(solver)) {
-      ks_err = p3mgpu_k_space_error(m_prefactor, mesh, cao, p3m.sum_qpart,
-                                    p3m.sum_q2, alpha_L, box_geo.length());
+      if (this_node == 0) {
+        ks_err =
+            p3m_k_space_error_gpu(m_prefactor, mesh.data(), cao, p3m.sum_qpart,
+                                  p3m.sum_q2, alpha_L, box_geo.length().data());
+      }
+      boost::mpi::broadcast(comm_cart, ks_err, 0);
     } else
 #endif
       ks_err = p3m_k_space_error(m_prefactor, mesh, cao, p3m.sum_qpart,
@@ -807,7 +796,7 @@ void CoulombP3M::sanity_checks_boxl() const {
   auto const &system = get_system();
   auto const &box_geo = *system.box_geo;
   auto const &local_geo = *system.local_geo;
-  for (unsigned int i = 0u; i < 3u; i++) {
+  for (auto i = 0u; i < 3u; i++) {
     /* check k-space cutoff */
     if (p3m.params.cao_cut[i] >= box_geo.length_half()[i]) {
       std::stringstream msg;
diff --git a/src/core/electrostatics/p3m_gpu_cuda.cu b/src/core/electrostatics/p3m_gpu_cuda.cu
index 6429c8afeed..c734ba0cccd 100644
--- a/src/core/electrostatics/p3m_gpu_cuda.cu
+++ b/src/core/electrostatics/p3m_gpu_cuda.cu
@@ -53,20 +53,20 @@
 #include "electrostatics/p3m_gpu_cuda.cuh"
 
 #include "cuda/utils.cuh"
+#include "p3m/math.hpp"
 #include "system/System.hpp"
 
 #include <utils/math/bspline.hpp>
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <cuda.h>
 #include <cufft.h>
 
 #include <algorithm>
+#include <cassert>
 #include <cstddef>
-#include <cstdio>
-#include <cstdlib>
+#include <limits>
 #include <numbers>
 #include <stdexcept>
 
@@ -93,7 +93,7 @@ struct P3MGpuData {
   /** Ewald parameter */
   REAL_TYPE alpha;
   /** Number of particles */
-  std::size_t n_part;
+  unsigned int n_part; // oddity: size_t causes UB with GCC 11.4 in Debug mode
   /** Box size */
   REAL_TYPE box[3];
   /** Mesh dimensions */
@@ -138,6 +138,31 @@ struct P3MGpuParams {
   }
 };
 
+static auto p3m_calc_blocks(unsigned int cao, std::size_t n_part) {
+  auto const cao3 = Utils::int_pow<3>(cao);
+  auto parts_per_block = 1u;
+  while ((parts_per_block + 1u) * cao3 <= 1024u) {
+    ++parts_per_block;
+  }
+  assert((n_part / parts_per_block) <= std::numeric_limits<unsigned>::max());
+  auto n = static_cast<unsigned int>(n_part / parts_per_block);
+  auto n_blocks = ((n_part % parts_per_block) == 0u) ? std::max(1u, n) : n + 1u;
+  assert(n_blocks <= std::numeric_limits<unsigned>::max());
+  return std::make_pair(parts_per_block, static_cast<unsigned>(n_blocks));
+}
+
+dim3 p3m_make_grid(unsigned int n_blocks) {
+  dim3 grid(n_blocks, 1u, 1u);
+  while (grid.x > 65536u) {
+    grid.y++;
+    if ((n_blocks % grid.y) == 0u)
+      grid.x = std::max(1u, n_blocks / grid.y);
+    else
+      grid.x = n_blocks / grid.y + 1u;
+  }
+  return grid;
+}
+
 template <int cao>
 __device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY,
                                         int NZ, REAL_TYPE *Zaehler,
@@ -160,15 +185,15 @@ __device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY,
   for (MX = -P3M_BRILLOUIN; MX <= P3M_BRILLOUIN; MX++) {
     NMX = static_cast<REAL_TYPE>(((NX > p.mesh[0] / 2) ? NX - p.mesh[0] : NX) +
                                  p.mesh[0] * MX);
-    S1 = int_pow<2 * cao>(Utils::sinc(Meshi[0] * NMX));
+    S1 = int_pow<2 * cao>(math::sinc(Meshi[0] * NMX));
     for (MY = -P3M_BRILLOUIN; MY <= P3M_BRILLOUIN; MY++) {
       NMY = static_cast<REAL_TYPE>(
           ((NY > p.mesh[1] / 2) ? NY - p.mesh[1] : NY) + p.mesh[1] * MY);
-      S2 = S1 * int_pow<2 * cao>(Utils::sinc(Meshi[1] * NMY));
+      S2 = S1 * int_pow<2 * cao>(math::sinc(Meshi[1] * NMY));
       for (MZ = -P3M_BRILLOUIN; MZ <= P3M_BRILLOUIN; MZ++) {
         NMZ = static_cast<REAL_TYPE>(
             ((NZ > p.mesh[2] / 2) ? NZ - p.mesh[2] : NZ) + p.mesh[2] * MZ);
-        S3 = S2 * int_pow<2 * cao>(Utils::sinc(Meshi[2] * NMZ));
+        S3 = S2 * int_pow<2 * cao>(math::sinc(Meshi[2] * NMZ));
 
         NM2 = sqr(NMX * Leni[0]) + sqr(NMY * Leni[1]) + sqr(NMZ * Leni[2]);
         *Nenner += S3;
@@ -366,26 +391,9 @@ void assign_charges(P3MGpuData const &params,
                     float const *const __restrict__ part_pos,
                     float const *const __restrict__ part_q) {
   auto const cao = static_cast<unsigned int>(params.cao);
-  auto const cao3 = int_pow<3>(cao);
-  unsigned int parts_per_block = 1u, n_blocks = 1u;
-
-  while ((parts_per_block + 1u) * cao3 <= 1024u) {
-    parts_per_block++;
-  }
-  if ((params.n_part % parts_per_block) == 0u)
-    n_blocks = std::max<unsigned>(1u, params.n_part / parts_per_block);
-  else
-    n_blocks = params.n_part / parts_per_block + 1u;
-
+  auto const [parts_per_block, n_blocks] = p3m_calc_blocks(cao, params.n_part);
   dim3 block(parts_per_block * cao, cao, cao);
-  dim3 grid(n_blocks, 1u, 1u);
-  while (grid.x > 65536u) {
-    grid.y++;
-    if ((n_blocks % grid.y) == 0u)
-      grid.x = std::max<unsigned>(1u, n_blocks / grid.y);
-    else
-      grid.x = n_blocks / grid.y + 1u;
-  }
+  dim3 grid = p3m_make_grid(n_blocks);
 
   auto const data_length = std::size_t(3u) *
                            static_cast<std::size_t>(parts_per_block) *
@@ -438,7 +446,7 @@ __global__ void assign_forces_kernel(P3MGpuData const params,
   /* id of the particle */
   auto const id =
       parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block;
-  if (id >= params.n_part)
+  if (id >= static_cast<unsigned>(params.n_part))
     return;
   /* position relative to the closest grid point */
   REAL_TYPE m_pos[3];
@@ -500,27 +508,9 @@ void assign_forces(P3MGpuData const &params,
                    float *const __restrict__ part_f,
                    REAL_TYPE const prefactor) {
   auto const cao = static_cast<unsigned int>(params.cao);
-  auto const cao3 = int_pow<3>(cao);
-  unsigned int parts_per_block = 1u, n_blocks = 1u;
-
-  while ((parts_per_block + 1u) * cao3 <= 1024u) {
-    parts_per_block++;
-  }
-
-  if ((params.n_part % parts_per_block) == 0u)
-    n_blocks = std::max<unsigned>(1u, params.n_part / parts_per_block);
-  else
-    n_blocks = params.n_part / parts_per_block + 1u;
-
+  auto const [parts_per_block, n_blocks] = p3m_calc_blocks(cao, params.n_part);
   dim3 block(parts_per_block * cao, cao, cao);
-  dim3 grid(n_blocks, 1u, 1u);
-  while (grid.x > 65536u) {
-    grid.y++;
-    if (n_blocks % grid.y == 0u)
-      grid.x = std::max<unsigned>(1u, n_blocks / grid.y);
-    else
-      grid.x = n_blocks / grid.y + 1u;
-  }
+  dim3 grid = p3m_make_grid(n_blocks);
 
   /* Switch for assignment templates, the shared version only is faster for cao
    * > 2 */
@@ -581,7 +571,8 @@ void p3m_gpu_init(std::shared_ptr<P3MGpuParams> &data, int cao,
 
   auto &p3m_gpu_data = data->p3m_gpu_data;
   bool do_reinit = false, mesh_changed = false;
-  p3m_gpu_data.n_part = n_part;
+  assert(n_part <= std::numeric_limits<unsigned int>::max());
+  p3m_gpu_data.n_part = static_cast<unsigned>(n_part);
 
   if (not data->is_initialized or p3m_gpu_data.alpha != alpha) {
     p3m_gpu_data.alpha = static_cast<REAL_TYPE>(alpha);
@@ -694,9 +685,9 @@ void p3m_gpu_init(std::shared_ptr<P3MGpuParams> &data, int cao,
 void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu,
                                 double prefactor, std::size_t n_part) {
   auto &p3m_gpu_data = data.p3m_gpu_data;
-  p3m_gpu_data.n_part = n_part;
+  p3m_gpu_data.n_part = static_cast<unsigned>(n_part);
 
-  if (p3m_gpu_data.n_part == 0u)
+  if (n_part == 0u)
     return;
 
   auto const positions_device = gpu.get_particle_positions_device();
@@ -722,8 +713,7 @@ void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu,
   if (FFT_FORW_FFT(data.p3m_fft.forw_plan,
                    (REAL_TYPE *)p3m_gpu_data.charge_mesh,
                    p3m_gpu_data.charge_mesh) != CUFFT_SUCCESS) {
-    fprintf(stderr, "CUFFT error: Forward FFT failed\n");
-    return;
+    throw std::runtime_error("CUFFT error: Forward FFT failed");
   }
 
   /* Do convolution */
diff --git a/src/core/electrostatics/p3m_gpu_error_cuda.cu b/src/core/electrostatics/p3m_gpu_error_cuda.cu
index 6e54e2dfe7e..fbc51fecf4a 100644
--- a/src/core/electrostatics/p3m_gpu_error_cuda.cu
+++ b/src/core/electrostatics/p3m_gpu_error_cuda.cu
@@ -23,12 +23,12 @@
  *  The corresponding header file is p3m_gpu_error.hpp.
  */
 
+#include "p3m/math.hpp"
 #include "p3m_gpu_error.hpp"
 
 #include "cuda/utils.cuh"
 
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <thrust/device_vector.h>
@@ -44,49 +44,12 @@
 #error CU-file includes mpi.h! This should not happen!
 #endif
 
-using Utils::int_pow;
-using Utils::sqr;
-
-/** @todo Extend to higher order. This comes from some 1/sin expansion in
- *  @cite hockney88a
- */
-
-template <int cao>
-__device__ static double p3m_analytic_cotangent_sum(int n, double mesh_i) {
-  const double c = sqr(cos(std::numbers::pi * mesh_i * n));
-
-  switch (cao) {
-  case 1:
-    return 1;
-  case 2:
-    return (1.0 + c * 2.0) / 3.0;
-  case 3:
-    return (2.0 + c * (11.0 + c * 2.0)) / 15.0;
-  case 4:
-    return (17.0 + c * (180.0 + c * (114.0 + c * 4.0))) / 315.0;
-  case 5:
-    return (62.0 + c * (1072.0 + c * (1452.0 + c * (247.0 + c * 2.0)))) /
-           2835.0;
-  case 6:
-    return (1382.0 +
-            c * (35396.0 +
-                 c * (83021.0 + c * (34096.0 + c * (2026.0 + c * 4.0))))) /
-           155925.0;
-  case 7:
-    return (21844.0 +
-            c * (776661.0 +
-                 c * (2801040.0 +
-                      c * (2123860.0 +
-                           c * (349500.0 + c * (8166.0 + c * 4.0)))))) /
-           6081075.0;
-  }
-
-  return 0.0;
-}
-
 template <int cao>
 __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi,
                                                 double alpha_L, double *he_q) {
+  using Utils::int_pow;
+  using Utils::sqr;
+
   const int nx =
       -mesh.x / 2 + static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
   const int ny =
@@ -103,14 +66,14 @@ __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi,
   if ((nx != 0) || (ny != 0) || (nz != 0)) {
     const double alpha_L_i = 1. / alpha_L;
     const double n2 = sqr(nx) + sqr(ny) + sqr(nz);
-    const double cs = p3m_analytic_cotangent_sum<cao>(nz, meshi.z) *
-                      p3m_analytic_cotangent_sum<cao>(nx, meshi.x) *
-                      p3m_analytic_cotangent_sum<cao>(ny, meshi.y);
+    const double cs = math::analytic_cotangent_sum<cao>(nz, meshi.z) *
+                      math::analytic_cotangent_sum<cao>(nx, meshi.x) *
+                      math::analytic_cotangent_sum<cao>(ny, meshi.y);
     const double ex = exp(-sqr(std::numbers::pi * alpha_L_i) * n2);
     const double ex2 = sqr(ex);
     const double U2 =
-        int_pow<2 * cao>(Utils::sinc(meshi.x * nx) * Utils::sinc(meshi.y * ny) *
-                         Utils::sinc(meshi.z * nz));
+        int_pow<2 * cao>(math::sinc(meshi.x * nx) * math::sinc(meshi.y * ny) *
+                         math::sinc(meshi.z * nz));
     auto const alias1 = ex2 / n2;
     auto const d = alias1 - sqr(U2 * ex / cs) / n2;
 
diff --git a/src/core/magnetostatics/dp3m.cpp b/src/core/magnetostatics/dp3m.cpp
index 4fcad22318e..4214900fb52 100644
--- a/src/core/magnetostatics/dp3m.cpp
+++ b/src/core/magnetostatics/dp3m.cpp
@@ -37,6 +37,7 @@
 #include "p3m/fft.hpp"
 #include "p3m/influence_function_dipolar.hpp"
 #include "p3m/interpolation.hpp"
+#include "p3m/math.hpp"
 #include "p3m/send_mesh.hpp"
 
 #include "BoxGeometry.hpp"
@@ -56,7 +57,6 @@
 #include <utils/Vector.hpp>
 #include <utils/integral_parameter.hpp>
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
@@ -67,10 +67,13 @@
 #include <cstddef>
 #include <cstdio>
 #include <functional>
+#include <iterator>
 #include <numbers>
 #include <optional>
+#include <span>
 #include <sstream>
 #include <stdexcept>
+#include <tuple>
 #include <vector>
 
 void DipolarP3M::count_magnetic_particles() {
@@ -272,15 +275,13 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
     dipole_assign(particles);
     /* Gather information for FFT grid inside the nodes domain (inner local
      * mesh) and perform forward 3D FFT (Charge Assignment Mesh). */
-    std::array<double *, 3> meshes = {{dp3m.rs_mesh_dip[0].data(),
-                                       dp3m.rs_mesh_dip[1].data(),
-                                       dp3m.rs_mesh_dip[2].data()}};
-
+    std::array<double *, 3> meshes = {{dp3m.rs_mesh_dip[0u].data(),
+                                       dp3m.rs_mesh_dip[1u].data(),
+                                       dp3m.rs_mesh_dip[2u].data()}};
     dp3m.sm.gather_grid(meshes, comm_cart, dp3m.local_mesh.dim);
-
-    fft_perform_forw(dp3m.rs_mesh_dip[0].data(), dp3m.fft, comm_cart);
-    fft_perform_forw(dp3m.rs_mesh_dip[1].data(), dp3m.fft, comm_cart);
-    fft_perform_forw(dp3m.rs_mesh_dip[2].data(), dp3m.fft, comm_cart);
+    for (auto i = 0u; i < 3u; ++i) {
+      fft_perform_forw(dp3m.rs_mesh_dip[i].data(), dp3m.fft, comm_cart);
+    }
     // Note: after these calls, the grids are in the order yzx and not xyz
     // anymore!!!
   }
@@ -293,34 +294,32 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
     if (dp3m.sum_mu2 > 0.) {
       /* i*k differentiation for dipolar gradients:
        * |(\Fourier{\vect{mu}}(k)\cdot \vect{k})|^2 */
-      int ind = 0;
-      int i = 0;
-      int j[3];
-      double node_energy = 0.0;
-      for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) {
-        for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; j[1]++) {
-          for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; j[2]++) {
-            node_energy +=
-                dp3m.g_energy[i] *
-                (Utils::sqr(
-                     dp3m.rs_mesh_dip[0][ind] *
-                         dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                     dp3m.rs_mesh_dip[1][ind] *
-                         dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                     dp3m.rs_mesh_dip[2][ind] *
-                         dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]) +
-                 Utils::sqr(
-                     dp3m.rs_mesh_dip[0][ind + 1] *
-                         dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                     dp3m.rs_mesh_dip[1][ind + 1] *
-                         dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                     dp3m.rs_mesh_dip[2][ind + 1] *
-                         dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]));
-            ind += 2;
-            i++;
-          }
-        }
-      }
+
+      auto constexpr m_start = Utils::Vector3i::broadcast(0);
+      auto const m_stop = std::span(dp3m.fft.plan[3].new_mesh);
+      auto const offset = std::span(dp3m.fft.plan[3].start);
+      auto const &d_op = dp3m.d_op[0u];
+      auto const &mesh_dip = dp3m.rs_mesh_dip;
+      auto indices = Utils::Vector3i{};
+      auto index = std::size_t(0u);
+      auto it_energy = dp3m.g_energy.begin();
+      auto node_energy = 0.;
+      for_each_3d(m_start, m_stop, indices, [&]() {
+        using namespace detail::FFT_indexing;
+        // Re(mu)*k
+        auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
+                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
+                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        ++index;
+        // Im(mu)*k
+        auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
+                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
+                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        ++index;
+        node_energy += *it_energy * (Utils::sqr(re) + Utils::sqr(im));
+        std::advance(it_energy, 1);
+      });
+
       node_energy *= dipole_prefac * std::numbers::pi * box_geo.length_inv()[0];
       boost::mpi::reduce(comm_cart, node_energy, energy, std::plus<>(), 0);
 
@@ -344,62 +343,45 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
      * DIPOLAR TORQUES (k-space)
      ****************************/
     if (dp3m.sum_mu2 > 0.) {
-      auto const two_pi_L_i = 2. * std::numbers::pi * box_geo.length_inv()[0];
+      auto const wavenumber = 2. * std::numbers::pi * box_geo.length_inv()[0u];
+      auto constexpr m_start = Utils::Vector3i::broadcast(0);
+      auto const m_stop = std::span(dp3m.fft.plan[3].new_mesh);
+      auto const offset = std::span(dp3m.fft.plan[3].start);
+      auto const &d_op = dp3m.d_op[0u];
+      auto const &mesh_dip = dp3m.rs_mesh_dip;
+      auto indices = Utils::Vector3i{};
+      auto index = std::size_t(0u);
+
       /* fill in ks_mesh array for torque calculation */
-      int ind = 0;
-      int i = 0;
-      int j[3];
-      double tmp0, tmp1;
-
-      for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { // j[0]=n_y
-        for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1];
-             j[1]++) { // j[1]=n_z
-          for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2];
-               j[2]++) { // j[2]=n_x
-            // tmp0 = Re(mu)*k,   tmp1 = Im(mu)*k
-
-            tmp0 = dp3m.rs_mesh_dip[0][ind] *
-                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                   dp3m.rs_mesh_dip[1][ind] *
-                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                   dp3m.rs_mesh_dip[2][ind] *
-                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
-
-            tmp1 = dp3m.rs_mesh_dip[0][ind + 1] *
-                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                   dp3m.rs_mesh_dip[1][ind + 1] *
-                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                   dp3m.rs_mesh_dip[2][ind + 1] *
-                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
-
-            /* the optimal influence function is the same for torques
-               and energy */
-            dp3m.ks_mesh[ind] = tmp0 * dp3m.g_energy[i];
-            dp3m.ks_mesh[ind + 1] = tmp1 * dp3m.g_energy[i];
-            ind += 2;
-            i++;
-          }
-        }
-      }
+      auto it_energy = dp3m.g_energy.begin();
+      index = 0u;
+      for_each_3d(m_start, m_stop, indices, [&]() {
+        using namespace detail::FFT_indexing;
+        // Re(mu)*k
+        auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
+                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
+                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        dp3m.ks_mesh[index] = *it_energy * re;
+        ++index;
+        // Im(mu)*k
+        auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
+                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
+                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        dp3m.ks_mesh[index] = *it_energy * im;
+        ++index;
+        std::advance(it_energy, 1);
+      });
 
       /* Force component loop */
       for (int d = 0; d < 3; d++) {
-        auto const d_rs = (d + dp3m.ks_pnum) % 3;
-        ind = 0;
-        for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) {
-          for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; j[1]++) {
-            for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; j[2]++) {
-              dp3m.rs_mesh[ind] =
-                  dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
-                  dp3m.ks_mesh[ind];
-              ind++;
-              dp3m.rs_mesh[ind] =
-                  dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
-                  dp3m.ks_mesh[ind];
-              ind++;
-            }
-          }
-        }
+        index = 0u;
+        for_each_3d(m_start, m_stop, indices, [&]() {
+          auto const pos = indices[d] + offset[d];
+          dp3m.rs_mesh[index] = d_op[pos] * dp3m.ks_mesh[index];
+          ++index;
+          dp3m.rs_mesh[index] = d_op[pos] * dp3m.ks_mesh[index];
+          ++index;
+        });
 
         /* Back FFT force component mesh */
         fft_perform_back(dp3m.rs_mesh.data(), false, dp3m.fft, comm_cart);
@@ -407,94 +389,68 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
         dp3m.sm.spread_grid(dp3m.rs_mesh.data(), comm_cart,
                             dp3m.local_mesh.dim);
         /* Assign force component from mesh to particle */
+        auto const d_rs = (d + dp3m.ks_pnum) % 3;
         Utils::integral_parameter<int, AssignTorques, 1, 7>(
-            dp3m.params.cao, dp3m, dipole_prefac * two_pi_L_i, d_rs, particles);
+            dp3m.params.cao, dp3m, dipole_prefac * wavenumber, d_rs, particles);
       }
 
       /***************************
          DIPOLAR FORCES (k-space)
       ****************************/
-
       // Compute forces after torques because the algorithm below overwrites the
       // grids dp3m.rs_mesh_dip !
       // Note: I'll do here 9 inverse FFTs. By symmetry, we can reduce this
       // number to 6 !
       /* fill in ks_mesh array for force calculation */
-      ind = 0;
-      i = 0;
-      for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { // j[0]=n_y
-        for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1];
-             j[1]++) { // j[1]=n_z
-          for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2];
-               j[2]++) { // j[2]=n_x
-            // tmp0 = Im(mu)*k,   tmp1 = -Re(mu)*k
-            tmp0 = dp3m.rs_mesh_dip[0][ind + 1] *
-                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                   dp3m.rs_mesh_dip[1][ind + 1] *
-                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                   dp3m.rs_mesh_dip[2][ind + 1] *
-                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
-            tmp1 = dp3m.rs_mesh_dip[0][ind] *
-                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                   dp3m.rs_mesh_dip[1][ind] *
-                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                   dp3m.rs_mesh_dip[2][ind] *
-                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
-            dp3m.ks_mesh[ind] = tmp0 * dp3m.g_force[i];
-            dp3m.ks_mesh[ind + 1] = -tmp1 * dp3m.g_force[i];
-            ind += 2;
-            i++;
-          }
-        }
-      }
+      auto it_force = dp3m.g_force.begin();
+      index = 0u;
+      for_each_3d(m_start, m_stop, indices, [&]() {
+        using namespace detail::FFT_indexing;
+        // Re(mu)*k
+        auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
+                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
+                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        ++index;
+        // Im(mu)*k
+        auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
+                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
+                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        ++index;
+        dp3m.ks_mesh[index - 2] = *it_force * im;
+        dp3m.ks_mesh[index - 1] = *it_force * (-re);
+        std::advance(it_force, 1);
+      });
 
       /* Force component loop */
-      for (int d = 0; d < 3; d++) { /* direction in k-space: */
-        auto const d_rs = (d + dp3m.ks_pnum) % 3;
-        ind = 0;
-        for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0];
-             j[0]++) { // j[0]=n_y
-          for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1];
-               j[1]++) { // j[1]=n_z
-            for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2];
-                 j[2]++) { // j[2]=n_x
-              tmp0 = dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
-                     dp3m.ks_mesh[ind];
-              dp3m.rs_mesh_dip[0][ind] =
-                  dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] * tmp0;
-              dp3m.rs_mesh_dip[1][ind] =
-                  dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] * tmp0;
-              dp3m.rs_mesh_dip[2][ind] =
-                  dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]] * tmp0;
-              ind++;
-              tmp0 = dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
-                     dp3m.ks_mesh[ind];
-              dp3m.rs_mesh_dip[0][ind] =
-                  dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] * tmp0;
-              dp3m.rs_mesh_dip[1][ind] =
-                  dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] * tmp0;
-              dp3m.rs_mesh_dip[2][ind] =
-                  dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]] * tmp0;
-              ind++;
-            }
-          }
-        }
+      for (int d = 0; d < 3; d++) {
+        auto &rs_mesh_dip = dp3m.rs_mesh_dip;
+        index = 0u;
+        for_each_3d(m_start, m_stop, indices, [&]() {
+          using namespace detail::FFT_indexing;
+          auto const f1 = d_op[indices[d] + offset[d]] * dp3m.ks_mesh[index];
+          rs_mesh_dip[0][index] = d_op[indices[KX] + offset[KX]] * f1;
+          rs_mesh_dip[1][index] = d_op[indices[KY] + offset[KY]] * f1;
+          rs_mesh_dip[2][index] = d_op[indices[KZ] + offset[KZ]] * f1;
+          ++index;
+          auto const f2 = d_op[indices[d] + offset[d]] * dp3m.ks_mesh[index];
+          rs_mesh_dip[0][index] = d_op[indices[KX] + offset[KX]] * f2;
+          rs_mesh_dip[1][index] = d_op[indices[KY] + offset[KY]] * f2;
+          rs_mesh_dip[2][index] = d_op[indices[KZ] + offset[KZ]] * f2;
+          ++index;
+        });
         /* Back FFT force component mesh */
-        fft_perform_back(dp3m.rs_mesh_dip[0].data(), false, dp3m.fft,
-                         comm_cart);
-        fft_perform_back(dp3m.rs_mesh_dip[1].data(), false, dp3m.fft,
-                         comm_cart);
-        fft_perform_back(dp3m.rs_mesh_dip[2].data(), false, dp3m.fft,
-                         comm_cart);
+        for (auto i = 0u; i < 3u; ++i) {
+          fft_perform_back(rs_mesh_dip[i].data(), false, dp3m.fft, comm_cart);
+        }
         /* redistribute force component mesh */
-        std::array<double *, 3> meshes = {{dp3m.rs_mesh_dip[0].data(),
-                                           dp3m.rs_mesh_dip[1].data(),
-                                           dp3m.rs_mesh_dip[2].data()}};
-
+        std::array<double *, 3> meshes = {{rs_mesh_dip[0].data(),
+                                           rs_mesh_dip[1].data(),
+                                           rs_mesh_dip[2].data()}};
         dp3m.sm.spread_grid(meshes, comm_cart, dp3m.local_mesh.dim);
         /* Assign force component from mesh to particle */
+        auto const d_rs = (d + dp3m.ks_pnum) % 3;
         Utils::integral_parameter<int, AssignForces, 1, 7>(
-            dp3m.params.cao, dp3m, dipole_prefac * Utils::sqr(two_pi_L_i), d_rs,
+            dp3m.params.cao, dp3m, dipole_prefac * Utils::sqr(wavenumber), d_rs,
             particles);
       }
     } /* if (dp3m.sum_mu2 > 0) */
@@ -590,15 +546,15 @@ void DipolarP3M::calc_influence_function_force() {
   auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh};
 
   dp3m.g_force = grid_influence_function<3>(dp3m.params, start, start + size,
-                                            get_system().box_geo->length());
+                                            get_system().box_geo->length_inv());
 }
 
 void DipolarP3M::calc_influence_function_energy() {
   auto const start = Utils::Vector3i{dp3m.fft.plan[3].start};
   auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh};
 
-  dp3m.g_energy = grid_influence_function<2>(dp3m.params, start, start + size,
-                                             get_system().box_geo->length());
+  dp3m.g_energy = grid_influence_function<2>(
+      dp3m.params, start, start + size, get_system().box_geo->length_inv());
 }
 
 class DipolarTuningAlgorithm : public TuningAlgorithm {
@@ -749,62 +705,68 @@ void DipolarP3M::tune() {
 }
 
 /** Tuning dipolar-P3M */
-static auto dp3m_tune_aliasing_sums(int nx, int ny, int nz, int mesh,
+static auto dp3m_tune_aliasing_sums(Utils::Vector3i const &shift, int mesh,
                                     double mesh_i, int cao, double alpha_L_i) {
-  using Utils::sinc;
 
+  auto constexpr m_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN);
+  auto constexpr m_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1);
   auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i);
-
   auto alias1 = 0.;
   auto alias2 = 0.;
-  for (int mx = -P3M_BRILLOUIN; mx <= P3M_BRILLOUIN; mx++) {
-    auto const nmx = nx + mx * mesh;
-    auto const fnmx = mesh_i * nmx;
-    for (int my = -P3M_BRILLOUIN; my <= P3M_BRILLOUIN; my++) {
-      auto const nmy = ny + my * mesh;
-      auto const fnmy = mesh_i * nmy;
-      for (int mz = -P3M_BRILLOUIN; mz <= P3M_BRILLOUIN; mz++) {
-        auto const nmz = nz + mz * mesh;
-        auto const fnmz = mesh_i * nmz;
-
-        auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz);
-        auto const ex = std::exp(-factor1 * nm2);
-
-        auto const U2 = pow(sinc(fnmx) * sinc(fnmy) * sinc(fnmz), 2. * cao);
-
-        alias1 += Utils::sqr(ex) * nm2;
-        alias2 += U2 * ex * pow((nx * nmx + ny * nmy + nz * nmz), 3.) / nm2;
-      }
-    }
-  }
+
+  Utils::Vector3i indices{};
+  Utils::Vector3i nm{};
+  Utils::Vector3d fnm{};
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() {
+        auto const norm_sq = nm.norm2();
+        auto const ex = std::exp(-factor1 * norm_sq);
+        auto const U2 = std::pow(Utils::product(fnm), 2 * cao);
+        alias1 += Utils::sqr(ex) * norm_sq;
+        alias2 += U2 * ex * std::pow(shift * nm, 3) / norm_sq;
+      },
+      [&](unsigned dim, int n) {
+        nm[dim] = shift[dim] + n * mesh;
+        fnm[dim] = math::sinc(nm[dim] * mesh_i);
+      });
+
   return std::make_pair(alias1, alias2);
 }
 
 /** Calculate the k-space error of dipolar-P3M */
 static double dp3m_k_space_error(double box_size, int mesh, int cao,
                                  int n_c_part, double sum_q2, double alpha_L) {
-  double he_q = 0.;
-  auto const mesh_i = 1. / mesh;
-  auto const alpha_L_i = 1. / alpha_L;
 
-  for (int nx = -mesh / 2; nx < mesh / 2; nx++)
-    for (int ny = -mesh / 2; ny < mesh / 2; ny++)
-      for (int nz = -mesh / 2; nz < mesh / 2; nz++)
-        if ((nx != 0) || (ny != 0) || (nz != 0)) {
-          auto const n2 = Utils::sqr(nx) + Utils::sqr(ny) + Utils::sqr(nz);
-          auto const cs = p3m_analytic_cotangent_sum(nx, mesh_i, cao) *
-                          p3m_analytic_cotangent_sum(ny, mesh_i, cao) *
-                          p3m_analytic_cotangent_sum(nz, mesh_i, cao);
+  auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao);
+  auto const mesh_i = 1. / static_cast<double>(mesh);
+  auto const alpha_L_i = 1. / alpha_L;
+  auto const m_stop = Utils::Vector3i::broadcast(mesh / 2);
+  auto const m_start = -m_stop;
+  auto indices = Utils::Vector3i{};
+  auto values = Utils::Vector3d{};
+  auto he_q = 0.;
+
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() {
+        if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) {
+          auto const n2 = indices.norm2();
+          auto const cs = Utils::product(values);
           auto const [alias1, alias2] =
-              dp3m_tune_aliasing_sums(nx, ny, nz, mesh, mesh_i, cao, alpha_L_i);
+              dp3m_tune_aliasing_sums(indices, mesh, mesh_i, cao, alpha_L_i);
           auto const d =
               alias1 - Utils::sqr(alias2 / cs) /
                            Utils::int_pow<3>(static_cast<double>(n2));
           /* at high precision, d can become negative due to extinction;
              also, don't take values that have no significant digits left*/
-          if (d > 0 && (fabs(d / alias1) > ROUND_ERROR_PREC))
+          if (d > 0. and std::fabs(d / alias1) > ROUND_ERROR_PREC)
             he_q += d;
         }
+      },
+      [&values, &mesh_i, cotangent_sum](unsigned dim, int n) {
+        values[dim] = cotangent_sum(n, mesh_i);
+      });
 
   return 8. * Utils::sqr(std::numbers::pi) / 3. * sum_q2 *
          sqrt(he_q / n_c_part) / Utils::int_pow<4>(box_size);
@@ -885,7 +847,7 @@ void DipolarP3M::sanity_checks_boxl() const {
   auto const &system = get_system();
   auto const &box_geo = *system.box_geo;
   auto const &local_geo = *system.local_geo;
-  for (unsigned int i = 0u; i < 3u; i++) {
+  for (auto i = 0u; i < 3u; i++) {
     /* check k-space cutoff */
     if (dp3m.params.cao_cut[i] >= box_geo.length_half()[i]) {
       std::stringstream msg;
diff --git a/src/core/p3m/common.cpp b/src/core/p3m/common.cpp
index bc87e99a6a8..fb8023eba74 100644
--- a/src/core/p3m/common.cpp
+++ b/src/core/p3m/common.cpp
@@ -28,52 +28,8 @@
 #include "LocalBox.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/math/sqr.hpp>
 
 #include <cmath>
-#include <numbers>
-#include <stdexcept>
-
-double p3m_analytic_cotangent_sum(int n, double mesh_i, int cao) {
-  auto const c =
-      Utils::sqr(std::cos(std::numbers::pi * mesh_i * static_cast<double>(n)));
-
-  switch (cao) {
-  case 1: {
-    return 1.0;
-  }
-  case 2: {
-    return (1.0 + c * 2.0) / 3.0;
-  }
-  case 3: {
-    return (2.0 + c * (11.0 + c * 2.0)) / 15.0;
-  }
-  case 4: {
-    return (17.0 + c * (180.0 + c * (114.0 + c * 4.0))) / 315.0;
-  }
-  case 5: {
-    return (62.0 + c * (1072.0 + c * (1452.0 + c * (247.0 + c * 2.0)))) /
-           2835.0;
-  }
-  case 6: {
-    return (1382.0 +
-            c * (35396.0 +
-                 c * (83021.0 + c * (34096.0 + c * (2026.0 + c * 4.0))))) /
-           155925.0;
-  }
-  case 7: {
-    return (21844.0 +
-            c * (776661.0 +
-                 c * (2801040.0 +
-                      c * (2123860.0 +
-                           c * (349500.0 + c * (8166.0 + c * 4.0)))))) /
-           6081075.0;
-  }
-  default: {
-    throw std::logic_error("Invalid value cao=" + std::to_string(cao));
-  }
-  }
-}
 
 void P3MLocalMesh::calc_local_ca_mesh(P3MParameters const &params,
                                       LocalBox const &local_geo, double skin,
diff --git a/src/core/p3m/common.hpp b/src/core/p3m/common.hpp
index 38a7553428a..6bc30828ee5 100644
--- a/src/core/p3m/common.hpp
+++ b/src/core/p3m/common.hpp
@@ -38,6 +38,7 @@
 
 #include <utils/Vector.hpp>
 
+#include <algorithm>
 #include <array>
 #include <vector>
 
@@ -49,6 +50,7 @@ auto constexpr P3M_EPSILON_METALLIC = 0.0;
 #include "LocalBox.hpp"
 
 #include <array>
+#include <cstddef>
 #include <stdexcept>
 #include <vector>
 
@@ -212,7 +214,7 @@ struct P3MLocalMesh {
    */
   void recalc_ld_pos(P3MParameters const &params) {
     // spatial position of left down mesh point
-    for (unsigned int i = 0; i < 3; i++) {
+    for (auto i = 0u; i < 3u; i++) {
       ld_pos[i] = (ld_ind[i] + params.mesh_off[i]) * params.a[i];
     }
   }
@@ -226,16 +228,6 @@ struct P3MLocalMesh {
                           double space_layer);
 };
 
-/** One of the aliasing sums used to compute k-space errors.
- *  Fortunately the one which is most important (because it converges
- *  most slowly, since it is not damped exponentially) can be
- *  calculated analytically. The result (which depends on the order of
- *  the spline interpolation) can be written as an even trigonometric
- *  polynomial. The results are tabulated here (the employed formula
- *  is eq. (7.66) in @cite hockney88a).
- */
-double p3m_analytic_cotangent_sum(int n, double mesh_i, int cao);
-
 #endif /* P3M || DP3M */
 
 namespace detail {
@@ -250,7 +242,7 @@ std::array<std::vector<int>, 3> inline calc_meshift(
     Utils::Vector3i const &mesh_size, bool zero_out_midpoint = false) {
   std::array<std::vector<int>, 3> ret{};
 
-  for (unsigned int i = 0; i < 3; i++) {
+  for (auto i = 0u; i < 3u; ++i) {
     ret[i] = std::vector<int>(static_cast<std::size_t>(mesh_size[i]));
 
     for (int j = 1; j <= mesh_size[i] / 2; j++) {
diff --git a/src/core/p3m/fft.cpp b/src/core/p3m/fft.cpp
index 57b56fbaa6c..a14c9a2d00b 100644
--- a/src/core/p3m/fft.cpp
+++ b/src/core/p3m/fft.cpp
@@ -84,7 +84,7 @@ using Utils::permute_ifield;
  */
 std::optional<std::vector<int>>
 find_comm_groups(Utils::Vector3i const &grid1, Utils::Vector3i const &grid2,
-                 std::span<const int> node_list1, std::span<int> node_list2,
+                 std::span<int const> node_list1, std::span<int> node_list2,
                  std::span<int> pos, std::span<int> my_pos, int rank) {
   int i;
   /* communication group cell size on grid1 and grid2 */
diff --git a/src/core/p3m/fft.hpp b/src/core/p3m/fft.hpp
index 58ea6c7e692..c87f11a14cb 100644
--- a/src/core/p3m/fft.hpp
+++ b/src/core/p3m/fft.hpp
@@ -33,12 +33,7 @@
  *  redistributed.
  *
  *  For simplicity at the moment I have implemented a full complex to
- *  complex FFT (even though a real to complex FFT would be
- *  sufficient)
- *
- *  \todo Combine the forward and backward structures.
- *  \todo The packing routines could be moved to utils.hpp when they are needed
- * elsewhere.
+ *  complex FFT (even though a real to complex FFT would be sufficient).
  *
  *  For more information about FFT usage, see \ref fft.cpp "fft.cpp".
  */
@@ -88,22 +83,24 @@ template <class T> struct fft_allocator {
 
 template <class T> using fft_vector = std::vector<T, fft_allocator<T>>;
 
-/** Structure for performing a 1D FFT.
- *
- *  This includes the information about the redistribution of the 3D
- *  FFT *grid before the actual FFT.
- */
-struct fft_forw_plan {
-  /** plan direction: 0 = Forward FFT, 1 = Backward FFT. */
+struct fft_plan {
+  /** plan direction: forward or backward FFT (enum value from FFTW). */
   int dir;
+  /** plan for the FFT. */
+  fftw_plan our_fftw_plan;
+  /** packing function for send blocks. */
+  void (*pack_function)(double const *const, double *const, int const *,
+                        int const *, int const *, int);
+};
+
+/** @brief Plan for a forward 1D FFT of a flattened 3D array. */
+struct fft_forw_plan : public fft_plan {
   /** row direction of that FFT. */
   int row_dir;
   /** permutations from normal coordinate system. */
   int n_permute;
   /** number of 1D FFTs. */
   int n_ffts;
-  /** plan for fft. */
-  fftw_plan our_fftw_plan;
 
   /** size of local mesh before communication. */
   int old_mesh[3];
@@ -117,9 +114,6 @@ struct fft_forw_plan {
   /** group of nodes which have to communicate with each other. */
   std::vector<int> group;
 
-  /** packing function for send blocks. */
-  void (*pack_function)(double const *const, double *const, int const *,
-                        int const *, int const *, int);
   /** Send block specification. 6 integers for each node: start[3], size[3]. */
   std::vector<int> send_block;
   /** Send block communication sizes. */
@@ -132,17 +126,8 @@ struct fft_forw_plan {
   int element;
 };
 
-/** Additional information for backwards FFT. */
-struct fft_back_plan {
-  /** plan direction. (e.g. fftw macro) */
-  int dir;
-  /** plan for fft. */
-  fftw_plan our_fftw_plan;
-
-  /** packing function for send blocks. */
-  void (*pack_function)(double const *const, double *const, int const *,
-                        int const *, int const *, int);
-};
+/** @brief Plan for a backward 1D FFT of a flattened 3D array. */
+struct fft_back_plan : public fft_plan {};
 
 /** Information about the three one dimensional FFTs and how the nodes
  *  have to communicate inbetween.
diff --git a/src/core/p3m/influence_function.hpp b/src/core/p3m/influence_function.hpp
index 9c322c23506..24853d43e25 100644
--- a/src/core/p3m/influence_function.hpp
+++ b/src/core/p3m/influence_function.hpp
@@ -16,15 +16,16 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef ESPRESSO_P3M_INFLUENCE_FUNCTION_HPP
-#define ESPRESSO_P3M_INFLUENCE_FUNCTION_HPP
+
+#pragma once
 
 #include "p3m/common.hpp"
+#include "p3m/for_each_3d.hpp"
+#include "p3m/math.hpp"
 
 #include <utils/Vector.hpp>
 #include <utils/index.hpp>
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <cmath>
@@ -52,44 +53,43 @@
 template <std::size_t S, std::size_t m>
 double G_opt(int cao, double alpha, Utils::Vector3d const &k,
              Utils::Vector3d const &h) {
-  using namespace detail::FFT_indexing;
-  using Utils::int_pow;
-  using Utils::sinc;
-
-  auto constexpr two_pi = 2. * std::numbers::pi;
-  auto constexpr two_pi_i = 1. / two_pi;
-  auto constexpr limit = 30.;
 
   auto const k2 = k.norm2();
-  if (k2 == 0.0) {
-    return 0.0;
+  if (k2 == 0.) {
+    return 0.;
   }
 
-  double numerator = 0.0;
-  double denominator = 0.0;
-
-  for (int mx = -m; mx <= m; mx++) {
-    for (int my = -m; my <= m; my++) {
-      for (int mz = -m; mz <= m; mz++) {
-        auto const km =
-            k + two_pi * Utils::Vector3d{mx / h[RX], my / h[RY], mz / h[RZ]};
-        auto const U2 = std::pow(sinc(km[RX] * h[RX] * two_pi_i) *
-                                     sinc(km[RY] * h[RY] * two_pi_i) *
-                                     sinc(km[RZ] * h[RZ] * two_pi_i),
-                                 2 * cao);
-
+  auto constexpr limit = 30.;
+  auto constexpr m_start = Utils::Vector3i::broadcast(-m);
+  auto constexpr m_stop = Utils::Vector3i::broadcast(m + 1);
+  auto const exponent_prefactor = Utils::sqr(1. / (2. * alpha));
+  auto const wavevector = (2. * std::numbers::pi) / h;
+  auto const wavevector_i = 1. / wavevector;
+  auto indices = Utils::Vector3i{};
+  auto km = Utils::Vector3d{};
+  auto fnm = Utils::Vector3d{};
+  auto numerator = 0.;
+  auto denominator = 0.;
+
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() {
+        using namespace detail::FFT_indexing;
+        auto const U2 = std::pow(Utils::product(fnm), 2 * cao);
         auto const km2 = km.norm2();
-        auto const exponent = Utils::sqr(1. / (2. * alpha)) * km2;
+        auto const exponent = exponent_prefactor * km2;
         if (exponent < limit) {
           auto const f3 = std::exp(-exponent) * (4. * std::numbers::pi / km2);
-          numerator += U2 * f3 * int_pow<S>(k * km);
+          numerator += U2 * f3 * Utils::int_pow<S>(k * km);
         }
         denominator += U2;
-      }
-    }
-  }
+      },
+      [&](unsigned dim, int n) {
+        km[dim] = k[dim] + n * wavevector[dim];
+        fnm[dim] = math::sinc(km[dim] * wavevector_i[dim]);
+      });
 
-  return numerator / (int_pow<S>(k2) * Utils::sqr(denominator));
+  return numerator / (Utils::int_pow<S>(k2) * Utils::sqr(denominator));
 }
 
 /**
@@ -104,20 +104,18 @@ double G_opt(int cao, double alpha, Utils::Vector3d const &k,
  *
  * @param params P3M parameters
  * @param n_start Lower left corner of the grid
- * @param n_end Upper right corner of the grid.
- * @param box_l Box size
+ * @param n_stop Upper right corner of the grid.
+ * @param inv_box_l Inverse box length
  * @return Values of G_opt at regular grid points.
  */
 template <std::size_t S, std::size_t m = 0>
 std::vector<double> grid_influence_function(const P3MParameters &params,
                                             const Utils::Vector3i &n_start,
-                                            const Utils::Vector3i &n_end,
-                                            const Utils::Vector3d &box_l) {
-  using namespace detail::FFT_indexing;
+                                            const Utils::Vector3i &n_stop,
+                                            const Utils::Vector3d &inv_box_l) {
 
   auto const shifts = detail::calc_meshift(params.mesh);
-
-  auto const size = n_end - n_start;
+  auto const size = n_stop - n_start;
 
   /* The influence function grid */
   auto g = std::vector<double>(Utils::product(size), 0.);
@@ -128,31 +126,24 @@ std::vector<double> grid_influence_function(const P3MParameters &params,
     return g;
   }
 
-  auto const h = Utils::Vector3d{params.a};
-
-  Utils::Vector3i n{};
-  for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) {
-    for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) {
-      for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) {
-        auto const ind = Utils::get_linear_index(n - n_start, size,
-                                                 Utils::MemoryOrder::ROW_MAJOR);
-        if ((n[KX] % (params.mesh[RX] / 2) == 0) &&
-            (n[KY] % (params.mesh[RY] / 2) == 0) &&
-            (n[KZ] % (params.mesh[RZ] / 2) == 0)) {
-          g[ind] = 0.0;
-        } else {
-          auto const k = 2. * std::numbers::pi *
-                         Utils::Vector3d{shifts[RX][n[KX]] / box_l[RX],
-                                         shifts[RY][n[KY]] / box_l[RY],
-                                         shifts[RZ][n[KZ]] / box_l[RZ]};
-
-          g[ind] = G_opt<S, m>(params.cao, params.alpha, k, h);
-        }
-      }
+  auto const wavevector = (2. * std::numbers::pi) * inv_box_l;
+  auto const half_mesh = params.mesh / 2;
+  auto indices = Utils::Vector3i{};
+  auto index = std::size_t(0u);
+
+  for_each_3d(n_start, n_stop, indices, [&]() {
+    using namespace detail::FFT_indexing;
+    if ((indices[KX] % half_mesh[RX] != 0) or
+        (indices[KY] % half_mesh[RY] != 0) or
+        (indices[KZ] % half_mesh[RZ] != 0)) {
+      auto const k =
+          Utils::Vector3d{{shifts[RX][indices[KX]] * wavevector[RX],
+                           shifts[RY][indices[KY]] * wavevector[RY],
+                           shifts[RZ][indices[KZ]] * wavevector[RZ]}};
+      g[index] = G_opt<S, m>(params.cao, params.alpha, k, params.a);
     }
-  }
+    ++index;
+  });
 
   return g;
 }
-
-#endif
diff --git a/src/core/p3m/influence_function_dipolar.hpp b/src/core/p3m/influence_function_dipolar.hpp
index e327846d20e..d52a1a6d675 100644
--- a/src/core/p3m/influence_function_dipolar.hpp
+++ b/src/core/p3m/influence_function_dipolar.hpp
@@ -18,19 +18,19 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef ESPRESSO_CORE_P3M_INFLUENCE_FUNCTION_DIPOLAR_HPP
-#define ESPRESSO_CORE_P3M_INFLUENCE_FUNCTION_DIPOLAR_HPP
+
+#pragma once
 
 #include "config/config.hpp"
 
 #if defined(DP3M)
 
 #include "p3m/common.hpp"
+#include "p3m/for_each_3d.hpp"
+#include "p3m/math.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/index.hpp>
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <cmath>
@@ -54,40 +54,41 @@
 template <std::size_t S>
 double G_opt_dipolar(P3MParameters const &params, Utils::Vector3i const &shift,
                      Utils::Vector3i const &d_op) {
-  using Utils::int_pow;
-  using Utils::sinc;
+
   auto constexpr limit = P3M_BRILLOUIN;
   auto constexpr exp_limit = 30.;
-  auto const exponent = 2. * params.cao;
-
+  auto constexpr m_start = Utils::Vector3i::broadcast(-limit);
+  auto constexpr m_stop = Utils::Vector3i::broadcast(limit + 1);
+  auto const cao = params.cao;
+  auto const mesh = params.mesh[0];
+  auto const offset =
+      static_cast<Utils::Vector3d>(shift) / static_cast<double>(mesh);
+  auto const f2 = Utils::sqr(std::numbers::pi / params.alpha_L);
+  auto indices = Utils::Vector3i{};
+  auto nm = Utils::Vector3i{};
+  auto fnm = Utils::Vector3d{};
   auto numerator = 0.;
   auto denominator = 0.;
 
-  auto const f1 = 1. / static_cast<double>(params.mesh[0]);
-  auto const f2 = Utils::sqr(std::numbers::pi / params.alpha_L);
-
-  for (int mx = -limit; mx <= limit; mx++) {
-    auto const nmx = shift[0] + params.mesh[0] * mx;
-    auto const sx = std::pow(sinc(f1 * nmx), exponent);
-    for (int my = -limit; my <= limit; my++) {
-      auto const nmy = shift[1] + params.mesh[0] * my;
-      auto const sy = sx * std::pow(sinc(f1 * nmy), exponent);
-      for (int mz = -limit; mz <= limit; mz++) {
-        auto const nmz = shift[2] + params.mesh[0] * mz;
-        auto const sz = sy * std::pow(sinc(f1 * nmz), exponent);
-        auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz);
-        auto const exp_term = f2 * nm2;
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() {
+        auto const norm_sq = nm.norm2();
+        auto const sz = std::pow(Utils::product(fnm), 2 * cao);
+        auto const exp_term = f2 * norm_sq;
         if (exp_term < exp_limit) {
-          auto const f3 = sz * std::exp(-exp_term) / nm2;
-          auto const n_nm = d_op[0] * nmx + d_op[1] * nmy + d_op[2] * nmz;
-          numerator += f3 * int_pow<S>(n_nm);
+          auto const f3 = sz * std::exp(-exp_term) / norm_sq;
+          numerator += f3 * Utils::int_pow<S>(d_op * nm);
         }
         denominator += sz;
-      }
-    }
-  }
-  return numerator / (int_pow<S>(static_cast<double>(d_op.norm2())) *
-                      Utils::sqr(denominator));
+      },
+      [&](unsigned dim, int n) {
+        nm[dim] = shift[dim] + n * mesh;
+        fnm[dim] = math::sinc(offset[dim] + n * mesh);
+      });
+
+  return numerator / (Utils::int_pow<S>(static_cast<double>(d_op.norm2())) *
+                      Utils::int_pow<2>(denominator));
 }
 
 /**
@@ -100,17 +101,17 @@ double G_opt_dipolar(P3MParameters const &params, Utils::Vector3i const &shift,
  *
  * @param params DP3M parameters
  * @param n_start Lower left corner of the grid
- * @param n_end Upper right corner of the grid.
- * @param box_l Box size
+ * @param n_stop Upper right corner of the grid.
+ * @param inv_box_l Inverse box length
  * @return Values of the influence function at regular grid points.
  */
 template <std::size_t S>
 std::vector<double> grid_influence_function(P3MParameters const &params,
                                             Utils::Vector3i const &n_start,
-                                            Utils::Vector3i const &n_end,
-                                            Utils::Vector3d const &box_l) {
+                                            Utils::Vector3i const &n_stop,
+                                            Utils::Vector3d const &inv_box_l) {
 
-  auto const size = n_end - n_start;
+  auto const size = n_stop - n_start;
 
   /* The influence function grid */
   auto g = std::vector<double>(Utils::product(size), 0.);
@@ -121,61 +122,56 @@ std::vector<double> grid_influence_function(P3MParameters const &params,
     return g;
   }
 
-  double fak1 = Utils::int_pow<3>(static_cast<double>(params.mesh[0])) * 2.0 /
-                Utils::sqr(box_l[0]);
-
-  auto const shifts = detail::calc_meshift(params.mesh, false);
-  auto const d_ops = detail::calc_meshift(params.mesh, true);
-
-  Utils::Vector3i n{};
-  for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) {
-    for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) {
-      for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) {
-        auto const ind = Utils::get_linear_index(n - n_start, size,
-                                                 Utils::MemoryOrder::ROW_MAJOR);
-
-        if (((n[0] % (params.mesh[0] / 2) == 0) &&
-             (n[1] % (params.mesh[0] / 2) == 0) &&
-             (n[2] % (params.mesh[0] / 2) == 0))) {
-          g[ind] = 0.0;
-        } else {
-          auto const shift = Utils::Vector3i{shifts[0][n[0]], shifts[0][n[1]],
-                                             shifts[0][n[2]]};
-          auto const d_op =
-              Utils::Vector3i{d_ops[0][n[0]], d_ops[0][n[1]], d_ops[0][n[2]]};
-          auto const fak2 = G_opt_dipolar<S>(params, shift, d_op);
-          g[ind] = fak1 * fak2;
+  auto prefactor = Utils::int_pow<3>(static_cast<double>(params.mesh[0])) * 2. *
+                   Utils::int_pow<2>(inv_box_l[0]);
+
+  auto const offset = detail::calc_meshift(params.mesh, false)[0];
+  auto const d_op = detail::calc_meshift(params.mesh, true)[0];
+  auto const half_mesh = params.mesh[0] / 2;
+  auto indices = Utils::Vector3i{};
+  auto shift_off = Utils::Vector3i{};
+  auto d_op_off = Utils::Vector3i{};
+  auto index = std::size_t(0u);
+
+  for_each_3d(
+      n_start, n_stop, indices,
+      [&]() {
+        if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or
+             (indices[2] % half_mesh != 0))) {
+          g[index] = prefactor * G_opt_dipolar<S>(params, shift_off, d_op_off);
         }
-      }
-    }
-  }
+        ++index;
+      },
+      [&](unsigned dim, int n) {
+        d_op_off[dim] = d_op[n];
+        shift_off[dim] = offset[n];
+      });
+
   return g;
 }
 
 inline double G_opt_dipolar_self_energy(P3MParameters const &params,
                                         Utils::Vector3i const &shift) {
-  using Utils::sinc;
+
   auto constexpr limit = P3M_BRILLOUIN + 1;
-  auto const exponent = 2. * params.cao;
-
-  auto u_sum = 0.0;
-
-  auto const f1 = 1.0 / static_cast<double>(params.mesh[0]);
-
-  for (int mx = -limit; mx <= limit; mx++) {
-    auto const nmx = shift[0] + params.mesh[0] * mx;
-    auto const sx = std::pow(sinc(f1 * nmx), exponent);
-    for (int my = -limit; my <= limit; my++) {
-      auto const nmy = shift[1] + params.mesh[0] * my;
-      auto const sy = sx * std::pow(sinc(f1 * nmy), exponent);
-      for (int mz = -limit; mz <= limit; mz++) {
-        auto const nmz = shift[2] + params.mesh[0] * mz;
-        auto const sz = sy * std::pow(sinc(f1 * nmz), exponent);
-        u_sum += sz;
-      }
-    }
-  }
-  return u_sum;
+  auto constexpr m_start = Utils::Vector3i::broadcast(-limit);
+  auto constexpr m_stop = Utils::Vector3i::broadcast(limit + 1);
+  auto const cao = params.cao;
+  auto const mesh = params.mesh[0];
+  auto const offset =
+      static_cast<Utils::Vector3d>(shift) / static_cast<double>(mesh);
+  auto indices = Utils::Vector3i{};
+  auto fnm = Utils::Vector3d{};
+  auto energy = 0.;
+
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() { energy += std::pow(Utils::product(fnm), 2 * cao); },
+      [&](unsigned dim, int n) {
+        fnm[dim] = math::sinc(offset[dim] + n * mesh);
+      });
+
+  return energy;
 }
 
 /**
@@ -183,42 +179,39 @@ inline double G_opt_dipolar_self_energy(P3MParameters const &params,
  *
  * @param params DP3M parameters
  * @param n_start Lower left corner of the grid
- * @param n_end Upper right corner of the grid.
+ * @param n_stop Upper right corner of the grid.
  * @param g Energies on the grid.
  * @return Total self-energy.
  */
 inline double grid_influence_function_self_energy(
     P3MParameters const &params, Utils::Vector3i const &n_start,
-    Utils::Vector3i const &n_end, std::vector<double> const &g) {
-  auto const size = n_end - n_start;
-
-  auto const shifts = detail::calc_meshift(params.mesh, false);
-  auto const d_ops = detail::calc_meshift(params.mesh, true);
-
-  double energy = 0.0;
-  Utils::Vector3i n{};
-  for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) {
-    for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) {
-      for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) {
-        if (((n[0] % (params.mesh[0] / 2) == 0) &&
-             (n[1] % (params.mesh[0] / 2) == 0) &&
-             (n[2] % (params.mesh[0] / 2) == 0))) {
-          energy += 0.0;
-        } else {
-          auto const ind = Utils::get_linear_index(
-              n - n_start, size, Utils::MemoryOrder::ROW_MAJOR);
-          auto const shift = Utils::Vector3i{shifts[0][n[0]], shifts[0][n[1]],
-                                             shifts[0][n[2]]};
-          auto const d_op =
-              Utils::Vector3i{d_ops[0][n[0]], d_ops[0][n[1]], d_ops[0][n[2]]};
-          auto const U2 = G_opt_dipolar_self_energy(params, shift);
-          energy += g[ind] * U2 * d_op.norm2();
+    Utils::Vector3i const &n_stop, std::vector<double> const &g) {
+
+  auto const offset = detail::calc_meshift(params.mesh, false)[0];
+  auto const d_op = detail::calc_meshift(params.mesh, true)[0];
+  auto const half_mesh = params.mesh[0] / 2;
+  auto indices = Utils::Vector3i{};
+  auto shift_off = Utils::Vector3i{};
+  auto d_op_off = Utils::Vector3i{};
+  auto index = std::size_t(0u);
+  auto energy = 0.;
+
+  for_each_3d(
+      n_start, n_stop, indices,
+      [&]() {
+        if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or
+             (indices[2] % half_mesh != 0))) {
+          auto const U2 = G_opt_dipolar_self_energy(params, shift_off);
+          energy += g[index] * U2 * d_op_off.norm2();
         }
-      }
-    }
-  }
+        ++index;
+      },
+      [&](unsigned dim, int n) {
+        d_op_off[dim] = d_op[n];
+        shift_off[dim] = offset[n];
+      });
+
   return energy;
 }
 
-#endif
-#endif
+#endif // defined(DP3M)
diff --git a/src/core/unit_tests/CMakeLists.txt b/src/core/unit_tests/CMakeLists.txt
index d843da0a636..60fcadae764 100644
--- a/src/core/unit_tests/CMakeLists.txt
+++ b/src/core/unit_tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2010-2022 The ESPResSo project
+# Copyright (C) 2010-2024 The ESPResSo project
 # Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
 #   Max-Planck-Institute for Polymer Research, Theory Group
 #
@@ -94,6 +94,8 @@ if(ESPRESSO_BUILD_WITH_FFTW)
             espresso::core)
   unit_test(NAME fft_test SRC fft_test.cpp DEPENDS espresso::utils
             espresso::core)
+  unit_test(NAME math_test SRC math_test.cpp DEPENDS espresso::utils
+            espresso::core)
 endif()
 
 if(ESPRESSO_BUILD_WITH_CUDA)
diff --git a/src/core/unit_tests/cuda_test.cu b/src/core/unit_tests/cuda_test.cu
index f5057fe7a75..812638cfa96 100644
--- a/src/core/unit_tests/cuda_test.cu
+++ b/src/core/unit_tests/cuda_test.cu
@@ -17,10 +17,12 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#define BOOST_TEST_MODULE cuda test
+#define BOOST_TEST_MODULE "CUDA interface tests"
 #define BOOST_TEST_DYN_LINK
 #include <boost/test/unit_test.hpp>
 
+#include "config/config.hpp"
+
 #include "cuda/init.hpp"
 #include "cuda/utils.cuh"
 #include "cuda/utils.hpp"
@@ -33,20 +35,22 @@
 #include <boost/test/unit_test.hpp>
 
 #include <cstddef>
+#include <initializer_list>
 #include <optional>
 #include <string>
 
 boost::test_tools::assertion_result has_gpu(boost::unit_test::test_unit_id) {
+  bool has_compatible_device = false;
   int n_devices = 0;
   cudaGetDeviceCount(&n_devices);
   if (n_devices > 0) {
     cudaDeviceProp prop;
     cudaGetDeviceProperties(&prop, 0);
     if (prop.major >= 3) {
-      return true;
+      has_compatible_device = true;
     }
   }
-  return false;
+  return has_compatible_device;
 }
 
 std::optional<std::string> read_pending_cuda_errors() {
@@ -79,44 +83,43 @@ void clear() {
 
 } // namespace Testing::non_sticky_cuda_error
 
+#ifdef P3M
+dim3 p3m_make_grid(unsigned int n_blocks);
+#endif
+
 static auto fixture = boost::unit_test::fixture(&setup, &teardown);
 
 BOOST_AUTO_TEST_SUITE(suite, *boost::unit_test::precondition(has_gpu))
 
 BOOST_AUTO_TEST_CASE(gpu_fixture, *fixture) {
-  {
-    auto error = read_pending_cuda_errors();
-    BOOST_REQUIRE(not error.has_value());
-  }
-  {
-    // check we can raise and clear non-sticky CUDA errors
-    Testing::non_sticky_cuda_error::trigger();
-    Testing::non_sticky_cuda_error::clear();
-    auto error = read_pending_cuda_errors();
-    BOOST_REQUIRE(not error.has_value());
-  }
-  {
-    // check fixture can handle the default non-sticky CUDA error
-    Testing::non_sticky_cuda_error::trigger();
-    auto ref_what{"There is a pending CUDA error: \"invalid device ordinal\""};
-    auto error = read_pending_cuda_errors();
-    BOOST_REQUIRE(error.has_value());
-    BOOST_REQUIRE_EQUAL(error.value(), ref_what);
-    // sticky error should have been cleared
-    error = read_pending_cuda_errors();
-    BOOST_REQUIRE(not error.has_value());
-  }
-  {
-    // check fixture can handle a custom non-sticky CUDA error
-    cudaMallocHost(nullptr, std::size_t(0u));
-    auto ref_what{"There is a pending CUDA error: \"invalid argument\""};
-    auto error = read_pending_cuda_errors();
-    BOOST_REQUIRE(error.has_value());
-    BOOST_REQUIRE_EQUAL(error.value(), ref_what);
-    // sticky error should have been cleared
-    error = read_pending_cuda_errors();
-    BOOST_REQUIRE(not error.has_value());
-  }
+  auto error1 = read_pending_cuda_errors();
+  BOOST_REQUIRE(not error1.has_value());
+
+  // check we can raise and clear non-sticky CUDA errors
+  Testing::non_sticky_cuda_error::trigger();
+  Testing::non_sticky_cuda_error::clear();
+  auto error2 = read_pending_cuda_errors();
+  BOOST_REQUIRE(not error2.has_value());
+
+  // check fixture can handle the default non-sticky CUDA error
+  Testing::non_sticky_cuda_error::trigger();
+  auto ref_what3{"There is a pending CUDA error: \"invalid device ordinal\""};
+  auto error3 = read_pending_cuda_errors();
+  BOOST_REQUIRE(error3.has_value());
+  BOOST_REQUIRE_EQUAL(error3.value(), ref_what3);
+  // sticky error should have been cleared
+  error3 = read_pending_cuda_errors();
+  BOOST_REQUIRE(not error3.has_value());
+
+  // check fixture can handle a custom non-sticky CUDA error
+  cudaMallocHost(nullptr, std::size_t(0u));
+  auto ref_what4{"There is a pending CUDA error: \"invalid argument\""};
+  auto error4 = read_pending_cuda_errors();
+  BOOST_REQUIRE(error4.has_value());
+  BOOST_REQUIRE_EQUAL(error4.value(), ref_what4);
+  // sticky error should have been cleared
+  error4 = read_pending_cuda_errors();
+  BOOST_REQUIRE(not error4.has_value());
 }
 
 static int fatal_error_counter = 0;
@@ -125,127 +128,153 @@ static void increment_counter() noexcept { ++fatal_error_counter; }
 BOOST_AUTO_TEST_CASE(gpu_interface, *fixture) {
   fatal_error_counter = 0;
   auto local_error_counter = 0;
-  {
-    std::string const what = "message 1";
-    try {
-      throw cuda_fatal_error(what);
-    } catch (cuda_fatal_error &err) {
-      BOOST_CHECK_EQUAL(err.what(), what);
-      BOOST_CHECK_EQUAL(err.get_terminate(), &errexit);
-      err.set_terminate(nullptr);
-      BOOST_CHECK_EQUAL(err.get_terminate(), nullptr);
-      err.set_terminate(increment_counter);
-      BOOST_CHECK_EQUAL(err.get_terminate(), &increment_counter);
-      BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
-    }
-    ++local_error_counter;
-    BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
+
+  try {
+    throw cuda_fatal_error("message");
+  } catch (cuda_fatal_error &err) {
+    std::string const what = "message";
+    BOOST_CHECK_EQUAL(err.what(), what);
+    BOOST_CHECK_EQUAL(err.get_terminate(), &errexit);
+    err.set_terminate(nullptr);
+    BOOST_CHECK_EQUAL(err.get_terminate(), nullptr);
+    err.set_terminate(increment_counter);
+    BOOST_CHECK_EQUAL(err.get_terminate(), &increment_counter);
+    BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
   }
-  {
-    auto error_caught = false;
-    auto const block = dim3{1, 2, 3};
-    auto const grid = dim3{4, 5, 6};
-    // should not throw
-    cuda_check_errors_exit(block, grid, "", "", 0u);
-    try {
-      Testing::non_sticky_cuda_error::trigger();
-      // should clear the CUDA error flag and throw a fatal error
-      cuda_check_errors_exit(block, grid, "cudaSetDevice()", "filename.cu", 4u);
-    } catch (cuda_fatal_error &err) {
-      error_caught = true;
-      err.set_terminate(increment_counter);
-      std::string const what =
-          "CUDA error: \"invalid device ordinal\" while calling "
-          "cudaSetDevice() with block: <1,2,3>, grid: <4,5,6> in filename.cu:4";
-      BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
-      BOOST_CHECK_EQUAL(err.what(), what);
-      BOOST_CHECK_EQUAL(cudaGetLastError(), cudaSuccess);
-    }
-    ++local_error_counter;
-    BOOST_REQUIRE(error_caught);
-    BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
+  ++local_error_counter;
+  BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
+
+  // -----------------------
+
+  auto error_caught = false;
+  auto const block = dim3{1, 2, 3};
+  auto const grid = dim3{4, 5, 6};
+  cuda_check_errors_exit(block, grid, "", "", 0u); // should not throw
+  try {
+    Testing::non_sticky_cuda_error::trigger();
+    // should clear the CUDA error flag and throw a fatal error
+    cuda_check_errors_exit(block, grid, "cudaSetDevice()", "filename.cu", 4u);
+  } catch (cuda_fatal_error &err) {
+    error_caught = true;
+    err.set_terminate(increment_counter);
+    std::string const what =
+        "CUDA error: \"invalid device ordinal\" while calling "
+        "cudaSetDevice() with block: <1,2,3>, grid: <4,5,6> in filename.cu:4";
+    BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
+    BOOST_CHECK_EQUAL(err.what(), what);
+    BOOST_CHECK_EQUAL(cudaGetLastError(), cudaSuccess);
   }
-  {
-    auto error_caught = false;
-    // should not throw
-    cuda_safe_mem_exit(cudaSuccess, "", 0u);
-    try {
-      Testing::non_sticky_cuda_error::trigger();
-      // should throw
-      cuda_safe_mem_exit(cudaSuccess, "filename.cu", 4u);
-    } catch (cuda_fatal_error &err) {
-      error_caught = true;
-      err.set_terminate(increment_counter);
-      std::string const what =
-          "CUDA error: \"invalid device ordinal\" in filename.cu:4. Error "
-          "found during memory operation. Possibly however from a failed "
-          "operation before the memory operation";
-      BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
-      BOOST_CHECK_EQUAL(err.what(), what);
-    }
-    ++local_error_counter;
-    BOOST_REQUIRE(error_caught);
-    BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
+  ++local_error_counter;
+  BOOST_REQUIRE(error_caught);
+  BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
+
+  // -----------------------
+
+  error_caught = false;
+  cuda_safe_mem_exit(cudaSuccess, "", 0u); // should not throw
+  try {
+    Testing::non_sticky_cuda_error::trigger();
+    cuda_safe_mem_exit(cudaSuccess, "filename.cu", 4u); // should throw
+  } catch (cuda_fatal_error &err) {
+    error_caught = true;
+    err.set_terminate(increment_counter);
+    std::string const what =
+        "CUDA error: \"invalid device ordinal\" in filename.cu:4. Error "
+        "found during memory operation. Possibly however from a failed "
+        "operation before the memory operation";
+    BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
+    BOOST_CHECK_EQUAL(err.what(), what);
   }
-  {
-    auto error_caught = false;
-    try {
-      cuda_safe_mem_exit(cudaErrorNotPermitted, "filename.cu", 4u);
-    } catch (cuda_fatal_error &err) {
-      error_caught = true;
-      err.set_terminate(increment_counter);
-      std::string const what = "CUDA error: \"operation not permitted\" during "
-                               "memory operation in filename.cu:4";
-      BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
-      BOOST_CHECK_EQUAL(err.what(), what);
-    }
-    ++local_error_counter;
-    BOOST_REQUIRE(error_caught);
-    BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
+  ++local_error_counter;
+  BOOST_REQUIRE(error_caught);
+  BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
+
+  // -----------------------
+
+  error_caught = false;
+  try {
+    cuda_safe_mem_exit(cudaErrorNotPermitted, "filename.cu", 4u);
+  } catch (cuda_fatal_error &err) {
+    error_caught = true;
+    err.set_terminate(increment_counter);
+    std::string const what = "CUDA error: \"operation not permitted\" during "
+                             "memory operation in filename.cu:4";
+    BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
+    BOOST_CHECK_EQUAL(err.what(), what);
   }
-  {
-    auto error_caught = false;
-    try {
-      cuda_safe_mem_exit(cudaErrorInvalidValue, "function_name()", 4u);
-    } catch (cuda_fatal_error &err) {
-      error_caught = true;
-      err.set_terminate(increment_counter);
-      std::string const what =
-          "CUDA error: \"invalid argument\" during memory operation in "
-          "function_name():4. You may have tried to allocate zero memory";
-      BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
-      BOOST_CHECK_EQUAL(err.what(), what);
-    }
-    ++local_error_counter;
-    BOOST_REQUIRE(error_caught);
-    BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
+  ++local_error_counter;
+  BOOST_REQUIRE(error_caught);
+  BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
+
+  // -----------------------
+
+  error_caught = false;
+  try {
+    cuda_safe_mem_exit(cudaErrorInvalidValue, "function_name()", 4u);
+  } catch (cuda_fatal_error &err) {
+    error_caught = true;
+    err.set_terminate(increment_counter);
+    std::string const what =
+        "CUDA error: \"invalid argument\" during memory operation in "
+        "function_name():4. You may have tried to allocate zero memory";
+    BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
+    BOOST_CHECK_EQUAL(err.what(), what);
   }
-  {
-    BOOST_REQUIRE_EQUAL(stream[0], nullptr);
-    auto error_caught = false;
-    cuda_init(); // allocate
-    BOOST_REQUIRE_NE(stream[0], nullptr);
-    cuda_set_device(0); // reallocate, may or may not result in the same pointer
-    BOOST_REQUIRE_NE(stream[0], nullptr);
-    auto const old_stream = stream[0];
-    try {
-      cuda_set_device(-1); // fail to reallocate, pointer remains the same
-    } catch (cuda_runtime_error_cuda const &err) {
-      error_caught = true;
-      std::string const what = "CUDA error: invalid device ordinal";
-      BOOST_CHECK_EQUAL(err.what(), what);
-    }
-    BOOST_REQUIRE(error_caught);
-    BOOST_REQUIRE_EQUAL(stream[0], old_stream);
+  ++local_error_counter;
+  BOOST_REQUIRE(error_caught);
+  BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
+
+  // -----------------------
+
+  error_caught = false;
+  BOOST_REQUIRE_EQUAL(stream[0], nullptr);
+  cuda_init(); // allocate
+  BOOST_REQUIRE_NE(stream[0], nullptr);
+  cuda_set_device(0); // reallocate, may or may not result in the same pointer
+  BOOST_REQUIRE_NE(stream[0], nullptr);
+  auto const old_stream = stream[0];
+  try {
+    cuda_set_device(-1); // fail to reallocate, pointer remains the same
+  } catch (cuda_runtime_error_cuda const &err) {
+    error_caught = true;
+    std::string const what = "CUDA error: invalid device ordinal";
+    BOOST_CHECK_EQUAL(err.what(), what);
   }
-  {
-    BOOST_REQUIRE_GE(cuda_get_n_gpus(), 1);
-    char gpu_name_buffer[260] = {'\0'};
-    cuda_get_gpu_name(0, gpu_name_buffer);
-    for (int i = 255; i < 260; ++i) {
-      BOOST_REQUIRE_EQUAL(gpu_name_buffer[i], '\0');
-    }
+  BOOST_REQUIRE(error_caught);
+  BOOST_REQUIRE_EQUAL(stream[0], old_stream);
+
+  // -----------------------
+
+  BOOST_REQUIRE_GE(cuda_get_n_gpus(), 1);
+  char gpu_name_buffer[260] = {'\0'};
+  cuda_get_gpu_name(0, gpu_name_buffer);
+  for (int i = 255; i < 260; ++i) {
+    BOOST_REQUIRE_EQUAL(gpu_name_buffer[i], '\0');
+  }
+}
+
+#ifdef P3M
+
+BOOST_AUTO_TEST_CASE(p3m_reshape_grid_test, *fixture) {
+  auto constexpr optimal_size = 65536u;
+
+  for (auto cao = 1u; cao <= 3u; ++cao) {
+    auto const n_blocks = cao * optimal_size;
+    auto const grid = p3m_make_grid(n_blocks);
+    BOOST_CHECK_EQUAL(grid.x, optimal_size);
+    BOOST_CHECK_EQUAL(grid.y, cao);
+    BOOST_CHECK_EQUAL(grid.z, 1u);
+  }
+
+  for (auto mul : {2u, 3u, 6u, 12u}) {
+    auto const n_blocks = mul * optimal_size + 1u;
+    auto const grid = p3m_make_grid(n_blocks);
+    BOOST_CHECK_EQUAL(grid.x, n_blocks / (mul + 1u) + ((mul == 2u) ? 0u : 1u));
+    BOOST_CHECK_EQUAL(grid.y, (mul + 1u));
+    BOOST_CHECK_EQUAL(grid.z, 1u);
   }
 }
 
+#endif
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/core/unit_tests/fft_test.cpp b/src/core/unit_tests/fft_test.cpp
index 1506e01f20d..bb27335279b 100644
--- a/src/core/unit_tests/fft_test.cpp
+++ b/src/core/unit_tests/fft_test.cpp
@@ -17,11 +17,17 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#define BOOST_TEST_MODULE fft test
+#define BOOST_TEST_MODULE "FFT utility functions"
 #define BOOST_TEST_DYN_LINK
+
+#include "config/config.hpp"
+
+#if defined(P3M) || defined(DP3M)
+
 #include <boost/test/unit_test.hpp>
 
 #include "p3m/fft.hpp"
+#include "p3m/for_each_3d.hpp"
 
 #include <utils/Vector.hpp>
 
@@ -33,10 +39,9 @@
 #include <stdexcept>
 #include <vector>
 
-#if defined(P3M) || defined(DP3M)
 std::optional<std::vector<int>> find_comm_groups(Utils::Vector3i const &,
                                                  Utils::Vector3i const &,
-                                                 std::span<const int>,
+                                                 std::span<int const>,
                                                  std::span<int>, std::span<int>,
                                                  std::span<int>, int);
 
@@ -63,11 +68,50 @@ BOOST_AUTO_TEST_CASE(fft_find_comm_groups_mismatch) {
 }
 
 BOOST_AUTO_TEST_CASE(fft_exceptions) {
+  auto constexpr size_max = std::numeric_limits<std::size_t>::max();
+  auto constexpr bad_size = size_max / sizeof(int) + 1ul;
   fft_allocator<int> allocator{};
   BOOST_CHECK_EQUAL(allocator.allocate(0ul), nullptr);
-  BOOST_CHECK_THROW(
-      allocator.allocate(std::numeric_limits<std::size_t>::max() / sizeof(int) +
-                         1ul),
-      std::bad_array_new_length);
+  BOOST_CHECK_THROW(allocator.allocate(bad_size), std::bad_array_new_length);
+}
+
+BOOST_AUTO_TEST_CASE(for_each_3d_test) {
+  auto const m_start = Utils::Vector3i{{0, -1, 3}};
+  auto const m_stop = Utils::Vector3i{{2, 2, 5}};
+  auto ref_loop_counters = m_start;
+  auto indices = Utils::Vector3i{};
+
+  auto const kernel = [&]() {
+    BOOST_REQUIRE_EQUAL(indices, ref_loop_counters);
+    if (++ref_loop_counters[2u] == m_stop[2u]) {
+      ref_loop_counters[2u] = m_start[2u];
+      if (++ref_loop_counters[1u] == m_stop[1u]) {
+        ref_loop_counters[1u] = m_start[1u];
+        if (++ref_loop_counters[0u] == m_stop[0u]) {
+          ref_loop_counters[0u] = m_start[0u];
+        }
+      }
+    }
+  };
+
+  {
+    for_each_3d(m_start, m_stop, indices, kernel, [&](unsigned dim, int n) {
+      BOOST_REQUIRE_GE(dim, 0);
+      BOOST_REQUIRE_LE(dim, 2);
+      BOOST_REQUIRE_EQUAL(n, ref_loop_counters[dim]);
+    });
+
+    BOOST_REQUIRE_EQUAL(indices, m_stop);
+    BOOST_REQUIRE_EQUAL(ref_loop_counters, m_start);
+  }
+  {
+    for_each_3d(m_start, m_stop, indices, kernel);
+
+    BOOST_REQUIRE_EQUAL(indices, m_stop);
+    BOOST_REQUIRE_EQUAL(ref_loop_counters, m_start);
+  }
 }
+
+#else  // defined(P3M) || defined(DP3M)
+int main(int argc, char **argv) {}
 #endif // defined(P3M) || defined(DP3M)
diff --git a/src/core/unit_tests/p3m_test.cpp b/src/core/unit_tests/p3m_test.cpp
index 9a0ea01bee2..3f091d565ce 100644
--- a/src/core/unit_tests/p3m_test.cpp
+++ b/src/core/unit_tests/p3m_test.cpp
@@ -27,8 +27,6 @@
 
 #include <array>
 #include <cstddef>
-#include <limits>
-#include <stdexcept>
 #include <vector>
 
 BOOST_AUTO_TEST_CASE(calc_meshift_false) {
@@ -60,27 +58,3 @@ BOOST_AUTO_TEST_CASE(calc_meshift_true) {
     }
   }
 }
-
-#if defined(P3M) || defined(DP3M)
-BOOST_AUTO_TEST_CASE(analytic_cotangent_sum) {
-  auto constexpr kernel = p3m_analytic_cotangent_sum;
-  auto constexpr tol = 8. * 100. * std::numeric_limits<double>::epsilon();
-
-  // check only trivial cases
-  for (auto const cao : {1, 2, 3, 4, 5, 6, 7}) {
-    BOOST_CHECK_CLOSE(kernel(0, 0., cao), 1., tol);
-  }
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 1), 1., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 2), 1. / 3., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 3), 2. / 15., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 4), 17. / 315., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 5), 62. / 2835., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 6), 1382. / 155925., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 7), 21844. / 6081075., tol);
-
-  // check assertion
-  for (auto const invalid_cao : {-1, 0, 8}) {
-    BOOST_CHECK_THROW(kernel(1, 0., invalid_cao), std::logic_error);
-  }
-}
-#endif // defined(P3M) || defined(DP3M)
diff --git a/src/utils/include/utils/Vector.hpp b/src/utils/include/utils/Vector.hpp
index 6ecf0f4fc4a..f1361fa9d6e 100644
--- a/src/utils/include/utils/Vector.hpp
+++ b/src/utils/include/utils/Vector.hpp
@@ -80,7 +80,7 @@ template <typename T, std::size_t N> class Vector : public Array<T, N> {
 
 public:
   template <class Range>
-  explicit Vector(Range const &rng)
+  explicit constexpr Vector(Range const &rng)
       : Vector(std::begin(rng), std::end(rng)) {}
   explicit constexpr Vector(T const (&v)[N]) : Base() {
     copy_init(std::begin(v), std::end(v));
diff --git a/src/utils/include/utils/math/abs.hpp b/src/utils/include/utils/math/abs.hpp
deleted file mode 100644
index 4576f2c9964..00000000000
--- a/src/utils/include/utils/math/abs.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2010-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef ESPRESSO_ABS_HPP
-#define ESPRESSO_ABS_HPP
-
-#include "utils/device_qualifier.hpp"
-
-#ifndef __CUDACC__
-#include <cmath>
-#endif
-
-namespace Utils {
-/**
- * @brief Return the absolute value of x.
- */
-inline DEVICE_QUALIFIER double abs(double x) { return fabs(x); }
-
-/**
- * @brief Return the absolute value of x.
- */
-inline DEVICE_QUALIFIER float abs(float x) { return fabsf(x); }
-} // namespace Utils
-
-#endif // ESPRESSO_DEVICE_MATH_HPP
diff --git a/src/utils/include/utils/math/sinc.hpp b/src/utils/include/utils/math/sinc.hpp
deleted file mode 100644
index 0a2cf4f47c7..00000000000
--- a/src/utils/include/utils/math/sinc.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) 2010-2022 The ESPResSo project
- * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
- *   Max-Planck-Institute for Polymer Research, Theory Group
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "utils/device_qualifier.hpp"
-#include "utils/math/abs.hpp"
-
-#include <cmath>
-#include <numbers>
-
-namespace Utils {
-/**
- * @brief Calculates the sinc-function as sin(PI*x)/(PI*x).
- *
- * (same convention as in @cite hockney88a). In order to avoid
- * divisions by 0, arguments, whose modulus is smaller than epsi, will
- * be evaluated by an 8th order Taylor expansion of the sinc
- * function. Note that the difference between sinc(x) and this
- * expansion is smaller than 0.235e-12, if x is smaller than 0.1. (The
- * next term in the expansion is the 10th order contribution
- * PI^10/39916800 * x^10 = 0.2346...*x^12). This expansion should
- * also save time, since it reduces the number of function calls to
- * sin().
- */
-template <typename T> DEVICE_QUALIFIER T sinc(T d) {
-  const constexpr T epsi = T(0.1);
-
-  const auto PId = std::numbers::pi_v<T> * d;
-
-  if (::Utils::abs(d) > epsi)
-    return sin(PId) / PId;
-
-  /* Coefficients of the Taylor expansion of sinc */
-  const constexpr T c2 = T(-0.1666666666667e-0);
-  const constexpr T c4 = T(0.8333333333333e-2);
-  const constexpr T c6 = T(-0.1984126984127e-3);
-  const constexpr T c8 = T(0.2755731922399e-5);
-
-  const auto PId2 = PId * PId;
-  return T(1) + PId2 * (c2 + PId2 * (c4 + PId2 * (c6 + PId2 * c8)));
-}
-} // namespace Utils
diff --git a/src/utils/tests/CMakeLists.txt b/src/utils/tests/CMakeLists.txt
index 2dd87d9f775..89755749fab 100644
--- a/src/utils/tests/CMakeLists.txt
+++ b/src/utils/tests/CMakeLists.txt
@@ -21,7 +21,6 @@
 
 include(unit_test)
 
-unit_test(NAME abs_test SRC abs_test.cpp DEPENDS espresso::utils)
 unit_test(NAME Vector_test SRC Vector_test.cpp DEPENDS espresso::utils)
 unit_test(NAME Factory_test SRC Factory_test.cpp DEPENDS espresso::utils)
 unit_test(NAME NumeratedContainer_test SRC NumeratedContainer_test.cpp DEPENDS
@@ -34,7 +33,6 @@ unit_test(NAME accumulator SRC accumulator.cpp DEPENDS espresso::utils
 unit_test(NAME int_pow SRC int_pow_test.cpp DEPENDS espresso::utils)
 unit_test(NAME sgn SRC sgn_test.cpp DEPENDS espresso::utils)
 unit_test(NAME AS_erfc_part SRC AS_erfc_part_test.cpp DEPENDS espresso::utils)
-unit_test(NAME sinc SRC sinc_test.cpp DEPENDS espresso::utils)
 unit_test(NAME permute_ifield_test SRC permute_ifield_test.cpp DEPENDS
           espresso::utils)
 unit_test(NAME vec_rotate SRC vec_rotate_test.cpp DEPENDS espresso::utils)
diff --git a/src/utils/tests/abs_test.cpp b/src/utils/tests/abs_test.cpp
deleted file mode 100644
index 94e43728e7b..00000000000
--- a/src/utils/tests/abs_test.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2019-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#define BOOST_TEST_MODULE abs test
-#define BOOST_TEST_DYN_LINK
-#include <boost/test/unit_test.hpp>
-
-#include <utils/math/abs.hpp>
-
-#include <cmath>
-#include <type_traits>
-
-BOOST_AUTO_TEST_CASE(abs_test) {
-  using Utils::abs;
-
-  static_assert(std::is_same_v<float, decltype(abs(1.f))>);
-  static_assert(std::is_same_v<double, decltype(abs(1.))>);
-
-  BOOST_CHECK_EQUAL(std::abs(3.1415), abs(3.1415));
-  BOOST_CHECK_EQUAL(std::abs(-3.1415), abs(-3.1415));
-  BOOST_CHECK_EQUAL(std::abs(3.1415f), abs(3.1415f));
-  BOOST_CHECK_EQUAL(std::abs(-3.1415f), abs(-3.1415f));
-}
diff --git a/src/utils/tests/sinc_test.cpp b/src/utils/tests/sinc_test.cpp
deleted file mode 100644
index 063120ca971..00000000000
--- a/src/utils/tests/sinc_test.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (C) 2017-2022 The ESPResSo project
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#define BOOST_TEST_MODULE Utils::sinc test
-#define BOOST_TEST_DYN_LINK
-#include <boost/test/unit_test.hpp>
-
-#include "utils/math/sinc.hpp"
-
-#include <cmath>
-#include <cstdlib>
-#include <numbers>
-
-BOOST_AUTO_TEST_CASE(zero) { BOOST_CHECK_EQUAL(Utils::sinc(0.0), 1.0); }
-
-BOOST_AUTO_TEST_CASE(approx) {
-  auto x = 0.001;
-  while (x <= 0.11) {
-    auto const approx = Utils::sinc(x);
-    auto const pi_x = std::numbers::pi * x;
-    auto const exact = std::sin(pi_x) / (pi_x);
-    BOOST_CHECK_SMALL(approx - exact, 1e-13);
-    x += 0.01;
-  }
-}