From d4e34a6f9c2787a4ef8e3e4446e5829b56e47595 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Fri, 14 Jun 2024 18:47:58 +0200
Subject: [PATCH] revert_to_working

---
 .gitlab-ci.yml                                |  76 ----
 CMakeLists.txt                                |   9 -
 .../code_generation_context.py                |   1 +
 .../walberla_kernels/generate_lb_kernels.py   |   4 +-
 maintainer/walberla_kernels/lbmpy_espresso.py |  10 +-
 src/core/CMakeLists.txt                       |   4 +-
 src/core/electrostatics/p3m.cpp               | 245 ++++++------
 src/core/electrostatics/p3m_gpu_cuda.cu       |  94 +++--
 src/core/electrostatics/p3m_gpu_error_cuda.cu |  55 ++-
 src/core/magnetostatics/dp3m.cpp              | 360 ++++++++++--------
 src/core/p3m/common.cpp                       |  44 +++
 src/core/p3m/common.hpp                       |  16 +-
 src/core/p3m/fft.cpp                          |   2 +-
 src/core/p3m/fft.hpp                          |  43 ++-
 src/core/p3m/for_each_3d.hpp                  |  70 ----
 src/core/p3m/influence_function.hpp           | 115 +++---
 src/core/p3m/influence_function_dipolar.hpp   | 219 +++++------
 src/core/p3m/math.hpp                         | 151 --------
 src/core/unit_tests/CMakeLists.txt            |   4 +-
 src/core/unit_tests/cuda_test.cu              | 329 ++++++++--------
 src/core/unit_tests/fft_test.cpp              |  58 +--
 src/core/unit_tests/math_test.cpp             | 105 -----
 src/core/unit_tests/p3m_test.cpp              |  30 +-
 src/shapes/include/shapes/Ellipsoid.hpp       |   2 +-
 src/shapes/src/HollowConicalFrustum.cpp       |   7 +-
 src/utils/include/utils/Vector.hpp            |  60 +--
 .../cylindrical_transformation_parameters.hpp |  23 +-
 src/utils/include/utils/math/sqr.hpp          |   2 -
 src/utils/include/utils/quaternion.hpp        |   1 -
 src/utils/tests/CMakeLists.txt                |   2 +
 src/utils/tests/Vector_test.cpp               |  25 +-
 .../src/lattice_boltzmann/LBWalberlaImpl.hpp  |   6 +-
 .../CollideSweepDoublePrecisionLeesEdwards.h  |   2 +-
 ...ollideSweepDoublePrecisionLeesEdwardsAVX.h |   2 +-
 ...llideSweepDoublePrecisionLeesEdwardsCUDA.h |   2 +-
 .../CollideSweepDoublePrecisionThermalized.h  |   2 +-
 ...ollideSweepDoublePrecisionThermalizedAVX.h |   2 +-
 ...llideSweepDoublePrecisionThermalizedCUDA.h |   2 +-
 .../CollideSweepSinglePrecisionLeesEdwards.h  |   2 +-
 ...ollideSweepSinglePrecisionLeesEdwardsAVX.h |   2 +-
 ...llideSweepSinglePrecisionLeesEdwardsCUDA.h |   2 +-
 .../CollideSweepSinglePrecisionThermalized.h  |   2 +-
 ...ollideSweepSinglePrecisionThermalizedAVX.h |   2 +-
 ...llideSweepSinglePrecisionThermalizedCUDA.h |   2 +-
 .../InitialPDFsSetterDoublePrecision.h        |   2 +-
 .../InitialPDFsSetterDoublePrecisionCUDA.h    |   2 +-
 .../InitialPDFsSetterSinglePrecision.h        |   2 +-
 .../InitialPDFsSetterSinglePrecisionCUDA.h    |   2 +-
 .../StreamSweepDoublePrecision.h              |   2 +-
 .../StreamSweepDoublePrecisionAVX.h           |   2 +-
 .../StreamSweepDoublePrecisionCUDA.h          |   2 +-
 .../StreamSweepSinglePrecision.h              |   2 +-
 .../StreamSweepSinglePrecisionAVX.h           |   2 +-
 .../StreamSweepSinglePrecisionCUDA.h          |   2 +-
 testsuite/python/lb.py                        |   4 +-
 testsuite/python/lb_boundary_ghost_layer.py   |   2 +-
 56 files changed, 948 insertions(+), 1274 deletions(-)
 delete mode 100644 src/core/p3m/for_each_3d.hpp
 delete mode 100644 src/core/p3m/math.hpp
 delete mode 100644 src/core/unit_tests/math_test.cpp

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d891c721212..cfff83fb571 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -41,40 +41,6 @@ status_pending:
   stage: prepare
   script: sh maintainer/gh_post_status.sh pending
 
-style:
-  <<: *global_job_definition
-  stage: build
-  dependencies: []
-  before_script:
-    - git config --global --add safe.directory ${CI_PROJECT_DIR}
-    - git submodule deinit .
-  script:
-    - sh maintainer/CI/fix_style.sh
-  tags:
-    - espresso
-    - no-cuda
-  variables:
-    GIT_SUBMODULE_STRATEGY: none
-  artifacts:
-    paths:
-    - style.patch
-    expire_in: 1 week
-    when: on_failure
-
-style_doxygen:
-  <<: *global_job_definition
-  stage: build
-  dependencies: []
-  script:
-    - mkdir build
-    - cd build
-    - cp ../maintainer/configs/maxset.hpp myconfig.hpp
-    - cmake .. -D ESPRESSO_BUILD_WITH_CUDA=ON -D ESPRESSO_BUILD_WITH_GSL=ON -D ESPRESSO_BUILD_WITH_HDF5=ON -D ESPRESSO_BUILD_WITH_SCAFACOS=ON -D ESPRESSO_BUILD_WITH_WALBERLA=ON -D ESPRESSO_BUILD_WITH_WALBERLA_FFT=ON -D ESPRESSO_BUILD_WITH_STOKESIAN_DYNAMICS=ON -D ESPRESSO_BUILD_WITH_CALIPER=ON
-    - sh ../maintainer/CI/dox_warnings.sh
-  tags:
-    - espresso
-    - no-cuda
-
 ### Builds without CUDA
 
 default:
@@ -436,29 +402,6 @@ empty:
     - cuda
     - numa
 
-check_sphinx:
-  <<: *global_job_definition
-  stage: additional_checks
-  needs:
-    - cuda12-maxset
-  when: on_success
-  script:
-    - cd ${CI_PROJECT_DIR}/build
-    - make -t && make sphinx
-    - make -j2 tutorials
-    - make check_utils
-    - bash ${CI_PROJECT_DIR}/maintainer/CI/doc_warnings.sh
-    - python3 ${CI_PROJECT_DIR}/maintainer/CI/jupyter_warnings.py
-  artifacts:
-    paths:
-    - build/doc/sphinx
-    expire_in: 1 week
-  tags:
-    - espresso
-    - cuda
-    - numa
-    - reuse-artifacts-same-arch
-
 run_tutorials:
   <<: *global_job_definition
   stage: additional_checks
@@ -487,25 +430,6 @@ run_tutorials:
   only:
     - schedules
 
-run_doxygen:
-  <<: *global_job_definition
-  stage: additional_checks
-  needs:
-    - cuda12-maxset
-  when: on_success
-  script:
-    - cd ${CI_PROJECT_DIR}/build
-    - make -t && make doxygen
-  artifacts:
-    paths:
-    - build/doc/doxygen
-    expire_in: 1 week
-  tags:
-    - espresso
-    - no-cuda
-    - numa
-    - reuse-artifacts-same-arch
-
 maxset_no_gpu:
   <<: *global_job_definition
   stage: additional_checks
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6f084f6fc80..cd9c42814c7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -523,25 +523,16 @@ if(ESPRESSO_BUILD_WITH_ASAN)
   target_compile_options(espresso_cpp_flags INTERFACE -fsanitize=address
                                                       -fno-omit-frame-pointer)
   target_link_libraries(espresso_cpp_flags INTERFACE -fsanitize=address)
-  if(ESPRESSO_BUILD_WITH_CUDA)
-    target_link_libraries(espresso_cuda_flags INTERFACE -fsanitize=address)
-  endif()
 endif()
 if(ESPRESSO_BUILD_WITH_MSAN)
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -g -O1")
   target_compile_options(espresso_cpp_flags INTERFACE -fsanitize=memory
                                                       -fno-omit-frame-pointer)
   target_link_libraries(espresso_cpp_flags INTERFACE -fsanitize=memory)
-  if(ESPRESSO_BUILD_WITH_CUDA)
-    target_link_libraries(espresso_cuda_flags INTERFACE -fsanitize=memory)
-  endif()
 endif()
 if(ESPRESSO_BUILD_WITH_UBSAN)
   target_compile_options(espresso_cpp_flags INTERFACE -fsanitize=undefined)
   target_link_libraries(espresso_cpp_flags INTERFACE -fsanitize=undefined)
-  if(ESPRESSO_BUILD_WITH_CUDA)
-    target_link_libraries(espresso_cuda_flags INTERFACE -fsanitize=undefined)
-  endif()
 endif()
 
 target_link_libraries(espresso_cpp_flags INTERFACE espresso::coverage_flags)
diff --git a/maintainer/walberla_kernels/code_generation_context.py b/maintainer/walberla_kernels/code_generation_context.py
index 77f86183e0e..153822b0144 100644
--- a/maintainer/walberla_kernels/code_generation_context.py
+++ b/maintainer/walberla_kernels/code_generation_context.py
@@ -26,6 +26,7 @@
 import pystencils_walberla
 
 
+
 def earmark_generated_kernels():
     """
     Add an earmark at the beginning of generated kernels to document the
diff --git a/maintainer/walberla_kernels/generate_lb_kernels.py b/maintainer/walberla_kernels/generate_lb_kernels.py
index 9afd75925c1..4a7cbc9f402 100644
--- a/maintainer/walberla_kernels/generate_lb_kernels.py
+++ b/maintainer/walberla_kernels/generate_lb_kernels.py
@@ -149,9 +149,7 @@ def paramlist(parameters, keys):
             params
         )
 
-    block_offsets = tuple(
-        ps.TypedSymbol(f"block_offset_{i}", np.uint32)
-        for i in range(3))
+    block_offsets = tuple(ps.TypedSymbol(f"block_offset_{i}", np.uint32) for i in range(3))
 
     # generate thermalized LB
     collision_rule_thermalized = lbmpy.creationfunctions.create_lb_collision_rule(
diff --git a/maintainer/walberla_kernels/lbmpy_espresso.py b/maintainer/walberla_kernels/lbmpy_espresso.py
index 8a755d347b0..6142713f5fa 100644
--- a/maintainer/walberla_kernels/lbmpy_espresso.py
+++ b/maintainer/walberla_kernels/lbmpy_espresso.py
@@ -46,11 +46,11 @@ def data_initialisation(self, direction):
         cell_args = [f"it.{direction}() + {bb_vec[i]}".replace('+ -', '-')
                      for i, direction in enumerate("xyz")]
         code = [
-            "auto const InitialisationAdditionalData = elementInitialiser(",
-            f"Cell({', '.join(cell_args)}), blocks, *block);",
-            "element.vel_0 = InitialisationAdditionalData[0];",
-            "element.vel_1 = InitialisationAdditionalData[1];",
-            "element.vel_2 = InitialisationAdditionalData[2];",
+          "auto const InitialisationAdditionalData = elementInitialiser(",
+          f"Cell({', '.join(cell_args)}), blocks, *block);",
+          "element.vel_0 = InitialisationAdditionalData[0];",
+          "element.vel_1 = InitialisationAdditionalData[1];",
+          "element.vel_2 = InitialisationAdditionalData[2];",
         ]
         return "\n".join(code)
 
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 6153849cc7a..7335db2a010 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -82,9 +82,9 @@ install(TARGETS espresso_core
 
 target_link_libraries(
   espresso_core PRIVATE espresso::config espresso::utils::mpi espresso::shapes
-                        espresso::cpp_flags
+                        espresso::profiler espresso::cpp_flags
   PUBLIC espresso::utils MPI::MPI_CXX Random123 espresso::particle_observables
-         Boost::serialization Boost::mpi espresso::profiler)
+         Boost::serialization Boost::mpi)
 
 target_include_directories(espresso_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
diff --git a/src/core/electrostatics/p3m.cpp b/src/core/electrostatics/p3m.cpp
index 32667d67966..e247b88418d 100644
--- a/src/core/electrostatics/p3m.cpp
+++ b/src/core/electrostatics/p3m.cpp
@@ -36,9 +36,7 @@
 #include "p3m/TuningAlgorithm.hpp"
 #include "p3m/TuningLogger.hpp"
 #include "p3m/fft.hpp"
-#include "p3m/for_each_3d.hpp"
 #include "p3m/influence_function.hpp"
-#include "p3m/math.hpp"
 
 #include "BoxGeometry.hpp"
 #include "LocalBox.hpp"
@@ -59,6 +57,7 @@
 #include <utils/Vector.hpp>
 #include <utils/integral_parameter.hpp>
 #include <utils/math/int_pow.hpp>
+#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
@@ -76,11 +75,9 @@
 #include <functional>
 #include <numbers>
 #include <optional>
-#include <span>
 #include <sstream>
 #include <stdexcept>
 #include <string>
-#include <tuple>
 #include <utility>
 
 void CoulombP3M::count_charged_particles() {
@@ -117,7 +114,7 @@ void CoulombP3M::calc_influence_function_force() {
   auto const size = Utils::Vector3i{p3m.fft.plan[3].new_mesh};
 
   p3m.g_force = grid_influence_function<1>(p3m.params, start, start + size,
-                                           get_system().box_geo->length_inv());
+                                           get_system().box_geo->length());
 }
 
 /** Calculate the influence function optimized for the energy and the
@@ -128,38 +125,41 @@ void CoulombP3M::calc_influence_function_energy() {
   auto const size = Utils::Vector3i{p3m.fft.plan[3].new_mesh};
 
   p3m.g_energy = grid_influence_function<0>(p3m.params, start, start + size,
-                                            get_system().box_geo->length_inv());
+                                            get_system().box_geo->length());
 }
 
 /** Aliasing sum used by @ref p3m_k_space_error. */
-static auto p3m_tune_aliasing_sums(Utils::Vector3i const &shift,
+static auto p3m_tune_aliasing_sums(int nx, int ny, int nz,
                                    Utils::Vector3i const &mesh,
                                    Utils::Vector3d const &mesh_i, int cao,
                                    double alpha_L_i) {
+  using Utils::sinc;
 
-  auto constexpr m_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN);
-  auto constexpr m_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1);
   auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i);
+
   auto alias1 = 0.;
   auto alias2 = 0.;
-
-  Utils::Vector3i indices{};
-  Utils::Vector3i nm{};
-  Utils::Vector3d fnm{};
-  for_each_3d(
-      m_start, m_stop, indices,
-      [&]() {
-        auto const norm_sq = nm.norm2();
-        auto const ex = exp(-factor1 * norm_sq);
-        auto const energy = std::pow(Utils::product(fnm), 2 * cao);
-        alias1 += Utils::sqr(ex) / norm_sq;
-        alias2 += energy * ex * (shift * nm) / norm_sq;
-      },
-      [&](unsigned dim, int n) {
-        nm[dim] = shift[dim] + n * mesh[dim];
-        fnm[dim] = math::sinc(nm[dim] * mesh_i[dim]);
-      });
-
+  for (int mx = -P3M_BRILLOUIN; mx <= P3M_BRILLOUIN; mx++) {
+    auto const nmx = nx + mx * mesh[0];
+    auto const fnmx = mesh_i[0] * nmx;
+    for (int my = -P3M_BRILLOUIN; my <= P3M_BRILLOUIN; my++) {
+      auto const nmy = ny + my * mesh[1];
+      auto const fnmy = mesh_i[1] * nmy;
+      for (int mz = -P3M_BRILLOUIN; mz <= P3M_BRILLOUIN; mz++) {
+        auto const nmz = nz + mz * mesh[2];
+        auto const fnmz = mesh_i[2] * nmz;
+
+        auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz);
+        auto const ex = exp(-factor1 * nm2);
+        auto const ex2 = Utils::sqr(ex);
+
+        auto const U2 = pow(sinc(fnmx) * sinc(fnmy) * sinc(fnmz), 2.0 * cao);
+
+        alias1 += ex2 / nm2;
+        alias2 += U2 * ex * (nx * nmx + ny * nmy + nz * nmz) / nm2;
+      }
+    }
+  }
   return std::make_pair(alias1, alias2);
 }
 
@@ -197,40 +197,51 @@ static double p3m_real_space_error(double pref, double r_cut_iL, int n_c_part,
 static double p3m_k_space_error(double pref, Utils::Vector3i const &mesh,
                                 int cao, int n_c_part, double sum_q2,
                                 double alpha_L, Utils::Vector3d const &box_l) {
-
-  auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao);
-  auto const mesh_i = 1. / Utils::Vector3d(mesh);
+  auto const mesh_i =
+      Utils::hadamard_division(Utils::Vector3d::broadcast(1.), mesh);
   auto const alpha_L_i = 1. / alpha_L;
-  auto const m_stop = mesh / 2;
-  auto const m_start = -m_stop;
-  auto indices = Utils::Vector3i{};
-  auto values = Utils::Vector3d{};
   auto he_q = 0.;
 
-  for_each_3d(
-      m_start, m_stop, indices,
-      [&]() {
-        if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) {
-          auto const n2 = indices.norm2();
-          auto const cs = Utils::product(values);
+  for (int nx = -mesh[0] / 2; nx < mesh[0] / 2; nx++) {
+    auto const ctan_x = p3m_analytic_cotangent_sum(nx, mesh_i[0], cao);
+    for (int ny = -mesh[1] / 2; ny < mesh[1] / 2; ny++) {
+      auto const ctan_y =
+          ctan_x * p3m_analytic_cotangent_sum(ny, mesh_i[1], cao);
+      for (int nz = -mesh[2] / 2; nz < mesh[2] / 2; nz++) {
+        if ((nx != 0) || (ny != 0) || (nz != 0)) {
+          auto const n2 = Utils::sqr(nx) + Utils::sqr(ny) + Utils::sqr(nz);
+          auto const cs =
+              p3m_analytic_cotangent_sum(nz, mesh_i[2], cao) * ctan_y;
           auto const [alias1, alias2] =
-              p3m_tune_aliasing_sums(indices, mesh, mesh_i, cao, alpha_L_i);
+              p3m_tune_aliasing_sums(nx, ny, nz, mesh, mesh_i, cao, alpha_L_i);
           auto const d = alias1 - Utils::sqr(alias2 / cs) / n2;
           /* at high precision, d can become negative due to extinction;
              also, don't take values that have no significant digits left*/
-          if (d > 0. and std::fabs(d / alias1) > ROUND_ERROR_PREC) {
+          if (d > 0 && (fabs(d / alias1) > ROUND_ERROR_PREC))
             he_q += d;
-          }
         }
-      },
-      [&values, &mesh_i, cotangent_sum](unsigned dim, int n) {
-        values[dim] = cotangent_sum(n, mesh_i[dim]);
-      });
-
+      }
+    }
+  }
   return 2. * pref * sum_q2 * sqrt(he_q / static_cast<double>(n_c_part)) /
          (box_l[1] * box_l[2]);
 }
 
+#ifdef CUDA
+static double p3mgpu_k_space_error(double prefactor,
+                                   Utils::Vector3i const &mesh, int cao,
+                                   int npart, double sum_q2, double alpha_L,
+                                   Utils::Vector3d const &box_l) {
+  auto ks_err = 0.;
+  if (this_node == 0) {
+    ks_err = p3m_k_space_error_gpu(prefactor, mesh.data(), cao, npart, sum_q2,
+                                   alpha_L, box_l.data());
+  }
+  boost::mpi::broadcast(comm_cart, ks_err, 0);
+  return ks_err;
+}
+#endif
+
 void CoulombP3M::init() {
   assert(p3m.params.mesh >= Utils::Vector3i::broadcast(1));
   assert(p3m.params.cao >= 1 and p3m.params.cao <= 7);
@@ -410,44 +421,48 @@ CoulombP3M::long_range_pressure(ParticleRange const &particles) {
     p3m.sm.gather_grid(p3m.rs_mesh.data(), comm_cart, p3m.local_mesh.dim);
     fft_perform_forw(p3m.rs_mesh.data(), p3m.fft, comm_cart);
 
-    auto constexpr m_start = Utils::Vector3i::broadcast(0);
-    auto const half_alpha_inv_sq = Utils::sqr(1. / 2. / p3m.params.alpha);
-    auto const m_stop = std::span(p3m.fft.plan[3].new_mesh);
-    auto const offset = std::span(p3m.fft.plan[3].start);
-    auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv();
-    auto indices = Utils::Vector3i{};
-    auto index = std::size_t(0u);
     auto diagonal = 0.;
-
-    for_each_3d(m_start, m_stop, indices, [&]() {
-      auto const kx = p3m.d_op[RX][indices[KX] + offset[KX]] * wavevector[RX];
-      auto const ky = p3m.d_op[RY][indices[KY] + offset[KY]] * wavevector[RY];
-      auto const kz = p3m.d_op[RZ][indices[KZ] + offset[KZ]] * wavevector[RZ];
-      auto const norm_sq = Utils::sqr(kx) + Utils::sqr(ky) + Utils::sqr(kz);
-
-      if (norm_sq != 0.) {
-        auto const node_k_space_energy =
-            p3m.g_energy[index] * (Utils::sqr(p3m.rs_mesh[2u * index + 0u]) +
-                                   Utils::sqr(p3m.rs_mesh[2u * index + 1u]));
-        auto const vterm = -2. * (1. / norm_sq + half_alpha_inv_sq);
-        auto const pref = node_k_space_energy * vterm;
-        node_k_space_pressure_tensor[0u] += pref * kx * kx; /* sigma_xx */
-        node_k_space_pressure_tensor[1u] += pref * kx * ky; /* sigma_xy */
-        node_k_space_pressure_tensor[2u] += pref * kx * kz; /* sigma_xz */
-        node_k_space_pressure_tensor[4u] += pref * ky * ky; /* sigma_yy */
-        node_k_space_pressure_tensor[5u] += pref * ky * kz; /* sigma_yz */
-        node_k_space_pressure_tensor[8u] += pref * kz * kz; /* sigma_zz */
-        diagonal += node_k_space_energy;
+    int ind = 0;
+    int j[3];
+    auto const half_alpha_inv_sq = Utils::sqr(1. / 2. / p3m.params.alpha);
+    for (j[0] = 0; j[0] < p3m.fft.plan[3].new_mesh[RX]; j[0]++) {
+      for (j[1] = 0; j[1] < p3m.fft.plan[3].new_mesh[RY]; j[1]++) {
+        for (j[2] = 0; j[2] < p3m.fft.plan[3].new_mesh[RZ]; j[2]++) {
+          auto const kx = 2. * std::numbers::pi *
+                          p3m.d_op[RX][j[KX] + p3m.fft.plan[3].start[KX]] *
+                          box_geo.length_inv()[RX];
+          auto const ky = 2. * std::numbers::pi *
+                          p3m.d_op[RY][j[KY] + p3m.fft.plan[3].start[KY]] *
+                          box_geo.length_inv()[RY];
+          auto const kz = 2. * std::numbers::pi *
+                          p3m.d_op[RZ][j[KZ] + p3m.fft.plan[3].start[KZ]] *
+                          box_geo.length_inv()[RZ];
+          auto const sqk = Utils::sqr(kx) + Utils::sqr(ky) + Utils::sqr(kz);
+
+          if (sqk != 0.) {
+            auto const node_k_space_energy =
+                p3m.g_energy[ind] * (Utils::sqr(p3m.rs_mesh[2 * ind]) +
+                                     Utils::sqr(p3m.rs_mesh[2 * ind + 1]));
+            auto const vterm = -2. * (1. / sqk + half_alpha_inv_sq);
+            auto const pref = node_k_space_energy * vterm;
+            node_k_space_pressure_tensor[0] += pref * kx * kx; /* sigma_xx */
+            node_k_space_pressure_tensor[1] += pref * kx * ky; /* sigma_xy */
+            node_k_space_pressure_tensor[2] += pref * kx * kz; /* sigma_xz */
+            node_k_space_pressure_tensor[3] += pref * ky * kx; /* sigma_yx */
+            node_k_space_pressure_tensor[4] += pref * ky * ky; /* sigma_yy */
+            node_k_space_pressure_tensor[5] += pref * ky * kz; /* sigma_yz */
+            node_k_space_pressure_tensor[6] += pref * kz * kx; /* sigma_zx */
+            node_k_space_pressure_tensor[7] += pref * kz * ky; /* sigma_zy */
+            node_k_space_pressure_tensor[8] += pref * kz * kz; /* sigma_zz */
+            diagonal += node_k_space_energy;
+          }
+          ind++;
+        }
       }
-      ++index;
-    });
-
-    node_k_space_pressure_tensor[0u] += diagonal;
-    node_k_space_pressure_tensor[4u] += diagonal;
-    node_k_space_pressure_tensor[8u] += diagonal;
-    node_k_space_pressure_tensor[3u] = node_k_space_pressure_tensor[1u];
-    node_k_space_pressure_tensor[6u] = node_k_space_pressure_tensor[2u];
-    node_k_space_pressure_tensor[7u] = node_k_space_pressure_tensor[5u];
+    }
+    node_k_space_pressure_tensor[0] += diagonal;
+    node_k_space_pressure_tensor[4] += diagonal;
+    node_k_space_pressure_tensor[8] += diagonal;
   }
 
   return node_k_space_pressure_tensor * prefactor / (2. * box_geo.volume());
@@ -494,35 +509,35 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
 
   /* === k-space force calculation  === */
   if (force_flag) {
-    /* i*k differentiation */
-    auto constexpr m_start = Utils::Vector3i::broadcast(0);
-    auto const m_stop = std::span(p3m.fft.plan[3].new_mesh);
-    auto const offset = std::span(p3m.fft.plan[3].start);
-    auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv();
-    auto indices = Utils::Vector3i{};
-    auto index = std::size_t(0u);
-
+    /* sqrt(-1)*k differentiation */
+    int j[3];
+    int ind = 0;
     /* compute electric field */
     // Eq. (3.49) @cite deserno00b
-    for_each_3d(m_start, m_stop, indices, [&]() {
-      auto const rho_hat = std::complex<double>(p3m.rs_mesh[2u * index + 0u],
-                                                p3m.rs_mesh[2u * index + 1u]);
-      auto const phi_hat = p3m.g_force[index] * rho_hat;
-
-      for (int d = 0; d < 3; d++) {
-        /* direction in r-space: */
-        int d_rs = (d + p3m.ks_pnum) % 3;
-        /* directions */
-        auto const k =
-            p3m.d_op[d_rs][indices[d] + offset[d]] * wavevector[d_rs];
-
-        /* i*k*(Re+i*Im) = - Im*k + i*Re*k     (i=sqrt(-1)) */
-        p3m.E_mesh[d_rs][2u * index + 0u] = -k * phi_hat.imag();
-        p3m.E_mesh[d_rs][2u * index + 1u] = +k * phi_hat.real();
-      }
+    for (j[0] = 0; j[0] < p3m.fft.plan[3].new_mesh[0]; j[0]++) {
+      for (j[1] = 0; j[1] < p3m.fft.plan[3].new_mesh[1]; j[1]++) {
+        for (j[2] = 0; j[2] < p3m.fft.plan[3].new_mesh[2]; j[2]++) {
+          auto const rho_hat = std::complex<double>(p3m.rs_mesh[2 * ind + 0],
+                                                    p3m.rs_mesh[2 * ind + 1]);
+          auto const phi_hat = p3m.g_force[ind] * rho_hat;
+
+          for (int d = 0; d < 3; d++) {
+            /* direction in r-space: */
+            int d_rs = (d + p3m.ks_pnum) % 3;
+            /* directions */
+            auto const k = 2. * std::numbers::pi *
+                           p3m.d_op[d_rs][j[d] + p3m.fft.plan[3].start[d]] *
+                           box_geo.length_inv()[d_rs];
+
+            /* i*k*(Re+i*Im) = - Im*k + i*Re*k     (i=sqrt(-1)) */
+            p3m.E_mesh[d_rs][2 * ind + 0] = -k * phi_hat.imag();
+            p3m.E_mesh[d_rs][2 * ind + 1] = +k * phi_hat.real();
+          }
 
-      ++index;
-    });
+          ind++;
+        }
+      }
+    }
 
     /* Back FFT force component mesh */
     auto const check_complex = !p3m.params.tuning and check_complex_residuals;
@@ -663,12 +678,8 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
 #ifdef CUDA
     auto const &solver = m_system.coulomb.impl->solver;
     if (has_actor_of_type<CoulombP3MGPU>(solver)) {
-      if (this_node == 0) {
-        ks_err =
-            p3m_k_space_error_gpu(m_prefactor, mesh.data(), cao, p3m.sum_qpart,
-                                  p3m.sum_q2, alpha_L, box_geo.length().data());
-      }
-      boost::mpi::broadcast(comm_cart, ks_err, 0);
+      ks_err = p3mgpu_k_space_error(m_prefactor, mesh, cao, p3m.sum_qpart,
+                                    p3m.sum_q2, alpha_L, box_geo.length());
     } else
 #endif
       ks_err = p3m_k_space_error(m_prefactor, mesh, cao, p3m.sum_qpart,
@@ -796,7 +807,7 @@ void CoulombP3M::sanity_checks_boxl() const {
   auto const &system = get_system();
   auto const &box_geo = *system.box_geo;
   auto const &local_geo = *system.local_geo;
-  for (auto i = 0u; i < 3u; i++) {
+  for (unsigned int i = 0u; i < 3u; i++) {
     /* check k-space cutoff */
     if (p3m.params.cao_cut[i] >= box_geo.length_half()[i]) {
       std::stringstream msg;
diff --git a/src/core/electrostatics/p3m_gpu_cuda.cu b/src/core/electrostatics/p3m_gpu_cuda.cu
index c734ba0cccd..6429c8afeed 100644
--- a/src/core/electrostatics/p3m_gpu_cuda.cu
+++ b/src/core/electrostatics/p3m_gpu_cuda.cu
@@ -53,20 +53,20 @@
 #include "electrostatics/p3m_gpu_cuda.cuh"
 
 #include "cuda/utils.cuh"
-#include "p3m/math.hpp"
 #include "system/System.hpp"
 
 #include <utils/math/bspline.hpp>
 #include <utils/math/int_pow.hpp>
+#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <cuda.h>
 #include <cufft.h>
 
 #include <algorithm>
-#include <cassert>
 #include <cstddef>
-#include <limits>
+#include <cstdio>
+#include <cstdlib>
 #include <numbers>
 #include <stdexcept>
 
@@ -93,7 +93,7 @@ struct P3MGpuData {
   /** Ewald parameter */
   REAL_TYPE alpha;
   /** Number of particles */
-  unsigned int n_part; // oddity: size_t causes UB with GCC 11.4 in Debug mode
+  std::size_t n_part;
   /** Box size */
   REAL_TYPE box[3];
   /** Mesh dimensions */
@@ -138,31 +138,6 @@ struct P3MGpuParams {
   }
 };
 
-static auto p3m_calc_blocks(unsigned int cao, std::size_t n_part) {
-  auto const cao3 = Utils::int_pow<3>(cao);
-  auto parts_per_block = 1u;
-  while ((parts_per_block + 1u) * cao3 <= 1024u) {
-    ++parts_per_block;
-  }
-  assert((n_part / parts_per_block) <= std::numeric_limits<unsigned>::max());
-  auto n = static_cast<unsigned int>(n_part / parts_per_block);
-  auto n_blocks = ((n_part % parts_per_block) == 0u) ? std::max(1u, n) : n + 1u;
-  assert(n_blocks <= std::numeric_limits<unsigned>::max());
-  return std::make_pair(parts_per_block, static_cast<unsigned>(n_blocks));
-}
-
-dim3 p3m_make_grid(unsigned int n_blocks) {
-  dim3 grid(n_blocks, 1u, 1u);
-  while (grid.x > 65536u) {
-    grid.y++;
-    if ((n_blocks % grid.y) == 0u)
-      grid.x = std::max(1u, n_blocks / grid.y);
-    else
-      grid.x = n_blocks / grid.y + 1u;
-  }
-  return grid;
-}
-
 template <int cao>
 __device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY,
                                         int NZ, REAL_TYPE *Zaehler,
@@ -185,15 +160,15 @@ __device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY,
   for (MX = -P3M_BRILLOUIN; MX <= P3M_BRILLOUIN; MX++) {
     NMX = static_cast<REAL_TYPE>(((NX > p.mesh[0] / 2) ? NX - p.mesh[0] : NX) +
                                  p.mesh[0] * MX);
-    S1 = int_pow<2 * cao>(math::sinc(Meshi[0] * NMX));
+    S1 = int_pow<2 * cao>(Utils::sinc(Meshi[0] * NMX));
     for (MY = -P3M_BRILLOUIN; MY <= P3M_BRILLOUIN; MY++) {
       NMY = static_cast<REAL_TYPE>(
           ((NY > p.mesh[1] / 2) ? NY - p.mesh[1] : NY) + p.mesh[1] * MY);
-      S2 = S1 * int_pow<2 * cao>(math::sinc(Meshi[1] * NMY));
+      S2 = S1 * int_pow<2 * cao>(Utils::sinc(Meshi[1] * NMY));
       for (MZ = -P3M_BRILLOUIN; MZ <= P3M_BRILLOUIN; MZ++) {
         NMZ = static_cast<REAL_TYPE>(
             ((NZ > p.mesh[2] / 2) ? NZ - p.mesh[2] : NZ) + p.mesh[2] * MZ);
-        S3 = S2 * int_pow<2 * cao>(math::sinc(Meshi[2] * NMZ));
+        S3 = S2 * int_pow<2 * cao>(Utils::sinc(Meshi[2] * NMZ));
 
         NM2 = sqr(NMX * Leni[0]) + sqr(NMY * Leni[1]) + sqr(NMZ * Leni[2]);
         *Nenner += S3;
@@ -391,9 +366,26 @@ void assign_charges(P3MGpuData const &params,
                     float const *const __restrict__ part_pos,
                     float const *const __restrict__ part_q) {
   auto const cao = static_cast<unsigned int>(params.cao);
-  auto const [parts_per_block, n_blocks] = p3m_calc_blocks(cao, params.n_part);
+  auto const cao3 = int_pow<3>(cao);
+  unsigned int parts_per_block = 1u, n_blocks = 1u;
+
+  while ((parts_per_block + 1u) * cao3 <= 1024u) {
+    parts_per_block++;
+  }
+  if ((params.n_part % parts_per_block) == 0u)
+    n_blocks = std::max<unsigned>(1u, params.n_part / parts_per_block);
+  else
+    n_blocks = params.n_part / parts_per_block + 1u;
+
   dim3 block(parts_per_block * cao, cao, cao);
-  dim3 grid = p3m_make_grid(n_blocks);
+  dim3 grid(n_blocks, 1u, 1u);
+  while (grid.x > 65536u) {
+    grid.y++;
+    if ((n_blocks % grid.y) == 0u)
+      grid.x = std::max<unsigned>(1u, n_blocks / grid.y);
+    else
+      grid.x = n_blocks / grid.y + 1u;
+  }
 
   auto const data_length = std::size_t(3u) *
                            static_cast<std::size_t>(parts_per_block) *
@@ -446,7 +438,7 @@ __global__ void assign_forces_kernel(P3MGpuData const params,
   /* id of the particle */
   auto const id =
       parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block;
-  if (id >= static_cast<unsigned>(params.n_part))
+  if (id >= params.n_part)
     return;
   /* position relative to the closest grid point */
   REAL_TYPE m_pos[3];
@@ -508,9 +500,27 @@ void assign_forces(P3MGpuData const &params,
                    float *const __restrict__ part_f,
                    REAL_TYPE const prefactor) {
   auto const cao = static_cast<unsigned int>(params.cao);
-  auto const [parts_per_block, n_blocks] = p3m_calc_blocks(cao, params.n_part);
+  auto const cao3 = int_pow<3>(cao);
+  unsigned int parts_per_block = 1u, n_blocks = 1u;
+
+  while ((parts_per_block + 1u) * cao3 <= 1024u) {
+    parts_per_block++;
+  }
+
+  if ((params.n_part % parts_per_block) == 0u)
+    n_blocks = std::max<unsigned>(1u, params.n_part / parts_per_block);
+  else
+    n_blocks = params.n_part / parts_per_block + 1u;
+
   dim3 block(parts_per_block * cao, cao, cao);
-  dim3 grid = p3m_make_grid(n_blocks);
+  dim3 grid(n_blocks, 1u, 1u);
+  while (grid.x > 65536u) {
+    grid.y++;
+    if (n_blocks % grid.y == 0u)
+      grid.x = std::max<unsigned>(1u, n_blocks / grid.y);
+    else
+      grid.x = n_blocks / grid.y + 1u;
+  }
 
   /* Switch for assignment templates, the shared version only is faster for cao
    * > 2 */
@@ -571,8 +581,7 @@ void p3m_gpu_init(std::shared_ptr<P3MGpuParams> &data, int cao,
 
   auto &p3m_gpu_data = data->p3m_gpu_data;
   bool do_reinit = false, mesh_changed = false;
-  assert(n_part <= std::numeric_limits<unsigned int>::max());
-  p3m_gpu_data.n_part = static_cast<unsigned>(n_part);
+  p3m_gpu_data.n_part = n_part;
 
   if (not data->is_initialized or p3m_gpu_data.alpha != alpha) {
     p3m_gpu_data.alpha = static_cast<REAL_TYPE>(alpha);
@@ -685,9 +694,9 @@ void p3m_gpu_init(std::shared_ptr<P3MGpuParams> &data, int cao,
 void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu,
                                 double prefactor, std::size_t n_part) {
   auto &p3m_gpu_data = data.p3m_gpu_data;
-  p3m_gpu_data.n_part = static_cast<unsigned>(n_part);
+  p3m_gpu_data.n_part = n_part;
 
-  if (n_part == 0u)
+  if (p3m_gpu_data.n_part == 0u)
     return;
 
   auto const positions_device = gpu.get_particle_positions_device();
@@ -713,7 +722,8 @@ void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu,
   if (FFT_FORW_FFT(data.p3m_fft.forw_plan,
                    (REAL_TYPE *)p3m_gpu_data.charge_mesh,
                    p3m_gpu_data.charge_mesh) != CUFFT_SUCCESS) {
-    throw std::runtime_error("CUFFT error: Forward FFT failed");
+    fprintf(stderr, "CUFFT error: Forward FFT failed\n");
+    return;
   }
 
   /* Do convolution */
diff --git a/src/core/electrostatics/p3m_gpu_error_cuda.cu b/src/core/electrostatics/p3m_gpu_error_cuda.cu
index fbc51fecf4a..6e54e2dfe7e 100644
--- a/src/core/electrostatics/p3m_gpu_error_cuda.cu
+++ b/src/core/electrostatics/p3m_gpu_error_cuda.cu
@@ -23,12 +23,12 @@
  *  The corresponding header file is p3m_gpu_error.hpp.
  */
 
-#include "p3m/math.hpp"
 #include "p3m_gpu_error.hpp"
 
 #include "cuda/utils.cuh"
 
 #include <utils/math/int_pow.hpp>
+#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <thrust/device_vector.h>
@@ -44,12 +44,49 @@
 #error CU-file includes mpi.h! This should not happen!
 #endif
 
+using Utils::int_pow;
+using Utils::sqr;
+
+/** @todo Extend to higher order. This comes from some 1/sin expansion in
+ *  @cite hockney88a
+ */
+
+template <int cao>
+__device__ static double p3m_analytic_cotangent_sum(int n, double mesh_i) {
+  const double c = sqr(cos(std::numbers::pi * mesh_i * n));
+
+  switch (cao) {
+  case 1:
+    return 1;
+  case 2:
+    return (1.0 + c * 2.0) / 3.0;
+  case 3:
+    return (2.0 + c * (11.0 + c * 2.0)) / 15.0;
+  case 4:
+    return (17.0 + c * (180.0 + c * (114.0 + c * 4.0))) / 315.0;
+  case 5:
+    return (62.0 + c * (1072.0 + c * (1452.0 + c * (247.0 + c * 2.0)))) /
+           2835.0;
+  case 6:
+    return (1382.0 +
+            c * (35396.0 +
+                 c * (83021.0 + c * (34096.0 + c * (2026.0 + c * 4.0))))) /
+           155925.0;
+  case 7:
+    return (21844.0 +
+            c * (776661.0 +
+                 c * (2801040.0 +
+                      c * (2123860.0 +
+                           c * (349500.0 + c * (8166.0 + c * 4.0)))))) /
+           6081075.0;
+  }
+
+  return 0.0;
+}
+
 template <int cao>
 __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi,
                                                 double alpha_L, double *he_q) {
-  using Utils::int_pow;
-  using Utils::sqr;
-
   const int nx =
       -mesh.x / 2 + static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
   const int ny =
@@ -66,14 +103,14 @@ __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi,
   if ((nx != 0) || (ny != 0) || (nz != 0)) {
     const double alpha_L_i = 1. / alpha_L;
     const double n2 = sqr(nx) + sqr(ny) + sqr(nz);
-    const double cs = math::analytic_cotangent_sum<cao>(nz, meshi.z) *
-                      math::analytic_cotangent_sum<cao>(nx, meshi.x) *
-                      math::analytic_cotangent_sum<cao>(ny, meshi.y);
+    const double cs = p3m_analytic_cotangent_sum<cao>(nz, meshi.z) *
+                      p3m_analytic_cotangent_sum<cao>(nx, meshi.x) *
+                      p3m_analytic_cotangent_sum<cao>(ny, meshi.y);
     const double ex = exp(-sqr(std::numbers::pi * alpha_L_i) * n2);
     const double ex2 = sqr(ex);
     const double U2 =
-        int_pow<2 * cao>(math::sinc(meshi.x * nx) * math::sinc(meshi.y * ny) *
-                         math::sinc(meshi.z * nz));
+        int_pow<2 * cao>(Utils::sinc(meshi.x * nx) * Utils::sinc(meshi.y * ny) *
+                         Utils::sinc(meshi.z * nz));
     auto const alias1 = ex2 / n2;
     auto const d = alias1 - sqr(U2 * ex / cs) / n2;
 
diff --git a/src/core/magnetostatics/dp3m.cpp b/src/core/magnetostatics/dp3m.cpp
index 4214900fb52..4fcad22318e 100644
--- a/src/core/magnetostatics/dp3m.cpp
+++ b/src/core/magnetostatics/dp3m.cpp
@@ -37,7 +37,6 @@
 #include "p3m/fft.hpp"
 #include "p3m/influence_function_dipolar.hpp"
 #include "p3m/interpolation.hpp"
-#include "p3m/math.hpp"
 #include "p3m/send_mesh.hpp"
 
 #include "BoxGeometry.hpp"
@@ -57,6 +56,7 @@
 #include <utils/Vector.hpp>
 #include <utils/integral_parameter.hpp>
 #include <utils/math/int_pow.hpp>
+#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
@@ -67,13 +67,10 @@
 #include <cstddef>
 #include <cstdio>
 #include <functional>
-#include <iterator>
 #include <numbers>
 #include <optional>
-#include <span>
 #include <sstream>
 #include <stdexcept>
-#include <tuple>
 #include <vector>
 
 void DipolarP3M::count_magnetic_particles() {
@@ -275,13 +272,15 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
     dipole_assign(particles);
     /* Gather information for FFT grid inside the nodes domain (inner local
      * mesh) and perform forward 3D FFT (Charge Assignment Mesh). */
-    std::array<double *, 3> meshes = {{dp3m.rs_mesh_dip[0u].data(),
-                                       dp3m.rs_mesh_dip[1u].data(),
-                                       dp3m.rs_mesh_dip[2u].data()}};
+    std::array<double *, 3> meshes = {{dp3m.rs_mesh_dip[0].data(),
+                                       dp3m.rs_mesh_dip[1].data(),
+                                       dp3m.rs_mesh_dip[2].data()}};
+
     dp3m.sm.gather_grid(meshes, comm_cart, dp3m.local_mesh.dim);
-    for (auto i = 0u; i < 3u; ++i) {
-      fft_perform_forw(dp3m.rs_mesh_dip[i].data(), dp3m.fft, comm_cart);
-    }
+
+    fft_perform_forw(dp3m.rs_mesh_dip[0].data(), dp3m.fft, comm_cart);
+    fft_perform_forw(dp3m.rs_mesh_dip[1].data(), dp3m.fft, comm_cart);
+    fft_perform_forw(dp3m.rs_mesh_dip[2].data(), dp3m.fft, comm_cart);
     // Note: after these calls, the grids are in the order yzx and not xyz
     // anymore!!!
   }
@@ -294,32 +293,34 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
     if (dp3m.sum_mu2 > 0.) {
       /* i*k differentiation for dipolar gradients:
        * |(\Fourier{\vect{mu}}(k)\cdot \vect{k})|^2 */
-
-      auto constexpr m_start = Utils::Vector3i::broadcast(0);
-      auto const m_stop = std::span(dp3m.fft.plan[3].new_mesh);
-      auto const offset = std::span(dp3m.fft.plan[3].start);
-      auto const &d_op = dp3m.d_op[0u];
-      auto const &mesh_dip = dp3m.rs_mesh_dip;
-      auto indices = Utils::Vector3i{};
-      auto index = std::size_t(0u);
-      auto it_energy = dp3m.g_energy.begin();
-      auto node_energy = 0.;
-      for_each_3d(m_start, m_stop, indices, [&]() {
-        using namespace detail::FFT_indexing;
-        // Re(mu)*k
-        auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
-                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
-                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
-        ++index;
-        // Im(mu)*k
-        auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
-                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
-                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
-        ++index;
-        node_energy += *it_energy * (Utils::sqr(re) + Utils::sqr(im));
-        std::advance(it_energy, 1);
-      });
-
+      int ind = 0;
+      int i = 0;
+      int j[3];
+      double node_energy = 0.0;
+      for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) {
+        for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; j[1]++) {
+          for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; j[2]++) {
+            node_energy +=
+                dp3m.g_energy[i] *
+                (Utils::sqr(
+                     dp3m.rs_mesh_dip[0][ind] *
+                         dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
+                     dp3m.rs_mesh_dip[1][ind] *
+                         dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
+                     dp3m.rs_mesh_dip[2][ind] *
+                         dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]) +
+                 Utils::sqr(
+                     dp3m.rs_mesh_dip[0][ind + 1] *
+                         dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
+                     dp3m.rs_mesh_dip[1][ind + 1] *
+                         dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
+                     dp3m.rs_mesh_dip[2][ind + 1] *
+                         dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]));
+            ind += 2;
+            i++;
+          }
+        }
+      }
       node_energy *= dipole_prefac * std::numbers::pi * box_geo.length_inv()[0];
       boost::mpi::reduce(comm_cart, node_energy, energy, std::plus<>(), 0);
 
@@ -343,45 +344,62 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
      * DIPOLAR TORQUES (k-space)
      ****************************/
     if (dp3m.sum_mu2 > 0.) {
-      auto const wavenumber = 2. * std::numbers::pi * box_geo.length_inv()[0u];
-      auto constexpr m_start = Utils::Vector3i::broadcast(0);
-      auto const m_stop = std::span(dp3m.fft.plan[3].new_mesh);
-      auto const offset = std::span(dp3m.fft.plan[3].start);
-      auto const &d_op = dp3m.d_op[0u];
-      auto const &mesh_dip = dp3m.rs_mesh_dip;
-      auto indices = Utils::Vector3i{};
-      auto index = std::size_t(0u);
-
+      auto const two_pi_L_i = 2. * std::numbers::pi * box_geo.length_inv()[0];
       /* fill in ks_mesh array for torque calculation */
-      auto it_energy = dp3m.g_energy.begin();
-      index = 0u;
-      for_each_3d(m_start, m_stop, indices, [&]() {
-        using namespace detail::FFT_indexing;
-        // Re(mu)*k
-        auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
-                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
-                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
-        dp3m.ks_mesh[index] = *it_energy * re;
-        ++index;
-        // Im(mu)*k
-        auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
-                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
-                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
-        dp3m.ks_mesh[index] = *it_energy * im;
-        ++index;
-        std::advance(it_energy, 1);
-      });
+      int ind = 0;
+      int i = 0;
+      int j[3];
+      double tmp0, tmp1;
+
+      for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { // j[0]=n_y
+        for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1];
+             j[1]++) { // j[1]=n_z
+          for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2];
+               j[2]++) { // j[2]=n_x
+            // tmp0 = Re(mu)*k,   tmp1 = Im(mu)*k
+
+            tmp0 = dp3m.rs_mesh_dip[0][ind] *
+                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
+                   dp3m.rs_mesh_dip[1][ind] *
+                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
+                   dp3m.rs_mesh_dip[2][ind] *
+                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
+
+            tmp1 = dp3m.rs_mesh_dip[0][ind + 1] *
+                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
+                   dp3m.rs_mesh_dip[1][ind + 1] *
+                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
+                   dp3m.rs_mesh_dip[2][ind + 1] *
+                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
+
+            /* the optimal influence function is the same for torques
+               and energy */
+            dp3m.ks_mesh[ind] = tmp0 * dp3m.g_energy[i];
+            dp3m.ks_mesh[ind + 1] = tmp1 * dp3m.g_energy[i];
+            ind += 2;
+            i++;
+          }
+        }
+      }
 
       /* Force component loop */
       for (int d = 0; d < 3; d++) {
-        index = 0u;
-        for_each_3d(m_start, m_stop, indices, [&]() {
-          auto const pos = indices[d] + offset[d];
-          dp3m.rs_mesh[index] = d_op[pos] * dp3m.ks_mesh[index];
-          ++index;
-          dp3m.rs_mesh[index] = d_op[pos] * dp3m.ks_mesh[index];
-          ++index;
-        });
+        auto const d_rs = (d + dp3m.ks_pnum) % 3;
+        ind = 0;
+        for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) {
+          for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; j[1]++) {
+            for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; j[2]++) {
+              dp3m.rs_mesh[ind] =
+                  dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
+                  dp3m.ks_mesh[ind];
+              ind++;
+              dp3m.rs_mesh[ind] =
+                  dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
+                  dp3m.ks_mesh[ind];
+              ind++;
+            }
+          }
+        }
 
         /* Back FFT force component mesh */
         fft_perform_back(dp3m.rs_mesh.data(), false, dp3m.fft, comm_cart);
@@ -389,68 +407,94 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
         dp3m.sm.spread_grid(dp3m.rs_mesh.data(), comm_cart,
                             dp3m.local_mesh.dim);
         /* Assign force component from mesh to particle */
-        auto const d_rs = (d + dp3m.ks_pnum) % 3;
         Utils::integral_parameter<int, AssignTorques, 1, 7>(
-            dp3m.params.cao, dp3m, dipole_prefac * wavenumber, d_rs, particles);
+            dp3m.params.cao, dp3m, dipole_prefac * two_pi_L_i, d_rs, particles);
       }
 
       /***************************
          DIPOLAR FORCES (k-space)
       ****************************/
+
       // Compute forces after torques because the algorithm below overwrites the
       // grids dp3m.rs_mesh_dip !
       // Note: I'll do here 9 inverse FFTs. By symmetry, we can reduce this
       // number to 6 !
       /* fill in ks_mesh array for force calculation */
-      auto it_force = dp3m.g_force.begin();
-      index = 0u;
-      for_each_3d(m_start, m_stop, indices, [&]() {
-        using namespace detail::FFT_indexing;
-        // Re(mu)*k
-        auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
-                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
-                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
-        ++index;
-        // Im(mu)*k
-        auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
-                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
-                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
-        ++index;
-        dp3m.ks_mesh[index - 2] = *it_force * im;
-        dp3m.ks_mesh[index - 1] = *it_force * (-re);
-        std::advance(it_force, 1);
-      });
+      ind = 0;
+      i = 0;
+      for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { // j[0]=n_y
+        for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1];
+             j[1]++) { // j[1]=n_z
+          for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2];
+               j[2]++) { // j[2]=n_x
+            // tmp0 = Im(mu)*k,   tmp1 = -Re(mu)*k
+            tmp0 = dp3m.rs_mesh_dip[0][ind + 1] *
+                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
+                   dp3m.rs_mesh_dip[1][ind + 1] *
+                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
+                   dp3m.rs_mesh_dip[2][ind + 1] *
+                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
+            tmp1 = dp3m.rs_mesh_dip[0][ind] *
+                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
+                   dp3m.rs_mesh_dip[1][ind] *
+                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
+                   dp3m.rs_mesh_dip[2][ind] *
+                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
+            dp3m.ks_mesh[ind] = tmp0 * dp3m.g_force[i];
+            dp3m.ks_mesh[ind + 1] = -tmp1 * dp3m.g_force[i];
+            ind += 2;
+            i++;
+          }
+        }
+      }
 
       /* Force component loop */
-      for (int d = 0; d < 3; d++) {
-        auto &rs_mesh_dip = dp3m.rs_mesh_dip;
-        index = 0u;
-        for_each_3d(m_start, m_stop, indices, [&]() {
-          using namespace detail::FFT_indexing;
-          auto const f1 = d_op[indices[d] + offset[d]] * dp3m.ks_mesh[index];
-          rs_mesh_dip[0][index] = d_op[indices[KX] + offset[KX]] * f1;
-          rs_mesh_dip[1][index] = d_op[indices[KY] + offset[KY]] * f1;
-          rs_mesh_dip[2][index] = d_op[indices[KZ] + offset[KZ]] * f1;
-          ++index;
-          auto const f2 = d_op[indices[d] + offset[d]] * dp3m.ks_mesh[index];
-          rs_mesh_dip[0][index] = d_op[indices[KX] + offset[KX]] * f2;
-          rs_mesh_dip[1][index] = d_op[indices[KY] + offset[KY]] * f2;
-          rs_mesh_dip[2][index] = d_op[indices[KZ] + offset[KZ]] * f2;
-          ++index;
-        });
-        /* Back FFT force component mesh */
-        for (auto i = 0u; i < 3u; ++i) {
-          fft_perform_back(rs_mesh_dip[i].data(), false, dp3m.fft, comm_cart);
+      for (int d = 0; d < 3; d++) { /* direction in k-space: */
+        auto const d_rs = (d + dp3m.ks_pnum) % 3;
+        ind = 0;
+        for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0];
+             j[0]++) { // j[0]=n_y
+          for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1];
+               j[1]++) { // j[1]=n_z
+            for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2];
+                 j[2]++) { // j[2]=n_x
+              tmp0 = dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
+                     dp3m.ks_mesh[ind];
+              dp3m.rs_mesh_dip[0][ind] =
+                  dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] * tmp0;
+              dp3m.rs_mesh_dip[1][ind] =
+                  dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] * tmp0;
+              dp3m.rs_mesh_dip[2][ind] =
+                  dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]] * tmp0;
+              ind++;
+              tmp0 = dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
+                     dp3m.ks_mesh[ind];
+              dp3m.rs_mesh_dip[0][ind] =
+                  dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] * tmp0;
+              dp3m.rs_mesh_dip[1][ind] =
+                  dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] * tmp0;
+              dp3m.rs_mesh_dip[2][ind] =
+                  dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]] * tmp0;
+              ind++;
+            }
+          }
         }
+        /* Back FFT force component mesh */
+        fft_perform_back(dp3m.rs_mesh_dip[0].data(), false, dp3m.fft,
+                         comm_cart);
+        fft_perform_back(dp3m.rs_mesh_dip[1].data(), false, dp3m.fft,
+                         comm_cart);
+        fft_perform_back(dp3m.rs_mesh_dip[2].data(), false, dp3m.fft,
+                         comm_cart);
         /* redistribute force component mesh */
-        std::array<double *, 3> meshes = {{rs_mesh_dip[0].data(),
-                                           rs_mesh_dip[1].data(),
-                                           rs_mesh_dip[2].data()}};
+        std::array<double *, 3> meshes = {{dp3m.rs_mesh_dip[0].data(),
+                                           dp3m.rs_mesh_dip[1].data(),
+                                           dp3m.rs_mesh_dip[2].data()}};
+
         dp3m.sm.spread_grid(meshes, comm_cart, dp3m.local_mesh.dim);
         /* Assign force component from mesh to particle */
-        auto const d_rs = (d + dp3m.ks_pnum) % 3;
         Utils::integral_parameter<int, AssignForces, 1, 7>(
-            dp3m.params.cao, dp3m, dipole_prefac * Utils::sqr(wavenumber), d_rs,
+            dp3m.params.cao, dp3m, dipole_prefac * Utils::sqr(two_pi_L_i), d_rs,
             particles);
       }
     } /* if (dp3m.sum_mu2 > 0) */
@@ -546,15 +590,15 @@ void DipolarP3M::calc_influence_function_force() {
   auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh};
 
   dp3m.g_force = grid_influence_function<3>(dp3m.params, start, start + size,
-                                            get_system().box_geo->length_inv());
+                                            get_system().box_geo->length());
 }
 
 void DipolarP3M::calc_influence_function_energy() {
   auto const start = Utils::Vector3i{dp3m.fft.plan[3].start};
   auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh};
 
-  dp3m.g_energy = grid_influence_function<2>(
-      dp3m.params, start, start + size, get_system().box_geo->length_inv());
+  dp3m.g_energy = grid_influence_function<2>(dp3m.params, start, start + size,
+                                             get_system().box_geo->length());
 }
 
 class DipolarTuningAlgorithm : public TuningAlgorithm {
@@ -705,68 +749,62 @@ void DipolarP3M::tune() {
 }
 
 /** Tuning dipolar-P3M */
-static auto dp3m_tune_aliasing_sums(Utils::Vector3i const &shift, int mesh,
+static auto dp3m_tune_aliasing_sums(int nx, int ny, int nz, int mesh,
                                     double mesh_i, int cao, double alpha_L_i) {
+  using Utils::sinc;
 
-  auto constexpr m_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN);
-  auto constexpr m_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1);
   auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i);
+
   auto alias1 = 0.;
   auto alias2 = 0.;
-
-  Utils::Vector3i indices{};
-  Utils::Vector3i nm{};
-  Utils::Vector3d fnm{};
-  for_each_3d(
-      m_start, m_stop, indices,
-      [&]() {
-        auto const norm_sq = nm.norm2();
-        auto const ex = std::exp(-factor1 * norm_sq);
-        auto const U2 = std::pow(Utils::product(fnm), 2 * cao);
-        alias1 += Utils::sqr(ex) * norm_sq;
-        alias2 += U2 * ex * std::pow(shift * nm, 3) / norm_sq;
-      },
-      [&](unsigned dim, int n) {
-        nm[dim] = shift[dim] + n * mesh;
-        fnm[dim] = math::sinc(nm[dim] * mesh_i);
-      });
-
+  for (int mx = -P3M_BRILLOUIN; mx <= P3M_BRILLOUIN; mx++) {
+    auto const nmx = nx + mx * mesh;
+    auto const fnmx = mesh_i * nmx;
+    for (int my = -P3M_BRILLOUIN; my <= P3M_BRILLOUIN; my++) {
+      auto const nmy = ny + my * mesh;
+      auto const fnmy = mesh_i * nmy;
+      for (int mz = -P3M_BRILLOUIN; mz <= P3M_BRILLOUIN; mz++) {
+        auto const nmz = nz + mz * mesh;
+        auto const fnmz = mesh_i * nmz;
+
+        auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz);
+        auto const ex = std::exp(-factor1 * nm2);
+
+        auto const U2 = pow(sinc(fnmx) * sinc(fnmy) * sinc(fnmz), 2. * cao);
+
+        alias1 += Utils::sqr(ex) * nm2;
+        alias2 += U2 * ex * pow((nx * nmx + ny * nmy + nz * nmz), 3.) / nm2;
+      }
+    }
+  }
   return std::make_pair(alias1, alias2);
 }
 
 /** Calculate the k-space error of dipolar-P3M */
 static double dp3m_k_space_error(double box_size, int mesh, int cao,
                                  int n_c_part, double sum_q2, double alpha_L) {
-
-  auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao);
-  auto const mesh_i = 1. / static_cast<double>(mesh);
+  double he_q = 0.;
+  auto const mesh_i = 1. / mesh;
   auto const alpha_L_i = 1. / alpha_L;
-  auto const m_stop = Utils::Vector3i::broadcast(mesh / 2);
-  auto const m_start = -m_stop;
-  auto indices = Utils::Vector3i{};
-  auto values = Utils::Vector3d{};
-  auto he_q = 0.;
-
-  for_each_3d(
-      m_start, m_stop, indices,
-      [&]() {
-        if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) {
-          auto const n2 = indices.norm2();
-          auto const cs = Utils::product(values);
+
+  for (int nx = -mesh / 2; nx < mesh / 2; nx++)
+    for (int ny = -mesh / 2; ny < mesh / 2; ny++)
+      for (int nz = -mesh / 2; nz < mesh / 2; nz++)
+        if ((nx != 0) || (ny != 0) || (nz != 0)) {
+          auto const n2 = Utils::sqr(nx) + Utils::sqr(ny) + Utils::sqr(nz);
+          auto const cs = p3m_analytic_cotangent_sum(nx, mesh_i, cao) *
+                          p3m_analytic_cotangent_sum(ny, mesh_i, cao) *
+                          p3m_analytic_cotangent_sum(nz, mesh_i, cao);
           auto const [alias1, alias2] =
-              dp3m_tune_aliasing_sums(indices, mesh, mesh_i, cao, alpha_L_i);
+              dp3m_tune_aliasing_sums(nx, ny, nz, mesh, mesh_i, cao, alpha_L_i);
           auto const d =
               alias1 - Utils::sqr(alias2 / cs) /
                            Utils::int_pow<3>(static_cast<double>(n2));
           /* at high precision, d can become negative due to extinction;
              also, don't take values that have no significant digits left*/
-          if (d > 0. and std::fabs(d / alias1) > ROUND_ERROR_PREC)
+          if (d > 0 && (fabs(d / alias1) > ROUND_ERROR_PREC))
             he_q += d;
         }
-      },
-      [&values, &mesh_i, cotangent_sum](unsigned dim, int n) {
-        values[dim] = cotangent_sum(n, mesh_i);
-      });
 
   return 8. * Utils::sqr(std::numbers::pi) / 3. * sum_q2 *
          sqrt(he_q / n_c_part) / Utils::int_pow<4>(box_size);
@@ -847,7 +885,7 @@ void DipolarP3M::sanity_checks_boxl() const {
   auto const &system = get_system();
   auto const &box_geo = *system.box_geo;
   auto const &local_geo = *system.local_geo;
-  for (auto i = 0u; i < 3u; i++) {
+  for (unsigned int i = 0u; i < 3u; i++) {
     /* check k-space cutoff */
     if (dp3m.params.cao_cut[i] >= box_geo.length_half()[i]) {
       std::stringstream msg;
diff --git a/src/core/p3m/common.cpp b/src/core/p3m/common.cpp
index fb8023eba74..bc87e99a6a8 100644
--- a/src/core/p3m/common.cpp
+++ b/src/core/p3m/common.cpp
@@ -28,8 +28,52 @@
 #include "LocalBox.hpp"
 
 #include <utils/Vector.hpp>
+#include <utils/math/sqr.hpp>
 
 #include <cmath>
+#include <numbers>
+#include <stdexcept>
+
+double p3m_analytic_cotangent_sum(int n, double mesh_i, int cao) {
+  auto const c =
+      Utils::sqr(std::cos(std::numbers::pi * mesh_i * static_cast<double>(n)));
+
+  switch (cao) {
+  case 1: {
+    return 1.0;
+  }
+  case 2: {
+    return (1.0 + c * 2.0) / 3.0;
+  }
+  case 3: {
+    return (2.0 + c * (11.0 + c * 2.0)) / 15.0;
+  }
+  case 4: {
+    return (17.0 + c * (180.0 + c * (114.0 + c * 4.0))) / 315.0;
+  }
+  case 5: {
+    return (62.0 + c * (1072.0 + c * (1452.0 + c * (247.0 + c * 2.0)))) /
+           2835.0;
+  }
+  case 6: {
+    return (1382.0 +
+            c * (35396.0 +
+                 c * (83021.0 + c * (34096.0 + c * (2026.0 + c * 4.0))))) /
+           155925.0;
+  }
+  case 7: {
+    return (21844.0 +
+            c * (776661.0 +
+                 c * (2801040.0 +
+                      c * (2123860.0 +
+                           c * (349500.0 + c * (8166.0 + c * 4.0)))))) /
+           6081075.0;
+  }
+  default: {
+    throw std::logic_error("Invalid value cao=" + std::to_string(cao));
+  }
+  }
+}
 
 void P3MLocalMesh::calc_local_ca_mesh(P3MParameters const &params,
                                       LocalBox const &local_geo, double skin,
diff --git a/src/core/p3m/common.hpp b/src/core/p3m/common.hpp
index 6bc30828ee5..38a7553428a 100644
--- a/src/core/p3m/common.hpp
+++ b/src/core/p3m/common.hpp
@@ -38,7 +38,6 @@
 
 #include <utils/Vector.hpp>
 
-#include <algorithm>
 #include <array>
 #include <vector>
 
@@ -50,7 +49,6 @@ auto constexpr P3M_EPSILON_METALLIC = 0.0;
 #include "LocalBox.hpp"
 
 #include <array>
-#include <cstddef>
 #include <stdexcept>
 #include <vector>
 
@@ -214,7 +212,7 @@ struct P3MLocalMesh {
    */
   void recalc_ld_pos(P3MParameters const &params) {
     // spatial position of left down mesh point
-    for (auto i = 0u; i < 3u; i++) {
+    for (unsigned int i = 0; i < 3; i++) {
       ld_pos[i] = (ld_ind[i] + params.mesh_off[i]) * params.a[i];
     }
   }
@@ -228,6 +226,16 @@ struct P3MLocalMesh {
                           double space_layer);
 };
 
+/** One of the aliasing sums used to compute k-space errors.
+ *  Fortunately the one which is most important (because it converges
+ *  most slowly, since it is not damped exponentially) can be
+ *  calculated analytically. The result (which depends on the order of
+ *  the spline interpolation) can be written as an even trigonometric
+ *  polynomial. The results are tabulated here (the employed formula
+ *  is eq. (7.66) in @cite hockney88a).
+ */
+double p3m_analytic_cotangent_sum(int n, double mesh_i, int cao);
+
 #endif /* P3M || DP3M */
 
 namespace detail {
@@ -242,7 +250,7 @@ std::array<std::vector<int>, 3> inline calc_meshift(
     Utils::Vector3i const &mesh_size, bool zero_out_midpoint = false) {
   std::array<std::vector<int>, 3> ret{};
 
-  for (auto i = 0u; i < 3u; ++i) {
+  for (unsigned int i = 0; i < 3; i++) {
     ret[i] = std::vector<int>(static_cast<std::size_t>(mesh_size[i]));
 
     for (int j = 1; j <= mesh_size[i] / 2; j++) {
diff --git a/src/core/p3m/fft.cpp b/src/core/p3m/fft.cpp
index a14c9a2d00b..57b56fbaa6c 100644
--- a/src/core/p3m/fft.cpp
+++ b/src/core/p3m/fft.cpp
@@ -84,7 +84,7 @@ using Utils::permute_ifield;
  */
 std::optional<std::vector<int>>
 find_comm_groups(Utils::Vector3i const &grid1, Utils::Vector3i const &grid2,
-                 std::span<int const> node_list1, std::span<int> node_list2,
+                 std::span<const int> node_list1, std::span<int> node_list2,
                  std::span<int> pos, std::span<int> my_pos, int rank) {
   int i;
   /* communication group cell size on grid1 and grid2 */
diff --git a/src/core/p3m/fft.hpp b/src/core/p3m/fft.hpp
index c87f11a14cb..58ea6c7e692 100644
--- a/src/core/p3m/fft.hpp
+++ b/src/core/p3m/fft.hpp
@@ -33,7 +33,12 @@
  *  redistributed.
  *
  *  For simplicity at the moment I have implemented a full complex to
- *  complex FFT (even though a real to complex FFT would be sufficient).
+ *  complex FFT (even though a real to complex FFT would be
+ *  sufficient)
+ *
+ *  \todo Combine the forward and backward structures.
+ *  \todo The packing routines could be moved to utils.hpp when they are needed
+ * elsewhere.
  *
  *  For more information about FFT usage, see \ref fft.cpp "fft.cpp".
  */
@@ -83,24 +88,22 @@ template <class T> struct fft_allocator {
 
 template <class T> using fft_vector = std::vector<T, fft_allocator<T>>;
 
-struct fft_plan {
-  /** plan direction: forward or backward FFT (enum value from FFTW). */
+/** Structure for performing a 1D FFT.
+ *
+ *  This includes the information about the redistribution of the 3D
+ *  FFT *grid before the actual FFT.
+ */
+struct fft_forw_plan {
+  /** plan direction: 0 = Forward FFT, 1 = Backward FFT. */
   int dir;
-  /** plan for the FFT. */
-  fftw_plan our_fftw_plan;
-  /** packing function for send blocks. */
-  void (*pack_function)(double const *const, double *const, int const *,
-                        int const *, int const *, int);
-};
-
-/** @brief Plan for a forward 1D FFT of a flattened 3D array. */
-struct fft_forw_plan : public fft_plan {
   /** row direction of that FFT. */
   int row_dir;
   /** permutations from normal coordinate system. */
   int n_permute;
   /** number of 1D FFTs. */
   int n_ffts;
+  /** plan for fft. */
+  fftw_plan our_fftw_plan;
 
   /** size of local mesh before communication. */
   int old_mesh[3];
@@ -114,6 +117,9 @@ struct fft_forw_plan : public fft_plan {
   /** group of nodes which have to communicate with each other. */
   std::vector<int> group;
 
+  /** packing function for send blocks. */
+  void (*pack_function)(double const *const, double *const, int const *,
+                        int const *, int const *, int);
   /** Send block specification. 6 integers for each node: start[3], size[3]. */
   std::vector<int> send_block;
   /** Send block communication sizes. */
@@ -126,8 +132,17 @@ struct fft_forw_plan : public fft_plan {
   int element;
 };
 
-/** @brief Plan for a backward 1D FFT of a flattened 3D array. */
-struct fft_back_plan : public fft_plan {};
+/** Additional information for backwards FFT. */
+struct fft_back_plan {
+  /** plan direction. (e.g. fftw macro) */
+  int dir;
+  /** plan for fft. */
+  fftw_plan our_fftw_plan;
+
+  /** packing function for send blocks. */
+  void (*pack_function)(double const *const, double *const, int const *,
+                        int const *, int const *, int);
+};
 
 /** Information about the three one dimensional FFTs and how the nodes
  *  have to communicate inbetween.
diff --git a/src/core/p3m/for_each_3d.hpp b/src/core/p3m/for_each_3d.hpp
deleted file mode 100644
index 9f44871bd54..00000000000
--- a/src/core/p3m/for_each_3d.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2024 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <concepts>
-
-namespace detail {
-
-constexpr inline void noop_projector(unsigned, int) {}
-
-template <typename T>
-concept IndexVectorConcept = requires(T vector) {
-  { vector[0] } -> std::convertible_to<std::size_t>;
-};
-
-} // namespace detail
-
-/**
- * @brief Repeat an operation on every element of a 3D grid.
- *
- * Intermediate values that depend on the iterated coordinates
- * are calculated and stored once per iteration. This is useful
- * when the operation is costly.
- *
- * @param start       Initial values for the loop counters.
- * @param stop        Final values (one-past-the-end) for the loop counters.
- * @param counters    Loop counters.
- * @param kernel      Functor to execute.
- * @param projector   Projection of the current loop counter.
- * @tparam Kernel     Nullary function.
- * @tparam Projector  Binary function that takes a nesting depth and a loop
- *                    counter as arguments and projects a value.
- */
-template <class Kernel, class Projector = decltype(detail::noop_projector)>
-  requires std::invocable<Kernel> and std::invocable<Projector, unsigned, int>
-void for_each_3d(detail::IndexVectorConcept auto &&start,
-                 detail::IndexVectorConcept auto &&stop,
-                 detail::IndexVectorConcept auto &&counters, Kernel &&kernel,
-                 Projector &&projector = detail::noop_projector) {
-  auto &nx = counters[0u];
-  auto &ny = counters[1u];
-  auto &nz = counters[2u];
-  for (nx = start[0u]; nx < stop[0u]; ++nx) {
-    projector(0u, nx);
-    for (ny = start[1u]; ny < stop[1u]; ++ny) {
-      projector(1u, ny);
-      for (nz = start[2u]; nz < stop[2u]; ++nz) {
-        projector(2u, nz);
-        kernel();
-      }
-    }
-  }
-}
diff --git a/src/core/p3m/influence_function.hpp b/src/core/p3m/influence_function.hpp
index 24853d43e25..9c322c23506 100644
--- a/src/core/p3m/influence_function.hpp
+++ b/src/core/p3m/influence_function.hpp
@@ -16,16 +16,15 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-
-#pragma once
+#ifndef ESPRESSO_P3M_INFLUENCE_FUNCTION_HPP
+#define ESPRESSO_P3M_INFLUENCE_FUNCTION_HPP
 
 #include "p3m/common.hpp"
-#include "p3m/for_each_3d.hpp"
-#include "p3m/math.hpp"
 
 #include <utils/Vector.hpp>
 #include <utils/index.hpp>
 #include <utils/math/int_pow.hpp>
+#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <cmath>
@@ -53,43 +52,44 @@
 template <std::size_t S, std::size_t m>
 double G_opt(int cao, double alpha, Utils::Vector3d const &k,
              Utils::Vector3d const &h) {
+  using namespace detail::FFT_indexing;
+  using Utils::int_pow;
+  using Utils::sinc;
+
+  auto constexpr two_pi = 2. * std::numbers::pi;
+  auto constexpr two_pi_i = 1. / two_pi;
+  auto constexpr limit = 30.;
 
   auto const k2 = k.norm2();
-  if (k2 == 0.) {
-    return 0.;
+  if (k2 == 0.0) {
+    return 0.0;
   }
 
-  auto constexpr limit = 30.;
-  auto constexpr m_start = Utils::Vector3i::broadcast(-m);
-  auto constexpr m_stop = Utils::Vector3i::broadcast(m + 1);
-  auto const exponent_prefactor = Utils::sqr(1. / (2. * alpha));
-  auto const wavevector = (2. * std::numbers::pi) / h;
-  auto const wavevector_i = 1. / wavevector;
-  auto indices = Utils::Vector3i{};
-  auto km = Utils::Vector3d{};
-  auto fnm = Utils::Vector3d{};
-  auto numerator = 0.;
-  auto denominator = 0.;
-
-  for_each_3d(
-      m_start, m_stop, indices,
-      [&]() {
-        using namespace detail::FFT_indexing;
-        auto const U2 = std::pow(Utils::product(fnm), 2 * cao);
+  double numerator = 0.0;
+  double denominator = 0.0;
+
+  for (int mx = -m; mx <= m; mx++) {
+    for (int my = -m; my <= m; my++) {
+      for (int mz = -m; mz <= m; mz++) {
+        auto const km =
+            k + two_pi * Utils::Vector3d{mx / h[RX], my / h[RY], mz / h[RZ]};
+        auto const U2 = std::pow(sinc(km[RX] * h[RX] * two_pi_i) *
+                                     sinc(km[RY] * h[RY] * two_pi_i) *
+                                     sinc(km[RZ] * h[RZ] * two_pi_i),
+                                 2 * cao);
+
         auto const km2 = km.norm2();
-        auto const exponent = exponent_prefactor * km2;
+        auto const exponent = Utils::sqr(1. / (2. * alpha)) * km2;
         if (exponent < limit) {
           auto const f3 = std::exp(-exponent) * (4. * std::numbers::pi / km2);
-          numerator += U2 * f3 * Utils::int_pow<S>(k * km);
+          numerator += U2 * f3 * int_pow<S>(k * km);
         }
         denominator += U2;
-      },
-      [&](unsigned dim, int n) {
-        km[dim] = k[dim] + n * wavevector[dim];
-        fnm[dim] = math::sinc(km[dim] * wavevector_i[dim]);
-      });
+      }
+    }
+  }
 
-  return numerator / (Utils::int_pow<S>(k2) * Utils::sqr(denominator));
+  return numerator / (int_pow<S>(k2) * Utils::sqr(denominator));
 }
 
 /**
@@ -104,18 +104,20 @@ double G_opt(int cao, double alpha, Utils::Vector3d const &k,
  *
  * @param params P3M parameters
  * @param n_start Lower left corner of the grid
- * @param n_stop Upper right corner of the grid.
- * @param inv_box_l Inverse box length
+ * @param n_end Upper right corner of the grid.
+ * @param box_l Box size
  * @return Values of G_opt at regular grid points.
  */
 template <std::size_t S, std::size_t m = 0>
 std::vector<double> grid_influence_function(const P3MParameters &params,
                                             const Utils::Vector3i &n_start,
-                                            const Utils::Vector3i &n_stop,
-                                            const Utils::Vector3d &inv_box_l) {
+                                            const Utils::Vector3i &n_end,
+                                            const Utils::Vector3d &box_l) {
+  using namespace detail::FFT_indexing;
 
   auto const shifts = detail::calc_meshift(params.mesh);
-  auto const size = n_stop - n_start;
+
+  auto const size = n_end - n_start;
 
   /* The influence function grid */
   auto g = std::vector<double>(Utils::product(size), 0.);
@@ -126,24 +128,31 @@ std::vector<double> grid_influence_function(const P3MParameters &params,
     return g;
   }
 
-  auto const wavevector = (2. * std::numbers::pi) * inv_box_l;
-  auto const half_mesh = params.mesh / 2;
-  auto indices = Utils::Vector3i{};
-  auto index = std::size_t(0u);
-
-  for_each_3d(n_start, n_stop, indices, [&]() {
-    using namespace detail::FFT_indexing;
-    if ((indices[KX] % half_mesh[RX] != 0) or
-        (indices[KY] % half_mesh[RY] != 0) or
-        (indices[KZ] % half_mesh[RZ] != 0)) {
-      auto const k =
-          Utils::Vector3d{{shifts[RX][indices[KX]] * wavevector[RX],
-                           shifts[RY][indices[KY]] * wavevector[RY],
-                           shifts[RZ][indices[KZ]] * wavevector[RZ]}};
-      g[index] = G_opt<S, m>(params.cao, params.alpha, k, params.a);
+  auto const h = Utils::Vector3d{params.a};
+
+  Utils::Vector3i n{};
+  for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) {
+    for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) {
+      for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) {
+        auto const ind = Utils::get_linear_index(n - n_start, size,
+                                                 Utils::MemoryOrder::ROW_MAJOR);
+        if ((n[KX] % (params.mesh[RX] / 2) == 0) &&
+            (n[KY] % (params.mesh[RY] / 2) == 0) &&
+            (n[KZ] % (params.mesh[RZ] / 2) == 0)) {
+          g[ind] = 0.0;
+        } else {
+          auto const k = 2. * std::numbers::pi *
+                         Utils::Vector3d{shifts[RX][n[KX]] / box_l[RX],
+                                         shifts[RY][n[KY]] / box_l[RY],
+                                         shifts[RZ][n[KZ]] / box_l[RZ]};
+
+          g[ind] = G_opt<S, m>(params.cao, params.alpha, k, h);
+        }
+      }
     }
-    ++index;
-  });
+  }
 
   return g;
 }
+
+#endif
diff --git a/src/core/p3m/influence_function_dipolar.hpp b/src/core/p3m/influence_function_dipolar.hpp
index d52a1a6d675..e327846d20e 100644
--- a/src/core/p3m/influence_function_dipolar.hpp
+++ b/src/core/p3m/influence_function_dipolar.hpp
@@ -18,19 +18,19 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-
-#pragma once
+#ifndef ESPRESSO_CORE_P3M_INFLUENCE_FUNCTION_DIPOLAR_HPP
+#define ESPRESSO_CORE_P3M_INFLUENCE_FUNCTION_DIPOLAR_HPP
 
 #include "config/config.hpp"
 
 #if defined(DP3M)
 
 #include "p3m/common.hpp"
-#include "p3m/for_each_3d.hpp"
-#include "p3m/math.hpp"
 
 #include <utils/Vector.hpp>
+#include <utils/index.hpp>
 #include <utils/math/int_pow.hpp>
+#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <cmath>
@@ -54,41 +54,40 @@
 template <std::size_t S>
 double G_opt_dipolar(P3MParameters const &params, Utils::Vector3i const &shift,
                      Utils::Vector3i const &d_op) {
-
+  using Utils::int_pow;
+  using Utils::sinc;
   auto constexpr limit = P3M_BRILLOUIN;
   auto constexpr exp_limit = 30.;
-  auto constexpr m_start = Utils::Vector3i::broadcast(-limit);
-  auto constexpr m_stop = Utils::Vector3i::broadcast(limit + 1);
-  auto const cao = params.cao;
-  auto const mesh = params.mesh[0];
-  auto const offset =
-      static_cast<Utils::Vector3d>(shift) / static_cast<double>(mesh);
-  auto const f2 = Utils::sqr(std::numbers::pi / params.alpha_L);
-  auto indices = Utils::Vector3i{};
-  auto nm = Utils::Vector3i{};
-  auto fnm = Utils::Vector3d{};
+  auto const exponent = 2. * params.cao;
+
   auto numerator = 0.;
   auto denominator = 0.;
 
-  for_each_3d(
-      m_start, m_stop, indices,
-      [&]() {
-        auto const norm_sq = nm.norm2();
-        auto const sz = std::pow(Utils::product(fnm), 2 * cao);
-        auto const exp_term = f2 * norm_sq;
+  auto const f1 = 1. / static_cast<double>(params.mesh[0]);
+  auto const f2 = Utils::sqr(std::numbers::pi / params.alpha_L);
+
+  for (int mx = -limit; mx <= limit; mx++) {
+    auto const nmx = shift[0] + params.mesh[0] * mx;
+    auto const sx = std::pow(sinc(f1 * nmx), exponent);
+    for (int my = -limit; my <= limit; my++) {
+      auto const nmy = shift[1] + params.mesh[0] * my;
+      auto const sy = sx * std::pow(sinc(f1 * nmy), exponent);
+      for (int mz = -limit; mz <= limit; mz++) {
+        auto const nmz = shift[2] + params.mesh[0] * mz;
+        auto const sz = sy * std::pow(sinc(f1 * nmz), exponent);
+        auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz);
+        auto const exp_term = f2 * nm2;
         if (exp_term < exp_limit) {
-          auto const f3 = sz * std::exp(-exp_term) / norm_sq;
-          numerator += f3 * Utils::int_pow<S>(d_op * nm);
+          auto const f3 = sz * std::exp(-exp_term) / nm2;
+          auto const n_nm = d_op[0] * nmx + d_op[1] * nmy + d_op[2] * nmz;
+          numerator += f3 * int_pow<S>(n_nm);
         }
         denominator += sz;
-      },
-      [&](unsigned dim, int n) {
-        nm[dim] = shift[dim] + n * mesh;
-        fnm[dim] = math::sinc(offset[dim] + n * mesh);
-      });
-
-  return numerator / (Utils::int_pow<S>(static_cast<double>(d_op.norm2())) *
-                      Utils::int_pow<2>(denominator));
+      }
+    }
+  }
+  return numerator / (int_pow<S>(static_cast<double>(d_op.norm2())) *
+                      Utils::sqr(denominator));
 }
 
 /**
@@ -101,17 +100,17 @@ double G_opt_dipolar(P3MParameters const &params, Utils::Vector3i const &shift,
  *
  * @param params DP3M parameters
  * @param n_start Lower left corner of the grid
- * @param n_stop Upper right corner of the grid.
- * @param inv_box_l Inverse box length
+ * @param n_end Upper right corner of the grid.
+ * @param box_l Box size
  * @return Values of the influence function at regular grid points.
  */
 template <std::size_t S>
 std::vector<double> grid_influence_function(P3MParameters const &params,
                                             Utils::Vector3i const &n_start,
-                                            Utils::Vector3i const &n_stop,
-                                            Utils::Vector3d const &inv_box_l) {
+                                            Utils::Vector3i const &n_end,
+                                            Utils::Vector3d const &box_l) {
 
-  auto const size = n_stop - n_start;
+  auto const size = n_end - n_start;
 
   /* The influence function grid */
   auto g = std::vector<double>(Utils::product(size), 0.);
@@ -122,56 +121,61 @@ std::vector<double> grid_influence_function(P3MParameters const &params,
     return g;
   }
 
-  auto prefactor = Utils::int_pow<3>(static_cast<double>(params.mesh[0])) * 2. *
-                   Utils::int_pow<2>(inv_box_l[0]);
-
-  auto const offset = detail::calc_meshift(params.mesh, false)[0];
-  auto const d_op = detail::calc_meshift(params.mesh, true)[0];
-  auto const half_mesh = params.mesh[0] / 2;
-  auto indices = Utils::Vector3i{};
-  auto shift_off = Utils::Vector3i{};
-  auto d_op_off = Utils::Vector3i{};
-  auto index = std::size_t(0u);
-
-  for_each_3d(
-      n_start, n_stop, indices,
-      [&]() {
-        if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or
-             (indices[2] % half_mesh != 0))) {
-          g[index] = prefactor * G_opt_dipolar<S>(params, shift_off, d_op_off);
+  double fak1 = Utils::int_pow<3>(static_cast<double>(params.mesh[0])) * 2.0 /
+                Utils::sqr(box_l[0]);
+
+  auto const shifts = detail::calc_meshift(params.mesh, false);
+  auto const d_ops = detail::calc_meshift(params.mesh, true);
+
+  Utils::Vector3i n{};
+  for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) {
+    for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) {
+      for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) {
+        auto const ind = Utils::get_linear_index(n - n_start, size,
+                                                 Utils::MemoryOrder::ROW_MAJOR);
+
+        if (((n[0] % (params.mesh[0] / 2) == 0) &&
+             (n[1] % (params.mesh[0] / 2) == 0) &&
+             (n[2] % (params.mesh[0] / 2) == 0))) {
+          g[ind] = 0.0;
+        } else {
+          auto const shift = Utils::Vector3i{shifts[0][n[0]], shifts[0][n[1]],
+                                             shifts[0][n[2]]};
+          auto const d_op =
+              Utils::Vector3i{d_ops[0][n[0]], d_ops[0][n[1]], d_ops[0][n[2]]};
+          auto const fak2 = G_opt_dipolar<S>(params, shift, d_op);
+          g[ind] = fak1 * fak2;
         }
-        ++index;
-      },
-      [&](unsigned dim, int n) {
-        d_op_off[dim] = d_op[n];
-        shift_off[dim] = offset[n];
-      });
-
+      }
+    }
+  }
   return g;
 }
 
 inline double G_opt_dipolar_self_energy(P3MParameters const &params,
                                         Utils::Vector3i const &shift) {
-
+  using Utils::sinc;
   auto constexpr limit = P3M_BRILLOUIN + 1;
-  auto constexpr m_start = Utils::Vector3i::broadcast(-limit);
-  auto constexpr m_stop = Utils::Vector3i::broadcast(limit + 1);
-  auto const cao = params.cao;
-  auto const mesh = params.mesh[0];
-  auto const offset =
-      static_cast<Utils::Vector3d>(shift) / static_cast<double>(mesh);
-  auto indices = Utils::Vector3i{};
-  auto fnm = Utils::Vector3d{};
-  auto energy = 0.;
-
-  for_each_3d(
-      m_start, m_stop, indices,
-      [&]() { energy += std::pow(Utils::product(fnm), 2 * cao); },
-      [&](unsigned dim, int n) {
-        fnm[dim] = math::sinc(offset[dim] + n * mesh);
-      });
-
-  return energy;
+  auto const exponent = 2. * params.cao;
+
+  auto u_sum = 0.0;
+
+  auto const f1 = 1.0 / static_cast<double>(params.mesh[0]);
+
+  for (int mx = -limit; mx <= limit; mx++) {
+    auto const nmx = shift[0] + params.mesh[0] * mx;
+    auto const sx = std::pow(sinc(f1 * nmx), exponent);
+    for (int my = -limit; my <= limit; my++) {
+      auto const nmy = shift[1] + params.mesh[0] * my;
+      auto const sy = sx * std::pow(sinc(f1 * nmy), exponent);
+      for (int mz = -limit; mz <= limit; mz++) {
+        auto const nmz = shift[2] + params.mesh[0] * mz;
+        auto const sz = sy * std::pow(sinc(f1 * nmz), exponent);
+        u_sum += sz;
+      }
+    }
+  }
+  return u_sum;
 }
 
 /**
@@ -179,39 +183,42 @@ inline double G_opt_dipolar_self_energy(P3MParameters const &params,
  *
  * @param params DP3M parameters
  * @param n_start Lower left corner of the grid
- * @param n_stop Upper right corner of the grid.
+ * @param n_end Upper right corner of the grid.
  * @param g Energies on the grid.
  * @return Total self-energy.
  */
 inline double grid_influence_function_self_energy(
     P3MParameters const &params, Utils::Vector3i const &n_start,
-    Utils::Vector3i const &n_stop, std::vector<double> const &g) {
-
-  auto const offset = detail::calc_meshift(params.mesh, false)[0];
-  auto const d_op = detail::calc_meshift(params.mesh, true)[0];
-  auto const half_mesh = params.mesh[0] / 2;
-  auto indices = Utils::Vector3i{};
-  auto shift_off = Utils::Vector3i{};
-  auto d_op_off = Utils::Vector3i{};
-  auto index = std::size_t(0u);
-  auto energy = 0.;
-
-  for_each_3d(
-      n_start, n_stop, indices,
-      [&]() {
-        if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or
-             (indices[2] % half_mesh != 0))) {
-          auto const U2 = G_opt_dipolar_self_energy(params, shift_off);
-          energy += g[index] * U2 * d_op_off.norm2();
+    Utils::Vector3i const &n_end, std::vector<double> const &g) {
+  auto const size = n_end - n_start;
+
+  auto const shifts = detail::calc_meshift(params.mesh, false);
+  auto const d_ops = detail::calc_meshift(params.mesh, true);
+
+  double energy = 0.0;
+  Utils::Vector3i n{};
+  for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) {
+    for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) {
+      for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) {
+        if (((n[0] % (params.mesh[0] / 2) == 0) &&
+             (n[1] % (params.mesh[0] / 2) == 0) &&
+             (n[2] % (params.mesh[0] / 2) == 0))) {
+          energy += 0.0;
+        } else {
+          auto const ind = Utils::get_linear_index(
+              n - n_start, size, Utils::MemoryOrder::ROW_MAJOR);
+          auto const shift = Utils::Vector3i{shifts[0][n[0]], shifts[0][n[1]],
+                                             shifts[0][n[2]]};
+          auto const d_op =
+              Utils::Vector3i{d_ops[0][n[0]], d_ops[0][n[1]], d_ops[0][n[2]]};
+          auto const U2 = G_opt_dipolar_self_energy(params, shift);
+          energy += g[ind] * U2 * d_op.norm2();
         }
-        ++index;
-      },
-      [&](unsigned dim, int n) {
-        d_op_off[dim] = d_op[n];
-        shift_off[dim] = offset[n];
-      });
-
+      }
+    }
+  }
   return energy;
 }
 
-#endif // defined(DP3M)
+#endif
+#endif
diff --git a/src/core/p3m/math.hpp b/src/core/p3m/math.hpp
deleted file mode 100644
index a9f2bdf7fd5..00000000000
--- a/src/core/p3m/math.hpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (C) 2010-2024 The ESPResSo project
- * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
- *   Max-Planck-Institute for Polymer Research, Theory Group
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <utils/device_qualifier.hpp>
-#include <utils/math/sqr.hpp>
-
-#ifndef __CUDACC__
-#include <cmath>
-#endif
-#include <numbers>
-#include <stdexcept>
-#include <string>
-
-namespace math {
-
-/** @brief Return the absolute value of x. */
-inline DEVICE_QUALIFIER auto abs(double x) { return fabs(x); }
-
-/** @brief Return the absolute value of x. */
-inline DEVICE_QUALIFIER auto abs(float x) { return fabsf(x); }
-
-/**
- * @brief Calculate the function @f$ \mathrm{sinc}(x) = \sin(\pi x)/(\pi x) @f$.
- *
- * (same convention as in @cite hockney88a). In order to avoid divisions
- * by 0, arguments whose modulus is smaller than @f$ \epsilon = 0.1 @f$
- * are evaluated by an 8th order Taylor expansion.
- * Note that the difference between sinc(x) and this expansion
- * is smaller than 0.235e-12, if x is smaller than @f$ \epsilon @f$.
- * (The next term in the expansion is the 10th order contribution, i.e.
- * @f$ \pi^{10} x^{10}/39916800 \approx 0.2346 \cdot x^{12} @f$).
- */
-template <typename T> DEVICE_QUALIFIER auto sinc(T x) {
-  auto constexpr epsilon = T(0.1);
-#if not defined(__CUDACC__)
-  using std::sin;
-#endif
-
-  auto const pix = std::numbers::pi_v<T> * x;
-
-  if (::math::abs(x) > epsilon)
-    return sin(pix) / pix;
-
-  auto constexpr factorial = [](int n) {
-    int acc{1}, c{1};
-    while (c < n) {
-      acc *= ++c;
-    }
-    return acc;
-  };
-
-  /* Coefficients of the Taylor expansion of sinc */
-  auto constexpr c0 = T(+1) / T(factorial(1));
-  auto constexpr c2 = T(-1) / T(factorial(3));
-  auto constexpr c4 = T(+1) / T(factorial(5));
-  auto constexpr c6 = T(-1) / T(factorial(7));
-  auto constexpr c8 = T(+1) / T(factorial(9));
-
-  auto const pix2 = pix * pix;
-  return c0 + pix2 * (c2 + pix2 * (c4 + pix2 * (c6 + pix2 * c8)));
-}
-
-/**
- * @brief One of the aliasing sums used to compute k-space errors.
- * Fortunately the one which is most important (because it converges
- * most slowly, since it is not damped exponentially) can be calculated
- * analytically. The result (which depends on the order of the spline
- * interpolation) can be written as an even trigonometric polynomial.
- * The results are tabulated here (the employed formula is eq. 7-66
- * p. 233 in @cite hockney88a).
- */
-template <int cao>
-DEVICE_QUALIFIER auto analytic_cotangent_sum(int n, double mesh_i) {
-  static_assert(cao >= 1 and cao <= 7);
-#if not defined(__CUDACC__)
-  using std::cos;
-#endif
-  auto const theta = static_cast<double>(n) * mesh_i * std::numbers::pi;
-  auto const c = Utils::sqr(cos(theta));
-
-  if constexpr (cao == 1) {
-    return 1.;
-  }
-  if constexpr (cao == 2) {
-    return (1. + c * 2.) / 3.;
-  }
-  if constexpr (cao == 3) {
-    return (2. + c * (11. + c * 2.)) / 15.;
-  }
-  if constexpr (cao == 4) {
-    return (17. + c * (180. + c * (114. + c * 4.))) / 315.;
-  }
-  if constexpr (cao == 5) {
-    return (62. + c * (1072. + c * (1452. + c * (247. + c * 2.)))) / 2835.;
-  }
-  if constexpr (cao == 6) {
-    return (1382. +
-            c * (35396. + c * (83021. + c * (34096. + c * (2026. + c * 4.))))) /
-           155925.;
-  }
-  return (21844. +
-          c * (776661. +
-               c * (2801040. +
-                    c * (2123860. + c * (349500. + c * (8166. + c * 4.)))))) /
-         6081075.;
-}
-
-inline auto get_analytic_cotangent_sum_kernel(int cao) {
-  decltype(&analytic_cotangent_sum<1>) ptr = nullptr;
-  if (cao == 1) {
-    ptr = &analytic_cotangent_sum<1>;
-  } else if (cao == 2) {
-    ptr = &analytic_cotangent_sum<2>;
-  } else if (cao == 3) {
-    ptr = &analytic_cotangent_sum<3>;
-  } else if (cao == 4) {
-    ptr = &analytic_cotangent_sum<4>;
-  } else if (cao == 5) {
-    ptr = &analytic_cotangent_sum<5>;
-  } else if (cao == 6) {
-    ptr = &analytic_cotangent_sum<6>;
-  } else if (cao == 7) {
-    ptr = &analytic_cotangent_sum<7>;
-  }
-  if (ptr == nullptr) {
-    throw std::logic_error("Invalid value cao=" + std::to_string(cao));
-  }
-  return ptr;
-}
-
-} // namespace math
diff --git a/src/core/unit_tests/CMakeLists.txt b/src/core/unit_tests/CMakeLists.txt
index 60fcadae764..d843da0a636 100644
--- a/src/core/unit_tests/CMakeLists.txt
+++ b/src/core/unit_tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2010-2024 The ESPResSo project
+# Copyright (C) 2010-2022 The ESPResSo project
 # Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
 #   Max-Planck-Institute for Polymer Research, Theory Group
 #
@@ -94,8 +94,6 @@ if(ESPRESSO_BUILD_WITH_FFTW)
             espresso::core)
   unit_test(NAME fft_test SRC fft_test.cpp DEPENDS espresso::utils
             espresso::core)
-  unit_test(NAME math_test SRC math_test.cpp DEPENDS espresso::utils
-            espresso::core)
 endif()
 
 if(ESPRESSO_BUILD_WITH_CUDA)
diff --git a/src/core/unit_tests/cuda_test.cu b/src/core/unit_tests/cuda_test.cu
index 812638cfa96..f5057fe7a75 100644
--- a/src/core/unit_tests/cuda_test.cu
+++ b/src/core/unit_tests/cuda_test.cu
@@ -17,12 +17,10 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#define BOOST_TEST_MODULE "CUDA interface tests"
+#define BOOST_TEST_MODULE cuda test
 #define BOOST_TEST_DYN_LINK
 #include <boost/test/unit_test.hpp>
 
-#include "config/config.hpp"
-
 #include "cuda/init.hpp"
 #include "cuda/utils.cuh"
 #include "cuda/utils.hpp"
@@ -35,22 +33,20 @@
 #include <boost/test/unit_test.hpp>
 
 #include <cstddef>
-#include <initializer_list>
 #include <optional>
 #include <string>
 
 boost::test_tools::assertion_result has_gpu(boost::unit_test::test_unit_id) {
-  bool has_compatible_device = false;
   int n_devices = 0;
   cudaGetDeviceCount(&n_devices);
   if (n_devices > 0) {
     cudaDeviceProp prop;
     cudaGetDeviceProperties(&prop, 0);
     if (prop.major >= 3) {
-      has_compatible_device = true;
+      return true;
     }
   }
-  return has_compatible_device;
+  return false;
 }
 
 std::optional<std::string> read_pending_cuda_errors() {
@@ -83,43 +79,44 @@ void clear() {
 
 } // namespace Testing::non_sticky_cuda_error
 
-#ifdef P3M
-dim3 p3m_make_grid(unsigned int n_blocks);
-#endif
-
 static auto fixture = boost::unit_test::fixture(&setup, &teardown);
 
 BOOST_AUTO_TEST_SUITE(suite, *boost::unit_test::precondition(has_gpu))
 
 BOOST_AUTO_TEST_CASE(gpu_fixture, *fixture) {
-  auto error1 = read_pending_cuda_errors();
-  BOOST_REQUIRE(not error1.has_value());
-
-  // check we can raise and clear non-sticky CUDA errors
-  Testing::non_sticky_cuda_error::trigger();
-  Testing::non_sticky_cuda_error::clear();
-  auto error2 = read_pending_cuda_errors();
-  BOOST_REQUIRE(not error2.has_value());
-
-  // check fixture can handle the default non-sticky CUDA error
-  Testing::non_sticky_cuda_error::trigger();
-  auto ref_what3{"There is a pending CUDA error: \"invalid device ordinal\""};
-  auto error3 = read_pending_cuda_errors();
-  BOOST_REQUIRE(error3.has_value());
-  BOOST_REQUIRE_EQUAL(error3.value(), ref_what3);
-  // sticky error should have been cleared
-  error3 = read_pending_cuda_errors();
-  BOOST_REQUIRE(not error3.has_value());
-
-  // check fixture can handle a custom non-sticky CUDA error
-  cudaMallocHost(nullptr, std::size_t(0u));
-  auto ref_what4{"There is a pending CUDA error: \"invalid argument\""};
-  auto error4 = read_pending_cuda_errors();
-  BOOST_REQUIRE(error4.has_value());
-  BOOST_REQUIRE_EQUAL(error4.value(), ref_what4);
-  // sticky error should have been cleared
-  error4 = read_pending_cuda_errors();
-  BOOST_REQUIRE(not error4.has_value());
+  {
+    auto error = read_pending_cuda_errors();
+    BOOST_REQUIRE(not error.has_value());
+  }
+  {
+    // check we can raise and clear non-sticky CUDA errors
+    Testing::non_sticky_cuda_error::trigger();
+    Testing::non_sticky_cuda_error::clear();
+    auto error = read_pending_cuda_errors();
+    BOOST_REQUIRE(not error.has_value());
+  }
+  {
+    // check fixture can handle the default non-sticky CUDA error
+    Testing::non_sticky_cuda_error::trigger();
+    auto ref_what{"There is a pending CUDA error: \"invalid device ordinal\""};
+    auto error = read_pending_cuda_errors();
+    BOOST_REQUIRE(error.has_value());
+    BOOST_REQUIRE_EQUAL(error.value(), ref_what);
+    // sticky error should have been cleared
+    error = read_pending_cuda_errors();
+    BOOST_REQUIRE(not error.has_value());
+  }
+  {
+    // check fixture can handle a custom non-sticky CUDA error
+    cudaMallocHost(nullptr, std::size_t(0u));
+    auto ref_what{"There is a pending CUDA error: \"invalid argument\""};
+    auto error = read_pending_cuda_errors();
+    BOOST_REQUIRE(error.has_value());
+    BOOST_REQUIRE_EQUAL(error.value(), ref_what);
+    // sticky error should have been cleared
+    error = read_pending_cuda_errors();
+    BOOST_REQUIRE(not error.has_value());
+  }
 }
 
 static int fatal_error_counter = 0;
@@ -128,153 +125,127 @@ static void increment_counter() noexcept { ++fatal_error_counter; }
 BOOST_AUTO_TEST_CASE(gpu_interface, *fixture) {
   fatal_error_counter = 0;
   auto local_error_counter = 0;
-
-  try {
-    throw cuda_fatal_error("message");
-  } catch (cuda_fatal_error &err) {
-    std::string const what = "message";
-    BOOST_CHECK_EQUAL(err.what(), what);
-    BOOST_CHECK_EQUAL(err.get_terminate(), &errexit);
-    err.set_terminate(nullptr);
-    BOOST_CHECK_EQUAL(err.get_terminate(), nullptr);
-    err.set_terminate(increment_counter);
-    BOOST_CHECK_EQUAL(err.get_terminate(), &increment_counter);
-    BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
-  }
-  ++local_error_counter;
-  BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
-
-  // -----------------------
-
-  auto error_caught = false;
-  auto const block = dim3{1, 2, 3};
-  auto const grid = dim3{4, 5, 6};
-  cuda_check_errors_exit(block, grid, "", "", 0u); // should not throw
-  try {
-    Testing::non_sticky_cuda_error::trigger();
-    // should clear the CUDA error flag and throw a fatal error
-    cuda_check_errors_exit(block, grid, "cudaSetDevice()", "filename.cu", 4u);
-  } catch (cuda_fatal_error &err) {
-    error_caught = true;
-    err.set_terminate(increment_counter);
-    std::string const what =
-        "CUDA error: \"invalid device ordinal\" while calling "
-        "cudaSetDevice() with block: <1,2,3>, grid: <4,5,6> in filename.cu:4";
-    BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
-    BOOST_CHECK_EQUAL(err.what(), what);
-    BOOST_CHECK_EQUAL(cudaGetLastError(), cudaSuccess);
-  }
-  ++local_error_counter;
-  BOOST_REQUIRE(error_caught);
-  BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
-
-  // -----------------------
-
-  error_caught = false;
-  cuda_safe_mem_exit(cudaSuccess, "", 0u); // should not throw
-  try {
-    Testing::non_sticky_cuda_error::trigger();
-    cuda_safe_mem_exit(cudaSuccess, "filename.cu", 4u); // should throw
-  } catch (cuda_fatal_error &err) {
-    error_caught = true;
-    err.set_terminate(increment_counter);
-    std::string const what =
-        "CUDA error: \"invalid device ordinal\" in filename.cu:4. Error "
-        "found during memory operation. Possibly however from a failed "
-        "operation before the memory operation";
-    BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
-    BOOST_CHECK_EQUAL(err.what(), what);
+  {
+    std::string const what = "message 1";
+    try {
+      throw cuda_fatal_error(what);
+    } catch (cuda_fatal_error &err) {
+      BOOST_CHECK_EQUAL(err.what(), what);
+      BOOST_CHECK_EQUAL(err.get_terminate(), &errexit);
+      err.set_terminate(nullptr);
+      BOOST_CHECK_EQUAL(err.get_terminate(), nullptr);
+      err.set_terminate(increment_counter);
+      BOOST_CHECK_EQUAL(err.get_terminate(), &increment_counter);
+      BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
+    }
+    ++local_error_counter;
+    BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
   }
-  ++local_error_counter;
-  BOOST_REQUIRE(error_caught);
-  BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
-
-  // -----------------------
-
-  error_caught = false;
-  try {
-    cuda_safe_mem_exit(cudaErrorNotPermitted, "filename.cu", 4u);
-  } catch (cuda_fatal_error &err) {
-    error_caught = true;
-    err.set_terminate(increment_counter);
-    std::string const what = "CUDA error: \"operation not permitted\" during "
-                             "memory operation in filename.cu:4";
-    BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
-    BOOST_CHECK_EQUAL(err.what(), what);
+  {
+    auto error_caught = false;
+    auto const block = dim3{1, 2, 3};
+    auto const grid = dim3{4, 5, 6};
+    // should not throw
+    cuda_check_errors_exit(block, grid, "", "", 0u);
+    try {
+      Testing::non_sticky_cuda_error::trigger();
+      // should clear the CUDA error flag and throw a fatal error
+      cuda_check_errors_exit(block, grid, "cudaSetDevice()", "filename.cu", 4u);
+    } catch (cuda_fatal_error &err) {
+      error_caught = true;
+      err.set_terminate(increment_counter);
+      std::string const what =
+          "CUDA error: \"invalid device ordinal\" while calling "
+          "cudaSetDevice() with block: <1,2,3>, grid: <4,5,6> in filename.cu:4";
+      BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
+      BOOST_CHECK_EQUAL(err.what(), what);
+      BOOST_CHECK_EQUAL(cudaGetLastError(), cudaSuccess);
+    }
+    ++local_error_counter;
+    BOOST_REQUIRE(error_caught);
+    BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
   }
-  ++local_error_counter;
-  BOOST_REQUIRE(error_caught);
-  BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
-
-  // -----------------------
-
-  error_caught = false;
-  try {
-    cuda_safe_mem_exit(cudaErrorInvalidValue, "function_name()", 4u);
-  } catch (cuda_fatal_error &err) {
-    error_caught = true;
-    err.set_terminate(increment_counter);
-    std::string const what =
-        "CUDA error: \"invalid argument\" during memory operation in "
-        "function_name():4. You may have tried to allocate zero memory";
-    BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
-    BOOST_CHECK_EQUAL(err.what(), what);
+  {
+    auto error_caught = false;
+    // should not throw
+    cuda_safe_mem_exit(cudaSuccess, "", 0u);
+    try {
+      Testing::non_sticky_cuda_error::trigger();
+      // should throw
+      cuda_safe_mem_exit(cudaSuccess, "filename.cu", 4u);
+    } catch (cuda_fatal_error &err) {
+      error_caught = true;
+      err.set_terminate(increment_counter);
+      std::string const what =
+          "CUDA error: \"invalid device ordinal\" in filename.cu:4. Error "
+          "found during memory operation. Possibly however from a failed "
+          "operation before the memory operation";
+      BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
+      BOOST_CHECK_EQUAL(err.what(), what);
+    }
+    ++local_error_counter;
+    BOOST_REQUIRE(error_caught);
+    BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
   }
-  ++local_error_counter;
-  BOOST_REQUIRE(error_caught);
-  BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
-
-  // -----------------------
-
-  error_caught = false;
-  BOOST_REQUIRE_EQUAL(stream[0], nullptr);
-  cuda_init(); // allocate
-  BOOST_REQUIRE_NE(stream[0], nullptr);
-  cuda_set_device(0); // reallocate, may or may not result in the same pointer
-  BOOST_REQUIRE_NE(stream[0], nullptr);
-  auto const old_stream = stream[0];
-  try {
-    cuda_set_device(-1); // fail to reallocate, pointer remains the same
-  } catch (cuda_runtime_error_cuda const &err) {
-    error_caught = true;
-    std::string const what = "CUDA error: invalid device ordinal";
-    BOOST_CHECK_EQUAL(err.what(), what);
+  {
+    auto error_caught = false;
+    try {
+      cuda_safe_mem_exit(cudaErrorNotPermitted, "filename.cu", 4u);
+    } catch (cuda_fatal_error &err) {
+      error_caught = true;
+      err.set_terminate(increment_counter);
+      std::string const what = "CUDA error: \"operation not permitted\" during "
+                               "memory operation in filename.cu:4";
+      BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
+      BOOST_CHECK_EQUAL(err.what(), what);
+    }
+    ++local_error_counter;
+    BOOST_REQUIRE(error_caught);
+    BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
   }
-  BOOST_REQUIRE(error_caught);
-  BOOST_REQUIRE_EQUAL(stream[0], old_stream);
-
-  // -----------------------
-
-  BOOST_REQUIRE_GE(cuda_get_n_gpus(), 1);
-  char gpu_name_buffer[260] = {'\0'};
-  cuda_get_gpu_name(0, gpu_name_buffer);
-  for (int i = 255; i < 260; ++i) {
-    BOOST_REQUIRE_EQUAL(gpu_name_buffer[i], '\0');
+  {
+    auto error_caught = false;
+    try {
+      cuda_safe_mem_exit(cudaErrorInvalidValue, "function_name()", 4u);
+    } catch (cuda_fatal_error &err) {
+      error_caught = true;
+      err.set_terminate(increment_counter);
+      std::string const what =
+          "CUDA error: \"invalid argument\" during memory operation in "
+          "function_name():4. You may have tried to allocate zero memory";
+      BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
+      BOOST_CHECK_EQUAL(err.what(), what);
+    }
+    ++local_error_counter;
+    BOOST_REQUIRE(error_caught);
+    BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
   }
-}
-
-#ifdef P3M
-
-BOOST_AUTO_TEST_CASE(p3m_reshape_grid_test, *fixture) {
-  auto constexpr optimal_size = 65536u;
-
-  for (auto cao = 1u; cao <= 3u; ++cao) {
-    auto const n_blocks = cao * optimal_size;
-    auto const grid = p3m_make_grid(n_blocks);
-    BOOST_CHECK_EQUAL(grid.x, optimal_size);
-    BOOST_CHECK_EQUAL(grid.y, cao);
-    BOOST_CHECK_EQUAL(grid.z, 1u);
+  {
+    BOOST_REQUIRE_EQUAL(stream[0], nullptr);
+    auto error_caught = false;
+    cuda_init(); // allocate
+    BOOST_REQUIRE_NE(stream[0], nullptr);
+    cuda_set_device(0); // reallocate, may or may not result in the same pointer
+    BOOST_REQUIRE_NE(stream[0], nullptr);
+    auto const old_stream = stream[0];
+    try {
+      cuda_set_device(-1); // fail to reallocate, pointer remains the same
+    } catch (cuda_runtime_error_cuda const &err) {
+      error_caught = true;
+      std::string const what = "CUDA error: invalid device ordinal";
+      BOOST_CHECK_EQUAL(err.what(), what);
+    }
+    BOOST_REQUIRE(error_caught);
+    BOOST_REQUIRE_EQUAL(stream[0], old_stream);
   }
-
-  for (auto mul : {2u, 3u, 6u, 12u}) {
-    auto const n_blocks = mul * optimal_size + 1u;
-    auto const grid = p3m_make_grid(n_blocks);
-    BOOST_CHECK_EQUAL(grid.x, n_blocks / (mul + 1u) + ((mul == 2u) ? 0u : 1u));
-    BOOST_CHECK_EQUAL(grid.y, (mul + 1u));
-    BOOST_CHECK_EQUAL(grid.z, 1u);
+  {
+    BOOST_REQUIRE_GE(cuda_get_n_gpus(), 1);
+    char gpu_name_buffer[260] = {'\0'};
+    cuda_get_gpu_name(0, gpu_name_buffer);
+    for (int i = 255; i < 260; ++i) {
+      BOOST_REQUIRE_EQUAL(gpu_name_buffer[i], '\0');
+    }
   }
 }
 
-#endif
-
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/core/unit_tests/fft_test.cpp b/src/core/unit_tests/fft_test.cpp
index bb27335279b..1506e01f20d 100644
--- a/src/core/unit_tests/fft_test.cpp
+++ b/src/core/unit_tests/fft_test.cpp
@@ -17,17 +17,11 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#define BOOST_TEST_MODULE "FFT utility functions"
+#define BOOST_TEST_MODULE fft test
 #define BOOST_TEST_DYN_LINK
-
-#include "config/config.hpp"
-
-#if defined(P3M) || defined(DP3M)
-
 #include <boost/test/unit_test.hpp>
 
 #include "p3m/fft.hpp"
-#include "p3m/for_each_3d.hpp"
 
 #include <utils/Vector.hpp>
 
@@ -39,9 +33,10 @@
 #include <stdexcept>
 #include <vector>
 
+#if defined(P3M) || defined(DP3M)
 std::optional<std::vector<int>> find_comm_groups(Utils::Vector3i const &,
                                                  Utils::Vector3i const &,
-                                                 std::span<int const>,
+                                                 std::span<const int>,
                                                  std::span<int>, std::span<int>,
                                                  std::span<int>, int);
 
@@ -68,50 +63,11 @@ BOOST_AUTO_TEST_CASE(fft_find_comm_groups_mismatch) {
 }
 
 BOOST_AUTO_TEST_CASE(fft_exceptions) {
-  auto constexpr size_max = std::numeric_limits<std::size_t>::max();
-  auto constexpr bad_size = size_max / sizeof(int) + 1ul;
   fft_allocator<int> allocator{};
   BOOST_CHECK_EQUAL(allocator.allocate(0ul), nullptr);
-  BOOST_CHECK_THROW(allocator.allocate(bad_size), std::bad_array_new_length);
-}
-
-BOOST_AUTO_TEST_CASE(for_each_3d_test) {
-  auto const m_start = Utils::Vector3i{{0, -1, 3}};
-  auto const m_stop = Utils::Vector3i{{2, 2, 5}};
-  auto ref_loop_counters = m_start;
-  auto indices = Utils::Vector3i{};
-
-  auto const kernel = [&]() {
-    BOOST_REQUIRE_EQUAL(indices, ref_loop_counters);
-    if (++ref_loop_counters[2u] == m_stop[2u]) {
-      ref_loop_counters[2u] = m_start[2u];
-      if (++ref_loop_counters[1u] == m_stop[1u]) {
-        ref_loop_counters[1u] = m_start[1u];
-        if (++ref_loop_counters[0u] == m_stop[0u]) {
-          ref_loop_counters[0u] = m_start[0u];
-        }
-      }
-    }
-  };
-
-  {
-    for_each_3d(m_start, m_stop, indices, kernel, [&](unsigned dim, int n) {
-      BOOST_REQUIRE_GE(dim, 0);
-      BOOST_REQUIRE_LE(dim, 2);
-      BOOST_REQUIRE_EQUAL(n, ref_loop_counters[dim]);
-    });
-
-    BOOST_REQUIRE_EQUAL(indices, m_stop);
-    BOOST_REQUIRE_EQUAL(ref_loop_counters, m_start);
-  }
-  {
-    for_each_3d(m_start, m_stop, indices, kernel);
-
-    BOOST_REQUIRE_EQUAL(indices, m_stop);
-    BOOST_REQUIRE_EQUAL(ref_loop_counters, m_start);
-  }
+  BOOST_CHECK_THROW(
+      allocator.allocate(std::numeric_limits<std::size_t>::max() / sizeof(int) +
+                         1ul),
+      std::bad_array_new_length);
 }
-
-#else  // defined(P3M) || defined(DP3M)
-int main(int argc, char **argv) {}
 #endif // defined(P3M) || defined(DP3M)
diff --git a/src/core/unit_tests/math_test.cpp b/src/core/unit_tests/math_test.cpp
deleted file mode 100644
index 46a638800f1..00000000000
--- a/src/core/unit_tests/math_test.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (C) 2017-2024 The ESPResSo project
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#define BOOST_TEST_MODULE "Special mathematical functions"
-#define BOOST_TEST_DYN_LINK
-#include <boost/test/unit_test.hpp>
-
-#include "p3m/math.hpp"
-
-#include <utils/math/sqr.hpp>
-
-#include <boost/math/special_functions/bernoulli.hpp>
-#include <boost/math/special_functions/factorials.hpp>
-
-#include <algorithm>
-#include <cmath>
-#include <numbers>
-#include <numeric>
-#include <ranges>
-#include <stdexcept>
-
-/** @brief Compute the n-th term in the Taylor series of @c tan(x). */
-static auto taylor_series_tangent(int n) {
-  auto const two_power_2n = std::pow(2., 2 * n);
-  auto const b2n = boost::math::bernoulli_b2n<double>(n);
-  auto const f2n = boost::math::factorial<double>(2 * n);
-  return std::pow(-1., n - 1) * two_power_2n * (two_power_2n - 1.) * b2n / f2n;
-}
-
-/** @brief Compute the n-th term in the Taylor series of @c sinc(x). */
-static auto taylor_series_sinc(int n) {
-  return std::pow(-1., n) / std::tgamma(2 * n + 2);
-}
-
-BOOST_AUTO_TEST_CASE(abs_test) {
-  static_assert(std::is_same_v<float, decltype(math::abs(1.f))>);
-  static_assert(std::is_same_v<double, decltype(math::abs(1.))>);
-
-  BOOST_CHECK_EQUAL(math::abs(+3.1415), std::abs(+3.1415));
-  BOOST_CHECK_EQUAL(math::abs(-3.1415), std::abs(-3.1415));
-  BOOST_CHECK_EQUAL(math::abs(+3.1415f), std::abs(+3.1415f));
-  BOOST_CHECK_EQUAL(math::abs(-3.1415f), std::abs(-3.1415f));
-}
-
-BOOST_AUTO_TEST_CASE(sinc_test) {
-  // edge case
-  BOOST_CHECK_EQUAL(math::sinc(0.0), 1.0);
-
-  // check against standard math functions
-  auto x = 0.001;
-  while (x <= 0.11) {
-    auto const approx = math::sinc(x);
-    auto const pi_x = std::numbers::pi * x;
-    auto const exact = std::sin(pi_x) / (pi_x);
-    BOOST_CHECK_SMALL(approx - exact, 1e-13);
-    x += 0.01;
-  }
-
-  // check Taylor expansion
-  auto const series = std::views::iota(0, 5) | std::views::reverse |
-                      std::views::transform(taylor_series_sinc);
-  for (auto const x : {1e-6, 1e-5, 1e-4, 1e-3, 1e-2}) {
-    auto const pix2 = Utils::sqr(std::numbers::pi * x);
-    auto const ref = std::accumulate(
-        series.begin(), series.end(), 0.,
-        [pix2](auto const &acc, auto const &c) { return acc * pix2 + c; });
-    BOOST_CHECK_SMALL(math::sinc(x) - ref, 1e-13);
-  }
-}
-
-BOOST_AUTO_TEST_CASE(analytic_cotangent_sum_test) {
-  auto constexpr tol = 8. * 100. * std::numeric_limits<double>::epsilon();
-  auto const kernel = [](int n, double mesh_i, int cao) {
-    return math::get_analytic_cotangent_sum_kernel(cao)(n, mesh_i);
-  };
-
-  // edge case: at theta = 0, aliasing sums are unity
-  for (auto const cao : std::ranges::iota_view{1, 8}) {
-    BOOST_CHECK_CLOSE(kernel(0, 0., cao), 1., tol);
-  }
-  // edge case: at theta = pi / 2, aliasing sums are equal to the cao-th term
-  // in the tangent Taylor series
-  for (auto const cao : std::ranges::iota_view{1, 8}) {
-    BOOST_CHECK_CLOSE(kernel(1, 0.5, cao), taylor_series_tangent(cao), tol);
-  }
-
-  // check assertion
-  for (auto const invalid_cao : {-1, 0, 8}) {
-    BOOST_CHECK_THROW(kernel(1, 0., invalid_cao), std::logic_error);
-  }
-}
diff --git a/src/core/unit_tests/p3m_test.cpp b/src/core/unit_tests/p3m_test.cpp
index 3f091d565ce..5fd3a83915b 100644
--- a/src/core/unit_tests/p3m_test.cpp
+++ b/src/core/unit_tests/p3m_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2022 The ESPResSo project
+ * Copyright (C) 2010-2022 The ESPResSo project
  *
  * This file is part of ESPResSo.
  *
@@ -17,7 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#define BOOST_TEST_MODULE "P3M utility functions"
+#define BOOST_TEST_MODULE p3m test
 #define BOOST_TEST_DYN_LINK
 #include <boost/test/unit_test.hpp>
 
@@ -27,6 +27,8 @@
 
 #include <array>
 #include <cstddef>
+#include <limits>
+#include <stdexcept>
 #include <vector>
 
 BOOST_AUTO_TEST_CASE(calc_meshift_false) {
@@ -58,3 +60,27 @@ BOOST_AUTO_TEST_CASE(calc_meshift_true) {
     }
   }
 }
+
+#if defined(P3M) || defined(DP3M)
+BOOST_AUTO_TEST_CASE(analytic_cotangent_sum) {
+  auto constexpr kernel = p3m_analytic_cotangent_sum;
+  auto constexpr tol = 8. * 100. * std::numeric_limits<double>::epsilon();
+
+  // check only trivial cases
+  for (auto const cao : {1, 2, 3, 4, 5, 6, 7}) {
+    BOOST_CHECK_CLOSE(kernel(0, 0., cao), 1., tol);
+  }
+  BOOST_CHECK_CLOSE(kernel(1, 0.5, 1), 1., tol);
+  BOOST_CHECK_CLOSE(kernel(1, 0.5, 2), 1. / 3., tol);
+  BOOST_CHECK_CLOSE(kernel(1, 0.5, 3), 2. / 15., tol);
+  BOOST_CHECK_CLOSE(kernel(1, 0.5, 4), 17. / 315., tol);
+  BOOST_CHECK_CLOSE(kernel(1, 0.5, 5), 62. / 2835., tol);
+  BOOST_CHECK_CLOSE(kernel(1, 0.5, 6), 1382. / 155925., tol);
+  BOOST_CHECK_CLOSE(kernel(1, 0.5, 7), 21844. / 6081075., tol);
+
+  // check assertion
+  for (auto const invalid_cao : {-1, 0, 8}) {
+    BOOST_CHECK_THROW(kernel(1, 0., invalid_cao), std::logic_error);
+  }
+}
+#endif // defined(P3M) || defined(DP3M)
diff --git a/src/shapes/include/shapes/Ellipsoid.hpp b/src/shapes/include/shapes/Ellipsoid.hpp
index e65149fb93c..39fc53bb293 100644
--- a/src/shapes/include/shapes/Ellipsoid.hpp
+++ b/src/shapes/include/shapes/Ellipsoid.hpp
@@ -23,7 +23,7 @@
 #define SHAPES_ELLIPSOID_HPP
 
 #include "Shape.hpp"
-
+#include <utils/Array.hpp>
 #include <utils/Vector.hpp>
 
 namespace Shapes {
diff --git a/src/shapes/src/HollowConicalFrustum.cpp b/src/shapes/src/HollowConicalFrustum.cpp
index 51ceb807dc2..c605fc20aa3 100644
--- a/src/shapes/src/HollowConicalFrustum.cpp
+++ b/src/shapes/src/HollowConicalFrustum.cpp
@@ -20,10 +20,9 @@
 #include <shapes/HollowConicalFrustum.hpp>
 
 #include <utils/Vector.hpp>
+#include <utils/math/abs.hpp>
 #include <utils/math/coordinate_transformation.hpp>
 
-#include <cmath>
-
 namespace Shapes {
 void HollowConicalFrustum::calculate_dist(const Utils::Vector3d &pos,
                                           double &dist,
@@ -71,7 +70,7 @@ void HollowConicalFrustum::calculate_dist(const Utils::Vector3d &pos,
    */
 
   auto endpoint_angle = pos_phi;
-  if (std::fabs(pos_phi) < m_central_angle / 2.) {
+  if (Utils::abs(pos_phi) < m_central_angle / 2.) {
     // Cannot use Utils::sgn because of pos_phi==0 corner case
     endpoint_angle =
         pos_phi > 0. ? m_central_angle / 2. : -m_central_angle / 2.;
@@ -94,7 +93,7 @@ void HollowConicalFrustum::calculate_dist(const Utils::Vector3d &pos,
   /* It can be that the projection onto the (infinite) line is outside the
    * frustum. In that case, the closest point is actually one of the endpoints.
    */
-  if (std::fabs(pos_closest_hcf_frame[2]) > m_length / 2.) {
+  if (Utils::abs(pos_closest_hcf_frame[2]) > m_length / 2.) {
     pos_closest_hcf_frame =
         pos_closest_hcf_frame[2] > 0. ? r1_endpoint : r2_endpoint;
   }
diff --git a/src/utils/include/utils/Vector.hpp b/src/utils/include/utils/Vector.hpp
index f1361fa9d6e..c4932207f26 100644
--- a/src/utils/include/utils/Vector.hpp
+++ b/src/utils/include/utils/Vector.hpp
@@ -39,7 +39,6 @@
 #include <initializer_list>
 #include <iterator>
 #include <numeric>
-#include <span>
 #include <type_traits>
 #include <vector>
 
@@ -80,21 +79,11 @@ template <typename T, std::size_t N> class Vector : public Array<T, N> {
 
 public:
   template <class Range>
-  explicit constexpr Vector(Range const &rng)
-      : Vector(std::begin(rng), std::end(rng)) {}
+  explicit Vector(Range const &rng) : Vector(std::begin(rng), std::end(rng)) {}
   explicit constexpr Vector(T const (&v)[N]) : Base() {
     copy_init(std::begin(v), std::end(v));
   }
 
-  explicit constexpr Vector(std::span<T> span) : Base() {
-    if (span.size() != N) {
-      throw std::length_error(
-          "Construction of Vector from Container of wrong length.");
-    }
-
-    copy_init(span.begin(), span.end());
-  }
-
   constexpr Vector(std::initializer_list<T> v) : Base() {
     if (N != v.size()) {
       throw std::length_error(
@@ -114,13 +103,14 @@ template <typename T, std::size_t N> class Vector : public Array<T, N> {
     }
   }
 
-  /** @brief Create a vector that has all entries set to the same value. */
-  DEVICE_QUALIFIER static constexpr Vector<T, N>
-  broadcast(typename Base::value_type const &value) {
-    Vector<T, N> ret{};
-    for (std::size_t i = 0u; i != N; ++i) {
-      ret[i] = value;
-    }
+  /**
+   * @brief Create a vector that has all entries set to
+   *         one value.
+   */
+  static Vector<T, N> broadcast(T const &s) {
+    Vector<T, N> ret;
+    std::fill(ret.begin(), ret.end(), s);
+
     return ret;
   }
 
@@ -128,17 +118,11 @@ template <typename T, std::size_t N> class Vector : public Array<T, N> {
 
   operator std::vector<T>() const { return as_vector(); }
 
-  constexpr std::span<T, N> as_span() const {
-    return std::span<T, N>(const_cast<T *>(begin()), size());
-  }
-
-  constexpr operator std::span<T, N>() const { return as_span(); }
-
   template <class U> explicit operator Vector<U, N>() const {
     Vector<U, N> ret;
 
     std::transform(begin(), end(), ret.begin(),
-                   [](auto const &e) { return static_cast<U>(e); });
+                   [](auto e) { return static_cast<U>(e); });
 
     return ret;
   }
@@ -156,7 +140,7 @@ template <typename T, std::size_t N> class Vector : public Array<T, N> {
   Vector &normalize() {
     auto const l = norm();
     if (l > T(0)) {
-      for (std::size_t i = 0u; i < N; ++i)
+      for (std::size_t i = 0; i < N; i++)
         this->operator[](i) /= l;
     }
 
@@ -263,7 +247,8 @@ template <std::size_t N, typename T>
 Vector<T, N> operator-(Vector<T, N> const &a) {
   Vector<T, N> ret;
 
-  std::transform(std::begin(a), std::end(a), std::begin(ret), std::negate<T>());
+  std::transform(std::begin(a), std::end(a), std::begin(ret),
+                 [](T const &v) { return -v; });
 
   return ret;
 }
@@ -315,15 +300,6 @@ Vector<T, N> operator/(Vector<T, N> const &a, T const &b) {
   return ret;
 }
 
-template <std::size_t N, typename T>
-Vector<T, N> operator/(T const &a, Vector<T, N> const &b) {
-  Vector<T, N> ret;
-
-  std::transform(std::begin(b), std::end(b), ret.begin(),
-                 [a](T const &val) { return a / val; });
-  return ret;
-}
-
 template <std::size_t N, typename T>
 Vector<T, N> &operator/=(Vector<T, N> &a, T const &b) {
   std::transform(std::begin(a), std::end(a), std::begin(a),
@@ -391,7 +367,7 @@ auto hadamard_product(Vector<T, N> const &a, Vector<U, N> const &b) {
 
   Vector<R, N> ret;
   std::transform(a.cbegin(), a.cend(), b.cbegin(), ret.begin(),
-                 [](auto const &ai, auto const &bi) { return ai * bi; });
+                 [](auto ai, auto bi) { return ai * bi; });
 
   return ret;
 }
@@ -434,7 +410,7 @@ auto hadamard_division(Vector<T, N> const &a, Vector<U, N> const &b) {
 
   Vector<R, N> ret;
   std::transform(a.cbegin(), a.cend(), b.cbegin(), ret.begin(),
-                 [](auto const &ai, auto const &bi) { return ai / bi; });
+                 [](auto ai, auto bi) { return ai / bi; });
 
   return ret;
 }
@@ -472,11 +448,11 @@ auto hadamard_division(T const &a, U const &b) {
 }
 
 template <typename T> Vector<T, 3> unit_vector(unsigned int i) {
-  if (i == 0u)
+  if (i == 0)
     return {T{1}, T{0}, T{0}};
-  if (i == 1u)
+  if (i == 1)
     return {T{0}, T{1}, T{0}};
-  if (i == 2u)
+  if (i == 2)
     return {T{0}, T{0}, T{1}};
   throw std::domain_error("coordinate out of range");
 }
diff --git a/src/utils/include/utils/math/cylindrical_transformation_parameters.hpp b/src/utils/include/utils/math/cylindrical_transformation_parameters.hpp
index 6893f04d58e..f3a055556bf 100644
--- a/src/utils/include/utils/math/cylindrical_transformation_parameters.hpp
+++ b/src/utils/include/utils/math/cylindrical_transformation_parameters.hpp
@@ -16,14 +16,13 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
+#ifndef ESPRESSO_CYLINDER_TRANSFORMATION_PARAMETERS_HPP
+#define ESPRESSO_CYLINDER_TRANSFORMATION_PARAMETERS_HPP
 
-#pragma once
-
-#include <cmath>
-#include <limits>
 #include <stdexcept>
 #include <string>
 
+#include <utils/math/abs.hpp>
 #include <utils/math/orthonormal_vec.hpp>
 
 namespace Utils {
@@ -64,28 +63,30 @@ class CylindricalTransformationParameters {
 
 private:
   void validate() const {
-    auto constexpr eps = 10. * std::numeric_limits<double>::epsilon();
-    if (std::fabs(m_orientation * m_axis) > eps) {
+    auto constexpr eps = 10 * std::numeric_limits<double>::epsilon();
+    if (Utils::abs(m_orientation * m_axis) > eps) {
       throw std::runtime_error(
           "CylindricalTransformationParameters: Axis and orientation must be "
           "orthogonal. Scalar product is " +
           std::to_string(m_orientation * m_axis));
     }
-    if (std::fabs(m_axis.norm() - 1.) > eps) {
+    if (Utils::abs(m_axis.norm() - 1) > eps) {
       throw std::runtime_error("CylindricalTransformationParameters: Axis must "
                                "be normalized. Norm is " +
                                std::to_string(m_axis.norm()));
     }
-    if (std::fabs(m_orientation.norm() - 1.) > eps) {
+    if (Utils::abs(m_orientation.norm() - 1) > eps) {
       throw std::runtime_error("CylindricalTransformationParameters: "
                                "orientation must be normalized. Norm is " +
                                std::to_string(m_orientation.norm()));
     }
   }
 
-  Utils::Vector3d const m_center{};
-  Utils::Vector3d const m_axis{0., 0., 1.};
-  Utils::Vector3d const m_orientation{1., 0., 0.};
+  const Utils::Vector3d m_center{};
+  const Utils::Vector3d m_axis{0, 0, 1};
+  const Utils::Vector3d m_orientation{1, 0, 0};
 };
 
 } // namespace Utils
+
+#endif // ESPRESSO_CYLINDER_TRANSFORMATION_PARAMETERS_HPP
diff --git a/src/utils/include/utils/math/sqr.hpp b/src/utils/include/utils/math/sqr.hpp
index a5ac0f25969..dae6ee6d8af 100644
--- a/src/utils/include/utils/math/sqr.hpp
+++ b/src/utils/include/utils/math/sqr.hpp
@@ -1,7 +1,5 @@
 /*
  * Copyright (C) 2010-2022 The ESPResSo project
- * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
- *   Max-Planck-Institute for Polymer Research, Theory Group
  *
  * This file is part of ESPResSo.
  *
diff --git a/src/utils/include/utils/quaternion.hpp b/src/utils/include/utils/quaternion.hpp
index d74b2161ec5..5a8c03b3251 100644
--- a/src/utils/include/utils/quaternion.hpp
+++ b/src/utils/include/utils/quaternion.hpp
@@ -42,7 +42,6 @@
 #include "utils/Array.hpp"
 #include "utils/Vector.hpp"
 #include "utils/matrix.hpp"
-#include "utils/serialization/array.hpp"
 
 #include <cassert>
 #include <cstddef>
diff --git a/src/utils/tests/CMakeLists.txt b/src/utils/tests/CMakeLists.txt
index 89755749fab..2dd87d9f775 100644
--- a/src/utils/tests/CMakeLists.txt
+++ b/src/utils/tests/CMakeLists.txt
@@ -21,6 +21,7 @@
 
 include(unit_test)
 
+unit_test(NAME abs_test SRC abs_test.cpp DEPENDS espresso::utils)
 unit_test(NAME Vector_test SRC Vector_test.cpp DEPENDS espresso::utils)
 unit_test(NAME Factory_test SRC Factory_test.cpp DEPENDS espresso::utils)
 unit_test(NAME NumeratedContainer_test SRC NumeratedContainer_test.cpp DEPENDS
@@ -33,6 +34,7 @@ unit_test(NAME accumulator SRC accumulator.cpp DEPENDS espresso::utils
 unit_test(NAME int_pow SRC int_pow_test.cpp DEPENDS espresso::utils)
 unit_test(NAME sgn SRC sgn_test.cpp DEPENDS espresso::utils)
 unit_test(NAME AS_erfc_part SRC AS_erfc_part_test.cpp DEPENDS espresso::utils)
+unit_test(NAME sinc SRC sinc_test.cpp DEPENDS espresso::utils)
 unit_test(NAME permute_ifield_test SRC permute_ifield_test.cpp DEPENDS
           espresso::utils)
 unit_test(NAME vec_rotate SRC vec_rotate_test.cpp DEPENDS espresso::utils)
diff --git a/src/utils/tests/Vector_test.cpp b/src/utils/tests/Vector_test.cpp
index b32fa4e9095..716c5276c01 100644
--- a/src/utils/tests/Vector_test.cpp
+++ b/src/utils/tests/Vector_test.cpp
@@ -43,8 +43,7 @@
 using Utils::Vector;
 
 /* Number of nontrivial Baxter permutations of length 2n-1. (A001185) */
-#define TEST_NUMBERS                                                           \
-  { 0, 1, 1, 7, 21, 112, 456, 2603, 13203 }
+#define TEST_NUMBERS {0, 1, 1, 7, 21, 112, 456, 2603, 13203}
 
 constexpr int test_numbers[] = TEST_NUMBERS;
 constexpr std::size_t n_test_numbers = sizeof(test_numbers) / sizeof(int);
@@ -63,17 +62,6 @@ BOOST_AUTO_TEST_CASE(initializer_list_constructor) {
   BOOST_CHECK_THROW(Utils::Vector2d({1., 2., 3.}), std::length_error);
 }
 
-BOOST_AUTO_TEST_CASE(span_constructor) {
-  constexpr static std::array<int, n_test_numbers> values(TEST_NUMBERS);
-  constexpr auto view = std::span(values);
-  Vector<int, n_test_numbers> v(view);
-
-  BOOST_CHECK(std::equal(v.begin(), v.end(), test_numbers));
-
-  BOOST_CHECK_THROW(Utils::Vector3i{view}, std::length_error);
-  BOOST_CHECK_THROW(Utils::Vector3i{view.subspan(0u, 2u)}, std::length_error);
-}
-
 BOOST_AUTO_TEST_CASE(iterator_constructor) {
   Vector<int, n_test_numbers> v(std::begin(test_numbers),
                                 std::end(test_numbers));
@@ -117,8 +105,8 @@ BOOST_AUTO_TEST_CASE(range_constructor_test) {
 }
 
 BOOST_AUTO_TEST_CASE(unit_vector_test) {
-  BOOST_CHECK((Utils::unit_vector<int>(2u) == Utils::Vector3i{0, 0, 1}));
-  BOOST_CHECK_THROW(Utils::unit_vector<double>(3u), std::domain_error);
+  BOOST_CHECK((Utils::unit_vector<int>(2) == Utils::Vector3i{0, 0, 1}));
+  BOOST_CHECK_THROW(Utils::unit_vector<double>(3), std::domain_error);
 }
 
 BOOST_AUTO_TEST_CASE(test_norm2) {
@@ -199,13 +187,6 @@ BOOST_AUTO_TEST_CASE(algebraic_operators) {
     BOOST_CHECK(v4 == (v3 /= 2));
   }
 
-  {
-    Utils::Vector3i v3{2, 12, 91};
-    Utils::Vector3i v4{180, 30, 3};
-    auto v5 = 360 / v3;
-    BOOST_CHECK(v5 == v4);
-  }
-
   BOOST_CHECK((sqrt(Utils::Vector3d{1., 2., 3.}) ==
                Utils::Vector3d{sqrt(1.), sqrt(2.), sqrt(3.)}));
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
index 5c1e4962691..5f618e22985 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
@@ -585,9 +585,9 @@ class LBWalberlaImpl : public LBWalberlaBase {
     auto const omega_odd = odd_mode_relaxation_rate(omega);
     auto const blocks = get_lattice().get_blocks();
     m_kT = FloatType_c(kT);
-    auto obj = CollisionModelThermalized(m_last_applied_force_field_id,
-                                         m_pdf_field_id, m_kT, omega, omega,
-                                         omega_odd, omega, seed, uint32_t{0u});
+    auto obj = CollisionModelThermalized(
+        m_last_applied_force_field_id, m_pdf_field_id, m_kT, omega, omega,
+        omega_odd, omega, seed, uint32_t{0u});
     m_collision_model = std::make_shared<CollisionModel>(std::move(obj));
     m_run_collide_sweep = CollideSweepVisitor(blocks);
   }
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.h
index fb144478e13..4535e2fc718 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.h
@@ -56,7 +56,7 @@ class CollideSweepDoublePrecisionLeesEdwards {
                                          BlockDataID pdfsID_, double grid_size,
                                          double omega_shear, double v_s)
       : forceID(forceID_), pdfsID(pdfsID_), grid_size_(grid_size),
-        omega_shear_(omega_shear), v_s_(v_s){};
+        omega_shear_(omega_shear), v_s_(v_s) {};
 
   void run(IBlock *block);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.h
index dfd7bcdde63..2e2e5bd1bd3 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.h
@@ -57,7 +57,7 @@ class CollideSweepDoublePrecisionLeesEdwardsAVX {
                                             double grid_size,
                                             double omega_shear, double v_s)
       : forceID(forceID_), pdfsID(pdfsID_), grid_size_(grid_size),
-        omega_shear_(omega_shear), v_s_(v_s){};
+        omega_shear_(omega_shear), v_s_(v_s) {};
 
   void run(IBlock *block);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsCUDA.h
index 276592774e1..00d995d043a 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsCUDA.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsCUDA.h
@@ -59,7 +59,7 @@ class CollideSweepDoublePrecisionLeesEdwardsCUDA {
                                              double grid_size,
                                              double omega_shear, double v_s)
       : forceID(forceID_), pdfsID(pdfsID_), grid_size_(grid_size),
-        omega_shear_(omega_shear), v_s_(v_s){};
+        omega_shear_(omega_shear), v_s_(v_s) {};
 
   void run(IBlock *block, gpuStream_t stream = nullptr);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.h
index 192b2fcf077..08e9d8c04ff 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.h
@@ -60,7 +60,7 @@ class CollideSweepDoublePrecisionThermalized {
       : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk),
         omega_even_(omega_even), omega_odd_(omega_odd),
         omega_shear_(omega_shear), seed_(seed), time_step_(time_step),
-        configured_(false){};
+        configured_(false) {};
 
   void run(IBlock *block);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.h
index 00c0575773c..505cb1a7479 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.h
@@ -61,7 +61,7 @@ class CollideSweepDoublePrecisionThermalizedAVX {
       : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk),
         omega_even_(omega_even), omega_odd_(omega_odd),
         omega_shear_(omega_shear), seed_(seed), time_step_(time_step),
-        configured_(false){};
+        configured_(false) {};
 
   void run(IBlock *block);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedCUDA.h
index bc7ac5f69e7..4683db87f80 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedCUDA.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedCUDA.h
@@ -61,7 +61,7 @@ class CollideSweepDoublePrecisionThermalizedCUDA {
       : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk),
         omega_even_(omega_even), omega_odd_(omega_odd),
         omega_shear_(omega_shear), seed_(seed), time_step_(time_step),
-        configured_(false){};
+        configured_(false) {};
 
   void run(IBlock *block, gpuStream_t stream = nullptr);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.h
index 58c30c1bf4e..3d6cbc09ef8 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.h
@@ -56,7 +56,7 @@ class CollideSweepSinglePrecisionLeesEdwards {
                                          BlockDataID pdfsID_, float grid_size,
                                          float omega_shear, float v_s)
       : forceID(forceID_), pdfsID(pdfsID_), grid_size_(grid_size),
-        omega_shear_(omega_shear), v_s_(v_s){};
+        omega_shear_(omega_shear), v_s_(v_s) {};
 
   void run(IBlock *block);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.h
index 2819ac83a73..39703996832 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.h
@@ -57,7 +57,7 @@ class CollideSweepSinglePrecisionLeesEdwardsAVX {
                                             float grid_size, float omega_shear,
                                             float v_s)
       : forceID(forceID_), pdfsID(pdfsID_), grid_size_(grid_size),
-        omega_shear_(omega_shear), v_s_(v_s){};
+        omega_shear_(omega_shear), v_s_(v_s) {};
 
   void run(IBlock *block);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsCUDA.h
index bf9a807a67f..57e7980e194 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsCUDA.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsCUDA.h
@@ -59,7 +59,7 @@ class CollideSweepSinglePrecisionLeesEdwardsCUDA {
                                              float grid_size, float omega_shear,
                                              float v_s)
       : forceID(forceID_), pdfsID(pdfsID_), grid_size_(grid_size),
-        omega_shear_(omega_shear), v_s_(v_s){};
+        omega_shear_(omega_shear), v_s_(v_s) {};
 
   void run(IBlock *block, gpuStream_t stream = nullptr);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.h
index 5a9aa9eb3fc..44fef35df24 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.h
@@ -60,7 +60,7 @@ class CollideSweepSinglePrecisionThermalized {
       : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk),
         omega_even_(omega_even), omega_odd_(omega_odd),
         omega_shear_(omega_shear), seed_(seed), time_step_(time_step),
-        configured_(false){};
+        configured_(false) {};
 
   void run(IBlock *block);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.h
index b5955f045db..8cd829be96c 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.h
@@ -60,7 +60,7 @@ class CollideSweepSinglePrecisionThermalizedAVX {
       : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk),
         omega_even_(omega_even), omega_odd_(omega_odd),
         omega_shear_(omega_shear), seed_(seed), time_step_(time_step),
-        configured_(false){};
+        configured_(false) {};
 
   void run(IBlock *block);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedCUDA.h
index cf55997b2b5..09e1dd04acc 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedCUDA.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedCUDA.h
@@ -62,7 +62,7 @@ class CollideSweepSinglePrecisionThermalizedCUDA {
       : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk),
         omega_even_(omega_even), omega_odd_(omega_odd),
         omega_shear_(omega_shear), seed_(seed), time_step_(time_step),
-        configured_(false){};
+        configured_(false) {};
 
   void run(IBlock *block, gpuStream_t stream = nullptr);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.h
index c472f1f5112..012a4bcc140 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.h
@@ -55,7 +55,7 @@ class InitialPDFsSetterDoublePrecision {
   InitialPDFsSetterDoublePrecision(BlockDataID forceID_, BlockDataID pdfsID_,
                                    BlockDataID velocityID_, double rho_0)
       : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_),
-        rho_0_(rho_0){};
+        rho_0_(rho_0) {};
 
   void run(IBlock *block);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecisionCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecisionCUDA.h
index 6f4ab75275a..6d493acdd84 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecisionCUDA.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecisionCUDA.h
@@ -58,7 +58,7 @@ class InitialPDFsSetterDoublePrecisionCUDA {
                                        BlockDataID pdfsID_,
                                        BlockDataID velocityID_, double rho_0)
       : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_),
-        rho_0_(rho_0){};
+        rho_0_(rho_0) {};
 
   void run(IBlock *block, gpuStream_t stream = nullptr);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.h
index e1c3260bab9..4a8ea571625 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.h
@@ -55,7 +55,7 @@ class InitialPDFsSetterSinglePrecision {
   InitialPDFsSetterSinglePrecision(BlockDataID forceID_, BlockDataID pdfsID_,
                                    BlockDataID velocityID_, float rho_0)
       : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_),
-        rho_0_(rho_0){};
+        rho_0_(rho_0) {};
 
   void run(IBlock *block);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecisionCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecisionCUDA.h
index a3faac64d68..c56b68680ac 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecisionCUDA.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecisionCUDA.h
@@ -58,7 +58,7 @@ class InitialPDFsSetterSinglePrecisionCUDA {
                                        BlockDataID pdfsID_,
                                        BlockDataID velocityID_, float rho_0)
       : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_),
-        rho_0_(rho_0){};
+        rho_0_(rho_0) {};
 
   void run(IBlock *block, gpuStream_t stream = nullptr);
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.h
index 87fa2aa0336..73819b58e6f 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.h
@@ -54,7 +54,7 @@ class StreamSweepDoublePrecision {
 public:
   StreamSweepDoublePrecision(BlockDataID forceID_, BlockDataID pdfsID_,
                              BlockDataID velocityID_)
-      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_){};
+      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_) {};
 
   ~StreamSweepDoublePrecision() {
     for (auto p : cache_pdfs_) {
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.h
index 52288021ad8..059f786e6b1 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.h
@@ -54,7 +54,7 @@ class StreamSweepDoublePrecisionAVX {
 public:
   StreamSweepDoublePrecisionAVX(BlockDataID forceID_, BlockDataID pdfsID_,
                                 BlockDataID velocityID_)
-      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_){};
+      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_) {};
 
   ~StreamSweepDoublePrecisionAVX() {
     for (auto p : cache_pdfs_) {
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionCUDA.h
index 217f90854e2..d6dd319394b 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionCUDA.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionCUDA.h
@@ -56,7 +56,7 @@ class StreamSweepDoublePrecisionCUDA {
 public:
   StreamSweepDoublePrecisionCUDA(BlockDataID forceID_, BlockDataID pdfsID_,
                                  BlockDataID velocityID_)
-      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_){};
+      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_) {};
 
   ~StreamSweepDoublePrecisionCUDA() {
     for (auto p : cache_pdfs_) {
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.h
index 4425265fc6c..f921333044a 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.h
@@ -54,7 +54,7 @@ class StreamSweepSinglePrecision {
 public:
   StreamSweepSinglePrecision(BlockDataID forceID_, BlockDataID pdfsID_,
                              BlockDataID velocityID_)
-      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_){};
+      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_) {};
 
   ~StreamSweepSinglePrecision() {
     for (auto p : cache_pdfs_) {
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.h
index 095c86c6cb9..81cc4fff038 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.h
@@ -54,7 +54,7 @@ class StreamSweepSinglePrecisionAVX {
 public:
   StreamSweepSinglePrecisionAVX(BlockDataID forceID_, BlockDataID pdfsID_,
                                 BlockDataID velocityID_)
-      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_){};
+      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_) {};
 
   ~StreamSweepSinglePrecisionAVX() {
     for (auto p : cache_pdfs_) {
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionCUDA.h
index 302b62816b7..16ea5b0ed2c 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionCUDA.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionCUDA.h
@@ -56,7 +56,7 @@ class StreamSweepSinglePrecisionCUDA {
 public:
   StreamSweepSinglePrecisionCUDA(BlockDataID forceID_, BlockDataID pdfsID_,
                                  BlockDataID velocityID_)
-      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_){};
+      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_) {};
 
   ~StreamSweepSinglePrecisionCUDA() {
     for (auto p : cache_pdfs_) {
diff --git a/testsuite/python/lb.py b/testsuite/python/lb.py
index e5a5ebe9ab3..66acb528814 100644
--- a/testsuite/python/lb.py
+++ b/testsuite/python/lb.py
@@ -635,10 +635,10 @@ def test_rng(self):
         system = self.system
         system.lb = self.lb_class(kT=15., **self.params, **self.lb_params)
         system.integrator.run(1)
-        diff = system.lb[0, :, :].population - system.lb[6, :, :].population
+        difference = system.lb[0,:,:].population - system.lb[6,:,:].population
         # if the RNG uses the local cell index instead of the global cell index,
         # the noise will be identical in all blocks, and the RMS is zero
-        rms = np.sqrt(np.mean(np.square(diff)))
+        rms = np.sqrt(np.mean(np.square(difference)))
         self.assertGreater(rms, 0.01, msg="thermal noise might be correlated!")
 
     def test_thermalization_force_balance(self):
diff --git a/testsuite/python/lb_boundary_ghost_layer.py b/testsuite/python/lb_boundary_ghost_layer.py
index 84ce9180f01..7170f2505f5 100644
--- a/testsuite/python/lb_boundary_ghost_layer.py
+++ b/testsuite/python/lb_boundary_ghost_layer.py
@@ -66,7 +66,7 @@ def quadratic(x, a, b, c):
         popt_ref = (4e-8, -1e-6, 1e-5)
         popt, _ = scipy.optimize.curve_fit(
             quadratic, xdata, ydata, p0=popt_ref)
-        rtol = 0.33 if self.lbf.single_precision else 0.1
+        rtol = 0.3 if self.lbf.single_precision else 0.1
         np.testing.assert_allclose(popt, popt_ref, rtol=0.5, atol=0.)
         np.testing.assert_allclose(ydata, quadratic(xdata, *popt),
                                    rtol=rtol, atol=0.)