From 77eb93a0d1dad5afcb18f6c4ed7a3c5ae2e2c0bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Fri, 14 Jun 2024 19:08:23 +0200 Subject: [PATCH] fft --- src/core/electrostatics/p3m.cpp | 245 ++++++------ src/core/electrostatics/p3m_gpu_cuda.cu | 94 ++--- src/core/electrostatics/p3m_gpu_error_cuda.cu | 55 +-- src/core/magnetostatics/dp3m.cpp | 360 ++++++++---------- src/core/p3m/common.cpp | 44 --- src/core/p3m/common.hpp | 16 +- src/core/p3m/fft.cpp | 2 +- src/core/p3m/fft.hpp | 43 +-- src/core/p3m/influence_function.hpp | 115 +++--- src/core/p3m/influence_function_dipolar.hpp | 219 ++++++----- src/core/unit_tests/CMakeLists.txt | 4 +- src/core/unit_tests/cuda_test.cu | 329 ++++++++-------- src/core/unit_tests/fft_test.cpp | 58 ++- src/core/unit_tests/p3m_test.cpp | 26 -- src/utils/include/utils/Vector.hpp | 2 +- src/utils/include/utils/math/abs.hpp | 40 -- src/utils/include/utils/math/sinc.hpp | 61 --- src/utils/tests/CMakeLists.txt | 2 - src/utils/tests/abs_test.cpp | 38 -- src/utils/tests/sinc_test.cpp | 39 -- 20 files changed, 741 insertions(+), 1051 deletions(-) delete mode 100644 src/utils/include/utils/math/abs.hpp delete mode 100644 src/utils/include/utils/math/sinc.hpp delete mode 100644 src/utils/tests/abs_test.cpp delete mode 100644 src/utils/tests/sinc_test.cpp diff --git a/src/core/electrostatics/p3m.cpp b/src/core/electrostatics/p3m.cpp index e247b88418d..32667d67966 100644 --- a/src/core/electrostatics/p3m.cpp +++ b/src/core/electrostatics/p3m.cpp @@ -36,7 +36,9 @@ #include "p3m/TuningAlgorithm.hpp" #include "p3m/TuningLogger.hpp" #include "p3m/fft.hpp" +#include "p3m/for_each_3d.hpp" #include "p3m/influence_function.hpp" +#include "p3m/math.hpp" #include "BoxGeometry.hpp" #include "LocalBox.hpp" @@ -57,7 +59,6 @@ #include #include #include -#include #include #include @@ -75,9 +76,11 @@ #include #include #include +#include #include #include #include +#include #include void CoulombP3M::count_charged_particles() { @@ -114,7 +117,7 @@ void CoulombP3M::calc_influence_function_force() { auto const size = Utils::Vector3i{p3m.fft.plan[3].new_mesh}; p3m.g_force = grid_influence_function<1>(p3m.params, start, start + size, - get_system().box_geo->length()); + get_system().box_geo->length_inv()); } /** Calculate the influence function optimized for the energy and the @@ -125,41 +128,38 @@ void CoulombP3M::calc_influence_function_energy() { auto const size = Utils::Vector3i{p3m.fft.plan[3].new_mesh}; p3m.g_energy = grid_influence_function<0>(p3m.params, start, start + size, - get_system().box_geo->length()); + get_system().box_geo->length_inv()); } /** Aliasing sum used by @ref p3m_k_space_error. */ -static auto p3m_tune_aliasing_sums(int nx, int ny, int nz, +static auto p3m_tune_aliasing_sums(Utils::Vector3i const &shift, Utils::Vector3i const &mesh, Utils::Vector3d const &mesh_i, int cao, double alpha_L_i) { - using Utils::sinc; + auto constexpr m_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN); + auto constexpr m_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1); auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i); - auto alias1 = 0.; auto alias2 = 0.; - for (int mx = -P3M_BRILLOUIN; mx <= P3M_BRILLOUIN; mx++) { - auto const nmx = nx + mx * mesh[0]; - auto const fnmx = mesh_i[0] * nmx; - for (int my = -P3M_BRILLOUIN; my <= P3M_BRILLOUIN; my++) { - auto const nmy = ny + my * mesh[1]; - auto const fnmy = mesh_i[1] * nmy; - for (int mz = -P3M_BRILLOUIN; mz <= P3M_BRILLOUIN; mz++) { - auto const nmz = nz + mz * mesh[2]; - auto const fnmz = mesh_i[2] * nmz; - - auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz); - auto const ex = exp(-factor1 * nm2); - auto const ex2 = Utils::sqr(ex); - - auto const U2 = pow(sinc(fnmx) * sinc(fnmy) * sinc(fnmz), 2.0 * cao); - - alias1 += ex2 / nm2; - alias2 += U2 * ex * (nx * nmx + ny * nmy + nz * nmz) / nm2; - } - } - } + + Utils::Vector3i indices{}; + Utils::Vector3i nm{}; + Utils::Vector3d fnm{}; + for_each_3d( + m_start, m_stop, indices, + [&]() { + auto const norm_sq = nm.norm2(); + auto const ex = exp(-factor1 * norm_sq); + auto const energy = std::pow(Utils::product(fnm), 2 * cao); + alias1 += Utils::sqr(ex) / norm_sq; + alias2 += energy * ex * (shift * nm) / norm_sq; + }, + [&](unsigned dim, int n) { + nm[dim] = shift[dim] + n * mesh[dim]; + fnm[dim] = math::sinc(nm[dim] * mesh_i[dim]); + }); + return std::make_pair(alias1, alias2); } @@ -197,51 +197,40 @@ static double p3m_real_space_error(double pref, double r_cut_iL, int n_c_part, static double p3m_k_space_error(double pref, Utils::Vector3i const &mesh, int cao, int n_c_part, double sum_q2, double alpha_L, Utils::Vector3d const &box_l) { - auto const mesh_i = - Utils::hadamard_division(Utils::Vector3d::broadcast(1.), mesh); + + auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao); + auto const mesh_i = 1. / Utils::Vector3d(mesh); auto const alpha_L_i = 1. / alpha_L; + auto const m_stop = mesh / 2; + auto const m_start = -m_stop; + auto indices = Utils::Vector3i{}; + auto values = Utils::Vector3d{}; auto he_q = 0.; - for (int nx = -mesh[0] / 2; nx < mesh[0] / 2; nx++) { - auto const ctan_x = p3m_analytic_cotangent_sum(nx, mesh_i[0], cao); - for (int ny = -mesh[1] / 2; ny < mesh[1] / 2; ny++) { - auto const ctan_y = - ctan_x * p3m_analytic_cotangent_sum(ny, mesh_i[1], cao); - for (int nz = -mesh[2] / 2; nz < mesh[2] / 2; nz++) { - if ((nx != 0) || (ny != 0) || (nz != 0)) { - auto const n2 = Utils::sqr(nx) + Utils::sqr(ny) + Utils::sqr(nz); - auto const cs = - p3m_analytic_cotangent_sum(nz, mesh_i[2], cao) * ctan_y; + for_each_3d( + m_start, m_stop, indices, + [&]() { + if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) { + auto const n2 = indices.norm2(); + auto const cs = Utils::product(values); auto const [alias1, alias2] = - p3m_tune_aliasing_sums(nx, ny, nz, mesh, mesh_i, cao, alpha_L_i); + p3m_tune_aliasing_sums(indices, mesh, mesh_i, cao, alpha_L_i); auto const d = alias1 - Utils::sqr(alias2 / cs) / n2; /* at high precision, d can become negative due to extinction; also, don't take values that have no significant digits left*/ - if (d > 0 && (fabs(d / alias1) > ROUND_ERROR_PREC)) + if (d > 0. and std::fabs(d / alias1) > ROUND_ERROR_PREC) { he_q += d; + } } - } - } - } + }, + [&values, &mesh_i, cotangent_sum](unsigned dim, int n) { + values[dim] = cotangent_sum(n, mesh_i[dim]); + }); + return 2. * pref * sum_q2 * sqrt(he_q / static_cast(n_c_part)) / (box_l[1] * box_l[2]); } -#ifdef CUDA -static double p3mgpu_k_space_error(double prefactor, - Utils::Vector3i const &mesh, int cao, - int npart, double sum_q2, double alpha_L, - Utils::Vector3d const &box_l) { - auto ks_err = 0.; - if (this_node == 0) { - ks_err = p3m_k_space_error_gpu(prefactor, mesh.data(), cao, npart, sum_q2, - alpha_L, box_l.data()); - } - boost::mpi::broadcast(comm_cart, ks_err, 0); - return ks_err; -} -#endif - void CoulombP3M::init() { assert(p3m.params.mesh >= Utils::Vector3i::broadcast(1)); assert(p3m.params.cao >= 1 and p3m.params.cao <= 7); @@ -421,48 +410,44 @@ CoulombP3M::long_range_pressure(ParticleRange const &particles) { p3m.sm.gather_grid(p3m.rs_mesh.data(), comm_cart, p3m.local_mesh.dim); fft_perform_forw(p3m.rs_mesh.data(), p3m.fft, comm_cart); - auto diagonal = 0.; - int ind = 0; - int j[3]; + auto constexpr m_start = Utils::Vector3i::broadcast(0); auto const half_alpha_inv_sq = Utils::sqr(1. / 2. / p3m.params.alpha); - for (j[0] = 0; j[0] < p3m.fft.plan[3].new_mesh[RX]; j[0]++) { - for (j[1] = 0; j[1] < p3m.fft.plan[3].new_mesh[RY]; j[1]++) { - for (j[2] = 0; j[2] < p3m.fft.plan[3].new_mesh[RZ]; j[2]++) { - auto const kx = 2. * std::numbers::pi * - p3m.d_op[RX][j[KX] + p3m.fft.plan[3].start[KX]] * - box_geo.length_inv()[RX]; - auto const ky = 2. * std::numbers::pi * - p3m.d_op[RY][j[KY] + p3m.fft.plan[3].start[KY]] * - box_geo.length_inv()[RY]; - auto const kz = 2. * std::numbers::pi * - p3m.d_op[RZ][j[KZ] + p3m.fft.plan[3].start[KZ]] * - box_geo.length_inv()[RZ]; - auto const sqk = Utils::sqr(kx) + Utils::sqr(ky) + Utils::sqr(kz); - - if (sqk != 0.) { - auto const node_k_space_energy = - p3m.g_energy[ind] * (Utils::sqr(p3m.rs_mesh[2 * ind]) + - Utils::sqr(p3m.rs_mesh[2 * ind + 1])); - auto const vterm = -2. * (1. / sqk + half_alpha_inv_sq); - auto const pref = node_k_space_energy * vterm; - node_k_space_pressure_tensor[0] += pref * kx * kx; /* sigma_xx */ - node_k_space_pressure_tensor[1] += pref * kx * ky; /* sigma_xy */ - node_k_space_pressure_tensor[2] += pref * kx * kz; /* sigma_xz */ - node_k_space_pressure_tensor[3] += pref * ky * kx; /* sigma_yx */ - node_k_space_pressure_tensor[4] += pref * ky * ky; /* sigma_yy */ - node_k_space_pressure_tensor[5] += pref * ky * kz; /* sigma_yz */ - node_k_space_pressure_tensor[6] += pref * kz * kx; /* sigma_zx */ - node_k_space_pressure_tensor[7] += pref * kz * ky; /* sigma_zy */ - node_k_space_pressure_tensor[8] += pref * kz * kz; /* sigma_zz */ - diagonal += node_k_space_energy; - } - ind++; - } + auto const m_stop = std::span(p3m.fft.plan[3].new_mesh); + auto const offset = std::span(p3m.fft.plan[3].start); + auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv(); + auto indices = Utils::Vector3i{}; + auto index = std::size_t(0u); + auto diagonal = 0.; + + for_each_3d(m_start, m_stop, indices, [&]() { + auto const kx = p3m.d_op[RX][indices[KX] + offset[KX]] * wavevector[RX]; + auto const ky = p3m.d_op[RY][indices[KY] + offset[KY]] * wavevector[RY]; + auto const kz = p3m.d_op[RZ][indices[KZ] + offset[KZ]] * wavevector[RZ]; + auto const norm_sq = Utils::sqr(kx) + Utils::sqr(ky) + Utils::sqr(kz); + + if (norm_sq != 0.) { + auto const node_k_space_energy = + p3m.g_energy[index] * (Utils::sqr(p3m.rs_mesh[2u * index + 0u]) + + Utils::sqr(p3m.rs_mesh[2u * index + 1u])); + auto const vterm = -2. * (1. / norm_sq + half_alpha_inv_sq); + auto const pref = node_k_space_energy * vterm; + node_k_space_pressure_tensor[0u] += pref * kx * kx; /* sigma_xx */ + node_k_space_pressure_tensor[1u] += pref * kx * ky; /* sigma_xy */ + node_k_space_pressure_tensor[2u] += pref * kx * kz; /* sigma_xz */ + node_k_space_pressure_tensor[4u] += pref * ky * ky; /* sigma_yy */ + node_k_space_pressure_tensor[5u] += pref * ky * kz; /* sigma_yz */ + node_k_space_pressure_tensor[8u] += pref * kz * kz; /* sigma_zz */ + diagonal += node_k_space_energy; } - } - node_k_space_pressure_tensor[0] += diagonal; - node_k_space_pressure_tensor[4] += diagonal; - node_k_space_pressure_tensor[8] += diagonal; + ++index; + }); + + node_k_space_pressure_tensor[0u] += diagonal; + node_k_space_pressure_tensor[4u] += diagonal; + node_k_space_pressure_tensor[8u] += diagonal; + node_k_space_pressure_tensor[3u] = node_k_space_pressure_tensor[1u]; + node_k_space_pressure_tensor[6u] = node_k_space_pressure_tensor[2u]; + node_k_space_pressure_tensor[7u] = node_k_space_pressure_tensor[5u]; } return node_k_space_pressure_tensor * prefactor / (2. * box_geo.volume()); @@ -509,35 +494,35 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag, /* === k-space force calculation === */ if (force_flag) { - /* sqrt(-1)*k differentiation */ - int j[3]; - int ind = 0; + /* i*k differentiation */ + auto constexpr m_start = Utils::Vector3i::broadcast(0); + auto const m_stop = std::span(p3m.fft.plan[3].new_mesh); + auto const offset = std::span(p3m.fft.plan[3].start); + auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv(); + auto indices = Utils::Vector3i{}; + auto index = std::size_t(0u); + /* compute electric field */ // Eq. (3.49) @cite deserno00b - for (j[0] = 0; j[0] < p3m.fft.plan[3].new_mesh[0]; j[0]++) { - for (j[1] = 0; j[1] < p3m.fft.plan[3].new_mesh[1]; j[1]++) { - for (j[2] = 0; j[2] < p3m.fft.plan[3].new_mesh[2]; j[2]++) { - auto const rho_hat = std::complex(p3m.rs_mesh[2 * ind + 0], - p3m.rs_mesh[2 * ind + 1]); - auto const phi_hat = p3m.g_force[ind] * rho_hat; - - for (int d = 0; d < 3; d++) { - /* direction in r-space: */ - int d_rs = (d + p3m.ks_pnum) % 3; - /* directions */ - auto const k = 2. * std::numbers::pi * - p3m.d_op[d_rs][j[d] + p3m.fft.plan[3].start[d]] * - box_geo.length_inv()[d_rs]; - - /* i*k*(Re+i*Im) = - Im*k + i*Re*k (i=sqrt(-1)) */ - p3m.E_mesh[d_rs][2 * ind + 0] = -k * phi_hat.imag(); - p3m.E_mesh[d_rs][2 * ind + 1] = +k * phi_hat.real(); - } - - ind++; - } + for_each_3d(m_start, m_stop, indices, [&]() { + auto const rho_hat = std::complex(p3m.rs_mesh[2u * index + 0u], + p3m.rs_mesh[2u * index + 1u]); + auto const phi_hat = p3m.g_force[index] * rho_hat; + + for (int d = 0; d < 3; d++) { + /* direction in r-space: */ + int d_rs = (d + p3m.ks_pnum) % 3; + /* directions */ + auto const k = + p3m.d_op[d_rs][indices[d] + offset[d]] * wavevector[d_rs]; + + /* i*k*(Re+i*Im) = - Im*k + i*Re*k (i=sqrt(-1)) */ + p3m.E_mesh[d_rs][2u * index + 0u] = -k * phi_hat.imag(); + p3m.E_mesh[d_rs][2u * index + 1u] = +k * phi_hat.real(); } - } + + ++index; + }); /* Back FFT force component mesh */ auto const check_complex = !p3m.params.tuning and check_complex_residuals; @@ -678,8 +663,12 @@ class CoulombTuningAlgorithm : public TuningAlgorithm { #ifdef CUDA auto const &solver = m_system.coulomb.impl->solver; if (has_actor_of_type(solver)) { - ks_err = p3mgpu_k_space_error(m_prefactor, mesh, cao, p3m.sum_qpart, - p3m.sum_q2, alpha_L, box_geo.length()); + if (this_node == 0) { + ks_err = + p3m_k_space_error_gpu(m_prefactor, mesh.data(), cao, p3m.sum_qpart, + p3m.sum_q2, alpha_L, box_geo.length().data()); + } + boost::mpi::broadcast(comm_cart, ks_err, 0); } else #endif ks_err = p3m_k_space_error(m_prefactor, mesh, cao, p3m.sum_qpart, @@ -807,7 +796,7 @@ void CoulombP3M::sanity_checks_boxl() const { auto const &system = get_system(); auto const &box_geo = *system.box_geo; auto const &local_geo = *system.local_geo; - for (unsigned int i = 0u; i < 3u; i++) { + for (auto i = 0u; i < 3u; i++) { /* check k-space cutoff */ if (p3m.params.cao_cut[i] >= box_geo.length_half()[i]) { std::stringstream msg; diff --git a/src/core/electrostatics/p3m_gpu_cuda.cu b/src/core/electrostatics/p3m_gpu_cuda.cu index 6429c8afeed..c734ba0cccd 100644 --- a/src/core/electrostatics/p3m_gpu_cuda.cu +++ b/src/core/electrostatics/p3m_gpu_cuda.cu @@ -53,20 +53,20 @@ #include "electrostatics/p3m_gpu_cuda.cuh" #include "cuda/utils.cuh" +#include "p3m/math.hpp" #include "system/System.hpp" #include #include -#include #include #include #include #include +#include #include -#include -#include +#include #include #include @@ -93,7 +93,7 @@ struct P3MGpuData { /** Ewald parameter */ REAL_TYPE alpha; /** Number of particles */ - std::size_t n_part; + unsigned int n_part; // oddity: size_t causes UB with GCC 11.4 in Debug mode /** Box size */ REAL_TYPE box[3]; /** Mesh dimensions */ @@ -138,6 +138,31 @@ struct P3MGpuParams { } }; +static auto p3m_calc_blocks(unsigned int cao, std::size_t n_part) { + auto const cao3 = Utils::int_pow<3>(cao); + auto parts_per_block = 1u; + while ((parts_per_block + 1u) * cao3 <= 1024u) { + ++parts_per_block; + } + assert((n_part / parts_per_block) <= std::numeric_limits::max()); + auto n = static_cast(n_part / parts_per_block); + auto n_blocks = ((n_part % parts_per_block) == 0u) ? std::max(1u, n) : n + 1u; + assert(n_blocks <= std::numeric_limits::max()); + return std::make_pair(parts_per_block, static_cast(n_blocks)); +} + +dim3 p3m_make_grid(unsigned int n_blocks) { + dim3 grid(n_blocks, 1u, 1u); + while (grid.x > 65536u) { + grid.y++; + if ((n_blocks % grid.y) == 0u) + grid.x = std::max(1u, n_blocks / grid.y); + else + grid.x = n_blocks / grid.y + 1u; + } + return grid; +} + template __device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY, int NZ, REAL_TYPE *Zaehler, @@ -160,15 +185,15 @@ __device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY, for (MX = -P3M_BRILLOUIN; MX <= P3M_BRILLOUIN; MX++) { NMX = static_cast(((NX > p.mesh[0] / 2) ? NX - p.mesh[0] : NX) + p.mesh[0] * MX); - S1 = int_pow<2 * cao>(Utils::sinc(Meshi[0] * NMX)); + S1 = int_pow<2 * cao>(math::sinc(Meshi[0] * NMX)); for (MY = -P3M_BRILLOUIN; MY <= P3M_BRILLOUIN; MY++) { NMY = static_cast( ((NY > p.mesh[1] / 2) ? NY - p.mesh[1] : NY) + p.mesh[1] * MY); - S2 = S1 * int_pow<2 * cao>(Utils::sinc(Meshi[1] * NMY)); + S2 = S1 * int_pow<2 * cao>(math::sinc(Meshi[1] * NMY)); for (MZ = -P3M_BRILLOUIN; MZ <= P3M_BRILLOUIN; MZ++) { NMZ = static_cast( ((NZ > p.mesh[2] / 2) ? NZ - p.mesh[2] : NZ) + p.mesh[2] * MZ); - S3 = S2 * int_pow<2 * cao>(Utils::sinc(Meshi[2] * NMZ)); + S3 = S2 * int_pow<2 * cao>(math::sinc(Meshi[2] * NMZ)); NM2 = sqr(NMX * Leni[0]) + sqr(NMY * Leni[1]) + sqr(NMZ * Leni[2]); *Nenner += S3; @@ -366,26 +391,9 @@ void assign_charges(P3MGpuData const ¶ms, float const *const __restrict__ part_pos, float const *const __restrict__ part_q) { auto const cao = static_cast(params.cao); - auto const cao3 = int_pow<3>(cao); - unsigned int parts_per_block = 1u, n_blocks = 1u; - - while ((parts_per_block + 1u) * cao3 <= 1024u) { - parts_per_block++; - } - if ((params.n_part % parts_per_block) == 0u) - n_blocks = std::max(1u, params.n_part / parts_per_block); - else - n_blocks = params.n_part / parts_per_block + 1u; - + auto const [parts_per_block, n_blocks] = p3m_calc_blocks(cao, params.n_part); dim3 block(parts_per_block * cao, cao, cao); - dim3 grid(n_blocks, 1u, 1u); - while (grid.x > 65536u) { - grid.y++; - if ((n_blocks % grid.y) == 0u) - grid.x = std::max(1u, n_blocks / grid.y); - else - grid.x = n_blocks / grid.y + 1u; - } + dim3 grid = p3m_make_grid(n_blocks); auto const data_length = std::size_t(3u) * static_cast(parts_per_block) * @@ -438,7 +446,7 @@ __global__ void assign_forces_kernel(P3MGpuData const params, /* id of the particle */ auto const id = parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block; - if (id >= params.n_part) + if (id >= static_cast(params.n_part)) return; /* position relative to the closest grid point */ REAL_TYPE m_pos[3]; @@ -500,27 +508,9 @@ void assign_forces(P3MGpuData const ¶ms, float *const __restrict__ part_f, REAL_TYPE const prefactor) { auto const cao = static_cast(params.cao); - auto const cao3 = int_pow<3>(cao); - unsigned int parts_per_block = 1u, n_blocks = 1u; - - while ((parts_per_block + 1u) * cao3 <= 1024u) { - parts_per_block++; - } - - if ((params.n_part % parts_per_block) == 0u) - n_blocks = std::max(1u, params.n_part / parts_per_block); - else - n_blocks = params.n_part / parts_per_block + 1u; - + auto const [parts_per_block, n_blocks] = p3m_calc_blocks(cao, params.n_part); dim3 block(parts_per_block * cao, cao, cao); - dim3 grid(n_blocks, 1u, 1u); - while (grid.x > 65536u) { - grid.y++; - if (n_blocks % grid.y == 0u) - grid.x = std::max(1u, n_blocks / grid.y); - else - grid.x = n_blocks / grid.y + 1u; - } + dim3 grid = p3m_make_grid(n_blocks); /* Switch for assignment templates, the shared version only is faster for cao * > 2 */ @@ -581,7 +571,8 @@ void p3m_gpu_init(std::shared_ptr &data, int cao, auto &p3m_gpu_data = data->p3m_gpu_data; bool do_reinit = false, mesh_changed = false; - p3m_gpu_data.n_part = n_part; + assert(n_part <= std::numeric_limits::max()); + p3m_gpu_data.n_part = static_cast(n_part); if (not data->is_initialized or p3m_gpu_data.alpha != alpha) { p3m_gpu_data.alpha = static_cast(alpha); @@ -694,9 +685,9 @@ void p3m_gpu_init(std::shared_ptr &data, int cao, void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu, double prefactor, std::size_t n_part) { auto &p3m_gpu_data = data.p3m_gpu_data; - p3m_gpu_data.n_part = n_part; + p3m_gpu_data.n_part = static_cast(n_part); - if (p3m_gpu_data.n_part == 0u) + if (n_part == 0u) return; auto const positions_device = gpu.get_particle_positions_device(); @@ -722,8 +713,7 @@ void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu, if (FFT_FORW_FFT(data.p3m_fft.forw_plan, (REAL_TYPE *)p3m_gpu_data.charge_mesh, p3m_gpu_data.charge_mesh) != CUFFT_SUCCESS) { - fprintf(stderr, "CUFFT error: Forward FFT failed\n"); - return; + throw std::runtime_error("CUFFT error: Forward FFT failed"); } /* Do convolution */ diff --git a/src/core/electrostatics/p3m_gpu_error_cuda.cu b/src/core/electrostatics/p3m_gpu_error_cuda.cu index 6e54e2dfe7e..fbc51fecf4a 100644 --- a/src/core/electrostatics/p3m_gpu_error_cuda.cu +++ b/src/core/electrostatics/p3m_gpu_error_cuda.cu @@ -23,12 +23,12 @@ * The corresponding header file is p3m_gpu_error.hpp. */ +#include "p3m/math.hpp" #include "p3m_gpu_error.hpp" #include "cuda/utils.cuh" #include -#include #include #include @@ -44,49 +44,12 @@ #error CU-file includes mpi.h! This should not happen! #endif -using Utils::int_pow; -using Utils::sqr; - -/** @todo Extend to higher order. This comes from some 1/sin expansion in - * @cite hockney88a - */ - -template -__device__ static double p3m_analytic_cotangent_sum(int n, double mesh_i) { - const double c = sqr(cos(std::numbers::pi * mesh_i * n)); - - switch (cao) { - case 1: - return 1; - case 2: - return (1.0 + c * 2.0) / 3.0; - case 3: - return (2.0 + c * (11.0 + c * 2.0)) / 15.0; - case 4: - return (17.0 + c * (180.0 + c * (114.0 + c * 4.0))) / 315.0; - case 5: - return (62.0 + c * (1072.0 + c * (1452.0 + c * (247.0 + c * 2.0)))) / - 2835.0; - case 6: - return (1382.0 + - c * (35396.0 + - c * (83021.0 + c * (34096.0 + c * (2026.0 + c * 4.0))))) / - 155925.0; - case 7: - return (21844.0 + - c * (776661.0 + - c * (2801040.0 + - c * (2123860.0 + - c * (349500.0 + c * (8166.0 + c * 4.0)))))) / - 6081075.0; - } - - return 0.0; -} - template __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi, double alpha_L, double *he_q) { + using Utils::int_pow; + using Utils::sqr; + const int nx = -mesh.x / 2 + static_cast(blockDim.x * blockIdx.x + threadIdx.x); const int ny = @@ -103,14 +66,14 @@ __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi, if ((nx != 0) || (ny != 0) || (nz != 0)) { const double alpha_L_i = 1. / alpha_L; const double n2 = sqr(nx) + sqr(ny) + sqr(nz); - const double cs = p3m_analytic_cotangent_sum(nz, meshi.z) * - p3m_analytic_cotangent_sum(nx, meshi.x) * - p3m_analytic_cotangent_sum(ny, meshi.y); + const double cs = math::analytic_cotangent_sum(nz, meshi.z) * + math::analytic_cotangent_sum(nx, meshi.x) * + math::analytic_cotangent_sum(ny, meshi.y); const double ex = exp(-sqr(std::numbers::pi * alpha_L_i) * n2); const double ex2 = sqr(ex); const double U2 = - int_pow<2 * cao>(Utils::sinc(meshi.x * nx) * Utils::sinc(meshi.y * ny) * - Utils::sinc(meshi.z * nz)); + int_pow<2 * cao>(math::sinc(meshi.x * nx) * math::sinc(meshi.y * ny) * + math::sinc(meshi.z * nz)); auto const alias1 = ex2 / n2; auto const d = alias1 - sqr(U2 * ex / cs) / n2; diff --git a/src/core/magnetostatics/dp3m.cpp b/src/core/magnetostatics/dp3m.cpp index 4fcad22318e..4214900fb52 100644 --- a/src/core/magnetostatics/dp3m.cpp +++ b/src/core/magnetostatics/dp3m.cpp @@ -37,6 +37,7 @@ #include "p3m/fft.hpp" #include "p3m/influence_function_dipolar.hpp" #include "p3m/interpolation.hpp" +#include "p3m/math.hpp" #include "p3m/send_mesh.hpp" #include "BoxGeometry.hpp" @@ -56,7 +57,6 @@ #include #include #include -#include #include #include @@ -67,10 +67,13 @@ #include #include #include +#include #include #include +#include #include #include +#include #include void DipolarP3M::count_magnetic_particles() { @@ -272,15 +275,13 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag, dipole_assign(particles); /* Gather information for FFT grid inside the nodes domain (inner local * mesh) and perform forward 3D FFT (Charge Assignment Mesh). */ - std::array meshes = {{dp3m.rs_mesh_dip[0].data(), - dp3m.rs_mesh_dip[1].data(), - dp3m.rs_mesh_dip[2].data()}}; - + std::array meshes = {{dp3m.rs_mesh_dip[0u].data(), + dp3m.rs_mesh_dip[1u].data(), + dp3m.rs_mesh_dip[2u].data()}}; dp3m.sm.gather_grid(meshes, comm_cart, dp3m.local_mesh.dim); - - fft_perform_forw(dp3m.rs_mesh_dip[0].data(), dp3m.fft, comm_cart); - fft_perform_forw(dp3m.rs_mesh_dip[1].data(), dp3m.fft, comm_cart); - fft_perform_forw(dp3m.rs_mesh_dip[2].data(), dp3m.fft, comm_cart); + for (auto i = 0u; i < 3u; ++i) { + fft_perform_forw(dp3m.rs_mesh_dip[i].data(), dp3m.fft, comm_cart); + } // Note: after these calls, the grids are in the order yzx and not xyz // anymore!!! } @@ -293,34 +294,32 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag, if (dp3m.sum_mu2 > 0.) { /* i*k differentiation for dipolar gradients: * |(\Fourier{\vect{mu}}(k)\cdot \vect{k})|^2 */ - int ind = 0; - int i = 0; - int j[3]; - double node_energy = 0.0; - for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { - for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; j[1]++) { - for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; j[2]++) { - node_energy += - dp3m.g_energy[i] * - (Utils::sqr( - dp3m.rs_mesh_dip[0][ind] * - dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] + - dp3m.rs_mesh_dip[1][ind] * - dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] + - dp3m.rs_mesh_dip[2][ind] * - dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]) + - Utils::sqr( - dp3m.rs_mesh_dip[0][ind + 1] * - dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] + - dp3m.rs_mesh_dip[1][ind + 1] * - dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] + - dp3m.rs_mesh_dip[2][ind + 1] * - dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]])); - ind += 2; - i++; - } - } - } + + auto constexpr m_start = Utils::Vector3i::broadcast(0); + auto const m_stop = std::span(dp3m.fft.plan[3].new_mesh); + auto const offset = std::span(dp3m.fft.plan[3].start); + auto const &d_op = dp3m.d_op[0u]; + auto const &mesh_dip = dp3m.rs_mesh_dip; + auto indices = Utils::Vector3i{}; + auto index = std::size_t(0u); + auto it_energy = dp3m.g_energy.begin(); + auto node_energy = 0.; + for_each_3d(m_start, m_stop, indices, [&]() { + using namespace detail::FFT_indexing; + // Re(mu)*k + auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] + + mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] + + mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]]; + ++index; + // Im(mu)*k + auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] + + mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] + + mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]]; + ++index; + node_energy += *it_energy * (Utils::sqr(re) + Utils::sqr(im)); + std::advance(it_energy, 1); + }); + node_energy *= dipole_prefac * std::numbers::pi * box_geo.length_inv()[0]; boost::mpi::reduce(comm_cart, node_energy, energy, std::plus<>(), 0); @@ -344,62 +343,45 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag, * DIPOLAR TORQUES (k-space) ****************************/ if (dp3m.sum_mu2 > 0.) { - auto const two_pi_L_i = 2. * std::numbers::pi * box_geo.length_inv()[0]; + auto const wavenumber = 2. * std::numbers::pi * box_geo.length_inv()[0u]; + auto constexpr m_start = Utils::Vector3i::broadcast(0); + auto const m_stop = std::span(dp3m.fft.plan[3].new_mesh); + auto const offset = std::span(dp3m.fft.plan[3].start); + auto const &d_op = dp3m.d_op[0u]; + auto const &mesh_dip = dp3m.rs_mesh_dip; + auto indices = Utils::Vector3i{}; + auto index = std::size_t(0u); + /* fill in ks_mesh array for torque calculation */ - int ind = 0; - int i = 0; - int j[3]; - double tmp0, tmp1; - - for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { // j[0]=n_y - for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; - j[1]++) { // j[1]=n_z - for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; - j[2]++) { // j[2]=n_x - // tmp0 = Re(mu)*k, tmp1 = Im(mu)*k - - tmp0 = dp3m.rs_mesh_dip[0][ind] * - dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] + - dp3m.rs_mesh_dip[1][ind] * - dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] + - dp3m.rs_mesh_dip[2][ind] * - dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]; - - tmp1 = dp3m.rs_mesh_dip[0][ind + 1] * - dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] + - dp3m.rs_mesh_dip[1][ind + 1] * - dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] + - dp3m.rs_mesh_dip[2][ind + 1] * - dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]; - - /* the optimal influence function is the same for torques - and energy */ - dp3m.ks_mesh[ind] = tmp0 * dp3m.g_energy[i]; - dp3m.ks_mesh[ind + 1] = tmp1 * dp3m.g_energy[i]; - ind += 2; - i++; - } - } - } + auto it_energy = dp3m.g_energy.begin(); + index = 0u; + for_each_3d(m_start, m_stop, indices, [&]() { + using namespace detail::FFT_indexing; + // Re(mu)*k + auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] + + mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] + + mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]]; + dp3m.ks_mesh[index] = *it_energy * re; + ++index; + // Im(mu)*k + auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] + + mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] + + mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]]; + dp3m.ks_mesh[index] = *it_energy * im; + ++index; + std::advance(it_energy, 1); + }); /* Force component loop */ for (int d = 0; d < 3; d++) { - auto const d_rs = (d + dp3m.ks_pnum) % 3; - ind = 0; - for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { - for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; j[1]++) { - for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; j[2]++) { - dp3m.rs_mesh[ind] = - dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] * - dp3m.ks_mesh[ind]; - ind++; - dp3m.rs_mesh[ind] = - dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] * - dp3m.ks_mesh[ind]; - ind++; - } - } - } + index = 0u; + for_each_3d(m_start, m_stop, indices, [&]() { + auto const pos = indices[d] + offset[d]; + dp3m.rs_mesh[index] = d_op[pos] * dp3m.ks_mesh[index]; + ++index; + dp3m.rs_mesh[index] = d_op[pos] * dp3m.ks_mesh[index]; + ++index; + }); /* Back FFT force component mesh */ fft_perform_back(dp3m.rs_mesh.data(), false, dp3m.fft, comm_cart); @@ -407,94 +389,68 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag, dp3m.sm.spread_grid(dp3m.rs_mesh.data(), comm_cart, dp3m.local_mesh.dim); /* Assign force component from mesh to particle */ + auto const d_rs = (d + dp3m.ks_pnum) % 3; Utils::integral_parameter( - dp3m.params.cao, dp3m, dipole_prefac * two_pi_L_i, d_rs, particles); + dp3m.params.cao, dp3m, dipole_prefac * wavenumber, d_rs, particles); } /*************************** DIPOLAR FORCES (k-space) ****************************/ - // Compute forces after torques because the algorithm below overwrites the // grids dp3m.rs_mesh_dip ! // Note: I'll do here 9 inverse FFTs. By symmetry, we can reduce this // number to 6 ! /* fill in ks_mesh array for force calculation */ - ind = 0; - i = 0; - for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { // j[0]=n_y - for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; - j[1]++) { // j[1]=n_z - for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; - j[2]++) { // j[2]=n_x - // tmp0 = Im(mu)*k, tmp1 = -Re(mu)*k - tmp0 = dp3m.rs_mesh_dip[0][ind + 1] * - dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] + - dp3m.rs_mesh_dip[1][ind + 1] * - dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] + - dp3m.rs_mesh_dip[2][ind + 1] * - dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]; - tmp1 = dp3m.rs_mesh_dip[0][ind] * - dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] + - dp3m.rs_mesh_dip[1][ind] * - dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] + - dp3m.rs_mesh_dip[2][ind] * - dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]; - dp3m.ks_mesh[ind] = tmp0 * dp3m.g_force[i]; - dp3m.ks_mesh[ind + 1] = -tmp1 * dp3m.g_force[i]; - ind += 2; - i++; - } - } - } + auto it_force = dp3m.g_force.begin(); + index = 0u; + for_each_3d(m_start, m_stop, indices, [&]() { + using namespace detail::FFT_indexing; + // Re(mu)*k + auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] + + mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] + + mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]]; + ++index; + // Im(mu)*k + auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] + + mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] + + mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]]; + ++index; + dp3m.ks_mesh[index - 2] = *it_force * im; + dp3m.ks_mesh[index - 1] = *it_force * (-re); + std::advance(it_force, 1); + }); /* Force component loop */ - for (int d = 0; d < 3; d++) { /* direction in k-space: */ - auto const d_rs = (d + dp3m.ks_pnum) % 3; - ind = 0; - for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; - j[0]++) { // j[0]=n_y - for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; - j[1]++) { // j[1]=n_z - for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; - j[2]++) { // j[2]=n_x - tmp0 = dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] * - dp3m.ks_mesh[ind]; - dp3m.rs_mesh_dip[0][ind] = - dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] * tmp0; - dp3m.rs_mesh_dip[1][ind] = - dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] * tmp0; - dp3m.rs_mesh_dip[2][ind] = - dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]] * tmp0; - ind++; - tmp0 = dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] * - dp3m.ks_mesh[ind]; - dp3m.rs_mesh_dip[0][ind] = - dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] * tmp0; - dp3m.rs_mesh_dip[1][ind] = - dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] * tmp0; - dp3m.rs_mesh_dip[2][ind] = - dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]] * tmp0; - ind++; - } - } - } + for (int d = 0; d < 3; d++) { + auto &rs_mesh_dip = dp3m.rs_mesh_dip; + index = 0u; + for_each_3d(m_start, m_stop, indices, [&]() { + using namespace detail::FFT_indexing; + auto const f1 = d_op[indices[d] + offset[d]] * dp3m.ks_mesh[index]; + rs_mesh_dip[0][index] = d_op[indices[KX] + offset[KX]] * f1; + rs_mesh_dip[1][index] = d_op[indices[KY] + offset[KY]] * f1; + rs_mesh_dip[2][index] = d_op[indices[KZ] + offset[KZ]] * f1; + ++index; + auto const f2 = d_op[indices[d] + offset[d]] * dp3m.ks_mesh[index]; + rs_mesh_dip[0][index] = d_op[indices[KX] + offset[KX]] * f2; + rs_mesh_dip[1][index] = d_op[indices[KY] + offset[KY]] * f2; + rs_mesh_dip[2][index] = d_op[indices[KZ] + offset[KZ]] * f2; + ++index; + }); /* Back FFT force component mesh */ - fft_perform_back(dp3m.rs_mesh_dip[0].data(), false, dp3m.fft, - comm_cart); - fft_perform_back(dp3m.rs_mesh_dip[1].data(), false, dp3m.fft, - comm_cart); - fft_perform_back(dp3m.rs_mesh_dip[2].data(), false, dp3m.fft, - comm_cart); + for (auto i = 0u; i < 3u; ++i) { + fft_perform_back(rs_mesh_dip[i].data(), false, dp3m.fft, comm_cart); + } /* redistribute force component mesh */ - std::array meshes = {{dp3m.rs_mesh_dip[0].data(), - dp3m.rs_mesh_dip[1].data(), - dp3m.rs_mesh_dip[2].data()}}; - + std::array meshes = {{rs_mesh_dip[0].data(), + rs_mesh_dip[1].data(), + rs_mesh_dip[2].data()}}; dp3m.sm.spread_grid(meshes, comm_cart, dp3m.local_mesh.dim); /* Assign force component from mesh to particle */ + auto const d_rs = (d + dp3m.ks_pnum) % 3; Utils::integral_parameter( - dp3m.params.cao, dp3m, dipole_prefac * Utils::sqr(two_pi_L_i), d_rs, + dp3m.params.cao, dp3m, dipole_prefac * Utils::sqr(wavenumber), d_rs, particles); } } /* if (dp3m.sum_mu2 > 0) */ @@ -590,15 +546,15 @@ void DipolarP3M::calc_influence_function_force() { auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh}; dp3m.g_force = grid_influence_function<3>(dp3m.params, start, start + size, - get_system().box_geo->length()); + get_system().box_geo->length_inv()); } void DipolarP3M::calc_influence_function_energy() { auto const start = Utils::Vector3i{dp3m.fft.plan[3].start}; auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh}; - dp3m.g_energy = grid_influence_function<2>(dp3m.params, start, start + size, - get_system().box_geo->length()); + dp3m.g_energy = grid_influence_function<2>( + dp3m.params, start, start + size, get_system().box_geo->length_inv()); } class DipolarTuningAlgorithm : public TuningAlgorithm { @@ -749,62 +705,68 @@ void DipolarP3M::tune() { } /** Tuning dipolar-P3M */ -static auto dp3m_tune_aliasing_sums(int nx, int ny, int nz, int mesh, +static auto dp3m_tune_aliasing_sums(Utils::Vector3i const &shift, int mesh, double mesh_i, int cao, double alpha_L_i) { - using Utils::sinc; + auto constexpr m_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN); + auto constexpr m_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1); auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i); - auto alias1 = 0.; auto alias2 = 0.; - for (int mx = -P3M_BRILLOUIN; mx <= P3M_BRILLOUIN; mx++) { - auto const nmx = nx + mx * mesh; - auto const fnmx = mesh_i * nmx; - for (int my = -P3M_BRILLOUIN; my <= P3M_BRILLOUIN; my++) { - auto const nmy = ny + my * mesh; - auto const fnmy = mesh_i * nmy; - for (int mz = -P3M_BRILLOUIN; mz <= P3M_BRILLOUIN; mz++) { - auto const nmz = nz + mz * mesh; - auto const fnmz = mesh_i * nmz; - - auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz); - auto const ex = std::exp(-factor1 * nm2); - - auto const U2 = pow(sinc(fnmx) * sinc(fnmy) * sinc(fnmz), 2. * cao); - - alias1 += Utils::sqr(ex) * nm2; - alias2 += U2 * ex * pow((nx * nmx + ny * nmy + nz * nmz), 3.) / nm2; - } - } - } + + Utils::Vector3i indices{}; + Utils::Vector3i nm{}; + Utils::Vector3d fnm{}; + for_each_3d( + m_start, m_stop, indices, + [&]() { + auto const norm_sq = nm.norm2(); + auto const ex = std::exp(-factor1 * norm_sq); + auto const U2 = std::pow(Utils::product(fnm), 2 * cao); + alias1 += Utils::sqr(ex) * norm_sq; + alias2 += U2 * ex * std::pow(shift * nm, 3) / norm_sq; + }, + [&](unsigned dim, int n) { + nm[dim] = shift[dim] + n * mesh; + fnm[dim] = math::sinc(nm[dim] * mesh_i); + }); + return std::make_pair(alias1, alias2); } /** Calculate the k-space error of dipolar-P3M */ static double dp3m_k_space_error(double box_size, int mesh, int cao, int n_c_part, double sum_q2, double alpha_L) { - double he_q = 0.; - auto const mesh_i = 1. / mesh; - auto const alpha_L_i = 1. / alpha_L; - for (int nx = -mesh / 2; nx < mesh / 2; nx++) - for (int ny = -mesh / 2; ny < mesh / 2; ny++) - for (int nz = -mesh / 2; nz < mesh / 2; nz++) - if ((nx != 0) || (ny != 0) || (nz != 0)) { - auto const n2 = Utils::sqr(nx) + Utils::sqr(ny) + Utils::sqr(nz); - auto const cs = p3m_analytic_cotangent_sum(nx, mesh_i, cao) * - p3m_analytic_cotangent_sum(ny, mesh_i, cao) * - p3m_analytic_cotangent_sum(nz, mesh_i, cao); + auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao); + auto const mesh_i = 1. / static_cast(mesh); + auto const alpha_L_i = 1. / alpha_L; + auto const m_stop = Utils::Vector3i::broadcast(mesh / 2); + auto const m_start = -m_stop; + auto indices = Utils::Vector3i{}; + auto values = Utils::Vector3d{}; + auto he_q = 0.; + + for_each_3d( + m_start, m_stop, indices, + [&]() { + if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) { + auto const n2 = indices.norm2(); + auto const cs = Utils::product(values); auto const [alias1, alias2] = - dp3m_tune_aliasing_sums(nx, ny, nz, mesh, mesh_i, cao, alpha_L_i); + dp3m_tune_aliasing_sums(indices, mesh, mesh_i, cao, alpha_L_i); auto const d = alias1 - Utils::sqr(alias2 / cs) / Utils::int_pow<3>(static_cast(n2)); /* at high precision, d can become negative due to extinction; also, don't take values that have no significant digits left*/ - if (d > 0 && (fabs(d / alias1) > ROUND_ERROR_PREC)) + if (d > 0. and std::fabs(d / alias1) > ROUND_ERROR_PREC) he_q += d; } + }, + [&values, &mesh_i, cotangent_sum](unsigned dim, int n) { + values[dim] = cotangent_sum(n, mesh_i); + }); return 8. * Utils::sqr(std::numbers::pi) / 3. * sum_q2 * sqrt(he_q / n_c_part) / Utils::int_pow<4>(box_size); @@ -885,7 +847,7 @@ void DipolarP3M::sanity_checks_boxl() const { auto const &system = get_system(); auto const &box_geo = *system.box_geo; auto const &local_geo = *system.local_geo; - for (unsigned int i = 0u; i < 3u; i++) { + for (auto i = 0u; i < 3u; i++) { /* check k-space cutoff */ if (dp3m.params.cao_cut[i] >= box_geo.length_half()[i]) { std::stringstream msg; diff --git a/src/core/p3m/common.cpp b/src/core/p3m/common.cpp index bc87e99a6a8..fb8023eba74 100644 --- a/src/core/p3m/common.cpp +++ b/src/core/p3m/common.cpp @@ -28,52 +28,8 @@ #include "LocalBox.hpp" #include -#include #include -#include -#include - -double p3m_analytic_cotangent_sum(int n, double mesh_i, int cao) { - auto const c = - Utils::sqr(std::cos(std::numbers::pi * mesh_i * static_cast(n))); - - switch (cao) { - case 1: { - return 1.0; - } - case 2: { - return (1.0 + c * 2.0) / 3.0; - } - case 3: { - return (2.0 + c * (11.0 + c * 2.0)) / 15.0; - } - case 4: { - return (17.0 + c * (180.0 + c * (114.0 + c * 4.0))) / 315.0; - } - case 5: { - return (62.0 + c * (1072.0 + c * (1452.0 + c * (247.0 + c * 2.0)))) / - 2835.0; - } - case 6: { - return (1382.0 + - c * (35396.0 + - c * (83021.0 + c * (34096.0 + c * (2026.0 + c * 4.0))))) / - 155925.0; - } - case 7: { - return (21844.0 + - c * (776661.0 + - c * (2801040.0 + - c * (2123860.0 + - c * (349500.0 + c * (8166.0 + c * 4.0)))))) / - 6081075.0; - } - default: { - throw std::logic_error("Invalid value cao=" + std::to_string(cao)); - } - } -} void P3MLocalMesh::calc_local_ca_mesh(P3MParameters const ¶ms, LocalBox const &local_geo, double skin, diff --git a/src/core/p3m/common.hpp b/src/core/p3m/common.hpp index 38a7553428a..6bc30828ee5 100644 --- a/src/core/p3m/common.hpp +++ b/src/core/p3m/common.hpp @@ -38,6 +38,7 @@ #include +#include #include #include @@ -49,6 +50,7 @@ auto constexpr P3M_EPSILON_METALLIC = 0.0; #include "LocalBox.hpp" #include +#include #include #include @@ -212,7 +214,7 @@ struct P3MLocalMesh { */ void recalc_ld_pos(P3MParameters const ¶ms) { // spatial position of left down mesh point - for (unsigned int i = 0; i < 3; i++) { + for (auto i = 0u; i < 3u; i++) { ld_pos[i] = (ld_ind[i] + params.mesh_off[i]) * params.a[i]; } } @@ -226,16 +228,6 @@ struct P3MLocalMesh { double space_layer); }; -/** One of the aliasing sums used to compute k-space errors. - * Fortunately the one which is most important (because it converges - * most slowly, since it is not damped exponentially) can be - * calculated analytically. The result (which depends on the order of - * the spline interpolation) can be written as an even trigonometric - * polynomial. The results are tabulated here (the employed formula - * is eq. (7.66) in @cite hockney88a). - */ -double p3m_analytic_cotangent_sum(int n, double mesh_i, int cao); - #endif /* P3M || DP3M */ namespace detail { @@ -250,7 +242,7 @@ std::array, 3> inline calc_meshift( Utils::Vector3i const &mesh_size, bool zero_out_midpoint = false) { std::array, 3> ret{}; - for (unsigned int i = 0; i < 3; i++) { + for (auto i = 0u; i < 3u; ++i) { ret[i] = std::vector(static_cast(mesh_size[i])); for (int j = 1; j <= mesh_size[i] / 2; j++) { diff --git a/src/core/p3m/fft.cpp b/src/core/p3m/fft.cpp index 57b56fbaa6c..a14c9a2d00b 100644 --- a/src/core/p3m/fft.cpp +++ b/src/core/p3m/fft.cpp @@ -84,7 +84,7 @@ using Utils::permute_ifield; */ std::optional> find_comm_groups(Utils::Vector3i const &grid1, Utils::Vector3i const &grid2, - std::span node_list1, std::span node_list2, + std::span node_list1, std::span node_list2, std::span pos, std::span my_pos, int rank) { int i; /* communication group cell size on grid1 and grid2 */ diff --git a/src/core/p3m/fft.hpp b/src/core/p3m/fft.hpp index 58ea6c7e692..c87f11a14cb 100644 --- a/src/core/p3m/fft.hpp +++ b/src/core/p3m/fft.hpp @@ -33,12 +33,7 @@ * redistributed. * * For simplicity at the moment I have implemented a full complex to - * complex FFT (even though a real to complex FFT would be - * sufficient) - * - * \todo Combine the forward and backward structures. - * \todo The packing routines could be moved to utils.hpp when they are needed - * elsewhere. + * complex FFT (even though a real to complex FFT would be sufficient). * * For more information about FFT usage, see \ref fft.cpp "fft.cpp". */ @@ -88,22 +83,24 @@ template struct fft_allocator { template using fft_vector = std::vector>; -/** Structure for performing a 1D FFT. - * - * This includes the information about the redistribution of the 3D - * FFT *grid before the actual FFT. - */ -struct fft_forw_plan { - /** plan direction: 0 = Forward FFT, 1 = Backward FFT. */ +struct fft_plan { + /** plan direction: forward or backward FFT (enum value from FFTW). */ int dir; + /** plan for the FFT. */ + fftw_plan our_fftw_plan; + /** packing function for send blocks. */ + void (*pack_function)(double const *const, double *const, int const *, + int const *, int const *, int); +}; + +/** @brief Plan for a forward 1D FFT of a flattened 3D array. */ +struct fft_forw_plan : public fft_plan { /** row direction of that FFT. */ int row_dir; /** permutations from normal coordinate system. */ int n_permute; /** number of 1D FFTs. */ int n_ffts; - /** plan for fft. */ - fftw_plan our_fftw_plan; /** size of local mesh before communication. */ int old_mesh[3]; @@ -117,9 +114,6 @@ struct fft_forw_plan { /** group of nodes which have to communicate with each other. */ std::vector group; - /** packing function for send blocks. */ - void (*pack_function)(double const *const, double *const, int const *, - int const *, int const *, int); /** Send block specification. 6 integers for each node: start[3], size[3]. */ std::vector send_block; /** Send block communication sizes. */ @@ -132,17 +126,8 @@ struct fft_forw_plan { int element; }; -/** Additional information for backwards FFT. */ -struct fft_back_plan { - /** plan direction. (e.g. fftw macro) */ - int dir; - /** plan for fft. */ - fftw_plan our_fftw_plan; - - /** packing function for send blocks. */ - void (*pack_function)(double const *const, double *const, int const *, - int const *, int const *, int); -}; +/** @brief Plan for a backward 1D FFT of a flattened 3D array. */ +struct fft_back_plan : public fft_plan {}; /** Information about the three one dimensional FFTs and how the nodes * have to communicate inbetween. diff --git a/src/core/p3m/influence_function.hpp b/src/core/p3m/influence_function.hpp index 9c322c23506..24853d43e25 100644 --- a/src/core/p3m/influence_function.hpp +++ b/src/core/p3m/influence_function.hpp @@ -16,15 +16,16 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -#ifndef ESPRESSO_P3M_INFLUENCE_FUNCTION_HPP -#define ESPRESSO_P3M_INFLUENCE_FUNCTION_HPP + +#pragma once #include "p3m/common.hpp" +#include "p3m/for_each_3d.hpp" +#include "p3m/math.hpp" #include #include #include -#include #include #include @@ -52,44 +53,43 @@ template double G_opt(int cao, double alpha, Utils::Vector3d const &k, Utils::Vector3d const &h) { - using namespace detail::FFT_indexing; - using Utils::int_pow; - using Utils::sinc; - - auto constexpr two_pi = 2. * std::numbers::pi; - auto constexpr two_pi_i = 1. / two_pi; - auto constexpr limit = 30.; auto const k2 = k.norm2(); - if (k2 == 0.0) { - return 0.0; + if (k2 == 0.) { + return 0.; } - double numerator = 0.0; - double denominator = 0.0; - - for (int mx = -m; mx <= m; mx++) { - for (int my = -m; my <= m; my++) { - for (int mz = -m; mz <= m; mz++) { - auto const km = - k + two_pi * Utils::Vector3d{mx / h[RX], my / h[RY], mz / h[RZ]}; - auto const U2 = std::pow(sinc(km[RX] * h[RX] * two_pi_i) * - sinc(km[RY] * h[RY] * two_pi_i) * - sinc(km[RZ] * h[RZ] * two_pi_i), - 2 * cao); - + auto constexpr limit = 30.; + auto constexpr m_start = Utils::Vector3i::broadcast(-m); + auto constexpr m_stop = Utils::Vector3i::broadcast(m + 1); + auto const exponent_prefactor = Utils::sqr(1. / (2. * alpha)); + auto const wavevector = (2. * std::numbers::pi) / h; + auto const wavevector_i = 1. / wavevector; + auto indices = Utils::Vector3i{}; + auto km = Utils::Vector3d{}; + auto fnm = Utils::Vector3d{}; + auto numerator = 0.; + auto denominator = 0.; + + for_each_3d( + m_start, m_stop, indices, + [&]() { + using namespace detail::FFT_indexing; + auto const U2 = std::pow(Utils::product(fnm), 2 * cao); auto const km2 = km.norm2(); - auto const exponent = Utils::sqr(1. / (2. * alpha)) * km2; + auto const exponent = exponent_prefactor * km2; if (exponent < limit) { auto const f3 = std::exp(-exponent) * (4. * std::numbers::pi / km2); - numerator += U2 * f3 * int_pow(k * km); + numerator += U2 * f3 * Utils::int_pow(k * km); } denominator += U2; - } - } - } + }, + [&](unsigned dim, int n) { + km[dim] = k[dim] + n * wavevector[dim]; + fnm[dim] = math::sinc(km[dim] * wavevector_i[dim]); + }); - return numerator / (int_pow(k2) * Utils::sqr(denominator)); + return numerator / (Utils::int_pow(k2) * Utils::sqr(denominator)); } /** @@ -104,20 +104,18 @@ double G_opt(int cao, double alpha, Utils::Vector3d const &k, * * @param params P3M parameters * @param n_start Lower left corner of the grid - * @param n_end Upper right corner of the grid. - * @param box_l Box size + * @param n_stop Upper right corner of the grid. + * @param inv_box_l Inverse box length * @return Values of G_opt at regular grid points. */ template std::vector grid_influence_function(const P3MParameters ¶ms, const Utils::Vector3i &n_start, - const Utils::Vector3i &n_end, - const Utils::Vector3d &box_l) { - using namespace detail::FFT_indexing; + const Utils::Vector3i &n_stop, + const Utils::Vector3d &inv_box_l) { auto const shifts = detail::calc_meshift(params.mesh); - - auto const size = n_end - n_start; + auto const size = n_stop - n_start; /* The influence function grid */ auto g = std::vector(Utils::product(size), 0.); @@ -128,31 +126,24 @@ std::vector grid_influence_function(const P3MParameters ¶ms, return g; } - auto const h = Utils::Vector3d{params.a}; - - Utils::Vector3i n{}; - for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) { - for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) { - for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) { - auto const ind = Utils::get_linear_index(n - n_start, size, - Utils::MemoryOrder::ROW_MAJOR); - if ((n[KX] % (params.mesh[RX] / 2) == 0) && - (n[KY] % (params.mesh[RY] / 2) == 0) && - (n[KZ] % (params.mesh[RZ] / 2) == 0)) { - g[ind] = 0.0; - } else { - auto const k = 2. * std::numbers::pi * - Utils::Vector3d{shifts[RX][n[KX]] / box_l[RX], - shifts[RY][n[KY]] / box_l[RY], - shifts[RZ][n[KZ]] / box_l[RZ]}; - - g[ind] = G_opt(params.cao, params.alpha, k, h); - } - } + auto const wavevector = (2. * std::numbers::pi) * inv_box_l; + auto const half_mesh = params.mesh / 2; + auto indices = Utils::Vector3i{}; + auto index = std::size_t(0u); + + for_each_3d(n_start, n_stop, indices, [&]() { + using namespace detail::FFT_indexing; + if ((indices[KX] % half_mesh[RX] != 0) or + (indices[KY] % half_mesh[RY] != 0) or + (indices[KZ] % half_mesh[RZ] != 0)) { + auto const k = + Utils::Vector3d{{shifts[RX][indices[KX]] * wavevector[RX], + shifts[RY][indices[KY]] * wavevector[RY], + shifts[RZ][indices[KZ]] * wavevector[RZ]}}; + g[index] = G_opt(params.cao, params.alpha, k, params.a); } - } + ++index; + }); return g; } - -#endif diff --git a/src/core/p3m/influence_function_dipolar.hpp b/src/core/p3m/influence_function_dipolar.hpp index e327846d20e..d52a1a6d675 100644 --- a/src/core/p3m/influence_function_dipolar.hpp +++ b/src/core/p3m/influence_function_dipolar.hpp @@ -18,19 +18,19 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -#ifndef ESPRESSO_CORE_P3M_INFLUENCE_FUNCTION_DIPOLAR_HPP -#define ESPRESSO_CORE_P3M_INFLUENCE_FUNCTION_DIPOLAR_HPP + +#pragma once #include "config/config.hpp" #if defined(DP3M) #include "p3m/common.hpp" +#include "p3m/for_each_3d.hpp" +#include "p3m/math.hpp" #include -#include #include -#include #include #include @@ -54,40 +54,41 @@ template double G_opt_dipolar(P3MParameters const ¶ms, Utils::Vector3i const &shift, Utils::Vector3i const &d_op) { - using Utils::int_pow; - using Utils::sinc; + auto constexpr limit = P3M_BRILLOUIN; auto constexpr exp_limit = 30.; - auto const exponent = 2. * params.cao; - + auto constexpr m_start = Utils::Vector3i::broadcast(-limit); + auto constexpr m_stop = Utils::Vector3i::broadcast(limit + 1); + auto const cao = params.cao; + auto const mesh = params.mesh[0]; + auto const offset = + static_cast(shift) / static_cast(mesh); + auto const f2 = Utils::sqr(std::numbers::pi / params.alpha_L); + auto indices = Utils::Vector3i{}; + auto nm = Utils::Vector3i{}; + auto fnm = Utils::Vector3d{}; auto numerator = 0.; auto denominator = 0.; - auto const f1 = 1. / static_cast(params.mesh[0]); - auto const f2 = Utils::sqr(std::numbers::pi / params.alpha_L); - - for (int mx = -limit; mx <= limit; mx++) { - auto const nmx = shift[0] + params.mesh[0] * mx; - auto const sx = std::pow(sinc(f1 * nmx), exponent); - for (int my = -limit; my <= limit; my++) { - auto const nmy = shift[1] + params.mesh[0] * my; - auto const sy = sx * std::pow(sinc(f1 * nmy), exponent); - for (int mz = -limit; mz <= limit; mz++) { - auto const nmz = shift[2] + params.mesh[0] * mz; - auto const sz = sy * std::pow(sinc(f1 * nmz), exponent); - auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz); - auto const exp_term = f2 * nm2; + for_each_3d( + m_start, m_stop, indices, + [&]() { + auto const norm_sq = nm.norm2(); + auto const sz = std::pow(Utils::product(fnm), 2 * cao); + auto const exp_term = f2 * norm_sq; if (exp_term < exp_limit) { - auto const f3 = sz * std::exp(-exp_term) / nm2; - auto const n_nm = d_op[0] * nmx + d_op[1] * nmy + d_op[2] * nmz; - numerator += f3 * int_pow(n_nm); + auto const f3 = sz * std::exp(-exp_term) / norm_sq; + numerator += f3 * Utils::int_pow(d_op * nm); } denominator += sz; - } - } - } - return numerator / (int_pow(static_cast(d_op.norm2())) * - Utils::sqr(denominator)); + }, + [&](unsigned dim, int n) { + nm[dim] = shift[dim] + n * mesh; + fnm[dim] = math::sinc(offset[dim] + n * mesh); + }); + + return numerator / (Utils::int_pow(static_cast(d_op.norm2())) * + Utils::int_pow<2>(denominator)); } /** @@ -100,17 +101,17 @@ double G_opt_dipolar(P3MParameters const ¶ms, Utils::Vector3i const &shift, * * @param params DP3M parameters * @param n_start Lower left corner of the grid - * @param n_end Upper right corner of the grid. - * @param box_l Box size + * @param n_stop Upper right corner of the grid. + * @param inv_box_l Inverse box length * @return Values of the influence function at regular grid points. */ template std::vector grid_influence_function(P3MParameters const ¶ms, Utils::Vector3i const &n_start, - Utils::Vector3i const &n_end, - Utils::Vector3d const &box_l) { + Utils::Vector3i const &n_stop, + Utils::Vector3d const &inv_box_l) { - auto const size = n_end - n_start; + auto const size = n_stop - n_start; /* The influence function grid */ auto g = std::vector(Utils::product(size), 0.); @@ -121,61 +122,56 @@ std::vector grid_influence_function(P3MParameters const ¶ms, return g; } - double fak1 = Utils::int_pow<3>(static_cast(params.mesh[0])) * 2.0 / - Utils::sqr(box_l[0]); - - auto const shifts = detail::calc_meshift(params.mesh, false); - auto const d_ops = detail::calc_meshift(params.mesh, true); - - Utils::Vector3i n{}; - for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) { - for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) { - for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) { - auto const ind = Utils::get_linear_index(n - n_start, size, - Utils::MemoryOrder::ROW_MAJOR); - - if (((n[0] % (params.mesh[0] / 2) == 0) && - (n[1] % (params.mesh[0] / 2) == 0) && - (n[2] % (params.mesh[0] / 2) == 0))) { - g[ind] = 0.0; - } else { - auto const shift = Utils::Vector3i{shifts[0][n[0]], shifts[0][n[1]], - shifts[0][n[2]]}; - auto const d_op = - Utils::Vector3i{d_ops[0][n[0]], d_ops[0][n[1]], d_ops[0][n[2]]}; - auto const fak2 = G_opt_dipolar(params, shift, d_op); - g[ind] = fak1 * fak2; + auto prefactor = Utils::int_pow<3>(static_cast(params.mesh[0])) * 2. * + Utils::int_pow<2>(inv_box_l[0]); + + auto const offset = detail::calc_meshift(params.mesh, false)[0]; + auto const d_op = detail::calc_meshift(params.mesh, true)[0]; + auto const half_mesh = params.mesh[0] / 2; + auto indices = Utils::Vector3i{}; + auto shift_off = Utils::Vector3i{}; + auto d_op_off = Utils::Vector3i{}; + auto index = std::size_t(0u); + + for_each_3d( + n_start, n_stop, indices, + [&]() { + if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or + (indices[2] % half_mesh != 0))) { + g[index] = prefactor * G_opt_dipolar(params, shift_off, d_op_off); } - } - } - } + ++index; + }, + [&](unsigned dim, int n) { + d_op_off[dim] = d_op[n]; + shift_off[dim] = offset[n]; + }); + return g; } inline double G_opt_dipolar_self_energy(P3MParameters const ¶ms, Utils::Vector3i const &shift) { - using Utils::sinc; + auto constexpr limit = P3M_BRILLOUIN + 1; - auto const exponent = 2. * params.cao; - - auto u_sum = 0.0; - - auto const f1 = 1.0 / static_cast(params.mesh[0]); - - for (int mx = -limit; mx <= limit; mx++) { - auto const nmx = shift[0] + params.mesh[0] * mx; - auto const sx = std::pow(sinc(f1 * nmx), exponent); - for (int my = -limit; my <= limit; my++) { - auto const nmy = shift[1] + params.mesh[0] * my; - auto const sy = sx * std::pow(sinc(f1 * nmy), exponent); - for (int mz = -limit; mz <= limit; mz++) { - auto const nmz = shift[2] + params.mesh[0] * mz; - auto const sz = sy * std::pow(sinc(f1 * nmz), exponent); - u_sum += sz; - } - } - } - return u_sum; + auto constexpr m_start = Utils::Vector3i::broadcast(-limit); + auto constexpr m_stop = Utils::Vector3i::broadcast(limit + 1); + auto const cao = params.cao; + auto const mesh = params.mesh[0]; + auto const offset = + static_cast(shift) / static_cast(mesh); + auto indices = Utils::Vector3i{}; + auto fnm = Utils::Vector3d{}; + auto energy = 0.; + + for_each_3d( + m_start, m_stop, indices, + [&]() { energy += std::pow(Utils::product(fnm), 2 * cao); }, + [&](unsigned dim, int n) { + fnm[dim] = math::sinc(offset[dim] + n * mesh); + }); + + return energy; } /** @@ -183,42 +179,39 @@ inline double G_opt_dipolar_self_energy(P3MParameters const ¶ms, * * @param params DP3M parameters * @param n_start Lower left corner of the grid - * @param n_end Upper right corner of the grid. + * @param n_stop Upper right corner of the grid. * @param g Energies on the grid. * @return Total self-energy. */ inline double grid_influence_function_self_energy( P3MParameters const ¶ms, Utils::Vector3i const &n_start, - Utils::Vector3i const &n_end, std::vector const &g) { - auto const size = n_end - n_start; - - auto const shifts = detail::calc_meshift(params.mesh, false); - auto const d_ops = detail::calc_meshift(params.mesh, true); - - double energy = 0.0; - Utils::Vector3i n{}; - for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) { - for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) { - for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) { - if (((n[0] % (params.mesh[0] / 2) == 0) && - (n[1] % (params.mesh[0] / 2) == 0) && - (n[2] % (params.mesh[0] / 2) == 0))) { - energy += 0.0; - } else { - auto const ind = Utils::get_linear_index( - n - n_start, size, Utils::MemoryOrder::ROW_MAJOR); - auto const shift = Utils::Vector3i{shifts[0][n[0]], shifts[0][n[1]], - shifts[0][n[2]]}; - auto const d_op = - Utils::Vector3i{d_ops[0][n[0]], d_ops[0][n[1]], d_ops[0][n[2]]}; - auto const U2 = G_opt_dipolar_self_energy(params, shift); - energy += g[ind] * U2 * d_op.norm2(); + Utils::Vector3i const &n_stop, std::vector const &g) { + + auto const offset = detail::calc_meshift(params.mesh, false)[0]; + auto const d_op = detail::calc_meshift(params.mesh, true)[0]; + auto const half_mesh = params.mesh[0] / 2; + auto indices = Utils::Vector3i{}; + auto shift_off = Utils::Vector3i{}; + auto d_op_off = Utils::Vector3i{}; + auto index = std::size_t(0u); + auto energy = 0.; + + for_each_3d( + n_start, n_stop, indices, + [&]() { + if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or + (indices[2] % half_mesh != 0))) { + auto const U2 = G_opt_dipolar_self_energy(params, shift_off); + energy += g[index] * U2 * d_op_off.norm2(); } - } - } - } + ++index; + }, + [&](unsigned dim, int n) { + d_op_off[dim] = d_op[n]; + shift_off[dim] = offset[n]; + }); + return energy; } -#endif -#endif +#endif // defined(DP3M) diff --git a/src/core/unit_tests/CMakeLists.txt b/src/core/unit_tests/CMakeLists.txt index d843da0a636..60fcadae764 100644 --- a/src/core/unit_tests/CMakeLists.txt +++ b/src/core/unit_tests/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright (C) 2010-2022 The ESPResSo project +# Copyright (C) 2010-2024 The ESPResSo project # Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 # Max-Planck-Institute for Polymer Research, Theory Group # @@ -94,6 +94,8 @@ if(ESPRESSO_BUILD_WITH_FFTW) espresso::core) unit_test(NAME fft_test SRC fft_test.cpp DEPENDS espresso::utils espresso::core) + unit_test(NAME math_test SRC math_test.cpp DEPENDS espresso::utils + espresso::core) endif() if(ESPRESSO_BUILD_WITH_CUDA) diff --git a/src/core/unit_tests/cuda_test.cu b/src/core/unit_tests/cuda_test.cu index f5057fe7a75..812638cfa96 100644 --- a/src/core/unit_tests/cuda_test.cu +++ b/src/core/unit_tests/cuda_test.cu @@ -17,10 +17,12 @@ * along with this program. If not, see . */ -#define BOOST_TEST_MODULE cuda test +#define BOOST_TEST_MODULE "CUDA interface tests" #define BOOST_TEST_DYN_LINK #include +#include "config/config.hpp" + #include "cuda/init.hpp" #include "cuda/utils.cuh" #include "cuda/utils.hpp" @@ -33,20 +35,22 @@ #include #include +#include #include #include boost::test_tools::assertion_result has_gpu(boost::unit_test::test_unit_id) { + bool has_compatible_device = false; int n_devices = 0; cudaGetDeviceCount(&n_devices); if (n_devices > 0) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); if (prop.major >= 3) { - return true; + has_compatible_device = true; } } - return false; + return has_compatible_device; } std::optional read_pending_cuda_errors() { @@ -79,44 +83,43 @@ void clear() { } // namespace Testing::non_sticky_cuda_error +#ifdef P3M +dim3 p3m_make_grid(unsigned int n_blocks); +#endif + static auto fixture = boost::unit_test::fixture(&setup, &teardown); BOOST_AUTO_TEST_SUITE(suite, *boost::unit_test::precondition(has_gpu)) BOOST_AUTO_TEST_CASE(gpu_fixture, *fixture) { - { - auto error = read_pending_cuda_errors(); - BOOST_REQUIRE(not error.has_value()); - } - { - // check we can raise and clear non-sticky CUDA errors - Testing::non_sticky_cuda_error::trigger(); - Testing::non_sticky_cuda_error::clear(); - auto error = read_pending_cuda_errors(); - BOOST_REQUIRE(not error.has_value()); - } - { - // check fixture can handle the default non-sticky CUDA error - Testing::non_sticky_cuda_error::trigger(); - auto ref_what{"There is a pending CUDA error: \"invalid device ordinal\""}; - auto error = read_pending_cuda_errors(); - BOOST_REQUIRE(error.has_value()); - BOOST_REQUIRE_EQUAL(error.value(), ref_what); - // sticky error should have been cleared - error = read_pending_cuda_errors(); - BOOST_REQUIRE(not error.has_value()); - } - { - // check fixture can handle a custom non-sticky CUDA error - cudaMallocHost(nullptr, std::size_t(0u)); - auto ref_what{"There is a pending CUDA error: \"invalid argument\""}; - auto error = read_pending_cuda_errors(); - BOOST_REQUIRE(error.has_value()); - BOOST_REQUIRE_EQUAL(error.value(), ref_what); - // sticky error should have been cleared - error = read_pending_cuda_errors(); - BOOST_REQUIRE(not error.has_value()); - } + auto error1 = read_pending_cuda_errors(); + BOOST_REQUIRE(not error1.has_value()); + + // check we can raise and clear non-sticky CUDA errors + Testing::non_sticky_cuda_error::trigger(); + Testing::non_sticky_cuda_error::clear(); + auto error2 = read_pending_cuda_errors(); + BOOST_REQUIRE(not error2.has_value()); + + // check fixture can handle the default non-sticky CUDA error + Testing::non_sticky_cuda_error::trigger(); + auto ref_what3{"There is a pending CUDA error: \"invalid device ordinal\""}; + auto error3 = read_pending_cuda_errors(); + BOOST_REQUIRE(error3.has_value()); + BOOST_REQUIRE_EQUAL(error3.value(), ref_what3); + // sticky error should have been cleared + error3 = read_pending_cuda_errors(); + BOOST_REQUIRE(not error3.has_value()); + + // check fixture can handle a custom non-sticky CUDA error + cudaMallocHost(nullptr, std::size_t(0u)); + auto ref_what4{"There is a pending CUDA error: \"invalid argument\""}; + auto error4 = read_pending_cuda_errors(); + BOOST_REQUIRE(error4.has_value()); + BOOST_REQUIRE_EQUAL(error4.value(), ref_what4); + // sticky error should have been cleared + error4 = read_pending_cuda_errors(); + BOOST_REQUIRE(not error4.has_value()); } static int fatal_error_counter = 0; @@ -125,127 +128,153 @@ static void increment_counter() noexcept { ++fatal_error_counter; } BOOST_AUTO_TEST_CASE(gpu_interface, *fixture) { fatal_error_counter = 0; auto local_error_counter = 0; - { - std::string const what = "message 1"; - try { - throw cuda_fatal_error(what); - } catch (cuda_fatal_error &err) { - BOOST_CHECK_EQUAL(err.what(), what); - BOOST_CHECK_EQUAL(err.get_terminate(), &errexit); - err.set_terminate(nullptr); - BOOST_CHECK_EQUAL(err.get_terminate(), nullptr); - err.set_terminate(increment_counter); - BOOST_CHECK_EQUAL(err.get_terminate(), &increment_counter); - BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); - } - ++local_error_counter; - BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); + + try { + throw cuda_fatal_error("message"); + } catch (cuda_fatal_error &err) { + std::string const what = "message"; + BOOST_CHECK_EQUAL(err.what(), what); + BOOST_CHECK_EQUAL(err.get_terminate(), &errexit); + err.set_terminate(nullptr); + BOOST_CHECK_EQUAL(err.get_terminate(), nullptr); + err.set_terminate(increment_counter); + BOOST_CHECK_EQUAL(err.get_terminate(), &increment_counter); + BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); } - { - auto error_caught = false; - auto const block = dim3{1, 2, 3}; - auto const grid = dim3{4, 5, 6}; - // should not throw - cuda_check_errors_exit(block, grid, "", "", 0u); - try { - Testing::non_sticky_cuda_error::trigger(); - // should clear the CUDA error flag and throw a fatal error - cuda_check_errors_exit(block, grid, "cudaSetDevice()", "filename.cu", 4u); - } catch (cuda_fatal_error &err) { - error_caught = true; - err.set_terminate(increment_counter); - std::string const what = - "CUDA error: \"invalid device ordinal\" while calling " - "cudaSetDevice() with block: <1,2,3>, grid: <4,5,6> in filename.cu:4"; - BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); - BOOST_CHECK_EQUAL(err.what(), what); - BOOST_CHECK_EQUAL(cudaGetLastError(), cudaSuccess); - } - ++local_error_counter; - BOOST_REQUIRE(error_caught); - BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); + ++local_error_counter; + BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); + + // ----------------------- + + auto error_caught = false; + auto const block = dim3{1, 2, 3}; + auto const grid = dim3{4, 5, 6}; + cuda_check_errors_exit(block, grid, "", "", 0u); // should not throw + try { + Testing::non_sticky_cuda_error::trigger(); + // should clear the CUDA error flag and throw a fatal error + cuda_check_errors_exit(block, grid, "cudaSetDevice()", "filename.cu", 4u); + } catch (cuda_fatal_error &err) { + error_caught = true; + err.set_terminate(increment_counter); + std::string const what = + "CUDA error: \"invalid device ordinal\" while calling " + "cudaSetDevice() with block: <1,2,3>, grid: <4,5,6> in filename.cu:4"; + BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); + BOOST_CHECK_EQUAL(err.what(), what); + BOOST_CHECK_EQUAL(cudaGetLastError(), cudaSuccess); } - { - auto error_caught = false; - // should not throw - cuda_safe_mem_exit(cudaSuccess, "", 0u); - try { - Testing::non_sticky_cuda_error::trigger(); - // should throw - cuda_safe_mem_exit(cudaSuccess, "filename.cu", 4u); - } catch (cuda_fatal_error &err) { - error_caught = true; - err.set_terminate(increment_counter); - std::string const what = - "CUDA error: \"invalid device ordinal\" in filename.cu:4. Error " - "found during memory operation. Possibly however from a failed " - "operation before the memory operation"; - BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); - BOOST_CHECK_EQUAL(err.what(), what); - } - ++local_error_counter; - BOOST_REQUIRE(error_caught); - BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); + ++local_error_counter; + BOOST_REQUIRE(error_caught); + BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); + + // ----------------------- + + error_caught = false; + cuda_safe_mem_exit(cudaSuccess, "", 0u); // should not throw + try { + Testing::non_sticky_cuda_error::trigger(); + cuda_safe_mem_exit(cudaSuccess, "filename.cu", 4u); // should throw + } catch (cuda_fatal_error &err) { + error_caught = true; + err.set_terminate(increment_counter); + std::string const what = + "CUDA error: \"invalid device ordinal\" in filename.cu:4. Error " + "found during memory operation. Possibly however from a failed " + "operation before the memory operation"; + BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); + BOOST_CHECK_EQUAL(err.what(), what); } - { - auto error_caught = false; - try { - cuda_safe_mem_exit(cudaErrorNotPermitted, "filename.cu", 4u); - } catch (cuda_fatal_error &err) { - error_caught = true; - err.set_terminate(increment_counter); - std::string const what = "CUDA error: \"operation not permitted\" during " - "memory operation in filename.cu:4"; - BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); - BOOST_CHECK_EQUAL(err.what(), what); - } - ++local_error_counter; - BOOST_REQUIRE(error_caught); - BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); + ++local_error_counter; + BOOST_REQUIRE(error_caught); + BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); + + // ----------------------- + + error_caught = false; + try { + cuda_safe_mem_exit(cudaErrorNotPermitted, "filename.cu", 4u); + } catch (cuda_fatal_error &err) { + error_caught = true; + err.set_terminate(increment_counter); + std::string const what = "CUDA error: \"operation not permitted\" during " + "memory operation in filename.cu:4"; + BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); + BOOST_CHECK_EQUAL(err.what(), what); } - { - auto error_caught = false; - try { - cuda_safe_mem_exit(cudaErrorInvalidValue, "function_name()", 4u); - } catch (cuda_fatal_error &err) { - error_caught = true; - err.set_terminate(increment_counter); - std::string const what = - "CUDA error: \"invalid argument\" during memory operation in " - "function_name():4. You may have tried to allocate zero memory"; - BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); - BOOST_CHECK_EQUAL(err.what(), what); - } - ++local_error_counter; - BOOST_REQUIRE(error_caught); - BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); + ++local_error_counter; + BOOST_REQUIRE(error_caught); + BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); + + // ----------------------- + + error_caught = false; + try { + cuda_safe_mem_exit(cudaErrorInvalidValue, "function_name()", 4u); + } catch (cuda_fatal_error &err) { + error_caught = true; + err.set_terminate(increment_counter); + std::string const what = + "CUDA error: \"invalid argument\" during memory operation in " + "function_name():4. You may have tried to allocate zero memory"; + BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); + BOOST_CHECK_EQUAL(err.what(), what); } - { - BOOST_REQUIRE_EQUAL(stream[0], nullptr); - auto error_caught = false; - cuda_init(); // allocate - BOOST_REQUIRE_NE(stream[0], nullptr); - cuda_set_device(0); // reallocate, may or may not result in the same pointer - BOOST_REQUIRE_NE(stream[0], nullptr); - auto const old_stream = stream[0]; - try { - cuda_set_device(-1); // fail to reallocate, pointer remains the same - } catch (cuda_runtime_error_cuda const &err) { - error_caught = true; - std::string const what = "CUDA error: invalid device ordinal"; - BOOST_CHECK_EQUAL(err.what(), what); - } - BOOST_REQUIRE(error_caught); - BOOST_REQUIRE_EQUAL(stream[0], old_stream); + ++local_error_counter; + BOOST_REQUIRE(error_caught); + BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); + + // ----------------------- + + error_caught = false; + BOOST_REQUIRE_EQUAL(stream[0], nullptr); + cuda_init(); // allocate + BOOST_REQUIRE_NE(stream[0], nullptr); + cuda_set_device(0); // reallocate, may or may not result in the same pointer + BOOST_REQUIRE_NE(stream[0], nullptr); + auto const old_stream = stream[0]; + try { + cuda_set_device(-1); // fail to reallocate, pointer remains the same + } catch (cuda_runtime_error_cuda const &err) { + error_caught = true; + std::string const what = "CUDA error: invalid device ordinal"; + BOOST_CHECK_EQUAL(err.what(), what); } - { - BOOST_REQUIRE_GE(cuda_get_n_gpus(), 1); - char gpu_name_buffer[260] = {'\0'}; - cuda_get_gpu_name(0, gpu_name_buffer); - for (int i = 255; i < 260; ++i) { - BOOST_REQUIRE_EQUAL(gpu_name_buffer[i], '\0'); - } + BOOST_REQUIRE(error_caught); + BOOST_REQUIRE_EQUAL(stream[0], old_stream); + + // ----------------------- + + BOOST_REQUIRE_GE(cuda_get_n_gpus(), 1); + char gpu_name_buffer[260] = {'\0'}; + cuda_get_gpu_name(0, gpu_name_buffer); + for (int i = 255; i < 260; ++i) { + BOOST_REQUIRE_EQUAL(gpu_name_buffer[i], '\0'); + } +} + +#ifdef P3M + +BOOST_AUTO_TEST_CASE(p3m_reshape_grid_test, *fixture) { + auto constexpr optimal_size = 65536u; + + for (auto cao = 1u; cao <= 3u; ++cao) { + auto const n_blocks = cao * optimal_size; + auto const grid = p3m_make_grid(n_blocks); + BOOST_CHECK_EQUAL(grid.x, optimal_size); + BOOST_CHECK_EQUAL(grid.y, cao); + BOOST_CHECK_EQUAL(grid.z, 1u); + } + + for (auto mul : {2u, 3u, 6u, 12u}) { + auto const n_blocks = mul * optimal_size + 1u; + auto const grid = p3m_make_grid(n_blocks); + BOOST_CHECK_EQUAL(grid.x, n_blocks / (mul + 1u) + ((mul == 2u) ? 0u : 1u)); + BOOST_CHECK_EQUAL(grid.y, (mul + 1u)); + BOOST_CHECK_EQUAL(grid.z, 1u); } } +#endif + BOOST_AUTO_TEST_SUITE_END() diff --git a/src/core/unit_tests/fft_test.cpp b/src/core/unit_tests/fft_test.cpp index 1506e01f20d..bb27335279b 100644 --- a/src/core/unit_tests/fft_test.cpp +++ b/src/core/unit_tests/fft_test.cpp @@ -17,11 +17,17 @@ * along with this program. If not, see . */ -#define BOOST_TEST_MODULE fft test +#define BOOST_TEST_MODULE "FFT utility functions" #define BOOST_TEST_DYN_LINK + +#include "config/config.hpp" + +#if defined(P3M) || defined(DP3M) + #include #include "p3m/fft.hpp" +#include "p3m/for_each_3d.hpp" #include @@ -33,10 +39,9 @@ #include #include -#if defined(P3M) || defined(DP3M) std::optional> find_comm_groups(Utils::Vector3i const &, Utils::Vector3i const &, - std::span, + std::span, std::span, std::span, std::span, int); @@ -63,11 +68,50 @@ BOOST_AUTO_TEST_CASE(fft_find_comm_groups_mismatch) { } BOOST_AUTO_TEST_CASE(fft_exceptions) { + auto constexpr size_max = std::numeric_limits::max(); + auto constexpr bad_size = size_max / sizeof(int) + 1ul; fft_allocator allocator{}; BOOST_CHECK_EQUAL(allocator.allocate(0ul), nullptr); - BOOST_CHECK_THROW( - allocator.allocate(std::numeric_limits::max() / sizeof(int) + - 1ul), - std::bad_array_new_length); + BOOST_CHECK_THROW(allocator.allocate(bad_size), std::bad_array_new_length); +} + +BOOST_AUTO_TEST_CASE(for_each_3d_test) { + auto const m_start = Utils::Vector3i{{0, -1, 3}}; + auto const m_stop = Utils::Vector3i{{2, 2, 5}}; + auto ref_loop_counters = m_start; + auto indices = Utils::Vector3i{}; + + auto const kernel = [&]() { + BOOST_REQUIRE_EQUAL(indices, ref_loop_counters); + if (++ref_loop_counters[2u] == m_stop[2u]) { + ref_loop_counters[2u] = m_start[2u]; + if (++ref_loop_counters[1u] == m_stop[1u]) { + ref_loop_counters[1u] = m_start[1u]; + if (++ref_loop_counters[0u] == m_stop[0u]) { + ref_loop_counters[0u] = m_start[0u]; + } + } + } + }; + + { + for_each_3d(m_start, m_stop, indices, kernel, [&](unsigned dim, int n) { + BOOST_REQUIRE_GE(dim, 0); + BOOST_REQUIRE_LE(dim, 2); + BOOST_REQUIRE_EQUAL(n, ref_loop_counters[dim]); + }); + + BOOST_REQUIRE_EQUAL(indices, m_stop); + BOOST_REQUIRE_EQUAL(ref_loop_counters, m_start); + } + { + for_each_3d(m_start, m_stop, indices, kernel); + + BOOST_REQUIRE_EQUAL(indices, m_stop); + BOOST_REQUIRE_EQUAL(ref_loop_counters, m_start); + } } + +#else // defined(P3M) || defined(DP3M) +int main(int argc, char **argv) {} #endif // defined(P3M) || defined(DP3M) diff --git a/src/core/unit_tests/p3m_test.cpp b/src/core/unit_tests/p3m_test.cpp index 9a0ea01bee2..3f091d565ce 100644 --- a/src/core/unit_tests/p3m_test.cpp +++ b/src/core/unit_tests/p3m_test.cpp @@ -27,8 +27,6 @@ #include #include -#include -#include #include BOOST_AUTO_TEST_CASE(calc_meshift_false) { @@ -60,27 +58,3 @@ BOOST_AUTO_TEST_CASE(calc_meshift_true) { } } } - -#if defined(P3M) || defined(DP3M) -BOOST_AUTO_TEST_CASE(analytic_cotangent_sum) { - auto constexpr kernel = p3m_analytic_cotangent_sum; - auto constexpr tol = 8. * 100. * std::numeric_limits::epsilon(); - - // check only trivial cases - for (auto const cao : {1, 2, 3, 4, 5, 6, 7}) { - BOOST_CHECK_CLOSE(kernel(0, 0., cao), 1., tol); - } - BOOST_CHECK_CLOSE(kernel(1, 0.5, 1), 1., tol); - BOOST_CHECK_CLOSE(kernel(1, 0.5, 2), 1. / 3., tol); - BOOST_CHECK_CLOSE(kernel(1, 0.5, 3), 2. / 15., tol); - BOOST_CHECK_CLOSE(kernel(1, 0.5, 4), 17. / 315., tol); - BOOST_CHECK_CLOSE(kernel(1, 0.5, 5), 62. / 2835., tol); - BOOST_CHECK_CLOSE(kernel(1, 0.5, 6), 1382. / 155925., tol); - BOOST_CHECK_CLOSE(kernel(1, 0.5, 7), 21844. / 6081075., tol); - - // check assertion - for (auto const invalid_cao : {-1, 0, 8}) { - BOOST_CHECK_THROW(kernel(1, 0., invalid_cao), std::logic_error); - } -} -#endif // defined(P3M) || defined(DP3M) diff --git a/src/utils/include/utils/Vector.hpp b/src/utils/include/utils/Vector.hpp index 6ecf0f4fc4a..f1361fa9d6e 100644 --- a/src/utils/include/utils/Vector.hpp +++ b/src/utils/include/utils/Vector.hpp @@ -80,7 +80,7 @@ template class Vector : public Array { public: template - explicit Vector(Range const &rng) + explicit constexpr Vector(Range const &rng) : Vector(std::begin(rng), std::end(rng)) {} explicit constexpr Vector(T const (&v)[N]) : Base() { copy_init(std::begin(v), std::end(v)); diff --git a/src/utils/include/utils/math/abs.hpp b/src/utils/include/utils/math/abs.hpp deleted file mode 100644 index 4576f2c9964..00000000000 --- a/src/utils/include/utils/math/abs.hpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (C) 2010-2022 The ESPResSo project - * - * This file is part of ESPResSo. - * - * ESPResSo is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * ESPResSo is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ -#ifndef ESPRESSO_ABS_HPP -#define ESPRESSO_ABS_HPP - -#include "utils/device_qualifier.hpp" - -#ifndef __CUDACC__ -#include -#endif - -namespace Utils { -/** - * @brief Return the absolute value of x. - */ -inline DEVICE_QUALIFIER double abs(double x) { return fabs(x); } - -/** - * @brief Return the absolute value of x. - */ -inline DEVICE_QUALIFIER float abs(float x) { return fabsf(x); } -} // namespace Utils - -#endif // ESPRESSO_DEVICE_MATH_HPP diff --git a/src/utils/include/utils/math/sinc.hpp b/src/utils/include/utils/math/sinc.hpp deleted file mode 100644 index 0a2cf4f47c7..00000000000 --- a/src/utils/include/utils/math/sinc.hpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (C) 2010-2022 The ESPResSo project - * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 - * Max-Planck-Institute for Polymer Research, Theory Group - * - * This file is part of ESPResSo. - * - * ESPResSo is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * ESPResSo is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#pragma once - -#include "utils/device_qualifier.hpp" -#include "utils/math/abs.hpp" - -#include -#include - -namespace Utils { -/** - * @brief Calculates the sinc-function as sin(PI*x)/(PI*x). - * - * (same convention as in @cite hockney88a). In order to avoid - * divisions by 0, arguments, whose modulus is smaller than epsi, will - * be evaluated by an 8th order Taylor expansion of the sinc - * function. Note that the difference between sinc(x) and this - * expansion is smaller than 0.235e-12, if x is smaller than 0.1. (The - * next term in the expansion is the 10th order contribution - * PI^10/39916800 * x^10 = 0.2346...*x^12). This expansion should - * also save time, since it reduces the number of function calls to - * sin(). - */ -template DEVICE_QUALIFIER T sinc(T d) { - const constexpr T epsi = T(0.1); - - const auto PId = std::numbers::pi_v * d; - - if (::Utils::abs(d) > epsi) - return sin(PId) / PId; - - /* Coefficients of the Taylor expansion of sinc */ - const constexpr T c2 = T(-0.1666666666667e-0); - const constexpr T c4 = T(0.8333333333333e-2); - const constexpr T c6 = T(-0.1984126984127e-3); - const constexpr T c8 = T(0.2755731922399e-5); - - const auto PId2 = PId * PId; - return T(1) + PId2 * (c2 + PId2 * (c4 + PId2 * (c6 + PId2 * c8))); -} -} // namespace Utils diff --git a/src/utils/tests/CMakeLists.txt b/src/utils/tests/CMakeLists.txt index 2dd87d9f775..89755749fab 100644 --- a/src/utils/tests/CMakeLists.txt +++ b/src/utils/tests/CMakeLists.txt @@ -21,7 +21,6 @@ include(unit_test) -unit_test(NAME abs_test SRC abs_test.cpp DEPENDS espresso::utils) unit_test(NAME Vector_test SRC Vector_test.cpp DEPENDS espresso::utils) unit_test(NAME Factory_test SRC Factory_test.cpp DEPENDS espresso::utils) unit_test(NAME NumeratedContainer_test SRC NumeratedContainer_test.cpp DEPENDS @@ -34,7 +33,6 @@ unit_test(NAME accumulator SRC accumulator.cpp DEPENDS espresso::utils unit_test(NAME int_pow SRC int_pow_test.cpp DEPENDS espresso::utils) unit_test(NAME sgn SRC sgn_test.cpp DEPENDS espresso::utils) unit_test(NAME AS_erfc_part SRC AS_erfc_part_test.cpp DEPENDS espresso::utils) -unit_test(NAME sinc SRC sinc_test.cpp DEPENDS espresso::utils) unit_test(NAME permute_ifield_test SRC permute_ifield_test.cpp DEPENDS espresso::utils) unit_test(NAME vec_rotate SRC vec_rotate_test.cpp DEPENDS espresso::utils) diff --git a/src/utils/tests/abs_test.cpp b/src/utils/tests/abs_test.cpp deleted file mode 100644 index 94e43728e7b..00000000000 --- a/src/utils/tests/abs_test.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (C) 2019-2022 The ESPResSo project - * - * This file is part of ESPResSo. - * - * ESPResSo is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * ESPResSo is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ -#define BOOST_TEST_MODULE abs test -#define BOOST_TEST_DYN_LINK -#include - -#include - -#include -#include - -BOOST_AUTO_TEST_CASE(abs_test) { - using Utils::abs; - - static_assert(std::is_same_v); - static_assert(std::is_same_v); - - BOOST_CHECK_EQUAL(std::abs(3.1415), abs(3.1415)); - BOOST_CHECK_EQUAL(std::abs(-3.1415), abs(-3.1415)); - BOOST_CHECK_EQUAL(std::abs(3.1415f), abs(3.1415f)); - BOOST_CHECK_EQUAL(std::abs(-3.1415f), abs(-3.1415f)); -} diff --git a/src/utils/tests/sinc_test.cpp b/src/utils/tests/sinc_test.cpp deleted file mode 100644 index 063120ca971..00000000000 --- a/src/utils/tests/sinc_test.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (C) 2017-2022 The ESPResSo project - * - * ESPResSo is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * ESPResSo is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#define BOOST_TEST_MODULE Utils::sinc test -#define BOOST_TEST_DYN_LINK -#include - -#include "utils/math/sinc.hpp" - -#include -#include -#include - -BOOST_AUTO_TEST_CASE(zero) { BOOST_CHECK_EQUAL(Utils::sinc(0.0), 1.0); } - -BOOST_AUTO_TEST_CASE(approx) { - auto x = 0.001; - while (x <= 0.11) { - auto const approx = Utils::sinc(x); - auto const pi_x = std::numbers::pi * x; - auto const exact = std::sin(pi_x) / (pi_x); - BOOST_CHECK_SMALL(approx - exact, 1e-13); - x += 0.01; - } -}