From d4e34a6f9c2787a4ef8e3e4446e5829b56e47595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Fri, 14 Jun 2024 18:47:58 +0200 Subject: [PATCH] revert_to_working --- .gitlab-ci.yml | 76 ---- CMakeLists.txt | 9 - .../code_generation_context.py | 1 + .../walberla_kernels/generate_lb_kernels.py | 4 +- maintainer/walberla_kernels/lbmpy_espresso.py | 10 +- src/core/CMakeLists.txt | 4 +- src/core/electrostatics/p3m.cpp | 245 ++++++------ src/core/electrostatics/p3m_gpu_cuda.cu | 94 +++-- src/core/electrostatics/p3m_gpu_error_cuda.cu | 55 ++- src/core/magnetostatics/dp3m.cpp | 360 ++++++++++-------- src/core/p3m/common.cpp | 44 +++ src/core/p3m/common.hpp | 16 +- src/core/p3m/fft.cpp | 2 +- src/core/p3m/fft.hpp | 43 ++- src/core/p3m/for_each_3d.hpp | 70 ---- src/core/p3m/influence_function.hpp | 115 +++--- src/core/p3m/influence_function_dipolar.hpp | 219 +++++------ src/core/p3m/math.hpp | 151 -------- src/core/unit_tests/CMakeLists.txt | 4 +- src/core/unit_tests/cuda_test.cu | 329 ++++++++-------- src/core/unit_tests/fft_test.cpp | 58 +-- src/core/unit_tests/math_test.cpp | 105 ----- src/core/unit_tests/p3m_test.cpp | 30 +- src/shapes/include/shapes/Ellipsoid.hpp | 2 +- src/shapes/src/HollowConicalFrustum.cpp | 7 +- src/utils/include/utils/Vector.hpp | 60 +-- .../cylindrical_transformation_parameters.hpp | 23 +- src/utils/include/utils/math/sqr.hpp | 2 - src/utils/include/utils/quaternion.hpp | 1 - src/utils/tests/CMakeLists.txt | 2 + src/utils/tests/Vector_test.cpp | 25 +- .../src/lattice_boltzmann/LBWalberlaImpl.hpp | 6 +- .../CollideSweepDoublePrecisionLeesEdwards.h | 2 +- ...ollideSweepDoublePrecisionLeesEdwardsAVX.h | 2 +- ...llideSweepDoublePrecisionLeesEdwardsCUDA.h | 2 +- .../CollideSweepDoublePrecisionThermalized.h | 2 +- ...ollideSweepDoublePrecisionThermalizedAVX.h | 2 +- ...llideSweepDoublePrecisionThermalizedCUDA.h | 2 +- .../CollideSweepSinglePrecisionLeesEdwards.h | 2 +- ...ollideSweepSinglePrecisionLeesEdwardsAVX.h | 2 +- ...llideSweepSinglePrecisionLeesEdwardsCUDA.h | 2 +- .../CollideSweepSinglePrecisionThermalized.h | 2 +- ...ollideSweepSinglePrecisionThermalizedAVX.h | 2 +- ...llideSweepSinglePrecisionThermalizedCUDA.h | 2 +- .../InitialPDFsSetterDoublePrecision.h | 2 +- .../InitialPDFsSetterDoublePrecisionCUDA.h | 2 +- .../InitialPDFsSetterSinglePrecision.h | 2 +- .../InitialPDFsSetterSinglePrecisionCUDA.h | 2 +- .../StreamSweepDoublePrecision.h | 2 +- .../StreamSweepDoublePrecisionAVX.h | 2 +- .../StreamSweepDoublePrecisionCUDA.h | 2 +- .../StreamSweepSinglePrecision.h | 2 +- .../StreamSweepSinglePrecisionAVX.h | 2 +- .../StreamSweepSinglePrecisionCUDA.h | 2 +- testsuite/python/lb.py | 4 +- testsuite/python/lb_boundary_ghost_layer.py | 2 +- 56 files changed, 948 insertions(+), 1274 deletions(-) delete mode 100644 src/core/p3m/for_each_3d.hpp delete mode 100644 src/core/p3m/math.hpp delete mode 100644 src/core/unit_tests/math_test.cpp diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d891c721212..cfff83fb571 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -41,40 +41,6 @@ status_pending: stage: prepare script: sh maintainer/gh_post_status.sh pending -style: - <<: *global_job_definition - stage: build - dependencies: [] - before_script: - - git config --global --add safe.directory ${CI_PROJECT_DIR} - - git submodule deinit . - script: - - sh maintainer/CI/fix_style.sh - tags: - - espresso - - no-cuda - variables: - GIT_SUBMODULE_STRATEGY: none - artifacts: - paths: - - style.patch - expire_in: 1 week - when: on_failure - -style_doxygen: - <<: *global_job_definition - stage: build - dependencies: [] - script: - - mkdir build - - cd build - - cp ../maintainer/configs/maxset.hpp myconfig.hpp - - cmake .. -D ESPRESSO_BUILD_WITH_CUDA=ON -D ESPRESSO_BUILD_WITH_GSL=ON -D ESPRESSO_BUILD_WITH_HDF5=ON -D ESPRESSO_BUILD_WITH_SCAFACOS=ON -D ESPRESSO_BUILD_WITH_WALBERLA=ON -D ESPRESSO_BUILD_WITH_WALBERLA_FFT=ON -D ESPRESSO_BUILD_WITH_STOKESIAN_DYNAMICS=ON -D ESPRESSO_BUILD_WITH_CALIPER=ON - - sh ../maintainer/CI/dox_warnings.sh - tags: - - espresso - - no-cuda - ### Builds without CUDA default: @@ -436,29 +402,6 @@ empty: - cuda - numa -check_sphinx: - <<: *global_job_definition - stage: additional_checks - needs: - - cuda12-maxset - when: on_success - script: - - cd ${CI_PROJECT_DIR}/build - - make -t && make sphinx - - make -j2 tutorials - - make check_utils - - bash ${CI_PROJECT_DIR}/maintainer/CI/doc_warnings.sh - - python3 ${CI_PROJECT_DIR}/maintainer/CI/jupyter_warnings.py - artifacts: - paths: - - build/doc/sphinx - expire_in: 1 week - tags: - - espresso - - cuda - - numa - - reuse-artifacts-same-arch - run_tutorials: <<: *global_job_definition stage: additional_checks @@ -487,25 +430,6 @@ run_tutorials: only: - schedules -run_doxygen: - <<: *global_job_definition - stage: additional_checks - needs: - - cuda12-maxset - when: on_success - script: - - cd ${CI_PROJECT_DIR}/build - - make -t && make doxygen - artifacts: - paths: - - build/doc/doxygen - expire_in: 1 week - tags: - - espresso - - no-cuda - - numa - - reuse-artifacts-same-arch - maxset_no_gpu: <<: *global_job_definition stage: additional_checks diff --git a/CMakeLists.txt b/CMakeLists.txt index 6f084f6fc80..cd9c42814c7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -523,25 +523,16 @@ if(ESPRESSO_BUILD_WITH_ASAN) target_compile_options(espresso_cpp_flags INTERFACE -fsanitize=address -fno-omit-frame-pointer) target_link_libraries(espresso_cpp_flags INTERFACE -fsanitize=address) - if(ESPRESSO_BUILD_WITH_CUDA) - target_link_libraries(espresso_cuda_flags INTERFACE -fsanitize=address) - endif() endif() if(ESPRESSO_BUILD_WITH_MSAN) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -g -O1") target_compile_options(espresso_cpp_flags INTERFACE -fsanitize=memory -fno-omit-frame-pointer) target_link_libraries(espresso_cpp_flags INTERFACE -fsanitize=memory) - if(ESPRESSO_BUILD_WITH_CUDA) - target_link_libraries(espresso_cuda_flags INTERFACE -fsanitize=memory) - endif() endif() if(ESPRESSO_BUILD_WITH_UBSAN) target_compile_options(espresso_cpp_flags INTERFACE -fsanitize=undefined) target_link_libraries(espresso_cpp_flags INTERFACE -fsanitize=undefined) - if(ESPRESSO_BUILD_WITH_CUDA) - target_link_libraries(espresso_cuda_flags INTERFACE -fsanitize=undefined) - endif() endif() target_link_libraries(espresso_cpp_flags INTERFACE espresso::coverage_flags) diff --git a/maintainer/walberla_kernels/code_generation_context.py b/maintainer/walberla_kernels/code_generation_context.py index 77f86183e0e..153822b0144 100644 --- a/maintainer/walberla_kernels/code_generation_context.py +++ b/maintainer/walberla_kernels/code_generation_context.py @@ -26,6 +26,7 @@ import pystencils_walberla + def earmark_generated_kernels(): """ Add an earmark at the beginning of generated kernels to document the diff --git a/maintainer/walberla_kernels/generate_lb_kernels.py b/maintainer/walberla_kernels/generate_lb_kernels.py index 9afd75925c1..4a7cbc9f402 100644 --- a/maintainer/walberla_kernels/generate_lb_kernels.py +++ b/maintainer/walberla_kernels/generate_lb_kernels.py @@ -149,9 +149,7 @@ def paramlist(parameters, keys): params ) - block_offsets = tuple( - ps.TypedSymbol(f"block_offset_{i}", np.uint32) - for i in range(3)) + block_offsets = tuple(ps.TypedSymbol(f"block_offset_{i}", np.uint32) for i in range(3)) # generate thermalized LB collision_rule_thermalized = lbmpy.creationfunctions.create_lb_collision_rule( diff --git a/maintainer/walberla_kernels/lbmpy_espresso.py b/maintainer/walberla_kernels/lbmpy_espresso.py index 8a755d347b0..6142713f5fa 100644 --- a/maintainer/walberla_kernels/lbmpy_espresso.py +++ b/maintainer/walberla_kernels/lbmpy_espresso.py @@ -46,11 +46,11 @@ def data_initialisation(self, direction): cell_args = [f"it.{direction}() + {bb_vec[i]}".replace('+ -', '-') for i, direction in enumerate("xyz")] code = [ - "auto const InitialisationAdditionalData = elementInitialiser(", - f"Cell({', '.join(cell_args)}), blocks, *block);", - "element.vel_0 = InitialisationAdditionalData[0];", - "element.vel_1 = InitialisationAdditionalData[1];", - "element.vel_2 = InitialisationAdditionalData[2];", + "auto const InitialisationAdditionalData = elementInitialiser(", + f"Cell({', '.join(cell_args)}), blocks, *block);", + "element.vel_0 = InitialisationAdditionalData[0];", + "element.vel_1 = InitialisationAdditionalData[1];", + "element.vel_2 = InitialisationAdditionalData[2];", ] return "\n".join(code) diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 6153849cc7a..7335db2a010 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -82,9 +82,9 @@ install(TARGETS espresso_core target_link_libraries( espresso_core PRIVATE espresso::config espresso::utils::mpi espresso::shapes - espresso::cpp_flags + espresso::profiler espresso::cpp_flags PUBLIC espresso::utils MPI::MPI_CXX Random123 espresso::particle_observables - Boost::serialization Boost::mpi espresso::profiler) + Boost::serialization Boost::mpi) target_include_directories(espresso_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/src/core/electrostatics/p3m.cpp b/src/core/electrostatics/p3m.cpp index 32667d67966..e247b88418d 100644 --- a/src/core/electrostatics/p3m.cpp +++ b/src/core/electrostatics/p3m.cpp @@ -36,9 +36,7 @@ #include "p3m/TuningAlgorithm.hpp" #include "p3m/TuningLogger.hpp" #include "p3m/fft.hpp" -#include "p3m/for_each_3d.hpp" #include "p3m/influence_function.hpp" -#include "p3m/math.hpp" #include "BoxGeometry.hpp" #include "LocalBox.hpp" @@ -59,6 +57,7 @@ #include #include #include +#include #include #include @@ -76,11 +75,9 @@ #include #include #include -#include #include #include #include -#include #include void CoulombP3M::count_charged_particles() { @@ -117,7 +114,7 @@ void CoulombP3M::calc_influence_function_force() { auto const size = Utils::Vector3i{p3m.fft.plan[3].new_mesh}; p3m.g_force = grid_influence_function<1>(p3m.params, start, start + size, - get_system().box_geo->length_inv()); + get_system().box_geo->length()); } /** Calculate the influence function optimized for the energy and the @@ -128,38 +125,41 @@ void CoulombP3M::calc_influence_function_energy() { auto const size = Utils::Vector3i{p3m.fft.plan[3].new_mesh}; p3m.g_energy = grid_influence_function<0>(p3m.params, start, start + size, - get_system().box_geo->length_inv()); + get_system().box_geo->length()); } /** Aliasing sum used by @ref p3m_k_space_error. */ -static auto p3m_tune_aliasing_sums(Utils::Vector3i const &shift, +static auto p3m_tune_aliasing_sums(int nx, int ny, int nz, Utils::Vector3i const &mesh, Utils::Vector3d const &mesh_i, int cao, double alpha_L_i) { + using Utils::sinc; - auto constexpr m_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN); - auto constexpr m_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1); auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i); + auto alias1 = 0.; auto alias2 = 0.; - - Utils::Vector3i indices{}; - Utils::Vector3i nm{}; - Utils::Vector3d fnm{}; - for_each_3d( - m_start, m_stop, indices, - [&]() { - auto const norm_sq = nm.norm2(); - auto const ex = exp(-factor1 * norm_sq); - auto const energy = std::pow(Utils::product(fnm), 2 * cao); - alias1 += Utils::sqr(ex) / norm_sq; - alias2 += energy * ex * (shift * nm) / norm_sq; - }, - [&](unsigned dim, int n) { - nm[dim] = shift[dim] + n * mesh[dim]; - fnm[dim] = math::sinc(nm[dim] * mesh_i[dim]); - }); - + for (int mx = -P3M_BRILLOUIN; mx <= P3M_BRILLOUIN; mx++) { + auto const nmx = nx + mx * mesh[0]; + auto const fnmx = mesh_i[0] * nmx; + for (int my = -P3M_BRILLOUIN; my <= P3M_BRILLOUIN; my++) { + auto const nmy = ny + my * mesh[1]; + auto const fnmy = mesh_i[1] * nmy; + for (int mz = -P3M_BRILLOUIN; mz <= P3M_BRILLOUIN; mz++) { + auto const nmz = nz + mz * mesh[2]; + auto const fnmz = mesh_i[2] * nmz; + + auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz); + auto const ex = exp(-factor1 * nm2); + auto const ex2 = Utils::sqr(ex); + + auto const U2 = pow(sinc(fnmx) * sinc(fnmy) * sinc(fnmz), 2.0 * cao); + + alias1 += ex2 / nm2; + alias2 += U2 * ex * (nx * nmx + ny * nmy + nz * nmz) / nm2; + } + } + } return std::make_pair(alias1, alias2); } @@ -197,40 +197,51 @@ static double p3m_real_space_error(double pref, double r_cut_iL, int n_c_part, static double p3m_k_space_error(double pref, Utils::Vector3i const &mesh, int cao, int n_c_part, double sum_q2, double alpha_L, Utils::Vector3d const &box_l) { - - auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao); - auto const mesh_i = 1. / Utils::Vector3d(mesh); + auto const mesh_i = + Utils::hadamard_division(Utils::Vector3d::broadcast(1.), mesh); auto const alpha_L_i = 1. / alpha_L; - auto const m_stop = mesh / 2; - auto const m_start = -m_stop; - auto indices = Utils::Vector3i{}; - auto values = Utils::Vector3d{}; auto he_q = 0.; - for_each_3d( - m_start, m_stop, indices, - [&]() { - if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) { - auto const n2 = indices.norm2(); - auto const cs = Utils::product(values); + for (int nx = -mesh[0] / 2; nx < mesh[0] / 2; nx++) { + auto const ctan_x = p3m_analytic_cotangent_sum(nx, mesh_i[0], cao); + for (int ny = -mesh[1] / 2; ny < mesh[1] / 2; ny++) { + auto const ctan_y = + ctan_x * p3m_analytic_cotangent_sum(ny, mesh_i[1], cao); + for (int nz = -mesh[2] / 2; nz < mesh[2] / 2; nz++) { + if ((nx != 0) || (ny != 0) || (nz != 0)) { + auto const n2 = Utils::sqr(nx) + Utils::sqr(ny) + Utils::sqr(nz); + auto const cs = + p3m_analytic_cotangent_sum(nz, mesh_i[2], cao) * ctan_y; auto const [alias1, alias2] = - p3m_tune_aliasing_sums(indices, mesh, mesh_i, cao, alpha_L_i); + p3m_tune_aliasing_sums(nx, ny, nz, mesh, mesh_i, cao, alpha_L_i); auto const d = alias1 - Utils::sqr(alias2 / cs) / n2; /* at high precision, d can become negative due to extinction; also, don't take values that have no significant digits left*/ - if (d > 0. and std::fabs(d / alias1) > ROUND_ERROR_PREC) { + if (d > 0 && (fabs(d / alias1) > ROUND_ERROR_PREC)) he_q += d; - } } - }, - [&values, &mesh_i, cotangent_sum](unsigned dim, int n) { - values[dim] = cotangent_sum(n, mesh_i[dim]); - }); - + } + } + } return 2. * pref * sum_q2 * sqrt(he_q / static_cast(n_c_part)) / (box_l[1] * box_l[2]); } +#ifdef CUDA +static double p3mgpu_k_space_error(double prefactor, + Utils::Vector3i const &mesh, int cao, + int npart, double sum_q2, double alpha_L, + Utils::Vector3d const &box_l) { + auto ks_err = 0.; + if (this_node == 0) { + ks_err = p3m_k_space_error_gpu(prefactor, mesh.data(), cao, npart, sum_q2, + alpha_L, box_l.data()); + } + boost::mpi::broadcast(comm_cart, ks_err, 0); + return ks_err; +} +#endif + void CoulombP3M::init() { assert(p3m.params.mesh >= Utils::Vector3i::broadcast(1)); assert(p3m.params.cao >= 1 and p3m.params.cao <= 7); @@ -410,44 +421,48 @@ CoulombP3M::long_range_pressure(ParticleRange const &particles) { p3m.sm.gather_grid(p3m.rs_mesh.data(), comm_cart, p3m.local_mesh.dim); fft_perform_forw(p3m.rs_mesh.data(), p3m.fft, comm_cart); - auto constexpr m_start = Utils::Vector3i::broadcast(0); - auto const half_alpha_inv_sq = Utils::sqr(1. / 2. / p3m.params.alpha); - auto const m_stop = std::span(p3m.fft.plan[3].new_mesh); - auto const offset = std::span(p3m.fft.plan[3].start); - auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv(); - auto indices = Utils::Vector3i{}; - auto index = std::size_t(0u); auto diagonal = 0.; - - for_each_3d(m_start, m_stop, indices, [&]() { - auto const kx = p3m.d_op[RX][indices[KX] + offset[KX]] * wavevector[RX]; - auto const ky = p3m.d_op[RY][indices[KY] + offset[KY]] * wavevector[RY]; - auto const kz = p3m.d_op[RZ][indices[KZ] + offset[KZ]] * wavevector[RZ]; - auto const norm_sq = Utils::sqr(kx) + Utils::sqr(ky) + Utils::sqr(kz); - - if (norm_sq != 0.) { - auto const node_k_space_energy = - p3m.g_energy[index] * (Utils::sqr(p3m.rs_mesh[2u * index + 0u]) + - Utils::sqr(p3m.rs_mesh[2u * index + 1u])); - auto const vterm = -2. * (1. / norm_sq + half_alpha_inv_sq); - auto const pref = node_k_space_energy * vterm; - node_k_space_pressure_tensor[0u] += pref * kx * kx; /* sigma_xx */ - node_k_space_pressure_tensor[1u] += pref * kx * ky; /* sigma_xy */ - node_k_space_pressure_tensor[2u] += pref * kx * kz; /* sigma_xz */ - node_k_space_pressure_tensor[4u] += pref * ky * ky; /* sigma_yy */ - node_k_space_pressure_tensor[5u] += pref * ky * kz; /* sigma_yz */ - node_k_space_pressure_tensor[8u] += pref * kz * kz; /* sigma_zz */ - diagonal += node_k_space_energy; + int ind = 0; + int j[3]; + auto const half_alpha_inv_sq = Utils::sqr(1. / 2. / p3m.params.alpha); + for (j[0] = 0; j[0] < p3m.fft.plan[3].new_mesh[RX]; j[0]++) { + for (j[1] = 0; j[1] < p3m.fft.plan[3].new_mesh[RY]; j[1]++) { + for (j[2] = 0; j[2] < p3m.fft.plan[3].new_mesh[RZ]; j[2]++) { + auto const kx = 2. * std::numbers::pi * + p3m.d_op[RX][j[KX] + p3m.fft.plan[3].start[KX]] * + box_geo.length_inv()[RX]; + auto const ky = 2. * std::numbers::pi * + p3m.d_op[RY][j[KY] + p3m.fft.plan[3].start[KY]] * + box_geo.length_inv()[RY]; + auto const kz = 2. * std::numbers::pi * + p3m.d_op[RZ][j[KZ] + p3m.fft.plan[3].start[KZ]] * + box_geo.length_inv()[RZ]; + auto const sqk = Utils::sqr(kx) + Utils::sqr(ky) + Utils::sqr(kz); + + if (sqk != 0.) { + auto const node_k_space_energy = + p3m.g_energy[ind] * (Utils::sqr(p3m.rs_mesh[2 * ind]) + + Utils::sqr(p3m.rs_mesh[2 * ind + 1])); + auto const vterm = -2. * (1. / sqk + half_alpha_inv_sq); + auto const pref = node_k_space_energy * vterm; + node_k_space_pressure_tensor[0] += pref * kx * kx; /* sigma_xx */ + node_k_space_pressure_tensor[1] += pref * kx * ky; /* sigma_xy */ + node_k_space_pressure_tensor[2] += pref * kx * kz; /* sigma_xz */ + node_k_space_pressure_tensor[3] += pref * ky * kx; /* sigma_yx */ + node_k_space_pressure_tensor[4] += pref * ky * ky; /* sigma_yy */ + node_k_space_pressure_tensor[5] += pref * ky * kz; /* sigma_yz */ + node_k_space_pressure_tensor[6] += pref * kz * kx; /* sigma_zx */ + node_k_space_pressure_tensor[7] += pref * kz * ky; /* sigma_zy */ + node_k_space_pressure_tensor[8] += pref * kz * kz; /* sigma_zz */ + diagonal += node_k_space_energy; + } + ind++; + } } - ++index; - }); - - node_k_space_pressure_tensor[0u] += diagonal; - node_k_space_pressure_tensor[4u] += diagonal; - node_k_space_pressure_tensor[8u] += diagonal; - node_k_space_pressure_tensor[3u] = node_k_space_pressure_tensor[1u]; - node_k_space_pressure_tensor[6u] = node_k_space_pressure_tensor[2u]; - node_k_space_pressure_tensor[7u] = node_k_space_pressure_tensor[5u]; + } + node_k_space_pressure_tensor[0] += diagonal; + node_k_space_pressure_tensor[4] += diagonal; + node_k_space_pressure_tensor[8] += diagonal; } return node_k_space_pressure_tensor * prefactor / (2. * box_geo.volume()); @@ -494,35 +509,35 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag, /* === k-space force calculation === */ if (force_flag) { - /* i*k differentiation */ - auto constexpr m_start = Utils::Vector3i::broadcast(0); - auto const m_stop = std::span(p3m.fft.plan[3].new_mesh); - auto const offset = std::span(p3m.fft.plan[3].start); - auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv(); - auto indices = Utils::Vector3i{}; - auto index = std::size_t(0u); - + /* sqrt(-1)*k differentiation */ + int j[3]; + int ind = 0; /* compute electric field */ // Eq. (3.49) @cite deserno00b - for_each_3d(m_start, m_stop, indices, [&]() { - auto const rho_hat = std::complex(p3m.rs_mesh[2u * index + 0u], - p3m.rs_mesh[2u * index + 1u]); - auto const phi_hat = p3m.g_force[index] * rho_hat; - - for (int d = 0; d < 3; d++) { - /* direction in r-space: */ - int d_rs = (d + p3m.ks_pnum) % 3; - /* directions */ - auto const k = - p3m.d_op[d_rs][indices[d] + offset[d]] * wavevector[d_rs]; - - /* i*k*(Re+i*Im) = - Im*k + i*Re*k (i=sqrt(-1)) */ - p3m.E_mesh[d_rs][2u * index + 0u] = -k * phi_hat.imag(); - p3m.E_mesh[d_rs][2u * index + 1u] = +k * phi_hat.real(); - } + for (j[0] = 0; j[0] < p3m.fft.plan[3].new_mesh[0]; j[0]++) { + for (j[1] = 0; j[1] < p3m.fft.plan[3].new_mesh[1]; j[1]++) { + for (j[2] = 0; j[2] < p3m.fft.plan[3].new_mesh[2]; j[2]++) { + auto const rho_hat = std::complex(p3m.rs_mesh[2 * ind + 0], + p3m.rs_mesh[2 * ind + 1]); + auto const phi_hat = p3m.g_force[ind] * rho_hat; + + for (int d = 0; d < 3; d++) { + /* direction in r-space: */ + int d_rs = (d + p3m.ks_pnum) % 3; + /* directions */ + auto const k = 2. * std::numbers::pi * + p3m.d_op[d_rs][j[d] + p3m.fft.plan[3].start[d]] * + box_geo.length_inv()[d_rs]; + + /* i*k*(Re+i*Im) = - Im*k + i*Re*k (i=sqrt(-1)) */ + p3m.E_mesh[d_rs][2 * ind + 0] = -k * phi_hat.imag(); + p3m.E_mesh[d_rs][2 * ind + 1] = +k * phi_hat.real(); + } - ++index; - }); + ind++; + } + } + } /* Back FFT force component mesh */ auto const check_complex = !p3m.params.tuning and check_complex_residuals; @@ -663,12 +678,8 @@ class CoulombTuningAlgorithm : public TuningAlgorithm { #ifdef CUDA auto const &solver = m_system.coulomb.impl->solver; if (has_actor_of_type(solver)) { - if (this_node == 0) { - ks_err = - p3m_k_space_error_gpu(m_prefactor, mesh.data(), cao, p3m.sum_qpart, - p3m.sum_q2, alpha_L, box_geo.length().data()); - } - boost::mpi::broadcast(comm_cart, ks_err, 0); + ks_err = p3mgpu_k_space_error(m_prefactor, mesh, cao, p3m.sum_qpart, + p3m.sum_q2, alpha_L, box_geo.length()); } else #endif ks_err = p3m_k_space_error(m_prefactor, mesh, cao, p3m.sum_qpart, @@ -796,7 +807,7 @@ void CoulombP3M::sanity_checks_boxl() const { auto const &system = get_system(); auto const &box_geo = *system.box_geo; auto const &local_geo = *system.local_geo; - for (auto i = 0u; i < 3u; i++) { + for (unsigned int i = 0u; i < 3u; i++) { /* check k-space cutoff */ if (p3m.params.cao_cut[i] >= box_geo.length_half()[i]) { std::stringstream msg; diff --git a/src/core/electrostatics/p3m_gpu_cuda.cu b/src/core/electrostatics/p3m_gpu_cuda.cu index c734ba0cccd..6429c8afeed 100644 --- a/src/core/electrostatics/p3m_gpu_cuda.cu +++ b/src/core/electrostatics/p3m_gpu_cuda.cu @@ -53,20 +53,20 @@ #include "electrostatics/p3m_gpu_cuda.cuh" #include "cuda/utils.cuh" -#include "p3m/math.hpp" #include "system/System.hpp" #include #include +#include #include #include #include #include -#include #include -#include +#include +#include #include #include @@ -93,7 +93,7 @@ struct P3MGpuData { /** Ewald parameter */ REAL_TYPE alpha; /** Number of particles */ - unsigned int n_part; // oddity: size_t causes UB with GCC 11.4 in Debug mode + std::size_t n_part; /** Box size */ REAL_TYPE box[3]; /** Mesh dimensions */ @@ -138,31 +138,6 @@ struct P3MGpuParams { } }; -static auto p3m_calc_blocks(unsigned int cao, std::size_t n_part) { - auto const cao3 = Utils::int_pow<3>(cao); - auto parts_per_block = 1u; - while ((parts_per_block + 1u) * cao3 <= 1024u) { - ++parts_per_block; - } - assert((n_part / parts_per_block) <= std::numeric_limits::max()); - auto n = static_cast(n_part / parts_per_block); - auto n_blocks = ((n_part % parts_per_block) == 0u) ? std::max(1u, n) : n + 1u; - assert(n_blocks <= std::numeric_limits::max()); - return std::make_pair(parts_per_block, static_cast(n_blocks)); -} - -dim3 p3m_make_grid(unsigned int n_blocks) { - dim3 grid(n_blocks, 1u, 1u); - while (grid.x > 65536u) { - grid.y++; - if ((n_blocks % grid.y) == 0u) - grid.x = std::max(1u, n_blocks / grid.y); - else - grid.x = n_blocks / grid.y + 1u; - } - return grid; -} - template __device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY, int NZ, REAL_TYPE *Zaehler, @@ -185,15 +160,15 @@ __device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY, for (MX = -P3M_BRILLOUIN; MX <= P3M_BRILLOUIN; MX++) { NMX = static_cast(((NX > p.mesh[0] / 2) ? NX - p.mesh[0] : NX) + p.mesh[0] * MX); - S1 = int_pow<2 * cao>(math::sinc(Meshi[0] * NMX)); + S1 = int_pow<2 * cao>(Utils::sinc(Meshi[0] * NMX)); for (MY = -P3M_BRILLOUIN; MY <= P3M_BRILLOUIN; MY++) { NMY = static_cast( ((NY > p.mesh[1] / 2) ? NY - p.mesh[1] : NY) + p.mesh[1] * MY); - S2 = S1 * int_pow<2 * cao>(math::sinc(Meshi[1] * NMY)); + S2 = S1 * int_pow<2 * cao>(Utils::sinc(Meshi[1] * NMY)); for (MZ = -P3M_BRILLOUIN; MZ <= P3M_BRILLOUIN; MZ++) { NMZ = static_cast( ((NZ > p.mesh[2] / 2) ? NZ - p.mesh[2] : NZ) + p.mesh[2] * MZ); - S3 = S2 * int_pow<2 * cao>(math::sinc(Meshi[2] * NMZ)); + S3 = S2 * int_pow<2 * cao>(Utils::sinc(Meshi[2] * NMZ)); NM2 = sqr(NMX * Leni[0]) + sqr(NMY * Leni[1]) + sqr(NMZ * Leni[2]); *Nenner += S3; @@ -391,9 +366,26 @@ void assign_charges(P3MGpuData const ¶ms, float const *const __restrict__ part_pos, float const *const __restrict__ part_q) { auto const cao = static_cast(params.cao); - auto const [parts_per_block, n_blocks] = p3m_calc_blocks(cao, params.n_part); + auto const cao3 = int_pow<3>(cao); + unsigned int parts_per_block = 1u, n_blocks = 1u; + + while ((parts_per_block + 1u) * cao3 <= 1024u) { + parts_per_block++; + } + if ((params.n_part % parts_per_block) == 0u) + n_blocks = std::max(1u, params.n_part / parts_per_block); + else + n_blocks = params.n_part / parts_per_block + 1u; + dim3 block(parts_per_block * cao, cao, cao); - dim3 grid = p3m_make_grid(n_blocks); + dim3 grid(n_blocks, 1u, 1u); + while (grid.x > 65536u) { + grid.y++; + if ((n_blocks % grid.y) == 0u) + grid.x = std::max(1u, n_blocks / grid.y); + else + grid.x = n_blocks / grid.y + 1u; + } auto const data_length = std::size_t(3u) * static_cast(parts_per_block) * @@ -446,7 +438,7 @@ __global__ void assign_forces_kernel(P3MGpuData const params, /* id of the particle */ auto const id = parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block; - if (id >= static_cast(params.n_part)) + if (id >= params.n_part) return; /* position relative to the closest grid point */ REAL_TYPE m_pos[3]; @@ -508,9 +500,27 @@ void assign_forces(P3MGpuData const ¶ms, float *const __restrict__ part_f, REAL_TYPE const prefactor) { auto const cao = static_cast(params.cao); - auto const [parts_per_block, n_blocks] = p3m_calc_blocks(cao, params.n_part); + auto const cao3 = int_pow<3>(cao); + unsigned int parts_per_block = 1u, n_blocks = 1u; + + while ((parts_per_block + 1u) * cao3 <= 1024u) { + parts_per_block++; + } + + if ((params.n_part % parts_per_block) == 0u) + n_blocks = std::max(1u, params.n_part / parts_per_block); + else + n_blocks = params.n_part / parts_per_block + 1u; + dim3 block(parts_per_block * cao, cao, cao); - dim3 grid = p3m_make_grid(n_blocks); + dim3 grid(n_blocks, 1u, 1u); + while (grid.x > 65536u) { + grid.y++; + if (n_blocks % grid.y == 0u) + grid.x = std::max(1u, n_blocks / grid.y); + else + grid.x = n_blocks / grid.y + 1u; + } /* Switch for assignment templates, the shared version only is faster for cao * > 2 */ @@ -571,8 +581,7 @@ void p3m_gpu_init(std::shared_ptr &data, int cao, auto &p3m_gpu_data = data->p3m_gpu_data; bool do_reinit = false, mesh_changed = false; - assert(n_part <= std::numeric_limits::max()); - p3m_gpu_data.n_part = static_cast(n_part); + p3m_gpu_data.n_part = n_part; if (not data->is_initialized or p3m_gpu_data.alpha != alpha) { p3m_gpu_data.alpha = static_cast(alpha); @@ -685,9 +694,9 @@ void p3m_gpu_init(std::shared_ptr &data, int cao, void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu, double prefactor, std::size_t n_part) { auto &p3m_gpu_data = data.p3m_gpu_data; - p3m_gpu_data.n_part = static_cast(n_part); + p3m_gpu_data.n_part = n_part; - if (n_part == 0u) + if (p3m_gpu_data.n_part == 0u) return; auto const positions_device = gpu.get_particle_positions_device(); @@ -713,7 +722,8 @@ void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu, if (FFT_FORW_FFT(data.p3m_fft.forw_plan, (REAL_TYPE *)p3m_gpu_data.charge_mesh, p3m_gpu_data.charge_mesh) != CUFFT_SUCCESS) { - throw std::runtime_error("CUFFT error: Forward FFT failed"); + fprintf(stderr, "CUFFT error: Forward FFT failed\n"); + return; } /* Do convolution */ diff --git a/src/core/electrostatics/p3m_gpu_error_cuda.cu b/src/core/electrostatics/p3m_gpu_error_cuda.cu index fbc51fecf4a..6e54e2dfe7e 100644 --- a/src/core/electrostatics/p3m_gpu_error_cuda.cu +++ b/src/core/electrostatics/p3m_gpu_error_cuda.cu @@ -23,12 +23,12 @@ * The corresponding header file is p3m_gpu_error.hpp. */ -#include "p3m/math.hpp" #include "p3m_gpu_error.hpp" #include "cuda/utils.cuh" #include +#include #include #include @@ -44,12 +44,49 @@ #error CU-file includes mpi.h! This should not happen! #endif +using Utils::int_pow; +using Utils::sqr; + +/** @todo Extend to higher order. This comes from some 1/sin expansion in + * @cite hockney88a + */ + +template +__device__ static double p3m_analytic_cotangent_sum(int n, double mesh_i) { + const double c = sqr(cos(std::numbers::pi * mesh_i * n)); + + switch (cao) { + case 1: + return 1; + case 2: + return (1.0 + c * 2.0) / 3.0; + case 3: + return (2.0 + c * (11.0 + c * 2.0)) / 15.0; + case 4: + return (17.0 + c * (180.0 + c * (114.0 + c * 4.0))) / 315.0; + case 5: + return (62.0 + c * (1072.0 + c * (1452.0 + c * (247.0 + c * 2.0)))) / + 2835.0; + case 6: + return (1382.0 + + c * (35396.0 + + c * (83021.0 + c * (34096.0 + c * (2026.0 + c * 4.0))))) / + 155925.0; + case 7: + return (21844.0 + + c * (776661.0 + + c * (2801040.0 + + c * (2123860.0 + + c * (349500.0 + c * (8166.0 + c * 4.0)))))) / + 6081075.0; + } + + return 0.0; +} + template __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi, double alpha_L, double *he_q) { - using Utils::int_pow; - using Utils::sqr; - const int nx = -mesh.x / 2 + static_cast(blockDim.x * blockIdx.x + threadIdx.x); const int ny = @@ -66,14 +103,14 @@ __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi, if ((nx != 0) || (ny != 0) || (nz != 0)) { const double alpha_L_i = 1. / alpha_L; const double n2 = sqr(nx) + sqr(ny) + sqr(nz); - const double cs = math::analytic_cotangent_sum(nz, meshi.z) * - math::analytic_cotangent_sum(nx, meshi.x) * - math::analytic_cotangent_sum(ny, meshi.y); + const double cs = p3m_analytic_cotangent_sum(nz, meshi.z) * + p3m_analytic_cotangent_sum(nx, meshi.x) * + p3m_analytic_cotangent_sum(ny, meshi.y); const double ex = exp(-sqr(std::numbers::pi * alpha_L_i) * n2); const double ex2 = sqr(ex); const double U2 = - int_pow<2 * cao>(math::sinc(meshi.x * nx) * math::sinc(meshi.y * ny) * - math::sinc(meshi.z * nz)); + int_pow<2 * cao>(Utils::sinc(meshi.x * nx) * Utils::sinc(meshi.y * ny) * + Utils::sinc(meshi.z * nz)); auto const alias1 = ex2 / n2; auto const d = alias1 - sqr(U2 * ex / cs) / n2; diff --git a/src/core/magnetostatics/dp3m.cpp b/src/core/magnetostatics/dp3m.cpp index 4214900fb52..4fcad22318e 100644 --- a/src/core/magnetostatics/dp3m.cpp +++ b/src/core/magnetostatics/dp3m.cpp @@ -37,7 +37,6 @@ #include "p3m/fft.hpp" #include "p3m/influence_function_dipolar.hpp" #include "p3m/interpolation.hpp" -#include "p3m/math.hpp" #include "p3m/send_mesh.hpp" #include "BoxGeometry.hpp" @@ -57,6 +56,7 @@ #include #include #include +#include #include #include @@ -67,13 +67,10 @@ #include #include #include -#include #include #include -#include #include #include -#include #include void DipolarP3M::count_magnetic_particles() { @@ -275,13 +272,15 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag, dipole_assign(particles); /* Gather information for FFT grid inside the nodes domain (inner local * mesh) and perform forward 3D FFT (Charge Assignment Mesh). */ - std::array meshes = {{dp3m.rs_mesh_dip[0u].data(), - dp3m.rs_mesh_dip[1u].data(), - dp3m.rs_mesh_dip[2u].data()}}; + std::array meshes = {{dp3m.rs_mesh_dip[0].data(), + dp3m.rs_mesh_dip[1].data(), + dp3m.rs_mesh_dip[2].data()}}; + dp3m.sm.gather_grid(meshes, comm_cart, dp3m.local_mesh.dim); - for (auto i = 0u; i < 3u; ++i) { - fft_perform_forw(dp3m.rs_mesh_dip[i].data(), dp3m.fft, comm_cart); - } + + fft_perform_forw(dp3m.rs_mesh_dip[0].data(), dp3m.fft, comm_cart); + fft_perform_forw(dp3m.rs_mesh_dip[1].data(), dp3m.fft, comm_cart); + fft_perform_forw(dp3m.rs_mesh_dip[2].data(), dp3m.fft, comm_cart); // Note: after these calls, the grids are in the order yzx and not xyz // anymore!!! } @@ -294,32 +293,34 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag, if (dp3m.sum_mu2 > 0.) { /* i*k differentiation for dipolar gradients: * |(\Fourier{\vect{mu}}(k)\cdot \vect{k})|^2 */ - - auto constexpr m_start = Utils::Vector3i::broadcast(0); - auto const m_stop = std::span(dp3m.fft.plan[3].new_mesh); - auto const offset = std::span(dp3m.fft.plan[3].start); - auto const &d_op = dp3m.d_op[0u]; - auto const &mesh_dip = dp3m.rs_mesh_dip; - auto indices = Utils::Vector3i{}; - auto index = std::size_t(0u); - auto it_energy = dp3m.g_energy.begin(); - auto node_energy = 0.; - for_each_3d(m_start, m_stop, indices, [&]() { - using namespace detail::FFT_indexing; - // Re(mu)*k - auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] + - mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] + - mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]]; - ++index; - // Im(mu)*k - auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] + - mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] + - mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]]; - ++index; - node_energy += *it_energy * (Utils::sqr(re) + Utils::sqr(im)); - std::advance(it_energy, 1); - }); - + int ind = 0; + int i = 0; + int j[3]; + double node_energy = 0.0; + for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { + for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; j[1]++) { + for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; j[2]++) { + node_energy += + dp3m.g_energy[i] * + (Utils::sqr( + dp3m.rs_mesh_dip[0][ind] * + dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] + + dp3m.rs_mesh_dip[1][ind] * + dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] + + dp3m.rs_mesh_dip[2][ind] * + dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]) + + Utils::sqr( + dp3m.rs_mesh_dip[0][ind + 1] * + dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] + + dp3m.rs_mesh_dip[1][ind + 1] * + dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] + + dp3m.rs_mesh_dip[2][ind + 1] * + dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]])); + ind += 2; + i++; + } + } + } node_energy *= dipole_prefac * std::numbers::pi * box_geo.length_inv()[0]; boost::mpi::reduce(comm_cart, node_energy, energy, std::plus<>(), 0); @@ -343,45 +344,62 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag, * DIPOLAR TORQUES (k-space) ****************************/ if (dp3m.sum_mu2 > 0.) { - auto const wavenumber = 2. * std::numbers::pi * box_geo.length_inv()[0u]; - auto constexpr m_start = Utils::Vector3i::broadcast(0); - auto const m_stop = std::span(dp3m.fft.plan[3].new_mesh); - auto const offset = std::span(dp3m.fft.plan[3].start); - auto const &d_op = dp3m.d_op[0u]; - auto const &mesh_dip = dp3m.rs_mesh_dip; - auto indices = Utils::Vector3i{}; - auto index = std::size_t(0u); - + auto const two_pi_L_i = 2. * std::numbers::pi * box_geo.length_inv()[0]; /* fill in ks_mesh array for torque calculation */ - auto it_energy = dp3m.g_energy.begin(); - index = 0u; - for_each_3d(m_start, m_stop, indices, [&]() { - using namespace detail::FFT_indexing; - // Re(mu)*k - auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] + - mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] + - mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]]; - dp3m.ks_mesh[index] = *it_energy * re; - ++index; - // Im(mu)*k - auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] + - mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] + - mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]]; - dp3m.ks_mesh[index] = *it_energy * im; - ++index; - std::advance(it_energy, 1); - }); + int ind = 0; + int i = 0; + int j[3]; + double tmp0, tmp1; + + for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { // j[0]=n_y + for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; + j[1]++) { // j[1]=n_z + for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; + j[2]++) { // j[2]=n_x + // tmp0 = Re(mu)*k, tmp1 = Im(mu)*k + + tmp0 = dp3m.rs_mesh_dip[0][ind] * + dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] + + dp3m.rs_mesh_dip[1][ind] * + dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] + + dp3m.rs_mesh_dip[2][ind] * + dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]; + + tmp1 = dp3m.rs_mesh_dip[0][ind + 1] * + dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] + + dp3m.rs_mesh_dip[1][ind + 1] * + dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] + + dp3m.rs_mesh_dip[2][ind + 1] * + dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]; + + /* the optimal influence function is the same for torques + and energy */ + dp3m.ks_mesh[ind] = tmp0 * dp3m.g_energy[i]; + dp3m.ks_mesh[ind + 1] = tmp1 * dp3m.g_energy[i]; + ind += 2; + i++; + } + } + } /* Force component loop */ for (int d = 0; d < 3; d++) { - index = 0u; - for_each_3d(m_start, m_stop, indices, [&]() { - auto const pos = indices[d] + offset[d]; - dp3m.rs_mesh[index] = d_op[pos] * dp3m.ks_mesh[index]; - ++index; - dp3m.rs_mesh[index] = d_op[pos] * dp3m.ks_mesh[index]; - ++index; - }); + auto const d_rs = (d + dp3m.ks_pnum) % 3; + ind = 0; + for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { + for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; j[1]++) { + for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; j[2]++) { + dp3m.rs_mesh[ind] = + dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] * + dp3m.ks_mesh[ind]; + ind++; + dp3m.rs_mesh[ind] = + dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] * + dp3m.ks_mesh[ind]; + ind++; + } + } + } /* Back FFT force component mesh */ fft_perform_back(dp3m.rs_mesh.data(), false, dp3m.fft, comm_cart); @@ -389,68 +407,94 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag, dp3m.sm.spread_grid(dp3m.rs_mesh.data(), comm_cart, dp3m.local_mesh.dim); /* Assign force component from mesh to particle */ - auto const d_rs = (d + dp3m.ks_pnum) % 3; Utils::integral_parameter( - dp3m.params.cao, dp3m, dipole_prefac * wavenumber, d_rs, particles); + dp3m.params.cao, dp3m, dipole_prefac * two_pi_L_i, d_rs, particles); } /*************************** DIPOLAR FORCES (k-space) ****************************/ + // Compute forces after torques because the algorithm below overwrites the // grids dp3m.rs_mesh_dip ! // Note: I'll do here 9 inverse FFTs. By symmetry, we can reduce this // number to 6 ! /* fill in ks_mesh array for force calculation */ - auto it_force = dp3m.g_force.begin(); - index = 0u; - for_each_3d(m_start, m_stop, indices, [&]() { - using namespace detail::FFT_indexing; - // Re(mu)*k - auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] + - mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] + - mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]]; - ++index; - // Im(mu)*k - auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] + - mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] + - mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]]; - ++index; - dp3m.ks_mesh[index - 2] = *it_force * im; - dp3m.ks_mesh[index - 1] = *it_force * (-re); - std::advance(it_force, 1); - }); + ind = 0; + i = 0; + for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { // j[0]=n_y + for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; + j[1]++) { // j[1]=n_z + for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; + j[2]++) { // j[2]=n_x + // tmp0 = Im(mu)*k, tmp1 = -Re(mu)*k + tmp0 = dp3m.rs_mesh_dip[0][ind + 1] * + dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] + + dp3m.rs_mesh_dip[1][ind + 1] * + dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] + + dp3m.rs_mesh_dip[2][ind + 1] * + dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]; + tmp1 = dp3m.rs_mesh_dip[0][ind] * + dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] + + dp3m.rs_mesh_dip[1][ind] * + dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] + + dp3m.rs_mesh_dip[2][ind] * + dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]; + dp3m.ks_mesh[ind] = tmp0 * dp3m.g_force[i]; + dp3m.ks_mesh[ind + 1] = -tmp1 * dp3m.g_force[i]; + ind += 2; + i++; + } + } + } /* Force component loop */ - for (int d = 0; d < 3; d++) { - auto &rs_mesh_dip = dp3m.rs_mesh_dip; - index = 0u; - for_each_3d(m_start, m_stop, indices, [&]() { - using namespace detail::FFT_indexing; - auto const f1 = d_op[indices[d] + offset[d]] * dp3m.ks_mesh[index]; - rs_mesh_dip[0][index] = d_op[indices[KX] + offset[KX]] * f1; - rs_mesh_dip[1][index] = d_op[indices[KY] + offset[KY]] * f1; - rs_mesh_dip[2][index] = d_op[indices[KZ] + offset[KZ]] * f1; - ++index; - auto const f2 = d_op[indices[d] + offset[d]] * dp3m.ks_mesh[index]; - rs_mesh_dip[0][index] = d_op[indices[KX] + offset[KX]] * f2; - rs_mesh_dip[1][index] = d_op[indices[KY] + offset[KY]] * f2; - rs_mesh_dip[2][index] = d_op[indices[KZ] + offset[KZ]] * f2; - ++index; - }); - /* Back FFT force component mesh */ - for (auto i = 0u; i < 3u; ++i) { - fft_perform_back(rs_mesh_dip[i].data(), false, dp3m.fft, comm_cart); + for (int d = 0; d < 3; d++) { /* direction in k-space: */ + auto const d_rs = (d + dp3m.ks_pnum) % 3; + ind = 0; + for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; + j[0]++) { // j[0]=n_y + for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; + j[1]++) { // j[1]=n_z + for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; + j[2]++) { // j[2]=n_x + tmp0 = dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] * + dp3m.ks_mesh[ind]; + dp3m.rs_mesh_dip[0][ind] = + dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] * tmp0; + dp3m.rs_mesh_dip[1][ind] = + dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] * tmp0; + dp3m.rs_mesh_dip[2][ind] = + dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]] * tmp0; + ind++; + tmp0 = dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] * + dp3m.ks_mesh[ind]; + dp3m.rs_mesh_dip[0][ind] = + dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] * tmp0; + dp3m.rs_mesh_dip[1][ind] = + dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] * tmp0; + dp3m.rs_mesh_dip[2][ind] = + dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]] * tmp0; + ind++; + } + } } + /* Back FFT force component mesh */ + fft_perform_back(dp3m.rs_mesh_dip[0].data(), false, dp3m.fft, + comm_cart); + fft_perform_back(dp3m.rs_mesh_dip[1].data(), false, dp3m.fft, + comm_cart); + fft_perform_back(dp3m.rs_mesh_dip[2].data(), false, dp3m.fft, + comm_cart); /* redistribute force component mesh */ - std::array meshes = {{rs_mesh_dip[0].data(), - rs_mesh_dip[1].data(), - rs_mesh_dip[2].data()}}; + std::array meshes = {{dp3m.rs_mesh_dip[0].data(), + dp3m.rs_mesh_dip[1].data(), + dp3m.rs_mesh_dip[2].data()}}; + dp3m.sm.spread_grid(meshes, comm_cart, dp3m.local_mesh.dim); /* Assign force component from mesh to particle */ - auto const d_rs = (d + dp3m.ks_pnum) % 3; Utils::integral_parameter( - dp3m.params.cao, dp3m, dipole_prefac * Utils::sqr(wavenumber), d_rs, + dp3m.params.cao, dp3m, dipole_prefac * Utils::sqr(two_pi_L_i), d_rs, particles); } } /* if (dp3m.sum_mu2 > 0) */ @@ -546,15 +590,15 @@ void DipolarP3M::calc_influence_function_force() { auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh}; dp3m.g_force = grid_influence_function<3>(dp3m.params, start, start + size, - get_system().box_geo->length_inv()); + get_system().box_geo->length()); } void DipolarP3M::calc_influence_function_energy() { auto const start = Utils::Vector3i{dp3m.fft.plan[3].start}; auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh}; - dp3m.g_energy = grid_influence_function<2>( - dp3m.params, start, start + size, get_system().box_geo->length_inv()); + dp3m.g_energy = grid_influence_function<2>(dp3m.params, start, start + size, + get_system().box_geo->length()); } class DipolarTuningAlgorithm : public TuningAlgorithm { @@ -705,68 +749,62 @@ void DipolarP3M::tune() { } /** Tuning dipolar-P3M */ -static auto dp3m_tune_aliasing_sums(Utils::Vector3i const &shift, int mesh, +static auto dp3m_tune_aliasing_sums(int nx, int ny, int nz, int mesh, double mesh_i, int cao, double alpha_L_i) { + using Utils::sinc; - auto constexpr m_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN); - auto constexpr m_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1); auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i); + auto alias1 = 0.; auto alias2 = 0.; - - Utils::Vector3i indices{}; - Utils::Vector3i nm{}; - Utils::Vector3d fnm{}; - for_each_3d( - m_start, m_stop, indices, - [&]() { - auto const norm_sq = nm.norm2(); - auto const ex = std::exp(-factor1 * norm_sq); - auto const U2 = std::pow(Utils::product(fnm), 2 * cao); - alias1 += Utils::sqr(ex) * norm_sq; - alias2 += U2 * ex * std::pow(shift * nm, 3) / norm_sq; - }, - [&](unsigned dim, int n) { - nm[dim] = shift[dim] + n * mesh; - fnm[dim] = math::sinc(nm[dim] * mesh_i); - }); - + for (int mx = -P3M_BRILLOUIN; mx <= P3M_BRILLOUIN; mx++) { + auto const nmx = nx + mx * mesh; + auto const fnmx = mesh_i * nmx; + for (int my = -P3M_BRILLOUIN; my <= P3M_BRILLOUIN; my++) { + auto const nmy = ny + my * mesh; + auto const fnmy = mesh_i * nmy; + for (int mz = -P3M_BRILLOUIN; mz <= P3M_BRILLOUIN; mz++) { + auto const nmz = nz + mz * mesh; + auto const fnmz = mesh_i * nmz; + + auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz); + auto const ex = std::exp(-factor1 * nm2); + + auto const U2 = pow(sinc(fnmx) * sinc(fnmy) * sinc(fnmz), 2. * cao); + + alias1 += Utils::sqr(ex) * nm2; + alias2 += U2 * ex * pow((nx * nmx + ny * nmy + nz * nmz), 3.) / nm2; + } + } + } return std::make_pair(alias1, alias2); } /** Calculate the k-space error of dipolar-P3M */ static double dp3m_k_space_error(double box_size, int mesh, int cao, int n_c_part, double sum_q2, double alpha_L) { - - auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao); - auto const mesh_i = 1. / static_cast(mesh); + double he_q = 0.; + auto const mesh_i = 1. / mesh; auto const alpha_L_i = 1. / alpha_L; - auto const m_stop = Utils::Vector3i::broadcast(mesh / 2); - auto const m_start = -m_stop; - auto indices = Utils::Vector3i{}; - auto values = Utils::Vector3d{}; - auto he_q = 0.; - - for_each_3d( - m_start, m_stop, indices, - [&]() { - if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) { - auto const n2 = indices.norm2(); - auto const cs = Utils::product(values); + + for (int nx = -mesh / 2; nx < mesh / 2; nx++) + for (int ny = -mesh / 2; ny < mesh / 2; ny++) + for (int nz = -mesh / 2; nz < mesh / 2; nz++) + if ((nx != 0) || (ny != 0) || (nz != 0)) { + auto const n2 = Utils::sqr(nx) + Utils::sqr(ny) + Utils::sqr(nz); + auto const cs = p3m_analytic_cotangent_sum(nx, mesh_i, cao) * + p3m_analytic_cotangent_sum(ny, mesh_i, cao) * + p3m_analytic_cotangent_sum(nz, mesh_i, cao); auto const [alias1, alias2] = - dp3m_tune_aliasing_sums(indices, mesh, mesh_i, cao, alpha_L_i); + dp3m_tune_aliasing_sums(nx, ny, nz, mesh, mesh_i, cao, alpha_L_i); auto const d = alias1 - Utils::sqr(alias2 / cs) / Utils::int_pow<3>(static_cast(n2)); /* at high precision, d can become negative due to extinction; also, don't take values that have no significant digits left*/ - if (d > 0. and std::fabs(d / alias1) > ROUND_ERROR_PREC) + if (d > 0 && (fabs(d / alias1) > ROUND_ERROR_PREC)) he_q += d; } - }, - [&values, &mesh_i, cotangent_sum](unsigned dim, int n) { - values[dim] = cotangent_sum(n, mesh_i); - }); return 8. * Utils::sqr(std::numbers::pi) / 3. * sum_q2 * sqrt(he_q / n_c_part) / Utils::int_pow<4>(box_size); @@ -847,7 +885,7 @@ void DipolarP3M::sanity_checks_boxl() const { auto const &system = get_system(); auto const &box_geo = *system.box_geo; auto const &local_geo = *system.local_geo; - for (auto i = 0u; i < 3u; i++) { + for (unsigned int i = 0u; i < 3u; i++) { /* check k-space cutoff */ if (dp3m.params.cao_cut[i] >= box_geo.length_half()[i]) { std::stringstream msg; diff --git a/src/core/p3m/common.cpp b/src/core/p3m/common.cpp index fb8023eba74..bc87e99a6a8 100644 --- a/src/core/p3m/common.cpp +++ b/src/core/p3m/common.cpp @@ -28,8 +28,52 @@ #include "LocalBox.hpp" #include +#include #include +#include +#include + +double p3m_analytic_cotangent_sum(int n, double mesh_i, int cao) { + auto const c = + Utils::sqr(std::cos(std::numbers::pi * mesh_i * static_cast(n))); + + switch (cao) { + case 1: { + return 1.0; + } + case 2: { + return (1.0 + c * 2.0) / 3.0; + } + case 3: { + return (2.0 + c * (11.0 + c * 2.0)) / 15.0; + } + case 4: { + return (17.0 + c * (180.0 + c * (114.0 + c * 4.0))) / 315.0; + } + case 5: { + return (62.0 + c * (1072.0 + c * (1452.0 + c * (247.0 + c * 2.0)))) / + 2835.0; + } + case 6: { + return (1382.0 + + c * (35396.0 + + c * (83021.0 + c * (34096.0 + c * (2026.0 + c * 4.0))))) / + 155925.0; + } + case 7: { + return (21844.0 + + c * (776661.0 + + c * (2801040.0 + + c * (2123860.0 + + c * (349500.0 + c * (8166.0 + c * 4.0)))))) / + 6081075.0; + } + default: { + throw std::logic_error("Invalid value cao=" + std::to_string(cao)); + } + } +} void P3MLocalMesh::calc_local_ca_mesh(P3MParameters const ¶ms, LocalBox const &local_geo, double skin, diff --git a/src/core/p3m/common.hpp b/src/core/p3m/common.hpp index 6bc30828ee5..38a7553428a 100644 --- a/src/core/p3m/common.hpp +++ b/src/core/p3m/common.hpp @@ -38,7 +38,6 @@ #include -#include #include #include @@ -50,7 +49,6 @@ auto constexpr P3M_EPSILON_METALLIC = 0.0; #include "LocalBox.hpp" #include -#include #include #include @@ -214,7 +212,7 @@ struct P3MLocalMesh { */ void recalc_ld_pos(P3MParameters const ¶ms) { // spatial position of left down mesh point - for (auto i = 0u; i < 3u; i++) { + for (unsigned int i = 0; i < 3; i++) { ld_pos[i] = (ld_ind[i] + params.mesh_off[i]) * params.a[i]; } } @@ -228,6 +226,16 @@ struct P3MLocalMesh { double space_layer); }; +/** One of the aliasing sums used to compute k-space errors. + * Fortunately the one which is most important (because it converges + * most slowly, since it is not damped exponentially) can be + * calculated analytically. The result (which depends on the order of + * the spline interpolation) can be written as an even trigonometric + * polynomial. The results are tabulated here (the employed formula + * is eq. (7.66) in @cite hockney88a). + */ +double p3m_analytic_cotangent_sum(int n, double mesh_i, int cao); + #endif /* P3M || DP3M */ namespace detail { @@ -242,7 +250,7 @@ std::array, 3> inline calc_meshift( Utils::Vector3i const &mesh_size, bool zero_out_midpoint = false) { std::array, 3> ret{}; - for (auto i = 0u; i < 3u; ++i) { + for (unsigned int i = 0; i < 3; i++) { ret[i] = std::vector(static_cast(mesh_size[i])); for (int j = 1; j <= mesh_size[i] / 2; j++) { diff --git a/src/core/p3m/fft.cpp b/src/core/p3m/fft.cpp index a14c9a2d00b..57b56fbaa6c 100644 --- a/src/core/p3m/fft.cpp +++ b/src/core/p3m/fft.cpp @@ -84,7 +84,7 @@ using Utils::permute_ifield; */ std::optional> find_comm_groups(Utils::Vector3i const &grid1, Utils::Vector3i const &grid2, - std::span node_list1, std::span node_list2, + std::span node_list1, std::span node_list2, std::span pos, std::span my_pos, int rank) { int i; /* communication group cell size on grid1 and grid2 */ diff --git a/src/core/p3m/fft.hpp b/src/core/p3m/fft.hpp index c87f11a14cb..58ea6c7e692 100644 --- a/src/core/p3m/fft.hpp +++ b/src/core/p3m/fft.hpp @@ -33,7 +33,12 @@ * redistributed. * * For simplicity at the moment I have implemented a full complex to - * complex FFT (even though a real to complex FFT would be sufficient). + * complex FFT (even though a real to complex FFT would be + * sufficient) + * + * \todo Combine the forward and backward structures. + * \todo The packing routines could be moved to utils.hpp when they are needed + * elsewhere. * * For more information about FFT usage, see \ref fft.cpp "fft.cpp". */ @@ -83,24 +88,22 @@ template struct fft_allocator { template using fft_vector = std::vector>; -struct fft_plan { - /** plan direction: forward or backward FFT (enum value from FFTW). */ +/** Structure for performing a 1D FFT. + * + * This includes the information about the redistribution of the 3D + * FFT *grid before the actual FFT. + */ +struct fft_forw_plan { + /** plan direction: 0 = Forward FFT, 1 = Backward FFT. */ int dir; - /** plan for the FFT. */ - fftw_plan our_fftw_plan; - /** packing function for send blocks. */ - void (*pack_function)(double const *const, double *const, int const *, - int const *, int const *, int); -}; - -/** @brief Plan for a forward 1D FFT of a flattened 3D array. */ -struct fft_forw_plan : public fft_plan { /** row direction of that FFT. */ int row_dir; /** permutations from normal coordinate system. */ int n_permute; /** number of 1D FFTs. */ int n_ffts; + /** plan for fft. */ + fftw_plan our_fftw_plan; /** size of local mesh before communication. */ int old_mesh[3]; @@ -114,6 +117,9 @@ struct fft_forw_plan : public fft_plan { /** group of nodes which have to communicate with each other. */ std::vector group; + /** packing function for send blocks. */ + void (*pack_function)(double const *const, double *const, int const *, + int const *, int const *, int); /** Send block specification. 6 integers for each node: start[3], size[3]. */ std::vector send_block; /** Send block communication sizes. */ @@ -126,8 +132,17 @@ struct fft_forw_plan : public fft_plan { int element; }; -/** @brief Plan for a backward 1D FFT of a flattened 3D array. */ -struct fft_back_plan : public fft_plan {}; +/** Additional information for backwards FFT. */ +struct fft_back_plan { + /** plan direction. (e.g. fftw macro) */ + int dir; + /** plan for fft. */ + fftw_plan our_fftw_plan; + + /** packing function for send blocks. */ + void (*pack_function)(double const *const, double *const, int const *, + int const *, int const *, int); +}; /** Information about the three one dimensional FFTs and how the nodes * have to communicate inbetween. diff --git a/src/core/p3m/for_each_3d.hpp b/src/core/p3m/for_each_3d.hpp deleted file mode 100644 index 9f44871bd54..00000000000 --- a/src/core/p3m/for_each_3d.hpp +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2024 The ESPResSo project - * - * This file is part of ESPResSo. - * - * ESPResSo is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * ESPResSo is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#pragma once - -#include - -namespace detail { - -constexpr inline void noop_projector(unsigned, int) {} - -template -concept IndexVectorConcept = requires(T vector) { - { vector[0] } -> std::convertible_to; -}; - -} // namespace detail - -/** - * @brief Repeat an operation on every element of a 3D grid. - * - * Intermediate values that depend on the iterated coordinates - * are calculated and stored once per iteration. This is useful - * when the operation is costly. - * - * @param start Initial values for the loop counters. - * @param stop Final values (one-past-the-end) for the loop counters. - * @param counters Loop counters. - * @param kernel Functor to execute. - * @param projector Projection of the current loop counter. - * @tparam Kernel Nullary function. - * @tparam Projector Binary function that takes a nesting depth and a loop - * counter as arguments and projects a value. - */ -template - requires std::invocable and std::invocable -void for_each_3d(detail::IndexVectorConcept auto &&start, - detail::IndexVectorConcept auto &&stop, - detail::IndexVectorConcept auto &&counters, Kernel &&kernel, - Projector &&projector = detail::noop_projector) { - auto &nx = counters[0u]; - auto &ny = counters[1u]; - auto &nz = counters[2u]; - for (nx = start[0u]; nx < stop[0u]; ++nx) { - projector(0u, nx); - for (ny = start[1u]; ny < stop[1u]; ++ny) { - projector(1u, ny); - for (nz = start[2u]; nz < stop[2u]; ++nz) { - projector(2u, nz); - kernel(); - } - } - } -} diff --git a/src/core/p3m/influence_function.hpp b/src/core/p3m/influence_function.hpp index 24853d43e25..9c322c23506 100644 --- a/src/core/p3m/influence_function.hpp +++ b/src/core/p3m/influence_function.hpp @@ -16,16 +16,15 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ - -#pragma once +#ifndef ESPRESSO_P3M_INFLUENCE_FUNCTION_HPP +#define ESPRESSO_P3M_INFLUENCE_FUNCTION_HPP #include "p3m/common.hpp" -#include "p3m/for_each_3d.hpp" -#include "p3m/math.hpp" #include #include #include +#include #include #include @@ -53,43 +52,44 @@ template double G_opt(int cao, double alpha, Utils::Vector3d const &k, Utils::Vector3d const &h) { + using namespace detail::FFT_indexing; + using Utils::int_pow; + using Utils::sinc; + + auto constexpr two_pi = 2. * std::numbers::pi; + auto constexpr two_pi_i = 1. / two_pi; + auto constexpr limit = 30.; auto const k2 = k.norm2(); - if (k2 == 0.) { - return 0.; + if (k2 == 0.0) { + return 0.0; } - auto constexpr limit = 30.; - auto constexpr m_start = Utils::Vector3i::broadcast(-m); - auto constexpr m_stop = Utils::Vector3i::broadcast(m + 1); - auto const exponent_prefactor = Utils::sqr(1. / (2. * alpha)); - auto const wavevector = (2. * std::numbers::pi) / h; - auto const wavevector_i = 1. / wavevector; - auto indices = Utils::Vector3i{}; - auto km = Utils::Vector3d{}; - auto fnm = Utils::Vector3d{}; - auto numerator = 0.; - auto denominator = 0.; - - for_each_3d( - m_start, m_stop, indices, - [&]() { - using namespace detail::FFT_indexing; - auto const U2 = std::pow(Utils::product(fnm), 2 * cao); + double numerator = 0.0; + double denominator = 0.0; + + for (int mx = -m; mx <= m; mx++) { + for (int my = -m; my <= m; my++) { + for (int mz = -m; mz <= m; mz++) { + auto const km = + k + two_pi * Utils::Vector3d{mx / h[RX], my / h[RY], mz / h[RZ]}; + auto const U2 = std::pow(sinc(km[RX] * h[RX] * two_pi_i) * + sinc(km[RY] * h[RY] * two_pi_i) * + sinc(km[RZ] * h[RZ] * two_pi_i), + 2 * cao); + auto const km2 = km.norm2(); - auto const exponent = exponent_prefactor * km2; + auto const exponent = Utils::sqr(1. / (2. * alpha)) * km2; if (exponent < limit) { auto const f3 = std::exp(-exponent) * (4. * std::numbers::pi / km2); - numerator += U2 * f3 * Utils::int_pow(k * km); + numerator += U2 * f3 * int_pow(k * km); } denominator += U2; - }, - [&](unsigned dim, int n) { - km[dim] = k[dim] + n * wavevector[dim]; - fnm[dim] = math::sinc(km[dim] * wavevector_i[dim]); - }); + } + } + } - return numerator / (Utils::int_pow(k2) * Utils::sqr(denominator)); + return numerator / (int_pow(k2) * Utils::sqr(denominator)); } /** @@ -104,18 +104,20 @@ double G_opt(int cao, double alpha, Utils::Vector3d const &k, * * @param params P3M parameters * @param n_start Lower left corner of the grid - * @param n_stop Upper right corner of the grid. - * @param inv_box_l Inverse box length + * @param n_end Upper right corner of the grid. + * @param box_l Box size * @return Values of G_opt at regular grid points. */ template std::vector grid_influence_function(const P3MParameters ¶ms, const Utils::Vector3i &n_start, - const Utils::Vector3i &n_stop, - const Utils::Vector3d &inv_box_l) { + const Utils::Vector3i &n_end, + const Utils::Vector3d &box_l) { + using namespace detail::FFT_indexing; auto const shifts = detail::calc_meshift(params.mesh); - auto const size = n_stop - n_start; + + auto const size = n_end - n_start; /* The influence function grid */ auto g = std::vector(Utils::product(size), 0.); @@ -126,24 +128,31 @@ std::vector grid_influence_function(const P3MParameters ¶ms, return g; } - auto const wavevector = (2. * std::numbers::pi) * inv_box_l; - auto const half_mesh = params.mesh / 2; - auto indices = Utils::Vector3i{}; - auto index = std::size_t(0u); - - for_each_3d(n_start, n_stop, indices, [&]() { - using namespace detail::FFT_indexing; - if ((indices[KX] % half_mesh[RX] != 0) or - (indices[KY] % half_mesh[RY] != 0) or - (indices[KZ] % half_mesh[RZ] != 0)) { - auto const k = - Utils::Vector3d{{shifts[RX][indices[KX]] * wavevector[RX], - shifts[RY][indices[KY]] * wavevector[RY], - shifts[RZ][indices[KZ]] * wavevector[RZ]}}; - g[index] = G_opt(params.cao, params.alpha, k, params.a); + auto const h = Utils::Vector3d{params.a}; + + Utils::Vector3i n{}; + for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) { + for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) { + for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) { + auto const ind = Utils::get_linear_index(n - n_start, size, + Utils::MemoryOrder::ROW_MAJOR); + if ((n[KX] % (params.mesh[RX] / 2) == 0) && + (n[KY] % (params.mesh[RY] / 2) == 0) && + (n[KZ] % (params.mesh[RZ] / 2) == 0)) { + g[ind] = 0.0; + } else { + auto const k = 2. * std::numbers::pi * + Utils::Vector3d{shifts[RX][n[KX]] / box_l[RX], + shifts[RY][n[KY]] / box_l[RY], + shifts[RZ][n[KZ]] / box_l[RZ]}; + + g[ind] = G_opt(params.cao, params.alpha, k, h); + } + } } - ++index; - }); + } return g; } + +#endif diff --git a/src/core/p3m/influence_function_dipolar.hpp b/src/core/p3m/influence_function_dipolar.hpp index d52a1a6d675..e327846d20e 100644 --- a/src/core/p3m/influence_function_dipolar.hpp +++ b/src/core/p3m/influence_function_dipolar.hpp @@ -18,19 +18,19 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ - -#pragma once +#ifndef ESPRESSO_CORE_P3M_INFLUENCE_FUNCTION_DIPOLAR_HPP +#define ESPRESSO_CORE_P3M_INFLUENCE_FUNCTION_DIPOLAR_HPP #include "config/config.hpp" #if defined(DP3M) #include "p3m/common.hpp" -#include "p3m/for_each_3d.hpp" -#include "p3m/math.hpp" #include +#include #include +#include #include #include @@ -54,41 +54,40 @@ template double G_opt_dipolar(P3MParameters const ¶ms, Utils::Vector3i const &shift, Utils::Vector3i const &d_op) { - + using Utils::int_pow; + using Utils::sinc; auto constexpr limit = P3M_BRILLOUIN; auto constexpr exp_limit = 30.; - auto constexpr m_start = Utils::Vector3i::broadcast(-limit); - auto constexpr m_stop = Utils::Vector3i::broadcast(limit + 1); - auto const cao = params.cao; - auto const mesh = params.mesh[0]; - auto const offset = - static_cast(shift) / static_cast(mesh); - auto const f2 = Utils::sqr(std::numbers::pi / params.alpha_L); - auto indices = Utils::Vector3i{}; - auto nm = Utils::Vector3i{}; - auto fnm = Utils::Vector3d{}; + auto const exponent = 2. * params.cao; + auto numerator = 0.; auto denominator = 0.; - for_each_3d( - m_start, m_stop, indices, - [&]() { - auto const norm_sq = nm.norm2(); - auto const sz = std::pow(Utils::product(fnm), 2 * cao); - auto const exp_term = f2 * norm_sq; + auto const f1 = 1. / static_cast(params.mesh[0]); + auto const f2 = Utils::sqr(std::numbers::pi / params.alpha_L); + + for (int mx = -limit; mx <= limit; mx++) { + auto const nmx = shift[0] + params.mesh[0] * mx; + auto const sx = std::pow(sinc(f1 * nmx), exponent); + for (int my = -limit; my <= limit; my++) { + auto const nmy = shift[1] + params.mesh[0] * my; + auto const sy = sx * std::pow(sinc(f1 * nmy), exponent); + for (int mz = -limit; mz <= limit; mz++) { + auto const nmz = shift[2] + params.mesh[0] * mz; + auto const sz = sy * std::pow(sinc(f1 * nmz), exponent); + auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz); + auto const exp_term = f2 * nm2; if (exp_term < exp_limit) { - auto const f3 = sz * std::exp(-exp_term) / norm_sq; - numerator += f3 * Utils::int_pow(d_op * nm); + auto const f3 = sz * std::exp(-exp_term) / nm2; + auto const n_nm = d_op[0] * nmx + d_op[1] * nmy + d_op[2] * nmz; + numerator += f3 * int_pow(n_nm); } denominator += sz; - }, - [&](unsigned dim, int n) { - nm[dim] = shift[dim] + n * mesh; - fnm[dim] = math::sinc(offset[dim] + n * mesh); - }); - - return numerator / (Utils::int_pow(static_cast(d_op.norm2())) * - Utils::int_pow<2>(denominator)); + } + } + } + return numerator / (int_pow(static_cast(d_op.norm2())) * + Utils::sqr(denominator)); } /** @@ -101,17 +100,17 @@ double G_opt_dipolar(P3MParameters const ¶ms, Utils::Vector3i const &shift, * * @param params DP3M parameters * @param n_start Lower left corner of the grid - * @param n_stop Upper right corner of the grid. - * @param inv_box_l Inverse box length + * @param n_end Upper right corner of the grid. + * @param box_l Box size * @return Values of the influence function at regular grid points. */ template std::vector grid_influence_function(P3MParameters const ¶ms, Utils::Vector3i const &n_start, - Utils::Vector3i const &n_stop, - Utils::Vector3d const &inv_box_l) { + Utils::Vector3i const &n_end, + Utils::Vector3d const &box_l) { - auto const size = n_stop - n_start; + auto const size = n_end - n_start; /* The influence function grid */ auto g = std::vector(Utils::product(size), 0.); @@ -122,56 +121,61 @@ std::vector grid_influence_function(P3MParameters const ¶ms, return g; } - auto prefactor = Utils::int_pow<3>(static_cast(params.mesh[0])) * 2. * - Utils::int_pow<2>(inv_box_l[0]); - - auto const offset = detail::calc_meshift(params.mesh, false)[0]; - auto const d_op = detail::calc_meshift(params.mesh, true)[0]; - auto const half_mesh = params.mesh[0] / 2; - auto indices = Utils::Vector3i{}; - auto shift_off = Utils::Vector3i{}; - auto d_op_off = Utils::Vector3i{}; - auto index = std::size_t(0u); - - for_each_3d( - n_start, n_stop, indices, - [&]() { - if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or - (indices[2] % half_mesh != 0))) { - g[index] = prefactor * G_opt_dipolar(params, shift_off, d_op_off); + double fak1 = Utils::int_pow<3>(static_cast(params.mesh[0])) * 2.0 / + Utils::sqr(box_l[0]); + + auto const shifts = detail::calc_meshift(params.mesh, false); + auto const d_ops = detail::calc_meshift(params.mesh, true); + + Utils::Vector3i n{}; + for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) { + for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) { + for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) { + auto const ind = Utils::get_linear_index(n - n_start, size, + Utils::MemoryOrder::ROW_MAJOR); + + if (((n[0] % (params.mesh[0] / 2) == 0) && + (n[1] % (params.mesh[0] / 2) == 0) && + (n[2] % (params.mesh[0] / 2) == 0))) { + g[ind] = 0.0; + } else { + auto const shift = Utils::Vector3i{shifts[0][n[0]], shifts[0][n[1]], + shifts[0][n[2]]}; + auto const d_op = + Utils::Vector3i{d_ops[0][n[0]], d_ops[0][n[1]], d_ops[0][n[2]]}; + auto const fak2 = G_opt_dipolar(params, shift, d_op); + g[ind] = fak1 * fak2; } - ++index; - }, - [&](unsigned dim, int n) { - d_op_off[dim] = d_op[n]; - shift_off[dim] = offset[n]; - }); - + } + } + } return g; } inline double G_opt_dipolar_self_energy(P3MParameters const ¶ms, Utils::Vector3i const &shift) { - + using Utils::sinc; auto constexpr limit = P3M_BRILLOUIN + 1; - auto constexpr m_start = Utils::Vector3i::broadcast(-limit); - auto constexpr m_stop = Utils::Vector3i::broadcast(limit + 1); - auto const cao = params.cao; - auto const mesh = params.mesh[0]; - auto const offset = - static_cast(shift) / static_cast(mesh); - auto indices = Utils::Vector3i{}; - auto fnm = Utils::Vector3d{}; - auto energy = 0.; - - for_each_3d( - m_start, m_stop, indices, - [&]() { energy += std::pow(Utils::product(fnm), 2 * cao); }, - [&](unsigned dim, int n) { - fnm[dim] = math::sinc(offset[dim] + n * mesh); - }); - - return energy; + auto const exponent = 2. * params.cao; + + auto u_sum = 0.0; + + auto const f1 = 1.0 / static_cast(params.mesh[0]); + + for (int mx = -limit; mx <= limit; mx++) { + auto const nmx = shift[0] + params.mesh[0] * mx; + auto const sx = std::pow(sinc(f1 * nmx), exponent); + for (int my = -limit; my <= limit; my++) { + auto const nmy = shift[1] + params.mesh[0] * my; + auto const sy = sx * std::pow(sinc(f1 * nmy), exponent); + for (int mz = -limit; mz <= limit; mz++) { + auto const nmz = shift[2] + params.mesh[0] * mz; + auto const sz = sy * std::pow(sinc(f1 * nmz), exponent); + u_sum += sz; + } + } + } + return u_sum; } /** @@ -179,39 +183,42 @@ inline double G_opt_dipolar_self_energy(P3MParameters const ¶ms, * * @param params DP3M parameters * @param n_start Lower left corner of the grid - * @param n_stop Upper right corner of the grid. + * @param n_end Upper right corner of the grid. * @param g Energies on the grid. * @return Total self-energy. */ inline double grid_influence_function_self_energy( P3MParameters const ¶ms, Utils::Vector3i const &n_start, - Utils::Vector3i const &n_stop, std::vector const &g) { - - auto const offset = detail::calc_meshift(params.mesh, false)[0]; - auto const d_op = detail::calc_meshift(params.mesh, true)[0]; - auto const half_mesh = params.mesh[0] / 2; - auto indices = Utils::Vector3i{}; - auto shift_off = Utils::Vector3i{}; - auto d_op_off = Utils::Vector3i{}; - auto index = std::size_t(0u); - auto energy = 0.; - - for_each_3d( - n_start, n_stop, indices, - [&]() { - if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or - (indices[2] % half_mesh != 0))) { - auto const U2 = G_opt_dipolar_self_energy(params, shift_off); - energy += g[index] * U2 * d_op_off.norm2(); + Utils::Vector3i const &n_end, std::vector const &g) { + auto const size = n_end - n_start; + + auto const shifts = detail::calc_meshift(params.mesh, false); + auto const d_ops = detail::calc_meshift(params.mesh, true); + + double energy = 0.0; + Utils::Vector3i n{}; + for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) { + for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) { + for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) { + if (((n[0] % (params.mesh[0] / 2) == 0) && + (n[1] % (params.mesh[0] / 2) == 0) && + (n[2] % (params.mesh[0] / 2) == 0))) { + energy += 0.0; + } else { + auto const ind = Utils::get_linear_index( + n - n_start, size, Utils::MemoryOrder::ROW_MAJOR); + auto const shift = Utils::Vector3i{shifts[0][n[0]], shifts[0][n[1]], + shifts[0][n[2]]}; + auto const d_op = + Utils::Vector3i{d_ops[0][n[0]], d_ops[0][n[1]], d_ops[0][n[2]]}; + auto const U2 = G_opt_dipolar_self_energy(params, shift); + energy += g[ind] * U2 * d_op.norm2(); } - ++index; - }, - [&](unsigned dim, int n) { - d_op_off[dim] = d_op[n]; - shift_off[dim] = offset[n]; - }); - + } + } + } return energy; } -#endif // defined(DP3M) +#endif +#endif diff --git a/src/core/p3m/math.hpp b/src/core/p3m/math.hpp deleted file mode 100644 index a9f2bdf7fd5..00000000000 --- a/src/core/p3m/math.hpp +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright (C) 2010-2024 The ESPResSo project - * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 - * Max-Planck-Institute for Polymer Research, Theory Group - * - * This file is part of ESPResSo. - * - * ESPResSo is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * ESPResSo is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#pragma once - -#include -#include - -#ifndef __CUDACC__ -#include -#endif -#include -#include -#include - -namespace math { - -/** @brief Return the absolute value of x. */ -inline DEVICE_QUALIFIER auto abs(double x) { return fabs(x); } - -/** @brief Return the absolute value of x. */ -inline DEVICE_QUALIFIER auto abs(float x) { return fabsf(x); } - -/** - * @brief Calculate the function @f$ \mathrm{sinc}(x) = \sin(\pi x)/(\pi x) @f$. - * - * (same convention as in @cite hockney88a). In order to avoid divisions - * by 0, arguments whose modulus is smaller than @f$ \epsilon = 0.1 @f$ - * are evaluated by an 8th order Taylor expansion. - * Note that the difference between sinc(x) and this expansion - * is smaller than 0.235e-12, if x is smaller than @f$ \epsilon @f$. - * (The next term in the expansion is the 10th order contribution, i.e. - * @f$ \pi^{10} x^{10}/39916800 \approx 0.2346 \cdot x^{12} @f$). - */ -template DEVICE_QUALIFIER auto sinc(T x) { - auto constexpr epsilon = T(0.1); -#if not defined(__CUDACC__) - using std::sin; -#endif - - auto const pix = std::numbers::pi_v * x; - - if (::math::abs(x) > epsilon) - return sin(pix) / pix; - - auto constexpr factorial = [](int n) { - int acc{1}, c{1}; - while (c < n) { - acc *= ++c; - } - return acc; - }; - - /* Coefficients of the Taylor expansion of sinc */ - auto constexpr c0 = T(+1) / T(factorial(1)); - auto constexpr c2 = T(-1) / T(factorial(3)); - auto constexpr c4 = T(+1) / T(factorial(5)); - auto constexpr c6 = T(-1) / T(factorial(7)); - auto constexpr c8 = T(+1) / T(factorial(9)); - - auto const pix2 = pix * pix; - return c0 + pix2 * (c2 + pix2 * (c4 + pix2 * (c6 + pix2 * c8))); -} - -/** - * @brief One of the aliasing sums used to compute k-space errors. - * Fortunately the one which is most important (because it converges - * most slowly, since it is not damped exponentially) can be calculated - * analytically. The result (which depends on the order of the spline - * interpolation) can be written as an even trigonometric polynomial. - * The results are tabulated here (the employed formula is eq. 7-66 - * p. 233 in @cite hockney88a). - */ -template -DEVICE_QUALIFIER auto analytic_cotangent_sum(int n, double mesh_i) { - static_assert(cao >= 1 and cao <= 7); -#if not defined(__CUDACC__) - using std::cos; -#endif - auto const theta = static_cast(n) * mesh_i * std::numbers::pi; - auto const c = Utils::sqr(cos(theta)); - - if constexpr (cao == 1) { - return 1.; - } - if constexpr (cao == 2) { - return (1. + c * 2.) / 3.; - } - if constexpr (cao == 3) { - return (2. + c * (11. + c * 2.)) / 15.; - } - if constexpr (cao == 4) { - return (17. + c * (180. + c * (114. + c * 4.))) / 315.; - } - if constexpr (cao == 5) { - return (62. + c * (1072. + c * (1452. + c * (247. + c * 2.)))) / 2835.; - } - if constexpr (cao == 6) { - return (1382. + - c * (35396. + c * (83021. + c * (34096. + c * (2026. + c * 4.))))) / - 155925.; - } - return (21844. + - c * (776661. + - c * (2801040. + - c * (2123860. + c * (349500. + c * (8166. + c * 4.)))))) / - 6081075.; -} - -inline auto get_analytic_cotangent_sum_kernel(int cao) { - decltype(&analytic_cotangent_sum<1>) ptr = nullptr; - if (cao == 1) { - ptr = &analytic_cotangent_sum<1>; - } else if (cao == 2) { - ptr = &analytic_cotangent_sum<2>; - } else if (cao == 3) { - ptr = &analytic_cotangent_sum<3>; - } else if (cao == 4) { - ptr = &analytic_cotangent_sum<4>; - } else if (cao == 5) { - ptr = &analytic_cotangent_sum<5>; - } else if (cao == 6) { - ptr = &analytic_cotangent_sum<6>; - } else if (cao == 7) { - ptr = &analytic_cotangent_sum<7>; - } - if (ptr == nullptr) { - throw std::logic_error("Invalid value cao=" + std::to_string(cao)); - } - return ptr; -} - -} // namespace math diff --git a/src/core/unit_tests/CMakeLists.txt b/src/core/unit_tests/CMakeLists.txt index 60fcadae764..d843da0a636 100644 --- a/src/core/unit_tests/CMakeLists.txt +++ b/src/core/unit_tests/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright (C) 2010-2024 The ESPResSo project +# Copyright (C) 2010-2022 The ESPResSo project # Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 # Max-Planck-Institute for Polymer Research, Theory Group # @@ -94,8 +94,6 @@ if(ESPRESSO_BUILD_WITH_FFTW) espresso::core) unit_test(NAME fft_test SRC fft_test.cpp DEPENDS espresso::utils espresso::core) - unit_test(NAME math_test SRC math_test.cpp DEPENDS espresso::utils - espresso::core) endif() if(ESPRESSO_BUILD_WITH_CUDA) diff --git a/src/core/unit_tests/cuda_test.cu b/src/core/unit_tests/cuda_test.cu index 812638cfa96..f5057fe7a75 100644 --- a/src/core/unit_tests/cuda_test.cu +++ b/src/core/unit_tests/cuda_test.cu @@ -17,12 +17,10 @@ * along with this program. If not, see . */ -#define BOOST_TEST_MODULE "CUDA interface tests" +#define BOOST_TEST_MODULE cuda test #define BOOST_TEST_DYN_LINK #include -#include "config/config.hpp" - #include "cuda/init.hpp" #include "cuda/utils.cuh" #include "cuda/utils.hpp" @@ -35,22 +33,20 @@ #include #include -#include #include #include boost::test_tools::assertion_result has_gpu(boost::unit_test::test_unit_id) { - bool has_compatible_device = false; int n_devices = 0; cudaGetDeviceCount(&n_devices); if (n_devices > 0) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); if (prop.major >= 3) { - has_compatible_device = true; + return true; } } - return has_compatible_device; + return false; } std::optional read_pending_cuda_errors() { @@ -83,43 +79,44 @@ void clear() { } // namespace Testing::non_sticky_cuda_error -#ifdef P3M -dim3 p3m_make_grid(unsigned int n_blocks); -#endif - static auto fixture = boost::unit_test::fixture(&setup, &teardown); BOOST_AUTO_TEST_SUITE(suite, *boost::unit_test::precondition(has_gpu)) BOOST_AUTO_TEST_CASE(gpu_fixture, *fixture) { - auto error1 = read_pending_cuda_errors(); - BOOST_REQUIRE(not error1.has_value()); - - // check we can raise and clear non-sticky CUDA errors - Testing::non_sticky_cuda_error::trigger(); - Testing::non_sticky_cuda_error::clear(); - auto error2 = read_pending_cuda_errors(); - BOOST_REQUIRE(not error2.has_value()); - - // check fixture can handle the default non-sticky CUDA error - Testing::non_sticky_cuda_error::trigger(); - auto ref_what3{"There is a pending CUDA error: \"invalid device ordinal\""}; - auto error3 = read_pending_cuda_errors(); - BOOST_REQUIRE(error3.has_value()); - BOOST_REQUIRE_EQUAL(error3.value(), ref_what3); - // sticky error should have been cleared - error3 = read_pending_cuda_errors(); - BOOST_REQUIRE(not error3.has_value()); - - // check fixture can handle a custom non-sticky CUDA error - cudaMallocHost(nullptr, std::size_t(0u)); - auto ref_what4{"There is a pending CUDA error: \"invalid argument\""}; - auto error4 = read_pending_cuda_errors(); - BOOST_REQUIRE(error4.has_value()); - BOOST_REQUIRE_EQUAL(error4.value(), ref_what4); - // sticky error should have been cleared - error4 = read_pending_cuda_errors(); - BOOST_REQUIRE(not error4.has_value()); + { + auto error = read_pending_cuda_errors(); + BOOST_REQUIRE(not error.has_value()); + } + { + // check we can raise and clear non-sticky CUDA errors + Testing::non_sticky_cuda_error::trigger(); + Testing::non_sticky_cuda_error::clear(); + auto error = read_pending_cuda_errors(); + BOOST_REQUIRE(not error.has_value()); + } + { + // check fixture can handle the default non-sticky CUDA error + Testing::non_sticky_cuda_error::trigger(); + auto ref_what{"There is a pending CUDA error: \"invalid device ordinal\""}; + auto error = read_pending_cuda_errors(); + BOOST_REQUIRE(error.has_value()); + BOOST_REQUIRE_EQUAL(error.value(), ref_what); + // sticky error should have been cleared + error = read_pending_cuda_errors(); + BOOST_REQUIRE(not error.has_value()); + } + { + // check fixture can handle a custom non-sticky CUDA error + cudaMallocHost(nullptr, std::size_t(0u)); + auto ref_what{"There is a pending CUDA error: \"invalid argument\""}; + auto error = read_pending_cuda_errors(); + BOOST_REQUIRE(error.has_value()); + BOOST_REQUIRE_EQUAL(error.value(), ref_what); + // sticky error should have been cleared + error = read_pending_cuda_errors(); + BOOST_REQUIRE(not error.has_value()); + } } static int fatal_error_counter = 0; @@ -128,153 +125,127 @@ static void increment_counter() noexcept { ++fatal_error_counter; } BOOST_AUTO_TEST_CASE(gpu_interface, *fixture) { fatal_error_counter = 0; auto local_error_counter = 0; - - try { - throw cuda_fatal_error("message"); - } catch (cuda_fatal_error &err) { - std::string const what = "message"; - BOOST_CHECK_EQUAL(err.what(), what); - BOOST_CHECK_EQUAL(err.get_terminate(), &errexit); - err.set_terminate(nullptr); - BOOST_CHECK_EQUAL(err.get_terminate(), nullptr); - err.set_terminate(increment_counter); - BOOST_CHECK_EQUAL(err.get_terminate(), &increment_counter); - BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); - } - ++local_error_counter; - BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); - - // ----------------------- - - auto error_caught = false; - auto const block = dim3{1, 2, 3}; - auto const grid = dim3{4, 5, 6}; - cuda_check_errors_exit(block, grid, "", "", 0u); // should not throw - try { - Testing::non_sticky_cuda_error::trigger(); - // should clear the CUDA error flag and throw a fatal error - cuda_check_errors_exit(block, grid, "cudaSetDevice()", "filename.cu", 4u); - } catch (cuda_fatal_error &err) { - error_caught = true; - err.set_terminate(increment_counter); - std::string const what = - "CUDA error: \"invalid device ordinal\" while calling " - "cudaSetDevice() with block: <1,2,3>, grid: <4,5,6> in filename.cu:4"; - BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); - BOOST_CHECK_EQUAL(err.what(), what); - BOOST_CHECK_EQUAL(cudaGetLastError(), cudaSuccess); - } - ++local_error_counter; - BOOST_REQUIRE(error_caught); - BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); - - // ----------------------- - - error_caught = false; - cuda_safe_mem_exit(cudaSuccess, "", 0u); // should not throw - try { - Testing::non_sticky_cuda_error::trigger(); - cuda_safe_mem_exit(cudaSuccess, "filename.cu", 4u); // should throw - } catch (cuda_fatal_error &err) { - error_caught = true; - err.set_terminate(increment_counter); - std::string const what = - "CUDA error: \"invalid device ordinal\" in filename.cu:4. Error " - "found during memory operation. Possibly however from a failed " - "operation before the memory operation"; - BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); - BOOST_CHECK_EQUAL(err.what(), what); + { + std::string const what = "message 1"; + try { + throw cuda_fatal_error(what); + } catch (cuda_fatal_error &err) { + BOOST_CHECK_EQUAL(err.what(), what); + BOOST_CHECK_EQUAL(err.get_terminate(), &errexit); + err.set_terminate(nullptr); + BOOST_CHECK_EQUAL(err.get_terminate(), nullptr); + err.set_terminate(increment_counter); + BOOST_CHECK_EQUAL(err.get_terminate(), &increment_counter); + BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); + } + ++local_error_counter; + BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); } - ++local_error_counter; - BOOST_REQUIRE(error_caught); - BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); - - // ----------------------- - - error_caught = false; - try { - cuda_safe_mem_exit(cudaErrorNotPermitted, "filename.cu", 4u); - } catch (cuda_fatal_error &err) { - error_caught = true; - err.set_terminate(increment_counter); - std::string const what = "CUDA error: \"operation not permitted\" during " - "memory operation in filename.cu:4"; - BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); - BOOST_CHECK_EQUAL(err.what(), what); + { + auto error_caught = false; + auto const block = dim3{1, 2, 3}; + auto const grid = dim3{4, 5, 6}; + // should not throw + cuda_check_errors_exit(block, grid, "", "", 0u); + try { + Testing::non_sticky_cuda_error::trigger(); + // should clear the CUDA error flag and throw a fatal error + cuda_check_errors_exit(block, grid, "cudaSetDevice()", "filename.cu", 4u); + } catch (cuda_fatal_error &err) { + error_caught = true; + err.set_terminate(increment_counter); + std::string const what = + "CUDA error: \"invalid device ordinal\" while calling " + "cudaSetDevice() with block: <1,2,3>, grid: <4,5,6> in filename.cu:4"; + BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); + BOOST_CHECK_EQUAL(err.what(), what); + BOOST_CHECK_EQUAL(cudaGetLastError(), cudaSuccess); + } + ++local_error_counter; + BOOST_REQUIRE(error_caught); + BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); } - ++local_error_counter; - BOOST_REQUIRE(error_caught); - BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); - - // ----------------------- - - error_caught = false; - try { - cuda_safe_mem_exit(cudaErrorInvalidValue, "function_name()", 4u); - } catch (cuda_fatal_error &err) { - error_caught = true; - err.set_terminate(increment_counter); - std::string const what = - "CUDA error: \"invalid argument\" during memory operation in " - "function_name():4. You may have tried to allocate zero memory"; - BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); - BOOST_CHECK_EQUAL(err.what(), what); + { + auto error_caught = false; + // should not throw + cuda_safe_mem_exit(cudaSuccess, "", 0u); + try { + Testing::non_sticky_cuda_error::trigger(); + // should throw + cuda_safe_mem_exit(cudaSuccess, "filename.cu", 4u); + } catch (cuda_fatal_error &err) { + error_caught = true; + err.set_terminate(increment_counter); + std::string const what = + "CUDA error: \"invalid device ordinal\" in filename.cu:4. Error " + "found during memory operation. Possibly however from a failed " + "operation before the memory operation"; + BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); + BOOST_CHECK_EQUAL(err.what(), what); + } + ++local_error_counter; + BOOST_REQUIRE(error_caught); + BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); } - ++local_error_counter; - BOOST_REQUIRE(error_caught); - BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); - - // ----------------------- - - error_caught = false; - BOOST_REQUIRE_EQUAL(stream[0], nullptr); - cuda_init(); // allocate - BOOST_REQUIRE_NE(stream[0], nullptr); - cuda_set_device(0); // reallocate, may or may not result in the same pointer - BOOST_REQUIRE_NE(stream[0], nullptr); - auto const old_stream = stream[0]; - try { - cuda_set_device(-1); // fail to reallocate, pointer remains the same - } catch (cuda_runtime_error_cuda const &err) { - error_caught = true; - std::string const what = "CUDA error: invalid device ordinal"; - BOOST_CHECK_EQUAL(err.what(), what); + { + auto error_caught = false; + try { + cuda_safe_mem_exit(cudaErrorNotPermitted, "filename.cu", 4u); + } catch (cuda_fatal_error &err) { + error_caught = true; + err.set_terminate(increment_counter); + std::string const what = "CUDA error: \"operation not permitted\" during " + "memory operation in filename.cu:4"; + BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); + BOOST_CHECK_EQUAL(err.what(), what); + } + ++local_error_counter; + BOOST_REQUIRE(error_caught); + BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); } - BOOST_REQUIRE(error_caught); - BOOST_REQUIRE_EQUAL(stream[0], old_stream); - - // ----------------------- - - BOOST_REQUIRE_GE(cuda_get_n_gpus(), 1); - char gpu_name_buffer[260] = {'\0'}; - cuda_get_gpu_name(0, gpu_name_buffer); - for (int i = 255; i < 260; ++i) { - BOOST_REQUIRE_EQUAL(gpu_name_buffer[i], '\0'); + { + auto error_caught = false; + try { + cuda_safe_mem_exit(cudaErrorInvalidValue, "function_name()", 4u); + } catch (cuda_fatal_error &err) { + error_caught = true; + err.set_terminate(increment_counter); + std::string const what = + "CUDA error: \"invalid argument\" during memory operation in " + "function_name():4. You may have tried to allocate zero memory"; + BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter); + BOOST_CHECK_EQUAL(err.what(), what); + } + ++local_error_counter; + BOOST_REQUIRE(error_caught); + BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter); } -} - -#ifdef P3M - -BOOST_AUTO_TEST_CASE(p3m_reshape_grid_test, *fixture) { - auto constexpr optimal_size = 65536u; - - for (auto cao = 1u; cao <= 3u; ++cao) { - auto const n_blocks = cao * optimal_size; - auto const grid = p3m_make_grid(n_blocks); - BOOST_CHECK_EQUAL(grid.x, optimal_size); - BOOST_CHECK_EQUAL(grid.y, cao); - BOOST_CHECK_EQUAL(grid.z, 1u); + { + BOOST_REQUIRE_EQUAL(stream[0], nullptr); + auto error_caught = false; + cuda_init(); // allocate + BOOST_REQUIRE_NE(stream[0], nullptr); + cuda_set_device(0); // reallocate, may or may not result in the same pointer + BOOST_REQUIRE_NE(stream[0], nullptr); + auto const old_stream = stream[0]; + try { + cuda_set_device(-1); // fail to reallocate, pointer remains the same + } catch (cuda_runtime_error_cuda const &err) { + error_caught = true; + std::string const what = "CUDA error: invalid device ordinal"; + BOOST_CHECK_EQUAL(err.what(), what); + } + BOOST_REQUIRE(error_caught); + BOOST_REQUIRE_EQUAL(stream[0], old_stream); } - - for (auto mul : {2u, 3u, 6u, 12u}) { - auto const n_blocks = mul * optimal_size + 1u; - auto const grid = p3m_make_grid(n_blocks); - BOOST_CHECK_EQUAL(grid.x, n_blocks / (mul + 1u) + ((mul == 2u) ? 0u : 1u)); - BOOST_CHECK_EQUAL(grid.y, (mul + 1u)); - BOOST_CHECK_EQUAL(grid.z, 1u); + { + BOOST_REQUIRE_GE(cuda_get_n_gpus(), 1); + char gpu_name_buffer[260] = {'\0'}; + cuda_get_gpu_name(0, gpu_name_buffer); + for (int i = 255; i < 260; ++i) { + BOOST_REQUIRE_EQUAL(gpu_name_buffer[i], '\0'); + } } } -#endif - BOOST_AUTO_TEST_SUITE_END() diff --git a/src/core/unit_tests/fft_test.cpp b/src/core/unit_tests/fft_test.cpp index bb27335279b..1506e01f20d 100644 --- a/src/core/unit_tests/fft_test.cpp +++ b/src/core/unit_tests/fft_test.cpp @@ -17,17 +17,11 @@ * along with this program. If not, see . */ -#define BOOST_TEST_MODULE "FFT utility functions" +#define BOOST_TEST_MODULE fft test #define BOOST_TEST_DYN_LINK - -#include "config/config.hpp" - -#if defined(P3M) || defined(DP3M) - #include #include "p3m/fft.hpp" -#include "p3m/for_each_3d.hpp" #include @@ -39,9 +33,10 @@ #include #include +#if defined(P3M) || defined(DP3M) std::optional> find_comm_groups(Utils::Vector3i const &, Utils::Vector3i const &, - std::span, + std::span, std::span, std::span, std::span, int); @@ -68,50 +63,11 @@ BOOST_AUTO_TEST_CASE(fft_find_comm_groups_mismatch) { } BOOST_AUTO_TEST_CASE(fft_exceptions) { - auto constexpr size_max = std::numeric_limits::max(); - auto constexpr bad_size = size_max / sizeof(int) + 1ul; fft_allocator allocator{}; BOOST_CHECK_EQUAL(allocator.allocate(0ul), nullptr); - BOOST_CHECK_THROW(allocator.allocate(bad_size), std::bad_array_new_length); -} - -BOOST_AUTO_TEST_CASE(for_each_3d_test) { - auto const m_start = Utils::Vector3i{{0, -1, 3}}; - auto const m_stop = Utils::Vector3i{{2, 2, 5}}; - auto ref_loop_counters = m_start; - auto indices = Utils::Vector3i{}; - - auto const kernel = [&]() { - BOOST_REQUIRE_EQUAL(indices, ref_loop_counters); - if (++ref_loop_counters[2u] == m_stop[2u]) { - ref_loop_counters[2u] = m_start[2u]; - if (++ref_loop_counters[1u] == m_stop[1u]) { - ref_loop_counters[1u] = m_start[1u]; - if (++ref_loop_counters[0u] == m_stop[0u]) { - ref_loop_counters[0u] = m_start[0u]; - } - } - } - }; - - { - for_each_3d(m_start, m_stop, indices, kernel, [&](unsigned dim, int n) { - BOOST_REQUIRE_GE(dim, 0); - BOOST_REQUIRE_LE(dim, 2); - BOOST_REQUIRE_EQUAL(n, ref_loop_counters[dim]); - }); - - BOOST_REQUIRE_EQUAL(indices, m_stop); - BOOST_REQUIRE_EQUAL(ref_loop_counters, m_start); - } - { - for_each_3d(m_start, m_stop, indices, kernel); - - BOOST_REQUIRE_EQUAL(indices, m_stop); - BOOST_REQUIRE_EQUAL(ref_loop_counters, m_start); - } + BOOST_CHECK_THROW( + allocator.allocate(std::numeric_limits::max() / sizeof(int) + + 1ul), + std::bad_array_new_length); } - -#else // defined(P3M) || defined(DP3M) -int main(int argc, char **argv) {} #endif // defined(P3M) || defined(DP3M) diff --git a/src/core/unit_tests/math_test.cpp b/src/core/unit_tests/math_test.cpp deleted file mode 100644 index 46a638800f1..00000000000 --- a/src/core/unit_tests/math_test.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (C) 2017-2024 The ESPResSo project - * - * ESPResSo is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * ESPResSo is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#define BOOST_TEST_MODULE "Special mathematical functions" -#define BOOST_TEST_DYN_LINK -#include - -#include "p3m/math.hpp" - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include - -/** @brief Compute the n-th term in the Taylor series of @c tan(x). */ -static auto taylor_series_tangent(int n) { - auto const two_power_2n = std::pow(2., 2 * n); - auto const b2n = boost::math::bernoulli_b2n(n); - auto const f2n = boost::math::factorial(2 * n); - return std::pow(-1., n - 1) * two_power_2n * (two_power_2n - 1.) * b2n / f2n; -} - -/** @brief Compute the n-th term in the Taylor series of @c sinc(x). */ -static auto taylor_series_sinc(int n) { - return std::pow(-1., n) / std::tgamma(2 * n + 2); -} - -BOOST_AUTO_TEST_CASE(abs_test) { - static_assert(std::is_same_v); - static_assert(std::is_same_v); - - BOOST_CHECK_EQUAL(math::abs(+3.1415), std::abs(+3.1415)); - BOOST_CHECK_EQUAL(math::abs(-3.1415), std::abs(-3.1415)); - BOOST_CHECK_EQUAL(math::abs(+3.1415f), std::abs(+3.1415f)); - BOOST_CHECK_EQUAL(math::abs(-3.1415f), std::abs(-3.1415f)); -} - -BOOST_AUTO_TEST_CASE(sinc_test) { - // edge case - BOOST_CHECK_EQUAL(math::sinc(0.0), 1.0); - - // check against standard math functions - auto x = 0.001; - while (x <= 0.11) { - auto const approx = math::sinc(x); - auto const pi_x = std::numbers::pi * x; - auto const exact = std::sin(pi_x) / (pi_x); - BOOST_CHECK_SMALL(approx - exact, 1e-13); - x += 0.01; - } - - // check Taylor expansion - auto const series = std::views::iota(0, 5) | std::views::reverse | - std::views::transform(taylor_series_sinc); - for (auto const x : {1e-6, 1e-5, 1e-4, 1e-3, 1e-2}) { - auto const pix2 = Utils::sqr(std::numbers::pi * x); - auto const ref = std::accumulate( - series.begin(), series.end(), 0., - [pix2](auto const &acc, auto const &c) { return acc * pix2 + c; }); - BOOST_CHECK_SMALL(math::sinc(x) - ref, 1e-13); - } -} - -BOOST_AUTO_TEST_CASE(analytic_cotangent_sum_test) { - auto constexpr tol = 8. * 100. * std::numeric_limits::epsilon(); - auto const kernel = [](int n, double mesh_i, int cao) { - return math::get_analytic_cotangent_sum_kernel(cao)(n, mesh_i); - }; - - // edge case: at theta = 0, aliasing sums are unity - for (auto const cao : std::ranges::iota_view{1, 8}) { - BOOST_CHECK_CLOSE(kernel(0, 0., cao), 1., tol); - } - // edge case: at theta = pi / 2, aliasing sums are equal to the cao-th term - // in the tangent Taylor series - for (auto const cao : std::ranges::iota_view{1, 8}) { - BOOST_CHECK_CLOSE(kernel(1, 0.5, cao), taylor_series_tangent(cao), tol); - } - - // check assertion - for (auto const invalid_cao : {-1, 0, 8}) { - BOOST_CHECK_THROW(kernel(1, 0., invalid_cao), std::logic_error); - } -} diff --git a/src/core/unit_tests/p3m_test.cpp b/src/core/unit_tests/p3m_test.cpp index 3f091d565ce..5fd3a83915b 100644 --- a/src/core/unit_tests/p3m_test.cpp +++ b/src/core/unit_tests/p3m_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 The ESPResSo project + * Copyright (C) 2010-2022 The ESPResSo project * * This file is part of ESPResSo. * @@ -17,7 +17,7 @@ * along with this program. If not, see . */ -#define BOOST_TEST_MODULE "P3M utility functions" +#define BOOST_TEST_MODULE p3m test #define BOOST_TEST_DYN_LINK #include @@ -27,6 +27,8 @@ #include #include +#include +#include #include BOOST_AUTO_TEST_CASE(calc_meshift_false) { @@ -58,3 +60,27 @@ BOOST_AUTO_TEST_CASE(calc_meshift_true) { } } } + +#if defined(P3M) || defined(DP3M) +BOOST_AUTO_TEST_CASE(analytic_cotangent_sum) { + auto constexpr kernel = p3m_analytic_cotangent_sum; + auto constexpr tol = 8. * 100. * std::numeric_limits::epsilon(); + + // check only trivial cases + for (auto const cao : {1, 2, 3, 4, 5, 6, 7}) { + BOOST_CHECK_CLOSE(kernel(0, 0., cao), 1., tol); + } + BOOST_CHECK_CLOSE(kernel(1, 0.5, 1), 1., tol); + BOOST_CHECK_CLOSE(kernel(1, 0.5, 2), 1. / 3., tol); + BOOST_CHECK_CLOSE(kernel(1, 0.5, 3), 2. / 15., tol); + BOOST_CHECK_CLOSE(kernel(1, 0.5, 4), 17. / 315., tol); + BOOST_CHECK_CLOSE(kernel(1, 0.5, 5), 62. / 2835., tol); + BOOST_CHECK_CLOSE(kernel(1, 0.5, 6), 1382. / 155925., tol); + BOOST_CHECK_CLOSE(kernel(1, 0.5, 7), 21844. / 6081075., tol); + + // check assertion + for (auto const invalid_cao : {-1, 0, 8}) { + BOOST_CHECK_THROW(kernel(1, 0., invalid_cao), std::logic_error); + } +} +#endif // defined(P3M) || defined(DP3M) diff --git a/src/shapes/include/shapes/Ellipsoid.hpp b/src/shapes/include/shapes/Ellipsoid.hpp index e65149fb93c..39fc53bb293 100644 --- a/src/shapes/include/shapes/Ellipsoid.hpp +++ b/src/shapes/include/shapes/Ellipsoid.hpp @@ -23,7 +23,7 @@ #define SHAPES_ELLIPSOID_HPP #include "Shape.hpp" - +#include #include namespace Shapes { diff --git a/src/shapes/src/HollowConicalFrustum.cpp b/src/shapes/src/HollowConicalFrustum.cpp index 51ceb807dc2..c605fc20aa3 100644 --- a/src/shapes/src/HollowConicalFrustum.cpp +++ b/src/shapes/src/HollowConicalFrustum.cpp @@ -20,10 +20,9 @@ #include #include +#include #include -#include - namespace Shapes { void HollowConicalFrustum::calculate_dist(const Utils::Vector3d &pos, double &dist, @@ -71,7 +70,7 @@ void HollowConicalFrustum::calculate_dist(const Utils::Vector3d &pos, */ auto endpoint_angle = pos_phi; - if (std::fabs(pos_phi) < m_central_angle / 2.) { + if (Utils::abs(pos_phi) < m_central_angle / 2.) { // Cannot use Utils::sgn because of pos_phi==0 corner case endpoint_angle = pos_phi > 0. ? m_central_angle / 2. : -m_central_angle / 2.; @@ -94,7 +93,7 @@ void HollowConicalFrustum::calculate_dist(const Utils::Vector3d &pos, /* It can be that the projection onto the (infinite) line is outside the * frustum. In that case, the closest point is actually one of the endpoints. */ - if (std::fabs(pos_closest_hcf_frame[2]) > m_length / 2.) { + if (Utils::abs(pos_closest_hcf_frame[2]) > m_length / 2.) { pos_closest_hcf_frame = pos_closest_hcf_frame[2] > 0. ? r1_endpoint : r2_endpoint; } diff --git a/src/utils/include/utils/Vector.hpp b/src/utils/include/utils/Vector.hpp index f1361fa9d6e..c4932207f26 100644 --- a/src/utils/include/utils/Vector.hpp +++ b/src/utils/include/utils/Vector.hpp @@ -39,7 +39,6 @@ #include #include #include -#include #include #include @@ -80,21 +79,11 @@ template class Vector : public Array { public: template - explicit constexpr Vector(Range const &rng) - : Vector(std::begin(rng), std::end(rng)) {} + explicit Vector(Range const &rng) : Vector(std::begin(rng), std::end(rng)) {} explicit constexpr Vector(T const (&v)[N]) : Base() { copy_init(std::begin(v), std::end(v)); } - explicit constexpr Vector(std::span span) : Base() { - if (span.size() != N) { - throw std::length_error( - "Construction of Vector from Container of wrong length."); - } - - copy_init(span.begin(), span.end()); - } - constexpr Vector(std::initializer_list v) : Base() { if (N != v.size()) { throw std::length_error( @@ -114,13 +103,14 @@ template class Vector : public Array { } } - /** @brief Create a vector that has all entries set to the same value. */ - DEVICE_QUALIFIER static constexpr Vector - broadcast(typename Base::value_type const &value) { - Vector ret{}; - for (std::size_t i = 0u; i != N; ++i) { - ret[i] = value; - } + /** + * @brief Create a vector that has all entries set to + * one value. + */ + static Vector broadcast(T const &s) { + Vector ret; + std::fill(ret.begin(), ret.end(), s); + return ret; } @@ -128,17 +118,11 @@ template class Vector : public Array { operator std::vector() const { return as_vector(); } - constexpr std::span as_span() const { - return std::span(const_cast(begin()), size()); - } - - constexpr operator std::span() const { return as_span(); } - template explicit operator Vector() const { Vector ret; std::transform(begin(), end(), ret.begin(), - [](auto const &e) { return static_cast(e); }); + [](auto e) { return static_cast(e); }); return ret; } @@ -156,7 +140,7 @@ template class Vector : public Array { Vector &normalize() { auto const l = norm(); if (l > T(0)) { - for (std::size_t i = 0u; i < N; ++i) + for (std::size_t i = 0; i < N; i++) this->operator[](i) /= l; } @@ -263,7 +247,8 @@ template Vector operator-(Vector const &a) { Vector ret; - std::transform(std::begin(a), std::end(a), std::begin(ret), std::negate()); + std::transform(std::begin(a), std::end(a), std::begin(ret), + [](T const &v) { return -v; }); return ret; } @@ -315,15 +300,6 @@ Vector operator/(Vector const &a, T const &b) { return ret; } -template -Vector operator/(T const &a, Vector const &b) { - Vector ret; - - std::transform(std::begin(b), std::end(b), ret.begin(), - [a](T const &val) { return a / val; }); - return ret; -} - template Vector &operator/=(Vector &a, T const &b) { std::transform(std::begin(a), std::end(a), std::begin(a), @@ -391,7 +367,7 @@ auto hadamard_product(Vector const &a, Vector const &b) { Vector ret; std::transform(a.cbegin(), a.cend(), b.cbegin(), ret.begin(), - [](auto const &ai, auto const &bi) { return ai * bi; }); + [](auto ai, auto bi) { return ai * bi; }); return ret; } @@ -434,7 +410,7 @@ auto hadamard_division(Vector const &a, Vector const &b) { Vector ret; std::transform(a.cbegin(), a.cend(), b.cbegin(), ret.begin(), - [](auto const &ai, auto const &bi) { return ai / bi; }); + [](auto ai, auto bi) { return ai / bi; }); return ret; } @@ -472,11 +448,11 @@ auto hadamard_division(T const &a, U const &b) { } template Vector unit_vector(unsigned int i) { - if (i == 0u) + if (i == 0) return {T{1}, T{0}, T{0}}; - if (i == 1u) + if (i == 1) return {T{0}, T{1}, T{0}}; - if (i == 2u) + if (i == 2) return {T{0}, T{0}, T{1}}; throw std::domain_error("coordinate out of range"); } diff --git a/src/utils/include/utils/math/cylindrical_transformation_parameters.hpp b/src/utils/include/utils/math/cylindrical_transformation_parameters.hpp index 6893f04d58e..f3a055556bf 100644 --- a/src/utils/include/utils/math/cylindrical_transformation_parameters.hpp +++ b/src/utils/include/utils/math/cylindrical_transformation_parameters.hpp @@ -16,14 +16,13 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ +#ifndef ESPRESSO_CYLINDER_TRANSFORMATION_PARAMETERS_HPP +#define ESPRESSO_CYLINDER_TRANSFORMATION_PARAMETERS_HPP -#pragma once - -#include -#include #include #include +#include #include namespace Utils { @@ -64,28 +63,30 @@ class CylindricalTransformationParameters { private: void validate() const { - auto constexpr eps = 10. * std::numeric_limits::epsilon(); - if (std::fabs(m_orientation * m_axis) > eps) { + auto constexpr eps = 10 * std::numeric_limits::epsilon(); + if (Utils::abs(m_orientation * m_axis) > eps) { throw std::runtime_error( "CylindricalTransformationParameters: Axis and orientation must be " "orthogonal. Scalar product is " + std::to_string(m_orientation * m_axis)); } - if (std::fabs(m_axis.norm() - 1.) > eps) { + if (Utils::abs(m_axis.norm() - 1) > eps) { throw std::runtime_error("CylindricalTransformationParameters: Axis must " "be normalized. Norm is " + std::to_string(m_axis.norm())); } - if (std::fabs(m_orientation.norm() - 1.) > eps) { + if (Utils::abs(m_orientation.norm() - 1) > eps) { throw std::runtime_error("CylindricalTransformationParameters: " "orientation must be normalized. Norm is " + std::to_string(m_orientation.norm())); } } - Utils::Vector3d const m_center{}; - Utils::Vector3d const m_axis{0., 0., 1.}; - Utils::Vector3d const m_orientation{1., 0., 0.}; + const Utils::Vector3d m_center{}; + const Utils::Vector3d m_axis{0, 0, 1}; + const Utils::Vector3d m_orientation{1, 0, 0}; }; } // namespace Utils + +#endif // ESPRESSO_CYLINDER_TRANSFORMATION_PARAMETERS_HPP diff --git a/src/utils/include/utils/math/sqr.hpp b/src/utils/include/utils/math/sqr.hpp index a5ac0f25969..dae6ee6d8af 100644 --- a/src/utils/include/utils/math/sqr.hpp +++ b/src/utils/include/utils/math/sqr.hpp @@ -1,7 +1,5 @@ /* * Copyright (C) 2010-2022 The ESPResSo project - * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 - * Max-Planck-Institute for Polymer Research, Theory Group * * This file is part of ESPResSo. * diff --git a/src/utils/include/utils/quaternion.hpp b/src/utils/include/utils/quaternion.hpp index d74b2161ec5..5a8c03b3251 100644 --- a/src/utils/include/utils/quaternion.hpp +++ b/src/utils/include/utils/quaternion.hpp @@ -42,7 +42,6 @@ #include "utils/Array.hpp" #include "utils/Vector.hpp" #include "utils/matrix.hpp" -#include "utils/serialization/array.hpp" #include #include diff --git a/src/utils/tests/CMakeLists.txt b/src/utils/tests/CMakeLists.txt index 89755749fab..2dd87d9f775 100644 --- a/src/utils/tests/CMakeLists.txt +++ b/src/utils/tests/CMakeLists.txt @@ -21,6 +21,7 @@ include(unit_test) +unit_test(NAME abs_test SRC abs_test.cpp DEPENDS espresso::utils) unit_test(NAME Vector_test SRC Vector_test.cpp DEPENDS espresso::utils) unit_test(NAME Factory_test SRC Factory_test.cpp DEPENDS espresso::utils) unit_test(NAME NumeratedContainer_test SRC NumeratedContainer_test.cpp DEPENDS @@ -33,6 +34,7 @@ unit_test(NAME accumulator SRC accumulator.cpp DEPENDS espresso::utils unit_test(NAME int_pow SRC int_pow_test.cpp DEPENDS espresso::utils) unit_test(NAME sgn SRC sgn_test.cpp DEPENDS espresso::utils) unit_test(NAME AS_erfc_part SRC AS_erfc_part_test.cpp DEPENDS espresso::utils) +unit_test(NAME sinc SRC sinc_test.cpp DEPENDS espresso::utils) unit_test(NAME permute_ifield_test SRC permute_ifield_test.cpp DEPENDS espresso::utils) unit_test(NAME vec_rotate SRC vec_rotate_test.cpp DEPENDS espresso::utils) diff --git a/src/utils/tests/Vector_test.cpp b/src/utils/tests/Vector_test.cpp index b32fa4e9095..716c5276c01 100644 --- a/src/utils/tests/Vector_test.cpp +++ b/src/utils/tests/Vector_test.cpp @@ -43,8 +43,7 @@ using Utils::Vector; /* Number of nontrivial Baxter permutations of length 2n-1. (A001185) */ -#define TEST_NUMBERS \ - { 0, 1, 1, 7, 21, 112, 456, 2603, 13203 } +#define TEST_NUMBERS {0, 1, 1, 7, 21, 112, 456, 2603, 13203} constexpr int test_numbers[] = TEST_NUMBERS; constexpr std::size_t n_test_numbers = sizeof(test_numbers) / sizeof(int); @@ -63,17 +62,6 @@ BOOST_AUTO_TEST_CASE(initializer_list_constructor) { BOOST_CHECK_THROW(Utils::Vector2d({1., 2., 3.}), std::length_error); } -BOOST_AUTO_TEST_CASE(span_constructor) { - constexpr static std::array values(TEST_NUMBERS); - constexpr auto view = std::span(values); - Vector v(view); - - BOOST_CHECK(std::equal(v.begin(), v.end(), test_numbers)); - - BOOST_CHECK_THROW(Utils::Vector3i{view}, std::length_error); - BOOST_CHECK_THROW(Utils::Vector3i{view.subspan(0u, 2u)}, std::length_error); -} - BOOST_AUTO_TEST_CASE(iterator_constructor) { Vector v(std::begin(test_numbers), std::end(test_numbers)); @@ -117,8 +105,8 @@ BOOST_AUTO_TEST_CASE(range_constructor_test) { } BOOST_AUTO_TEST_CASE(unit_vector_test) { - BOOST_CHECK((Utils::unit_vector(2u) == Utils::Vector3i{0, 0, 1})); - BOOST_CHECK_THROW(Utils::unit_vector(3u), std::domain_error); + BOOST_CHECK((Utils::unit_vector(2) == Utils::Vector3i{0, 0, 1})); + BOOST_CHECK_THROW(Utils::unit_vector(3), std::domain_error); } BOOST_AUTO_TEST_CASE(test_norm2) { @@ -199,13 +187,6 @@ BOOST_AUTO_TEST_CASE(algebraic_operators) { BOOST_CHECK(v4 == (v3 /= 2)); } - { - Utils::Vector3i v3{2, 12, 91}; - Utils::Vector3i v4{180, 30, 3}; - auto v5 = 360 / v3; - BOOST_CHECK(v5 == v4); - } - BOOST_CHECK((sqrt(Utils::Vector3d{1., 2., 3.}) == Utils::Vector3d{sqrt(1.), sqrt(2.), sqrt(3.)})); diff --git a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp index 5c1e4962691..5f618e22985 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp +++ b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp @@ -585,9 +585,9 @@ class LBWalberlaImpl : public LBWalberlaBase { auto const omega_odd = odd_mode_relaxation_rate(omega); auto const blocks = get_lattice().get_blocks(); m_kT = FloatType_c(kT); - auto obj = CollisionModelThermalized(m_last_applied_force_field_id, - m_pdf_field_id, m_kT, omega, omega, - omega_odd, omega, seed, uint32_t{0u}); + auto obj = CollisionModelThermalized( + m_last_applied_force_field_id, m_pdf_field_id, m_kT, omega, omega, + omega_odd, omega, seed, uint32_t{0u}); m_collision_model = std::make_shared(std::move(obj)); m_run_collide_sweep = CollideSweepVisitor(blocks); } diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.h index fb144478e13..4535e2fc718 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.h @@ -56,7 +56,7 @@ class CollideSweepDoublePrecisionLeesEdwards { BlockDataID pdfsID_, double grid_size, double omega_shear, double v_s) : forceID(forceID_), pdfsID(pdfsID_), grid_size_(grid_size), - omega_shear_(omega_shear), v_s_(v_s){}; + omega_shear_(omega_shear), v_s_(v_s) {}; void run(IBlock *block); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.h index dfd7bcdde63..2e2e5bd1bd3 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.h @@ -57,7 +57,7 @@ class CollideSweepDoublePrecisionLeesEdwardsAVX { double grid_size, double omega_shear, double v_s) : forceID(forceID_), pdfsID(pdfsID_), grid_size_(grid_size), - omega_shear_(omega_shear), v_s_(v_s){}; + omega_shear_(omega_shear), v_s_(v_s) {}; void run(IBlock *block); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsCUDA.h index 276592774e1..00d995d043a 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsCUDA.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsCUDA.h @@ -59,7 +59,7 @@ class CollideSweepDoublePrecisionLeesEdwardsCUDA { double grid_size, double omega_shear, double v_s) : forceID(forceID_), pdfsID(pdfsID_), grid_size_(grid_size), - omega_shear_(omega_shear), v_s_(v_s){}; + omega_shear_(omega_shear), v_s_(v_s) {}; void run(IBlock *block, gpuStream_t stream = nullptr); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.h index 192b2fcf077..08e9d8c04ff 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.h @@ -60,7 +60,7 @@ class CollideSweepDoublePrecisionThermalized { : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk), omega_even_(omega_even), omega_odd_(omega_odd), omega_shear_(omega_shear), seed_(seed), time_step_(time_step), - configured_(false){}; + configured_(false) {}; void run(IBlock *block); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.h index 00c0575773c..505cb1a7479 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.h @@ -61,7 +61,7 @@ class CollideSweepDoublePrecisionThermalizedAVX { : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk), omega_even_(omega_even), omega_odd_(omega_odd), omega_shear_(omega_shear), seed_(seed), time_step_(time_step), - configured_(false){}; + configured_(false) {}; void run(IBlock *block); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedCUDA.h index bc7ac5f69e7..4683db87f80 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedCUDA.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedCUDA.h @@ -61,7 +61,7 @@ class CollideSweepDoublePrecisionThermalizedCUDA { : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk), omega_even_(omega_even), omega_odd_(omega_odd), omega_shear_(omega_shear), seed_(seed), time_step_(time_step), - configured_(false){}; + configured_(false) {}; void run(IBlock *block, gpuStream_t stream = nullptr); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.h index 58c30c1bf4e..3d6cbc09ef8 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.h @@ -56,7 +56,7 @@ class CollideSweepSinglePrecisionLeesEdwards { BlockDataID pdfsID_, float grid_size, float omega_shear, float v_s) : forceID(forceID_), pdfsID(pdfsID_), grid_size_(grid_size), - omega_shear_(omega_shear), v_s_(v_s){}; + omega_shear_(omega_shear), v_s_(v_s) {}; void run(IBlock *block); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.h index 2819ac83a73..39703996832 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.h @@ -57,7 +57,7 @@ class CollideSweepSinglePrecisionLeesEdwardsAVX { float grid_size, float omega_shear, float v_s) : forceID(forceID_), pdfsID(pdfsID_), grid_size_(grid_size), - omega_shear_(omega_shear), v_s_(v_s){}; + omega_shear_(omega_shear), v_s_(v_s) {}; void run(IBlock *block); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsCUDA.h index bf9a807a67f..57e7980e194 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsCUDA.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsCUDA.h @@ -59,7 +59,7 @@ class CollideSweepSinglePrecisionLeesEdwardsCUDA { float grid_size, float omega_shear, float v_s) : forceID(forceID_), pdfsID(pdfsID_), grid_size_(grid_size), - omega_shear_(omega_shear), v_s_(v_s){}; + omega_shear_(omega_shear), v_s_(v_s) {}; void run(IBlock *block, gpuStream_t stream = nullptr); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.h index 5a9aa9eb3fc..44fef35df24 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.h @@ -60,7 +60,7 @@ class CollideSweepSinglePrecisionThermalized { : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk), omega_even_(omega_even), omega_odd_(omega_odd), omega_shear_(omega_shear), seed_(seed), time_step_(time_step), - configured_(false){}; + configured_(false) {}; void run(IBlock *block); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.h index b5955f045db..8cd829be96c 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.h @@ -60,7 +60,7 @@ class CollideSweepSinglePrecisionThermalizedAVX { : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk), omega_even_(omega_even), omega_odd_(omega_odd), omega_shear_(omega_shear), seed_(seed), time_step_(time_step), - configured_(false){}; + configured_(false) {}; void run(IBlock *block); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedCUDA.h index cf55997b2b5..09e1dd04acc 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedCUDA.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedCUDA.h @@ -62,7 +62,7 @@ class CollideSweepSinglePrecisionThermalizedCUDA { : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk), omega_even_(omega_even), omega_odd_(omega_odd), omega_shear_(omega_shear), seed_(seed), time_step_(time_step), - configured_(false){}; + configured_(false) {}; void run(IBlock *block, gpuStream_t stream = nullptr); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.h index c472f1f5112..012a4bcc140 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.h @@ -55,7 +55,7 @@ class InitialPDFsSetterDoublePrecision { InitialPDFsSetterDoublePrecision(BlockDataID forceID_, BlockDataID pdfsID_, BlockDataID velocityID_, double rho_0) : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_), - rho_0_(rho_0){}; + rho_0_(rho_0) {}; void run(IBlock *block); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecisionCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecisionCUDA.h index 6f4ab75275a..6d493acdd84 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecisionCUDA.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecisionCUDA.h @@ -58,7 +58,7 @@ class InitialPDFsSetterDoublePrecisionCUDA { BlockDataID pdfsID_, BlockDataID velocityID_, double rho_0) : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_), - rho_0_(rho_0){}; + rho_0_(rho_0) {}; void run(IBlock *block, gpuStream_t stream = nullptr); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.h index e1c3260bab9..4a8ea571625 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.h @@ -55,7 +55,7 @@ class InitialPDFsSetterSinglePrecision { InitialPDFsSetterSinglePrecision(BlockDataID forceID_, BlockDataID pdfsID_, BlockDataID velocityID_, float rho_0) : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_), - rho_0_(rho_0){}; + rho_0_(rho_0) {}; void run(IBlock *block); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecisionCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecisionCUDA.h index a3faac64d68..c56b68680ac 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecisionCUDA.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecisionCUDA.h @@ -58,7 +58,7 @@ class InitialPDFsSetterSinglePrecisionCUDA { BlockDataID pdfsID_, BlockDataID velocityID_, float rho_0) : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_), - rho_0_(rho_0){}; + rho_0_(rho_0) {}; void run(IBlock *block, gpuStream_t stream = nullptr); diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.h index 87fa2aa0336..73819b58e6f 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.h @@ -54,7 +54,7 @@ class StreamSweepDoublePrecision { public: StreamSweepDoublePrecision(BlockDataID forceID_, BlockDataID pdfsID_, BlockDataID velocityID_) - : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_){}; + : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_) {}; ~StreamSweepDoublePrecision() { for (auto p : cache_pdfs_) { diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.h index 52288021ad8..059f786e6b1 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.h @@ -54,7 +54,7 @@ class StreamSweepDoublePrecisionAVX { public: StreamSweepDoublePrecisionAVX(BlockDataID forceID_, BlockDataID pdfsID_, BlockDataID velocityID_) - : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_){}; + : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_) {}; ~StreamSweepDoublePrecisionAVX() { for (auto p : cache_pdfs_) { diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionCUDA.h index 217f90854e2..d6dd319394b 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionCUDA.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionCUDA.h @@ -56,7 +56,7 @@ class StreamSweepDoublePrecisionCUDA { public: StreamSweepDoublePrecisionCUDA(BlockDataID forceID_, BlockDataID pdfsID_, BlockDataID velocityID_) - : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_){}; + : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_) {}; ~StreamSweepDoublePrecisionCUDA() { for (auto p : cache_pdfs_) { diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.h index 4425265fc6c..f921333044a 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.h @@ -54,7 +54,7 @@ class StreamSweepSinglePrecision { public: StreamSweepSinglePrecision(BlockDataID forceID_, BlockDataID pdfsID_, BlockDataID velocityID_) - : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_){}; + : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_) {}; ~StreamSweepSinglePrecision() { for (auto p : cache_pdfs_) { diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.h index 095c86c6cb9..81cc4fff038 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.h @@ -54,7 +54,7 @@ class StreamSweepSinglePrecisionAVX { public: StreamSweepSinglePrecisionAVX(BlockDataID forceID_, BlockDataID pdfsID_, BlockDataID velocityID_) - : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_){}; + : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_) {}; ~StreamSweepSinglePrecisionAVX() { for (auto p : cache_pdfs_) { diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionCUDA.h index 302b62816b7..16ea5b0ed2c 100644 --- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionCUDA.h +++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionCUDA.h @@ -56,7 +56,7 @@ class StreamSweepSinglePrecisionCUDA { public: StreamSweepSinglePrecisionCUDA(BlockDataID forceID_, BlockDataID pdfsID_, BlockDataID velocityID_) - : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_){}; + : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_) {}; ~StreamSweepSinglePrecisionCUDA() { for (auto p : cache_pdfs_) { diff --git a/testsuite/python/lb.py b/testsuite/python/lb.py index e5a5ebe9ab3..66acb528814 100644 --- a/testsuite/python/lb.py +++ b/testsuite/python/lb.py @@ -635,10 +635,10 @@ def test_rng(self): system = self.system system.lb = self.lb_class(kT=15., **self.params, **self.lb_params) system.integrator.run(1) - diff = system.lb[0, :, :].population - system.lb[6, :, :].population + difference = system.lb[0,:,:].population - system.lb[6,:,:].population # if the RNG uses the local cell index instead of the global cell index, # the noise will be identical in all blocks, and the RMS is zero - rms = np.sqrt(np.mean(np.square(diff))) + rms = np.sqrt(np.mean(np.square(difference))) self.assertGreater(rms, 0.01, msg="thermal noise might be correlated!") def test_thermalization_force_balance(self): diff --git a/testsuite/python/lb_boundary_ghost_layer.py b/testsuite/python/lb_boundary_ghost_layer.py index 84ce9180f01..7170f2505f5 100644 --- a/testsuite/python/lb_boundary_ghost_layer.py +++ b/testsuite/python/lb_boundary_ghost_layer.py @@ -66,7 +66,7 @@ def quadratic(x, a, b, c): popt_ref = (4e-8, -1e-6, 1e-5) popt, _ = scipy.optimize.curve_fit( quadratic, xdata, ydata, p0=popt_ref) - rtol = 0.33 if self.lbf.single_precision else 0.1 + rtol = 0.3 if self.lbf.single_precision else 0.1 np.testing.assert_allclose(popt, popt_ref, rtol=0.5, atol=0.) np.testing.assert_allclose(ydata, quadratic(xdata, *popt), rtol=rtol, atol=0.)