diff --git a/cmake/FindCUDACompilerNVCC.cmake b/cmake/FindCUDACompilerNVCC.cmake
index 030dc0f312..d9600ecf6d 100644
--- a/cmake/FindCUDACompilerNVCC.cmake
+++ b/cmake/FindCUDACompilerNVCC.cmake
@@ -50,6 +50,8 @@ target_compile_options(
   $<$<CONFIG:Coverage>:-Xptxas=-O3 -Xcompiler=-Og,-g,--coverage,-fprofile-abs-path>
   $<$<CONFIG:RelWithAssert>:-Xptxas=-O3 -Xcompiler=-O3,-g>
   $<$<BOOL:${CMAKE_OSX_SYSROOT}>:-Xcompiler=-isysroot;-Xcompiler=${CMAKE_OSX_SYSROOT}>
+  # workaround for https://github.com/espressomd/espresso/issues/4943
+  $<$<BOOL:${ESPRESSO_BUILD_WITH_CCACHE}>:$<$<CONFIG:Coverage>:--coverage -fprofile-abs-path>>
 )
 
 function(espresso_add_gpu_library)
diff --git a/cmake/unit_test.cmake b/cmake/espresso_unit_test.cmake
similarity index 91%
rename from cmake/unit_test.cmake
rename to cmake/espresso_unit_test.cmake
index 8b87986176..c857d9bed1 100644
--- a/cmake/unit_test.cmake
+++ b/cmake/espresso_unit_test.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2016-2022 The ESPResSo project
+# Copyright (C) 2016-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,9 +17,12 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-# unit_test function
-function(UNIT_TEST)
-  cmake_parse_arguments(TEST "" "NAME;NUM_PROC" "SRC;DEPENDS" ${ARGN})
+function(ESPRESSO_UNIT_TEST)
+  cmake_parse_arguments(TEST "" "SRC;NAME;NUM_PROC" "DEPENDS" ${ARGN})
+  if(NOT DEFINED TEST_NAME)
+    cmake_path(GET TEST_SRC STEM TEST_NAME)
+    set(TEST_NAME ${TEST_NAME} PARENT_SCOPE)
+  endif()
   if(${TEST_SRC} MATCHES ".*\.cu$")
     espresso_add_gpu_executable(${TEST_NAME} ${TEST_SRC})
   else()
@@ -72,4 +75,4 @@ function(UNIT_TEST)
     ${TEST_NAME} PROPERTIES ENVIRONMENT "${TEST_ENV_VARIABLES}")
 
   add_dependencies(check_unit_tests ${TEST_NAME})
-endfunction(UNIT_TEST)
+endfunction()
diff --git a/maintainer/CI/build_cmake.sh b/maintainer/CI/build_cmake.sh
index 5cd18df07c..6d043c8b59 100755
--- a/maintainer/CI/build_cmake.sh
+++ b/maintainer/CI/build_cmake.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright (C) 2016-2022 The ESPResSo project
+# Copyright (C) 2016-2024 The ESPResSo project
 # Copyright (C) 2014 Olaf Lenz
 #
 # Copying and distribution of this file, with or without modification,
@@ -151,50 +151,18 @@ cmake_params="-D CMAKE_BUILD_TYPE=${build_type} -D ESPRESSO_WARNINGS_ARE_ERRORS=
 cmake_params="${cmake_params} -D CMAKE_INSTALL_PREFIX=/tmp/espresso-unit-tests -D ESPRESSO_INSIDE_DOCKER=ON"
 cmake_params="${cmake_params} -D ESPRESSO_CTEST_ARGS:STRING=-j${check_procs} -D ESPRESSO_TEST_TIMEOUT=${test_timeout}"
 
-if [ "${make_check_benchmarks}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_BENCHMARKS=ON"
-fi
-
-if [ "${with_ccache}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_CCACHE=ON"
-fi
-
-if [ "${with_caliper}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_CALIPER=ON"
-fi
-
-if [ "${with_hdf5}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_HDF5=ON"
-else
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_HDF5=OFF"
-fi
-
-if [ "${with_fftw}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_FFTW=ON"
-else
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_FFTW=OFF"
-fi
-
-if [ "${with_gsl}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_GSL=ON"
-else
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_GSL=OFF"
-fi
-
-if [ "${with_scafacos}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_SCAFACOS=ON"
-else
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_SCAFACOS=OFF"
-fi
-
-if [ "${with_stokesian_dynamics}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_STOKESIAN_DYNAMICS=ON"
-else
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_STOKESIAN_DYNAMICS=OFF"
-fi
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_BENCHMARKS=${make_check_benchmarks}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_CCACHE=${with_ccache}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_CALIPER=${with_caliper}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_HDF5=${with_hdf5}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_FFTW=${with_fftw}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_GSL=${with_gsl}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_SCAFACOS=${with_scafacos}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_STOKESIAN_DYNAMICS=${with_stokesian_dynamics}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_WALBERLA=${with_walberla}"
 
 if [ "${with_walberla}" = true ]; then
-  cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_WALBERLA=ON -D ESPRESSO_BUILD_WITH_WALBERLA_FFT=ON"
+  cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_WALBERLA_FFT=ON"
   if [ "${with_walberla_avx}" = true ]; then
     cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_WALBERLA_AVX=ON"
   fi
@@ -203,39 +171,18 @@ if [ "${with_walberla}" = true ]; then
   mpiexec_preflags="${mpiexec_preflags:+$mpiexec_preflags;}--bind-to;none"
 fi
 
-if [ "${with_coverage}" = true ]; then
-    cmake_params="-D ESPRESSO_BUILD_WITH_COVERAGE=ON ${cmake_params}"
-fi
-
-if [ "${with_coverage_python}" = true ]; then
-    cmake_params="-D ESPRESSO_BUILD_WITH_COVERAGE_PYTHON=ON ${cmake_params}"
-fi
-
-if [ "${with_asan}" = true ]; then
-    cmake_params="-D ESPRESSO_BUILD_WITH_ASAN=ON ${cmake_params}"
-fi
-
-if [ "${with_ubsan}" = true ]; then
-    cmake_params="-D ESPRESSO_BUILD_WITH_UBSAN=ON ${cmake_params}"
-fi
-
-if [ "${with_static_analysis}" = true ]; then
-    cmake_params="-D ESPRESSO_BUILD_WITH_CLANG_TIDY=ON ${cmake_params}"
-fi
-
-if [ "${run_checks}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_TESTS=ON"
-else
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_TESTS=OFF"
-fi
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_COVERAGE=${with_coverage}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_COVERAGE_PYTHON=${with_coverage_python}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_ASAN=${with_asan}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_UBSAN=${with_ubsan}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_CLANG_TIDY=${with_static_analysis}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_CUDA=${with_cuda}"
 
 if [ "${with_cuda}" = true ]; then
-    cmake_params="-D ESPRESSO_BUILD_WITH_CUDA=ON -D CUDAToolkit_ROOT=/usr/lib/cuda ${cmake_params}"
+    cmake_params="${cmake_params} -D CUDAToolkit_ROOT=/usr/lib/cuda"
     if [ "${CUDACXX}" = "" ] && [ "${CXX}" != "" ]; then
-        cmake_params="-D CMAKE_CUDA_FLAGS='--compiler-bindir=$(which "${CXX}")' ${cmake_params}"
+        cmake_params="${cmake_params} -D CMAKE_CUDA_FLAGS='--compiler-bindir=$(which "${CXX}")'"
     fi
-else
-    cmake_params="-D ESPRESSO_BUILD_WITH_CUDA=OFF ${cmake_params}"
 fi
 
 command -v nvidia-smi && nvidia-smi || true
@@ -281,11 +228,10 @@ fi
 # CONFIGURE
 start "CONFIGURE"
 
-MYCONFIG_DIR="${srcdir}/maintainer/configs"
 if [ "${myconfig}" = "default" ]; then
     echo "Using default myconfig."
 else
-    myconfig_file="${MYCONFIG_DIR}/${myconfig}.hpp"
+    myconfig_file="${srcdir}/maintainer/configs/${myconfig}.hpp"
     if [ ! -e "${myconfig_file}" ]; then
         echo "${myconfig_file} does not exist!"
         exit 1
@@ -315,9 +261,8 @@ end "BUILD"
 # library. See details in https://github.com/espressomd/espresso/issues/2249
 # Can't do this check on CUDA though because nvcc creates a host function
 # that just calls exit() for each device function, and can't do this with
-# coverage because gcov 9.0 adds code that calls exit(), and can't do this
 # with walberla because the library calls exit() in assertions.
-if [[ "${with_coverage}" == false && ( "${with_cuda}" == false || "${with_cuda_compiler}" != "nvcc" ) && "${with_walberla}" != "true" ]]; then
+if [[ ( "${with_cuda}" == false || "${with_cuda_compiler}" != "nvcc" ) && "${with_walberla}" != "true" ]]; then
     if nm -o -C $(find . -name '*.so') | grep '[^a-z]exit@@GLIBC'; then
         echo "Found calls to exit() function in shared libraries."
         exit 1
@@ -409,20 +354,7 @@ if [ "${with_coverage}" = true ] || [ "${with_coverage_python}" = true ]; then
     if [ "${with_coverage}" = true ]; then
         echo "Running lcov and gcov..."
         codecov_opts="${codecov_opts} --gcov"
-        lcov --gcov-tool "${GCOV:-gcov}" \
-             --quiet \
-             --ignore-errors graph,mismatch,mismatch,gcov,unused \
-             --directory . \
-             --filter brace,blank,range,region \
-             --capture \
-             --rc lcov_json_module="JSON::XS" \
-             --exclude "/usr/*" \
-             --exclude "$(realpath .)/_deps/*" \
-             --exclude "$(realpath .)/src/python/espressomd/*" \
-             --exclude "$(realpath "${srcdir}")/src/walberla_bridge/src/*/generated_kernels/*" \
-             --exclude "$(realpath "${srcdir}")/libs/*" \
-             --exclude "*/tmpxft_*cudafe1.stub.*" \
-             --output-file coverage.info
+        ./run_lcov.sh coverage.info
     fi
     if [ "${with_coverage_python}" = true ]; then
         echo "Running python3-coverage..."
diff --git a/maintainer/CI/run_lcov.sh b/maintainer/CI/run_lcov.sh
new file mode 100755
index 0000000000..22052acf45
--- /dev/null
+++ b/maintainer/CI/run_lcov.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env sh
+#
+# Copyright (C) 2017-2024 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+set -e
+
+output="${1:-coverage.info}"
+bindir="$(realpath .)"
+srcdir="$(sed -nr "s/^ESPResSo_SOURCE_DIR:STATIC=(.+)/\1/p" "${bindir}/CMakeCache.txt")"
+
+if [ "${srcdir}" = "" ]; then
+  echo "Cannot extract ESPResSo_SOURCE_DIR variable from the CMake cache" >&2
+  exit 2
+fi
+
+lcov --gcov-tool "${GCOV:-gcov}" \
+     --quiet \
+     --ignore-errors graph,mismatch,mismatch,gcov,unused \
+     --directory . \
+     --filter brace,blank,range,region \
+     --capture \
+     --rc lcov_json_module="JSON::XS" \
+     --exclude "/usr/*" \
+     --exclude "*/tmpxft_*cudafe1.stub.*" \
+     --exclude "${bindir}/_deps/*" \
+     --exclude "${bindir}/src/python/espressomd/*" \
+     --exclude "${srcdir}/src/walberla_bridge/src/*/generated_kernels/*" \
+     --exclude "${srcdir}/libs/*" \
+     --output-file "${output}"
diff --git a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cu b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cu
index 7cca223490..c9f8ae4dc9 100644
--- a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cu
+++ b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cu
@@ -46,13 +46,8 @@
 
 #if defined(__NVCC__)
 #define RESTRICT __restrict__
-#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // unused variable
-#else
-#pragma push
-#pragma diag_suppress 177 // unused variable
-#endif
 #elif defined(__clang__)
 #if defined(__CUDA__)
 #if defined(__CUDA_ARCH__)
@@ -60,20 +55,17 @@
 #define RESTRICT __restrict__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-parameter"
 #else
 // clang compiling CUDA code in host mode
 #define RESTRICT __restrict__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-parameter"
 #endif
 #endif
 #elif defined(__GNUC__) or defined(__GNUG__)
 #define RESTRICT __restrict__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"
-#pragma GCC diagnostic ignored "-Wunused-parameter"
 #elif defined(_MSC_VER)
 #define RESTRICT __restrict
 #else
@@ -101,62 +93,77 @@ namespace accessor {
 
 namespace Population
 {
-    __global__ void kernel_get_interval(
+// LCOV_EXCL_START
+    __global__ void kernel_get(
         gpu::FieldAccessor< {{dtype}} > pdf,
-        {{dtype}} * RESTRICT const pop )
+        {{dtype}} * RESTRICT pop )
     {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{Q}}u);
         pdf.set( blockIdx, threadIdx );
+        pop += offset;
         if (pdf.isValidPosition()) {
-            uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{Q}}u);
             {% for i in range(Q) -%}
-                pop[offset + {{i}}u] = pdf.get({{i}}u);
+                pop[{{i}}u] = pdf.get({{i}}u);
             {% endfor -%}
         }
     }
 
-    __global__ void kernel_get(
+    __global__ void kernel_set(
         gpu::FieldAccessor< {{dtype}} > pdf,
-        {{dtype}} * RESTRICT const pop )
+        {{dtype}} const * RESTRICT pop )
     {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{Q}}u);
         pdf.set( blockIdx, threadIdx );
+        pop += offset;
         if (pdf.isValidPosition()) {
             {% for i in range(Q) -%}
-                pop[{{i}}u] = pdf.get({{i}}u);
+                pdf.get({{i}}u) = pop[{{i}}u];
             {% endfor -%}
         }
     }
 
-    __global__ void kernel_set_interval(
+    __global__ void kernel_broadcast(
         gpu::FieldAccessor< {{dtype}} > pdf,
-        {{dtype}} const * RESTRICT const pop )
+        {{dtype}} const * RESTRICT pop )
     {
         pdf.set( blockIdx, threadIdx );
         if (pdf.isValidPosition()) {
-            uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{Q}}u);
             {% for i in range(Q) -%}
-                pdf.get({{i}}u) = pop[offset + {{i}}u];
+                pdf.get({{i}}u) = pop[{{i}}u];
             {% endfor -%}
         }
     }
 
-    __global__ void kernel_set(
+    __global__ void kernel_set_vel(
         gpu::FieldAccessor< {{dtype}} > pdf,
-        {{dtype}} const * RESTRICT const pop )
+        gpu::FieldAccessor< {{dtype}} > velocity,
+        gpu::FieldAccessor< {{dtype}} > force,
+        {{dtype}} const * RESTRICT pop )
     {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{Q}}u);
         pdf.set( blockIdx, threadIdx );
+        velocity.set( blockIdx, threadIdx );
+        force.set( blockIdx, threadIdx );
+        pop += offset;
         if (pdf.isValidPosition()) {
             {% for i in range(Q) -%}
-                pdf.get({{i}}u) = pop[{{i}}u];
+                const {{dtype}} f_{{i}} = pdf.get({{i}}u) = pop[{{i}}u];
             {% endfor -%}
+            {{momentum_density_getter | substitute_force_getter_cu | indent(8) }}
+            const {{dtype}} rho_inv = {{dtype}} {1} / rho;
+            {% for i in range(D) -%}
+                velocity.get({{i}}u) = md_{{i}} * rho_inv;
+            {% endfor %}
         }
     }
+// LCOV_EXCL_STOP
 
     std::array<{{dtype}}, {{Q}}u> get(
         gpu::GPUField< {{dtype}} > const * pdf_field,
         Cell const & cell )
     {
         CellInterval ci ( cell, cell );
-        thrust::device_vector< {{dtype}} > dev_data({{Q}}u, {{dtype}} {0});
+        thrust::device_vector< {{dtype}} > dev_data({{Q}}u);
         auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
         auto kernel = gpu::make_kernel( kernel_get );
         kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
@@ -172,7 +179,7 @@ namespace Population
         std::array< {{dtype}}, {{Q}}u > const & pop,
         Cell const & cell )
     {
-        thrust::device_vector< {{dtype}} > dev_data(pop.data(), pop.data() + {{Q}}u);
+        thrust::device_vector< {{dtype}} > dev_data(pop.begin(), pop.end());
         auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
         CellInterval ci ( cell, cell );
         auto kernel = gpu::make_kernel( kernel_set );
@@ -181,14 +188,32 @@ namespace Population
         kernel();
     }
 
+    void set(
+        gpu::GPUField< {{dtype}} > * pdf_field,
+        gpu::GPUField< {{dtype}} > * velocity_field,
+        gpu::GPUField< {{dtype}} > const * force_field,
+        std::array< {{dtype}}, {{Q}}u > const & pop,
+        Cell const & cell )
+    {
+        thrust::device_vector< {{dtype}} > dev_data(pop.begin(), pop.end());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        CellInterval ci ( cell, cell );
+        auto kernel = gpu::make_kernel( kernel_set_vel );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *velocity_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+
     void initialize(
         gpu::GPUField< {{dtype}} > * pdf_field,
         std::array< {{dtype}}, {{Q}}u > const & pop )
     {
         CellInterval ci = pdf_field->xyzSizeWithGhostLayer();
-        thrust::device_vector< {{dtype}} > dev_data(pop.data(), pop.data() + {{Q}}u);
+        thrust::device_vector< {{dtype}} > dev_data(pop.begin(), pop.end());
         auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-        auto kernel = gpu::make_kernel( kernel_set );
+        auto kernel = gpu::make_kernel( kernel_broadcast );
         kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
         kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
         kernel();
@@ -200,7 +225,7 @@ namespace Population
     {
         thrust::device_vector< {{dtype}} > dev_data(ci.numCells() * {{Q}}u);
         auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-        auto kernel = gpu::make_kernel( kernel_get_interval );
+        auto kernel = gpu::make_kernel( kernel_get );
         kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
         kernel.addParam( dev_data_ptr );
         kernel();
@@ -216,89 +241,99 @@ namespace Population
     {
         thrust::device_vector< {{dtype}} > dev_data(values.begin(), values.end());
         auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-        auto kernel = gpu::make_kernel( kernel_set_interval );
+        auto kernel = gpu::make_kernel( kernel_set );
         kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
         kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
         kernel();
     }
-} // namespace Population
 
-namespace Vector
-{
-    __global__ void kernel_get_interval(
-        gpu::FieldAccessor< {{dtype}} > vec,
-        {{dtype}} * const out )
+    void set(
+        gpu::GPUField< {{dtype}} > * pdf_field,
+        gpu::GPUField< {{dtype}} > * velocity_field,
+        gpu::GPUField< {{dtype}} > const * force_field,
+        std::vector< {{dtype}} > const & values,
+        CellInterval const & ci )
     {
-        vec.set( blockIdx, threadIdx );
-        if (vec.isValidPosition()) {
-            uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
-            {% for i in range(D) -%}
-                out[offset + {{i}}u] = vec.get({{i}}u);
-            {% endfor %}
-        }
+        thrust::device_vector< {{dtype}} > dev_data(values.begin(), values.end());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_set_vel );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *velocity_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
     }
+} // namespace Population
 
+namespace Vector
+{
+// LCOV_EXCL_START
     __global__ void kernel_get(
         gpu::FieldAccessor< {{dtype}} > vec,
-        {{dtype}} * const out )
+        {{dtype}} * u_out )
     {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
         vec.set( blockIdx, threadIdx );
+        u_out += offset;
         if (vec.isValidPosition()) {
             {% for i in range(D) -%}
-                out[{{i}}u] = vec.get({{i}}u);
+                u_out[{{i}}u] = vec.get({{i}}u);
             {% endfor %}
         }
     }
 
-    __global__ void kernel_set_interval(
+    __global__ void kernel_set(
         gpu::FieldAccessor< {{dtype}} > vec,
-        {{dtype}} const * RESTRICT const u )
+        {{dtype}} const * RESTRICT u_in )
     {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
         vec.set( blockIdx, threadIdx );
+        u_in += offset;
         if (vec.isValidPosition()) {
-            uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
             {% for i in range(D) -%}
-                vec.get({{i}}u) = u[offset + {{i}}u];
+                vec.get({{i}}u) = u_in[{{i}}u];
             {% endfor %}
         }
     }
 
-    __global__ void kernel_set(
+    __global__ void kernel_broadcast(
         gpu::FieldAccessor< {{dtype}} > vec,
-        const {{dtype}} * RESTRICT const u )
+        {{dtype}} const * RESTRICT u_in )
     {
         vec.set( blockIdx, threadIdx );
         if (vec.isValidPosition()) {
             {% for i in range(D) -%}
-                vec.get({{i}}u) = u[{{i}}u];
+                vec.get({{i}}u) = u_in[{{i}}u];
             {% endfor %}
         }
     }
 
-    __global__ void kernel_add_interval(
+    __global__ void kernel_add(
         gpu::FieldAccessor< {{dtype}} > vec,
-        {{dtype}} const * RESTRICT const u )
+        {{dtype}} const * RESTRICT u_in )
     {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
         vec.set( blockIdx, threadIdx );
+        u_in += offset;
         if (vec.isValidPosition()) {
-            uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
             {% for i in range(D) -%}
-                vec.get({{i}}u) += u[offset + {{i}}u];
+                vec.get({{i}}u) += u_in[{{i}}u];
             {% endfor %}
         }
     }
 
-    __global__ void kernel_add(
+    __global__ void kernel_broadcast_add(
         gpu::FieldAccessor< {{dtype}} > vec,
-        {{dtype}} const * RESTRICT const u )
+        {{dtype}} const * RESTRICT u_in )
     {
         vec.set( blockIdx, threadIdx );
         if (vec.isValidPosition()) {
             {% for i in range(D) -%}
-                vec.get({{i}}u) += u[{{i}}u];
+                vec.get({{i}}u) += u_in[{{i}}u];
             {% endfor %}
         }
     }
+// LCOV_EXCL_STOP
 
     Vector{{D}}< {{dtype}} > get(
         gpu::GPUField< {{dtype}} > const * vec_field,
@@ -351,7 +386,7 @@ namespace Vector
         CellInterval ci = vec_field->xyzSizeWithGhostLayer();
         thrust::device_vector< {{dtype}} > dev_data(vec.data(), vec.data() + {{D}}u);
         auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-        auto kernel = gpu::make_kernel( kernel_set );
+        auto kernel = gpu::make_kernel( kernel_broadcast );
         kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *vec_field, ci ) );
         kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
         kernel();
@@ -364,7 +399,7 @@ namespace Vector
         CellInterval ci = vec_field->xyzSizeWithGhostLayer();
         thrust::device_vector< {{dtype}} > dev_data(vec.data(), vec.data() + {{D}}u);
         auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-        auto kernel = gpu::make_kernel( kernel_add );
+        auto kernel = gpu::make_kernel( kernel_broadcast_add );
         kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *vec_field, ci ) );
         kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
         kernel();
@@ -376,7 +411,7 @@ namespace Vector
     {
         thrust::device_vector< {{dtype}} > dev_data(ci.numCells() * {{D}}u);
         auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-        auto kernel = gpu::make_kernel( kernel_get_interval );
+        auto kernel = gpu::make_kernel( kernel_get );
         kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *vec_field, ci ) );
         kernel.addParam( dev_data_ptr );
         kernel();
@@ -392,7 +427,7 @@ namespace Vector
     {
         thrust::device_vector< {{dtype}} > dev_data(values.begin(), values.end());
         auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-        auto kernel = gpu::make_kernel( kernel_set_interval );
+        auto kernel = gpu::make_kernel( kernel_set );
         kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *vec_field, ci ) );
         kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
         kernel();
@@ -401,6 +436,7 @@ namespace Vector
 
 namespace Interpolation
 {
+// LCOV_EXCL_START
     /** @brief Calculate interpolation weights. */
     static __forceinline__ __device__ void calculate_weights(
         {{dtype}} const *RESTRICT const pos,
@@ -495,6 +531,7 @@ namespace Interpolation
         }
       }
     }
+// LCOV_EXCL_STOP
 
     static dim3 calculate_dim_grid(uint const threads_x,
                                    uint const blocks_per_grid_y,
@@ -552,6 +589,7 @@ namespace Interpolation
 
 namespace Equilibrium
 {
+// LCOV_EXCL_START
     __device__ void kernel_set_device(
         gpu::FieldAccessor< {{dtype}} > pdf,
         {{dtype}} const * RESTRICT const u,
@@ -565,44 +603,49 @@ namespace Equilibrium
             pdf.get({{loop.index0 }}u) = {{eqTerm}};
         {% endfor -%}
     }
+// LCOV_EXCL_STOP
 } // namespace Equilibrium
 
 namespace Density
 {
+// LCOV_EXCL_START
     __global__ void kernel_get(
         gpu::FieldAccessor< {{dtype}} > pdf,
-        {{dtype}} * RESTRICT const out )
+        {{dtype}} * RESTRICT rho_out )
     {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
         pdf.set( blockIdx, threadIdx );
+        rho_out += offset;
         if (pdf.isValidPosition()) {
-            uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, uint(1u));
             {% for i in range(Q) -%}
                 {{dtype}} const f_{{i}} = pdf.get({{i}}u);
             {% endfor -%}
             {{density_getters | indent(12)}}
-            out[offset] = rho;
+            rho_out[0u] = rho;
         }
     }
 
     __global__ void kernel_set(
         gpu::FieldAccessor< {{dtype}} > pdf,
-        {{dtype}} const * RESTRICT const rho_in )
+        {{dtype}} const * RESTRICT rho_in )
     {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
         pdf.set( blockIdx, threadIdx );
+        rho_in += offset;
         if (pdf.isValidPosition()) {
-            uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, uint(1u));
             {% for i in range(Q) -%}
                 {{dtype}} const f_{{i}} = pdf.get({{i}}u);
             {% endfor -%}
             {{unshifted_momentum_density_getter | indent(12)}}
 
             // calculate current velocity (before density change)
-            {{dtype}} const conversion = {{dtype}}(1) / rho;
-            {{dtype}} const u_old[{{D}}] = { {% for i in range(D) %}momdensity_{{i}} * conversion{% if not loop.last %}, {% endif %}{% endfor %} };
+            {{dtype}} const rho_inv = {{dtype}} {1} / rho;
+            {{dtype}} const u_old[{{D}}] = { {% for i in range(D) %}momdensity_{{i}} * rho_inv{% if not loop.last %}, {% endif %}{% endfor %} };
 
-            Equilibrium::kernel_set_device(pdf, u_old, rho_in[offset] {%if not compressible %} + {{dtype}}(1) {%endif%});
+            Equilibrium::kernel_set_device(pdf, u_old, rho_in[0u] {%if not compressible %} + {{dtype}} {1} {%endif%});
         }
     }
+// LCOV_EXCL_STOP
 
     {{dtype}} get(
         gpu::GPUField< {{dtype}} > const * pdf_field,
@@ -619,20 +662,6 @@ namespace Density
         return rho;
     }
 
-    void set(
-        gpu::GPUField< {{dtype}} > * pdf_field,
-        const {{dtype}} rho,
-        Cell const & cell )
-    {
-        CellInterval ci ( cell, cell );
-        thrust::device_vector< {{dtype}} > dev_data(1u, rho);
-        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-        auto kernel = gpu::make_kernel( kernel_set );
-        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
-        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
-        kernel();
-    }
-
     std::vector< {{dtype}} > get(
         gpu::GPUField< {{dtype}} > const * pdf_field,
         CellInterval const & ci )
@@ -643,11 +672,25 @@ namespace Density
         kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
         kernel.addParam( dev_data_ptr );
         kernel();
-        std::vector< {{dtype}} > out(ci.numCells());
+        std::vector< {{dtype}} > out(dev_data.size());
         thrust::copy(dev_data.begin(), dev_data.end(), out.begin());
         return out;
     }
 
+    void set(
+        gpu::GPUField< {{dtype}} > * pdf_field,
+        const {{dtype}} rho,
+        Cell const & cell )
+    {
+        CellInterval ci ( cell, cell );
+        thrust::device_vector< {{dtype}} > dev_data(1u, rho);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_set );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+
     void set(
         gpu::GPUField< {{dtype}} > * pdf_field,
         std::vector< {{dtype}} > const & values,
@@ -664,31 +707,95 @@ namespace Density
 
 namespace Velocity
 {
+// LCOV_EXCL_START
+    __global__ void kernel_get(
+        gpu::FieldAccessor< {{dtype}} > pdf,
+        gpu::FieldAccessor< {{dtype}} > force,
+        {{dtype}} * RESTRICT u_out )
+    {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
+        pdf.set( blockIdx, threadIdx );
+        force.set( blockIdx, threadIdx );
+        u_out += offset;
+        if (pdf.isValidPosition()) {
+            {% for i in range(Q) -%}
+                {{dtype}} const f_{{i}} = pdf.get({{i}}u);
+            {% endfor -%}
+            {{momentum_density_getter | substitute_force_getter_cu | indent(8) }}
+            auto const rho_inv = {{dtype}} {1} / rho;
+            {% for i in range(D) -%}
+                u_out[{{i}}u] = md_{{i}} * rho_inv;
+            {% endfor %}
+        }
+    }
+
     __global__ void kernel_set(
         gpu::FieldAccessor< {{dtype}} > pdf,
+        gpu::FieldAccessor< {{dtype}} > velocity,
         gpu::FieldAccessor< {{dtype}} > force,
-        {{dtype}} const * RESTRICT const u_in )
+        {{dtype}} const * RESTRICT u_in )
     {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
         pdf.set( blockIdx, threadIdx );
+        velocity.set( blockIdx, threadIdx );
         force.set( blockIdx, threadIdx );
+        u_in += offset;
         if (pdf.isValidPosition()) {
-            uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, uint({{D}}u));
-            uint const bufsize = {{D}}u;
-            {{dtype}} const * RESTRICT const u = u_in + bufsize * offset;
             {% for i in range(Q) -%}
                 {{dtype}} const f_{{i}} = pdf.get({{i}}u);
             {% endfor -%}
+            {{dtype}} const * RESTRICT const u = u_in;
             {{density_getters | indent(8)}}
             {{density_velocity_setter_macroscopic_values | substitute_force_getter_cu | indent(8)}}
+            {% for i in range(D) -%}
+                velocity.get({{i}}u) = u_in[{{i}}u];
+            {% endfor %}
             {{dtype}} u_new[{{D}}] = { {% for i in range(D) %}u_{{i}}{% if not loop.last %}, {% endif %}{% endfor %} };
 
             Equilibrium::kernel_set_device(pdf, u_new, rho {%if not compressible %} + {{dtype}}(1) {%endif%});
         }
     }
+// LCOV_EXCL_STOP
+
+    Vector{{D}}< {{dtype}} > get(
+        gpu::GPUField< {{dtype}} > const * pdf_field,
+        gpu::GPUField< {{dtype}} > const * force_field,
+        Cell const & cell )
+    {
+        CellInterval ci ( cell, cell );
+        thrust::device_vector< {{dtype}} > dev_data({{D}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_get );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( dev_data_ptr );
+        kernel();
+        Vector{{D}}< {{dtype}} > vec;
+        thrust::copy(dev_data.begin(), dev_data.end(), vec.data());
+        return vec;
+    }
+
+    std::vector< {{dtype}} > get(
+        gpu::GPUField< {{dtype}} > const * pdf_field,
+        gpu::GPUField< {{dtype}} > const * force_field,
+        CellInterval const & ci )
+    {
+        thrust::device_vector< {{dtype}} > dev_data({{D}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_get );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( dev_data_ptr );
+        kernel();
+        std::vector< {{dtype}} > out(dev_data.size());
+        thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+        return out;
+    }
 
     void set(
         gpu::GPUField< {{dtype}} > * pdf_field,
-        gpu::GPUField< {{dtype}} > * force_field,
+        gpu::GPUField< {{dtype}} > * velocity_field,
+        gpu::GPUField< {{dtype}} > const * force_field,
         Vector{{D}}< {{dtype}} > const & u,
         Cell const & cell )
     {
@@ -697,33 +804,121 @@ namespace Velocity
         auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
         auto kernel = gpu::make_kernel( kernel_set );
         kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *velocity_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+
+    void set(
+        gpu::GPUField< {{dtype}} > * pdf_field,
+        gpu::GPUField< {{dtype}} > * velocity_field,
+        gpu::GPUField< {{dtype}} > const * force_field,
+        std::vector< {{dtype}} > const & values,
+        CellInterval const & ci )
+    {
+        thrust::device_vector< {{dtype}} > dev_data(values.begin(), values.end());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_set );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *velocity_field, ci ) );
         kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
         kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
         kernel();
     }
 } // namespace Velocity
 
+namespace Force {
+// LCOV_EXCL_START
+    __global__ void kernel_set(
+        gpu::FieldAccessor< {{dtype}} > pdf,
+        gpu::FieldAccessor< {{dtype}} > velocity,
+        gpu::FieldAccessor< {{dtype}} > force,
+        {{dtype}} const * RESTRICT f_in )
+    {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
+        pdf.set( blockIdx, threadIdx );
+        velocity.set( blockIdx, threadIdx );
+        force.set( blockIdx, threadIdx );
+        f_in += offset;
+        if (pdf.isValidPosition()) {
+            {% for i in range(Q) -%}
+                {{dtype}} const f_{{i}} = pdf.get({{i}}u);
+            {% endfor -%}
+
+            {{momentum_density_getter | substitute_force_getter_pattern("force->get\(x, ?y, ?z, ?([0-9])u?\)", "f_in[\g<1>u]") | indent(8) }}
+            auto const rho_inv = {{dtype}} {1} / rho;
+
+            {% for i in range(D) -%}
+                force.get({{i}}u) = f_in[{{i}}u];
+            {% endfor %}
+
+            {% for i in range(D) -%}
+                velocity.get({{i}}u) = md_{{i}} * rho_inv;
+            {% endfor %}
+        }
+    }
+// LCOV_EXCL_STOP
+
+    void
+    set( gpu::GPUField< {{dtype}} > const * pdf_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > * force_field,
+         Vector{{D}}< {{dtype}} > const & u,
+         Cell const & cell )
+    {
+        CellInterval ci ( cell, cell );
+        thrust::device_vector< {{dtype}} > dev_data(u.data(), u.data() + {{D}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_set );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *velocity_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+
+    void
+    set( gpu::GPUField< {{dtype}} > const * pdf_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > * force_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci )
+    {
+        thrust::device_vector< {{dtype}} > dev_data(values.begin(), values.end());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_set );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *velocity_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+} // namespace Force
+
 namespace MomentumDensity
 {
+// LCOV_EXCL_START
     __global__ void kernel_sum(
         gpu::FieldAccessor< {{dtype}} > pdf,
         gpu::FieldAccessor< {{dtype}} > force,
-        {{dtype}} * RESTRICT const out )
+        {{dtype}} * RESTRICT out )
     {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
         pdf.set( blockIdx, threadIdx );
         force.set( blockIdx, threadIdx );
+        out += offset;
         if (pdf.isValidPosition()) {
-            uint const bufsize = {{D}}u;
-            uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, bufsize);
             {% for i in range(Q) -%}
                 {{dtype}} const f_{{i}} = pdf.get({{i}}u);
             {% endfor -%}
             {{momentum_density_getter | substitute_force_getter_cu | indent(8) }}
             {% for i in range(D) -%}
-                out[bufsize * offset + {{i}}u] += md_{{i}};
+                out[{{i}}u] += md_{{i}};
             {% endfor %}
         }
     }
+// LCOV_EXCL_STOP
 
     Vector{{D}}< {{dtype}} > reduce(
         gpu::GPUField< {{dtype}} > const * pdf_field,
@@ -748,25 +943,25 @@ namespace MomentumDensity
 
 namespace PressureTensor
 {
+// LCOV_EXCL_START
     __global__ void kernel_get(
         gpu::FieldAccessor< {{dtype}} > pdf,
-        {{dtype}} * RESTRICT const out )
+        {{dtype}} * RESTRICT p_out )
     {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D**2}}u);
         pdf.set( blockIdx, threadIdx );
+        p_out += offset;
         if (pdf.isValidPosition()) {
-            uint const bufsize = {{D**2}}u;
-            uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, bufsize);
             {% for i in range(Q) -%}
                 {{dtype}} const f_{{i}} = pdf.get({{i}}u);
             {% endfor -%}
             {{second_momentum_getter | indent(12) }}
-            {% for i in range(D) -%}
-                {% for j in range(D) -%}
-                    out[bufsize * offset + {{i*D+j}}u] = p_{{i*D+j}};
-                {% endfor %}
+            {% for i in range(D**2) -%}
+                p_out[{{i}}u] = p_{{i}};
             {% endfor %}
         }
     }
+// LCOV_EXCL_STOP
 
     Matrix{{D}}< {{dtype}} > get(
         gpu::GPUField< {{dtype}} > const * pdf_field,
@@ -780,7 +975,22 @@ namespace PressureTensor
         kernel.addParam( dev_data_ptr );
         kernel();
         Matrix{{D}}< {{dtype}} > out;
-        thrust::copy(dev_data.begin(), dev_data.begin() + {{D**2}}u, out.data());
+        thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+        return out;
+   }
+
+    std::vector< {{dtype}} > get(
+        gpu::GPUField< {{dtype}} > const * pdf_field,
+        CellInterval const & ci )
+    {
+        thrust::device_vector< {{dtype}} > dev_data({{D**2}}u * ci.numCells());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_get );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addParam( dev_data_ptr );
+        kernel();
+        std::vector< {{dtype}} > out(dev_data.size());
+        thrust::copy(dev_data.begin(), dev_data.end(), out.data());
         return out;
    }
 } // namespace PressureTensor
diff --git a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cuh b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cuh
index e75586057b..65f776abee 100644
--- a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cuh
+++ b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cuh
@@ -39,18 +39,6 @@
 #include <tuple>
 #include <vector>
 
-#ifdef WALBERLA_CXX_COMPILER_IS_GNU
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-variable"
-#pragma GCC diagnostic ignored "-Wunused-parameter"
-#endif
-
-#ifdef WALBERLA_CXX_COMPILER_IS_CLANG
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-parameter"
-#endif
-
 namespace walberla {
 namespace {{namespace}} {
 namespace accessor {
@@ -65,6 +53,13 @@ namespace Population {
     set( gpu::GPUField< {{dtype}} > * pdf_field,
          std::array< {{dtype}}, {{Q}}u > const & pop,
          Cell const & cell );
+    /** @brief Set populations and recalculate velocities on a single cell. */
+    void
+    set( gpu::GPUField< {{dtype}} > * pdf_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > const * force_field,
+         std::array< {{dtype}}, {{Q}}u > const & pop,
+         Cell const & cell );
     /** @brief Initialize all cells with the same value. */
     void initialize(
          gpu::GPUField< {{dtype}} > * pdf_field,
@@ -78,6 +73,13 @@ namespace Population {
     set( gpu::GPUField< {{dtype}} > * pdf_field,
          std::vector< {{dtype}} > const & values,
          CellInterval const & ci );
+    /** @brief Set populations and recalculate velocities on a cell interval. */
+    void
+    set( gpu::GPUField< {{dtype}} > * pdf_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > const * force_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci );
 } // namespace Population
 
 namespace Vector {
@@ -141,13 +143,43 @@ namespace Density {
 } // namespace Density
 
 namespace Velocity {
+    Vector{{D}}< {{dtype}} >
+    get( gpu::GPUField< {{dtype}} > const * pdf_field,
+         gpu::GPUField< {{dtype}} > const * force_field,
+         Cell const & cell );
+    std::vector< {{dtype}} >
+    get( gpu::GPUField< {{dtype}} > const * pdf_field,
+         gpu::GPUField< {{dtype}} > const * force_field,
+         CellInterval const & ci );
     void
     set( gpu::GPUField< {{dtype}} > * pdf_field,
-         gpu::GPUField< {{dtype}} > * force_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > const * force_field,
          Vector{{D}}< {{dtype}} > const & u,
          Cell const & cell );
+    void
+    set( gpu::GPUField< {{dtype}} > * pdf_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > const * force_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci );
 } // namespace Velocity
 
+namespace Force {
+    void
+    set( gpu::GPUField< {{dtype}} > const * pdf_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > * force_field,
+         Vector{{D}}< {{dtype}} > const & u,
+         Cell const & cell );
+    void
+    set( gpu::GPUField< {{dtype}} > const * pdf_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > * force_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci );
+} // namespace Force
+
 namespace DensityAndVelocity {
     std::tuple< {{dtype}} , Vector{{D}}< {{dtype}} > >
     get( gpu::GPUField< {{dtype}} > const * pdf_field,
@@ -178,16 +210,11 @@ namespace PressureTensor {
     Matrix{{D}}< {{dtype}} >
     get( gpu::GPUField< {{dtype}} > const * pdf_field,
          Cell const & cell );
+    std::vector< {{dtype}} >
+    get( gpu::GPUField< {{dtype}} > const * pdf_field,
+         CellInterval const & ci );
 } // namespace PressureTensor
 
 } // namespace accessor
 } // namespace {{namespace}}
 } // namespace walberla
-
-#ifdef WALBERLA_CXX_COMPILER_IS_GNU
-#pragma GCC diagnostic pop
-#endif
-
-#ifdef WALBERLA_CXX_COMPILER_IS_CLANG
-#pragma clang diagnostic pop
-#endif
diff --git a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.h b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.h
index 95f8992555..d443243bba 100644
--- a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.h
+++ b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.h
@@ -37,19 +37,18 @@
 
 #include <array>
 #include <cassert>
+#include <iterator>
 #include <tuple>
 #include <vector>
 
 #ifdef WALBERLA_CXX_COMPILER_IS_GNU
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"
-#pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
 #ifdef WALBERLA_CXX_COMPILER_IS_CLANG
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-parameter"
 #endif
 
 namespace walberla {
@@ -58,7 +57,7 @@ namespace accessor {
 
 namespace Population
 {
-    inline std::array<{{dtype}}, {{Q}}u>
+    inline auto
     get( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
          Cell const & cell )
     {
@@ -81,6 +80,28 @@ namespace Population
         {% endfor -%}
     }
 
+    inline void
+    set( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * velocity_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * force_field,
+         std::array<{{dtype}}, {{Q}}u> const & pop,
+         Cell const & cell )
+    {
+        auto & xyz0 = pdf_field->get(cell, uint_t{ 0u });
+        {% for i in range(Q) -%}
+            const {{dtype}} f_{{i}} = pdf_field->getF( &xyz0, uint_t{ {{i}}u }) = pop[{{i}}u];
+        {% endfor -%}
+
+        {% for c in "xyz" -%}
+            const auto {{c}} = cell.{{c}}();
+        {% endfor -%}
+        {{momentum_density_getter | substitute_force_getter_cpp | indent(8) }}
+        const auto rho_inv = {{dtype}} {1} / rho;
+        {% for i in range(D) -%}
+            velocity_field->get(cell, uint_t{ {{i}}u }) = md_{{i}} * rho_inv;
+        {% endfor -%}
+    }
+
     inline void
     initialize( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > * pdf_field,
                 std::array<{{dtype}}, {{Q}}u> const & pop)
@@ -93,7 +114,7 @@ namespace Population
          });
      }
 
-    inline std::vector< {{dtype}} >
+    inline auto
     get( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
          CellInterval const & ci )
     {
@@ -118,15 +139,42 @@ namespace Population
          CellInterval const & ci )
     {
         assert(uint_c(values.size()) == ci.numCells() * uint_t({{Q}}u));
-        auto values_ptr = values.data();
+        auto pop = values.data();
+        for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+            for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+                for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+                    {{dtype}} & xyz0 = pdf_field->get(x, y, z, uint_t{ 0u });
+                    {% for i in range(Q) -%}
+                        pdf_field->getF( &xyz0, uint_t{ {{i}}u }) = pop[{{i}}u];
+                    {% endfor -%}
+                    std::advance(pop, {{Q}});
+                }
+            }
+        }
+    }
+
+    inline void
+    set( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * velocity_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * force_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci )
+    {
+        assert(uint_c(values.size()) == ci.numCells() * uint_t({{Q}}u));
+        auto pop = values.data();
         for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
             for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
                 for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
                     {{dtype}} & xyz0 = pdf_field->get(x, y, z, uint_t{ 0u });
                     {% for i in range(Q) -%}
-                        pdf_field->getF( &xyz0, uint_t{ {{i}}u }) = values_ptr[{{i}}u];
+                        const {{dtype}} f_{{i}} = pdf_field->getF( &xyz0, uint_t{ {{i}}u }) = pop[{{i}}u];
+                    {% endfor -%}
+                    {{momentum_density_getter | substitute_force_getter_cpp | indent(12) }}
+                    const auto rho_inv = {{dtype}} {1} / rho;
+                    {% for i in range(D) -%}
+                        velocity_field->get(x, y, z, uint_t{ {{i}}u }) = md_{{i}} * rho_inv;
                     {% endfor -%}
-                    values_ptr += {{Q}}u;
+                    std::advance(pop, {{Q}});
                 }
             }
         }
@@ -135,7 +183,7 @@ namespace Population
 
 namespace Vector
 {
-    inline Vector{{D}}< {{dtype}} >
+    inline auto
     get( GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * vec_field,
          Cell const & cell )
     {
@@ -193,7 +241,7 @@ namespace Vector
          });
      }
 
-    inline std::vector< {{dtype}} >
+    inline auto
     get( GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * vec_field,
          CellInterval const & ci )
     {
@@ -226,7 +274,7 @@ namespace Vector
                     {% for i in range(D) -%}
                         vec_field->getF( &xyz0, uint_t{ {{i}}u }) = values_ptr[{{i}}u];
                     {% endfor -%}
-                    values_ptr += {{D}}u;
+                    std::advance(values_ptr, {{D}});
                 }
             }
         }
@@ -237,11 +285,11 @@ namespace EquilibriumDistribution
 {
     inline {{dtype}}
     get( stencil::Direction const direction,
-         Vector{{D}}< {{dtype}} > const & u = Vector{{D}}< {{dtype}} >( {{dtype}}(0.0) ),
-         {{dtype}} rho = {{dtype}}(1.0) )
+         Vector{{D}}< {{dtype}} > const & u = Vector{{D}}< {{dtype}} >( {{dtype}} {0} ),
+         {{dtype}} rho = {{dtype}} {1} )
     {
         {% if not compressible %}
-        rho -= {{dtype}}(1.0);
+        rho -= {{dtype}} {1};
         {% endif %}
         {{equilibrium_from_direction}}
     }
@@ -256,7 +304,7 @@ namespace Equilibrium
          Cell const & cell )
     {
         {%if not compressible %}
-        rho -= {{dtype}}(1.0);
+        rho -= {{dtype}} {1};
         {%endif %}
 
         {{dtype}} & xyz0 = pdf_field->get(cell, uint_t{ 0u });
@@ -293,13 +341,13 @@ namespace Density
         {{unshifted_momentum_density_getter | indent(8)}}
 
         // calculate current velocity (before density change)
-        const {{dtype}} conversion = {{dtype}}(1) / rho;
+        const {{dtype}} conversion = {{dtype}} {1} / rho;
         Vector{{D}}< {{dtype}} > velocity;
         {% for i in range(D) -%}
             velocity[{{i}}u] = momdensity_{{i}} * conversion;
         {% endfor %}
 
-        Equilibrium::set(pdf_field, velocity, rho_in {%if not compressible %} + {{dtype}}(1) {%endif%}, cell);
+        Equilibrium::set(pdf_field, velocity, rho_in {%if not compressible %} + {{dtype}} {1} {%endif%}, cell);
     }
 
     inline std::vector< {{dtype}} >
@@ -341,13 +389,13 @@ namespace Density
                     {{unshifted_momentum_density_getter | indent(12)}}
 
                     // calculate current velocity (before density change)
-                    const {{dtype}} conversion = {{dtype}}(1) / rho;
+                    const {{dtype}} conversion = {{dtype}} {1} / rho;
                     Vector{{D}}< {{dtype}} > velocity;
                     {% for i in range(D) -%}
                         velocity[{{i}}u] = momdensity_{{i}} * conversion;
                     {% endfor %}
 
-                    Equilibrium::set(pdf_field, velocity, *values_it {%if not compressible %} + {{dtype}}(1) {%endif%}, Cell{x, y, z});
+                    Equilibrium::set(pdf_field, velocity, *values_it {%if not compressible %} + {{dtype}} {1} {%endif%}, Cell{x, y, z});
                     ++values_it;
                 }
             }
@@ -357,8 +405,53 @@ namespace Density
 
 namespace Velocity
 {
+    inline auto
+    get( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * force_field,
+         Cell const & cell )
+    {
+        const {{dtype}} & xyz0 = pdf_field->get(cell, uint_t{ 0u });
+        {% for i in range(Q) -%}
+            const {{dtype}} f_{{i}} = pdf_field->getF( &xyz0, uint_t{ {{i}}u });
+        {% endfor -%}
+
+        {% for c in "xyz" -%}
+            const auto {{c}} = cell.{{c}}();
+        {% endfor -%}
+        {{momentum_density_getter | substitute_force_getter_cpp | indent(8) }}
+        const {{dtype}} rho_inv = {{dtype}} {1} / rho;
+
+        return Vector3<{{dtype}}>(md_0 * rho_inv, md_1 * rho_inv, md_2 * rho_inv);
+    }
+
+    inline auto
+    get( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * force_field,
+         CellInterval const & ci )
+    {
+        std::vector< {{dtype}} > out;
+        out.reserve(ci.numCells() * uint_t({{D}}u));
+        for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+            for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+                for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+                    const {{dtype}} & xyz0 = pdf_field->get(x, y, z, uint_t{ 0u });
+                    {% for i in range(Q) -%}
+                        const {{dtype}} f_{{i}} = pdf_field->getF( &xyz0, uint_t{ {{i}}u });
+                    {% endfor -%}
+                    {{momentum_density_getter | substitute_force_getter_cpp | indent(12) }}
+                    const {{dtype}} rho_inv = {{dtype}} {1} / rho;
+                    {% for i in range(D) -%}
+                        out.emplace_back(md_{{i}} * rho_inv);
+                    {% endfor -%}
+                }
+            }
+        }
+        return out;
+    }
+
     inline void
     set( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * velocity_field,
          GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * force_field,
          Vector{{D}}< {{dtype}} > const & u,
          Cell const & cell )
@@ -373,14 +466,113 @@ namespace Velocity
             const auto {{c}} = cell.{{c}}();
         {% endfor -%}
         {{density_velocity_setter_macroscopic_values | substitute_force_getter_cpp | indent(8)}}
+        {% for i in range(D) -%}
+            velocity_field->get(x, y, z, uint_t{ {{i}}u }) = u[{{i}}u];
+        {% endfor %}
 
-        Equilibrium::set(pdf_field, Vector{{D}}<{{dtype}}>({% for i in range(D) %}u_{{i}}{% if not loop.last %}, {% endif %}{% endfor %}), rho {%if not compressible %} + {{dtype}}(1) {%endif%}, cell);
+        Equilibrium::set(pdf_field, Vector{{D}}<{{dtype}}>({% for i in range(D) %}u_{{i}}{% if not loop.last %}, {% endif %}{% endfor %}), rho {%if not compressible %} + {{dtype}} {1} {%endif%}, cell);
+    }
+
+    inline void
+    set( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * velocity_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * force_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci )
+    {
+        assert(uint_c(values.size()) == ci.numCells() * uint_t({{D}}u));
+        auto u = values.data();
+        for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+            for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+                for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+                    {{dtype}} & pdf_xyz0 = pdf_field->get(x, y, z, uint_t{ 0u });
+                    {{dtype}} & vel_xyz0 = velocity_field->get(x, y, z, uint_t{ 0u });
+                    {% for i in range(Q) -%}
+                        const {{dtype}} f_{{i}} = pdf_field->getF( &pdf_xyz0, uint_t{ {{i}}u });
+                    {% endfor -%}
+                    {{density_getters | indent(8)}}
+
+                    {{density_velocity_setter_macroscopic_values | substitute_force_getter_cpp | indent(8)}}
+                    {% for i in range(D) -%}
+                        velocity_field->getF( &vel_xyz0, uint_t{ {{i}}u }) = u[{{i}}u];
+                    {% endfor %}
+                    std::advance(u, {{D}});
+
+                    Equilibrium::set(pdf_field, Vector{{D}}<{{dtype}}>({% for i in range(D) %}u_{{i}}{% if not loop.last %}, {% endif %}{% endfor %}), rho {%if not compressible %} + {{dtype}} {1} {%endif%}, Cell{x, y, z});
+                }
+            }
+        }
     }
 } // namespace Velocity
 
+namespace Force
+{
+    inline void
+    set( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * velocity_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * force_field,
+         Vector{{D}}< {{dtype}} > const & force,
+         Cell const & cell )
+    {
+        {{dtype}} const & pdf_xyz0 = pdf_field->get(cell, uint_t{ 0u });
+        {{dtype}} & vel_xyz0 = velocity_field->get(cell, uint_t{ 0u });
+        {{dtype}} & laf_xyz0 = force_field->get(cell, uint_t{ 0u });
+        {% for i in range(Q) -%}
+            const {{dtype}} f_{{i}} = pdf_field->getF( &pdf_xyz0, uint_t{ {{i}}u });
+        {% endfor -%}
+
+        {{momentum_density_getter | substitute_force_getter_pattern("force->get\(x, ?y, ?z, ?([0-9])u?\)", "force[\g<1>u]") | indent(8) }}
+        auto const rho_inv = {{dtype}} {1} / rho;
+
+        {% for i in range(D) -%}
+            force_field->getF( &laf_xyz0, uint_t{ {{i}}u }) = force[{{i}}u];
+        {% endfor %}
+
+        {% for i in range(D) -%}
+            velocity_field->getF( &vel_xyz0, uint_t{ {{i}}u }) = md_{{i}} * rho_inv;
+        {% endfor %}
+    }
+
+    inline void
+    set( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * velocity_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * force_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci )
+    {
+        assert(uint_c(values.size()) == ci.numCells() * uint_t({{D}}u));
+        auto force = values.data();
+        for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+            for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+                for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+                    {{dtype}} const & pdf_xyz0 = pdf_field->get(x, y, z, uint_t{ 0u });
+                    {{dtype}} & vel_xyz0 = velocity_field->get(x, y, z, uint_t{ 0u });
+                    {{dtype}} & laf_xyz0 = force_field->get(x, y, z, uint_t{ 0u });
+                    {% for i in range(Q) -%}
+                        const {{dtype}} f_{{i}} = pdf_field->getF( &pdf_xyz0, uint_t{ {{i}}u });
+                    {% endfor -%}
+
+                    {{momentum_density_getter | substitute_force_getter_pattern("force->get\(x, ?y, ?z, ?([0-9])u?\)", "force[\g<1>u]") | indent(12) }}
+                    auto const rho_inv = {{dtype}} {1} / rho;
+
+                    {% for i in range(D) -%}
+                        force_field->getF( &laf_xyz0, uint_t{ {{i}}u }) = force[{{i}}u];
+                    {% endfor %}
+
+                    {% for i in range(D) -%}
+                        velocity_field->getF( &vel_xyz0, uint_t{ {{i}}u }) = md_{{i}} * rho_inv;
+                    {% endfor %}
+
+                    std::advance(force, {{D}});
+                }
+            }
+        }
+    }
+} // namespace Force
+
 namespace MomentumDensity
 {
-    inline Vector{{D}}< {{dtype}} >
+    inline auto
     reduce( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
             GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * force_field )
     {
@@ -403,7 +595,7 @@ namespace MomentumDensity
 
 namespace PressureTensor
 {
-    inline Matrix{{D}}< {{dtype}} >
+    inline auto
     get( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
          Cell const & cell )
    {
@@ -422,6 +614,33 @@ namespace PressureTensor
         {% endfor %}
         return pressureTensor;
    }
+
+    inline auto
+    get( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
+         CellInterval const & ci )
+    {
+        std::vector< {{dtype}} > out;
+        out.reserve(ci.numCells() * uint_t({{D**2}}u));
+        for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+            for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+                for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+                    const {{dtype}} & xyz0 = pdf_field->get(x, y, z, uint_t{ 0u });
+                    {% for i in range(Q) -%}
+                        const {{dtype}} f_{{i}} = pdf_field->getF( &xyz0, uint_t{ {{i}}u });
+                    {% endfor -%}
+
+                    {{second_momentum_getter | indent(12) }}
+
+                    {% for i in range(D) -%}
+                        {% for j in range(D) -%}
+                            out.emplace_back(p_{{i*D+j}});
+                        {% endfor %}
+                    {% endfor %}
+                }
+            }
+        }
+        return out;
+    }
 } // namespace PressureTensor
 
 } // namespace accessor
diff --git a/maintainer/walberla_kernels/walberla_lbm_generation.py b/maintainer/walberla_kernels/walberla_lbm_generation.py
index f746fe2f8b..6aec095662 100644
--- a/maintainer/walberla_kernels/walberla_lbm_generation.py
+++ b/maintainer/walberla_kernels/walberla_lbm_generation.py
@@ -19,6 +19,7 @@
 #
 
 import os
+import re
 import sympy as sp
 import pystencils as ps
 import lbmpy_walberla
@@ -104,6 +105,12 @@ def equations_to_code(equations, variable_prefix="",
     return "\n".join(result)
 
 
+def substitute_force_getter_pattern(code, pattern, subst):
+    re_pat = re.compile(pattern)
+    assert re_pat.search(code) is not None, f"pattern '{pattern} not found in '''\n{code}\n'''"  # nopep8
+    return re_pat.sub(subst, code)
+
+
 def substitute_force_getter_cpp(code):
     field_getter = "force->"
     assert field_getter in code is not None, f"pattern '{field_getter} not found in '''\n{code}\n'''"  # nopep8
@@ -120,6 +127,7 @@ def substitute_force_getter_cu(code):
 def add_espresso_filters_to_jinja_env(jinja_env):
     jinja_env.filters["substitute_force_getter_cpp"] = substitute_force_getter_cpp
     jinja_env.filters["substitute_force_getter_cu"] = substitute_force_getter_cu
+    jinja_env.filters["substitute_force_getter_pattern"] = substitute_force_getter_pattern
 
 
 def generate_macroscopic_values_accessors(ctx, config, lb_method, templates):
diff --git a/src/core/reaction_methods/tests/CMakeLists.txt b/src/core/reaction_methods/tests/CMakeLists.txt
index 7dd67a9e63..e63fa9630a 100644
--- a/src/core/reaction_methods/tests/CMakeLists.txt
+++ b/src/core/reaction_methods/tests/CMakeLists.txt
@@ -1,4 +1,5 @@
-# Copyright (C) 2021-2022 The ESPResSo project
+#
+# Copyright (C) 2021-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -14,14 +15,13 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
 
-include(unit_test)
+include(espresso_unit_test)
 
-unit_test(NAME SingleReaction_test SRC SingleReaction_test.cpp DEPENDS
-          espresso::core)
-unit_test(NAME ReactionAlgorithm_test SRC ReactionAlgorithm_test.cpp DEPENDS
-          espresso::core Boost::mpi MPI::MPI_CXX NUM_PROC 2)
-unit_test(NAME particle_tracking_test SRC particle_tracking_test.cpp DEPENDS
-          espresso::core Boost::mpi MPI::MPI_CXX)
-unit_test(NAME reaction_methods_utils_test SRC reaction_methods_utils_test.cpp
-          DEPENDS espresso::core)
+espresso_unit_test(SRC SingleReaction_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC ReactionAlgorithm_test.cpp DEPENDS espresso::core
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(SRC particle_tracking_test.cpp DEPENDS espresso::core
+                   Boost::mpi MPI::MPI_CXX)
+espresso_unit_test(SRC reaction_methods_utils_test.cpp DEPENDS espresso::core)
diff --git a/src/core/unit_tests/CMakeLists.txt b/src/core/unit_tests/CMakeLists.txt
index 60fcadae76..92d56587c4 100644
--- a/src/core/unit_tests/CMakeLists.txt
+++ b/src/core/unit_tests/CMakeLists.txt
@@ -19,68 +19,59 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-include(unit_test)
+include(espresso_unit_test)
 
-# Add tests here
-unit_test(NAME RuntimeError_test SRC RuntimeError_test.cpp DEPENDS
-          Boost::serialization)
-unit_test(NAME RuntimeErrorCollector_test SRC RuntimeErrorCollector_test.cpp
-          DEPENDS espresso::core Boost::mpi MPI::MPI_CXX NUM_PROC 2)
-unit_test(NAME EspressoSystemStandAlone_parallel_test SRC
-          EspressoSystemStandAlone_test.cpp DEPENDS espresso::core Boost::mpi
-          MPI::MPI_CXX NUM_PROC 4)
-unit_test(NAME EspressoSystemStandAlone_serial_test SRC
-          EspressoSystemStandAlone_test.cpp DEPENDS espresso::core Boost::mpi
-          MPI::MPI_CXX NUM_PROC 1)
-unit_test(NAME EspressoSystem_test SRC EspressoSystem_test.cpp DEPENDS
-          espresso::core Boost::mpi NUM_PROC 2)
-unit_test(NAME ResourceCleanup_test SRC ResourceCleanup_test.cpp DEPENDS
-          espresso::core Boost::mpi NUM_PROC 2)
-unit_test(NAME MpiCallbacks_test SRC MpiCallbacks_test.cpp DEPENDS
-          espresso::utils Boost::mpi MPI::MPI_CXX NUM_PROC 2)
-unit_test(NAME ParticleIterator_test SRC ParticleIterator_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME link_cell_test SRC link_cell_test.cpp DEPENDS espresso::utils)
-unit_test(NAME Particle_test SRC Particle_test.cpp DEPENDS espresso::utils
-          Boost::serialization)
-unit_test(NAME Particle_serialization_test SRC Particle_serialization_test.cpp
-          DEPENDS espresso::utils Boost::serialization)
-unit_test(NAME rotation_test SRC rotation_test.cpp DEPENDS espresso::utils
-          espresso::core)
-unit_test(NAME field_coupling_couplings SRC field_coupling_couplings_test.cpp
-          DEPENDS espresso::utils)
-unit_test(NAME field_coupling_fields SRC field_coupling_fields_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME field_coupling_force_field SRC
-          field_coupling_force_field_test.cpp DEPENDS espresso::utils)
-unit_test(NAME periodic_fold_test SRC periodic_fold_test.cpp)
-unit_test(NAME grid_test SRC grid_test.cpp DEPENDS espresso::core)
-unit_test(NAME lees_edwards_test SRC lees_edwards_test.cpp DEPENDS
-          espresso::core)
-unit_test(NAME BoxGeometry_test SRC BoxGeometry_test.cpp DEPENDS espresso::core)
-unit_test(NAME LocalBox_test SRC LocalBox_test.cpp DEPENDS espresso::core)
-unit_test(NAME Verlet_list_test SRC Verlet_list_test.cpp DEPENDS espresso::core
-          NUM_PROC 4)
-unit_test(NAME VerletCriterion_test SRC VerletCriterion_test.cpp DEPENDS
-          espresso::core)
-unit_test(NAME thermostats_test SRC thermostats_test.cpp DEPENDS espresso::core)
-unit_test(NAME random_test SRC random_test.cpp DEPENDS espresso::utils
-          Random123)
-unit_test(NAME BondList_test SRC BondList_test.cpp DEPENDS espresso::core)
-unit_test(NAME energy_test SRC energy_test.cpp DEPENDS espresso::core)
-unit_test(NAME bonded_interactions_map_test SRC
-          bonded_interactions_map_test.cpp DEPENDS espresso::core)
-unit_test(NAME bond_breakage_test SRC bond_breakage_test.cpp DEPENDS
-          espresso::core)
+espresso_unit_test(SRC RuntimeError_test.cpp DEPENDS Boost::serialization)
+espresso_unit_test(SRC RuntimeErrorCollector_test.cpp DEPENDS espresso::core
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(
+  NAME EspressoSystemStandAlone_parallel_test SRC
+  EspressoSystemStandAlone_test.cpp DEPENDS espresso::core Boost::mpi
+  MPI::MPI_CXX NUM_PROC 4)
+espresso_unit_test(
+  NAME EspressoSystemStandAlone_serial_test SRC
+  EspressoSystemStandAlone_test.cpp DEPENDS espresso::core Boost::mpi
+  MPI::MPI_CXX NUM_PROC 1)
+espresso_unit_test(SRC EspressoSystem_test.cpp DEPENDS espresso::core
+                   Boost::mpi NUM_PROC 2)
+espresso_unit_test(SRC ResourceCleanup_test.cpp DEPENDS espresso::core
+                   Boost::mpi NUM_PROC 2)
+espresso_unit_test(SRC MpiCallbacks_test.cpp DEPENDS espresso::utils Boost::mpi
+                   MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(SRC ParticleIterator_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC link_cell_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC Particle_test.cpp DEPENDS espresso::utils
+                   Boost::serialization)
+espresso_unit_test(SRC Particle_serialization_test.cpp DEPENDS espresso::utils
+                   Boost::serialization)
+espresso_unit_test(SRC rotation_test.cpp DEPENDS espresso::utils espresso::core)
+espresso_unit_test(SRC field_coupling_couplings_test.cpp DEPENDS
+                   espresso::utils)
+espresso_unit_test(SRC field_coupling_fields_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC field_coupling_force_field_test.cpp DEPENDS
+                   espresso::utils)
+espresso_unit_test(SRC periodic_fold_test.cpp)
+espresso_unit_test(SRC grid_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC lees_edwards_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC BoxGeometry_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC LocalBox_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC Verlet_list_test.cpp DEPENDS espresso::core NUM_PROC 4)
+espresso_unit_test(SRC VerletCriterion_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC thermostats_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC random_test.cpp DEPENDS espresso::utils Random123)
+espresso_unit_test(SRC BondList_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC energy_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC bonded_interactions_map_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC bond_breakage_test.cpp DEPENDS espresso::core)
 if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
   # AppleClang doesn't implement C++17's mathematical special functions
-  unit_test(NAME specfunc_test SRC specfunc_test.cpp DEPENDS espresso::utils
-            espresso::core)
+  espresso_unit_test(SRC specfunc_test.cpp DEPENDS espresso::utils
+                     espresso::core)
 endif()
-unit_test(NAME lb_particle_coupling_test SRC lb_particle_coupling_test.cpp
-          DEPENDS espresso::core Boost::mpi MPI::MPI_CXX NUM_PROC 2)
-unit_test(NAME ek_interface_test SRC ek_interface_test.cpp DEPENDS
-          espresso::core Boost::mpi MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(SRC lb_particle_coupling_test.cpp DEPENDS espresso::core
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(SRC ek_interface_test.cpp DEPENDS espresso::core Boost::mpi
+                   MPI::MPI_CXX NUM_PROC 2)
 if(ESPRESSO_BUILD_WITH_WALBERLA)
   target_link_libraries(
     lb_particle_coupling_test PRIVATE espresso::walberla
@@ -90,15 +81,12 @@ if(ESPRESSO_BUILD_WITH_WALBERLA)
 endif()
 
 if(ESPRESSO_BUILD_WITH_FFTW)
-  unit_test(NAME p3m_test SRC p3m_test.cpp DEPENDS espresso::utils
-            espresso::core)
-  unit_test(NAME fft_test SRC fft_test.cpp DEPENDS espresso::utils
-            espresso::core)
-  unit_test(NAME math_test SRC math_test.cpp DEPENDS espresso::utils
-            espresso::core)
+  espresso_unit_test(SRC p3m_test.cpp DEPENDS espresso::utils espresso::core)
+  espresso_unit_test(SRC fft_test.cpp DEPENDS espresso::utils espresso::core)
+  espresso_unit_test(SRC math_test.cpp DEPENDS espresso::utils espresso::core)
 endif()
 
 if(ESPRESSO_BUILD_WITH_CUDA)
-  unit_test(NAME cuda_test SRC cuda_test.cu DEPENDS espresso::utils
-            espresso::core espresso::cuda)
+  espresso_unit_test(SRC cuda_test.cu DEPENDS espresso::utils espresso::core
+                     espresso::cuda)
 endif()
diff --git a/src/particle_observables/tests/CMakeLists.txt b/src/particle_observables/tests/CMakeLists.txt
index 244c47bfab..241c6d12a6 100644
--- a/src/particle_observables/tests/CMakeLists.txt
+++ b/src/particle_observables/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2020-2022 The ESPResSo project
+# Copyright (C) 2020-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,11 +17,11 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-include(unit_test)
+include(espresso_unit_test)
 
-unit_test(NAME properties_test SRC properties.cpp DEPENDS
-          espresso::particle_observables)
-unit_test(NAME algorithms_test SRC algorithms.cpp DEPENDS
-          espresso::particle_observables)
-unit_test(NAME observables_test SRC observables.cpp DEPENDS
-          espresso::particle_observables)
+espresso_unit_test(SRC properties_test.cpp DEPENDS
+                   espresso::particle_observables)
+espresso_unit_test(SRC algorithms_test.cpp DEPENDS
+                   espresso::particle_observables)
+espresso_unit_test(SRC observables_test.cpp DEPENDS
+                   espresso::particle_observables)
diff --git a/src/particle_observables/tests/algorithms.cpp b/src/particle_observables/tests/algorithms_test.cpp
similarity index 100%
rename from src/particle_observables/tests/algorithms.cpp
rename to src/particle_observables/tests/algorithms_test.cpp
diff --git a/src/particle_observables/tests/observables.cpp b/src/particle_observables/tests/observables_test.cpp
similarity index 100%
rename from src/particle_observables/tests/observables.cpp
rename to src/particle_observables/tests/observables_test.cpp
diff --git a/src/particle_observables/tests/properties.cpp b/src/particle_observables/tests/properties_test.cpp
similarity index 100%
rename from src/particle_observables/tests/properties.cpp
rename to src/particle_observables/tests/properties_test.cpp
diff --git a/src/script_interface/tests/CMakeLists.txt b/src/script_interface/tests/CMakeLists.txt
index 399e44255a..dc34f91e1b 100644
--- a/src/script_interface/tests/CMakeLists.txt
+++ b/src/script_interface/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2020-2022 The ESPResSo project
+# Copyright (C) 2020-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,46 +17,42 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-include(unit_test)
+include(espresso_unit_test)
 
-unit_test(NAME ObjectHandle_test SRC ObjectHandle_test.cpp DEPENDS
-          espresso::script_interface)
-unit_test(NAME AutoParameters_test SRC AutoParameters_test.cpp DEPENDS
-          espresso::script_interface)
-unit_test(NAME AutoParameter_test SRC AutoParameter_test.cpp DEPENDS
-          espresso::script_interface)
-unit_test(NAME Variant_test SRC Variant_test.cpp DEPENDS
-          espresso::script_interface)
-unit_test(NAME get_value_test SRC get_value_test.cpp DEPENDS
-          espresso::script_interface)
-unit_test(NAME None_test SRC None_test.cpp DEPENDS espresso::script_interface)
-unit_test(NAME reduction_test SRC reduction_test.cpp DEPENDS
-          espresso::script_interface Boost::mpi MPI::MPI_CXX NUM_PROC 4)
-unit_test(NAME LocalContext_test SRC LocalContext_test.cpp DEPENDS
-          espresso::script_interface Boost::mpi MPI::MPI_CXX NUM_PROC 1)
-unit_test(NAME GlobalContext_test SRC GlobalContext_test.cpp DEPENDS
-          espresso::script_interface Boost::mpi MPI::MPI_CXX NUM_PROC 2)
-unit_test(NAME Exception_test SRC Exception_test.cpp DEPENDS
-          espresso::script_interface)
-unit_test(NAME ParallelExceptionHandler_test SRC
-          ParallelExceptionHandler_test.cpp DEPENDS espresso::script_interface
-          espresso::core Boost::mpi MPI::MPI_CXX NUM_PROC 2)
-unit_test(NAME packed_variant_test SRC packed_variant_test.cpp DEPENDS
-          espresso::script_interface)
-unit_test(NAME ObjectList_test SRC ObjectList_test.cpp DEPENDS
-          espresso::script_interface espresso::core Boost::mpi)
-unit_test(NAME ObjectMap_test SRC ObjectMap_test.cpp DEPENDS
-          espresso::script_interface espresso::core Boost::mpi)
-unit_test(NAME Accumulators_test SRC Accumulators_test.cpp DEPENDS
-          espresso::script_interface espresso::core Boost::mpi MPI::MPI_CXX
-          NUM_PROC 2)
-unit_test(NAME Constraints_test SRC Constraints_test.cpp DEPENDS
-          espresso::script_interface espresso::core)
-unit_test(NAME Actors_test SRC Actors_test.cpp DEPENDS
-          espresso::script_interface espresso::core)
-unit_test(NAME ConstantpHEnsemble_test SRC ConstantpHEnsemble_test.cpp DEPENDS
-          espresso::core espresso::script_interface Boost::mpi MPI::MPI_CXX
-          NUM_PROC 2)
-unit_test(NAME ReactionEnsemble_test SRC ReactionEnsemble_test.cpp DEPENDS
-          espresso::core espresso::script_interface Boost::mpi MPI::MPI_CXX
-          NUM_PROC 2)
+espresso_unit_test(SRC ObjectHandle_test.cpp DEPENDS espresso::script_interface)
+espresso_unit_test(SRC AutoParameters_test.cpp DEPENDS
+                   espresso::script_interface)
+espresso_unit_test(SRC AutoParameter_test.cpp DEPENDS
+                   espresso::script_interface)
+espresso_unit_test(SRC Variant_test.cpp DEPENDS espresso::script_interface)
+espresso_unit_test(SRC get_value_test.cpp DEPENDS espresso::script_interface)
+espresso_unit_test(SRC None_test.cpp DEPENDS espresso::script_interface)
+espresso_unit_test(SRC reduction_test.cpp DEPENDS espresso::script_interface
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 4)
+espresso_unit_test(SRC LocalContext_test.cpp DEPENDS espresso::script_interface
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 1)
+espresso_unit_test(
+  SRC GlobalContext_test.cpp DEPENDS espresso::script_interface Boost::mpi
+  MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(SRC Exception_test.cpp DEPENDS espresso::script_interface)
+espresso_unit_test(
+  SRC ParallelExceptionHandler_test.cpp DEPENDS espresso::script_interface
+  espresso::core Boost::mpi MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(SRC packed_variant_test.cpp DEPENDS
+                   espresso::script_interface)
+espresso_unit_test(SRC ObjectList_test.cpp DEPENDS espresso::script_interface
+                   espresso::core Boost::mpi)
+espresso_unit_test(SRC ObjectMap_test.cpp DEPENDS espresso::script_interface
+                   espresso::core Boost::mpi)
+espresso_unit_test(SRC Accumulators_test.cpp DEPENDS espresso::script_interface
+                   espresso::core Boost::mpi MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(SRC Constraints_test.cpp DEPENDS espresso::script_interface
+                   espresso::core)
+espresso_unit_test(SRC Actors_test.cpp DEPENDS espresso::script_interface
+                   espresso::core)
+espresso_unit_test(
+  SRC ConstantpHEnsemble_test.cpp DEPENDS espresso::core
+  espresso::script_interface Boost::mpi MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(
+  SRC ReactionEnsemble_test.cpp DEPENDS espresso::core
+  espresso::script_interface Boost::mpi MPI::MPI_CXX NUM_PROC 2)
diff --git a/src/script_interface/thermostat/thermostat.hpp b/src/script_interface/thermostat/thermostat.hpp
index 1f5c223f01..57318778f1 100644
--- a/src/script_interface/thermostat/thermostat.hpp
+++ b/src/script_interface/thermostat/thermostat.hpp
@@ -612,6 +612,12 @@ class Thermostat : public AutoParameters<Thermostat, System::Leaf> {
 
   Variant do_call_method(std::string const &name,
                          VariantMap const &params) override {
+    if (params.contains("act_on_virtual")) {
+      context()->parallel_try_catch([&]() {
+        throw std::runtime_error(
+            name + "() got an unexpected keyword argument 'act_on_virtual'");
+      });
+    }
     if (name == "set_langevin") {
       setup_thermostat(langevin, params);
       return {};
diff --git a/src/shapes/unit_tests/CMakeLists.txt b/src/shapes/unit_tests/CMakeLists.txt
index 61480e1a23..5123a75d2c 100644
--- a/src/shapes/unit_tests/CMakeLists.txt
+++ b/src/shapes/unit_tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2016-2022 The ESPResSo project
+# Copyright (C) 2016-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,16 +17,14 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-include(unit_test)
-unit_test(NAME Wall_test SRC Wall_test.cpp DEPENDS espresso::shapes
-          espresso::utils)
-unit_test(NAME HollowConicalFrustum_test SRC HollowConicalFrustum_test.cpp
-          DEPENDS espresso::shapes espresso::utils)
-unit_test(NAME Union_test SRC Union_test.cpp DEPENDS espresso::shapes
-          espresso::utils)
-unit_test(NAME Ellipsoid_test SRC Ellipsoid_test.cpp DEPENDS espresso::shapes
-          espresso::utils)
-unit_test(NAME Sphere_test SRC Sphere_test.cpp DEPENDS espresso::shapes
-          espresso::utils)
-unit_test(NAME NoWhere_test SRC NoWhere_test.cpp DEPENDS espresso::shapes
-          espresso::utils)
+include(espresso_unit_test)
+
+espresso_unit_test(SRC Wall_test.cpp DEPENDS espresso::shapes espresso::utils)
+espresso_unit_test(SRC HollowConicalFrustum_test.cpp DEPENDS espresso::shapes
+                   espresso::utils)
+espresso_unit_test(SRC Union_test.cpp DEPENDS espresso::shapes espresso::utils)
+espresso_unit_test(SRC Ellipsoid_test.cpp DEPENDS espresso::shapes
+                   espresso::utils)
+espresso_unit_test(SRC Sphere_test.cpp DEPENDS espresso::shapes espresso::utils)
+espresso_unit_test(SRC NoWhere_test.cpp DEPENDS espresso::shapes
+                   espresso::utils)
diff --git a/src/utils/tests/CMakeLists.txt b/src/utils/tests/CMakeLists.txt
index 89755749fa..c7f954e78b 100644
--- a/src/utils/tests/CMakeLists.txt
+++ b/src/utils/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2010-2022 The ESPResSo project
+# Copyright (C) 2010-2024 The ESPResSo project
 # Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
 #   Max-Planck-Institute for Polymer Research, Theory Group
 #
@@ -19,92 +19,76 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-include(unit_test)
+include(espresso_unit_test)
 
-unit_test(NAME Vector_test SRC Vector_test.cpp DEPENDS espresso::utils)
-unit_test(NAME Factory_test SRC Factory_test.cpp DEPENDS espresso::utils)
-unit_test(NAME NumeratedContainer_test SRC NumeratedContainer_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME keys_test SRC keys_test.cpp DEPENDS espresso::utils)
-unit_test(NAME Cache_test SRC Cache_test.cpp DEPENDS espresso::utils)
-unit_test(NAME histogram SRC histogram.cpp DEPENDS espresso::utils)
-unit_test(NAME accumulator SRC accumulator.cpp DEPENDS espresso::utils
-          Boost::serialization)
-unit_test(NAME int_pow SRC int_pow_test.cpp DEPENDS espresso::utils)
-unit_test(NAME sgn SRC sgn_test.cpp DEPENDS espresso::utils)
-unit_test(NAME AS_erfc_part SRC AS_erfc_part_test.cpp DEPENDS espresso::utils)
-unit_test(NAME permute_ifield_test SRC permute_ifield_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME vec_rotate SRC vec_rotate_test.cpp DEPENDS espresso::utils)
-unit_test(NAME tensor_product SRC tensor_product_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME linear_interpolation SRC linear_interpolation_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME interpolation_gradient SRC interpolation_gradient_test.cpp
-          DEPENDS espresso::utils)
-unit_test(NAME interpolation SRC interpolation_test.cpp DEPENDS espresso::utils)
-unit_test(NAME bspline_test SRC bspline_test.cpp DEPENDS espresso::utils)
-unit_test(NAME matrix_vector_product SRC matrix_vector_product.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME index_test SRC index_test.cpp DEPENDS espresso::utils)
-unit_test(NAME tuple_test SRC tuple_test.cpp DEPENDS espresso::utils)
-unit_test(NAME Array_test SRC Array_test.cpp DEPENDS Boost::serialization
-          espresso::utils)
-unit_test(NAME contains_test SRC contains_test.cpp DEPENDS espresso::utils)
-unit_test(NAME Counter_test SRC Counter_test.cpp DEPENDS espresso::utils
-          Boost::serialization)
-unit_test(NAME RunningAverage_test SRC RunningAverage_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME for_each_pair_test SRC for_each_pair_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME raster_test SRC raster_test.cpp DEPENDS espresso::utils)
-unit_test(NAME make_lin_space_test SRC make_lin_space_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME sampling_test SRC sampling_test.cpp DEPENDS espresso::utils)
-unit_test(NAME coordinate_transformation_test SRC coordinate_transformation.cpp
-          DEPENDS espresso::utils)
-unit_test(NAME cylindrical_transformation_test SRC
-          cylindrical_transformation.cpp DEPENDS espresso::utils)
-unit_test(NAME rotation_matrix_test SRC rotation_matrix_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME quaternion_test SRC quaternion_test.cpp DEPENDS espresso::utils)
-unit_test(NAME mask_test SRC mask_test.cpp DEPENDS espresso::utils)
-unit_test(NAME type_traits_test SRC type_traits_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME uniform_test SRC uniform_test.cpp DEPENDS espresso::utils)
-unit_test(NAME memcpy_archive_test SRC memcpy_archive_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME triangle_functions_test SRC triangle_functions_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME Bag_test SRC Bag_test.cpp DEPENDS espresso::utils
-          Boost::serialization)
-unit_test(NAME integral_parameter_test SRC integral_parameter_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME flatten_test SRC flatten_test.cpp DEPENDS espresso::utils)
-unit_test(NAME pack_test SRC pack_test.cpp DEPENDS Boost::serialization
-          espresso::utils)
-unit_test(NAME unordered_map_test SRC unordered_map_test.cpp DEPENDS
-          Boost::serialization espresso::utils)
-unit_test(NAME u32_to_u64_test SRC u32_to_u64_test.cpp DEPENDS espresso::utils
-          NUM_PROC 1)
-unit_test(NAME gather_buffer_test SRC gather_buffer_test.cpp DEPENDS
-          espresso::utils::mpi Boost::mpi MPI::MPI_CXX NUM_PROC 4)
-unit_test(NAME scatter_buffer_test SRC scatter_buffer_test.cpp DEPENDS
-          espresso::utils::mpi Boost::mpi MPI::MPI_CXX NUM_PROC 4)
-unit_test(NAME all_compare_test SRC all_compare_test.cpp DEPENDS
-          espresso::utils::mpi Boost::mpi MPI::MPI_CXX NUM_PROC 3)
-unit_test(NAME gatherv_test SRC gatherv_test.cpp DEPENDS espresso::utils::mpi
-          Boost::mpi MPI::MPI_CXX NUM_PROC 3)
-unit_test(NAME iall_gatherv_test SRC iall_gatherv_test.cpp DEPENDS
-          espresso::utils::mpi Boost::mpi MPI::MPI_CXX NUM_PROC 3)
-unit_test(NAME sendrecv_test SRC sendrecv_test.cpp DEPENDS espresso::utils::mpi
-          Boost::mpi MPI::MPI_CXX espresso::utils NUM_PROC 3)
-unit_test(NAME serialization_test SRC serialization_test.cpp DEPENDS
-          espresso::utils Boost::serialization Boost::mpi MPI::MPI_CXX NUM_PROC
-          1)
-unit_test(NAME matrix_test SRC matrix_test.cpp DEPENDS espresso::utils
-          Boost::serialization NUM_PROC 1)
-unit_test(NAME orthonormal_vec_test SRC orthonormal_vec_test.cpp DEPENDS
-          espresso::utils Boost::serialization NUM_PROC 1)
-unit_test(NAME reduce_optional_test SRC reduce_optional_test.cpp DEPENDS
-          espresso::utils::mpi Boost::mpi MPI::MPI_CXX NUM_PROC 4)
+espresso_unit_test(SRC Vector_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC Factory_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC NumeratedContainer_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC keys_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC Cache_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC histogram_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC accumulator_test.cpp DEPENDS espresso::utils
+                   Boost::serialization)
+espresso_unit_test(SRC int_pow_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC sgn_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC AS_erfc_part_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC permute_ifield_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC vec_rotate_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC tensor_product_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC linear_interpolation_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC interpolation_gradient_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC interpolation_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC bspline_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC matrix_vector_product_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC index_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC tuple_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC Array_test.cpp DEPENDS Boost::serialization
+                   espresso::utils)
+espresso_unit_test(SRC contains_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC Counter_test.cpp DEPENDS espresso::utils
+                   Boost::serialization)
+espresso_unit_test(SRC RunningAverage_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC for_each_pair_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC raster_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC make_lin_space_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC sampling_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC coordinate_transformation_test.cpp DEPENDS
+                   espresso::utils)
+espresso_unit_test(SRC cylindrical_transformation_test.cpp DEPENDS
+                   espresso::utils)
+espresso_unit_test(SRC rotation_matrix_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC quaternion_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC mask_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC type_traits_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC uniform_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC memcpy_archive_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC triangle_functions_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC Bag_test.cpp DEPENDS espresso::utils
+                   Boost::serialization)
+espresso_unit_test(SRC integral_parameter_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC flatten_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC pack_test.cpp DEPENDS Boost::serialization
+                   espresso::utils)
+espresso_unit_test(SRC unordered_map_test.cpp DEPENDS Boost::serialization
+                   espresso::utils)
+espresso_unit_test(SRC u32_to_u64_test.cpp DEPENDS espresso::utils NUM_PROC 1)
+espresso_unit_test(SRC gather_buffer_test.cpp DEPENDS espresso::utils::mpi
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 4)
+espresso_unit_test(SRC scatter_buffer_test.cpp DEPENDS espresso::utils::mpi
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 4)
+espresso_unit_test(SRC all_compare_test.cpp DEPENDS espresso::utils::mpi
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 3)
+espresso_unit_test(SRC gatherv_test.cpp DEPENDS espresso::utils::mpi Boost::mpi
+                   MPI::MPI_CXX NUM_PROC 3)
+espresso_unit_test(SRC iall_gatherv_test.cpp DEPENDS espresso::utils::mpi
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 3)
+espresso_unit_test(SRC sendrecv_test.cpp DEPENDS espresso::utils::mpi
+                   Boost::mpi MPI::MPI_CXX espresso::utils NUM_PROC 3)
+espresso_unit_test(SRC serialization_test.cpp DEPENDS espresso::utils
+                   Boost::serialization Boost::mpi MPI::MPI_CXX NUM_PROC 1)
+espresso_unit_test(SRC matrix_test.cpp DEPENDS espresso::utils
+                   Boost::serialization NUM_PROC 1)
+espresso_unit_test(SRC orthonormal_vec_test.cpp DEPENDS espresso::utils
+                   Boost::serialization NUM_PROC 1)
+espresso_unit_test(SRC reduce_optional_test.cpp DEPENDS espresso::utils::mpi
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 4)
diff --git a/src/utils/tests/accumulator.cpp b/src/utils/tests/accumulator_test.cpp
similarity index 100%
rename from src/utils/tests/accumulator.cpp
rename to src/utils/tests/accumulator_test.cpp
diff --git a/src/utils/tests/coordinate_transformation.cpp b/src/utils/tests/coordinate_transformation_test.cpp
similarity index 100%
rename from src/utils/tests/coordinate_transformation.cpp
rename to src/utils/tests/coordinate_transformation_test.cpp
diff --git a/src/utils/tests/cylindrical_transformation.cpp b/src/utils/tests/cylindrical_transformation_test.cpp
similarity index 100%
rename from src/utils/tests/cylindrical_transformation.cpp
rename to src/utils/tests/cylindrical_transformation_test.cpp
diff --git a/src/utils/tests/histogram.cpp b/src/utils/tests/histogram_test.cpp
similarity index 100%
rename from src/utils/tests/histogram.cpp
rename to src/utils/tests/histogram_test.cpp
diff --git a/src/utils/tests/matrix_vector_product.cpp b/src/utils/tests/matrix_vector_product_test.cpp
similarity index 100%
rename from src/utils/tests/matrix_vector_product.cpp
rename to src/utils/tests/matrix_vector_product_test.cpp
diff --git a/src/walberla_bridge/CMakeLists.txt b/src/walberla_bridge/CMakeLists.txt
index 090f0c7337..6b2da504a0 100644
--- a/src/walberla_bridge/CMakeLists.txt
+++ b/src/walberla_bridge/CMakeLists.txt
@@ -23,6 +23,12 @@ target_link_libraries(
   espresso_walberla_cpp_flags
   INTERFACE espresso::cpp_flags
             $<$<BOOL:${ESPRESSO_BUILD_WITH_WALBERLA_AVX}>:espresso::avx_flags>)
+add_library(espresso_walberla_cuda_flags INTERFACE)
+add_library(espresso::walberla::cuda_flags ALIAS espresso_walberla_cuda_flags)
+target_link_libraries(
+  espresso_walberla_cuda_flags
+  INTERFACE espresso::cuda_flags
+            $<$<BOOL:${ESPRESSO_BUILD_WITH_WALBERLA_AVX}>:espresso::avx_flags>)
 
 function(espresso_configure_walberla_target)
   set(TARGET_NAME ${ARGV0})
diff --git a/src/walberla_bridge/src/BoundaryHandling.hpp b/src/walberla_bridge/src/BoundaryHandling.hpp
index bf82044cbd..d8b2082460 100644
--- a/src/walberla_bridge/src/BoundaryHandling.hpp
+++ b/src/walberla_bridge/src/BoundaryHandling.hpp
@@ -83,7 +83,7 @@ template <typename T, typename BoundaryClass> class BoundaryHandling {
       m_value_boundary->erase(global);
     }
 
-    [[nodiscard]] auto
+    [[nodiscard]] auto &
     get_node_boundary_value(Utils::Vector3i const &node) const {
       auto const global = Cell(node[0], node[1], node[2]);
       return get_value(global);
@@ -98,7 +98,7 @@ template <typename T, typename BoundaryClass> class BoundaryHandling {
     std::shared_ptr<std::unordered_map<Cell, T>> m_value_boundary;
     static constexpr T default_value{};
 
-    [[nodiscard]] T get_value(Cell const &cell) const {
+    [[nodiscard]] T const &get_value(Cell const &cell) const {
       if (m_value_boundary->count(cell) == 0) {
         return default_value;
       }
@@ -122,8 +122,8 @@ template <typename T, typename BoundaryClass> class BoundaryHandling {
       : m_blocks(std::move(blocks)), m_flag_field_id(flag_field_id),
         m_callback(DynamicValueCallback()), m_pending_changes(false) {
     // reinitialize the flag field
-    for (auto b = m_blocks->begin(); b != m_blocks->end(); ++b) {
-      flag_reset_kernel(&*b);
+    for (auto block = m_blocks->begin(); block != m_blocks->end(); ++block) {
+      flag_reset_kernel(block->template getData<FlagField>(m_flag_field_id));
     }
     // instantiate the boundary sweep
     std::function callback = m_callback;
@@ -137,7 +137,7 @@ template <typename T, typename BoundaryClass> class BoundaryHandling {
     return m_callback.node_is_boundary(node);
   }
 
-  [[nodiscard]] auto
+  [[nodiscard]] auto &
   get_node_value_at_boundary(Utils::Vector3i const &node) const {
     return m_callback.get_node_boundary_value(node);
   }
@@ -185,8 +185,7 @@ template <typename T, typename BoundaryClass> class BoundaryHandling {
   bool m_pending_changes;
 
   /** Register flags and reset all cells. */
-  void flag_reset_kernel(IBlock *const block) {
-    auto flag_field = block->template getData<FlagField>(m_flag_field_id);
+  void flag_reset_kernel(FlagField *flag_field) {
     // register flags
     if (!flag_field->flagExists(Domain_flag))
       flag_field->registerFlag(Domain_flag);
diff --git a/src/walberla_bridge/src/lattice_boltzmann/CMakeLists.txt b/src/walberla_bridge/src/lattice_boltzmann/CMakeLists.txt
index 3a2c214c47..891cc7aa2b 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/CMakeLists.txt
+++ b/src/walberla_bridge/src/lattice_boltzmann/CMakeLists.txt
@@ -19,10 +19,8 @@
 
 add_subdirectory(generated_kernels)
 
-target_sources(espresso_walberla
-               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/lb_walberla_init.cpp)
+target_sources(espresso_walberla PRIVATE lb_walberla_init.cpp)
 
 if(ESPRESSO_BUILD_WITH_CUDA AND WALBERLA_BUILD_WITH_CUDA)
-  target_sources(espresso_walberla_cuda
-                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/lb_walberla_init.cu)
+  target_sources(espresso_walberla_cuda PRIVATE lb_walberla_init.cu)
 endif()
diff --git a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
index 5c1e496269..98b0679f4b 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2023 The ESPResSo project
+ * Copyright (C) 2019-2024 The ESPResSo project
  *
  * This file is part of ESPResSo.
  *
@@ -226,6 +226,13 @@ class LBWalberlaImpl : public LBWalberlaBase {
     }
   }
 
+  void pressure_tensor_correction(std::span<FloatType, 9ul> tensor) const {
+    auto const revert_factor = pressure_tensor_correction_factor();
+    for (auto const i : {1u, 2u, 3u, 5u, 6u, 7u}) {
+      tensor[i] *= revert_factor;
+    }
+  }
+
   class interpolation_illegal_access : public std::runtime_error {
   public:
     explicit interpolation_illegal_access(std::string const &field,
@@ -349,11 +356,11 @@ class LBWalberlaImpl : public LBWalberlaBase {
     if constexpr (Architecture == lbmpy::Arch::CPU) {
 #ifdef ESPRESSO_BUILD_WITH_AVX_KERNELS
 #if defined(__AVX512F__)
-      constexpr uint_t alignment = 64;
+      constexpr uint_t alignment = 64u;
 #elif defined(__AVX__)
-      constexpr uint_t alignment = 32;
+      constexpr uint_t alignment = 32u;
 #elif defined(__SSE__)
-      constexpr uint_t alignment = 16;
+      constexpr uint_t alignment = 16u;
 #else
 #error "Unsupported arch, check walberla src/field/allocation/FieldAllocator.h"
 #endif
@@ -690,8 +697,8 @@ class LBWalberlaImpl : public LBWalberlaBase {
     auto force_field =
         bc->block->template getData<VectorField>(m_last_applied_force_field_id);
     auto const vel = to_vector3<FloatType>(v);
-    lbm::accessor::Velocity::set(pdf_field, force_field, vel, bc->cell);
-    lbm::accessor::Vector::set(vel_field, vel, bc->cell);
+    lbm::accessor::Velocity::set(pdf_field, vel_field, force_field, vel,
+                                 bc->cell);
 
     return true;
   }
@@ -705,25 +712,31 @@ class LBWalberlaImpl : public LBWalberlaBase {
       auto const &block = *(lattice.get_blocks()->begin());
       auto const field =
           block.template getData<VectorField>(m_velocity_field_id);
+      auto const values = lbm::accessor::Vector::get(field, *ci);
+      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
+      assert(values.size() == 3u * ci->numCells());
+      if constexpr (std::is_same_v<typename decltype(values)::value_type,
+                                   double>) {
+        out = std::move(values);
+      } else {
+        out = std::vector<double>(values.begin(), values.end());
+      }
       auto const local_offset = std::get<0>(lattice.get_local_grid_range());
       auto const lower_cell = ci->min();
       auto const upper_cell = ci->max();
-      out.reserve(ci->numCells());
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
+      auto it = out.begin();
       for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
         for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
           for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
             auto const node = local_offset + Utils::Vector3i{{x, y, z}};
             if (m_boundary->node_is_boundary(node)) {
-              auto const vec = m_boundary->get_node_value_at_boundary(node);
+              auto const &vec = m_boundary->get_node_value_at_boundary(node);
               for (uint_t f = 0u; f < 3u; ++f) {
-                out.emplace_back(double_c(vec[f]));
+                (*it) = double_c(vec[f]);
+                std::advance(it, 1l);
               }
             } else {
-              auto const vec = lbm::accessor::Vector::get(field, Cell{x, y, z});
-              for (uint_t f = 0u; f < 3u; ++f) {
-                out.emplace_back(double_c(vec[f]));
-              }
+              std::advance(it, 3l);
             }
           }
         }
@@ -738,30 +751,15 @@ class LBWalberlaImpl : public LBWalberlaBase {
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
       auto const &lattice = get_lattice();
       auto &block = *(lattice.get_blocks()->begin());
-      // We have to set both, the pdf and the stored velocity field
       auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto vel_field = block.template getData<VectorField>(m_velocity_field_id);
       auto force_field =
           block.template getData<VectorField>(m_last_applied_force_field_id);
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto it = velocity.begin();
-      assert(velocity.size() == 3u * ci->numCells());
+      auto vel_field = block.template getData<VectorField>(m_velocity_field_id);
       assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            auto const cell = Cell{x, y, z};
-            Vector3<FloatType> vec;
-            for (uint_t f = 0u; f < 3u; ++f) {
-              vec[f] = FloatType_c(*it);
-              ++it;
-            }
-            lbm::accessor::Velocity::set(pdf_field, force_field, vec, cell);
-            lbm::accessor::Vector::set(vel_field, vec, cell);
-          }
-        }
-      }
+      assert(velocity.size() == 3u * ci->numCells());
+      std::vector<FloatType> const values(velocity.begin(), velocity.end());
+      lbm::accessor::Velocity::set(pdf_field, vel_field, force_field, values,
+                                   *ci);
     }
   }
 
@@ -954,10 +952,13 @@ class LBWalberlaImpl : public LBWalberlaBase {
     if (!bc)
       return false;
 
-    auto field =
+    auto pdf_field = bc->block->template getData<PdfField>(m_pdf_field_id);
+    auto force_field =
         bc->block->template getData<VectorField>(m_last_applied_force_field_id);
+    auto vel_field =
+        bc->block->template getData<VectorField>(m_velocity_field_id);
     auto const vec = to_vector3<FloatType>(force);
-    lbm::accessor::Vector::set(field, vec, bc->cell);
+    lbm::accessor::Force::set(pdf_field, vel_field, force_field, vec, bc->cell);
 
     return true;
   }
@@ -971,22 +972,15 @@ class LBWalberlaImpl : public LBWalberlaBase {
       auto const &block = *(lattice.get_blocks()->begin());
       auto const field =
           block.template getData<VectorField>(m_last_applied_force_field_id);
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto const n_values = 3u * ci->numCells();
-      out.reserve(n_values);
+      auto const values = lbm::accessor::Vector::get(field, *ci);
       assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            auto const vec = lbm::accessor::Vector::get(field, Cell{x, y, z});
-            for (uint_t f = 0u; f < 3u; ++f) {
-              out.emplace_back(double_c(vec[f]));
-            }
-          }
-        }
+      assert(values.size() == 3u * ci->numCells());
+      if constexpr (std::is_same_v<typename decltype(values)::value_type,
+                                   double>) {
+        out = std::move(values);
+      } else {
+        out = std::vector<double>(values.begin(), values.end());
       }
-      assert(out.size() == n_values);
     }
     return out;
   }
@@ -997,25 +991,14 @@ class LBWalberlaImpl : public LBWalberlaBase {
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
       auto const &lattice = get_lattice();
       auto &block = *(lattice.get_blocks()->begin());
-      auto field =
+      auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
+      auto force_field =
           block.template getData<VectorField>(m_last_applied_force_field_id);
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto it = force.begin();
-      assert(force.size() == 3u * ci->numCells());
+      auto vel_field = block.template getData<VectorField>(m_velocity_field_id);
       assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            Vector3<FloatType> vec;
-            for (uint_t f = 0u; f < 3u; ++f) {
-              vec[f] = FloatType_c(*it);
-              ++it;
-            }
-            lbm::accessor::Vector::set(field, vec, Cell{x, y, z});
-          }
-        }
-      }
+      assert(force.size() == 3u * ci->numCells());
+      std::vector<FloatType> const values(force.begin(), force.end());
+      lbm::accessor::Force::set(pdf_field, vel_field, force_field, values, *ci);
     }
   }
 
@@ -1044,11 +1027,16 @@ class LBWalberlaImpl : public LBWalberlaBase {
       return false;
 
     auto pdf_field = bc->block->template getData<PdfField>(m_pdf_field_id);
+    auto force_field =
+        bc->block->template getData<VectorField>(m_last_applied_force_field_id);
+    auto vel_field =
+        bc->block->template getData<VectorField>(m_velocity_field_id);
     std::array<FloatType, Stencil::Size> pop;
     for (uint_t f = 0u; f < Stencil::Size; ++f) {
       pop[f] = FloatType_c(population[f]);
     }
-    lbm::accessor::Population::set(pdf_field, pop, bc->cell);
+    lbm::accessor::Population::set(pdf_field, vel_field, force_field, pop,
+                                   bc->cell);
 
     return true;
   }
@@ -1063,13 +1051,13 @@ class LBWalberlaImpl : public LBWalberlaBase {
       auto const pdf_field = block.template getData<PdfField>(m_pdf_field_id);
       auto const values = lbm::accessor::Population::get(pdf_field, *ci);
       assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
+      assert(values.size() == stencil_size() * ci->numCells());
       if constexpr (std::is_same_v<typename decltype(values)::value_type,
                                    double>) {
         out = std::move(values);
       } else {
         out = std::vector<double>(values.begin(), values.end());
       }
-      assert(out.size() == stencil_size() * ci->numCells());
     }
     return out;
   }
@@ -1081,10 +1069,14 @@ class LBWalberlaImpl : public LBWalberlaBase {
       auto const &lattice = get_lattice();
       auto &block = *(lattice.get_blocks()->begin());
       auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
+      auto force_field =
+          block.template getData<VectorField>(m_last_applied_force_field_id);
+      auto vel_field = block.template getData<VectorField>(m_velocity_field_id);
       assert(population.size() == stencil_size() * ci->numCells());
       assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
       std::vector<FloatType> const values(population.begin(), population.end());
-      lbm::accessor::Population::set(pdf_field, values, *ci);
+      lbm::accessor::Population::set(pdf_field, vel_field, force_field, values,
+                                     *ci);
     }
   }
 
@@ -1123,13 +1115,13 @@ class LBWalberlaImpl : public LBWalberlaBase {
       auto const pdf_field = block.template getData<PdfField>(m_pdf_field_id);
       auto const values = lbm::accessor::Density::get(pdf_field, *ci);
       assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
+      assert(values.size() == ci->numCells());
       if constexpr (std::is_same_v<typename decltype(values)::value_type,
                                    double>) {
         out = std::move(values);
       } else {
         out = std::vector<double>(values.begin(), values.end());
       }
-      assert(out.size() == ci->numCells());
     }
     return out;
   }
@@ -1316,24 +1308,18 @@ class LBWalberlaImpl : public LBWalberlaBase {
       auto const &lattice = get_lattice();
       auto const &block = *(lattice.get_blocks()->begin());
       auto const pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto const n_values = 9u * ci->numCells();
-      out.reserve(n_values);
+      auto values = lbm::accessor::PressureTensor::get(pdf_field, *ci);
       assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            auto const cell = Cell{x, y, z};
-            auto tensor = lbm::accessor::PressureTensor::get(pdf_field, cell);
-            pressure_tensor_correction(tensor);
-            for (auto i = 0u; i < 9u; ++i) {
-              out.emplace_back(tensor[i]);
-            }
-          }
-        }
+      assert(values.size() == 9u * ci->numCells());
+      for (auto it = values.begin(); it != values.end(); std::advance(it, 9l)) {
+        pressure_tensor_correction(std::span<FloatType, 9ul>(it, 9ul));
+      }
+      if constexpr (std::is_same_v<typename decltype(values)::value_type,
+                                   double>) {
+        out = std::move(values);
+      } else {
+        out = std::vector<double>(values.begin(), values.end());
       }
-      assert(out.size() == n_values);
     }
     return out;
   }
@@ -1444,7 +1430,9 @@ class LBWalberlaImpl : public LBWalberlaBase {
   template <typename OutputType = float>
   class DensityVTKWriter : public VTKWriter<PdfField, 1u, OutputType> {
   public:
-    using VTKWriter<PdfField, 1u, OutputType>::VTKWriter;
+    using Base = VTKWriter<PdfField, 1u, OutputType>;
+    using Base::Base;
+    using Base::evaluate;
 
   protected:
     OutputType evaluate(cell_idx_t const x, cell_idx_t const y,
@@ -1459,7 +1447,9 @@ class LBWalberlaImpl : public LBWalberlaBase {
   template <typename OutputType = float>
   class VelocityVTKWriter : public VTKWriter<VectorField, 3u, OutputType> {
   public:
-    using VTKWriter<VectorField, 3u, OutputType>::VTKWriter;
+    using Base = VTKWriter<VectorField, 3u, OutputType>;
+    using Base::Base;
+    using Base::evaluate;
 
   protected:
     OutputType evaluate(cell_idx_t const x, cell_idx_t const y,
@@ -1474,11 +1464,14 @@ class LBWalberlaImpl : public LBWalberlaBase {
   template <typename OutputType = float>
   class PressureTensorVTKWriter : public VTKWriter<PdfField, 9u, OutputType> {
   public:
+    using Base = VTKWriter<PdfField, 9u, OutputType>;
+    using Base::Base;
+    using Base::evaluate;
+
     PressureTensorVTKWriter(ConstBlockDataID const &block_id,
                             std::string const &id, FloatType unit_conversion,
                             FloatType off_diag_factor)
-        : VTKWriter<PdfField, 9u, OutputType>::VTKWriter(block_id, id,
-                                                         unit_conversion),
+        : Base(block_id, id, unit_conversion),
           m_off_diag_factor(off_diag_factor) {}
 
   protected:
@@ -1504,19 +1497,20 @@ class LBWalberlaImpl : public LBWalberlaBase {
     } else {
       if (flag_observables & static_cast<int>(OutputVTK::density)) {
         auto const unit_conversion = FloatType_c(units.at("density"));
-        vtk_obj.addCellDataWriter(make_shared<DensityVTKWriter<float>>(
+        vtk_obj.addCellDataWriter(std::make_shared<DensityVTKWriter<float>>(
             m_pdf_field_id, "density", unit_conversion));
       }
       if (flag_observables & static_cast<int>(OutputVTK::velocity_vector)) {
         auto const unit_conversion = FloatType_c(units.at("velocity"));
-        vtk_obj.addCellDataWriter(make_shared<VelocityVTKWriter<float>>(
+        vtk_obj.addCellDataWriter(std::make_shared<VelocityVTKWriter<float>>(
             m_velocity_field_id, "velocity_vector", unit_conversion));
       }
       if (flag_observables & static_cast<int>(OutputVTK::pressure_tensor)) {
         auto const unit_conversion = FloatType_c(units.at("pressure"));
-        vtk_obj.addCellDataWriter(make_shared<PressureTensorVTKWriter<float>>(
-            m_pdf_field_id, "pressure_tensor", unit_conversion,
-            pressure_tensor_correction_factor()));
+        vtk_obj.addCellDataWriter(
+            std::make_shared<PressureTensorVTKWriter<float>>(
+                m_pdf_field_id, "pressure_tensor", unit_conversion,
+                pressure_tensor_correction_factor()));
       }
     }
   }
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecision.h
index 32650c3e3c..c73cb58c14 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecision.h
@@ -41,19 +41,18 @@
 
 #include <array>
 #include <cassert>
+#include <iterator>
 #include <tuple>
 #include <vector>
 
 #ifdef WALBERLA_CXX_COMPILER_IS_GNU
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"
-#pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
 #ifdef WALBERLA_CXX_COMPILER_IS_CLANG
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-parameter"
 #endif
 
 namespace walberla {
@@ -61,8 +60,8 @@ namespace lbm {
 namespace accessor {
 
 namespace Population {
-inline std::array<double, 19u>
-get(GhostLayerField<double, uint_t{19u}> const *pdf_field, Cell const &cell) {
+inline auto get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                Cell const &cell) {
   double const &xyz0 = pdf_field->get(cell, uint_t{0u});
   std::array<double, 19u> pop;
   pop[0u] = pdf_field->getF(&xyz0, uint_t{0u});
@@ -111,6 +110,54 @@ inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
   pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
 }
 
+inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
+                GhostLayerField<double, uint_t{3u}> *velocity_field,
+                GhostLayerField<double, uint_t{3u}> const *force_field,
+                std::array<double, 19u> const &pop, Cell const &cell) {
+  auto &xyz0 = pdf_field->get(cell, uint_t{0u});
+  const double f_0 = pdf_field->getF(&xyz0, uint_t{0u}) = pop[0u];
+  const double f_1 = pdf_field->getF(&xyz0, uint_t{1u}) = pop[1u];
+  const double f_2 = pdf_field->getF(&xyz0, uint_t{2u}) = pop[2u];
+  const double f_3 = pdf_field->getF(&xyz0, uint_t{3u}) = pop[3u];
+  const double f_4 = pdf_field->getF(&xyz0, uint_t{4u}) = pop[4u];
+  const double f_5 = pdf_field->getF(&xyz0, uint_t{5u}) = pop[5u];
+  const double f_6 = pdf_field->getF(&xyz0, uint_t{6u}) = pop[6u];
+  const double f_7 = pdf_field->getF(&xyz0, uint_t{7u}) = pop[7u];
+  const double f_8 = pdf_field->getF(&xyz0, uint_t{8u}) = pop[8u];
+  const double f_9 = pdf_field->getF(&xyz0, uint_t{9u}) = pop[9u];
+  const double f_10 = pdf_field->getF(&xyz0, uint_t{10u}) = pop[10u];
+  const double f_11 = pdf_field->getF(&xyz0, uint_t{11u}) = pop[11u];
+  const double f_12 = pdf_field->getF(&xyz0, uint_t{12u}) = pop[12u];
+  const double f_13 = pdf_field->getF(&xyz0, uint_t{13u}) = pop[13u];
+  const double f_14 = pdf_field->getF(&xyz0, uint_t{14u}) = pop[14u];
+  const double f_15 = pdf_field->getF(&xyz0, uint_t{15u}) = pop[15u];
+  const double f_16 = pdf_field->getF(&xyz0, uint_t{16u}) = pop[16u];
+  const double f_17 = pdf_field->getF(&xyz0, uint_t{17u}) = pop[17u];
+  const double f_18 = pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
+  const auto x = cell.x();
+  const auto y = cell.y();
+  const auto z = cell.z();
+  const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+  const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+  const double vel1Term = f_1 + f_11 + f_15 + f_7;
+  const double momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+  const double vel2Term = f_12 + f_13 + f_5;
+  const double momdensity_2 =
+      f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+  const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                     vel1Term + vel2Term;
+  const double md_0 =
+      force_field->get(x, y, z, 0) * 0.50000000000000000 + momdensity_0;
+  const double md_1 =
+      force_field->get(x, y, z, 1) * 0.50000000000000000 + momdensity_1;
+  const double md_2 =
+      force_field->get(x, y, z, 2) * 0.50000000000000000 + momdensity_2;
+  const auto rho_inv = double{1} / rho;
+  velocity_field->get(cell, uint_t{0u}) = md_0 * rho_inv;
+  velocity_field->get(cell, uint_t{1u}) = md_1 * rho_inv;
+  velocity_field->get(cell, uint_t{2u}) = md_2 * rho_inv;
+}
+
 inline void initialize(GhostLayerField<double, uint_t{19u}> *pdf_field,
                        std::array<double, 19u> const &pop) {
   WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(pdf_field, {
@@ -137,9 +184,8 @@ inline void initialize(GhostLayerField<double, uint_t{19u}> *pdf_field,
   });
 }
 
-inline std::vector<double>
-get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
-    CellInterval const &ci) {
+inline auto get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                CellInterval const &ci) {
   std::vector<double> out;
   out.reserve(ci.numCells() * uint_t(19u));
   for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
@@ -174,31 +220,86 @@ get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
 inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
                 std::vector<double> const &values, CellInterval const &ci) {
   assert(uint_c(values.size()) == ci.numCells() * uint_t(19u));
-  auto values_ptr = values.data();
+  auto pop = values.data();
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        double &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        pdf_field->getF(&xyz0, uint_t{0u}) = pop[0u];
+        pdf_field->getF(&xyz0, uint_t{1u}) = pop[1u];
+        pdf_field->getF(&xyz0, uint_t{2u}) = pop[2u];
+        pdf_field->getF(&xyz0, uint_t{3u}) = pop[3u];
+        pdf_field->getF(&xyz0, uint_t{4u}) = pop[4u];
+        pdf_field->getF(&xyz0, uint_t{5u}) = pop[5u];
+        pdf_field->getF(&xyz0, uint_t{6u}) = pop[6u];
+        pdf_field->getF(&xyz0, uint_t{7u}) = pop[7u];
+        pdf_field->getF(&xyz0, uint_t{8u}) = pop[8u];
+        pdf_field->getF(&xyz0, uint_t{9u}) = pop[9u];
+        pdf_field->getF(&xyz0, uint_t{10u}) = pop[10u];
+        pdf_field->getF(&xyz0, uint_t{11u}) = pop[11u];
+        pdf_field->getF(&xyz0, uint_t{12u}) = pop[12u];
+        pdf_field->getF(&xyz0, uint_t{13u}) = pop[13u];
+        pdf_field->getF(&xyz0, uint_t{14u}) = pop[14u];
+        pdf_field->getF(&xyz0, uint_t{15u}) = pop[15u];
+        pdf_field->getF(&xyz0, uint_t{16u}) = pop[16u];
+        pdf_field->getF(&xyz0, uint_t{17u}) = pop[17u];
+        pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
+        std::advance(pop, 19);
+      }
+    }
+  }
+}
+
+inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
+                GhostLayerField<double, uint_t{3u}> *velocity_field,
+                GhostLayerField<double, uint_t{3u}> const *force_field,
+                std::vector<double> const &values, CellInterval const &ci) {
+  assert(uint_c(values.size()) == ci.numCells() * uint_t(19u));
+  auto pop = values.data();
   for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
     for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
       for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
         double &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
-        pdf_field->getF(&xyz0, uint_t{0u}) = values_ptr[0u];
-        pdf_field->getF(&xyz0, uint_t{1u}) = values_ptr[1u];
-        pdf_field->getF(&xyz0, uint_t{2u}) = values_ptr[2u];
-        pdf_field->getF(&xyz0, uint_t{3u}) = values_ptr[3u];
-        pdf_field->getF(&xyz0, uint_t{4u}) = values_ptr[4u];
-        pdf_field->getF(&xyz0, uint_t{5u}) = values_ptr[5u];
-        pdf_field->getF(&xyz0, uint_t{6u}) = values_ptr[6u];
-        pdf_field->getF(&xyz0, uint_t{7u}) = values_ptr[7u];
-        pdf_field->getF(&xyz0, uint_t{8u}) = values_ptr[8u];
-        pdf_field->getF(&xyz0, uint_t{9u}) = values_ptr[9u];
-        pdf_field->getF(&xyz0, uint_t{10u}) = values_ptr[10u];
-        pdf_field->getF(&xyz0, uint_t{11u}) = values_ptr[11u];
-        pdf_field->getF(&xyz0, uint_t{12u}) = values_ptr[12u];
-        pdf_field->getF(&xyz0, uint_t{13u}) = values_ptr[13u];
-        pdf_field->getF(&xyz0, uint_t{14u}) = values_ptr[14u];
-        pdf_field->getF(&xyz0, uint_t{15u}) = values_ptr[15u];
-        pdf_field->getF(&xyz0, uint_t{16u}) = values_ptr[16u];
-        pdf_field->getF(&xyz0, uint_t{17u}) = values_ptr[17u];
-        pdf_field->getF(&xyz0, uint_t{18u}) = values_ptr[18u];
-        values_ptr += 19u;
+        const double f_0 = pdf_field->getF(&xyz0, uint_t{0u}) = pop[0u];
+        const double f_1 = pdf_field->getF(&xyz0, uint_t{1u}) = pop[1u];
+        const double f_2 = pdf_field->getF(&xyz0, uint_t{2u}) = pop[2u];
+        const double f_3 = pdf_field->getF(&xyz0, uint_t{3u}) = pop[3u];
+        const double f_4 = pdf_field->getF(&xyz0, uint_t{4u}) = pop[4u];
+        const double f_5 = pdf_field->getF(&xyz0, uint_t{5u}) = pop[5u];
+        const double f_6 = pdf_field->getF(&xyz0, uint_t{6u}) = pop[6u];
+        const double f_7 = pdf_field->getF(&xyz0, uint_t{7u}) = pop[7u];
+        const double f_8 = pdf_field->getF(&xyz0, uint_t{8u}) = pop[8u];
+        const double f_9 = pdf_field->getF(&xyz0, uint_t{9u}) = pop[9u];
+        const double f_10 = pdf_field->getF(&xyz0, uint_t{10u}) = pop[10u];
+        const double f_11 = pdf_field->getF(&xyz0, uint_t{11u}) = pop[11u];
+        const double f_12 = pdf_field->getF(&xyz0, uint_t{12u}) = pop[12u];
+        const double f_13 = pdf_field->getF(&xyz0, uint_t{13u}) = pop[13u];
+        const double f_14 = pdf_field->getF(&xyz0, uint_t{14u}) = pop[14u];
+        const double f_15 = pdf_field->getF(&xyz0, uint_t{15u}) = pop[15u];
+        const double f_16 = pdf_field->getF(&xyz0, uint_t{16u}) = pop[16u];
+        const double f_17 = pdf_field->getF(&xyz0, uint_t{17u}) = pop[17u];
+        const double f_18 = pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
+        const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const double vel1Term = f_1 + f_11 + f_15 + f_7;
+        const double momdensity_1 =
+            -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const double vel2Term = f_12 + f_13 + f_5;
+        const double momdensity_2 =
+            f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 +
+                           vel0Term + vel1Term + vel2Term;
+        const double md_0 =
+            force_field->get(x, y, z, 0) * 0.50000000000000000 + momdensity_0;
+        const double md_1 =
+            force_field->get(x, y, z, 1) * 0.50000000000000000 + momdensity_1;
+        const double md_2 =
+            force_field->get(x, y, z, 2) * 0.50000000000000000 + momdensity_2;
+        const auto rho_inv = double{1} / rho;
+        velocity_field->get(x, y, z, uint_t{0u}) = md_0 * rho_inv;
+        velocity_field->get(x, y, z, uint_t{1u}) = md_1 * rho_inv;
+        velocity_field->get(x, y, z, uint_t{2u}) = md_2 * rho_inv;
+        std::advance(pop, 19);
       }
     }
   }
@@ -206,8 +307,8 @@ inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
 } // namespace Population
 
 namespace Vector {
-inline Vector3<double> get(GhostLayerField<double, uint_t{3u}> const *vec_field,
-                           Cell const &cell) {
+inline auto get(GhostLayerField<double, uint_t{3u}> const *vec_field,
+                Cell const &cell) {
   const double &xyz0 = vec_field->get(cell, uint_t{0u});
   Vector3<double> vec;
   vec[0] = vec_field->getF(&xyz0, uint_t{0u});
@@ -252,9 +353,8 @@ inline void add_to_all(GhostLayerField<double, uint_t{3u}> *vec_field,
   });
 }
 
-inline std::vector<double>
-get(GhostLayerField<double, uint_t{3u}> const *vec_field,
-    CellInterval const &ci) {
+inline auto get(GhostLayerField<double, uint_t{3u}> const *vec_field,
+                CellInterval const &ci) {
   std::vector<double> out;
   out.reserve(ci.numCells() * uint_t(3u));
   for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
@@ -281,7 +381,7 @@ inline void set(GhostLayerField<double, uint_t{3u}> *vec_field,
         vec_field->getF(&xyz0, uint_t{0u}) = values_ptr[0u];
         vec_field->getF(&xyz0, uint_t{1u}) = values_ptr[1u];
         vec_field->getF(&xyz0, uint_t{2u}) = values_ptr[2u];
-        values_ptr += 3u;
+        std::advance(values_ptr, 3);
       }
     }
   }
@@ -290,8 +390,8 @@ inline void set(GhostLayerField<double, uint_t{3u}> *vec_field,
 
 namespace EquilibriumDistribution {
 inline double get(stencil::Direction const direction,
-                  Vector3<double> const &u = Vector3<double>(double(0.0)),
-                  double rho = double(1.0)) {
+                  Vector3<double> const &u = Vector3<double>(double{0}),
+                  double rho = double{1}) {
 
   using namespace stencil;
   switch (direction) {
@@ -568,7 +668,7 @@ inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
                      vel1Term + vel2Term;
 
   // calculate current velocity (before density change)
-  const double conversion = double(1) / rho;
+  const double conversion = double{1} / rho;
   Vector3<double> velocity;
   velocity[0u] = momdensity_0 * conversion;
   velocity[1u] = momdensity_1 * conversion;
@@ -656,7 +756,7 @@ inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
                            vel0Term + vel1Term + vel2Term;
 
         // calculate current velocity (before density change)
-        const double conversion = double(1) / rho;
+        const double conversion = double{1} / rho;
         Vector3<double> velocity;
         velocity[0u] = momdensity_0 * conversion;
         velocity[1u] = momdensity_1 * conversion;
@@ -671,7 +771,108 @@ inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
 } // namespace Density
 
 namespace Velocity {
+inline auto get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                GhostLayerField<double, uint_t{3u}> const *force_field,
+                Cell const &cell) {
+  const double &xyz0 = pdf_field->get(cell, uint_t{0u});
+  const double f_0 = pdf_field->getF(&xyz0, uint_t{0u});
+  const double f_1 = pdf_field->getF(&xyz0, uint_t{1u});
+  const double f_2 = pdf_field->getF(&xyz0, uint_t{2u});
+  const double f_3 = pdf_field->getF(&xyz0, uint_t{3u});
+  const double f_4 = pdf_field->getF(&xyz0, uint_t{4u});
+  const double f_5 = pdf_field->getF(&xyz0, uint_t{5u});
+  const double f_6 = pdf_field->getF(&xyz0, uint_t{6u});
+  const double f_7 = pdf_field->getF(&xyz0, uint_t{7u});
+  const double f_8 = pdf_field->getF(&xyz0, uint_t{8u});
+  const double f_9 = pdf_field->getF(&xyz0, uint_t{9u});
+  const double f_10 = pdf_field->getF(&xyz0, uint_t{10u});
+  const double f_11 = pdf_field->getF(&xyz0, uint_t{11u});
+  const double f_12 = pdf_field->getF(&xyz0, uint_t{12u});
+  const double f_13 = pdf_field->getF(&xyz0, uint_t{13u});
+  const double f_14 = pdf_field->getF(&xyz0, uint_t{14u});
+  const double f_15 = pdf_field->getF(&xyz0, uint_t{15u});
+  const double f_16 = pdf_field->getF(&xyz0, uint_t{16u});
+  const double f_17 = pdf_field->getF(&xyz0, uint_t{17u});
+  const double f_18 = pdf_field->getF(&xyz0, uint_t{18u});
+  const auto x = cell.x();
+  const auto y = cell.y();
+  const auto z = cell.z();
+  const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+  const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+  const double vel1Term = f_1 + f_11 + f_15 + f_7;
+  const double momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+  const double vel2Term = f_12 + f_13 + f_5;
+  const double momdensity_2 =
+      f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+  const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                     vel1Term + vel2Term;
+  const double md_0 =
+      force_field->get(x, y, z, 0) * 0.50000000000000000 + momdensity_0;
+  const double md_1 =
+      force_field->get(x, y, z, 1) * 0.50000000000000000 + momdensity_1;
+  const double md_2 =
+      force_field->get(x, y, z, 2) * 0.50000000000000000 + momdensity_2;
+  const double rho_inv = double{1} / rho;
+
+  return Vector3<double>(md_0 * rho_inv, md_1 * rho_inv, md_2 * rho_inv);
+}
+
+inline auto get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                GhostLayerField<double, uint_t{3u}> const *force_field,
+                CellInterval const &ci) {
+  std::vector<double> out;
+  out.reserve(ci.numCells() * uint_t(3u));
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        const double &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        const double f_0 = pdf_field->getF(&xyz0, uint_t{0u});
+        const double f_1 = pdf_field->getF(&xyz0, uint_t{1u});
+        const double f_2 = pdf_field->getF(&xyz0, uint_t{2u});
+        const double f_3 = pdf_field->getF(&xyz0, uint_t{3u});
+        const double f_4 = pdf_field->getF(&xyz0, uint_t{4u});
+        const double f_5 = pdf_field->getF(&xyz0, uint_t{5u});
+        const double f_6 = pdf_field->getF(&xyz0, uint_t{6u});
+        const double f_7 = pdf_field->getF(&xyz0, uint_t{7u});
+        const double f_8 = pdf_field->getF(&xyz0, uint_t{8u});
+        const double f_9 = pdf_field->getF(&xyz0, uint_t{9u});
+        const double f_10 = pdf_field->getF(&xyz0, uint_t{10u});
+        const double f_11 = pdf_field->getF(&xyz0, uint_t{11u});
+        const double f_12 = pdf_field->getF(&xyz0, uint_t{12u});
+        const double f_13 = pdf_field->getF(&xyz0, uint_t{13u});
+        const double f_14 = pdf_field->getF(&xyz0, uint_t{14u});
+        const double f_15 = pdf_field->getF(&xyz0, uint_t{15u});
+        const double f_16 = pdf_field->getF(&xyz0, uint_t{16u});
+        const double f_17 = pdf_field->getF(&xyz0, uint_t{17u});
+        const double f_18 = pdf_field->getF(&xyz0, uint_t{18u});
+        const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const double vel1Term = f_1 + f_11 + f_15 + f_7;
+        const double momdensity_1 =
+            -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const double vel2Term = f_12 + f_13 + f_5;
+        const double momdensity_2 =
+            f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 +
+                           vel0Term + vel1Term + vel2Term;
+        const double md_0 =
+            force_field->get(x, y, z, 0) * 0.50000000000000000 + momdensity_0;
+        const double md_1 =
+            force_field->get(x, y, z, 1) * 0.50000000000000000 + momdensity_1;
+        const double md_2 =
+            force_field->get(x, y, z, 2) * 0.50000000000000000 + momdensity_2;
+        const double rho_inv = double{1} / rho;
+        out.emplace_back(md_0 * rho_inv);
+        out.emplace_back(md_1 * rho_inv);
+        out.emplace_back(md_2 * rho_inv);
+      }
+    }
+  }
+  return out;
+}
+
 inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
+                GhostLayerField<double, uint_t{3u}> *velocity_field,
                 GhostLayerField<double, uint_t{3u}> const *force_field,
                 Vector3<double> const &u, Cell const &cell) {
   const double &xyz0 = pdf_field->get(cell, uint_t{0u});
@@ -709,15 +910,183 @@ inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
       -force_field->get(x, y, z, 1) * 0.50000000000000000 / rho + u[1];
   const double u_2 =
       -force_field->get(x, y, z, 2) * 0.50000000000000000 / rho + u[2];
+  velocity_field->get(x, y, z, uint_t{0u}) = u[0u];
+  velocity_field->get(x, y, z, uint_t{1u}) = u[1u];
+  velocity_field->get(x, y, z, uint_t{2u}) = u[2u];
 
   Equilibrium::set(pdf_field, Vector3<double>(u_0, u_1, u_2), rho, cell);
 }
+
+inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
+                GhostLayerField<double, uint_t{3u}> *velocity_field,
+                GhostLayerField<double, uint_t{3u}> const *force_field,
+                std::vector<double> const &values, CellInterval const &ci) {
+  assert(uint_c(values.size()) == ci.numCells() * uint_t(3u));
+  auto u = values.data();
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        double &pdf_xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        double &vel_xyz0 = velocity_field->get(x, y, z, uint_t{0u});
+        const double f_0 = pdf_field->getF(&pdf_xyz0, uint_t{0u});
+        const double f_1 = pdf_field->getF(&pdf_xyz0, uint_t{1u});
+        const double f_2 = pdf_field->getF(&pdf_xyz0, uint_t{2u});
+        const double f_3 = pdf_field->getF(&pdf_xyz0, uint_t{3u});
+        const double f_4 = pdf_field->getF(&pdf_xyz0, uint_t{4u});
+        const double f_5 = pdf_field->getF(&pdf_xyz0, uint_t{5u});
+        const double f_6 = pdf_field->getF(&pdf_xyz0, uint_t{6u});
+        const double f_7 = pdf_field->getF(&pdf_xyz0, uint_t{7u});
+        const double f_8 = pdf_field->getF(&pdf_xyz0, uint_t{8u});
+        const double f_9 = pdf_field->getF(&pdf_xyz0, uint_t{9u});
+        const double f_10 = pdf_field->getF(&pdf_xyz0, uint_t{10u});
+        const double f_11 = pdf_field->getF(&pdf_xyz0, uint_t{11u});
+        const double f_12 = pdf_field->getF(&pdf_xyz0, uint_t{12u});
+        const double f_13 = pdf_field->getF(&pdf_xyz0, uint_t{13u});
+        const double f_14 = pdf_field->getF(&pdf_xyz0, uint_t{14u});
+        const double f_15 = pdf_field->getF(&pdf_xyz0, uint_t{15u});
+        const double f_16 = pdf_field->getF(&pdf_xyz0, uint_t{16u});
+        const double f_17 = pdf_field->getF(&pdf_xyz0, uint_t{17u});
+        const double f_18 = pdf_field->getF(&pdf_xyz0, uint_t{18u});
+        const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const double vel1Term = f_1 + f_11 + f_15 + f_7;
+        const double vel2Term = f_12 + f_13 + f_5;
+        const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 +
+                           vel0Term + vel1Term + vel2Term;
+
+        const double u_0 =
+            -force_field->get(x, y, z, 0) * 0.50000000000000000 / rho + u[0];
+        const double u_1 =
+            -force_field->get(x, y, z, 1) * 0.50000000000000000 / rho + u[1];
+        const double u_2 =
+            -force_field->get(x, y, z, 2) * 0.50000000000000000 / rho + u[2];
+        velocity_field->getF(&vel_xyz0, uint_t{0u}) = u[0u];
+        velocity_field->getF(&vel_xyz0, uint_t{1u}) = u[1u];
+        velocity_field->getF(&vel_xyz0, uint_t{2u}) = u[2u];
+
+        std::advance(u, 3);
+
+        Equilibrium::set(pdf_field, Vector3<double>(u_0, u_1, u_2), rho,
+                         Cell{x, y, z});
+      }
+    }
+  }
+}
 } // namespace Velocity
 
+namespace Force {
+inline void set(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                GhostLayerField<double, uint_t{3u}> *velocity_field,
+                GhostLayerField<double, uint_t{3u}> *force_field,
+                Vector3<double> const &force, Cell const &cell) {
+  double const &pdf_xyz0 = pdf_field->get(cell, uint_t{0u});
+  double &vel_xyz0 = velocity_field->get(cell, uint_t{0u});
+  double &laf_xyz0 = force_field->get(cell, uint_t{0u});
+  const double f_0 = pdf_field->getF(&pdf_xyz0, uint_t{0u});
+  const double f_1 = pdf_field->getF(&pdf_xyz0, uint_t{1u});
+  const double f_2 = pdf_field->getF(&pdf_xyz0, uint_t{2u});
+  const double f_3 = pdf_field->getF(&pdf_xyz0, uint_t{3u});
+  const double f_4 = pdf_field->getF(&pdf_xyz0, uint_t{4u});
+  const double f_5 = pdf_field->getF(&pdf_xyz0, uint_t{5u});
+  const double f_6 = pdf_field->getF(&pdf_xyz0, uint_t{6u});
+  const double f_7 = pdf_field->getF(&pdf_xyz0, uint_t{7u});
+  const double f_8 = pdf_field->getF(&pdf_xyz0, uint_t{8u});
+  const double f_9 = pdf_field->getF(&pdf_xyz0, uint_t{9u});
+  const double f_10 = pdf_field->getF(&pdf_xyz0, uint_t{10u});
+  const double f_11 = pdf_field->getF(&pdf_xyz0, uint_t{11u});
+  const double f_12 = pdf_field->getF(&pdf_xyz0, uint_t{12u});
+  const double f_13 = pdf_field->getF(&pdf_xyz0, uint_t{13u});
+  const double f_14 = pdf_field->getF(&pdf_xyz0, uint_t{14u});
+  const double f_15 = pdf_field->getF(&pdf_xyz0, uint_t{15u});
+  const double f_16 = pdf_field->getF(&pdf_xyz0, uint_t{16u});
+  const double f_17 = pdf_field->getF(&pdf_xyz0, uint_t{17u});
+  const double f_18 = pdf_field->getF(&pdf_xyz0, uint_t{18u});
+  const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+  const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+  const double vel1Term = f_1 + f_11 + f_15 + f_7;
+  const double momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+  const double vel2Term = f_12 + f_13 + f_5;
+  const double momdensity_2 =
+      f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+  const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                     vel1Term + vel2Term;
+  const double md_0 = force[0u] * 0.50000000000000000 + momdensity_0;
+  const double md_1 = force[1u] * 0.50000000000000000 + momdensity_1;
+  const double md_2 = force[2u] * 0.50000000000000000 + momdensity_2;
+  auto const rho_inv = double{1} / rho;
+
+  force_field->getF(&laf_xyz0, uint_t{0u}) = force[0u];
+  force_field->getF(&laf_xyz0, uint_t{1u}) = force[1u];
+  force_field->getF(&laf_xyz0, uint_t{2u}) = force[2u];
+
+  velocity_field->getF(&vel_xyz0, uint_t{0u}) = md_0 * rho_inv;
+  velocity_field->getF(&vel_xyz0, uint_t{1u}) = md_1 * rho_inv;
+  velocity_field->getF(&vel_xyz0, uint_t{2u}) = md_2 * rho_inv;
+}
+
+inline void set(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                GhostLayerField<double, uint_t{3u}> *velocity_field,
+                GhostLayerField<double, uint_t{3u}> *force_field,
+                std::vector<double> const &values, CellInterval const &ci) {
+  assert(uint_c(values.size()) == ci.numCells() * uint_t(3u));
+  auto force = values.data();
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        double const &pdf_xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        double &vel_xyz0 = velocity_field->get(x, y, z, uint_t{0u});
+        double &laf_xyz0 = force_field->get(x, y, z, uint_t{0u});
+        const double f_0 = pdf_field->getF(&pdf_xyz0, uint_t{0u});
+        const double f_1 = pdf_field->getF(&pdf_xyz0, uint_t{1u});
+        const double f_2 = pdf_field->getF(&pdf_xyz0, uint_t{2u});
+        const double f_3 = pdf_field->getF(&pdf_xyz0, uint_t{3u});
+        const double f_4 = pdf_field->getF(&pdf_xyz0, uint_t{4u});
+        const double f_5 = pdf_field->getF(&pdf_xyz0, uint_t{5u});
+        const double f_6 = pdf_field->getF(&pdf_xyz0, uint_t{6u});
+        const double f_7 = pdf_field->getF(&pdf_xyz0, uint_t{7u});
+        const double f_8 = pdf_field->getF(&pdf_xyz0, uint_t{8u});
+        const double f_9 = pdf_field->getF(&pdf_xyz0, uint_t{9u});
+        const double f_10 = pdf_field->getF(&pdf_xyz0, uint_t{10u});
+        const double f_11 = pdf_field->getF(&pdf_xyz0, uint_t{11u});
+        const double f_12 = pdf_field->getF(&pdf_xyz0, uint_t{12u});
+        const double f_13 = pdf_field->getF(&pdf_xyz0, uint_t{13u});
+        const double f_14 = pdf_field->getF(&pdf_xyz0, uint_t{14u});
+        const double f_15 = pdf_field->getF(&pdf_xyz0, uint_t{15u});
+        const double f_16 = pdf_field->getF(&pdf_xyz0, uint_t{16u});
+        const double f_17 = pdf_field->getF(&pdf_xyz0, uint_t{17u});
+        const double f_18 = pdf_field->getF(&pdf_xyz0, uint_t{18u});
+        const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const double vel1Term = f_1 + f_11 + f_15 + f_7;
+        const double momdensity_1 =
+            -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const double vel2Term = f_12 + f_13 + f_5;
+        const double momdensity_2 =
+            f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 +
+                           vel0Term + vel1Term + vel2Term;
+        const double md_0 = force[0u] * 0.50000000000000000 + momdensity_0;
+        const double md_1 = force[1u] * 0.50000000000000000 + momdensity_1;
+        const double md_2 = force[2u] * 0.50000000000000000 + momdensity_2;
+        auto const rho_inv = double{1} / rho;
+
+        force_field->getF(&laf_xyz0, uint_t{0u}) = force[0u];
+        force_field->getF(&laf_xyz0, uint_t{1u}) = force[1u];
+        force_field->getF(&laf_xyz0, uint_t{2u}) = force[2u];
+
+        velocity_field->getF(&vel_xyz0, uint_t{0u}) = md_0 * rho_inv;
+        velocity_field->getF(&vel_xyz0, uint_t{1u}) = md_1 * rho_inv;
+        velocity_field->getF(&vel_xyz0, uint_t{2u}) = md_2 * rho_inv;
+
+        std::advance(force, 3);
+      }
+    }
+  }
+}
+} // namespace Force
+
 namespace MomentumDensity {
-inline Vector3<double>
-reduce(GhostLayerField<double, uint_t{19u}> const *pdf_field,
-       GhostLayerField<double, uint_t{3u}> const *force_field) {
+inline auto reduce(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                   GhostLayerField<double, uint_t{3u}> const *force_field) {
   Vector3<double> momentumDensity(double{0});
   WALBERLA_FOR_ALL_CELLS_XYZ(pdf_field, {
     const double &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
@@ -766,8 +1135,8 @@ reduce(GhostLayerField<double, uint_t{19u}> const *pdf_field,
 } // namespace MomentumDensity
 
 namespace PressureTensor {
-inline Matrix3<double>
-get(GhostLayerField<double, uint_t{19u}> const *pdf_field, Cell const &cell) {
+inline auto get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                Cell const &cell) {
   const double &xyz0 = pdf_field->get(cell, uint_t{0u});
   const double f_0 = pdf_field->getF(&xyz0, uint_t{0u});
   const double f_1 = pdf_field->getF(&xyz0, uint_t{1u});
@@ -816,6 +1185,63 @@ get(GhostLayerField<double, uint_t{19u}> const *pdf_field, Cell const &cell) {
 
   return pressureTensor;
 }
+
+inline auto get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                CellInterval const &ci) {
+  std::vector<double> out;
+  out.reserve(ci.numCells() * uint_t(9u));
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        const double &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        const double f_0 = pdf_field->getF(&xyz0, uint_t{0u});
+        const double f_1 = pdf_field->getF(&xyz0, uint_t{1u});
+        const double f_2 = pdf_field->getF(&xyz0, uint_t{2u});
+        const double f_3 = pdf_field->getF(&xyz0, uint_t{3u});
+        const double f_4 = pdf_field->getF(&xyz0, uint_t{4u});
+        const double f_5 = pdf_field->getF(&xyz0, uint_t{5u});
+        const double f_6 = pdf_field->getF(&xyz0, uint_t{6u});
+        const double f_7 = pdf_field->getF(&xyz0, uint_t{7u});
+        const double f_8 = pdf_field->getF(&xyz0, uint_t{8u});
+        const double f_9 = pdf_field->getF(&xyz0, uint_t{9u});
+        const double f_10 = pdf_field->getF(&xyz0, uint_t{10u});
+        const double f_11 = pdf_field->getF(&xyz0, uint_t{11u});
+        const double f_12 = pdf_field->getF(&xyz0, uint_t{12u});
+        const double f_13 = pdf_field->getF(&xyz0, uint_t{13u});
+        const double f_14 = pdf_field->getF(&xyz0, uint_t{14u});
+        const double f_15 = pdf_field->getF(&xyz0, uint_t{15u});
+        const double f_16 = pdf_field->getF(&xyz0, uint_t{16u});
+        const double f_17 = pdf_field->getF(&xyz0, uint_t{17u});
+        const double f_18 = pdf_field->getF(&xyz0, uint_t{18u});
+        const double p_0 =
+            f_10 + f_13 + f_14 + f_17 + f_18 + f_3 + f_4 + f_7 + f_8 + f_9;
+        const double p_1 = -f_10 - f_7 + f_8 + f_9;
+        const double p_2 = -f_13 + f_14 + f_17 - f_18;
+        const double p_3 = -f_10 - f_7 + f_8 + f_9;
+        const double p_4 =
+            f_1 + f_10 + f_11 + f_12 + f_15 + f_16 + f_2 + f_7 + f_8 + f_9;
+        const double p_5 = f_11 - f_12 - f_15 + f_16;
+        const double p_6 = -f_13 + f_14 + f_17 - f_18;
+        const double p_7 = f_11 - f_12 - f_15 + f_16;
+        const double p_8 =
+            f_11 + f_12 + f_13 + f_14 + f_15 + f_16 + f_17 + f_18 + f_5 + f_6;
+
+        out.emplace_back(p_0);
+        out.emplace_back(p_1);
+        out.emplace_back(p_2);
+
+        out.emplace_back(p_3);
+        out.emplace_back(p_4);
+        out.emplace_back(p_5);
+
+        out.emplace_back(p_6);
+        out.emplace_back(p_7);
+        out.emplace_back(p_8);
+      }
+    }
+  }
+  return out;
+}
 } // namespace PressureTensor
 
 } // namespace accessor
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cu
index a9fe53dc6f..62dda87100 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cu
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cu
@@ -48,13 +48,8 @@
 
 #if defined(__NVCC__)
 #define RESTRICT __restrict__
-#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // unused variable
-#else
-#pragma push
-#pragma diag_suppress 177 // unused variable
-#endif
 #elif defined(__clang__)
 #if defined(__CUDA__)
 #if defined(__CUDA_ARCH__)
@@ -62,20 +57,17 @@
 #define RESTRICT __restrict__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-parameter"
 #else
 // clang compiling CUDA code in host mode
 #define RESTRICT __restrict__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-parameter"
 #endif
 #endif
 #elif defined(__GNUC__) or defined(__GNUG__)
 #define RESTRICT __restrict__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"
-#pragma GCC diagnostic ignored "-Wunused-parameter"
 #elif defined(_MSC_VER)
 #define RESTRICT __restrict
 #else
@@ -102,38 +94,13 @@ namespace lbm {
 namespace accessor {
 
 namespace Population {
-__global__ void kernel_get_interval(
-    gpu::FieldAccessor<double> pdf,
-    double *RESTRICT const pop) {
-  pdf.set(blockIdx, threadIdx);
-  if (pdf.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
-    pop[offset + 0u] = pdf.get(0u);
-    pop[offset + 1u] = pdf.get(1u);
-    pop[offset + 2u] = pdf.get(2u);
-    pop[offset + 3u] = pdf.get(3u);
-    pop[offset + 4u] = pdf.get(4u);
-    pop[offset + 5u] = pdf.get(5u);
-    pop[offset + 6u] = pdf.get(6u);
-    pop[offset + 7u] = pdf.get(7u);
-    pop[offset + 8u] = pdf.get(8u);
-    pop[offset + 9u] = pdf.get(9u);
-    pop[offset + 10u] = pdf.get(10u);
-    pop[offset + 11u] = pdf.get(11u);
-    pop[offset + 12u] = pdf.get(12u);
-    pop[offset + 13u] = pdf.get(13u);
-    pop[offset + 14u] = pdf.get(14u);
-    pop[offset + 15u] = pdf.get(15u);
-    pop[offset + 16u] = pdf.get(16u);
-    pop[offset + 17u] = pdf.get(17u);
-    pop[offset + 18u] = pdf.get(18u);
-  }
-}
-
+// LCOV_EXCL_START
 __global__ void kernel_get(
     gpu::FieldAccessor<double> pdf,
-    double *RESTRICT const pop) {
+    double *RESTRICT pop) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
   pdf.set(blockIdx, threadIdx);
+  pop += offset;
   if (pdf.isValidPosition()) {
     pop[0u] = pdf.get(0u);
     pop[1u] = pdf.get(1u);
@@ -157,37 +124,38 @@ __global__ void kernel_get(
   }
 }
 
-__global__ void kernel_set_interval(
+__global__ void kernel_set(
     gpu::FieldAccessor<double> pdf,
-    double const *RESTRICT const pop) {
+    double const *RESTRICT pop) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
   pdf.set(blockIdx, threadIdx);
+  pop += offset;
   if (pdf.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
-    pdf.get(0u) = pop[offset + 0u];
-    pdf.get(1u) = pop[offset + 1u];
-    pdf.get(2u) = pop[offset + 2u];
-    pdf.get(3u) = pop[offset + 3u];
-    pdf.get(4u) = pop[offset + 4u];
-    pdf.get(5u) = pop[offset + 5u];
-    pdf.get(6u) = pop[offset + 6u];
-    pdf.get(7u) = pop[offset + 7u];
-    pdf.get(8u) = pop[offset + 8u];
-    pdf.get(9u) = pop[offset + 9u];
-    pdf.get(10u) = pop[offset + 10u];
-    pdf.get(11u) = pop[offset + 11u];
-    pdf.get(12u) = pop[offset + 12u];
-    pdf.get(13u) = pop[offset + 13u];
-    pdf.get(14u) = pop[offset + 14u];
-    pdf.get(15u) = pop[offset + 15u];
-    pdf.get(16u) = pop[offset + 16u];
-    pdf.get(17u) = pop[offset + 17u];
-    pdf.get(18u) = pop[offset + 18u];
+    pdf.get(0u) = pop[0u];
+    pdf.get(1u) = pop[1u];
+    pdf.get(2u) = pop[2u];
+    pdf.get(3u) = pop[3u];
+    pdf.get(4u) = pop[4u];
+    pdf.get(5u) = pop[5u];
+    pdf.get(6u) = pop[6u];
+    pdf.get(7u) = pop[7u];
+    pdf.get(8u) = pop[8u];
+    pdf.get(9u) = pop[9u];
+    pdf.get(10u) = pop[10u];
+    pdf.get(11u) = pop[11u];
+    pdf.get(12u) = pop[12u];
+    pdf.get(13u) = pop[13u];
+    pdf.get(14u) = pop[14u];
+    pdf.get(15u) = pop[15u];
+    pdf.get(16u) = pop[16u];
+    pdf.get(17u) = pop[17u];
+    pdf.get(18u) = pop[18u];
   }
 }
 
-__global__ void kernel_set(
+__global__ void kernel_broadcast(
     gpu::FieldAccessor<double> pdf,
-    double const *RESTRICT const pop) {
+    double const *RESTRICT pop) {
   pdf.set(blockIdx, threadIdx);
   if (pdf.isValidPosition()) {
     pdf.get(0u) = pop[0u];
@@ -212,11 +180,59 @@ __global__ void kernel_set(
   }
 }
 
+__global__ void kernel_set_vel(
+    gpu::FieldAccessor<double> pdf,
+    gpu::FieldAccessor<double> velocity,
+    gpu::FieldAccessor<double> force,
+    double const *RESTRICT pop) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
+  pdf.set(blockIdx, threadIdx);
+  velocity.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  pop += offset;
+  if (pdf.isValidPosition()) {
+    const double f_0 = pdf.get(0u) = pop[0u];
+    const double f_1 = pdf.get(1u) = pop[1u];
+    const double f_2 = pdf.get(2u) = pop[2u];
+    const double f_3 = pdf.get(3u) = pop[3u];
+    const double f_4 = pdf.get(4u) = pop[4u];
+    const double f_5 = pdf.get(5u) = pop[5u];
+    const double f_6 = pdf.get(6u) = pop[6u];
+    const double f_7 = pdf.get(7u) = pop[7u];
+    const double f_8 = pdf.get(8u) = pop[8u];
+    const double f_9 = pdf.get(9u) = pop[9u];
+    const double f_10 = pdf.get(10u) = pop[10u];
+    const double f_11 = pdf.get(11u) = pop[11u];
+    const double f_12 = pdf.get(12u) = pop[12u];
+    const double f_13 = pdf.get(13u) = pop[13u];
+    const double f_14 = pdf.get(14u) = pop[14u];
+    const double f_15 = pdf.get(15u) = pop[15u];
+    const double f_16 = pdf.get(16u) = pop[16u];
+    const double f_17 = pdf.get(17u) = pop[17u];
+    const double f_18 = pdf.get(18u) = pop[18u];
+    const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const double vel1Term = f_1 + f_11 + f_15 + f_7;
+    const double momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const double vel2Term = f_12 + f_13 + f_5;
+    const double momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const double md_0 = force.get(0) * 0.50000000000000000 + momdensity_0;
+    const double md_1 = force.get(1) * 0.50000000000000000 + momdensity_1;
+    const double md_2 = force.get(2) * 0.50000000000000000 + momdensity_2;
+    const double rho_inv = double{1} / rho;
+    velocity.get(0u) = md_0 * rho_inv;
+    velocity.get(1u) = md_1 * rho_inv;
+    velocity.get(2u) = md_2 * rho_inv;
+  }
+}
+// LCOV_EXCL_STOP
+
 std::array<double, 19u> get(
     gpu::GPUField<double> const *pdf_field,
     Cell const &cell) {
   CellInterval ci(cell, cell);
-  thrust::device_vector<double> dev_data(19u, double{0});
+  thrust::device_vector<double> dev_data(19u);
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
   auto kernel = gpu::make_kernel(kernel_get);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
@@ -231,7 +247,7 @@ void set(
     gpu::GPUField<double> *pdf_field,
     std::array<double, 19u> const &pop,
     Cell const &cell) {
-  thrust::device_vector<double> dev_data(pop.data(), pop.data() + 19u);
+  thrust::device_vector<double> dev_data(pop.begin(), pop.end());
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
   CellInterval ci(cell, cell);
   auto kernel = gpu::make_kernel(kernel_set);
@@ -240,13 +256,30 @@ void set(
   kernel();
 }
 
+void set(
+    gpu::GPUField<double> *pdf_field,
+    gpu::GPUField<double> *velocity_field,
+    gpu::GPUField<double> const *force_field,
+    std::array<double, 19u> const &pop,
+    Cell const &cell) {
+  thrust::device_vector<double> dev_data(pop.begin(), pop.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  CellInterval ci(cell, cell);
+  auto kernel = gpu::make_kernel(kernel_set_vel);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+
 void initialize(
     gpu::GPUField<double> *pdf_field,
     std::array<double, 19u> const &pop) {
   CellInterval ci = pdf_field->xyzSizeWithGhostLayer();
-  thrust::device_vector<double> dev_data(pop.data(), pop.data() + 19u);
+  thrust::device_vector<double> dev_data(pop.begin(), pop.end());
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_set);
+  auto kernel = gpu::make_kernel(kernel_broadcast);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
   kernel.addParam(const_cast<const double *>(dev_data_ptr));
   kernel();
@@ -257,7 +290,7 @@ std::vector<double> get(
     CellInterval const &ci) {
   thrust::device_vector<double> dev_data(ci.numCells() * 19u);
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_get_interval);
+  auto kernel = gpu::make_kernel(kernel_get);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
   kernel.addParam(dev_data_ptr);
   kernel();
@@ -272,82 +305,92 @@ void set(
     CellInterval const &ci) {
   thrust::device_vector<double> dev_data(values.begin(), values.end());
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_set_interval);
+  auto kernel = gpu::make_kernel(kernel_set);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
   kernel.addParam(const_cast<const double *>(dev_data_ptr));
   kernel();
 }
-} // namespace Population
 
-namespace Vector {
-__global__ void kernel_get_interval(
-    gpu::FieldAccessor<double> vec,
-    double *const out) {
-  vec.set(blockIdx, threadIdx);
-  if (vec.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
-    out[offset + 0u] = vec.get(0u);
-    out[offset + 1u] = vec.get(1u);
-    out[offset + 2u] = vec.get(2u);
-  }
+void set(
+    gpu::GPUField<double> *pdf_field,
+    gpu::GPUField<double> *velocity_field,
+    gpu::GPUField<double> const *force_field,
+    std::vector<double> const &values,
+    CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set_vel);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
 }
+} // namespace Population
 
+namespace Vector {
+// LCOV_EXCL_START
 __global__ void kernel_get(
     gpu::FieldAccessor<double> vec,
-    double *const out) {
+    double *u_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
   vec.set(blockIdx, threadIdx);
+  u_out += offset;
   if (vec.isValidPosition()) {
-    out[0u] = vec.get(0u);
-    out[1u] = vec.get(1u);
-    out[2u] = vec.get(2u);
+    u_out[0u] = vec.get(0u);
+    u_out[1u] = vec.get(1u);
+    u_out[2u] = vec.get(2u);
   }
 }
 
-__global__ void kernel_set_interval(
+__global__ void kernel_set(
     gpu::FieldAccessor<double> vec,
-    double const *RESTRICT const u) {
+    double const *RESTRICT u_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
   vec.set(blockIdx, threadIdx);
+  u_in += offset;
   if (vec.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
-    vec.get(0u) = u[offset + 0u];
-    vec.get(1u) = u[offset + 1u];
-    vec.get(2u) = u[offset + 2u];
+    vec.get(0u) = u_in[0u];
+    vec.get(1u) = u_in[1u];
+    vec.get(2u) = u_in[2u];
   }
 }
 
-__global__ void kernel_set(
+__global__ void kernel_broadcast(
     gpu::FieldAccessor<double> vec,
-    const double *RESTRICT const u) {
+    double const *RESTRICT u_in) {
   vec.set(blockIdx, threadIdx);
   if (vec.isValidPosition()) {
-    vec.get(0u) = u[0u];
-    vec.get(1u) = u[1u];
-    vec.get(2u) = u[2u];
+    vec.get(0u) = u_in[0u];
+    vec.get(1u) = u_in[1u];
+    vec.get(2u) = u_in[2u];
   }
 }
 
-__global__ void kernel_add_interval(
+__global__ void kernel_add(
     gpu::FieldAccessor<double> vec,
-    double const *RESTRICT const u) {
+    double const *RESTRICT u_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
   vec.set(blockIdx, threadIdx);
+  u_in += offset;
   if (vec.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
-    vec.get(0u) += u[offset + 0u];
-    vec.get(1u) += u[offset + 1u];
-    vec.get(2u) += u[offset + 2u];
+    vec.get(0u) += u_in[0u];
+    vec.get(1u) += u_in[1u];
+    vec.get(2u) += u_in[2u];
   }
 }
 
-__global__ void kernel_add(
+__global__ void kernel_broadcast_add(
     gpu::FieldAccessor<double> vec,
-    double const *RESTRICT const u) {
+    double const *RESTRICT u_in) {
   vec.set(blockIdx, threadIdx);
   if (vec.isValidPosition()) {
-    vec.get(0u) += u[0u];
-    vec.get(1u) += u[1u];
-    vec.get(2u) += u[2u];
+    vec.get(0u) += u_in[0u];
+    vec.get(1u) += u_in[1u];
+    vec.get(2u) += u_in[2u];
   }
 }
+// LCOV_EXCL_STOP
 
 Vector3<double> get(
     gpu::GPUField<double> const *vec_field,
@@ -396,7 +439,7 @@ void initialize(
   CellInterval ci = vec_field->xyzSizeWithGhostLayer();
   thrust::device_vector<double> dev_data(vec.data(), vec.data() + 3u);
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_set);
+  auto kernel = gpu::make_kernel(kernel_broadcast);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*vec_field, ci));
   kernel.addParam(const_cast<const double *>(dev_data_ptr));
   kernel();
@@ -408,7 +451,7 @@ void add_to_all(
   CellInterval ci = vec_field->xyzSizeWithGhostLayer();
   thrust::device_vector<double> dev_data(vec.data(), vec.data() + 3u);
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_add);
+  auto kernel = gpu::make_kernel(kernel_broadcast_add);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*vec_field, ci));
   kernel.addParam(const_cast<const double *>(dev_data_ptr));
   kernel();
@@ -419,7 +462,7 @@ std::vector<double> get(
     CellInterval const &ci) {
   thrust::device_vector<double> dev_data(ci.numCells() * 3u);
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_get_interval);
+  auto kernel = gpu::make_kernel(kernel_get);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*vec_field, ci));
   kernel.addParam(dev_data_ptr);
   kernel();
@@ -434,7 +477,7 @@ void set(
     CellInterval const &ci) {
   thrust::device_vector<double> dev_data(values.begin(), values.end());
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_set_interval);
+  auto kernel = gpu::make_kernel(kernel_set);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*vec_field, ci));
   kernel.addParam(const_cast<const double *>(dev_data_ptr));
   kernel();
@@ -442,6 +485,7 @@ void set(
 } // namespace Vector
 
 namespace Interpolation {
+// LCOV_EXCL_START
 /** @brief Calculate interpolation weights. */
 static __forceinline__ __device__ void calculate_weights(
     double const *RESTRICT const pos,
@@ -535,6 +579,7 @@ __global__ void kernel_set(
     }
   }
 }
+// LCOV_EXCL_STOP
 
 static dim3 calculate_dim_grid(uint const threads_x,
                                uint const blocks_per_grid_y,
@@ -589,6 +634,7 @@ void set(
 } // namespace Interpolation
 
 namespace Equilibrium {
+// LCOV_EXCL_START
 __device__ void kernel_set_device(
     gpu::FieldAccessor<double> pdf,
     double const *RESTRICT const u,
@@ -614,15 +660,18 @@ __device__ void kernel_set_device(
   pdf.get(17u) = rho * -0.083333333333333329 * u[0] + rho * -0.083333333333333329 * u[2] + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u[0] * u[0]) + rho * 0.083333333333333329 * (u[2] * u[2]) + rho * 0.25 * u[0] * u[2];
   pdf.get(18u) = rho * -0.083333333333333329 * u[2] + rho * -0.25 * u[0] * u[2] + rho * 0.027777777777777776 + rho * 0.083333333333333329 * u[0] + rho * 0.083333333333333329 * (u[0] * u[0]) + rho * 0.083333333333333329 * (u[2] * u[2]);
 }
+// LCOV_EXCL_STOP
 } // namespace Equilibrium
 
 namespace Density {
+// LCOV_EXCL_START
 __global__ void kernel_get(
     gpu::FieldAccessor<double> pdf,
-    double *RESTRICT const out) {
+    double *RESTRICT rho_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
   pdf.set(blockIdx, threadIdx);
+  rho_out += offset;
   if (pdf.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, uint(1u));
     double const f_0 = pdf.get(0u);
     double const f_1 = pdf.get(1u);
     double const f_2 = pdf.get(2u);
@@ -646,16 +695,17 @@ __global__ void kernel_get(
     const double vel1Term = f_1 + f_11 + f_15 + f_7;
     const double vel2Term = f_12 + f_13 + f_5;
     const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
-    out[offset] = rho;
+    rho_out[0u] = rho;
   }
 }
 
 __global__ void kernel_set(
     gpu::FieldAccessor<double> pdf,
-    double const *RESTRICT const rho_in) {
+    double const *RESTRICT rho_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
   pdf.set(blockIdx, threadIdx);
+  rho_in += offset;
   if (pdf.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, uint(1u));
     double const f_0 = pdf.get(0u);
     double const f_1 = pdf.get(1u);
     double const f_2 = pdf.get(2u);
@@ -684,12 +734,13 @@ __global__ void kernel_set(
     const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
 
     // calculate current velocity (before density change)
-    double const conversion = double(1) / rho;
-    double const u_old[3] = {momdensity_0 * conversion, momdensity_1 * conversion, momdensity_2 * conversion};
+    double const rho_inv = double{1} / rho;
+    double const u_old[3] = {momdensity_0 * rho_inv, momdensity_1 * rho_inv, momdensity_2 * rho_inv};
 
-    Equilibrium::kernel_set_device(pdf, u_old, rho_in[offset]);
+    Equilibrium::kernel_set_device(pdf, u_old, rho_in[0u]);
   }
 }
+// LCOV_EXCL_STOP
 
 double get(
     gpu::GPUField<double> const *pdf_field,
@@ -727,7 +778,7 @@ std::vector<double> get(
   kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
   kernel.addParam(dev_data_ptr);
   kernel();
-  std::vector<double> out(ci.numCells());
+  std::vector<double> out(dev_data.size());
   thrust::copy(dev_data.begin(), dev_data.end(), out.begin());
   return out;
 }
@@ -746,16 +797,63 @@ void set(
 } // namespace Density
 
 namespace Velocity {
+// LCOV_EXCL_START
+__global__ void kernel_get(
+    gpu::FieldAccessor<double> pdf,
+    gpu::FieldAccessor<double> force,
+    double *RESTRICT u_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  pdf.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  u_out += offset;
+  if (pdf.isValidPosition()) {
+    double const f_0 = pdf.get(0u);
+    double const f_1 = pdf.get(1u);
+    double const f_2 = pdf.get(2u);
+    double const f_3 = pdf.get(3u);
+    double const f_4 = pdf.get(4u);
+    double const f_5 = pdf.get(5u);
+    double const f_6 = pdf.get(6u);
+    double const f_7 = pdf.get(7u);
+    double const f_8 = pdf.get(8u);
+    double const f_9 = pdf.get(9u);
+    double const f_10 = pdf.get(10u);
+    double const f_11 = pdf.get(11u);
+    double const f_12 = pdf.get(12u);
+    double const f_13 = pdf.get(13u);
+    double const f_14 = pdf.get(14u);
+    double const f_15 = pdf.get(15u);
+    double const f_16 = pdf.get(16u);
+    double const f_17 = pdf.get(17u);
+    double const f_18 = pdf.get(18u);
+    const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const double vel1Term = f_1 + f_11 + f_15 + f_7;
+    const double momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const double vel2Term = f_12 + f_13 + f_5;
+    const double momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const double md_0 = force.get(0) * 0.50000000000000000 + momdensity_0;
+    const double md_1 = force.get(1) * 0.50000000000000000 + momdensity_1;
+    const double md_2 = force.get(2) * 0.50000000000000000 + momdensity_2;
+    auto const rho_inv = double{1} / rho;
+    u_out[0u] = md_0 * rho_inv;
+    u_out[1u] = md_1 * rho_inv;
+    u_out[2u] = md_2 * rho_inv;
+  }
+}
+
 __global__ void kernel_set(
     gpu::FieldAccessor<double> pdf,
+    gpu::FieldAccessor<double> velocity,
     gpu::FieldAccessor<double> force,
-    double const *RESTRICT const u_in) {
+    double const *RESTRICT u_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
   pdf.set(blockIdx, threadIdx);
+  velocity.set(blockIdx, threadIdx);
   force.set(blockIdx, threadIdx);
+  u_in += offset;
   if (pdf.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, uint(3u));
-    uint const bufsize = 3u;
-    double const *RESTRICT const u = u_in + bufsize * offset;
     double const f_0 = pdf.get(0u);
     double const f_1 = pdf.get(1u);
     double const f_2 = pdf.get(2u);
@@ -775,6 +873,7 @@ __global__ void kernel_set(
     double const f_16 = pdf.get(16u);
     double const f_17 = pdf.get(17u);
     double const f_18 = pdf.get(18u);
+    double const *RESTRICT const u = u_in;
     const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
     const double vel1Term = f_1 + f_11 + f_15 + f_7;
     const double vel2Term = f_12 + f_13 + f_5;
@@ -782,15 +881,54 @@ __global__ void kernel_set(
     const double u_0 = -force.get(0) * 0.50000000000000000 / rho + u[0];
     const double u_1 = -force.get(1) * 0.50000000000000000 / rho + u[1];
     const double u_2 = -force.get(2) * 0.50000000000000000 / rho + u[2];
+    velocity.get(0u) = u_in[0u];
+    velocity.get(1u) = u_in[1u];
+    velocity.get(2u) = u_in[2u];
+
     double u_new[3] = {u_0, u_1, u_2};
 
     Equilibrium::kernel_set_device(pdf, u_new, rho);
   }
 }
+// LCOV_EXCL_STOP
+
+Vector3<double> get(
+    gpu::GPUField<double> const *pdf_field,
+    gpu::GPUField<double> const *force_field,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<double> dev_data(3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  Vector3<double> vec;
+  thrust::copy(dev_data.begin(), dev_data.end(), vec.data());
+  return vec;
+}
+
+std::vector<double> get(
+    gpu::GPUField<double> const *pdf_field,
+    gpu::GPUField<double> const *force_field,
+    CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<double> out(dev_data.size());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  return out;
+}
 
 void set(
     gpu::GPUField<double> *pdf_field,
-    gpu::GPUField<double> *force_field,
+    gpu::GPUField<double> *velocity_field,
+    gpu::GPUField<double> const *force_field,
     Vector3<double> const &u,
     Cell const &cell) {
   CellInterval ci(cell, cell);
@@ -798,22 +936,127 @@ void set(
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
   auto kernel = gpu::make_kernel(kernel_set);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+
+void set(
+    gpu::GPUField<double> *pdf_field,
+    gpu::GPUField<double> *velocity_field,
+    gpu::GPUField<double> const *force_field,
+    std::vector<double> const &values,
+    CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*velocity_field, ci));
   kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
   kernel.addParam(const_cast<const double *>(dev_data_ptr));
   kernel();
 }
 } // namespace Velocity
 
+namespace Force {
+// LCOV_EXCL_START
+__global__ void kernel_set(
+    gpu::FieldAccessor<double> pdf,
+    gpu::FieldAccessor<double> velocity,
+    gpu::FieldAccessor<double> force,
+    double const *RESTRICT f_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  pdf.set(blockIdx, threadIdx);
+  velocity.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  f_in += offset;
+  if (pdf.isValidPosition()) {
+    double const f_0 = pdf.get(0u);
+    double const f_1 = pdf.get(1u);
+    double const f_2 = pdf.get(2u);
+    double const f_3 = pdf.get(3u);
+    double const f_4 = pdf.get(4u);
+    double const f_5 = pdf.get(5u);
+    double const f_6 = pdf.get(6u);
+    double const f_7 = pdf.get(7u);
+    double const f_8 = pdf.get(8u);
+    double const f_9 = pdf.get(9u);
+    double const f_10 = pdf.get(10u);
+    double const f_11 = pdf.get(11u);
+    double const f_12 = pdf.get(12u);
+    double const f_13 = pdf.get(13u);
+    double const f_14 = pdf.get(14u);
+    double const f_15 = pdf.get(15u);
+    double const f_16 = pdf.get(16u);
+    double const f_17 = pdf.get(17u);
+    double const f_18 = pdf.get(18u);
+    const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const double vel1Term = f_1 + f_11 + f_15 + f_7;
+    const double momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const double vel2Term = f_12 + f_13 + f_5;
+    const double momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const double md_0 = f_in[0u] * 0.50000000000000000 + momdensity_0;
+    const double md_1 = f_in[1u] * 0.50000000000000000 + momdensity_1;
+    const double md_2 = f_in[2u] * 0.50000000000000000 + momdensity_2;
+    auto const rho_inv = double{1} / rho;
+
+    force.get(0u) = f_in[0u];
+    force.get(1u) = f_in[1u];
+    force.get(2u) = f_in[2u];
+
+    velocity.get(0u) = md_0 * rho_inv;
+    velocity.get(1u) = md_1 * rho_inv;
+    velocity.get(2u) = md_2 * rho_inv;
+  }
+}
+// LCOV_EXCL_STOP
+
+void set(gpu::GPUField<double> const *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> *force_field,
+         Vector3<double> const &u,
+         Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<double> dev_data(u.data(), u.data() + 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+
+void set(gpu::GPUField<double> const *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> *force_field,
+         std::vector<double> const &values,
+         CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+} // namespace Force
+
 namespace MomentumDensity {
+// LCOV_EXCL_START
 __global__ void kernel_sum(
     gpu::FieldAccessor<double> pdf,
     gpu::FieldAccessor<double> force,
-    double *RESTRICT const out) {
+    double *RESTRICT out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
   pdf.set(blockIdx, threadIdx);
   force.set(blockIdx, threadIdx);
+  out += offset;
   if (pdf.isValidPosition()) {
-    uint const bufsize = 3u;
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, bufsize);
     double const f_0 = pdf.get(0u);
     double const f_1 = pdf.get(1u);
     double const f_2 = pdf.get(2u);
@@ -843,11 +1086,12 @@ __global__ void kernel_sum(
     const double md_0 = force.get(0) * 0.50000000000000000 + momdensity_0;
     const double md_1 = force.get(1) * 0.50000000000000000 + momdensity_1;
     const double md_2 = force.get(2) * 0.50000000000000000 + momdensity_2;
-    out[bufsize * offset + 0u] += md_0;
-    out[bufsize * offset + 1u] += md_1;
-    out[bufsize * offset + 2u] += md_2;
+    out[0u] += md_0;
+    out[1u] += md_1;
+    out[2u] += md_2;
   }
 }
+// LCOV_EXCL_STOP
 
 Vector3<double> reduce(
     gpu::GPUField<double> const *pdf_field,
@@ -870,13 +1114,14 @@ Vector3<double> reduce(
 } // namespace MomentumDensity
 
 namespace PressureTensor {
+// LCOV_EXCL_START
 __global__ void kernel_get(
     gpu::FieldAccessor<double> pdf,
-    double *RESTRICT const out) {
+    double *RESTRICT p_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 9u);
   pdf.set(blockIdx, threadIdx);
+  p_out += offset;
   if (pdf.isValidPosition()) {
-    uint const bufsize = 9u;
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, bufsize);
     double const f_0 = pdf.get(0u);
     double const f_1 = pdf.get(1u);
     double const f_2 = pdf.get(2u);
@@ -905,19 +1150,18 @@ __global__ void kernel_get(
     const double p_6 = -f_13 + f_14 + f_17 - f_18;
     const double p_7 = f_11 - f_12 - f_15 + f_16;
     const double p_8 = f_11 + f_12 + f_13 + f_14 + f_15 + f_16 + f_17 + f_18 + f_5 + f_6;
-    out[bufsize * offset + 0u] = p_0;
-    out[bufsize * offset + 1u] = p_1;
-    out[bufsize * offset + 2u] = p_2;
-
-    out[bufsize * offset + 3u] = p_3;
-    out[bufsize * offset + 4u] = p_4;
-    out[bufsize * offset + 5u] = p_5;
-
-    out[bufsize * offset + 6u] = p_6;
-    out[bufsize * offset + 7u] = p_7;
-    out[bufsize * offset + 8u] = p_8;
+    p_out[0u] = p_0;
+    p_out[1u] = p_1;
+    p_out[2u] = p_2;
+    p_out[3u] = p_3;
+    p_out[4u] = p_4;
+    p_out[5u] = p_5;
+    p_out[6u] = p_6;
+    p_out[7u] = p_7;
+    p_out[8u] = p_8;
   }
 }
+// LCOV_EXCL_STOP
 
 Matrix3<double> get(
     gpu::GPUField<double> const *pdf_field,
@@ -930,7 +1174,21 @@ Matrix3<double> get(
   kernel.addParam(dev_data_ptr);
   kernel();
   Matrix3<double> out;
-  thrust::copy(dev_data.begin(), dev_data.begin() + 9u, out.data());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  return out;
+}
+
+std::vector<double> get(
+    gpu::GPUField<double> const *pdf_field,
+    CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(9u * ci.numCells());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<double> out(dev_data.size());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
   return out;
 }
 } // namespace PressureTensor
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cuh b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cuh
index 6b3723ab30..6ac754f263 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cuh
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cuh
@@ -43,18 +43,6 @@
 #include <tuple>
 #include <vector>
 
-#ifdef WALBERLA_CXX_COMPILER_IS_GNU
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-variable"
-#pragma GCC diagnostic ignored "-Wunused-parameter"
-#endif
-
-#ifdef WALBERLA_CXX_COMPILER_IS_CLANG
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-parameter"
-#endif
-
 namespace walberla {
 namespace lbm {
 namespace accessor {
@@ -66,6 +54,11 @@ std::array<double, 19u> get(gpu::GPUField<double> const *pdf_field,
 /** @brief Set populations on a single cell. */
 void set(gpu::GPUField<double> *pdf_field, std::array<double, 19u> const &pop,
          Cell const &cell);
+/** @brief Set populations and recalculate velocities on a single cell. */
+void set(gpu::GPUField<double> *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> const *force_field,
+         std::array<double, 19u> const &pop, Cell const &cell);
 /** @brief Initialize all cells with the same value. */
 void initialize(gpu::GPUField<double> *pdf_field,
                 std::array<double, 19u> const &pop);
@@ -75,6 +68,11 @@ std::vector<double> get(gpu::GPUField<double> const *pdf_field,
 /** @brief Set populations on a cell interval. */
 void set(gpu::GPUField<double> *pdf_field, std::vector<double> const &values,
          CellInterval const &ci);
+/** @brief Set populations and recalculate velocities on a cell interval. */
+void set(gpu::GPUField<double> *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> const *force_field,
+         std::vector<double> const &values, CellInterval const &ci);
 } // namespace Population
 
 namespace Vector {
@@ -116,10 +114,32 @@ void set(gpu::GPUField<double> *pdf_field, std::vector<double> const &values,
 } // namespace Density
 
 namespace Velocity {
-void set(gpu::GPUField<double> *pdf_field, gpu::GPUField<double> *force_field,
-         Vector3<double> const &u, Cell const &cell);
+Vector3<double> get(gpu::GPUField<double> const *pdf_field,
+                    gpu::GPUField<double> const *force_field, Cell const &cell);
+std::vector<double> get(gpu::GPUField<double> const *pdf_field,
+                        gpu::GPUField<double> const *force_field,
+                        CellInterval const &ci);
+void set(gpu::GPUField<double> *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> const *force_field, Vector3<double> const &u,
+         Cell const &cell);
+void set(gpu::GPUField<double> *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> const *force_field,
+         std::vector<double> const &values, CellInterval const &ci);
 } // namespace Velocity
 
+namespace Force {
+void set(gpu::GPUField<double> const *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> *force_field, Vector3<double> const &u,
+         Cell const &cell);
+void set(gpu::GPUField<double> const *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> *force_field, std::vector<double> const &values,
+         CellInterval const &ci);
+} // namespace Force
+
 namespace DensityAndVelocity {
 std::tuple<double, Vector3<double>>
 get(gpu::GPUField<double> const *pdf_field,
@@ -141,16 +161,10 @@ Vector3<double> reduce(gpu::GPUField<double> const *pdf_field,
 
 namespace PressureTensor {
 Matrix3<double> get(gpu::GPUField<double> const *pdf_field, Cell const &cell);
+std::vector<double> get(gpu::GPUField<double> const *pdf_field,
+                        CellInterval const &ci);
 } // namespace PressureTensor
 
 } // namespace accessor
 } // namespace lbm
 } // namespace walberla
-
-#ifdef WALBERLA_CXX_COMPILER_IS_GNU
-#pragma GCC diagnostic pop
-#endif
-
-#ifdef WALBERLA_CXX_COMPILER_IS_CLANG
-#pragma clang diagnostic pop
-#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecision.h
index 689fe936be..a7711c4f78 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecision.h
@@ -41,19 +41,18 @@
 
 #include <array>
 #include <cassert>
+#include <iterator>
 #include <tuple>
 #include <vector>
 
 #ifdef WALBERLA_CXX_COMPILER_IS_GNU
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"
-#pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
 #ifdef WALBERLA_CXX_COMPILER_IS_CLANG
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-parameter"
 #endif
 
 namespace walberla {
@@ -61,8 +60,8 @@ namespace lbm {
 namespace accessor {
 
 namespace Population {
-inline std::array<float, 19u>
-get(GhostLayerField<float, uint_t{19u}> const *pdf_field, Cell const &cell) {
+inline auto get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                Cell const &cell) {
   float const &xyz0 = pdf_field->get(cell, uint_t{0u});
   std::array<float, 19u> pop;
   pop[0u] = pdf_field->getF(&xyz0, uint_t{0u});
@@ -111,6 +110,54 @@ inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
   pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
 }
 
+inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
+                GhostLayerField<float, uint_t{3u}> *velocity_field,
+                GhostLayerField<float, uint_t{3u}> const *force_field,
+                std::array<float, 19u> const &pop, Cell const &cell) {
+  auto &xyz0 = pdf_field->get(cell, uint_t{0u});
+  const float f_0 = pdf_field->getF(&xyz0, uint_t{0u}) = pop[0u];
+  const float f_1 = pdf_field->getF(&xyz0, uint_t{1u}) = pop[1u];
+  const float f_2 = pdf_field->getF(&xyz0, uint_t{2u}) = pop[2u];
+  const float f_3 = pdf_field->getF(&xyz0, uint_t{3u}) = pop[3u];
+  const float f_4 = pdf_field->getF(&xyz0, uint_t{4u}) = pop[4u];
+  const float f_5 = pdf_field->getF(&xyz0, uint_t{5u}) = pop[5u];
+  const float f_6 = pdf_field->getF(&xyz0, uint_t{6u}) = pop[6u];
+  const float f_7 = pdf_field->getF(&xyz0, uint_t{7u}) = pop[7u];
+  const float f_8 = pdf_field->getF(&xyz0, uint_t{8u}) = pop[8u];
+  const float f_9 = pdf_field->getF(&xyz0, uint_t{9u}) = pop[9u];
+  const float f_10 = pdf_field->getF(&xyz0, uint_t{10u}) = pop[10u];
+  const float f_11 = pdf_field->getF(&xyz0, uint_t{11u}) = pop[11u];
+  const float f_12 = pdf_field->getF(&xyz0, uint_t{12u}) = pop[12u];
+  const float f_13 = pdf_field->getF(&xyz0, uint_t{13u}) = pop[13u];
+  const float f_14 = pdf_field->getF(&xyz0, uint_t{14u}) = pop[14u];
+  const float f_15 = pdf_field->getF(&xyz0, uint_t{15u}) = pop[15u];
+  const float f_16 = pdf_field->getF(&xyz0, uint_t{16u}) = pop[16u];
+  const float f_17 = pdf_field->getF(&xyz0, uint_t{17u}) = pop[17u];
+  const float f_18 = pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
+  const auto x = cell.x();
+  const auto y = cell.y();
+  const auto z = cell.z();
+  const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+  const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+  const float vel1Term = f_1 + f_11 + f_15 + f_7;
+  const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+  const float vel2Term = f_12 + f_13 + f_5;
+  const float momdensity_2 =
+      f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+  const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                    vel1Term + vel2Term;
+  const float md_0 =
+      force_field->get(x, y, z, 0) * 0.50000000000000000f + momdensity_0;
+  const float md_1 =
+      force_field->get(x, y, z, 1) * 0.50000000000000000f + momdensity_1;
+  const float md_2 =
+      force_field->get(x, y, z, 2) * 0.50000000000000000f + momdensity_2;
+  const auto rho_inv = float{1} / rho;
+  velocity_field->get(cell, uint_t{0u}) = md_0 * rho_inv;
+  velocity_field->get(cell, uint_t{1u}) = md_1 * rho_inv;
+  velocity_field->get(cell, uint_t{2u}) = md_2 * rho_inv;
+}
+
 inline void initialize(GhostLayerField<float, uint_t{19u}> *pdf_field,
                        std::array<float, 19u> const &pop) {
   WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(pdf_field, {
@@ -137,9 +184,8 @@ inline void initialize(GhostLayerField<float, uint_t{19u}> *pdf_field,
   });
 }
 
-inline std::vector<float>
-get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
-    CellInterval const &ci) {
+inline auto get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                CellInterval const &ci) {
   std::vector<float> out;
   out.reserve(ci.numCells() * uint_t(19u));
   for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
@@ -174,31 +220,86 @@ get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
 inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
                 std::vector<float> const &values, CellInterval const &ci) {
   assert(uint_c(values.size()) == ci.numCells() * uint_t(19u));
-  auto values_ptr = values.data();
+  auto pop = values.data();
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        float &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        pdf_field->getF(&xyz0, uint_t{0u}) = pop[0u];
+        pdf_field->getF(&xyz0, uint_t{1u}) = pop[1u];
+        pdf_field->getF(&xyz0, uint_t{2u}) = pop[2u];
+        pdf_field->getF(&xyz0, uint_t{3u}) = pop[3u];
+        pdf_field->getF(&xyz0, uint_t{4u}) = pop[4u];
+        pdf_field->getF(&xyz0, uint_t{5u}) = pop[5u];
+        pdf_field->getF(&xyz0, uint_t{6u}) = pop[6u];
+        pdf_field->getF(&xyz0, uint_t{7u}) = pop[7u];
+        pdf_field->getF(&xyz0, uint_t{8u}) = pop[8u];
+        pdf_field->getF(&xyz0, uint_t{9u}) = pop[9u];
+        pdf_field->getF(&xyz0, uint_t{10u}) = pop[10u];
+        pdf_field->getF(&xyz0, uint_t{11u}) = pop[11u];
+        pdf_field->getF(&xyz0, uint_t{12u}) = pop[12u];
+        pdf_field->getF(&xyz0, uint_t{13u}) = pop[13u];
+        pdf_field->getF(&xyz0, uint_t{14u}) = pop[14u];
+        pdf_field->getF(&xyz0, uint_t{15u}) = pop[15u];
+        pdf_field->getF(&xyz0, uint_t{16u}) = pop[16u];
+        pdf_field->getF(&xyz0, uint_t{17u}) = pop[17u];
+        pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
+        std::advance(pop, 19);
+      }
+    }
+  }
+}
+
+inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
+                GhostLayerField<float, uint_t{3u}> *velocity_field,
+                GhostLayerField<float, uint_t{3u}> const *force_field,
+                std::vector<float> const &values, CellInterval const &ci) {
+  assert(uint_c(values.size()) == ci.numCells() * uint_t(19u));
+  auto pop = values.data();
   for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
     for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
       for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
         float &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
-        pdf_field->getF(&xyz0, uint_t{0u}) = values_ptr[0u];
-        pdf_field->getF(&xyz0, uint_t{1u}) = values_ptr[1u];
-        pdf_field->getF(&xyz0, uint_t{2u}) = values_ptr[2u];
-        pdf_field->getF(&xyz0, uint_t{3u}) = values_ptr[3u];
-        pdf_field->getF(&xyz0, uint_t{4u}) = values_ptr[4u];
-        pdf_field->getF(&xyz0, uint_t{5u}) = values_ptr[5u];
-        pdf_field->getF(&xyz0, uint_t{6u}) = values_ptr[6u];
-        pdf_field->getF(&xyz0, uint_t{7u}) = values_ptr[7u];
-        pdf_field->getF(&xyz0, uint_t{8u}) = values_ptr[8u];
-        pdf_field->getF(&xyz0, uint_t{9u}) = values_ptr[9u];
-        pdf_field->getF(&xyz0, uint_t{10u}) = values_ptr[10u];
-        pdf_field->getF(&xyz0, uint_t{11u}) = values_ptr[11u];
-        pdf_field->getF(&xyz0, uint_t{12u}) = values_ptr[12u];
-        pdf_field->getF(&xyz0, uint_t{13u}) = values_ptr[13u];
-        pdf_field->getF(&xyz0, uint_t{14u}) = values_ptr[14u];
-        pdf_field->getF(&xyz0, uint_t{15u}) = values_ptr[15u];
-        pdf_field->getF(&xyz0, uint_t{16u}) = values_ptr[16u];
-        pdf_field->getF(&xyz0, uint_t{17u}) = values_ptr[17u];
-        pdf_field->getF(&xyz0, uint_t{18u}) = values_ptr[18u];
-        values_ptr += 19u;
+        const float f_0 = pdf_field->getF(&xyz0, uint_t{0u}) = pop[0u];
+        const float f_1 = pdf_field->getF(&xyz0, uint_t{1u}) = pop[1u];
+        const float f_2 = pdf_field->getF(&xyz0, uint_t{2u}) = pop[2u];
+        const float f_3 = pdf_field->getF(&xyz0, uint_t{3u}) = pop[3u];
+        const float f_4 = pdf_field->getF(&xyz0, uint_t{4u}) = pop[4u];
+        const float f_5 = pdf_field->getF(&xyz0, uint_t{5u}) = pop[5u];
+        const float f_6 = pdf_field->getF(&xyz0, uint_t{6u}) = pop[6u];
+        const float f_7 = pdf_field->getF(&xyz0, uint_t{7u}) = pop[7u];
+        const float f_8 = pdf_field->getF(&xyz0, uint_t{8u}) = pop[8u];
+        const float f_9 = pdf_field->getF(&xyz0, uint_t{9u}) = pop[9u];
+        const float f_10 = pdf_field->getF(&xyz0, uint_t{10u}) = pop[10u];
+        const float f_11 = pdf_field->getF(&xyz0, uint_t{11u}) = pop[11u];
+        const float f_12 = pdf_field->getF(&xyz0, uint_t{12u}) = pop[12u];
+        const float f_13 = pdf_field->getF(&xyz0, uint_t{13u}) = pop[13u];
+        const float f_14 = pdf_field->getF(&xyz0, uint_t{14u}) = pop[14u];
+        const float f_15 = pdf_field->getF(&xyz0, uint_t{15u}) = pop[15u];
+        const float f_16 = pdf_field->getF(&xyz0, uint_t{16u}) = pop[16u];
+        const float f_17 = pdf_field->getF(&xyz0, uint_t{17u}) = pop[17u];
+        const float f_18 = pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
+        const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const float vel1Term = f_1 + f_11 + f_15 + f_7;
+        const float momdensity_1 =
+            -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const float vel2Term = f_12 + f_13 + f_5;
+        const float momdensity_2 =
+            f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                          vel1Term + vel2Term;
+        const float md_0 =
+            force_field->get(x, y, z, 0) * 0.50000000000000000f + momdensity_0;
+        const float md_1 =
+            force_field->get(x, y, z, 1) * 0.50000000000000000f + momdensity_1;
+        const float md_2 =
+            force_field->get(x, y, z, 2) * 0.50000000000000000f + momdensity_2;
+        const auto rho_inv = float{1} / rho;
+        velocity_field->get(x, y, z, uint_t{0u}) = md_0 * rho_inv;
+        velocity_field->get(x, y, z, uint_t{1u}) = md_1 * rho_inv;
+        velocity_field->get(x, y, z, uint_t{2u}) = md_2 * rho_inv;
+        std::advance(pop, 19);
       }
     }
   }
@@ -206,8 +307,8 @@ inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
 } // namespace Population
 
 namespace Vector {
-inline Vector3<float> get(GhostLayerField<float, uint_t{3u}> const *vec_field,
-                          Cell const &cell) {
+inline auto get(GhostLayerField<float, uint_t{3u}> const *vec_field,
+                Cell const &cell) {
   const float &xyz0 = vec_field->get(cell, uint_t{0u});
   Vector3<float> vec;
   vec[0] = vec_field->getF(&xyz0, uint_t{0u});
@@ -252,9 +353,8 @@ inline void add_to_all(GhostLayerField<float, uint_t{3u}> *vec_field,
   });
 }
 
-inline std::vector<float>
-get(GhostLayerField<float, uint_t{3u}> const *vec_field,
-    CellInterval const &ci) {
+inline auto get(GhostLayerField<float, uint_t{3u}> const *vec_field,
+                CellInterval const &ci) {
   std::vector<float> out;
   out.reserve(ci.numCells() * uint_t(3u));
   for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
@@ -281,7 +381,7 @@ inline void set(GhostLayerField<float, uint_t{3u}> *vec_field,
         vec_field->getF(&xyz0, uint_t{0u}) = values_ptr[0u];
         vec_field->getF(&xyz0, uint_t{1u}) = values_ptr[1u];
         vec_field->getF(&xyz0, uint_t{2u}) = values_ptr[2u];
-        values_ptr += 3u;
+        std::advance(values_ptr, 3);
       }
     }
   }
@@ -290,8 +390,8 @@ inline void set(GhostLayerField<float, uint_t{3u}> *vec_field,
 
 namespace EquilibriumDistribution {
 inline float get(stencil::Direction const direction,
-                 Vector3<float> const &u = Vector3<float>(float(0.0)),
-                 float rho = float(1.0)) {
+                 Vector3<float> const &u = Vector3<float>(float{0}),
+                 float rho = float{1}) {
 
   using namespace stencil;
   switch (direction) {
@@ -571,7 +671,7 @@ inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
                     vel1Term + vel2Term;
 
   // calculate current velocity (before density change)
-  const float conversion = float(1) / rho;
+  const float conversion = float{1} / rho;
   Vector3<float> velocity;
   velocity[0u] = momdensity_0 * conversion;
   velocity[1u] = momdensity_1 * conversion;
@@ -659,7 +759,7 @@ inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
                           vel1Term + vel2Term;
 
         // calculate current velocity (before density change)
-        const float conversion = float(1) / rho;
+        const float conversion = float{1} / rho;
         Vector3<float> velocity;
         velocity[0u] = momdensity_0 * conversion;
         velocity[1u] = momdensity_1 * conversion;
@@ -674,7 +774,108 @@ inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
 } // namespace Density
 
 namespace Velocity {
+inline auto get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                GhostLayerField<float, uint_t{3u}> const *force_field,
+                Cell const &cell) {
+  const float &xyz0 = pdf_field->get(cell, uint_t{0u});
+  const float f_0 = pdf_field->getF(&xyz0, uint_t{0u});
+  const float f_1 = pdf_field->getF(&xyz0, uint_t{1u});
+  const float f_2 = pdf_field->getF(&xyz0, uint_t{2u});
+  const float f_3 = pdf_field->getF(&xyz0, uint_t{3u});
+  const float f_4 = pdf_field->getF(&xyz0, uint_t{4u});
+  const float f_5 = pdf_field->getF(&xyz0, uint_t{5u});
+  const float f_6 = pdf_field->getF(&xyz0, uint_t{6u});
+  const float f_7 = pdf_field->getF(&xyz0, uint_t{7u});
+  const float f_8 = pdf_field->getF(&xyz0, uint_t{8u});
+  const float f_9 = pdf_field->getF(&xyz0, uint_t{9u});
+  const float f_10 = pdf_field->getF(&xyz0, uint_t{10u});
+  const float f_11 = pdf_field->getF(&xyz0, uint_t{11u});
+  const float f_12 = pdf_field->getF(&xyz0, uint_t{12u});
+  const float f_13 = pdf_field->getF(&xyz0, uint_t{13u});
+  const float f_14 = pdf_field->getF(&xyz0, uint_t{14u});
+  const float f_15 = pdf_field->getF(&xyz0, uint_t{15u});
+  const float f_16 = pdf_field->getF(&xyz0, uint_t{16u});
+  const float f_17 = pdf_field->getF(&xyz0, uint_t{17u});
+  const float f_18 = pdf_field->getF(&xyz0, uint_t{18u});
+  const auto x = cell.x();
+  const auto y = cell.y();
+  const auto z = cell.z();
+  const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+  const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+  const float vel1Term = f_1 + f_11 + f_15 + f_7;
+  const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+  const float vel2Term = f_12 + f_13 + f_5;
+  const float momdensity_2 =
+      f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+  const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                    vel1Term + vel2Term;
+  const float md_0 =
+      force_field->get(x, y, z, 0) * 0.50000000000000000f + momdensity_0;
+  const float md_1 =
+      force_field->get(x, y, z, 1) * 0.50000000000000000f + momdensity_1;
+  const float md_2 =
+      force_field->get(x, y, z, 2) * 0.50000000000000000f + momdensity_2;
+  const float rho_inv = float{1} / rho;
+
+  return Vector3<float>(md_0 * rho_inv, md_1 * rho_inv, md_2 * rho_inv);
+}
+
+inline auto get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                GhostLayerField<float, uint_t{3u}> const *force_field,
+                CellInterval const &ci) {
+  std::vector<float> out;
+  out.reserve(ci.numCells() * uint_t(3u));
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        const float &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        const float f_0 = pdf_field->getF(&xyz0, uint_t{0u});
+        const float f_1 = pdf_field->getF(&xyz0, uint_t{1u});
+        const float f_2 = pdf_field->getF(&xyz0, uint_t{2u});
+        const float f_3 = pdf_field->getF(&xyz0, uint_t{3u});
+        const float f_4 = pdf_field->getF(&xyz0, uint_t{4u});
+        const float f_5 = pdf_field->getF(&xyz0, uint_t{5u});
+        const float f_6 = pdf_field->getF(&xyz0, uint_t{6u});
+        const float f_7 = pdf_field->getF(&xyz0, uint_t{7u});
+        const float f_8 = pdf_field->getF(&xyz0, uint_t{8u});
+        const float f_9 = pdf_field->getF(&xyz0, uint_t{9u});
+        const float f_10 = pdf_field->getF(&xyz0, uint_t{10u});
+        const float f_11 = pdf_field->getF(&xyz0, uint_t{11u});
+        const float f_12 = pdf_field->getF(&xyz0, uint_t{12u});
+        const float f_13 = pdf_field->getF(&xyz0, uint_t{13u});
+        const float f_14 = pdf_field->getF(&xyz0, uint_t{14u});
+        const float f_15 = pdf_field->getF(&xyz0, uint_t{15u});
+        const float f_16 = pdf_field->getF(&xyz0, uint_t{16u});
+        const float f_17 = pdf_field->getF(&xyz0, uint_t{17u});
+        const float f_18 = pdf_field->getF(&xyz0, uint_t{18u});
+        const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const float vel1Term = f_1 + f_11 + f_15 + f_7;
+        const float momdensity_1 =
+            -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const float vel2Term = f_12 + f_13 + f_5;
+        const float momdensity_2 =
+            f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                          vel1Term + vel2Term;
+        const float md_0 =
+            force_field->get(x, y, z, 0) * 0.50000000000000000f + momdensity_0;
+        const float md_1 =
+            force_field->get(x, y, z, 1) * 0.50000000000000000f + momdensity_1;
+        const float md_2 =
+            force_field->get(x, y, z, 2) * 0.50000000000000000f + momdensity_2;
+        const float rho_inv = float{1} / rho;
+        out.emplace_back(md_0 * rho_inv);
+        out.emplace_back(md_1 * rho_inv);
+        out.emplace_back(md_2 * rho_inv);
+      }
+    }
+  }
+  return out;
+}
+
 inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
+                GhostLayerField<float, uint_t{3u}> *velocity_field,
                 GhostLayerField<float, uint_t{3u}> const *force_field,
                 Vector3<float> const &u, Cell const &cell) {
   const float &xyz0 = pdf_field->get(cell, uint_t{0u});
@@ -712,15 +913,183 @@ inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
       -force_field->get(x, y, z, 1) * 0.50000000000000000f / rho + u[1];
   const float u_2 =
       -force_field->get(x, y, z, 2) * 0.50000000000000000f / rho + u[2];
+  velocity_field->get(x, y, z, uint_t{0u}) = u[0u];
+  velocity_field->get(x, y, z, uint_t{1u}) = u[1u];
+  velocity_field->get(x, y, z, uint_t{2u}) = u[2u];
 
   Equilibrium::set(pdf_field, Vector3<float>(u_0, u_1, u_2), rho, cell);
 }
+
+inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
+                GhostLayerField<float, uint_t{3u}> *velocity_field,
+                GhostLayerField<float, uint_t{3u}> const *force_field,
+                std::vector<float> const &values, CellInterval const &ci) {
+  assert(uint_c(values.size()) == ci.numCells() * uint_t(3u));
+  auto u = values.data();
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        float &pdf_xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        float &vel_xyz0 = velocity_field->get(x, y, z, uint_t{0u});
+        const float f_0 = pdf_field->getF(&pdf_xyz0, uint_t{0u});
+        const float f_1 = pdf_field->getF(&pdf_xyz0, uint_t{1u});
+        const float f_2 = pdf_field->getF(&pdf_xyz0, uint_t{2u});
+        const float f_3 = pdf_field->getF(&pdf_xyz0, uint_t{3u});
+        const float f_4 = pdf_field->getF(&pdf_xyz0, uint_t{4u});
+        const float f_5 = pdf_field->getF(&pdf_xyz0, uint_t{5u});
+        const float f_6 = pdf_field->getF(&pdf_xyz0, uint_t{6u});
+        const float f_7 = pdf_field->getF(&pdf_xyz0, uint_t{7u});
+        const float f_8 = pdf_field->getF(&pdf_xyz0, uint_t{8u});
+        const float f_9 = pdf_field->getF(&pdf_xyz0, uint_t{9u});
+        const float f_10 = pdf_field->getF(&pdf_xyz0, uint_t{10u});
+        const float f_11 = pdf_field->getF(&pdf_xyz0, uint_t{11u});
+        const float f_12 = pdf_field->getF(&pdf_xyz0, uint_t{12u});
+        const float f_13 = pdf_field->getF(&pdf_xyz0, uint_t{13u});
+        const float f_14 = pdf_field->getF(&pdf_xyz0, uint_t{14u});
+        const float f_15 = pdf_field->getF(&pdf_xyz0, uint_t{15u});
+        const float f_16 = pdf_field->getF(&pdf_xyz0, uint_t{16u});
+        const float f_17 = pdf_field->getF(&pdf_xyz0, uint_t{17u});
+        const float f_18 = pdf_field->getF(&pdf_xyz0, uint_t{18u});
+        const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const float vel1Term = f_1 + f_11 + f_15 + f_7;
+        const float vel2Term = f_12 + f_13 + f_5;
+        const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                          vel1Term + vel2Term;
+
+        const float u_0 =
+            -force_field->get(x, y, z, 0) * 0.50000000000000000f / rho + u[0];
+        const float u_1 =
+            -force_field->get(x, y, z, 1) * 0.50000000000000000f / rho + u[1];
+        const float u_2 =
+            -force_field->get(x, y, z, 2) * 0.50000000000000000f / rho + u[2];
+        velocity_field->getF(&vel_xyz0, uint_t{0u}) = u[0u];
+        velocity_field->getF(&vel_xyz0, uint_t{1u}) = u[1u];
+        velocity_field->getF(&vel_xyz0, uint_t{2u}) = u[2u];
+
+        std::advance(u, 3);
+
+        Equilibrium::set(pdf_field, Vector3<float>(u_0, u_1, u_2), rho,
+                         Cell{x, y, z});
+      }
+    }
+  }
+}
 } // namespace Velocity
 
+namespace Force {
+inline void set(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                GhostLayerField<float, uint_t{3u}> *velocity_field,
+                GhostLayerField<float, uint_t{3u}> *force_field,
+                Vector3<float> const &force, Cell const &cell) {
+  float const &pdf_xyz0 = pdf_field->get(cell, uint_t{0u});
+  float &vel_xyz0 = velocity_field->get(cell, uint_t{0u});
+  float &laf_xyz0 = force_field->get(cell, uint_t{0u});
+  const float f_0 = pdf_field->getF(&pdf_xyz0, uint_t{0u});
+  const float f_1 = pdf_field->getF(&pdf_xyz0, uint_t{1u});
+  const float f_2 = pdf_field->getF(&pdf_xyz0, uint_t{2u});
+  const float f_3 = pdf_field->getF(&pdf_xyz0, uint_t{3u});
+  const float f_4 = pdf_field->getF(&pdf_xyz0, uint_t{4u});
+  const float f_5 = pdf_field->getF(&pdf_xyz0, uint_t{5u});
+  const float f_6 = pdf_field->getF(&pdf_xyz0, uint_t{6u});
+  const float f_7 = pdf_field->getF(&pdf_xyz0, uint_t{7u});
+  const float f_8 = pdf_field->getF(&pdf_xyz0, uint_t{8u});
+  const float f_9 = pdf_field->getF(&pdf_xyz0, uint_t{9u});
+  const float f_10 = pdf_field->getF(&pdf_xyz0, uint_t{10u});
+  const float f_11 = pdf_field->getF(&pdf_xyz0, uint_t{11u});
+  const float f_12 = pdf_field->getF(&pdf_xyz0, uint_t{12u});
+  const float f_13 = pdf_field->getF(&pdf_xyz0, uint_t{13u});
+  const float f_14 = pdf_field->getF(&pdf_xyz0, uint_t{14u});
+  const float f_15 = pdf_field->getF(&pdf_xyz0, uint_t{15u});
+  const float f_16 = pdf_field->getF(&pdf_xyz0, uint_t{16u});
+  const float f_17 = pdf_field->getF(&pdf_xyz0, uint_t{17u});
+  const float f_18 = pdf_field->getF(&pdf_xyz0, uint_t{18u});
+  const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+  const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+  const float vel1Term = f_1 + f_11 + f_15 + f_7;
+  const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+  const float vel2Term = f_12 + f_13 + f_5;
+  const float momdensity_2 =
+      f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+  const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                    vel1Term + vel2Term;
+  const float md_0 = force[0u] * 0.50000000000000000f + momdensity_0;
+  const float md_1 = force[1u] * 0.50000000000000000f + momdensity_1;
+  const float md_2 = force[2u] * 0.50000000000000000f + momdensity_2;
+  auto const rho_inv = float{1} / rho;
+
+  force_field->getF(&laf_xyz0, uint_t{0u}) = force[0u];
+  force_field->getF(&laf_xyz0, uint_t{1u}) = force[1u];
+  force_field->getF(&laf_xyz0, uint_t{2u}) = force[2u];
+
+  velocity_field->getF(&vel_xyz0, uint_t{0u}) = md_0 * rho_inv;
+  velocity_field->getF(&vel_xyz0, uint_t{1u}) = md_1 * rho_inv;
+  velocity_field->getF(&vel_xyz0, uint_t{2u}) = md_2 * rho_inv;
+}
+
+inline void set(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                GhostLayerField<float, uint_t{3u}> *velocity_field,
+                GhostLayerField<float, uint_t{3u}> *force_field,
+                std::vector<float> const &values, CellInterval const &ci) {
+  assert(uint_c(values.size()) == ci.numCells() * uint_t(3u));
+  auto force = values.data();
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        float const &pdf_xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        float &vel_xyz0 = velocity_field->get(x, y, z, uint_t{0u});
+        float &laf_xyz0 = force_field->get(x, y, z, uint_t{0u});
+        const float f_0 = pdf_field->getF(&pdf_xyz0, uint_t{0u});
+        const float f_1 = pdf_field->getF(&pdf_xyz0, uint_t{1u});
+        const float f_2 = pdf_field->getF(&pdf_xyz0, uint_t{2u});
+        const float f_3 = pdf_field->getF(&pdf_xyz0, uint_t{3u});
+        const float f_4 = pdf_field->getF(&pdf_xyz0, uint_t{4u});
+        const float f_5 = pdf_field->getF(&pdf_xyz0, uint_t{5u});
+        const float f_6 = pdf_field->getF(&pdf_xyz0, uint_t{6u});
+        const float f_7 = pdf_field->getF(&pdf_xyz0, uint_t{7u});
+        const float f_8 = pdf_field->getF(&pdf_xyz0, uint_t{8u});
+        const float f_9 = pdf_field->getF(&pdf_xyz0, uint_t{9u});
+        const float f_10 = pdf_field->getF(&pdf_xyz0, uint_t{10u});
+        const float f_11 = pdf_field->getF(&pdf_xyz0, uint_t{11u});
+        const float f_12 = pdf_field->getF(&pdf_xyz0, uint_t{12u});
+        const float f_13 = pdf_field->getF(&pdf_xyz0, uint_t{13u});
+        const float f_14 = pdf_field->getF(&pdf_xyz0, uint_t{14u});
+        const float f_15 = pdf_field->getF(&pdf_xyz0, uint_t{15u});
+        const float f_16 = pdf_field->getF(&pdf_xyz0, uint_t{16u});
+        const float f_17 = pdf_field->getF(&pdf_xyz0, uint_t{17u});
+        const float f_18 = pdf_field->getF(&pdf_xyz0, uint_t{18u});
+        const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const float vel1Term = f_1 + f_11 + f_15 + f_7;
+        const float momdensity_1 =
+            -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const float vel2Term = f_12 + f_13 + f_5;
+        const float momdensity_2 =
+            f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                          vel1Term + vel2Term;
+        const float md_0 = force[0u] * 0.50000000000000000f + momdensity_0;
+        const float md_1 = force[1u] * 0.50000000000000000f + momdensity_1;
+        const float md_2 = force[2u] * 0.50000000000000000f + momdensity_2;
+        auto const rho_inv = float{1} / rho;
+
+        force_field->getF(&laf_xyz0, uint_t{0u}) = force[0u];
+        force_field->getF(&laf_xyz0, uint_t{1u}) = force[1u];
+        force_field->getF(&laf_xyz0, uint_t{2u}) = force[2u];
+
+        velocity_field->getF(&vel_xyz0, uint_t{0u}) = md_0 * rho_inv;
+        velocity_field->getF(&vel_xyz0, uint_t{1u}) = md_1 * rho_inv;
+        velocity_field->getF(&vel_xyz0, uint_t{2u}) = md_2 * rho_inv;
+
+        std::advance(force, 3);
+      }
+    }
+  }
+}
+} // namespace Force
+
 namespace MomentumDensity {
-inline Vector3<float>
-reduce(GhostLayerField<float, uint_t{19u}> const *pdf_field,
-       GhostLayerField<float, uint_t{3u}> const *force_field) {
+inline auto reduce(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                   GhostLayerField<float, uint_t{3u}> const *force_field) {
   Vector3<float> momentumDensity(float{0});
   WALBERLA_FOR_ALL_CELLS_XYZ(pdf_field, {
     const float &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
@@ -768,8 +1137,8 @@ reduce(GhostLayerField<float, uint_t{19u}> const *pdf_field,
 } // namespace MomentumDensity
 
 namespace PressureTensor {
-inline Matrix3<float> get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
-                          Cell const &cell) {
+inline auto get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                Cell const &cell) {
   const float &xyz0 = pdf_field->get(cell, uint_t{0u});
   const float f_0 = pdf_field->getF(&xyz0, uint_t{0u});
   const float f_1 = pdf_field->getF(&xyz0, uint_t{1u});
@@ -818,6 +1187,63 @@ inline Matrix3<float> get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
 
   return pressureTensor;
 }
+
+inline auto get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                CellInterval const &ci) {
+  std::vector<float> out;
+  out.reserve(ci.numCells() * uint_t(9u));
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        const float &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        const float f_0 = pdf_field->getF(&xyz0, uint_t{0u});
+        const float f_1 = pdf_field->getF(&xyz0, uint_t{1u});
+        const float f_2 = pdf_field->getF(&xyz0, uint_t{2u});
+        const float f_3 = pdf_field->getF(&xyz0, uint_t{3u});
+        const float f_4 = pdf_field->getF(&xyz0, uint_t{4u});
+        const float f_5 = pdf_field->getF(&xyz0, uint_t{5u});
+        const float f_6 = pdf_field->getF(&xyz0, uint_t{6u});
+        const float f_7 = pdf_field->getF(&xyz0, uint_t{7u});
+        const float f_8 = pdf_field->getF(&xyz0, uint_t{8u});
+        const float f_9 = pdf_field->getF(&xyz0, uint_t{9u});
+        const float f_10 = pdf_field->getF(&xyz0, uint_t{10u});
+        const float f_11 = pdf_field->getF(&xyz0, uint_t{11u});
+        const float f_12 = pdf_field->getF(&xyz0, uint_t{12u});
+        const float f_13 = pdf_field->getF(&xyz0, uint_t{13u});
+        const float f_14 = pdf_field->getF(&xyz0, uint_t{14u});
+        const float f_15 = pdf_field->getF(&xyz0, uint_t{15u});
+        const float f_16 = pdf_field->getF(&xyz0, uint_t{16u});
+        const float f_17 = pdf_field->getF(&xyz0, uint_t{17u});
+        const float f_18 = pdf_field->getF(&xyz0, uint_t{18u});
+        const float p_0 =
+            f_10 + f_13 + f_14 + f_17 + f_18 + f_3 + f_4 + f_7 + f_8 + f_9;
+        const float p_1 = -f_10 - f_7 + f_8 + f_9;
+        const float p_2 = -f_13 + f_14 + f_17 - f_18;
+        const float p_3 = -f_10 - f_7 + f_8 + f_9;
+        const float p_4 =
+            f_1 + f_10 + f_11 + f_12 + f_15 + f_16 + f_2 + f_7 + f_8 + f_9;
+        const float p_5 = f_11 - f_12 - f_15 + f_16;
+        const float p_6 = -f_13 + f_14 + f_17 - f_18;
+        const float p_7 = f_11 - f_12 - f_15 + f_16;
+        const float p_8 =
+            f_11 + f_12 + f_13 + f_14 + f_15 + f_16 + f_17 + f_18 + f_5 + f_6;
+
+        out.emplace_back(p_0);
+        out.emplace_back(p_1);
+        out.emplace_back(p_2);
+
+        out.emplace_back(p_3);
+        out.emplace_back(p_4);
+        out.emplace_back(p_5);
+
+        out.emplace_back(p_6);
+        out.emplace_back(p_7);
+        out.emplace_back(p_8);
+      }
+    }
+  }
+  return out;
+}
 } // namespace PressureTensor
 
 } // namespace accessor
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cu
index 58c74fca3a..e0c8dd8102 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cu
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cu
@@ -48,13 +48,8 @@
 
 #if defined(__NVCC__)
 #define RESTRICT __restrict__
-#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // unused variable
-#else
-#pragma push
-#pragma diag_suppress 177 // unused variable
-#endif
 #elif defined(__clang__)
 #if defined(__CUDA__)
 #if defined(__CUDA_ARCH__)
@@ -62,20 +57,17 @@
 #define RESTRICT __restrict__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-parameter"
 #else
 // clang compiling CUDA code in host mode
 #define RESTRICT __restrict__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-parameter"
 #endif
 #endif
 #elif defined(__GNUC__) or defined(__GNUG__)
 #define RESTRICT __restrict__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"
-#pragma GCC diagnostic ignored "-Wunused-parameter"
 #elif defined(_MSC_VER)
 #define RESTRICT __restrict
 #else
@@ -102,38 +94,13 @@ namespace lbm {
 namespace accessor {
 
 namespace Population {
-__global__ void kernel_get_interval(
-    gpu::FieldAccessor<float> pdf,
-    float *RESTRICT const pop) {
-  pdf.set(blockIdx, threadIdx);
-  if (pdf.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
-    pop[offset + 0u] = pdf.get(0u);
-    pop[offset + 1u] = pdf.get(1u);
-    pop[offset + 2u] = pdf.get(2u);
-    pop[offset + 3u] = pdf.get(3u);
-    pop[offset + 4u] = pdf.get(4u);
-    pop[offset + 5u] = pdf.get(5u);
-    pop[offset + 6u] = pdf.get(6u);
-    pop[offset + 7u] = pdf.get(7u);
-    pop[offset + 8u] = pdf.get(8u);
-    pop[offset + 9u] = pdf.get(9u);
-    pop[offset + 10u] = pdf.get(10u);
-    pop[offset + 11u] = pdf.get(11u);
-    pop[offset + 12u] = pdf.get(12u);
-    pop[offset + 13u] = pdf.get(13u);
-    pop[offset + 14u] = pdf.get(14u);
-    pop[offset + 15u] = pdf.get(15u);
-    pop[offset + 16u] = pdf.get(16u);
-    pop[offset + 17u] = pdf.get(17u);
-    pop[offset + 18u] = pdf.get(18u);
-  }
-}
-
+// LCOV_EXCL_START
 __global__ void kernel_get(
     gpu::FieldAccessor<float> pdf,
-    float *RESTRICT const pop) {
+    float *RESTRICT pop) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
   pdf.set(blockIdx, threadIdx);
+  pop += offset;
   if (pdf.isValidPosition()) {
     pop[0u] = pdf.get(0u);
     pop[1u] = pdf.get(1u);
@@ -157,37 +124,38 @@ __global__ void kernel_get(
   }
 }
 
-__global__ void kernel_set_interval(
+__global__ void kernel_set(
     gpu::FieldAccessor<float> pdf,
-    float const *RESTRICT const pop) {
+    float const *RESTRICT pop) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
   pdf.set(blockIdx, threadIdx);
+  pop += offset;
   if (pdf.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
-    pdf.get(0u) = pop[offset + 0u];
-    pdf.get(1u) = pop[offset + 1u];
-    pdf.get(2u) = pop[offset + 2u];
-    pdf.get(3u) = pop[offset + 3u];
-    pdf.get(4u) = pop[offset + 4u];
-    pdf.get(5u) = pop[offset + 5u];
-    pdf.get(6u) = pop[offset + 6u];
-    pdf.get(7u) = pop[offset + 7u];
-    pdf.get(8u) = pop[offset + 8u];
-    pdf.get(9u) = pop[offset + 9u];
-    pdf.get(10u) = pop[offset + 10u];
-    pdf.get(11u) = pop[offset + 11u];
-    pdf.get(12u) = pop[offset + 12u];
-    pdf.get(13u) = pop[offset + 13u];
-    pdf.get(14u) = pop[offset + 14u];
-    pdf.get(15u) = pop[offset + 15u];
-    pdf.get(16u) = pop[offset + 16u];
-    pdf.get(17u) = pop[offset + 17u];
-    pdf.get(18u) = pop[offset + 18u];
+    pdf.get(0u) = pop[0u];
+    pdf.get(1u) = pop[1u];
+    pdf.get(2u) = pop[2u];
+    pdf.get(3u) = pop[3u];
+    pdf.get(4u) = pop[4u];
+    pdf.get(5u) = pop[5u];
+    pdf.get(6u) = pop[6u];
+    pdf.get(7u) = pop[7u];
+    pdf.get(8u) = pop[8u];
+    pdf.get(9u) = pop[9u];
+    pdf.get(10u) = pop[10u];
+    pdf.get(11u) = pop[11u];
+    pdf.get(12u) = pop[12u];
+    pdf.get(13u) = pop[13u];
+    pdf.get(14u) = pop[14u];
+    pdf.get(15u) = pop[15u];
+    pdf.get(16u) = pop[16u];
+    pdf.get(17u) = pop[17u];
+    pdf.get(18u) = pop[18u];
   }
 }
 
-__global__ void kernel_set(
+__global__ void kernel_broadcast(
     gpu::FieldAccessor<float> pdf,
-    float const *RESTRICT const pop) {
+    float const *RESTRICT pop) {
   pdf.set(blockIdx, threadIdx);
   if (pdf.isValidPosition()) {
     pdf.get(0u) = pop[0u];
@@ -212,11 +180,59 @@ __global__ void kernel_set(
   }
 }
 
+__global__ void kernel_set_vel(
+    gpu::FieldAccessor<float> pdf,
+    gpu::FieldAccessor<float> velocity,
+    gpu::FieldAccessor<float> force,
+    float const *RESTRICT pop) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
+  pdf.set(blockIdx, threadIdx);
+  velocity.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  pop += offset;
+  if (pdf.isValidPosition()) {
+    const float f_0 = pdf.get(0u) = pop[0u];
+    const float f_1 = pdf.get(1u) = pop[1u];
+    const float f_2 = pdf.get(2u) = pop[2u];
+    const float f_3 = pdf.get(3u) = pop[3u];
+    const float f_4 = pdf.get(4u) = pop[4u];
+    const float f_5 = pdf.get(5u) = pop[5u];
+    const float f_6 = pdf.get(6u) = pop[6u];
+    const float f_7 = pdf.get(7u) = pop[7u];
+    const float f_8 = pdf.get(8u) = pop[8u];
+    const float f_9 = pdf.get(9u) = pop[9u];
+    const float f_10 = pdf.get(10u) = pop[10u];
+    const float f_11 = pdf.get(11u) = pop[11u];
+    const float f_12 = pdf.get(12u) = pop[12u];
+    const float f_13 = pdf.get(13u) = pop[13u];
+    const float f_14 = pdf.get(14u) = pop[14u];
+    const float f_15 = pdf.get(15u) = pop[15u];
+    const float f_16 = pdf.get(16u) = pop[16u];
+    const float f_17 = pdf.get(17u) = pop[17u];
+    const float f_18 = pdf.get(18u) = pop[18u];
+    const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const float vel1Term = f_1 + f_11 + f_15 + f_7;
+    const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const float vel2Term = f_12 + f_13 + f_5;
+    const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
+    const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
+    const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
+    const float rho_inv = float{1} / rho;
+    velocity.get(0u) = md_0 * rho_inv;
+    velocity.get(1u) = md_1 * rho_inv;
+    velocity.get(2u) = md_2 * rho_inv;
+  }
+}
+// LCOV_EXCL_STOP
+
 std::array<float, 19u> get(
     gpu::GPUField<float> const *pdf_field,
     Cell const &cell) {
   CellInterval ci(cell, cell);
-  thrust::device_vector<float> dev_data(19u, float{0});
+  thrust::device_vector<float> dev_data(19u);
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
   auto kernel = gpu::make_kernel(kernel_get);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
@@ -231,7 +247,7 @@ void set(
     gpu::GPUField<float> *pdf_field,
     std::array<float, 19u> const &pop,
     Cell const &cell) {
-  thrust::device_vector<float> dev_data(pop.data(), pop.data() + 19u);
+  thrust::device_vector<float> dev_data(pop.begin(), pop.end());
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
   CellInterval ci(cell, cell);
   auto kernel = gpu::make_kernel(kernel_set);
@@ -240,13 +256,30 @@ void set(
   kernel();
 }
 
+void set(
+    gpu::GPUField<float> *pdf_field,
+    gpu::GPUField<float> *velocity_field,
+    gpu::GPUField<float> const *force_field,
+    std::array<float, 19u> const &pop,
+    Cell const &cell) {
+  thrust::device_vector<float> dev_data(pop.begin(), pop.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  CellInterval ci(cell, cell);
+  auto kernel = gpu::make_kernel(kernel_set_vel);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+
 void initialize(
     gpu::GPUField<float> *pdf_field,
     std::array<float, 19u> const &pop) {
   CellInterval ci = pdf_field->xyzSizeWithGhostLayer();
-  thrust::device_vector<float> dev_data(pop.data(), pop.data() + 19u);
+  thrust::device_vector<float> dev_data(pop.begin(), pop.end());
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_set);
+  auto kernel = gpu::make_kernel(kernel_broadcast);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
   kernel.addParam(const_cast<const float *>(dev_data_ptr));
   kernel();
@@ -257,7 +290,7 @@ std::vector<float> get(
     CellInterval const &ci) {
   thrust::device_vector<float> dev_data(ci.numCells() * 19u);
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_get_interval);
+  auto kernel = gpu::make_kernel(kernel_get);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
   kernel.addParam(dev_data_ptr);
   kernel();
@@ -272,82 +305,92 @@ void set(
     CellInterval const &ci) {
   thrust::device_vector<float> dev_data(values.begin(), values.end());
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_set_interval);
+  auto kernel = gpu::make_kernel(kernel_set);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
   kernel.addParam(const_cast<const float *>(dev_data_ptr));
   kernel();
 }
-} // namespace Population
 
-namespace Vector {
-__global__ void kernel_get_interval(
-    gpu::FieldAccessor<float> vec,
-    float *const out) {
-  vec.set(blockIdx, threadIdx);
-  if (vec.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
-    out[offset + 0u] = vec.get(0u);
-    out[offset + 1u] = vec.get(1u);
-    out[offset + 2u] = vec.get(2u);
-  }
+void set(
+    gpu::GPUField<float> *pdf_field,
+    gpu::GPUField<float> *velocity_field,
+    gpu::GPUField<float> const *force_field,
+    std::vector<float> const &values,
+    CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set_vel);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
 }
+} // namespace Population
 
+namespace Vector {
+// LCOV_EXCL_START
 __global__ void kernel_get(
     gpu::FieldAccessor<float> vec,
-    float *const out) {
+    float *u_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
   vec.set(blockIdx, threadIdx);
+  u_out += offset;
   if (vec.isValidPosition()) {
-    out[0u] = vec.get(0u);
-    out[1u] = vec.get(1u);
-    out[2u] = vec.get(2u);
+    u_out[0u] = vec.get(0u);
+    u_out[1u] = vec.get(1u);
+    u_out[2u] = vec.get(2u);
   }
 }
 
-__global__ void kernel_set_interval(
+__global__ void kernel_set(
     gpu::FieldAccessor<float> vec,
-    float const *RESTRICT const u) {
+    float const *RESTRICT u_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
   vec.set(blockIdx, threadIdx);
+  u_in += offset;
   if (vec.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
-    vec.get(0u) = u[offset + 0u];
-    vec.get(1u) = u[offset + 1u];
-    vec.get(2u) = u[offset + 2u];
+    vec.get(0u) = u_in[0u];
+    vec.get(1u) = u_in[1u];
+    vec.get(2u) = u_in[2u];
   }
 }
 
-__global__ void kernel_set(
+__global__ void kernel_broadcast(
     gpu::FieldAccessor<float> vec,
-    const float *RESTRICT const u) {
+    float const *RESTRICT u_in) {
   vec.set(blockIdx, threadIdx);
   if (vec.isValidPosition()) {
-    vec.get(0u) = u[0u];
-    vec.get(1u) = u[1u];
-    vec.get(2u) = u[2u];
+    vec.get(0u) = u_in[0u];
+    vec.get(1u) = u_in[1u];
+    vec.get(2u) = u_in[2u];
   }
 }
 
-__global__ void kernel_add_interval(
+__global__ void kernel_add(
     gpu::FieldAccessor<float> vec,
-    float const *RESTRICT const u) {
+    float const *RESTRICT u_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
   vec.set(blockIdx, threadIdx);
+  u_in += offset;
   if (vec.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
-    vec.get(0u) += u[offset + 0u];
-    vec.get(1u) += u[offset + 1u];
-    vec.get(2u) += u[offset + 2u];
+    vec.get(0u) += u_in[0u];
+    vec.get(1u) += u_in[1u];
+    vec.get(2u) += u_in[2u];
   }
 }
 
-__global__ void kernel_add(
+__global__ void kernel_broadcast_add(
     gpu::FieldAccessor<float> vec,
-    float const *RESTRICT const u) {
+    float const *RESTRICT u_in) {
   vec.set(blockIdx, threadIdx);
   if (vec.isValidPosition()) {
-    vec.get(0u) += u[0u];
-    vec.get(1u) += u[1u];
-    vec.get(2u) += u[2u];
+    vec.get(0u) += u_in[0u];
+    vec.get(1u) += u_in[1u];
+    vec.get(2u) += u_in[2u];
   }
 }
+// LCOV_EXCL_STOP
 
 Vector3<float> get(
     gpu::GPUField<float> const *vec_field,
@@ -396,7 +439,7 @@ void initialize(
   CellInterval ci = vec_field->xyzSizeWithGhostLayer();
   thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_set);
+  auto kernel = gpu::make_kernel(kernel_broadcast);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
   kernel.addParam(const_cast<const float *>(dev_data_ptr));
   kernel();
@@ -408,7 +451,7 @@ void add_to_all(
   CellInterval ci = vec_field->xyzSizeWithGhostLayer();
   thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_add);
+  auto kernel = gpu::make_kernel(kernel_broadcast_add);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
   kernel.addParam(const_cast<const float *>(dev_data_ptr));
   kernel();
@@ -419,7 +462,7 @@ std::vector<float> get(
     CellInterval const &ci) {
   thrust::device_vector<float> dev_data(ci.numCells() * 3u);
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_get_interval);
+  auto kernel = gpu::make_kernel(kernel_get);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
   kernel.addParam(dev_data_ptr);
   kernel();
@@ -434,7 +477,7 @@ void set(
     CellInterval const &ci) {
   thrust::device_vector<float> dev_data(values.begin(), values.end());
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_set_interval);
+  auto kernel = gpu::make_kernel(kernel_set);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
   kernel.addParam(const_cast<const float *>(dev_data_ptr));
   kernel();
@@ -442,6 +485,7 @@ void set(
 } // namespace Vector
 
 namespace Interpolation {
+// LCOV_EXCL_START
 /** @brief Calculate interpolation weights. */
 static __forceinline__ __device__ void calculate_weights(
     float const *RESTRICT const pos,
@@ -535,6 +579,7 @@ __global__ void kernel_set(
     }
   }
 }
+// LCOV_EXCL_STOP
 
 static dim3 calculate_dim_grid(uint const threads_x,
                                uint const blocks_per_grid_y,
@@ -589,6 +634,7 @@ void set(
 } // namespace Interpolation
 
 namespace Equilibrium {
+// LCOV_EXCL_START
 __device__ void kernel_set_device(
     gpu::FieldAccessor<float> pdf,
     float const *RESTRICT const u,
@@ -614,15 +660,18 @@ __device__ void kernel_set_device(
   pdf.get(17u) = rho * -0.083333333333333329f * u[0] + rho * -0.083333333333333329f * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[0] * u[2];
   pdf.get(18u) = rho * -0.083333333333333329f * u[2] + rho * -0.25f * u[0] * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]);
 }
+// LCOV_EXCL_STOP
 } // namespace Equilibrium
 
 namespace Density {
+// LCOV_EXCL_START
 __global__ void kernel_get(
     gpu::FieldAccessor<float> pdf,
-    float *RESTRICT const out) {
+    float *RESTRICT rho_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
   pdf.set(blockIdx, threadIdx);
+  rho_out += offset;
   if (pdf.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, uint(1u));
     float const f_0 = pdf.get(0u);
     float const f_1 = pdf.get(1u);
     float const f_2 = pdf.get(2u);
@@ -646,16 +695,17 @@ __global__ void kernel_get(
     const float vel1Term = f_1 + f_11 + f_15 + f_7;
     const float vel2Term = f_12 + f_13 + f_5;
     const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
-    out[offset] = rho;
+    rho_out[0u] = rho;
   }
 }
 
 __global__ void kernel_set(
     gpu::FieldAccessor<float> pdf,
-    float const *RESTRICT const rho_in) {
+    float const *RESTRICT rho_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
   pdf.set(blockIdx, threadIdx);
+  rho_in += offset;
   if (pdf.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, uint(1u));
     float const f_0 = pdf.get(0u);
     float const f_1 = pdf.get(1u);
     float const f_2 = pdf.get(2u);
@@ -684,12 +734,13 @@ __global__ void kernel_set(
     const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
 
     // calculate current velocity (before density change)
-    float const conversion = float(1) / rho;
-    float const u_old[3] = {momdensity_0 * conversion, momdensity_1 * conversion, momdensity_2 * conversion};
+    float const rho_inv = float{1} / rho;
+    float const u_old[3] = {momdensity_0 * rho_inv, momdensity_1 * rho_inv, momdensity_2 * rho_inv};
 
-    Equilibrium::kernel_set_device(pdf, u_old, rho_in[offset]);
+    Equilibrium::kernel_set_device(pdf, u_old, rho_in[0u]);
   }
 }
+// LCOV_EXCL_STOP
 
 float get(
     gpu::GPUField<float> const *pdf_field,
@@ -727,7 +778,7 @@ std::vector<float> get(
   kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
   kernel.addParam(dev_data_ptr);
   kernel();
-  std::vector<float> out(ci.numCells());
+  std::vector<float> out(dev_data.size());
   thrust::copy(dev_data.begin(), dev_data.end(), out.begin());
   return out;
 }
@@ -746,16 +797,63 @@ void set(
 } // namespace Density
 
 namespace Velocity {
+// LCOV_EXCL_START
+__global__ void kernel_get(
+    gpu::FieldAccessor<float> pdf,
+    gpu::FieldAccessor<float> force,
+    float *RESTRICT u_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  pdf.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  u_out += offset;
+  if (pdf.isValidPosition()) {
+    float const f_0 = pdf.get(0u);
+    float const f_1 = pdf.get(1u);
+    float const f_2 = pdf.get(2u);
+    float const f_3 = pdf.get(3u);
+    float const f_4 = pdf.get(4u);
+    float const f_5 = pdf.get(5u);
+    float const f_6 = pdf.get(6u);
+    float const f_7 = pdf.get(7u);
+    float const f_8 = pdf.get(8u);
+    float const f_9 = pdf.get(9u);
+    float const f_10 = pdf.get(10u);
+    float const f_11 = pdf.get(11u);
+    float const f_12 = pdf.get(12u);
+    float const f_13 = pdf.get(13u);
+    float const f_14 = pdf.get(14u);
+    float const f_15 = pdf.get(15u);
+    float const f_16 = pdf.get(16u);
+    float const f_17 = pdf.get(17u);
+    float const f_18 = pdf.get(18u);
+    const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const float vel1Term = f_1 + f_11 + f_15 + f_7;
+    const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const float vel2Term = f_12 + f_13 + f_5;
+    const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
+    const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
+    const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
+    auto const rho_inv = float{1} / rho;
+    u_out[0u] = md_0 * rho_inv;
+    u_out[1u] = md_1 * rho_inv;
+    u_out[2u] = md_2 * rho_inv;
+  }
+}
+
 __global__ void kernel_set(
     gpu::FieldAccessor<float> pdf,
+    gpu::FieldAccessor<float> velocity,
     gpu::FieldAccessor<float> force,
-    float const *RESTRICT const u_in) {
+    float const *RESTRICT u_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
   pdf.set(blockIdx, threadIdx);
+  velocity.set(blockIdx, threadIdx);
   force.set(blockIdx, threadIdx);
+  u_in += offset;
   if (pdf.isValidPosition()) {
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, uint(3u));
-    uint const bufsize = 3u;
-    float const *RESTRICT const u = u_in + bufsize * offset;
     float const f_0 = pdf.get(0u);
     float const f_1 = pdf.get(1u);
     float const f_2 = pdf.get(2u);
@@ -775,6 +873,7 @@ __global__ void kernel_set(
     float const f_16 = pdf.get(16u);
     float const f_17 = pdf.get(17u);
     float const f_18 = pdf.get(18u);
+    float const *RESTRICT const u = u_in;
     const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
     const float vel1Term = f_1 + f_11 + f_15 + f_7;
     const float vel2Term = f_12 + f_13 + f_5;
@@ -782,15 +881,54 @@ __global__ void kernel_set(
     const float u_0 = -force.get(0) * 0.50000000000000000f / rho + u[0];
     const float u_1 = -force.get(1) * 0.50000000000000000f / rho + u[1];
     const float u_2 = -force.get(2) * 0.50000000000000000f / rho + u[2];
+    velocity.get(0u) = u_in[0u];
+    velocity.get(1u) = u_in[1u];
+    velocity.get(2u) = u_in[2u];
+
     float u_new[3] = {u_0, u_1, u_2};
 
     Equilibrium::kernel_set_device(pdf, u_new, rho);
   }
 }
+// LCOV_EXCL_STOP
+
+Vector3<float> get(
+    gpu::GPUField<float> const *pdf_field,
+    gpu::GPUField<float> const *force_field,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<float> dev_data(3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  Vector3<float> vec;
+  thrust::copy(dev_data.begin(), dev_data.end(), vec.data());
+  return vec;
+}
+
+std::vector<float> get(
+    gpu::GPUField<float> const *pdf_field,
+    gpu::GPUField<float> const *force_field,
+    CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<float> out(dev_data.size());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  return out;
+}
 
 void set(
     gpu::GPUField<float> *pdf_field,
-    gpu::GPUField<float> *force_field,
+    gpu::GPUField<float> *velocity_field,
+    gpu::GPUField<float> const *force_field,
     Vector3<float> const &u,
     Cell const &cell) {
   CellInterval ci(cell, cell);
@@ -798,22 +936,127 @@ void set(
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
   auto kernel = gpu::make_kernel(kernel_set);
   kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+
+void set(
+    gpu::GPUField<float> *pdf_field,
+    gpu::GPUField<float> *velocity_field,
+    gpu::GPUField<float> const *force_field,
+    std::vector<float> const &values,
+    CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
   kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
   kernel.addParam(const_cast<const float *>(dev_data_ptr));
   kernel();
 }
 } // namespace Velocity
 
+namespace Force {
+// LCOV_EXCL_START
+__global__ void kernel_set(
+    gpu::FieldAccessor<float> pdf,
+    gpu::FieldAccessor<float> velocity,
+    gpu::FieldAccessor<float> force,
+    float const *RESTRICT f_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  pdf.set(blockIdx, threadIdx);
+  velocity.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  f_in += offset;
+  if (pdf.isValidPosition()) {
+    float const f_0 = pdf.get(0u);
+    float const f_1 = pdf.get(1u);
+    float const f_2 = pdf.get(2u);
+    float const f_3 = pdf.get(3u);
+    float const f_4 = pdf.get(4u);
+    float const f_5 = pdf.get(5u);
+    float const f_6 = pdf.get(6u);
+    float const f_7 = pdf.get(7u);
+    float const f_8 = pdf.get(8u);
+    float const f_9 = pdf.get(9u);
+    float const f_10 = pdf.get(10u);
+    float const f_11 = pdf.get(11u);
+    float const f_12 = pdf.get(12u);
+    float const f_13 = pdf.get(13u);
+    float const f_14 = pdf.get(14u);
+    float const f_15 = pdf.get(15u);
+    float const f_16 = pdf.get(16u);
+    float const f_17 = pdf.get(17u);
+    float const f_18 = pdf.get(18u);
+    const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const float vel1Term = f_1 + f_11 + f_15 + f_7;
+    const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const float vel2Term = f_12 + f_13 + f_5;
+    const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const float md_0 = f_in[0u] * 0.50000000000000000f + momdensity_0;
+    const float md_1 = f_in[1u] * 0.50000000000000000f + momdensity_1;
+    const float md_2 = f_in[2u] * 0.50000000000000000f + momdensity_2;
+    auto const rho_inv = float{1} / rho;
+
+    force.get(0u) = f_in[0u];
+    force.get(1u) = f_in[1u];
+    force.get(2u) = f_in[2u];
+
+    velocity.get(0u) = md_0 * rho_inv;
+    velocity.get(1u) = md_1 * rho_inv;
+    velocity.get(2u) = md_2 * rho_inv;
+  }
+}
+// LCOV_EXCL_STOP
+
+void set(gpu::GPUField<float> const *pdf_field,
+         gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> *force_field,
+         Vector3<float> const &u,
+         Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<float> dev_data(u.data(), u.data() + 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+
+void set(gpu::GPUField<float> const *pdf_field,
+         gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> *force_field,
+         std::vector<float> const &values,
+         CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+} // namespace Force
+
 namespace MomentumDensity {
+// LCOV_EXCL_START
 __global__ void kernel_sum(
     gpu::FieldAccessor<float> pdf,
     gpu::FieldAccessor<float> force,
-    float *RESTRICT const out) {
+    float *RESTRICT out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
   pdf.set(blockIdx, threadIdx);
   force.set(blockIdx, threadIdx);
+  out += offset;
   if (pdf.isValidPosition()) {
-    uint const bufsize = 3u;
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, bufsize);
     float const f_0 = pdf.get(0u);
     float const f_1 = pdf.get(1u);
     float const f_2 = pdf.get(2u);
@@ -843,11 +1086,12 @@ __global__ void kernel_sum(
     const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
     const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
     const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
-    out[bufsize * offset + 0u] += md_0;
-    out[bufsize * offset + 1u] += md_1;
-    out[bufsize * offset + 2u] += md_2;
+    out[0u] += md_0;
+    out[1u] += md_1;
+    out[2u] += md_2;
   }
 }
+// LCOV_EXCL_STOP
 
 Vector3<float> reduce(
     gpu::GPUField<float> const *pdf_field,
@@ -870,13 +1114,14 @@ Vector3<float> reduce(
 } // namespace MomentumDensity
 
 namespace PressureTensor {
+// LCOV_EXCL_START
 __global__ void kernel_get(
     gpu::FieldAccessor<float> pdf,
-    float *RESTRICT const out) {
+    float *RESTRICT p_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 9u);
   pdf.set(blockIdx, threadIdx);
+  p_out += offset;
   if (pdf.isValidPosition()) {
-    uint const bufsize = 9u;
-    uint const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, bufsize);
     float const f_0 = pdf.get(0u);
     float const f_1 = pdf.get(1u);
     float const f_2 = pdf.get(2u);
@@ -905,19 +1150,18 @@ __global__ void kernel_get(
     const float p_6 = -f_13 + f_14 + f_17 - f_18;
     const float p_7 = f_11 - f_12 - f_15 + f_16;
     const float p_8 = f_11 + f_12 + f_13 + f_14 + f_15 + f_16 + f_17 + f_18 + f_5 + f_6;
-    out[bufsize * offset + 0u] = p_0;
-    out[bufsize * offset + 1u] = p_1;
-    out[bufsize * offset + 2u] = p_2;
-
-    out[bufsize * offset + 3u] = p_3;
-    out[bufsize * offset + 4u] = p_4;
-    out[bufsize * offset + 5u] = p_5;
-
-    out[bufsize * offset + 6u] = p_6;
-    out[bufsize * offset + 7u] = p_7;
-    out[bufsize * offset + 8u] = p_8;
+    p_out[0u] = p_0;
+    p_out[1u] = p_1;
+    p_out[2u] = p_2;
+    p_out[3u] = p_3;
+    p_out[4u] = p_4;
+    p_out[5u] = p_5;
+    p_out[6u] = p_6;
+    p_out[7u] = p_7;
+    p_out[8u] = p_8;
   }
 }
+// LCOV_EXCL_STOP
 
 Matrix3<float> get(
     gpu::GPUField<float> const *pdf_field,
@@ -930,7 +1174,21 @@ Matrix3<float> get(
   kernel.addParam(dev_data_ptr);
   kernel();
   Matrix3<float> out;
-  thrust::copy(dev_data.begin(), dev_data.begin() + 9u, out.data());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  return out;
+}
+
+std::vector<float> get(
+    gpu::GPUField<float> const *pdf_field,
+    CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(9u * ci.numCells());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<float> out(dev_data.size());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
   return out;
 }
 } // namespace PressureTensor
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cuh b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cuh
index dede426c02..9d25a37a85 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cuh
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cuh
@@ -43,18 +43,6 @@
 #include <tuple>
 #include <vector>
 
-#ifdef WALBERLA_CXX_COMPILER_IS_GNU
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-variable"
-#pragma GCC diagnostic ignored "-Wunused-parameter"
-#endif
-
-#ifdef WALBERLA_CXX_COMPILER_IS_CLANG
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-parameter"
-#endif
-
 namespace walberla {
 namespace lbm {
 namespace accessor {
@@ -66,6 +54,10 @@ std::array<float, 19u> get(gpu::GPUField<float> const *pdf_field,
 /** @brief Set populations on a single cell. */
 void set(gpu::GPUField<float> *pdf_field, std::array<float, 19u> const &pop,
          Cell const &cell);
+/** @brief Set populations and recalculate velocities on a single cell. */
+void set(gpu::GPUField<float> *pdf_field, gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> const *force_field,
+         std::array<float, 19u> const &pop, Cell const &cell);
 /** @brief Initialize all cells with the same value. */
 void initialize(gpu::GPUField<float> *pdf_field,
                 std::array<float, 19u> const &pop);
@@ -75,6 +67,10 @@ std::vector<float> get(gpu::GPUField<float> const *pdf_field,
 /** @brief Set populations on a cell interval. */
 void set(gpu::GPUField<float> *pdf_field, std::vector<float> const &values,
          CellInterval const &ci);
+/** @brief Set populations and recalculate velocities on a cell interval. */
+void set(gpu::GPUField<float> *pdf_field, gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> const *force_field,
+         std::vector<float> const &values, CellInterval const &ci);
 } // namespace Population
 
 namespace Vector {
@@ -116,10 +112,30 @@ void set(gpu::GPUField<float> *pdf_field, std::vector<float> const &values,
 } // namespace Density
 
 namespace Velocity {
-void set(gpu::GPUField<float> *pdf_field, gpu::GPUField<float> *force_field,
-         Vector3<float> const &u, Cell const &cell);
+Vector3<float> get(gpu::GPUField<float> const *pdf_field,
+                   gpu::GPUField<float> const *force_field, Cell const &cell);
+std::vector<float> get(gpu::GPUField<float> const *pdf_field,
+                       gpu::GPUField<float> const *force_field,
+                       CellInterval const &ci);
+void set(gpu::GPUField<float> *pdf_field, gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> const *force_field, Vector3<float> const &u,
+         Cell const &cell);
+void set(gpu::GPUField<float> *pdf_field, gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> const *force_field,
+         std::vector<float> const &values, CellInterval const &ci);
 } // namespace Velocity
 
+namespace Force {
+void set(gpu::GPUField<float> const *pdf_field,
+         gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> *force_field, Vector3<float> const &u,
+         Cell const &cell);
+void set(gpu::GPUField<float> const *pdf_field,
+         gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> *force_field, std::vector<float> const &values,
+         CellInterval const &ci);
+} // namespace Force
+
 namespace DensityAndVelocity {
 std::tuple<float, Vector3<float>> get(gpu::GPUField<float> const *pdf_field,
                                       gpu::GPUField<float> const *force_field,
@@ -141,16 +157,10 @@ Vector3<float> reduce(gpu::GPUField<float> const *pdf_field,
 
 namespace PressureTensor {
 Matrix3<float> get(gpu::GPUField<float> const *pdf_field, Cell const &cell);
+std::vector<float> get(gpu::GPUField<float> const *pdf_field,
+                       CellInterval const &ci);
 } // namespace PressureTensor
 
 } // namespace accessor
 } // namespace lbm
 } // namespace walberla
-
-#ifdef WALBERLA_CXX_COMPILER_IS_GNU
-#pragma GCC diagnostic pop
-#endif
-
-#ifdef WALBERLA_CXX_COMPILER_IS_CLANG
-#pragma clang diagnostic pop
-#endif
diff --git a/src/walberla_bridge/tests/CMakeLists.txt b/src/walberla_bridge/tests/CMakeLists.txt
index 4cbf9ce15d..83a7d9d2ee 100644
--- a/src/walberla_bridge/tests/CMakeLists.txt
+++ b/src/walberla_bridge/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2020-2023 The ESPResSo project
+# Copyright (C) 2020-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,52 +17,44 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-include(unit_test)
-
-function(ESPRESSO_WALBERLA_UNIT_TEST)
-  cmake_parse_arguments(TEST "" "NAME;NUM_PROC" "SRC;DEPENDS" ${ARGN})
-  unit_test(NAME ${TEST_NAME} NUM_PROC ${TEST_NUM_PROC} SRC ${TEST_SRC} DEPENDS
-            ${TEST_DEPENDS} espresso::walberla espresso::walberla::cpp_flags
-            espresso::utils)
+include(espresso_unit_test)
+
+function(ESPRESSO_ADD_TEST)
+  cmake_parse_arguments(TEST "" "SRC;NAME;NUM_PROC" "DEPENDS" ${ARGN})
+  espresso_unit_test(
+    SRC ${TEST_SRC} NAME ${TEST_NAME} NUM_PROC ${TEST_NUM_PROC} DEPENDS
+    ${TEST_DEPENDS} espresso::walberla espresso::utils)
+  if(${TEST_SRC} MATCHES ".*\.cu$")
+    target_link_libraries(${TEST_NAME} PRIVATE espresso::walberla::cuda_flags
+                                               espresso::walberla_cuda)
+  else()
+    target_link_libraries(${TEST_NAME} PRIVATE espresso::walberla::cpp_flags)
+  endif()
   set_target_properties(${TEST_NAME} PROPERTIES CXX_CLANG_TIDY "")
   target_include_directories(${TEST_NAME} PRIVATE ${WALBERLA_INCLUDE_DIRS}
                                                   ${walberla_BINARY_DIR}/src)
   target_link_libraries(${TEST_NAME} PRIVATE ${WALBERLA_LIBS})
 endfunction()
 
-espresso_walberla_unit_test(NAME ResourceManager_test SRC
-                            ResourceManager_test.cpp)
-
-espresso_walberla_unit_test(NAME lb_kernels_unit_tests SRC
-                            lb_kernels_unit_tests.cpp)
-
-espresso_walberla_unit_test(NAME ek_kernels_unit_tests SRC
-                            ek_kernels_unit_tests.cpp)
-
-espresso_walberla_unit_test(
-  NAME LatticeWalberla_unit_tests SRC LatticeWalberla_unit_tests.cpp DEPENDS
-  Boost::boost Boost::mpi NUM_PROC 2)
-
-espresso_walberla_unit_test(
-  NAME LBWalberlaImpl_unit_tests SRC LBWalberlaImpl_unit_tests.cpp DEPENDS
-  Boost::boost Boost::mpi NUM_PROC 2)
-
-espresso_walberla_unit_test(
-  NAME LBWalberlaImpl_bspline_tests SRC LBWalberlaImpl_bspline_tests.cpp
-  DEPENDS Boost::mpi NUM_PROC 2)
+espresso_add_test(SRC ResourceManager_test.cpp)
+espresso_add_test(SRC lb_kernels_unit_tests.cpp)
+espresso_add_test(SRC ek_kernels_unit_tests.cpp)
+espresso_add_test(SRC LatticeWalberla_unit_tests.cpp DEPENDS Boost::mpi
+                  NUM_PROC 2)
+espresso_add_test(SRC LBWalberlaImpl_unit_tests.cpp DEPENDS Boost::mpi NUM_PROC
+                  2)
+espresso_add_test(SRC LBWalberlaImpl_bspline_tests.cpp DEPENDS Boost::mpi
+                  NUM_PROC 2)
+espresso_add_test(SRC LBWalberlaImpl_flow_tests.cpp DEPENDS Boost::mpi)
+espresso_add_test(SRC LBWalberlaImpl_lees_edwards_tests.cpp DEPENDS Boost::mpi)
+espresso_add_test(SRC EKinWalberlaImpl_unit_tests.cpp DEPENDS Boost::mpi
+                  NUM_PROC 2)
 
 if(NOT (ESPRESSO_BUILD_WITH_ASAN OR ESPRESSO_BUILD_WITH_UBSAN))
-  espresso_walberla_unit_test(
-    NAME LBWalberlaImpl_statistical_tests SRC
-    LBWalberlaImpl_statistical_tests.cpp DEPENDS Boost::mpi)
+  espresso_add_test(SRC LBWalberlaImpl_statistical_tests.cpp DEPENDS Boost::mpi)
 endif()
 
-espresso_walberla_unit_test(NAME LBWalberlaImpl_flow_tests SRC
-                            LBWalberlaImpl_flow_tests.cpp DEPENDS Boost::mpi)
-
-espresso_walberla_unit_test(NAME LBWalberlaImpl_lees_edwards_test SRC
-                            LBWalberlaImpl_lees_edwards.cpp DEPENDS Boost::mpi)
-
-espresso_walberla_unit_test(
-  NAME EKinWalberlaImpl_unit_tests SRC EKinWalberlaImpl_unit_tests.cpp DEPENDS
-  Boost::boost Boost::mpi NUM_PROC 2)
+if(ESPRESSO_BUILD_WITH_CUDA AND WALBERLA_BUILD_WITH_CUDA)
+  espresso_add_test(SRC LBWalberlaImpl_field_accessors_tests.cu DEPENDS
+                    Boost::mpi)
+endif()
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_field_accessors_tests.cu b/src/walberla_bridge/tests/LBWalberlaImpl_field_accessors_tests.cu
new file mode 100644
index 0000000000..5829c49021
--- /dev/null
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_field_accessors_tests.cu
@@ -0,0 +1,372 @@
+/*
+ * Copyright (C) 2024 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#define BOOST_TEST_MODULE "field accessors test"
+#define BOOST_TEST_DYN_LINK
+#define BOOST_TEST_NO_MAIN
+
+#if defined(__NVCC__)
+// Fix for https://i10git.cs.fau.de/walberla/walberla/-/issues/244
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 554 // unreachable conversion operator
+#endif
+
+#include <boost/test/data/monomorphic.hpp>
+#include <boost/test/data/test_case.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include "../src/lattice_boltzmann/LBWalberlaImpl.hpp"
+#include "../src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecision.h"
+#include "../src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cuh"
+#include "../src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecision.h"
+#include "../src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cuh"
+
+#include <walberla_bridge/Architecture.hpp>
+#include <walberla_bridge/BlockAndCell.hpp>
+
+#include <utils/Vector.hpp>
+
+#include <boost/mpi/collectives/all_reduce.hpp>
+#include <boost/mpi/communicator.hpp>
+#include <boost/mpl/list.hpp>
+
+#include <mpi.h>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <concepts>
+#include <cstddef>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <ranges>
+#include <span>
+#include <sstream>
+#include <vector>
+
+static Utils::Vector3i mpi_shape;
+
+template <std::ranges::contiguous_range R>
+  requires std::ranges::sized_range<R>
+boost::test_tools::predicate_result almost_equal(R const &val, R const &ref,
+                                                 typename R::value_type atol) {
+  assert(val.size() == ref.size());
+  assert(not val.empty());
+  auto const print_first_n = [](R const &v) {
+    auto constexpr n = std::size_t{6u};
+    std::stringstream stream;
+    stream << "[";
+    for (auto i = 0ul, end = std::min(v.size(), n); i < end; ++i) {
+      if (i) {
+        stream << ", ";
+      }
+      stream << v[i];
+    }
+    if (v.size() > n) {
+      stream << ", ...";
+    }
+    stream << "]";
+    return stream.str();
+  };
+  boost::test_tools::predicate_result res(true);
+  for (auto i = 0ul; i < val.size(); ++i) {
+    if (auto const diff = std::abs(val[i] - ref[i]); diff > atol) {
+      res = false;
+      res.message() << "val{" << print_first_n(val) << "} and " << "ref{"
+                    << print_first_n(ref) << "} mismatch: " << "val[" << i
+                    << "]{" << val[i] << "} != " << "ref[" << i << "]{"
+                    << ref[i] << "} " << "(difference{" << diff << "} > delta{"
+                    << atol << "})";
+      break;
+    }
+  }
+  return res;
+}
+
+template <typename T>
+boost::test_tools::predicate_result almost_equal(walberla::Vector3<T> const &a,
+                                                 walberla::Vector3<T> const &b,
+                                                 T atol) {
+  return almost_equal(std::span(a.data(), 3ul), std::span(b.data(), 3ul), atol);
+}
+
+template <typename T>
+boost::test_tools::predicate_result almost_equal(walberla::Matrix3<T> const &a,
+                                                 walberla::Matrix3<T> const &b,
+                                                 T atol) {
+  return almost_equal(std::span(a.data(), 9ul), std::span(b.data(), 9ul), atol);
+}
+
+template <typename T>
+  requires std::floating_point<T>
+boost::test_tools::predicate_result almost_equal(T a, T b, T atol) {
+  return almost_equal(std::span(&a, 1ul), std::span(&b, 1ul), atol);
+}
+
+template <typename FT, lbmpy::Arch Architecture>
+class LBWalberlaImplTest : public walberla::LBWalberlaImpl<FT, Architecture> {
+public:
+  using Base = walberla::LBWalberlaImpl<FT, Architecture>;
+  using Base::Base;
+  using Base::m_density;
+  using Base::m_last_applied_force_field_id;
+  using Base::m_pdf_field_id;
+  using Base::m_velocity_field_id;
+};
+
+template <typename FT, lbmpy::Arch Architecture> struct Fixture {
+  std::shared_ptr<::LatticeWalberla> lattice;
+  std::shared_ptr<LBWalberlaImplTest<FT, Architecture>> lbfluid;
+
+  Fixture() {
+    auto const grid_dim = Utils::Vector3i::broadcast(4);
+    auto const viscosity = FT(1.5);
+    auto const density = FT(0.9);
+    lattice = std::make_shared<::LatticeWalberla>(grid_dim, mpi_shape, 1u);
+    lbfluid = std::make_shared<LBWalberlaImplTest<FT, Architecture>>(
+        lattice, viscosity, density);
+  }
+
+  void runTest() {
+    auto const bc = walberla::get_block_and_cell(*lattice, {0, 0, 0}, false);
+    BOOST_CHECK(bc);
+    auto const cell = bc->cell;
+    auto const ci = walberla::CellInterval(cell, cell);
+    auto &block = *(bc->block);
+    check_getters_setters(block, cell);
+    check_getters_setters(block, ci);
+  }
+
+  template <typename CellIt>
+  void check_getters_setters(walberla::IBlock &block, CellIt const &it) {
+    using namespace walberla;
+    using PdfField = LBWalberlaImplTest<FT, Architecture>::PdfField;
+    using VectorField = LBWalberlaImplTest<FT, Architecture>::VectorField;
+
+    auto constexpr is_interval = std::is_same_v<CellIt, CellInterval>;
+    auto constexpr epsilon = std::numeric_limits<FT>::epsilon();
+    auto constexpr exact = FT{0};
+
+    auto const make_ref_vector = [](std::initializer_list<FT> values) {
+      if constexpr (is_interval) {
+        assert(values.size() % 3ul == 0ul);
+        return std::vector<FT>(values);
+      } else {
+        assert(values.size() == 3ul);
+        return Vector3<FT>(std::data(values));
+      }
+    };
+    auto const make_ref_matrix = [](std::initializer_list<FT> values) {
+      if constexpr (is_interval) {
+        assert(values.size() % 9ul == 0ul);
+        return std::vector<FT>(values);
+      } else {
+        assert(values.size() == 9ul);
+        return Matrix3<FT>(std::data(values));
+      }
+    };
+    auto const to_number = [](auto const &value) {
+      if constexpr (std::is_same_v<decltype(value), FT const &>) {
+        return value;
+      } else {
+        assert(value.size() == 1ul);
+        return value[0u];
+      }
+    };
+
+    auto const density = lbfluid->m_density;
+    auto pdf_field = block.template getData<PdfField>(lbfluid->m_pdf_field_id);
+    auto velocity_field =
+        block.template getData<VectorField>(lbfluid->m_velocity_field_id);
+    auto force_field = block.template getData<VectorField>(
+        lbfluid->m_last_applied_force_field_id);
+
+    std::conditional_t<is_interval, std::vector<FT>, std::array<FT, 19u>>
+        cur_pop = lbm::accessor::Population::get(pdf_field, it);
+    std::conditional_t<is_interval, std::vector<FT>, Vector3<FT>> cur_vel;
+    std::conditional_t<is_interval, std::vector<FT>, Vector3<FT>> cur_laf;
+
+    auto const reset_fields = [&, initial_pop = cur_pop]() {
+      auto const null_vector = Vector3<FT>(FT{0});
+      std::array<FT, 19u> pop;
+      for (std::size_t i = 0ul; i < pop.size(); ++i) {
+        pop[i] = initial_pop[i];
+      }
+      lbm::accessor::Population::initialize(pdf_field, pop);
+      lbm::accessor::Vector::initialize(force_field, null_vector);
+      lbm::accessor::Vector::initialize(velocity_field, null_vector);
+    };
+
+    {
+      auto diag = FT{0};
+      auto const zero = FT{0};
+      auto const old_pop = lbm::accessor::Population::get(pdf_field, it);
+      auto const old_pre = lbm::accessor::PressureTensor::get(pdf_field, it);
+      auto const old_laf = lbm::accessor::Vector::get(force_field, it);
+      auto const old_rho = lbm::accessor::Density::get(pdf_field, it);
+      auto ref_pop = old_pop;
+      std::transform(old_pop.begin(), old_pop.end(), ref_pop.begin(),
+                     [](auto const &f) { return FT{2} * f; });
+      lbm::accessor::Population::set(pdf_field, ref_pop, it);
+      auto const new_pop = lbm::accessor::Population::get(pdf_field, it);
+      auto const new_pre = lbm::accessor::PressureTensor::get(pdf_field, it);
+      auto const new_rho = lbm::accessor::Density::get(pdf_field, it);
+      BOOST_CHECK(almost_equal(new_pop, ref_pop, exact));
+      // clang-format off
+      diag = density * (FT{1} / FT{3});
+      auto const old_pre_ref = make_ref_matrix({diag, zero, zero,
+                                                zero, diag, zero,
+                                                zero, zero, diag});
+      BOOST_CHECK(almost_equal(old_pre, old_pre_ref, epsilon));
+      diag = density * (FT{2} / FT{3});
+      auto const new_pre_ref = make_ref_matrix({diag, zero, zero,
+                                                zero, diag, zero,
+                                                zero, zero, diag});
+      BOOST_CHECK(almost_equal(new_pre, new_pre_ref, epsilon));
+      // clang-format on
+      auto const old_laf_ref = make_ref_vector({FT{0}, FT{0}, FT{0}});
+      auto const new_laf_ref = make_ref_vector({FT{2}, FT{3}, FT{4}});
+      lbm::accessor::Vector::set(force_field, new_laf_ref, it);
+      auto const new_laf = lbm::accessor::Vector::get(force_field, it);
+      BOOST_CHECK(almost_equal(old_laf, old_laf_ref, exact));
+      BOOST_CHECK(almost_equal(new_laf, new_laf_ref, exact));
+      lbm::accessor::Vector::set(force_field, old_laf_ref, it);
+      cur_laf = lbm::accessor::Vector::get(force_field, it);
+      BOOST_CHECK(almost_equal(cur_laf, old_laf_ref, exact));
+      lbm::accessor::Vector::add_to_all(force_field,
+                                        Vector3<FT>(new_laf_ref.data()));
+      cur_laf = lbm::accessor::Vector::get(force_field, it);
+      BOOST_CHECK(almost_equal(cur_laf, new_laf_ref, exact));
+      lbm::accessor::Vector::set(force_field, old_laf_ref, it);
+      cur_laf = lbm::accessor::Vector::get(force_field, it);
+      BOOST_CHECK(almost_equal(cur_laf, old_laf_ref, exact));
+      lbm::accessor::Density::set(pdf_field, {FT{7} * density}, it);
+      auto const cur_rho = lbm::accessor::Density::get(pdf_field, it);
+      BOOST_CHECK(
+          almost_equal(to_number(old_rho), density * FT{1}, FT{20} * epsilon));
+      BOOST_CHECK(
+          almost_equal(to_number(new_rho), density * FT{2}, FT{20} * epsilon));
+      BOOST_CHECK(
+          almost_equal(to_number(cur_rho), density * FT{7}, FT{20} * epsilon));
+    }
+    reset_fields();
+    {
+      // update cached velocities and recalculate populations in a single step
+      auto const old_pop = lbm::accessor::Population::get(pdf_field, it);
+      auto const old_vel = lbm::accessor::Vector::get(velocity_field, it);
+      auto const ref_vel = make_ref_vector({FT(0.001), FT(0.002), FT(0.003)});
+      lbm::accessor::Velocity::set(pdf_field, velocity_field, force_field,
+                                   ref_vel, it);
+      auto const new_pop = lbm::accessor::Population::get(pdf_field, it);
+      auto const new_vel = lbm::accessor::Vector::get(velocity_field, it);
+      BOOST_CHECK(almost_equal(new_vel, ref_vel, epsilon));
+      auto const new_mom =
+          lbm::accessor::MomentumDensity::reduce(pdf_field, force_field);
+      auto const ref_mom = Vector3<FT>(
+          ref_vel[0u] * density, ref_vel[1u] * density, ref_vel[2u] * density);
+      BOOST_CHECK(almost_equal(new_mom, ref_mom, FT{20} * epsilon));
+      // update populations and recalculate cached velocities in a single step
+      lbm::accessor::Population::set(pdf_field, velocity_field, force_field,
+                                     old_pop, it);
+      cur_pop = lbm::accessor::Population::get(pdf_field, it);
+      cur_vel = lbm::accessor::Vector::get(velocity_field, it);
+      BOOST_CHECK(almost_equal(cur_pop, old_pop, exact));
+      BOOST_CHECK(almost_equal(cur_vel, old_vel, epsilon));
+      cur_vel = lbm::accessor::Velocity::get(pdf_field, force_field, it);
+      BOOST_CHECK(almost_equal(cur_vel, old_vel, epsilon));
+      lbm::accessor::Population::set(pdf_field, velocity_field, force_field,
+                                     new_pop, it);
+      cur_pop = lbm::accessor::Population::get(pdf_field, it);
+      cur_vel = lbm::accessor::Vector::get(velocity_field, it);
+      BOOST_CHECK(almost_equal(cur_pop, new_pop, exact));
+      BOOST_CHECK(almost_equal(cur_vel, new_vel, epsilon));
+      cur_vel = lbm::accessor::Velocity::get(pdf_field, force_field, it);
+      BOOST_CHECK(almost_equal(cur_vel, new_vel, epsilon));
+      // update forces and recalculate cached velocities in a single step
+      auto const ref_laf = make_ref_vector({ref_vel[0u] * FT{2} * density,
+                                            ref_vel[1u] * FT{2} * density,
+                                            ref_vel[2u] * FT{2} * density});
+      lbm::accessor::Population::set(pdf_field, velocity_field, force_field,
+                                     old_pop, it);
+      lbm::accessor::Force::set(pdf_field, velocity_field, force_field, ref_laf,
+                                it);
+      cur_pop = lbm::accessor::Population::get(pdf_field, it);
+      cur_vel = lbm::accessor::Vector::get(velocity_field, it);
+      cur_laf = lbm::accessor::Vector::get(force_field, it);
+      BOOST_CHECK(almost_equal(cur_pop, old_pop, exact));
+      BOOST_CHECK(almost_equal(cur_vel, new_vel, epsilon));
+      BOOST_CHECK(almost_equal(cur_laf, ref_laf, epsilon));
+      cur_vel = lbm::accessor::Velocity::get(pdf_field, force_field, it);
+      BOOST_CHECK(almost_equal(cur_vel, new_vel, epsilon));
+      // update velocities and recalculate populations in a single step
+      lbm::accessor::Population::set(pdf_field, velocity_field, force_field,
+                                     old_pop, it);
+      lbm::accessor::Velocity::set(pdf_field, velocity_field, force_field,
+                                   ref_vel, it);
+      cur_pop = lbm::accessor::Population::get(pdf_field, it);
+      cur_vel = lbm::accessor::Vector::get(velocity_field, it);
+      cur_laf = lbm::accessor::Vector::get(force_field, it);
+      BOOST_CHECK(almost_equal(cur_pop, old_pop, epsilon));
+      BOOST_CHECK(almost_equal(cur_vel, new_vel, epsilon));
+      BOOST_CHECK(almost_equal(cur_laf, ref_laf, epsilon));
+      cur_vel = lbm::accessor::Velocity::get(pdf_field, force_field, it);
+      BOOST_CHECK(almost_equal(cur_vel, new_vel, epsilon));
+    }
+    reset_fields();
+  }
+};
+
+BOOST_AUTO_TEST_CASE(test_custom_predicate) {
+  std::vector<int> const val = {0, 1, 2, 3, 4, 5, 99, 2};
+  std::vector<int> const ref = {0, 1, 2, 3, 4, 5, 7, 80};
+  auto const is_true = almost_equal(ref, ref, 0);
+  auto const is_false = almost_equal(val, ref, 1);
+  BOOST_REQUIRE(is_true);
+  BOOST_REQUIRE(not is_false);
+  BOOST_CHECK_EQUAL(is_true.message(), "");
+  BOOST_REQUIRE_EQUAL(
+      is_false.message(),
+      "val{[0, 1, 2, 3, 4, 5, ...]} and ref{[0, 1, 2, 3, 4, 5, ...]} "
+      "mismatch: val[6]{99} != ref[6]{7} (difference{92} > delta{1})");
+}
+
+using test_types = boost::mpl::list<float, double>;
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(macroscopic_accessors, FT, test_types) {
+  Fixture<FT, lbmpy::Arch::CPU>().runTest();
+  Fixture<FT, lbmpy::Arch::GPU>().runTest();
+}
+
+int main(int argc, char **argv) {
+  int n_nodes;
+  MPI_Init(&argc, &argv);
+  MPI_Comm_size(MPI_COMM_WORLD, &n_nodes);
+  MPI_Dims_create(n_nodes, 3, mpi_shape.data());
+  walberla::mpi_init();
+
+  auto const res = boost::unit_test::unit_test_main(init_unit_test, argc, argv);
+  MPI_Finalize();
+  return res;
+}
+
+#if defined(__NVCC__)
+#pragma nv_diagnostic pop
+#endif
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards.cpp b/src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards_tests.cpp
similarity index 100%
rename from src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards.cpp
rename to src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards_tests.cpp
diff --git a/src/walberla_bridge/tests/lb_kernels_unit_tests.cpp b/src/walberla_bridge/tests/lb_kernels_unit_tests.cpp
index 3045e9974c..f45cfb542f 100644
--- a/src/walberla_bridge/tests/lb_kernels_unit_tests.cpp
+++ b/src/walberla_bridge/tests/lb_kernels_unit_tests.cpp
@@ -19,10 +19,6 @@
 #define BOOST_TEST_MODULE waLBerla LB kernels
 #define BOOST_TEST_DYN_LINK
 
-#include "config/config.hpp"
-
-#ifdef WALBERLA
-
 #include <boost/test/unit_test.hpp>
 
 #include "../src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precision.h"
@@ -189,5 +185,3 @@ BOOST_AUTO_TEST_CASE(macroscopic_accessor_equilibrium_distribution) {
     }
   }
 }
-
-#endif
diff --git a/testsuite/CMakeLists.txt b/testsuite/CMakeLists.txt
index 027cc3f852..d46e6094db 100644
--- a/testsuite/CMakeLists.txt
+++ b/testsuite/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2015-2022 The ESPResSo project
+# Copyright (C) 2015-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,8 +17,6 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-include(unit_test)
-
 if(ESPRESSO_BUILD_WITH_COVERAGE_PYTHON)
   set(PYPRESSO_OPTIONS --coverage)
 endif()
diff --git a/testsuite/python/lb.py b/testsuite/python/lb.py
index e5a5ebe9ab..86daa7fe51 100644
--- a/testsuite/python/lb.py
+++ b/testsuite/python/lb.py
@@ -89,13 +89,14 @@ def test_properties(self):
         self.check_properties(lbf)
 
     def check_properties(self, lbf):
+        density = self.params["density"]
         agrid = self.params["agrid"]
-        tau = self.system.time_step
+        tau = self.params["tau"]
         # check LB object
         self.assertAlmostEqual(lbf.tau, tau, delta=self.atol)
         self.assertAlmostEqual(lbf.agrid, agrid, delta=self.atol)
         self.assertAlmostEqual(lbf.kinematic_viscosity, 3., delta=self.atol)
-        self.assertAlmostEqual(lbf.density, 0.85, delta=self.atol)
+        self.assertAlmostEqual(lbf.density, density, delta=self.atol)
         self.assertAlmostEqual(lbf.kT, 1.0, delta=self.atol)
         self.assertEqual(lbf.seed, 42)
         self.assertEqual(
@@ -136,6 +137,57 @@ def check_properties(self, lbf):
             lbf[0, 0, 0].velocity = [1, 2]
         with self.assertRaises(TypeError):
             lbf[0, 1].velocity = [1, 2, 3]
+        # check node setters update cached velocities (with precision loss)
+        lbnode = lbf[0, 0, 0]
+        lbnode.last_applied_force = [0., 0., 0.]
+        lbnode.velocity = [0., 0., 0.]
+        old_pop = np.copy(lbnode.population)
+        old_vel = np.copy(lbnode.velocity)
+        old_rho = np.copy(lbnode.density)
+        lbnode.velocity = [1., 2., 3.]
+        new_pop = np.copy(lbnode.population)
+        new_vel = np.copy(lbnode.velocity)
+        lbnode.population = old_pop
+        np.testing.assert_allclose(
+            np.copy(lbnode.velocity), old_vel, atol=self.atol * 20.)
+        lbnode.population = new_pop
+        np.testing.assert_allclose(
+            np.copy(lbnode.velocity), new_vel, atol=self.atol * 20.)
+        lbnode.population = old_pop
+        lbnode.last_applied_force = [0.4, 0.5, 0.6]
+        np.testing.assert_allclose(
+            np.copy(lbnode.velocity),
+            np.copy([0.4, 0.5, 0.6]) / (agrid / tau * old_rho / 2.),
+            atol=self.atol * 20.)
+        lbnode.last_applied_force = [0., 0., 0.]
+        lbnode.population = old_pop
+        # check slice setters update cached velocities (with precision loss)
+        lbslice = lbf[0:5, 0:5, 0:5]
+        lbslice.last_applied_force = [0., 0., 0.]
+        lbslice.velocity = [0., 0., 0.]
+        old_pop = np.copy(lbslice.population)
+        old_vel = np.copy(lbslice.velocity)
+        old_rho = np.copy(lbslice.density)
+        lbslice.velocity = [1., 2., 3.]
+        new_pop = np.copy(lbslice.population)
+        new_vel = np.copy(lbslice.velocity)
+        lbslice.population = old_pop
+        np.testing.assert_allclose(
+            np.copy(lbslice.velocity), old_vel, atol=self.atol * 100.)
+        lbslice.population = new_pop
+        np.testing.assert_allclose(
+            np.copy(lbslice.velocity), new_vel, atol=self.atol * 100.)
+        lbslice.population = old_pop
+        lbslice.last_applied_force = [0.4, 0.5, 0.6]
+        vel2force = 2. * tau / agrid
+        np.testing.assert_allclose(
+            np.copy(lbslice.velocity),
+            np.tile(np.copy([0.4, 0.5, 0.6]), (5, 5, 5, 1)) * vel2force /
+            np.tile(old_rho.reshape((5, 5, 5, 1)), (3,)),
+            atol=self.atol * 20.)
+        lbslice.last_applied_force = [0., 0., 0.]
+        lbslice.population = old_pop
+        # check node boundary conditions
         node = lbf[0, 0, 0]
         self.assertIsNone(node.boundary)
         self.assertIsNone(node.boundary_force)
@@ -373,13 +425,13 @@ def test_lb_node_set_get(self):
         self.assertAlmostEqual(lbf[0, 0, 0].density, density, delta=1e-4)
 
         self.assertEqual(lbf[3, 2, 1].index, (3, 2, 1))
-        ext_force_density = [0.1, 0.2, 1.2]
-        last_applied_force = [0.2, 0.4, 0.6]
+        ext_force_density = np.array([0.1, 0.2, 1.2])
+        last_applied_force = np.array([0.2, 0.4, 0.6])
         lbf.ext_force_density = ext_force_density
         node = lbf[1, 2, 3]
         node.velocity = v_fluid
-        node.last_applied_force = last_applied_force
         np.testing.assert_allclose(np.copy(node.velocity), v_fluid, atol=1e-4)
+        node.last_applied_force = last_applied_force
         np.testing.assert_allclose(
             np.copy(node.last_applied_force), last_applied_force, atol=1e-4)
         np.testing.assert_allclose(
diff --git a/testsuite/python/lb_thermostat.py b/testsuite/python/lb_thermostat.py
index 4f0ee2bb62..c67709dd98 100644
--- a/testsuite/python/lb_thermostat.py
+++ b/testsuite/python/lb_thermostat.py
@@ -204,6 +204,9 @@ def test_friction(self):
     @utx.skipIfMissingFeatures(["PARTICLE_ANISOTROPY",
                                "THERMOSTAT_PER_PARTICLE"])
     def test_exceptions(self):
+        with self.assertRaisesRegex(RuntimeError, r"set_lb\(\) got an unexpected keyword argument 'act_on_virtual'"):
+            self.system.thermostat.set_lb(
+                LB_fluid=self.lbf, act_on_virtual=False)
         self.system.part.add(pos=[0., 0., 0.], gamma=[1., 2., 3.], id=2)
         with self.assertRaisesRegex(Exception, r"ERROR: anisotropic particle \(id 2\) coupled to LB"):
             self.system.integrator.run(1)
diff --git a/testsuite/python/test_checkpoint.py b/testsuite/python/test_checkpoint.py
index b8d46c49a4..1ab0d748af 100644
--- a/testsuite/python/test_checkpoint.py
+++ b/testsuite/python/test_checkpoint.py
@@ -158,8 +158,6 @@ def test_lb_fluid(self):
             np.testing.assert_allclose(np.copy(node.velocity), slip_velocity1)
         for node in lbf[-1, :, :]:
             np.testing.assert_allclose(np.copy(node.velocity), slip_velocity2)
-        for node in lbf[2, :, :]:
-            np.testing.assert_allclose(np.copy(node.velocity), 0.)
         # remove boundaries
         lbf.clear_boundaries()
         np.testing.assert_equal(