CUDA tests work

JuliaMolSim · Dec 20, 2024 · 28e7381 · 28e7381
1 parent 14cf074
commit 28e7381
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 322 deletions.
diff --git a/ext/MollyCUDAExt.jl b/ext/MollyCUDAExt.jl
@@ -3,6 +3,7 @@ module MollyCUDAExt
 using Molly
 using CUDA
 using Atomix
+using KernelAbstractions
 
 CUDA.Const(nl::Molly.NoNeighborList) = nl
 
@@ -37,8 +38,8 @@ function cuda_threads_blocks_specific(n_inters)
     return n_threads_gpu, n_blocks
 end
 
-function Molly.pairwise_force_gpu!(fs_mat, coords::CuArray{SVector{D, C}}, velocities, atoms,
-                    boundary, pairwise_inters, nbs, step_n, force_units, ::Val{T}) where {D, C, T}
+function Molly.pairwise_force_gpu!(fs_mat::AT, coords::AbstractArray{SVector{D, C}}, velocities, atoms,
+                    boundary, pairwise_inters, nbs, step_n, force_units, ::Val{T}) where {AT <: CuArray, D, C, T}
     if typeof(nbs) == NoNeighborList
         kernel = @cuda launch=false pairwise_force_kernel_nonl!(
                 fs_mat, coords, velocities, atoms, boundary, pairwise_inters, step_n,
@@ -56,6 +57,7 @@ function Molly.pairwise_force_gpu!(fs_mat, coords::CuArray{SVector{D, C}}, veloc
         CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks pairwise_force_kernel_nl!(
                 fs_mat, coords, velocities, atoms, boundary, pairwise_inters, nbs, step_n,
                 Val(D), Val(force_units))
+
     end
     return fs_mat
 end
@@ -71,7 +73,7 @@ function pairwise_force_kernel_nl!(forces, coords_var, velocities_var, atoms_var
 
     @inbounds if inter_i <= length(neighbors)
         i, j, special = neighbors[inter_i]
-        f = sum_pairwise_forces(inters, atoms[i], atoms[j], Val(F), special, coords[i], coords[j],
+        f = Molly.sum_pairwise_forces(inters, atoms[i], atoms[j], Val(F), special, coords[i], coords[j],
                                 boundary, velocities[i], velocities[j], step_n)
         for dim in 1:D
             fval = ustrip(f[dim])
@@ -82,6 +84,7 @@ function pairwise_force_kernel_nl!(forces, coords_var, velocities_var, atoms_var
     return nothing
 end
 
+
 #=
 **The No-neighborlist pairwise force summation kernel**: This kernel calculates all the pairwise forces in the system of
 `n_atoms` atoms, this is done by dividing the complete matrix of `n_atoms`×`n_atoms` interactions into small tiles. Most
@@ -122,7 +125,7 @@ That's why the calculations are done in the following order:
     h | 1 2 3 4 5 6
 ```
 =#
-function pairwise_force_kernel_nonl!(forces::CuArray{T}, coords_var, velocities_var,
+function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, velocities_var,
                         atoms_var, boundary, inters, step_n, ::Val{D}, ::Val{F}) where {T, D, F}
     coords = CUDA.Const(coords_var)
     velocities = CUDA.Const(velocities_var)
@@ -149,7 +152,7 @@ function pairwise_force_kernel_nonl!(forces::CuArray{T}, coords_var, velocities_
                 j = j_0_tile + del_j
                 if i != j
                     atom_j, coord_j, vel_j = atoms[j], coords[j], velocities[j]
-                    f = sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j,
+                    f = Molly.sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j,
                                             boundary, vel_i, vel_j, step_n)
                     for dim in 1:D
                         forces_shmem[dim, tidx] += -ustrip(f[dim])
@@ -174,7 +177,7 @@ function pairwise_force_kernel_nonl!(forces::CuArray{T}, coords_var, velocities_
         @inbounds for _ in 1:tilesteps
             sync_warp()
             atom_j = atoms[j]
-            f = sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j,
+            f = Molly.sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j,
                                     boundary, vel_i, vel_j, step_n)
             for dim in 1:D
                 forces_shmem[dim, tidx] += -ustrip(f[dim])
@@ -190,319 +193,4 @@ function pairwise_force_kernel_nonl!(forces::CuArray{T}, coords_var, velocities_
     return nothing
 end
 
-function Molly.specific_force_gpu!(fs_mat, inter_list::InteractionList1Atoms, coords::CuArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_1_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.inters,
-            Val(D), Val(force_units))
-    return fs_mat
-end
-
-function Molly.specific_force_gpu!(fs_mat, inter_list::InteractionList2Atoms, coords::CuArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_2_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js,
-            inter_list.inters, Val(D), Val(force_units))
-    return fs_mat
-end
-
-function Molly.specific_force_gpu!(fs_mat, inter_list::InteractionList3Atoms, coords::CuArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_3_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js,
-            inter_list.ks, inter_list.inters, Val(D), Val(force_units))
-    return fs_mat
-end
-
-function Molly.specific_force_gpu!(fs_mat, inter_list::InteractionList4Atoms, coords::CuArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_4_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js,
-            inter_list.ks, inter_list.ls, inter_list.inters, Val(D), Val(force_units))
-    return fs_mat
-end
-
-function Molly.specific_force_1_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, inters_var, ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i = is[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], boundary, atoms[i], F, velocities[i], step_n)
-        if unit(fs.f1[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-        end
-    end
-    return nothing
-end
-
-function Molly.specific_force_2_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, js_var, inters_var, ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j = is[inter_i], js[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i], atoms[j], F,
-                       velocities[i], velocities[j], step_n)
-        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
-        end
-    end
-    return nothing
-end
-
-function Molly.specific_force_3_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, js_var, ks_var, inters_var, ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary, atoms[i],
-                       atoms[j], atoms[k], F, velocities[i], velocities[j], velocities[k], step_n)
-        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
-            Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim])
-        end
-    end
-    return nothing
-end
-
-function Molly.specific_force_4_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, js_var, ks_var, ls_var, inters_var,
-                        ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    ls = CUDA.Const(ls_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], boundary,
-                       atoms[i], atoms[j], atoms[k], atoms[l], F, velocities[i], velocities[j],
-                       velocities[k], velocities[l], step_n)
-        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F || unit(fs.f4[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
-            Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim])
-            Atomix.@atomic :monotonic forces[dim, l] += ustrip(fs.f4[dim])
-        end
-    end
-    return nothing
-end
-
-function Molly.pairwise_pe_gpu!(pe_vec_nounits, coords::CuArray{SVector{D, C}}, velocities, atoms,
-                          boundary, pairwise_inters, nbs, step_n, energy_units,
-                          ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_pairwise(length(nbs))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks pairwise_pe_kernel!(
-                pe_vec_nounits, coords, velocities, atoms, boundary, pairwise_inters, nbs,
-                step_n, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function pairwise_pe_kernel!(energy, coords_var, velocities_var, atoms_var, boundary, inters,
-                             neighbors_var, step_n, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    neighbors = CUDA.Const(neighbors_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(neighbors)
-        i, j, special = neighbors[inter_i]
-        coord_i, coord_j, vel_i, vel_j = coords[i], coords[j], velocities[i], velocities[j]
-        dr = vector(coord_i, coord_j, boundary)
-        pe = Molly.potential_energy_gpu(inters[1], dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
-                                  boundary, vel_i, vel_j, step_n)
-        for inter in inters[2:end]
-            pe += Molly.potential_energy_gpu(inter, dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
-                                       boundary, vel_i, vel_j, step_n)
-        end
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-function Molly.specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList1Atoms, coords::CuArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_1_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function Molly.specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList2Atoms, coords::CuArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_2_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.js, inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function Molly.specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList3Atoms, coords::CuArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_3_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.js, inter_list.ks, inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function Molly.specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList4Atoms, coords::CuArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_4_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.js, inter_list.ks, inter_list.ls, inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
 end
-
-function specific_pe_1_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i = is[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], boundary, atoms[i], E,
-                                  velocities[i], step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-function specific_pe_2_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, js_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j = is[inter_i], js[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i],
-                                  atoms[j], E, velocities[i], velocities[j], step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-function specific_pe_3_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, js_var, ks_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary,
-                                  atoms[i], atoms[j], atoms[k], E, velocities[i], velocities[j],
-                                  velocities[k], step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-function specific_pe_4_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, js_var, ks_var, ls_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    ls = CUDA.Const(ls_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l],
-                                  boundary, atoms[i], atoms[j], atoms[k], atoms[l], E,
-                                  velocities[i], velocities[j], velocities[k], velocities[l],
-                                  step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-end # module
diff --git a/src/kernels.jl b/src/kernels.jl
@@ -43,7 +43,6 @@ function pairwise_force_gpu!(fs_mat, coords::AbstractArray{SVector{D, C}}, veloc
         kernel! = pairwise_force_kernel_nl!(backend, n_threads_gpu)
         kernel!(fs_mat, coords, velocities, atoms, boundary, pairwise_inters,
                 nbs, step_n, Val(D), Val(force_units); ndrange = length(nbs))
-
     end
     return fs_mat
 end