Adding KernelAbstractions tooling for Molly and tests

JuliaMolSim · Dec 1, 2024 · a173ac2 · a173ac2
1 parent 916db90
commit a173ac2
Show file tree

Hide file tree

Showing 27 changed files with 709 additions and 317 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -29,6 +29,7 @@ jobs:
           - NotGradients
           - Gradients
     steps:
+      - run: export UCX_ERROR_SIGNALS="SIGILL,SIGBUS,SIGFPE"
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2
         with:

diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,7 @@
 *.jl.*.cov
 *.jl.mem
 docs/build
-/Manifest.toml
+*Manifest.toml
 benchmark/tune.json
 benchmark/results
 .vscode/settings.json
diff --git a/Project.toml b/Project.toml
@@ -8,7 +8,6 @@ Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 AtomsCalculators = "a3e0e189-c65a-42c1-833c-339540406eb1"
 BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CellListMap = "69e1c6dd-3888-40e6-b3c8-31ac5f578864"
 Chemfiles = "46823bd8-5fb3-5f92-9aa0-96921f3dd015"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
@@ -17,7 +16,9 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
 FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 PeriodicTable = "7b2266bf-644c-5ea3-82d8-af4bbd25a884"
@@ -32,6 +33,7 @@ UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
 UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
@@ -40,6 +42,7 @@ PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
 
 [extensions]
 MollyEnzymeExt = "Enzyme"
+MollyCUDAExt = "CUDA"
 MollyGLMakieExt = ["GLMakie", "Colors"]
 MollyKernelDensityExt = "KernelDensity"
 MollyPythonCallExt = "PythonCall"
@@ -61,7 +64,9 @@ Enzyme = "0.13"
 EzXML = "1"
 FLoops = "0.2"
 GLMakie = "0.8, 0.9, 0.10"
+GPUArrays = "10"
 Graphs = "1.8"
+KernelAbstractions = "0.9"
 KernelDensity = "0.5, 0.6"
 LinearAlgebra = "1.9"
 NearestNeighbors = "0.4"

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
@@ -62,7 +62,8 @@ const starting_velocities = [random_velocity(atom_mass, 1.0u"K") for i in 1:n_at
 const starting_coords_f32 = [Float32.(c) for c in starting_coords]
 const starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
 
-function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
+function test_sim(nl::Bool, parallel::Bool, f32::Bool,
+                  ArrayType::Type{AT}) where AT <: AbstractArray
     n_atoms = 400
     n_steps = 200
     atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
@@ -72,34 +73,27 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
     r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
     bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
     specific_inter_lists = (InteractionList2Atoms(
-        gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)),
-        gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)),
-        gpu ? CuArray(bonds) : bonds,
+        ArrayType(Int32.(collect(1:2:n_atoms))),
+        ArrayType(Int32.(collect(2:2:n_atoms))),
+        ArrayType(bonds),
     ),)
 
     neighbor_finder = NoNeighborFinder()
     cutoff = DistanceCutoff(f32 ? 1.0f0u"nm" : 1.0u"nm")
     pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
     if nl
         neighbor_finder = DistanceNeighborFinder(
-            eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
+            eligible=ArrayType(trues(n_atoms, n_atoms)),
             n_steps=10,
             dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
         )
         pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
     end
 
-    if gpu
-        coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords))
-        velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities))
-        atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                              ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
-    else
-        coords = copy(f32 ? starting_coords_f32 : starting_coords)
-        velocities = copy(f32 ? starting_velocities_f32 : starting_velocities)
-        atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                      ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]
-    end
+    coords = ArrayType(deepcopy(f32 ? starting_coords_f32 : starting_coords))
+    velocities = ArrayType(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
+    atoms = ArrayType([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
+                            ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
 
     sys = System(
         atoms=atoms,
@@ -117,22 +111,22 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
 end
 
 runs = [
-    ("CPU"       , [false, false, false, false]),
-    ("CPU f32"   , [false, false, true , false]),
-    ("CPU NL"    , [true , false, false, false]),
-    ("CPU f32 NL", [true , false, true , false]),
+    ("CPU"       , [false, false, false, Array]),
+    ("CPU f32"   , [false, false, true , Array]),
+    ("CPU NL"    , [true , false, false, Array]),
+    ("CPU f32 NL", [true , false, true , Array]),
 ]
 if run_parallel_tests
-    push!(runs, ("CPU parallel"       , [false, true , false, false]))
-    push!(runs, ("CPU parallel f32"   , [false, true , true , false]))
-    push!(runs, ("CPU parallel NL"    , [true , true , false, false]))
-    push!(runs, ("CPU parallel f32 NL", [true , true , true , false]))
+    push!(runs, ("CPU parallel"       , [false, true , false, Array]))
+    push!(runs, ("CPU parallel f32"   , [false, true , true , Array]))
+    push!(runs, ("CPU parallel NL"    , [true , true , false, Array]))
+    push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
 end
-if run_gpu_tests
-    push!(runs, ("GPU"       , [false, false, false, true]))
-    push!(runs, ("GPU f32"   , [false, false, true , true]))
-    push!(runs, ("GPU NL"    , [true , false, false, true]))
-    push!(runs, ("GPU f32 NL", [true , false, true , true]))
+if run_cuda_tests
+    push!(runs, ("GPU"       , [false, false, false, CuArray]))
+    push!(runs, ("GPU f32"   , [false, false, true , CuArray]))
+    push!(runs, ("GPU NL"    , [true , false, false, CuArray]))
+    push!(runs, ("GPU f32 NL", [true , false, true , CuArray]))
 end
 
 for (name, args) in runs

diff --git a/benchmark/protein.jl b/benchmark/protein.jl
@@ -11,7 +11,7 @@ const data_dir = normpath(dirname(pathof(Molly)), "..", "data")
 const ff_dir = joinpath(data_dir, "force_fields")
 const openmm_dir = joinpath(data_dir, "openmm_6mrr")
 
-function setup_system(gpu::Bool, f32::Bool, units::Bool)
+function setup_system(ArrayType::AbstractArray, f32::Bool, units::Bool)
     T = f32 ? Float32 : Float64
     ff = MolecularForceField(
         T,
@@ -27,7 +27,7 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool)
     sys = System(
         joinpath(data_dir, "6mrr_equil.pdb"),
         ff;
-        velocities=gpu ? CuArray(velocities) : velocities,
+        velocities=ArrayType(velocities),
         units=units,
         gpu=gpu,
         dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff),
@@ -42,15 +42,15 @@ end
 
 runs = [
     # run_name                             gpu    parr   f32    units
-    ("CPU 1 thread"                      , false, false, false, true ),
-    ("CPU 1 thread f32"                  , false, false, true , true ),
-    ("CPU 1 thread f32 nounits"          , false, false, true , false),
-    ("CPU $n_threads threads"            , false, true , false, true ),
-    ("CPU $n_threads threads f32"        , false, true , true , true ),
-    ("CPU $n_threads threads f32 nounits", false, true , true , false),
-    ("GPU"                               , true , false, false, true ),
-    ("GPU f32"                           , true , false, true , true ),
-    ("GPU f32 nounits"                   , true , false, true , false),
+    ("CPU 1 thread"                      , Array, false, false, true ),
+    ("CPU 1 thread f32"                  , Array, false, true , true ),
+    ("CPU 1 thread f32 nounits"          , Array, false, true , false),
+    ("CPU $n_threads threads"            , Array, true , false, true ),
+    ("CPU $n_threads threads f32"        , Array, true , true , true ),
+    ("CPU $n_threads threads f32 nounits", Array, true , true , false),
+    ("GPU"                               , CuArray, false, false, true ),
+    ("GPU f32"                           , CuArray, false, true , true ),
+    ("GPU f32 nounits"                   , CuArray, false, true , false),
 ]
 
 for (run_name, gpu, parallel, f32, units) in runs

diff --git a/src/cuda.jl → ext/MollyCUDAExt.jl b/src/cuda.jl → ext/MollyCUDAExt.jl
@@ -1,3 +1,12 @@
+module MollyCUDAExt
+
+using Molly
+using CUDA
+using ChainRulesCore
+using Atomix
+
+CUDA.Const(nl::Molly.NoNeighborList) = nl
+
 # CUDA.jl kernels
 const WARPSIZE = UInt32(32)
 
@@ -513,3 +522,5 @@ function specific_pe_4_atoms_kernel!(energy, coords_var, velocities_var, atoms_v
     end
     return nothing
 end
+
+end # module
diff --git a/ext/MollyGLMakieExt.jl b/ext/MollyGLMakieExt.jl
@@ -6,8 +6,8 @@ module MollyGLMakieExt
 using Molly
 import AtomsBase
 using GLMakie
-using Colors
 using Unitful
+using Colors
 
 using LinearAlgebra
 

diff --git a/ext/MollyPythonCallExt.jl b/ext/MollyPythonCallExt.jl
@@ -6,7 +6,7 @@ module MollyPythonCallExt
 using Molly
 using PythonCall
 import AtomsCalculators
-using CUDA
+using GPUArrays
 using StaticArrays
 using Unitful
 
@@ -91,7 +91,7 @@ end
 
 uconvert_vec(x...) = uconvert.(x...)
 
-function AtomsCalculators.forces(sys::System{D, G, T},
+function AtomsCalculators.forces(sys::System{D, AT, T},
                                  ase_calc::ASECalculator;
                                  kwargs...) where {D, G, T}
     update_ase_calc!(ase_calc, sys)
@@ -105,7 +105,7 @@ function AtomsCalculators.forces(sys::System{D, G, T},
     else
         fs_unit = uconvert_vec.(sys.force_units, fs * u"eV/Å")
     end
-    return G ? CuArray(fs_unit) : fs_unit
+    return AT <: AbstractGPUArray ? AT(fs_unit) : fs_unit
 end
 
 function AtomsCalculators.potential_energy(sys::System{D, G, T},

diff --git a/src/Molly.jl b/src/Molly.jl
@@ -11,7 +11,8 @@ import BioStructures # Imported to avoid clashing names
 using CellListMap
 import Chemfiles
 using Combinatorics
-using CUDA
+using KernelAbstractions
+using GPUArrays
 using DataStructures
 using Distances
 using Distributions
@@ -34,7 +35,7 @@ include("types.jl")
 include("units.jl")
 include("spatial.jl")
 include("cutoffs.jl")
-include("cuda.jl")
+include("kernels.jl")
 include("force.jl")
 include("interactions/lennard_jones.jl")
 include("interactions/soft_sphere.jl")

diff --git a/src/analysis.jl b/src/analysis.jl
@@ -90,6 +90,7 @@ function hydrodynamic_radius(coords::AbstractArray{SVector{D, T}}, boundary) whe
     n_atoms = length(coords)
     diag_cpu = Diagonal(ones(T, n_atoms))
     diag = isa(coords, CuArray) ? CuArray(diag_cpu) : diag_cpu
+    diag = isa(coords, AbstractGPUArray) ? get_array_type(coords)((diag_cpu)) : diag_cpu
     dists = distances(coords, boundary) .+ diag
     sum_inv_dists = sum(inv.(dists)) - sum(inv(diag))
     inv_R_hyd = sum_inv_dists / (2 * n_atoms^2)

diff --git a/src/coupling.jl b/src/coupling.jl
@@ -58,9 +58,9 @@ struct AndersenThermostat{T, C}
     coupling_const::C
 end
 
-function apply_coupling!(sys::System{D, false}, thermostat::AndersenThermostat, sim,
+function apply_coupling!(sys::System{D, AT}, thermostat::AndersenThermostat, sim,
                          neighbors=nothing, step_n::Integer=0;
-                         n_threads::Integer=Threads.nthreads()) where D
+                         n_threads::Integer=Threads.nthreads()) where {D, AT}
     for i in eachindex(sys)
         if rand() < (sim.dt / thermostat.coupling_const)
             sys.velocities[i] = random_velocity(mass(sys.atoms[i]), thermostat.temperature, sys.k;
@@ -70,9 +70,9 @@ function apply_coupling!(sys::System{D, false}, thermostat::AndersenThermostat,
     return false
 end
 
-function apply_coupling!(sys::System{D, true, T}, thermostat::AndersenThermostat, sim,
+function apply_coupling!(sys::System{D, AT, T}, thermostat::AndersenThermostat, sim,
                          neighbors=nothing, step_n::Integer=0;
-                         n_threads::Integer=Threads.nthreads()) where {D, T}
+                         n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
     atoms_to_bump = T.(rand(length(sys)) .< (sim.dt / thermostat.coupling_const))
     atoms_to_leave = one(T) .- atoms_to_bump
     atoms_to_bump_dev = move_array(atoms_to_bump, sys)

diff --git a/src/energy.jl b/src/energy.jl
@@ -72,8 +72,8 @@ function potential_energy(sys; n_threads::Integer=Threads.nthreads())
     return potential_energy(sys, find_neighbors(sys; n_threads=n_threads); n_threads=n_threads)
 end
 
-function potential_energy(sys::System{D, false, T}, neighbors, step_n::Integer=0;
-                          n_threads::Integer=Threads.nthreads()) where {D, T}
+function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0;
+                          n_threads::Integer=Threads.nthreads()) where {D, AT, T}
     pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters))
     pairwise_inters_nl   = filter( use_neighbors, values(sys.pairwise_inters))
     sils_1_atoms = filter(il -> il isa InteractionList1Atoms, values(sys.specific_inter_lists))
@@ -247,10 +247,11 @@ function specific_pe(atoms, coords, velocities, boundary, energy_units, sils_1_a
     return pe
 end
 
-function potential_energy(sys::System{D, true, T}, neighbors, step_n::Integer=0;
-                          n_threads::Integer=Threads.nthreads()) where {D, T}
-    pe_vec_nounits = CUDA.zeros(T, 1)
+function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0;
+                          n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
+    n_atoms = length(sys)
     val_ft = Val(T)
+    pe_vec_nounits = KernelAbstractions.zeros(get_backend(sys.coords), T, 1)
 
     pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters))
     if length(pairwise_inters_nonl) > 0

diff --git a/src/force.jl b/src/force.jl
@@ -123,8 +123,9 @@ function init_forces_buffer(forces_nounits, n_threads)
     end
 end
 
-function init_forces_buffer(forces_nounits::CuArray{SVector{D, T}}, n_threads) where {D, T}
-    return CUDA.zeros(T, D, length(forces_nounits))
+function init_forces_buffer(forces_nounits::AbstractGPUArray{SVector{D, T}}, n_threads) where {D, T}
+    return KernelAbstractions.zeros(get_backend(forces_nounits),
+                                    T, D, length(forces_nounits))
 end
 
 """
@@ -346,9 +347,9 @@ function specific_forces!(fs_nounits, atoms, coords, velocities, boundary, force
     return fs_nounits
 end
 
-function forces_nounits!(fs_nounits, sys::System{D, true, T}, neighbors,
+function forces_nounits!(fs_nounits, sys::System{D, AT, T}, neighbors,
                          fs_mat=CUDA.zeros(T, D, length(sys)), step_n::Integer=0;
-                         n_threads::Integer=Threads.nthreads()) where {D, T}
+                         n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
     fill!(fs_mat, zero(T))
     val_ft = Val(T)