Adding KernelAbstractions tooling for Molly and tests

JuliaMolSim · Dec 23, 2024 · 84bb0c8 · 84bb0c8
1 parent 32f5c73
commit 84bb0c8
Show file tree

Hide file tree

Showing 32 changed files with 1,006 additions and 916 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -29,6 +29,7 @@ jobs:
           - NotGradients
           - Gradients
     steps:
+      - run: export UCX_ERROR_SIGNALS="SIGILL,SIGBUS,SIGFPE"
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2
         with:

diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,7 @@
 *.jl.*.cov
 *.jl.mem
 docs/build
-/Manifest.toml
+*Manifest.toml
 benchmark/tune.json
 benchmark/results
 .vscode/settings.json
diff --git a/Project.toml b/Project.toml
@@ -8,7 +8,6 @@ Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 AtomsCalculators = "a3e0e189-c65a-42c1-833c-339540406eb1"
 BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CellListMap = "69e1c6dd-3888-40e6-b3c8-31ac5f578864"
 Chemfiles = "46823bd8-5fb3-5f92-9aa0-96921f3dd015"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
@@ -17,7 +16,9 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
 FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 PeriodicTable = "7b2266bf-644c-5ea3-82d8-af4bbd25a884"
@@ -32,14 +33,17 @@ UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
 UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 KernelDensity = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
 PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
 
 [extensions]
+MollyCUDAExt = "CUDA"
 MollyEnzymeExt = "Enzyme"
+MollyCUDAEnzymeExt = ["CUDA", "Enzyme"]
 MollyGLMakieExt = ["GLMakie", "Colors"]
 MollyKernelDensityExt = "KernelDensity"
 MollyPythonCallExt = "PythonCall"
@@ -61,7 +65,9 @@ Enzyme = "0.13.20"
 EzXML = "1"
 FLoops = "0.2"
 GLMakie = "0.8, 0.9, 0.10"
+GPUArrays = "10"
 Graphs = "1.8"
+KernelAbstractions = "0.9"
 KernelDensity = "0.5, 0.6"
 LinearAlgebra = "1.9"
 NearestNeighbors = "0.4"

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
@@ -62,7 +62,8 @@ const starting_velocities = [random_velocity(atom_mass, 1.0u"K") for i in 1:n_at
 const starting_coords_f32 = [Float32.(c) for c in starting_coords]
 const starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
 
-function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
+function test_sim(nl::Bool, parallel::Bool, f32::Bool,
+                  ArrayType::Type{AT}) where AT <: AbstractArray
     n_atoms = 400
     n_steps = 200
     atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
@@ -72,34 +73,27 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
     r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
     bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
     specific_inter_lists = (InteractionList2Atoms(
-        gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)),
-        gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)),
-        gpu ? CuArray(bonds) : bonds,
+        ArrayType(Int32.(collect(1:2:n_atoms))),
+        ArrayType(Int32.(collect(2:2:n_atoms))),
+        ArrayType(bonds),
     ),)
 
     neighbor_finder = NoNeighborFinder()
     cutoff = DistanceCutoff(f32 ? 1.0f0u"nm" : 1.0u"nm")
     pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
     if nl
         neighbor_finder = DistanceNeighborFinder(
-            eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
+            eligible=ArrayType(trues(n_atoms, n_atoms)),
             n_steps=10,
             dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
         )
         pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
     end
 
-    if gpu
-        coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords))
-        velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities))
-        atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                              ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
-    else
-        coords = copy(f32 ? starting_coords_f32 : starting_coords)
-        velocities = copy(f32 ? starting_velocities_f32 : starting_velocities)
-        atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                      ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]
-    end
+    coords = ArrayType(deepcopy(f32 ? starting_coords_f32 : starting_coords))
+    velocities = ArrayType(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
+    atoms = ArrayType([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
+                            ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
 
     sys = System(
         atoms=atoms,
@@ -117,22 +111,22 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
 end
 
 runs = [
-    ("CPU"       , [false, false, false, false]),
-    ("CPU f32"   , [false, false, true , false]),
-    ("CPU NL"    , [true , false, false, false]),
-    ("CPU f32 NL", [true , false, true , false]),
+    ("CPU"       , [false, false, false, Array]),
+    ("CPU f32"   , [false, false, true , Array]),
+    ("CPU NL"    , [true , false, false, Array]),
+    ("CPU f32 NL", [true , false, true , Array]),
 ]
 if run_parallel_tests
-    push!(runs, ("CPU parallel"       , [false, true , false, false]))
-    push!(runs, ("CPU parallel f32"   , [false, true , true , false]))
-    push!(runs, ("CPU parallel NL"    , [true , true , false, false]))
-    push!(runs, ("CPU parallel f32 NL", [true , true , true , false]))
+    push!(runs, ("CPU parallel"       , [false, true , false, Array]))
+    push!(runs, ("CPU parallel f32"   , [false, true , true , Array]))
+    push!(runs, ("CPU parallel NL"    , [true , true , false, Array]))
+    push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
 end
-if run_gpu_tests
-    push!(runs, ("GPU"       , [false, false, false, true]))
-    push!(runs, ("GPU f32"   , [false, false, true , true]))
-    push!(runs, ("GPU NL"    , [true , false, false, true]))
-    push!(runs, ("GPU f32 NL", [true , false, true , true]))
+if run_cuda_tests
+    push!(runs, ("GPU"       , [false, false, false, CuArray]))
+    push!(runs, ("GPU f32"   , [false, false, true , CuArray]))
+    push!(runs, ("GPU NL"    , [true , false, false, CuArray]))
+    push!(runs, ("GPU f32 NL", [true , false, true , CuArray]))
 end
 
 for (name, args) in runs

diff --git a/benchmark/protein.jl b/benchmark/protein.jl
@@ -11,7 +11,7 @@ const data_dir = normpath(dirname(pathof(Molly)), "..", "data")
 const ff_dir = joinpath(data_dir, "force_fields")
 const openmm_dir = joinpath(data_dir, "openmm_6mrr")
 
-function setup_system(gpu::Bool, f32::Bool, units::Bool)
+function setup_system(ArrayType::AbstractArray, f32::Bool, units::Bool)
     T = f32 ? Float32 : Float64
     ff = MolecularForceField(
         T,
@@ -27,7 +27,7 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool)
     sys = System(
         joinpath(data_dir, "6mrr_equil.pdb"),
         ff;
-        velocities=gpu ? CuArray(velocities) : velocities,
+        velocities=ArrayType(velocities),
         units=units,
         gpu=gpu,
         dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff),
@@ -42,15 +42,15 @@ end
 
 runs = [
     # run_name                             gpu    parr   f32    units
-    ("CPU 1 thread"                      , false, false, false, true ),
-    ("CPU 1 thread f32"                  , false, false, true , true ),
-    ("CPU 1 thread f32 nounits"          , false, false, true , false),
-    ("CPU $n_threads threads"            , false, true , false, true ),
-    ("CPU $n_threads threads f32"        , false, true , true , true ),
-    ("CPU $n_threads threads f32 nounits", false, true , true , false),
-    ("GPU"                               , true , false, false, true ),
-    ("GPU f32"                           , true , false, true , true ),
-    ("GPU f32 nounits"                   , true , false, true , false),
+    ("CPU 1 thread"                      , Array, false, false, true ),
+    ("CPU 1 thread f32"                  , Array, false, true , true ),
+    ("CPU 1 thread f32 nounits"          , Array, false, true , false),
+    ("CPU $n_threads threads"            , Array, true , false, true ),
+    ("CPU $n_threads threads f32"        , Array, true , true , true ),
+    ("CPU $n_threads threads f32 nounits", Array, true , true , false),
+    ("GPU"                               , CuArray, false, false, true ),
+    ("GPU f32"                           , CuArray, false, true , true ),
+    ("GPU f32 nounits"                   , CuArray, false, true , false),
 ]
 
 for (run_name, gpu, parallel, f32, units) in runs

diff --git a/docs/src/documentation.md b/docs/src/documentation.md
@@ -135,11 +135,21 @@ visualize(sys.loggers.coords, boundary, "sim_lj.mp4")
 
 ## GPU acceleration
 
-To run simulations on the GPU you will need to have a CUDA-compatible device.
-[CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) is used to run on the device.
+To run simulations on the GPU you will need to have a GPU available and then load the appropriate package:
+
+| Hardware Available | Necessary Package | Array Type |
+| ------------------ | ----------------- | ---------- |
+| Parallel CPU       | none              | Array      |
+| NVIDIA GPU         | CUDA              | CuArray    |
+| AMD GPU            | AMDGPU            | ROCArray   |
+| Intel GPU          | oneAPI            | oneArray   |
+| Apple Silicon      | Metal             | MtlArray   |
+
+As an important note, Metal / Apple Silicon devices can only run with 32 bit precision, so be sure to use `Float32` (for example) where necessary.
 Simulation setup is similar to above, but with the coordinates, velocities and atoms moved to the GPU.
 This example also shows setting up a simulation to run with `Float32`, which gives better performance on GPUs.
 Of course, you will need to determine whether this level of numerical accuracy is appropriate in your case.
+Here is an example script for an NVIDIA GPU using CUDA:
 ```julia
 using Molly
 using CUDA
@@ -168,6 +178,7 @@ sys = System(
 simulate!(deepcopy(sys), simulator, 20) # Compile function
 simulate!(sys, simulator, 1_000)
 ```
+To use another GPU package, just swap out `CUDA` for your desired package and `CuArray` for your desired array type.
 The device to run on can be changed with `device!`, e.g. `device!(1)`.
 The GPU code path is currently designed to be compatible with differentiable simulation and runs slower than related software, but this is an active area of development.
 Nonetheless, GPU performance is significantly better than CPU performance and is good enough for many applications.

diff --git a/ext/MollyCUDAEnzymeExt.jl b/ext/MollyCUDAEnzymeExt.jl
@@ -0,0 +1,13 @@
+module MollyCUDAEnzymeExt
+
+using Molly
+using CUDA
+using Enzyme
+
+ext = Base.get_extension(Molly,:MollyCUDAExt)
+
+EnzymeRules.inactive(::typeof(ext.cuda_threads_blocks_pairwise), args...) = nothing
+EnzymeRules.inactive(::typeof(ext.cuda_threads_blocks_specific), args...) = nothing
+
+
+end