Skip to content

Commit

Permalink
Adding KernelAbstractions tooling for Molly and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
leios committed Dec 23, 2024
1 parent 32f5c73 commit 84bb0c8
Show file tree
Hide file tree
Showing 32 changed files with 1,006 additions and 916 deletions.
1 change: 1 addition & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ jobs:
- NotGradients
- Gradients
steps:
- run: export UCX_ERROR_SIGNALS="SIGILL,SIGBUS,SIGFPE"
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
with:
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
*.jl.*.cov
*.jl.mem
docs/build
/Manifest.toml
*Manifest.toml
benchmark/tune.json
benchmark/results
.vscode/settings.json
8 changes: 7 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
AtomsCalculators = "a3e0e189-c65a-42c1-833c-339540406eb1"
BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CellListMap = "69e1c6dd-3888-40e6-b3c8-31ac5f578864"
Chemfiles = "46823bd8-5fb3-5f92-9aa0-96921f3dd015"
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
Expand All @@ -17,7 +16,9 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
PeriodicTable = "7b2266bf-644c-5ea3-82d8-af4bbd25a884"
Expand All @@ -32,14 +33,17 @@ UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
KernelDensity = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"

[extensions]
MollyCUDAExt = "CUDA"
MollyEnzymeExt = "Enzyme"
MollyCUDAEnzymeExt = ["CUDA", "Enzyme"]
MollyGLMakieExt = ["GLMakie", "Colors"]
MollyKernelDensityExt = "KernelDensity"
MollyPythonCallExt = "PythonCall"
Expand All @@ -61,7 +65,9 @@ Enzyme = "0.13.20"
EzXML = "1"
FLoops = "0.2"
GLMakie = "0.8, 0.9, 0.10"
GPUArrays = "10"
Graphs = "1.8"
KernelAbstractions = "0.9"
KernelDensity = "0.5, 0.6"
LinearAlgebra = "1.9"
NearestNeighbors = "0.4"
Expand Down
52 changes: 23 additions & 29 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ const starting_velocities = [random_velocity(atom_mass, 1.0u"K") for i in 1:n_at
const starting_coords_f32 = [Float32.(c) for c in starting_coords]
const starting_velocities_f32 = [Float32.(c) for c in starting_velocities]

function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
function test_sim(nl::Bool, parallel::Bool, f32::Bool,
ArrayType::Type{AT}) where AT <: AbstractArray
n_atoms = 400
n_steps = 200
atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
Expand All @@ -72,34 +73,27 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
specific_inter_lists = (InteractionList2Atoms(
gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)),
gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)),
gpu ? CuArray(bonds) : bonds,
ArrayType(Int32.(collect(1:2:n_atoms))),
ArrayType(Int32.(collect(2:2:n_atoms))),
ArrayType(bonds),
),)

neighbor_finder = NoNeighborFinder()
cutoff = DistanceCutoff(f32 ? 1.0f0u"nm" : 1.0u"nm")
pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
if nl
neighbor_finder = DistanceNeighborFinder(
eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
eligible=ArrayType(trues(n_atoms, n_atoms)),
n_steps=10,
dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
)
pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
end

if gpu
coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords))
velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities))
atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
else
coords = copy(f32 ? starting_coords_f32 : starting_coords)
velocities = copy(f32 ? starting_velocities_f32 : starting_velocities)
atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]
end
coords = ArrayType(deepcopy(f32 ? starting_coords_f32 : starting_coords))
velocities = ArrayType(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
atoms = ArrayType([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])

sys = System(
atoms=atoms,
Expand All @@ -117,22 +111,22 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
end

runs = [
("CPU" , [false, false, false, false]),
("CPU f32" , [false, false, true , false]),
("CPU NL" , [true , false, false, false]),
("CPU f32 NL", [true , false, true , false]),
("CPU" , [false, false, false, Array]),
("CPU f32" , [false, false, true , Array]),
("CPU NL" , [true , false, false, Array]),
("CPU f32 NL", [true , false, true , Array]),
]
if run_parallel_tests
push!(runs, ("CPU parallel" , [false, true , false, false]))
push!(runs, ("CPU parallel f32" , [false, true , true , false]))
push!(runs, ("CPU parallel NL" , [true , true , false, false]))
push!(runs, ("CPU parallel f32 NL", [true , true , true , false]))
push!(runs, ("CPU parallel" , [false, true , false, Array]))
push!(runs, ("CPU parallel f32" , [false, true , true , Array]))
push!(runs, ("CPU parallel NL" , [true , true , false, Array]))
push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
end
if run_gpu_tests
push!(runs, ("GPU" , [false, false, false, true]))
push!(runs, ("GPU f32" , [false, false, true , true]))
push!(runs, ("GPU NL" , [true , false, false, true]))
push!(runs, ("GPU f32 NL", [true , false, true , true]))
if run_cuda_tests
push!(runs, ("GPU" , [false, false, false, CuArray]))
push!(runs, ("GPU f32" , [false, false, true , CuArray]))
push!(runs, ("GPU NL" , [true , false, false, CuArray]))
push!(runs, ("GPU f32 NL", [true , false, true , CuArray]))
end

for (name, args) in runs
Expand Down
22 changes: 11 additions & 11 deletions benchmark/protein.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ const data_dir = normpath(dirname(pathof(Molly)), "..", "data")
const ff_dir = joinpath(data_dir, "force_fields")
const openmm_dir = joinpath(data_dir, "openmm_6mrr")

function setup_system(gpu::Bool, f32::Bool, units::Bool)
function setup_system(ArrayType::AbstractArray, f32::Bool, units::Bool)
T = f32 ? Float32 : Float64
ff = MolecularForceField(
T,
Expand All @@ -27,7 +27,7 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool)
sys = System(
joinpath(data_dir, "6mrr_equil.pdb"),
ff;
velocities=gpu ? CuArray(velocities) : velocities,
velocities=ArrayType(velocities),
units=units,
gpu=gpu,
dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff),
Expand All @@ -42,15 +42,15 @@ end

runs = [
# run_name gpu parr f32 units
("CPU 1 thread" , false, false, false, true ),
("CPU 1 thread f32" , false, false, true , true ),
("CPU 1 thread f32 nounits" , false, false, true , false),
("CPU $n_threads threads" , false, true , false, true ),
("CPU $n_threads threads f32" , false, true , true , true ),
("CPU $n_threads threads f32 nounits", false, true , true , false),
("GPU" , true , false, false, true ),
("GPU f32" , true , false, true , true ),
("GPU f32 nounits" , true , false, true , false),
("CPU 1 thread" , Array, false, false, true ),
("CPU 1 thread f32" , Array, false, true , true ),
("CPU 1 thread f32 nounits" , Array, false, true , false),
("CPU $n_threads threads" , Array, true , false, true ),
("CPU $n_threads threads f32" , Array, true , true , true ),
("CPU $n_threads threads f32 nounits", Array, true , true , false),
("GPU" , CuArray, false, false, true ),
("GPU f32" , CuArray, false, true , true ),
("GPU f32 nounits" , CuArray, false, true , false),
]

for (run_name, gpu, parallel, f32, units) in runs
Expand Down
15 changes: 13 additions & 2 deletions docs/src/documentation.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,11 +135,21 @@ visualize(sys.loggers.coords, boundary, "sim_lj.mp4")

## GPU acceleration

To run simulations on the GPU you will need to have a CUDA-compatible device.
[CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) is used to run on the device.
To run simulations on the GPU you will need to have a GPU available and then load the appropriate package:

| Hardware Available | Necessary Package | Array Type |
| ------------------ | ----------------- | ---------- |
| Parallel CPU | none | Array |
| NVIDIA GPU | CUDA | CuArray |
| AMD GPU | AMDGPU | ROCArray |
| Intel GPU | oneAPI | oneArray |
| Apple Silicon | Metal | MtlArray |

As an important note, Metal / Apple Silicon devices can only run with 32 bit precision, so be sure to use `Float32` (for example) where necessary.
Simulation setup is similar to above, but with the coordinates, velocities and atoms moved to the GPU.
This example also shows setting up a simulation to run with `Float32`, which gives better performance on GPUs.
Of course, you will need to determine whether this level of numerical accuracy is appropriate in your case.
Here is an example script for an NVIDIA GPU using CUDA:
```julia
using Molly
using CUDA
Expand Down Expand Up @@ -168,6 +178,7 @@ sys = System(
simulate!(deepcopy(sys), simulator, 20) # Compile function
simulate!(sys, simulator, 1_000)
```
To use another GPU package, just swap out `CUDA` for your desired package and `CuArray` for your desired array type.
The device to run on can be changed with `device!`, e.g. `device!(1)`.
The GPU code path is currently designed to be compatible with differentiable simulation and runs slower than related software, but this is an active area of development.
Nonetheless, GPU performance is significantly better than CPU performance and is good enough for many applications.
Expand Down
13 changes: 13 additions & 0 deletions ext/MollyCUDAEnzymeExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
module MollyCUDAEnzymeExt

using Molly
using CUDA
using Enzyme

ext = Base.get_extension(Molly,:MollyCUDAExt)

EnzymeRules.inactive(::typeof(ext.cuda_threads_blocks_pairwise), args...) = nothing
EnzymeRules.inactive(::typeof(ext.cuda_threads_blocks_specific), args...) = nothing


end
Loading

0 comments on commit 84bb0c8

Please sign in to comment.