From eebb7e5d916d8f9153c5e0fd7d184c54a58c7086 Mon Sep 17 00:00:00 2001 From: Omar Elrefaei Date: Mon, 15 Apr 2024 18:51:47 -0400 Subject: [PATCH 1/2] initial setup to allow GPU simulation with constraints (this gpu path currently severely underperforms cpu) --- src/constraints/shake.jl | 179 +++++++++++++++++++++++++++------------ 1 file changed, 125 insertions(+), 54 deletions(-) diff --git a/src/constraints/shake.jl b/src/constraints/shake.jl index cbb8dd2d..6570c208 100644 --- a/src/constraints/shake.jl +++ b/src/constraints/shake.jl @@ -43,48 +43,61 @@ function apply_position_constraints!(sys, ca::SHAKE_RATTLE, coord_storage; converged = false while !converged + # syncing here is an incremental improvement for gpu and inc reg for cpu for cluster in ca.clusters # Cannot parallelize this - for constraint in cluster.constraints - k1, k2 = constraint.i, constraint.j + if coord_storage isa CuArray + # @show "GPU VERSION of apply_position_constraints_kernel!" + # blocking=true saves a fair # of allocations, but not time + CUDA.@sync blocking = true @cuda apply_position_constraints_kernel!(sys.coords, sys.atoms, sys.boundary, cluster, coord_storage, ca.dist_tolerance) + # @cuda apply_position_constraints_kernel!(sys.coords, sys.atoms, sys.boundary, cluster, coord_storage, ca.dist_tolerance) + else + # @show "CPU VERSION of apply_position_constraints_kernel!" + apply_position_constraints_kernel!(sys.coords, sys.atoms, sys.boundary, cluster, coord_storage, ca.dist_tolerance) - # Vector between the atoms after unconstrained update (s) - s12 = vector(sys.coords[k1], sys.coords[k2], sys.boundary) - - # Vector between the atoms before unconstrained update (r) - r12 = vector(coord_storage[k1], coord_storage[k2], sys.boundary) + end + end - if abs(norm(s12) - constraint.dist) > ca.dist_tolerance - m1_inv = inv(mass(sys.atoms[k1])) - m2_inv = inv(mass(sys.atoms[k2])) - a = (m1_inv + m2_inv)^2 * sum(abs2, r12) - b = -2 * (m1_inv + m2_inv) * dot(r12, s12) - c = sum(abs2, s12) - (constraint.dist)^2 - D = b^2 - 4*a*c + converged = check_position_constraints(sys, ca) + end + return sys +end - if ustrip(D) < 0.0 - @warn "SHAKE determinant negative, setting to 0.0" - D = zero(D) - end +function apply_position_constraints_kernel!(coords, atoms, boundary, cluster, coord_storage, dist_tolerance) + @inbounds for constraint in cluster.constraints + k1, k2 = constraint.i, constraint.j + + # Vector between the atoms after unconstrained update (s) + s12 = vector(coords[k1], coords[k2], boundary) + + # Vector between the atoms before unconstrained update (r) + r12 = vector(coord_storage[k1], coord_storage[k2], boundary) + if abs(norm(s12) - constraint.dist) > dist_tolerance + m1_inv = inv(mass(atoms[k1])) + m2_inv = inv(mass(atoms[k2])) + a = (m1_inv + m2_inv)^2 * sum(abs2, r12) + b = -2 * (m1_inv + m2_inv) * dot(r12, s12) + c = sum(abs2, s12) - (constraint.dist)^2 + D = b^2 - 4 * a * c + + if ustrip(D) < 0.0 + # @warn "SHAKE determinant negative, setting to 0.0" + D = zero(D) + end - # Quadratic solution for g - α1 = (-b + sqrt(D)) / (2*a) - α2 = (-b - sqrt(D)) / (2*a) + # Quadratic solution for g + α1 = (-b + sqrt(D)) / (2 * a) + α2 = (-b - sqrt(D)) / (2 * a) - g = abs(α1) <= abs(α2) ? α1 : α2 + g = abs(α1) <= abs(α2) ? α1 : α2 - # Update positions - δri1 = r12 .* (g*m1_inv) - δri2 = r12 .* (-g*m2_inv) + # Update positions + δri1 = r12 .* (g * m1_inv) + δri2 = r12 .* (-g * m2_inv) - sys.coords[k1] += δri1 - sys.coords[k2] += δri2 - end - end + coords[k1] += δri1 + coords[k2] += δri2 end - - converged = check_position_constraints(sys, ca) end - return sys end function apply_velocity_constraints!(sys, ca::SHAKE_RATTLE; n_threads::Integer=Threads.nthreads()) @@ -93,34 +106,44 @@ function apply_velocity_constraints!(sys, ca::SHAKE_RATTLE; n_threads::Integer=T while !converged for cluster in ca.clusters # Cannot parallelize this - for constraint in cluster.constraints - k1, k2 = constraint.i, constraint.j + if sys.coords isa CuArray + CUDA.@sync blocking = true @cuda apply_velocity_constraints_kernel!(sys.coords, sys.atoms, sys.velocities, sys.boundary, cluster, ca.vel_tolerance) + # @cuda apply_velocity_constraints_kernel!(sys.coords, sys.atoms, sys.velocities, sys.boundary, cluster, ca.vel_tolerance) + else + apply_velocity_constraints_kernel!(sys.coords, sys.atoms, sys.velocities, sys.boundary, cluster, ca.vel_tolerance) + end + end - inv_m1 = inv(mass(sys.atoms[k1])) - inv_m2 = inv(mass(sys.atoms[k2])) + converged = check_velocity_constraints(sys, ca) + end + return sys +end - # Vector between the atoms after SHAKE constraint - r_k1k2 = vector(sys.coords[k1], sys.coords[k2], sys.boundary) +function apply_velocity_constraints_kernel!(coords, atoms, velocities, boundary, cluster, vel_tolerance) + @inbounds for constraint in cluster.constraints + k1, k2 = constraint.i, constraint.j - # Difference between unconstrainted velocities - v_k1k2 = sys.velocities[k2] .- sys.velocities[k1] + inv_m1 = inv(mass(atoms[k1])) + inv_m2 = inv(mass(atoms[k2])) - err = abs(dot(r_k1k2, v_k1k2)) - if err > ca.vel_tolerance - # Re-arrange constraint equation to solve for Lagrange multiplier - # This has a factor of dt which cancels out in the velocity update - λₖ = -dot(r_k1k2, v_k1k2) / (dot(r_k1k2, r_k1k2) * (inv_m1 + inv_m2)) + # Vector between the atoms after SHAKE constraint + r_k1k2 = vector(coords[k1], coords[k2], boundary) - # Correct velocities - sys.velocities[k1] -= inv_m1 .* λₖ .* r_k1k2 - sys.velocities[k2] += inv_m2 .* λₖ .* r_k1k2 - end - end - end + # Difference between unconstrainted velocities + v_k1k2 = velocities[k2] .- velocities[k1] - converged = check_velocity_constraints(sys, ca) + err = abs(dot(r_k1k2, v_k1k2)) + if err > vel_tolerance + # Re-arrange constraint equation to solve for Lagrange multiplier + # This has a factor of dt which cancels out in the velocity update + λₖ = -dot(r_k1k2, v_k1k2) / (dot(r_k1k2, r_k1k2) * (inv_m1 + inv_m2)) + + # Correct velocities + velocities[k1] -= inv_m1 .* λₖ .* r_k1k2 + velocities[k2] += inv_m2 .* λₖ .* r_k1k2 + end end - return sys + end """ @@ -128,7 +151,7 @@ end Checks if the position constraints are satisfied by the current coordinates of `sys`. """ -function check_position_constraints(sys, ca::SHAKE_RATTLE) +function check_position_constraints(sys::System{D,false}, ca::SHAKE_RATTLE) where {D} max_err = typemin(float_type(sys)) * unit(eltype(eltype(sys.coords))) for cluster in ca.clusters for constraint in cluster.constraints @@ -142,6 +165,29 @@ function check_position_constraints(sys, ca::SHAKE_RATTLE) return max_err < ca.dist_tolerance end +# GPU version of check_position_constraints +function check_position_constraints(sys::System{D,true}, ca::SHAKE_RATTLE) where {D} + # println("GPU VERSION of check_position_constraints") + max_err = typemin(float_type(sys)) * unit(eltype(eltype(sys.coords))) + deltas = CUDA.fill(max_err, length(ca.clusters)) + + CUDA.@sync blocking = true for (i, cluster) in enumerate(ca.clusters) + @cuda check_position_kernel(sys.coords, cluster, sys.boundary, deltas, i) + end + max_err = CUDA.maximum(deltas) + return max_err < ca.dist_tolerance +end + +function check_position_kernel(coords, cluster, boundary, deltas, i) + @inbounds for constraint in cluster.constraints + dr = vector(coords[constraint.i], coords[constraint.j], boundary) + err = abs(norm(dr) - constraint.dist) + + if deltas[i] < err + deltas[i] = err + end + end +end """ check_velocity_constraints(sys, constraints) @@ -161,3 +207,28 @@ function check_velocity_constraints(sys::System, ca::SHAKE_RATTLE) end return max_err < ca.vel_tolerance end + +# GPU version of check_velocity_constraints +function check_velocity_constraints(sys::System{D,true}, ca::SHAKE_RATTLE) where {D} + max_err = typemin(float_type(sys)) * unit(eltype(eltype(sys.velocities))) * unit(eltype(eltype(sys.coords))) + deltas = CUDA.fill(max_err, length(ca.clusters)) + + CUDA.@sync blocking = true for (i, cluster) in enumerate(ca.clusters) + @cuda check_velocity_kernel(sys.coords, sys.velocities, cluster, sys.boundary, deltas, i) + end + max_err = CUDA.maximum(deltas) + return max_err < ca.vel_tolerance +end + +function check_velocity_kernel(coords, velocities, cluster, boundary, deltas, i) + @inbounds for constraint in cluster.constraints + dr = vector(coords[constraint.i], coords[constraint.j], boundary) + v_diff = velocities[constraint.j] .- velocities[constraint.i] + err = abs(dot(dr, v_diff)) + if deltas[i] < err + deltas[i] = err + end + end +end + + From 759c1920214d2576dbc3d1a91e486dbf9b1aa4ce Mon Sep 17 00:00:00 2001 From: Omar Elrefaei Date: Wed, 17 Apr 2024 16:39:10 -0400 Subject: [PATCH 2/2] add proper implementation for constraints on GPU the core algorithms remains the same the comments about "# Cannot parallelize this" was deemed to be about parallelizing the computation of constraints inside a cluster not different clusters --- src/constraints/constraints.jl | 2 +- src/constraints/shake.jl | 101 +++++++++++++++++++++------------ src/types.jl | 3 - 3 files changed, 65 insertions(+), 41 deletions(-) diff --git a/src/constraints/constraints.jl b/src/constraints/constraints.jl index 95014e43..a872adb2 100644 --- a/src/constraints/constraints.jl +++ b/src/constraints/constraints.jl @@ -46,7 +46,7 @@ Base.length(cc::ConstraintCluster) = length(cc.constraints) Disables neighbor list interactions between atoms in a constraint. """ function disable_constrained_interactions!(neighbor_finder, constraint_clusters) - for cluster in constraint_clusters + CUDA.@allowscalar for cluster in constraint_clusters for constraint in cluster.constraints neighbor_finder.eligible[constraint.i, constraint.j] = false neighbor_finder.eligible[constraint.j, constraint.i] = false diff --git a/src/constraints/shake.jl b/src/constraints/shake.jl index 6570c208..47b58705 100644 --- a/src/constraints/shake.jl +++ b/src/constraints/shake.jl @@ -37,33 +37,42 @@ function SHAKE_RATTLE(constraints, n_atoms::Integer, dist_tolerance, vel_toleran clusters, dist_tolerance, vel_tolerance) end -function apply_position_constraints!(sys, ca::SHAKE_RATTLE, coord_storage; - n_threads::Integer=Threads.nthreads()) +function apply_position_constraints!(sys::System{D,false}, ca::SHAKE_RATTLE, coord_storage; + n_threads::Integer=Threads.nthreads()) where D # SHAKE updates converged = false - while !converged - # syncing here is an incremental improvement for gpu and inc reg for cpu - for cluster in ca.clusters # Cannot parallelize this - if coord_storage isa CuArray - # @show "GPU VERSION of apply_position_constraints_kernel!" - # blocking=true saves a fair # of allocations, but not time - CUDA.@sync blocking = true @cuda apply_position_constraints_kernel!(sys.coords, sys.atoms, sys.boundary, cluster, coord_storage, ca.dist_tolerance) - # @cuda apply_position_constraints_kernel!(sys.coords, sys.atoms, sys.boundary, cluster, coord_storage, ca.dist_tolerance) - else - # @show "CPU VERSION of apply_position_constraints_kernel!" - apply_position_constraints_kernel!(sys.coords, sys.atoms, sys.boundary, cluster, coord_storage, ca.dist_tolerance) - - end + for cluster in ca.clusters + apply_position_constraints_kernel!(sys.coords, sys.atoms, sys.boundary, cluster, coord_storage, ca.dist_tolerance) end + converged = check_position_constraints(sys, ca) + end + return sys +end +function apply_position_constraints!(sys::System{D,true}, ca::SHAKE_RATTLE, coord_storage; + n_threads::Integer=Threads.nthreads()) where {D} + # SHAKE updates + converged = false + clusters = cu(ca.clusters) + n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(ca.clusters)) + while !converged + CUDA.@sync @cuda threads = n_threads_gpu blocks = n_blocks apply_position_constraints_gpu!( + sys.coords, sys.atoms, sys.boundary, clusters, coord_storage, ca.dist_tolerance) converged = check_position_constraints(sys, ca) end return sys end +function apply_position_constraints_gpu!(coords, atoms, boundary, clusters, coord_storage, dist_tolerance) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + (i > length(clusters)) && return nothing + cluster = clusters[i] + apply_position_constraints_kernel!(coords, atoms, boundary, cluster, coord_storage, dist_tolerance) +end + function apply_position_constraints_kernel!(coords, atoms, boundary, cluster, coord_storage, dist_tolerance) - @inbounds for constraint in cluster.constraints + @inbounds for constraint in cluster.constraints # Cannot parallelize this k1, k2 = constraint.i, constraint.j # Vector between the atoms after unconstrained update (s) @@ -105,11 +114,13 @@ function apply_velocity_constraints!(sys, ca::SHAKE_RATTLE; n_threads::Integer=T converged = false while !converged - for cluster in ca.clusters # Cannot parallelize this - if sys.coords isa CuArray - CUDA.@sync blocking = true @cuda apply_velocity_constraints_kernel!(sys.coords, sys.atoms, sys.velocities, sys.boundary, cluster, ca.vel_tolerance) - # @cuda apply_velocity_constraints_kernel!(sys.coords, sys.atoms, sys.velocities, sys.boundary, cluster, ca.vel_tolerance) - else + if is_on_gpu(sys) + clusters = cu(ca.clusters) + n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(ca.clusters)) + CUDA.@sync @cuda threads = n_threads_gpu blocks = n_blocks apply_velocity_constraints_gpu!( + sys.coords, sys.atoms, sys.velocities, sys.boundary, clusters, ca.vel_tolerance) + else + for cluster in ca.clusters apply_velocity_constraints_kernel!(sys.coords, sys.atoms, sys.velocities, sys.boundary, cluster, ca.vel_tolerance) end end @@ -119,8 +130,15 @@ function apply_velocity_constraints!(sys, ca::SHAKE_RATTLE; n_threads::Integer=T return sys end +function apply_velocity_constraints_gpu!(coords, atoms, velocities, boundary, clusters, vel_tolerance) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + (i > length(clusters)) && return nothing + cluster = clusters[i] + apply_velocity_constraints_kernel!(coords, atoms, velocities, boundary, cluster, vel_tolerance) +end + function apply_velocity_constraints_kernel!(coords, atoms, velocities, boundary, cluster, vel_tolerance) - @inbounds for constraint in cluster.constraints + @inbounds for constraint in cluster.constraints # Cannot parallelize this k1, k2 = constraint.i, constraint.j inv_m1 = inv(mass(atoms[k1])) @@ -165,21 +183,25 @@ function check_position_constraints(sys::System{D,false}, ca::SHAKE_RATTLE) wher return max_err < ca.dist_tolerance end -# GPU version of check_position_constraints function check_position_constraints(sys::System{D,true}, ca::SHAKE_RATTLE) where {D} - # println("GPU VERSION of check_position_constraints") max_err = typemin(float_type(sys)) * unit(eltype(eltype(sys.coords))) deltas = CUDA.fill(max_err, length(ca.clusters)) + clusters = cu(ca.clusters) - CUDA.@sync blocking = true for (i, cluster) in enumerate(ca.clusters) - @cuda check_position_kernel(sys.coords, cluster, sys.boundary, deltas, i) - end - max_err = CUDA.maximum(deltas) + n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(ca.clusters)) + CUDA.@sync @cuda threads=n_threads_gpu blocks= + n_blocks check_position_kernel!(sys.coords, clusters, sys.boundary, deltas) + + max_err = maximum(deltas) return max_err < ca.dist_tolerance end -function check_position_kernel(coords, cluster, boundary, deltas, i) - @inbounds for constraint in cluster.constraints +function check_position_kernel!(coords, clusters, boundary, deltas) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + (i > length(clusters)) && return nothing + cluster = clusters[i] + + for constraint in cluster.constraints dr = vector(coords[constraint.i], coords[constraint.j], boundary) err = abs(norm(dr) - constraint.dist) @@ -208,20 +230,25 @@ function check_velocity_constraints(sys::System, ca::SHAKE_RATTLE) return max_err < ca.vel_tolerance end -# GPU version of check_velocity_constraints function check_velocity_constraints(sys::System{D,true}, ca::SHAKE_RATTLE) where {D} max_err = typemin(float_type(sys)) * unit(eltype(eltype(sys.velocities))) * unit(eltype(eltype(sys.coords))) deltas = CUDA.fill(max_err, length(ca.clusters)) + clusters = cu(ca.clusters) - CUDA.@sync blocking = true for (i, cluster) in enumerate(ca.clusters) - @cuda check_velocity_kernel(sys.coords, sys.velocities, cluster, sys.boundary, deltas, i) - end - max_err = CUDA.maximum(deltas) + n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(ca.clusters)) + CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks check_velocity_kernel!( + sys.coords, sys.velocities, clusters, sys.boundary, deltas) + + max_err = maximum(deltas) return max_err < ca.vel_tolerance end -function check_velocity_kernel(coords, velocities, cluster, boundary, deltas, i) - @inbounds for constraint in cluster.constraints +function check_velocity_kernel!(coords, velocities, clusters, boundary, deltas) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + (i > length(clusters)) && return nothing + cluster = clusters[i] + + for constraint in cluster.constraints dr = vector(coords[constraint.i], coords[constraint.j], boundary) v_diff = velocities[constraint.j] .- velocities[constraint.i] err = abs(dot(dr, v_diff)) diff --git a/src/types.jl b/src/types.jl index 77fd8517..fe3acae7 100644 --- a/src/types.jl +++ b/src/types.jl @@ -559,9 +559,6 @@ function System(; if isa(vels, CuArray) && !isa(atoms, CuArray) throw(ArgumentError("the velocities are on the GPU but the atoms are not")) end - if isa(atoms, CuArray) && length(constraints) > 0 - @warn "Constraints are not currently compatible with simulation on the GPU" - end atom_masses = mass.(atoms) M = typeof(atom_masses)