diff --git a/README.md b/README.md index 3a02847..23d12b1 100644 --- a/README.md +++ b/README.md @@ -29,9 +29,8 @@ using SIMD function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N, T} @assert length(ys) == length(xs) @assert length(xs) % N == 0 - lane = VecRange{N}(0) - @inbounds for i in 1:N:length(xs) - xs[lane + i] += ys[lane + i] + @inbounds for lane in LoopVecRange{N}(xs) + xs[lane] += ys[lane] end end ``` diff --git a/src/SIMD.jl b/src/SIMD.jl index 12f9f97..ad02888 100644 --- a/src/SIMD.jl +++ b/src/SIMD.jl @@ -4,7 +4,7 @@ using Base: @propagate_inbounds export Vec, vload, vloada, vloadnt, vloadx, vstore, vstorea, vstorent, vstorec, vgather, vgathera, vscatter, vscattera, shufflevector, vifelse, valloc, - VecRange + VecRange, LoopVecRange const VE = Base.VecElement const LVec{N, T} = NTuple{N, VE{T}} diff --git a/src/arrayops.jl b/src/arrayops.jl index f696543..e43f84d 100644 --- a/src/arrayops.jl +++ b/src/arrayops.jl @@ -228,6 +228,63 @@ Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::VecRange) = Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::Vec) = all(first(inds) <= idx) && all(idx <= last(inds)) +export LoopVecRange + +""" + LoopVecRange{N}(start::Int, stop::Int) +Analogous to `UnitRange` but for iterating over a vector with SIMD vectors of width `N`. +# Examples +```jldoctest +julia> function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N, T} + for lane in LoopVecRange{N}(xs) + xs[lane] += ys[lane] + end +end +``` +""" +struct LoopVecRange{N} <: AbstractRange{Int} + start::Int + stop::Int + + Base.@propagate_inbounds function LoopVecRange{N}(start::Int, stop::Int; unsafe=false) where N + N == 0 && throw(ArgumentError("Step cannot be zero")) + + if !unsafe + @boundscheck (abs(stop - start) + 1) % N != 0 && throw(ArgumentError("Length of range, has to be a multiple of the width")) + end + + return new{N}(start, stop) + end +end + +Base.@propagate_inbounds LoopVecRange{N}(r::Base.OneTo; unsafe=false) where N = LoopVecRange{N}(1, r.stop, unsafe=unsafe) +Base.@propagate_inbounds LoopVecRange{N}(r::UnitRange; unsafe=false) where N = LoopVecRange{N}(r.start, r.stop, unsafe=unsafe) +Base.@propagate_inbounds LoopVecRange{N}(x::AbstractVector; unsafe=false) where N = LoopVecRange{N}(eachindex(x), unsafe=unsafe) + +Base.isempty(r::LoopVecRange{N}) where N = (r.start != r.stop) & ((N > zero(N)) != (r.stop > r.start)) + +Base.step(r::LoopVecRange{N}) where N = N +Base.has_offset_axes(::LoopVecRange) = false + +Base.first(r::LoopVecRange{N}) where N = VecRange{abs(N)}(r.start + (N < 0 ? N + 1 : 0)) +Base.last(r::LoopVecRange{N}) where N = VecRange{abs(N)}(r.stop + (N > 0 ? -N + 1 : 0)) + +Base.iterate(r::LoopVecRange) = isempty(r) ? nothing : (first(r), first(r)) + +function Base.iterate(r::LoopVecRange{N}, i::VecRange) where N + @inline + if (N > zero(N) && i.i >= last(r).i) || (N < zero(N) && i.i <= last(r).i) # greater than or equal prevents infinite loop if length of range is not a multiple of width + return nothing + end + next = i + step(r) + (next, next) +end + +Base.length(r::LoopVecRange{N}) where N = (r.stop - r.start + 1) รท N +Base.eltype(::Type{LoopVecRange{N}}) where N = VecRange{N} + +Base.show(io::IO, r::LoopVecRange) = print(io, repr(first(r)), ':', repr(last(r))) + @inline _checkarity(::AbstractArray{<:Any,N}, ::Vararg{Any,N}) where {N} = nothing diff --git a/test/runtests.jl b/test/runtests.jl index 9981e06..bbaec35 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -870,6 +870,207 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...)) end + @testset "LoopVecRange Real-world examples" begin + + function vadd!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1}, + ::Type{Vec{N,T}}) where {N,T} + @assert length(ys) == length(xs) + @assert length(xs) % N == 0 + @inbounds for lane in LoopVecRange{N}(xs) + xs[lane] += ys[lane] + end + end + + let xs = valloc(Float64, L4, 4*L4) do i i end, + ys = valloc(Float64, L4, 4*L4) do i 1 end + vadd!(xs, ys, V4F64) + @test xs == Float64[i+1 for i in 1:(4*L4)] + # @code_native vadd!(xs, ys, V4F64) + + ir = llvm_ir(vadd!, (xs, ys, V4F64)) + @test occursin(r"( load <4 x double>.*){2}"s, ir) + @test occursin(" store <4 x double>", ir) + @test occursin(" fadd <4 x double>", ir) + end + + function vsum(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T} + @assert length(xs) % N == 0 + sv = Vec{N,T}(0) + @inbounds for lane in LoopVecRange{N}(xs) + sv += xs[lane] + end + sum(sv) + end + + let xs = valloc(Float64, L4, 4*L4) do i i end + s = vsum(xs, V4F64) + @test s === (x->(x^2+x)/2)(Float64(4*L4)) + # @code_native vsum(xs, V4F64) + + ir = llvm_ir(vsum, (xs, V4F64)) + @test occursin(" load <4 x double>", ir) + @test occursin(" fadd <4 x double>", ir) + # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir) + end + + + function vadd_masked!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1}, + ::Type{Vec{N,T}}) where {N, T} + @assert length(ys) == length(xs) + limit = length(xs) - (N-1) + vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N))) + @inbounds for lane in LoopVecRange{N}(xs, unsafe=true) + if lane.i <= limit + xs[lane] += ys[lane] + else + mask = Vec{N,Int}(lane.i) <= vlimit + xs[lane, mask] = xs[lane, mask] + ys[lane, mask] + end + end + end + + let xs = valloc(Float64, 4, 13) do i i end, + ys = valloc(Float64, 4, 13) do i 1 end + vadd_masked!(xs, ys, V4F64) + @test xs == Float64[i+1 for i in 1:13] + # @code_native vadd!(xs, ys, V4F64) + + ir = llvm_ir(vadd_masked!, (xs, ys, V4F64)) + @test occursin(r"(masked.load.v4f64.*){2}"s, ir) + @test occursin("masked.store.v4f64", ir) + @test occursin(" store <4 x double>", ir) + @test occursin(" fadd <4 x double>", ir) + end + + function vsum_masked(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T} + vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N))) + sv = Vec{N,T}(0) + @inbounds for lane in LoopVecRange{N}(xs, unsafe=true) + mask = Vec{N,Int}(lane.i) <= vlimit + sv += xs[lane, mask] + end + sum(sv) + end + + let xs = valloc(Float64, 4, 13) do i i end + s = vsum_masked(xs, V4F64) + # @code_llvm vsum(xs, V4F64) + # @code_native vsum(xs, V4F64) + @test s === sum(xs) + + ir = llvm_ir(vsum_masked, (xs, V4F64)) + @test occursin("masked.load.v4f64", ir) + @test occursin(" fadd <4 x double>", ir) + # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir) + end + + end + + @testset "LoopVecRange" begin + @test_throws ArgumentError LoopVecRange{0}(1, 8) + @test_throws ArgumentError LoopVecRange{2}(1, 3) + @test_throws ArgumentError LoopVecRange{4}(3, 3) + + @test_throws ArgumentError LoopVecRange{0}(8, 1) + @test_throws ArgumentError LoopVecRange{2}(3, 1) + end + + @testset "Reverse LoopVecRange Real-world examples" begin + + function vadd!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1}, + ::Type{Vec{N,T}}) where {N,T} + @assert length(ys) == length(xs) + @assert length(xs) % N == 0 + @inbounds for lane in LoopVecRange{-N}(length(xs), 1) + xs[lane] += ys[lane] + end + end + + let xs = valloc(Float64, L4, 4*L4) do i i end, + ys = valloc(Float64, L4, 4*L4) do i 1 end + vadd!(xs, ys, V4F64) + @test xs == Float64[i+1 for i in 1:(4*L4)] + # @code_native vadd!(xs, ys, V4F64) + + ir = llvm_ir(vadd!, (xs, ys, V4F64)) + @test occursin(r"( load <4 x double>.*){2}"s, ir) + @test occursin(" store <4 x double>", ir) + @test occursin(" fadd <4 x double>", ir) + end + + function vsum(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T} + @assert length(xs) % N == 0 + sv = Vec{N,T}(0) + @inbounds for lane in LoopVecRange{-N}(length(xs), 1) + sv += xs[lane] + end + sum(sv) + end + + let xs = valloc(Float64, L4, 4*L4) do i i end + s = vsum(xs, V4F64) + @test s === (x->(x^2+x)/2)(Float64(4*L4)) + # @code_native vsum(xs, V4F64) + + ir = llvm_ir(vsum, (xs, V4F64)) + @test occursin(" load <4 x double>", ir) + @test occursin(" fadd <4 x double>", ir) + # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir) + end + + + function vadd_masked!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1}, + ::Type{Vec{N,T}}) where {N, T} + @assert length(ys) == length(xs) + limit = length(xs) - (N-1) + vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N))) + @inbounds for lane in LoopVecRange{-N}(length(xs) + (N - 1), 1, unsafe=true) + if lane.i <= limit + xs[lane] += ys[lane] + else + mask = Vec{N,Int}(lane.i) <= vlimit + xs[lane, mask] = xs[lane, mask] + ys[lane, mask] + end + end + end + + let xs = valloc(Float64, 4, 13) do i i end, + ys = valloc(Float64, 4, 13) do i 1 end + vadd_masked!(xs, ys, V4F64) + @test xs == Float64[i+1 for i in 1:13] + # @code_native vadd!(xs, ys, V4F64) + + ir = llvm_ir(vadd_masked!, (xs, ys, V4F64)) + @test occursin(r"(masked.load.v4f64.*){2}"s, ir) + @test occursin("masked.store.v4f64", ir) + @test occursin(" store <4 x double>", ir) + @test occursin(" fadd <4 x double>", ir) + end + + function vsum_masked(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T} + vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N))) + sv = Vec{N,T}(0) + @inbounds for lane in LoopVecRange{-N}(length(xs) + (N - 1), 1, unsafe=true) + mask = Vec{N,Int}(lane.i) <= vlimit + sv += xs[lane, mask] + end + sum(sv) + end + + let xs = valloc(Float64, 4, 13) do i i end + s = vsum_masked(xs, V4F64) + # @code_llvm vsum(xs, V4F64) + # @code_native vsum(xs, V4F64) + @test s === sum(xs) + + ir = llvm_ir(vsum_masked, (xs, V4F64)) + @test occursin("masked.load.v4f64", ir) + @test occursin(" fadd <4 x double>", ir) + # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir) + end + + end + @testset "Vector shuffles" begin for T in (Int8,UInt8,Int16,UInt16,Int32,UInt32,Int64,UInt64,Float32,Float64)