-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy patharrayops.jl
328 lines (284 loc) · 12.8 KB
/
arrayops.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
using Base: Slice, ScalarIndex
"""
ContiguousSubArray{T,N,P,I,L}
Like `Base.FastContiguousSubArray` but without requirement for linear
indexing (i.e., type parameter `L` can be `false`).
# Examples
```
julia> A = view(ones(5, 5), :, [1,3]);
julia> A isa Base.FastContiguousSubArray
false
julia> A isa SIMD.ContiguousSubArray
true
```
"""
ContiguousSubArray{T,N,P,
I<:Union{Tuple{Union{Slice, AbstractUnitRange}, Vararg{Any}},
Tuple{Vararg{ScalarIndex}}},
L} = SubArray{T,N,P,I,L}
"""
ContiguousArray{T,N}
Array types with contiguous first dimension.
"""
ContiguousArray{T,N} = Union{DenseArray{T,N}, ContiguousSubArray{T,N},
Base.ReinterpretArray{T,N,T2,A} where {A <: Union{DenseArray{T2,N},
ContiguousSubArray{T2,N}}} where {T2}}
"""
FastContiguousArray{T,N}
This is the type of arrays that `pointer(A, i)` works.
"""
FastContiguousArray{T,N} = Union{DenseArray{T,N}, Base.FastContiguousSubArray{T,N},
Base.ReinterpretArray{T,N,T2,A} where {A <: Union{DenseArray{T2,N},
Base.FastContiguousSubArray{T2,N}}} where {T2}}
# https://github.com/eschnett/SIMD.jl/pull/40#discussion_r254131184
# https://github.com/JuliaArrays/MappedArrays.jl/pull/24#issuecomment-460568978
# vload
@propagate_inbounds function vload(::Type{Vec{N, T}}, ptr::Ptr{T}, mask::Union{Nothing, Vec{N, Bool}}=nothing,
::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
if mask === nothing
Vec(Intrinsics.load(Intrinsics.LVec{N, T}, ptr, Val(Aligned), Val(Nontemporal)))
else
Vec(Intrinsics.maskedload(ptr, mask.data, Val(Aligned), Val(Nontemporal)))
end
end
@propagate_inbounds function vload(::Type{Vec{N, T}}, a::FastContiguousArray{T,1}, i::Integer, mask=nothing,
::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
@boundscheck checkbounds(a, i:(i+N-1))
GC.@preserve a begin
ptr = pointer(a, i)
vload(Vec{N, T}, ptr, mask, Val(Aligned), Val(Nontemporal))
end
end
@propagate_inbounds vloada(::Type{Vec{N, T}}, ptr::Ptr{T}, mask=nothing) where {N, T} = vload(Vec{N, T}, ptr, mask, Val(true))
@propagate_inbounds vloadnt(::Type{Vec{N, T}}, ptr::Ptr{T}, mask=nothing) where {N, T} = vload(Vec{N, T}, ptr, mask, Val(true), Val(true))
@propagate_inbounds vloada(::Type{Vec{N, T}}, a::FastContiguousArray{T,1}, i::Integer, mask=nothing) where {N, T} = vload(Vec{N, T}, a, i, mask, Val(true))
@propagate_inbounds vloadnt(::Type{Vec{N, T}}, a::FastContiguousArray{T,1}, i::Integer, mask=nothing) where {N, T} = vload(Vec{N, T}, a, i, mask, Val(true), Val(true))
# vstore
@propagate_inbounds function vstore(x::Vec{N, T}, ptr::Ptr{T}, mask::Union{Nothing, Vec{N, Bool}}=nothing,
::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
if mask === nothing
Intrinsics.store(x.data, ptr, Val(Aligned), Val(Nontemporal))
else
Intrinsics.maskedstore(x.data, ptr, mask.data, Val(Aligned), Val(Nontemporal))
end
end
@propagate_inbounds function vstore(x::Vec{N, T}, a::FastContiguousArray{T,1}, i::Integer, mask=nothing,
::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
@boundscheck checkbounds(a, i:(i+N-1))
GC.@preserve a begin
ptr = pointer(a, i)
vstore(x, ptr, mask, Val(Aligned), Val(Nontemporal))
end
return a
end
@propagate_inbounds vstorea(x::Vec, ptr::Ptr, mask=nothing) = vstore(x, ptr, mask, Val(true))
@propagate_inbounds vstorent(x::Vec, ptr::Ptr, mask=nothing) = vstore(x, ptr, mask, Val(true), Val(true))
@propagate_inbounds vstorea(x::Vec, a, i, mask=nothing) = vstore(x, a, i, mask, Val(true))
@propagate_inbounds vstorent(x::Vec, a, i, mask=nothing) = vstore(x, a, i, mask, Val(true), Val(true))
@inline vloadx(ptr::Ptr, mask::Vec{<:Any, Bool}) =
Vec(Intrinsics.maskedexpandload(ptr, mask.data))
@propagate_inbounds function vloadx(a::FastContiguousArray{T,1},
i::Integer, mask::Vec{N, Bool}) where {N, T}
@boundscheck checkbounds(a, i:i + N - 1)
return GC.@preserve a begin
ptr = pointer(a, i)
vloadx(ptr, mask)
end
end
@inline vstorec(x::Vec{N, T}, ptr::Ptr{T}, mask::Vec{N, Bool}) where {N, T} =
Intrinsics.maskedcompressstore(x.data, ptr, mask.data)
@propagate_inbounds function vstorec(x::Vec{N, T}, a::FastContiguousArray{T,1},
i::Integer, mask::Vec{N, Bool}) where {N, T}
@boundscheck checkbounds(a, i:i + N - 1)
GC.@preserve a begin
ptr = pointer(a, i)
vstorec(x, ptr, mask)
end
return a
end
function valloc(::Type{T}, N::Int, sz::Int) where T
@assert N > 0
@assert sz >= 0
# We use padding to align the address of the first element, and
# also to ensure that we can access past the last element up to
# the next full vector width
padding = N-1 + mod(-sz, N)
mem = Vector{T}(undef, sz + padding)
addr = Int(pointer(mem))
off = mod(-addr, N * sizeof(T))
@assert mod(off, sizeof(T)) == 0
off = fld(off, sizeof(T))
@assert 0 <= off <= padding
res = view(mem, off+1 : off+sz)
addr2 = Int(pointer(res))
@assert mod(addr2, N * sizeof(T)) == 0
res
end
function valloc(f, ::Type{T}, N::Int, sz::Int) where T
mem = valloc(T, N, sz)
@inbounds for i in 1:sz
mem[i] = f(i)
end
mem
end
@inline function _get_vec_pointers(a, idx::Vec{N, Int}) where {N}
ptrs = pointer(a) + (idx - 1) * sizeof(eltype(a))
end
# Have to be careful with optional arguments and @boundscheck,
# see https://github.com/JuliaLang/julia/issues/30411,
# therefore use @propagate_inbounds
@inline vgather(ptrs::Vec{N,Ptr{T}},
mask::Vec{N,Bool}=one(Vec{N,Bool}),
::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned} =
return Vec(Intrinsics.maskedgather(ptrs.data, mask.data))
@inline vgather(ptrs::Vec{<:Any,<:Ptr{<:ScalarTypes}}, ::Nothing, aligned::Val = Val(false)) =
vgather(ptrs, one(Vec{length(ptrs),Bool}), aligned)
@propagate_inbounds function vgather(a::FastContiguousArray{T,1}, idx::Vec{N, Int},
mask::Vec{N,Bool}=one(Vec{N,Bool}),
::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned}
@boundscheck for i in 1:N
checkbounds(a, @inbounds idx[i])
end
GC.@preserve a begin
ptrs = _get_vec_pointers(a, idx)
return vgather(ptrs, mask, Val(Aligned))
end
end
@propagate_inbounds vgathera(a, idx, mask) = vgather(a, idx, mask, Val(true))
@propagate_inbounds vgathera(a, idx::Vec{N}) where {N} = vgather(a, idx, one(Vec{N,Bool}), Val(true))
@propagate_inbounds Base.getindex(a::FastContiguousArray{T,1}, idx::Vec{N,Int}) where {N,T} =
vgather(a, idx)
@propagate_inbounds vscatter(x::Vec{N,T}, ptrs::Vec{N,Ptr{T}},
mask::Vec{N,Bool}, ::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned} =
Intrinsics.maskedscatter(x.data, ptrs.data, mask.data)
@inline vscatter(x::Vec{N,T}, ptrs::Vec{N,Ptr{T}},
::Nothing, aligned::Val=Val(false)) where {N, T<:ScalarTypes} =
vscatter(x, ptrs, one(Vec{N, Bool}), aligned)
@propagate_inbounds function vscatter(x::Vec{N,T}, a::FastContiguousArray{T,1}, idx::Vec{N, Int},
mask::Vec{N,Bool}=one(Vec{N, Bool}),
::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned}
@boundscheck for i in 1:N
checkbounds(a, @inbounds idx[i])
end
GC.@preserve a begin
ptrs = _get_vec_pointers(a, idx)
vscatter(x, ptrs, mask, Val(Aligned))
end
return
end
@propagate_inbounds vscattera(x, a, idx, mask) = vscatter(x, a, idx, mask, Val(true))
@propagate_inbounds vscattera(x, a, idx::Vec{N}) where {N} = vscatter(x, a, idx, one(Vec{N,Bool}), Val(true))
@propagate_inbounds Base.setindex!(a::FastContiguousArray{T,1}, v::Vec{N,T}, idx::Vec{N,Int}) where {N, T} =
vscatter(v, a, idx)
export VecRange
"""
VecRange{N}(i::Int)
Analogous to `UnitRange` but for loading SIMD vector of width `N` at
index `i`.
# Examples
```jldoctest
julia> xs = ones(4);
julia> xs[VecRange{4}(1)] # calls `vload(Vec{4,Float64}, xs, 1)`
<4 x Float64>[1.0, 1.0, 1.0, 1.0]
```
"""
struct VecRange{N}
i::Int
end
@inline Base.length(idx::VecRange{N}) where {N} = N
@inline Base.first(idx::VecRange) = idx.i
@inline Base.last(idx::VecRange) = idx.i + length(idx) - 1
@inline Base.:+(idx::VecRange{N}, j::Integer) where N = VecRange{N}(idx.i + j)
@inline Base.:+(j::Integer, idx::VecRange{N}) where N = VecRange{N}(idx.i + j)
@inline Base.:-(idx::VecRange{N}, j::Integer) where N = VecRange{N}(idx.i - j)
Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::VecRange) =
(first(inds) <= first(idx)) && (last(idx) <= last(inds))
Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::Vec) =
all(first(inds) <= idx) && all(idx <= last(inds))
@inline _checkarity(::AbstractArray{<:Any,N}, ::Vararg{Any,N}) where {N} =
nothing
@inline _checkarity(::T, ::Any) where {T <: AbstractArray} =
if IndexStyle(T) isa IndexLinear
nothing
else
throw(ArgumentError("""
Array type $T does not support indexing with a single index.
Exactly $(ndims(T)) (non-mask) indices have to be specified.
"""))
end
_checkarity(::AbstractArray{<:Any,N}, ::Vararg{Any,M}) where {N,M} =
throw(ArgumentError("""
$M indices are given to $N-dimensional array.
Exactly $N (non-mask) indices have to be specified when using SIMD.
"""))
# Combined with `_preprocessindices`, helper function `_extractmask`
# extracts `mask` in the tail position. As slicing tuple is not
# type-stable, we use reverse-of-tail-of-reverse hack to extract
# `mask` at the end of `args`.
@inline _extractmask(mask::Vec{N,Bool}, R::Vararg{Integer}) where N =
(reverse(R), mask)
@inline _extractmask(R::Vararg{Integer}) = (reverse(R), nothing)
@inline _extractmask(mask::Vec{N,Bool}) where {N} = ((), mask)
@inline _extractmask() = ((), nothing)
@noinline _extractmask(rargs...) =
throw(ArgumentError("""
Using SIMD indexing `array[idx, i2, ..., iN, mask]` for `N`-dimensional
array requires `i2` to `iN` to be all integers and `mask` to be optionally
a SIMD vector `Vec` of `Bool`s. Given `(i2, ..., iN, mask)` is
$(summary(reverse(rargs)))
"""))
_maskedidx(idx, ::Nothing, ::Any) = idx
_maskedidx(idx::Vec, mask::Vec, fst) = vifelse(mask, idx, fst)
_maskedidx(idx::VecRange, mask::Vec, fst) =
_maskedidx(Vec(ntuple(i -> i - 1 + idx.i, length(mask))), mask, fst)
Base.@propagate_inbounds function _preprocessindices(arr, idx, args)
I, mask = _extractmask(reverse(args)...)
_checkarity(arr, idx, I...)
@boundscheck checkbounds(arr,
_maskedidx(idx, mask, first(axes(arr, 1))),
I...)
return I, mask
end
"""
_pointer(arr, i, I)
Pointer to the element `arr[i, I...]`.
"""
Base.@propagate_inbounds _pointer(arr::Union{Array, Base.ReinterpretArray}, i, I) =
pointer(arr, LinearIndices(arr)[i, I...])
Base.@propagate_inbounds _pointer(arr::Base.FastContiguousSubArray, i, I) =
pointer(arr, (i, I...))
Base.@propagate_inbounds _pointer(arr::Base.FastContiguousSubArray, i, I::Tuple{}) =
pointer(arr, i)
# must be separate methods to resolve ambiguity
Base.@propagate_inbounds _pointer(arr::Base.ReinterpretArray, i, I::Tuple{}) =
pointer(arr, i)
Base.@propagate_inbounds _pointer(arr::SubArray, i, I) =
pointer(Base.unsafe_view(arr, 1, I...), i)
Base.@propagate_inbounds function Base.getindex(
arr::ContiguousArray{T}, idx::VecRange{N},
args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
I, mask = _preprocessindices(arr, idx, args)
return vload(Vec{N,T}, _pointer(arr, idx.i, I), mask)
end
Base.@propagate_inbounds function Base.setindex!(
arr::ContiguousArray{T}, v::Vec{N,T}, idx::VecRange{N},
args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
I, mask = _preprocessindices(arr, idx, args)
vstore(v, _pointer(arr, idx.i, I), mask)
return arr
end
Base.@propagate_inbounds function Base.getindex(
arr::ContiguousArray{T}, idx::Vec{N,<:Integer},
args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
I, mask = _preprocessindices(arr, idx, args)
ptrs = _pointer(arr, 1, I) - sizeof(T) + sizeof(T) * idx
return vgather(ptrs, mask)
end
Base.@propagate_inbounds function Base.setindex!(
arr::ContiguousArray{T}, v::Vec{N,T}, idx::Vec{N,<:Integer},
args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
I, mask = _preprocessindices(arr, idx, args)
ptrs = _pointer(arr, 1, I) - sizeof(T) + sizeof(T) * idx
vscatter(v, ptrs, mask)
return arr
end