Skip to content

Commit

Permalink
Merge pull request #40 from jipolanco/gpu-indexing
Browse files Browse the repository at this point in the history
Avoid scalar indexing on GPU
  • Loading branch information
jipolanco authored Jan 28, 2022
2 parents aaa806b + 30474ed commit 938cbb3
Show file tree
Hide file tree
Showing 7 changed files with 178 additions and 81 deletions.
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Expand Down
1 change: 1 addition & 0 deletions src/PencilArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ include("size.jl")

include("array_interface.jl")
include("broadcast.jl")
include("random.jl")
include("reductions.jl")
include("gather.jl")

Expand Down
148 changes: 108 additions & 40 deletions src/Transpositions/Transpositions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -174,16 +174,16 @@ function assert_compatible(p::Pencil, q::Pencil)
nothing
end

# Reinterpret UInt8 array as a different type of array.
# Reinterpret UInt8 vector as a different type of array.
# The input array should have enough space for the reinterpreted array with the
# given dimensions.
# This is a workaround to the performance issues when using `reinterpret`.
# See for instance:
# - https://discourse.julialang.org/t/big-overhead-with-the-new-lazy-reshape-reinterpret/7635
# - https://github.com/JuliaLang/julia/issues/28980
function unsafe_as_array(::Type{T}, x::AbstractVector{UInt8}, dims) where T
function unsafe_as_array(::Type{T}, x::AbstractVector{UInt8}, dims) where {T}
p = typeof_ptr(x){T}(pointer(x))
A = unsafe_wrap(typeof_array(x), p, dims, own=false)
unsafe_wrap(typeof_array(x), p, dims, own=false)
end

# Only local transposition.
Expand Down Expand Up @@ -371,13 +371,15 @@ function transpose_send!(
@assert length_recv_n == length_self
recv_offsets[n] = length_recv
@timeit_debug timer "copy_range!" copy_range!(
recv_buf, length_recv, Ai, local_send_range, exdims, timer)
recv_buf, length_recv, Ai, local_send_range,
)
transpose_send_self!(method, n, requests, buf_info)
index_local_req = n
else
# Copy data into contiguous buffer, then send the buffer.
@timeit_debug timer "copy_range!" copy_range!(
send_buf, isend, Ai, local_send_range, exdims, timer)
send_buf, isend, Ai, local_send_range,
)
transpose_send_other!(
method, buf_info, (length_send_n, length_recv_n), n,
requests, (rank, comm), eltype(Ai),
Expand Down Expand Up @@ -498,7 +500,8 @@ function transpose_recv!(

# Copy data to `Ao`, permuting dimensions if required.
@timeit_debug timer "copy_permuted!" copy_permuted!(
Ao, o_range_iperm, recv_buf, off, perm, exdims)
Ao, o_range_iperm, recv_buf, off, perm,
)
end

Ao
Expand All @@ -520,54 +523,119 @@ function get_remote_indices(R::Int, coords_local::Dims{M}, Nproc::Int) where M
CartesianIndices(t)
end

# TODO compile for GPU
# maybe using permudedims
function copy_range!(dest::AbstractVector{T}, dest_offset::Int, src::PencilArray{T,N},
src_range::ArrayRegion{P}, extra_dims::Dims{E}, timer,
) where {T,N,P,E}
@assert P + E == N

# Specialisation for CPU arrays.
function copy_range!(
dest::Vector, dest_offset::Integer,
src::PencilArray, src_range_memorder::NTuple,
)
exdims = extra_dims(src)
n = dest_offset
src_p = parent(src) # array with non-permuted indices
for K in CartesianIndices(extra_dims)
for I in CartesianIndices(src_range)
src_p = parent(src) # array with non-permuted indices (memory order)
for K in CartesianIndices(exdims)
for I in CartesianIndices(src_range_memorder)
@inbounds dest[n += 1] = src_p[I, K]
end
end
dest
end

# Generic case avoiding scalar indexing, should work for GPU arrays.
function copy_range!(
dest::AbstractVector, dest_offset::Integer,
src::PencilArray, src_range_memorder::NTuple,
)
exdims = extra_dims(src)
n = dest_offset
src_p = parent(src) # array with non-permuted indices (memory order)
Ks = CartesianIndices(exdims)
Is = CartesianIndices(src_range_memorder)
len = length(Is) * length(Ks)
src_view = @view src_p[Is, Ks]
dst_view = @view dest[(n + 1):(n + len)]
# TODO this allocates on GPUArrays... can it be improved?
copyto!(dst_view, src_view)
dest
end

function copy_permuted!(dest::PencilArray{T,N}, o_range_iperm::ArrayRegion{P},
src::AbstractVector{T}, src_offset::Int,
perm::AbstractPermutation, extra_dims::Dims{E}) where {T,N,P,E}
function copy_permuted!(
dst::PencilArray, o_range_iperm::NTuple,
src::AbstractVector, src_offset::Integer,
perm::AbstractPermutation,
)
N = ndims(dst)
P = length(o_range_iperm)
exdims = extra_dims(dst)
E = length(exdims)
@assert P + E == N

src_view = let src_dims = (map(length, o_range_iperm)..., extra_dims...)
Ndata = prod(src_dims)
n = src_offset
v = Strided.sview(src, (n + 1):(n + Ndata))
Strided.sreshape(v, src_dims)
end
src_dims = (map(length, o_range_iperm)..., exdims...)
src_view = _viewreshape(src, src_dims, src_offset)

dest_view = let dest_p = parent(dest) # array with non-permuted indices
indices = perm * o_range_iperm
v = StridedView(
view(dest_p, indices..., map(Base.OneTo, extra_dims)...)
)
if isidentity(perm)
v
else
pperm = append(perm, Val(E))
# This is the equivalent of a PermutedDimsArray in Strided.jl.
# Note that this is a lazy object (a StridedView)!
permutedims(v, Tuple(inv(pperm))) :: StridedView
end
dst_inds = perm * o_range_iperm # destination indices in memory order
_permutedims!(dst, src_view, dst_inds, perm)

dst
end

# Case of CPU arrays.
# Note that Strided uses scalar indexing at some point, and for that reason it
# doesn't work with GPU arrays.
function _viewreshape(src::Vector, src_dims, n)
N = prod(src_dims)
v = Strided.sview(src, (n + 1):(n + N))
Strided.sreshape(v, src_dims)
end

# Generic case, used in particular for GPU arrays.
function _viewreshape(src::AbstractVector, src_dims, n)
@boundscheck begin
N = prod(src_dims)
checkbounds(src, (n + 1):(n + N))
end
# On GPUs, we use unsafe_wrap to make sure that the returned array is an
# AbstractGPUArray, for which `permutedims!` is implemented in GPUArrays.jl.
unsafe_wrap(typeof_array(src), pointer(src, n + 1), src_dims)
end

function _permutedims!(dst::PencilArray, src, dst_inds, perm)
exdims = extra_dims(dst)
v = view(parent(dst), dst_inds..., map(Base.OneTo, exdims)...)
_permutedims!(typeof_array(pencil(dst)), v, src, perm)
end

copyto!(dest_view, src_view)
# Specialisation for CPU arrays.
# Note that v_in is the raw array (in memory order) wrapped by a PencilArray.
function _permutedims!(::Type{Array}, v_in::SubArray, src, perm)
v = StridedView(v_in)
vperm = if isidentity(perm)
v
else
E = ndims(v) - length(perm) # number of "extra dims"
pperm = append(perm, Val(E))
# This is the equivalent of a PermutedDimsArray in Strided.jl.
# Note that this is a lazy object (a StridedView)!
permutedims(v, Tuple(inv(pperm))) :: StridedView
end
copyto!(vperm, src)
end

dest
# General case, used in particular for GPU arrays.
function _permutedims!(::Type{<:AbstractArray}, v::SubArray, src, perm)
if isidentity(perm)
copyto!(v, src)
else
E = ndims(v) - length(perm) # number of "extra dims"
iperm = inv(append(perm, Val(E)))
# On GPUs, if `src` is an AbstractGPUArray, then there is a `permutedims!`
# implementation for GPUs (in GPUArrays.jl) if the destination is also
# an AbstractGPUArray.
# Note that AbstractGPUArray <: DenseArray, and `v` is generally not
# dense, so we need an intermediate array for the destination.
tmp = similar(src, iperm * size(src)) # TODO avoid allocation!
permutedims!(tmp, src, Tuple(iperm))
copyto!(v, tmp)
end
v
end

end # module Transpositions
6 changes: 5 additions & 1 deletion src/gather.jl
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,13 @@ function gather(x::PencilArray{T,N}, root::Integer=0) where {T, N}
# Unpack data.
dest = Array{T,N}(undef, size_global(x))

# For GPU arrays, this transfers data to the CPU (allocating a new Array).
# If `data` is already an Array{T}, this is non-allocating.
data_cpu = oftype(dest, data)

# Copy local data.
colons_extra_dims = ntuple(n -> Colon(), Val(length(extra_dims)))
dest[pen.axes_local..., colons_extra_dims...] .= data
dest[pen.axes_local..., colons_extra_dims...] .= data_cpu

# Copy remote data.
for m = 2:Nproc
Expand Down
11 changes: 11 additions & 0 deletions src/random.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
using Random

function Random.rand!(rng::AbstractRNG, u::PencilArray, ::Type{X}) where {X}
rand!(rng, parent(u), X)
u
end

function Random.randn!(rng::AbstractRNG, u::PencilArray, args...)
randn!(rng, parent(u), args...)
u
end
79 changes: 43 additions & 36 deletions test/array_types.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,10 @@
using MPI
using PencilArrays
using PencilArrays: typeof_array
using OffsetArrays
using LinearAlgebra: transpose!
using Random
using Test

# TODO
# Some operations still need to be adapted for GPU arrays to remove @allowscalar
using GPUArrays: @allowscalar

include("include/jlarray.jl")
using .JLArrays

Expand All @@ -31,8 +26,9 @@ Base.getindex(u::TestArray, args...) = getindex(u.data, args...)
Base.setindex!(u::TestArray, args...) = setindex!(u.data, args...)
Base.resize!(u::TestArray, args...) = (resize!(u.data, args...); u)
Base.pointer(u::TestArray) = pointer(u.data)
Base.unsafe_wrap(::Type{TestArray}, args...; kws...) =
TestArray(unsafe_wrap(Array, args...; kws...))
Base.pointer(u::TestArray, n::Integer) = pointer(u.data, n) # needed to avoid ambiguity
Base.unsafe_wrap(::Type{TestArray}, p::Ptr, dims::Union{Integer, Dims}; kws...) =
TestArray(unsafe_wrap(Array, p, dims; kws...))
MPI.Buffer(u::TestArray) = MPI.Buffer(u.data) # for `gather`

# A bit of type piracy to help tests pass.
Expand All @@ -43,7 +39,7 @@ MPI.Init()
comm = MPI.COMM_WORLD
MPI.Comm_rank(comm) == 0 || redirect_stdout(devnull)

@testset "Array type: $A" for A (Array, TestArray, JLArray)
@testset "Array type: $A" for A (JLArray, Array, TestArray)
pen = @inferred Pencil(A, (8, 10), comm)
@test @inferred(typeof_array(pen)) === A
@test (@inferred (p -> p.send_buf)(pen)) isa A
Expand All @@ -53,44 +49,55 @@ MPI.Comm_rank(comm) == 0 || redirect_stdout(devnull)
# fails.
ArrayOther = A === Array ? TestArray : Array
let dims = size_local(pen, MemoryOrder())
data = ArrayOther{Int}(undef, dims)
data = ArrayOther{Float32}(undef, dims)
@test_throws ArgumentError PencilArray(pen, data)
end

u = @inferred PencilArray{Int}(undef, pen)
@test typeof(parent(u)) <: A{Int}
u = @inferred PencilArray{Float32}(undef, pen)
@test typeof(parent(u)) <: A{Float32}

@test @inferred(typeof_array(pen)) === A
@test @inferred(typeof_array(u)) === A

# This is in particular to test that, for GPU arrays, scalar indexing is not
# performed and the correct GPU functions are called.
@testset "Initialisation" begin
@test_nowarn fill!(u, 4)
@test_nowarn rand!(u)
@test_nowarn randn!(u)
end

px = @inferred Pencil(A, (20, 16), (1, ), comm)
py = @inferred Pencil(px; decomp_dims = (2, ), permute = Permutation(2, 1))
@test permutation(py) == Permutation(2, 1)
@test @inferred(typeof_array(px)) === A
@test @inferred(typeof_array(py)) === A

@testset "Transpositions" begin
ux = @allowscalar rand!(PencilArray{Float64}(undef, px))
uy = @inferred similar(ux, py)
@test pencil(uy) === py
tr = @inferred Transpositions.Transposition(uy, ux)
@allowscalar transpose!(tr)
@testset "Permutation: $perm" for perm (NoPermutation(), Permutation(2, 1))
py = @inferred Pencil(px; decomp_dims = (2, ), permute = perm)
@test permutation(py) == perm
@test @inferred(typeof_array(px)) === A
@test @inferred(typeof_array(py)) === A

@testset "Transpositions" begin
ux = @test_nowarn rand!(PencilArray{Float64}(undef, px))
uy = @inferred similar(ux, py)
@test pencil(uy) === py
tr = @inferred Transpositions.Transposition(uy, ux)
transpose!(tr)

# Verify transposition
gx = @allowscalar @inferred Nothing gather(ux)
gy = @allowscalar @inferred Nothing gather(uy)
if !(nothing === gx === gy)
@test typeof(gx) === typeof(gy) <: Array
@test gx == gy
# Verify transposition
gx = @inferred Nothing gather(ux)
gy = @inferred Nothing gather(uy)
if !(nothing === gx === gy)
@test typeof(gx) === typeof(gy) <: Array
@test gx == gy
end
end
end

@testset "Multiarrays" begin
M = @inferred ManyPencilArray{Float32}(undef, px, py)
ux = @inferred first(M)
uy = @inferred last(M)
uxp = parent(parent(parent(ux)))
@test uxp === parent(parent(parent(uy)))
@test typeof(uxp) <: A{Float32}
end
@testset "Multiarrays" begin
M = @inferred ManyPencilArray{Float32}(undef, px, py)
ux = @inferred first(M)
uy = @inferred last(M)
uxp = parent(parent(parent(ux)))
@test uxp === parent(parent(parent(uy)))
@test typeof(uxp) <: A{Float32}
end
end # permutation
end
13 changes: 9 additions & 4 deletions test/include/jlarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# in CUDA.jl):
# - resize!(::DenseJLVector, n)
# - unsafe_wrap(::Type{JLArray}, ...)
# - rand!(::AbstractRNG, ::JLArray, ...)

# ============================================================================ #

Expand Down Expand Up @@ -339,10 +340,9 @@ Base.copyto!(dest::DenseJLArray{T}, source::DenseJLArray{T}) where {T} =
Base.resize!(u::DenseJLVector, n) = (resize!(u.data, n); u)

# Added for PencilArrays tests
function Base.unsafe_wrap(::Type{JLArray}, args...; kws...)
data = unsafe_wrap(Array, args...; kws...)
T, N = eltype(data), ndims(data)
JLArray{T, N}(data, size(data))
function Base.unsafe_wrap(::Type{JLArray}, p::Ptr, dims::Union{Integer, Dims}; kws...)
data = unsafe_wrap(Array, p, dims; kws...)
JLArray(data)
end

## random number generation
Expand All @@ -361,6 +361,11 @@ function GPUArrays.default_rng(::Type{<:JLArray})
GLOBAL_RNG[]
end

# Added for PencilArrays tests
function Random.rand!(rng::AbstractRNG, u::JLArray, ::Type{X}) where {X}
rand!(rng, u.data, X)
u
end

## GPUArrays interfaces

Expand Down

0 comments on commit 938cbb3

Please sign in to comment.