Merge pull request #40 from jipolanco/gpu-indexing

Avoid scalar indexing on GPU
jipolanco · Jan 28, 2022 · 938cbb3 · 938cbb3
2 parents aaa806b + 30474ed
commit 938cbb3
Show file tree

Hide file tree

Showing 7 changed files with 178 additions and 81 deletions.
diff --git a/Project.toml b/Project.toml
@@ -9,6 +9,7 @@ JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

diff --git a/src/PencilArrays.jl b/src/PencilArrays.jl
@@ -38,6 +38,7 @@ include("size.jl")
 
 include("array_interface.jl")
 include("broadcast.jl")
+include("random.jl")
 include("reductions.jl")
 include("gather.jl")
 

diff --git a/src/Transpositions/Transpositions.jl b/src/Transpositions/Transpositions.jl
@@ -174,16 +174,16 @@ function assert_compatible(p::Pencil, q::Pencil)
     nothing
 end
 
-# Reinterpret UInt8 array as a different type of array.
+# Reinterpret UInt8 vector as a different type of array.
 # The input array should have enough space for the reinterpreted array with the
 # given dimensions.
 # This is a workaround to the performance issues when using `reinterpret`.
 # See for instance:
 # - https://discourse.julialang.org/t/big-overhead-with-the-new-lazy-reshape-reinterpret/7635
 # - https://github.com/JuliaLang/julia/issues/28980
-function unsafe_as_array(::Type{T}, x::AbstractVector{UInt8}, dims) where T
+function unsafe_as_array(::Type{T}, x::AbstractVector{UInt8}, dims) where {T}
     p = typeof_ptr(x){T}(pointer(x))
-    A = unsafe_wrap(typeof_array(x), p, dims, own=false)
+    unsafe_wrap(typeof_array(x), p, dims, own=false)
 end
 
 # Only local transposition.
@@ -371,13 +371,15 @@ function transpose_send!(
             @assert length_recv_n == length_self
             recv_offsets[n] = length_recv
             @timeit_debug timer "copy_range!" copy_range!(
-                recv_buf, length_recv, Ai, local_send_range, exdims, timer)
+                recv_buf, length_recv, Ai, local_send_range,
+            )
             transpose_send_self!(method, n, requests, buf_info)
             index_local_req = n
         else
             # Copy data into contiguous buffer, then send the buffer.
             @timeit_debug timer "copy_range!" copy_range!(
-                send_buf, isend, Ai, local_send_range, exdims, timer)
+                send_buf, isend, Ai, local_send_range,
+            )
             transpose_send_other!(
                 method, buf_info, (length_send_n, length_recv_n), n,
                 requests, (rank, comm), eltype(Ai),
@@ -498,7 +500,8 @@ function transpose_recv!(
 
         # Copy data to `Ao`, permuting dimensions if required.
         @timeit_debug timer "copy_permuted!" copy_permuted!(
-            Ao, o_range_iperm, recv_buf, off, perm, exdims)
+            Ao, o_range_iperm, recv_buf, off, perm,
+        )
     end
 
     Ao
@@ -520,54 +523,119 @@ function get_remote_indices(R::Int, coords_local::Dims{M}, Nproc::Int) where M
     CartesianIndices(t)
 end
 
-# TODO compile for GPU
-# maybe using permudedims
-function copy_range!(dest::AbstractVector{T}, dest_offset::Int, src::PencilArray{T,N},
-                     src_range::ArrayRegion{P}, extra_dims::Dims{E}, timer,
-                    ) where {T,N,P,E}
-    @assert P + E == N
-
+# Specialisation for CPU arrays.
+function copy_range!(
+        dest::Vector, dest_offset::Integer,
+        src::PencilArray, src_range_memorder::NTuple,
+    )
+    exdims = extra_dims(src)
     n = dest_offset
-    src_p = parent(src)  # array with non-permuted indices
-    for K in CartesianIndices(extra_dims)
-        for I in CartesianIndices(src_range)
+    src_p = parent(src)  # array with non-permuted indices (memory order)
+    for K in CartesianIndices(exdims)
+        for I in CartesianIndices(src_range_memorder)
             @inbounds dest[n += 1] = src_p[I, K]
         end
     end
+    dest
+end
 
+# Generic case avoiding scalar indexing, should work for GPU arrays.
+function copy_range!(
+        dest::AbstractVector, dest_offset::Integer,
+        src::PencilArray, src_range_memorder::NTuple,
+    )
+    exdims = extra_dims(src)
+    n = dest_offset
+    src_p = parent(src)  # array with non-permuted indices (memory order)
+    Ks = CartesianIndices(exdims)
+    Is = CartesianIndices(src_range_memorder)
+    len = length(Is) * length(Ks)
+    src_view = @view src_p[Is, Ks]
+    dst_view = @view dest[(n + 1):(n + len)]
+    # TODO this allocates on GPUArrays... can it be improved?
+    copyto!(dst_view, src_view)
     dest
 end
 
-function copy_permuted!(dest::PencilArray{T,N}, o_range_iperm::ArrayRegion{P},
-                        src::AbstractVector{T}, src_offset::Int,
-                        perm::AbstractPermutation, extra_dims::Dims{E}) where {T,N,P,E}
+function copy_permuted!(
+        dst::PencilArray, o_range_iperm::NTuple,
+        src::AbstractVector, src_offset::Integer,
+        perm::AbstractPermutation,
+    )
+    N = ndims(dst)
+    P = length(o_range_iperm)
+    exdims = extra_dims(dst)
+    E = length(exdims)
     @assert P + E == N
 
-    src_view = let src_dims = (map(length, o_range_iperm)..., extra_dims...)
-        Ndata = prod(src_dims)
-        n = src_offset
-        v = Strided.sview(src, (n + 1):(n + Ndata))
-        Strided.sreshape(v, src_dims)
-    end
+    src_dims = (map(length, o_range_iperm)..., exdims...)
+    src_view = _viewreshape(src, src_dims, src_offset)
 
-    dest_view = let dest_p = parent(dest)  # array with non-permuted indices
-        indices = perm * o_range_iperm
-        v = StridedView(
-            view(dest_p, indices..., map(Base.OneTo, extra_dims)...)
-        )
-        if isidentity(perm)
-            v
-        else
-            pperm = append(perm, Val(E))
-            # This is the equivalent of a PermutedDimsArray in Strided.jl.
-            # Note that this is a lazy object (a StridedView)!
-            permutedims(v, Tuple(inv(pperm))) :: StridedView
-        end
+    dst_inds = perm * o_range_iperm  # destination indices in memory order
+    _permutedims!(dst, src_view, dst_inds, perm)
+
+    dst
+end
+
+# Case of CPU arrays.
+# Note that Strided uses scalar indexing at some point, and for that reason it
+# doesn't work with GPU arrays.
+function _viewreshape(src::Vector, src_dims, n)
+    N = prod(src_dims)
+    v = Strided.sview(src, (n + 1):(n + N))
+    Strided.sreshape(v, src_dims)
+end
+
+# Generic case, used in particular for GPU arrays.
+function _viewreshape(src::AbstractVector, src_dims, n)
+    @boundscheck begin
+        N = prod(src_dims)
+        checkbounds(src, (n + 1):(n + N))
     end
+    # On GPUs, we use unsafe_wrap to make sure that the returned array is an
+    # AbstractGPUArray, for which `permutedims!` is implemented in GPUArrays.jl.
+    unsafe_wrap(typeof_array(src), pointer(src, n + 1), src_dims)
+end
+
+function _permutedims!(dst::PencilArray, src, dst_inds, perm)
+    exdims = extra_dims(dst)
+    v = view(parent(dst), dst_inds..., map(Base.OneTo, exdims)...)
+    _permutedims!(typeof_array(pencil(dst)), v, src, perm)
+end
 
-    copyto!(dest_view, src_view)
+# Specialisation for CPU arrays.
+# Note that v_in is the raw array (in memory order) wrapped by a PencilArray.
+function _permutedims!(::Type{Array}, v_in::SubArray, src, perm)
+    v = StridedView(v_in)
+    vperm = if isidentity(perm)
+        v
+    else
+        E = ndims(v) - length(perm)  # number of "extra dims"
+        pperm = append(perm, Val(E))
+        # This is the equivalent of a PermutedDimsArray in Strided.jl.
+        # Note that this is a lazy object (a StridedView)!
+        permutedims(v, Tuple(inv(pperm))) :: StridedView
+    end
+    copyto!(vperm, src)
+end
 
-    dest
+# General case, used in particular for GPU arrays.
+function _permutedims!(::Type{<:AbstractArray}, v::SubArray, src, perm)
+    if isidentity(perm)
+        copyto!(v, src)
+    else
+        E = ndims(v) - length(perm)  # number of "extra dims"
+        iperm = inv(append(perm, Val(E)))
+        # On GPUs, if `src` is an AbstractGPUArray, then there is a `permutedims!`
+        # implementation for GPUs (in GPUArrays.jl) if the destination is also
+        # an AbstractGPUArray.
+        # Note that AbstractGPUArray <: DenseArray, and `v` is generally not
+        # dense, so we need an intermediate array for the destination.
+        tmp = similar(src, iperm * size(src))  # TODO avoid allocation!
+        permutedims!(tmp, src, Tuple(iperm))
+        copyto!(v, tmp)
+    end
+    v
 end
 
 end  # module Transpositions
diff --git a/src/gather.jl b/src/gather.jl
@@ -77,9 +77,13 @@ function gather(x::PencilArray{T,N}, root::Integer=0) where {T, N}
     # Unpack data.
     dest = Array{T,N}(undef, size_global(x))
 
+    # For GPU arrays, this transfers data to the CPU (allocating a new Array).
+    # If `data` is already an Array{T}, this is non-allocating.
+    data_cpu = oftype(dest, data)
+
     # Copy local data.
     colons_extra_dims = ntuple(n -> Colon(), Val(length(extra_dims)))
-    dest[pen.axes_local..., colons_extra_dims...] .= data
+    dest[pen.axes_local..., colons_extra_dims...] .= data_cpu
 
     # Copy remote data.
     for m = 2:Nproc

diff --git a/src/random.jl b/src/random.jl
@@ -0,0 +1,11 @@
+using Random
+
+function Random.rand!(rng::AbstractRNG, u::PencilArray, ::Type{X}) where {X}
+    rand!(rng, parent(u), X)
+    u
+end
+
+function Random.randn!(rng::AbstractRNG, u::PencilArray, args...)
+    randn!(rng, parent(u), args...)
+    u
+end
diff --git a/test/array_types.jl b/test/array_types.jl
@@ -5,15 +5,10 @@
 using MPI
 using PencilArrays
 using PencilArrays: typeof_array
-using OffsetArrays
 using LinearAlgebra: transpose!
 using Random
 using Test
 
-# TODO
-# Some operations still need to be adapted for GPU arrays to remove @allowscalar
-using GPUArrays: @allowscalar
-
 include("include/jlarray.jl")
 using .JLArrays
 
@@ -31,8 +26,9 @@ Base.getindex(u::TestArray, args...) = getindex(u.data, args...)
 Base.setindex!(u::TestArray, args...) = setindex!(u.data, args...)
 Base.resize!(u::TestArray, args...) = (resize!(u.data, args...); u)
 Base.pointer(u::TestArray) = pointer(u.data)
-Base.unsafe_wrap(::Type{TestArray}, args...; kws...) =
-    TestArray(unsafe_wrap(Array, args...; kws...))
+Base.pointer(u::TestArray, n::Integer) = pointer(u.data, n)  # needed to avoid ambiguity
+Base.unsafe_wrap(::Type{TestArray}, p::Ptr, dims::Union{Integer, Dims}; kws...) =
+    TestArray(unsafe_wrap(Array, p, dims; kws...))
 MPI.Buffer(u::TestArray) = MPI.Buffer(u.data)  # for `gather`
 
 # A bit of type piracy to help tests pass.
@@ -43,7 +39,7 @@ MPI.Init()
 comm = MPI.COMM_WORLD
 MPI.Comm_rank(comm) == 0 || redirect_stdout(devnull)
 
-@testset "Array type: $A" for A ∈ (Array, TestArray, JLArray)
+@testset "Array type: $A" for A ∈ (JLArray, Array, TestArray)
     pen = @inferred Pencil(A, (8, 10), comm)
     @test @inferred(typeof_array(pen)) === A
     @test (@inferred (p -> p.send_buf)(pen)) isa A
@@ -53,44 +49,55 @@ MPI.Comm_rank(comm) == 0 || redirect_stdout(devnull)
     # fails.
     ArrayOther = A === Array ? TestArray : Array
     let dims = size_local(pen, MemoryOrder())
-        data = ArrayOther{Int}(undef, dims)
+        data = ArrayOther{Float32}(undef, dims)
         @test_throws ArgumentError PencilArray(pen, data)
     end
 
-    u = @inferred PencilArray{Int}(undef, pen)
-    @test typeof(parent(u)) <: A{Int}
+    u = @inferred PencilArray{Float32}(undef, pen)
+    @test typeof(parent(u)) <: A{Float32}
 
     @test @inferred(typeof_array(pen)) === A
     @test @inferred(typeof_array(u)) === A
 
+    # This is in particular to test that, for GPU arrays, scalar indexing is not
+    # performed and the correct GPU functions are called.
+    @testset "Initialisation" begin
+        @test_nowarn fill!(u, 4)
+        @test_nowarn rand!(u)
+        @test_nowarn randn!(u)
+    end
+
     px = @inferred Pencil(A, (20, 16), (1, ), comm)
-    py = @inferred Pencil(px; decomp_dims = (2, ), permute = Permutation(2, 1))
-    @test permutation(py) == Permutation(2, 1)
-    @test @inferred(typeof_array(px)) === A
-    @test @inferred(typeof_array(py)) === A
 
-    @testset "Transpositions" begin
-        ux = @allowscalar rand!(PencilArray{Float64}(undef, px))
-        uy = @inferred similar(ux, py)
-        @test pencil(uy) === py
-        tr = @inferred Transpositions.Transposition(uy, ux)
-        @allowscalar transpose!(tr)
+    @testset "Permutation: $perm" for perm ∈ (NoPermutation(), Permutation(2, 1))
+        py = @inferred Pencil(px; decomp_dims = (2, ), permute = perm)
+        @test permutation(py) == perm
+        @test @inferred(typeof_array(px)) === A
+        @test @inferred(typeof_array(py)) === A
+
+        @testset "Transpositions" begin
+            ux = @test_nowarn rand!(PencilArray{Float64}(undef, px))
+            uy = @inferred similar(ux, py)
+            @test pencil(uy) === py
+            tr = @inferred Transpositions.Transposition(uy, ux)
+            transpose!(tr)
 
-        # Verify transposition
-        gx = @allowscalar @inferred Nothing gather(ux)
-        gy = @allowscalar @inferred Nothing gather(uy)
-        if !(nothing === gx === gy)
-            @test typeof(gx) === typeof(gy) <: Array
-            @test gx == gy
+            # Verify transposition
+            gx = @inferred Nothing gather(ux)
+            gy = @inferred Nothing gather(uy)
+            if !(nothing === gx === gy)
+                @test typeof(gx) === typeof(gy) <: Array
+                @test gx == gy
+            end
         end
-    end
 
-    @testset "Multiarrays" begin
-        M = @inferred ManyPencilArray{Float32}(undef, px, py)
-        ux = @inferred first(M)
-        uy = @inferred last(M)
-        uxp = parent(parent(parent(ux)))
-        @test uxp === parent(parent(parent(uy)))
-        @test typeof(uxp) <: A{Float32}
-    end
+        @testset "Multiarrays" begin
+            M = @inferred ManyPencilArray{Float32}(undef, px, py)
+            ux = @inferred first(M)
+            uy = @inferred last(M)
+            uxp = parent(parent(parent(ux)))
+            @test uxp === parent(parent(parent(uy)))
+            @test typeof(uxp) <: A{Float32}
+        end
+    end  # permutation
 end
diff --git a/test/include/jlarray.jl b/test/include/jlarray.jl
@@ -9,6 +9,7 @@
 # in CUDA.jl):
 # - resize!(::DenseJLVector, n)
 # - unsafe_wrap(::Type{JLArray}, ...)
+# - rand!(::AbstractRNG, ::JLArray, ...)
 
 # ============================================================================ #
 
@@ -339,10 +340,9 @@ Base.copyto!(dest::DenseJLArray{T}, source::DenseJLArray{T}) where {T} =
 Base.resize!(u::DenseJLVector, n) = (resize!(u.data, n); u)
 
 # Added for PencilArrays tests
-function Base.unsafe_wrap(::Type{JLArray}, args...; kws...)
-    data = unsafe_wrap(Array, args...; kws...)
-    T, N = eltype(data), ndims(data)
-    JLArray{T, N}(data, size(data))
+function Base.unsafe_wrap(::Type{JLArray}, p::Ptr, dims::Union{Integer, Dims}; kws...)
+    data = unsafe_wrap(Array, p, dims; kws...)
+    JLArray(data)
 end
 
 ## random number generation
@@ -361,6 +361,11 @@ function GPUArrays.default_rng(::Type{<:JLArray})
     GLOBAL_RNG[]
 end
 
+# Added for PencilArrays tests
+function Random.rand!(rng::AbstractRNG, u::JLArray, ::Type{X}) where {X}
+    rand!(rng, u.data, X)
+    u
+end
 
 ## GPUArrays interfaces