Merge pull request #51 from JuliaGPU/tb/release

maleadt · web-flow · commit d004e29622d6 · 2024-02-22T10:50:15.000+01:00
Prepare for initial release
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "NCCL"
 uuid = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b"
-version = "0.2.0"
+version = "0.1.0"
 
 [deps]
 CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
@@ -9,4 +9,4 @@ NCCL_jll = "4d6d38e4-5b87-5e63-912a-873ff2d649b7"
 
 [compat]
 CEnum = "0.2, 0.3, 0.4, 0.5"
-julia = "1.6"
+julia = "1.8"
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,6 +1,6 @@
 # NCCL.jl
 
- A Julia wrapper for the [NVIDIA Collective Communications Library (NCCL)](https://developer.nvidia.com/nccl). 
+ A Julia wrapper for the [NVIDIA Collective Communications Library (NCCL)](https://developer.nvidia.com/nccl).
 
 # API
 
@@ -18,8 +18,9 @@ version()
 ## Communicators
 
 ```@docs
+Communicator
 Communicators
-CUDA.CuDevice(comm::Communicator)
+device(comm::Communicator)
 size(comm::Communicator)
 rank(comm::Communicator)
 abort(comm::Communicator)
@@ -50,4 +51,4 @@ avg
 ```@docs
 Send
 Recv!
-```
+```
diff --git a/src/base.jl b/src/base.jl
@@ -17,9 +17,6 @@ function version()
     ncclGetVersion(ver_r)
     ver = ver_r[]
 
-    # nccl.h defines this as:
-    #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
-
     if ver < 2900
         major, ver = divrem(ver, 1000)
         minor, patch = divrem(ver, 100)
diff --git a/src/collective.jl b/src/collective.jl
@@ -9,7 +9,8 @@ or [`NCCL.avg`](@ref)), writing the result to `recvbuf` to all ranks.
 # External links
 - [`ncclAllReduce`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclallreduce)
 """
-function Allreduce!(sendbuf, recvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm))
+function Allreduce!(sendbuf, recvbuf, op, comm::Communicator;
+                    stream::CuStream=default_device_stream(comm))
     count = length(recvbuf)
     @assert length(sendbuf) == count
     data_type = ncclDataType_t(eltype(recvbuf))
@@ -27,8 +28,10 @@ end
 Reduce the array `sendrecvbuf` using `op` (one of `+`, `*`, `min`, `max`,
 or `[`NCCL.avg`](@ref)`), writing the result inplace to all ranks.
 """
-Allreduce!(sendrecvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm) ) =
+function Allreduce!(sendrecvbuf, op, comm::Communicator;
+                    stream::CuStream=default_device_stream(comm))
     Allreduce!(sendrecvbuf, sendrecvbuf, op, comm; stream)
+end
 
 """
     NCCL.Broadcast!(
@@ -41,14 +44,17 @@ Copies array the `sendbuf` on rank `root` to `recvbuf` on all ranks.
 # External links
 - [`ncclBroadcast`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclbroadcast)
 """
-function Broadcast!(sendbuf, recvbuf, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm))
+function Broadcast!(sendbuf, recvbuf, comm::Communicator; root::Integer=0,
+                    stream::CuStream=default_device_stream(comm))
     data_type = ncclDataType_t(eltype(recvbuf))
     count = length(recvbuf)
     ncclBroadcast(sendbuf, recvbuf, count, data_type, root, comm, stream)
     return recvbuf
 end
-Broadcast!(sendrecvbuf, comm::Communicator;  root::Integer=0, stream::CuStream=default_device_stream(comm)) =
+function Broadcast!(sendrecvbuf, comm::Communicator;  root::Integer=0,
+                    stream::CuStream=default_device_stream(comm))
     Broadcast!(sendrecvbuf, sendrecvbuf, comm; root, stream)
+end
 
 
 """
@@ -63,15 +69,18 @@ or `[`NCCL.avg`](@ref)`), writing the result to `recvbuf` on rank `root`.
 # External links
 - [`ncclReduce`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclreduce)
 """
-function Reduce!(sendbuf, recvbuf, op, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm))
+function Reduce!(sendbuf, recvbuf, op, comm::Communicator; root::Integer=0,
+                 stream::CuStream=default_device_stream(comm))
     data_type = ncclDataType_t(eltype(recvbuf))
     count = length(recvbuf)
     _op = ncclRedOp_t(op)
     ncclReduce(sendbuf, recvbuf, count, data_type, _op, root, comm, stream)
     return recvbuf
 end
-Reduce!(sendrecvbuf, op, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm)) =
+function Reduce!(sendrecvbuf, op, comm::Communicator; root::Integer=0,
+                 stream::CuStream=default_device_stream(comm))
     Reduce!(sendrecvbuf, sendrecvbuf, op, comm; root, stream)
+end
 
 """
     NCCL.Allgather!(
@@ -84,7 +93,8 @@ Concatenate `sendbuf` from each rank into `recvbuf` on all ranks.
 # External links
 - [`ncclAllGather`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclallgather)
 """
-function Allgather!(sendbuf, recvbuf, comm::Communicator; stream::CuStream=default_device_stream(comm))
+function Allgather!(sendbuf, recvbuf, comm::Communicator;
+                    stream::CuStream=default_device_stream(comm))
     data_type = ncclDataType_t(eltype(recvbuf))
     sendcount = length(sendbuf)
     @assert length(recvbuf) == sendcount * size(comm)
@@ -105,7 +115,8 @@ scattered over the devices such that `recvbuf` on each rank will contain the
 # External links
 - [`ncclReduceScatter`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclreducescatter)
 """
-function ReduceScatter!(sendbuf, recvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm) )
+function ReduceScatter!(sendbuf, recvbuf, op, comm::Communicator;
+                        stream::CuStream=default_device_stream(comm))
     recvcount = length(recvbuf)
     @assert length(sendbuf) == recvcount * size(comm)
     data_type = ncclDataType_t(eltype(recvbuf))
diff --git a/src/communicator.jl b/src/communicator.jl
@@ -22,25 +22,42 @@ function destroy(comm::Communicator)
 end
 Base.unsafe_convert(::Type{LibNCCL.ncclComm_t}, comm::Communicator) = comm.handle
 
-# creates a new communicator (multi thread/process version)
-function Communicator(nranks::Integer, comm_id::UniqueID, rank::Integer)
+"""
+    NCCL.Communicator(nranks, rank; [unique_id]) :: Communicator
+
+Create a single communicator for use in a multi-threaded or multi-process
+environment. `nranks` is the number of ranks in the communicator, and `rank`
+is the 0-based index of the current rank. `unique_id` is an optional unique
+identifier for the communicator.
+
+# Examples
+```
+comm = Communicator(length(CUDA.devices()), id, myid())
+# this blocks until all other ranks have connected
+```
+
+# External links
+- [`ncclCommInitRank`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclCommInitRank)
+"""
+function Communicator(nranks::Integer, rank::Integer;
+                      unique_id::UniqueID=UniqueID())
+    0 <= rank < nranks || throw(ArgumentError("rank must be in [0, nranks)"))
     handle_ref = Ref{ncclComm_t}(C_NULL)
-    ncclCommInitRank(handle_ref, nranks, comm_id, rank)
+    ncclCommInitRank(handle_ref, nranks, unique_id, rank)
     c = Communicator(handle_ref[])
     return finalizer(destroy, c)
 end
 
-# creates a clique of communicators (single process version)
 """
     NCCL.Communicators(devices) :: Vector{Communicator}
 
-Construct and initialize a clique of NCCL Communicators.
+Construct and initialize a clique of NCCL Communicators over the devices
+on a single host.
 
 `devices` can either be a collection of identifiers, or `CuDevice`s.
 
 # Examples
 ```
-# initialize a clique over all devices on the host
 comms = NCCL.Communicators(CUDA.devices())
 ```
 
@@ -64,20 +81,19 @@ function Communicators(devices)
 end
 
 """
-    CuDevice(comm::Communicator) :: CuDevice
+    NCCL.device(comm::Communicator) :: CuDevice
 
 The device of the communicator
 
 # External Links
 - [`ncclCommCuDevice`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommcudevice)
 """
-function CUDA.CuDevice(comm::Communicator)
+function device(comm::Communicator)
     dev_ref = Ref{Cint}(C_NULL)
     ncclCommCuDevice(comm, dev_ref)
     return CuDevice(dev_ref[])
 end
 
-
 """
     NCCL.size(comm::Communicator) :: Int
 
@@ -120,15 +136,14 @@ function abort(comm::Communicator)
     return
 end
 
-
 """
     NCCL.default_device_stream(comm::Communicator) :: CuStream
 
 Get the default stream for device `devid`, or the device corresponding to
 communicator `comm`.
 """
 function default_device_stream(comm::Communicator)
-    dev = CuDevice(comm)
+    dev = device(comm)
     device!(dev) do
         stream()
     end
diff --git a/src/pointtopoint.jl b/src/pointtopoint.jl
@@ -2,7 +2,7 @@
     NCCL.Send(
         sendbuf, comm::Communicator;
         dest::Integer,
-        stream::CuStream = default_device_stream(comm))  
+        stream::CuStream = default_device_stream(comm))
     )
 
 Send data from `sendbuf` to rank `dest`. A matching [`Recv!`](@ref) must also be
@@ -11,7 +11,8 @@ called.
 # External links
 - [`ncclSend`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclsend)
 """
-function Send(sendbuf, comm::Communicator; dest::Integer, stream::CuStream=default_device_stream(comm))
+function Send(sendbuf, comm::Communicator; dest::Integer,
+              stream::CuStream=default_device_stream(comm))
     count = length(sendbuf)
     datatype = ncclDataType_t(eltype(sendbuf))
     ncclSend(sendbuf, count, datatype, dest, comm, stream)
@@ -22,7 +23,7 @@ end
     NCCL.Recv!(
         recvbuf, comm::Communicator;
         source::Integer,
-        stream::CuStream = default_device_stream(comm))  
+        stream::CuStream = default_device_stream(comm))
     )
 
 Write the data from a matching [`Send`](@ref) on rank `source` into `recvbuf`.
@@ -31,7 +32,8 @@ Write the data from a matching [`Send`](@ref) on rank `source` into `recvbuf`.
 - [`ncclRecv`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclrecv)
 
 """
-function Recv!(recvbuf, comm::Communicator; source::Integer, stream::CuStream=default_device_stream(comm))
+function Recv!(recvbuf, comm::Communicator; source::Integer,
+               stream::CuStream=default_device_stream(comm))
     count = length(recvbuf)
     datatype = ncclDataType_t(eltype(recvbuf))
     ncclRecv(recvbuf, count, datatype, source, comm, stream)
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -9,16 +9,17 @@ using NCCL
 @testset "NCCL" begin
 
 @testset "Communicator" begin
+    # clique of communicators
     comms = NCCL.Communicators(CUDA.devices())
     for (i,dev) in enumerate(CUDA.devices())
         @test NCCL.rank(comms[i]) == i-1
-        @test CuDevice(comms[i]) == dev
+        @test NCCL.device(comms[i]) == dev
         @test NCCL.size(comms[i]) == length(CUDA.devices())
     end
-    id  = NCCL.UniqueID()
-    #=num_devs = length(CUDA.devices())
-    comm  = Communicator(num_devs, id, 0)
-    @test device(comm) == 0=#
+
+    # single communicator (with nranks=1 or this would block)
+    comm  = Communicator(1, 0)
+    @test NCCL.device(comm) == CuDevice(0)
 end
 
 @testset "Allreduce!" begin