Skip to content

Commit d004e29

Browse files
authored
Merge pull request #51 from JuliaGPU/tb/release
Prepare for initial release
2 parents 039bc01 + 08fddad commit d004e29

File tree

7 files changed

+63
-36
lines changed

7 files changed

+63
-36
lines changed

Project.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "NCCL"
22
uuid = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b"
3-
version = "0.2.0"
3+
version = "0.1.0"
44

55
[deps]
66
CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
@@ -9,4 +9,4 @@ NCCL_jll = "4d6d38e4-5b87-5e63-912a-873ff2d649b7"
99

1010
[compat]
1111
CEnum = "0.2, 0.3, 0.4, 0.5"
12-
julia = "1.6"
12+
julia = "1.8"

docs/src/index.md

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# NCCL.jl
22

3-
A Julia wrapper for the [NVIDIA Collective Communications Library (NCCL)](https://developer.nvidia.com/nccl).
3+
A Julia wrapper for the [NVIDIA Collective Communications Library (NCCL)](https://developer.nvidia.com/nccl).
44

55
# API
66

@@ -18,8 +18,9 @@ version()
1818
## Communicators
1919

2020
```@docs
21+
Communicator
2122
Communicators
22-
CUDA.CuDevice(comm::Communicator)
23+
device(comm::Communicator)
2324
size(comm::Communicator)
2425
rank(comm::Communicator)
2526
abort(comm::Communicator)
@@ -50,4 +51,4 @@ avg
5051
```@docs
5152
Send
5253
Recv!
53-
```
54+
```

src/base.jl

-3
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,6 @@ function version()
1717
ncclGetVersion(ver_r)
1818
ver = ver_r[]
1919

20-
# nccl.h defines this as:
21-
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
22-
2320
if ver < 2900
2421
major, ver = divrem(ver, 1000)
2522
minor, patch = divrem(ver, 100)

src/collective.jl

+19-8
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ or [`NCCL.avg`](@ref)), writing the result to `recvbuf` to all ranks.
99
# External links
1010
- [`ncclAllReduce`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclallreduce)
1111
"""
12-
function Allreduce!(sendbuf, recvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm))
12+
function Allreduce!(sendbuf, recvbuf, op, comm::Communicator;
13+
stream::CuStream=default_device_stream(comm))
1314
count = length(recvbuf)
1415
@assert length(sendbuf) == count
1516
data_type = ncclDataType_t(eltype(recvbuf))
@@ -27,8 +28,10 @@ end
2728
Reduce the array `sendrecvbuf` using `op` (one of `+`, `*`, `min`, `max`,
2829
or `[`NCCL.avg`](@ref)`), writing the result inplace to all ranks.
2930
"""
30-
Allreduce!(sendrecvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm) ) =
31+
function Allreduce!(sendrecvbuf, op, comm::Communicator;
32+
stream::CuStream=default_device_stream(comm))
3133
Allreduce!(sendrecvbuf, sendrecvbuf, op, comm; stream)
34+
end
3235

3336
"""
3437
NCCL.Broadcast!(
@@ -41,14 +44,17 @@ Copies array the `sendbuf` on rank `root` to `recvbuf` on all ranks.
4144
# External links
4245
- [`ncclBroadcast`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclbroadcast)
4346
"""
44-
function Broadcast!(sendbuf, recvbuf, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm))
47+
function Broadcast!(sendbuf, recvbuf, comm::Communicator; root::Integer=0,
48+
stream::CuStream=default_device_stream(comm))
4549
data_type = ncclDataType_t(eltype(recvbuf))
4650
count = length(recvbuf)
4751
ncclBroadcast(sendbuf, recvbuf, count, data_type, root, comm, stream)
4852
return recvbuf
4953
end
50-
Broadcast!(sendrecvbuf, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm)) =
54+
function Broadcast!(sendrecvbuf, comm::Communicator; root::Integer=0,
55+
stream::CuStream=default_device_stream(comm))
5156
Broadcast!(sendrecvbuf, sendrecvbuf, comm; root, stream)
57+
end
5258

5359

5460
"""
@@ -63,15 +69,18 @@ or `[`NCCL.avg`](@ref)`), writing the result to `recvbuf` on rank `root`.
6369
# External links
6470
- [`ncclReduce`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclreduce)
6571
"""
66-
function Reduce!(sendbuf, recvbuf, op, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm))
72+
function Reduce!(sendbuf, recvbuf, op, comm::Communicator; root::Integer=0,
73+
stream::CuStream=default_device_stream(comm))
6774
data_type = ncclDataType_t(eltype(recvbuf))
6875
count = length(recvbuf)
6976
_op = ncclRedOp_t(op)
7077
ncclReduce(sendbuf, recvbuf, count, data_type, _op, root, comm, stream)
7178
return recvbuf
7279
end
73-
Reduce!(sendrecvbuf, op, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm)) =
80+
function Reduce!(sendrecvbuf, op, comm::Communicator; root::Integer=0,
81+
stream::CuStream=default_device_stream(comm))
7482
Reduce!(sendrecvbuf, sendrecvbuf, op, comm; root, stream)
83+
end
7584

7685
"""
7786
NCCL.Allgather!(
@@ -84,7 +93,8 @@ Concatenate `sendbuf` from each rank into `recvbuf` on all ranks.
8493
# External links
8594
- [`ncclAllGather`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclallgather)
8695
"""
87-
function Allgather!(sendbuf, recvbuf, comm::Communicator; stream::CuStream=default_device_stream(comm))
96+
function Allgather!(sendbuf, recvbuf, comm::Communicator;
97+
stream::CuStream=default_device_stream(comm))
8898
data_type = ncclDataType_t(eltype(recvbuf))
8999
sendcount = length(sendbuf)
90100
@assert length(recvbuf) == sendcount * size(comm)
@@ -105,7 +115,8 @@ scattered over the devices such that `recvbuf` on each rank will contain the
105115
# External links
106116
- [`ncclReduceScatter`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclreducescatter)
107117
"""
108-
function ReduceScatter!(sendbuf, recvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm) )
118+
function ReduceScatter!(sendbuf, recvbuf, op, comm::Communicator;
119+
stream::CuStream=default_device_stream(comm))
109120
recvcount = length(recvbuf)
110121
@assert length(sendbuf) == recvcount * size(comm)
111122
data_type = ncclDataType_t(eltype(recvbuf))

src/communicator.jl

+26-11
Original file line numberDiff line numberDiff line change
@@ -22,25 +22,42 @@ function destroy(comm::Communicator)
2222
end
2323
Base.unsafe_convert(::Type{LibNCCL.ncclComm_t}, comm::Communicator) = comm.handle
2424

25-
# creates a new communicator (multi thread/process version)
26-
function Communicator(nranks::Integer, comm_id::UniqueID, rank::Integer)
25+
"""
26+
NCCL.Communicator(nranks, rank; [unique_id]) :: Communicator
27+
28+
Create a single communicator for use in a multi-threaded or multi-process
29+
environment. `nranks` is the number of ranks in the communicator, and `rank`
30+
is the 0-based index of the current rank. `unique_id` is an optional unique
31+
identifier for the communicator.
32+
33+
# Examples
34+
```
35+
comm = Communicator(length(CUDA.devices()), id, myid())
36+
# this blocks until all other ranks have connected
37+
```
38+
39+
# External links
40+
- [`ncclCommInitRank`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclCommInitRank)
41+
"""
42+
function Communicator(nranks::Integer, rank::Integer;
43+
unique_id::UniqueID=UniqueID())
44+
0 <= rank < nranks || throw(ArgumentError("rank must be in [0, nranks)"))
2745
handle_ref = Ref{ncclComm_t}(C_NULL)
28-
ncclCommInitRank(handle_ref, nranks, comm_id, rank)
46+
ncclCommInitRank(handle_ref, nranks, unique_id, rank)
2947
c = Communicator(handle_ref[])
3048
return finalizer(destroy, c)
3149
end
3250

33-
# creates a clique of communicators (single process version)
3451
"""
3552
NCCL.Communicators(devices) :: Vector{Communicator}
3653
37-
Construct and initialize a clique of NCCL Communicators.
54+
Construct and initialize a clique of NCCL Communicators over the devices
55+
on a single host.
3856
3957
`devices` can either be a collection of identifiers, or `CuDevice`s.
4058
4159
# Examples
4260
```
43-
# initialize a clique over all devices on the host
4461
comms = NCCL.Communicators(CUDA.devices())
4562
```
4663
@@ -64,20 +81,19 @@ function Communicators(devices)
6481
end
6582

6683
"""
67-
CuDevice(comm::Communicator) :: CuDevice
84+
NCCL.device(comm::Communicator) :: CuDevice
6885
6986
The device of the communicator
7087
7188
# External Links
7289
- [`ncclCommCuDevice`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommcudevice)
7390
"""
74-
function CUDA.CuDevice(comm::Communicator)
91+
function device(comm::Communicator)
7592
dev_ref = Ref{Cint}(C_NULL)
7693
ncclCommCuDevice(comm, dev_ref)
7794
return CuDevice(dev_ref[])
7895
end
7996

80-
8197
"""
8298
NCCL.size(comm::Communicator) :: Int
8399
@@ -120,15 +136,14 @@ function abort(comm::Communicator)
120136
return
121137
end
122138

123-
124139
"""
125140
NCCL.default_device_stream(comm::Communicator) :: CuStream
126141
127142
Get the default stream for device `devid`, or the device corresponding to
128143
communicator `comm`.
129144
"""
130145
function default_device_stream(comm::Communicator)
131-
dev = CuDevice(comm)
146+
dev = device(comm)
132147
device!(dev) do
133148
stream()
134149
end

src/pointtopoint.jl

+6-4
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
NCCL.Send(
33
sendbuf, comm::Communicator;
44
dest::Integer,
5-
stream::CuStream = default_device_stream(comm))
5+
stream::CuStream = default_device_stream(comm))
66
)
77
88
Send data from `sendbuf` to rank `dest`. A matching [`Recv!`](@ref) must also be
@@ -11,7 +11,8 @@ called.
1111
# External links
1212
- [`ncclSend`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclsend)
1313
"""
14-
function Send(sendbuf, comm::Communicator; dest::Integer, stream::CuStream=default_device_stream(comm))
14+
function Send(sendbuf, comm::Communicator; dest::Integer,
15+
stream::CuStream=default_device_stream(comm))
1516
count = length(sendbuf)
1617
datatype = ncclDataType_t(eltype(sendbuf))
1718
ncclSend(sendbuf, count, datatype, dest, comm, stream)
@@ -22,7 +23,7 @@ end
2223
NCCL.Recv!(
2324
recvbuf, comm::Communicator;
2425
source::Integer,
25-
stream::CuStream = default_device_stream(comm))
26+
stream::CuStream = default_device_stream(comm))
2627
)
2728
2829
Write the data from a matching [`Send`](@ref) on rank `source` into `recvbuf`.
@@ -31,7 +32,8 @@ Write the data from a matching [`Send`](@ref) on rank `source` into `recvbuf`.
3132
- [`ncclRecv`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclrecv)
3233
3334
"""
34-
function Recv!(recvbuf, comm::Communicator; source::Integer, stream::CuStream=default_device_stream(comm))
35+
function Recv!(recvbuf, comm::Communicator; source::Integer,
36+
stream::CuStream=default_device_stream(comm))
3537
count = length(recvbuf)
3638
datatype = ncclDataType_t(eltype(recvbuf))
3739
ncclRecv(recvbuf, count, datatype, source, comm, stream)

test/runtests.jl

+6-5
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,17 @@ using NCCL
99
@testset "NCCL" begin
1010

1111
@testset "Communicator" begin
12+
# clique of communicators
1213
comms = NCCL.Communicators(CUDA.devices())
1314
for (i,dev) in enumerate(CUDA.devices())
1415
@test NCCL.rank(comms[i]) == i-1
15-
@test CuDevice(comms[i]) == dev
16+
@test NCCL.device(comms[i]) == dev
1617
@test NCCL.size(comms[i]) == length(CUDA.devices())
1718
end
18-
id = NCCL.UniqueID()
19-
#=num_devs = length(CUDA.devices())
20-
comm = Communicator(num_devs, id, 0)
21-
@test device(comm) == 0=#
19+
20+
# single communicator (with nranks=1 or this would block)
21+
comm = Communicator(1, 0)
22+
@test NCCL.device(comm) == CuDevice(0)
2223
end
2324

2425
@testset "Allreduce!" begin

0 commit comments

Comments
 (0)