From 5b7b6e8b5cab96f706bef85096671a04de306e73 Mon Sep 17 00:00:00 2001 From: Sreevatsa Anantharamu Date: Wed, 19 Feb 2025 17:08:16 +0000 Subject: [PATCH 1/3] First version of recvbuff exchange. --- apps/nccl/src/broadcast.hpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/apps/nccl/src/broadcast.hpp b/apps/nccl/src/broadcast.hpp index 6d52c963..7964a2d9 100644 --- a/apps/nccl/src/broadcast.hpp +++ b/apps/nccl/src/broadcast.hpp @@ -20,16 +20,28 @@ __global__ void __launch_bounds__(1024, 1) const size_t nThread = blockDim.x * gridDim.x; const size_t nPeer = nRanksPerNode - 1; const size_t chanOffset = nPeer * blockIdx.x; + const size_t peerRootIdx = (root == rank) ? nPeer : ((root < rank) ? root : (root - 1)); __shared__ mscclpp::DeviceHandle memChans[NRANKS_PER_NODE - 1]; if (threadIdx.x < nPeer) { - memChans[threadIdx.x] = memoryChannels[chanOffset + threadIdx.x]; memChans[threadIdx.x].relaxedSignal(); memChans[threadIdx.x].wait(); } + +// if (threadIdx.x < nPeer) { +// // My Id in peer. +// const size_t myIdInPeer = (threadIdx.x < rank) ? (rank - 1) : rank; +// const size_t offset = myIdInPeer * sizeof(void *); +// memChans[threadIdx.x] = memoryChannels[chanOffset + threadIdx.x]; +// // Write recvbuff (64 bytes). +// void **dst = reinterpret_cast(memChans[threadIdx.x].dst_); // Peer's scratchbuff. +// *(dst + offset) = recvbuff; +// memChans[threadIdx.x].signal(); +// //memChans[threadIdx.x].relaxedSignal(); +// memChans[threadIdx.x].wait(); +// } __syncthreads(); - const size_t peerRootIdx = (root == rank) ? nPeer : ((root < rank) ? root : (root - 1)); const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); const size_t bytes = bytesPerGPU; From 67f1e000797a5ab22f1699870389e620882db9be Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 19 Feb 2025 17:14:54 +0000 Subject: [PATCH 2/3] fixed memchans assignment. --- apps/nccl/src/broadcast.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/nccl/src/broadcast.hpp b/apps/nccl/src/broadcast.hpp index 7964a2d9..c152fb3f 100644 --- a/apps/nccl/src/broadcast.hpp +++ b/apps/nccl/src/broadcast.hpp @@ -24,6 +24,7 @@ __global__ void __launch_bounds__(1024, 1) __shared__ mscclpp::DeviceHandle memChans[NRANKS_PER_NODE - 1]; if (threadIdx.x < nPeer) { + memChans[threadIdx.x] = memoryChannels[chanOffset + threadIdx.x]; memChans[threadIdx.x].relaxedSignal(); memChans[threadIdx.x].wait(); } From c5f3b60adeac4cc767a76e1c9aed2559490b3efc Mon Sep 17 00:00:00 2001 From: Sreevatsa Anantharamu Date: Wed, 19 Feb 2025 18:47:16 +0000 Subject: [PATCH 3/3] Tested the exchange of recvbuff with scratch buff --- apps/nccl/src/broadcast.hpp | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/apps/nccl/src/broadcast.hpp b/apps/nccl/src/broadcast.hpp index c152fb3f..a41b4b3a 100644 --- a/apps/nccl/src/broadcast.hpp +++ b/apps/nccl/src/broadcast.hpp @@ -23,26 +23,31 @@ __global__ void __launch_bounds__(1024, 1) const size_t peerRootIdx = (root == rank) ? nPeer : ((root < rank) ? root : (root - 1)); __shared__ mscclpp::DeviceHandle memChans[NRANKS_PER_NODE - 1]; - if (threadIdx.x < nPeer) { - memChans[threadIdx.x] = memoryChannels[chanOffset + threadIdx.x]; - memChans[threadIdx.x].relaxedSignal(); - memChans[threadIdx.x].wait(); - } - // if (threadIdx.x < nPeer) { -// // My Id in peer. -// const size_t myIdInPeer = (threadIdx.x < rank) ? (rank - 1) : rank; -// const size_t offset = myIdInPeer * sizeof(void *); // memChans[threadIdx.x] = memoryChannels[chanOffset + threadIdx.x]; -// // Write recvbuff (64 bytes). -// void **dst = reinterpret_cast(memChans[threadIdx.x].dst_); // Peer's scratchbuff. -// *(dst + offset) = recvbuff; -// memChans[threadIdx.x].signal(); -// //memChans[threadIdx.x].relaxedSignal(); +// memChans[threadIdx.x].relaxedSignal(); // memChans[threadIdx.x].wait(); // } + + if (threadIdx.x < nPeer) { + // My Id in peer. + const size_t myIdInPeer = (threadIdx.x < rank) ? (rank - 1) : rank; + const size_t offset = myIdInPeer * sizeof(void *); + memChans[threadIdx.x] = memoryChannels[chanOffset + threadIdx.x]; + // Write recvbuff (64 bytes). + void **dst = reinterpret_cast(memChans[threadIdx.x].dst_); // Peer's scratchbuff. + *(dst + offset) = recvbuff; + memChans[threadIdx.x].signal(); + //memChans[threadIdx.x].relaxedSignal(); + memChans[threadIdx.x].wait(); + } __syncthreads(); + if (threadIdx.x < nPeer) { + void **scratch_ = reinterpret_cast(scratchbuff); // My scratchbuff. + printf("rank = %ld, recvbuff = %p, recvbuff_[0] = %p, recvbuff_[1] = %p\n", rank, recvbuff, + *scratch_, *(scratch_ + sizeof(void *))); + } const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); const size_t bytes = bytesPerGPU;