/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "devcomm.h" #include "primitives.h" #include "collectives.h" template __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = args->coll.nThreads-WARP_SIZE; const int bid = args->coll.bid; const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS); const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS; const int nranks = comm->nRanks; const ssize_t loopSize = nChannels*(ssize_t)chunkSize; const ssize_t size = args->coll.count; // Compute pointers const T * __restrict__ thisInput = (const T*)args->sendbuff; T * __restrict__ thisOutput = (T*)args->recvbuff; ncclPrimitives prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels)); ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); ssize_t chunkOffset = gridOffset + bid*realChunkSize; /////////////// begin AllGather steps /////////////// ssize_t offset; int nelem = min(realChunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU rankDest = ring->devUserRanks[0]; offset = chunkOffset + rankDest * size; if (thisInput + chunkOffset == thisOutput + offset) { // In place prims.directSend(thisInput+chunkOffset, offset, nelem); } else { prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem); } // k-2 steps: copy to next GPU for (int j=1; jdevUserRanks[nranks-j]; offset = chunkOffset + rankDest * size; prims.directRecvCopySend(thisOutput+offset, offset, nelem); } // Make final copy from buffer to dest. rankDest = ring->devUserRanks[1]; offset = chunkOffset + rankDest * size; // Final wait/copy. prims.directRecv(thisOutput+offset, offset, nelem); } } template __device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { } template __device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { } template __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = args->coll.nThreads; const int bid = args->coll.bid; const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS); ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T); const int nranks = comm->nRanks; const ssize_t loopSize = nChannels*chunkSize; const ssize_t size = args->coll.count; ncclLLPrimitives LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm); // Compute pointers const T * __restrict__ thisInput = (const T*)args->sendbuff; T * __restrict__ thisOutput = (T*)args->recvbuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { chunkSize = args->coll.lastChunkSize; } ssize_t chunkOffset = gridOffset + bid*chunkSize; /////////////// begin AllGather steps /////////////// ssize_t offset; int nelem = min(chunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU rankDest = ring->devUserRanks[0]; offset = chunkOffset + rankDest * size; if (thisInput + chunkOffset == thisOutput + offset) { // In place LLprims.send(thisInput+chunkOffset, nelem); } else { LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem); } // k-2 steps: copy to next GPU for (int j=1; jdevUserRanks[nranks-j]; offset = chunkOffset + rankDest * size; LLprims.recvCopySend(thisOutput+offset, nelem); } // step k-1: final store rankDest = ring->devUserRanks[1]; offset = chunkOffset + rankDest * size; LLprims.recv(thisOutput+offset, nelem); } } template __device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { } template __device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { } #include "prims_ll128.h" template __device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = args->coll.nThreads; const int bid = args->coll.bid; const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS); ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T)); // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2; const int nranks = comm->nRanks; const ssize_t loopSize = nChannels*chunkSize; const ssize_t size = args->coll.count; ncclLL128Primitives LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm); // Compute pointers const T * __restrict__ thisInput = (const T*)args->sendbuff; T * __restrict__ thisOutput = (T*)args->recvbuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize); ssize_t chunkOffset = gridOffset + bid*chunkSize; /////////////// begin AllGather steps /////////////// ssize_t offset; int nelem = min(chunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU rankDest = ring->devUserRanks[0]; offset = chunkOffset + rankDest * size; if (thisInput + chunkOffset == thisOutput + offset) { // In place LLprims.send(thisInput+chunkOffset, nelem); } else { LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem); } // k-2 steps: copy to next GPU for (int j=1; jdevUserRanks[nranks-j]; offset = chunkOffset + rankDest * size; LLprims.recvCopySend(thisOutput+offset, nelem); } // step k-1: final store rankDest = ring->devUserRanks[1]; offset = chunkOffset + rankDest * size; LLprims.recv(thisOutput+offset, nelem); } } template __device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { } template __device__ void ncclAllGatherCollNetLL128Kernel(struct CollectiveArgs* args) { }