/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "devcomm.h" #include "collectives.h" #include "primitives.h" namespace { template __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; const int nthreads = args->nThreads; const int bid = args->coll.bid; const int nChannels = args->coll.nChannels; ncclRing *ring = &ncclShmem.channel.ring; int const *ringRanks = ring->devUserRanks; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1)); // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2); const int nranks = ncclShmem.comm.nRanks; const ssize_t loopSize = nChannels*chunkSize; const ssize_t size = args->coll.count; Primitives, 0, Proto> prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t realChunkSize; if (Proto::Id == NCCL_PROTO_SIMPLE) { realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels)); realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); } else if (Proto::Id == NCCL_PROTO_LL) realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize; else if (Proto::Id == NCCL_PROTO_LL128) realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize); realChunkSize = int(realChunkSize); ssize_t chunkOffset = gridOffset + bid*int(realChunkSize); /////////////// begin ReduceScatter steps /////////////// ssize_t offset; int nelem = min(realChunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU rankDest = ringRanks[nranks-1]; offset = chunkOffset + rankDest * size; prims.send(offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; j struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { using Proto = ProtoSimple; runRing(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } };