Add new API for creating a reduction operation which multiplies the input by a rank-specific scalar before doing an inter-rank summation (see: ncclRedOpCreatePreMulSum). Improve CollNet (SHARP) performance of ncclAllReduce when captured in a CUDA Graph via user buffer registration. Add environment variable NCCL_NET_PLUGIN="<suffix>" to allow user to choose among multiple NCCL net plugins by substituting into "libnccl-net-<suffix>.so". Fix memory leak of NVB connections. Fix topology detection of IB Virtual Functions (SR-IOV).
90 lines
3.6 KiB
C++
90 lines
3.6 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include "devcomm.h"
|
|
#include "collectives.h"
|
|
#include "primitives.h"
|
|
|
|
namespace {
|
|
template<typename T, typename RedOp, typename Proto>
|
|
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
|
|
const int tid = threadIdx.x;
|
|
const int nthreads = args->nThreads;
|
|
const int bid = args->coll.bid;
|
|
const int nChannels = args->coll.nChannels;
|
|
ncclRing *ring = &ncclShmem.channel.ring;
|
|
int const *ringRanks = ring->devUserRanks;
|
|
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
|
|
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
|
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
|
|
const int nranks = ncclShmem.comm.nRanks;
|
|
const ssize_t loopSize = nChannels*chunkSize;
|
|
const ssize_t size = args->coll.count;
|
|
|
|
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto>
|
|
prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg);
|
|
|
|
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
|
ssize_t realChunkSize;
|
|
if (Proto::Id == NCCL_PROTO_SIMPLE) {
|
|
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
|
|
realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
|
|
}
|
|
else if (Proto::Id == NCCL_PROTO_LL)
|
|
realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize;
|
|
else if (Proto::Id == NCCL_PROTO_LL128)
|
|
realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
|
|
realChunkSize = int(realChunkSize);
|
|
|
|
ssize_t chunkOffset = gridOffset + bid*int(realChunkSize);
|
|
|
|
/////////////// begin ReduceScatter steps ///////////////
|
|
ssize_t offset;
|
|
int nelem = min(realChunkSize, size-chunkOffset);
|
|
int rankDest;
|
|
|
|
// step 0: push data to next GPU
|
|
rankDest = ringRanks[nranks-1];
|
|
offset = chunkOffset + rankDest * size;
|
|
prims.send(offset, nelem);
|
|
|
|
// k-2 steps: reduce and copy to next GPU
|
|
for (int j=2; j<nranks; ++j) {
|
|
rankDest = ringRanks[nranks-j];
|
|
offset = chunkOffset + rankDest * size;
|
|
prims.recvReduceSend(offset, nelem);
|
|
}
|
|
|
|
// step k-1: reduce this buffer and data, which will produce the final result
|
|
rankDest = ringRanks[0];
|
|
offset = chunkOffset + rankDest * size;
|
|
prims.recvReduceCopy(offset, chunkOffset, nelem, /*postOp=*/true);
|
|
}
|
|
}
|
|
}
|
|
|
|
template<typename T, typename RedOp>
|
|
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
|
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
|
using Proto = ProtoSimple<REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS>;
|
|
runRing<T, RedOp, Proto>(args);
|
|
}
|
|
};
|
|
|
|
template<typename T, typename RedOp>
|
|
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
|
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
|
runRing<T, RedOp, ProtoLL>(args);
|
|
}
|
|
};
|
|
|
|
template<typename T, typename RedOp>
|
|
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
|
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
|
runRing<T, RedOp, ProtoLL128>(args);
|
|
}
|
|
};
|