Add new NVLS algorithm for allreduce using NVLink SHARP (intra-node only).
Add new config options: cgaClusterSize, minCTAs, maxCTAs, netName.
Enable LL128 when we use PXN to close rings.
NVTX3 includes update.
Fix crash when one CollNet (SHARP) rail fails to initialize.
This commit is contained in:
Sylvain Jeaugey 2023-02-27 02:48:21 -08:00
parent f3d5166783
commit 5d3ab08b69
72 changed files with 4541 additions and 2391 deletions

View File

@ -1,6 +1,6 @@
##### version ##### version
NCCL_MAJOR := 2 NCCL_MAJOR := 2
NCCL_MINOR := 16 NCCL_MINOR := 17
NCCL_PATCH := 5 NCCL_PATCH := 1
NCCL_SUFFIX := NCCL_SUFFIX :=
PKG_REVISION := 1 PKG_REVISION := 1

View File

@ -12,7 +12,8 @@ INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \ LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \ misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \ misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \ misc/ipcsocket.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc transport/nvls.cc \
collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \ collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
@ -62,7 +63,7 @@ ALWAYS_REBUILD:
-include $(DEPFILES) -include $(DEPFILES)
$(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ) $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
$(INCDIR)/nccl.h : nccl.h.in $(INCDIR)/nccl.h : nccl.h.in ../makefiles/version.mk
# NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z)) # NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z))
@$(eval NCCL_VERSION := $(shell printf "%d%02d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH))) @$(eval NCCL_VERSION := $(shell printf "%d%02d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
mkdir -p $(INCDIR) mkdir -p $(INCDIR)

View File

@ -386,6 +386,24 @@ ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank,
return ncclSuccess; return ncclSuccess;
} }
// IntraNode in-place Broadcast
ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size) {
if (nranks == 1) return ncclSuccess;
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - ENTER", rank, nranks, root, size);
if (rank == root) {
for (int i=0; i<nranks; i++) {
if (i != root) NCCLCHECK(bootstrapSend(commState, ranks[i], /*tag=*/ranks[i], bcastData, size));
}
}
else {
NCCLCHECK(bootstrapRecv(commState, ranks[root], /*tag=*/rank, bcastData, size));
}
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - DONE", rank, nranks, root, size);
return ncclSuccess;
}
ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) { ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
// New unex // New unex
struct unexConn* unex; struct unexConn* unex;

View File

@ -13,14 +13,15 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
if (channel->id != -1) return ncclSuccess; if (channel->id != -1) return ncclSuccess;
int nRanks = comm->nRanks; int nRanks = comm->nRanks;
int nPeers = nRanks + 1 /* Collnet */ + comm->localRanks /* NVLS */;
channel->id = channelId; channel->id = channelId;
channel->workFifoSent = 0; channel->workFifoSent = 0;
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream)); NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream));
// The extra on nRanks+1 is for collnet root (i.e. network) // The extra on nRanks+1 is for collnet root (i.e. network)
channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nRanks+1); channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nPeers);
NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nRanks+1, comm->deviceStream.cudaStream)); NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, comm->deviceStream.cudaStream));
ncclCommPushCudaFree(comm, channel->devPeers); ncclCommPushCudaFree(comm, channel->devPeers);
channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks); channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
@ -29,7 +30,7 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream)); NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream));
for (int r=0; r < nRanks+1; ++r) { for (int r=0; r < nPeers; ++r) {
for (int b=0; b < NCCL_MAX_CONNS; b++) { for (int b=0; b < NCCL_MAX_CONNS; b++) {
channel->peers[r].send[b].comm = comm; channel->peers[r].send[b].comm = comm;
channel->peers[r].recv[b].comm = comm; channel->peers[r].recv[b].comm = comm;

View File

@ -97,3 +97,45 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL
runRing<T, RedOp, ProtoLL128>(args); runRing<T, RedOp, ProtoLL128>(args);
} }
}; };
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*chunkSize;
const int nThreadsGather = 128;
const int nThreadsBcast = 384 + WARP_SIZE;
const int tidEndGather = nThreadsGather;
const int tidEndBcast = tidEndGather + nThreadsBcast;
using Proto = ProtoSimple<1, 1>;
if (tid < tidEndGather) {
// Gather
int group = (0*Proto::MaxGroupWidth) | (0<<16);
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.gather(offset, nvls->nHeads*size, nelem, size, -1, 0);
}
} else if (tid < tidEndBcast) {
int group = (3*Proto::MaxGroupWidth) | (1<<16);
// Bcast through MC
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.send(offset, nelem);
}
}
}
};

View File

@ -306,9 +306,9 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize; ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
int nelem = min(direct->nHeads*chunkSize, size-offset); int nelem = min(direct->nHeads*chunkSize, size-offset);
if (args->regUsed) { if (args->regUsed) {
prims.directScatter(offset, nelem, chunkSize, direct->headRank, direct->shift); prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
} else { } else {
prims.scatter(offset, nelem, chunkSize, direct->headRank, direct->shift); prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
} }
} }
} else if (tid >= tidStartReduce && direct->out != -1) { } else if (tid >= tidStartReduce && direct->out != -1) {
@ -344,7 +344,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize; ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
int nelem = min(direct->nHeads*chunkSize, size-offset); int nelem = min(direct->nHeads*chunkSize, size-offset);
prims.directGather(offset, nelem, chunkSize, direct->headRank, direct->shift); prims.directGather(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
} }
} else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) { } else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
int group = (1*Proto::MaxGroupWidth) | (0<<16); int group = (1*Proto::MaxGroupWidth) | (0<<16);
@ -371,6 +371,65 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
} }
}; };
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
#if NCCL_NVLS_ENABLED
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
const int nranks = ncclShmem.comm.nRanks;
const int reduceWarps = nranks <= 6 ? 6 : 4;
const int copyWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps)/2;
const int nThreadsScatter = copyWarps*WARP_SIZE;
const int nThreadsGather = (copyWarps-1)*WARP_SIZE;
const int nThreadsReduce = (reduceWarps+1)*WARP_SIZE;
const int tidEndScatter = nThreadsScatter;
const int tidEndGather = tidEndScatter + nThreadsGather;
const int tidEndReduce = tidEndGather + nThreadsReduce;
using Proto = ProtoSimple<1, 1, COLL_UNROLL, /*NVLS=*/true>;
if (tid < tidEndScatter) {
// Scatter
int group = (0*Proto::MaxGroupWidth) | (0<<16);
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndGather) {
// Gather
int group = (2*Proto::MaxGroupWidth) | (0<<16);
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndReduce) {
int group = (3*Proto::MaxGroupWidth) | (1<<16);
// Reduce, broadcast through NVLS
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
}
#endif // NCCL_NVLS_ENABLED
}
};
template<typename T, typename RedOp> template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> { struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) { __device__ __forceinline__ void run(ncclWorkElem *args) {

View File

@ -11,31 +11,23 @@
#include "devcomm.h" #include "devcomm.h"
#include "op128.h" #include "op128.h"
#if __CUDA_ARCH__ >= 800 #define COLL_UNROLL (ncclCollUnroll())
#define COLL_UNROLL 8
#else
#define COLL_UNROLL 4
#endif
#define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1) // Using balanced tree instead of split tree #define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1) // Using balanced tree instead of split tree
typedef void(*ncclKern_t)(); typedef void(*ncclKern_t)();
extern __device__ ncclKern_t ncclFuncs[]; extern __device__ ncclKern_t ncclFuncs[];
struct ncclShmemGroup { struct ncclShmemGroup {
ncclConnInfo *recvConns[NCCL_MAX_DIRECT_ARITY]; ncclConnInfo *recvConns[NCCL_MAX_NVLS_ARITY];
ncclConnInfo *sendConns[NCCL_MAX_DIRECT_ARITY]; ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY];
void* srcs[NCCL_MAX_DIRECT_ARITY+1]; void* srcs[NCCL_MAX_NVLS_ARITY+1];
void* dsts[NCCL_MAX_DIRECT_ARITY+1]; void* dsts[NCCL_MAX_NVLS_ARITY+1];
int totalSendSize[NCCL_MAX_SLICE_PER_CHUNK]; int nvlsRecv;
}; };
struct ncclShmemData { struct ncclShmemData {
union { struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
uint64_t ll128warp[NCCL_LL128_MAX_NTHREADS/WARP_SIZE][NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE]; uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
};
uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
int channelId; int channelId;
int aborted; int aborted;
alignas(16) struct ncclDevComm comm; alignas(16) struct ncclDevComm comm;
@ -45,6 +37,15 @@ struct ncclShmemData {
static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned"); static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
extern __shared__ ncclShmemData ncclShmem; extern __shared__ ncclShmemData ncclShmem;
#if __CUDA_ARCH__ >= 700
extern __shared__ ulong2 ncclShmemPerWarp[/*ncclShmemDynamicSize()/sizeof(ulong2)*/];
#else
extern __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
#endif
__device__ inline void* ncclScratchForWarp(int warp) {
return (char*)ncclShmemPerWarp + warp*ncclShmemScratchWarpSize();
}
__device__ inline bool barrierReduceAny(int bit) { __device__ inline bool barrierReduceAny(int bit) {
uint32_t popc; uint32_t popc;
@ -235,7 +236,8 @@ __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
IMPL_COLL4(func, TREE, devredop, type, ncclType) \ IMPL_COLL4(func, TREE, devredop, type, ncclType) \
IMPL_COLL4(func, RING, devredop, type, ncclType) \ IMPL_COLL4(func, RING, devredop, type, ncclType) \
IMPL_COLL4(func, COLLNET_DIRECT, devredop, type, ncclType) \ IMPL_COLL4(func, COLLNET_DIRECT, devredop, type, ncclType) \
IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType) IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType) \
IMPL_COLL4(func, NVLS, devredop, type, ncclType)
#if NCCL_TYPE == 0 #if NCCL_TYPE == 0
#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int8_t, ncclInt8) #define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int8_t, ncclInt8)
@ -291,4 +293,6 @@ __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
#define IMPL_COLL_P(func) #define IMPL_COLL_P(func)
#endif #endif
#define NCCL_NVLS_ENABLED (__CUDA_ARCH__ >= 900 && NCCL_NVLS_SUPPORTS(NCCL_TYPE, NCCL_OP))
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -9,6 +9,9 @@
#include "common.h" #include "common.h"
__shared__ ncclShmemData ncclShmem; __shared__ ncclShmemData ncclShmem;
#if __CUDA_ARCH__ < 700
__shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
#endif
#define NCCL_FUNC5(func, algo, devredop, type, nullify) \ #define NCCL_FUNC5(func, algo, devredop, type, nullify) \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \ MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \
@ -19,7 +22,8 @@ __shared__ ncclShmemData ncclShmem;
NCCL_FUNC5(func, TREE, devredop, type, nullify), \ NCCL_FUNC5(func, TREE, devredop, type, nullify), \
NCCL_FUNC5(func, RING, devredop, type, nullify), \ NCCL_FUNC5(func, RING, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \ NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify) NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify), \
NCCL_FUNC5(func, NVLS, devredop, type, nullify)
#if defined(__CUDA_BF16_TYPES_EXIST__) #if defined(__CUDA_BF16_TYPES_EXIST__)
// Must be consistent with ncclDataType_t // Must be consistent with ncclDataType_t

View File

@ -6,7 +6,7 @@
#include "devcomm.h" #include "devcomm.h"
#include "collectives.h" #include "collectives.h"
#include "reduce_kernel.h" #include "common_kernel.h"
#include "common.h" #include "common.h"
namespace { namespace {
@ -35,8 +35,10 @@ namespace {
i1 = i1 < eltN ? i1 : eltN; i1 = i1 < eltN ? i1 : eltN;
src += i0; src += i0;
dst += i0; dst += i0;
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 1> void *vsrc = (void*)src;
(tid, tn, &(we->redOpArg), true, 1, &src, 1, &dst, i1-i0); void *vdst = (void*)dst;
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/1>
(tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0);
} }
} }
} }

View File

@ -65,4 +65,290 @@ inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1
v1 = tmp8[1]; v1 = tmp8[1];
} }
template<typename T>
__device__ __forceinline__ uint32_t cvta_to_shared(T* ptr) {
return (uint32_t)__cvta_generic_to_shared(ptr);
}
template<typename T>
__device__ __forceinline__ uintptr_t cvta_to_global(T* ptr) {
return (uintptr_t)__cvta_generic_to_global(ptr);
}
template<typename T>
__device__ __forceinline__ T* cvta_from_shared(uint32_t shptr) {
T* ans;
asm("cvta.shared.u64 %0, %1;" : "=l"(ans) : "l"(uint64_t(shptr)));
return ans;
}
template<typename T>
__device__ __forceinline__ T* cvta_from_global(uintptr_t gptr) {
T* ans;
asm("cvta.global.u64 %0, %1;" : "=l"(ans) : "l"(gptr));
return ans;
}
////////////////////////////////////////////////////////////////////////////////
// BytePack<Size>: struct of bytes.
template<int Size>
union BytePack;
template<>
union BytePack<1> {
uint8_t u8, native;
};
template<>
union BytePack<2> {
BytePack<1> half[2];
uint8_t u8[2];
uint16_t u16, native;
};
template<>
union BytePack<4> {
BytePack<2> half[2];
uint8_t u8[4];
uint16_t u16[2];
uint32_t u32, native;
};
template<>
union BytePack<8> {
BytePack<4> half[2];
uint8_t u8[8];
uint16_t u16[4];
uint32_t u32[2];
uint64_t u64, native;
};
template<>
union alignas(16) BytePack<16> {
BytePack<8> half[2];
uint8_t u8[16];
uint16_t u16[8];
uint32_t u32[4];
uint64_t u64[2];
ulong2 ul2, native;
};
template<typename T>
__device__ __forceinline__ BytePack<sizeof(T)> toPack(T value) {
union { BytePack<sizeof(T)> p; T v; };
v = value;
return p;
}
template<typename T>
__device__ __forceinline__ T fromPack(BytePack<sizeof(T)> pack) {
union { BytePack<sizeof(T)> p; T v; };
p = pack;
return v;
}
////////////////////////////////////////////////////////////////////////////////
// Load/store of BytePack<?> using integral addresses.
template<int Size> __device__ BytePack<Size> ld_global(uintptr_t addr);
template<int Size> __device__ BytePack<Size> ld_volatile_global(uintptr_t addr);
template<int Size> __device__ BytePack<Size> ld_shared(uint32_t addr);
template<int Size> __device__ BytePack<Size> ld_volatile_shared(uint32_t addr);
template<int Size> __device__ void st_global(uintptr_t addr, BytePack<Size> value);
template<int Size> __device__ void st_shared(uint32_t addr, BytePack<Size> value);
// Used to define implementations for above prototypes.
#define DEFINE_ld_st(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \
template<> \
__device__ __forceinline__ BytePack<bytes> ld_##space<bytes>(addr_cxx_ty addr) { \
data_cxx_ty tmp; \
asm("ld." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \
BytePack<bytes> ans; \
ans.native = tmp; \
return ans; \
} \
template<> \
__device__ __forceinline__ BytePack<bytes> ld_volatile_##space<bytes>(addr_cxx_ty addr) { \
data_cxx_ty tmp; \
asm("ld.volatile." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \
BytePack<bytes> ans; \
ans.native = tmp; \
return ans; \
} \
template<> \
__device__ __forceinline__ void st_##space<bytes>(addr_cxx_ty addr, BytePack<bytes> value) { \
data_cxx_ty tmp = value.native; \
asm volatile("st." #space "." #data_ptx_ty " [%0], %1;" :: #addr_reg_ty(addr), #data_reg_ty(tmp) : "memory"); \
}
// Single-byte types use 4-byte registers since there is no 1-byte register
// character for asm blocks. See https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints
DEFINE_ld_st(1, uint32_t, b8, r, global, uintptr_t, l)
DEFINE_ld_st(1, uint32_t, b8, r, shared, uint32_t, r)
DEFINE_ld_st(2, uint16_t, b16, h, global, uintptr_t, l)
DEFINE_ld_st(2, uint16_t, b16, h, shared, uint32_t, r)
DEFINE_ld_st(4, uint32_t, b32, r, global, uintptr_t, l)
DEFINE_ld_st(4, uint32_t, b32, r, shared, uint32_t, r)
DEFINE_ld_st(8, uint64_t, b64, l, global, uintptr_t, l)
DEFINE_ld_st(8, uint64_t, b64, l, shared, uint32_t, r)
#undef DEFINE_ld_st
#define DEFINE_ld_st_16(space, addr_cxx_ty, addr_reg_ty) \
template<> \
__device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \
BytePack<16> ans; \
asm("ld." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \
return ans; \
} \
template<> \
__device__ __forceinline__ BytePack<16> ld_volatile_##space<16>(addr_cxx_ty addr) { \
BytePack<16> ans; \
asm("ld.volatile." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \
return ans; \
} \
template<> \
__device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \
asm("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \
}
DEFINE_ld_st_16(global, uintptr_t, l)
DEFINE_ld_st_16(shared, uint32_t, r)
#undef DEFINE_ld_st_16
////////////////////////////////////////////////////////////////////////////////
// Atomic load/store using c++ pointers.
__device__ __forceinline__ uint64_t ld_volatile_global(uint64_t *ptr) {
uint64_t ans;
asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
return ans;
}
__device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) {
uint64_t ans;
#if __CUDA_ARCH__ >= 700
asm("ld.relaxed.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
#else
asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
#endif
return ans;
}
__device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) {
uint64_t ans;
#if __CUDA_ARCH__ >= 700
asm("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
#else
asm("ld.volatile.sys.global.u64 %0, [%1]; membar.gl;" : "=l"(ans) : "l"(cvta_to_global(ptr)));
#endif
return ans;
}
__device__ __forceinline__ void st_volatile_global(uint64_t *ptr, uint64_t val) {
asm volatile("st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory");
}
__device__ __forceinline__ void st_relaxed_sys_global(uint64_t *ptr, uint64_t val) {
#if __CUDA_ARCH__ >= 700
asm volatile("st.relaxed.sys.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory");
#else
asm volatile("st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory");
#endif
}
__device__ __forceinline__ void st_release_sys_global(uint64_t *ptr, uint64_t val) {
#if __CUDA_ARCH__ >= 700
asm volatile("st.release.sys.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory");
#else
asm volatile("membar.sys; st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory");
#endif
}
__device__ __forceinline__ void fence_acq_rel_sys() {
#if __CUDA_ARCH__ >= 700
asm volatile("fence.acq_rel.sys;" ::: "memory");
#else
asm volatile("membar.sys;" ::: "memory");
#endif
}
__device__ __forceinline__ void fence_acq_rel_gpu() {
#if __CUDA_ARCH__ >= 700
asm volatile("fence.acq_rel.gpu;" ::: "memory");
#else
asm volatile("membar.gl;" ::: "memory");
#endif
}
////////////////////////////////////////////////////////////////////////////////
// Multimem stores of BytePack<?>.
template<int Size>
__device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size> val);
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
template<>
__device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) {
asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.u32) : "memory");
}
template<>
__device__ __forceinline__ void multimem_st_global<8>(uintptr_t addr, BytePack<8> val) {
asm volatile("multimem.st.global.b64 [%0], %1;" :: "l"(addr), "l"(val.u64) : "memory");
}
template<>
__device__ __forceinline__ void multimem_st_global<16>(uintptr_t addr, BytePack<16> val) {
asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};"
:: "l"(addr), "r"(val.u32[0]), "r"(val.u32[1]), "r"(val.u32[2]), "r"(val.u32[3])
: "memory");
}
#else
template<int Size>
__device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size> val) {
// nop
}
#endif
// Warp-uniform memory copy from shared address (not generic) to global memory.
// The number of bytes copied is `min(MaxBytes, nBytesAhead)`, a negative value
// is interpeted as zero. EltSize is the guaranteed alignment of the addresses and sizes.
template<int EltSize, int MaxBytes, bool Multimem, typename IntBytes>
__device__ __forceinline__ void copyGlobalShared_WarpUnrolled(
int lane, uintptr_t dstAddr, uint32_t srcAddr, IntBytes nBytesAhead
) {
static_assert(std::is_signed<IntBytes>::value, "`IntBytes` must be a signed integral type.");
int nBytes = min(nBytesAhead, (IntBytes)MaxBytes);
int nFrontBytes = min(nBytes, (16 - int(dstAddr%16))%16);
int nMiddleBytes = (nBytes-nFrontBytes) & -16;
int nBackBytes = (nBytes-nFrontBytes) % 16;
{ int backLane = WARP_SIZE-1 - lane;
bool hasFront = lane*EltSize < nFrontBytes;
bool hasBack = backLane*EltSize < nBackBytes;
int offset = hasFront ? lane*EltSize : (nBytes - (backLane+1)*EltSize);
if (hasFront | hasBack) {
BytePack<EltSize> tmp = ld_shared<EltSize>(srcAddr+offset);
// Can't use multimem_st since it doesn't support EltSize==2
st_global<EltSize>(dstAddr+offset, tmp);
}
}
srcAddr += nFrontBytes;
int srcMisalign = EltSize < 4 ? (srcAddr%4) : 0;
srcAddr += -srcMisalign + lane*16;
dstAddr += nFrontBytes + lane*16;
nMiddleBytes -= lane*16;
#pragma unroll
for (int u=0; u < divUp(MaxBytes, WARP_SIZE*16); u++) {
if (nMiddleBytes <= 0) break;
union {
BytePack<4> b4[4];
BytePack<16> b16;
};
b4[0] = ld_shared<4>(srcAddr + 0*4);
b4[1] = ld_shared<4>(srcAddr + 1*4);
b4[2] = ld_shared<4>(srcAddr + 2*4);
b4[3] = ld_shared<4>(srcAddr + 3*4);
if (srcMisalign != 0) {
BytePack<4> b4_4 = ld_shared<4>(srcAddr + 4*4);
b4[0].u32 = __funnelshift_r(b4[0].u32, b4[1].u32, srcMisalign*8);
b4[1].u32 = __funnelshift_r(b4[1].u32, b4[2].u32, srcMisalign*8);
b4[2].u32 = __funnelshift_r(b4[2].u32, b4[3].u32, srcMisalign*8);
b4[3].u32 = __funnelshift_r(b4[3].u32, b4_4.u32, srcMisalign*8);
}
if (Multimem) multimem_st_global<16>(dstAddr, b16);
else st_global<16>(dstAddr, b16);
srcAddr += WARP_SIZE*16;
dstAddr += WARP_SIZE*16;
nMiddleBytes -= WARP_SIZE*16;
}
}
#endif #endif

View File

@ -9,6 +9,7 @@
#include <type_traits> #include <type_traits>
#include "reduce_kernel.h" // for reduction funcs #include "reduce_kernel.h" // for reduction funcs
#include "common_kernel.h"
#include "common.h" #include "common.h"
#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000 #define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
@ -20,12 +21,13 @@
* to how that protocol operates with a consistent interface so that our * to how that protocol operates with a consistent interface so that our
* algorithm code can operate protocol parametrically. * algorithm code can operate protocol parametrically.
*/ */
template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL> template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL, bool NVLS_1 = false>
struct ProtoSimple { struct ProtoSimple {
static constexpr int Id = NCCL_PROTO_SIMPLE; static constexpr int Id = NCCL_PROTO_SIMPLE;
static constexpr int SlicePerChunk = SlicePerChunk_1; static constexpr int SlicePerChunk = SlicePerChunk_1;
static constexpr int StepPerSlice = StepPerSlice_1; static constexpr int StepPerSlice = StepPerSlice_1;
static constexpr int Unroll = Unroll_1; static constexpr int Unroll = Unroll_1;
static constexpr bool NVLS = NVLS_1;
// Data bytes (no flags etc) in one step of the fifo queue. // Data bytes (no flags etc) in one step of the fifo queue.
__device__ static int calcBytePerStep() { __device__ static int calcBytePerStep() {

View File

@ -255,18 +255,18 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
} }
if (SRC) { if (SRC) {
data = dl.loadFinish(); data = dl.loadFinish();
if (SrcBuf == Input) data = MULTI<RedOp, T>().preOp(redOp, data); if (SrcBuf == Input) data = applyPreOp(redOp, data);
} }
if (RECV) { if (RECV) {
data = !SRC ? peerData : MULTI<RedOp,T>()(redOp, peerData, data); data = !SRC ? peerData : applyReduce(redOp, peerData, data);
#pragma unroll MaxRecv #pragma unroll MaxRecv
for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) { for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) {
peerData = readLLFinish(offset, line, i); peerData = readLLFinish(offset, line, i);
data = MULTI<RedOp,T>()(redOp, peerData, data); data = applyReduce(redOp, peerData, data);
} }
} }
if (postOp) data = MULTI<RedOp, T>().postOp(redOp, data); if (postOp) data = applyPostOp(redOp, data);
// Send : inter-node, then intra-node, then local // Send : inter-node, then intra-node, then local
if (SEND) { if (SEND) {

View File

@ -82,7 +82,14 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1; if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1;
} }
inline __device__ void postSend() { inline __device__ void postSend() {
if (sendConnTailPtr) { __threadfence(); *sendConnTailPtr = sendConnTail += 1; } if (sendConnTailPtr) {
#if __CUDA_ARCH__ >= 900
__threadfence_system();
#else
__threadfence();
#endif
*sendConnTailPtr = sendConnTail += 1;
}
} }
template<int WordPerThread> template<int WordPerThread>
@ -109,7 +116,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
// buffer into shmem. // buffer into shmem.
int misalignment = reinterpret_cast<uintptr_t>(src) % 16; int misalignment = reinterpret_cast<uintptr_t>(src) % 16;
uint64_t *src8 = reinterpret_cast<uint64_t*>(reinterpret_cast<uintptr_t>(src) & -uintptr_t(16)); uint64_t *src8 = reinterpret_cast<uint64_t*>(reinterpret_cast<uintptr_t>(src) & -uintptr_t(16));
uint64_t *shm8 = shmemCvtPtr(ncclShmem.ll128warp[warpInBlock]); uint64_t *shm8 = shmemCvtPtr((uint64_t*)ncclScratchForWarp(warpInBlock));
#pragma unroll #pragma unroll
for(int g=0; g < WordPerThread/2; g++) for(int g=0; g < WordPerThread/2; g++)
if((g*WARP_SIZE + wid)*16 < misalignment + eltN*sizeof(T)) if((g*WARP_SIZE + wid)*16 < misalignment + eltN*sizeof(T))
@ -153,7 +160,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
} }
// Write to dst if 16-byte aligned, shmem otherwise. // Write to dst if 16-byte aligned, shmem otherwise.
int misalignment = reinterpret_cast<uintptr_t>(dst)%16; int misalignment = reinterpret_cast<uintptr_t>(dst)%16;
uint64_t *shm8 = shmemCvtPtr(ncclShmem.ll128warp[warpInBlock]); uint64_t *shm8 = shmemCvtPtr((uint64_t*)ncclScratchForWarp(warpInBlock));
#pragma unroll #pragma unroll
for(int g=0; g < WordPerThread/2; g++) { for(int g=0; g < WordPerThread/2; g++) {
int ix = g*WARP_SIZE - 4*(g/2) + wid - (g%2)*(wid/8); int ix = g*WARP_SIZE - 4*(g/2) + wid - (g%2)*(wid/8);
@ -167,7 +174,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
__syncwarp(); __syncwarp();
// Write rest from shmem to dst. No need to coalesce stores to 16-bytes, // Write rest from shmem to dst. No need to coalesce stores to 16-bytes,
// the hardware keeps up fine. // the hardware keeps up fine.
T *shm = (T*)ncclShmem.ll128warp[warpInBlock]; T *shm = (T*)ncclScratchForWarp(warpInBlock);
int skip = misalignment == 0 ? eltN & -EltPer16B : 0; int skip = misalignment == 0 ? eltN & -EltPer16B : 0;
for(int i=skip+wid; i < eltN; i += WARP_SIZE) for(int i=skip+wid; i < eltN; i += WARP_SIZE)
dst[i] = shm[i]; dst[i] = shm[i];
@ -196,6 +203,10 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
} }
needReload &= (0 == checkAbort(spins, 0, 0)); needReload &= (0 == checkAbort(spins, 0, 0));
} while (__any_sync(WARP_MASK, needReload)); } while (__any_sync(WARP_MASK, needReload));
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2)
load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
} }
/************* Finish register load **************/ /************* Finish register load **************/
@ -206,9 +217,9 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
if (SrcBuf == Input) { if (SrcBuf == Input) {
#pragma unroll #pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) { for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
v[u] = MULTI<RedOp, T>().preOp(redOp, v[u]); v[u] = applyPreOp(redOp, v[u]);
if (!flagThread) if (!flagThread)
v[u+1] = MULTI<RedOp, T>().preOp(redOp, v[u+1]); v[u+1] = applyPreOp(redOp, v[u+1]);
} }
} }
} }
@ -218,8 +229,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
{ // Consume data from first recv { // Consume data from first recv
#pragma unroll #pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) { for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
v[u] = SRC ? MULTI<RedOp, T>()(redOp, vr[u], v[u]) : vr[u]; v[u] = SRC ? applyReduce(redOp, vr[u], v[u]) : vr[u];
v[u+1] = SRC ? MULTI<RedOp, T>()(redOp, vr[u+1], v[u+1]) : vr[u+1]; v[u+1] = SRC ? applyReduce(redOp, vr[u+1], v[u+1]) : vr[u+1];
} }
} }
@ -238,20 +249,24 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
needReload &= (0 == checkAbort(spins, i, 0)); needReload &= (0 == checkAbort(spins, i, 0));
} while (__any_sync(WARP_MASK, needReload)); } while (__any_sync(WARP_MASK, needReload));
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2)
load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
#pragma unroll #pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) { for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
v[u] = MULTI<RedOp, T>()(redOp, vr[u], v[u]); v[u] = applyReduce(redOp, vr[u], v[u]);
v[u+1] = MULTI<RedOp, T>()(redOp, vr[u+1], v[u+1]); v[u+1] = applyReduce(redOp, vr[u+1], v[u+1]);
} }
} }
} }
/********************** End Recv ************************/ /********************** End Recv ************************/
if (postOp && !FuncTraits<RedOp>::IsPostOpIdentity) { if (postOp) {
#pragma unroll #pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) { for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
v[u] = MULTI<RedOp, T>().postOp(redOp, v[u]); v[u] = applyPostOp(redOp, v[u]);
v[u+1] = MULTI<RedOp, T>().postOp(redOp, v[u+1]); v[u+1] = applyPostOp(redOp, v[u+1]);
} }
} }
@ -282,14 +297,6 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
__device__ __forceinline__ void GenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) { __device__ __forceinline__ void GenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) {
constexpr int SRC = SrcBuf != -1 ? 1 : 0; constexpr int SRC = SrcBuf != -1 ? 1 : 0;
constexpr int DST = DstBuf != -1 ? 1 : 0; constexpr int DST = DstBuf != -1 ? 1 : 0;
static_assert(-1<=SrcBuf && SrcBuf < 2, "Uhoh");
static_assert(-1<=DstBuf && DstBuf < 2, "Uhoh");
static_assert(DstBuf!=Input, "Mistake?");
#if 0
assert((SrcBuf==-1) == (srcIx==-1));
assert((DstBuf==-1) == (dstIx==-1));
#endif
T const *srcPtr = SrcBuf == -1 ? nullptr : userBufs[SrcBuf] + srcIx; T const *srcPtr = SrcBuf == -1 ? nullptr : userBufs[SrcBuf] + srcIx;
T *dstPtr = DstBuf == -1 ? nullptr : userBufs[DstBuf] + dstIx; T *dstPtr = DstBuf == -1 ? nullptr : userBufs[DstBuf] + dstIx;
int wireOffset = WireWordPerSlice*warp + 2*wid; int wireOffset = WireWordPerSlice*warp + 2*wid;

View File

@ -5,9 +5,9 @@
************************************************************************/ ************************************************************************/
template<typename T, typename RedOp, typename Fan, int Direct, template<typename T, typename RedOp, typename Fan, int Direct,
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p> int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, bool NVLS>
class Primitives< class Primitives<
T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll>, P2p T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, NVLS>, P2p
> { > {
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
static constexpr int Input=0, Output=1; static constexpr int Input=0, Output=1;
@ -22,8 +22,10 @@ class Primitives<
SizesFifoEnabled = 0x100, SizesFifoEnabled = 0x100,
DirectWrite = 0x200, DirectWrite = 0x200,
DirectRead = 0x400, DirectRead = 0x400,
ThreadsSynced = 0x800; ThreadsSynced = 0x800,
const int tid; NvlsMinPolling = 0x1000,
NvlsRecv = 0x2000;
const int tid, tidInBlock;
int nthreads; int nthreads;
int nworkers; int nworkers;
const int stepSize; const int stepSize;
@ -41,22 +43,54 @@ class Primitives<
int volatile *connSizesFifoPtr; // (flags & SizesFifoEnabled) int volatile *connSizesFifoPtr; // (flags & SizesFifoEnabled)
T *directBuff; // !(flags & SizesFifoEnabled) T *directBuff; // !(flags & SizesFifoEnabled)
}; };
uint64_t volatile *connStepPtr; uint64_t *connStepPtr;
uint64_t connStepCache; // Cache last seen value of (*connStepPtr) uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
// Don't use barrier 0 as it's used by the final sync // Don't use barrier 0 as it's used by the final sync
inline __device__ void barrier() { __device__ void barrier() {
if (nthreads == WARP_SIZE)
__syncwarp();
else
asm volatile("bar.sync %0, %1;" :: "r"(15-group), "r"(nthreads));
flags |= ThreadsSynced; flags |= ThreadsSynced;
if (nthreads == WARP_SIZE) __syncwarp();
else {
int bar = 15-group;
asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nthreads) : "memory");
}
} }
inline __device__ void subBarrier() { __device__ void subBarrier() {
if (nworkers == nthreads) if (nworkers == WARP_SIZE) __syncwarp();
barrier(); else {
else int bar = (nworkers==nthreads ? 15 : 8) - group;
asm volatile("bar.sync %0, %1;" :: "r"(8-group), "r"(nworkers)); asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nworkers) : "memory");
}
}
__device__ bool barrierAny(int vote) {
flags |= ThreadsSynced;
if (nthreads == WARP_SIZE) {
return __any_sync(~0u, vote);
} else {
int ans, bar = 15-group;
asm volatile(
"{ .reg .pred p;"
" setp.ne.s32 p, %1, 0;"
" bar.red.or.pred p, %2, %3, p; "
" selp.s32 %0, 1, 0, p; }"
: "=r"(ans) : "r"(vote), "r"(bar), "r"(nthreads) : "memory");
return ans != 0;
}
}
__device__ bool subBarrierAny(int vote) {
if (nworkers == WARP_SIZE) {
return __any_sync(~0u, vote);
} else {
int ans, bar = (nworkers==nthreads ? 15 : 8) - group;
asm volatile(
"{ .reg .pred p;"
" setp.ne.s32 p, %1, 0;"
" bar.red.or.pred p, %2, %3, p; "
" selp.s32 %0, 1, 0, p; }"
: "=r"(ans) : "r"(vote), "r"(bar), "r"(nworkers) : "memory");
return ans != 0;
}
} }
inline __device__ bool checkAbort(int &spins) { inline __device__ bool checkAbort(int &spins) {
@ -71,6 +105,19 @@ class Primitives<
return flags & Aborted; return flags & Aborted;
} }
inline __device__ uint64_t loadStepValue(uint64_t* ptr) {
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
if (NVLS && (flags & NvlsMinPolling)) {
uint64_t ans;
asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
return ans;
}
#endif
// volatile is faster than acquire but not as correct. Make sure ReduceOrCopyMulti
// loads data using volatile so it doesn't see stale data in L1.
return ld_volatile_global(ptr);
}
template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst> template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
__device__ __forceinline__ void waitPeer(intptr_t dstIx, intptr_t remoteIx, int offset, int nelts) { __device__ __forceinline__ void waitPeer(intptr_t dstIx, intptr_t remoteIx, int offset, int nelts) {
const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send; const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
@ -80,7 +127,7 @@ class Primitives<
((flags & (Send*RoleWaitSend)) && !noSendWait)) { ((flags & (Send*RoleWaitSend)) && !noSendWait)) {
int spins = 0; int spins = 0;
while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) { while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
connStepCache = *connStepPtr; connStepCache = loadStepValue(connStepPtr);
if (checkAbort(spins)) break; if (checkAbort(spins)) break;
//if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice)); //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
} }
@ -119,10 +166,11 @@ class Primitives<
} }
template<int Recv, int Send> template<int Recv, int Send>
inline __device__ void postPeer() { inline __device__ void postPeer(bool dataStored) {
if (flags & (Recv*RolePostRecv | Send*RolePostSend)) { if (flags & (Recv*RolePostRecv | Send*RolePostSend)) {
step += StepPerSlice; step += StepPerSlice;
*connStepPtr = step; if (Send && (flags & RolePostSend) && dataStored) fence_acq_rel_sys();
st_relaxed_sys_global(connStepPtr, step);
} }
} }
@ -166,7 +214,7 @@ class Primitives<
// post(); // post();
// } // Since we no longer unroll, new branch added here // } // Since we no longer unroll, new branch added here
#if __CUDA_ARCH__ < 700 #if __CUDA_ARCH__ < 700
// Yeah, so all that above don't matter a lick on older hardware. // Above doesn't matter on older hardware.
#pragma unroll SlicePerChunk #pragma unroll SlicePerChunk
#else #else
#pragma unroll 1 #pragma unroll 1
@ -181,37 +229,39 @@ class Primitives<
subBarrier(); subBarrier();
/* if user abort the kernel, we don't need to actually perform copy/reduce; just set size /* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
* to 0 to avoid unnecessary workload. */ * to 0 to avoid unnecessary workload. */
size_t workSize = ncclShmem.aborted ? 0 : sliceSize; int workSize = ncclShmem.aborted ? 0 : sliceSize;
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) { if (NVLS && ncclShmem.groups[group].nvlsRecv) {
void* src = ncclShmem.groups[group].srcs[0];
void* dst = ncclShmem.groups[group].dsts[0];
copyMultimemMultimem<RedOp>(tid, nworkers, ncclShmem.redOpArgs[0], postOp, src, dst, workSize,
cvta_to_shared(ncclScratchForWarp(tidInBlock/WARP_SIZE)));
} else if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (Send) { if (Send) {
// (1-Send) is only there to avoid compilation errors in case MaxSend=0 (and Send=0). ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, MaxSend, /*PreOpSrcs*/0>
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, (1-Send)+MaxSend, 0> (tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false,
(tid, nworkers, nullptr, false, 1, ncclShmem.groups[group].srcs,
1, (T const**)ncclShmem.groups[group].srcs, fan.nsend(), ncclShmem.groups[group].dsts+1,
fan.nsend(), (T**)ncclShmem.groups[group].dsts+1,
workSize); workSize);
} }
} else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) { } else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) {
// For broadcast in CollNet to do empty send // For broadcast in CollNet to do empty send
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0> ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs*/0>
(tid, nworkers, ncclShmem.redOpArgs, postOp, (tid, nworkers, ncclShmem.redOpArgs[0], nullptr, postOp,
Recv, (T const**)ncclShmem.groups[group].srcs, Recv, ncclShmem.groups[group].srcs,
Dst, (T**)ncclShmem.groups[group].dsts, Dst, ncclShmem.groups[group].dsts,
workSize); workSize);
} else { } else {
constexpr int PreOpN = SrcBuf != Input ? 0 : constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1; DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpN> ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
(tid, nworkers, ncclShmem.redOpArgs, postOp, (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
Recv*fan.nrecv()+Src, (T const**)ncclShmem.groups[group].srcs, Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
Send*fan.nsend()+Dst, (T**)ncclShmem.groups[group].dsts, Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
workSize); workSize);
} }
barrier(); // This barrier has a counterpart in following loop barrier(); // This barrier has a counterpart in following loop
if (Send && (flags & RolePostSend) && index == 0) __threadfence_system(); postPeer<Recv, Send>(0 < sliceSize);
__syncwarp();
postPeer<Recv, Send>();
offset += sliceSize; offset += sliceSize;
slice += 1; slice += 1;
} while (slice < SlicePerChunk && offset < nelem); } while (slice < SlicePerChunk && offset < nelem);
@ -229,9 +279,7 @@ class Primitives<
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, 0); waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, 0);
} }
barrier(); // Has couterpart in preceding worker-only loop. barrier(); // Has couterpart in preceding worker-only loop.
if (Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system(); postPeer<Recv, Send>(0 < sliceSize);
__syncwarp();
postPeer<Recv, Send>();
offset += sliceSize; offset += sliceSize;
slice += 1; slice += 1;
} }
@ -242,7 +290,7 @@ class Primitives<
// shift: peer offset to avoid all ranks sending to or receiving from same peer // shift: peer offset to avoid all ranks sending to or receiving from same peer
template <int DirectRecv1, int DirectSend1, int Recv, int Send> template <int DirectRecv1, int DirectSend1, int Recv, int Send>
__device__ __forceinline__ void __device__ __forceinline__ void
ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp) { ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp) {
constexpr int DirectRecv = 1 && Direct && DirectRecv1; constexpr int DirectRecv = 1 && Direct && DirectRecv1;
constexpr int DirectSend = 1 && Direct && DirectSend1; constexpr int DirectSend = 1 && Direct && DirectSend1;
int offset = 0; // slice offset int offset = 0; // slice offset
@ -252,12 +300,12 @@ class Primitives<
#pragma unroll #pragma unroll
for (int slice=0; slice<SlicePerChunk; ++slice) { for (int slice=0; slice<SlicePerChunk; ++slice) {
int realSize = max(0, min(dataSize, peerElem-offset)); int realSize = max(0, min(dataSize, peerElem-offset));
bool fenceNeeded = false;
if (tid < nworkers) { if (tid < nworkers) {
if (Send) { if (Send) {
// Scatter pre-scales data of input buffer only in non-Direct case // Scatter pre-scales data of input buffer only in non-Direct case
constexpr int PreOpN = DirectSend ? 0 : 1; constexpr int PreOpSrcs = DirectSend ? 0 : 1;
if (flags & RoleInput) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset; if (flags & RoleInput) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset;
if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] = 0; // Skip the threadfence
// realSize is not accurate here; but intra-node does not rely on sizes FIFO // realSize is not accurate here; but intra-node does not rely on sizes FIFO
waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize); waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
subBarrier(); subBarrier();
@ -265,23 +313,23 @@ class Primitives<
// Loop over peers // Loop over peers
for (int j=0; j<fan.nsend(); j++) { for (int j=0; j<fan.nsend(); j++) {
int i = (j+shift)%fan.nsend(); int i = (j+shift)%fan.nsend();
int peerOffset = i*peerElem; int pOffset = i*peerOffset;
// Skip the data I am responsible of reducing myself // Skip the data I am responsible of reducing myself
if (skip >= 0 && i >= skip) peerOffset += peerElem; if (skip >= 0 && i >= skip) pOffset += peerElem;
const T* src0 = (T*)ncclShmem.groups[group].srcs[0] + peerOffset; void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset;
int realPeerSize = min(realSize, totalElem-peerOffset); int realPeerSize = min(realSize, totalElem-pOffset);
if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) { if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpN>(tid, nworkers, ncclShmem.redOpArgs, false, 1, &src0, 1, (T**)ncclShmem.groups[group].dsts+i, realPeerSize); ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpSrcs>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize);
// Mark for threadfence at the end // Mark for threadfence at the end
if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] += realPeerSize; fenceNeeded |= true;
} }
} }
} else if (Recv) { } else if (Recv) {
if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset; if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
int peerOffset = index*peerElem; int pOffset = index*peerOffset;
if (skip >= 0 && index >= skip) peerOffset += peerElem; if (skip >= 0 && index >= skip) pOffset += peerElem;
// Adjust remote index with peer offset in case we are directly pulling from peer's output buffer // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+peerOffset, offset, realSize); waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+pOffset, offset, realSize);
subBarrier(); subBarrier();
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) { if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
// Since waitPeer sets srcs[0] to output buffer + offset, we are doing a direct-write based recv // Since waitPeer sets srcs[0] to output buffer + offset, we are doing a direct-write based recv
@ -290,21 +338,17 @@ class Primitives<
#pragma unroll #pragma unroll
for (int j=0; j<fan.nrecv(); j++) { for (int j=0; j<fan.nrecv(); j++) {
int i = (j+shift)%fan.nrecv(); int i = (j+shift)%fan.nrecv();
peerOffset = i*peerElem; pOffset = i*peerOffset;
if (skip >= 0 && i >= skip) peerOffset += peerElem; if (skip >= 0 && i >= skip) pOffset += peerElem;
T* dst0 = (T*)ncclShmem.groups[group].dsts[0] + peerOffset; void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
int realPeerSize = min(realSize, totalElem-peerOffset); int realPeerSize = min(realSize, totalElem-pOffset);
if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>(tid, nworkers, ncclShmem.redOpArgs, postOp, 1, (const T**)ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize); if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
} }
} }
} }
} }
barrier(); fenceNeeded = barrierAny(fenceNeeded);
// If we indeed send something, threadfence postPeer<Recv, Send>(fenceNeeded);
if (Send && (flags & RolePostSend) && ncclShmem.groups[group].totalSendSize[slice] > 0 && index == 0)
__threadfence_system();
__syncwarp();
postPeer<Recv, Send>();
offset += realSize; offset += realSize;
} }
} }
@ -320,25 +364,33 @@ class Primitives<
} }
if (flags & RoleWaitRecv) { if (flags & RoleWaitRecv) {
ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs() ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
if ((index == 0) && (flags & RoleWaitRecv)) {
if (conn->flags & NCCL_NVLS_MIN_POLL) {
flags |= NvlsMinPolling;
ncclShmem.groups[group].nvlsRecv = 1;
} else {
ncclShmem.groups[group].nvlsRecv = 0;
}
}
connStepPtr = conn->tail; connStepPtr = conn->tail;
connStepCache = *connStepPtr; connStepCache = loadStepValue(connStepPtr);
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0; flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
if (Direct) { if (Direct) {
// User buffers have been registered // User buffers have been registered
if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) { if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
if (connIndex == 1 && P2p == 0) { if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull flags |= DirectRead; // scatter-reduce use direct pull
} else { } else {
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite : flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0; (e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
} }
} else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) { } else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
if (connIndex == 1 && P2p == 0) { if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull flags |= DirectRead; // scatter-reduce use direct pull
} else { } else {
// direct read not allowed in non-register case // direct read not allowed in non-register case
// otherwise, in one-to-multi send, we could mix empty send and intermediate send // otherwise, in one-to-multi send, we could mix empty send and intermediate send
flags |= (conn->direct & NCCL_DIRECT_WRITE) ? DirectWrite : 0; flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
} }
} }
} }
@ -359,8 +411,9 @@ class Primitives<
} }
if (flags & RoleWaitSend) { if (flags & RoleWaitSend) {
ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs() ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->head; connStepPtr = conn->head;
connStepCache = *connStepPtr; connStepCache = loadStepValue(connStepPtr);
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0; flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
if (flags & OffsFifoEnabled) if (flags & OffsFifoEnabled)
connOffsFifoPtr = conn->offsFifo; connOffsFifoPtr = conn->offsFifo;
@ -371,20 +424,20 @@ class Primitives<
connSizesFifoPtr = conn->sizesFifo; connSizesFifoPtr = conn->sizesFifo;
} else if (Direct) { } else if (Direct) {
// User buffers have been registered // User buffers have been registered
if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) { if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
if (connIndex == 1 && P2p == 0) { if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull flags |= DirectRead; // scatter-reduce use direct pull
} else { } else {
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite : flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0; (e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
} }
} else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) { } else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
if (connIndex == 1 && P2p == 0) { if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull flags |= DirectRead; // scatter-reduce use direct pull
} else { } else {
// direct read not allowed in non-register case // direct read not allowed in non-register case
// otherwise, in one-to-multi send, we could mix empty send and intermediate send // otherwise, in one-to-multi send, we could mix empty send and intermediate send
flags |= (conn->direct & NCCL_DIRECT_WRITE) ? DirectWrite : 0; flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
} }
} }
} }
@ -397,7 +450,7 @@ class Primitives<
int tid, int nthreads, int const *recvPeers, int const *sendPeers, int tid, int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr
): ):
tid(tid), tid(tid), tidInBlock(threadIdx.x),
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) { stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
// For send operations, we need an extra warp to overlap the threadfence and the copy // For send operations, we need an extra warp to overlap the threadfence and the copy
@ -412,7 +465,7 @@ class Primitives<
this->fan = Fan(nrecv, nsend); this->fan = Fan(nrecv, nsend);
constexpr int ThreadPerSync = 8; constexpr int ThreadPerSync = 8;
static_assert(MaxSend < ThreadPerSync && MaxRecv < ThreadPerSync, "Not enough threads to cover all peers"); static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
int g = tid / ThreadPerSync; int g = tid / ThreadPerSync;
int ng = nthreads / ThreadPerSync; int ng = nthreads / ThreadPerSync;
@ -566,6 +619,9 @@ class Primitives<
genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp); genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp);
} }
__device__ __forceinline__ void recvSend(int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, -1, -1>(-1, -1, -1, eltN, postOp);
}
__device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) { __device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, -1, eltN, postOp); genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, -1, eltN, postOp);
} }
@ -596,20 +652,20 @@ class Primitives<
} }
__device__ __forceinline__ void __device__ __forceinline__ void
scatter(intptr_t inpIx, int totalElem, int peerElem, int skip, int shift) { scatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, skip, shift, /*postOp=*/false); ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
} }
__device__ __forceinline__ void __device__ __forceinline__ void
directScatter(intptr_t inpIx, int totalElem, int peerElem, int skip, int shift) { directScatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, skip, shift, /*postOp=*/false); ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
} }
__device__ __forceinline__ void __device__ __forceinline__ void
gather(intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp=false) { gather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp=false) {
ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, skip, shift, postOp); ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, postOp);
} }
__device__ __forceinline__ void __device__ __forceinline__ void
directGather(intptr_t outIx, int totalElem, int peerElem, int skip, int shift) { directGather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, skip, shift, /*postOp=*/false); ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
} }
}; };

File diff suppressed because it is too large Load Diff

View File

@ -87,3 +87,45 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROT
runRing<T, RedOp, ProtoLL128>(args); runRing<T, RedOp, ProtoLL128>(args);
} }
}; };
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*chunkSize;
const int nThreadsScatter = 128 + WARP_SIZE;
const int nThreadsReduce = 384;
const int tidEndScatter = nThreadsScatter;
const int tidEndReduce = tidEndScatter + nThreadsReduce;
using Proto = ProtoSimple<1, 1>;
if (tid < tidEndScatter) {
// Scatter
int group = (0*Proto::MaxGroupWidth) | (0<<16);
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.scatter(offset, nvls->nHeads*size, nelem, size, -1, 0);
}
} else if (tid < tidEndReduce) {
int group = (3*Proto::MaxGroupWidth) | (1<<16);
// Reduce through MC
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recv(offset, nelem);
}
}
}
};

View File

@ -13,12 +13,13 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
template<typename Proto> template<typename Proto>
__device__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { __device__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32); void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
size_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32); ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
if (args->peer == ncclShmem.comm.rank) { if (args->peer == ncclShmem.comm.rank) {
struct ncclWorkElemP2p* recvArgs = args-1; struct ncclWorkElemP2p* recvArgs = args-1;
void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32); void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
if (buff != recvBuff) { if (buff != recvBuff) {
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&buff, 1, (T**)&recvBuff, count); ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>
(tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count);
} }
} else { } else {
int chunkSize = args->chunkSize/sizeof(T); int chunkSize = args->chunkSize/sizeof(T);

View File

@ -74,6 +74,8 @@ void ncclDebugInit() {
mask = NCCL_ALLOC; mask = NCCL_ALLOC;
} else if (strcasecmp(subsys, "CALL") == 0) { } else if (strcasecmp(subsys, "CALL") == 0) {
mask = NCCL_CALL; mask = NCCL_CALL;
} else if (strcasecmp(subsys, "NVLS") == 0) {
mask = NCCL_NVLS;
} else if (strcasecmp(subsys, "ALL") == 0) { } else if (strcasecmp(subsys, "ALL") == 0) {
mask = NCCL_ALL; mask = NCCL_ALL;
} }

View File

@ -32,7 +32,8 @@ struct ncclKernelMatch {
NCCL_FUNC5(func, TREE, devredop, type, specialized), \ NCCL_FUNC5(func, TREE, devredop, type, specialized), \
NCCL_FUNC5(func, RING, devredop, type, specialized), \ NCCL_FUNC5(func, RING, devredop, type, specialized), \
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, specialized), \ NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, specialized), \
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, specialized) NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, specialized), \
NCCL_FUNC5(func, NVLS, devredop, type, specialized)
#ifdef __CUDA_BF16_TYPES_EXIST__ #ifdef __CUDA_BF16_TYPES_EXIST__
#define HAVE_BFLOAT16 1 #define HAVE_BFLOAT16 1
@ -90,34 +91,48 @@ static const ncclKernelMatch ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNum
static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */); static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */);
// Determine the maximum kernel stack size of all CUDA kernels NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
size_t ncclKernMaxLocalSize() {
ncclResult_t res = ncclSuccess; // Returns maximum kernel stack size of all CUDA kernels
int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]); ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
cudaFuncAttributes attr = {0}; constexpr int KernelCount = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
size_t max = 0; ncclResult_t result = ncclSuccess;
for (int i = 0; i < numNcclKerns; i++) {
CUDACHECKGOTO(cudaFuncGetAttributes(&attr, ncclKerns[i].kernelFn), res, error); if (maxStackSize) *maxStackSize = 0;
if (attr.localSizeBytes > max) max = attr.localSizeBytes; int carveout = ncclParamL1SharedMemoryCarveout();
// Keep track if we already visited a function pointer.
void* lru[2] = {nullptr, nullptr};
for (int i=0; i < KernelCount; i++) {
void* fn = ncclKerns[i].kernelFn;
if (fn == lru[0] || fn == lru[1]) goto next_kernel;
lru[1] = lru[0];
lru[0] = fn;
if (maxStackSize) {
cudaFuncAttributes attr = {0};
CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
ignore0:;
}
if (carveout) {
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
cudaFuncAttributePreferredSharedMemoryCarveout, carveout),
result, ignore1);
ignore1:;
}
if (ncclShmemDynamicSize(cudaArch) != 0) {
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)),
result, next_kernel);
}
next_kernel:;
} }
return result;
error:
return (res != ncclSuccess) ? 0 : max;
} }
// Set shared memory carveout for the nccl kernels
ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut) {
ncclResult_t res = ncclSuccess;
int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
for (int i = 0; i < numNcclKerns; i++) {
CUDACHECKGOTO(cudaFuncSetAttribute(ncclKerns[i].kernelFn, cudaFuncAttributePreferredSharedMemoryCarveout, carveOut), res, error);
}
error:
return res;
}
/*****************************************************************************/ /*****************************************************************************/
/* Launch system : synchronization and CUDA kernel launch */ /* Launch system : synchronization and CUDA kernel launch */
/*****************************************************************************/ /*****************************************************************************/
@ -248,10 +263,9 @@ static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelP
static ncclResult_t addCollToPlan( static ncclResult_t addCollToPlan(
struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, int funcIndex, struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, int funcIndex,
struct ncclWorkElem const* workElem, struct ncclProxyOp const* proxyOp, struct ncclWorkElem const* workElem, struct ncclProxyOp const* proxyOp,
int nBid, size_t bytes, bool regBufUsed, void* regBufSend[], void* regBufRecv[] int nCollChannels, int nBid, size_t bytes, bool regBufUsed, void* regBufSend[], void* regBufRecv[]
) { ) {
struct ncclKernelPlan::Channel *chans = plan->channels; struct ncclKernelPlan::Channel *chans = plan->channels;
int nCollChannels = comm->nChannels;
// Choose the `nBid` least loaded channels to do the work. This ensures // Choose the `nBid` least loaded channels to do the work. This ensures
// all bids go to different channels in case they need to synchronize. // all bids go to different channels in case they need to synchronize.
@ -268,9 +282,7 @@ static ncclResult_t addCollToPlan(
} }
} }
// Sort in the rest of the channels. If a channel has less work than the max // Sort in the rest of the channels. If a channel has less work than the max
// member of least[], replace that member and compute the new max. The optimal // member of least[], replace that member and compute the new max.
// algorithm uses a max-heap, but for our small sizes I suspect the better
// asymptotic complexity would be swamped by the increased instruction complexity.
for (int c=nBid; c < nCollChannels; c++) { for (int c=nBid; c < nCollChannels; c++) {
if (chans[c].collBytes < maxBytesInLeast) { if (chans[c].collBytes < maxBytesInLeast) {
least[maxIndexInLeast] = c; least[maxIndexInLeast] = c;
@ -541,8 +553,9 @@ static ncclResult_t scheduleCollTasksToPlan(
info.sliceSteps = head->sliceSteps; info.sliceSteps = head->sliceSteps;
NCCLCHECK(ncclInfoSetDerived(&info, comm->nRanks)); NCCLCHECK(ncclInfoSetDerived(&info, comm->nRanks));
if (nAggOps > 1) { if (nAggOps > 1) {
int maxChannels = aggInfo.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels;
info.nChannels = DIVUP(info.nBytes, bytePerChannel[collNetSupport]); info.nChannels = DIVUP(info.nBytes, bytePerChannel[collNetSupport]);
info.nChannels = std::max(1, std::min(info.nChannels, comm->nChannels)); info.nChannels = std::max(1, std::min(info.nChannels, maxChannels));
info.algorithm = aggInfo.algorithm; info.algorithm = aggInfo.algorithm;
info.protocol = aggInfo.protocol; info.protocol = aggInfo.protocol;
info.nThreads = aggInfo.nThreads; info.nThreads = aggInfo.nThreads;
@ -565,8 +578,9 @@ static ncclResult_t scheduleCollTasksToPlan(
NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, &regBufUsed, regBufSend, regBufRecv)); NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, &regBufUsed, regBufSend, regBufRecv));
} }
int maxChannels = info.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels;
NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp, NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp,
info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv)); maxChannels, info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv));
tasks->nTasksColl -= 1; tasks->nTasksColl -= 1;
tasks->collBytesTotal -= info.nBytes; tasks->collBytesTotal -= info.nBytes;
ncclIntruQueueDequeue(&tasks->collQueue); ncclIntruQueueDequeue(&tasks->collQueue);
@ -856,7 +870,7 @@ static void CUDART_CB hostStreamPlanCallback(void *plan_) {
struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plan_; struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plan_;
ncclResult_t result = hostStreamPlanTask(plan->comm, plan); ncclResult_t result = hostStreamPlanTask(plan->comm, plan);
if (result != ncclSuccess) { if (result != ncclSuccess) {
WARN("hostStreamPlanCallback() failed : %s\n", ncclGetErrorString(result)); WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result));
} }
} }
@ -964,7 +978,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
} }
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->deviceStream), result, failure); NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->deviceStream), result, failure);
if (persistent || comm->persistentRefs != 0) { if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking) {
// We have to launch host tasks to push proxy args. We are careful to only // We have to launch host tasks to push proxy args. We are careful to only
// do this if necessary since host tasks impose a high performance cost in CUDA. // do this if necessary since host tasks impose a high performance cost in CUDA.
bool acquired = false; bool acquired = false;
@ -1005,12 +1019,6 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
return ncclSuccess; return ncclSuccess;
} }
#if CUDART_VERSION >= 11080
#define NCCL_MAX_CGA_CLUSTER_SIZE 8
#define NCCL_CGA_CLUSTER_SIZE_SM90 4
NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", -2);
#endif
#if CUDART_VERSION >= 12000 #if CUDART_VERSION >= 12000
// NCCL uses the "Remote" Mem Sync domain by default // NCCL uses the "Remote" Mem Sync domain by default
NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote); NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
@ -1022,6 +1030,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
cudaStream_t launchStream = tasks->streams->stream; cudaStream_t launchStream = tasks->streams->stream;
dim3 grid = {(unsigned)plan->channelCount, 1, 1}; dim3 grid = {(unsigned)plan->channelCount, 1, 1};
dim3 block = {(unsigned)plan->threadPerBlock, 1, 1}; dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
size_t smem = ncclShmemDynamicSize(comm->cudaArch);
void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead}; void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead};
#if CUDART_VERSION >= 11080 #if CUDART_VERSION >= 11080
@ -1029,19 +1038,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
NCCLCHECK(ncclCudaDriverVersion(&driverVersion)); NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
if (driverVersion >= 11080) { if (driverVersion >= 11080) {
int compCap = comm->compCap; int compCap = comm->compCap;
unsigned int clusterSize = (compCap == 90) ? NCCL_CGA_CLUSTER_SIZE_SM90 : 0; unsigned int clusterSize = (compCap == 90) ? comm->cgaClusterSize : 0;
if (ncclParamCGAClusterSize() != -2) {
clusterSize = ncclParamCGAClusterSize();
if (clusterSize > NCCL_MAX_CGA_CLUSTER_SIZE) {
static bool warned = false;
if (warned == false) {
WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.",
clusterSize, NCCL_MAX_CGA_CLUSTER_SIZE);
warned = true;
}
clusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
}
}
cudaLaunchConfig_t launchConfig = {0}; cudaLaunchConfig_t launchConfig = {0};
cudaLaunchAttribute launchAttrs[3]; cudaLaunchAttribute launchAttrs[3];
@ -1073,6 +1070,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
#endif #endif
launchConfig.gridDim = grid; launchConfig.gridDim = grid;
launchConfig.blockDim = block; launchConfig.blockDim = block;
launchConfig.dynamicSmemBytes = smem;
launchConfig.attrs = launchAttrs; launchConfig.attrs = launchAttrs;
launchConfig.numAttrs = attrs; launchConfig.numAttrs = attrs;
launchConfig.stream = launchStream; launchConfig.stream = launchStream;
@ -1082,12 +1080,12 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
} }
#endif #endif
// Standard kernel launch // Standard kernel launch
CUDACHECK(cudaLaunchKernel(fn, grid, block, args, 0, launchStream)); CUDACHECK(cudaLaunchKernel(fn, grid, block, args, smem, launchStream));
return ncclSuccess; return ncclSuccess;
} }
ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) { ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
if (comm->persistentRefs == 0) { // implies !plan->persistent if (!(plan->persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking)) {
// If this isn't being captured and there aren't any CUDA graphs alive // If this isn't being captured and there aren't any CUDA graphs alive
// then we don't need to do our proxyOp pushing on the host stream. // then we don't need to do our proxyOp pushing on the host stream.
NCCLCHECK(hostStreamPlanTask(comm, plan)); NCCLCHECK(hostStreamPlanTask(comm, plan));
@ -1161,6 +1159,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
int nAlgos = NCCL_NUM_ALGORITHMS; int nAlgos = NCCL_NUM_ALGORITHMS;
for (int a=0; a<nAlgos; a++) { for (int a=0; a<nAlgos; a++) {
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetTypeSupport != 1) continue; if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetTypeSupport != 1) continue;
if (a == NCCL_ALGO_NVLS && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
float time; float time;
NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time)); NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time));
@ -1193,6 +1193,9 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
} }
ncSwitch /= 2; ncSwitch /= 2;
} }
} else if (info->algorithm == NCCL_ALGO_NVLS) {
// NVLS should not need more than 16 channels to get peak BW.
nc = comm->nvlsChannels;
} else { } else {
// Ring/Tree channel tuning // Ring/Tree channel tuning
while (info->nBytes < nc*nt*threadThreshold) { while (info->nBytes < nc*nt*threadThreshold) {
@ -1207,6 +1210,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE; if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE;
if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) nt += 3*WARP_SIZE; if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) nt += 3*WARP_SIZE;
if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) nt += 3*WARP_SIZE; if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) nt += 3*WARP_SIZE;
if (info->algorithm == NCCL_ALGO_NVLS) nt = NCCL_MAX_NTHREADS;
} }
nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt; nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
info->nChannels = nc; info->nChannels = nc;
@ -1225,6 +1229,7 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
info->pattern = ncclPatternRing; break; info->pattern = ncclPatternRing; break;
case ncclFuncAllReduce: case ncclFuncAllReduce:
info->pattern = info->pattern =
info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect : info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain : info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain :
info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown :
@ -1244,6 +1249,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
case ncclPatternPipelineFrom: case ncclPatternPipelineFrom:
case ncclPatternPipelineTo: case ncclPatternPipelineTo:
case ncclPatternCollnetChain: case ncclPatternCollnetChain:
case ncclPatternNvls:
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break; info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
case ncclPatternCollnetDirect: case ncclPatternCollnetDirect:
info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collnetDirect.nHeads; break; info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collnetDirect.nHeads; break;
@ -1319,6 +1325,14 @@ comp_next:
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth*8 && chunkSize > 65536) chunkSize /= 2; while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth*8 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2; while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_NVLS) {
if (chunkSize > 131072) chunkSize = 131072;
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
if ((info->nBytes < (8 * (concurrentOps*chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
if ((info->nBytes < (2 * (concurrentOps*chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->protocol == NCCL_PROTO_LL) { } else if (info->protocol == NCCL_PROTO_LL) {
const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine); const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize; const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
@ -1618,6 +1632,11 @@ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) {
WARN("ncclRedOpDestroy : operator is garbage."); WARN("ncclRedOpDestroy : operator is garbage.");
return ncclInvalidArgument; return ncclInvalidArgument;
} }
if (comm == NULL) {
WARN("ncclRedOpDestroy : invalid communicator passed.");
return ncclInvalidArgument;
}
int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps); int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps);
if (comm->userRedOpCapacity <= ix || comm->userRedOps[ix].freeNext != -1) { if (comm->userRedOpCapacity <= ix || comm->userRedOps[ix].freeNext != -1) {
WARN("ncclRedOpDestroy : operator unknown to this communicator."); WARN("ncclRedOpDestroy : operator unknown to this communicator.");

View File

@ -313,8 +313,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS. // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
// We permit combining max, then min, to only use the first channels, then duplicate them. // We permit combining max, then min, to only use the first channels, then duplicate them.
nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels); nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->maxCTAs);
nChannels = comm->nChannels = copyChannels(comm, nChannels, ncclMinNchannels(), ringPrev, ringNext); nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->minCTAs), ringPrev, ringNext);
// Create rings array and check all is fine // Create rings array and check all is fine
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext)); NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));

View File

@ -461,7 +461,7 @@ ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank
type = node->type; type = node->type;
} }
if (type != GPU) { if (type != GPU) {
WARN("Could not find intermediate GPU between GPU rank %d and NIC %d\n", rank, netDev); WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev);
return ncclInternalError; return ncclInternalError;
} }
*intermediateRank = node->gpu.rank; *intermediateRank = node->gpu.rank;
@ -707,6 +707,7 @@ static int nextPow2(int v) {
} }
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
/* here we already honor comm->max/minCTAs for p2pnChannels. */
comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels()); comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels()); comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels());
int minChannels = comm->p2pnChannels; int minChannels = comm->p2pnChannels;
@ -734,7 +735,6 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb; for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
comm->p2pChannels[c] = mirror; comm->p2pChannels[c] = mirror;
} }
INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
return ncclSuccess; return ncclSuccess;
} }

View File

@ -765,7 +765,6 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE; if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
// SPLIT_TREE works better on older archs.
int ccMin; int ccMin;
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL)); NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));

View File

@ -815,6 +815,6 @@ ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int*
return ncclSuccess; return ncclSuccess;
} }
} }
WARN("Could not find local GPU with rank %d\n", rank); WARN("Could not find local GPU with rank %d", rank);
return ncclInternalError; return ncclInternalError;
} }

View File

@ -53,7 +53,7 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
// Latencies in us, Bandwidths in GB/s // Latencies in us, Bandwidths in GB/s
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple } // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 10.0, 8.4 }, { 4.4, 4.4, 0 }, { 4.4, 4.4, 0 }}; static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 10.0, 8.4 }, { 4.4, 4.4, 0 }, { 4.4, 4.4, 0 }, { 0, 0, 40.0 }};
// NVLink, PCI, Network // NVLink, PCI, Network
#define NCCL_HW_NVLINK 0 #define NCCL_HW_NVLINK 0
@ -63,13 +63,16 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4,
static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
{ /* NVLINK */ { /* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { .52, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .47, 1.9, 3.4 }, { /* Tree (LL/LL128/Simple)*/ { .52, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .47, 1.9, 3.4 },
/* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 } }, /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 },
/* NVLS */ { 0, 0, 0 } },
/* PCI */ /* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
/* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 } }, /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 },
/* NVLS */ { 0, 0, 0 } },
/* NET */ /* NET */
{ /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 9.6 }, { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 9.6 },
/* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 10.7 } } /* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 10.7 },
/* NVLS */ { 0, 0, 0 } }
}; };
/* Array indexes used below */ /* Array indexes used below */
@ -78,7 +81,7 @@ static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
#define HOPPER_COMPCAP_IDX 2 #define HOPPER_COMPCAP_IDX 2
// LL128 max BW per channel // LL128 max BW per channel
static const double ll128MaxBwPerCh = 20.0; static const double ll128MaxBwPerCh[3] = { 20.0, 20.0, 36.7 };
static const double llMaxBws[3][3] = { static const double llMaxBws[3][3] = {
/* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
/* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}, /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
@ -88,7 +91,7 @@ static const double llMaxBws[3][3] = {
static const double perChMaxTreeBws[3][3] = { static const double perChMaxTreeBws[3][3] = {
/* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
/* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8}, /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
/* Hopper (N1/N2/N4) */ {24.0, 23.6, 17.8}, /* Hopper (N1/N2/N4) */ {38.7, 41.4, 33.0},
}; };
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) { ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
@ -98,7 +101,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS); getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] =
comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] =
comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS); getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
@ -108,7 +112,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
int nRanks = comm->nRanks; int nRanks = comm->nRanks;
if (nRanks <= 1) return ncclSuccess; if (nRanks <= 1) return ncclSuccess;
int compCapIndex = (minCompCap == 80 && maxCompCap == 80) ? AMPERE_COMPCAP_IDX : ((minCompCap == 90 && maxCompCap == 90) ? HOPPER_COMPCAP_IDX : VOLTA_COMPCAP_IDX); int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX;
int cpuArch, cpuVendor, cpuModel; int cpuArch, cpuVendor, cpuModel;
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
int index2 = nNodes <= 2 ? nNodes-1 : 2; int index2 = nNodes <= 2 ? nNodes-1 : 2;
@ -120,7 +124,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]; if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph, collNetGraph }; struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph, collNetGraph, ringGraph/* we only need the NVSwitch speed for NVLS*/ };
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS]; int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI; for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET; for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
@ -134,20 +138,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
nNodes; nNodes;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue; if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (a == NCCL_ALGO_NVLS && p != NCCL_PROTO_SIMPLE) continue;
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0; int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter; float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
float busBw = graphs[a]->nChannels * bw; float busBw = graphs[a]->nChannels * bw;
// Various model refinements // Various model refinements
if (compCapIndex == AMPERE_COMPCAP_IDX) busBw = std::min(busBw, 235.0f); if (compCapIndex == AMPERE_COMPCAP_IDX) busBw = std::min(busBw, 235.0f);
if (compCapIndex == HOPPER_COMPCAP_IDX) busBw = std::min(busBw, 370.0f);
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); } if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh*graphs[a]->nChannels); if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[compCapIndex]*graphs[a]->nChannels);
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw); if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh*graphs[a]->nChannels); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh[compCapIndex]*graphs[a]->nChannels);
if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) { if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
@ -159,7 +168,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (a == NCCL_ALGO_COLLNET_CHAIN && p == NCCL_PROTO_SIMPLE) busBw *= .75; if (a == NCCL_ALGO_COLLNET_CHAIN && p == NCCL_PROTO_SIMPLE) busBw *= .75;
// Convert bus BW to algorithm BW // Convert bus BW to algorithm BW
float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps; float ratio;
if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps;
else if (a == NCCL_ALGO_NVLS) ratio = .75;
else ratio = .5;
comm->bandwidths[coll][a][p] = busBw * ratio; comm->bandwidths[coll][a][p] = busBw * ratio;
comm->latencies[coll][a][p] = baseLat[a][p]; comm->latencies[coll][a][p] = baseLat[a][p];
@ -195,7 +207,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
// Protocols/Algorithms enable/disable, and user overrides. // Protocols/Algorithms enable/disable, and user overrides.
// All are enabled except ll128 which is enabled by default only in certain cases. // All are enabled except ll128 which is enabled by default only in certain cases.
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 }; int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1 }; int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1 };
const char *protoStr = getenv("NCCL_PROTO"); const char *protoStr = getenv("NCCL_PROTO");
if (protoStr) { if (protoStr) {
@ -207,6 +219,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr); INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable)); NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
} }
// Disable NVLink SHARP if not supported
if (comm->nvlsSupport == 0 /* || comm->localRanks <= 2*/) algoEnable[NCCL_ALGO_NVLS] = 0;
// Disable CollNet if it is not supported // Disable CollNet if it is not supported
if (comm->collNetSupport == 0) { if (comm->collNetSupport == 0) {
algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0; algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
@ -228,7 +244,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (pEnable == 2 && p == NCCL_PROTO_LL128) { if (pEnable == 2 && p == NCCL_PROTO_LL128) {
// Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption. // Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption.
pEnable = 1; pEnable = 1;
pEnable &= (graphs[a]->typeInter <= PATH_PXB); pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN));
pEnable &= (graphs[a]->typeIntra <= PATH_NVL); pEnable &= (graphs[a]->typeIntra <= PATH_NVL);
pEnable &= (minCompCap == maxCompCap); pEnable &= (minCompCap == maxCompCap);
switch (minCompCap) { switch (minCompCap) {
@ -239,8 +255,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
} }
} }
if (pEnable == 0) comm->bandwidths[c][a][p] = 0; if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
// Only disable algo for Allreduce since others only have one // Never disable ring for non-allreduce operations. That allows to run real apps with NCCL_ALGO=TREE.
if (c == ncclFuncAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0; if (a == NCCL_ALGO_RING && c != ncclFuncAllReduce) continue;
if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
} }
if (comm->rank == 0) { if (comm->rank == 0) {
@ -284,9 +301,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
char* str = getenv("NCCL_THREAD_THRESHOLDS"); char* str = getenv("NCCL_THREAD_THRESHOLDS");
if (str) { if (str) {
INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str); INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str);
ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }, { -2, -2, -2 }, { -2, -2, -2 }}; ssize_t t[2][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }};
sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2); sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { for (int a=0; a<2; a++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p]; if (t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p];
} }
@ -323,7 +340,9 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize]; if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels; if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1 if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
&& info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring && info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) {
lat *= info->comm->minCompCap < 90 ? 1.9 : 1.5; // Plateau effect of ring
}
// Tree pipelining saves latency in aggregation cases // Tree pipelining saves latency in aggregation cases
int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS); int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS);
*time = lat * latCount + (info->nBytes) / (1000 * bw); *time = lat * latCount + (info->nBytes) / (1000 * bw);

View File

@ -315,7 +315,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
ret = ncclSystemError; ret = ncclSystemError;
} }
job->state = ncclGroupJobJoined; job->state = ncclGroupJobJoined;
if (job->result != ncclSuccess) { if (job->result != ncclSuccess && ret == ncclSuccess) {
ret = job->result; ret = job->result;
errorJobAbortFlag = true; errorJobAbortFlag = true;
} }
@ -326,7 +326,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
if (*groupAbortFlag == true || errorJobAbortFlag == true) { if (*groupAbortFlag == true || errorJobAbortFlag == true) {
*job->abortFlag = 1; *job->abortFlag = 1;
ret = ncclInternalError;
} }
job = job->next; job = job->next;

View File

@ -25,6 +25,7 @@ ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int s
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size); ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag); ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size); ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
ncclResult_t bootstrapClose(void* commState); ncclResult_t bootstrapClose(void* commState);
ncclResult_t bootstrapAbort(void* commState); ncclResult_t bootstrapAbort(void* commState);
#endif #endif

View File

@ -53,7 +53,8 @@ struct ncclDevRedOpFull {
DECL4(func, RING, devredop, type, undef) \ DECL4(func, RING, devredop, type, undef) \
DECL4(func, TREE, devredop, type, undef) \ DECL4(func, TREE, devredop, type, undef) \
DECL4(func, COLLNET_DIRECT, devredop, type, undef) \ DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
DECL4(func, COLLNET_CHAIN, devredop, type, undef) DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
DECL4(func, NVLS, devredop, type, undef)
#if defined(__CUDA_BF16_TYPES_EXIST__) #if defined(__CUDA_BF16_TYPES_EXIST__)
#define DECL2(func, devredop, undefForFloat) \ #define DECL2(func, devredop, undefForFloat) \
@ -121,4 +122,13 @@ extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
#define REDUCE_CHUNKSTEPS 1 #define REDUCE_CHUNKSTEPS 1
#define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above #define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
// We can't use the enum identifiers like ncclSum, ncclFloat, etc since this
// macro will be used in preprocessor conditionals where enums have no meaning.
#define NCCL_NVLS_SUPPORTS(/*ncclDataType_t*/ type, /*ncclDevRedOp_t*/ red) \
(((type==2 || type==3) && (red==0 || red==2 || red==3)) || \
((type==4 || type==5) && (red==0 || red==2 || red==3)) || \
((type==6 || type==9) && (red==0 || red==2 || red==3)) || \
(type==7 && red==0) || \
(type==8 && red==0))
#endif #endif

View File

@ -104,6 +104,7 @@ struct ncclChannel {
struct ncclTree tree; struct ncclTree tree;
struct ncclTree collnetChain; struct ncclTree collnetChain;
struct ncclDirect collnetDirect; struct ncclDirect collnetDirect;
struct ncclNvls nvls;
int id; // index of this channel int id; // index of this channel
uint32_t workFifoSent; // last used work index+1 uint32_t workFifoSent; // last used work index+1
uint64_t p2pOpCount; uint64_t p2pOpCount;
@ -177,8 +178,10 @@ struct ncclComm {
int nRanks; // number of GPUs in communicator int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index int cudaDev; // my cuda device index
int compCap; // compute capability of the GPU int compCap; // compute capability of the GPU
int minCompCap; // min compute capability in the communicator
int64_t busId; // my PCI bus ID in int format int64_t busId; // my PCI bus ID in int format
cpu_set_t cpuAffinity; // CPU affinity of the GPU cpu_set_t cpuAffinity; // CPU affinity of the GPU
int cudaArch; // matches __CUDA_ARCH__ of device
int node; int node;
int nNodes; int nNodes;
@ -201,6 +204,7 @@ struct ncclComm {
// Channels for collectives // Channels for collectives
int nChannels; int nChannels;
int nvlsChannels;
// Channels (per peer) for p2p // Channels (per peer) for p2p
int p2pnChannels; int p2pnChannels;
int p2pnChannelsPerPeer; int p2pnChannelsPerPeer;
@ -257,6 +261,10 @@ struct ncclComm {
int collNetSupport; int collNetSupport;
int intraHighestTransportType; int intraHighestTransportType;
// NVLink SHARP (NVLS) support
int nvlsSupport;
void* nvlsResources;
size_t channelSize; // User requested work size (bytes) for channel partitions size_t channelSize; // User requested work size (bytes) for channel partitions
// Internal streams // Internal streams
@ -288,6 +296,11 @@ struct ncclComm {
// communicator mode // communicator mode
int blocking; int blocking;
// CGA cluster size
int cgaClusterSize;
int minCTAs, maxCTAs;
// network interface name
char *netName;
// initState is to more conveniently reclaim resources when errors happen. // initState is to more conveniently reclaim resources when errors happen.
ncclResult_t initState; ncclResult_t initState;
// flag to indicate if ncclCommFinalize() is called // flag to indicate if ncclCommFinalize() is called

View File

@ -73,10 +73,32 @@ DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020); DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020); DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020);
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000); DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000); DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
// cuMem API support
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
#if CUDA_VERSION >= 11070 #if CUDA_VERSION >= 11070
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
#endif #endif
#if CUDA_VERSION >= 12010
/* NVSwitch Multicast support */
DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
#endif
#endif #endif
/* CUDA Driver functions loaded with dlsym() */ /* CUDA Driver functions loaded with dlsym() */
@ -88,6 +110,7 @@ DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030);
ncclResult_t ncclCudaLibraryInit(void); ncclResult_t ncclCudaLibraryInit(void);
extern int ncclCudaDriverVersionCache; extern int ncclCudaDriverVersionCache;
extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
inline ncclResult_t ncclCudaDriverVersion(int* driver) { inline ncclResult_t ncclCudaDriverVersion(int* driver) {
int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED); int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED);
@ -98,5 +121,4 @@ inline ncclResult_t ncclCudaDriverVersion(int* driver) {
*driver = version; *driver = version;
return ncclSuccess; return ncclSuccess;
} }
#endif #endif

View File

@ -15,11 +15,12 @@
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t; typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS]; extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
#define NCCL_NUM_ALGORITHMS 4 // Tree/Ring/CollNet* #define NCCL_NUM_ALGORITHMS 5 // Tree/Ring/CollNet*
#define NCCL_ALGO_TREE 0 #define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1 #define NCCL_ALGO_RING 1
#define NCCL_ALGO_COLLNET_DIRECT 2 #define NCCL_ALGO_COLLNET_DIRECT 2
#define NCCL_ALGO_COLLNET_CHAIN 3 #define NCCL_ALGO_COLLNET_CHAIN 3
#define NCCL_ALGO_NVLS 4
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS]; extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
@ -78,6 +79,7 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
#define NCCL_DIRECT_NIC 0x04 #define NCCL_DIRECT_NIC 0x04
#define NCCL_IPC_WRITE 0x08 #define NCCL_IPC_WRITE 0x08
#define NCCL_IPC_READ 0x10 #define NCCL_IPC_READ 0x10
#define NCCL_NVLS_MIN_POLL 0x20
struct ncclConnInfo { struct ncclConnInfo {
// Regular comm mechanism // Regular comm mechanism
@ -85,7 +87,7 @@ struct ncclConnInfo {
uint64_t *tail; // Local for recv, remote for send uint64_t *tail; // Local for recv, remote for send
uint64_t *head; // Local for send, remote for recv uint64_t *head; // Local for send, remote for recv
int direct; // Direct communication int flags; // Direct communication / other flags
int shared; // Buffers are shared int shared; // Buffers are shared
void **ptrExchange; // Pointer exchange for direct communication void **ptrExchange; // Pointer exchange for direct communication
uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
@ -138,13 +140,22 @@ struct ncclTree {
struct ncclDirect { struct ncclDirect {
int depth; int depth;
int out; int out;
int nHeads; int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
int headRank; int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
int shift; int shift; // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
int up[NCCL_MAX_DIRECT_ARITY]; int up[NCCL_MAX_DIRECT_ARITY];
int down[NCCL_MAX_DIRECT_ARITY]; int down[NCCL_MAX_DIRECT_ARITY];
}; };
#define NCCL_MAX_NVLS_ARITY 8
struct ncclNvls {
int out;
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
int up[NCCL_MAX_NVLS_ARITY];
int down;
};
#define NCCL_MAX_CONNS 2 #define NCCL_MAX_CONNS 2
struct ncclChannelPeer { struct ncclChannelPeer {
struct ncclConnector send[NCCL_MAX_CONNS]; struct ncclConnector send[NCCL_MAX_CONNS];
@ -264,6 +275,7 @@ struct alignas(16) ncclDevChannel {
struct ncclTree tree; struct ncclTree tree;
struct ncclTree collnetChain; struct ncclTree collnetChain;
struct ncclDirect collnetDirect; struct ncclDirect collnetDirect;
struct ncclNvls nvls;
uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
}; };
@ -288,4 +300,65 @@ struct alignas(16) ncclDevCommAndChannels {
struct ncclDevChannel channels[MAXCHANNELS]; struct ncclDevChannel channels[MAXCHANNELS];
}; };
#ifdef __CUDA_ARCH__
#define NCCL_CUDA_ARCH __CUDA_ARCH__
#else
#define NCCL_CUDA_ARCH 0
#endif
template<typename T>
__host__ __device__ constexpr T min_constexpr(T a) { return a; }
template<typename T, typename ...Ts>
__host__ __device__ constexpr T min_constexpr(T a, T b, Ts ...c) {
return min_constexpr<T>((a < b ? a : b), c...);
}
template<typename T>
__host__ __device__ constexpr T max_constexpr(T a) { return a; }
template<typename T, typename ...Ts>
__host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) {
return max_constexpr<T>((a > b ? a : b), c...);
}
// Calculate the unroll factor given:
// * bytePerPack: number of bytes accessed per instruction
// * insns: max permissible unroll value
// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack)
__host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int bytes) {
return min_constexpr(insns, (bytes + bytePerPack-1)/bytePerPack);
}
// Note that all unroll value logic should depend on a given cudaArch argument
// and not __CUDA_ARCH__ since these need to be host-side executable where the
// arch value is strictly runtime only. By defaulting to NCCL_CUDA_ARCH, device
// side code can elide passing the arch for brevity.
__host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
// Our collective unroll should move to the same bytes&insns model as NVLS.
return cudaArch >= 800 ? 8 : 4;
}
__host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
__host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; }
__host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch = NCCL_CUDA_ARCH) {
return ncclCalcUnroll(bytePerPack, ncclNvlsUnrollInsns(cudaArch), ncclNvlsUnrollBytes(cudaArch));
}
// The amount of dynamic shmem per warp
__host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
return (max_constexpr<int>(
/*LL */0,
/*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t),
/*SIMPLE*/(ncclCollUnroll(cudaArch)*WARP_SIZE + 1)*16,
// NVLS needs an extra 16B to read unaligned data.
/*NVLS */WARP_SIZE*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16
) + 15) & -16; // pad to 16 bytes
}
// The amount of dynamic shmem per block
__host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
}
#endif #endif

View File

@ -15,8 +15,7 @@
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64) #define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */ #define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
size_t ncclKernMaxLocalSize(); ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut);
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclLaunchPrepare(struct ncclComm* comm); ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);

View File

@ -24,6 +24,7 @@ typedef enum : uint8_t {
ncclPatternTreeUpDown, ncclPatternTreeUpDown,
ncclPatternCollnetChain, ncclPatternCollnetChain,
ncclPatternCollnetDirect, ncclPatternCollnetDirect,
ncclPatternNvls,
ncclPatternSend, ncclPatternSend,
ncclPatternRecv ncclPatternRecv
} ncclPattern_t; } ncclPattern_t;

37
src/include/ipcsocket.h Normal file
View File

@ -0,0 +1,37 @@
/*
* Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
*
* See COPYRIGHT for license information
*/
#ifndef NCCL_IPCSOCKET_H
#define NCCL_IPCSOCKET_H
#include "nccl.h"
#include <stdio.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <errno.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <memory.h>
#include <sys/un.h>
#include <inttypes.h>
#define NCCL_IPC_SOCKNAME_LEN 64
struct ncclIpcSocket {
int fd;
char socketName[NCCL_IPC_SOCKNAME_LEN];
volatile uint32_t* abortFlag;
};
ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
#endif /* NCCL_IPCSOCKET_H */

View File

@ -20,7 +20,7 @@
#define NCCL_NET_MAX_REQUESTS 8 #define NCCL_NET_MAX_REQUESTS 8
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

View File

@ -7,12 +7,12 @@
#ifndef NCCL_NVTX_H_ #ifndef NCCL_NVTX_H_
#define NCCL_NVTX_H_ #define NCCL_NVTX_H_
#include "nvtx3.hpp" #include "nvtx3/nvtx3.hpp"
#if __cpp_constexpr >= 201304L && !defined(NVTX3_RELAXED_CONSTEXPR) #if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14)
#define NVTX3_RELAXED_CONSTEXPR constexpr #define NVTX3_CONSTEXPR_IF_CPP14 constexpr
#else #else
#define NVTX3_RELAXED_CONSTEXPR #define NVTX3_CONSTEXPR_IF_CPP14
#endif #endif
// Define all NCCL-provided static schema IDs here (avoid duplicates). // Define all NCCL-provided static schema IDs here (avoid duplicates).
@ -37,7 +37,7 @@ struct nccl_domain{static constexpr char const* name{"NCCL"};};
class payload_schema { class payload_schema {
public: public:
NVTX3_RELAXED_CONSTEXPR explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
{ {
schema_attr.name = schemaName; schema_attr.name = schemaName;
schema_attr.entries = entries; schema_attr.entries = entries;
@ -74,11 +74,11 @@ class payload_schema {
#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \ #define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
static const payload_schema schema{S, std::extent<decltype(S)>::value, \ static const payload_schema schema{S, std::extent<decltype(S)>::value, \
NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \ NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
static ::nvtx3::v1::registered_string<nccl_domain> const nvtx3_func_name__{__func__}; \ static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
nvtxPayloadData_t nvtx3_bpl__[] = { \ nvtxPayloadData_t nvtx3_bpl__[] = { \
{NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \ {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
::nvtx3::v1::event_attributes nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \ ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
::nvtx3::v1::domain_thread_range<nccl_domain> const nvtx3_range__{nvtx3_func_attr__}; ::nvtx3::v1::scoped_range_in<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
extern void initNvtxRegisteredEnums(); extern void initNvtxRegisteredEnums();

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

View File

@ -1,12 +1,12 @@
/* /*
* Copyright 2021 NVIDIA Corporation. All rights reserved. * Copyright 2021-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/ */
#include "nvtx3/nvToolsExt.h" #include "nvToolsExt.h"
#ifndef NVTOOLSEXT_PAYLOAD_H #ifndef NVTOOLSEXT_PAYLOAD_H
#define NVTOOLSEXT_PAYLOAD_H #define NVTOOLSEXT_PAYLOAD_H

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
* *
* Licensed under the Apache License v2.0 with LLVM Exceptions. * Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information. * See https://llvm.org/LICENSE.txt for license information.

View File

@ -35,10 +35,11 @@ NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_PAYLOAD_FN_NUM
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)() NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
{ {
intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1;
nvtxExtModuleSegment_t segment = { nvtxExtModuleSegment_t segment = {
0, // unused (only one segment) 0, // unused (only one segment)
NVTX3EXT_CBID_PAYLOAD_FN_NUM, NVTX3EXT_CBID_PAYLOAD_FN_NUM,
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1 fnSlots
}; };
nvtxExtModuleInfo_t module = { nvtxExtModuleInfo_t module = {

View File

@ -10,6 +10,7 @@
#include "devcomm.h" #include "devcomm.h"
#include "info.h" #include "info.h"
#include "socket.h" #include "socket.h"
#include "ipcsocket.h"
#include <pthread.h> #include <pthread.h>
#include "shm.h" #include "shm.h"
@ -161,6 +162,31 @@ struct ncclProxyProgressState {
int nextOps; int nextOps;
}; };
// Expected proxy response fifo
struct ncclExpectedProxyResponse {
void* opId;
int respSize;
bool done;
void* respBuff;
struct ncclExpectedProxyResponse* next;
};
struct ncclProxyAsyncOp {
int type;
struct ncclProxyConnection* connection;
int reqSize, respSize;
char *reqBuff, *respBuff;
void* opId;
ncclProxyAsyncOp* next;
};
struct ncclProxyLocalPeer {
struct ncclSocket sock;
int localRank;
ncclProxyAsyncOp* asyncOps;
int asyncOpCounter;
};
struct ncclProxyState { struct ncclProxyState {
// Service thread // Service thread
pthread_t thread; pthread_t thread;
@ -176,6 +202,9 @@ struct ncclProxyState {
// Progress thread // Progress thread
struct ncclProxyProgressState progressState; struct ncclProxyProgressState progressState;
// Queue of expected responses from the proxy
struct ncclExpectedProxyResponse* expectedResponses;
}; };
enum proxyConnectState { enum proxyConnectState {
@ -220,10 +249,19 @@ enum ncclProxyMsgType {
ncclProxyMsgStart = 5, ncclProxyMsgStart = 5,
ncclProxyMsgClose = 6, ncclProxyMsgClose = 6,
ncclProxyMsgAbort = 7, ncclProxyMsgAbort = 7,
ncclProxyMsgStop = 8 ncclProxyMsgStop = 8,
ncclProxyMsgConvertFd = 9 // cuMem API support
}; };
ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize); // This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
// Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
// ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed
ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
// This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received
ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
ncclResult_t ncclProxyDestroy(struct ncclComm* comm); ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm); ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
#endif #endif

View File

@ -92,6 +92,6 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int
ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size); ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size); ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed); ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
ncclResult_t ncclSocketClose(struct ncclSocket* sock); ncclResult_t ncclSocketClose(struct ncclSocket* sock);
#endif #endif

View File

@ -62,7 +62,7 @@ struct ncclTransportComm {
}; };
struct ncclTransport { struct ncclTransport {
const char name[4]; const char name[8];
ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*); ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
struct ncclTransportComm send; struct ncclTransportComm send;
struct ncclTransportComm recv; struct ncclTransportComm recv;
@ -71,6 +71,9 @@ struct ncclTransport {
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex); ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL); ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
ncclResult_t ncclNvlsSetup(struct ncclComm* comm);
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
enum { collNetRecv=0, collNetSend=1 }; enum { collNetRecv=0, collNetSend=1 };
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type); int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail); ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);

View File

@ -35,13 +35,13 @@
#endif #endif
const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" }; const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain" }; const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS" };
const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" }; const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
NCCL_PARAM(CommBlocking, "COMM_BLOCKING", 0); NCCL_PARAM(CommBlocking, "COMM_BLOCKING", NCCL_CONFIG_UNDEF_INT);
static uint64_t hashUniqueId(ncclUniqueId const &id) { static uint64_t hashUniqueId(ncclUniqueId const &id) {
char const *bytes = (char const*)&id; char const *bytes = (char const*)&id;
@ -67,12 +67,8 @@ ncclResult_t initGdrCopy() {
return ncclSuccess; return ncclSuccess;
} }
NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
static bool initialized = false; static bool initialized = false;
static size_t maxLocalSizeBytes = 0;
static ncclResult_t ncclInit() { static ncclResult_t ncclInit() {
if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return ncclSuccess; if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return ncclSuccess;
@ -80,9 +76,6 @@ static ncclResult_t ncclInit() {
if (!initialized) { if (!initialized) {
initEnv(); initEnv();
initGdrCopy(); initGdrCopy();
maxLocalSizeBytes = ncclKernMaxLocalSize();
int carveout = ncclParamL1SharedMemoryCarveout();
if (carveout) ncclKernSetSharedMemoryCarveout(carveout);
// Always initialize bootstrap network // Always initialize bootstrap network
NCCLCHECK(bootstrapNetInit()); NCCLCHECK(bootstrapNetInit());
NCCLCHECK(ncclNetPluginInit()); NCCLCHECK(ncclNetPluginInit());
@ -210,6 +203,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream)); NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream));
} }
if (comm->nvlsSupport) NCCLCHECK(ncclNvlsFree(comm));
struct ncclDestructor* dtor = comm->destructorHead; struct ncclDestructor* dtor = comm->destructorHead;
while (dtor != nullptr) { while (dtor != nullptr) {
NCCLCHECK(dtor->fn(dtor)); NCCLCHECK(dtor->fn(dtor));
@ -220,6 +215,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
ncclMemoryStackDestruct(&comm->memPermanent); ncclMemoryStackDestruct(&comm->memPermanent);
ncclCudaHostFree((void *)comm->abortFlag); ncclCudaHostFree((void *)comm->abortFlag);
free(comm->netName);
commPoison(comm); // poison comm before free to avoid comm reuse. commPoison(comm); // poison comm before free to avoid comm reuse.
free(comm); free(comm);
@ -243,8 +239,8 @@ static ncclResult_t dmaBufSupported(struct ncclComm* comm) {
int flag = 0; int flag = 0;
CUdevice dev; CUdevice dev;
int cudaDriverVersion; int cudaDriverVersion;
CUCHECK(cuDriverGetVersion(&cudaDriverVersion)); CUDACHECK(cudaDriverGetVersion(&cudaDriverVersion));
if (cudaDriverVersion < 11070) return ncclInternalError; if (CUPFN(cuDeviceGet) == NULL || cudaDriverVersion < 11070) return ncclInternalError;
CUCHECK(cuDeviceGet(&dev, comm->cudaDev)); CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
// Query device to see if DMA-BUF support is available // Query device to see if DMA-BUF support is available
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev)); (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev));
@ -265,7 +261,7 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
if (ret != ncclSuccess) { if (ret != ncclSuccess) {
/* if ret is not ncclInProgress, we just keep it. */ /* if ret is not ncclInProgress, we just keep it. */
WARN("Attempt to use communicator before the previous operation returned ncclSuccess\n"); WARN("Attempt to use communicator before the previous operation returned ncclSuccess");
if (ret == ncclInProgress) ret = ncclInvalidArgument; if (ret == ncclInProgress) ret = ncclInvalidArgument;
goto exit; goto exit;
} }
@ -395,6 +391,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
tmpCommAndChans.channels[c].tree = comm->channels[c].tree; tmpCommAndChans.channels[c].tree = comm->channels[c].tree;
tmpCommAndChans.channels[c].collnetChain = comm->channels[c].collnetChain; tmpCommAndChans.channels[c].collnetChain = comm->channels[c].collnetChain;
tmpCommAndChans.channels[c].collnetDirect = comm->channels[c].collnetDirect; tmpCommAndChans.channels[c].collnetDirect = comm->channels[c].collnetDirect;
tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls;
tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c]; tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c];
if (comm->channels[c].ring.userRanks != nullptr) { if (comm->channels[c].ring.userRanks != nullptr) {
@ -521,8 +518,8 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collN
struct ncclChannel* channel = comm->channels + c; struct ncclChannel* channel = comm->channels + c;
for (int h = 0; h < nHeads; h++) { for (int h = 0; h < nHeads; h++) {
const int head = heads[h]; const int head = heads[h];
collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv); collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv);
if (!collNetSetupFail) collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend); if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend);
} }
// Verify CollNet setup across ranks after trying the first channel // Verify CollNet setup across ranks after trying the first channel
if (c == 0) { if (c == 0) {
@ -922,6 +919,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
// Check if we can setup CollNet // Check if we can setup CollNet
if (comm->collNetSupport > 0) collNetTrySetup(comm, &collNetGraph); if (comm->collNetSupport > 0) collNetTrySetup(comm, &collNetGraph);
NCCLCHECKGOTO(ncclNvlsSetup(comm), ret, fail);
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
// Compute time models for algorithm and protocol combinations // Compute time models for algorithm and protocol combinations
@ -929,7 +928,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
int myCompCap = comm->peerInfo[rank].cudaCompCap; int myCompCap = comm->peerInfo[rank].cudaCompCap;
int minCompCap = myCompCap, maxCompCap = myCompCap; int minCompCap = myCompCap, maxCompCap = myCompCap;
for (int i = 0; i < nranks; i++) { for (int i = 0; i < nranks; i++) {
minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap); comm->minCompCap = minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap);
maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap); maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap);
} }
NCCLCHECKGOTO(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph), ret, fail); NCCLCHECKGOTO(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph), ret, fail);
@ -938,6 +937,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
// Compute nChannels per peer for p2p // Compute nChannels per peer for p2p
NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail); NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
INFO(NCCL_INIT, "%d coll channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
do { // Setup p2p structures in comm->tasks do { // Setup p2p structures in comm->tasks
struct ncclTasks* tasks = &comm->tasks; struct ncclTasks* tasks = &comm->tasks;
int nRanks = comm->nRanks; int nRanks = comm->nRanks;
@ -1004,12 +1005,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
} }
} }
} }
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail); NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
} }
// Connect to local net proxy // Connect to local net proxy
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail); NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
NCCLCHECKGOTO(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); NCCLCHECKGOTO(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
// Then to remote ones when using PXN // Then to remote ones when using PXN
if (ncclPxnDisable(comm) == 0) { if (ncclPxnDisable(comm) == 0) {
@ -1017,7 +1019,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail); NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
for (int r=0; r<nranks; r++) { for (int r=0; r<nranks; r++) {
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail); NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
NCCLCHECKGOTO(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); NCCLCHECKGOTO(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
} }
} }
@ -1065,6 +1067,11 @@ fail:
} }
NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0); NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0);
NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", NCCL_CONFIG_UNDEF_INT);
// Match config max/minCTAs
NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT);
NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT);
#define NCCL_MAX_CGA_CLUSTER_SIZE 8
struct ncclCommInitRankAsyncJob { struct ncclCommInitRankAsyncJob {
struct ncclAsyncJob base; struct ncclAsyncJob base;
@ -1087,9 +1094,16 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
ncclUniqueId commId = job->commId; // C++ struct assignment ncclUniqueId commId = job->commId; // C++ struct assignment
int myrank = job->myrank; int myrank = job->myrank;
int cudaDev = job->cudaDev; int cudaDev = job->cudaDev;
int archMajor, archMinor;
size_t maxLocalSizeBytes = 0;
ncclResult_t res = ncclSuccess; ncclResult_t res = ncclSuccess;
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail); CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
CUDACHECK(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
CUDACHECK(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev));
comm->cudaArch = 100*archMajor + 10*archMinor;
NCCLCHECK(ncclInitKernelsForDevice(comm->cudaArch, &maxLocalSizeBytes));
// Set the maximum kernel stack size of all kernels to avoid // Set the maximum kernel stack size of all kernels to avoid
// a CUDA memory reconfig on load (c.f. NVSHMEM issue) // a CUDA memory reconfig on load (c.f. NVSHMEM issue)
if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) { if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) {
@ -1114,18 +1128,143 @@ fail:
goto exit; goto exit;
} }
static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { #define NCCL_CONFIG_DEFAULT(config, field, undef, defvalue, fieldStr, format) \
ncclResult_t ret = ncclSuccess; if (config->field == undef) { \
config->field = defvalue; \
/* first set configuration */ } else { \
if (config) { INFO(NCCL_ENV, "Comm config " fieldStr " set to " format, config->field); \
comm->blocking = config->blocking;
} else {
/* default setting of communicator */
comm->blocking = 1;
} }
static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
ncclResult_t ret = ncclSuccess;
/* config must not be NULL in this function */
int blockingEnv;
int cgaClusterSizeEnv;
int minCTAsEnv;
int maxCTAsEnv;
const char *envNetName, *tmpNetName;
ncclConfig_t defaultConfig = NCCL_CONFIG_INITIALIZER;
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
ncclConfig_t *internalConfigPtr;
size_t realSize;
internalConfigPtr = &internalConfig;
if (config) {
memcpy((void*)&realSize, (void*)config, sizeof(size_t));
realSize = realSize > sizeof(ncclConfig_t) ? sizeof(ncclConfig_t) : realSize;
memcpy((void*)internalConfigPtr, (void*)config, realSize);
if (internalConfigPtr->magic != 0xcafebeef) {
WARN("ncclConfig_t argument not initialized via NCCL_CONFIG_INITIALIZER");
ret = ncclInvalidArgument;
goto fail;
}
/* check version. */
if (internalConfigPtr->version < NCCL_VERSION(2, 14, 0)) {
internalConfigPtr->blocking = defaultConfig.blocking;
}
if (internalConfigPtr->version < NCCL_VERSION(2, 17, 0)) {
internalConfigPtr->cgaClusterSize = defaultConfig.cgaClusterSize;
internalConfigPtr->minCTAs = defaultConfig.minCTAs;
internalConfigPtr->maxCTAs = defaultConfig.maxCTAs;
internalConfigPtr->netName = defaultConfig.netName;
}
}
/* check input config attributes, -1 means user-undefined and we should use default value from NCCL. */
if (internalConfigPtr->blocking != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->blocking != 0 && internalConfigPtr->blocking != 1) {
WARN("Invalid config blocking attribute value %d", internalConfigPtr->blocking);
ret = ncclInvalidArgument;
goto fail;
}
if (internalConfigPtr->cgaClusterSize != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->cgaClusterSize < 0) {
WARN("Invalid config cgaClusterSize attribute value %d", internalConfigPtr->cgaClusterSize);
ret = ncclInvalidArgument;
goto fail;
}
if ((internalConfigPtr->minCTAs != NCCL_CONFIG_UNDEF_INT &&
internalConfigPtr->minCTAs <= 0) ||
(internalConfigPtr->maxCTAs != NCCL_CONFIG_UNDEF_INT &&
internalConfigPtr->maxCTAs <= 0) ||
(internalConfigPtr->minCTAs > internalConfigPtr->maxCTAs)) {
WARN("Invalid config min/max channels attribute value %d/%d", internalConfigPtr->minCTAs, internalConfigPtr->maxCTAs);
ret = ncclInvalidArgument;
goto fail;
}
/* default config value can be tuned on different platform. */
NCCL_CONFIG_DEFAULT(internalConfigPtr, blocking, NCCL_CONFIG_UNDEF_INT, 1, "Blocking", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, cgaClusterSize, NCCL_CONFIG_UNDEF_INT, 4, "CGA cluster size", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, minCTAs, NCCL_CONFIG_UNDEF_INT, 1, "Min CTAs", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s");
tmpNetName = internalConfigPtr->netName;
/* assign config to communicator */
comm->blocking = internalConfigPtr->blocking;
comm->cgaClusterSize = internalConfigPtr->cgaClusterSize;
comm->minCTAs = internalConfigPtr->minCTAs;
comm->maxCTAs = internalConfigPtr->maxCTAs;
/* override configuration from env variable. */
blockingEnv = ncclParamCommBlocking();
if (blockingEnv == 0 || blockingEnv == 1)
comm->blocking = blockingEnv;
cgaClusterSizeEnv = ncclParamCGAClusterSize();
if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) {
comm->cgaClusterSize = cgaClusterSizeEnv;
} else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) {
WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
comm->cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
}
minCTAsEnv = ncclParamMinCTAs();
if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
comm->minCTAs = minCTAsEnv;
}
maxCTAsEnv = ncclParamMaxCTAs();
if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
comm->maxCTAs = maxCTAsEnv;
}
/* cap channels if needed */
if (comm->minCTAs > MAXCHANNELS) {
WARN("minCTAs %d is larger than #channels upper limit %d", comm->minCTAs, MAXCHANNELS);
comm->minCTAs = MAXCHANNELS;
}
if (comm->maxCTAs > MAXCHANNELS) {
WARN("maxCTAs %d is larger than #channels upper limit %d", comm->maxCTAs, MAXCHANNELS);
comm->maxCTAs = MAXCHANNELS;
}
if (comm->minCTAs > comm->maxCTAs) {
WARN("minCTAs %d is larger than maxCTAs %d", comm->minCTAs, comm->maxCTAs);
ret = ncclInvalidArgument;
goto fail;
}
envNetName = getenv("NCCL_NET");
if (envNetName)
tmpNetName = envNetName;
if (tmpNetName != NULL) {
int netNameLen = strlen(tmpNetName) + 1;
comm->netName = (char*)malloc(netNameLen);
memcpy(comm->netName, tmpNetName, netNameLen);
} else {
comm->netName = NULL;
}
exit:
return ret; return ret;
fail:
goto exit;
} }
static void ncclCommInitRankUndo(struct ncclAsyncJob* job_) { static void ncclCommInitRankUndo(struct ncclAsyncJob* job_) {
@ -1151,6 +1290,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
CUDACHECKGOTO(cudaFree(NULL), res, fail); CUDACHECKGOTO(cudaFree(NULL), res, fail);
NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, fail); NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, fail);
NCCLCHECKGOTO(PtrCheck(config, "CommInitRank", "config"), res, fail);
if (nranks < 1 || myrank < 0 || myrank >= nranks) { if (nranks < 1 || myrank < 0 || myrank >= nranks) {
WARN("Invalid rank requested : %d/%d", myrank, nranks); WARN("Invalid rank requested : %d/%d", myrank, nranks);
res = ncclInvalidArgument; res = ncclInvalidArgument;
@ -1201,12 +1341,13 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
(void)ncclCudaLibraryInit(); (void)ncclCudaLibraryInit();
int cudaDev; int cudaDev;
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
CUDACHECK(cudaGetDevice(&cudaDev)); CUDACHECK(cudaGetDevice(&cudaDev));
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev}; NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload) NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, NULL)); NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config));
return ncclSuccess; return ncclSuccess;
} }
@ -1215,6 +1356,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
ncclResult_t ret = ncclSuccess; ncclResult_t ret = ncclSuccess;
int totalnDev; int totalnDev;
int *gpuFlags = NULL; int *gpuFlags = NULL;
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = { constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"} {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"}
@ -1258,7 +1400,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
NCCLCHECKGOTO(ncclGroupStart(), ret, fail); NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
for (int i=0; i<ndev; i++) { for (int i=0; i<ndev; i++) {
// Ignore return codes .. we need to call ncclGroupEnd to clean up anyway // Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, NULL); ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, &config);
} }
NCCLCHECKGOTO(ncclGroupEnd(), ret, fail); NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
@ -1283,39 +1425,16 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
int cudaDev; int cudaDev;
ncclResult_t ret = ncclSuccess; ncclResult_t ret = ncclSuccess;
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER; ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
ncclConfig_t *internalConfigPtr; ncclConfig_t *internalConfigPtr = NULL;
size_t realSize;
int blockingEnv;
NCCLCHECK(ncclGroupStartInternal()); NCCLCHECK(ncclGroupStartInternal());
internalConfigPtr = &internalConfig;
if (config) {
memcpy((void*)&realSize, (void*)config, sizeof(size_t));
realSize = realSize > sizeof(ncclConfig_t) ? sizeof(ncclConfig_t) : realSize;
memcpy((void*)internalConfigPtr, (void*)config, realSize);
if (internalConfigPtr->magic != 0xcafebeef) {
WARN("ncclConfig_t argument not initialized via NCCL_CONFIG_INITIALIZER");
ret = ncclInvalidArgument;
goto exit;
}
}
/* check input config attributes */
if (internalConfigPtr->blocking != 0 && internalConfigPtr->blocking != 1) {
WARN("Invalid config blocking attribute value %d", internalConfigPtr->blocking);
ret = ncclInvalidArgument;
goto exit;
}
/* overwrite configuration from env variable. */
blockingEnv = ncclParamCommBlocking();
if (blockingEnv != 0 && blockingEnv != 1) {
WARN("Invalid NCCL_COMM_BLOCKING value %d", blockingEnv);
}
if (blockingEnv == 1) internalConfigPtr->blocking = blockingEnv;
(void)ncclCudaLibraryInit(); (void)ncclCudaLibraryInit();
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, exit); CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, fail);
if (config == NULL)
internalConfigPtr = &internalConfig;
else
internalConfigPtr = config;
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail); NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail);
exit: exit:

View File

@ -23,11 +23,33 @@ DECLARE_CUDA_PFN(cuMemGetAddressRange, 3020);
/* proxy.cc */ /* proxy.cc */
DECLARE_CUDA_PFN(cuCtxCreate, 3020); DECLARE_CUDA_PFN(cuCtxCreate, 3020);
DECLARE_CUDA_PFN(cuCtxDestroy, 4000); DECLARE_CUDA_PFN(cuCtxDestroy, 4000);
DECLARE_CUDA_PFN(cuCtxGetCurrent, 4000);
DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000); DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000);
DECLARE_CUDA_PFN(cuCtxGetDevice, 2000);
/* cuMem API support */
DECLARE_CUDA_PFN(cuMemAddressReserve, 10020);
DECLARE_CUDA_PFN(cuMemAddressFree, 10020);
DECLARE_CUDA_PFN(cuMemCreate, 10020);
DECLARE_CUDA_PFN(cuMemGetAllocationGranularity, 10020);
DECLARE_CUDA_PFN(cuMemExportToShareableHandle, 10020);
DECLARE_CUDA_PFN(cuMemImportFromShareableHandle, 10020);
DECLARE_CUDA_PFN(cuMemMap, 10020);
DECLARE_CUDA_PFN(cuMemRelease, 10020);
DECLARE_CUDA_PFN(cuMemSetAccess, 10020);
DECLARE_CUDA_PFN(cuMemUnmap, 10020);
#if CUDA_VERSION >= 11070 #if CUDA_VERSION >= 11070
/* transport/collNet.cc/net.cc*/ /* transport/collNet.cc/net.cc*/
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
#endif #endif
#if CUDA_VERSION >= 12010
/* NVSwitch Multicast support */
DECLARE_CUDA_PFN(cuMulticastAddDevice, 12010);
DECLARE_CUDA_PFN(cuMulticastBindMem, 12010);
DECLARE_CUDA_PFN(cuMulticastBindAddr, 12010);
DECLARE_CUDA_PFN(cuMulticastCreate, 12010);
DECLARE_CUDA_PFN(cuMulticastGetGranularity, 12010);
DECLARE_CUDA_PFN(cuMulticastUnbind, 12010);
#endif
#endif #endif
/* CUDA Driver functions loaded with dlsym() */ /* CUDA Driver functions loaded with dlsym() */
@ -39,6 +61,7 @@ DECLARE_CUDA_PFN(cuGetProcAddress, 11030);
static void *cudaLib; static void *cudaLib;
int ncclCudaDriverVersionCache = -1; int ncclCudaDriverVersionCache = -1;
bool ncclCudaLaunchBlocking = false;
#if CUDART_VERSION >= 11030 #if CUDART_VERSION >= 11030
/* /*
@ -62,9 +85,33 @@ static ncclResult_t cudaPfnFuncLoader(void) {
LOAD_SYM(cuMemGetAddressRange, 3020, 1); LOAD_SYM(cuMemGetAddressRange, 3020, 1);
LOAD_SYM(cuCtxCreate, 3020, 1); LOAD_SYM(cuCtxCreate, 3020, 1);
LOAD_SYM(cuCtxDestroy, 4000, 1); LOAD_SYM(cuCtxDestroy, 4000, 1);
LOAD_SYM(cuCtxGetCurrent, 4000, 1);
LOAD_SYM(cuCtxSetCurrent, 4000, 1); LOAD_SYM(cuCtxSetCurrent, 4000, 1);
LOAD_SYM(cuCtxGetDevice, 2000, 1);
/* cuMem API support */
#if CUDA_VERSION >= 11030
LOAD_SYM(cuMemAddressReserve, 10020, 1);
LOAD_SYM(cuMemAddressFree, 10020, 1);
LOAD_SYM(cuMemCreate, 10020, 1);
LOAD_SYM(cuMemGetAllocationGranularity, 10020, 1);
LOAD_SYM(cuMemExportToShareableHandle, 10020, 1);
LOAD_SYM(cuMemImportFromShareableHandle, 10020, 1);
LOAD_SYM(cuMemMap, 10020, 1);
LOAD_SYM(cuMemRelease, 10020, 1);
LOAD_SYM(cuMemSetAccess, 10020, 1);
LOAD_SYM(cuMemUnmap, 10020, 1);
#endif
#if CUDA_VERSION >= 11070 #if CUDA_VERSION >= 11070
LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
#endif
#if CUDA_VERSION >= 12010
/* NVSwitch Multicast support */
LOAD_SYM(cuMulticastAddDevice, 12010, 1);
LOAD_SYM(cuMulticastBindMem, 12010, 1);
LOAD_SYM(cuMulticastBindAddr, 12010, 1);
LOAD_SYM(cuMulticastCreate, 12010, 1);
LOAD_SYM(cuMulticastGetGranularity, 12010, 1);
LOAD_SYM(cuMulticastUnbind, 12010, 1);
#endif #endif
return ncclSuccess; return ncclSuccess;
} }
@ -74,6 +121,11 @@ static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
static ncclResult_t initResult; static ncclResult_t initResult;
static void initOnceFunc() { static void initOnceFunc() {
do {
char* val = getenv("CUDA_LAUNCH_BLOCKING");
ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0);
} while (0);
CUresult res; CUresult res;
/* /*
* Load CUDA driver library * Load CUDA driver library
@ -85,9 +137,10 @@ static void initOnceFunc() {
else else
snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so"); snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so");
(void) dlerror(); // Clear any previous errors
cudaLib = dlopen(path, RTLD_LAZY); cudaLib = dlopen(path, RTLD_LAZY);
if (cudaLib == NULL) { if (cudaLib == NULL) {
WARN("Failed to find CUDA library (NCCL_CUDA_PATH='%s') : %s", ncclCudaPath ? ncclCudaPath : "", dlerror()); WARN("Failed to find CUDA library %s (NCCL_CUDA_PATH='%s') : %s", path, ncclCudaPath ? ncclCudaPath : "", dlerror());
goto error; goto error;
} }

200
src/misc/ipcsocket.cc Normal file
View File

@ -0,0 +1,200 @@
/*
* Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
*
* See COPYRIGHT for license information
*/
#include "ipcsocket.h"
#include "utils.h"
#include <stdlib.h>
#include <string.h>
#include <errno.h>
// Enable Linux abstract socket naming
#define USE_ABSTRACT_SOCKET
#define NCCL_IPC_SOCKNAME_STR "/tmp/nccl-socket-%d-%lx"
/*
* Create a Unix Domain Socket
*/
ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag) {
int fd = -1;
struct sockaddr_un cliaddr;
char temp[NCCL_IPC_SOCKNAME_LEN] = "";
if (handle == NULL) {
return ncclInternalError;
}
handle->fd = -1;
handle->socketName[0] = '\0';
if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
WARN("UDS: Socket creation error : %d", errno);
return ncclSystemError;
}
bzero(&cliaddr, sizeof(cliaddr));
cliaddr.sun_family = AF_UNIX;
// Create unique name for the socket.
int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
if (len > (sizeof(cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot bind provided name to socket. Name too large");
return ncclInternalError;
}
#ifndef USE_ABSTRACT_SOCKET
unlink(temp);
#endif
TRACE(NCCL_INIT, "UDS: Creating socket %s", temp);
strncpy(cliaddr.sun_path, temp, len);
#ifdef USE_ABSTRACT_SOCKET
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) {
WARN("UDS: Binding to socket %s failed : %d", temp, errno);
close(fd);
return ncclSystemError;
}
handle->fd = fd;
strcpy(handle->socketName, temp);
handle->abortFlag = abortFlag;
// Mark socket as non-blocking
if (handle->abortFlag) {
int flags;
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
}
return ncclSuccess;
}
ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) {
if (handle == NULL) {
return ncclInternalError;
}
if (handle->fd <= 0) {
return ncclSuccess;
}
#ifndef USE_ABSTRACT_SOCKET
if (handle->socketName[0] != '\0') {
unlink(handle->socketName);
}
#endif
close(handle->fd);
return ncclSuccess;
}
ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
struct iovec iov[1];
// Union to guarantee alignment requirements for control array
union {
struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))];
} control_un;
struct cmsghdr *cmptr;
char dummy_buffer[1];
int ret;
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
iov[0].iov_base = (void *)dummy_buffer;
iov[0].iov_len = sizeof(dummy_buffer);
msg.msg_iov = iov;
msg.msg_iovlen = 1;
while ((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Receiving data over socket failed : %d", errno);
return ncclSystemError;
}
if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
}
if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
WARN("UDS: Receiving data over socket failed");
return ncclSystemError;
}
memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
} else {
WARN("UDS: Receiving data over socket %s failed", handle->socketName);
return ncclSystemError;
}
TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
return ncclSuccess;
}
ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) {
struct msghdr msg;
struct iovec iov[1];
char temp[NCCL_IPC_SOCKNAME_LEN];
union {
struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))];
} control_un;
struct cmsghdr *cmptr;
struct sockaddr_un cliaddr;
// Construct client address to send this shareable handle to
bzero(&cliaddr, sizeof(cliaddr));
cliaddr.sun_family = AF_UNIX;
int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
if (len > (sizeof(cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot connect to provided name for socket. Name too large");
return ncclInternalError;
}
(void) strncpy(cliaddr.sun_path, temp, len);
TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
#ifdef USE_ABSTRACT_SOCKET
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
cmptr = CMSG_FIRSTHDR(&msg);
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
cmptr->cmsg_level = SOL_SOCKET;
cmptr->cmsg_type = SCM_RIGHTS;
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
msg.msg_name = (void *)&cliaddr;
msg.msg_namelen = sizeof(struct sockaddr_un);
iov[0].iov_base = (void *)"";
iov[0].iov_len = 1;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_flags = 0;
ssize_t sendResult;
while ((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Sending data over socket %s failed : %d", temp, errno);
return ncclSystemError;
}
if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
}
return ncclSuccess;
}

View File

@ -43,7 +43,7 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr
static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) { static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
int closed; int closed;
NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0, &closed)); NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0 /*block*/, &closed));
if (closed) { if (closed) {
char line[SOCKET_NAME_MAXLEN+1]; char line[SOCKET_NAME_MAXLEN+1];
WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0)); WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
@ -785,16 +785,33 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
} }
// Receive or detect connection closed // Receive or detect connection closed
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed) { ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking) {
int offset = 0; int offset = 0;
if (sock == NULL) { if (sock == NULL) {
WARN("ncclSocketTryRecv: pass NULL socket"); WARN("ncclSocketTryRecv: pass NULL socket");
return ncclInvalidArgument; return ncclInvalidArgument;
} }
*closed = 0; *closed = 0;
while (offset < size) { // Block until connection closes or nbytes received
if (blocking) {
while (offset < size) {
NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
if (*closed) return ncclSuccess;
}
} else {
NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed)); NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
if (*closed) return ncclSuccess; if (*closed) return ncclSuccess;
// If any bytes were received, block waiting for the rest
if (offset > 0) {
while (offset < size) {
NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
if (*closed) return ncclSuccess;
}
// No bytes were received, return ncclInProgress
} else {
return ncclInProgress;
}
} }
return ncclSuccess; return ncclSuccess;
} }

View File

@ -25,8 +25,10 @@
extern "C" { extern "C" {
#endif #endif
#include <limits.h>
/* Opaque handle to communicator */ /* Opaque handle to communicator */
typedef struct ncclComm* ncclComm_t; typedef struct ncclComm* ncclComm_t;
#define NCCL_COMM_NULL NULL
#define NCCL_UNIQUE_ID_BYTES 128 #define NCCL_UNIQUE_ID_BYTES 128
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
@ -42,15 +44,22 @@ typedef enum { ncclSuccess = 0,
ncclInProgress = 7, ncclInProgress = 7,
ncclNumResults = 8 } ncclResult_t; ncclNumResults = 8 } ncclResult_t;
#define NCCL_CONFIG_UNDEF_INT INT_MIN
#define NCCL_CONFIG_UNDEF_PTR NULL
/* Communicator configuration. Users can assign value to attributes to specify the /* Communicator configuration. Users can assign value to attributes to specify the
* behavior of a communicator. */ * behavior of a communicator. */
typedef struct ncclConfig_v21400 { typedef struct ncclConfig_v21700 {
/* attributes that users should never touch. */ /* attributes that users should never touch. */
size_t size; size_t size;
unsigned int magic; unsigned int magic;
unsigned int version; unsigned int version;
/* attributes that users are able to customize. */ /* attributes that users are able to customize. */
int blocking; int blocking;
int cgaClusterSize;
int minCTAs;
int maxCTAs;
const char *netName;
} ncclConfig_t; } ncclConfig_t;
/* Config initializer must be assigned to initialize config structure when it is created. /* Config initializer must be assigned to initialize config structure when it is created.
@ -59,7 +68,11 @@ typedef struct ncclConfig_v21400 {
sizeof(ncclConfig_t), /* size */ \ sizeof(ncclConfig_t), /* size */ \
0xcafebeef, /* magic */ \ 0xcafebeef, /* magic */ \
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
1 /* blocking */ \ NCCL_CONFIG_UNDEF_INT, /* blocking */ \
NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
NCCL_CONFIG_UNDEF_PTR /* netName */ \
} }
/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.

View File

@ -176,14 +176,8 @@ ncclResult_t ncclNetPluginInit() {
} }
void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL); void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
if (netPluginLib == nullptr) { if (netPluginLib == nullptr) {
// dlopen does not guarantee to set errno, but dlerror only gives us a INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load (%s) returned %d : %s", ncclNetPluginName, errno, dlerror());
// string, so checking errno doesn't hurt to try to provide a better INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found, using internal implementation");
// error message
if (errno == ENOENT) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (%s), using internal implementation", ncclNetPluginName);
} else {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
}
return ncclSuccess; return ncclSuccess;
} }
@ -264,9 +258,10 @@ static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
ncclResult_t ncclNetInit(struct ncclComm* comm) { ncclResult_t ncclNetInit(struct ncclComm* comm) {
// Initialize main communication network // Initialize main communication network
char* netName = getenv("NCCL_NET"); char* netName;
bool ok = false; bool ok = false;
netName = comm->netName;
for (int i=0; i<3; i++) { for (int i=0; i<3; i++) {
if (ncclNets[i] == nullptr) continue; if (ncclNets[i] == nullptr) continue;
enum ncclNetState state; enum ncclNetState state;
@ -324,9 +319,26 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
ncclResult_t ret; ncclResult_t ret;
ncclDebugNoWarn = NCCL_NET; ncclDebugNoWarn = NCCL_NET;
NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1); NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1);
NCCLWAITGOTO(ncclNetConnect(comm, dev, &handle, &sComm), sComm != NULL, comm->abortFlag, ret, cleanup2);
NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3); bool connected;
CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4); connected = false;
while (!connected) {
// If we're aborting now, skip to cleanup
if (*comm->abortFlag) {
goto cleanup2;
}
if (sComm == NULL)
NCCLCHECKGOTO(ncclNetConnect(comm, dev, &handle, &sComm), ret, cleanup2);
if (rComm == NULL)
NCCLCHECKGOTO(ncclNetAccept(comm, lComm, &rComm), ret, cleanup2);
connected = (rComm != NULL) && (sComm != NULL);
}
CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle)); NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle));
NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
@ -335,11 +347,11 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
} }
ncclDebugNoWarn = 0; ncclDebugNoWarn = 0;
CUDACHECK(cudaFree(gpuPtr)); CUDACHECK(cudaFree(gpuPtr));
cleanup4:
NCCLCHECK(ncclNetCloseRecv(comm, rComm));
cleanup3:
NCCLCHECK(ncclNetCloseSend(comm, sComm));
cleanup2: cleanup2:
if (rComm != NULL)
NCCLCHECK(ncclNetCloseRecv(comm, rComm));
if (sComm != NULL)
NCCLCHECK(ncclNetCloseSend(comm, sComm));
NCCLCHECK(ncclNetCloseListen(comm, lComm)); NCCLCHECK(ncclNetCloseListen(comm, lComm));
cleanup1: cleanup1:
break; break;

View File

@ -14,6 +14,7 @@
#include "timer.h" #include "timer.h"
#include <sys/syscall.h> #include <sys/syscall.h>
#include <assert.h>
enum { proxyRecv=0, proxySend=1 }; enum { proxyRecv=0, proxySend=1 };
@ -37,6 +38,155 @@ struct ncclProxyPool {
struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE]; struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
}; };
static void expectedProxyResponseFree(struct ncclProxyState* state) {
struct ncclExpectedProxyResponse* elem = state->expectedResponses;
struct ncclExpectedProxyResponse* prev = NULL;
while (elem) {
prev = elem;
elem = elem->next;
free(prev->respBuff);
free(prev);
}
}
static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, void* opId, void* respBuff, int respSize) {
struct ncclExpectedProxyResponse* elem = state->expectedResponses;
while (elem) {
if (elem->opId == opId) {
if (respSize != elem->respSize) {
WARN("Mismatched response size for opId=%p", opId);
return ncclInternalError;
}
if (elem->done) {
WARN("Storing response for already completed opId=%p", opId);
return ncclInternalError;
}
memcpy(elem->respBuff, respBuff, respSize);
elem->done = true;
return ncclSuccess;
}
elem = elem->next;
}
WARN("Proxy response for opId=%p doesn't match any expected response", opId);
return ncclInternalError;
}
static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, void* opId, int respSize, void* respData, int respDataSize) {
struct ncclExpectedProxyResponse* ex;
NCCLCHECK(ncclCalloc(&ex, 1));
ex->opId = opId;
// Pre-alloc response buffer
ex->respBuff = malloc(respSize);
ex->respSize = respSize;
ex->done = false;
if (respData) {
memcpy(ex->respBuff, respData, respDataSize);
ex->done = true;
}
// Enqueue
struct ncclExpectedProxyResponse* list = state->expectedResponses;
if (list == NULL) {
state->expectedResponses = ex;
return ncclSuccess;
}
while (list->next) list = list->next;
list->next = ex;
return ncclSuccess;
}
static ncclResult_t expectedProxyResponseDequeue(struct ncclProxyState* state, void* opId, void* respBuff, int* found) {
struct ncclExpectedProxyResponse* elem = state->expectedResponses;
struct ncclExpectedProxyResponse* prev = NULL;
*found = 0;
while (elem) {
if ((elem->opId == opId) && elem->done) {
if (prev == NULL) {
state->expectedResponses = elem->next;
} else {
prev->next = elem->next;
}
memcpy(respBuff, elem->respBuff, elem->respSize);
free(elem->respBuff);
free(elem);
*found = 1;
return ncclSuccess;
}
prev = elem;
elem = elem->next;
}
return ncclSuccess;
}
static ncclResult_t expectedProxyResponseRemove(struct ncclProxyState* state, void* opId) {
struct ncclExpectedProxyResponse* elem = state->expectedResponses;
struct ncclExpectedProxyResponse* prev = NULL;
while (elem) {
if (elem->opId == opId) {
if (prev == NULL) {
state->expectedResponses = elem->next;
} else {
prev->next = elem->next;
}
free(elem->respBuff);
free(elem);
return ncclSuccess;
}
prev = elem;
elem = elem->next;
}
WARN("Couldn't find opId=%p", opId);
return ncclInternalError;
}
static ncclResult_t asyncProxyOpEnqueue(struct ncclProxyLocalPeer* peer, ncclProxyAsyncOp* op) {
ncclProxyAsyncOp* list = peer->asyncOps;
if (list == NULL) {
peer->asyncOps = op;
return ncclSuccess;
}
while (list->next) list = list->next;
list->next = op;
return ncclSuccess;
}
static ncclResult_t asyncProxyOpDequeue(struct ncclProxyLocalPeer* peer, ncclProxyAsyncOp* op) {
struct ncclProxyAsyncOp* elem = peer->asyncOps;
struct ncclProxyAsyncOp* prev = NULL;
while (elem) {
if (elem->opId == op->opId) {
if (prev == NULL) {
peer->asyncOps = elem->next;
} else {
prev->next = elem->next;
}
if (elem->reqBuff) {
free(elem->reqBuff);
}
if (elem->respBuff) {
free(elem->respBuff);
}
free(elem);
return ncclSuccess;
}
prev = elem;
elem = elem->next;
}
if (op) {
WARN("Attempting to dequeue nonexistent async opId=%p", op->opId);
} else {
WARN("Attempting to dequeue null operation");
}
return ncclInternalError;
}
static ncclResult_t allocateArgs(struct ncclProxyProgressState* state, struct ncclProxyArgs** argsptr) { static ncclResult_t allocateArgs(struct ncclProxyProgressState* state, struct ncclProxyArgs** argsptr) {
struct ncclProxyArgs* elem; struct ncclProxyArgs* elem;
if (state->pool == NULL) { if (state->pool == NULL) {
@ -86,7 +236,7 @@ ncclResult_t getOpIndex(struct ncclProxyArgs* op, struct ncclProxyProgressState*
pool = pool->next; pool = pool->next;
p++; p++;
} }
WARN("Could not find pool of op %p\n", op); WARN("Could not find pool of op %p", op);
return ncclInternalError; return ncclInternalError;
} }
@ -140,7 +290,7 @@ ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) {
nextOp->state |= OP_SEEN; nextOp->state |= OP_SEEN;
printf("\n"); printf("\n");
if (nextOp->next) { if (nextOp->next) {
WARN("Inactive op has next set!\n"); WARN("Inactive op has next set!");
} }
nextOp = nextOp->nextPeer; nextOp = nextOp->nextPeer;
} }
@ -337,7 +487,7 @@ ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector*
} }
} }
if (lastOp == -1) { if (lastOp == -1) {
WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)\n", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount); WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount);
return ncclInternalError; return ncclInternalError;
} }
// Cut chain at lastOp // Cut chain at lastOp
@ -770,19 +920,6 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclComm* comm) {
return ncclSuccess; return ncclSuccess;
} }
struct ncclProxyAsyncOp {
int type;
struct ncclProxyConnection* connection;
int reqSize, respSize;
char *reqBuff, *respBuff;
};
struct ncclProxyLocalPeer {
struct ncclSocket sock;
int localRank;
struct ncclProxyAsyncOp asyncOps;
};
#define NCCL_PROXY_CONN_POOL_SIZE_POW2 7 #define NCCL_PROXY_CONN_POOL_SIZE_POW2 7
#define NCCL_PROXY_CONN_POOL_SIZE (1<<(NCCL_PROXY_CONN_POOL_SIZE_POW2)) #define NCCL_PROXY_CONN_POOL_SIZE (1<<(NCCL_PROXY_CONN_POOL_SIZE_POW2))
#define NCCL_PROXY_CONN_POOL_MASK ((NCCL_PROXY_CONN_POOL_SIZE)-1) #define NCCL_PROXY_CONN_POOL_MASK ((NCCL_PROXY_CONN_POOL_SIZE)-1)
@ -790,7 +927,6 @@ struct ncclProxyConnectionPool {
struct ncclProxyConnection** pools; struct ncclProxyConnection** pools;
int banks; int banks;
int offset; int offset;
struct ncclProxyAsyncOp* ops;
}; };
static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) { static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) {
@ -888,26 +1024,137 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
return ncclSuccess; return ncclSuccess;
} }
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop" }; const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "ConvertFd" };
ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) { ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
struct ncclSocket* sock; struct ncclSocket* sock;
ncclResult_t ret = ncclSuccess; ncclResult_t ret = ncclSuccess;
void* respData = NULL;
int respDataSize = 0;
struct ncclComm* comm = proxyConn->comm;
struct ncclIpcSocket ipcSock = { 0 };
if (proxyConn->comm->proxyState.peerSocks == NULL) return ncclInternalError; if (*comm->abortFlag != 0) {
sock = proxyConn->comm->proxyState.peerSocks + proxyConn->localRank; WARN("ncclProxyCallAsync() - Saw abortFlag while waiting for proxyThread response");
return ncclInternalError;
}
if (comm->proxyState.peerSocks == NULL) return ncclInternalError;
sock = comm->proxyState.peerSocks + proxyConn->localRank;
if (sock == NULL) return ncclInternalError; if (sock == NULL) return ncclInternalError;
if (type == ncclProxyMsgConvertFd) {
// cuMem API support
// Create a UDS socket to receive the converted fd
NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->localRank, (uint64_t)proxyConn->connection, comm->abortFlag));
}
NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error); NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error); NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error); NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
NCCLCHECKGOTO(ncclSocketSend(sock, &respSize, sizeof(int)), ret, error); NCCLCHECKGOTO(ncclSocketSend(sock, &respSize, sizeof(int)), ret, error);
if (reqSize) NCCLCHECKGOTO(ncclSocketSend(sock, reqBuff, reqSize), ret, error); if (reqSize) NCCLCHECKGOTO(ncclSocketSend(sock, reqBuff, reqSize), ret, error);
if (respSize) NCCLCHECKGOTO(ncclSocketRecv(sock, respBuff, respSize), ret, error);
if (type == ncclProxyMsgConvertFd) {
// cuMem API support
int recvFd = -1;
if (reqSize != sizeof(int) || respSize != sizeof(int)) return ncclInternalError;
// Receive converted fd over UDS
NCCLCHECK(ncclIpcSocketRecvFd(&ipcSock, &recvFd));
TRACE(NCCL_NET, "UDS: ConvertFd rank %d returned %p %d", proxyConn->localRank, &recvFd, recvFd);
assert(recvFd != -1);
respData = &recvFd;
respDataSize = sizeof(recvFd);
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
} else {
// Send opId to proxy
NCCLCHECKGOTO(ncclSocketSend(sock, &opId, sizeof(opId)), ret, error);
}
// Add proxyOp to expected response queue
NCCLCHECK(expectedProxyResponseEnqueue(&comm->proxyState, opId, respSize, respData, respDataSize));
return ncclSuccess; return ncclSuccess;
error: error:
WARN("Proxy Call to rank %d failed (%s)", proxyConn->comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]); NCCLCHECK(ncclIpcSocketClose(&ipcSock));
WARN("Proxy Call to rank %d failed (%s)", comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]);
return ret; return ret;
} }
ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId) {
struct ncclComm* comm = proxyConn->comm;
// Receive the connection pointer from the Proxy
if (*comm->abortFlag) {
WARN("Comm %p is in abort state", comm);
return ncclInternalError;
}
if (comm->proxyState.peerSocks == NULL) return ncclInternalError;
// Check response queue
int found = 0;
NCCLCHECK(expectedProxyResponseDequeue(&comm->proxyState, opId, respBuff, &found));
if (found == 0) {
// Attempt to read in a new response header from the proxy thread
struct ncclSocket* sock = comm->proxyState.peerSocks + proxyConn->localRank;
void* recvOpId;
int offset = 0;
if (ncclSuccess != ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset)) {
WARN("Socket recv failed while polling for opId=%p", opId);
return ncclInternalError;
}
if (offset == 0) {
return ncclInProgress;
// If we've returned a partial response, block to receive the rest of it
} else if (offset < sizeof(recvOpId)) {
while (offset < sizeof(recvOpId))
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset));
}
INFO(NCCL_PROXY, "ncclPollProxyResponse Recieved new opId=%p", recvOpId);
// Now do a blocking recv of the response size
int respSize = 0;
NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(respSize)));
// If there's a respSize to recv
if (respSize > 0) {
NCCLCHECK(ncclSocketRecv(sock, respBuff, respSize));
}
if (recvOpId == opId) {
INFO(NCCL_PROXY, "recvOpId=%p matches expected opId=%p", recvOpId, opId);
NCCLCHECK(expectedProxyResponseRemove(&comm->proxyState, recvOpId));
return ncclSuccess;
} else {
INFO(NCCL_PROXY, "Queing opId=%p", recvOpId);
// Store the result and mark response as completed
NCCLCHECK(expectedProxyResponseStore(&comm->proxyState, recvOpId, respBuff, respSize));
return ncclInProgress;
}
} else {
INFO(NCCL_PROXY, "ncclPollProxyResponse Dequeued cached opId=%p", opId);
}
return ncclSuccess;
}
ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
// Alloc some memory to act as a handle
void* opId = malloc(1);
NCCLCHECK(ncclProxyCallAsync(proxyConn, type, reqBuff, reqSize, respSize, opId));
ncclResult_t res = ncclInProgress;
while (res == ncclInProgress) {
res = ncclPollProxyResponse(proxyConn, respBuff, opId);
}
free(opId);
return res;
}
static ncclResult_t proxyProgressInit(struct ncclComm* comm) { static ncclResult_t proxyProgressInit(struct ncclComm* comm) {
struct ncclProxyProgressState* state = &comm->proxyState.progressState; struct ncclProxyProgressState* state = &comm->proxyState.progressState;
if (state->opsPool == NULL) { if (state->opsPool == NULL) {
@ -998,16 +1245,55 @@ static ncclResult_t proxyConnSharedInit(struct ncclProxyLocalPeer* peer, struct
if (reqSize != sizeof(int) || respSize != 0) return ncclInternalError; if (reqSize != sizeof(int) || respSize != 0) return ncclInternalError;
int nChannels; int nChannels;
NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int))); NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int)));
// Store opId for completion response
void* opId;
NCCLCHECK(ncclSocketRecv(sock, &opId, sizeof(opId)));
INFO(NCCL_PROXY, "proxyConnSharedInit received opId=%p", opId);
if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels)); if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels));
__atomic_store_n(&connection->state, connSharedInitialized, __ATOMIC_RELEASE); __atomic_store_n(&connection->state, connSharedInitialized, __ATOMIC_RELEASE);
// Send the opId for referencing async operation
INFO(NCCL_PROXY, "proxyConnSharedInit::ncclSocketSend(opId=%p)", opId);
NCCLCHECK(ncclSocketSend(connection->sock, &opId, sizeof(opId)));
// Send the response size
INFO(NCCL_PROXY, "proxyConnSharedInit::ncclSocketSend(op.respSize=%d)", respSize);
NCCLCHECK(ncclSocketSend(connection->sock, &respSize, sizeof(respSize)));
return ncclSuccess; return ncclSuccess;
} }
static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclComm* comm, int* asyncOpCount) { // cuMem API support
static ncclResult_t proxyConvertFd(struct ncclProxyLocalPeer* peer, struct ncclComm* comm) {
struct ncclSocket* sock = &peer->sock;
uint64_t connection;
NCCLCHECK(ncclSocketRecv(sock, &connection, sizeof(uint64_t)));
int reqSize, respSize;
NCCLCHECK(ncclSocketRecv(sock, &reqSize, sizeof(int)));
NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(int)));
if (reqSize != sizeof(int) || respSize != sizeof(int)) return ncclInternalError;
int fd;
struct ncclIpcSocket ipcSock = { 0 };
NCCLCHECK(ncclSocketRecv(sock, &fd, sizeof(int)));
INFO(NCCL_NET, "UDS: proxyConvertFd received fd %d peer %d connection %lx", fd, peer->localRank, connection);
// Send back the converted fd using UDS
NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->localRank, connection^1, comm->abortFlag));
NCCLCHECK(ncclIpcSocketSendFd(&ipcSock, fd, peer->localRank, connection));
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
return ncclSuccess;
}
static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclComm* comm, int* asyncOpCount, struct ncclProxyLocalPeer* peer) {
int done = 1; int done = 1;
if (op->type == ncclProxyMsgSetup) { if (op->type == ncclProxyMsgSetup) {
INFO(NCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId);
NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done)); NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
} else if (op->type == ncclProxyMsgConnect) { } else if (op->type == ncclProxyMsgConnect) {
INFO(NCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done)); NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
} else return ncclInternalError; } else return ncclInternalError;
if (done) { if (done) {
@ -1015,31 +1301,38 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclC
__atomic_store_n(&op->connection->state, connSetupDone, __ATOMIC_RELEASE); __atomic_store_n(&op->connection->state, connSetupDone, __ATOMIC_RELEASE);
else if (op->type == ncclProxyMsgConnect) else if (op->type == ncclProxyMsgConnect)
__atomic_store_n(&op->connection->state, connConnected, __ATOMIC_RELEASE); __atomic_store_n(&op->connection->state, connConnected, __ATOMIC_RELEASE);
/* if setup or connect is done, we should not return any error at this point since /* if setup or connect is done, we should not return any error at this point since
* ncclSocketSend might already send the respBuff to the requester. If we still choose * ncclSocketSend might already send the respBuff to the requester. If we still choose
* to abort and close the connection, it can cause segfault if the requester is using * to abort and close the connection, it can cause segfault if the requester is using
* the respBuff. */ * the respBuff. */
if (op->respSize) ncclSocketSend(op->connection->sock, op->respBuff, op->respSize);
if (op->reqBuff) { // Send the opId for referencing async operation
free(op->reqBuff); NCCLCHECK(ncclSocketSend(op->connection->sock, &op->opId, sizeof(op->opId)));
op->reqBuff = NULL;
// Send the response size
NCCLCHECK(ncclSocketSend(op->connection->sock, &op->respSize, sizeof(op->respSize)));
if (op->respSize) {
// Send the response
NCCLCHECK(ncclSocketSend(op->connection->sock, op->respBuff, op->respSize));
} }
if (op->respBuff) {
free(op->respBuff); asyncProxyOpDequeue(peer, op);
op->respBuff = NULL;
}
op->type = 0;
(*asyncOpCount)--; (*asyncOpCount)--;
return ncclSuccess;
} else if (*comm->abortFlag != 0) { } else if (*comm->abortFlag != 0) {
return ncclInternalError; return ncclInternalError;
} }
return ncclSuccess; return ncclInProgress;
} }
static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm, int* asyncOpCount) { static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm, int* asyncOpCount) {
struct ncclSocket* sock = &peer->sock; struct ncclSocket* sock = &peer->sock;
struct ncclProxyAsyncOp* asyncOp = &peer->asyncOps; struct ncclProxyAsyncOp* asyncOp;
NCCLCHECK(ncclCalloc(&asyncOp, 1));
asyncOp->type = type; asyncOp->type = type;
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*))); NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)));
@ -1049,9 +1342,16 @@ static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* p
NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize)); NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize));
NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize)); NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize));
} }
// Store opId for completion response
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)));
if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize)); if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize));
asyncProxyOpEnqueue(peer, asyncOp);
(*asyncOpCount)++; (*asyncOpCount)++;
NCCLCHECK(proxyProgressAsync(asyncOp, comm, asyncOpCount)); NCCLCHECK(proxyProgressAsync(asyncOp, comm, asyncOpCount, peer));
return ncclSuccess; return ncclSuccess;
} }
@ -1081,7 +1381,7 @@ void* ncclProxyService(void* _args) {
pollfds[s].events = POLLHUP|POLLIN; pollfds[s].events = POLLHUP|POLLIN;
} }
if (ncclSocketGetFd(comm->proxyState.listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) { if (ncclSocketGetFd(comm->proxyState.listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) {
WARN("[Proxy Service] Get listenSock fd fails\n"); WARN("[Proxy Service] Get listenSock fd fails");
return NULL; return NULL;
}; };
pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN; pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
@ -1113,14 +1413,14 @@ void* ncclProxyService(void* _args) {
} }
if (maxnpeers < s+1) maxnpeers = s+1; if (maxnpeers < s+1) maxnpeers = s+1;
if (ncclSocketInit(&peers[s].sock) != ncclSuccess) { if (ncclSocketInit(&peers[s].sock) != ncclSuccess) {
WARN("[Service thread] Initialize peers[%d].sock fails\n", s); WARN("[Service thread] Initialize peers[%d].sock fails", s);
return NULL; return NULL;
} }
if (ncclSocketAccept(&peers[s].sock, comm->proxyState.listenSock) != ncclSuccess) { if (ncclSocketAccept(&peers[s].sock, comm->proxyState.listenSock) != ncclSuccess) {
WARN("[Service thread] Accept failed %s", strerror(errno)); WARN("[Service thread] Accept failed %s", strerror(errno));
} else { } else {
if (ncclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != ncclSuccess) { if (ncclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != ncclSuccess) {
WARN("[Service thread] Get peers[%d].sock fd fails\n", s); WARN("[Service thread] Get peers[%d].sock fd fails", s);
return NULL; return NULL;
} }
npeers++; npeers++;
@ -1130,25 +1430,37 @@ void* ncclProxyService(void* _args) {
for (int s=0; s<maxnpeers; s++) { for (int s=0; s<maxnpeers; s++) {
struct ncclProxyLocalPeer* peer = peers+s; struct ncclProxyLocalPeer* peer = peers+s;
struct ncclSocket* sock = &peer->sock; struct ncclSocket* sock = &peer->sock;
struct ncclProxyAsyncOp* op = &peer->asyncOps;
int closeConn = 0; int closeConn = 0;
int type = 0; int type = 0;
ncclResult_t res = ncclSuccess; ncclResult_t res = ncclSuccess;
if (pollfds[s].fd == -1) continue; if (pollfds[s].fd == -1) continue;
if (op->type != 0) {
res = proxyProgressAsync(op, comm, &asyncOpCount); // Progress all ops for this ncclProxyLocalPeer
ncclProxyAsyncOp* op = peer->asyncOps;
while (op != nullptr) {
type = op->type; type = op->type;
if (res != ncclSuccess) closeConn = 1; res = proxyProgressAsync(op, comm, &asyncOpCount, peer);
} else if (pollfds[s].revents & POLLIN) { if (res == ncclSuccess || res == ncclInProgress) {
op = op->next;
} else {
// Res is a bad result
closeConn = 1;
WARN("[Service thread] Error encountered progressing operation=%s, res=%d, closing connection", ncclProxyMsgTypeStr[type], res);
break;
}
}
// Check for additional ops coming in
if (pollfds[s].revents & POLLIN) {
int closed; int closed;
if (ncclSocketTryRecv(sock, &type, sizeof(int), &closed) != ncclSuccess) { res = ncclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/);
WARN("[Service thread] Could not receive type from localRank %d", peer->localRank); if (res != ncclSuccess && res != ncclInProgress) {
WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->localRank, res, closed);
closeConn = 1; closeConn = 1;
} else if (closed) { } else if (closed) {
INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank); INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank);
closeConn = 1; closeConn = 1;
} else { } else if (res == ncclSuccess) { // We received something from the sock
if (type == ncclProxyMsgStop) { if (type == ncclProxyMsgStop) {
stop = 1; stop = 1;
closeConn = 1; closeConn = 1;
@ -1159,30 +1471,32 @@ void* ncclProxyService(void* _args) {
} else if (type == ncclProxyMsgSharedInit) { } else if (type == ncclProxyMsgSharedInit) {
res = proxyConnSharedInit(peers+s, &connectionPool, comm); res = proxyConnSharedInit(peers+s, &connectionPool, comm);
} else if (type == ncclProxyMsgSetup || type == ncclProxyMsgConnect) { } else if (type == ncclProxyMsgSetup || type == ncclProxyMsgConnect) {
INFO(NCCL_PROXY, "proxyConnSetupConnect for peer->localRank %d,", peer->localRank);
res = proxyConnSetupConnect(type, peers+s, &connectionPool, comm, &asyncOpCount); res = proxyConnSetupConnect(type, peers+s, &connectionPool, comm, &asyncOpCount);
} else if (type == ncclProxyMsgConvertFd) {
res = proxyConvertFd(peers+s, comm); // cuMem API support
} else { } else {
WARN("[Service thread] Unknown command %d from localRank %d\n", type, peer->localRank); WARN("[Service thread] Unknown command %d from localRank %d", type, peer->localRank);
closeConn = 1; closeConn = 1;
} }
INFO(NCCL_PROXY, "Received and initiated operation=%s res=%d", ncclProxyMsgTypeStr[type], res);
} }
} else if (pollfds[s].revents & POLLHUP) { } else if (pollfds[s].revents & POLLHUP) {
closeConn = 1; closeConn = 1;
} }
if (res != ncclSuccess) { if (res != ncclSuccess && res != ncclInProgress) {
WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", comm->rank, ncclProxyMsgTypeStr[type], comm->localRankToRank[peer->localRank], res); WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", comm->rank, ncclProxyMsgTypeStr[type], comm->localRankToRank[peer->localRank], res);
closeConn = 1; closeConn = 1;
} }
if (closeConn) { if (closeConn) {
ncclSocketClose(sock); ncclSocketClose(sock);
if (op->reqBuff) {
free(op->reqBuff); if (op != nullptr) {
op->reqBuff = NULL; asyncProxyOpDequeue(peer, op);
asyncOpCount--;
} }
if (op->respBuff) {
free(op->respBuff);
op->respBuff = NULL;
}
op->type = 0;
pollfds[s].fd = -1; pollfds[s].fd = -1;
npeers--; npeers--;
} }
@ -1250,6 +1564,7 @@ ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
free(state->peerSocks); free(state->peerSocks);
free(state->proxyOps); free(state->proxyOps);
free(state->sharedDevMems); free(state->sharedDevMems);
expectedProxyResponseFree(state);
} }
return ncclSuccess; return ncclSuccess;
} }

View File

@ -69,9 +69,12 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
ncclResult_t ret = ncclSuccess; ncclResult_t ret = ncclSuccess;
int highestType = TRANSPORT_P2P; // track highest transport type int highestType = TRANSPORT_P2P; // track highest transport type
struct ncclConnect data[2*MAXCHANNELS]; struct ncclConnect** data = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Store intermediate send/recvData structs for connect
struct ncclConnect** recvData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given recv connection within a channel
struct ncclConnect** sendData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given send connection within a channel
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail); NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail);
// First time initialization
for (int i=1; i<comm->nRanks; i++) { for (int i=1; i<comm->nRanks; i++) {
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0); int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks; int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
@ -79,22 +82,28 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
uint64_t recvMask = comm->connectRecv[recvPeer]; uint64_t recvMask = comm->connectRecv[recvPeer];
uint64_t sendMask = comm->connectSend[sendPeer]; uint64_t sendMask = comm->connectSend[sendPeer];
struct ncclConnect* recvData = data; // Data[i] contains all ncclConnect information for all send and receive connections with a given send and recv peer
// This data is packed in the array based on the number of sendChannels and recvChannels connected with these peers
// The first N entries contain recvData, connection information for recv connections
// The next M entries contain sendData, connection information for send connections
// It's not guaranteed that each entry of data has the same number of total or send/recv specific connections
data[i] = (ncclConnect*) malloc(sizeof(ncclConnect) * 2*MAXCHANNELS);
recvData[i] = data[i];
int sendChannels = 0, recvChannels = 0; int sendChannels = 0, recvChannels = 0;
int type; int type;
TIME_START(0); TIME_START(0);
for (int c=0; c<MAXCHANNELS; c++) { for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1UL<<c)) { if (recvMask & (1UL<<c)) {
NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type), ret, fail); NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData[i]+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
if (type > highestType) highestType = type; if (type > highestType) highestType = type;
} }
} }
TIME_STOP(0); TIME_STOP(0);
TIME_START(1); TIME_START(1);
struct ncclConnect* sendData = recvData+recvChannels; sendData[i] = recvData[i]+recvChannels;
for (int c=0; c<MAXCHANNELS; c++) { for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1UL<<c)) { if (sendMask & (1UL<<c)) {
NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type), ret, fail); NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData[i]+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
if (type > highestType) highestType = type; if (type > highestType) highestType = type;
} }
} }
@ -103,42 +112,82 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
TIME_START(2); TIME_START(2);
if (sendPeer == recvPeer) { if (sendPeer == recvPeer) {
if (recvChannels+sendChannels) { if (recvChannels+sendChannels) {
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail); NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail); NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
sendData = data; sendData[i] = data[i];
recvData = data+sendChannels; recvData[i] = data[i]+sendChannels;
} }
} else { } else {
if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail); if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail);
if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail); if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail);
if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail); if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail);
if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail); if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail);
} }
TIME_STOP(2); TIME_STOP(2);
TIME_START(3);
for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn), ret, fail);
conn->connected = 1;
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
}
}
TIME_STOP(3);
TIME_START(4);
for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn), ret, fail);
conn->connected = 1;
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
}
}
TIME_STOP(4);
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
} }
// Loop until all channels with all ranks have been connected
bool allChannelsConnected;
allChannelsConnected = false;
while (!allChannelsConnected) {
allChannelsConnected = true;
for (int i=1; i<comm->nRanks; i++) {
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
int sendPeer = (comm->rank + i) % comm->nRanks;
uint64_t recvMask = comm->connectRecv[recvPeer];
uint64_t sendMask = comm->connectSend[sendPeer];
int sendDataOffset = 0;
int recvDataOffset = 0;
for (int c=0; c<MAXCHANNELS; c++) {
TIME_START(3);
if (sendMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
// This connector hasn't completed connection yet
if (conn->connected == 0) {
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[i] + sendDataOffset++, 1, comm->rank, conn), ret, fail);
if (ret == ncclSuccess) {
conn->connected = 1;
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
} else if (ret == ncclInProgress) {
allChannelsConnected = false;
}
}
}
TIME_STOP(3);
// Start with recv channels
TIME_START(4);
if (recvMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
// This connector hasn't completed connection yet
if (conn->connected == 0) {
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[i] + recvDataOffset++, 1, comm->rank, conn), ret, fail);
if (ret == ncclSuccess) {
conn->connected = 1;
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
} else if (ret == ncclInProgress) {
allChannelsConnected = false;
}
}
}
TIME_STOP(4);
}
}
}
// Clear all connect masks and free each connectInfo array
for (int i=1; i<comm->nRanks; i++) {
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
int sendPeer = (comm->rank + i) % comm->nRanks;
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
free(data[i]);
}
free(data);
free(sendData);
free(recvData);
if (highestTransportType != NULL) *highestTransportType = highestType; if (highestTransportType != NULL) *highestTransportType = highestType;
TIME_PRINT("P2P Setup/Connect"); TIME_PRINT("P2P Setup/Connect");
exit: exit:

View File

@ -152,13 +152,13 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
int proxyRank; int proxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0; send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
// Determine whether we need to flush the GDR buffer on recv or not // Determine whether we need to flush the GDR buffer on recv or not
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank)); NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn)); NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : ""); req.useGdr ? "/GDRDMA" : "");
@ -171,12 +171,12 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
int proxyRank; int proxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
recv->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0; recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.localRank)); NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.localRank));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn)); NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t))); NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : ""); req.useGdr ? "/GDRDMA" : "");
@ -221,7 +221,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
// We're on the same process as the proxy. We can pass a pointer to a struct. // We're on the same process as the proxy. We can pass a pointer to a struct.
struct collNetConnectArgs args = { rank, nranks, connectInfos }; struct collNetConnectArgs args = { rank, nranks, connectInfos };
struct connectMap* map; struct connectMap* map;
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*))); NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
// If collnet connect failed, propagate error to fallback on regular p2p // If collnet connect failed, propagate error to fallback on regular p2p
if (map == NULL) return ncclSystemError; if (map == NULL) return ncclSystemError;
@ -247,7 +247,7 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
// We're on the same process as the proxy. We can pass a pointer to a struct. // We're on the same process as the proxy. We can pass a pointer to a struct.
struct collNetConnectArgs args = { rank, nranks, connectInfos }; struct collNetConnectArgs args = { rank, nranks, connectInfos };
struct connectMap* map; struct connectMap* map;
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*))); NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
// If collnet connect failed, propagate error to fallback on regular p2p // If collnet connect failed, propagate error to fallback on regular p2p
if (map == NULL) return ncclSystemError; if (map == NULL) return ncclSystemError;
@ -410,7 +410,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
} }
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank); struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
@ -426,7 +426,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm)); NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
// Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller. // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; } if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
if (resources->collNetComm == NULL) { if (resources->collNetComm == NULL) {
*((struct connectMap**)respBuff) = NULL; *((struct connectMap**)respBuff) = NULL;
return ncclSuccess; return ncclSuccess;
@ -484,7 +484,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
} }
static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
struct recvResources* resources = (struct recvResources*)(connection->transportResources); struct recvResources* resources = (struct recvResources*)(connection->transportResources);
@ -494,7 +494,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm)); NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
// Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller. // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; } if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
if (resources->collNetComm == NULL) { if (resources->collNetComm == NULL) {
*((struct connectMap**)respBuff) = NULL; *((struct connectMap**)respBuff) = NULL;
return ncclSuccess; return ncclSuccess;
@ -553,7 +553,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
info->mhandles[p] = resources->mhandles[p]; info->mhandles[p] = resources->mhandles[p];
if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; } if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
*((struct connectMap**)respBuff) = &resources->map; *((struct connectMap**)respBuff) = &resources->map;
return ncclSuccess; return ncclSuccess;
} }

View File

@ -172,13 +172,13 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
int proxyRank; int proxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0; send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn)); NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
req.rank = myInfo->rank; req.rank = myInfo->rank;
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank)); NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
req.remoteRank = peerInfo->rank; req.remoteRank = peerInfo->rank;
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
if (proxyRank == myInfo->rank) { if (proxyRank == myInfo->rank) {
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev, INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
@ -218,8 +218,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
req.rank = myInfo->rank; req.rank = myInfo->rank;
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank)); NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
req.remoteRank = peerInfo->rank; req.remoteRank = peerInfo->rank;
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t))); NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev, INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
return ncclSuccess; return ncclSuccess;
@ -264,11 +263,28 @@ static ncclResult_t netDumpMap(struct connectMap* map) {
} }
static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
// Setup device pointers struct connectMap* map = (connectMap*) send->transportResources;
struct connectMap* map;
NCCLCHECK(ncclCalloc(&map, 1)); void* opId;
send->transportResources = map;
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), map, sizeof(struct connectMap))); // map isn't allocated thus this op hasn't been submitted yet
if (!map) {
// Setup device pointers
NCCLCHECK(ncclCalloc(&map, 1));
send->transportResources = map;
opId = send;
INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId);
NCCLCHECK(ncclProxyCallAsync(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), sizeof(struct connectMap), opId));
} else {
opId = send;
}
ncclResult_t ret;
NCCLCHECK(ret = ncclPollProxyResponse(&send->proxyConn, map, opId));
if (ret == ncclInProgress) {
return ret;
}
INFO(NCCL_PROXY, "sendConnect ncclPollProxyResponse opId=%p", opId);
if (map->sameProcess) { if (map->sameProcess) {
if (map->cudaDev != comm->cudaDev) { if (map->cudaDev != comm->cudaDev) {
@ -315,10 +331,26 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
/* Connect to this peer */ /* Connect to this peer */
static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
struct connectMap* map; struct connectMap* map = (connectMap*) recv->transportResources;
NCCLCHECK(ncclCalloc(&map, 1)); void* opId;
recv->transportResources = map; if (!map) {
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), map, sizeof(struct connectMap))); NCCLCHECK(ncclCalloc(&map, 1));
recv->transportResources = map;
// Use recv connector as unique identifier
opId = recv;
INFO(NCCL_PROXY, "recvConnect ncclProxyCallAsync opId=%p &recv->proxyConn=%p connectInfo=%p",
opId, &recv->proxyConn, connectInfo);
NCCLCHECK(ncclProxyCallAsync(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), sizeof(struct connectMap), opId));
} else {
opId = recv;
}
ncclResult_t ret;
NCCLCHECK(ret = ncclPollProxyResponse(&recv->proxyConn, map, opId));
if (ret == ncclInProgress) {
return ret;
}
INFO(NCCL_PROXY, "recvConnect ncclPollProxyResponse opId=%p", opId);
//NCCLCHECK(netDumpMap(map)); //NCCLCHECK(netDumpMap(map));
struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
@ -490,12 +522,14 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError; if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm)); NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm));
*done = 1; *done = 1;
return ncclSuccess; return ncclSuccess;
} }
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct sendResources* resources = (struct sendResources*)(connection->transportResources); struct sendResources* resources = (struct sendResources*)(connection->transportResources);
if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError; if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
ncclResult_t ret = ncclSuccess;
if (resources->shared) { if (resources->shared) {
// Shared buffers // Shared buffers
@ -515,21 +549,22 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks)); NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
} }
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank; struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId)); if (comms->sendComm[resources->channelId] == NULL) ret = ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId);
resources->netSendComm = comms->sendComm[resources->channelId]; resources->netSendComm = comms->sendComm[resources->channelId];
if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++; if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
} else { } else {
NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm)); ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm);
} }
} else { } else {
// Connect to remote peer // Connect to remote peer
NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm)); ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm);
connection->proxyAppendPtr = &connection->proxyAppend; connection->proxyAppendPtr = &connection->proxyAppend;
} }
NCCLCHECK(ret);
if (resources->netSendComm == NULL) { if (resources->netSendComm == NULL) {
*done = 0; *done = 0;
return ncclSuccess; return ncclInProgress;
} }
*done = 1; *done = 1;
@ -630,6 +665,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
if (reqSize != sizeof(int)) return ncclInternalError; if (reqSize != sizeof(int)) return ncclInternalError;
struct recvResources* resources = (struct recvResources*)(connection->transportResources); struct recvResources* resources = (struct recvResources*)(connection->transportResources);
resources->proxyRank = *(int*)reqBuff; resources->proxyRank = *(int*)reqBuff;
ncclResult_t ret = ncclSuccess;
// Finish connection establishment from remote peer // Finish connection establishment from remote peer
if (resources->shared) { if (resources->shared) {
@ -650,23 +686,25 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks)); NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
} }
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank; struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId)); if (comms->recvComm[resources->channelId] == NULL) ret = ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId);
resources->netRecvComm = comms->recvComm[resources->channelId]; resources->netRecvComm = comms->recvComm[resources->channelId];
if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++; if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
} else { } else {
NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm)); ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm);
} }
} else { } else {
// Connect to remote peer // Connect to remote peer
NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm)); ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm);
connection->proxyAppendPtr = &connection->proxyAppend; connection->proxyAppendPtr = &connection->proxyAppend;
} }
NCCLCHECK(ret);
if (resources->netRecvComm == NULL) { if (resources->netRecvComm == NULL) {
*done = 0; *done = 0;
return ncclSuccess; return ncclInProgress;
} }
*done = 1; *done = 1;
NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm)); NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm));
// Create structures // Create structures

View File

@ -363,7 +363,9 @@ enum ncclIbCommState {
ncclIbCommStateAccept = 3, ncclIbCommStateAccept = 3,
ncclIbCommStateSend = 4, ncclIbCommStateSend = 4,
ncclIbCommStateRecv = 5, ncclIbCommStateRecv = 5,
ncclIbCommStateConnected = 6, ncclIbCommStateConnecting = 6,
ncclIbCommStateConnected = 7,
ncclIbCommStatePendingReady = 8,
}; };
struct ncclIbCommStage { struct ncclIbCommStage {
@ -599,8 +601,10 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
int ready; int ready;
*sendComm = NULL; *sendComm = NULL;
if (stage->state == ncclIbCommStateConnect) goto ib_connect_check; if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
if (stage->state == ncclIbCommStateSend) goto ib_send; if (stage->state == ncclIbCommStateSend) goto ib_send;
if (stage->state == ncclIbCommStateConnecting) goto ib_connect;
if (stage->state == ncclIbCommStateConnected) goto ib_send_ready;
if (stage->state != ncclIbCommStateStart) { if (stage->state != ncclIbCommStateStart) {
WARN("Error: trying to connect already connected sendComm"); WARN("Error: trying to connect already connected sendComm");
return ncclInternalError; return ncclInternalError;
@ -664,11 +668,37 @@ ib_connect_check:
ib_send: ib_send:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, stage->buffer, sizeof(qpInfo), &stage->offset)); NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, stage->buffer, sizeof(qpInfo), &stage->offset));
if (stage->offset != sizeof(qpInfo)) if (stage->offset != sizeof(qpInfo)) return ncclSuccess;
return ncclSuccess;
stage->state = ncclIbCommStateConnecting;
stage->offset = 0;
// Clear the staging buffer for re-use
memset(stage->buffer, 0, sizeof(qpInfo));
ib_connect:
struct ncclIbQpInfo remQpInfo;
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, stage->buffer, sizeof(ncclIbQpInfo), &stage->offset));
if (stage->offset != sizeof(remQpInfo)) return ncclSuccess;
memcpy(&remQpInfo, stage->buffer, sizeof(ncclIbQpInfo));
for (int q=0; q<comm->nqps; q++) {
struct ibv_qp* qp = comm->qps[q];
NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo));
NCCLCHECK(ncclIbRtsQp(qp));
}
comm->ready = 1;
stage->state = ncclIbCommStateConnected;
stage->offset = 0;
ib_send_ready:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, &comm->ready, sizeof(int), &stage->offset));
if (stage->offset != sizeof(int)) return ncclSuccess;
free(stage->buffer); free(stage->buffer);
stage->state = ncclIbCommStateConnected; stage->state = ncclIbCommStateStart;
*sendComm = comm; *sendComm = comm;
return ncclSuccess; return ncclSuccess;
} }
@ -685,8 +715,9 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
if (stage->state == ncclIbCommStateAccept) goto ib_accept_check; if (stage->state == ncclIbCommStateAccept) goto ib_accept_check;
if (stage->state == ncclIbCommStateRecv) goto ib_recv; if (stage->state == ncclIbCommStateRecv) goto ib_recv;
if (stage->state == ncclIbCommStateSend) goto ib_send; if (stage->state == ncclIbCommStateSend) goto ib_send;
if (stage->state == ncclIbCommStatePendingReady) goto ib_recv_ready;
if (stage->state != ncclIbCommStateStart) { if (stage->state != ncclIbCommStateStart) {
WARN("Listencomm in unknown state %d\n", stage->state); WARN("Listencomm in unknown state %d", stage->state);
return ncclInternalError; return ncclInternalError;
} }
@ -704,10 +735,10 @@ ib_accept_check:
stage->state = ncclIbCommStateRecv; stage->state = ncclIbCommStateRecv;
stage->offset = 0; stage->offset = 0;
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remQpInfo))); NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remQpInfo)));
ib_recv: ib_recv:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset)); NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset));
if (stage->offset != sizeof(remQpInfo)) if (stage->offset != sizeof(remQpInfo)) return ncclSuccess;
return ncclSuccess;
/* copy back the received info */ /* copy back the received info */
memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo)); memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo));
@ -780,10 +811,18 @@ ib_recv:
if (stage->buffer) free(stage->buffer); if (stage->buffer) free(stage->buffer);
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbQpInfo))); NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbQpInfo)));
memcpy(stage->buffer, &qpInfo, sizeof(struct ncclIbQpInfo)); memcpy(stage->buffer, &qpInfo, sizeof(struct ncclIbQpInfo));
ib_send: ib_send:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct ncclIbQpInfo), &stage->offset)); NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct ncclIbQpInfo), &stage->offset));
if (stage->offset < sizeof(struct ncclIbQpInfo)) return ncclSuccess; if (stage->offset < sizeof(struct ncclIbQpInfo)) return ncclSuccess;
stage->offset = 0;
stage->state = ncclIbCommStatePendingReady;
ib_recv_ready:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, &rComm->ready, sizeof(int), &stage->offset));
if (stage->offset != sizeof(int)) return ncclSuccess;
free(stage->buffer); free(stage->buffer);
*recvComm = rComm; *recvComm = rComm;
@ -815,36 +854,6 @@ ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) {
return ncclSuccess; return ncclSuccess;
} }
ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
struct ncclIbQpInfo remQpInfo;
// Do not block on this receive, return if not ready.
int bytes = 0;
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes));
if (bytes == 0) return ncclSuccess; // Try again later
NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes));
for (int q=0; q<comm->nqps; q++) {
struct ibv_qp* qp = comm->qps[q];
NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo));
NCCLCHECK(ncclIbRtsQp(qp));
}
comm->ready = 1;
// Block until this is done. It *should* not block indefinitely.
NCCLCHECK(ncclSocketSend(&comm->sock, &comm->ready, sizeof(int)));
return ncclSuccess;
}
ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) {
// Do not block on this receive, return if not ready.
int bytes = 0;
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes));
if (bytes == 0) return ncclSuccess; // Try again later
NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes));
return ncclSuccess;
}
ncclResult_t ncclIbTest(void* request, int* done, int* size); ncclResult_t ncclIbTest(void* request, int* done, int* size);
/* DMA-BUF support */ /* DMA-BUF support */
@ -1020,7 +1029,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm)); if (comm->ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->ready == 0"); return ncclInternalError; }
if (comm->ready == 0) { *request = NULL; return ncclSuccess; } if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
struct ibv_mr* mr = (struct ibv_mr*)mhandle; struct ibv_mr* mr = (struct ibv_mr*)mhandle;
@ -1153,7 +1162,7 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int
ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm)); if (comm->ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->ready == 0"); return ncclInternalError; }
if (comm->ready == 0) { *request = NULL; return ncclSuccess; } if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError; if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;

373
src/transport/nvls.cc Normal file
View File

@ -0,0 +1,373 @@
/*************************************************************************
* Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
// Implementation of the NVLink SHARP (NVLS) transport
#include "comm.h"
#include "graph.h"
#include "utils.h"
#include "proxy.h"
#if CUDART_VERSION >= 12010
// Currently we only support POSIX_FILE_DESCRIPTOR handle exchange
#define USE_POSIX_FD 1
#if USE_POSIX_FD
#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
#else
#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE
#endif
ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
// This transport cannot be used for p2p
*ret = 0;
return ncclSuccess;
}
ncclResult_t nvlsSendFree(struct ncclConnector* send) {
return ncclSuccess;
}
ncclResult_t nvlsRecvFree(struct ncclConnector* recv) {
return ncclSuccess;
}
struct ncclTransport nvlsTransport = {
"NVLS",
nvlsCanConnect,
{ NULL, NULL, nvlsSendFree, NULL, NULL, NULL, NULL, NULL },
{ NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL }
};
#define NVLS_HANDLE_SIZE 64
struct nvlsResources {
CUmulticastObjectProp properties;
CUmemAccessDesc accessDesc;
int dev;
size_t size;
size_t granularity;
CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
char* mcBuff; // Multicast NVLS buffer address
CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
char* ucBuff; // Unicast NVLS buffer address
};
ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct nvlsResources* resources, int dev, int nranks, size_t size) {
CUmulticastObjectProp* prop = &resources->properties;
memset(prop, 0, sizeof(*prop));
prop->size = size;
prop->numDevices = nranks;
prop->handleTypes = NVLS_CU_MEM_HANDLE_TYPE;
prop->flags = 0;
// Could be changed to CU_MULTICAST_GRANULARITY_MINIMUM when 3418538 resolved
CUCHECK(cuMulticastGetGranularity(&resources->granularity, prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
ALIGN_SIZE(size, resources->granularity);
prop->size = resources->size = size;
memset(&resources->accessDesc, 0, sizeof(resources->accessDesc));
resources->accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
resources->accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
resources->accessDesc.location.id = dev;
resources->dev = dev;
return ncclSuccess;
}
ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct nvlsResources* resources, int rank, unsigned int nranks, char* shareableHandle) {
size_t size = resources->size;
// Create a Multicast group
CUmulticastObjectProp* prop = &resources->properties;
INFO(NCCL_NVLS, "NVLS Creating Multicast group nranks %d size %zi on rank %d", nranks, size, rank);
CUCHECK(cuMulticastCreate(&resources->mcHandle, prop));
if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) {
// Get a handle to pass to other ranks
CUCHECK(cuMemExportToShareableHandle(shareableHandle, resources->mcHandle, NVLS_CU_MEM_HANDLE_TYPE, 0));
}
else {
memcpy(shareableHandle, &resources->mcHandle, sizeof(resources->mcHandle));
}
INFO(NCCL_NVLS, "NVLS Created Multicast group %llx nranks %d size %zi on rank %d", resources->mcHandle, nranks, size, rank);
return ncclSuccess;
}
ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct nvlsResources* resources) {
INFO(NCCL_NVLS, "NVLS group %llx adding dev %d", resources->mcHandle, resources->dev);
CUCHECK(cuMulticastAddDevice(resources->mcHandle, resources->dev));
return ncclSuccess;
}
ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct nvlsResources* resources) {
int dev = resources->dev;
size_t size = resources->size;
INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zi dev %d", resources->mcHandle, size, dev);
// Unbind physical memory from group for the given device
CUCHECK(cuMulticastUnbind(resources->mcHandle, dev, 0/*mcOffset*/, size));
return ncclSuccess;
}
ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resources, int rank, char* shareableHandle) {
CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE;
INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank);
// Import and map the remote memory descriptor to the local GPU
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
// cuMem UDS support
int fd = *(int *)shareableHandle;
TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle from rank %d fd %d", comm->localRank, rank, fd);
struct ncclProxyConnector proxyConn;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, rank, &proxyConn));
TRACE(NCCL_NVLS, "NVLS rank %d request conversion of fd %d from rank %d", comm->localRank, fd, rank);
NCCLCHECK(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgConvertFd, shareableHandle, sizeof(int), &fd, sizeof(int)));
TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank);
CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)(uintptr_t)fd, type));
} else {
if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) {
CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)shareableHandle, type));
} else {
memcpy(&resources->mcHandle, shareableHandle, sizeof(resources->mcHandle));
}
}
return ncclSuccess;
}
ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct nvlsResources* resources) {
size_t size = resources->size;
size_t granularity;
CUdeviceptr ptr = 0;
CUmemAllocationProp prop;
memset(&prop, 0, sizeof(prop));
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = resources->dev;
prop.requestedHandleTypes = NVLS_CU_MEM_HANDLE_TYPE;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
// Map a VA for UC memory
CUCHECK(cuMemAddressReserve(&ptr, size, granularity, 0U, 0));
// Alloc local physical mem for this NVLS group
CUCHECK(cuMemCreate(&resources->ucHandle, size, &prop, 0));
CUCHECK(cuMemMap(ptr, size, 0, resources->ucHandle, 0));
CUCHECK(cuMemSetAccess(ptr, size, &resources->accessDesc, 1));
CUDACHECK(cudaMemset((void*)ptr, 0, size));
resources->ucBuff = (char*)ptr;
INFO(NCCL_NVLS, "NVLS Mapped UC at %p size %zi", resources->ucBuff, size);
// Bind physical memory to the Multicast group
// NB: It will block until all ranks have been added to the Group
INFO(NCCL_NVLS, "NVLS Bind mem %p UC handle 0x%llx MC handle 0x%llx size %zi", (void*)ptr, resources->ucHandle, resources->mcHandle, size);
CUCHECK(cuMulticastBindMem(resources->mcHandle, 0/*mcOffset*/, resources->ucHandle, 0/*memOffset*/, size, 0/*flags*/));
return ncclSuccess;
}
ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct nvlsResources* resources) {
size_t size = resources->size;
CUdeviceptr ptr = 0;
// Create a VA for the NVLS
CUCHECK(cuMemAddressReserve(&ptr, size, resources->granularity, 0U, 0));
// Map the VA locally
CUCHECK(cuMemMap(ptr, size, 0, resources->mcHandle, 0));
resources->mcBuff = (char*)ptr;
INFO(NCCL_NVLS, "NVLS Mapped MC buffer at %p size %zi", resources->mcBuff, size);
// Having completed the BindMem we can now call SetAccess
// NB: It will block until all ranks have bound to the Group
CUCHECK(cuMemSetAccess((CUdeviceptr)resources->mcBuff, size, &resources->accessDesc, 1));
return ncclSuccess;
}
ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct nvlsResources* resources) {
size_t size;
CUdeviceptr ptr;
INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)",
resources->ucHandle, resources->ucBuff, resources->mcHandle, resources->mcBuff);
// Release the UC memory and mapping
ptr = (CUdeviceptr)resources->ucBuff;
size = resources->size;
CUCHECK(cuMemUnmap(ptr, size));
CUCHECK(cuMemAddressFree(ptr, size));
CUCHECK(cuMemRelease(resources->ucHandle));
// Release the MC memory and mapping
ptr = (CUdeviceptr)resources->mcBuff;
size = resources->size;
CUCHECK(cuMemUnmap(ptr, size));
CUCHECK(cuMemAddressFree(ptr, size));
CUCHECK(cuMemRelease(resources->mcHandle));
return ncclSuccess;
}
#include "bootstrap.h"
#include "channel.h"
#define NVLS_MEM_ALIGN_SIZE (1 << 21)
NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16);
NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 1);
ncclResult_t ncclNvlsSetup(struct ncclComm* comm) {
if (!ncclParamNvlsEnable() || comm->localRanks <= 1 || comm->nNodes>1) return ncclSuccess;
CUdevice dev;
int driverVersion;
if (CUPFN(cuDeviceGet) == NULL) return ncclSuccess;
CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
CUDACHECK(cudaDriverGetVersion(&driverVersion));
comm->nvlsSupport = 0;
// NVLS Multicast support requires CUDA12.1 UMD + KMD
if (CUPFN(cuMulticastCreate) != NULL && driverVersion >= 12010) {
CUCHECK(cuDeviceGetAttribute(&comm->nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
}
INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev);
if (comm->nvlsSupport == 0) return ncclSuccess;
int nChannels = comm->nvlsChannels = std::max(comm->minCTAs, std::min(comm->maxCTAs, (int)ncclParamNvlsChannels()));
int rank = comm->localRank, nranks = comm->localRanks;
for (int c=0; c<nChannels; c++) {
NCCLCHECK(initChannel(comm, c));
}
ncclResult_t res = ncclSuccess;
struct nvlsResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
comm->nvlsResources = resources;
size_t buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
size_t memSize = NVLS_MEM_ALIGN_SIZE;
size_t nvlsPerRankSize = nChannels*2*(buffSize+memSize);
size_t nvlsTotalSize = nvlsPerRankSize*nranks;
INFO(NCCL_INIT|NCCL_NVLS, "NVLS comm %p rank %d nranks %d buffSize %zi memSize %zi nvlsPerRankSize %zi nvlsTotalSize %zi",
comm, rank, nranks, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize);
char* nvlsShareableHandle = NULL;
NCCLCHECKGOTO(ncclCalloc(&nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, nranks, nvlsTotalSize), res, cleanup);
if (rank == 0) {
NCCLCHECKGOTO(nvlsGroupCreate(comm, resources, rank, nranks, nvlsShareableHandle), res, cleanup);
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
} else {
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
NCCLCHECKGOTO(nvlsGroupConnect(comm, resources, 0, nvlsShareableHandle), res, cleanup);
}
NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup);
NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup);
// Local intra-node barrier to ensure everyone has bound their memory to the group
NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup);
NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup);
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->nvls.nHeads = nranks;
for (int i=0; i<NCCL_MAX_NVLS_ARITY; i++) channel->nvls.up[i] = -1;
channel->nvls.down = comm->nRanks+1+comm->localRank;
channel->nvls.out = -1; // Network not yet implemented.
channel->nvls.headRank = comm->localRank; // Network not yet implemented.
}
for (int r=0; r<nranks; r++) {
int nvlsPeer = comm->nRanks+1+r;
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->nvls.up[r] = nvlsPeer;
char* mem = NULL;
struct ncclChannelPeer* peer = channel->peers+nvlsPeer;
// Reduce UC -> MC
mem = resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize);
peer->send[0].transportComm = &nvlsTransport.send;
peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->send[0].conn.head = (uint64_t*)(mem+buffSize);
peer->send[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
mem = resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize);
peer->recv[1].transportComm = &nvlsTransport.recv;
peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->recv[1].conn.head = (uint64_t*)(mem+buffSize);
peer->recv[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
peer->recv[1].conn.flags |= NCCL_NVLS_MIN_POLL;
// Broadcast MC -> UC
mem = resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize);
peer->recv[0].transportComm = &nvlsTransport.recv;
peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->recv[0].conn.head = (uint64_t*)(mem+buffSize);
peer->recv[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
mem = resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize);
peer->send[1].transportComm = &nvlsTransport.send;
peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->send[1].conn.head = (uint64_t*)(mem+buffSize);
peer->send[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
peer->send[1].conn.flags |= NCCL_NVLS_MIN_POLL;
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
/*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p",
nvlsPeer, c,
resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize),
resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize),
resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize),
resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize));*/
}
}
free(nvlsShareableHandle);
return res;
cleanup:
comm->nvlsSupport = 0;
free(nvlsShareableHandle);
return res;
}
ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
struct nvlsResources* resources = (struct nvlsResources*)comm->nvlsResources;
if (resources == NULL) return ncclSuccess;
NCCLCHECK(nvlsGroupUnbind(comm, resources));
NCCLCHECK(nvlsGroupUnmapMem(comm, resources));
free(resources);
comm->nvlsResources = NULL;
return ncclSuccess;
}
#else
/*
* Pre CUDA 12.1 stubs
*/
ncclResult_t ncclNvlsSetup(struct ncclComm* comm) {
return ncclSuccess;
}
ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
return ncclSuccess;
}
#endif /* CUDA_VERSION >= 12010 */

View File

@ -239,11 +239,11 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
if (intermediateRank == -1) { if (intermediateRank == -1) {
info->rank = myInfo->rank; info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) { if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; if (ncclParamP2pDirectDisable() == 0) send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s", INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr); channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
} else { } else {
send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; send->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s", INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : ""); channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "");
} }
@ -256,11 +256,11 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn)); NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
if (useMemcpy) { if (useMemcpy) {
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo))); NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo)));
info->shmSize = resources->proxyInfo.shmSize; info->shmSize = resources->proxyInfo.shmSize;
memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName)); memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
} else { } else {
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc)); NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
} }
@ -290,16 +290,16 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
if (intermediateRank == -1) { if (intermediateRank == -1) {
info->rank = myInfo->rank; info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) { if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
if (ncclParamP2pDirectDisable() == 0) recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; if (ncclParamP2pDirectDisable() == 0) recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
} else { } else {
recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; recv->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
} }
} else { } else {
info->rank = intermediateRank; info->rank = intermediateRank;
} }
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn)); NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn));
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->recvMemIpc)); NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->recvMemIpc));
return ncclSuccess; return ncclSuccess;
@ -330,7 +330,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo; send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
send->conn.head = &resources->proxyInfo.devShm->sendMem.head; send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
// Send SIMPLE buff to proxy, and replace it by local buffer // Send SIMPLE buff to proxy, and replace it by local buffer
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0)); NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff; send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff;
} else { } else {
send->conn.tail = &remDevMem->tail; send->conn.tail = &remDevMem->tail;

View File

@ -157,7 +157,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
if (useMemcpySend) { if (useMemcpySend) {
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn)); NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn));
struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem }; struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo))); NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo; send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
send->conn.tail = &proxyInfo.ceRecvMem->tail; send->conn.tail = &proxyInfo.ceRecvMem->tail;
send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo; send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
@ -187,7 +187,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
if (useMemcpyRecv) { if (useMemcpyRecv) {
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn)); NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem }; struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo))); NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo; recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
recv->conn.tail = &proxyInfo.ceRecvMem->tail; recv->conn.tail = &proxyInfo.ceRecvMem->tail;
} }