2.17.1-1
Add new NVLS algorithm for allreduce using NVLink SHARP (intra-node only). Add new config options: cgaClusterSize, minCTAs, maxCTAs, netName. Enable LL128 when we use PXN to close rings. NVTX3 includes update. Fix crash when one CollNet (SHARP) rail fails to initialize.
This commit is contained in:
parent
f3d5166783
commit
5d3ab08b69
@ -1,6 +1,6 @@
|
|||||||
##### version
|
##### version
|
||||||
NCCL_MAJOR := 2
|
NCCL_MAJOR := 2
|
||||||
NCCL_MINOR := 16
|
NCCL_MINOR := 17
|
||||||
NCCL_PATCH := 5
|
NCCL_PATCH := 1
|
||||||
NCCL_SUFFIX :=
|
NCCL_SUFFIX :=
|
||||||
PKG_REVISION := 1
|
PKG_REVISION := 1
|
||||||
|
@ -12,7 +12,8 @@ INCEXPORTS := nccl.h nccl_net.h
|
|||||||
LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
|
LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
|
||||||
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
|
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
|
||||||
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
|
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
|
||||||
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
|
misc/ipcsocket.cc \
|
||||||
|
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc transport/nvls.cc \
|
||||||
collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
|
collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
|
||||||
graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
|
graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
|
||||||
|
|
||||||
@ -62,7 +63,7 @@ ALWAYS_REBUILD:
|
|||||||
-include $(DEPFILES)
|
-include $(DEPFILES)
|
||||||
$(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
|
$(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
|
||||||
|
|
||||||
$(INCDIR)/nccl.h : nccl.h.in
|
$(INCDIR)/nccl.h : nccl.h.in ../makefiles/version.mk
|
||||||
# NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z))
|
# NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z))
|
||||||
@$(eval NCCL_VERSION := $(shell printf "%d%02d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
|
@$(eval NCCL_VERSION := $(shell printf "%d%02d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
|
||||||
mkdir -p $(INCDIR)
|
mkdir -p $(INCDIR)
|
||||||
|
@ -386,6 +386,24 @@ ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank,
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IntraNode in-place Broadcast
|
||||||
|
ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size) {
|
||||||
|
if (nranks == 1) return ncclSuccess;
|
||||||
|
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - ENTER", rank, nranks, root, size);
|
||||||
|
|
||||||
|
if (rank == root) {
|
||||||
|
for (int i=0; i<nranks; i++) {
|
||||||
|
if (i != root) NCCLCHECK(bootstrapSend(commState, ranks[i], /*tag=*/ranks[i], bcastData, size));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
NCCLCHECK(bootstrapRecv(commState, ranks[root], /*tag=*/rank, bcastData, size));
|
||||||
|
}
|
||||||
|
|
||||||
|
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - DONE", rank, nranks, root, size);
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
|
ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
|
||||||
// New unex
|
// New unex
|
||||||
struct unexConn* unex;
|
struct unexConn* unex;
|
||||||
|
@ -13,14 +13,15 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
|
|||||||
if (channel->id != -1) return ncclSuccess;
|
if (channel->id != -1) return ncclSuccess;
|
||||||
|
|
||||||
int nRanks = comm->nRanks;
|
int nRanks = comm->nRanks;
|
||||||
|
int nPeers = nRanks + 1 /* Collnet */ + comm->localRanks /* NVLS */;
|
||||||
channel->id = channelId;
|
channel->id = channelId;
|
||||||
channel->workFifoSent = 0;
|
channel->workFifoSent = 0;
|
||||||
|
|
||||||
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream));
|
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream));
|
||||||
|
|
||||||
// The extra on nRanks+1 is for collnet root (i.e. network)
|
// The extra on nRanks+1 is for collnet root (i.e. network)
|
||||||
channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nRanks+1);
|
channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nPeers);
|
||||||
NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nRanks+1, comm->deviceStream.cudaStream));
|
NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, comm->deviceStream.cudaStream));
|
||||||
ncclCommPushCudaFree(comm, channel->devPeers);
|
ncclCommPushCudaFree(comm, channel->devPeers);
|
||||||
|
|
||||||
channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
|
channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
|
||||||
@ -29,7 +30,7 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
|
|||||||
|
|
||||||
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream));
|
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream));
|
||||||
|
|
||||||
for (int r=0; r < nRanks+1; ++r) {
|
for (int r=0; r < nPeers; ++r) {
|
||||||
for (int b=0; b < NCCL_MAX_CONNS; b++) {
|
for (int b=0; b < NCCL_MAX_CONNS; b++) {
|
||||||
channel->peers[r].send[b].comm = comm;
|
channel->peers[r].send[b].comm = comm;
|
||||||
channel->peers[r].recv[b].comm = comm;
|
channel->peers[r].recv[b].comm = comm;
|
||||||
|
@ -97,3 +97,45 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL
|
|||||||
runRing<T, RedOp, ProtoLL128>(args);
|
runRing<T, RedOp, ProtoLL128>(args);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename T, typename RedOp>
|
||||||
|
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||||
|
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
const int bid = args->bid;
|
||||||
|
const int nChannels = args->nChannels;
|
||||||
|
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||||
|
const ssize_t chunkSize = int(args->lastChunkSize);
|
||||||
|
const ssize_t size = args->count;
|
||||||
|
const ssize_t loopSize = nChannels*chunkSize;
|
||||||
|
|
||||||
|
const int nThreadsGather = 128;
|
||||||
|
const int nThreadsBcast = 384 + WARP_SIZE;
|
||||||
|
const int tidEndGather = nThreadsGather;
|
||||||
|
const int tidEndBcast = tidEndGather + nThreadsBcast;
|
||||||
|
|
||||||
|
using Proto = ProtoSimple<1, 1>;
|
||||||
|
|
||||||
|
if (tid < tidEndGather) {
|
||||||
|
// Gather
|
||||||
|
int group = (0*Proto::MaxGroupWidth) | (0<<16);
|
||||||
|
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||||
|
prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, args->redOpArg, group, args);
|
||||||
|
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||||
|
ssize_t offset = gridOffset + bid*chunkSize;
|
||||||
|
int nelem = min(chunkSize, size-offset);
|
||||||
|
prims.gather(offset, nvls->nHeads*size, nelem, size, -1, 0);
|
||||||
|
}
|
||||||
|
} else if (tid < tidEndBcast) {
|
||||||
|
int group = (3*Proto::MaxGroupWidth) | (1<<16);
|
||||||
|
// Bcast through MC
|
||||||
|
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||||
|
prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL, args->redOpArg, group, args);
|
||||||
|
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||||
|
ssize_t offset = gridOffset + bid*chunkSize;
|
||||||
|
int nelem = min(chunkSize, size-offset);
|
||||||
|
prims.send(offset, nelem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
@ -306,9 +306,9 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
|||||||
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
|
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
|
||||||
int nelem = min(direct->nHeads*chunkSize, size-offset);
|
int nelem = min(direct->nHeads*chunkSize, size-offset);
|
||||||
if (args->regUsed) {
|
if (args->regUsed) {
|
||||||
prims.directScatter(offset, nelem, chunkSize, direct->headRank, direct->shift);
|
prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
|
||||||
} else {
|
} else {
|
||||||
prims.scatter(offset, nelem, chunkSize, direct->headRank, direct->shift);
|
prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (tid >= tidStartReduce && direct->out != -1) {
|
} else if (tid >= tidStartReduce && direct->out != -1) {
|
||||||
@ -344,7 +344,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
|||||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||||
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
|
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
|
||||||
int nelem = min(direct->nHeads*chunkSize, size-offset);
|
int nelem = min(direct->nHeads*chunkSize, size-offset);
|
||||||
prims.directGather(offset, nelem, chunkSize, direct->headRank, direct->shift);
|
prims.directGather(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
|
||||||
}
|
}
|
||||||
} else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
|
} else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
|
||||||
int group = (1*Proto::MaxGroupWidth) | (0<<16);
|
int group = (1*Proto::MaxGroupWidth) | (0<<16);
|
||||||
@ -371,6 +371,65 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename T, typename RedOp>
|
||||||
|
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||||
|
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||||
|
#if NCCL_NVLS_ENABLED
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
const int bid = args->bid;
|
||||||
|
const int nChannels = args->nChannels;
|
||||||
|
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||||
|
const ssize_t chunkSize = int(args->lastChunkSize);
|
||||||
|
const ssize_t size = args->count;
|
||||||
|
const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
|
||||||
|
const int nranks = ncclShmem.comm.nRanks;
|
||||||
|
const int reduceWarps = nranks <= 6 ? 6 : 4;
|
||||||
|
const int copyWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps)/2;
|
||||||
|
|
||||||
|
const int nThreadsScatter = copyWarps*WARP_SIZE;
|
||||||
|
const int nThreadsGather = (copyWarps-1)*WARP_SIZE;
|
||||||
|
const int nThreadsReduce = (reduceWarps+1)*WARP_SIZE;
|
||||||
|
const int tidEndScatter = nThreadsScatter;
|
||||||
|
const int tidEndGather = tidEndScatter + nThreadsGather;
|
||||||
|
const int tidEndReduce = tidEndGather + nThreadsReduce;
|
||||||
|
|
||||||
|
using Proto = ProtoSimple<1, 1, COLL_UNROLL, /*NVLS=*/true>;
|
||||||
|
|
||||||
|
if (tid < tidEndScatter) {
|
||||||
|
// Scatter
|
||||||
|
int group = (0*Proto::MaxGroupWidth) | (0<<16);
|
||||||
|
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||||
|
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
|
||||||
|
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||||
|
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
|
||||||
|
int nelem = min(nvls->nHeads*chunkSize, size-offset);
|
||||||
|
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||||
|
}
|
||||||
|
} else if (tid < tidEndGather) {
|
||||||
|
// Gather
|
||||||
|
int group = (2*Proto::MaxGroupWidth) | (0<<16);
|
||||||
|
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||||
|
prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
|
||||||
|
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||||
|
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
|
||||||
|
int nelem = min(nvls->nHeads*chunkSize, size-offset);
|
||||||
|
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||||
|
}
|
||||||
|
} else if (tid < tidEndReduce) {
|
||||||
|
int group = (3*Proto::MaxGroupWidth) | (1<<16);
|
||||||
|
// Reduce, broadcast through NVLS
|
||||||
|
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
|
||||||
|
prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
|
||||||
|
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||||
|
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
|
||||||
|
int nelem = min(chunkSize, size-offset);
|
||||||
|
prims.recvSend(nelem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif // NCCL_NVLS_ENABLED
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template<typename T, typename RedOp>
|
template<typename T, typename RedOp>
|
||||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
|
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
|
||||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||||
|
@ -11,31 +11,23 @@
|
|||||||
#include "devcomm.h"
|
#include "devcomm.h"
|
||||||
#include "op128.h"
|
#include "op128.h"
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= 800
|
#define COLL_UNROLL (ncclCollUnroll())
|
||||||
#define COLL_UNROLL 8
|
|
||||||
#else
|
|
||||||
#define COLL_UNROLL 4
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1) // Using balanced tree instead of split tree
|
#define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1) // Using balanced tree instead of split tree
|
||||||
|
|
||||||
typedef void(*ncclKern_t)();
|
typedef void(*ncclKern_t)();
|
||||||
extern __device__ ncclKern_t ncclFuncs[];
|
extern __device__ ncclKern_t ncclFuncs[];
|
||||||
|
|
||||||
struct ncclShmemGroup {
|
struct ncclShmemGroup {
|
||||||
ncclConnInfo *recvConns[NCCL_MAX_DIRECT_ARITY];
|
ncclConnInfo *recvConns[NCCL_MAX_NVLS_ARITY];
|
||||||
ncclConnInfo *sendConns[NCCL_MAX_DIRECT_ARITY];
|
ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY];
|
||||||
void* srcs[NCCL_MAX_DIRECT_ARITY+1];
|
void* srcs[NCCL_MAX_NVLS_ARITY+1];
|
||||||
void* dsts[NCCL_MAX_DIRECT_ARITY+1];
|
void* dsts[NCCL_MAX_NVLS_ARITY+1];
|
||||||
int totalSendSize[NCCL_MAX_SLICE_PER_CHUNK];
|
int nvlsRecv;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ncclShmemData {
|
struct ncclShmemData {
|
||||||
union {
|
|
||||||
uint64_t ll128warp[NCCL_LL128_MAX_NTHREADS/WARP_SIZE][NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE];
|
|
||||||
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
|
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
|
||||||
};
|
uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
|
||||||
uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
|
|
||||||
int channelId;
|
int channelId;
|
||||||
int aborted;
|
int aborted;
|
||||||
alignas(16) struct ncclDevComm comm;
|
alignas(16) struct ncclDevComm comm;
|
||||||
@ -45,6 +37,15 @@ struct ncclShmemData {
|
|||||||
static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
|
static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
|
||||||
|
|
||||||
extern __shared__ ncclShmemData ncclShmem;
|
extern __shared__ ncclShmemData ncclShmem;
|
||||||
|
#if __CUDA_ARCH__ >= 700
|
||||||
|
extern __shared__ ulong2 ncclShmemPerWarp[/*ncclShmemDynamicSize()/sizeof(ulong2)*/];
|
||||||
|
#else
|
||||||
|
extern __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__device__ inline void* ncclScratchForWarp(int warp) {
|
||||||
|
return (char*)ncclShmemPerWarp + warp*ncclShmemScratchWarpSize();
|
||||||
|
}
|
||||||
|
|
||||||
__device__ inline bool barrierReduceAny(int bit) {
|
__device__ inline bool barrierReduceAny(int bit) {
|
||||||
uint32_t popc;
|
uint32_t popc;
|
||||||
@ -235,7 +236,8 @@ __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
|
|||||||
IMPL_COLL4(func, TREE, devredop, type, ncclType) \
|
IMPL_COLL4(func, TREE, devredop, type, ncclType) \
|
||||||
IMPL_COLL4(func, RING, devredop, type, ncclType) \
|
IMPL_COLL4(func, RING, devredop, type, ncclType) \
|
||||||
IMPL_COLL4(func, COLLNET_DIRECT, devredop, type, ncclType) \
|
IMPL_COLL4(func, COLLNET_DIRECT, devredop, type, ncclType) \
|
||||||
IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType)
|
IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType) \
|
||||||
|
IMPL_COLL4(func, NVLS, devredop, type, ncclType)
|
||||||
|
|
||||||
#if NCCL_TYPE == 0
|
#if NCCL_TYPE == 0
|
||||||
#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int8_t, ncclInt8)
|
#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int8_t, ncclInt8)
|
||||||
@ -291,4 +293,6 @@ __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
|
|||||||
#define IMPL_COLL_P(func)
|
#define IMPL_COLL_P(func)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define NCCL_NVLS_ENABLED (__CUDA_ARCH__ >= 900 && NCCL_NVLS_SUPPORTS(NCCL_TYPE, NCCL_OP))
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -9,6 +9,9 @@
|
|||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
__shared__ ncclShmemData ncclShmem;
|
__shared__ ncclShmemData ncclShmem;
|
||||||
|
#if __CUDA_ARCH__ < 700
|
||||||
|
__shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
|
||||||
|
#endif
|
||||||
|
|
||||||
#define NCCL_FUNC5(func, algo, devredop, type, nullify) \
|
#define NCCL_FUNC5(func, algo, devredop, type, nullify) \
|
||||||
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \
|
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \
|
||||||
@ -19,7 +22,8 @@ __shared__ ncclShmemData ncclShmem;
|
|||||||
NCCL_FUNC5(func, TREE, devredop, type, nullify), \
|
NCCL_FUNC5(func, TREE, devredop, type, nullify), \
|
||||||
NCCL_FUNC5(func, RING, devredop, type, nullify), \
|
NCCL_FUNC5(func, RING, devredop, type, nullify), \
|
||||||
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
|
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
|
||||||
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify)
|
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify), \
|
||||||
|
NCCL_FUNC5(func, NVLS, devredop, type, nullify)
|
||||||
|
|
||||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||||
// Must be consistent with ncclDataType_t
|
// Must be consistent with ncclDataType_t
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
#include "devcomm.h"
|
#include "devcomm.h"
|
||||||
#include "collectives.h"
|
#include "collectives.h"
|
||||||
#include "reduce_kernel.h"
|
#include "common_kernel.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
@ -35,8 +35,10 @@ namespace {
|
|||||||
i1 = i1 < eltN ? i1 : eltN;
|
i1 = i1 < eltN ? i1 : eltN;
|
||||||
src += i0;
|
src += i0;
|
||||||
dst += i0;
|
dst += i0;
|
||||||
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 1>
|
void *vsrc = (void*)src;
|
||||||
(tid, tn, &(we->redOpArg), true, 1, &src, 1, &dst, i1-i0);
|
void *vdst = (void*)dst;
|
||||||
|
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/1>
|
||||||
|
(tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -65,4 +65,290 @@ inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1
|
|||||||
v1 = tmp8[1];
|
v1 = tmp8[1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
__device__ __forceinline__ uint32_t cvta_to_shared(T* ptr) {
|
||||||
|
return (uint32_t)__cvta_generic_to_shared(ptr);
|
||||||
|
}
|
||||||
|
template<typename T>
|
||||||
|
__device__ __forceinline__ uintptr_t cvta_to_global(T* ptr) {
|
||||||
|
return (uintptr_t)__cvta_generic_to_global(ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
__device__ __forceinline__ T* cvta_from_shared(uint32_t shptr) {
|
||||||
|
T* ans;
|
||||||
|
asm("cvta.shared.u64 %0, %1;" : "=l"(ans) : "l"(uint64_t(shptr)));
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
template<typename T>
|
||||||
|
__device__ __forceinline__ T* cvta_from_global(uintptr_t gptr) {
|
||||||
|
T* ans;
|
||||||
|
asm("cvta.global.u64 %0, %1;" : "=l"(ans) : "l"(gptr));
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// BytePack<Size>: struct of bytes.
|
||||||
|
|
||||||
|
template<int Size>
|
||||||
|
union BytePack;
|
||||||
|
template<>
|
||||||
|
union BytePack<1> {
|
||||||
|
uint8_t u8, native;
|
||||||
|
};
|
||||||
|
template<>
|
||||||
|
union BytePack<2> {
|
||||||
|
BytePack<1> half[2];
|
||||||
|
uint8_t u8[2];
|
||||||
|
uint16_t u16, native;
|
||||||
|
};
|
||||||
|
template<>
|
||||||
|
union BytePack<4> {
|
||||||
|
BytePack<2> half[2];
|
||||||
|
uint8_t u8[4];
|
||||||
|
uint16_t u16[2];
|
||||||
|
uint32_t u32, native;
|
||||||
|
};
|
||||||
|
template<>
|
||||||
|
union BytePack<8> {
|
||||||
|
BytePack<4> half[2];
|
||||||
|
uint8_t u8[8];
|
||||||
|
uint16_t u16[4];
|
||||||
|
uint32_t u32[2];
|
||||||
|
uint64_t u64, native;
|
||||||
|
};
|
||||||
|
template<>
|
||||||
|
union alignas(16) BytePack<16> {
|
||||||
|
BytePack<8> half[2];
|
||||||
|
uint8_t u8[16];
|
||||||
|
uint16_t u16[8];
|
||||||
|
uint32_t u32[4];
|
||||||
|
uint64_t u64[2];
|
||||||
|
ulong2 ul2, native;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
__device__ __forceinline__ BytePack<sizeof(T)> toPack(T value) {
|
||||||
|
union { BytePack<sizeof(T)> p; T v; };
|
||||||
|
v = value;
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
template<typename T>
|
||||||
|
__device__ __forceinline__ T fromPack(BytePack<sizeof(T)> pack) {
|
||||||
|
union { BytePack<sizeof(T)> p; T v; };
|
||||||
|
p = pack;
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Load/store of BytePack<?> using integral addresses.
|
||||||
|
|
||||||
|
template<int Size> __device__ BytePack<Size> ld_global(uintptr_t addr);
|
||||||
|
template<int Size> __device__ BytePack<Size> ld_volatile_global(uintptr_t addr);
|
||||||
|
template<int Size> __device__ BytePack<Size> ld_shared(uint32_t addr);
|
||||||
|
template<int Size> __device__ BytePack<Size> ld_volatile_shared(uint32_t addr);
|
||||||
|
template<int Size> __device__ void st_global(uintptr_t addr, BytePack<Size> value);
|
||||||
|
template<int Size> __device__ void st_shared(uint32_t addr, BytePack<Size> value);
|
||||||
|
|
||||||
|
// Used to define implementations for above prototypes.
|
||||||
|
#define DEFINE_ld_st(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \
|
||||||
|
template<> \
|
||||||
|
__device__ __forceinline__ BytePack<bytes> ld_##space<bytes>(addr_cxx_ty addr) { \
|
||||||
|
data_cxx_ty tmp; \
|
||||||
|
asm("ld." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \
|
||||||
|
BytePack<bytes> ans; \
|
||||||
|
ans.native = tmp; \
|
||||||
|
return ans; \
|
||||||
|
} \
|
||||||
|
template<> \
|
||||||
|
__device__ __forceinline__ BytePack<bytes> ld_volatile_##space<bytes>(addr_cxx_ty addr) { \
|
||||||
|
data_cxx_ty tmp; \
|
||||||
|
asm("ld.volatile." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \
|
||||||
|
BytePack<bytes> ans; \
|
||||||
|
ans.native = tmp; \
|
||||||
|
return ans; \
|
||||||
|
} \
|
||||||
|
template<> \
|
||||||
|
__device__ __forceinline__ void st_##space<bytes>(addr_cxx_ty addr, BytePack<bytes> value) { \
|
||||||
|
data_cxx_ty tmp = value.native; \
|
||||||
|
asm volatile("st." #space "." #data_ptx_ty " [%0], %1;" :: #addr_reg_ty(addr), #data_reg_ty(tmp) : "memory"); \
|
||||||
|
}
|
||||||
|
// Single-byte types use 4-byte registers since there is no 1-byte register
|
||||||
|
// character for asm blocks. See https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints
|
||||||
|
DEFINE_ld_st(1, uint32_t, b8, r, global, uintptr_t, l)
|
||||||
|
DEFINE_ld_st(1, uint32_t, b8, r, shared, uint32_t, r)
|
||||||
|
DEFINE_ld_st(2, uint16_t, b16, h, global, uintptr_t, l)
|
||||||
|
DEFINE_ld_st(2, uint16_t, b16, h, shared, uint32_t, r)
|
||||||
|
DEFINE_ld_st(4, uint32_t, b32, r, global, uintptr_t, l)
|
||||||
|
DEFINE_ld_st(4, uint32_t, b32, r, shared, uint32_t, r)
|
||||||
|
DEFINE_ld_st(8, uint64_t, b64, l, global, uintptr_t, l)
|
||||||
|
DEFINE_ld_st(8, uint64_t, b64, l, shared, uint32_t, r)
|
||||||
|
#undef DEFINE_ld_st
|
||||||
|
|
||||||
|
#define DEFINE_ld_st_16(space, addr_cxx_ty, addr_reg_ty) \
|
||||||
|
template<> \
|
||||||
|
__device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \
|
||||||
|
BytePack<16> ans; \
|
||||||
|
asm("ld." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \
|
||||||
|
return ans; \
|
||||||
|
} \
|
||||||
|
template<> \
|
||||||
|
__device__ __forceinline__ BytePack<16> ld_volatile_##space<16>(addr_cxx_ty addr) { \
|
||||||
|
BytePack<16> ans; \
|
||||||
|
asm("ld.volatile." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \
|
||||||
|
return ans; \
|
||||||
|
} \
|
||||||
|
template<> \
|
||||||
|
__device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \
|
||||||
|
asm("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \
|
||||||
|
}
|
||||||
|
DEFINE_ld_st_16(global, uintptr_t, l)
|
||||||
|
DEFINE_ld_st_16(shared, uint32_t, r)
|
||||||
|
#undef DEFINE_ld_st_16
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Atomic load/store using c++ pointers.
|
||||||
|
|
||||||
|
__device__ __forceinline__ uint64_t ld_volatile_global(uint64_t *ptr) {
|
||||||
|
uint64_t ans;
|
||||||
|
asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
__device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) {
|
||||||
|
uint64_t ans;
|
||||||
|
#if __CUDA_ARCH__ >= 700
|
||||||
|
asm("ld.relaxed.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
|
||||||
|
#else
|
||||||
|
asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
|
||||||
|
#endif
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
__device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) {
|
||||||
|
uint64_t ans;
|
||||||
|
#if __CUDA_ARCH__ >= 700
|
||||||
|
asm("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
|
||||||
|
#else
|
||||||
|
asm("ld.volatile.sys.global.u64 %0, [%1]; membar.gl;" : "=l"(ans) : "l"(cvta_to_global(ptr)));
|
||||||
|
#endif
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void st_volatile_global(uint64_t *ptr, uint64_t val) {
|
||||||
|
asm volatile("st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory");
|
||||||
|
}
|
||||||
|
__device__ __forceinline__ void st_relaxed_sys_global(uint64_t *ptr, uint64_t val) {
|
||||||
|
#if __CUDA_ARCH__ >= 700
|
||||||
|
asm volatile("st.relaxed.sys.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory");
|
||||||
|
#else
|
||||||
|
asm volatile("st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
__device__ __forceinline__ void st_release_sys_global(uint64_t *ptr, uint64_t val) {
|
||||||
|
#if __CUDA_ARCH__ >= 700
|
||||||
|
asm volatile("st.release.sys.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory");
|
||||||
|
#else
|
||||||
|
asm volatile("membar.sys; st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void fence_acq_rel_sys() {
|
||||||
|
#if __CUDA_ARCH__ >= 700
|
||||||
|
asm volatile("fence.acq_rel.sys;" ::: "memory");
|
||||||
|
#else
|
||||||
|
asm volatile("membar.sys;" ::: "memory");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
__device__ __forceinline__ void fence_acq_rel_gpu() {
|
||||||
|
#if __CUDA_ARCH__ >= 700
|
||||||
|
asm volatile("fence.acq_rel.gpu;" ::: "memory");
|
||||||
|
#else
|
||||||
|
asm volatile("membar.gl;" ::: "memory");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Multimem stores of BytePack<?>.
|
||||||
|
|
||||||
|
template<int Size>
|
||||||
|
__device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size> val);
|
||||||
|
|
||||||
|
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
|
||||||
|
template<>
|
||||||
|
__device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) {
|
||||||
|
asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.u32) : "memory");
|
||||||
|
}
|
||||||
|
template<>
|
||||||
|
__device__ __forceinline__ void multimem_st_global<8>(uintptr_t addr, BytePack<8> val) {
|
||||||
|
asm volatile("multimem.st.global.b64 [%0], %1;" :: "l"(addr), "l"(val.u64) : "memory");
|
||||||
|
}
|
||||||
|
template<>
|
||||||
|
__device__ __forceinline__ void multimem_st_global<16>(uintptr_t addr, BytePack<16> val) {
|
||||||
|
asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};"
|
||||||
|
:: "l"(addr), "r"(val.u32[0]), "r"(val.u32[1]), "r"(val.u32[2]), "r"(val.u32[3])
|
||||||
|
: "memory");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
template<int Size>
|
||||||
|
__device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size> val) {
|
||||||
|
// nop
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Warp-uniform memory copy from shared address (not generic) to global memory.
|
||||||
|
// The number of bytes copied is `min(MaxBytes, nBytesAhead)`, a negative value
|
||||||
|
// is interpeted as zero. EltSize is the guaranteed alignment of the addresses and sizes.
|
||||||
|
template<int EltSize, int MaxBytes, bool Multimem, typename IntBytes>
|
||||||
|
__device__ __forceinline__ void copyGlobalShared_WarpUnrolled(
|
||||||
|
int lane, uintptr_t dstAddr, uint32_t srcAddr, IntBytes nBytesAhead
|
||||||
|
) {
|
||||||
|
static_assert(std::is_signed<IntBytes>::value, "`IntBytes` must be a signed integral type.");
|
||||||
|
int nBytes = min(nBytesAhead, (IntBytes)MaxBytes);
|
||||||
|
int nFrontBytes = min(nBytes, (16 - int(dstAddr%16))%16);
|
||||||
|
int nMiddleBytes = (nBytes-nFrontBytes) & -16;
|
||||||
|
int nBackBytes = (nBytes-nFrontBytes) % 16;
|
||||||
|
|
||||||
|
{ int backLane = WARP_SIZE-1 - lane;
|
||||||
|
bool hasFront = lane*EltSize < nFrontBytes;
|
||||||
|
bool hasBack = backLane*EltSize < nBackBytes;
|
||||||
|
int offset = hasFront ? lane*EltSize : (nBytes - (backLane+1)*EltSize);
|
||||||
|
if (hasFront | hasBack) {
|
||||||
|
BytePack<EltSize> tmp = ld_shared<EltSize>(srcAddr+offset);
|
||||||
|
// Can't use multimem_st since it doesn't support EltSize==2
|
||||||
|
st_global<EltSize>(dstAddr+offset, tmp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
srcAddr += nFrontBytes;
|
||||||
|
int srcMisalign = EltSize < 4 ? (srcAddr%4) : 0;
|
||||||
|
srcAddr += -srcMisalign + lane*16;
|
||||||
|
dstAddr += nFrontBytes + lane*16;
|
||||||
|
nMiddleBytes -= lane*16;
|
||||||
|
#pragma unroll
|
||||||
|
for (int u=0; u < divUp(MaxBytes, WARP_SIZE*16); u++) {
|
||||||
|
if (nMiddleBytes <= 0) break;
|
||||||
|
union {
|
||||||
|
BytePack<4> b4[4];
|
||||||
|
BytePack<16> b16;
|
||||||
|
};
|
||||||
|
b4[0] = ld_shared<4>(srcAddr + 0*4);
|
||||||
|
b4[1] = ld_shared<4>(srcAddr + 1*4);
|
||||||
|
b4[2] = ld_shared<4>(srcAddr + 2*4);
|
||||||
|
b4[3] = ld_shared<4>(srcAddr + 3*4);
|
||||||
|
if (srcMisalign != 0) {
|
||||||
|
BytePack<4> b4_4 = ld_shared<4>(srcAddr + 4*4);
|
||||||
|
b4[0].u32 = __funnelshift_r(b4[0].u32, b4[1].u32, srcMisalign*8);
|
||||||
|
b4[1].u32 = __funnelshift_r(b4[1].u32, b4[2].u32, srcMisalign*8);
|
||||||
|
b4[2].u32 = __funnelshift_r(b4[2].u32, b4[3].u32, srcMisalign*8);
|
||||||
|
b4[3].u32 = __funnelshift_r(b4[3].u32, b4_4.u32, srcMisalign*8);
|
||||||
|
}
|
||||||
|
if (Multimem) multimem_st_global<16>(dstAddr, b16);
|
||||||
|
else st_global<16>(dstAddr, b16);
|
||||||
|
|
||||||
|
srcAddr += WARP_SIZE*16;
|
||||||
|
dstAddr += WARP_SIZE*16;
|
||||||
|
nMiddleBytes -= WARP_SIZE*16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
|
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
#include "reduce_kernel.h" // for reduction funcs
|
#include "reduce_kernel.h" // for reduction funcs
|
||||||
|
#include "common_kernel.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
|
#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
|
||||||
@ -20,12 +21,13 @@
|
|||||||
* to how that protocol operates with a consistent interface so that our
|
* to how that protocol operates with a consistent interface so that our
|
||||||
* algorithm code can operate protocol parametrically.
|
* algorithm code can operate protocol parametrically.
|
||||||
*/
|
*/
|
||||||
template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL>
|
template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL, bool NVLS_1 = false>
|
||||||
struct ProtoSimple {
|
struct ProtoSimple {
|
||||||
static constexpr int Id = NCCL_PROTO_SIMPLE;
|
static constexpr int Id = NCCL_PROTO_SIMPLE;
|
||||||
static constexpr int SlicePerChunk = SlicePerChunk_1;
|
static constexpr int SlicePerChunk = SlicePerChunk_1;
|
||||||
static constexpr int StepPerSlice = StepPerSlice_1;
|
static constexpr int StepPerSlice = StepPerSlice_1;
|
||||||
static constexpr int Unroll = Unroll_1;
|
static constexpr int Unroll = Unroll_1;
|
||||||
|
static constexpr bool NVLS = NVLS_1;
|
||||||
|
|
||||||
// Data bytes (no flags etc) in one step of the fifo queue.
|
// Data bytes (no flags etc) in one step of the fifo queue.
|
||||||
__device__ static int calcBytePerStep() {
|
__device__ static int calcBytePerStep() {
|
||||||
|
@ -255,18 +255,18 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
|||||||
}
|
}
|
||||||
if (SRC) {
|
if (SRC) {
|
||||||
data = dl.loadFinish();
|
data = dl.loadFinish();
|
||||||
if (SrcBuf == Input) data = MULTI<RedOp, T>().preOp(redOp, data);
|
if (SrcBuf == Input) data = applyPreOp(redOp, data);
|
||||||
}
|
}
|
||||||
if (RECV) {
|
if (RECV) {
|
||||||
data = !SRC ? peerData : MULTI<RedOp,T>()(redOp, peerData, data);
|
data = !SRC ? peerData : applyReduce(redOp, peerData, data);
|
||||||
#pragma unroll MaxRecv
|
#pragma unroll MaxRecv
|
||||||
for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) {
|
for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) {
|
||||||
peerData = readLLFinish(offset, line, i);
|
peerData = readLLFinish(offset, line, i);
|
||||||
data = MULTI<RedOp,T>()(redOp, peerData, data);
|
data = applyReduce(redOp, peerData, data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (postOp) data = MULTI<RedOp, T>().postOp(redOp, data);
|
if (postOp) data = applyPostOp(redOp, data);
|
||||||
|
|
||||||
// Send : inter-node, then intra-node, then local
|
// Send : inter-node, then intra-node, then local
|
||||||
if (SEND) {
|
if (SEND) {
|
||||||
|
@ -82,7 +82,14 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
|||||||
if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1;
|
if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1;
|
||||||
}
|
}
|
||||||
inline __device__ void postSend() {
|
inline __device__ void postSend() {
|
||||||
if (sendConnTailPtr) { __threadfence(); *sendConnTailPtr = sendConnTail += 1; }
|
if (sendConnTailPtr) {
|
||||||
|
#if __CUDA_ARCH__ >= 900
|
||||||
|
__threadfence_system();
|
||||||
|
#else
|
||||||
|
__threadfence();
|
||||||
|
#endif
|
||||||
|
*sendConnTailPtr = sendConnTail += 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int WordPerThread>
|
template<int WordPerThread>
|
||||||
@ -109,7 +116,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
|||||||
// buffer into shmem.
|
// buffer into shmem.
|
||||||
int misalignment = reinterpret_cast<uintptr_t>(src) % 16;
|
int misalignment = reinterpret_cast<uintptr_t>(src) % 16;
|
||||||
uint64_t *src8 = reinterpret_cast<uint64_t*>(reinterpret_cast<uintptr_t>(src) & -uintptr_t(16));
|
uint64_t *src8 = reinterpret_cast<uint64_t*>(reinterpret_cast<uintptr_t>(src) & -uintptr_t(16));
|
||||||
uint64_t *shm8 = shmemCvtPtr(ncclShmem.ll128warp[warpInBlock]);
|
uint64_t *shm8 = shmemCvtPtr((uint64_t*)ncclScratchForWarp(warpInBlock));
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for(int g=0; g < WordPerThread/2; g++)
|
for(int g=0; g < WordPerThread/2; g++)
|
||||||
if((g*WARP_SIZE + wid)*16 < misalignment + eltN*sizeof(T))
|
if((g*WARP_SIZE + wid)*16 < misalignment + eltN*sizeof(T))
|
||||||
@ -153,7 +160,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
|||||||
}
|
}
|
||||||
// Write to dst if 16-byte aligned, shmem otherwise.
|
// Write to dst if 16-byte aligned, shmem otherwise.
|
||||||
int misalignment = reinterpret_cast<uintptr_t>(dst)%16;
|
int misalignment = reinterpret_cast<uintptr_t>(dst)%16;
|
||||||
uint64_t *shm8 = shmemCvtPtr(ncclShmem.ll128warp[warpInBlock]);
|
uint64_t *shm8 = shmemCvtPtr((uint64_t*)ncclScratchForWarp(warpInBlock));
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for(int g=0; g < WordPerThread/2; g++) {
|
for(int g=0; g < WordPerThread/2; g++) {
|
||||||
int ix = g*WARP_SIZE - 4*(g/2) + wid - (g%2)*(wid/8);
|
int ix = g*WARP_SIZE - 4*(g/2) + wid - (g%2)*(wid/8);
|
||||||
@ -167,7 +174,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
|||||||
__syncwarp();
|
__syncwarp();
|
||||||
// Write rest from shmem to dst. No need to coalesce stores to 16-bytes,
|
// Write rest from shmem to dst. No need to coalesce stores to 16-bytes,
|
||||||
// the hardware keeps up fine.
|
// the hardware keeps up fine.
|
||||||
T *shm = (T*)ncclShmem.ll128warp[warpInBlock];
|
T *shm = (T*)ncclScratchForWarp(warpInBlock);
|
||||||
int skip = misalignment == 0 ? eltN & -EltPer16B : 0;
|
int skip = misalignment == 0 ? eltN & -EltPer16B : 0;
|
||||||
for(int i=skip+wid; i < eltN; i += WARP_SIZE)
|
for(int i=skip+wid; i < eltN; i += WARP_SIZE)
|
||||||
dst[i] = shm[i];
|
dst[i] = shm[i];
|
||||||
@ -196,6 +203,10 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
|||||||
}
|
}
|
||||||
needReload &= (0 == checkAbort(spins, 0, 0));
|
needReload &= (0 == checkAbort(spins, 0, 0));
|
||||||
} while (__any_sync(WARP_MASK, needReload));
|
} while (__any_sync(WARP_MASK, needReload));
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int u=0; u<ELEMS_PER_THREAD; u+=2)
|
||||||
|
load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/************* Finish register load **************/
|
/************* Finish register load **************/
|
||||||
@ -206,9 +217,9 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
|||||||
if (SrcBuf == Input) {
|
if (SrcBuf == Input) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||||
v[u] = MULTI<RedOp, T>().preOp(redOp, v[u]);
|
v[u] = applyPreOp(redOp, v[u]);
|
||||||
if (!flagThread)
|
if (!flagThread)
|
||||||
v[u+1] = MULTI<RedOp, T>().preOp(redOp, v[u+1]);
|
v[u+1] = applyPreOp(redOp, v[u+1]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -218,8 +229,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
|||||||
{ // Consume data from first recv
|
{ // Consume data from first recv
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||||
v[u] = SRC ? MULTI<RedOp, T>()(redOp, vr[u], v[u]) : vr[u];
|
v[u] = SRC ? applyReduce(redOp, vr[u], v[u]) : vr[u];
|
||||||
v[u+1] = SRC ? MULTI<RedOp, T>()(redOp, vr[u+1], v[u+1]) : vr[u+1];
|
v[u+1] = SRC ? applyReduce(redOp, vr[u+1], v[u+1]) : vr[u+1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -238,20 +249,24 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
|||||||
needReload &= (0 == checkAbort(spins, i, 0));
|
needReload &= (0 == checkAbort(spins, i, 0));
|
||||||
} while (__any_sync(WARP_MASK, needReload));
|
} while (__any_sync(WARP_MASK, needReload));
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int u=0; u<ELEMS_PER_THREAD; u+=2)
|
||||||
|
load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||||
v[u] = MULTI<RedOp, T>()(redOp, vr[u], v[u]);
|
v[u] = applyReduce(redOp, vr[u], v[u]);
|
||||||
v[u+1] = MULTI<RedOp, T>()(redOp, vr[u+1], v[u+1]);
|
v[u+1] = applyReduce(redOp, vr[u+1], v[u+1]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/********************** End Recv ************************/
|
/********************** End Recv ************************/
|
||||||
|
|
||||||
if (postOp && !FuncTraits<RedOp>::IsPostOpIdentity) {
|
if (postOp) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||||
v[u] = MULTI<RedOp, T>().postOp(redOp, v[u]);
|
v[u] = applyPostOp(redOp, v[u]);
|
||||||
v[u+1] = MULTI<RedOp, T>().postOp(redOp, v[u+1]);
|
v[u+1] = applyPostOp(redOp, v[u+1]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -282,14 +297,6 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
|||||||
__device__ __forceinline__ void GenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) {
|
__device__ __forceinline__ void GenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) {
|
||||||
constexpr int SRC = SrcBuf != -1 ? 1 : 0;
|
constexpr int SRC = SrcBuf != -1 ? 1 : 0;
|
||||||
constexpr int DST = DstBuf != -1 ? 1 : 0;
|
constexpr int DST = DstBuf != -1 ? 1 : 0;
|
||||||
static_assert(-1<=SrcBuf && SrcBuf < 2, "Uhoh");
|
|
||||||
static_assert(-1<=DstBuf && DstBuf < 2, "Uhoh");
|
|
||||||
static_assert(DstBuf!=Input, "Mistake?");
|
|
||||||
#if 0
|
|
||||||
assert((SrcBuf==-1) == (srcIx==-1));
|
|
||||||
assert((DstBuf==-1) == (dstIx==-1));
|
|
||||||
#endif
|
|
||||||
|
|
||||||
T const *srcPtr = SrcBuf == -1 ? nullptr : userBufs[SrcBuf] + srcIx;
|
T const *srcPtr = SrcBuf == -1 ? nullptr : userBufs[SrcBuf] + srcIx;
|
||||||
T *dstPtr = DstBuf == -1 ? nullptr : userBufs[DstBuf] + dstIx;
|
T *dstPtr = DstBuf == -1 ? nullptr : userBufs[DstBuf] + dstIx;
|
||||||
int wireOffset = WireWordPerSlice*warp + 2*wid;
|
int wireOffset = WireWordPerSlice*warp + 2*wid;
|
||||||
|
@ -5,9 +5,9 @@
|
|||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
template<typename T, typename RedOp, typename Fan, int Direct,
|
template<typename T, typename RedOp, typename Fan, int Direct,
|
||||||
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p>
|
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, bool NVLS>
|
||||||
class Primitives<
|
class Primitives<
|
||||||
T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll>, P2p
|
T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, NVLS>, P2p
|
||||||
> {
|
> {
|
||||||
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
|
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
|
||||||
static constexpr int Input=0, Output=1;
|
static constexpr int Input=0, Output=1;
|
||||||
@ -22,8 +22,10 @@ class Primitives<
|
|||||||
SizesFifoEnabled = 0x100,
|
SizesFifoEnabled = 0x100,
|
||||||
DirectWrite = 0x200,
|
DirectWrite = 0x200,
|
||||||
DirectRead = 0x400,
|
DirectRead = 0x400,
|
||||||
ThreadsSynced = 0x800;
|
ThreadsSynced = 0x800,
|
||||||
const int tid;
|
NvlsMinPolling = 0x1000,
|
||||||
|
NvlsRecv = 0x2000;
|
||||||
|
const int tid, tidInBlock;
|
||||||
int nthreads;
|
int nthreads;
|
||||||
int nworkers;
|
int nworkers;
|
||||||
const int stepSize;
|
const int stepSize;
|
||||||
@ -41,22 +43,54 @@ class Primitives<
|
|||||||
int volatile *connSizesFifoPtr; // (flags & SizesFifoEnabled)
|
int volatile *connSizesFifoPtr; // (flags & SizesFifoEnabled)
|
||||||
T *directBuff; // !(flags & SizesFifoEnabled)
|
T *directBuff; // !(flags & SizesFifoEnabled)
|
||||||
};
|
};
|
||||||
uint64_t volatile *connStepPtr;
|
uint64_t *connStepPtr;
|
||||||
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
|
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
|
||||||
|
|
||||||
// Don't use barrier 0 as it's used by the final sync
|
// Don't use barrier 0 as it's used by the final sync
|
||||||
inline __device__ void barrier() {
|
__device__ void barrier() {
|
||||||
if (nthreads == WARP_SIZE)
|
|
||||||
__syncwarp();
|
|
||||||
else
|
|
||||||
asm volatile("bar.sync %0, %1;" :: "r"(15-group), "r"(nthreads));
|
|
||||||
flags |= ThreadsSynced;
|
flags |= ThreadsSynced;
|
||||||
|
if (nthreads == WARP_SIZE) __syncwarp();
|
||||||
|
else {
|
||||||
|
int bar = 15-group;
|
||||||
|
asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nthreads) : "memory");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__device__ void subBarrier() {
|
||||||
|
if (nworkers == WARP_SIZE) __syncwarp();
|
||||||
|
else {
|
||||||
|
int bar = (nworkers==nthreads ? 15 : 8) - group;
|
||||||
|
asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nworkers) : "memory");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ bool barrierAny(int vote) {
|
||||||
|
flags |= ThreadsSynced;
|
||||||
|
if (nthreads == WARP_SIZE) {
|
||||||
|
return __any_sync(~0u, vote);
|
||||||
|
} else {
|
||||||
|
int ans, bar = 15-group;
|
||||||
|
asm volatile(
|
||||||
|
"{ .reg .pred p;"
|
||||||
|
" setp.ne.s32 p, %1, 0;"
|
||||||
|
" bar.red.or.pred p, %2, %3, p; "
|
||||||
|
" selp.s32 %0, 1, 0, p; }"
|
||||||
|
: "=r"(ans) : "r"(vote), "r"(bar), "r"(nthreads) : "memory");
|
||||||
|
return ans != 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__device__ bool subBarrierAny(int vote) {
|
||||||
|
if (nworkers == WARP_SIZE) {
|
||||||
|
return __any_sync(~0u, vote);
|
||||||
|
} else {
|
||||||
|
int ans, bar = (nworkers==nthreads ? 15 : 8) - group;
|
||||||
|
asm volatile(
|
||||||
|
"{ .reg .pred p;"
|
||||||
|
" setp.ne.s32 p, %1, 0;"
|
||||||
|
" bar.red.or.pred p, %2, %3, p; "
|
||||||
|
" selp.s32 %0, 1, 0, p; }"
|
||||||
|
: "=r"(ans) : "r"(vote), "r"(bar), "r"(nworkers) : "memory");
|
||||||
|
return ans != 0;
|
||||||
}
|
}
|
||||||
inline __device__ void subBarrier() {
|
|
||||||
if (nworkers == nthreads)
|
|
||||||
barrier();
|
|
||||||
else
|
|
||||||
asm volatile("bar.sync %0, %1;" :: "r"(8-group), "r"(nworkers));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __device__ bool checkAbort(int &spins) {
|
inline __device__ bool checkAbort(int &spins) {
|
||||||
@ -71,6 +105,19 @@ class Primitives<
|
|||||||
return flags & Aborted;
|
return flags & Aborted;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline __device__ uint64_t loadStepValue(uint64_t* ptr) {
|
||||||
|
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
|
||||||
|
if (NVLS && (flags & NvlsMinPolling)) {
|
||||||
|
uint64_t ans;
|
||||||
|
asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// volatile is faster than acquire but not as correct. Make sure ReduceOrCopyMulti
|
||||||
|
// loads data using volatile so it doesn't see stale data in L1.
|
||||||
|
return ld_volatile_global(ptr);
|
||||||
|
}
|
||||||
|
|
||||||
template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
|
template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
|
||||||
__device__ __forceinline__ void waitPeer(intptr_t dstIx, intptr_t remoteIx, int offset, int nelts) {
|
__device__ __forceinline__ void waitPeer(intptr_t dstIx, intptr_t remoteIx, int offset, int nelts) {
|
||||||
const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
|
const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
|
||||||
@ -80,7 +127,7 @@ class Primitives<
|
|||||||
((flags & (Send*RoleWaitSend)) && !noSendWait)) {
|
((flags & (Send*RoleWaitSend)) && !noSendWait)) {
|
||||||
int spins = 0;
|
int spins = 0;
|
||||||
while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
|
while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
|
||||||
connStepCache = *connStepPtr;
|
connStepCache = loadStepValue(connStepPtr);
|
||||||
if (checkAbort(spins)) break;
|
if (checkAbort(spins)) break;
|
||||||
//if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
|
//if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
|
||||||
}
|
}
|
||||||
@ -119,10 +166,11 @@ class Primitives<
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<int Recv, int Send>
|
template<int Recv, int Send>
|
||||||
inline __device__ void postPeer() {
|
inline __device__ void postPeer(bool dataStored) {
|
||||||
if (flags & (Recv*RolePostRecv | Send*RolePostSend)) {
|
if (flags & (Recv*RolePostRecv | Send*RolePostSend)) {
|
||||||
step += StepPerSlice;
|
step += StepPerSlice;
|
||||||
*connStepPtr = step;
|
if (Send && (flags & RolePostSend) && dataStored) fence_acq_rel_sys();
|
||||||
|
st_relaxed_sys_global(connStepPtr, step);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -166,7 +214,7 @@ class Primitives<
|
|||||||
// post();
|
// post();
|
||||||
// } // Since we no longer unroll, new branch added here
|
// } // Since we no longer unroll, new branch added here
|
||||||
#if __CUDA_ARCH__ < 700
|
#if __CUDA_ARCH__ < 700
|
||||||
// Yeah, so all that above don't matter a lick on older hardware.
|
// Above doesn't matter on older hardware.
|
||||||
#pragma unroll SlicePerChunk
|
#pragma unroll SlicePerChunk
|
||||||
#else
|
#else
|
||||||
#pragma unroll 1
|
#pragma unroll 1
|
||||||
@ -181,37 +229,39 @@ class Primitives<
|
|||||||
subBarrier();
|
subBarrier();
|
||||||
/* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
|
/* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
|
||||||
* to 0 to avoid unnecessary workload. */
|
* to 0 to avoid unnecessary workload. */
|
||||||
size_t workSize = ncclShmem.aborted ? 0 : sliceSize;
|
int workSize = ncclShmem.aborted ? 0 : sliceSize;
|
||||||
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
|
if (NVLS && ncclShmem.groups[group].nvlsRecv) {
|
||||||
|
void* src = ncclShmem.groups[group].srcs[0];
|
||||||
|
void* dst = ncclShmem.groups[group].dsts[0];
|
||||||
|
copyMultimemMultimem<RedOp>(tid, nworkers, ncclShmem.redOpArgs[0], postOp, src, dst, workSize,
|
||||||
|
cvta_to_shared(ncclScratchForWarp(tidInBlock/WARP_SIZE)));
|
||||||
|
} else if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
|
||||||
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
|
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
|
||||||
if (Send) {
|
if (Send) {
|
||||||
// (1-Send) is only there to avoid compilation errors in case MaxSend=0 (and Send=0).
|
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, MaxSend, /*PreOpSrcs*/0>
|
||||||
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, (1-Send)+MaxSend, 0>
|
(tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false,
|
||||||
(tid, nworkers, nullptr, false,
|
1, ncclShmem.groups[group].srcs,
|
||||||
1, (T const**)ncclShmem.groups[group].srcs,
|
fan.nsend(), ncclShmem.groups[group].dsts+1,
|
||||||
fan.nsend(), (T**)ncclShmem.groups[group].dsts+1,
|
|
||||||
workSize);
|
workSize);
|
||||||
}
|
}
|
||||||
} else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) {
|
} else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) {
|
||||||
// For broadcast in CollNet to do empty send
|
// For broadcast in CollNet to do empty send
|
||||||
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>
|
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs*/0>
|
||||||
(tid, nworkers, ncclShmem.redOpArgs, postOp,
|
(tid, nworkers, ncclShmem.redOpArgs[0], nullptr, postOp,
|
||||||
Recv, (T const**)ncclShmem.groups[group].srcs,
|
Recv, ncclShmem.groups[group].srcs,
|
||||||
Dst, (T**)ncclShmem.groups[group].dsts,
|
Dst, ncclShmem.groups[group].dsts,
|
||||||
workSize);
|
workSize);
|
||||||
} else {
|
} else {
|
||||||
constexpr int PreOpN = SrcBuf != Input ? 0 :
|
constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
|
||||||
DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
|
DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
|
||||||
ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpN>
|
ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
|
||||||
(tid, nworkers, ncclShmem.redOpArgs, postOp,
|
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
|
||||||
Recv*fan.nrecv()+Src, (T const**)ncclShmem.groups[group].srcs,
|
Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
|
||||||
Send*fan.nsend()+Dst, (T**)ncclShmem.groups[group].dsts,
|
Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
|
||||||
workSize);
|
workSize);
|
||||||
}
|
}
|
||||||
barrier(); // This barrier has a counterpart in following loop
|
barrier(); // This barrier has a counterpart in following loop
|
||||||
if (Send && (flags & RolePostSend) && index == 0) __threadfence_system();
|
postPeer<Recv, Send>(0 < sliceSize);
|
||||||
__syncwarp();
|
|
||||||
postPeer<Recv, Send>();
|
|
||||||
offset += sliceSize;
|
offset += sliceSize;
|
||||||
slice += 1;
|
slice += 1;
|
||||||
} while (slice < SlicePerChunk && offset < nelem);
|
} while (slice < SlicePerChunk && offset < nelem);
|
||||||
@ -229,9 +279,7 @@ class Primitives<
|
|||||||
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, 0);
|
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, 0);
|
||||||
}
|
}
|
||||||
barrier(); // Has couterpart in preceding worker-only loop.
|
barrier(); // Has couterpart in preceding worker-only loop.
|
||||||
if (Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system();
|
postPeer<Recv, Send>(0 < sliceSize);
|
||||||
__syncwarp();
|
|
||||||
postPeer<Recv, Send>();
|
|
||||||
offset += sliceSize;
|
offset += sliceSize;
|
||||||
slice += 1;
|
slice += 1;
|
||||||
}
|
}
|
||||||
@ -242,7 +290,7 @@ class Primitives<
|
|||||||
// shift: peer offset to avoid all ranks sending to or receiving from same peer
|
// shift: peer offset to avoid all ranks sending to or receiving from same peer
|
||||||
template <int DirectRecv1, int DirectSend1, int Recv, int Send>
|
template <int DirectRecv1, int DirectSend1, int Recv, int Send>
|
||||||
__device__ __forceinline__ void
|
__device__ __forceinline__ void
|
||||||
ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp) {
|
ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp) {
|
||||||
constexpr int DirectRecv = 1 && Direct && DirectRecv1;
|
constexpr int DirectRecv = 1 && Direct && DirectRecv1;
|
||||||
constexpr int DirectSend = 1 && Direct && DirectSend1;
|
constexpr int DirectSend = 1 && Direct && DirectSend1;
|
||||||
int offset = 0; // slice offset
|
int offset = 0; // slice offset
|
||||||
@ -252,12 +300,12 @@ class Primitives<
|
|||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int slice=0; slice<SlicePerChunk; ++slice) {
|
for (int slice=0; slice<SlicePerChunk; ++slice) {
|
||||||
int realSize = max(0, min(dataSize, peerElem-offset));
|
int realSize = max(0, min(dataSize, peerElem-offset));
|
||||||
|
bool fenceNeeded = false;
|
||||||
if (tid < nworkers) {
|
if (tid < nworkers) {
|
||||||
if (Send) {
|
if (Send) {
|
||||||
// Scatter pre-scales data of input buffer only in non-Direct case
|
// Scatter pre-scales data of input buffer only in non-Direct case
|
||||||
constexpr int PreOpN = DirectSend ? 0 : 1;
|
constexpr int PreOpSrcs = DirectSend ? 0 : 1;
|
||||||
if (flags & RoleInput) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset;
|
if (flags & RoleInput) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset;
|
||||||
if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] = 0; // Skip the threadfence
|
|
||||||
// realSize is not accurate here; but intra-node does not rely on sizes FIFO
|
// realSize is not accurate here; but intra-node does not rely on sizes FIFO
|
||||||
waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
|
waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
|
||||||
subBarrier();
|
subBarrier();
|
||||||
@ -265,23 +313,23 @@ class Primitives<
|
|||||||
// Loop over peers
|
// Loop over peers
|
||||||
for (int j=0; j<fan.nsend(); j++) {
|
for (int j=0; j<fan.nsend(); j++) {
|
||||||
int i = (j+shift)%fan.nsend();
|
int i = (j+shift)%fan.nsend();
|
||||||
int peerOffset = i*peerElem;
|
int pOffset = i*peerOffset;
|
||||||
// Skip the data I am responsible of reducing myself
|
// Skip the data I am responsible of reducing myself
|
||||||
if (skip >= 0 && i >= skip) peerOffset += peerElem;
|
if (skip >= 0 && i >= skip) pOffset += peerElem;
|
||||||
const T* src0 = (T*)ncclShmem.groups[group].srcs[0] + peerOffset;
|
void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset;
|
||||||
int realPeerSize = min(realSize, totalElem-peerOffset);
|
int realPeerSize = min(realSize, totalElem-pOffset);
|
||||||
if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
|
if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
|
||||||
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpN>(tid, nworkers, ncclShmem.redOpArgs, false, 1, &src0, 1, (T**)ncclShmem.groups[group].dsts+i, realPeerSize);
|
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpSrcs>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize);
|
||||||
// Mark for threadfence at the end
|
// Mark for threadfence at the end
|
||||||
if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] += realPeerSize;
|
fenceNeeded |= true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (Recv) {
|
} else if (Recv) {
|
||||||
if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
|
if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
|
||||||
int peerOffset = index*peerElem;
|
int pOffset = index*peerOffset;
|
||||||
if (skip >= 0 && index >= skip) peerOffset += peerElem;
|
if (skip >= 0 && index >= skip) pOffset += peerElem;
|
||||||
// Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
|
// Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
|
||||||
waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+peerOffset, offset, realSize);
|
waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+pOffset, offset, realSize);
|
||||||
subBarrier();
|
subBarrier();
|
||||||
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
|
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
|
||||||
// Since waitPeer sets srcs[0] to output buffer + offset, we are doing a direct-write based recv
|
// Since waitPeer sets srcs[0] to output buffer + offset, we are doing a direct-write based recv
|
||||||
@ -290,21 +338,17 @@ class Primitives<
|
|||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j=0; j<fan.nrecv(); j++) {
|
for (int j=0; j<fan.nrecv(); j++) {
|
||||||
int i = (j+shift)%fan.nrecv();
|
int i = (j+shift)%fan.nrecv();
|
||||||
peerOffset = i*peerElem;
|
pOffset = i*peerOffset;
|
||||||
if (skip >= 0 && i >= skip) peerOffset += peerElem;
|
if (skip >= 0 && i >= skip) pOffset += peerElem;
|
||||||
T* dst0 = (T*)ncclShmem.groups[group].dsts[0] + peerOffset;
|
void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
|
||||||
int realPeerSize = min(realSize, totalElem-peerOffset);
|
int realPeerSize = min(realSize, totalElem-pOffset);
|
||||||
if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>(tid, nworkers, ncclShmem.redOpArgs, postOp, 1, (const T**)ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
|
if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
barrier();
|
fenceNeeded = barrierAny(fenceNeeded);
|
||||||
// If we indeed send something, threadfence
|
postPeer<Recv, Send>(fenceNeeded);
|
||||||
if (Send && (flags & RolePostSend) && ncclShmem.groups[group].totalSendSize[slice] > 0 && index == 0)
|
|
||||||
__threadfence_system();
|
|
||||||
__syncwarp();
|
|
||||||
postPeer<Recv, Send>();
|
|
||||||
offset += realSize;
|
offset += realSize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -320,25 +364,33 @@ class Primitives<
|
|||||||
}
|
}
|
||||||
if (flags & RoleWaitRecv) {
|
if (flags & RoleWaitRecv) {
|
||||||
ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
|
ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
|
||||||
|
if ((index == 0) && (flags & RoleWaitRecv)) {
|
||||||
|
if (conn->flags & NCCL_NVLS_MIN_POLL) {
|
||||||
|
flags |= NvlsMinPolling;
|
||||||
|
ncclShmem.groups[group].nvlsRecv = 1;
|
||||||
|
} else {
|
||||||
|
ncclShmem.groups[group].nvlsRecv = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
connStepPtr = conn->tail;
|
connStepPtr = conn->tail;
|
||||||
connStepCache = *connStepPtr;
|
connStepCache = loadStepValue(connStepPtr);
|
||||||
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
|
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
|
||||||
if (Direct) {
|
if (Direct) {
|
||||||
// User buffers have been registered
|
// User buffers have been registered
|
||||||
if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
|
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
|
||||||
if (connIndex == 1 && P2p == 0) {
|
if (connIndex == 1 && P2p == 0) {
|
||||||
flags |= DirectRead; // scatter-reduce use direct pull
|
flags |= DirectRead; // scatter-reduce use direct pull
|
||||||
} else {
|
} else {
|
||||||
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
|
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
|
||||||
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
|
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
|
||||||
}
|
}
|
||||||
} else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
|
} else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
|
||||||
if (connIndex == 1 && P2p == 0) {
|
if (connIndex == 1 && P2p == 0) {
|
||||||
flags |= DirectRead; // scatter-reduce use direct pull
|
flags |= DirectRead; // scatter-reduce use direct pull
|
||||||
} else {
|
} else {
|
||||||
// direct read not allowed in non-register case
|
// direct read not allowed in non-register case
|
||||||
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
|
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
|
||||||
flags |= (conn->direct & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
|
flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -359,8 +411,9 @@ class Primitives<
|
|||||||
}
|
}
|
||||||
if (flags & RoleWaitSend) {
|
if (flags & RoleWaitSend) {
|
||||||
ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
|
ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
|
||||||
|
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
|
||||||
connStepPtr = conn->head;
|
connStepPtr = conn->head;
|
||||||
connStepCache = *connStepPtr;
|
connStepCache = loadStepValue(connStepPtr);
|
||||||
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
|
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
|
||||||
if (flags & OffsFifoEnabled)
|
if (flags & OffsFifoEnabled)
|
||||||
connOffsFifoPtr = conn->offsFifo;
|
connOffsFifoPtr = conn->offsFifo;
|
||||||
@ -371,20 +424,20 @@ class Primitives<
|
|||||||
connSizesFifoPtr = conn->sizesFifo;
|
connSizesFifoPtr = conn->sizesFifo;
|
||||||
} else if (Direct) {
|
} else if (Direct) {
|
||||||
// User buffers have been registered
|
// User buffers have been registered
|
||||||
if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
|
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
|
||||||
if (connIndex == 1 && P2p == 0) {
|
if (connIndex == 1 && P2p == 0) {
|
||||||
flags |= DirectRead; // scatter-reduce use direct pull
|
flags |= DirectRead; // scatter-reduce use direct pull
|
||||||
} else {
|
} else {
|
||||||
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
|
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
|
||||||
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
|
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
|
||||||
}
|
}
|
||||||
} else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
|
} else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
|
||||||
if (connIndex == 1 && P2p == 0) {
|
if (connIndex == 1 && P2p == 0) {
|
||||||
flags |= DirectRead; // scatter-reduce use direct pull
|
flags |= DirectRead; // scatter-reduce use direct pull
|
||||||
} else {
|
} else {
|
||||||
// direct read not allowed in non-register case
|
// direct read not allowed in non-register case
|
||||||
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
|
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
|
||||||
flags |= (conn->direct & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
|
flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -397,7 +450,7 @@ class Primitives<
|
|||||||
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
|
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
|
||||||
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr
|
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr
|
||||||
):
|
):
|
||||||
tid(tid),
|
tid(tid), tidInBlock(threadIdx.x),
|
||||||
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
|
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
|
||||||
|
|
||||||
// For send operations, we need an extra warp to overlap the threadfence and the copy
|
// For send operations, we need an extra warp to overlap the threadfence and the copy
|
||||||
@ -412,7 +465,7 @@ class Primitives<
|
|||||||
this->fan = Fan(nrecv, nsend);
|
this->fan = Fan(nrecv, nsend);
|
||||||
|
|
||||||
constexpr int ThreadPerSync = 8;
|
constexpr int ThreadPerSync = 8;
|
||||||
static_assert(MaxSend < ThreadPerSync && MaxRecv < ThreadPerSync, "Not enough threads to cover all peers");
|
static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
|
||||||
|
|
||||||
int g = tid / ThreadPerSync;
|
int g = tid / ThreadPerSync;
|
||||||
int ng = nthreads / ThreadPerSync;
|
int ng = nthreads / ThreadPerSync;
|
||||||
@ -566,6 +619,9 @@ class Primitives<
|
|||||||
genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp);
|
genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void recvSend(int eltN, bool postOp=false) {
|
||||||
|
genericOp<0, 0, 1, 1, -1, -1>(-1, -1, -1, eltN, postOp);
|
||||||
|
}
|
||||||
__device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
|
__device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
|
||||||
genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, -1, eltN, postOp);
|
genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, -1, eltN, postOp);
|
||||||
}
|
}
|
||||||
@ -596,20 +652,20 @@ class Primitives<
|
|||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ void
|
__device__ __forceinline__ void
|
||||||
scatter(intptr_t inpIx, int totalElem, int peerElem, int skip, int shift) {
|
scatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
|
||||||
ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, skip, shift, /*postOp=*/false);
|
ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void
|
__device__ __forceinline__ void
|
||||||
directScatter(intptr_t inpIx, int totalElem, int peerElem, int skip, int shift) {
|
directScatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
|
||||||
ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, skip, shift, /*postOp=*/false);
|
ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ void
|
__device__ __forceinline__ void
|
||||||
gather(intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp=false) {
|
gather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp=false) {
|
||||||
ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, skip, shift, postOp);
|
ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, postOp);
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void
|
__device__ __forceinline__ void
|
||||||
directGather(intptr_t outIx, int totalElem, int peerElem, int skip, int shift) {
|
directGather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
|
||||||
ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, skip, shift, /*postOp=*/false);
|
ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -87,3 +87,45 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROT
|
|||||||
runRing<T, RedOp, ProtoLL128>(args);
|
runRing<T, RedOp, ProtoLL128>(args);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename T, typename RedOp>
|
||||||
|
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||||
|
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
const int bid = args->bid;
|
||||||
|
const int nChannels = args->nChannels;
|
||||||
|
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||||
|
const ssize_t chunkSize = int(args->lastChunkSize);
|
||||||
|
const ssize_t size = args->count;
|
||||||
|
const ssize_t loopSize = nChannels*chunkSize;
|
||||||
|
|
||||||
|
const int nThreadsScatter = 128 + WARP_SIZE;
|
||||||
|
const int nThreadsReduce = 384;
|
||||||
|
const int tidEndScatter = nThreadsScatter;
|
||||||
|
const int tidEndReduce = tidEndScatter + nThreadsReduce;
|
||||||
|
|
||||||
|
using Proto = ProtoSimple<1, 1>;
|
||||||
|
|
||||||
|
if (tid < tidEndScatter) {
|
||||||
|
// Scatter
|
||||||
|
int group = (0*Proto::MaxGroupWidth) | (0<<16);
|
||||||
|
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||||
|
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, args->redOpArg, group, args);
|
||||||
|
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||||
|
ssize_t offset = gridOffset + bid*chunkSize;
|
||||||
|
int nelem = min(chunkSize, size-offset);
|
||||||
|
prims.scatter(offset, nvls->nHeads*size, nelem, size, -1, 0);
|
||||||
|
}
|
||||||
|
} else if (tid < tidEndReduce) {
|
||||||
|
int group = (3*Proto::MaxGroupWidth) | (1<<16);
|
||||||
|
// Reduce through MC
|
||||||
|
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||||
|
prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff, args->redOpArg, group, args);
|
||||||
|
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||||
|
ssize_t offset = gridOffset + bid*chunkSize;
|
||||||
|
int nelem = min(chunkSize, size-offset);
|
||||||
|
prims.recv(offset, nelem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
@ -13,12 +13,13 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
|||||||
template<typename Proto>
|
template<typename Proto>
|
||||||
__device__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
|
__device__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
|
||||||
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
|
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
|
||||||
size_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
|
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
|
||||||
if (args->peer == ncclShmem.comm.rank) {
|
if (args->peer == ncclShmem.comm.rank) {
|
||||||
struct ncclWorkElemP2p* recvArgs = args-1;
|
struct ncclWorkElemP2p* recvArgs = args-1;
|
||||||
void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
|
void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
|
||||||
if (buff != recvBuff) {
|
if (buff != recvBuff) {
|
||||||
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&buff, 1, (T**)&recvBuff, count);
|
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>
|
||||||
|
(tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
int chunkSize = args->chunkSize/sizeof(T);
|
int chunkSize = args->chunkSize/sizeof(T);
|
||||||
|
@ -74,6 +74,8 @@ void ncclDebugInit() {
|
|||||||
mask = NCCL_ALLOC;
|
mask = NCCL_ALLOC;
|
||||||
} else if (strcasecmp(subsys, "CALL") == 0) {
|
} else if (strcasecmp(subsys, "CALL") == 0) {
|
||||||
mask = NCCL_CALL;
|
mask = NCCL_CALL;
|
||||||
|
} else if (strcasecmp(subsys, "NVLS") == 0) {
|
||||||
|
mask = NCCL_NVLS;
|
||||||
} else if (strcasecmp(subsys, "ALL") == 0) {
|
} else if (strcasecmp(subsys, "ALL") == 0) {
|
||||||
mask = NCCL_ALL;
|
mask = NCCL_ALL;
|
||||||
}
|
}
|
||||||
|
123
src/enqueue.cc
123
src/enqueue.cc
@ -32,7 +32,8 @@ struct ncclKernelMatch {
|
|||||||
NCCL_FUNC5(func, TREE, devredop, type, specialized), \
|
NCCL_FUNC5(func, TREE, devredop, type, specialized), \
|
||||||
NCCL_FUNC5(func, RING, devredop, type, specialized), \
|
NCCL_FUNC5(func, RING, devredop, type, specialized), \
|
||||||
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, specialized), \
|
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, specialized), \
|
||||||
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, specialized)
|
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, specialized), \
|
||||||
|
NCCL_FUNC5(func, NVLS, devredop, type, specialized)
|
||||||
|
|
||||||
#ifdef __CUDA_BF16_TYPES_EXIST__
|
#ifdef __CUDA_BF16_TYPES_EXIST__
|
||||||
#define HAVE_BFLOAT16 1
|
#define HAVE_BFLOAT16 1
|
||||||
@ -90,34 +91,48 @@ static const ncclKernelMatch ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNum
|
|||||||
|
|
||||||
static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */);
|
static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */);
|
||||||
|
|
||||||
// Determine the maximum kernel stack size of all CUDA kernels
|
NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
|
||||||
size_t ncclKernMaxLocalSize() {
|
|
||||||
ncclResult_t res = ncclSuccess;
|
// Returns maximum kernel stack size of all CUDA kernels
|
||||||
int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
|
ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
|
||||||
|
constexpr int KernelCount = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
|
||||||
|
ncclResult_t result = ncclSuccess;
|
||||||
|
|
||||||
|
if (maxStackSize) *maxStackSize = 0;
|
||||||
|
int carveout = ncclParamL1SharedMemoryCarveout();
|
||||||
|
|
||||||
|
// Keep track if we already visited a function pointer.
|
||||||
|
void* lru[2] = {nullptr, nullptr};
|
||||||
|
for (int i=0; i < KernelCount; i++) {
|
||||||
|
void* fn = ncclKerns[i].kernelFn;
|
||||||
|
if (fn == lru[0] || fn == lru[1]) goto next_kernel;
|
||||||
|
lru[1] = lru[0];
|
||||||
|
lru[0] = fn;
|
||||||
|
|
||||||
|
if (maxStackSize) {
|
||||||
cudaFuncAttributes attr = {0};
|
cudaFuncAttributes attr = {0};
|
||||||
size_t max = 0;
|
CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
|
||||||
for (int i = 0; i < numNcclKerns; i++) {
|
if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
|
||||||
CUDACHECKGOTO(cudaFuncGetAttributes(&attr, ncclKerns[i].kernelFn), res, error);
|
ignore0:;
|
||||||
if (attr.localSizeBytes > max) max = attr.localSizeBytes;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
error:
|
if (carveout) {
|
||||||
return (res != ncclSuccess) ? 0 : max;
|
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
|
||||||
}
|
cudaFuncAttributePreferredSharedMemoryCarveout, carveout),
|
||||||
|
result, ignore1);
|
||||||
// Set shared memory carveout for the nccl kernels
|
ignore1:;
|
||||||
ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut) {
|
|
||||||
ncclResult_t res = ncclSuccess;
|
|
||||||
int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
|
|
||||||
for (int i = 0; i < numNcclKerns; i++) {
|
|
||||||
CUDACHECKGOTO(cudaFuncSetAttribute(ncclKerns[i].kernelFn, cudaFuncAttributePreferredSharedMemoryCarveout, carveOut), res, error);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
error:
|
if (ncclShmemDynamicSize(cudaArch) != 0) {
|
||||||
return res;
|
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
|
||||||
|
cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)),
|
||||||
|
result, next_kernel);
|
||||||
|
}
|
||||||
|
next_kernel:;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*****************************************************************************/
|
/*****************************************************************************/
|
||||||
/* Launch system : synchronization and CUDA kernel launch */
|
/* Launch system : synchronization and CUDA kernel launch */
|
||||||
/*****************************************************************************/
|
/*****************************************************************************/
|
||||||
@ -248,10 +263,9 @@ static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelP
|
|||||||
static ncclResult_t addCollToPlan(
|
static ncclResult_t addCollToPlan(
|
||||||
struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, int funcIndex,
|
struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, int funcIndex,
|
||||||
struct ncclWorkElem const* workElem, struct ncclProxyOp const* proxyOp,
|
struct ncclWorkElem const* workElem, struct ncclProxyOp const* proxyOp,
|
||||||
int nBid, size_t bytes, bool regBufUsed, void* regBufSend[], void* regBufRecv[]
|
int nCollChannels, int nBid, size_t bytes, bool regBufUsed, void* regBufSend[], void* regBufRecv[]
|
||||||
) {
|
) {
|
||||||
struct ncclKernelPlan::Channel *chans = plan->channels;
|
struct ncclKernelPlan::Channel *chans = plan->channels;
|
||||||
int nCollChannels = comm->nChannels;
|
|
||||||
|
|
||||||
// Choose the `nBid` least loaded channels to do the work. This ensures
|
// Choose the `nBid` least loaded channels to do the work. This ensures
|
||||||
// all bids go to different channels in case they need to synchronize.
|
// all bids go to different channels in case they need to synchronize.
|
||||||
@ -268,9 +282,7 @@ static ncclResult_t addCollToPlan(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Sort in the rest of the channels. If a channel has less work than the max
|
// Sort in the rest of the channels. If a channel has less work than the max
|
||||||
// member of least[], replace that member and compute the new max. The optimal
|
// member of least[], replace that member and compute the new max.
|
||||||
// algorithm uses a max-heap, but for our small sizes I suspect the better
|
|
||||||
// asymptotic complexity would be swamped by the increased instruction complexity.
|
|
||||||
for (int c=nBid; c < nCollChannels; c++) {
|
for (int c=nBid; c < nCollChannels; c++) {
|
||||||
if (chans[c].collBytes < maxBytesInLeast) {
|
if (chans[c].collBytes < maxBytesInLeast) {
|
||||||
least[maxIndexInLeast] = c;
|
least[maxIndexInLeast] = c;
|
||||||
@ -541,8 +553,9 @@ static ncclResult_t scheduleCollTasksToPlan(
|
|||||||
info.sliceSteps = head->sliceSteps;
|
info.sliceSteps = head->sliceSteps;
|
||||||
NCCLCHECK(ncclInfoSetDerived(&info, comm->nRanks));
|
NCCLCHECK(ncclInfoSetDerived(&info, comm->nRanks));
|
||||||
if (nAggOps > 1) {
|
if (nAggOps > 1) {
|
||||||
|
int maxChannels = aggInfo.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels;
|
||||||
info.nChannels = DIVUP(info.nBytes, bytePerChannel[collNetSupport]);
|
info.nChannels = DIVUP(info.nBytes, bytePerChannel[collNetSupport]);
|
||||||
info.nChannels = std::max(1, std::min(info.nChannels, comm->nChannels));
|
info.nChannels = std::max(1, std::min(info.nChannels, maxChannels));
|
||||||
info.algorithm = aggInfo.algorithm;
|
info.algorithm = aggInfo.algorithm;
|
||||||
info.protocol = aggInfo.protocol;
|
info.protocol = aggInfo.protocol;
|
||||||
info.nThreads = aggInfo.nThreads;
|
info.nThreads = aggInfo.nThreads;
|
||||||
@ -565,8 +578,9 @@ static ncclResult_t scheduleCollTasksToPlan(
|
|||||||
NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, ®BufUsed, regBufSend, regBufRecv));
|
NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, ®BufUsed, regBufSend, regBufRecv));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int maxChannels = info.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels;
|
||||||
NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp,
|
NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp,
|
||||||
info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv));
|
maxChannels, info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv));
|
||||||
tasks->nTasksColl -= 1;
|
tasks->nTasksColl -= 1;
|
||||||
tasks->collBytesTotal -= info.nBytes;
|
tasks->collBytesTotal -= info.nBytes;
|
||||||
ncclIntruQueueDequeue(&tasks->collQueue);
|
ncclIntruQueueDequeue(&tasks->collQueue);
|
||||||
@ -856,7 +870,7 @@ static void CUDART_CB hostStreamPlanCallback(void *plan_) {
|
|||||||
struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plan_;
|
struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plan_;
|
||||||
ncclResult_t result = hostStreamPlanTask(plan->comm, plan);
|
ncclResult_t result = hostStreamPlanTask(plan->comm, plan);
|
||||||
if (result != ncclSuccess) {
|
if (result != ncclSuccess) {
|
||||||
WARN("hostStreamPlanCallback() failed : %s\n", ncclGetErrorString(result));
|
WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -964,7 +978,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
|
|||||||
}
|
}
|
||||||
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->deviceStream), result, failure);
|
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->deviceStream), result, failure);
|
||||||
|
|
||||||
if (persistent || comm->persistentRefs != 0) {
|
if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking) {
|
||||||
// We have to launch host tasks to push proxy args. We are careful to only
|
// We have to launch host tasks to push proxy args. We are careful to only
|
||||||
// do this if necessary since host tasks impose a high performance cost in CUDA.
|
// do this if necessary since host tasks impose a high performance cost in CUDA.
|
||||||
bool acquired = false;
|
bool acquired = false;
|
||||||
@ -1005,12 +1019,6 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if CUDART_VERSION >= 11080
|
|
||||||
#define NCCL_MAX_CGA_CLUSTER_SIZE 8
|
|
||||||
#define NCCL_CGA_CLUSTER_SIZE_SM90 4
|
|
||||||
NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", -2);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if CUDART_VERSION >= 12000
|
#if CUDART_VERSION >= 12000
|
||||||
// NCCL uses the "Remote" Mem Sync domain by default
|
// NCCL uses the "Remote" Mem Sync domain by default
|
||||||
NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
|
NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
|
||||||
@ -1022,6 +1030,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
|
|||||||
cudaStream_t launchStream = tasks->streams->stream;
|
cudaStream_t launchStream = tasks->streams->stream;
|
||||||
dim3 grid = {(unsigned)plan->channelCount, 1, 1};
|
dim3 grid = {(unsigned)plan->channelCount, 1, 1};
|
||||||
dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
|
dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
|
||||||
|
size_t smem = ncclShmemDynamicSize(comm->cudaArch);
|
||||||
void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead};
|
void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead};
|
||||||
|
|
||||||
#if CUDART_VERSION >= 11080
|
#if CUDART_VERSION >= 11080
|
||||||
@ -1029,19 +1038,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
|
|||||||
NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
|
NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
|
||||||
if (driverVersion >= 11080) {
|
if (driverVersion >= 11080) {
|
||||||
int compCap = comm->compCap;
|
int compCap = comm->compCap;
|
||||||
unsigned int clusterSize = (compCap == 90) ? NCCL_CGA_CLUSTER_SIZE_SM90 : 0;
|
unsigned int clusterSize = (compCap == 90) ? comm->cgaClusterSize : 0;
|
||||||
if (ncclParamCGAClusterSize() != -2) {
|
|
||||||
clusterSize = ncclParamCGAClusterSize();
|
|
||||||
if (clusterSize > NCCL_MAX_CGA_CLUSTER_SIZE) {
|
|
||||||
static bool warned = false;
|
|
||||||
if (warned == false) {
|
|
||||||
WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.",
|
|
||||||
clusterSize, NCCL_MAX_CGA_CLUSTER_SIZE);
|
|
||||||
warned = true;
|
|
||||||
}
|
|
||||||
clusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaLaunchConfig_t launchConfig = {0};
|
cudaLaunchConfig_t launchConfig = {0};
|
||||||
cudaLaunchAttribute launchAttrs[3];
|
cudaLaunchAttribute launchAttrs[3];
|
||||||
@ -1073,6 +1070,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
|
|||||||
#endif
|
#endif
|
||||||
launchConfig.gridDim = grid;
|
launchConfig.gridDim = grid;
|
||||||
launchConfig.blockDim = block;
|
launchConfig.blockDim = block;
|
||||||
|
launchConfig.dynamicSmemBytes = smem;
|
||||||
launchConfig.attrs = launchAttrs;
|
launchConfig.attrs = launchAttrs;
|
||||||
launchConfig.numAttrs = attrs;
|
launchConfig.numAttrs = attrs;
|
||||||
launchConfig.stream = launchStream;
|
launchConfig.stream = launchStream;
|
||||||
@ -1082,12 +1080,12 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
// Standard kernel launch
|
// Standard kernel launch
|
||||||
CUDACHECK(cudaLaunchKernel(fn, grid, block, args, 0, launchStream));
|
CUDACHECK(cudaLaunchKernel(fn, grid, block, args, smem, launchStream));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
||||||
if (comm->persistentRefs == 0) { // implies !plan->persistent
|
if (!(plan->persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking)) {
|
||||||
// If this isn't being captured and there aren't any CUDA graphs alive
|
// If this isn't being captured and there aren't any CUDA graphs alive
|
||||||
// then we don't need to do our proxyOp pushing on the host stream.
|
// then we don't need to do our proxyOp pushing on the host stream.
|
||||||
NCCLCHECK(hostStreamPlanTask(comm, plan));
|
NCCLCHECK(hostStreamPlanTask(comm, plan));
|
||||||
@ -1161,6 +1159,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
|
|||||||
int nAlgos = NCCL_NUM_ALGORITHMS;
|
int nAlgos = NCCL_NUM_ALGORITHMS;
|
||||||
for (int a=0; a<nAlgos; a++) {
|
for (int a=0; a<nAlgos; a++) {
|
||||||
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetTypeSupport != 1) continue;
|
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetTypeSupport != 1) continue;
|
||||||
|
if (a == NCCL_ALGO_NVLS && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
|
||||||
|
|
||||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||||
float time;
|
float time;
|
||||||
NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time));
|
NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time));
|
||||||
@ -1193,6 +1193,9 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
|
|||||||
}
|
}
|
||||||
ncSwitch /= 2;
|
ncSwitch /= 2;
|
||||||
}
|
}
|
||||||
|
} else if (info->algorithm == NCCL_ALGO_NVLS) {
|
||||||
|
// NVLS should not need more than 16 channels to get peak BW.
|
||||||
|
nc = comm->nvlsChannels;
|
||||||
} else {
|
} else {
|
||||||
// Ring/Tree channel tuning
|
// Ring/Tree channel tuning
|
||||||
while (info->nBytes < nc*nt*threadThreshold) {
|
while (info->nBytes < nc*nt*threadThreshold) {
|
||||||
@ -1207,6 +1210,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
|
|||||||
if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE;
|
if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE;
|
||||||
if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) nt += 3*WARP_SIZE;
|
if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) nt += 3*WARP_SIZE;
|
||||||
if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) nt += 3*WARP_SIZE;
|
if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) nt += 3*WARP_SIZE;
|
||||||
|
if (info->algorithm == NCCL_ALGO_NVLS) nt = NCCL_MAX_NTHREADS;
|
||||||
}
|
}
|
||||||
nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
|
nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
|
||||||
info->nChannels = nc;
|
info->nChannels = nc;
|
||||||
@ -1225,6 +1229,7 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
|
|||||||
info->pattern = ncclPatternRing; break;
|
info->pattern = ncclPatternRing; break;
|
||||||
case ncclFuncAllReduce:
|
case ncclFuncAllReduce:
|
||||||
info->pattern =
|
info->pattern =
|
||||||
|
info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
|
||||||
info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
|
info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
|
||||||
info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain :
|
info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain :
|
||||||
info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown :
|
info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown :
|
||||||
@ -1244,6 +1249,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
|
|||||||
case ncclPatternPipelineFrom:
|
case ncclPatternPipelineFrom:
|
||||||
case ncclPatternPipelineTo:
|
case ncclPatternPipelineTo:
|
||||||
case ncclPatternCollnetChain:
|
case ncclPatternCollnetChain:
|
||||||
|
case ncclPatternNvls:
|
||||||
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
|
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
|
||||||
case ncclPatternCollnetDirect:
|
case ncclPatternCollnetDirect:
|
||||||
info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collnetDirect.nHeads; break;
|
info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collnetDirect.nHeads; break;
|
||||||
@ -1319,6 +1325,14 @@ comp_next:
|
|||||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth*8 && chunkSize > 65536) chunkSize /= 2;
|
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth*8 && chunkSize > 65536) chunkSize /= 2;
|
||||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
|
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
|
||||||
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||||
|
} else if (info->algorithm == NCCL_ALGO_NVLS) {
|
||||||
|
if (chunkSize > 131072) chunkSize = 131072;
|
||||||
|
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
|
||||||
|
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
|
||||||
|
if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
|
||||||
|
if ((info->nBytes < (8 * (concurrentOps*chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
|
||||||
|
if ((info->nBytes < (2 * (concurrentOps*chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
|
||||||
|
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||||
} else if (info->protocol == NCCL_PROTO_LL) {
|
} else if (info->protocol == NCCL_PROTO_LL) {
|
||||||
const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
|
const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
|
||||||
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
|
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
|
||||||
@ -1618,6 +1632,11 @@ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) {
|
|||||||
WARN("ncclRedOpDestroy : operator is garbage.");
|
WARN("ncclRedOpDestroy : operator is garbage.");
|
||||||
return ncclInvalidArgument;
|
return ncclInvalidArgument;
|
||||||
}
|
}
|
||||||
|
if (comm == NULL) {
|
||||||
|
WARN("ncclRedOpDestroy : invalid communicator passed.");
|
||||||
|
return ncclInvalidArgument;
|
||||||
|
}
|
||||||
|
|
||||||
int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps);
|
int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps);
|
||||||
if (comm->userRedOpCapacity <= ix || comm->userRedOps[ix].freeNext != -1) {
|
if (comm->userRedOpCapacity <= ix || comm->userRedOps[ix].freeNext != -1) {
|
||||||
WARN("ncclRedOpDestroy : operator unknown to this communicator.");
|
WARN("ncclRedOpDestroy : operator unknown to this communicator.");
|
||||||
|
@ -313,8 +313,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
|||||||
|
|
||||||
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
|
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
|
||||||
// We permit combining max, then min, to only use the first channels, then duplicate them.
|
// We permit combining max, then min, to only use the first channels, then duplicate them.
|
||||||
nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels);
|
nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->maxCTAs);
|
||||||
nChannels = comm->nChannels = copyChannels(comm, nChannels, ncclMinNchannels(), ringPrev, ringNext);
|
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->minCTAs), ringPrev, ringNext);
|
||||||
|
|
||||||
// Create rings array and check all is fine
|
// Create rings array and check all is fine
|
||||||
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
|
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
|
||||||
|
@ -461,7 +461,7 @@ ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank
|
|||||||
type = node->type;
|
type = node->type;
|
||||||
}
|
}
|
||||||
if (type != GPU) {
|
if (type != GPU) {
|
||||||
WARN("Could not find intermediate GPU between GPU rank %d and NIC %d\n", rank, netDev);
|
WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev);
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
*intermediateRank = node->gpu.rank;
|
*intermediateRank = node->gpu.rank;
|
||||||
@ -707,6 +707,7 @@ static int nextPow2(int v) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||||
|
/* here we already honor comm->max/minCTAs for p2pnChannels. */
|
||||||
comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
|
comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
|
||||||
comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels());
|
comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels());
|
||||||
int minChannels = comm->p2pnChannels;
|
int minChannels = comm->p2pnChannels;
|
||||||
@ -734,7 +735,6 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
|||||||
for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
|
for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
|
||||||
comm->p2pChannels[c] = mirror;
|
comm->p2pChannels[c] = mirror;
|
||||||
}
|
}
|
||||||
INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -765,7 +765,6 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
|||||||
|
|
||||||
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
|
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
|
||||||
|
|
||||||
// SPLIT_TREE works better on older archs.
|
|
||||||
int ccMin;
|
int ccMin;
|
||||||
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
|
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
|
||||||
|
|
||||||
|
@ -815,6 +815,6 @@ ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int*
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
WARN("Could not find local GPU with rank %d\n", rank);
|
WARN("Could not find local GPU with rank %d", rank);
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
|
@ -53,7 +53,7 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
|
|||||||
|
|
||||||
// Latencies in us, Bandwidths in GB/s
|
// Latencies in us, Bandwidths in GB/s
|
||||||
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
|
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
|
||||||
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 10.0, 8.4 }, { 4.4, 4.4, 0 }, { 4.4, 4.4, 0 }};
|
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 10.0, 8.4 }, { 4.4, 4.4, 0 }, { 4.4, 4.4, 0 }, { 0, 0, 40.0 }};
|
||||||
|
|
||||||
// NVLink, PCI, Network
|
// NVLink, PCI, Network
|
||||||
#define NCCL_HW_NVLINK 0
|
#define NCCL_HW_NVLINK 0
|
||||||
@ -63,13 +63,16 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4,
|
|||||||
static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
|
static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
|
||||||
{ /* NVLINK */
|
{ /* NVLINK */
|
||||||
{ /* Tree (LL/LL128/Simple)*/ { .52, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .47, 1.9, 3.4 },
|
{ /* Tree (LL/LL128/Simple)*/ { .52, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .47, 1.9, 3.4 },
|
||||||
/* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 } },
|
/* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 },
|
||||||
|
/* NVLS */ { 0, 0, 0 } },
|
||||||
/* PCI */
|
/* PCI */
|
||||||
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
|
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
|
||||||
/* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 } },
|
/* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 },
|
||||||
|
/* NVLS */ { 0, 0, 0 } },
|
||||||
/* NET */
|
/* NET */
|
||||||
{ /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 9.6 },
|
{ /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 9.6 },
|
||||||
/* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 10.7 } }
|
/* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 10.7 },
|
||||||
|
/* NVLS */ { 0, 0, 0 } }
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Array indexes used below */
|
/* Array indexes used below */
|
||||||
@ -78,7 +81,7 @@ static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
|
|||||||
#define HOPPER_COMPCAP_IDX 2
|
#define HOPPER_COMPCAP_IDX 2
|
||||||
|
|
||||||
// LL128 max BW per channel
|
// LL128 max BW per channel
|
||||||
static const double ll128MaxBwPerCh = 20.0;
|
static const double ll128MaxBwPerCh[3] = { 20.0, 20.0, 36.7 };
|
||||||
static const double llMaxBws[3][3] = {
|
static const double llMaxBws[3][3] = {
|
||||||
/* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
|
/* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
|
||||||
/* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
|
/* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
|
||||||
@ -88,7 +91,7 @@ static const double llMaxBws[3][3] = {
|
|||||||
static const double perChMaxTreeBws[3][3] = {
|
static const double perChMaxTreeBws[3][3] = {
|
||||||
/* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
|
/* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
|
||||||
/* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
|
/* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
|
||||||
/* Hopper (N1/N2/N4) */ {24.0, 23.6, 17.8},
|
/* Hopper (N1/N2/N4) */ {38.7, 41.4, 33.0},
|
||||||
};
|
};
|
||||||
|
|
||||||
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
|
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
|
||||||
@ -98,7 +101,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||||||
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
|
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
|
||||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
|
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
|
||||||
comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] =
|
comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] =
|
||||||
comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
|
comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] =
|
||||||
|
comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
|
||||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] =
|
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] =
|
||||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
|
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
|
||||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
|
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
|
||||||
@ -108,7 +112,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||||||
int nRanks = comm->nRanks;
|
int nRanks = comm->nRanks;
|
||||||
if (nRanks <= 1) return ncclSuccess;
|
if (nRanks <= 1) return ncclSuccess;
|
||||||
|
|
||||||
int compCapIndex = (minCompCap == 80 && maxCompCap == 80) ? AMPERE_COMPCAP_IDX : ((minCompCap == 90 && maxCompCap == 90) ? HOPPER_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
|
int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX;
|
||||||
int cpuArch, cpuVendor, cpuModel;
|
int cpuArch, cpuVendor, cpuModel;
|
||||||
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
|
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
|
||||||
int index2 = nNodes <= 2 ? nNodes-1 : 2;
|
int index2 = nNodes <= 2 ? nNodes-1 : 2;
|
||||||
@ -120,7 +124,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||||||
if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
|
if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
|
||||||
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
|
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
|
||||||
|
|
||||||
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph, collNetGraph };
|
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph, collNetGraph, ringGraph/* we only need the NVSwitch speed for NVLS*/ };
|
||||||
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
|
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
|
||||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
|
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
|
||||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
|
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
|
||||||
@ -134,20 +138,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||||||
nNodes;
|
nNodes;
|
||||||
|
|
||||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||||
if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue;
|
if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
|
||||||
|
if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
|
||||||
|
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
|
||||||
|
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
|
||||||
|
|
||||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||||
|
if (a == NCCL_ALGO_NVLS && p != NCCL_PROTO_SIMPLE) continue;
|
||||||
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
|
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
|
||||||
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
|
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
|
||||||
float busBw = graphs[a]->nChannels * bw;
|
float busBw = graphs[a]->nChannels * bw;
|
||||||
|
|
||||||
// Various model refinements
|
// Various model refinements
|
||||||
if (compCapIndex == AMPERE_COMPCAP_IDX) busBw = std::min(busBw, 235.0f);
|
if (compCapIndex == AMPERE_COMPCAP_IDX) busBw = std::min(busBw, 235.0f);
|
||||||
|
if (compCapIndex == HOPPER_COMPCAP_IDX) busBw = std::min(busBw, 370.0f);
|
||||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
|
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
|
||||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh*graphs[a]->nChannels);
|
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[compCapIndex]*graphs[a]->nChannels);
|
||||||
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
|
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
|
||||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
|
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
|
||||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh*graphs[a]->nChannels);
|
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh[compCapIndex]*graphs[a]->nChannels);
|
||||||
if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
||||||
if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
||||||
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
|
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
|
||||||
@ -159,7 +168,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||||||
if (a == NCCL_ALGO_COLLNET_CHAIN && p == NCCL_PROTO_SIMPLE) busBw *= .75;
|
if (a == NCCL_ALGO_COLLNET_CHAIN && p == NCCL_PROTO_SIMPLE) busBw *= .75;
|
||||||
|
|
||||||
// Convert bus BW to algorithm BW
|
// Convert bus BW to algorithm BW
|
||||||
float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps;
|
float ratio;
|
||||||
|
if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps;
|
||||||
|
else if (a == NCCL_ALGO_NVLS) ratio = .75;
|
||||||
|
else ratio = .5;
|
||||||
comm->bandwidths[coll][a][p] = busBw * ratio;
|
comm->bandwidths[coll][a][p] = busBw * ratio;
|
||||||
|
|
||||||
comm->latencies[coll][a][p] = baseLat[a][p];
|
comm->latencies[coll][a][p] = baseLat[a][p];
|
||||||
@ -195,7 +207,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||||||
// Protocols/Algorithms enable/disable, and user overrides.
|
// Protocols/Algorithms enable/disable, and user overrides.
|
||||||
// All are enabled except ll128 which is enabled by default only in certain cases.
|
// All are enabled except ll128 which is enabled by default only in certain cases.
|
||||||
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
|
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
|
||||||
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1 };
|
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1 };
|
||||||
|
|
||||||
const char *protoStr = getenv("NCCL_PROTO");
|
const char *protoStr = getenv("NCCL_PROTO");
|
||||||
if (protoStr) {
|
if (protoStr) {
|
||||||
@ -207,6 +219,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||||||
INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
|
INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
|
||||||
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
|
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Disable NVLink SHARP if not supported
|
||||||
|
if (comm->nvlsSupport == 0 /* || comm->localRanks <= 2*/) algoEnable[NCCL_ALGO_NVLS] = 0;
|
||||||
|
|
||||||
// Disable CollNet if it is not supported
|
// Disable CollNet if it is not supported
|
||||||
if (comm->collNetSupport == 0) {
|
if (comm->collNetSupport == 0) {
|
||||||
algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
|
algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
|
||||||
@ -228,7 +244,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||||||
if (pEnable == 2 && p == NCCL_PROTO_LL128) {
|
if (pEnable == 2 && p == NCCL_PROTO_LL128) {
|
||||||
// Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption.
|
// Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption.
|
||||||
pEnable = 1;
|
pEnable = 1;
|
||||||
pEnable &= (graphs[a]->typeInter <= PATH_PXB);
|
pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN));
|
||||||
pEnable &= (graphs[a]->typeIntra <= PATH_NVL);
|
pEnable &= (graphs[a]->typeIntra <= PATH_NVL);
|
||||||
pEnable &= (minCompCap == maxCompCap);
|
pEnable &= (minCompCap == maxCompCap);
|
||||||
switch (minCompCap) {
|
switch (minCompCap) {
|
||||||
@ -239,8 +255,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
|
if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
|
||||||
// Only disable algo for Allreduce since others only have one
|
// Never disable ring for non-allreduce operations. That allows to run real apps with NCCL_ALGO=TREE.
|
||||||
if (c == ncclFuncAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
|
if (a == NCCL_ALGO_RING && c != ncclFuncAllReduce) continue;
|
||||||
|
if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (comm->rank == 0) {
|
if (comm->rank == 0) {
|
||||||
@ -284,9 +301,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||||||
char* str = getenv("NCCL_THREAD_THRESHOLDS");
|
char* str = getenv("NCCL_THREAD_THRESHOLDS");
|
||||||
if (str) {
|
if (str) {
|
||||||
INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str);
|
INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str);
|
||||||
ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }, { -2, -2, -2 }, { -2, -2, -2 }};
|
ssize_t t[2][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }};
|
||||||
sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
|
sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
|
||||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
for (int a=0; a<2; a++) {
|
||||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||||
if (t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p];
|
if (t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p];
|
||||||
}
|
}
|
||||||
@ -323,7 +340,9 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
|
|||||||
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
|
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
|
||||||
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
|
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
|
||||||
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
|
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
|
||||||
&& info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring
|
&& info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) {
|
||||||
|
lat *= info->comm->minCompCap < 90 ? 1.9 : 1.5; // Plateau effect of ring
|
||||||
|
}
|
||||||
// Tree pipelining saves latency in aggregation cases
|
// Tree pipelining saves latency in aggregation cases
|
||||||
int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS);
|
int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS);
|
||||||
*time = lat * latCount + (info->nBytes) / (1000 * bw);
|
*time = lat * latCount + (info->nBytes) / (1000 * bw);
|
||||||
|
@ -315,7 +315,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
|
|||||||
ret = ncclSystemError;
|
ret = ncclSystemError;
|
||||||
}
|
}
|
||||||
job->state = ncclGroupJobJoined;
|
job->state = ncclGroupJobJoined;
|
||||||
if (job->result != ncclSuccess) {
|
if (job->result != ncclSuccess && ret == ncclSuccess) {
|
||||||
ret = job->result;
|
ret = job->result;
|
||||||
errorJobAbortFlag = true;
|
errorJobAbortFlag = true;
|
||||||
}
|
}
|
||||||
@ -326,7 +326,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
|
|||||||
|
|
||||||
if (*groupAbortFlag == true || errorJobAbortFlag == true) {
|
if (*groupAbortFlag == true || errorJobAbortFlag == true) {
|
||||||
*job->abortFlag = 1;
|
*job->abortFlag = 1;
|
||||||
ret = ncclInternalError;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
job = job->next;
|
job = job->next;
|
||||||
|
@ -25,6 +25,7 @@ ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int s
|
|||||||
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
|
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
|
||||||
ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
|
ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
|
||||||
ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
|
ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
|
||||||
|
ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
|
||||||
ncclResult_t bootstrapClose(void* commState);
|
ncclResult_t bootstrapClose(void* commState);
|
||||||
ncclResult_t bootstrapAbort(void* commState);
|
ncclResult_t bootstrapAbort(void* commState);
|
||||||
#endif
|
#endif
|
||||||
|
@ -53,7 +53,8 @@ struct ncclDevRedOpFull {
|
|||||||
DECL4(func, RING, devredop, type, undef) \
|
DECL4(func, RING, devredop, type, undef) \
|
||||||
DECL4(func, TREE, devredop, type, undef) \
|
DECL4(func, TREE, devredop, type, undef) \
|
||||||
DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
|
DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
|
||||||
DECL4(func, COLLNET_CHAIN, devredop, type, undef)
|
DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
|
||||||
|
DECL4(func, NVLS, devredop, type, undef)
|
||||||
|
|
||||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||||
#define DECL2(func, devredop, undefForFloat) \
|
#define DECL2(func, devredop, undefForFloat) \
|
||||||
@ -121,4 +122,13 @@ extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
|
|||||||
#define REDUCE_CHUNKSTEPS 1
|
#define REDUCE_CHUNKSTEPS 1
|
||||||
#define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
|
#define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
|
||||||
|
|
||||||
|
// We can't use the enum identifiers like ncclSum, ncclFloat, etc since this
|
||||||
|
// macro will be used in preprocessor conditionals where enums have no meaning.
|
||||||
|
#define NCCL_NVLS_SUPPORTS(/*ncclDataType_t*/ type, /*ncclDevRedOp_t*/ red) \
|
||||||
|
(((type==2 || type==3) && (red==0 || red==2 || red==3)) || \
|
||||||
|
((type==4 || type==5) && (red==0 || red==2 || red==3)) || \
|
||||||
|
((type==6 || type==9) && (red==0 || red==2 || red==3)) || \
|
||||||
|
(type==7 && red==0) || \
|
||||||
|
(type==8 && red==0))
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -104,6 +104,7 @@ struct ncclChannel {
|
|||||||
struct ncclTree tree;
|
struct ncclTree tree;
|
||||||
struct ncclTree collnetChain;
|
struct ncclTree collnetChain;
|
||||||
struct ncclDirect collnetDirect;
|
struct ncclDirect collnetDirect;
|
||||||
|
struct ncclNvls nvls;
|
||||||
int id; // index of this channel
|
int id; // index of this channel
|
||||||
uint32_t workFifoSent; // last used work index+1
|
uint32_t workFifoSent; // last used work index+1
|
||||||
uint64_t p2pOpCount;
|
uint64_t p2pOpCount;
|
||||||
@ -177,8 +178,10 @@ struct ncclComm {
|
|||||||
int nRanks; // number of GPUs in communicator
|
int nRanks; // number of GPUs in communicator
|
||||||
int cudaDev; // my cuda device index
|
int cudaDev; // my cuda device index
|
||||||
int compCap; // compute capability of the GPU
|
int compCap; // compute capability of the GPU
|
||||||
|
int minCompCap; // min compute capability in the communicator
|
||||||
int64_t busId; // my PCI bus ID in int format
|
int64_t busId; // my PCI bus ID in int format
|
||||||
cpu_set_t cpuAffinity; // CPU affinity of the GPU
|
cpu_set_t cpuAffinity; // CPU affinity of the GPU
|
||||||
|
int cudaArch; // matches __CUDA_ARCH__ of device
|
||||||
|
|
||||||
int node;
|
int node;
|
||||||
int nNodes;
|
int nNodes;
|
||||||
@ -201,6 +204,7 @@ struct ncclComm {
|
|||||||
|
|
||||||
// Channels for collectives
|
// Channels for collectives
|
||||||
int nChannels;
|
int nChannels;
|
||||||
|
int nvlsChannels;
|
||||||
// Channels (per peer) for p2p
|
// Channels (per peer) for p2p
|
||||||
int p2pnChannels;
|
int p2pnChannels;
|
||||||
int p2pnChannelsPerPeer;
|
int p2pnChannelsPerPeer;
|
||||||
@ -257,6 +261,10 @@ struct ncclComm {
|
|||||||
int collNetSupport;
|
int collNetSupport;
|
||||||
int intraHighestTransportType;
|
int intraHighestTransportType;
|
||||||
|
|
||||||
|
// NVLink SHARP (NVLS) support
|
||||||
|
int nvlsSupport;
|
||||||
|
void* nvlsResources;
|
||||||
|
|
||||||
size_t channelSize; // User requested work size (bytes) for channel partitions
|
size_t channelSize; // User requested work size (bytes) for channel partitions
|
||||||
|
|
||||||
// Internal streams
|
// Internal streams
|
||||||
@ -288,6 +296,11 @@ struct ncclComm {
|
|||||||
|
|
||||||
// communicator mode
|
// communicator mode
|
||||||
int blocking;
|
int blocking;
|
||||||
|
// CGA cluster size
|
||||||
|
int cgaClusterSize;
|
||||||
|
int minCTAs, maxCTAs;
|
||||||
|
// network interface name
|
||||||
|
char *netName;
|
||||||
// initState is to more conveniently reclaim resources when errors happen.
|
// initState is to more conveniently reclaim resources when errors happen.
|
||||||
ncclResult_t initState;
|
ncclResult_t initState;
|
||||||
// flag to indicate if ncclCommFinalize() is called
|
// flag to indicate if ncclCommFinalize() is called
|
||||||
|
@ -73,10 +73,32 @@ DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
|
|||||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
|
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
|
||||||
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020);
|
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020);
|
||||||
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
|
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
|
||||||
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
|
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
|
||||||
|
// cuMem API support
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
|
||||||
#if CUDA_VERSION >= 11070
|
#if CUDA_VERSION >= 11070
|
||||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
|
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
|
||||||
#endif
|
#endif
|
||||||
|
#if CUDA_VERSION >= 12010
|
||||||
|
/* NVSwitch Multicast support */
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
|
||||||
|
DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* CUDA Driver functions loaded with dlsym() */
|
/* CUDA Driver functions loaded with dlsym() */
|
||||||
@ -88,6 +110,7 @@ DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030);
|
|||||||
ncclResult_t ncclCudaLibraryInit(void);
|
ncclResult_t ncclCudaLibraryInit(void);
|
||||||
|
|
||||||
extern int ncclCudaDriverVersionCache;
|
extern int ncclCudaDriverVersionCache;
|
||||||
|
extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
|
||||||
|
|
||||||
inline ncclResult_t ncclCudaDriverVersion(int* driver) {
|
inline ncclResult_t ncclCudaDriverVersion(int* driver) {
|
||||||
int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED);
|
int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED);
|
||||||
@ -98,5 +121,4 @@ inline ncclResult_t ncclCudaDriverVersion(int* driver) {
|
|||||||
*driver = version;
|
*driver = version;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -15,11 +15,12 @@
|
|||||||
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
|
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
|
||||||
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
|
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
|
||||||
|
|
||||||
#define NCCL_NUM_ALGORITHMS 4 // Tree/Ring/CollNet*
|
#define NCCL_NUM_ALGORITHMS 5 // Tree/Ring/CollNet*
|
||||||
#define NCCL_ALGO_TREE 0
|
#define NCCL_ALGO_TREE 0
|
||||||
#define NCCL_ALGO_RING 1
|
#define NCCL_ALGO_RING 1
|
||||||
#define NCCL_ALGO_COLLNET_DIRECT 2
|
#define NCCL_ALGO_COLLNET_DIRECT 2
|
||||||
#define NCCL_ALGO_COLLNET_CHAIN 3
|
#define NCCL_ALGO_COLLNET_CHAIN 3
|
||||||
|
#define NCCL_ALGO_NVLS 4
|
||||||
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
|
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
|
||||||
|
|
||||||
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
|
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
|
||||||
@ -78,6 +79,7 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
|
|||||||
#define NCCL_DIRECT_NIC 0x04
|
#define NCCL_DIRECT_NIC 0x04
|
||||||
#define NCCL_IPC_WRITE 0x08
|
#define NCCL_IPC_WRITE 0x08
|
||||||
#define NCCL_IPC_READ 0x10
|
#define NCCL_IPC_READ 0x10
|
||||||
|
#define NCCL_NVLS_MIN_POLL 0x20
|
||||||
|
|
||||||
struct ncclConnInfo {
|
struct ncclConnInfo {
|
||||||
// Regular comm mechanism
|
// Regular comm mechanism
|
||||||
@ -85,7 +87,7 @@ struct ncclConnInfo {
|
|||||||
uint64_t *tail; // Local for recv, remote for send
|
uint64_t *tail; // Local for recv, remote for send
|
||||||
uint64_t *head; // Local for send, remote for recv
|
uint64_t *head; // Local for send, remote for recv
|
||||||
|
|
||||||
int direct; // Direct communication
|
int flags; // Direct communication / other flags
|
||||||
int shared; // Buffers are shared
|
int shared; // Buffers are shared
|
||||||
void **ptrExchange; // Pointer exchange for direct communication
|
void **ptrExchange; // Pointer exchange for direct communication
|
||||||
uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
|
uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
|
||||||
@ -138,13 +140,22 @@ struct ncclTree {
|
|||||||
struct ncclDirect {
|
struct ncclDirect {
|
||||||
int depth;
|
int depth;
|
||||||
int out;
|
int out;
|
||||||
int nHeads;
|
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
|
||||||
int headRank;
|
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
|
||||||
int shift;
|
int shift; // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
|
||||||
int up[NCCL_MAX_DIRECT_ARITY];
|
int up[NCCL_MAX_DIRECT_ARITY];
|
||||||
int down[NCCL_MAX_DIRECT_ARITY];
|
int down[NCCL_MAX_DIRECT_ARITY];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define NCCL_MAX_NVLS_ARITY 8
|
||||||
|
struct ncclNvls {
|
||||||
|
int out;
|
||||||
|
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
|
||||||
|
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
|
||||||
|
int up[NCCL_MAX_NVLS_ARITY];
|
||||||
|
int down;
|
||||||
|
};
|
||||||
|
|
||||||
#define NCCL_MAX_CONNS 2
|
#define NCCL_MAX_CONNS 2
|
||||||
struct ncclChannelPeer {
|
struct ncclChannelPeer {
|
||||||
struct ncclConnector send[NCCL_MAX_CONNS];
|
struct ncclConnector send[NCCL_MAX_CONNS];
|
||||||
@ -264,6 +275,7 @@ struct alignas(16) ncclDevChannel {
|
|||||||
struct ncclTree tree;
|
struct ncclTree tree;
|
||||||
struct ncclTree collnetChain;
|
struct ncclTree collnetChain;
|
||||||
struct ncclDirect collnetDirect;
|
struct ncclDirect collnetDirect;
|
||||||
|
struct ncclNvls nvls;
|
||||||
uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
|
uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -288,4 +300,65 @@ struct alignas(16) ncclDevCommAndChannels {
|
|||||||
struct ncclDevChannel channels[MAXCHANNELS];
|
struct ncclDevChannel channels[MAXCHANNELS];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef __CUDA_ARCH__
|
||||||
|
#define NCCL_CUDA_ARCH __CUDA_ARCH__
|
||||||
|
#else
|
||||||
|
#define NCCL_CUDA_ARCH 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
__host__ __device__ constexpr T min_constexpr(T a) { return a; }
|
||||||
|
template<typename T, typename ...Ts>
|
||||||
|
__host__ __device__ constexpr T min_constexpr(T a, T b, Ts ...c) {
|
||||||
|
return min_constexpr<T>((a < b ? a : b), c...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
__host__ __device__ constexpr T max_constexpr(T a) { return a; }
|
||||||
|
template<typename T, typename ...Ts>
|
||||||
|
__host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) {
|
||||||
|
return max_constexpr<T>((a > b ? a : b), c...);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate the unroll factor given:
|
||||||
|
// * bytePerPack: number of bytes accessed per instruction
|
||||||
|
// * insns: max permissible unroll value
|
||||||
|
// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack)
|
||||||
|
__host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int bytes) {
|
||||||
|
return min_constexpr(insns, (bytes + bytePerPack-1)/bytePerPack);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note that all unroll value logic should depend on a given cudaArch argument
|
||||||
|
// and not __CUDA_ARCH__ since these need to be host-side executable where the
|
||||||
|
// arch value is strictly runtime only. By defaulting to NCCL_CUDA_ARCH, device
|
||||||
|
// side code can elide passing the arch for brevity.
|
||||||
|
|
||||||
|
__host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
|
||||||
|
// Our collective unroll should move to the same bytes&insns model as NVLS.
|
||||||
|
return cudaArch >= 800 ? 8 : 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
|
||||||
|
__host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; }
|
||||||
|
|
||||||
|
__host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch = NCCL_CUDA_ARCH) {
|
||||||
|
return ncclCalcUnroll(bytePerPack, ncclNvlsUnrollInsns(cudaArch), ncclNvlsUnrollBytes(cudaArch));
|
||||||
|
}
|
||||||
|
|
||||||
|
// The amount of dynamic shmem per warp
|
||||||
|
__host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
|
||||||
|
return (max_constexpr<int>(
|
||||||
|
/*LL */0,
|
||||||
|
/*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t),
|
||||||
|
/*SIMPLE*/(ncclCollUnroll(cudaArch)*WARP_SIZE + 1)*16,
|
||||||
|
// NVLS needs an extra 16B to read unaligned data.
|
||||||
|
/*NVLS */WARP_SIZE*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16
|
||||||
|
) + 15) & -16; // pad to 16 bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
// The amount of dynamic shmem per block
|
||||||
|
__host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
|
||||||
|
return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -15,8 +15,7 @@
|
|||||||
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
|
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
|
||||||
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
|
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
|
||||||
|
|
||||||
size_t ncclKernMaxLocalSize();
|
ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
|
||||||
ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut);
|
|
||||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
||||||
ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
|
ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
|
||||||
ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
|
ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
|
||||||
|
@ -24,6 +24,7 @@ typedef enum : uint8_t {
|
|||||||
ncclPatternTreeUpDown,
|
ncclPatternTreeUpDown,
|
||||||
ncclPatternCollnetChain,
|
ncclPatternCollnetChain,
|
||||||
ncclPatternCollnetDirect,
|
ncclPatternCollnetDirect,
|
||||||
|
ncclPatternNvls,
|
||||||
ncclPatternSend,
|
ncclPatternSend,
|
||||||
ncclPatternRecv
|
ncclPatternRecv
|
||||||
} ncclPattern_t;
|
} ncclPattern_t;
|
||||||
|
37
src/include/ipcsocket.h
Normal file
37
src/include/ipcsocket.h
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See COPYRIGHT for license information
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NCCL_IPCSOCKET_H
|
||||||
|
#define NCCL_IPCSOCKET_H
|
||||||
|
|
||||||
|
#include "nccl.h"
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <sys/wait.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/socket.h>
|
||||||
|
#include <memory.h>
|
||||||
|
#include <sys/un.h>
|
||||||
|
#include <inttypes.h>
|
||||||
|
|
||||||
|
#define NCCL_IPC_SOCKNAME_LEN 64
|
||||||
|
|
||||||
|
struct ncclIpcSocket {
|
||||||
|
int fd;
|
||||||
|
char socketName[NCCL_IPC_SOCKNAME_LEN];
|
||||||
|
volatile uint32_t* abortFlag;
|
||||||
|
};
|
||||||
|
|
||||||
|
ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
|
||||||
|
ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
|
||||||
|
|
||||||
|
ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
|
||||||
|
ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
|
||||||
|
|
||||||
|
#endif /* NCCL_IPCSOCKET_H */
|
@ -20,7 +20,7 @@
|
|||||||
#define NCCL_NET_MAX_REQUESTS 8
|
#define NCCL_NET_MAX_REQUESTS 8
|
||||||
|
|
||||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
|
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||||
|
|
||||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||||
|
|
||||||
|
@ -7,12 +7,12 @@
|
|||||||
#ifndef NCCL_NVTX_H_
|
#ifndef NCCL_NVTX_H_
|
||||||
#define NCCL_NVTX_H_
|
#define NCCL_NVTX_H_
|
||||||
|
|
||||||
#include "nvtx3.hpp"
|
#include "nvtx3/nvtx3.hpp"
|
||||||
|
|
||||||
#if __cpp_constexpr >= 201304L && !defined(NVTX3_RELAXED_CONSTEXPR)
|
#if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14)
|
||||||
#define NVTX3_RELAXED_CONSTEXPR constexpr
|
#define NVTX3_CONSTEXPR_IF_CPP14 constexpr
|
||||||
#else
|
#else
|
||||||
#define NVTX3_RELAXED_CONSTEXPR
|
#define NVTX3_CONSTEXPR_IF_CPP14
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Define all NCCL-provided static schema IDs here (avoid duplicates).
|
// Define all NCCL-provided static schema IDs here (avoid duplicates).
|
||||||
@ -37,7 +37,7 @@ struct nccl_domain{static constexpr char const* name{"NCCL"};};
|
|||||||
|
|
||||||
class payload_schema {
|
class payload_schema {
|
||||||
public:
|
public:
|
||||||
NVTX3_RELAXED_CONSTEXPR explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
|
explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
|
||||||
{
|
{
|
||||||
schema_attr.name = schemaName;
|
schema_attr.name = schemaName;
|
||||||
schema_attr.entries = entries;
|
schema_attr.entries = entries;
|
||||||
@ -74,11 +74,11 @@ class payload_schema {
|
|||||||
#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
|
#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
|
||||||
static const payload_schema schema{S, std::extent<decltype(S)>::value, \
|
static const payload_schema schema{S, std::extent<decltype(S)>::value, \
|
||||||
NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
|
NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
|
||||||
static ::nvtx3::v1::registered_string<nccl_domain> const nvtx3_func_name__{__func__}; \
|
static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
|
||||||
nvtxPayloadData_t nvtx3_bpl__[] = { \
|
nvtxPayloadData_t nvtx3_bpl__[] = { \
|
||||||
{NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
|
{NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
|
||||||
::nvtx3::v1::event_attributes nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
|
::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
|
||||||
::nvtx3::v1::domain_thread_range<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
|
::nvtx3::v1::scoped_range_in<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
|
||||||
|
|
||||||
extern void initNvtxRegisteredEnums();
|
extern void initNvtxRegisteredEnums();
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2021 NVIDIA Corporation. All rights reserved.
|
* Copyright 2021-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "nvtx3/nvToolsExt.h"
|
#include "nvToolsExt.h"
|
||||||
|
|
||||||
#ifndef NVTOOLSEXT_PAYLOAD_H
|
#ifndef NVTOOLSEXT_PAYLOAD_H
|
||||||
#define NVTOOLSEXT_PAYLOAD_H
|
#define NVTOOLSEXT_PAYLOAD_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
@ -35,10 +35,11 @@ NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_PAYLOAD_FN_NUM
|
|||||||
|
|
||||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
|
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
|
||||||
{
|
{
|
||||||
|
intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1;
|
||||||
nvtxExtModuleSegment_t segment = {
|
nvtxExtModuleSegment_t segment = {
|
||||||
0, // unused (only one segment)
|
0, // unused (only one segment)
|
||||||
NVTX3EXT_CBID_PAYLOAD_FN_NUM,
|
NVTX3EXT_CBID_PAYLOAD_FN_NUM,
|
||||||
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1
|
fnSlots
|
||||||
};
|
};
|
||||||
|
|
||||||
nvtxExtModuleInfo_t module = {
|
nvtxExtModuleInfo_t module = {
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
#include "devcomm.h"
|
#include "devcomm.h"
|
||||||
#include "info.h"
|
#include "info.h"
|
||||||
#include "socket.h"
|
#include "socket.h"
|
||||||
|
#include "ipcsocket.h"
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
#include "shm.h"
|
#include "shm.h"
|
||||||
|
|
||||||
@ -161,6 +162,31 @@ struct ncclProxyProgressState {
|
|||||||
int nextOps;
|
int nextOps;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Expected proxy response fifo
|
||||||
|
struct ncclExpectedProxyResponse {
|
||||||
|
void* opId;
|
||||||
|
int respSize;
|
||||||
|
bool done;
|
||||||
|
void* respBuff;
|
||||||
|
struct ncclExpectedProxyResponse* next;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ncclProxyAsyncOp {
|
||||||
|
int type;
|
||||||
|
struct ncclProxyConnection* connection;
|
||||||
|
int reqSize, respSize;
|
||||||
|
char *reqBuff, *respBuff;
|
||||||
|
void* opId;
|
||||||
|
ncclProxyAsyncOp* next;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ncclProxyLocalPeer {
|
||||||
|
struct ncclSocket sock;
|
||||||
|
int localRank;
|
||||||
|
ncclProxyAsyncOp* asyncOps;
|
||||||
|
int asyncOpCounter;
|
||||||
|
};
|
||||||
|
|
||||||
struct ncclProxyState {
|
struct ncclProxyState {
|
||||||
// Service thread
|
// Service thread
|
||||||
pthread_t thread;
|
pthread_t thread;
|
||||||
@ -176,6 +202,9 @@ struct ncclProxyState {
|
|||||||
|
|
||||||
// Progress thread
|
// Progress thread
|
||||||
struct ncclProxyProgressState progressState;
|
struct ncclProxyProgressState progressState;
|
||||||
|
|
||||||
|
// Queue of expected responses from the proxy
|
||||||
|
struct ncclExpectedProxyResponse* expectedResponses;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum proxyConnectState {
|
enum proxyConnectState {
|
||||||
@ -220,10 +249,19 @@ enum ncclProxyMsgType {
|
|||||||
ncclProxyMsgStart = 5,
|
ncclProxyMsgStart = 5,
|
||||||
ncclProxyMsgClose = 6,
|
ncclProxyMsgClose = 6,
|
||||||
ncclProxyMsgAbort = 7,
|
ncclProxyMsgAbort = 7,
|
||||||
ncclProxyMsgStop = 8
|
ncclProxyMsgStop = 8,
|
||||||
|
ncclProxyMsgConvertFd = 9 // cuMem API support
|
||||||
};
|
};
|
||||||
|
|
||||||
ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
|
// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
|
||||||
|
// Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
|
||||||
|
// ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed
|
||||||
|
ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
|
||||||
|
|
||||||
|
// This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received
|
||||||
|
ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
|
||||||
|
ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
|
||||||
|
|
||||||
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
|
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
|
||||||
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
|
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
|
||||||
#endif
|
#endif
|
||||||
|
@ -92,6 +92,6 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int
|
|||||||
ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
|
ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
|
||||||
ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
|
ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
|
||||||
ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
|
ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
|
||||||
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed);
|
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
|
||||||
ncclResult_t ncclSocketClose(struct ncclSocket* sock);
|
ncclResult_t ncclSocketClose(struct ncclSocket* sock);
|
||||||
#endif
|
#endif
|
||||||
|
@ -62,7 +62,7 @@ struct ncclTransportComm {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct ncclTransport {
|
struct ncclTransport {
|
||||||
const char name[4];
|
const char name[8];
|
||||||
ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
|
ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
|
||||||
struct ncclTransportComm send;
|
struct ncclTransportComm send;
|
||||||
struct ncclTransportComm recv;
|
struct ncclTransportComm recv;
|
||||||
@ -71,6 +71,9 @@ struct ncclTransport {
|
|||||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
|
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
|
||||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
|
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
|
||||||
|
|
||||||
|
ncclResult_t ncclNvlsSetup(struct ncclComm* comm);
|
||||||
|
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
|
||||||
|
|
||||||
enum { collNetRecv=0, collNetSend=1 };
|
enum { collNetRecv=0, collNetSend=1 };
|
||||||
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
|
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
|
||||||
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
|
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
|
||||||
|
235
src/init.cc
235
src/init.cc
@ -35,13 +35,13 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
|
const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
|
||||||
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain" };
|
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS" };
|
||||||
const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
|
const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
|
||||||
|
|
||||||
NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
|
NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
|
||||||
|
|
||||||
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
|
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
|
||||||
NCCL_PARAM(CommBlocking, "COMM_BLOCKING", 0);
|
NCCL_PARAM(CommBlocking, "COMM_BLOCKING", NCCL_CONFIG_UNDEF_INT);
|
||||||
|
|
||||||
static uint64_t hashUniqueId(ncclUniqueId const &id) {
|
static uint64_t hashUniqueId(ncclUniqueId const &id) {
|
||||||
char const *bytes = (char const*)&id;
|
char const *bytes = (char const*)&id;
|
||||||
@ -67,12 +67,8 @@ ncclResult_t initGdrCopy() {
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
|
|
||||||
|
|
||||||
pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
|
pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
static bool initialized = false;
|
static bool initialized = false;
|
||||||
static size_t maxLocalSizeBytes = 0;
|
|
||||||
|
|
||||||
static ncclResult_t ncclInit() {
|
static ncclResult_t ncclInit() {
|
||||||
if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return ncclSuccess;
|
if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return ncclSuccess;
|
||||||
@ -80,9 +76,6 @@ static ncclResult_t ncclInit() {
|
|||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
initEnv();
|
initEnv();
|
||||||
initGdrCopy();
|
initGdrCopy();
|
||||||
maxLocalSizeBytes = ncclKernMaxLocalSize();
|
|
||||||
int carveout = ncclParamL1SharedMemoryCarveout();
|
|
||||||
if (carveout) ncclKernSetSharedMemoryCarveout(carveout);
|
|
||||||
// Always initialize bootstrap network
|
// Always initialize bootstrap network
|
||||||
NCCLCHECK(bootstrapNetInit());
|
NCCLCHECK(bootstrapNetInit());
|
||||||
NCCLCHECK(ncclNetPluginInit());
|
NCCLCHECK(ncclNetPluginInit());
|
||||||
@ -210,6 +203,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
|||||||
NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream));
|
NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (comm->nvlsSupport) NCCLCHECK(ncclNvlsFree(comm));
|
||||||
|
|
||||||
struct ncclDestructor* dtor = comm->destructorHead;
|
struct ncclDestructor* dtor = comm->destructorHead;
|
||||||
while (dtor != nullptr) {
|
while (dtor != nullptr) {
|
||||||
NCCLCHECK(dtor->fn(dtor));
|
NCCLCHECK(dtor->fn(dtor));
|
||||||
@ -220,6 +215,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
|||||||
ncclMemoryStackDestruct(&comm->memPermanent);
|
ncclMemoryStackDestruct(&comm->memPermanent);
|
||||||
|
|
||||||
ncclCudaHostFree((void *)comm->abortFlag);
|
ncclCudaHostFree((void *)comm->abortFlag);
|
||||||
|
free(comm->netName);
|
||||||
|
|
||||||
commPoison(comm); // poison comm before free to avoid comm reuse.
|
commPoison(comm); // poison comm before free to avoid comm reuse.
|
||||||
free(comm);
|
free(comm);
|
||||||
@ -243,8 +239,8 @@ static ncclResult_t dmaBufSupported(struct ncclComm* comm) {
|
|||||||
int flag = 0;
|
int flag = 0;
|
||||||
CUdevice dev;
|
CUdevice dev;
|
||||||
int cudaDriverVersion;
|
int cudaDriverVersion;
|
||||||
CUCHECK(cuDriverGetVersion(&cudaDriverVersion));
|
CUDACHECK(cudaDriverGetVersion(&cudaDriverVersion));
|
||||||
if (cudaDriverVersion < 11070) return ncclInternalError;
|
if (CUPFN(cuDeviceGet) == NULL || cudaDriverVersion < 11070) return ncclInternalError;
|
||||||
CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
|
CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
|
||||||
// Query device to see if DMA-BUF support is available
|
// Query device to see if DMA-BUF support is available
|
||||||
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev));
|
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev));
|
||||||
@ -265,7 +261,7 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
|
|||||||
NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
|
NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
|
||||||
if (ret != ncclSuccess) {
|
if (ret != ncclSuccess) {
|
||||||
/* if ret is not ncclInProgress, we just keep it. */
|
/* if ret is not ncclInProgress, we just keep it. */
|
||||||
WARN("Attempt to use communicator before the previous operation returned ncclSuccess\n");
|
WARN("Attempt to use communicator before the previous operation returned ncclSuccess");
|
||||||
if (ret == ncclInProgress) ret = ncclInvalidArgument;
|
if (ret == ncclInProgress) ret = ncclInvalidArgument;
|
||||||
goto exit;
|
goto exit;
|
||||||
}
|
}
|
||||||
@ -395,6 +391,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
|||||||
tmpCommAndChans.channels[c].tree = comm->channels[c].tree;
|
tmpCommAndChans.channels[c].tree = comm->channels[c].tree;
|
||||||
tmpCommAndChans.channels[c].collnetChain = comm->channels[c].collnetChain;
|
tmpCommAndChans.channels[c].collnetChain = comm->channels[c].collnetChain;
|
||||||
tmpCommAndChans.channels[c].collnetDirect = comm->channels[c].collnetDirect;
|
tmpCommAndChans.channels[c].collnetDirect = comm->channels[c].collnetDirect;
|
||||||
|
tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls;
|
||||||
tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c];
|
tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c];
|
||||||
|
|
||||||
if (comm->channels[c].ring.userRanks != nullptr) {
|
if (comm->channels[c].ring.userRanks != nullptr) {
|
||||||
@ -521,8 +518,8 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collN
|
|||||||
struct ncclChannel* channel = comm->channels + c;
|
struct ncclChannel* channel = comm->channels + c;
|
||||||
for (int h = 0; h < nHeads; h++) {
|
for (int h = 0; h < nHeads; h++) {
|
||||||
const int head = heads[h];
|
const int head = heads[h];
|
||||||
collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv);
|
collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv);
|
||||||
if (!collNetSetupFail) collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend);
|
if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend);
|
||||||
}
|
}
|
||||||
// Verify CollNet setup across ranks after trying the first channel
|
// Verify CollNet setup across ranks after trying the first channel
|
||||||
if (c == 0) {
|
if (c == 0) {
|
||||||
@ -922,6 +919,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
|||||||
// Check if we can setup CollNet
|
// Check if we can setup CollNet
|
||||||
if (comm->collNetSupport > 0) collNetTrySetup(comm, &collNetGraph);
|
if (comm->collNetSupport > 0) collNetTrySetup(comm, &collNetGraph);
|
||||||
|
|
||||||
|
NCCLCHECKGOTO(ncclNvlsSetup(comm), ret, fail);
|
||||||
|
|
||||||
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
|
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
|
||||||
|
|
||||||
// Compute time models for algorithm and protocol combinations
|
// Compute time models for algorithm and protocol combinations
|
||||||
@ -929,7 +928,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
|||||||
int myCompCap = comm->peerInfo[rank].cudaCompCap;
|
int myCompCap = comm->peerInfo[rank].cudaCompCap;
|
||||||
int minCompCap = myCompCap, maxCompCap = myCompCap;
|
int minCompCap = myCompCap, maxCompCap = myCompCap;
|
||||||
for (int i = 0; i < nranks; i++) {
|
for (int i = 0; i < nranks; i++) {
|
||||||
minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap);
|
comm->minCompCap = minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap);
|
||||||
maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap);
|
maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap);
|
||||||
}
|
}
|
||||||
NCCLCHECKGOTO(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph), ret, fail);
|
NCCLCHECKGOTO(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph), ret, fail);
|
||||||
@ -938,6 +937,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
|||||||
// Compute nChannels per peer for p2p
|
// Compute nChannels per peer for p2p
|
||||||
NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
|
NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
|
||||||
|
|
||||||
|
INFO(NCCL_INIT, "%d coll channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
|
||||||
|
|
||||||
do { // Setup p2p structures in comm->tasks
|
do { // Setup p2p structures in comm->tasks
|
||||||
struct ncclTasks* tasks = &comm->tasks;
|
struct ncclTasks* tasks = &comm->tasks;
|
||||||
int nRanks = comm->nRanks;
|
int nRanks = comm->nRanks;
|
||||||
@ -1004,12 +1005,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
|
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Connect to local net proxy
|
// Connect to local net proxy
|
||||||
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
|
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
|
||||||
NCCLCHECKGOTO(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
|
NCCLCHECKGOTO(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
|
||||||
|
|
||||||
// Then to remote ones when using PXN
|
// Then to remote ones when using PXN
|
||||||
if (ncclPxnDisable(comm) == 0) {
|
if (ncclPxnDisable(comm) == 0) {
|
||||||
@ -1017,7 +1019,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
|||||||
NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
|
NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
|
||||||
for (int r=0; r<nranks; r++) {
|
for (int r=0; r<nranks; r++) {
|
||||||
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
|
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
|
||||||
NCCLCHECKGOTO(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
|
NCCLCHECKGOTO(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1065,6 +1067,11 @@ fail:
|
|||||||
}
|
}
|
||||||
|
|
||||||
NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0);
|
NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0);
|
||||||
|
NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", NCCL_CONFIG_UNDEF_INT);
|
||||||
|
// Match config max/minCTAs
|
||||||
|
NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT);
|
||||||
|
NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT);
|
||||||
|
#define NCCL_MAX_CGA_CLUSTER_SIZE 8
|
||||||
|
|
||||||
struct ncclCommInitRankAsyncJob {
|
struct ncclCommInitRankAsyncJob {
|
||||||
struct ncclAsyncJob base;
|
struct ncclAsyncJob base;
|
||||||
@ -1087,9 +1094,16 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
|||||||
ncclUniqueId commId = job->commId; // C++ struct assignment
|
ncclUniqueId commId = job->commId; // C++ struct assignment
|
||||||
int myrank = job->myrank;
|
int myrank = job->myrank;
|
||||||
int cudaDev = job->cudaDev;
|
int cudaDev = job->cudaDev;
|
||||||
|
int archMajor, archMinor;
|
||||||
|
size_t maxLocalSizeBytes = 0;
|
||||||
ncclResult_t res = ncclSuccess;
|
ncclResult_t res = ncclSuccess;
|
||||||
|
|
||||||
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
|
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
|
||||||
|
CUDACHECK(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
|
||||||
|
CUDACHECK(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev));
|
||||||
|
comm->cudaArch = 100*archMajor + 10*archMinor;
|
||||||
|
|
||||||
|
NCCLCHECK(ncclInitKernelsForDevice(comm->cudaArch, &maxLocalSizeBytes));
|
||||||
// Set the maximum kernel stack size of all kernels to avoid
|
// Set the maximum kernel stack size of all kernels to avoid
|
||||||
// a CUDA memory reconfig on load (c.f. NVSHMEM issue)
|
// a CUDA memory reconfig on load (c.f. NVSHMEM issue)
|
||||||
if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) {
|
if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) {
|
||||||
@ -1114,18 +1128,143 @@ fail:
|
|||||||
goto exit;
|
goto exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
|
#define NCCL_CONFIG_DEFAULT(config, field, undef, defvalue, fieldStr, format) \
|
||||||
ncclResult_t ret = ncclSuccess;
|
if (config->field == undef) { \
|
||||||
|
config->field = defvalue; \
|
||||||
/* first set configuration */
|
} else { \
|
||||||
if (config) {
|
INFO(NCCL_ENV, "Comm config " fieldStr " set to " format, config->field); \
|
||||||
comm->blocking = config->blocking;
|
|
||||||
} else {
|
|
||||||
/* default setting of communicator */
|
|
||||||
comm->blocking = 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
|
||||||
|
ncclResult_t ret = ncclSuccess;
|
||||||
|
/* config must not be NULL in this function */
|
||||||
|
int blockingEnv;
|
||||||
|
int cgaClusterSizeEnv;
|
||||||
|
int minCTAsEnv;
|
||||||
|
int maxCTAsEnv;
|
||||||
|
const char *envNetName, *tmpNetName;
|
||||||
|
ncclConfig_t defaultConfig = NCCL_CONFIG_INITIALIZER;
|
||||||
|
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
|
||||||
|
ncclConfig_t *internalConfigPtr;
|
||||||
|
size_t realSize;
|
||||||
|
|
||||||
|
internalConfigPtr = &internalConfig;
|
||||||
|
if (config) {
|
||||||
|
memcpy((void*)&realSize, (void*)config, sizeof(size_t));
|
||||||
|
realSize = realSize > sizeof(ncclConfig_t) ? sizeof(ncclConfig_t) : realSize;
|
||||||
|
memcpy((void*)internalConfigPtr, (void*)config, realSize);
|
||||||
|
if (internalConfigPtr->magic != 0xcafebeef) {
|
||||||
|
WARN("ncclConfig_t argument not initialized via NCCL_CONFIG_INITIALIZER");
|
||||||
|
ret = ncclInvalidArgument;
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* check version. */
|
||||||
|
if (internalConfigPtr->version < NCCL_VERSION(2, 14, 0)) {
|
||||||
|
internalConfigPtr->blocking = defaultConfig.blocking;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (internalConfigPtr->version < NCCL_VERSION(2, 17, 0)) {
|
||||||
|
internalConfigPtr->cgaClusterSize = defaultConfig.cgaClusterSize;
|
||||||
|
internalConfigPtr->minCTAs = defaultConfig.minCTAs;
|
||||||
|
internalConfigPtr->maxCTAs = defaultConfig.maxCTAs;
|
||||||
|
internalConfigPtr->netName = defaultConfig.netName;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* check input config attributes, -1 means user-undefined and we should use default value from NCCL. */
|
||||||
|
if (internalConfigPtr->blocking != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->blocking != 0 && internalConfigPtr->blocking != 1) {
|
||||||
|
WARN("Invalid config blocking attribute value %d", internalConfigPtr->blocking);
|
||||||
|
ret = ncclInvalidArgument;
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (internalConfigPtr->cgaClusterSize != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->cgaClusterSize < 0) {
|
||||||
|
WARN("Invalid config cgaClusterSize attribute value %d", internalConfigPtr->cgaClusterSize);
|
||||||
|
ret = ncclInvalidArgument;
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((internalConfigPtr->minCTAs != NCCL_CONFIG_UNDEF_INT &&
|
||||||
|
internalConfigPtr->minCTAs <= 0) ||
|
||||||
|
(internalConfigPtr->maxCTAs != NCCL_CONFIG_UNDEF_INT &&
|
||||||
|
internalConfigPtr->maxCTAs <= 0) ||
|
||||||
|
(internalConfigPtr->minCTAs > internalConfigPtr->maxCTAs)) {
|
||||||
|
WARN("Invalid config min/max channels attribute value %d/%d", internalConfigPtr->minCTAs, internalConfigPtr->maxCTAs);
|
||||||
|
ret = ncclInvalidArgument;
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* default config value can be tuned on different platform. */
|
||||||
|
NCCL_CONFIG_DEFAULT(internalConfigPtr, blocking, NCCL_CONFIG_UNDEF_INT, 1, "Blocking", "%d");
|
||||||
|
NCCL_CONFIG_DEFAULT(internalConfigPtr, cgaClusterSize, NCCL_CONFIG_UNDEF_INT, 4, "CGA cluster size", "%d");
|
||||||
|
NCCL_CONFIG_DEFAULT(internalConfigPtr, minCTAs, NCCL_CONFIG_UNDEF_INT, 1, "Min CTAs", "%d");
|
||||||
|
NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d");
|
||||||
|
NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s");
|
||||||
|
|
||||||
|
tmpNetName = internalConfigPtr->netName;
|
||||||
|
|
||||||
|
/* assign config to communicator */
|
||||||
|
comm->blocking = internalConfigPtr->blocking;
|
||||||
|
comm->cgaClusterSize = internalConfigPtr->cgaClusterSize;
|
||||||
|
comm->minCTAs = internalConfigPtr->minCTAs;
|
||||||
|
comm->maxCTAs = internalConfigPtr->maxCTAs;
|
||||||
|
|
||||||
|
/* override configuration from env variable. */
|
||||||
|
blockingEnv = ncclParamCommBlocking();
|
||||||
|
if (blockingEnv == 0 || blockingEnv == 1)
|
||||||
|
comm->blocking = blockingEnv;
|
||||||
|
|
||||||
|
cgaClusterSizeEnv = ncclParamCGAClusterSize();
|
||||||
|
if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) {
|
||||||
|
comm->cgaClusterSize = cgaClusterSizeEnv;
|
||||||
|
} else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) {
|
||||||
|
WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
|
||||||
|
comm->cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
minCTAsEnv = ncclParamMinCTAs();
|
||||||
|
if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
|
||||||
|
comm->minCTAs = minCTAsEnv;
|
||||||
|
}
|
||||||
|
|
||||||
|
maxCTAsEnv = ncclParamMaxCTAs();
|
||||||
|
if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
|
||||||
|
comm->maxCTAs = maxCTAsEnv;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* cap channels if needed */
|
||||||
|
if (comm->minCTAs > MAXCHANNELS) {
|
||||||
|
WARN("minCTAs %d is larger than #channels upper limit %d", comm->minCTAs, MAXCHANNELS);
|
||||||
|
comm->minCTAs = MAXCHANNELS;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (comm->maxCTAs > MAXCHANNELS) {
|
||||||
|
WARN("maxCTAs %d is larger than #channels upper limit %d", comm->maxCTAs, MAXCHANNELS);
|
||||||
|
comm->maxCTAs = MAXCHANNELS;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (comm->minCTAs > comm->maxCTAs) {
|
||||||
|
WARN("minCTAs %d is larger than maxCTAs %d", comm->minCTAs, comm->maxCTAs);
|
||||||
|
ret = ncclInvalidArgument;
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
envNetName = getenv("NCCL_NET");
|
||||||
|
if (envNetName)
|
||||||
|
tmpNetName = envNetName;
|
||||||
|
if (tmpNetName != NULL) {
|
||||||
|
int netNameLen = strlen(tmpNetName) + 1;
|
||||||
|
comm->netName = (char*)malloc(netNameLen);
|
||||||
|
memcpy(comm->netName, tmpNetName, netNameLen);
|
||||||
|
} else {
|
||||||
|
comm->netName = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
exit:
|
||||||
return ret;
|
return ret;
|
||||||
|
fail:
|
||||||
|
goto exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ncclCommInitRankUndo(struct ncclAsyncJob* job_) {
|
static void ncclCommInitRankUndo(struct ncclAsyncJob* job_) {
|
||||||
@ -1151,6 +1290,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
|
|||||||
CUDACHECKGOTO(cudaFree(NULL), res, fail);
|
CUDACHECKGOTO(cudaFree(NULL), res, fail);
|
||||||
|
|
||||||
NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, fail);
|
NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, fail);
|
||||||
|
NCCLCHECKGOTO(PtrCheck(config, "CommInitRank", "config"), res, fail);
|
||||||
if (nranks < 1 || myrank < 0 || myrank >= nranks) {
|
if (nranks < 1 || myrank < 0 || myrank >= nranks) {
|
||||||
WARN("Invalid rank requested : %d/%d", myrank, nranks);
|
WARN("Invalid rank requested : %d/%d", myrank, nranks);
|
||||||
res = ncclInvalidArgument;
|
res = ncclInvalidArgument;
|
||||||
@ -1201,12 +1341,13 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
|
|||||||
(void)ncclCudaLibraryInit();
|
(void)ncclCudaLibraryInit();
|
||||||
|
|
||||||
int cudaDev;
|
int cudaDev;
|
||||||
|
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
|
||||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||||
|
|
||||||
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
|
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
|
||||||
NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
|
NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
|
||||||
|
|
||||||
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, NULL));
|
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1215,6 +1356,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
|||||||
ncclResult_t ret = ncclSuccess;
|
ncclResult_t ret = ncclSuccess;
|
||||||
int totalnDev;
|
int totalnDev;
|
||||||
int *gpuFlags = NULL;
|
int *gpuFlags = NULL;
|
||||||
|
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
|
||||||
|
|
||||||
constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = {
|
constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = {
|
||||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"}
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"}
|
||||||
@ -1258,7 +1400,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
|||||||
NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
|
NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
|
||||||
for (int i=0; i<ndev; i++) {
|
for (int i=0; i<ndev; i++) {
|
||||||
// Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
|
// Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
|
||||||
ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, NULL);
|
ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, &config);
|
||||||
}
|
}
|
||||||
NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
|
NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
|
||||||
|
|
||||||
@ -1283,39 +1425,16 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
|
|||||||
int cudaDev;
|
int cudaDev;
|
||||||
ncclResult_t ret = ncclSuccess;
|
ncclResult_t ret = ncclSuccess;
|
||||||
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
|
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
|
||||||
ncclConfig_t *internalConfigPtr;
|
ncclConfig_t *internalConfigPtr = NULL;
|
||||||
size_t realSize;
|
|
||||||
int blockingEnv;
|
|
||||||
|
|
||||||
NCCLCHECK(ncclGroupStartInternal());
|
NCCLCHECK(ncclGroupStartInternal());
|
||||||
internalConfigPtr = &internalConfig;
|
|
||||||
if (config) {
|
|
||||||
memcpy((void*)&realSize, (void*)config, sizeof(size_t));
|
|
||||||
realSize = realSize > sizeof(ncclConfig_t) ? sizeof(ncclConfig_t) : realSize;
|
|
||||||
memcpy((void*)internalConfigPtr, (void*)config, realSize);
|
|
||||||
if (internalConfigPtr->magic != 0xcafebeef) {
|
|
||||||
WARN("ncclConfig_t argument not initialized via NCCL_CONFIG_INITIALIZER");
|
|
||||||
ret = ncclInvalidArgument;
|
|
||||||
goto exit;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* check input config attributes */
|
|
||||||
if (internalConfigPtr->blocking != 0 && internalConfigPtr->blocking != 1) {
|
|
||||||
WARN("Invalid config blocking attribute value %d", internalConfigPtr->blocking);
|
|
||||||
ret = ncclInvalidArgument;
|
|
||||||
goto exit;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* overwrite configuration from env variable. */
|
|
||||||
blockingEnv = ncclParamCommBlocking();
|
|
||||||
if (blockingEnv != 0 && blockingEnv != 1) {
|
|
||||||
WARN("Invalid NCCL_COMM_BLOCKING value %d", blockingEnv);
|
|
||||||
}
|
|
||||||
if (blockingEnv == 1) internalConfigPtr->blocking = blockingEnv;
|
|
||||||
|
|
||||||
(void)ncclCudaLibraryInit();
|
(void)ncclCudaLibraryInit();
|
||||||
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, exit);
|
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, fail);
|
||||||
|
|
||||||
|
if (config == NULL)
|
||||||
|
internalConfigPtr = &internalConfig;
|
||||||
|
else
|
||||||
|
internalConfigPtr = config;
|
||||||
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail);
|
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail);
|
||||||
|
|
||||||
exit:
|
exit:
|
||||||
|
@ -23,11 +23,33 @@ DECLARE_CUDA_PFN(cuMemGetAddressRange, 3020);
|
|||||||
/* proxy.cc */
|
/* proxy.cc */
|
||||||
DECLARE_CUDA_PFN(cuCtxCreate, 3020);
|
DECLARE_CUDA_PFN(cuCtxCreate, 3020);
|
||||||
DECLARE_CUDA_PFN(cuCtxDestroy, 4000);
|
DECLARE_CUDA_PFN(cuCtxDestroy, 4000);
|
||||||
|
DECLARE_CUDA_PFN(cuCtxGetCurrent, 4000);
|
||||||
DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000);
|
DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000);
|
||||||
|
DECLARE_CUDA_PFN(cuCtxGetDevice, 2000);
|
||||||
|
/* cuMem API support */
|
||||||
|
DECLARE_CUDA_PFN(cuMemAddressReserve, 10020);
|
||||||
|
DECLARE_CUDA_PFN(cuMemAddressFree, 10020);
|
||||||
|
DECLARE_CUDA_PFN(cuMemCreate, 10020);
|
||||||
|
DECLARE_CUDA_PFN(cuMemGetAllocationGranularity, 10020);
|
||||||
|
DECLARE_CUDA_PFN(cuMemExportToShareableHandle, 10020);
|
||||||
|
DECLARE_CUDA_PFN(cuMemImportFromShareableHandle, 10020);
|
||||||
|
DECLARE_CUDA_PFN(cuMemMap, 10020);
|
||||||
|
DECLARE_CUDA_PFN(cuMemRelease, 10020);
|
||||||
|
DECLARE_CUDA_PFN(cuMemSetAccess, 10020);
|
||||||
|
DECLARE_CUDA_PFN(cuMemUnmap, 10020);
|
||||||
#if CUDA_VERSION >= 11070
|
#if CUDA_VERSION >= 11070
|
||||||
/* transport/collNet.cc/net.cc*/
|
/* transport/collNet.cc/net.cc*/
|
||||||
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
|
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
|
||||||
#endif
|
#endif
|
||||||
|
#if CUDA_VERSION >= 12010
|
||||||
|
/* NVSwitch Multicast support */
|
||||||
|
DECLARE_CUDA_PFN(cuMulticastAddDevice, 12010);
|
||||||
|
DECLARE_CUDA_PFN(cuMulticastBindMem, 12010);
|
||||||
|
DECLARE_CUDA_PFN(cuMulticastBindAddr, 12010);
|
||||||
|
DECLARE_CUDA_PFN(cuMulticastCreate, 12010);
|
||||||
|
DECLARE_CUDA_PFN(cuMulticastGetGranularity, 12010);
|
||||||
|
DECLARE_CUDA_PFN(cuMulticastUnbind, 12010);
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* CUDA Driver functions loaded with dlsym() */
|
/* CUDA Driver functions loaded with dlsym() */
|
||||||
@ -39,6 +61,7 @@ DECLARE_CUDA_PFN(cuGetProcAddress, 11030);
|
|||||||
|
|
||||||
static void *cudaLib;
|
static void *cudaLib;
|
||||||
int ncclCudaDriverVersionCache = -1;
|
int ncclCudaDriverVersionCache = -1;
|
||||||
|
bool ncclCudaLaunchBlocking = false;
|
||||||
|
|
||||||
#if CUDART_VERSION >= 11030
|
#if CUDART_VERSION >= 11030
|
||||||
/*
|
/*
|
||||||
@ -62,9 +85,33 @@ static ncclResult_t cudaPfnFuncLoader(void) {
|
|||||||
LOAD_SYM(cuMemGetAddressRange, 3020, 1);
|
LOAD_SYM(cuMemGetAddressRange, 3020, 1);
|
||||||
LOAD_SYM(cuCtxCreate, 3020, 1);
|
LOAD_SYM(cuCtxCreate, 3020, 1);
|
||||||
LOAD_SYM(cuCtxDestroy, 4000, 1);
|
LOAD_SYM(cuCtxDestroy, 4000, 1);
|
||||||
|
LOAD_SYM(cuCtxGetCurrent, 4000, 1);
|
||||||
LOAD_SYM(cuCtxSetCurrent, 4000, 1);
|
LOAD_SYM(cuCtxSetCurrent, 4000, 1);
|
||||||
|
LOAD_SYM(cuCtxGetDevice, 2000, 1);
|
||||||
|
/* cuMem API support */
|
||||||
|
#if CUDA_VERSION >= 11030
|
||||||
|
LOAD_SYM(cuMemAddressReserve, 10020, 1);
|
||||||
|
LOAD_SYM(cuMemAddressFree, 10020, 1);
|
||||||
|
LOAD_SYM(cuMemCreate, 10020, 1);
|
||||||
|
LOAD_SYM(cuMemGetAllocationGranularity, 10020, 1);
|
||||||
|
LOAD_SYM(cuMemExportToShareableHandle, 10020, 1);
|
||||||
|
LOAD_SYM(cuMemImportFromShareableHandle, 10020, 1);
|
||||||
|
LOAD_SYM(cuMemMap, 10020, 1);
|
||||||
|
LOAD_SYM(cuMemRelease, 10020, 1);
|
||||||
|
LOAD_SYM(cuMemSetAccess, 10020, 1);
|
||||||
|
LOAD_SYM(cuMemUnmap, 10020, 1);
|
||||||
|
#endif
|
||||||
#if CUDA_VERSION >= 11070
|
#if CUDA_VERSION >= 11070
|
||||||
LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
|
LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
|
||||||
|
#endif
|
||||||
|
#if CUDA_VERSION >= 12010
|
||||||
|
/* NVSwitch Multicast support */
|
||||||
|
LOAD_SYM(cuMulticastAddDevice, 12010, 1);
|
||||||
|
LOAD_SYM(cuMulticastBindMem, 12010, 1);
|
||||||
|
LOAD_SYM(cuMulticastBindAddr, 12010, 1);
|
||||||
|
LOAD_SYM(cuMulticastCreate, 12010, 1);
|
||||||
|
LOAD_SYM(cuMulticastGetGranularity, 12010, 1);
|
||||||
|
LOAD_SYM(cuMulticastUnbind, 12010, 1);
|
||||||
#endif
|
#endif
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
@ -74,6 +121,11 @@ static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
|
|||||||
static ncclResult_t initResult;
|
static ncclResult_t initResult;
|
||||||
|
|
||||||
static void initOnceFunc() {
|
static void initOnceFunc() {
|
||||||
|
do {
|
||||||
|
char* val = getenv("CUDA_LAUNCH_BLOCKING");
|
||||||
|
ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0);
|
||||||
|
} while (0);
|
||||||
|
|
||||||
CUresult res;
|
CUresult res;
|
||||||
/*
|
/*
|
||||||
* Load CUDA driver library
|
* Load CUDA driver library
|
||||||
@ -85,9 +137,10 @@ static void initOnceFunc() {
|
|||||||
else
|
else
|
||||||
snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so");
|
snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so");
|
||||||
|
|
||||||
|
(void) dlerror(); // Clear any previous errors
|
||||||
cudaLib = dlopen(path, RTLD_LAZY);
|
cudaLib = dlopen(path, RTLD_LAZY);
|
||||||
if (cudaLib == NULL) {
|
if (cudaLib == NULL) {
|
||||||
WARN("Failed to find CUDA library (NCCL_CUDA_PATH='%s') : %s", ncclCudaPath ? ncclCudaPath : "", dlerror());
|
WARN("Failed to find CUDA library %s (NCCL_CUDA_PATH='%s') : %s", path, ncclCudaPath ? ncclCudaPath : "", dlerror());
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
200
src/misc/ipcsocket.cc
Normal file
200
src/misc/ipcsocket.cc
Normal file
@ -0,0 +1,200 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See COPYRIGHT for license information
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "ipcsocket.h"
|
||||||
|
#include "utils.h"
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <errno.h>
|
||||||
|
|
||||||
|
// Enable Linux abstract socket naming
|
||||||
|
#define USE_ABSTRACT_SOCKET
|
||||||
|
|
||||||
|
#define NCCL_IPC_SOCKNAME_STR "/tmp/nccl-socket-%d-%lx"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create a Unix Domain Socket
|
||||||
|
*/
|
||||||
|
ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag) {
|
||||||
|
int fd = -1;
|
||||||
|
struct sockaddr_un cliaddr;
|
||||||
|
char temp[NCCL_IPC_SOCKNAME_LEN] = "";
|
||||||
|
|
||||||
|
if (handle == NULL) {
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
|
||||||
|
handle->fd = -1;
|
||||||
|
handle->socketName[0] = '\0';
|
||||||
|
if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
|
||||||
|
WARN("UDS: Socket creation error : %d", errno);
|
||||||
|
return ncclSystemError;
|
||||||
|
}
|
||||||
|
|
||||||
|
bzero(&cliaddr, sizeof(cliaddr));
|
||||||
|
cliaddr.sun_family = AF_UNIX;
|
||||||
|
|
||||||
|
// Create unique name for the socket.
|
||||||
|
int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
|
||||||
|
if (len > (sizeof(cliaddr.sun_path) - 1)) {
|
||||||
|
WARN("UDS: Cannot bind provided name to socket. Name too large");
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
#ifndef USE_ABSTRACT_SOCKET
|
||||||
|
unlink(temp);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
TRACE(NCCL_INIT, "UDS: Creating socket %s", temp);
|
||||||
|
|
||||||
|
strncpy(cliaddr.sun_path, temp, len);
|
||||||
|
#ifdef USE_ABSTRACT_SOCKET
|
||||||
|
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
|
||||||
|
#endif
|
||||||
|
if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) {
|
||||||
|
WARN("UDS: Binding to socket %s failed : %d", temp, errno);
|
||||||
|
close(fd);
|
||||||
|
return ncclSystemError;
|
||||||
|
}
|
||||||
|
|
||||||
|
handle->fd = fd;
|
||||||
|
strcpy(handle->socketName, temp);
|
||||||
|
|
||||||
|
handle->abortFlag = abortFlag;
|
||||||
|
// Mark socket as non-blocking
|
||||||
|
if (handle->abortFlag) {
|
||||||
|
int flags;
|
||||||
|
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
|
||||||
|
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
|
||||||
|
}
|
||||||
|
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) {
|
||||||
|
if (handle == NULL) {
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
if (handle->fd <= 0) {
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
#ifndef USE_ABSTRACT_SOCKET
|
||||||
|
if (handle->socketName[0] != '\0') {
|
||||||
|
unlink(handle->socketName);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
close(handle->fd);
|
||||||
|
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
|
||||||
|
struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
|
||||||
|
struct iovec iov[1];
|
||||||
|
|
||||||
|
// Union to guarantee alignment requirements for control array
|
||||||
|
union {
|
||||||
|
struct cmsghdr cm;
|
||||||
|
char control[CMSG_SPACE(sizeof(int))];
|
||||||
|
} control_un;
|
||||||
|
|
||||||
|
struct cmsghdr *cmptr;
|
||||||
|
char dummy_buffer[1];
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
msg.msg_control = control_un.control;
|
||||||
|
msg.msg_controllen = sizeof(control_un.control);
|
||||||
|
|
||||||
|
iov[0].iov_base = (void *)dummy_buffer;
|
||||||
|
iov[0].iov_len = sizeof(dummy_buffer);
|
||||||
|
|
||||||
|
msg.msg_iov = iov;
|
||||||
|
msg.msg_iovlen = 1;
|
||||||
|
|
||||||
|
while ((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
|
||||||
|
if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
|
||||||
|
WARN("UDS: Receiving data over socket failed : %d", errno);
|
||||||
|
return ncclSystemError;
|
||||||
|
}
|
||||||
|
if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
|
||||||
|
if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
|
||||||
|
WARN("UDS: Receiving data over socket failed");
|
||||||
|
return ncclSystemError;
|
||||||
|
}
|
||||||
|
|
||||||
|
memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
|
||||||
|
} else {
|
||||||
|
WARN("UDS: Receiving data over socket %s failed", handle->socketName);
|
||||||
|
return ncclSystemError;
|
||||||
|
}
|
||||||
|
|
||||||
|
TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
|
||||||
|
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) {
|
||||||
|
struct msghdr msg;
|
||||||
|
struct iovec iov[1];
|
||||||
|
char temp[NCCL_IPC_SOCKNAME_LEN];
|
||||||
|
|
||||||
|
union {
|
||||||
|
struct cmsghdr cm;
|
||||||
|
char control[CMSG_SPACE(sizeof(int))];
|
||||||
|
} control_un;
|
||||||
|
|
||||||
|
struct cmsghdr *cmptr;
|
||||||
|
struct sockaddr_un cliaddr;
|
||||||
|
|
||||||
|
// Construct client address to send this shareable handle to
|
||||||
|
bzero(&cliaddr, sizeof(cliaddr));
|
||||||
|
cliaddr.sun_family = AF_UNIX;
|
||||||
|
|
||||||
|
int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
|
||||||
|
if (len > (sizeof(cliaddr.sun_path) - 1)) {
|
||||||
|
WARN("UDS: Cannot connect to provided name for socket. Name too large");
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
(void) strncpy(cliaddr.sun_path, temp, len);
|
||||||
|
|
||||||
|
TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
|
||||||
|
|
||||||
|
#ifdef USE_ABSTRACT_SOCKET
|
||||||
|
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
|
||||||
|
#endif
|
||||||
|
|
||||||
|
msg.msg_control = control_un.control;
|
||||||
|
msg.msg_controllen = sizeof(control_un.control);
|
||||||
|
|
||||||
|
cmptr = CMSG_FIRSTHDR(&msg);
|
||||||
|
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
|
||||||
|
cmptr->cmsg_level = SOL_SOCKET;
|
||||||
|
cmptr->cmsg_type = SCM_RIGHTS;
|
||||||
|
|
||||||
|
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
|
||||||
|
|
||||||
|
msg.msg_name = (void *)&cliaddr;
|
||||||
|
msg.msg_namelen = sizeof(struct sockaddr_un);
|
||||||
|
|
||||||
|
iov[0].iov_base = (void *)"";
|
||||||
|
iov[0].iov_len = 1;
|
||||||
|
msg.msg_iov = iov;
|
||||||
|
msg.msg_iovlen = 1;
|
||||||
|
msg.msg_flags = 0;
|
||||||
|
|
||||||
|
ssize_t sendResult;
|
||||||
|
while ((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
|
||||||
|
if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
|
||||||
|
WARN("UDS: Sending data over socket %s failed : %d", temp, errno);
|
||||||
|
return ncclSystemError;
|
||||||
|
}
|
||||||
|
if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
@ -43,7 +43,7 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr
|
|||||||
|
|
||||||
static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
|
static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
|
||||||
int closed;
|
int closed;
|
||||||
NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
|
NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0 /*block*/, &closed));
|
||||||
if (closed) {
|
if (closed) {
|
||||||
char line[SOCKET_NAME_MAXLEN+1];
|
char line[SOCKET_NAME_MAXLEN+1];
|
||||||
WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
|
WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
|
||||||
@ -785,17 +785,34 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Receive or detect connection closed
|
// Receive or detect connection closed
|
||||||
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed) {
|
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking) {
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
if (sock == NULL) {
|
if (sock == NULL) {
|
||||||
WARN("ncclSocketTryRecv: pass NULL socket");
|
WARN("ncclSocketTryRecv: pass NULL socket");
|
||||||
return ncclInvalidArgument;
|
return ncclInvalidArgument;
|
||||||
}
|
}
|
||||||
*closed = 0;
|
*closed = 0;
|
||||||
|
// Block until connection closes or nbytes received
|
||||||
|
if (blocking) {
|
||||||
while (offset < size) {
|
while (offset < size) {
|
||||||
NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
|
NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
|
||||||
if (*closed) return ncclSuccess;
|
if (*closed) return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
|
||||||
|
if (*closed) return ncclSuccess;
|
||||||
|
|
||||||
|
// If any bytes were received, block waiting for the rest
|
||||||
|
if (offset > 0) {
|
||||||
|
while (offset < size) {
|
||||||
|
NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
|
||||||
|
if (*closed) return ncclSuccess;
|
||||||
|
}
|
||||||
|
// No bytes were received, return ncclInProgress
|
||||||
|
} else {
|
||||||
|
return ncclInProgress;
|
||||||
|
}
|
||||||
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,8 +25,10 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <limits.h>
|
||||||
/* Opaque handle to communicator */
|
/* Opaque handle to communicator */
|
||||||
typedef struct ncclComm* ncclComm_t;
|
typedef struct ncclComm* ncclComm_t;
|
||||||
|
#define NCCL_COMM_NULL NULL
|
||||||
|
|
||||||
#define NCCL_UNIQUE_ID_BYTES 128
|
#define NCCL_UNIQUE_ID_BYTES 128
|
||||||
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
|
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
|
||||||
@ -42,15 +44,22 @@ typedef enum { ncclSuccess = 0,
|
|||||||
ncclInProgress = 7,
|
ncclInProgress = 7,
|
||||||
ncclNumResults = 8 } ncclResult_t;
|
ncclNumResults = 8 } ncclResult_t;
|
||||||
|
|
||||||
|
#define NCCL_CONFIG_UNDEF_INT INT_MIN
|
||||||
|
#define NCCL_CONFIG_UNDEF_PTR NULL
|
||||||
|
|
||||||
/* Communicator configuration. Users can assign value to attributes to specify the
|
/* Communicator configuration. Users can assign value to attributes to specify the
|
||||||
* behavior of a communicator. */
|
* behavior of a communicator. */
|
||||||
typedef struct ncclConfig_v21400 {
|
typedef struct ncclConfig_v21700 {
|
||||||
/* attributes that users should never touch. */
|
/* attributes that users should never touch. */
|
||||||
size_t size;
|
size_t size;
|
||||||
unsigned int magic;
|
unsigned int magic;
|
||||||
unsigned int version;
|
unsigned int version;
|
||||||
/* attributes that users are able to customize. */
|
/* attributes that users are able to customize. */
|
||||||
int blocking;
|
int blocking;
|
||||||
|
int cgaClusterSize;
|
||||||
|
int minCTAs;
|
||||||
|
int maxCTAs;
|
||||||
|
const char *netName;
|
||||||
} ncclConfig_t;
|
} ncclConfig_t;
|
||||||
|
|
||||||
/* Config initializer must be assigned to initialize config structure when it is created.
|
/* Config initializer must be assigned to initialize config structure when it is created.
|
||||||
@ -59,7 +68,11 @@ typedef struct ncclConfig_v21400 {
|
|||||||
sizeof(ncclConfig_t), /* size */ \
|
sizeof(ncclConfig_t), /* size */ \
|
||||||
0xcafebeef, /* magic */ \
|
0xcafebeef, /* magic */ \
|
||||||
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
|
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
|
||||||
1 /* blocking */ \
|
NCCL_CONFIG_UNDEF_INT, /* blocking */ \
|
||||||
|
NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
|
||||||
|
NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
|
||||||
|
NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
|
||||||
|
NCCL_CONFIG_UNDEF_PTR /* netName */ \
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
|
/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
|
||||||
|
44
src/net.cc
44
src/net.cc
@ -176,14 +176,8 @@ ncclResult_t ncclNetPluginInit() {
|
|||||||
}
|
}
|
||||||
void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
|
void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
|
||||||
if (netPluginLib == nullptr) {
|
if (netPluginLib == nullptr) {
|
||||||
// dlopen does not guarantee to set errno, but dlerror only gives us a
|
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load (%s) returned %d : %s", ncclNetPluginName, errno, dlerror());
|
||||||
// string, so checking errno doesn't hurt to try to provide a better
|
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found, using internal implementation");
|
||||||
// error message
|
|
||||||
if (errno == ENOENT) {
|
|
||||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (%s), using internal implementation", ncclNetPluginName);
|
|
||||||
} else {
|
|
||||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
|
|
||||||
}
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -264,9 +258,10 @@ static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
|
|||||||
|
|
||||||
ncclResult_t ncclNetInit(struct ncclComm* comm) {
|
ncclResult_t ncclNetInit(struct ncclComm* comm) {
|
||||||
// Initialize main communication network
|
// Initialize main communication network
|
||||||
char* netName = getenv("NCCL_NET");
|
char* netName;
|
||||||
bool ok = false;
|
bool ok = false;
|
||||||
|
|
||||||
|
netName = comm->netName;
|
||||||
for (int i=0; i<3; i++) {
|
for (int i=0; i<3; i++) {
|
||||||
if (ncclNets[i] == nullptr) continue;
|
if (ncclNets[i] == nullptr) continue;
|
||||||
enum ncclNetState state;
|
enum ncclNetState state;
|
||||||
@ -324,9 +319,26 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
|
|||||||
ncclResult_t ret;
|
ncclResult_t ret;
|
||||||
ncclDebugNoWarn = NCCL_NET;
|
ncclDebugNoWarn = NCCL_NET;
|
||||||
NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1);
|
NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1);
|
||||||
NCCLWAITGOTO(ncclNetConnect(comm, dev, &handle, &sComm), sComm != NULL, comm->abortFlag, ret, cleanup2);
|
|
||||||
NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3);
|
bool connected;
|
||||||
CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
|
connected = false;
|
||||||
|
while (!connected) {
|
||||||
|
|
||||||
|
// If we're aborting now, skip to cleanup
|
||||||
|
if (*comm->abortFlag) {
|
||||||
|
goto cleanup2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sComm == NULL)
|
||||||
|
NCCLCHECKGOTO(ncclNetConnect(comm, dev, &handle, &sComm), ret, cleanup2);
|
||||||
|
|
||||||
|
if (rComm == NULL)
|
||||||
|
NCCLCHECKGOTO(ncclNetAccept(comm, lComm, &rComm), ret, cleanup2);
|
||||||
|
|
||||||
|
connected = (rComm != NULL) && (sComm != NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
|
||||||
if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
|
if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
|
||||||
NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle));
|
NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle));
|
||||||
NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
|
NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
|
||||||
@ -335,11 +347,11 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
|
|||||||
}
|
}
|
||||||
ncclDebugNoWarn = 0;
|
ncclDebugNoWarn = 0;
|
||||||
CUDACHECK(cudaFree(gpuPtr));
|
CUDACHECK(cudaFree(gpuPtr));
|
||||||
cleanup4:
|
|
||||||
NCCLCHECK(ncclNetCloseRecv(comm, rComm));
|
|
||||||
cleanup3:
|
|
||||||
NCCLCHECK(ncclNetCloseSend(comm, sComm));
|
|
||||||
cleanup2:
|
cleanup2:
|
||||||
|
if (rComm != NULL)
|
||||||
|
NCCLCHECK(ncclNetCloseRecv(comm, rComm));
|
||||||
|
if (sComm != NULL)
|
||||||
|
NCCLCHECK(ncclNetCloseSend(comm, sComm));
|
||||||
NCCLCHECK(ncclNetCloseListen(comm, lComm));
|
NCCLCHECK(ncclNetCloseListen(comm, lComm));
|
||||||
cleanup1:
|
cleanup1:
|
||||||
break;
|
break;
|
||||||
|
431
src/proxy.cc
431
src/proxy.cc
@ -14,6 +14,7 @@
|
|||||||
#include "timer.h"
|
#include "timer.h"
|
||||||
|
|
||||||
#include <sys/syscall.h>
|
#include <sys/syscall.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
enum { proxyRecv=0, proxySend=1 };
|
enum { proxyRecv=0, proxySend=1 };
|
||||||
|
|
||||||
@ -37,6 +38,155 @@ struct ncclProxyPool {
|
|||||||
struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
|
struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static void expectedProxyResponseFree(struct ncclProxyState* state) {
|
||||||
|
struct ncclExpectedProxyResponse* elem = state->expectedResponses;
|
||||||
|
struct ncclExpectedProxyResponse* prev = NULL;
|
||||||
|
|
||||||
|
while (elem) {
|
||||||
|
prev = elem;
|
||||||
|
elem = elem->next;
|
||||||
|
free(prev->respBuff);
|
||||||
|
free(prev);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, void* opId, void* respBuff, int respSize) {
|
||||||
|
struct ncclExpectedProxyResponse* elem = state->expectedResponses;
|
||||||
|
while (elem) {
|
||||||
|
if (elem->opId == opId) {
|
||||||
|
if (respSize != elem->respSize) {
|
||||||
|
WARN("Mismatched response size for opId=%p", opId);
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (elem->done) {
|
||||||
|
WARN("Storing response for already completed opId=%p", opId);
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(elem->respBuff, respBuff, respSize);
|
||||||
|
elem->done = true;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
elem = elem->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
WARN("Proxy response for opId=%p doesn't match any expected response", opId);
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, void* opId, int respSize, void* respData, int respDataSize) {
|
||||||
|
struct ncclExpectedProxyResponse* ex;
|
||||||
|
NCCLCHECK(ncclCalloc(&ex, 1));
|
||||||
|
ex->opId = opId;
|
||||||
|
|
||||||
|
// Pre-alloc response buffer
|
||||||
|
ex->respBuff = malloc(respSize);
|
||||||
|
ex->respSize = respSize;
|
||||||
|
ex->done = false;
|
||||||
|
if (respData) {
|
||||||
|
memcpy(ex->respBuff, respData, respDataSize);
|
||||||
|
ex->done = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enqueue
|
||||||
|
struct ncclExpectedProxyResponse* list = state->expectedResponses;
|
||||||
|
if (list == NULL) {
|
||||||
|
state->expectedResponses = ex;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
while (list->next) list = list->next;
|
||||||
|
list->next = ex;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t expectedProxyResponseDequeue(struct ncclProxyState* state, void* opId, void* respBuff, int* found) {
|
||||||
|
struct ncclExpectedProxyResponse* elem = state->expectedResponses;
|
||||||
|
struct ncclExpectedProxyResponse* prev = NULL;
|
||||||
|
*found = 0;
|
||||||
|
while (elem) {
|
||||||
|
if ((elem->opId == opId) && elem->done) {
|
||||||
|
if (prev == NULL) {
|
||||||
|
state->expectedResponses = elem->next;
|
||||||
|
} else {
|
||||||
|
prev->next = elem->next;
|
||||||
|
}
|
||||||
|
memcpy(respBuff, elem->respBuff, elem->respSize);
|
||||||
|
free(elem->respBuff);
|
||||||
|
free(elem);
|
||||||
|
*found = 1;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
prev = elem;
|
||||||
|
elem = elem->next;
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t expectedProxyResponseRemove(struct ncclProxyState* state, void* opId) {
|
||||||
|
struct ncclExpectedProxyResponse* elem = state->expectedResponses;
|
||||||
|
struct ncclExpectedProxyResponse* prev = NULL;
|
||||||
|
while (elem) {
|
||||||
|
if (elem->opId == opId) {
|
||||||
|
if (prev == NULL) {
|
||||||
|
state->expectedResponses = elem->next;
|
||||||
|
} else {
|
||||||
|
prev->next = elem->next;
|
||||||
|
}
|
||||||
|
free(elem->respBuff);
|
||||||
|
free(elem);
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
prev = elem;
|
||||||
|
elem = elem->next;
|
||||||
|
}
|
||||||
|
WARN("Couldn't find opId=%p", opId);
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t asyncProxyOpEnqueue(struct ncclProxyLocalPeer* peer, ncclProxyAsyncOp* op) {
|
||||||
|
ncclProxyAsyncOp* list = peer->asyncOps;
|
||||||
|
if (list == NULL) {
|
||||||
|
peer->asyncOps = op;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
while (list->next) list = list->next;
|
||||||
|
list->next = op;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t asyncProxyOpDequeue(struct ncclProxyLocalPeer* peer, ncclProxyAsyncOp* op) {
|
||||||
|
struct ncclProxyAsyncOp* elem = peer->asyncOps;
|
||||||
|
struct ncclProxyAsyncOp* prev = NULL;
|
||||||
|
while (elem) {
|
||||||
|
if (elem->opId == op->opId) {
|
||||||
|
if (prev == NULL) {
|
||||||
|
peer->asyncOps = elem->next;
|
||||||
|
} else {
|
||||||
|
prev->next = elem->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (elem->reqBuff) {
|
||||||
|
free(elem->reqBuff);
|
||||||
|
}
|
||||||
|
if (elem->respBuff) {
|
||||||
|
free(elem->respBuff);
|
||||||
|
}
|
||||||
|
free(elem);
|
||||||
|
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
prev = elem;
|
||||||
|
elem = elem->next;
|
||||||
|
}
|
||||||
|
if (op) {
|
||||||
|
WARN("Attempting to dequeue nonexistent async opId=%p", op->opId);
|
||||||
|
} else {
|
||||||
|
WARN("Attempting to dequeue null operation");
|
||||||
|
}
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
|
||||||
static ncclResult_t allocateArgs(struct ncclProxyProgressState* state, struct ncclProxyArgs** argsptr) {
|
static ncclResult_t allocateArgs(struct ncclProxyProgressState* state, struct ncclProxyArgs** argsptr) {
|
||||||
struct ncclProxyArgs* elem;
|
struct ncclProxyArgs* elem;
|
||||||
if (state->pool == NULL) {
|
if (state->pool == NULL) {
|
||||||
@ -86,7 +236,7 @@ ncclResult_t getOpIndex(struct ncclProxyArgs* op, struct ncclProxyProgressState*
|
|||||||
pool = pool->next;
|
pool = pool->next;
|
||||||
p++;
|
p++;
|
||||||
}
|
}
|
||||||
WARN("Could not find pool of op %p\n", op);
|
WARN("Could not find pool of op %p", op);
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -140,7 +290,7 @@ ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) {
|
|||||||
nextOp->state |= OP_SEEN;
|
nextOp->state |= OP_SEEN;
|
||||||
printf("\n");
|
printf("\n");
|
||||||
if (nextOp->next) {
|
if (nextOp->next) {
|
||||||
WARN("Inactive op has next set!\n");
|
WARN("Inactive op has next set!");
|
||||||
}
|
}
|
||||||
nextOp = nextOp->nextPeer;
|
nextOp = nextOp->nextPeer;
|
||||||
}
|
}
|
||||||
@ -337,7 +487,7 @@ ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector*
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (lastOp == -1) {
|
if (lastOp == -1) {
|
||||||
WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)\n", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount);
|
WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount);
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
// Cut chain at lastOp
|
// Cut chain at lastOp
|
||||||
@ -770,19 +920,6 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclComm* comm) {
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ncclProxyAsyncOp {
|
|
||||||
int type;
|
|
||||||
struct ncclProxyConnection* connection;
|
|
||||||
int reqSize, respSize;
|
|
||||||
char *reqBuff, *respBuff;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ncclProxyLocalPeer {
|
|
||||||
struct ncclSocket sock;
|
|
||||||
int localRank;
|
|
||||||
struct ncclProxyAsyncOp asyncOps;
|
|
||||||
};
|
|
||||||
|
|
||||||
#define NCCL_PROXY_CONN_POOL_SIZE_POW2 7
|
#define NCCL_PROXY_CONN_POOL_SIZE_POW2 7
|
||||||
#define NCCL_PROXY_CONN_POOL_SIZE (1<<(NCCL_PROXY_CONN_POOL_SIZE_POW2))
|
#define NCCL_PROXY_CONN_POOL_SIZE (1<<(NCCL_PROXY_CONN_POOL_SIZE_POW2))
|
||||||
#define NCCL_PROXY_CONN_POOL_MASK ((NCCL_PROXY_CONN_POOL_SIZE)-1)
|
#define NCCL_PROXY_CONN_POOL_MASK ((NCCL_PROXY_CONN_POOL_SIZE)-1)
|
||||||
@ -790,7 +927,6 @@ struct ncclProxyConnectionPool {
|
|||||||
struct ncclProxyConnection** pools;
|
struct ncclProxyConnection** pools;
|
||||||
int banks;
|
int banks;
|
||||||
int offset;
|
int offset;
|
||||||
struct ncclProxyAsyncOp* ops;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) {
|
static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) {
|
||||||
@ -888,26 +1024,137 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop" };
|
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "ConvertFd" };
|
||||||
ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
|
ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
|
||||||
struct ncclSocket* sock;
|
struct ncclSocket* sock;
|
||||||
ncclResult_t ret = ncclSuccess;
|
ncclResult_t ret = ncclSuccess;
|
||||||
|
void* respData = NULL;
|
||||||
|
int respDataSize = 0;
|
||||||
|
struct ncclComm* comm = proxyConn->comm;
|
||||||
|
struct ncclIpcSocket ipcSock = { 0 };
|
||||||
|
|
||||||
if (proxyConn->comm->proxyState.peerSocks == NULL) return ncclInternalError;
|
if (*comm->abortFlag != 0) {
|
||||||
sock = proxyConn->comm->proxyState.peerSocks + proxyConn->localRank;
|
WARN("ncclProxyCallAsync() - Saw abortFlag while waiting for proxyThread response");
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
if (comm->proxyState.peerSocks == NULL) return ncclInternalError;
|
||||||
|
|
||||||
|
sock = comm->proxyState.peerSocks + proxyConn->localRank;
|
||||||
if (sock == NULL) return ncclInternalError;
|
if (sock == NULL) return ncclInternalError;
|
||||||
|
|
||||||
|
if (type == ncclProxyMsgConvertFd) {
|
||||||
|
// cuMem API support
|
||||||
|
// Create a UDS socket to receive the converted fd
|
||||||
|
NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->localRank, (uint64_t)proxyConn->connection, comm->abortFlag));
|
||||||
|
}
|
||||||
|
|
||||||
NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
|
NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
|
||||||
NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
|
NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
|
||||||
NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
|
NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
|
||||||
NCCLCHECKGOTO(ncclSocketSend(sock, &respSize, sizeof(int)), ret, error);
|
NCCLCHECKGOTO(ncclSocketSend(sock, &respSize, sizeof(int)), ret, error);
|
||||||
if (reqSize) NCCLCHECKGOTO(ncclSocketSend(sock, reqBuff, reqSize), ret, error);
|
if (reqSize) NCCLCHECKGOTO(ncclSocketSend(sock, reqBuff, reqSize), ret, error);
|
||||||
if (respSize) NCCLCHECKGOTO(ncclSocketRecv(sock, respBuff, respSize), ret, error);
|
|
||||||
|
if (type == ncclProxyMsgConvertFd) {
|
||||||
|
// cuMem API support
|
||||||
|
int recvFd = -1;
|
||||||
|
if (reqSize != sizeof(int) || respSize != sizeof(int)) return ncclInternalError;
|
||||||
|
// Receive converted fd over UDS
|
||||||
|
NCCLCHECK(ncclIpcSocketRecvFd(&ipcSock, &recvFd));
|
||||||
|
TRACE(NCCL_NET, "UDS: ConvertFd rank %d returned %p %d", proxyConn->localRank, &recvFd, recvFd);
|
||||||
|
assert(recvFd != -1);
|
||||||
|
respData = &recvFd;
|
||||||
|
respDataSize = sizeof(recvFd);
|
||||||
|
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
|
||||||
|
} else {
|
||||||
|
// Send opId to proxy
|
||||||
|
NCCLCHECKGOTO(ncclSocketSend(sock, &opId, sizeof(opId)), ret, error);
|
||||||
|
}
|
||||||
|
// Add proxyOp to expected response queue
|
||||||
|
NCCLCHECK(expectedProxyResponseEnqueue(&comm->proxyState, opId, respSize, respData, respDataSize));
|
||||||
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
error:
|
error:
|
||||||
WARN("Proxy Call to rank %d failed (%s)", proxyConn->comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]);
|
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
|
||||||
|
WARN("Proxy Call to rank %d failed (%s)", comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId) {
|
||||||
|
struct ncclComm* comm = proxyConn->comm;
|
||||||
|
|
||||||
|
// Receive the connection pointer from the Proxy
|
||||||
|
if (*comm->abortFlag) {
|
||||||
|
WARN("Comm %p is in abort state", comm);
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
if (comm->proxyState.peerSocks == NULL) return ncclInternalError;
|
||||||
|
|
||||||
|
// Check response queue
|
||||||
|
int found = 0;
|
||||||
|
NCCLCHECK(expectedProxyResponseDequeue(&comm->proxyState, opId, respBuff, &found));
|
||||||
|
if (found == 0) {
|
||||||
|
// Attempt to read in a new response header from the proxy thread
|
||||||
|
struct ncclSocket* sock = comm->proxyState.peerSocks + proxyConn->localRank;
|
||||||
|
|
||||||
|
void* recvOpId;
|
||||||
|
int offset = 0;
|
||||||
|
if (ncclSuccess != ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset)) {
|
||||||
|
WARN("Socket recv failed while polling for opId=%p", opId);
|
||||||
|
return ncclInternalError;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (offset == 0) {
|
||||||
|
return ncclInProgress;
|
||||||
|
// If we've returned a partial response, block to receive the rest of it
|
||||||
|
} else if (offset < sizeof(recvOpId)) {
|
||||||
|
while (offset < sizeof(recvOpId))
|
||||||
|
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset));
|
||||||
|
}
|
||||||
|
|
||||||
|
INFO(NCCL_PROXY, "ncclPollProxyResponse Recieved new opId=%p", recvOpId);
|
||||||
|
|
||||||
|
// Now do a blocking recv of the response size
|
||||||
|
int respSize = 0;
|
||||||
|
NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(respSize)));
|
||||||
|
|
||||||
|
// If there's a respSize to recv
|
||||||
|
if (respSize > 0) {
|
||||||
|
NCCLCHECK(ncclSocketRecv(sock, respBuff, respSize));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (recvOpId == opId) {
|
||||||
|
INFO(NCCL_PROXY, "recvOpId=%p matches expected opId=%p", recvOpId, opId);
|
||||||
|
NCCLCHECK(expectedProxyResponseRemove(&comm->proxyState, recvOpId));
|
||||||
|
return ncclSuccess;
|
||||||
|
} else {
|
||||||
|
INFO(NCCL_PROXY, "Queing opId=%p", recvOpId);
|
||||||
|
// Store the result and mark response as completed
|
||||||
|
NCCLCHECK(expectedProxyResponseStore(&comm->proxyState, recvOpId, respBuff, respSize));
|
||||||
|
return ncclInProgress;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
INFO(NCCL_PROXY, "ncclPollProxyResponse Dequeued cached opId=%p", opId);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
|
||||||
|
// Alloc some memory to act as a handle
|
||||||
|
void* opId = malloc(1);
|
||||||
|
|
||||||
|
NCCLCHECK(ncclProxyCallAsync(proxyConn, type, reqBuff, reqSize, respSize, opId));
|
||||||
|
ncclResult_t res = ncclInProgress;
|
||||||
|
|
||||||
|
while (res == ncclInProgress) {
|
||||||
|
res = ncclPollProxyResponse(proxyConn, respBuff, opId);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(opId);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
static ncclResult_t proxyProgressInit(struct ncclComm* comm) {
|
static ncclResult_t proxyProgressInit(struct ncclComm* comm) {
|
||||||
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
|
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
|
||||||
if (state->opsPool == NULL) {
|
if (state->opsPool == NULL) {
|
||||||
@ -998,16 +1245,55 @@ static ncclResult_t proxyConnSharedInit(struct ncclProxyLocalPeer* peer, struct
|
|||||||
if (reqSize != sizeof(int) || respSize != 0) return ncclInternalError;
|
if (reqSize != sizeof(int) || respSize != 0) return ncclInternalError;
|
||||||
int nChannels;
|
int nChannels;
|
||||||
NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int)));
|
NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int)));
|
||||||
|
|
||||||
|
// Store opId for completion response
|
||||||
|
void* opId;
|
||||||
|
NCCLCHECK(ncclSocketRecv(sock, &opId, sizeof(opId)));
|
||||||
|
INFO(NCCL_PROXY, "proxyConnSharedInit received opId=%p", opId);
|
||||||
|
|
||||||
if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels));
|
if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels));
|
||||||
__atomic_store_n(&connection->state, connSharedInitialized, __ATOMIC_RELEASE);
|
__atomic_store_n(&connection->state, connSharedInitialized, __ATOMIC_RELEASE);
|
||||||
|
|
||||||
|
// Send the opId for referencing async operation
|
||||||
|
INFO(NCCL_PROXY, "proxyConnSharedInit::ncclSocketSend(opId=%p)", opId);
|
||||||
|
NCCLCHECK(ncclSocketSend(connection->sock, &opId, sizeof(opId)));
|
||||||
|
|
||||||
|
// Send the response size
|
||||||
|
INFO(NCCL_PROXY, "proxyConnSharedInit::ncclSocketSend(op.respSize=%d)", respSize);
|
||||||
|
NCCLCHECK(ncclSocketSend(connection->sock, &respSize, sizeof(respSize)));
|
||||||
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclComm* comm, int* asyncOpCount) {
|
// cuMem API support
|
||||||
|
static ncclResult_t proxyConvertFd(struct ncclProxyLocalPeer* peer, struct ncclComm* comm) {
|
||||||
|
struct ncclSocket* sock = &peer->sock;
|
||||||
|
uint64_t connection;
|
||||||
|
NCCLCHECK(ncclSocketRecv(sock, &connection, sizeof(uint64_t)));
|
||||||
|
int reqSize, respSize;
|
||||||
|
NCCLCHECK(ncclSocketRecv(sock, &reqSize, sizeof(int)));
|
||||||
|
NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(int)));
|
||||||
|
if (reqSize != sizeof(int) || respSize != sizeof(int)) return ncclInternalError;
|
||||||
|
|
||||||
|
int fd;
|
||||||
|
struct ncclIpcSocket ipcSock = { 0 };
|
||||||
|
NCCLCHECK(ncclSocketRecv(sock, &fd, sizeof(int)));
|
||||||
|
|
||||||
|
INFO(NCCL_NET, "UDS: proxyConvertFd received fd %d peer %d connection %lx", fd, peer->localRank, connection);
|
||||||
|
// Send back the converted fd using UDS
|
||||||
|
NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->localRank, connection^1, comm->abortFlag));
|
||||||
|
NCCLCHECK(ncclIpcSocketSendFd(&ipcSock, fd, peer->localRank, connection));
|
||||||
|
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclComm* comm, int* asyncOpCount, struct ncclProxyLocalPeer* peer) {
|
||||||
int done = 1;
|
int done = 1;
|
||||||
if (op->type == ncclProxyMsgSetup) {
|
if (op->type == ncclProxyMsgSetup) {
|
||||||
|
INFO(NCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId);
|
||||||
NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
|
NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
|
||||||
} else if (op->type == ncclProxyMsgConnect) {
|
} else if (op->type == ncclProxyMsgConnect) {
|
||||||
|
INFO(NCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
|
||||||
NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
|
NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
|
||||||
} else return ncclInternalError;
|
} else return ncclInternalError;
|
||||||
if (done) {
|
if (done) {
|
||||||
@ -1019,27 +1305,34 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclC
|
|||||||
* ncclSocketSend might already send the respBuff to the requester. If we still choose
|
* ncclSocketSend might already send the respBuff to the requester. If we still choose
|
||||||
* to abort and close the connection, it can cause segfault if the requester is using
|
* to abort and close the connection, it can cause segfault if the requester is using
|
||||||
* the respBuff. */
|
* the respBuff. */
|
||||||
if (op->respSize) ncclSocketSend(op->connection->sock, op->respBuff, op->respSize);
|
|
||||||
if (op->reqBuff) {
|
// Send the opId for referencing async operation
|
||||||
free(op->reqBuff);
|
NCCLCHECK(ncclSocketSend(op->connection->sock, &op->opId, sizeof(op->opId)));
|
||||||
op->reqBuff = NULL;
|
|
||||||
|
// Send the response size
|
||||||
|
NCCLCHECK(ncclSocketSend(op->connection->sock, &op->respSize, sizeof(op->respSize)));
|
||||||
|
|
||||||
|
if (op->respSize) {
|
||||||
|
// Send the response
|
||||||
|
NCCLCHECK(ncclSocketSend(op->connection->sock, op->respBuff, op->respSize));
|
||||||
}
|
}
|
||||||
if (op->respBuff) {
|
|
||||||
free(op->respBuff);
|
asyncProxyOpDequeue(peer, op);
|
||||||
op->respBuff = NULL;
|
|
||||||
}
|
|
||||||
op->type = 0;
|
|
||||||
(*asyncOpCount)--;
|
(*asyncOpCount)--;
|
||||||
|
return ncclSuccess;
|
||||||
|
|
||||||
} else if (*comm->abortFlag != 0) {
|
} else if (*comm->abortFlag != 0) {
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
|
|
||||||
return ncclSuccess;
|
return ncclInProgress;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm, int* asyncOpCount) {
|
static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm, int* asyncOpCount) {
|
||||||
struct ncclSocket* sock = &peer->sock;
|
struct ncclSocket* sock = &peer->sock;
|
||||||
struct ncclProxyAsyncOp* asyncOp = &peer->asyncOps;
|
struct ncclProxyAsyncOp* asyncOp;
|
||||||
|
NCCLCHECK(ncclCalloc(&asyncOp, 1));
|
||||||
|
|
||||||
asyncOp->type = type;
|
asyncOp->type = type;
|
||||||
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)));
|
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)));
|
||||||
|
|
||||||
@ -1049,9 +1342,16 @@ static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* p
|
|||||||
NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize));
|
NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize));
|
||||||
NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize));
|
NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Store opId for completion response
|
||||||
|
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)));
|
||||||
|
|
||||||
if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize));
|
if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize));
|
||||||
|
|
||||||
|
asyncProxyOpEnqueue(peer, asyncOp);
|
||||||
|
|
||||||
(*asyncOpCount)++;
|
(*asyncOpCount)++;
|
||||||
NCCLCHECK(proxyProgressAsync(asyncOp, comm, asyncOpCount));
|
NCCLCHECK(proxyProgressAsync(asyncOp, comm, asyncOpCount, peer));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1081,7 +1381,7 @@ void* ncclProxyService(void* _args) {
|
|||||||
pollfds[s].events = POLLHUP|POLLIN;
|
pollfds[s].events = POLLHUP|POLLIN;
|
||||||
}
|
}
|
||||||
if (ncclSocketGetFd(comm->proxyState.listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) {
|
if (ncclSocketGetFd(comm->proxyState.listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) {
|
||||||
WARN("[Proxy Service] Get listenSock fd fails\n");
|
WARN("[Proxy Service] Get listenSock fd fails");
|
||||||
return NULL;
|
return NULL;
|
||||||
};
|
};
|
||||||
pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
|
pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
|
||||||
@ -1113,14 +1413,14 @@ void* ncclProxyService(void* _args) {
|
|||||||
}
|
}
|
||||||
if (maxnpeers < s+1) maxnpeers = s+1;
|
if (maxnpeers < s+1) maxnpeers = s+1;
|
||||||
if (ncclSocketInit(&peers[s].sock) != ncclSuccess) {
|
if (ncclSocketInit(&peers[s].sock) != ncclSuccess) {
|
||||||
WARN("[Service thread] Initialize peers[%d].sock fails\n", s);
|
WARN("[Service thread] Initialize peers[%d].sock fails", s);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
if (ncclSocketAccept(&peers[s].sock, comm->proxyState.listenSock) != ncclSuccess) {
|
if (ncclSocketAccept(&peers[s].sock, comm->proxyState.listenSock) != ncclSuccess) {
|
||||||
WARN("[Service thread] Accept failed %s", strerror(errno));
|
WARN("[Service thread] Accept failed %s", strerror(errno));
|
||||||
} else {
|
} else {
|
||||||
if (ncclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != ncclSuccess) {
|
if (ncclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != ncclSuccess) {
|
||||||
WARN("[Service thread] Get peers[%d].sock fd fails\n", s);
|
WARN("[Service thread] Get peers[%d].sock fd fails", s);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
npeers++;
|
npeers++;
|
||||||
@ -1130,25 +1430,37 @@ void* ncclProxyService(void* _args) {
|
|||||||
for (int s=0; s<maxnpeers; s++) {
|
for (int s=0; s<maxnpeers; s++) {
|
||||||
struct ncclProxyLocalPeer* peer = peers+s;
|
struct ncclProxyLocalPeer* peer = peers+s;
|
||||||
struct ncclSocket* sock = &peer->sock;
|
struct ncclSocket* sock = &peer->sock;
|
||||||
struct ncclProxyAsyncOp* op = &peer->asyncOps;
|
|
||||||
int closeConn = 0;
|
int closeConn = 0;
|
||||||
int type = 0;
|
int type = 0;
|
||||||
ncclResult_t res = ncclSuccess;
|
ncclResult_t res = ncclSuccess;
|
||||||
|
|
||||||
if (pollfds[s].fd == -1) continue;
|
if (pollfds[s].fd == -1) continue;
|
||||||
if (op->type != 0) {
|
|
||||||
res = proxyProgressAsync(op, comm, &asyncOpCount);
|
// Progress all ops for this ncclProxyLocalPeer
|
||||||
|
ncclProxyAsyncOp* op = peer->asyncOps;
|
||||||
|
while (op != nullptr) {
|
||||||
type = op->type;
|
type = op->type;
|
||||||
if (res != ncclSuccess) closeConn = 1;
|
res = proxyProgressAsync(op, comm, &asyncOpCount, peer);
|
||||||
} else if (pollfds[s].revents & POLLIN) {
|
if (res == ncclSuccess || res == ncclInProgress) {
|
||||||
|
op = op->next;
|
||||||
|
} else {
|
||||||
|
// Res is a bad result
|
||||||
|
closeConn = 1;
|
||||||
|
WARN("[Service thread] Error encountered progressing operation=%s, res=%d, closing connection", ncclProxyMsgTypeStr[type], res);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for additional ops coming in
|
||||||
|
if (pollfds[s].revents & POLLIN) {
|
||||||
int closed;
|
int closed;
|
||||||
if (ncclSocketTryRecv(sock, &type, sizeof(int), &closed) != ncclSuccess) {
|
res = ncclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/);
|
||||||
WARN("[Service thread] Could not receive type from localRank %d", peer->localRank);
|
if (res != ncclSuccess && res != ncclInProgress) {
|
||||||
|
WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->localRank, res, closed);
|
||||||
closeConn = 1;
|
closeConn = 1;
|
||||||
} else if (closed) {
|
} else if (closed) {
|
||||||
INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank);
|
INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank);
|
||||||
closeConn = 1;
|
closeConn = 1;
|
||||||
} else {
|
} else if (res == ncclSuccess) { // We received something from the sock
|
||||||
if (type == ncclProxyMsgStop) {
|
if (type == ncclProxyMsgStop) {
|
||||||
stop = 1;
|
stop = 1;
|
||||||
closeConn = 1;
|
closeConn = 1;
|
||||||
@ -1159,30 +1471,32 @@ void* ncclProxyService(void* _args) {
|
|||||||
} else if (type == ncclProxyMsgSharedInit) {
|
} else if (type == ncclProxyMsgSharedInit) {
|
||||||
res = proxyConnSharedInit(peers+s, &connectionPool, comm);
|
res = proxyConnSharedInit(peers+s, &connectionPool, comm);
|
||||||
} else if (type == ncclProxyMsgSetup || type == ncclProxyMsgConnect) {
|
} else if (type == ncclProxyMsgSetup || type == ncclProxyMsgConnect) {
|
||||||
|
INFO(NCCL_PROXY, "proxyConnSetupConnect for peer->localRank %d,", peer->localRank);
|
||||||
res = proxyConnSetupConnect(type, peers+s, &connectionPool, comm, &asyncOpCount);
|
res = proxyConnSetupConnect(type, peers+s, &connectionPool, comm, &asyncOpCount);
|
||||||
|
} else if (type == ncclProxyMsgConvertFd) {
|
||||||
|
res = proxyConvertFd(peers+s, comm); // cuMem API support
|
||||||
} else {
|
} else {
|
||||||
WARN("[Service thread] Unknown command %d from localRank %d\n", type, peer->localRank);
|
WARN("[Service thread] Unknown command %d from localRank %d", type, peer->localRank);
|
||||||
closeConn = 1;
|
closeConn = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
INFO(NCCL_PROXY, "Received and initiated operation=%s res=%d", ncclProxyMsgTypeStr[type], res);
|
||||||
}
|
}
|
||||||
} else if (pollfds[s].revents & POLLHUP) {
|
} else if (pollfds[s].revents & POLLHUP) {
|
||||||
closeConn = 1;
|
closeConn = 1;
|
||||||
}
|
}
|
||||||
if (res != ncclSuccess) {
|
if (res != ncclSuccess && res != ncclInProgress) {
|
||||||
WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", comm->rank, ncclProxyMsgTypeStr[type], comm->localRankToRank[peer->localRank], res);
|
WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", comm->rank, ncclProxyMsgTypeStr[type], comm->localRankToRank[peer->localRank], res);
|
||||||
closeConn = 1;
|
closeConn = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (closeConn) {
|
if (closeConn) {
|
||||||
ncclSocketClose(sock);
|
ncclSocketClose(sock);
|
||||||
if (op->reqBuff) {
|
|
||||||
free(op->reqBuff);
|
if (op != nullptr) {
|
||||||
op->reqBuff = NULL;
|
asyncProxyOpDequeue(peer, op);
|
||||||
|
asyncOpCount--;
|
||||||
}
|
}
|
||||||
if (op->respBuff) {
|
|
||||||
free(op->respBuff);
|
|
||||||
op->respBuff = NULL;
|
|
||||||
}
|
|
||||||
op->type = 0;
|
|
||||||
pollfds[s].fd = -1;
|
pollfds[s].fd = -1;
|
||||||
npeers--;
|
npeers--;
|
||||||
}
|
}
|
||||||
@ -1250,6 +1564,7 @@ ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
|
|||||||
free(state->peerSocks);
|
free(state->peerSocks);
|
||||||
free(state->proxyOps);
|
free(state->proxyOps);
|
||||||
free(state->sharedDevMems);
|
free(state->sharedDevMems);
|
||||||
|
expectedProxyResponseFree(state);
|
||||||
}
|
}
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
@ -69,9 +69,12 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
|
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
|
||||||
ncclResult_t ret = ncclSuccess;
|
ncclResult_t ret = ncclSuccess;
|
||||||
int highestType = TRANSPORT_P2P; // track highest transport type
|
int highestType = TRANSPORT_P2P; // track highest transport type
|
||||||
struct ncclConnect data[2*MAXCHANNELS];
|
struct ncclConnect** data = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Store intermediate send/recvData structs for connect
|
||||||
|
struct ncclConnect** recvData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given recv connection within a channel
|
||||||
|
struct ncclConnect** sendData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given send connection within a channel
|
||||||
|
|
||||||
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail);
|
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail);
|
||||||
|
// First time initialization
|
||||||
for (int i=1; i<comm->nRanks; i++) {
|
for (int i=1; i<comm->nRanks; i++) {
|
||||||
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
|
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
|
||||||
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
||||||
@ -79,22 +82,28 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
uint64_t recvMask = comm->connectRecv[recvPeer];
|
uint64_t recvMask = comm->connectRecv[recvPeer];
|
||||||
uint64_t sendMask = comm->connectSend[sendPeer];
|
uint64_t sendMask = comm->connectSend[sendPeer];
|
||||||
|
|
||||||
struct ncclConnect* recvData = data;
|
// Data[i] contains all ncclConnect information for all send and receive connections with a given send and recv peer
|
||||||
|
// This data is packed in the array based on the number of sendChannels and recvChannels connected with these peers
|
||||||
|
// The first N entries contain recvData, connection information for recv connections
|
||||||
|
// The next M entries contain sendData, connection information for send connections
|
||||||
|
// It's not guaranteed that each entry of data has the same number of total or send/recv specific connections
|
||||||
|
data[i] = (ncclConnect*) malloc(sizeof(ncclConnect) * 2*MAXCHANNELS);
|
||||||
|
recvData[i] = data[i];
|
||||||
int sendChannels = 0, recvChannels = 0;
|
int sendChannels = 0, recvChannels = 0;
|
||||||
int type;
|
int type;
|
||||||
TIME_START(0);
|
TIME_START(0);
|
||||||
for (int c=0; c<MAXCHANNELS; c++) {
|
for (int c=0; c<MAXCHANNELS; c++) {
|
||||||
if (recvMask & (1UL<<c)) {
|
if (recvMask & (1UL<<c)) {
|
||||||
NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
|
NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData[i]+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
|
||||||
if (type > highestType) highestType = type;
|
if (type > highestType) highestType = type;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TIME_STOP(0);
|
TIME_STOP(0);
|
||||||
TIME_START(1);
|
TIME_START(1);
|
||||||
struct ncclConnect* sendData = recvData+recvChannels;
|
sendData[i] = recvData[i]+recvChannels;
|
||||||
for (int c=0; c<MAXCHANNELS; c++) {
|
for (int c=0; c<MAXCHANNELS; c++) {
|
||||||
if (sendMask & (1UL<<c)) {
|
if (sendMask & (1UL<<c)) {
|
||||||
NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
|
NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData[i]+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
|
||||||
if (type > highestType) highestType = type;
|
if (type > highestType) highestType = type;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -103,41 +112,81 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
|||||||
TIME_START(2);
|
TIME_START(2);
|
||||||
if (sendPeer == recvPeer) {
|
if (sendPeer == recvPeer) {
|
||||||
if (recvChannels+sendChannels) {
|
if (recvChannels+sendChannels) {
|
||||||
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
|
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
|
||||||
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
|
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
|
||||||
sendData = data;
|
sendData[i] = data[i];
|
||||||
recvData = data+sendChannels;
|
recvData[i] = data[i]+sendChannels;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
|
if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail);
|
||||||
if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
|
if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail);
|
||||||
if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
|
if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail);
|
||||||
if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
|
if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail);
|
||||||
}
|
}
|
||||||
TIME_STOP(2);
|
TIME_STOP(2);
|
||||||
|
}
|
||||||
|
|
||||||
TIME_START(3);
|
// Loop until all channels with all ranks have been connected
|
||||||
|
bool allChannelsConnected;
|
||||||
|
allChannelsConnected = false;
|
||||||
|
while (!allChannelsConnected) {
|
||||||
|
allChannelsConnected = true;
|
||||||
|
for (int i=1; i<comm->nRanks; i++) {
|
||||||
|
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
||||||
|
int sendPeer = (comm->rank + i) % comm->nRanks;
|
||||||
|
uint64_t recvMask = comm->connectRecv[recvPeer];
|
||||||
|
uint64_t sendMask = comm->connectSend[sendPeer];
|
||||||
|
|
||||||
|
int sendDataOffset = 0;
|
||||||
|
int recvDataOffset = 0;
|
||||||
for (int c=0; c<MAXCHANNELS; c++) {
|
for (int c=0; c<MAXCHANNELS; c++) {
|
||||||
|
TIME_START(3);
|
||||||
if (sendMask & (1UL<<c)) {
|
if (sendMask & (1UL<<c)) {
|
||||||
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
|
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
|
||||||
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn), ret, fail);
|
// This connector hasn't completed connection yet
|
||||||
|
if (conn->connected == 0) {
|
||||||
|
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[i] + sendDataOffset++, 1, comm->rank, conn), ret, fail);
|
||||||
|
if (ret == ncclSuccess) {
|
||||||
conn->connected = 1;
|
conn->connected = 1;
|
||||||
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
|
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
|
||||||
|
} else if (ret == ncclInProgress) {
|
||||||
|
allChannelsConnected = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TIME_STOP(3);
|
TIME_STOP(3);
|
||||||
|
|
||||||
|
// Start with recv channels
|
||||||
TIME_START(4);
|
TIME_START(4);
|
||||||
for (int c=0; c<MAXCHANNELS; c++) {
|
|
||||||
if (recvMask & (1UL<<c)) {
|
if (recvMask & (1UL<<c)) {
|
||||||
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
|
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
|
||||||
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn), ret, fail);
|
// This connector hasn't completed connection yet
|
||||||
|
if (conn->connected == 0) {
|
||||||
|
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[i] + recvDataOffset++, 1, comm->rank, conn), ret, fail);
|
||||||
|
if (ret == ncclSuccess) {
|
||||||
conn->connected = 1;
|
conn->connected = 1;
|
||||||
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
|
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
|
||||||
|
} else if (ret == ncclInProgress) {
|
||||||
|
allChannelsConnected = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TIME_STOP(4);
|
TIME_STOP(4);
|
||||||
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clear all connect masks and free each connectInfo array
|
||||||
|
for (int i=1; i<comm->nRanks; i++) {
|
||||||
|
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
||||||
|
int sendPeer = (comm->rank + i) % comm->nRanks;
|
||||||
|
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
|
||||||
|
free(data[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(data);
|
||||||
|
free(sendData);
|
||||||
|
free(recvData);
|
||||||
|
|
||||||
if (highestTransportType != NULL) *highestTransportType = highestType;
|
if (highestTransportType != NULL) *highestTransportType = highestType;
|
||||||
TIME_PRINT("P2P Setup/Connect");
|
TIME_PRINT("P2P Setup/Connect");
|
||||||
|
@ -152,13 +152,13 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
|||||||
int proxyRank;
|
int proxyRank;
|
||||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
|
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
|
||||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
|
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
|
||||||
send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
||||||
// Determine whether we need to flush the GDR buffer on recv or not
|
// Determine whether we need to flush the GDR buffer on recv or not
|
||||||
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
|
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
|
||||||
|
|
||||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank));
|
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank));
|
||||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
|
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
|
||||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
|
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
|
||||||
|
|
||||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
|
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
|
||||||
req.useGdr ? "/GDRDMA" : "");
|
req.useGdr ? "/GDRDMA" : "");
|
||||||
@ -171,12 +171,12 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
|||||||
int proxyRank;
|
int proxyRank;
|
||||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
|
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
|
||||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
|
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
|
||||||
recv->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
||||||
|
|
||||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.localRank));
|
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.localRank));
|
||||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
|
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
|
||||||
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
|
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
|
||||||
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
|
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
|
||||||
|
|
||||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
|
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
|
||||||
req.useGdr ? "/GDRDMA" : "");
|
req.useGdr ? "/GDRDMA" : "");
|
||||||
@ -221,7 +221,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
|
|||||||
// We're on the same process as the proxy. We can pass a pointer to a struct.
|
// We're on the same process as the proxy. We can pass a pointer to a struct.
|
||||||
struct collNetConnectArgs args = { rank, nranks, connectInfos };
|
struct collNetConnectArgs args = { rank, nranks, connectInfos };
|
||||||
struct connectMap* map;
|
struct connectMap* map;
|
||||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
|
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
|
||||||
|
|
||||||
// If collnet connect failed, propagate error to fallback on regular p2p
|
// If collnet connect failed, propagate error to fallback on regular p2p
|
||||||
if (map == NULL) return ncclSystemError;
|
if (map == NULL) return ncclSystemError;
|
||||||
@ -247,7 +247,7 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
|
|||||||
// We're on the same process as the proxy. We can pass a pointer to a struct.
|
// We're on the same process as the proxy. We can pass a pointer to a struct.
|
||||||
struct collNetConnectArgs args = { rank, nranks, connectInfos };
|
struct collNetConnectArgs args = { rank, nranks, connectInfos };
|
||||||
struct connectMap* map;
|
struct connectMap* map;
|
||||||
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
|
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
|
||||||
|
|
||||||
// If collnet connect failed, propagate error to fallback on regular p2p
|
// If collnet connect failed, propagate error to fallback on regular p2p
|
||||||
if (map == NULL) return ncclSystemError;
|
if (map == NULL) return ncclSystemError;
|
||||||
@ -410,7 +410,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
|
|||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||||
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
|
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
|
||||||
struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
|
struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
|
||||||
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
|
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
|
||||||
|
|
||||||
@ -426,7 +426,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
|||||||
NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
|
NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
|
||||||
|
|
||||||
// Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
|
// Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
|
||||||
if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
|
if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
|
||||||
if (resources->collNetComm == NULL) {
|
if (resources->collNetComm == NULL) {
|
||||||
*((struct connectMap**)respBuff) = NULL;
|
*((struct connectMap**)respBuff) = NULL;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
@ -484,7 +484,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
|||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||||
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
|
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
|
||||||
struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
|
struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
|
||||||
|
|
||||||
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
|
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
|
||||||
@ -494,7 +494,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
|||||||
NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
|
NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
|
||||||
|
|
||||||
// Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
|
// Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
|
||||||
if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
|
if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
|
||||||
if (resources->collNetComm == NULL) {
|
if (resources->collNetComm == NULL) {
|
||||||
*((struct connectMap**)respBuff) = NULL;
|
*((struct connectMap**)respBuff) = NULL;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
@ -553,7 +553,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
|||||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
|
||||||
info->mhandles[p] = resources->mhandles[p];
|
info->mhandles[p] = resources->mhandles[p];
|
||||||
|
|
||||||
if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
|
if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
|
||||||
*((struct connectMap**)respBuff) = &resources->map;
|
*((struct connectMap**)respBuff) = &resources->map;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
@ -172,13 +172,13 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
|||||||
int proxyRank;
|
int proxyRank;
|
||||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
|
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
|
||||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
|
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
|
||||||
send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
||||||
|
|
||||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
|
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
|
||||||
req.rank = myInfo->rank;
|
req.rank = myInfo->rank;
|
||||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
|
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
|
||||||
req.remoteRank = peerInfo->rank;
|
req.remoteRank = peerInfo->rank;
|
||||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
|
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
|
||||||
|
|
||||||
if (proxyRank == myInfo->rank) {
|
if (proxyRank == myInfo->rank) {
|
||||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
|
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
|
||||||
@ -218,8 +218,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
|||||||
req.rank = myInfo->rank;
|
req.rank = myInfo->rank;
|
||||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
|
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
|
||||||
req.remoteRank = peerInfo->rank;
|
req.remoteRank = peerInfo->rank;
|
||||||
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
|
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
|
||||||
|
|
||||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
|
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
|
||||||
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
|
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
@ -264,11 +263,28 @@ static ncclResult_t netDumpMap(struct connectMap* map) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
|
static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
|
||||||
|
struct connectMap* map = (connectMap*) send->transportResources;
|
||||||
|
|
||||||
|
void* opId;
|
||||||
|
|
||||||
|
// map isn't allocated thus this op hasn't been submitted yet
|
||||||
|
if (!map) {
|
||||||
// Setup device pointers
|
// Setup device pointers
|
||||||
struct connectMap* map;
|
|
||||||
NCCLCHECK(ncclCalloc(&map, 1));
|
NCCLCHECK(ncclCalloc(&map, 1));
|
||||||
send->transportResources = map;
|
send->transportResources = map;
|
||||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), map, sizeof(struct connectMap)));
|
opId = send;
|
||||||
|
INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId);
|
||||||
|
NCCLCHECK(ncclProxyCallAsync(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), sizeof(struct connectMap), opId));
|
||||||
|
} else {
|
||||||
|
opId = send;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t ret;
|
||||||
|
NCCLCHECK(ret = ncclPollProxyResponse(&send->proxyConn, map, opId));
|
||||||
|
if (ret == ncclInProgress) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
INFO(NCCL_PROXY, "sendConnect ncclPollProxyResponse opId=%p", opId);
|
||||||
|
|
||||||
if (map->sameProcess) {
|
if (map->sameProcess) {
|
||||||
if (map->cudaDev != comm->cudaDev) {
|
if (map->cudaDev != comm->cudaDev) {
|
||||||
@ -315,10 +331,26 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
|
|||||||
|
|
||||||
/* Connect to this peer */
|
/* Connect to this peer */
|
||||||
static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
|
static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
|
||||||
struct connectMap* map;
|
struct connectMap* map = (connectMap*) recv->transportResources;
|
||||||
|
void* opId;
|
||||||
|
if (!map) {
|
||||||
NCCLCHECK(ncclCalloc(&map, 1));
|
NCCLCHECK(ncclCalloc(&map, 1));
|
||||||
recv->transportResources = map;
|
recv->transportResources = map;
|
||||||
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), map, sizeof(struct connectMap)));
|
// Use recv connector as unique identifier
|
||||||
|
opId = recv;
|
||||||
|
INFO(NCCL_PROXY, "recvConnect ncclProxyCallAsync opId=%p &recv->proxyConn=%p connectInfo=%p",
|
||||||
|
opId, &recv->proxyConn, connectInfo);
|
||||||
|
NCCLCHECK(ncclProxyCallAsync(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), sizeof(struct connectMap), opId));
|
||||||
|
} else {
|
||||||
|
opId = recv;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t ret;
|
||||||
|
NCCLCHECK(ret = ncclPollProxyResponse(&recv->proxyConn, map, opId));
|
||||||
|
if (ret == ncclInProgress) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
INFO(NCCL_PROXY, "recvConnect ncclPollProxyResponse opId=%p", opId);
|
||||||
//NCCLCHECK(netDumpMap(map));
|
//NCCLCHECK(netDumpMap(map));
|
||||||
|
|
||||||
struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
|
struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
|
||||||
@ -490,12 +522,14 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
|
|||||||
if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
|
if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
|
||||||
NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm));
|
NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm));
|
||||||
*done = 1;
|
*done = 1;
|
||||||
|
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||||
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
|
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
|
||||||
if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
|
if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
|
||||||
|
ncclResult_t ret = ncclSuccess;
|
||||||
|
|
||||||
if (resources->shared) {
|
if (resources->shared) {
|
||||||
// Shared buffers
|
// Shared buffers
|
||||||
@ -515,21 +549,22 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
|||||||
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
|
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
|
||||||
}
|
}
|
||||||
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
|
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
|
||||||
if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId));
|
if (comms->sendComm[resources->channelId] == NULL) ret = ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId);
|
||||||
resources->netSendComm = comms->sendComm[resources->channelId];
|
resources->netSendComm = comms->sendComm[resources->channelId];
|
||||||
if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
|
if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
|
||||||
} else {
|
} else {
|
||||||
NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
|
ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Connect to remote peer
|
// Connect to remote peer
|
||||||
NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
|
ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm);
|
||||||
connection->proxyAppendPtr = &connection->proxyAppend;
|
connection->proxyAppendPtr = &connection->proxyAppend;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
NCCLCHECK(ret);
|
||||||
if (resources->netSendComm == NULL) {
|
if (resources->netSendComm == NULL) {
|
||||||
*done = 0;
|
*done = 0;
|
||||||
return ncclSuccess;
|
return ncclInProgress;
|
||||||
}
|
}
|
||||||
*done = 1;
|
*done = 1;
|
||||||
|
|
||||||
@ -630,6 +665,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
|||||||
if (reqSize != sizeof(int)) return ncclInternalError;
|
if (reqSize != sizeof(int)) return ncclInternalError;
|
||||||
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
|
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
|
||||||
resources->proxyRank = *(int*)reqBuff;
|
resources->proxyRank = *(int*)reqBuff;
|
||||||
|
ncclResult_t ret = ncclSuccess;
|
||||||
|
|
||||||
// Finish connection establishment from remote peer
|
// Finish connection establishment from remote peer
|
||||||
if (resources->shared) {
|
if (resources->shared) {
|
||||||
@ -650,23 +686,25 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
|||||||
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
|
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
|
||||||
}
|
}
|
||||||
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
|
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
|
||||||
if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId));
|
if (comms->recvComm[resources->channelId] == NULL) ret = ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId);
|
||||||
resources->netRecvComm = comms->recvComm[resources->channelId];
|
resources->netRecvComm = comms->recvComm[resources->channelId];
|
||||||
if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
|
if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
|
||||||
} else {
|
} else {
|
||||||
NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
|
ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Connect to remote peer
|
// Connect to remote peer
|
||||||
NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
|
ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm);
|
||||||
connection->proxyAppendPtr = &connection->proxyAppend;
|
connection->proxyAppendPtr = &connection->proxyAppend;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
NCCLCHECK(ret);
|
||||||
if (resources->netRecvComm == NULL) {
|
if (resources->netRecvComm == NULL) {
|
||||||
*done = 0;
|
*done = 0;
|
||||||
return ncclSuccess;
|
return ncclInProgress;
|
||||||
}
|
}
|
||||||
*done = 1;
|
*done = 1;
|
||||||
|
|
||||||
NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm));
|
NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm));
|
||||||
|
|
||||||
// Create structures
|
// Create structures
|
||||||
|
@ -363,7 +363,9 @@ enum ncclIbCommState {
|
|||||||
ncclIbCommStateAccept = 3,
|
ncclIbCommStateAccept = 3,
|
||||||
ncclIbCommStateSend = 4,
|
ncclIbCommStateSend = 4,
|
||||||
ncclIbCommStateRecv = 5,
|
ncclIbCommStateRecv = 5,
|
||||||
ncclIbCommStateConnected = 6,
|
ncclIbCommStateConnecting = 6,
|
||||||
|
ncclIbCommStateConnected = 7,
|
||||||
|
ncclIbCommStatePendingReady = 8,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ncclIbCommStage {
|
struct ncclIbCommStage {
|
||||||
@ -601,6 +603,8 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
|
|||||||
|
|
||||||
if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
|
if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
|
||||||
if (stage->state == ncclIbCommStateSend) goto ib_send;
|
if (stage->state == ncclIbCommStateSend) goto ib_send;
|
||||||
|
if (stage->state == ncclIbCommStateConnecting) goto ib_connect;
|
||||||
|
if (stage->state == ncclIbCommStateConnected) goto ib_send_ready;
|
||||||
if (stage->state != ncclIbCommStateStart) {
|
if (stage->state != ncclIbCommStateStart) {
|
||||||
WARN("Error: trying to connect already connected sendComm");
|
WARN("Error: trying to connect already connected sendComm");
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
@ -664,11 +668,37 @@ ib_connect_check:
|
|||||||
|
|
||||||
ib_send:
|
ib_send:
|
||||||
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, stage->buffer, sizeof(qpInfo), &stage->offset));
|
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, stage->buffer, sizeof(qpInfo), &stage->offset));
|
||||||
if (stage->offset != sizeof(qpInfo))
|
if (stage->offset != sizeof(qpInfo)) return ncclSuccess;
|
||||||
return ncclSuccess;
|
|
||||||
|
stage->state = ncclIbCommStateConnecting;
|
||||||
|
stage->offset = 0;
|
||||||
|
// Clear the staging buffer for re-use
|
||||||
|
memset(stage->buffer, 0, sizeof(qpInfo));
|
||||||
|
|
||||||
|
ib_connect:
|
||||||
|
struct ncclIbQpInfo remQpInfo;
|
||||||
|
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, stage->buffer, sizeof(ncclIbQpInfo), &stage->offset));
|
||||||
|
if (stage->offset != sizeof(remQpInfo)) return ncclSuccess;
|
||||||
|
|
||||||
|
memcpy(&remQpInfo, stage->buffer, sizeof(ncclIbQpInfo));
|
||||||
|
|
||||||
|
for (int q=0; q<comm->nqps; q++) {
|
||||||
|
struct ibv_qp* qp = comm->qps[q];
|
||||||
|
NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo));
|
||||||
|
NCCLCHECK(ncclIbRtsQp(qp));
|
||||||
|
}
|
||||||
|
|
||||||
|
comm->ready = 1;
|
||||||
|
stage->state = ncclIbCommStateConnected;
|
||||||
|
stage->offset = 0;
|
||||||
|
|
||||||
|
ib_send_ready:
|
||||||
|
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, &comm->ready, sizeof(int), &stage->offset));
|
||||||
|
if (stage->offset != sizeof(int)) return ncclSuccess;
|
||||||
|
|
||||||
free(stage->buffer);
|
free(stage->buffer);
|
||||||
stage->state = ncclIbCommStateConnected;
|
stage->state = ncclIbCommStateStart;
|
||||||
|
|
||||||
*sendComm = comm;
|
*sendComm = comm;
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
@ -685,8 +715,9 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
|
|||||||
if (stage->state == ncclIbCommStateAccept) goto ib_accept_check;
|
if (stage->state == ncclIbCommStateAccept) goto ib_accept_check;
|
||||||
if (stage->state == ncclIbCommStateRecv) goto ib_recv;
|
if (stage->state == ncclIbCommStateRecv) goto ib_recv;
|
||||||
if (stage->state == ncclIbCommStateSend) goto ib_send;
|
if (stage->state == ncclIbCommStateSend) goto ib_send;
|
||||||
|
if (stage->state == ncclIbCommStatePendingReady) goto ib_recv_ready;
|
||||||
if (stage->state != ncclIbCommStateStart) {
|
if (stage->state != ncclIbCommStateStart) {
|
||||||
WARN("Listencomm in unknown state %d\n", stage->state);
|
WARN("Listencomm in unknown state %d", stage->state);
|
||||||
return ncclInternalError;
|
return ncclInternalError;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -704,10 +735,10 @@ ib_accept_check:
|
|||||||
stage->state = ncclIbCommStateRecv;
|
stage->state = ncclIbCommStateRecv;
|
||||||
stage->offset = 0;
|
stage->offset = 0;
|
||||||
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remQpInfo)));
|
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remQpInfo)));
|
||||||
|
|
||||||
ib_recv:
|
ib_recv:
|
||||||
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset));
|
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset));
|
||||||
if (stage->offset != sizeof(remQpInfo))
|
if (stage->offset != sizeof(remQpInfo)) return ncclSuccess;
|
||||||
return ncclSuccess;
|
|
||||||
|
|
||||||
/* copy back the received info */
|
/* copy back the received info */
|
||||||
memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo));
|
memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo));
|
||||||
@ -780,10 +811,18 @@ ib_recv:
|
|||||||
if (stage->buffer) free(stage->buffer);
|
if (stage->buffer) free(stage->buffer);
|
||||||
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbQpInfo)));
|
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbQpInfo)));
|
||||||
memcpy(stage->buffer, &qpInfo, sizeof(struct ncclIbQpInfo));
|
memcpy(stage->buffer, &qpInfo, sizeof(struct ncclIbQpInfo));
|
||||||
|
|
||||||
ib_send:
|
ib_send:
|
||||||
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct ncclIbQpInfo), &stage->offset));
|
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct ncclIbQpInfo), &stage->offset));
|
||||||
if (stage->offset < sizeof(struct ncclIbQpInfo)) return ncclSuccess;
|
if (stage->offset < sizeof(struct ncclIbQpInfo)) return ncclSuccess;
|
||||||
|
|
||||||
|
stage->offset = 0;
|
||||||
|
stage->state = ncclIbCommStatePendingReady;
|
||||||
|
|
||||||
|
ib_recv_ready:
|
||||||
|
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, &rComm->ready, sizeof(int), &stage->offset));
|
||||||
|
if (stage->offset != sizeof(int)) return ncclSuccess;
|
||||||
|
|
||||||
free(stage->buffer);
|
free(stage->buffer);
|
||||||
*recvComm = rComm;
|
*recvComm = rComm;
|
||||||
|
|
||||||
@ -815,36 +854,6 @@ ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) {
|
|||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
|
|
||||||
struct ncclIbQpInfo remQpInfo;
|
|
||||||
|
|
||||||
// Do not block on this receive, return if not ready.
|
|
||||||
int bytes = 0;
|
|
||||||
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes));
|
|
||||||
if (bytes == 0) return ncclSuccess; // Try again later
|
|
||||||
NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes));
|
|
||||||
|
|
||||||
for (int q=0; q<comm->nqps; q++) {
|
|
||||||
struct ibv_qp* qp = comm->qps[q];
|
|
||||||
NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo));
|
|
||||||
NCCLCHECK(ncclIbRtsQp(qp));
|
|
||||||
}
|
|
||||||
comm->ready = 1;
|
|
||||||
// Block until this is done. It *should* not block indefinitely.
|
|
||||||
NCCLCHECK(ncclSocketSend(&comm->sock, &comm->ready, sizeof(int)));
|
|
||||||
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) {
|
|
||||||
// Do not block on this receive, return if not ready.
|
|
||||||
int bytes = 0;
|
|
||||||
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes));
|
|
||||||
if (bytes == 0) return ncclSuccess; // Try again later
|
|
||||||
NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes));
|
|
||||||
return ncclSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
ncclResult_t ncclIbTest(void* request, int* done, int* size);
|
ncclResult_t ncclIbTest(void* request, int* done, int* size);
|
||||||
|
|
||||||
/* DMA-BUF support */
|
/* DMA-BUF support */
|
||||||
@ -1020,7 +1029,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
|
|||||||
|
|
||||||
ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
|
ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
|
||||||
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
|
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
|
||||||
if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm));
|
if (comm->ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->ready == 0"); return ncclInternalError; }
|
||||||
if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
|
if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
|
||||||
|
|
||||||
struct ibv_mr* mr = (struct ibv_mr*)mhandle;
|
struct ibv_mr* mr = (struct ibv_mr*)mhandle;
|
||||||
@ -1153,7 +1162,7 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int
|
|||||||
|
|
||||||
ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
|
ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
|
||||||
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
|
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
|
||||||
if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm));
|
if (comm->ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->ready == 0"); return ncclInternalError; }
|
||||||
if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
|
if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
|
||||||
if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
|
if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
|
||||||
|
|
||||||
|
373
src/transport/nvls.cc
Normal file
373
src/transport/nvls.cc
Normal file
@ -0,0 +1,373 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
// Implementation of the NVLink SHARP (NVLS) transport
|
||||||
|
|
||||||
|
#include "comm.h"
|
||||||
|
#include "graph.h"
|
||||||
|
#include "utils.h"
|
||||||
|
#include "proxy.h"
|
||||||
|
|
||||||
|
#if CUDART_VERSION >= 12010
|
||||||
|
|
||||||
|
// Currently we only support POSIX_FILE_DESCRIPTOR handle exchange
|
||||||
|
#define USE_POSIX_FD 1
|
||||||
|
|
||||||
|
#if USE_POSIX_FD
|
||||||
|
#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
|
||||||
|
#else
|
||||||
|
#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||||
|
// This transport cannot be used for p2p
|
||||||
|
*ret = 0;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t nvlsSendFree(struct ncclConnector* send) {
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t nvlsRecvFree(struct ncclConnector* recv) {
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ncclTransport nvlsTransport = {
|
||||||
|
"NVLS",
|
||||||
|
nvlsCanConnect,
|
||||||
|
{ NULL, NULL, nvlsSendFree, NULL, NULL, NULL, NULL, NULL },
|
||||||
|
{ NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL }
|
||||||
|
};
|
||||||
|
|
||||||
|
#define NVLS_HANDLE_SIZE 64
|
||||||
|
|
||||||
|
struct nvlsResources {
|
||||||
|
CUmulticastObjectProp properties;
|
||||||
|
CUmemAccessDesc accessDesc;
|
||||||
|
int dev;
|
||||||
|
size_t size;
|
||||||
|
size_t granularity;
|
||||||
|
CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
|
||||||
|
char* mcBuff; // Multicast NVLS buffer address
|
||||||
|
CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
|
||||||
|
char* ucBuff; // Unicast NVLS buffer address
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct nvlsResources* resources, int dev, int nranks, size_t size) {
|
||||||
|
CUmulticastObjectProp* prop = &resources->properties;
|
||||||
|
memset(prop, 0, sizeof(*prop));
|
||||||
|
prop->size = size;
|
||||||
|
prop->numDevices = nranks;
|
||||||
|
prop->handleTypes = NVLS_CU_MEM_HANDLE_TYPE;
|
||||||
|
prop->flags = 0;
|
||||||
|
|
||||||
|
// Could be changed to CU_MULTICAST_GRANULARITY_MINIMUM when 3418538 resolved
|
||||||
|
CUCHECK(cuMulticastGetGranularity(&resources->granularity, prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
|
||||||
|
|
||||||
|
ALIGN_SIZE(size, resources->granularity);
|
||||||
|
prop->size = resources->size = size;
|
||||||
|
|
||||||
|
memset(&resources->accessDesc, 0, sizeof(resources->accessDesc));
|
||||||
|
resources->accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||||
|
resources->accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||||
|
resources->accessDesc.location.id = dev;
|
||||||
|
resources->dev = dev;
|
||||||
|
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct nvlsResources* resources, int rank, unsigned int nranks, char* shareableHandle) {
|
||||||
|
size_t size = resources->size;
|
||||||
|
|
||||||
|
// Create a Multicast group
|
||||||
|
CUmulticastObjectProp* prop = &resources->properties;
|
||||||
|
|
||||||
|
INFO(NCCL_NVLS, "NVLS Creating Multicast group nranks %d size %zi on rank %d", nranks, size, rank);
|
||||||
|
CUCHECK(cuMulticastCreate(&resources->mcHandle, prop));
|
||||||
|
|
||||||
|
if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) {
|
||||||
|
// Get a handle to pass to other ranks
|
||||||
|
CUCHECK(cuMemExportToShareableHandle(shareableHandle, resources->mcHandle, NVLS_CU_MEM_HANDLE_TYPE, 0));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
memcpy(shareableHandle, &resources->mcHandle, sizeof(resources->mcHandle));
|
||||||
|
}
|
||||||
|
|
||||||
|
INFO(NCCL_NVLS, "NVLS Created Multicast group %llx nranks %d size %zi on rank %d", resources->mcHandle, nranks, size, rank);
|
||||||
|
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct nvlsResources* resources) {
|
||||||
|
INFO(NCCL_NVLS, "NVLS group %llx adding dev %d", resources->mcHandle, resources->dev);
|
||||||
|
CUCHECK(cuMulticastAddDevice(resources->mcHandle, resources->dev));
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct nvlsResources* resources) {
|
||||||
|
int dev = resources->dev;
|
||||||
|
size_t size = resources->size;
|
||||||
|
INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zi dev %d", resources->mcHandle, size, dev);
|
||||||
|
|
||||||
|
// Unbind physical memory from group for the given device
|
||||||
|
CUCHECK(cuMulticastUnbind(resources->mcHandle, dev, 0/*mcOffset*/, size));
|
||||||
|
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resources, int rank, char* shareableHandle) {
|
||||||
|
CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE;
|
||||||
|
|
||||||
|
INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank);
|
||||||
|
|
||||||
|
// Import and map the remote memory descriptor to the local GPU
|
||||||
|
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
|
||||||
|
// cuMem UDS support
|
||||||
|
int fd = *(int *)shareableHandle;
|
||||||
|
TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle from rank %d fd %d", comm->localRank, rank, fd);
|
||||||
|
struct ncclProxyConnector proxyConn;
|
||||||
|
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, rank, &proxyConn));
|
||||||
|
TRACE(NCCL_NVLS, "NVLS rank %d request conversion of fd %d from rank %d", comm->localRank, fd, rank);
|
||||||
|
NCCLCHECK(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgConvertFd, shareableHandle, sizeof(int), &fd, sizeof(int)));
|
||||||
|
TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank);
|
||||||
|
CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)(uintptr_t)fd, type));
|
||||||
|
} else {
|
||||||
|
if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) {
|
||||||
|
CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)shareableHandle, type));
|
||||||
|
} else {
|
||||||
|
memcpy(&resources->mcHandle, shareableHandle, sizeof(resources->mcHandle));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct nvlsResources* resources) {
|
||||||
|
size_t size = resources->size;
|
||||||
|
size_t granularity;
|
||||||
|
CUdeviceptr ptr = 0;
|
||||||
|
CUmemAllocationProp prop;
|
||||||
|
|
||||||
|
memset(&prop, 0, sizeof(prop));
|
||||||
|
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||||
|
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||||
|
prop.location.id = resources->dev;
|
||||||
|
prop.requestedHandleTypes = NVLS_CU_MEM_HANDLE_TYPE;
|
||||||
|
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||||
|
|
||||||
|
// Map a VA for UC memory
|
||||||
|
CUCHECK(cuMemAddressReserve(&ptr, size, granularity, 0U, 0));
|
||||||
|
|
||||||
|
// Alloc local physical mem for this NVLS group
|
||||||
|
CUCHECK(cuMemCreate(&resources->ucHandle, size, &prop, 0));
|
||||||
|
CUCHECK(cuMemMap(ptr, size, 0, resources->ucHandle, 0));
|
||||||
|
CUCHECK(cuMemSetAccess(ptr, size, &resources->accessDesc, 1));
|
||||||
|
CUDACHECK(cudaMemset((void*)ptr, 0, size));
|
||||||
|
resources->ucBuff = (char*)ptr;
|
||||||
|
INFO(NCCL_NVLS, "NVLS Mapped UC at %p size %zi", resources->ucBuff, size);
|
||||||
|
|
||||||
|
// Bind physical memory to the Multicast group
|
||||||
|
// NB: It will block until all ranks have been added to the Group
|
||||||
|
INFO(NCCL_NVLS, "NVLS Bind mem %p UC handle 0x%llx MC handle 0x%llx size %zi", (void*)ptr, resources->ucHandle, resources->mcHandle, size);
|
||||||
|
CUCHECK(cuMulticastBindMem(resources->mcHandle, 0/*mcOffset*/, resources->ucHandle, 0/*memOffset*/, size, 0/*flags*/));
|
||||||
|
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct nvlsResources* resources) {
|
||||||
|
size_t size = resources->size;
|
||||||
|
CUdeviceptr ptr = 0;
|
||||||
|
|
||||||
|
// Create a VA for the NVLS
|
||||||
|
CUCHECK(cuMemAddressReserve(&ptr, size, resources->granularity, 0U, 0));
|
||||||
|
// Map the VA locally
|
||||||
|
CUCHECK(cuMemMap(ptr, size, 0, resources->mcHandle, 0));
|
||||||
|
resources->mcBuff = (char*)ptr;
|
||||||
|
INFO(NCCL_NVLS, "NVLS Mapped MC buffer at %p size %zi", resources->mcBuff, size);
|
||||||
|
|
||||||
|
// Having completed the BindMem we can now call SetAccess
|
||||||
|
// NB: It will block until all ranks have bound to the Group
|
||||||
|
CUCHECK(cuMemSetAccess((CUdeviceptr)resources->mcBuff, size, &resources->accessDesc, 1));
|
||||||
|
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct nvlsResources* resources) {
|
||||||
|
size_t size;
|
||||||
|
CUdeviceptr ptr;
|
||||||
|
INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)",
|
||||||
|
resources->ucHandle, resources->ucBuff, resources->mcHandle, resources->mcBuff);
|
||||||
|
|
||||||
|
// Release the UC memory and mapping
|
||||||
|
ptr = (CUdeviceptr)resources->ucBuff;
|
||||||
|
size = resources->size;
|
||||||
|
CUCHECK(cuMemUnmap(ptr, size));
|
||||||
|
CUCHECK(cuMemAddressFree(ptr, size));
|
||||||
|
CUCHECK(cuMemRelease(resources->ucHandle));
|
||||||
|
|
||||||
|
// Release the MC memory and mapping
|
||||||
|
ptr = (CUdeviceptr)resources->mcBuff;
|
||||||
|
size = resources->size;
|
||||||
|
CUCHECK(cuMemUnmap(ptr, size));
|
||||||
|
CUCHECK(cuMemAddressFree(ptr, size));
|
||||||
|
CUCHECK(cuMemRelease(resources->mcHandle));
|
||||||
|
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
#include "bootstrap.h"
|
||||||
|
#include "channel.h"
|
||||||
|
|
||||||
|
#define NVLS_MEM_ALIGN_SIZE (1 << 21)
|
||||||
|
|
||||||
|
NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16);
|
||||||
|
|
||||||
|
NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 1);
|
||||||
|
|
||||||
|
ncclResult_t ncclNvlsSetup(struct ncclComm* comm) {
|
||||||
|
if (!ncclParamNvlsEnable() || comm->localRanks <= 1 || comm->nNodes>1) return ncclSuccess;
|
||||||
|
CUdevice dev;
|
||||||
|
int driverVersion;
|
||||||
|
if (CUPFN(cuDeviceGet) == NULL) return ncclSuccess;
|
||||||
|
CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
|
||||||
|
CUDACHECK(cudaDriverGetVersion(&driverVersion));
|
||||||
|
comm->nvlsSupport = 0;
|
||||||
|
// NVLS Multicast support requires CUDA12.1 UMD + KMD
|
||||||
|
if (CUPFN(cuMulticastCreate) != NULL && driverVersion >= 12010) {
|
||||||
|
CUCHECK(cuDeviceGetAttribute(&comm->nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
|
||||||
|
}
|
||||||
|
INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev);
|
||||||
|
if (comm->nvlsSupport == 0) return ncclSuccess;
|
||||||
|
|
||||||
|
int nChannels = comm->nvlsChannels = std::max(comm->minCTAs, std::min(comm->maxCTAs, (int)ncclParamNvlsChannels()));
|
||||||
|
int rank = comm->localRank, nranks = comm->localRanks;
|
||||||
|
|
||||||
|
for (int c=0; c<nChannels; c++) {
|
||||||
|
NCCLCHECK(initChannel(comm, c));
|
||||||
|
}
|
||||||
|
ncclResult_t res = ncclSuccess;
|
||||||
|
struct nvlsResources* resources;
|
||||||
|
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||||
|
comm->nvlsResources = resources;
|
||||||
|
|
||||||
|
size_t buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
|
||||||
|
size_t memSize = NVLS_MEM_ALIGN_SIZE;
|
||||||
|
size_t nvlsPerRankSize = nChannels*2*(buffSize+memSize);
|
||||||
|
size_t nvlsTotalSize = nvlsPerRankSize*nranks;
|
||||||
|
|
||||||
|
INFO(NCCL_INIT|NCCL_NVLS, "NVLS comm %p rank %d nranks %d buffSize %zi memSize %zi nvlsPerRankSize %zi nvlsTotalSize %zi",
|
||||||
|
comm, rank, nranks, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize);
|
||||||
|
|
||||||
|
char* nvlsShareableHandle = NULL;
|
||||||
|
NCCLCHECKGOTO(ncclCalloc(&nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
|
||||||
|
NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, nranks, nvlsTotalSize), res, cleanup);
|
||||||
|
if (rank == 0) {
|
||||||
|
NCCLCHECKGOTO(nvlsGroupCreate(comm, resources, rank, nranks, nvlsShareableHandle), res, cleanup);
|
||||||
|
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
|
||||||
|
} else {
|
||||||
|
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
|
||||||
|
NCCLCHECKGOTO(nvlsGroupConnect(comm, resources, 0, nvlsShareableHandle), res, cleanup);
|
||||||
|
}
|
||||||
|
|
||||||
|
NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup);
|
||||||
|
NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup);
|
||||||
|
// Local intra-node barrier to ensure everyone has bound their memory to the group
|
||||||
|
NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup);
|
||||||
|
NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup);
|
||||||
|
|
||||||
|
for (int c=0; c<nChannels; c++) {
|
||||||
|
struct ncclChannel* channel = comm->channels+c;
|
||||||
|
channel->nvls.nHeads = nranks;
|
||||||
|
for (int i=0; i<NCCL_MAX_NVLS_ARITY; i++) channel->nvls.up[i] = -1;
|
||||||
|
channel->nvls.down = comm->nRanks+1+comm->localRank;
|
||||||
|
channel->nvls.out = -1; // Network not yet implemented.
|
||||||
|
channel->nvls.headRank = comm->localRank; // Network not yet implemented.
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<nranks; r++) {
|
||||||
|
int nvlsPeer = comm->nRanks+1+r;
|
||||||
|
for (int c=0; c<nChannels; c++) {
|
||||||
|
struct ncclChannel* channel = comm->channels+c;
|
||||||
|
channel->nvls.up[r] = nvlsPeer;
|
||||||
|
|
||||||
|
char* mem = NULL;
|
||||||
|
struct ncclChannelPeer* peer = channel->peers+nvlsPeer;
|
||||||
|
|
||||||
|
// Reduce UC -> MC
|
||||||
|
mem = resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize);
|
||||||
|
peer->send[0].transportComm = &nvlsTransport.send;
|
||||||
|
peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
|
||||||
|
peer->send[0].conn.head = (uint64_t*)(mem+buffSize);
|
||||||
|
peer->send[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
|
||||||
|
mem = resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize);
|
||||||
|
peer->recv[1].transportComm = &nvlsTransport.recv;
|
||||||
|
peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
|
||||||
|
peer->recv[1].conn.head = (uint64_t*)(mem+buffSize);
|
||||||
|
peer->recv[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
|
||||||
|
peer->recv[1].conn.flags |= NCCL_NVLS_MIN_POLL;
|
||||||
|
|
||||||
|
// Broadcast MC -> UC
|
||||||
|
mem = resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize);
|
||||||
|
peer->recv[0].transportComm = &nvlsTransport.recv;
|
||||||
|
peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
|
||||||
|
peer->recv[0].conn.head = (uint64_t*)(mem+buffSize);
|
||||||
|
peer->recv[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
|
||||||
|
mem = resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize);
|
||||||
|
peer->send[1].transportComm = &nvlsTransport.send;
|
||||||
|
peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
|
||||||
|
peer->send[1].conn.head = (uint64_t*)(mem+buffSize);
|
||||||
|
peer->send[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
|
||||||
|
peer->send[1].conn.flags |= NCCL_NVLS_MIN_POLL;
|
||||||
|
|
||||||
|
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
|
||||||
|
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
|
||||||
|
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
|
||||||
|
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
|
||||||
|
|
||||||
|
/*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p",
|
||||||
|
nvlsPeer, c,
|
||||||
|
resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize),
|
||||||
|
resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize),
|
||||||
|
resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize),
|
||||||
|
resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize));*/
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
free(nvlsShareableHandle);
|
||||||
|
return res;
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
comm->nvlsSupport = 0;
|
||||||
|
free(nvlsShareableHandle);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
|
||||||
|
struct nvlsResources* resources = (struct nvlsResources*)comm->nvlsResources;
|
||||||
|
if (resources == NULL) return ncclSuccess;
|
||||||
|
NCCLCHECK(nvlsGroupUnbind(comm, resources));
|
||||||
|
NCCLCHECK(nvlsGroupUnmapMem(comm, resources));
|
||||||
|
free(resources);
|
||||||
|
comm->nvlsResources = NULL;
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Pre CUDA 12.1 stubs
|
||||||
|
*/
|
||||||
|
|
||||||
|
ncclResult_t ncclNvlsSetup(struct ncclComm* comm) {
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
|
||||||
|
return ncclSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* CUDA_VERSION >= 12010 */
|
@ -239,11 +239,11 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
|||||||
if (intermediateRank == -1) {
|
if (intermediateRank == -1) {
|
||||||
info->rank = myInfo->rank;
|
info->rank = myInfo->rank;
|
||||||
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
|
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
|
||||||
if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
if (ncclParamP2pDirectDisable() == 0) send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
|
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
|
||||||
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
|
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
|
||||||
} else {
|
} else {
|
||||||
send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
|
send->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
|
||||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s",
|
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s",
|
||||||
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "");
|
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "");
|
||||||
}
|
}
|
||||||
@ -256,11 +256,11 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
|||||||
|
|
||||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
|
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
|
||||||
if (useMemcpy) {
|
if (useMemcpy) {
|
||||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo)));
|
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo)));
|
||||||
info->shmSize = resources->proxyInfo.shmSize;
|
info->shmSize = resources->proxyInfo.shmSize;
|
||||||
memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
|
memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
|
||||||
} else {
|
} else {
|
||||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
|
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
|
||||||
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
|
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -290,16 +290,16 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
|||||||
if (intermediateRank == -1) {
|
if (intermediateRank == -1) {
|
||||||
info->rank = myInfo->rank;
|
info->rank = myInfo->rank;
|
||||||
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
|
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
|
||||||
if (ncclParamP2pDirectDisable() == 0) recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
if (ncclParamP2pDirectDisable() == 0) recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||||
} else {
|
} else {
|
||||||
recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
|
recv->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
info->rank = intermediateRank;
|
info->rank = intermediateRank;
|
||||||
}
|
}
|
||||||
|
|
||||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn));
|
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn));
|
||||||
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
|
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
|
||||||
|
|
||||||
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->recvMemIpc));
|
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->recvMemIpc));
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
@ -330,7 +330,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
|
|||||||
send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
|
send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
|
||||||
send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
|
send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
|
||||||
// Send SIMPLE buff to proxy, and replace it by local buffer
|
// Send SIMPLE buff to proxy, and replace it by local buffer
|
||||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
|
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
|
||||||
send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff;
|
send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff;
|
||||||
} else {
|
} else {
|
||||||
send->conn.tail = &remDevMem->tail;
|
send->conn.tail = &remDevMem->tail;
|
||||||
|
@ -157,7 +157,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
|
|||||||
if (useMemcpySend) {
|
if (useMemcpySend) {
|
||||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn));
|
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn));
|
||||||
struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
|
struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
|
||||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
|
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
|
||||||
send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
|
send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
|
||||||
send->conn.tail = &proxyInfo.ceRecvMem->tail;
|
send->conn.tail = &proxyInfo.ceRecvMem->tail;
|
||||||
send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
|
send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
|
||||||
@ -187,7 +187,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
|
|||||||
if (useMemcpyRecv) {
|
if (useMemcpyRecv) {
|
||||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
|
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
|
||||||
struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
|
struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
|
||||||
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
|
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
|
||||||
recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
|
recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
|
||||||
recv->conn.tail = &proxyInfo.ceRecvMem->tail;
|
recv->conn.tail = &proxyInfo.ceRecvMem->tail;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user