diff --git a/makefiles/version.mk b/makefiles/version.mk index 88656d9..496796a 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 12 -NCCL_PATCH := 12 +NCCL_MINOR := 13 +NCCL_PATCH := 4 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/Makefile b/src/Makefile index 82e21a0..d658c35 100644 --- a/src/Makefile +++ b/src/Makefile @@ -10,7 +10,8 @@ include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc enhcompat.cc net.cc \ - misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc \ + misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \ + misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \ transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \ collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \ graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc diff --git a/src/bootstrap.cc b/src/bootstrap.cc index 4f7f48c..0ba89a5 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -105,6 +105,7 @@ static void *bootstrapRoot(void* args) { do { struct ncclSocket sock; sock.abortFlag = NULL; + /* bootstrap root thread always uses blocking ncclSocketAccept. */ NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out); NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out); close(sock.fd); @@ -139,6 +140,7 @@ static void *bootstrapRoot(void* args) { int next = (r+1) % nranks; struct ncclSocket sock; sock.abortFlag = NULL; + sock.asyncFlag = 0; memcpy(&sock.addr, rankAddressesRoot+r, sizeof(union ncclSocketAddress)); NCCLCHECKGOTO(ncclSocketConnect(&sock), res, out); NCCLCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union ncclSocketAddress)), res, out); @@ -316,6 +318,7 @@ ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int s struct bootstrapState* state = (struct bootstrapState*)commState; struct ncclSocket sock; sock.abortFlag = state->abortFlag; + sock.asyncFlag = 0; memcpy(&sock.addr, state->peerCommAddresses+peer, sizeof(union ncclSocketAddress)); NCCLCHECK(ncclSocketConnect(&sock)); NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int))); diff --git a/src/channel.cc b/src/channel.cc index 87cec65..4d28a68 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -8,75 +8,54 @@ #include "param.h" #include "gdrwrap.h" -// GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory -NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1); - -ncclResult_t initChannel(struct ncclComm* comm, int channelid) { - struct ncclChannel* channel = comm->channels+channelid; +ncclResult_t initChannel(struct ncclComm* comm, int channelId) { + struct ncclChannel* channel = &comm->channels[channelId]; if (channel->id != -1) return ncclSuccess; - channel->id = channelid; - // Ring index to user rank table. - NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks)); - NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks)); + int nRanks = comm->nRanks; + channel->id = channelId; + channel->workFifoSent = 0; - // Communication structures with peers. - NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network) - NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1)); - for (size_t i=0; inRanks+1; ++i) { - for (int b=0; bpeers[i].send[b].comm = comm; - channel->peers[i].recv[b].comm = comm; + NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream)); + + // The extra on nRanks+1 is for collnet root (i.e. network) + channel->peers = ncclMemoryStackAlloc(&comm->memPermanent, nRanks+1); + NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nRanks+1, comm->deviceStream.stream)); + ncclCommPushCudaFree(comm, channel->devPeers); + + channel->ring.userRanks = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); + NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, comm->deviceStream.stream)); + ncclCommPushCudaFree(comm, channel->devRingUserRanks); + + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNull(), &comm->deviceStream)); + + for (int r=0; r < nRanks+1; ++r) { + for (int b=0; b < NCCL_MAX_CONNS; b++) { + channel->peers[r].send[b].comm = comm; + channel->peers[r].recv[b].comm = comm; } } - // Per-channel operation list. - NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS)); - if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) { - // GDRCOPY support - // We allocate a workFifo in GDR mapped CUDA memory - // But we still allocate the Host workFifo so that we - // can copy the work elements to CUDA memory on kernel launch - NCCLCHECK(ncclGdrCudaCalloc(&channel->workFifoGdr, &channel->workFifoDev, NCCL_MAX_OPS, &channel->gdrMemDesc)); - } else { - // The device workFifo is the Host one - channel->workFifoDev = channel->workFifo; - } - return ncclSuccess; } ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { if (channel->id == -1) return ncclSuccess; - // Operation list - NCCLCHECK(ncclCudaHostFree(channel->workFifo)); - if (channel->gdrMemDesc) { - // GDRCOPY support - NCCLCHECK(ncclGdrCudaFree(channel->gdrMemDesc)); - } - - // Free Ring index to rank tables - free(channel->ring.userRanks); - CUDACHECK(cudaFree(channel->ring.devUserRanks)); // Free transport proxy resources // Note: free all send resources first due to CollNet arrangement for (int r=0; rpeers+r; + struct ncclChannelPeer* peer = channel->peers+r; for (int b=0; bsend[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b)); } } for (int r=0; rpeers+r; + struct ncclChannelPeer* peer = channel->peers+r; for (int b=0; brecv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b)); } } - // Free the peer structures. - CUDACHECK(cudaFree(channel->devPeers)); - free(channel->peers); - return ncclSuccess; } diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index c86384c..4e82dd6 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -12,11 +12,11 @@ namespace { template __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->header.nWarps*WARP_SIZE; + const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem.channel.ring; - const int *ringRanks = ring->devUserRanks; + const int *ringRanks = ring->userRanks; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1)); // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2); diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index 41ef255..23f6d0a 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -12,7 +12,7 @@ namespace { template __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->header.nWarps*WARP_SIZE; + const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem.channel.ring; @@ -97,7 +97,7 @@ namespace { template __device__ __forceinline__ void runTreeUpDown(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->header.nWarps*WARP_SIZE; + const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; ncclTree *tree = &ncclShmem.channel.tree; @@ -169,7 +169,7 @@ namespace { template __device__ __forceinline__ void runTreeSplit(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->header.nWarps*WARP_SIZE; + const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; ncclTree *tree = &ncclShmem.channel.tree; @@ -290,7 +290,7 @@ struct RunWorkElementheader.nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast; + const int nThreadsReduce = args->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast; const int tidStartBcast = nThreadsGather; const int tidStartScatter = tidStartBcast + nThreadsBcast; const int tidStartReduce = tidStartScatter + nThreadsScatter; diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index ba4ef56..ebe4381 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -12,7 +12,7 @@ namespace { template __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->header.nWarps*WARP_SIZE; + const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem.channel.ring; @@ -20,8 +20,8 @@ namespace { const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))); const ssize_t loopSize = nChannels*chunkSize; const ssize_t size = args->count; - const int rank = ring->devUserRanks[0]; - const int nextRank = ring->devUserRanks[1]; + const int rank = ring->userRanks[0]; + const int nextRank = ring->userRanks[1]; const int root = args->root; T *inputBuf = (T*)args->sendbuff; diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index 40a2303..ab333b4 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -19,90 +19,6 @@ #define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1) // Using balanced tree instead of split tree -__device__ inline bool barrierReduceAny(int bit) { - uint32_t popc; - asm ("{" - ".reg .pred barr_pred;" - "setp.eq.u32 barr_pred, %1, 1;" - "bar.red.popc.u32 %0, 2, barr_pred;" - "}" : "=r"(popc) : "r"(bit)); - return popc != 0; -} - -// Copy src to dst and fill extra size with zeroes -template -__device__ void copyToShmem(Tdst *dst, Tsrc const *src, int tid, int nthreads) { - static_assert(sizeof(Tdst)%(2*sizeof(uint64_t)) == 0 && sizeof(Tsrc)%(2*sizeof(uint64_t)) == 0, - "copyToShmem needs sizes which are multiple of 16B"); - static_assert(sizeof(Tdst) >= sizeof(Tsrc), "Tdst size is too small"); - static_assert(sizeof(Tdst) <= WARP_SIZE*2*sizeof(uint64_t), "copyToShmem limited to 512B to make sure it can always be done in one cycle"); - uint64_t *d = reinterpret_cast(dst); - uint64_t const *s = reinterpret_cast(src); - uint64_t *shmemPtr = shmemCvtPtr(d); - int offset = 2*tid; - uint64_t v0, v1; - if (offset >= sizeof(Tsrc)/sizeof(uint64_t)) { - v0 = v1 = 0ULL; - } else { - v0 = s[offset] ; v1 = s[offset+1]; - } - if (offset < sizeof(Tdst)/sizeof(uint64_t)) storeShmem128(shmemPtr+offset, v0, v1); -} - -template -__device__ int copyToShmem(T *dst, T const *src, int turn=0) { - static_assert(sizeof(uint64_t) <= alignof(T), "Uhoh"); - uint64_t *d = reinterpret_cast(dst); - uint64_t const *s = reinterpret_cast(src); - int t = threadIdx.x - turn; - if (t < 0) t += blockDim.x; - int n = sizeof(T)/sizeof(uint64_t); - - int delta = (n + WARP_SIZE-1) & -WARP_SIZE; // round up to warp lane 0 - if (delta < blockDim.x) { - turn += delta; - if (turn >= blockDim.x) turn -= blockDim.x; - } - else - turn = 0; - - n -= t; - d += t; - s += t; - #pragma unroll - for (int i=0; i < divUp(sizeof(T), WARP_SIZE*sizeof(uint64_t)); i++) { - if (n > 0) { - *d = *s; - d += blockDim.x; - s += blockDim.x; - n -= blockDim.x; - } - } - return turn; -} - -template -struct RunWorkElement { - __device__ void run(ncclWorkElem*) { - // Put NOT IMPLEMENTED behavior here. - } -}; - -template -struct RunWork { - // This __forceinline__ is necessary. The compiler was inserting a function call - // here from the LL ncclKernel. - __device__ __forceinline__ void run(ncclWork *w) { - int wid = threadIdx.x / WARP_SIZE; - int inc = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) / sizeof(ncclWorkElem) : 1; - #pragma unroll 1 - for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e += inc) { - if (wid < w->header.nWarps) - RunWorkElement().run(&w->elems[e]); - } - } -}; - typedef void(*ncclKern_t)(); extern __device__ ncclKern_t ncclFuncs[]; @@ -120,15 +36,62 @@ struct ncclShmemData { struct ncclShmemGroup groups[NCCL_MAX_GROUPS]; }; uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1]; - struct ncclDevComm comm; - struct ncclChannel channel; - uint64_t pad; - struct ncclWork work; + int channelId; + alignas(16) struct ncclDevComm comm; + alignas(16) struct ncclDevChannel channel; + alignas(16) struct ncclWork work; }; static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned"); +extern __shared__ ncclShmemData ncclShmem; + +__device__ inline bool barrierReduceAny(int bit) { + uint32_t popc; + asm ("{" + ".reg .pred barr_pred;" + "setp.eq.u32 barr_pred, %1, 1;" + "bar.red.popc.u32 %0, 2, barr_pred;" + "}" : "=r"(popc) : "r"(bit)); + return popc != 0; +} + +// Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads. +inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int bytes) { + int offset = 16*tid; + if (offset < bytes) { + uint64_t a=0, b=0; + asm("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset)); + asm volatile("st.v2.u64 [%0],{%1,%2};" :: "l"((char*)dst + offset), "l"(a), "l"(b)); + } +} + +template +struct RunWorkElement { + __device__ void run(ncclWorkElem*) { + // Put NOT IMPLEMENTED behavior here. + } +}; + +template +struct RunWork { + // This __forceinline__ is necessary. The compiler was inserting a function call + // here from the LL ncclKernel. + __device__ __forceinline__ void run(ncclWork *w) { + int wid = threadIdx.x / WARP_SIZE; + ncclWorkElem* we = w->header.type == ncclWorkTypeRegColl ? &w->regElems[0].elem : &w->elems[0]; + int stride = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) : sizeof(ncclWorkElem); + #pragma unroll 1 + while ((char*)we + stride <= (char*)(w+1) && we->isUsed) { + if (wid < we->nWarps) { + RunWorkElement().run(we); + } + we = (ncclWorkElem*)((char*)we + stride); + } + } +}; + static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) { - if (we->header.type != ncclWorkTypeUnused && we->redOpArgIsPtr) { + if (we->isUsed && we->redOpArgIsPtr) { /* redOpArg is a pointer to the scalar value, so we'll dereference it * here so that redOpArg holds the bits of the scalar going forward. * The tricky thing is we don't know its type T since that's encoded in @@ -148,48 +111,69 @@ static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) { } } -extern __shared__ ncclShmemData ncclShmem; - template -__device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) { +__device__ void ncclKernel( + struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead + ) { int tid = threadIdx.x; - int nthreads = blockDim.x; - int bid = blockIdx.x; - int turn = copyToShmem(&ncclShmem.comm, comm); - // get address of channel without incurring indirect load from ncclDevCom::channels - ncclChannel *channel = &((ncclDevCommAndChannels*)comm)->channels[bid]; - turn = copyToShmem(&ncclShmem.channel, channel, turn); + // To map blockId to channelId, we need the n'th set bit of channelMask which + // is the inverse of counting the number of set bits among the the first n. + if (tid < WARP_SIZE) { + int x = tid; + if (channelMask & (1ull<channels[channelId]; + bytes = sizeof(ncclDevChannel); + static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn."); + break; + case 2: + dst = &ncclShmem.work; + src = workHead + blockIdx.x; + bytes = sizeof(ncclWork); + static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn."); + break; + default: + bytes = 0; + break; + } + copyToShmem16(tid%WARP_SIZE, dst, src, bytes); } __syncthreads(); // publish ncclShmem - ncclWork *workFifoHost = ncclShmem.channel.workFifo; - ncclWork *workFifoDev = ncclShmem.channel.workFifoDev; - int workFifoIx = ncclShmem.channel.index; - - if (bid == 0 && first.header.type != ncclWorkTypeUnused) - goto SkipLoadWork; - while (true) { - copyToShmem(&ncclShmem.work, &workFifoDev[workFifoIx], tid, nthreads); - { // Check whether the last operation was aborted and make sure all threads exit - int aborted = tid == 0 ? *comm->abortFlag : 0; - if (barrierReduceAny(aborted)) // publish ncclShmem.work - break; - if (tid == 0) - workFifoHost[workFifoIx].header.type = ncclWorkTypeUnused; + // Notify host that all fifo reads are complete. + if (tid == 0 && ncclShmem.work.header.isLast && ncclShmem.work.header.inFifo) { + *ncclShmem.channel.workFifoDone = ncclShmem.work.header.doneAcks; } - SkipLoadWork: - workFifoIx = (workFifoIx + 1)%NCCL_MAX_OPS; - if (tid == 0) - channel->index = workFifoIx; // write back to real channel, not shmem shadow - __syncwarp(); if (ncclShmem.work.header.type == ncclWorkTypeColl) { if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]); @@ -198,21 +182,34 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) { } __syncthreads(); - if (ncclShmem.work.header.funcIndex == FnIndex) + if (ncclShmem.work.header.funcIndex == FnIndex) { RunWork().run(&ncclShmem.work); - else + } else { ncclFuncs[ncclShmem.work.header.funcIndex](); + } - if (ncclShmem.work.header.isLast) break; + int workIxNext = ncclShmem.work.header.workNext; __syncthreads(); + if (ncclShmem.work.header.isLast) break; + + copyToShmem16(tid, &ncclShmem.work, workHead + workIxNext, sizeof(ncclWork)); + + { // Check whether the last operation was aborted and make sure all threads exit + int aborted = tid == 0 ? *comm->abortFlag : 0; + if (barrierReduceAny(aborted)) // publish ncclShmem.work + break; + } } } // Only generate kernels for SUM #if NCCL_OP == 0 #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \ -__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem first) { \ - ncclKernel, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex>(comm, first); \ +__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)( \ + struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead \ + ) { \ + ncclKernel, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex> \ + (comm, channelMask, workHead); \ } #else #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fInded) diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu index b7dc3e9..f594e34 100644 --- a/src/collectives/device/onerank_reduce.cu +++ b/src/collectives/device/onerank_reduce.cu @@ -16,7 +16,7 @@ namespace { int tid = threadIdx.x; int tn = blockDim.x; #pragma unroll 1 - for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e++) { + for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].isUsed; e++) { ncclWorkElem *we = &w->elems[e]; intptr_t eltN = we->count; int bid = we->bid; diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h index afed3df..e8cc8e3 100644 --- a/src/collectives/device/prims_ll.h +++ b/src/collectives/device/prims_ll.h @@ -326,11 +326,11 @@ class Primitives: // If we are going to support oneshot collNet + LL, then we would need to add connector index here int nrecv=0, nsend=0; while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) { - loadRecvConn(&channel->devPeers[recvPeers[nrecv]].recv->conn, nrecv); + loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv); nrecv++; } while (nsend < MaxSend && sendPeers[nsend] >= 0) { - loadSendConn(&channel->devPeers[sendPeers[nsend]].send->conn, nsend); + loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend); nsend++; } this->fan = Fan(nrecv, nsend); diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h index 8090385..93b6b4f 100644 --- a/src/collectives/device/prims_ll128.h +++ b/src/collectives/device/prims_ll128.h @@ -364,11 +364,11 @@ public: auto *channel = &ncclShmem.channel; int nrecv=0, nsend=0; while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) { - loadRecvConn(&channel->devPeers[recvPeers[nrecv]].recv->conn, nrecv); + loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv); nrecv++; } while (nsend < MaxSend && sendPeers[nsend] >= 0) { - loadSendConn(&channel->devPeers[sendPeers[nsend]].send->conn, nsend); + loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend); nsend++; } this->fan = Fan(nrecv, nsend); diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h index fd61dc4..a727849 100644 --- a/src/collectives/device/prims_simple.h +++ b/src/collectives/device/prims_simple.h @@ -303,9 +303,9 @@ class Primitives< } } - __device__ __forceinline__ void loadRecvConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) { + __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) { if (flags & (RoleWaitRecv|RolePostRecv)) { - auto *conn = &peer->recv[connIndex].conn; + auto *conn = &peer->recv[connIndex]; step = conn->step; step = roundUp(step, SlicePerChunk*StepPerSlice); if (flags & RolePostRecv) { @@ -343,9 +343,9 @@ class Primitives< } } - __device__ __forceinline__ void loadSendConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) { + __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) { if (flags & (RoleWaitSend|RolePostSend)) { - auto *conn = &peer->send[connIndex].conn; + auto *conn = &peer->send[connIndex]; step = conn->step; step = roundUp(step, SlicePerChunk*StepPerSlice); if (flags & RolePostSend) { @@ -428,8 +428,8 @@ class Primitives< if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index]; if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index]; - loadRecvConn(&ncclShmem.channel.devPeers[peer], connIndex, e); - loadSendConn(&ncclShmem.channel.devPeers[peer], connIndex, e); + loadRecvConn(&ncclShmem.channel.peers[peer], connIndex, e); + loadSendConn(&ncclShmem.channel.peers[peer], connIndex, e); setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e); } diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h index 8dc867b..0927037 100644 --- a/src/collectives/device/reduce.h +++ b/src/collectives/device/reduce.h @@ -12,7 +12,7 @@ namespace { template __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->header.nWarps*WARP_SIZE; + const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem.channel.ring; @@ -22,7 +22,7 @@ namespace { const ssize_t loopSize = nChannels*chunkSize; const ssize_t size = args->count; const int rank = ncclShmem.comm.rank; - const int prevRank = ring->devUserRanks[nranks-1]; + const int prevRank = ring->userRanks[nranks-1]; const int root = args->root; Primitives, 0, Proto, 0> diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h index 3f38b1a..754889a 100644 --- a/src/collectives/device/reduce_scatter.h +++ b/src/collectives/device/reduce_scatter.h @@ -12,11 +12,11 @@ namespace { template __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->header.nWarps*WARP_SIZE; + const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem.channel.ring; - int const *ringRanks = ring->devUserRanks; + int const *ringRanks = ring->userRanks; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1)); // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2); diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h index be0dbc5..feae653 100644 --- a/src/collectives/device/sendrecv.h +++ b/src/collectives/device/sendrecv.h @@ -11,21 +11,23 @@ template struct RunWork { __device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { + void* buff = reinterpret_cast(uintptr_t(args->buffHi32)<<32 | args->buffLo32); + size_t count = reinterpret_cast(size_t(args->countHi32)<<32 | args->countLo32); if (args->peer == ncclShmem.comm.rank) { struct ncclWorkElemP2p* recvArgs = args-1; - if (args->buff != recvArgs->buff) { - ReduceOrCopyMulti(tid, nthreads, nullptr, false, 1, (const T**)&args->buff, 1, (T**)&recvArgs->buff, args->count); + void* recvBuff = reinterpret_cast(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32); + if (buff != recvBuff) { + ReduceOrCopyMulti(tid, nthreads, nullptr, false, 1, (const T**)&buff, 1, (T**)&recvBuff, count); } } else { using Proto = ProtoSimple<1, 1>; - ssize_t const count = args->count; int const chunkSize = args->chunkSize/sizeof(T); int const peer = args->peer; Primitives, 1, Proto, 1> prims - (tid, nthreads, nullptr, &peer, args->buff, nullptr, /*redOpArg(ignored)=*/0, group); - ssize_t offset = 0; + (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group); + size_t offset = 0; do { - int nelem = min(chunkSize, count-offset); + int nelem = min(size_t(chunkSize), count-offset); prims.directSend(offset, offset, nelem); offset += nelem; } while(offset < count); @@ -35,14 +37,15 @@ struct RunWork { __device__ __forceinline__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { if (args->peer != ncclShmem.comm.rank) { using Proto = ProtoSimple<1, 1>; - ssize_t const count = args->count; + void* buff = reinterpret_cast(uintptr_t(args->buffHi32)<<32 | args->buffLo32); + ssize_t count = reinterpret_cast(size_t(args->countHi32)<<32 | args->countLo32); int const chunkSize = args->chunkSize/sizeof(T); int const peer = args->peer; Primitives, 1, Proto, 1> prims - (tid, nthreads, &peer, nullptr, nullptr, args->buff, /*redOpArg(ignored)=*/0, group); - ssize_t offset = 0; + (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group); + size_t offset = 0; do { - int nelem = min(chunkSize, count-offset); + int nelem = min(size_t(chunkSize), count-offset); prims.directRecv(offset, nelem); offset += nelem; } while(offset < count); @@ -61,11 +64,11 @@ struct RunWork { #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE) int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS; args += group; - if (args->header.type == ncclWorkTypeUnused) return; - tid -= args->warpStart * WARP_SIZE; int nthreads = args->nWarps * WARP_SIZE; group |= 1<<16; // Used to select connIndex 1 + + if (args->p2pType == ncclWorkP2pTypeUnused) return; if (tid >= nthreads || args->peer == -1) return; if ((group%2) == 0) { runRecv(tid, nthreads, group, args); diff --git a/src/debug.cc b/src/debug.cc index 9060abb..1c184d0 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -8,30 +8,24 @@ #include "nccl_net.h" #include #include +#include int ncclDebugLevel = -1; +static int pid = -1; +static char hostname[1024]; thread_local int ncclDebugNoWarn = 0; +char ncclLastError[1024] = ""; // Global string for the last error in human readable form uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT FILE *ncclDebugFile = stdout; pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER; +std::chrono::steady_clock::time_point ncclEpoch; + +static __thread int tid = -1; void ncclDebugInit() { pthread_mutex_lock(&ncclDebugLock); if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; } const char* nccl_debug = getenv("NCCL_DEBUG"); - if (nccl_debug == NULL) { - ncclDebugLevel = NCCL_LOG_NONE; - } else if (strcasecmp(nccl_debug, "VERSION") == 0) { - ncclDebugLevel = NCCL_LOG_VERSION; - } else if (strcasecmp(nccl_debug, "WARN") == 0) { - ncclDebugLevel = NCCL_LOG_WARN; - } else if (strcasecmp(nccl_debug, "INFO") == 0) { - ncclDebugLevel = NCCL_LOG_INFO; - } else if (strcasecmp(nccl_debug, "ABORT") == 0) { - ncclDebugLevel = NCCL_LOG_ABORT; - } else if (strcasecmp(nccl_debug, "TRACE") == 0) { - ncclDebugLevel = NCCL_LOG_TRACE; - } /* Parse the NCCL_DEBUG_SUBSYS env var * This can be a comma separated list such as INIT,COLL @@ -64,6 +58,8 @@ void ncclDebugInit() { mask = NCCL_ENV; } else if (strcasecmp(subsys, "ALLOC") == 0) { mask = NCCL_ALLOC; + } else if (strcasecmp(subsys, "CALL") == 0) { + mask = NCCL_CALL; } else if (strcasecmp(subsys, "ALL") == 0) { mask = NCCL_ALL; } @@ -75,6 +71,10 @@ void ncclDebugInit() { free(ncclDebugSubsys); } + // Cache pid and hostname + getHostName(hostname, 1024, '.'); + pid = getpid(); + /* Parse and expand the NCCL_DEBUG_FILE path and * then create the debug file. But don't bother unless the * NCCL_DEBUG level is > VERSION @@ -94,12 +94,10 @@ void ncclDebugInit() { *dfn++ = '%'; break; case 'h': // %h = hostname - char hostname[1024]; - getHostName(hostname, 1024, '.'); dfn += snprintf(dfn, PATH_MAX, "%s", hostname); break; case 'p': // %p = pid - dfn += snprintf(dfn, PATH_MAX, "%d", getpid()); + dfn += snprintf(dfn, PATH_MAX, "%d", pid); break; default: // Echo everything we don't understand *dfn++ = '%'; @@ -110,15 +108,30 @@ void ncclDebugInit() { *dfn = '\0'; if (debugFn[0] != '\0') { FILE *file = fopen(debugFn, "w"); - if (file != NULL) { + if (file != nullptr) { + setbuf(file, nullptr); // disable buffering ncclDebugFile = file; } } } -#ifdef ENABLE_TRACE - ncclEpoch = std::chrono::high_resolution_clock::now(); -#endif + int tempNcclDebugLevel = -1; + if (nccl_debug == NULL) { + tempNcclDebugLevel = NCCL_LOG_NONE; + } else if (strcasecmp(nccl_debug, "VERSION") == 0) { + tempNcclDebugLevel = NCCL_LOG_VERSION; + } else if (strcasecmp(nccl_debug, "WARN") == 0) { + tempNcclDebugLevel = NCCL_LOG_WARN; + } else if (strcasecmp(nccl_debug, "INFO") == 0) { + tempNcclDebugLevel = NCCL_LOG_INFO; + } else if (strcasecmp(nccl_debug, "ABORT") == 0) { + tempNcclDebugLevel = NCCL_LOG_ABORT; + } else if (strcasecmp(nccl_debug, "TRACE") == 0) { + tempNcclDebugLevel = NCCL_LOG_TRACE; + } + + ncclEpoch = std::chrono::steady_clock::now(); + __atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE); pthread_mutex_unlock(&ncclDebugLock); } @@ -127,45 +140,53 @@ void ncclDebugInit() { * they can share the debugging mechanisms and output files */ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) { - if (ncclDebugLevel == -1) ncclDebugInit(); + if (__atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE) == -1) ncclDebugInit(); if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; } + + // Save the last error (WARN) as a human readable string + if (level == NCCL_LOG_WARN) { + pthread_mutex_lock(&ncclDebugLock); + va_list vargs; + va_start(vargs, fmt); + (void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs); + va_end(vargs); + pthread_mutex_unlock(&ncclDebugLock); + } if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return; - // Gather the rank information. This can take > 1us so we want to make sure - // we only do it when needed. - char hostname[1024]; - getHostName(hostname, 1024, '.'); + if (tid == -1) { + tid = syscall(SYS_gettid); + } + int cudaDev; - cudaGetDevice(&cudaDev); - int pid = getpid(); - int tid = syscall(SYS_gettid); + if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) { + cudaGetDevice(&cudaDev); + } char buffer[1024]; size_t len = 0; - pthread_mutex_lock(&ncclDebugLock); - if (level == NCCL_LOG_WARN) - len = snprintf(buffer, sizeof(buffer), - "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line); - else if (level == NCCL_LOG_INFO) - len = snprintf(buffer, sizeof(buffer), - "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev); -#ifdef ENABLE_TRACE - else if (level == NCCL_LOG_TRACE) { - auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; + if (level == NCCL_LOG_WARN) { + len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ", + hostname, pid, tid, cudaDev, filefunc, line); + } else if (level == NCCL_LOG_INFO) { + len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev); + } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) { + len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid); + } else if (level == NCCL_LOG_TRACE) { + auto delta = std::chrono::steady_clock::now() - ncclEpoch; double timestamp = std::chrono::duration_cast>(delta).count()*1000; - len = snprintf(buffer, sizeof(buffer), - "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, pid, tid, cudaDev, timestamp, filefunc, line); + len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", + hostname, pid, tid, cudaDev, timestamp, filefunc, line); } -#endif + if (len) { va_list vargs; va_start(vargs, fmt); - (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs); + len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs); va_end(vargs); - fprintf(ncclDebugFile,"%s\n", buffer); - fflush(ncclDebugFile); + buffer[len++] = '\n'; + fwrite(buffer, 1, len, ncclDebugFile); } - pthread_mutex_unlock(&ncclDebugLock); } NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0); diff --git a/src/enqueue.cc b/src/enqueue.cc index 349cb2b..d3fbbe5 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -10,14 +10,18 @@ #include "gdrwrap.h" #include "bootstrap.h" #include "channel.h" +#include "cudawrap.h" #include // std::memcpy +#include // PRIx64 + +static void* const ncclKernelGeneric = (void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t); // Only generate inline kernels for LL #define NCCL_FUNC5(func, algo, devredop, dtype) \ - (void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), \ - (void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), \ - (void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype) + /*LL */(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), \ + /*LL128 */nullptr /*(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype)*/, \ + /*SIMPLE*/nullptr /*(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype)*/ #define NCCL_FUNC4(func, devredop, type) \ (void*)NCCL_FUNC5(func, TREE, devredop, type), \ @@ -111,6 +115,8 @@ static void* const ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps* NCCL_FUNCS2A(AllReduce) }; +static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */); + // Determine the maximum kernel stack size of all CUDA kernels size_t ncclKernMaxLocalSize() { ncclResult_t res = ncclSuccess; @@ -118,8 +124,10 @@ size_t ncclKernMaxLocalSize() { cudaFuncAttributes attr = {0}; size_t max = 0; for (int i = 0; i < numNcclKerns; i++) { - CUDACHECKGOTO(cudaFuncGetAttributes(&attr, ncclKerns[i]), res, error); - if (attr.localSizeBytes > max) max = attr.localSizeBytes; + if (ncclKerns[i] != nullptr) { + CUDACHECKGOTO(cudaFuncGetAttributes(&attr, ncclKerns[i]), res, error); + if (attr.localSizeBytes > max) max = attr.localSizeBytes; + } } error: @@ -143,267 +151,906 @@ error: /* Launch system : synchronization and CUDA kernel launch */ /*****************************************************************************/ -ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) { -#if CUDART_VERSION >= 9000 - if (cgMode & 0x01) { - CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices, - // These flags are to reduce the latency of using this API - cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync)); - return ncclSuccess; +static void appendWorkElemColl( + struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId, + int funcIndex, struct ncclWorkElem const *elem, int bid + ) { + struct ncclKernelPlan::Channel* chan = &plan->channels[channelId]; + struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue); + if (q && funcIndex == q->work.header.funcIndex + && elem->nWarps == q->work.elems[0].nWarps + && chan->nWorkElem < NCCL_MAX_WORK_ELEMENTS) { + int e = chan->nWorkElem++; + q->work.elems[e] = *elem; // C++ struct assignment + q->work.elems[e].bid = bid; + q->work.elems[e].isUsed = 1; + return; } -#endif - int savedDev; - CUDACHECK(cudaGetDevice(&savedDev)); - for (int i = 0; i < numDevices; i++) { - struct cudaLaunchParams* params = paramsList+i; - CUDACHECK(cudaSetDevice(cudaDevs[i])); - CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream)); + q = ncclMemoryStackAlloc(&comm->memScoped); + q->work.header.type = ncclWorkTypeColl; + q->work.header.funcIndex = funcIndex; + q->work.elems[0] = *elem; // C++ struct assignment + q->work.elems[0].bid = bid; + q->work.elems[0].isUsed = 1; + chan->nWorkElem = 1; + chan->nWork += 1; + ncclIntruQueueEnqueue(&chan->workQueue, q); +} + +static void appendWorkElemColl( + struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId, + int funcIndex, struct ncclWorkElemReg const *elem, int bid + ) { + struct ncclKernelPlan::Channel* chan = &plan->channels[channelId]; + struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue); + if (q && funcIndex == q->work.header.funcIndex + && elem->elem.nWarps == q->work.regElems[0].elem.nWarps + && chan->nWorkElem < NCCL_MAX_WORK_ELEMENTS_REG) { + int e = chan->nWorkElem++; + q->work.regElems[e] = *elem; // C++ struct assignment + q->work.regElems[e].elem.bid = bid; + q->work.regElems[e].elem.isUsed = 1; + return; + } + q = ncclMemoryStackAlloc(&comm->memScoped); + q->work.header.type = ncclWorkTypeRegColl; + q->work.header.funcIndex = funcIndex; + q->work.regElems[0] = *elem; // C++ struct assignment + q->work.regElems[0].elem.bid = bid; + q->work.regElems[0].elem.isUsed = 1; + chan->nWorkElem = 1; + chan->nWork += 1; + ncclIntruQueueEnqueue(&chan->workQueue, q); +} + +static void finishWorkP2p(struct ncclWork* work) { + int nElem = 0; + for (int e=0; e < NCCL_MAX_WORK_ELEMENTS_P2P; e++) { + if (work->p2pElems[e].p2pType != ncclWorkP2pTypeUnused) + nElem = e+1; + } + int nGroup = 1; + while (nGroup < nElem) nGroup *= 2; + int nWarp = 1; + while (nWarp*nGroup <= (NCCL_MAX_NTHREADS/WARP_SIZE)/2) nWarp *= 2; + for (int i=0; i < nGroup; i++) { + work->p2pElems[i].ngroups = nGroup; + work->p2pElems[i].warpStart = i*(NCCL_MAX_NTHREADS/WARP_SIZE)/nGroup; + int extraWarp = nWarp >= 2 ? i%2 : 0; + work->p2pElems[i].nWarps = nWarp + extraWarp; + } +} + +static void finishWork(struct ncclWork* work) { + if (work->header.type == ncclWorkTypeP2p) { + finishWorkP2p(work); + } +} + +static void appendWorkElemP2p( + struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId, + struct ncclWorkElemP2p const *elem + ) { + constexpr int funcIndex = FUNC_INDEX_P2P; + struct ncclKernelPlan::Channel* chan = &plan->channels[channelId]; + struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue); + if (q && funcIndex == q->work.header.funcIndex) { + if (chan->p2pTailElem[elem->p2pType-1] < NCCL_MAX_WORK_ELEMENTS_P2P) { + for (int e = -2 + chan->p2pTailElem[elem->p2pType-1]; e >= 0; e -= 2) { + // Can't have multiple elements of the same ncclWork communicate with the + // same peer otherwise they would attempt to use that connection concurrently. + if (q->work.p2pElems[e].peer == elem->peer) + goto NewWork; + } + int e = chan->p2pTailElem[elem->p2pType-1]; + q->work.p2pElems[e] = *elem; // C++ struct assignment + chan->p2pTailElem[elem->p2pType-1] += 2; + return; + } + NewWork: + finishWorkP2p(&q->work); + } + q = ncclMemoryStackAlloc(&comm->memScoped); + q->work.header.type = ncclWorkTypeP2p; + q->work.header.funcIndex = FUNC_INDEX_P2P; + chan->p2pTailElem[ncclWorkP2pTypeRecv-1] = 0; + chan->p2pTailElem[ncclWorkP2pTypeSend-1] = 1; + q->work.p2pElems[chan->p2pTailElem[elem->p2pType-1]] = *elem; // C++ struct assignment + chan->p2pTailElem[elem->p2pType-1] += 2; + chan->nWork += 1; + ncclIntruQueueEnqueue(&chan->workQueue, q); +} + +static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclProxyOp* op) { + bool needed = true; + NCCLCHECK(ncclProxySaveOp(comm, op, &needed)); + if (needed) { + struct ncclProxyOp* q = ncclMemoryPoolAlloc(&comm->memPool_ncclProxyOp, &comm->memPermanent); + *q = *op; // C++ struct assignment + ncclIntruQueueEnqueue(&plan->channels[op->channelId].proxyOpQueue, q); } - CUDACHECK(cudaSetDevice(savedDev)); return ncclSuccess; } -static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** work, struct ncclWorkElem* base) { - if (channel->workCount == NCCL_MAX_OPS) { - WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS); - return ncclInvalidUsage; +// Put coll workelem & proxyOp in plan assuming nWorkBudget permits, so please +// ensure *nWorkBudget >= nBids upon entry. +static ncclResult_t addCollToPlan( + struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, int funcIndex, + struct ncclWorkElem const* workElem, struct ncclProxyOp const* proxyOp, + int nBid, size_t bytes, bool regBufUsed, void* regBufSend[], void* regBufRecv[] + ) { + struct ncclKernelPlan::Channel *chans = plan->channels; + int nCollChannels = comm->nChannels; + + // Choose the `nBid` least loaded channels to do the work. This ensures + // all bids go to different channels in case they need to synchronize. + int least[/*nBid*/MAXCHANNELS]; + least[0] = 0; + int maxIndexInLeast = 0; + size_t maxBytesInLeast = chans[0].collBytes; + // Initialize least[] such that the first nBid channels are accounted for. + for (int b=1; b < nBid; b++) { + least[b] = b; + if (maxBytesInLeast < chans[b].collBytes) { + maxIndexInLeast = b; + maxBytesInLeast = chans[b].collBytes; + } } - int opIndex = channel->workFifoTail%NCCL_MAX_OPS; - struct ncclWork* w = channel->workFifo+opIndex; - volatile uint8_t* typePtr = (volatile uint8_t*)&w->header.type; - while (typePtr[0] != ncclWorkTypeUnused) sched_yield(); - memset(w, 0, sizeof(struct ncclWork)); - // Initialize with work elem if provided - if (base) memcpy(w->elems, base, sizeof(struct ncclWorkElem)); - channel->workFifoTail++; - channel->workCount++; - if (work) *work = w; - return ncclSuccess; -} - -// Finalize channel work FIFO states before launch -// Called during dynamic enqueue -static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph) { - ncclComm_t comm = eqInfo->comm; - // Do not use comm->myParams in this function unless in non-graph mode - // In graph mode, enqueue is async to capture, myParams can have been changed - struct cudaLaunchParams* params = comm->myParams; - - // Only launch blocks where we have work to do. - // This is not supported when we are in cudaGraph mode. - // Because in cudaGraph mode the launch param needs to be determined - // at capture time instead of launch time. - if (!usingCudaGraph) { - int nChannels = std::max(comm->nChannels, comm->p2pnChannels); - for (int c=0; cchannels[c].workCount) params->gridDim.x = c+1; - } - eqInfo->maxChannels = params->gridDim.x; - } - - // Set isLast = 1 for the last operation and add a no-op on empty channels (p2p case). - for (int c=0; cmaxChannels; c++) { - struct ncclChannel* channel = comm->channels+c; - if (channel->workCount == 0) { - struct ncclWork* w; - NCCLCHECK(getNextOp(channel, &w, NULL)); - w->header.funcIndex = FUNC_INDEX_P2P; - w->header.type = ncclWorkTypeP2p; - w->header.nWarps = 0; - } - channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].header.isLast = 1; - - if (c == 0) { - // As we inline the first coll directly, we can free it immediately. - // Except P2P or aggregation or registration cases - struct ncclWork* work = channel->workFifo+((channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS); - if (work->header.type == ncclWorkTypeColl && eqInfo->elemList->count() == 1) - work->header.type = ncclWorkTypeUnused; - } - - if (channel->gdrMemDesc) { - // GDRCOPY support - uint64_t first = (channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS; - uint64_t nelems = channel->workCount; - TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld nelems %zi", - channel->workFifo, channel->workFifoGdr, first, nelems); - - for (int i = 0; i < nelems; i++) { - int elem = (first+i) % NCCL_MAX_OPS; - // Copy Host workFifo to CUDA workFifo via the GDRCOPY mapping - NCCLCHECK(ncclGdrCudaCopy(channel->gdrMemDesc, channel->workFifoGdr+elem, channel->workFifo+elem, 1)); + // Sort in the rest of the channels. If a channel has less work than the max + // member of least[], replace that member and compute the new max. The optimal + // algorithm uses a max-heap, but for our small sizes I suspect the better + // asymptotic complexity would be swamped by the increased instruction complexity. + for (int c=nBid; c < nCollChannels; c++) { + if (chans[c].collBytes < maxBytesInLeast) { + least[maxIndexInLeast] = c; + maxBytesInLeast = chans[least[0]].collBytes; + maxIndexInLeast = 0; + for (int b=1; b < nBid; b++) { + if (maxBytesInLeast < chans[least[b]].collBytes) { + maxIndexInLeast = b; + maxBytesInLeast = chans[least[b]].collBytes; + } } } } - return ncclSuccess; -} + uint64_t opCount = uint64_t(plan->collOpCount++)<<1 | 0; + bytes /= nBid; + for (int bid=0; bid < nBid; bid++) { + int c = least[bid]; + chans[c].collBytes += bytes; -ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) { - volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); - int val = *ptr; - bool done = false; - while (done == false) { - if (val >= comm->intraRanks) { - WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS); - return ncclInvalidUsage; + // Add work elem + *nWorkBudget += chans[c].nWork; + if (!regBufUsed) { + appendWorkElemColl(comm, plan, c, funcIndex, workElem, bid); + } else { + // Buffer registration in play which could only for CollNet at the moment. + struct ncclChannel* channel = &comm->channels[c]; + struct ncclWorkElemReg workElemReg; + workElemReg.elem = *workElem; // C++ struct assignment + workElemReg.elem.regUsed = 1; + for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) { + int peer = channel->collTree.down[i]; + if (peer == -1) break; + int j = comm->rankToLocalRank[peer]; // Get intra-node slot + workElemReg.dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer + workElemReg.dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer + } + for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) { + int peer = channel->collTree.up[i]; + if (peer == -1) break; + int j = comm->rankToLocalRank[peer]; + // Output buffer of root peer + workElemReg.upOutputs[i] = regBufRecv[j]; + } + appendWorkElemColl(comm, plan, c, funcIndex, &workElemReg, bid); } - if (val+1 == comm->intraRanks) { - // Reset the barrier. - comm->intraBarrier[comm->intraPhase^1] = 0; - *isLast = 1; - return ncclSuccess; - } - done = __sync_bool_compare_and_swap(ptr, val, val+1); - val++; - } - *isLast = 0; - return ncclSuccess; -} + *nWorkBudget -= chans[c].nWork; // subtract delta of chans[c].nWork -ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) { - volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); - int val = *ptr; - if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) { - WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS); - return ncclInternalError; + // Add proxy task. Empty collectives do not make it to the proxy thread + // since they don't imply synchronization for the user like p2p. + if (proxyOp->nsteps != 0) { + struct ncclProxyOp tmp = *proxyOp; // C++ struct assignment + tmp.channelId = c; + tmp.opCount = opCount; + NCCLCHECK(addProxyOpIfNeeded(comm, plan, &tmp)); + } } return ncclSuccess; } -ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) { - volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); - while (*ptr < comm->intraRanks) pthread_yield(); - comm->intraPhase ^= 1; +// Put p2p op in plan assuming there is space in nWorkBudget, so you must +// ensure *nWorkBudget >= 1 upon entry. +static ncclResult_t addP2pToPlan( + struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, + bool isSendNotRecv, int peer, int chunk, void *addr, size_t bytes + ) { + struct ncclInfo info = { + isSendNotRecv ? ncclFuncSend : ncclFuncRecv, + isSendNotRecv ? "Send" : "Recv", + nullptr, addr, bytes, ncclInt8, ncclSum, peer, comm, (cudaStream_t)0, + /*Args*/1, 1 + }; + + int channelId; + NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, info.coll, &channelId)); + info.channelId = channelId; + + struct ncclProxyOp proxyOp = {}; + NCCLCHECK(ncclProxyComputeP2p(&info, &proxyOp)); + + struct ncclWorkElemP2p elem = {0}; + elem.peer = peer; + elem.nWarps = NCCL_MAX_NTHREADS/WARP_SIZE; + elem.p2pType = isSendNotRecv ? ncclWorkP2pTypeSend : ncclWorkP2pTypeRecv; + elem.buffLo32 = uint32_t(reinterpret_cast(addr)); + elem.buffHi32 = reinterpret_cast(addr)>>32; + elem.countLo32 = uint32_t(bytes); + elem.countHi32 = bytes>>32; + elem.chunkSize = info.chunkSize; // computed by ncclProxyComputeP2p + + *nWorkBudget += plan->channels[channelId].nWork; + appendWorkElemP2p(comm, plan, channelId, &elem); + *nWorkBudget -= plan->channels[channelId].nWork; + + // Calculate the opCount after appendWorkElemP2p since it will always return + // with channel->nWork equal to one plus the work index this p2p settled in. + proxyOp.opCount = uint64_t(plan->channels[channelId].nWork)<<1 | 1; + NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp)); return ncclSuccess; } -// Check dependency wrt outside streams or previous launches -// Launch kernel in GROUP mode -ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) { - struct cudaLaunchParams* params = comm->myParams; - if (params->gridDim.x == 0) return ncclSuccess; +static void finishPlan(struct ncclKernelPlan* plan) { + int channelUbound = 0; + int channelCount = 0; + uint64_t channelMask = 0; + bool hasProxyOps = false; + for (int c=0; c < MAXCHANNELS; c++) { + struct ncclWorkList* tail = ncclIntruQueueTail(&plan->channels[c].workQueue); + if (tail != nullptr) { + channelUbound = c+1; + channelCount += 1; + channelMask |= 1ull<work.header.isLast = 1; + finishWork(&tail->work); + } + hasProxyOps |= !ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue); + } + plan->channelUbound = channelUbound; + plan->channelCount = channelCount; + plan->channelMask = channelMask; + plan->hasProxyOps = hasProxyOps; + if (plan->kernelFn == nullptr) + plan->kernelFn = ncclKernelGeneric; + plan->threadPerBlock = std::max(plan->threadPerBlock, 3*WARP_SIZE); +} - // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL - if (comm->launchMode == ncclComm::GROUP && - (comm->groupCudaStream || - comm->userStream == cudaStreamDefault || - comm->userStream == cudaStreamLegacy || - comm->userStream == cudaStreamPerThread)) { - // Enqueue event in user stream - CUDACHECK(cudaEventRecord(comm->intDoneEvent, comm->userStream)); - // Create dependency between user stream and internal NCCL stream - CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->intDoneEvent, 0)); - params->stream = comm->groupStream; +static ncclResult_t registerIntraNodeBuffers( + struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclInfo* info, + bool* outRegBufUsed, + void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], + void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS] + ) { + *outRegBufUsed = false; + ncclResult_t result = ncclSuccess; + +#if CUDART_VERSION >= 11030 + int localRank = comm->localRank; + + if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess; + + struct HandlePair { + cudaIpcMemHandle_t ipc[2]; // {send, recv} + size_t offset[2]; // {send, recv} + }; + struct HandlePair handles[NCCL_MAX_LOCAL_RANKS]; + + CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback); + CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback); + + void *baseSend, *baseRecv; + size_t size; + CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff)); + handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend; + CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff)); + handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv; + + NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair))); + + // Open handles locally + for (int i=0; i < comm->localRanks; i++) { + if (i == localRank) { // Skip self + outRegBufSend[i] = nullptr; + outRegBufRecv[i] = nullptr; + } else { + for (int sr=0; sr < 2; sr++) { + // Get base address of mapping + void* base; + CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess)); + // Get real buffer address by adding offset in the mapping + (sr==0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr]; + // Enqueue reminder to close memory handle + struct ncclPointerList* q = ncclMemoryPoolAlloc(&comm->memPool_ncclPointerList, &comm->memPermanent); + q->ptr = base; + ncclIntruQueueEnqueue(&plan->ipcMemQueue, q); + } + } + } + *outRegBufUsed = true; + +fallback: +#endif + return result; +} + +NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 0); + +static ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport); +static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps); + +static ncclResult_t scheduleCollTasksToPlan( + struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget + ) { + struct ncclTasks* tasks = &comm->tasks; + + size_t bytePerChannel[/*collNetSupport*/2]; + if (comm->channelSize > 0) { + // Set by user + bytePerChannel[/*collNetSupport=*/0] = comm->channelSize; + bytePerChannel[/*collNetSupport=*/1] = comm->channelSize; } else { - if (comm->userStream != params->stream && !comm->usingCudaGraph) { - // Stream changed from last call, create dependency against last NCCL kernel launch - CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0)); - } - params->stream = comm->userStream; + // Latency increases as scale increases + // We would thus want to increase the chunk size to compensate for the lost efficiency + bytePerChannel[/*collNetSupport=*/0] = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks); + bytePerChannel[/*collNetSupport=*/1] = 256<<10; // Hand-tuned } - if (comm->launchMode == ncclComm::GROUP) { - int isLast = 0; - NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); - if (isLast) { - // I'm the last. Launch all operations. - NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode)); - NCCLCHECK(ncclCpuBarrierLast(comm)); + for (int collNetSupport=0; collNetSupport < 2; collNetSupport++) { + while (tasks->collBytesTotal < bytePerChannel[collNetSupport]*comm->nChannels && + bytePerChannel[collNetSupport] > NCCL_MIN_CHANNEL_SIZE) { + // Reduce per-channel size so we utilize all channels. + bytePerChannel[collNetSupport] /= 2; + } + } + + while (tasks->nTasksColl != 0) { + struct ncclTaskColl* head = ncclIntruQueueHead(&tasks->collQueue); + struct ncclInfo aggInfo = {}; + aggInfo.comm = comm; + aggInfo.coll = head->func; + aggInfo.datatype = head->datatype; + aggInfo.opFull = head->op; + aggInfo.op = (ncclRedOp_t)(int)head->op.op; + aggInfo.count = head->count; + int nAggChannels = 0; + int nAggOps = 1; + struct ncclTaskColl* aggEnd = head->next; + int collNetSupport = 0; + NCCLCHECK(getCollNetSupport(&aggInfo, &collNetSupport)); + + // Find a range of ops that can be aggregated together. + while (aggEnd != nullptr && + aggEnd->func == aggInfo.coll && + aggEnd->datatype == aggInfo.datatype && + aggEnd->op.op == aggInfo.opFull.op) { + aggInfo.count += aggEnd->count; + int nc = DIVUP(aggEnd->count*ncclTypeSize(aggInfo.datatype), bytePerChannel[collNetSupport]); + nc = std::max(1, std::min(nc, comm->nChannels)); + nAggChannels += nc; + nAggOps++; + aggEnd = aggEnd->next; + } + + if (nAggOps > 1) { + NCCLCHECK(ncclInfoSetDerived(&aggInfo, comm->nRanks)); + aggInfo.nChannels = std::min(comm->nChannels, nAggChannels); + int opPerChannel = DIVUP(nAggChannels, aggInfo.nChannels); + NCCLCHECK(getAlgoInfo(&aggInfo, collNetSupport, opPerChannel)); + } + + while (head != aggEnd) { + struct ncclInfo info = {}; + info.comm = comm; + info.coll = head->func; + info.sendbuff = head->sendbuff; + info.recvbuff = head->recvbuff; + info.count = head->count; + info.root = head->root; + info.datatype = head->datatype; + info.opFull = head->op; // C++ struct assignment + info.op = (ncclRedOp_t)(int)head->op.op; + info.chunkSteps = head->chunkSteps; + info.sliceSteps = head->sliceSteps; + NCCLCHECK(ncclInfoSetDerived(&info, comm->nRanks)); + if (nAggOps > 1) { + info.nChannels = DIVUP(info.nBytes, bytePerChannel[collNetSupport]); + info.nChannels = std::max(1, std::min(info.nChannels, comm->nChannels)); + info.algorithm = aggInfo.algorithm; + info.protocol = aggInfo.protocol; + info.nThreads = aggInfo.nThreads; + } + + int workFuncIndex; + struct ncclWorkElem workElem = {}; + struct ncclProxyOp proxyOp = {}; + NCCLCHECK(computeColl(&info, &workFuncIndex, &workElem, &proxyOp)); + + if (*nWorkBudget < info.nChannels) return ncclSuccess; // Ensure room for addCollToPlan() + + bool regBufUsed = false; + void* regBufSend[NCCL_MAX_LOCAL_RANKS]; + void* regBufRecv[NCCL_MAX_LOCAL_RANKS]; + if (plan->persistent && ncclParamGraphRegister() && + info.algorithm == NCCL_ALGO_COLLNET && // limited to CollNet for now + comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other + comm->intraRanks < comm->localRanks) { // only with inter-process & intra-node peers + NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, ®BufUsed, regBufSend, regBufRecv)); + } + + NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp, + info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv)); + tasks->nTasksColl -= 1; + tasks->collBytesTotal -= info.nBytes; + ncclIntruQueueDequeue(&tasks->collQueue); + head = ncclIntruQueueHead(&tasks->collQueue); + + plan->threadPerBlock = std::max(plan->threadPerBlock, info.nThreads); + if (ncclKerns[workFuncIndex] != nullptr) + plan->kernelFn = ncclKerns[workFuncIndex]; } } return ncclSuccess; } -// Launch kernel in PARALLEL mode -ncclResult_t ncclLaunchKernel(ncclComm_t comm) { - struct cudaLaunchParams *params = comm->myParams; - if (params->gridDim.x == 0) return ncclSuccess; - - // We can't print the CG mode before the first barrier happened. - if (comm->rank == 0 && *comm->intraCGMode & 0x10) { - *comm->intraCGMode ^= 0x10; - INFO(NCCL_INIT,"Launch mode %s%s%s", - comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel", - *comm->intraCGMode ? "/CGMD" : "", - (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : ""); +static size_t calcP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) { + size_t size = std::max(minSize, divUp(totalSize, minChannels)); + int nChannels = minChannels; + while (size > maxSize && nChannels <= maxChannels/2) { + nChannels *= 2; + size = divUp(totalSize, nChannels); } + return alignUp(size, minSize); +} - if (comm->launchMode == ncclComm::GROUP) { - NCCLCHECK(ncclCpuBarrierOut(comm)); +static ncclResult_t scheduleP2pTasksToPlan( + struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget + ) { + struct ncclTasks* tasks = &comm->tasks; + int nRanks = comm->nRanks; + struct ncclTasks::Peer* peers = tasks->peers; + int const *sendOrder = tasks->p2pSendOrder; + int const *recvOrder = tasks->p2pRecvOrder; + + plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MAX_NTHREADS); + + // Compute how much to split operations + // Natural step size matching buffer steps. + ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; + if (comm->nNodes > 1) stepSize /= SENDRECV_SLICEFACTOR; + // Try to use all channels + int nChannelsMax = comm->p2pnChannelsPerPeer; + int nChannelsMin = nChannelsMax; + // Try to use all channels, but one channel per operation. + while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2; + // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth. + while (nChannelsMax*nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2; + + while (tasks->nTasksP2p != 0) { + for (int i=0; i < nRanks; i++) { + int sendPeer = sendOrder[i]; + int recvPeer = recvOrder[i]; + struct ncclTaskP2p* send = ncclIntruQueueHead(&peers[sendPeer].sendQueue); + struct ncclTaskP2p* recv = ncclIntruQueueHead(&peers[recvPeer].recvQueue); + if (sendPeer == comm->rank) { + if (recvPeer != comm->rank) { + WARN("Sendrecv plan not aligned for self"); + return ncclInternalError; + } + if (send && recv == nullptr) { + WARN("Trying to send to self without a matching recv"); + return ncclInvalidUsage; + } + if (send == nullptr && recv) { + WARN("Trying to recv to self without a matching send"); + return ncclInvalidUsage; + } + } + if (send != nullptr || recv != nullptr) { + char* recvPtr = recv ? (char*)recv->buff : nullptr; + char* sendPtr = send ? (char*)send->buff : nullptr; + ssize_t recvBytes = recv ? recv->bytes : 0; + ssize_t sendBytes = send ? send->bytes : 0; + ssize_t minSize = stepSize/8; + ssize_t maxSize = comm->nNodes > 1 ? stepSize : stepSize*32; + ssize_t recvChunkBytesMax = calcP2pChunkSize(recvBytes, nChannelsMin, nChannelsMax, minSize, maxSize); + ssize_t sendChunkBytesMax = calcP2pChunkSize(sendBytes, nChannelsMin, nChannelsMax, minSize, maxSize); + // Zero size send/recv are syncs, encode here with -1. + recvBytes = recv && recvBytes == 0 ? -1 : recvBytes; + sendBytes = send && sendBytes == 0 ? -1 : sendBytes; + // Advance to current chunk. Syncs will always have chunk=0 so no effect on the -1. + if (recv) recvPtr += recv->chunk*recvChunkBytesMax; + if (recv) recvBytes -= recv->chunk*recvChunkBytesMax; + if (send) sendPtr += send->chunk*sendChunkBytesMax; + if (send) sendBytes -= send->chunk*sendChunkBytesMax; + + do { + ssize_t recvChunkBytes = std::min(recvBytes, recvChunkBytesMax); // -1 preserved + ssize_t sendChunkBytes = std::min(sendBytes, sendChunkBytesMax); + if (recvChunkBytes != 0) { + if (recvChunkBytes == -1) recvChunkBytes = 0; + if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget + NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes)); + recvPtr += recvChunkBytes; + recvBytes -= recvChunkBytes; + recv->chunk += 1; + if (recvBytes <= 0) { + recvBytes = 0; // in case still -1 + ncclIntruQueueDequeue(&peers[recvPeer].recvQueue); + tasks->nTasksP2p -= 1; + } + } + if (sendChunkBytes != 0) { + if (sendChunkBytes == -1) sendChunkBytes = 0; + if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget + NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/true, sendPeer, send->chunk, sendPtr, sendChunkBytes)); + sendPtr += sendChunkBytes; + sendBytes -= sendChunkBytes; + send->chunk += 1; + if (sendBytes <= 0) { + sendBytes = 0; // in case still -1 + ncclIntruQueueDequeue(&peers[sendPeer].sendQueue); + tasks->nTasksP2p -= 1; + } + } + } while (sendBytes != 0 || recvBytes != 0); + } + } + } + return ncclSuccess; +} + +// Comparison of monotonic rolling counters. +static inline bool rollingLess32(uint32_t a, uint32_t b) { + constexpr uint32_t PositiveMax = uint32_t(-1)>>1; + return a-b > PositiveMax; +} +static inline uint32_t rollingMin32(uint32_t a, uint32_t b) { + constexpr uint32_t PositiveMax = uint32_t(-1)>>1; + return (b-a <= PositiveMax) ? a : b; +} + +// Spin until its safe to increase comm->workFifoSent to desiredSent. +static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredSent) { + if (__builtin_expect(rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent), false)) { + uint64_t t0 = clockNano(); + while (1) { + // We have to poll for notifications from device. + uint32_t* doneLive = comm->workFifoDone; + uint32_t ackd[MAXCHANNELS]; + for (int c=0; c < MAXCHANNELS; c++) { + ackd[c] = __atomic_load_n(&doneLive[c], __ATOMIC_RELAXED); + } + // Compiler-only fence to prevent fusion of loops to encourage dense loads. + __atomic_signal_fence(__ATOMIC_SEQ_CST); + + uint32_t ackdAll = comm->workFifoSent; + for (int c=0; c < MAXCHANNELS; c++) { + // ackdAll is min over all non-quiesced channels + if (ackd[c] != comm->channels[c].workFifoSent) + ackdAll = rollingMin32(ackdAll, ackd[c]); + } + + // Compiler only fence to prevent fusion of loops to encourage dense stores. + __atomic_signal_fence(__ATOMIC_SEQ_CST); + + for (int c=0; c < MAXCHANNELS; c++) { + // Advance counter on quiesced channels so they don't lag behind + // too far where they could get lost in 32-bit wraparound. + if (ackd[c] == comm->channels[c].workFifoSent) { + comm->channels[c].workFifoSent = ackdAll; + __atomic_store_n(&doneLive[c], ackdAll, __ATOMIC_RELAXED); + } + } + comm->workFifoAckdMin = ackdAll; + + // See if that was enough. + if (!rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent)) break; + // Nope. Maintain vigorous spin for first 5us, then start yielding. + if (clockNano()-t0 >= 5*1000) sched_yield(); + } + } +} + +static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) { + bool persistent = plan->persistent; + int channelUbound = plan->channelUbound; + int nWork = 0; + for (int c=0; c < channelUbound; c++) nWork += plan->channels[c].nWork; + + struct ncclWork* workHeap; + if (!persistent) { + workHeap = comm->workFifoHeap; } else { - CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream)); + workHeap = ncclMemoryStackAlloc(&comm->memScoped, nWork); + } + uint32_t ixMask = persistent ? ~uint32_t(0) : comm->workFifoDepth-1; + uint32_t ixSent; + if (persistent) { + ixSent = 0; + } else { + ixSent = comm->workFifoSent; + // First work for a channel has to be at workHeap+blockIdx.x which means + // we cannot tolerate fifo wraparound. So round up to the wrap boundary + // if not doing so would incur crossing it. + if (((ixSent + plan->channelCount-1) & ixMask) < (ixSent & ixMask)) { + ixSent = (ixSent + ixMask) & ~ixMask; + // Need to update workFifoSent so waitWorkFifoAvailable() knows we've + // skipped those elements. Consider if all the channels report quiesced, + // this way the skipped slots will be considered consumed as well. + comm->workFifoSent = ixSent; + } + waitWorkFifoAvailable(comm, ixSent + nWork); + } + uint32_t ixHead = ixSent; + ixSent += plan->channelCount; + int channelsWithWork = 0; // number of channels below `c` with work structs. + for (int c=0; c < channelUbound; c++) { + struct ncclWorkList* q = ncclIntruQueueHead(&plan->channels[c].workQueue); + // Offset of first work equals number of channels below with work. + uint32_t ix = ixHead + channelsWithWork; + channelsWithWork += q != nullptr ? 1 : 0; + while (q != nullptr) { + if (q->next != nullptr) { + q->work.header.workNext = int32_t(ixSent & ixMask) - int32_t(ixHead & ixMask); + } else { + q->work.header.inFifo = !persistent ? 1 : 0; + // Tell channel to ack us back ix+1 indicating that all slots up to and + // including ix have been consumed. + q->work.header.doneAcks = ix+1; + comm->channels[c].workFifoSent = ix+1; + } + workHeap[ix & ixMask] = q->work; // C++ struct assignment + q = q->next; + if (q != nullptr) ix = ixSent++; + } } + if (!persistent) { + comm->workFifoSent = ixSent; + if (comm->workFifoHeapGdrHandle != nullptr) wc_store_fence(); + plan->workHead = &comm->devWorkFifoHeap[ixHead & ixMask]; + } else { + NCCLCHECK(ncclCudaMalloc(&plan->workHead, nWork)); + NCCLCHECK(ncclCudaMemcpy(plan->workHead, workHeap, nWork)); + } return ncclSuccess; } -// Launch network proxy -static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) { - // Start the network proxies as soon as the kernel has been launched. We can't - // perform any CUDA call between the two or having a cudaFree between the CUDA - // launch and the ncclProxyStart call could cause a deadlock. - // Also, starting the proxies after the CUDA launch seems to be better for - // performance (latency). - ncclComm_t comm = eqInfo->comm; - if (eqInfo->maxChannels == 0) return ncclSuccess; - - for (int r=0; rmaxChannels; r++) { - struct ncclChannel* channel = comm->channels+r; - channel->workCount = 0; - channel->totalSize = 0; +static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* plan) { + uint64_t collOpCount = comm->collOpCount; + // Advance comm's collOpCount by number of colls in this plan. + comm->collOpCount = collOpCount + plan->collOpCount; + for (int c=0; c < plan->channelUbound; c++) { + struct ncclProxyOp* q = ncclIntruQueueHead(&plan->channels[c].proxyOpQueue); + uint64_t p2pOpCount = comm->channels[c].p2pOpCount; + uint64_t nextP2pOpCount = p2pOpCount; + while (q != nullptr) { + struct ncclProxyOp* qNext = q->enqNext; + // Ignoring the bottom tag bit, opCount's are zero-based within plan so + // translate them to the tip of the comm's history. + if (q->opCount & 1) { // p2p + // p2pOpCount is monotonic increasing within a plan's channel so just + // remember last value to compute max. + nextP2pOpCount = p2pOpCount + (q->opCount>>1); + nextP2pOpCount += 1; // +1 to ensure next plan doesn't collide + q->opCount = (p2pOpCount<<1) + q->opCount; + } else { // coll + q->opCount = (collOpCount<<1) + q->opCount; + } + NCCLCHECK(ncclProxySaveOp(comm, q, nullptr)); // May overwrite enqNext. + if (!plan->persistent) { + // Non-persistent kernels have their memory reclaimed after upload. + ncclMemoryPoolFree(&plan->memPool_ncclProxyOp, q); + } + q = qNext; + } + // Advance channel's p2pOpCount by number of p2p's in this plan channel. + comm->channels[c].p2pOpCount = nextP2pOpCount; } - comm->lastChannel = 0; + return ncclSuccess; +} + +static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelPlan* plan) { + NCCLCHECK(uploadProxyOps(comm, plan)); NCCLCHECK(ncclProxyStart(comm)); - return ncclSuccess; -} - -// Record done event for current launch -ncclResult_t ncclRecordEvents(ncclComm_t comm) { - struct cudaLaunchParams *params = comm->myParams; - - // Enqueue event after NCCL kernel (only in non-graph mode) - if (!comm->usingCudaGraph) CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream)); - // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL - if (comm->launchMode == ncclComm::GROUP && - (comm->groupCudaStream || - comm->userStream == cudaStreamDefault || - comm->userStream == cudaStreamLegacy || - comm->userStream == cudaStreamPerThread)) { - CUDACHECK(cudaEventRecord(comm->intDoneEvent, params->stream)); - // Create dependency between NCCL internal stream and user stream - CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->intDoneEvent, 0)); + if (!plan->persistent) { + // Notify main thread of our reclaiming. This will reclaim plan concurrently. + ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer); } return ncclSuccess; } -// Reset parameter space for launch -ncclResult_t ncclLaunchReset(ncclComm_t comm) { - comm->userStreamSet = false; +static void CUDART_CB hostStreamPlanCallback(void *plan_) { + struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plan_; + ncclResult_t result = hostStreamPlanTask(plan->comm, plan); + if (result != ncclSuccess) { + WARN("hostStreamPlanCallback() failed : %s\n", ncclGetErrorString(result)); + } +} - // We are finishing capture of the current launch - // But we need to keep the current enqueue info for CUDA graph - // Thus we need to creating a new enqueue info for the next run - if (comm->usingCudaGraph) { - NCCLCHECK(ncclCreateQueueInfo(&comm->enqueueInfo, comm)); - } else { - // If not in CUDA graph mode, we reuse the same info space - NCCLCHECK(ncclResetQueueInfo(comm->enqueueInfo)); +static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* me) { + struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim` + if (plan->persistent) { + comm->persistentRefs -= 1; + if (!ncclMainExited) NCCLCHECK(ncclCudaFree(plan->workHead)); + while (!ncclIntruQueueEmpty(&plan->ipcMemQueue)) { + struct ncclPointerList* q = ncclIntruQueueDequeue(&plan->ipcMemQueue); + if (!ncclMainExited) CUDACHECKIGNORE(cudaIpcCloseMemHandle(q->ptr)); + ncclMemoryPoolFree(&comm->memPool_ncclPointerList, q); + } + } + ncclMemoryPoolTakeAll(&comm->memPool_ncclProxyOp, &plan->memPool_ncclProxyOp); + ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan); + return ncclSuccess; +} + +static void persistentDestructor(void* plans_) { + struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plans_; + struct ncclComm* comm = plan->comm; + while (plan != nullptr) { + struct ncclKernelPlan* next = plan->next; + ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer); + plan = next; + } +} + +ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { + ncclResult_t result = ncclSuccess; + struct ncclTasks* tasks = &comm->tasks; + bool persistent = ncclCudaGraphValid(tasks->capturingGraph); + int nPlans = 0; + + // Poll for callbacks sent to us from other threads. Typically these free + // resources from to our memory pools. + NCCLCHECK(ncclCommPollCallbacks(comm)); + + // We already have one frame present which holds all of our tasks (which we + // are about to schedule). Now push an additional frame for allocating + // work structs (see appendWorkElem() variants all use scoped allocation). + ncclMemoryStackPush(&comm->memScoped); + + if (tasks->nTasksColl + tasks->nTasksP2p != 0) { + do { + struct ncclKernelPlan* plan = ncclMemoryPoolAlloc(&comm->memPool_ncclKernelPlan, &comm->memPermanent); + ncclIntruQueueEnqueue(&comm->planQueue, plan); + nPlans += 1; + plan->comm = comm; + plan->reclaimer.fn = reclaimPlan; + plan->persistent = persistent; + + // Non-persistent kernels fill up at most half of our fifo per kernel. + int nWorkBudget = plan->persistent ? INT_MAX : comm->workFifoDepth/2; + int nWorkBudgetOld = nWorkBudget; + + // Drain coll tasks first. This is essential since we partition tasks based + // on the work budget and p2p work isn't collective. If we were to drain p2p + // first, the place where we cut the kernel could vary by rank which would + // cause the "shortest channel first" channel picker to have divergent results. + if (tasks->nTasksColl != 0) { + NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &nWorkBudget), result, failure); + } + // And only drain p2p tasks once colls are depleted. + if (tasks->nTasksColl == 0 && tasks->nTasksP2p != 0) { + NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &nWorkBudget), result, failure); + } + if (nWorkBudget == nWorkBudgetOld) { + // We weren't able to fit any tasks into our budget which means now we're + // stuck in an infinite loop. We defer this check until here, instead of + // doing it in comm init, to permit testing with insanely shallow queues + // for cases where that's expected to still work (e.g. few channels). + WARN("'NCCL_WORK_FIFO_DEPTH=%d' is too small. Minimum value is %d", comm->workFifoDepth, 2*MAXCHANNELS); + result = ncclInvalidUsage; + goto failure; + } + finishPlan(plan); + } while (tasks->nTasksColl + tasks->nTasksP2p != 0); + + struct ncclKernelPlan* planHead = ncclIntruQueueHead(&comm->planQueue); + comm->unlaunchedPlansHead = planHead; + + NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->deviceStream), result, failure); + + // Create dependency for nccl device work on user streams. + for (struct ncclCudaStreamList* l=tasks->streams; l != nullptr; l = l->next) { + NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, l->stream), result, failure); + } + + if (persistent || comm->persistentRefs != 0) { + bool acquired = false; + for (struct ncclKernelPlan* plan=planHead; plan != nullptr; plan = plan->next) { + if (plan->hasProxyOps) { + if (!acquired) { + acquired = true; + NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->hostStream), result, failure); + } + NCCLCHECKGOTO(ncclStrongStreamLaunchHost(tasks->capturingGraph, &comm->hostStream, hostStreamPlanCallback, plan), result, failure); + } + } + if (acquired) { + NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->hostStream), result, failure); + } + } + + if (persistent) { + comm->persistentRefs += nPlans; + NCCLCHECKGOTO(ncclCudaGraphAddDestructor(tasks->capturingGraph, persistentDestructor, (void*)planHead), result, failure); + } } - // After capturing an op in graph mode or launching the op in non-graph mode - // we can reset myParams for use in next op - struct cudaLaunchParams *params = comm->myParams; - params->gridDim.x = params->blockDim.x = 0; - params->func = NULL; - - // Reset launch mode to GROUP if changed - if (comm->launchMode == ncclComm::GROUP_GRAPH) comm->launchMode = ncclComm::GROUP; - comm->usingCudaGraph = 0; + if (false) { + failure: + ncclMemoryStackPop(&comm->memScoped); // deallocate ncclWork's + } + return result; +} +ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) { + // This code is called after we've checked in to the intra-process barrier + // but before launching the kernel. We are not allowed to call CUDA unless the + // kernel launch is captured. + NCCLCHECK(uploadWork(comm, plan)); return ncclSuccess; } +ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) { + struct ncclTasks* tasks = &comm->tasks; + dim3 grid = {(unsigned)plan->channelCount, 1, 1}; + dim3 block = {(unsigned)plan->threadPerBlock, 1, 1}; + void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead}; + NCCLCHECK(ncclStrongStreamLaunchKernel( + tasks->capturingGraph, &comm->deviceStream, plan->kernelFn, grid, block, args, 0 + )); + return ncclSuccess; +} + +ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) { + if (comm->persistentRefs == 0) { // implies !plan->persistent + // If this isn't being captured and there aren't any CUDA graphs alive + // then we don't need to do our proxyOp pushing on the host stream. + NCCLCHECK(hostStreamPlanTask(comm, plan)); + } + return ncclSuccess; +} + +ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { + ncclResult_t result = ncclSuccess; + struct ncclTasks* tasks = &comm->tasks; + tasks->collBytesTotal = 0; // Just in case subtraction during scheduleCollTasksToPlan() doesn't get to 0 + + // Deallocate ncclWork's. This frame exists so long as ncclLaunchPrepare + // succeeded, and if it ncclLaunchPrepare didn't succeed we wouldn't be here. + ncclMemoryStackPop(&comm->memScoped); + + if (!ncclIntruQueueEmpty(&comm->planQueue)) { + // Reset queue to empty without destroying plans since those will be sent + // back to us for reclaiming via callbackQueue. + ncclIntruQueueConstruct(&comm->planQueue); + // Close strong stream "transaction" encompassing cuda launches + NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->deviceStream), result, resume1); + resume1: + // Create dependency for user streams on nccl device work. + struct ncclCudaStreamList* sl = tasks->streams; + tasks->streams = nullptr; // reset streams to empty + while (sl != nullptr) { + NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->deviceStream), result, resume2); + resume2: + sl = sl->next; + } + } + return result; +} + /*****************************************************************************/ /* Enqueueing system : computation of kernel and proxy operations parameters */ /*****************************************************************************/ @@ -412,7 +1059,7 @@ static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNet if (info->comm->collNetSupport > 0) { // Translate ncclAvg and PreMulSum ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op; - NCCLCHECK(collNetReduceSupport(info->datatype, netOp, collNetTypeSupport)); + NCCLCHECK(collNetReduceSupport(info->comm, info->datatype, netOp, collNetTypeSupport)); } else { *collNetTypeSupport = 0; } @@ -480,6 +1127,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE; if (info->algorithm == NCCL_ALGO_COLLNET) nt += 3*WARP_SIZE; } + nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt; info->nChannels = nc; info->nThreads = nt; return ncclSuccess; @@ -524,7 +1172,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { return ncclSuccess; } -static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */) { +static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */) { int collNetTypeSupport = 0; // Check whether algo and proto have been preset (as in aggregation case) // If so, skip the calculation @@ -537,23 +1185,22 @@ comp_next: NCCLCHECK(getPatternInfo(info)); NCCLCHECK(getLoopInfo(info)); - work->header.type = ncclWorkTypeColl; work->sendbuff = info->sendbuff; work->recvbuff = info->recvbuff; work->root = info->root; work->count = info->count; work->nChannels = info->nChannels; - work->header.nWarps = info->nThreads / WARP_SIZE; + work->nWarps = info->nThreads / WARP_SIZE; work->redOpArg = info->opFull.scalarArg; work->redOpArgIsPtr = info->opFull.scalarArgIsPtr; if (info->comm->nRanks == 1) { // one-rank reduce index - work->header.funcIndex = 1 + int(info->datatype); + *workFuncIndex = 1 + int(info->datatype); return ncclSuccess; } - work->header.funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol); + *workFuncIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol); int stepSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS; int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1; @@ -623,636 +1270,6 @@ comp_next: return ncclSuccess; } -static ncclResult_t checkSetStream(struct ncclInfo* info) { - if (info->comm->userStreamSet == false) { - info->comm->userStream = info->stream; - info->comm->userStreamSet = true; - } else if (info->stream != info->comm->userStream) { - WARN("Error : mixing different streams within a group call is not supported."); - return ncclInvalidUsage; - } - return ncclSuccess; -} - -// Handle structure for user buffer registration (IPC) exchange -struct ncclBuffRegHandle { - cudaIpcMemHandle_t sendBuffIpc; - cudaIpcMemHandle_t recvBuffIpc; - ssize_t sendBuffOffset; - ssize_t recvBuffOffset; -}; - -// Register input and output buffers -// Exchange with ranks on the same host -static ncclResult_t ncclRegBuffAndExchange(struct ncclInfo* info, struct ncclBuffRegInfo* regInfo) { - ncclComm_t comm = info->comm; - if (comm->localRanks == 1) return ncclSuccess; - if (comm->pfnCuMemGetAddressRange == NULL) return ncclSuccess; // CUDA toolkit or driver version too old - - ncclResult_t ret = ncclSuccess; - struct ncclBuffRegHandle regHandles[NCCL_MAX_LOCAL_RANKS]; - // Get IPC handles - // Note: the handle only corresponds to the base address of the allocation - CUDACHECKGOTO(cudaIpcGetMemHandle(®Handles[comm->localRank].sendBuffIpc, (void*)info->sendbuff), ret, reg_fallback); - CUDACHECKGOTO(cudaIpcGetMemHandle(®Handles[comm->localRank].recvBuffIpc, (void*)info->recvbuff), ret, reg_fallback); - // Get offset of user buffer within allocation - void* baseAddr; - size_t size; - // Get base address - CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->sendbuff)); - regHandles[comm->localRank].sendBuffOffset = (char*)info->sendbuff - (char*)baseAddr; - CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->recvbuff)); - regHandles[comm->localRank].recvBuffOffset = (char*)info->recvbuff - (char*)baseAddr; - TRACE(NCCL_COLL, "Base %p size %lu offset %ld", baseAddr, size, regHandles[comm->localRank].recvBuffOffset); - - // Exchange handles within node - NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regHandles, sizeof(struct ncclBuffRegHandle))); - // Open handles at local process - for (int i=0; ilocalRanks; i++) { - // Skip myself - if (i == comm->localRank) { - regInfo->sendbuffsBase[i] = regInfo->recvbuffsBase[i] = NULL; - continue; - } - // Get base address of mapping - CUDACHECK(cudaIpcOpenMemHandle(regInfo->sendbuffsBase+i, regHandles[i].sendBuffIpc, cudaIpcMemLazyEnablePeerAccess)); - CUDACHECK(cudaIpcOpenMemHandle(regInfo->recvbuffsBase+i, regHandles[i].recvBuffIpc, cudaIpcMemLazyEnablePeerAccess)); - // Get real buffer address by adding offset in the mapping - regInfo->sendbuffs[i] = (char*)regInfo->sendbuffsBase[i] + regHandles[i].sendBuffOffset; - regInfo->recvbuffs[i] = (char*)regInfo->recvbuffsBase[i] + regHandles[i].recvBuffOffset; - } - // Marks the operation as being buffer registered - regInfo->nBuffs = comm->localRanks; - TRACE(NCCL_COLL, "Rank %d exchanged %d buffers", comm->rank, regInfo->nBuffs); - return ncclSuccess; - -reg_fallback: - // If we cannot register specific buffer types, we just bypass this stage, and continue without failing - (void)ret; - WARN("Unable to register user buffers"); - return ncclSuccess; -} - -// Compute enqueue element, save it in list -// Compute CUDA launch parameters -// Capture time code in view of CUDA graph -static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) { - ncclComm_t comm = info->comm; - if (comm->nRanks == 1 && - // User-defined reduction ops may need alter the data even for unitary reductions - info->op < ncclNumOps) { - if (info->sendbuff != info->recvbuff) - CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream)); - return ncclSuccess; - } - - // Compute cuda kernel arg and proxy arg templates - struct ncclQueueElem* eqElem; - NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem)); - struct ncclWork* work = &eqElem->work; - NCCLCHECK(computeColl(info, work->elems, &eqElem->proxyOp)); - - // Determine grid size - struct cudaLaunchParams* params = comm->myParams; - params->gridDim.x += info->nChannels; - params->gridDim.x = std::min(params->gridDim.x, comm->nChannels); - params->blockDim.x = std::max(params->blockDim.x, info->nThreads); - comm->enqueueInfo->maxChannels = params->gridDim.x; // params may be varied by a second graph hence we need to capture it here - - // Inline the first kernel - if (params->func == NULL) { - params->func = ncclKerns[work->header.funcIndex]; - if (work->header.type == ncclWorkTypeColl) { - // Copy the first operation to the inline argument. Type may be set later to - // ncclWorkTypeUnused if we have more than one coll element. - memcpy(&comm->args, work->elems, sizeof(struct ncclWorkElem)); - comm->args.bid = 0; // Only inline for channel 0 - comm->args.header.isLast = 1; // I am so far the last element - } - } - - // Register and exchange input and output buffers - if (comm->usingCudaGraph && // only in CUDA graph mode - comm->graphRegister == 1 && // when registration is enabled - info->algorithm == NCCL_ALGO_COLLNET && // limited to CollNet for now - comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other - comm->intraRanks == 1) { // only in multi-process mode - NCCLCHECK(ncclRegBuffAndExchange(info, &eqElem->buffRegInfo)); - comm->enqueueInfo->nRegBuffs += eqElem->buffRegInfo.nBuffs; - work->header.type = ncclWorkTypeRegColl; - // Disable inline argument because we need kernel to copy the entire ncclWork from workFifo - // because the registered addresses are in ncclWorkElemReg - comm->args.header.type = ncclWorkTypeUnused; - } - - return ncclSuccess; -} - -// Find the channel with the least enqueued work (counted in bytes) -static inline int findShortestChannel(ncclComm_t comm) { - size_t minSize = SIZE_MAX; - int minC = 0; - for (int c=0; cnChannels; c++) { - struct ncclChannel* channel = comm->channels+c; - if (channel->totalSize < minSize) { - minSize = channel->totalSize; - minC = c; - } - } - return minC; -} - -// Get next channel based on shortest-queue mode or round-robin mode -static inline int getNextChannel(ncclComm_t comm, int aggMode) { - int nextChannel = 0; - if (aggMode && comm->asyncAllocMode == ncclComm::SHORTEST_QUEUE) { - nextChannel = findShortestChannel(comm); - } else { - nextChannel = comm->lastChannel % comm->nChannels; - comm->lastChannel++; - } - return nextChannel; -} - -// Setup aggregated kernels -// Op info has been previously saved in comm->asyncOps -ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) { - if (comm->asyncOpCount == 0) { - return ncclSuccess; - } else if (comm->asyncOpCount == 1) { - // No aggregation - struct ncclInfo* info = comm->asyncOps; - info->nChannels = 0; - NCCLCHECK(ncclSetupCollKernel(info)); - } else { - // Aggregation - // Determine a per-channel chunk size used to divide an operation into multiple channels - size_t channelSize; - if (comm->channelSize > 0) { - // Set by user - channelSize = comm->channelSize; - } else if (comm->collNetSupport && comm->asyncOps[0].coll == ncclFuncAllReduce) { - // CollNet specific size (tuned based on experiments) - channelSize = 256 * 1024; - } else { - // Latency increases as scale increases - // We would thus want to increase the chunk size to compensate for the lost efficiency - channelSize = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks); - } - // Reduce the per-channel size if we cannot fully utilize the channels - while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2; - // Check whether the ops have same reduce and data types (and hence can be packed in same ncclWork) - int channelUsed = 0; - int homogeneous = 1; - int allCollNetSupport = comm->collNetSupport; - for (int c = 0; c < comm->asyncOpCount; c++) { - struct ncclInfo* info = comm->asyncOps+c; - info->nChannels = std::min(std::max(1, (int)DIVUP(info->nBytes, channelSize)), comm->nChannels); // assign number of channels - channelUsed += info->nChannels; - // We can use fast path if all collectives are the same - homogeneous &= info->coll == comm->asyncOps[0].coll && - info->opFull.op == comm->asyncOps[0].opFull.op && - info->datatype == comm->asyncOps[0].datatype; - if (allCollNetSupport > 0) NCCLCHECK(getCollNetSupport(info, &allCollNetSupport)); - } - // Compute algo, proto, nthreads for the entire kernel - // Prepare a synthetic op info to calculate the collective algo - struct ncclInfo total; - total.comm = comm; - total.coll = comm->asyncOps[0].coll; - total.nBytes = comm->asyncTotalSize; - total.nChannels = std::min(channelUsed, comm->nChannels); - int perChannelOps = DIVUP(channelUsed, total.nChannels); - if (homogeneous) NCCLCHECK(getAlgoInfo(&total, allCollNetSupport, perChannelOps)); - // Set for each op - for (int c = 0; c < comm->asyncOpCount; c++) { - struct ncclInfo* info = comm->asyncOps+c; - if (homogeneous) { - // Set fields to skip the individual computeColl in ncclSetupCollKernel - info->algorithm = total.algorithm; - info->protocol = total.protocol; - info->nThreads = total.nThreads; - } - NCCLCHECK(ncclSetupCollKernel(info)); - } - comm->args.header.type = ncclWorkTypeUnused; // disable inline argument - } - // Reset counters - comm->asyncOpCount = 0; - comm->asyncTotalSize = 0; - return ncclSuccess; -} - -// Store aggregated operations info -static ncclResult_t ncclSaveAsyncColl(struct ncclInfo* info) { - ncclComm_t comm = info->comm; - if (comm->asyncOpCount >= NCCL_MAX_OPS) { - WARN("Too many async operations in progress, max is %d", NCCL_MAX_OPS); - return ncclInvalidUsage; - } - memcpy(comm->asyncOps+comm->asyncOpCount, info, sizeof(struct ncclInfo)); - comm->asyncOpCount++; - comm->asyncTotalSize += info->nBytes; - return ncclSuccess; -} - -// Save p2p operations in comm->p2pSends and p2pRecvs. Operations will be posted to channels -// during ncclGroupEnd() -static ncclResult_t ncclSaveP2p(struct ncclInfo* info) { - struct ncclComm* comm = info->comm; - int peer = info->root; - ssize_t nBytes = info->count*ncclTypeSize(info->datatype); - int channelBaseId; - NCCLCHECK(ncclChannelComputeBase(comm, peer, info->coll, &channelBaseId)); - if (info->coll == ncclFuncSend) { - if (peer != comm->rank) { - // Mark channels that need pre-connect - for (int c=0; cp2pnChannelsPerPeer; c++) { - int channelId; - NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId)); - if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector - comm->connectSend[peer] |= (1<connect = 1; - } - } - } - NCCLCHECK(ncclSaveP2pInfo(comm->p2pSends[info->root], info->recvbuff, nBytes)); - comm->p2pSendCount++; - } else { - if (peer != comm->rank) { - // Mark channels that need pre-connect - for (int c=0; cp2pnChannelsPerPeer; c++) { - int channelId; - NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId)); - if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector - comm->connectRecv[peer] |= (1<connect = 1; - } - } - } - NCCLCHECK(ncclSaveP2pInfo(comm->p2pRecvs[info->root], info->recvbuff, nBytes)); - comm->p2pRecvCount++; - } - return ncclSuccess; -} - -static int getSegment(enum ncclWorkElemType type, enum ncclWorkElemSubType subType, int peer, struct ncclWork* work) { - if (work->header.type && (work->header.type != type)) return -1; - - if (type == ncclWorkTypeP2p) { // P2P - int start = subType == ncclWorkSubTypeRecv ? 0 : 1; - for (int s=start; sp2pElems[s].peer == -1) return s; - // Do not aggregate multiple sends to the same peer (or receives from the same peer) - if (work->p2pElems[s].peer == peer) return -1; - } - } else if (type == ncclWorkTypeRegColl) { // CollNet - for (int s=0; sregElems[s].elem.header.type == ncclWorkTypeUnused) return s; - } - } else if (type == ncclWorkTypeColl) { // Ring or Tree - for (int s=0; selems[s].header.type == ncclWorkTypeUnused) return s; - } - } - return -1; -} - -// Compute kernel arguments for P2P ops -static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElemP2p* elem /* output */) { - elem->header.type = ncclWorkTypeP2p; - elem->header.funcIndex = FUNC_INDEX_P2P; - elem->header.nWarps = NCCL_MAX_NTHREADS/WARP_SIZE; - elem->buff = info->recvbuff; - elem->subType = info->coll == ncclFuncSend ? ncclWorkSubTypeSend : ncclWorkSubTypeRecv; - elem->count = info->count; - elem->chunkSize = info->chunkSize; - elem->peer = info->root; - return ncclSuccess; -} - -// Equeue work elements into segment of ncclWork -// Supporting both collectives (aggregated or not) and P2P -static ncclResult_t enqueueSegOp(enum ncclWorkElemType type, struct ncclWork* elem /* input */, struct ncclWork* work, int s, - struct ncclBuffRegInfo* regInfo, struct ncclChannel* channel, struct ncclComm* comm) { - - if (type == ncclWorkTypeP2p) { - memcpy(work->p2pElems+s, elem, sizeof(struct ncclWorkElemP2p)); - int nelems = 0; - for (int i=0; ip2pElems[i].header.type) nelems = i+1; - } - - int ngroups = 1; - while (ngroups < nelems) ngroups *= 2; - int nWarps = 1; - while (nWarps*ngroups <= elem->header.nWarps/2) nWarps *= 2; - - for (int i=0; ip2pElems[i].ngroups = ngroups; - work->p2pElems[i].warpStart = - i*(NCCL_MAX_NTHREADS/WARP_SIZE)/ngroups; - int extraWarp = nWarps >= 2 ? i%2 : 0; - work->p2pElems[i].nWarps = nWarps + extraWarp; - } - return ncclSuccess; - } - - memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem)); - - if (regInfo->nBuffs == 0) return ncclSuccess; - - // Copy registered buffer addresses into ncclWork - struct ncclWorkElemReg* regElem = (struct ncclWorkElemReg*)(work->elems+s); - // For CollNet - for (int i=0; icollTree.down[i]; - if (peer == -1) break; - // Get intra-node slot - int j = comm->rankToLocalRank[peer]; - if (j < 0) { - WARN("Invalid intra-node rank %d for peer %d", j, peer); - return ncclInternalError; - } - // Input buffer of leaf peer - regElem->dnInputs[i] = regInfo->sendbuffs[j]; - // Output buffer of leaf peer - regElem->dnOutputs[i] = regInfo->recvbuffs[j]; - } - for (int i=0; icollTree.up[i]; - if (peer == -1) break; - int j = comm->rankToLocalRank[peer]; - if (j < 0) { - WARN("Invalid intra-node rank %d for peer %d", j, peer); - return ncclInternalError; - } - // Output buffer of root peer - regElem->upOutputs[i] = regInfo->recvbuffs[j]; - } - work->elems[s].regUsed = 1; - return ncclSuccess; -} - -// Enqueue P2P op -ncclResult_t ncclEnqueueP2pKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem) { - struct ncclWorkElemP2p* workElem = eqElem->work.p2pElems; - struct ncclProxyOp* proxyOp = &eqElem->proxyOp; - - // Try to reuse last p2p operation if not full yet - struct ncclChannel* channel = comm->channels+proxyOp->channelId; - int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS; - struct ncclWork* w = channel->workFifo+opIndex; - int segment = -1; - if (channel->workCount) { - // Try to pack more segments into a single operation - segment = getSegment(ncclWorkTypeP2p, workElem->subType, workElem->peer, w); - } - if (segment == -1) { - NCCLCHECK(getNextOp(channel, &w, NULL)); - segment = workElem->subType == ncclWorkSubTypeRecv ? 0 : 1; - // Initialize work as P2P, set peer=-1 to designate the p2p elem is not used. - w->header.type = ncclWorkTypeP2p; - for (int i=0; ip2pElems[i].peer = -1; - } - //printf("%s to %d -> Channel %d OpCount %ld Segment %d\n", workElem->subType == ncclWorkSubTypeRecv ? "Recv" : "Send", proxyOp->root, channel->id, channel->workFifoTail-1, segment); - - // store work element into FIFO - NCCLCHECK(ncclProxySaveP2p(comm, proxyOp)); - NCCLCHECK(enqueueSegOp(ncclWorkTypeP2p, &eqElem->work, w, segment, &eqElem->buffRegInfo, channel, comm)); - return ncclSuccess; -} - -// Setup P2P op -ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) { - ncclComm* comm = info->comm; - // Compute cuda kernel arg and proxy arg templates - struct ncclQueueElem* eqElem; - NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem)); - // The proxy code will set and tune the send/recv chunk size, make sure to run it first. - NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyOp)); - NCCLCHECK(computeP2pWorkElem(info, eqElem->work.p2pElems)); - // Compute grid size - int channelId = info->channelId; - struct cudaLaunchParams* params = comm->myParams; - params->gridDim.x = std::max(params->gridDim.x, channelId+1); - params->blockDim.x = std::max(params->blockDim.x, eqElem->work.header.nWarps*WARP_SIZE); - comm->enqueueInfo->maxChannels = params->gridDim.x; // params may be varied by a second graph hence we need to capture it here - - // Record the first kernel to launch - // Just for CUDA kernel to know this is a P2P operation - // The CUDA kernel does not use the inlined first work element as fastpath argument - if (params->func == NULL) { - params->func = ncclKerns[eqElem->work.header.funcIndex]; - comm->args.header.type = ncclWorkTypeUnused; - } - return ncclSuccess; -} - -// Dynamic enqueue function for collective kernels -// Supports both aggregated and non-aggregated modes -ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem, int aggMode) { - struct ncclWork* work = &eqElem->work; - struct ncclWorkElem* elem = work->elems; - struct ncclProxyOp* proxyOp = &eqElem->proxyOp; - - int nChannels = elem->nChannels; - size_t channelSize = elem->count*ncclTypeSize(proxyOp->dtype)/elem->nChannels; - enum ncclWorkElemType workElemType = proxyOp->redOp == ncclNumOps ? ncclWorkTypeColl : ncclWorkTypeRegColl; // redOp is only set when using CollNet - - for (int bid=0; bidchannels+channelId; - - // Proxy - proxyOp->channelId = channelId; - proxyOp->opCount = comm->collOpCount; - if (proxyOp->nsteps) NCCLCHECK(ncclProxySaveColl(comm, proxyOp, comm->nRanks)); - - elem->bid = bid % nChannels; - struct ncclWork* w = NULL; - int segment = -1; - if (aggMode && channel->workCount) { - // Try to pack more segments into a single operation - int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS; - w = channel->workFifo+opIndex; - // All elems in work must have same (funcIndex,nThreads), - // see "src/collectives/device/common.h" - if (w->header.funcIndex == work->header.funcIndex && - w->header.nWarps == work->header.nWarps) { - segment = getSegment(workElemType, ncclWorkSubTypeUnused, 0, w); - } - } - if (segment == -1) { - NCCLCHECK(getNextOp(channel, &w, NULL)); - segment = 0; - } - - // store work element into FIFO - NCCLCHECK(enqueueSegOp(workElemType, work, w, segment, &eqElem->buffRegInfo, channel, comm)); - channel->totalSize += channelSize; - } - comm->collOpCount++; - return ncclSuccess; -} - -// Host setup node for CUDA Graph -// Performs the enqueue job -template -void CUDART_CB ncclEnqueueHostSetup(void* arg) { - NVTX3_FUNC_RANGE_IN(nccl_domain); - ncclResult_t ret; - // All work for current launch has been captured in Queue Info - struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)arg; - ncclComm_t comm = eqInfo->comm; - int aggMode = eqInfo->elemList->count() > 1 ? 1 : 0; - - // Iterate through the element list - struct ncclQueueElem* eqElem = eqInfo->elemList->begin(); - while (eqElem != NULL) { - if (eqElem->work.header.funcIndex == FUNC_INDEX_P2P) { - NCCLCHECKGOTO(ncclEnqueueP2pKernel(comm, eqElem), ret, cb_end); - } else { - NCCLCHECKGOTO(ncclEnqueueCollKernel(comm, eqElem, aggMode), ret, cb_end); - } - eqElem = eqInfo->elemList->getNext(); - } - - NCCLCHECKGOTO(setupLaunch(eqInfo, USING_CUDA_GRAPH), ret, cb_end); - NCCLCHECKGOTO(ncclLaunchProxy(eqInfo), ret, cb_end); - -cb_end: - if (ret != ncclSuccess) { - WARN("Failure in host setup : %s", ncclGetErrorString(ret)); - } - eqInfo->ret = ret; -} - -template void CUDART_CB ncclEnqueueHostSetup<0>(void*); -template void CUDART_CB ncclEnqueueHostSetup<1>(void*); - -// CUDA Graph helper thread -// for de-registering user buffers -void* graphHelperFunc(void *args) { - struct ncclGraphHelperResources* res = (struct ncclGraphHelperResources*)args; - if (res == NULL) { - WARN("CUDA Graph helper resource is null"); - return NULL; - } - int dev = res->comm->cudaDev; - CUDACHECKIGNORE(cudaSetDevice(dev)); - INFO(NCCL_COLL, "CUDA Graph helper thread created for device %d", dev); - - volatile enum helperThreadState* state = &res->threadState; - volatile int* ipcTail = &res->ipcTail; - while (1) { - // Last IPC entry enqueue so far - int ipcTailMark = *ipcTail; - int ipcCount = 0; - // Close IPC till the last entry - while (res->ipcHead != ipcTailMark) { - if (res->ipcBases[res->ipcHead] != NULL) - CUDACHECKIGNORE(cudaIpcCloseMemHandle(res->ipcBases[res->ipcHead])); - res->ipcBases[res->ipcHead] = NULL; - res->ipcHead = (res->ipcHead+1)%NCCL_IPC_POOL_SIZE; - ipcCount++; - } - TRACE(NCCL_COLL, "CUDA Graph helper thread closed %d IPC handles", ipcCount); - pthread_mutex_lock(&res->threadLock); - // Check for exit signal - while (res->ipcHead == *ipcTail && *state != ThreadStop) { - pthread_cond_wait(&res->threadCond, &res->threadLock); - } - pthread_mutex_unlock(&res->threadLock); - if (*state == ThreadStop) { - INFO(NCCL_COLL, "CUDA Graph helper thread for device %d returning", dev); - return NULL; - } - } -} - -// Check if we are in CUDA Graph capture mode -ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) { - comm->usingCudaGraph = 0; - // Feature requires CUDA 11.3/R465 or above -#if CUDART_VERSION >= 11030 - cudaStreamCaptureStatus captureStatus; - unsigned long long cudaGraphId; - ncclResult_t ret = ncclSuccess; - if (comm->driverVersion < 11030) { - // Runtime driver version older than compiler version - // Enhanced compat fallback - goto enh_compat_end; - } - // Get CUDA Graph handle - CUDACHECKGOTO(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end); - if (captureStatus == cudaStreamCaptureStatusActive) { - if (cudaGraphId != comm->lastCudaGraphId) { - INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId); - // We are in a new graph, hence need to forget the last setup node so that - // the first setup node in the new graph will not have a dependency - comm->lastCudaGraphId = cudaGraphId; - comm->lastSetupNode = NULL; - } - if (comm->launchMode == ncclComm::GROUP) comm->launchMode = ncclComm::GROUP_GRAPH; - comm->usingCudaGraph = 1; - - // Create helper thread that closes IPC handles during graph destruction - // Only create this thread when buffer registration is enabled - if ((!comm->graphHelperThread) && comm->graphRegister == 1 && comm->disableGraphHelper == 0) { - pthread_mutex_init(&comm->graphHelperResources->threadLock, NULL); - // Init signaling method between Graph destroy function and helper thread - pthread_cond_init(&comm->graphHelperResources->threadCond, NULL); - // Set state - comm->graphHelperResources->threadState = ThreadStart; - // Create thread - pthread_create(&comm->graphHelperThread, NULL, graphHelperFunc, comm->graphHelperResources); - // Name thread - ncclSetThreadName(comm->graphHelperThread, "NCCL GrHelper%2d", comm->cudaDev); - } - } - return ncclSuccess; - -enh_compat_end: // Enhanced compat fallback - (void)ret; - CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus)); - if (captureStatus != cudaStreamCaptureStatusNone) { - WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support"); - return ncclInvalidUsage; - } - // If we are not in capture mode, we can ignore the driver being lower -#endif - return ncclSuccess; -} - -// Create host setup node in CUDA Graph -ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph) { -#if CUDART_VERSION >= 11030 - struct ncclQueueInfo* eqInfo = comm->enqueueInfo; - // Create a CUDA object to wrap around the argument space - // which CUDA graph would manage lifetime of - cudaUserObject_t object; - CUDACHECK(cudaUserObjectCreate(&object, eqInfo, ncclDestroyQueueInfo, 1/*initialRefcount*/, cudaUserObjectNoDestructorSync)); - // Hand over ownership to CUDA Graph - CUDACHECK(cudaGraphRetainUserObject(graph, object, 1, cudaGraphUserObjectMove)); - - cudaHostFn_t fn = ncclEnqueueHostSetup<1>; - // Add a CPU node to the graph - cudaGraphNode_t setupNode; - // Function + parameter space for that function (i.e. enqueue info) - cudaHostNodeParams setupNodeParams = {fn, eqInfo}; - int numDependencies = comm->lastSetupNode == NULL ? 0 : 1; - CUDACHECK(cudaGraphAddHostNode(&setupNode, graph, &comm->lastSetupNode, numDependencies, &setupNodeParams)); - // Create dependency from last setup node in the same graph - CUDACHECK(cudaStreamUpdateCaptureDependencies(comm->userStream, &setupNode, 1, cudaStreamAddCaptureDependencies)); - comm->lastSetupNode = setupNode; - return ncclSuccess; -#else - WARN("NCCL does not support this CUDA version for CUDA graph feature"); - return ncclInternalError; -#endif -} - static ncclResult_t hostToDevRedOp( ncclDevRedOpFull *opFull, ncclRedOp_t op, ncclDataType_t datatype, ncclComm *comm ) { @@ -1321,70 +1338,131 @@ static ncclResult_t hostToDevRedOp( return ncclSuccess; } -ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { - ncclResult_t ret = ncclSuccess; - bool isAsync = ncclAsyncMode(); - int savedDev = -1; - // Check arguments - NCCLCHECK(PtrCheck(info->comm, info->opName, "comm")); - if (isAsync && info->comm->checkPointers) { - CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end); - CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end); - } - NCCLCHECKGOTO(ArgsCheck(info), ret, end); +// Converts `info` to a task and adds it to `comm->tasks`. The exception is with +// single rank communicators, collectives are issued as `ncclMemcpyAsync`s and +// thus don't need a task. +static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo const* info) { + ncclTasks *tasks = &comm->tasks; + if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) { + int peer = info->root; + ssize_t nBytes = info->count*ncclTypeSize(info->datatype); + bool isSendNotRecv = info->coll == ncclFuncSend; - // Copy reduction op state from op handle into info struct here since the - // op handle may be destroyed before ncclGroupEnd(). - NCCLCHECKGOTO(hostToDevRedOp(&info->opFull, info->op, info->datatype, info->comm), ret, end); + // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. + ncclGroupCommJoin(info->comm); + struct ncclTaskP2p* p2p = ncclMemoryStackAlloc(&comm->memScoped); + p2p->buff = (void*)info->recvbuff; + p2p->bytes = nBytes; + p2p->chunk = 0; + ncclIntruQueueEnqueue( + isSendNotRecv ? &tasks->peers[peer].sendQueue : &tasks->peers[peer].recvQueue, + p2p); + tasks->nTasksP2p += 1; - // Launch asynchronously if needed - if (isAsync) { - // Always register comm even in case of error to make sure ncclGroupEnd - // cleans it up. - NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end); - NCCLCHECKGOTO(checkSetStream(info), ret, end); - - INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", - info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, - info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); - - if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) { //p2p stored separately - NCCLCHECKGOTO(ncclSaveP2p(info), ret, end); - } else { - NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end); + // Mark channels that need pre-connect + if (comm->rank != peer) { + int channelBaseId; + NCCLCHECK(ncclChannelComputeBase(comm, peer, info->coll, &channelBaseId)); + if (!(isSendNotRecv ? tasks->peers[peer].sendSeen : tasks->peers[peer].recvSeen)) { + (isSendNotRecv ? tasks->peers[peer].sendSeen : tasks->peers[peer].recvSeen) = true; + for (int c=0; c < comm->p2pnChannelsPerPeer; c++) { + int channelId; + NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId)); + if (isSendNotRecv) { + if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector + comm->connectSend[peer] |= (1<channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector + comm->connectRecv[peer] |= (1<op, info->datatype, comm)); - INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", + // User-defined reduction ops may need alter the data even for unitary reductions + if (comm->nRanks == 1 && opFull.op < ncclDevPreMulSum) { + if (info->sendbuff != info->recvbuff) { + size_t bytes = info->count*ncclTypeSize(info->datatype); + CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, bytes, cudaMemcpyDeviceToDevice, info->stream)); + } + return ncclSuccess; + } else { + // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. + ncclGroupCommJoin(info->comm); + struct ncclTaskColl* t = ncclMemoryStackAlloc(&comm->memScoped); + t->func = info->coll; + t->sendbuff = info->sendbuff; + t->recvbuff = info->recvbuff; + t->count = info->count; + t->root = info->root; + t->datatype = info->datatype; + t->op = opFull; // C++ struct assignment + t->chunkSteps = info->chunkSteps; + t->sliceSteps = info->sliceSteps; + ncclIntruQueueEnqueue(&tasks->collQueue, t); + tasks->collBytesTotal += t->count*ncclTypeSize(t->datatype); + tasks->nTasksColl += 1; + } + } + + if (info->stream != tasks->streamRecent || tasks->streams == nullptr) { + tasks->streamRecent = info->stream; + struct ncclCudaStreamList* l = tasks->streams; + while (true) { + if (l == nullptr) { // Got to the end, this must be a new stream. + struct ncclCudaGraph graph; + NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream)) + if (tasks->streams != nullptr && !ncclCudaGraphSame(tasks->capturingGraph, graph)) { + WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph."); + return ncclInvalidUsage; + } + tasks->capturingGraph = graph; // C++ struct assignment + // Add stream to list + l = ncclMemoryStackAlloc(&comm->memScoped); + l->stream = info->stream; + l->next = tasks->streams; + tasks->streams = l; + break; + } + if (l->stream == info->stream) + break; // Already seen stream. + } + } + return ncclSuccess; +} + +ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { + NCCLCHECK(ncclGroupStartInternal()); + ncclResult_t ret = ncclSuccess; + int devOld = -1; + NCCLCHECKGOTO(PtrCheck(info->comm, info->opName, "comm"), ret, end0); + if (info->comm->checkPointers) { + CUDACHECKGOTO(cudaGetDevice(&devOld), ret, end0); + CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end0); + } + NCCLCHECKGOTO(ArgsCheck(info), ret, end1); + + INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); + TRACE_CALL("nccl%s(%" PRIx64 ",%" PRIx64 ",%zi,%d,%d,%d,%p,%p)", info->opName, reinterpret_cast(info->sendbuff), reinterpret_cast(info->recvbuff), info->count, info->datatype, info->op, info->root, info->comm, info->stream); - // Check whether we are in cuda graph mode - cudaGraph_t graph; - ncclComm_t comm = info->comm; - NCCLCHECKGOTO(ncclGetCudaGraph(comm, &graph), ret, end); + NCCLCHECKGOTO(taskAppend(info->comm, info), ret, end1); - // Common part between graph mode and non-graph mode - NCCLCHECKGOTO(ncclSetupCollKernel(info), ret, end); - - // Host setup - if (comm->usingCudaGraph) { - NCCLCHECKGOTO(ncclCudaGraphHostSetup(comm, graph), ret, end); - } else { - ncclEnqueueHostSetup<0>(comm->enqueueInfo); - NCCLCHECKGOTO(comm->enqueueInfo->ret, ret, end); - } - - // Common part between graph mode and non-graph mode - NCCLCHECKGOTO(ncclLaunchBarrier(comm), ret, end); - NCCLCHECKGOTO(ncclLaunchKernel(comm), ret, end); - NCCLCHECKGOTO(ncclRecordEvents(comm), ret, end); - NCCLCHECKGOTO(ncclLaunchReset(comm), ret, end); - } -end: - if (isAsync && savedDev != -1) CUDACHECK(cudaSetDevice(savedDev)); - if (isAsync) ncclAsyncErrCheck(ret); +end1: + if (devOld != -1) CUDACHECKGOTO(cudaSetDevice(devOld), ret, end0); +end0: + ncclGroupErrCheck(ret); + NCCLCHECK(ncclGroupEndInternal()); return ret; } @@ -1419,6 +1497,7 @@ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataTyp } *op = ncclRedOp_t(int(ncclNumOps) + ix); *op = ncclUserRedOpMangle(comm, *op); + TRACE_CALL("ncclRedOpCreatePreMulSum(%d,%p,%d,%d,%p)", *op, scalar, datatype, residence, comm); return ncclSuccess; } @@ -1440,5 +1519,6 @@ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) { // push to free list comm->userRedOps[ix].freeNext = comm->userRedOpFreeHead; comm->userRedOpFreeHead = ix; + TRACE_CALL("ncclRedOpDestroy(%d,%p)", op, comm); return ncclSuccess; } diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 222be70..ab8f8c3 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -428,10 +428,10 @@ NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0); // Net v4 plugins don't have non-blocking connect/accept. We can't therefore use // remote proxies without risking deadlocks -int ncclPxnDisable() { +int ncclPxnDisable(struct ncclComm* comm) { static int pxnDisable = -1; if (pxnDisable == -1) { - if (ncclNetVersion() == 4) { + if (comm && ncclNetVersion(comm) == 4) { INFO(NCCL_INIT, "PXN Disabled as plugin is v4"); pxnDisable = 1; } else { @@ -470,7 +470,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, return ncclSuccess; } -ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) { +ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm) { // Precompute paths between GPUs/NICs. // Remove everything in case we're re-computing @@ -498,16 +498,16 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer } } - if (peerInfos == NULL) continue; + if (comm == NULL) continue; // Remove GPUs we can't talk to because of containers. - struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].gpu.rank; + struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank; for (int p=0; pnodes[GPU].count; p++) { if (p == g) continue; - struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].gpu.rank; + struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank; int shm; - NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo)); + NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo)); int p2p; - NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo)); + NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo)); if (shm == 0 && p2p == 0) { // Mark this peer as inaccessible. We'll trim it later. system->nodes[GPU].nodes[p].paths[GPU][g].count = 0; @@ -523,7 +523,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer for (int g=0; gnodes[GPU].count; g++) { // Check whether we can access the NIC through another NVLink-connected GPU (PXN) struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; - if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) { + if (ncclPxnDisable(comm) != 1 && gpu->paths[NET][n].type > PATH_PXB) { int pxnGpu = -1; for (int p=0; pnodes[GPU].count; p++) { @@ -670,7 +670,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { // We want to spread channels used when there aren't many and progressively // fill the whole space of nChannels. To do so we mirror the bits in the // nChannels space. - for (int c=0; cp2pnChannelsPerPeer; c++) { + for (int c=0; cp2pnChannels; c++) { int mirror = 0; for (int b=1, mb=(comm->p2pnChannels>>1); bp2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb; comm->p2pChannels[c] = mirror; diff --git a/src/graph/search.cc b/src/graph/search.cc index d70b6a7..0f79258 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -958,10 +958,14 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, dev)); *proxyRank = rank; - int pxnLevel = ncclPxnDisable() == 1 ? 0 : ncclParamP2pPxnLevel(); + int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel(); // See whether we can use the remote rank preferred device. if (ncclParamCrossNic() == 0 || (pxnLevel != 0)) { - int netDev = comm->peerInfo[peerRank].netDev; + // Find local NIC number close to local cudaDev + int cudaDev = comm->peerInfo[peerRank].cudaDev; + int localRank; + if (ncclTopoDevToRank(comm->topo, cudaDev, &localRank) != ncclSuccess) return ncclSuccess; + int netDev = comm->peerInfo[localRank].netDev; int n; // Check that device exists on our node if (ncclParamCrossNic() == 0) { diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 53e12e5..2730bf9 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -626,11 +626,11 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes, // so we start with collnet so that it has precedence. int netDevCount = 0; - if (collNetSupport()) { - NCCLCHECK(collNetDevices(&netDevCount)); + if (collNetSupport(comm)) { + NCCLCHECK(collNetDevices(comm, &netDevCount)); for (int n=0; ndmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); + INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name); + NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport)); NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1)); } } if (netDevCount == 0) { - NCCLCHECK(ncclNetDevices(&netDevCount)); + NCCLCHECK(ncclNetDevices(comm, &netDevCount)); } for (int n=0; ndmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); + INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name); + NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport)); } // Remove XML branches which don't have a node with keep="1" (typically when importing a topology) diff --git a/src/graph/topo.h b/src/graph/topo.h index 71c1fca..b24a72b 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -181,6 +181,17 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, return ncclInternalError; } +static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, int* rank) { + *rank = -1; + for (int i=0; inodes[GPU].count; i++) { + if (system->nodes[GPU].nodes[i].gpu.dev == dev) { + *rank = system->nodes[GPU].nodes[i].gpu.rank; + return ncclSuccess; + } + } + return ncclInternalError; +} + // Returns NVLink speed in GB/s static float ncclTopoNVLinkSpeed(int cudaCompCap) { return diff --git a/src/group.cc b/src/group.cc index 5f65a58..d9bc684 100644 --- a/src/group.cc +++ b/src/group.cc @@ -10,399 +10,259 @@ #include "transport.h" #include "channel.h" -#define MAX_ASYNC_OPS 128 -thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS]; -thread_local int ncclGroupIndex = 0; -thread_local int ncclGroupMode = 0; -thread_local ncclResult_t ncclGroupError = ncclSuccess; +__thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting +__thread ncclResult_t ncclGroupError = ncclSuccess; +__thread struct ncclComm* ncclGroupCommHead = nullptr; +__thread struct ncclComm* ncclGroupCommPreconnectHead = nullptr; +__thread struct ncclIntruQueue ncclAsyncJobs; -bool ncclAsyncMode() { - return ncclGroupMode > 0; -} - -ncclResult_t ncclAsyncErrCheck(ncclResult_t ret) { - if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret; - return ret; -} - -struct ncclInitArgs { - ncclInitFunc_t func; - int cudaDev; - ncclComm_t* newcomm; - int ndev; - ncclUniqueId commId; - int myrank; -}; -struct ncclCollArgs { - ncclComm_t comm; -}; - -enum ncclAsyncFuncType { - ASYNC_FUNC_INVALID = 0, - ASYNC_FUNC_INIT = 1, - ASYNC_FUNC_COLL = 2, -}; -struct ncclAsyncArgs { - ncclResult_t ret; - enum ncclAsyncFuncType funcType; - union { - ncclCollArgs coll; - ncclInitArgs init; - }; -}; - -thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS]; - -void* ncclAsyncThreadMain(void* args_) { - struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_; - NCCLCHECKTHREAD(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev)); - return args; -} - -ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev) { - if (ncclGroupIndex >= MAX_ASYNC_OPS) { - WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS); - return ncclAsyncErrCheck(ncclInvalidUsage); +ncclResult_t ncclAsyncLaunch( + struct ncclAsyncJob* job, + ncclResult_t(*func)(struct ncclAsyncJob*), + void(*undo)(struct ncclAsyncJob*), + void(*destructor)(void*) + ) { + if (0 == ncclGroupDepth) { + ncclResult_t res = func(job); + if (res != ncclSuccess && undo) undo(job); + if (destructor) destructor(job); + return res; + } else { + job->func = func; + job->undo = undo; + job->destructor = destructor; + ncclIntruQueueEnqueue(&ncclAsyncJobs, job); + return ncclSuccess; } - int index = ncclGroupIndex++; - struct ncclAsyncArgs* args = ncclGroupArgs+index; - args->funcType = ASYNC_FUNC_INIT; - args->init.func = func; - args->init.cudaDev = cudaDev; - args->init.newcomm = newcomm; - args->init.ndev = ndev; - memcpy(&args->init.commId, &commId, sizeof(commId)); - args->init.myrank = myrank; - return ncclSuccess; } -ncclResult_t ncclAsyncColl(ncclComm_t comm) { - struct ncclAsyncArgs* args = ncclGroupArgs; - for (int i=0; icoll.comm == comm) return ncclSuccess; - args++; +void* ncclAsyncJobMain(void* arg) { + struct ncclAsyncJob* job = (struct ncclAsyncJob*)arg; + job->result = job->func(job); + if (job->result != ncclSuccess) { + INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, job->result); } - if (ncclGroupIndex >= MAX_ASYNC_OPS) { - WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS); - return ncclAsyncErrCheck(ncclInvalidUsage); - } - ncclGroupIndex++; - args->funcType = ASYNC_FUNC_COLL; - args->coll.comm = comm; - return ncclSuccess; + return arg; } NCCL_API(ncclResult_t, ncclGroupStart); ncclResult_t ncclGroupStart() { NVTX3_FUNC_RANGE_IN(nccl_domain); - if (ncclGroupMode == 0) { - memset(ncclGroupArgs, 0, sizeof(struct ncclAsyncArgs)*MAX_ASYNC_OPS); - } - ncclGroupMode++; + NCCLCHECK(ncclGroupStartInternal()); + TRACE_CALL("ncclGroupStart()"); return ncclSuccess; } -static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int chunk, size_t count, void* buff) { - struct ncclInfo info = { ncclFuncSend, "Send", - NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */ - 1, 1 }; - int channelId; - NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, ncclFuncSend, &channelId)); - info.channelId = channelId; - NCCLCHECK(ncclSetupP2pKernel(&info)); - return ncclSuccess; -} -static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int chunk, size_t count, void* buff) { - struct ncclInfo info = { ncclFuncRecv, "Recv", - NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */ - 1, 1 }; - int channelId; - NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, ncclFuncRecv, &channelId)); - info.channelId = channelId; - NCCLCHECK(ncclSetupP2pKernel(&info)); - return ncclSuccess; -} - -void* ncclAsyncThreadPreconnect(void* args_) { - struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_; - struct ncclComm* comm = args->coll.comm; - CUDACHECKTHREAD(cudaSetDevice(comm->cudaDev)); - if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); - NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, 1)); - return args; -} - -static size_t getP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) { - size_t size = std::max(minSize, DIVUP(totalSize, minChannels)); - int nChannels = minChannels; - while (size > maxSize && nChannels <= maxChannels/2) { - nChannels *= 2; - size = DIVUP(totalSize, nChannels); - } - ALIGN_SIZE(size, minSize); - return size; -} - NCCL_API(ncclResult_t, ncclGroupEnd); ncclResult_t ncclGroupEnd() { NVTX3_FUNC_RANGE_IN(nccl_domain); - if (ncclGroupMode == 0) { + NCCLCHECK(ncclGroupEndInternal()); + TRACE_CALL("ncclGroupEnd()"); + return ncclSuccess; +} + +struct ncclPreconnectJob { + struct ncclAsyncJob base; + struct ncclComm* comm; +}; +ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) { + struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_; + struct ncclComm* comm = job->comm; + CUDACHECK(cudaSetDevice(comm->cudaDev)); + if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1)); + return ncclSuccess; +} + +static ncclResult_t doLaunches(struct ncclComm* head) { + ncclResult_t result = ncclSuccess; + struct ncclComm* cliqueComm0 = head->intraComm0; + struct ncclComm* cliqueHead = head; + struct ncclComm* cliqueNextHead; + bool useBarrier = ncclParamLaunchMode == ncclLaunchModeGroup; + // This outer loop iterates over cliques of comms which are siblings of the + // same global entity. We calculate a clique as all comms which have the same + // `intraComm0` value. + do { + struct ncclComm* comm = cliqueHead; + bool capturingYes = false, capturingNo = false; + do { + (ncclCudaGraphValid(comm->tasks.capturingGraph) ? capturingYes : capturingNo) = true; + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure); + NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure); + if (useBarrier) ncclCommIntraBarrierIn(comm, 1); + comm = comm->groupNext; + } while (comm != nullptr && comm->intraComm0 == cliqueComm0); + cliqueNextHead = comm; + + if (capturingYes && capturingNo) { + // We have entered barriers but are aborting without leaving them. Thus + // these comms are permanently trashed. We need a good mechanism for + // tracking and reporting that. + WARN("Either none or all communicators in a ncclGroup() can be CUDA graph captured."); + result = ncclInvalidUsage; + goto failure; + } + + while (true) { // Iterate rounds of launches for clique. + bool moreRounds; + comm = cliqueHead; + do { // Iterate clique members. + struct ncclComm* next = comm->groupNext; + if (useBarrier) { + // Barrier reduction result tells us if this was the final round. + moreRounds = 0 != ncclCommIntraBarrierOut(comm); + } else { + moreRounds = comm->unlaunchedPlansHead != nullptr; + } + if (moreRounds) { + // Pop next unlaunched kernel + struct ncclKernelPlan* plan = comm->unlaunchedPlansHead; + if (plan != nullptr) { + comm->unlaunchedPlansHead = plan->next; + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure); + NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure); + NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure); + } + // Barrier reduction input indicates if we require further rounds. + if (useBarrier) ncclCommIntraBarrierIn(comm, comm->unlaunchedPlansHead != nullptr ? 1 : 0); + if (plan != nullptr) { + NCCLCHECKGOTO(ncclLaunchKernelAfter_NoCuda(comm, plan), result, failure); + } + } else { // Final round. + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure); + NCCLCHECKGOTO(ncclLaunchFinish(comm), result, failure); + } + comm = next; + } while (comm != cliqueNextHead); + if (!moreRounds) break; + } + cliqueHead = cliqueNextHead; + } while (cliqueHead != nullptr); +failure: + return result; +} + +ncclResult_t ncclGroupEndInternal() { + if (ncclGroupDepth == 0) { WARN("ncclGroupEnd: not in a group call."); return ncclInvalidUsage; } - ncclGroupMode--; - if (ncclGroupMode > 0) return ncclSuccess; + ncclGroupDepth--; + if (ncclGroupDepth > 0) return ncclSuccess; + int savedDev; CUDACHECK(cudaGetDevice(&savedDev)); - int activeThreads = 0; - int doneArray[MAX_ASYNC_OPS]; - for (int i=0; ifuncType == ASYNC_FUNC_INIT) { - pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args); - activeThreads++; - doneArray[i] = 0; - } - } - /* For init, since we use threads, we just wait for threads to complete */ - while (activeThreads) { - for (int i=0; ifuncType == ASYNC_FUNC_INIT && doneArray[i] == 0) { - int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL); - if (err == EBUSY) continue; - if (err != 0) ret = ncclSystemError; - if (args->ret != ncclSuccess) ret = args->ret; - doneArray[i] = 1; - activeThreads--; - } - } + if (ncclGroupCommPreconnectHead != nullptr) { + struct ncclComm* comm = ncclGroupCommPreconnectHead; + do { + struct ncclPreconnectJob* job; + NCCLCHECK(ncclCalloc(&job, 1)); + job->base.func = ncclPreconnectFunc; + job->base.undo = nullptr; + job->base.destructor = free; + job->comm = comm; + ncclIntruQueueEnqueue(&ncclAsyncJobs, &job->base); + + struct ncclComm* next = comm->preconnectNext; + comm->preconnectNext = reinterpret_cast(0x1); + comm = next; + } while (comm != nullptr); } - for (int i=0; ifuncType == ASYNC_FUNC_COLL && args->coll.comm->connect) { - pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args); - } - } + if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) { + struct ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs); + do { + pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job); + job = job->next; + } while (job != nullptr); - for (int i=0; ifuncType == ASYNC_FUNC_COLL && args->coll.comm->connect) { - int err = pthread_join(ncclGroupThreads[i], NULL); + job = ncclIntruQueueHead(&ncclAsyncJobs); + do { + int err = pthread_join(job->thread, nullptr); if (err != 0) { WARN("Error waiting for pthread_join : %s", strerror(errno)); - return ncclSystemError; + ret = ncclSystemError; } - NCCLCHECKGOTO(args->ret, ret, end); - args->coll.comm->connect = 0; - } + if (ret == ncclSuccess && job->result != ncclSuccess) ret = job->result; + job = job->next; + } while (job != nullptr); + + jobsDone = true; + if (ret != ncclSuccess) goto failure; } - for (int i=0; ifuncType == ASYNC_FUNC_COLL) { - struct ncclComm* comm = args->coll.comm; - int node = comm->node; - int nNodes = comm->nNodes; - int localRank = comm->localRank; + if (ncclGroupCommHead != nullptr) { + NCCLCHECKGOTO(doLaunches(ncclGroupCommHead), ret, failure); + do { + struct ncclComm* comm = ncclGroupCommHead; + struct ncclComm* next = comm->groupNext; + ncclGroupCommLeave(comm); + ncclGroupCommHead = next; + } while (ncclGroupCommHead != nullptr); + } - // Compute how much to split operations - // Natural step size matching buffer steps. - ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS; - // Try to use all channels - int nChannelsMax = comm->p2pnChannelsPerPeer; - int nChannelsMin = nChannelsMax; - // Try to use all channels, but one channel per operation. - while (nChannelsMin*comm->nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2; - // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth. - while (nChannelsMax*comm->nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2; - - while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) { - // schedule delta 0, +1, -1, +2, -2, ... - // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even. - for (int d=0; d<=nNodes/4; d++) { - int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes }; - int index = 0; - int delta = deltas[index]; -sched_delta: - uint32_t recvNode = (node+nNodes-delta)%nNodes; - uint32_t sendNode = (node+delta)%nNodes; - int steps = comm->maxLocalRanks; - for (int s=0; snodeRanks[recvNode].localRanks ? comm->nodeRanks[recvNode].localRankToRank[recvIndex] : -1; - int sendIndex = (localRank+s)%steps; - int sendPeer = sendIndexnodeRanks[sendNode].localRanks ? comm->nodeRanks[sendNode].localRankToRank[sendIndex] : -1; - struct ncclP2Pinfo* recv = recvPeer != -1 && comm->p2pRecvs[recvPeer] ? comm->p2pRecvs[recvPeer]->getNext() : NULL; - struct ncclP2Pinfo* send = sendPeer != -1 && comm->p2pSends[sendPeer] ? comm->p2pSends[sendPeer]->getNext() : NULL; - if (recv != NULL || send != NULL) { - ssize_t totRecvBytes = -1, totSendBytes = -1; - if (recv != NULL) totRecvBytes = recv->nbytes; - if (send != NULL) totSendBytes = send->nbytes; - if (recv) comm->p2pRecvCount--; - if (send) comm->p2pSendCount--; - if (recvPeer == comm->rank) { // Check self send/recv - if (sendPeer != comm->rank) { WARN("Sendrecv schedule not aligned for self"); ret = ncclInternalError; goto group_cleanup; } - if (send && recv == NULL) { WARN("Trying to send to self without a matching recv"); ret = ncclInvalidUsage; goto group_cleanup; } - if (send == NULL && recv) { WARN("Trying to recv to self without a matching send"); ret = ncclInvalidUsage; goto group_cleanup; } - } - void* recvBuff = recv ? recv->buff : NULL; - void* sendBuff = send ? send->buff : NULL; - // After we recycle p2pSend/Recv, we're no longer allowed to dereference send or recv, only use them as boolean NULL/not NULL. - if (recv && comm->p2pRecvs[recvPeer]->peakNext() == NULL) comm->p2pRecvs[recvPeer]->recycle(); - if (send && comm->p2pSends[sendPeer]->peakNext() == NULL) comm->p2pSends[sendPeer]->recycle(); - - ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize); - ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize); - - ssize_t sendOffset = 0; - ssize_t recvOffset = 0; - int sendRemaining = 1, recvRemaining = 1; - int chunk = 0; - do { - // Shuffle channels with s intra-node, and delta inter-node. Inter-node, make sure - // to use multiple channels to guarantee progress on all ranks from the same node. - ssize_t recvbytes = totRecvBytes-recvOffset; - ssize_t sendbytes = totSendBytes-sendOffset; - if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; } - if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; } - // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested - // (total size == 0), otherwise set size to -1. - if (sendbytes < 0 || (sendbytes == 0 && totSendBytes != 0)) send = NULL; - if (recvbytes < 0 || (recvbytes == 0 && totRecvBytes != 0)) recv = NULL; - if (recv) { - NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, chunk, recvbytes, ((char*)recvBuff)+recvOffset), ret, group_cleanup); - } - if (send) { - NCCLCHECKGOTO(scheduleSend(comm, sendPeer, chunk, sendbytes, ((char*)sendBuff)+sendOffset), ret, group_cleanup); - } - recvOffset += recvChunkSize; - sendOffset += sendChunkSize; - chunk++; - } while (sendRemaining || recvRemaining); + if (false) { + failure: + struct ncclComm* comm = ncclGroupCommHead; + while (comm != nullptr) { + struct ncclComm* next = comm->groupNext; + ncclGroupCommLeave(comm); // overwrites comm->groupNext + // We don't know if preconnect succeeded or happened at all, so clear + // the flags that let `taskAppend()` skip over checking if preconnect + // is needed. + comm->preconnectNext = reinterpret_cast(0x1); + for (int i=0; i < comm->nRanks; i++) { + comm->tasks.peers[i].sendSeen = false; + comm->tasks.peers[i].recvSeen = false; + comm->connectSend[i] = 0; + comm->connectRecv[i] = 0; + } + comm->unlaunchedPlansHead = nullptr; + // Reclaim abandoned kernel plan memory. Note ncclWork structs were already + // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`. + while (!ncclIntruQueueEmpty(&comm->planQueue)) { + struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue); + // Persistent plans will be reclaimed via the callbackQueue when the + // graph drops its UserObject reference. + if (!plan->persistent) { + for (int c=0; c < MAXCHANNELS; c++) { + while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) { + struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue); + ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop); } } - index++; - if (index == 1 && deltas[1] == deltas[0]) index++; - if (index == 2 && deltas[2] == deltas[0]) index++; - if (index == 3 && deltas[3] == deltas[2]) index++; - if (index == 3 && deltas[3] == deltas[1]) index++; - if (index < 4) { - delta = deltas[index]; - goto sched_delta; - } + ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan); } } + // Reset comm->tasks to empty. + comm->tasks.nTasksColl = 0; + comm->tasks.nTasksP2p = 0; + comm->tasks.streams = nullptr; + ncclIntruQueueConstruct(&comm->tasks.collQueue); + comm->tasks.collBytesTotal = 0; + for (int i=0; i < comm->nRanks; i++) { + ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue); + ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue); + } + comm = next; } } - /* Collectives are done in three steps : - * 0. Save kernels previously enqueued. Compute channel, algo, proto, etc. - * 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative] - * 2. Barrier Wait. No CUDA call is permitted - * 3. Enqueue Events. CUDA event wait/enqueue. - * This is needed because step 2 cannot call any CUDA primitive, otherwise if - * cudaFree happens between 1 and 3, it could block that CUDA call and - * prevent some ranks from launching their network threads, which would - * prevent the NCCL call from completing, blocking the cudaFree call. - */ - - // Check whether we are in cuda graph mode - NCCLCHECK(ncclCalloc(&graphs, ncclGroupIndex)); - for (int i=0; ifuncType == ASYNC_FUNC_COLL) { - ncclComm_t comm = args->coll.comm; - NCCLCHECKGOTO(ncclGetCudaGraph(comm, graphs+i), ret, group_cleanup); - if (usingCudaGraphAll == -1) { - usingCudaGraphAll = comm->usingCudaGraph; - } else if (usingCudaGraphAll != comm->usingCudaGraph) { - WARN("Illegal to have some communicators in graph mode while others not"); - ret = ncclInvalidUsage; - goto group_cleanup; - } - } - } - for (int i=0; ifuncType == ASYNC_FUNC_COLL) { - ncclComm_t comm = args->coll.comm; - NCCLCHECKGOTO(ncclSetupAsyncKernels(comm), ret, group_cleanup); - } - } - for (int i=0; ifuncType == ASYNC_FUNC_COLL) { - if (args->coll.comm->userStream == cudaStreamDefault || - args->coll.comm->userStream == cudaStreamPerThread || - args->coll.comm->userStream == cudaStreamLegacy) - CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end); - if (usingCudaGraphAll == 1) { - NCCLCHECKGOTO(ncclCudaGraphHostSetup(args->coll.comm, graphs[i]), ret, end); - } else { - ncclEnqueueHostSetup<0>(args->coll.comm->enqueueInfo); - } - NCCLCHECKGOTO(ncclLaunchBarrier(args->coll.comm), ret, end); - } - } - for (int i=0; ifuncType == ASYNC_FUNC_COLL) { - CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end); - NCCLCHECKGOTO(ncclLaunchKernel(args->coll.comm), ret, end); - } - } - for (int i=0; ifuncType == ASYNC_FUNC_COLL) { - if (args->coll.comm->userStream == cudaStreamDefault || - args->coll.comm->userStream == cudaStreamPerThread || - args->coll.comm->userStream == cudaStreamLegacy) - CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end); - NCCLCHECKGOTO(ncclRecordEvents(args->coll.comm), ret, end); - NCCLCHECKGOTO(ncclLaunchReset(args->coll.comm), ret, end); - } + while (!ncclIntruQueueEmpty(&ncclAsyncJobs)) { + struct ncclAsyncJob* job = ncclIntruQueueDequeue(&ncclAsyncJobs); + if (ret != ncclSuccess && jobsDone && job->undo) job->undo(job); + if (job->destructor) job->destructor((void*)job); } - goto end; -group_cleanup: - if (ret != ncclSuccess) { - // At least one call in the group failed. Since we want to make that group - // an atomic operation, we need to cancel all operations. - for (int i=0; ifuncType == ASYNC_FUNC_INIT) { - if (args->init.newcomm) ncclCommDestroy(*args->init.newcomm); - *args->init.newcomm = NULL; - } else { - struct ncclComm* comm = args->coll.comm; - // Reset aggregation counters - comm->asyncOpCount = 0; - comm->asyncTotalSize = 0; - // Dequeue p2p lists - if (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) { - for (int peer=0; peernRanks; peer++) { - if (comm->p2pSends[peer]) comm->p2pSends[peer]->recycle(); - if (comm->p2pRecvs[peer]) comm->p2pRecvs[peer]->recycle(); - } - comm->p2pSendCount = comm->p2pRecvCount = 0; - } - ncclLaunchReset(comm); - } - } - } -end: ncclGroupError = ncclSuccess; - ncclGroupIndex = 0; + ncclGroupCommHead = nullptr; + ncclGroupCommPreconnectHead = nullptr; CUDACHECK(cudaSetDevice(savedDev)); // do other clean-ups first before calling cudaSetDevice, because this call can fail too - if (graphs) free(graphs); return ret; } diff --git a/src/include/alloc.h b/src/include/alloc.h index 14bccf9..29ec87a 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -10,27 +10,39 @@ #include "nccl.h" #include "checks.h" #include "align.h" +#include "utils.h" #include #include #include #include +uint64_t clockNano(); // from utils.h with which we have a circular dependency + template -static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { - CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped)); +ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { + ncclResult_t result = ncclSuccess; + uint64_t time = 0; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + *ptr = nullptr; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + time = clockNano(); + CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish); + time = clockNano() - time; memset(*ptr, 0, nelem*sizeof(T)); - INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); - return ncclSuccess; + INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p seconds: cudaHostAlloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9); +finish: + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + return result; } #define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__) -static inline ncclResult_t ncclCudaHostFree(void* ptr) { +inline ncclResult_t ncclCudaHostFree(void* ptr) { CUDACHECK(cudaFreeHost(ptr)); return ncclSuccess; } template -static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { +ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { void* p = malloc(nelem*sizeof(T)); if (p == NULL) { WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); @@ -44,7 +56,7 @@ static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, #define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__) template -static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) { +ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) { if (nelem < oldNelem) return ncclInternalError; if (nelem == oldNelem) return ncclSuccess; @@ -63,29 +75,105 @@ static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) { } template -static ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { - // Need async stream for P2P pre-connect + CUDA Graph +ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { + ncclResult_t result = ncclSuccess; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + *ptr = nullptr; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + uint64_t time = clockNano(); + CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); + time = clockNano() - time; +finish: + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: cudaMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9); + return result; +} +#define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__) + +template +ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { + ncclResult_t result = ncclSuccess; + uint64_t time0=0, time1=0, time2=0; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + *ptr = nullptr; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + // Need a side stream so as not to interfere with graph capture. cudaStream_t stream; + time0 = clockNano(); CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T))); - CUDACHECK(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream)); - CUDACHECK(cudaStreamSynchronize(stream)); - CUDACHECK(cudaStreamDestroy(stream)); - INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); - return ncclSuccess; + time1 = clockNano(); + CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); + time2 = clockNano(); + CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); + CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); + CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); + INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: cudaStreamCreateWithFlags=%g cudaMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time1-time0)/1.e9, double(time2-time1)/1.e9); +finish: + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + return result; } #define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__) template -static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { - CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault)); - return ncclSuccess; +ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char *filefunc, int line) { + ncclResult_t result = ncclSuccess; + uint64_t time = 0; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + *ptr = nullptr; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + time = clockNano(); + CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); + time = clockNano() - time; + CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); + INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: cudaMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9); +finish: + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + return result; +} +#define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__) + +template +ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { + ncclResult_t result = ncclSuccess; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + // Need a side stream so as not to interfere with graph capture. + cudaStream_t stream; + CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish); + CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); + CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); +finish: + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + return result; +} + +template +ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) { + ncclResult_t result = ncclSuccess; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish); +finish: + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + return result; +} + +template +ncclResult_t ncclCudaFree(T* ptr) { + ncclResult_t result = ncclSuccess; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + CUDACHECKGOTO(cudaFree(ptr), result, finish); +finish: + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + return result; } // Allocate memory to be potentially ibv_reg_mr'd. This needs to be // allocated on separate pages as those pages will be marked DONTFORK // and if they are shared, that could cause a crash in a child process -static ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) { +inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) { size_t page_size = sysconf(_SC_PAGESIZE); void* p; int size_aligned = ROUNDUP(size, page_size); diff --git a/src/include/channel.h b/src/include/channel.h index dc1536a..0ebb5a2 100644 --- a/src/include/channel.h +++ b/src/include/channel.h @@ -31,7 +31,8 @@ static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int } static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) { - *channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels; + //*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels; + *channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels; return ncclSuccess; } diff --git a/src/include/checks.h b/src/include/checks.h index 9624608..715aeb7 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -9,7 +9,7 @@ #include "debug.h" -// Check CUDA calls +// Check CUDA RT calls #define CUDACHECK(cmd) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ @@ -142,9 +142,9 @@ if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \ } while (!(cond)); -#define NCCLCHECKTHREAD(a) do { \ - if ((args->ret = (a)) != ncclSuccess) { \ - INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ +#define NCCLCHECKTHREAD(a, args) do { \ + if (((args)->ret = (a)) != ncclSuccess) { \ + INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \ return args; \ } \ } while(0) diff --git a/src/include/coll_net.h b/src/include/coll_net.h index c2d831e..f4b5408 100644 --- a/src/include/coll_net.h +++ b/src/include/coll_net.h @@ -10,25 +10,26 @@ #include "nccl.h" #include "nccl_net.h" -extern ncclCollNet_t* ncclCollNet; typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; // Translation to external API -static const char* collNetName() { return ncclCollNet->name; } -static ncclResult_t collNetDevices(int* ndev) { NCCLCHECK(ncclCollNet->devices(ndev)); return ncclSuccess; } -static ncclResult_t collNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclCollNet->getProperties(dev, props)); return ncclSuccess; } -static ncclResult_t collNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; } -static ncclResult_t collNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; } -static ncclResult_t collNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; } -static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclCollNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; } -static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; } -static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { - NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; } -static ncclResult_t collNetIflush(void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; } -static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; } -static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; } -static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; } +static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; } +static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; } +static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; } +static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; } +static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; } +static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; } +static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; } +/* DMA-BUF support */ +static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; } +static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; } +static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { + NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; } +static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; } +static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; } +static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; } +static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; } -static int collNetSupport() { return ncclCollNet != nullptr ? 1 : 0; } +static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; } #endif diff --git a/src/include/collectives.h b/src/include/collectives.h index d65c6ae..7f0d0b6 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -36,7 +36,7 @@ struct ncclDevRedOpFull { /* Declare all collective operations */ #define DECL5(func, algo, proto, devredop, type) \ extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \ - extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem c); \ + extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \ #define CONCAT(a,b) a##b #define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f) diff --git a/src/include/comm.h b/src/include/comm.h index 4b55dc6..ee752fc 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -10,6 +10,8 @@ #include "transport.h" #include "p2p.h" #include "collectives.h" +#include "proxy.h" +#include "strongstream.h" #if CUDART_VERSION < 9000 struct cudaLaunchParams { @@ -58,8 +60,6 @@ struct ncclRecvMem { }; }; -typedef cudaError_t(*pfn_cuMemGetAddressRange_t)(void**, size_t*, void*); - enum helperThreadState {ThreadStart, ThreadStop}; #define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS) @@ -85,15 +85,87 @@ struct ncclNodeRanks { int* localRankToRank; }; -struct ncclComm { - struct ncclChannel channels[MAXCHANNELS]; +struct ncclDestructor { + struct ncclDestructor* next; + void* obj; + ncclResult_t(*fn)(struct ncclDestructor* me); +}; +struct ncclCommCallback { + struct ncclCommCallback* next; + ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb); +}; + +struct ncclChannel { + struct ncclChannelPeer* peers; + struct ncclDevChannelPeer* devPeers; + struct ncclRing ring; + int* devRingUserRanks; + struct ncclTree tree; + struct ncclDirect collTree; + int id; // index of this channel + uint32_t workFifoSent; // last used work index+1 + uint64_t p2pOpCount; +}; + +struct ncclWorkList { + struct ncclWorkList* next; + struct ncclWork work; +}; + +struct ncclPointerList { + struct ncclPointerList* next; + void *ptr; +}; + +struct ncclKernelPlan { + // A kernel plan is also a callback that reclaims itself. Hence this must + // be the first member. + struct ncclCommCallback reclaimer; + struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup + + struct ncclComm* comm; + struct ncclKernelPlan* next; + + bool persistent; // aka captured in a graph + void *kernelFn; + int channelUbound; // only channels c < channelUbound are present + int channelCount; // number of channels present + uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask) + bool hasProxyOps; // does any channel have a non-empty proxyOpQueue + int threadPerBlock; + // workHeap fields are null until uploadWorkFifo() or preparePersistentKernel() + struct ncclWork* workHead; + + int collOpCount; // zero based for this plan + + struct ncclIntruQueue ipcMemQueue; + + struct Channel { + int nWork; + union { + int nWorkElem; // used for coll and reg coll + int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1 + }; + size_t collBytes; + struct ncclIntruQueue workQueue; + struct ncclIntruQueue proxyOpQueue; + } channels[MAXCHANNELS]; +}; + +struct ncclComm { + struct ncclMemoryStack memPermanent, memScoped; + // List of destructors to run when comm is destructed + struct ncclDestructor* destructorHead; + + struct ncclChannel channels[MAXCHANNELS]; struct ncclPeerInfo* peerInfo; struct ncclTopoSystem* topo; + ncclNet_t* ncclNet; + ncclCollNet_t* ncclCollNet; void* bootstrap; // Bitmasks for ncclTransportP2pSetup - int connect; uint32_t* connectSend; uint32_t* connectRecv; @@ -114,12 +186,8 @@ struct ncclComm { // localRanks and localRanktoRank for all nodes struct ncclNodeRanks* nodeRanks; - enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode; - cudaStream_t userStream; - bool userStreamSet; - cudaEvent_t doneEvent; - cudaEvent_t intDoneEvent; bool checkPointers; + bool dmaBufSupport; // Counter for tracking CUDA launches (P2P and collectives included) uint64_t opCount; @@ -142,36 +210,37 @@ struct ncclComm { float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - // An internal CUDA stream for NCCL kernel CGMD launches - int groupCudaStream; - cudaStream_t groupStream; - // Whether there has been a fatal error in this communicator. ncclResult_t fatalError; // Flag to ask NCCL kernels to abort volatile uint32_t *abortFlag; - // Device side of the communicator - struct ncclDevComm *devComm; - // Host copy of the devComm (to free CUDA allocs) - struct ncclDevComm hostDevComm; + // Device side of the communicator (for cudaFree's) + struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm + + // Operation pool. + int workFifoDepth; // size of workFifoHeap[], power of 2 + struct ncclWork* workFifoHeap; + struct ncclWork* devWorkFifoHeap; + void* workFifoHeapGdrHandle; + + // Work completion notificaion + uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory + uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot. + uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels. // Intra-process sync + struct ncclComm* intraComm0; // leader of intra-process comms (self possible) + struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head + int intraRefs; // reference count from intra-process comms (zero if not leader else intraRanks) int intraRank; int intraRanks; - int* intraBarrier; - int intraPhase; - - // Storage for deferred intra-process launch - struct cudaLaunchParams * intraParams; - struct cudaLaunchParams *myParams; - pthread_t* intraThreads; - int* intraCudaDevs; - int* intraCGMode; // Whether we can use CUDA9 CGMD or not - int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not - struct ncclWorkElem args; - void* argsptrs[2]; + uint32_t intraBarrierPhase; + char intraPad1[64 - sizeof(uint64_t)]; + uint64_t intraBarrierCounter; // only used if this is intraComm0 + char intraPad2[64 - sizeof(uint64_t)]; + uint64_t intraBarrierGate; // only used if this is intraComm0 struct ncclProxyState proxyState; @@ -179,39 +248,98 @@ struct ncclComm { int collNetSupport; int intraHighestTransportType; - // Store info of async operations - struct ncclInfo* asyncOps; - int asyncOpCount; - size_t asyncTotalSize; - ssize_t channelSize; - int lastChannel; - enum { ROUND_ROBIN, SHORTEST_QUEUE } asyncAllocMode; + size_t channelSize; // User requested work size (bytes) for channel partitions - //list of async p2p operation queued in a group semantics - ncclP2Plist** p2pSends; - ncclP2Plist** p2pRecvs; - int p2pSendCount; - int p2pRecvCount; + // Internal streams + struct ncclStrongStream deviceStream, hostStream; - // Store info for cudaGraph - int usingCudaGraph; // Only use it during capture time, not launch time - struct ncclQueueInfo* enqueueInfo; - int nQueueInfoCreated; - int nQueueInfoDestroyed; - cudaGraphNode_t lastSetupNode; - unsigned long long lastCudaGraphId; - int driverVersion; - pfn_cuMemGetAddressRange_t pfnCuMemGetAddressRange; - pthread_t graphHelperThread; - struct ncclGraphHelperResources* graphHelperResources; - int disableGraphHelper; - int graphRegister; + // pools backed by comm->memPermanent + struct ncclMemoryPool memPool_ncclProxyOp; + struct ncclMemoryPool memPool_ncclKernelPlan; + struct ncclMemoryPool memPool_ncclPointerList; + // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when + // this comm is not yet in a group. + struct ncclComm* groupNext; + // Subset of those in groupNext list. Holds 0x1 if not needing preconnect. + struct ncclComm* preconnectNext; + int persistentRefs; // number of persistent plan-lists capturing this comm + struct ncclTasks tasks; // user-created reduction ops int userRedOpCapacity, userRedOpFreeHead; ncclUserRedOp *userRedOps; + + // Queue of things for the main thread to do + struct ncclIntruQueueMpsc callbackQueue; + + // List of kernel plans built form tasks. + struct ncclIntruQueue planQueue; + // First of the unlaunched kernels in `planQueue` + struct ncclKernelPlan* unlaunchedPlansHead; }; +// Set to true during an `atexit()` handler. We use this to intentionally leak +// unfreed CUDA resources when cleaning up after return of `main()` to avoid +// CUDA calls after CUDA runtime teardown. +extern bool ncclMainExited; + +enum ncclLaunchMode { + ncclLaunchModeInvalid=0, + ncclLaunchModeParallel, + ncclLaunchModeGroup +}; +extern enum ncclLaunchMode ncclParamLaunchMode; + +void ncclCommPushFree(struct ncclComm* comm, void* buf); +void ncclCommPushCudaFree(struct ncclComm* comm, void* buf); +void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf); +void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle); + +inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm) { + struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, /*waitSome=*/false); + while (cb != nullptr) { + struct ncclCommCallback* next = cb->next; + NCCLCHECK(cb->fn(comm, cb)); // may reclaim memory of cb + cb = next; + } + return ncclSuccess; +} + +inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) { + int phase = comm->intraBarrierPhase; + if (comm->intraRanks == 1) { + // Release everyone (just me). + comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1); + } else { + struct ncclComm* comm0 = comm->intraComm0; + uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE); + if (uint32_t(count) == uint32_t(comm->intraRanks)) { + // Reset. + __atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED); + // Release everyone. + __atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE); + } + } +} + +// returns sum of x values contributed to ncclCommIntraBarrierIn(comm, x) +inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) { + struct ncclComm* comm0 = comm->intraComm0; + comm->intraBarrierPhase ^= 1; + uint32_t phase = comm->intraBarrierPhase; + uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED); + if ((gate & 1) != phase) { + uint64_t t0 = clockNano(); + do { + // Spin vigorously for first 5us. + if (clockNano()-t0 >= 5*1000) sched_yield(); + gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED); + } while ((gate & 1) != phase); + } + if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE); + return gate>>32; +} + // Scrambles the bits of non-builtin values of ncclRedOp_t according to the // communicator memory address. Used to catch bugs so that integer handles // associated with this communicator won't collide with handles of other diff --git a/src/include/core.h b/src/include/core.h index 823a016..ac6ea77 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -55,6 +55,7 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) { #include "debug.h" #include "checks.h" +#include "cudawrap.h" #include "alloc.h" #include "utils.h" #include "param.h" diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h new file mode 100644 index 0000000..eaa5949 --- /dev/null +++ b/src/include/cudawrap.h @@ -0,0 +1,88 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_CUDAWRAP_H_ +#define NCCL_CUDAWRAP_H_ + +#include + +#if CUDART_VERSION >= 11030 +#include +#else +typedef CUresult (CUDAAPI *PFN_cuInit)(unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion)(int *driverVersion); +typedef CUresult (CUDAAPI *PFN_cuGetProcAddress)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags); +#endif + +#define CUPFN(symbol) pfn_##symbol + +// Check CUDA PFN driver calls +#define CUCHECK(cmd) do { \ + CUresult err = pfn_##cmd; \ + if( err != CUDA_SUCCESS ) { \ + const char *errStr; \ + (void) pfn_cuGetErrorString(err, &errStr); \ + WARN("Cuda failure '%s'", errStr); \ + return ncclUnhandledCudaError; \ + } \ +} while(false) + +#define CUCHECKGOTO(cmd, res, label) do { \ + CUresult err = pfn_##cmd; \ + if( err != CUDA_SUCCESS ) { \ + const char *errStr; \ + (void) pfn_cuGetErrorString(err, &errStr); \ + WARN("Cuda failure '%s'", errStr); \ + res = ncclUnhandledCudaError; \ + goto label; \ + } \ +} while(false) + +// Report failure but clear error and continue +#define CUCHECKIGNORE(cmd) do { \ + CUresult err = pfn_##cmd; \ + if( err != CUDA_SUCCESS ) { \ + const char *errStr; \ + (void) pfn_cuGetErrorString(err, &errStr); \ + INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, errStr); \ + } \ +} while(false) + +#define CUCHECKTHREAD(cmd, args) do { \ + CUresult err = pfn_##cmd; \ + if (err != CUDA_SUCCESS) { \ + INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \ + args->ret = ncclUnhandledCudaError; \ + return args; \ + } \ +} while(0) + +#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol + +#if CUDART_VERSION >= 11030 +/* CUDA Driver functions loaded with cuGetProcAddress for versioning */ +DECLARE_CUDA_PFN_EXTERN(cuDeviceGet); +DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute); +DECLARE_CUDA_PFN_EXTERN(cuGetErrorString); +DECLARE_CUDA_PFN_EXTERN(cuGetErrorName); +DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange); +DECLARE_CUDA_PFN_EXTERN(cuCtxCreate_v3020); +DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy); +DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent); +#if CUDA_VERSION >= 11070 +DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support +#endif +#endif + +/* CUDA Driver functions loaded with dlsym() */ +DECLARE_CUDA_PFN_EXTERN(cuInit); +DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion); +DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress); + + +ncclResult_t cudaLibraryInit(void); + +#endif diff --git a/src/include/debug.h b/src/include/debug.h index 7af38fd..cd6e53b 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -10,8 +10,8 @@ #include "nccl_net.h" #include #include +#include -#include #include #include #include @@ -21,7 +21,7 @@ extern int ncclDebugLevel; extern uint64_t ncclDebugMask; -extern pthread_mutex_t ncclDebugOutputLock; +extern pthread_mutex_t ncclDebugLock; extern FILE *ncclDebugFile; extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim); @@ -29,13 +29,15 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file // Let code temporarily downgrade WARN into INFO extern thread_local int ncclDebugNoWarn; +extern char ncclLastError[]; #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) +#define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__) #ifdef ENABLE_TRACE #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__) -extern std::chrono::high_resolution_clock::time_point ncclEpoch; +extern std::chrono::steady_clock::time_point ncclEpoch; #else #define TRACE(...) #endif diff --git a/src/include/devcomm.h b/src/include/devcomm.h index 8ff9d4b..f8b630e 100644 --- a/src/include/devcomm.h +++ b/src/include/devcomm.h @@ -121,7 +121,6 @@ struct ncclRing { // since we need to know how the user expects data to be ordered across // devices. Ordered from current device. int* userRanks; - int* devUserRanks; int index; // This rank's index in the ring }; @@ -146,7 +145,7 @@ struct ncclDirect { }; #define NCCL_MAX_CONNS 2 -struct ncclPeer { +struct ncclChannelPeer { struct ncclConnector send[NCCL_MAX_CONNS]; struct ncclConnector recv[NCCL_MAX_CONNS]; }; @@ -158,30 +157,38 @@ struct ncclDevComm; /* Make sure to adjust padding at the end of ncclWorkElem. */ #define NCCL_WORK_SIZE 512 -enum ncclWorkElemType : uint8_t { +enum ncclWorkType : uint8_t { ncclWorkTypeUnused=0, ncclWorkTypeColl=1, ncclWorkTypeP2p=2, ncclWorkTypeRegColl=3 }; -enum ncclWorkElemSubType : uint8_t { - ncclWorkSubTypeUnused =0, - ncclWorkSubTypeSend, - ncclWorkSubTypeRecv +enum ncclWorkP2PType : uint8_t { + ncclWorkP2pTypeUnused=0, + ncclWorkP2pTypeSend, + ncclWorkP2pTypeRecv }; -struct ncclWorkElemHeader { +struct ncclWorkHeader { + union { + int32_t workNext; // when isLast=0: Offset from kernel argument workHead + uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back. + }; uint16_t funcIndex; - enum ncclWorkElemType type; - unsigned nWarps:5; - unsigned isLast:1; + uint8_t isLast:1; // last work for this kernel + uint8_t inFifo:1; // is this work in the fifo + enum ncclWorkType type; }; struct ncclWorkElem { - struct ncclWorkElemHeader header; - uint8_t regUsed; + union { + uint8_t flagBits; + struct { + uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1; + }; + }; + uint8_t nWarps; uint8_t direct; - uint8_t redOpArgIsPtr; const void * sendbuff; void * recvbuff; @@ -192,22 +199,29 @@ struct ncclWorkElem { uint8_t bid; uint8_t nChannels; uint64_t redOpArg; - uint64_t pad; }; -static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElem) == 0, "ncclWorkElem size must be a multiple of ncclWork size"); + +#define NCCL_MAX_WORK_ELEMENTS ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem)) +static_assert(NCCL_MAX_WORK_ELEMENTS == 9, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 9"); struct ncclWorkElemP2p { - struct ncclWorkElemHeader header; int32_t peer; - void* buff; - size_t count; - int chunkSize; - uint8_t ngroups; - uint8_t warpStart; + enum ncclWorkP2PType p2pType; uint8_t nWarps; - enum ncclWorkElemSubType subType; + uint8_t warpStart; + uint8_t ngroups; + // Important not to use any fields with greater than 4-byte alignment since + // we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if + // there were 8-byte fields. + //void* buff; + uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32; + //size_t count; + uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32; + int chunkSize; }; -static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemP2p) == 0, "ncclWorkElemP2p size must be a multiple of ncclWork size"); + +static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) >= 16, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 16"); +#define NCCL_MAX_WORK_ELEMENTS_P2P 16 struct ncclWorkElemReg { struct ncclWorkElem elem; @@ -215,72 +229,59 @@ struct ncclWorkElemReg { void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1]; void* upOutputs[NCCL_MAX_DIRECT_ARITY+1]; }; -static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemReg) == 0, "ncclWork size must be a multiple of ncclWorkElemReg size"); -static_assert(sizeof(struct ncclWorkElemReg) % sizeof(struct ncclWorkElem) == 0, "ncclWorkElemReg size must be a multiple of ncclWorkElem size"); -#define NCCL_MAX_WORK_ELEMENTS (NCCL_WORK_SIZE/sizeof(struct ncclWorkElem)) -#define NCCL_MAX_WORK_ELEMENTS_P2P (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemP2p)) -#define NCCL_MAX_WORK_ELEMENTS_REG (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemReg)) +#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg)) +static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 2, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 2"); + // Number of named barriers supported by CUDA #define NCCL_MAX_GROUPS 16 struct ncclWork { + struct ncclWorkHeader header; union { - char pad[NCCL_WORK_SIZE]; - struct ncclWorkElemHeader header; + char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)]; struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS]; struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P]; struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG]; }; }; +static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE"); +static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0"); -static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "ncclWork size needs to be well aligned"); - -struct ncclChannel { - union { - struct { - struct ncclRing ring; - struct ncclTree tree; - struct ncclDirect collTree; - - int id; - - // Communication structures - struct ncclPeer* peers; - struct ncclPeer* devPeers; - - // Operation list for aggregation - struct ncclWork* workFifo; - int workCount; - size_t totalSize; - uint64_t workFifoTail; // Only used by CPU - uint16_t index; // Only used by GPU - - // GDRCOPY support - struct ncclWork* workFifoGdr; - struct ncclWork* workFifoDev; - void* gdrMemDesc; - }; - int data[0x80]; - }; +struct ncclDevChannelPeer { + // Stripped version of ncclChannelPeer where we only keep the ncclConnInfo + // instead of the full ncclConnector. + struct ncclConnInfo send[NCCL_MAX_CONNS]; + struct ncclConnInfo recv[NCCL_MAX_CONNS]; +}; + +struct alignas(16) ncclDevChannel { + struct ncclDevChannelPeer *peers; + struct ncclRing ring; + struct ncclTree tree; + struct ncclDirect collTree; + uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed }; -static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size"); struct ncclDevComm { int rank; int nRanks; int buffSizes[NCCL_NUM_PROTOCOLS]; + // Operation list for aggregation + int workFifoDepth; + struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory + // Flag to ask NCCL kernels to abort - volatile uint32_t *abortFlag; + volatile uint32_t* abortFlag; // Channels, device side - struct ncclChannel* channels; + struct ncclDevChannel* channels/*[MAXCHANNELS]*/; }; -struct ncclDevCommAndChannels { - ncclDevComm comm; - ncclChannel channels[MAXCHANNELS]; +struct alignas(16) ncclDevCommAndChannels { + struct ncclDevComm comm; + struct ncclDevChannel channels[MAXCHANNELS]; }; #endif diff --git a/src/include/enqueue.h b/src/include/enqueue.h index 282342b..74b7ccd 100644 --- a/src/include/enqueue.h +++ b/src/include/enqueue.h @@ -10,6 +10,7 @@ #include "comm.h" #include "group.h" #include "collectives.h" +#include "utils.h" #define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64) #define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */ @@ -17,117 +18,10 @@ size_t ncclKernMaxLocalSize(); ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut); ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); -ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast); -ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm); -ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm); -ncclResult_t ncclLaunchBarrier(struct ncclComm* comm); -ncclResult_t ncclLaunchKernel(ncclComm_t comm); -ncclResult_t ncclRecordEvents(struct ncclComm* comm); -ncclResult_t ncclLaunchReset(ncclComm_t comm); -ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info); -ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm); -template -void CUDART_CB ncclEnqueueHostSetup(void* arg); -ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph); -ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph); +ncclResult_t ncclLaunchPrepare(struct ncclComm* comm); +ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); +ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan); +ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); +ncclResult_t ncclLaunchFinish(struct ncclComm* comm); -struct ncclBuffRegInfo { - void* sendbuffsBase[NCCL_MAX_LOCAL_RANKS]; - void* recvbuffsBase[NCCL_MAX_LOCAL_RANKS]; - void* sendbuffs[NCCL_MAX_LOCAL_RANKS]; - void* recvbuffs[NCCL_MAX_LOCAL_RANKS]; - int nBuffs; -}; - -// Enqueue information (for kernel and proxy) for each operation -struct ncclQueueElem { - struct ncclWork work; - struct ncclProxyOp proxyOp; - struct ncclBuffRegInfo buffRegInfo; -}; - -typedef ncclRecyclableList ncclQueueElemList; - -// Structure passed to CUDA graph -struct ncclQueueInfo { - ncclComm_t comm; - int maxChannels; // Dynamic version of gridDim - ncclResult_t ret; // Return value of host setup call - int nRegBuffs; - ncclQueueElemList* elemList; -}; - -static ncclResult_t ncclCreateQueueInfo(struct ncclQueueInfo** eqInfo, ncclComm_t comm) { - NCCLCHECK(ncclCalloc(eqInfo, 1)); - (*eqInfo)->comm = comm; - (*eqInfo)->elemList = new ncclQueueElemList(); - (*eqInfo)->comm->nQueueInfoCreated++; - return ncclSuccess; -} - -// Reset element queue -static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) { - if (eqInfo == NULL) return ncclInternalError; - eqInfo->maxChannels = 0; - eqInfo->ret = ncclSuccess; - eqInfo->nRegBuffs = 0; - eqInfo->elemList->recycle(); - return ncclSuccess; -} - -// Destroy enqueue info space -// used by both CUDA graph and non CUDA graph -static void ncclDestroyQueueInfo(void* ptr) { - if (ptr == NULL) return; - struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr; - struct ncclComm* comm = eqInfo->comm; - // Close IPC mem handles for registered buffers - struct ncclQueueElem* eqElem = eqInfo->elemList->begin(); -#if 0 - // Ideally, the deregistration should happen here - // but currently the destroy function of CUDA objects does not allow CUDA API calls - while (eqElem != NULL) { - for (int i=0; ibuffRegInfo.nBuffs; i++) { - if (i == eqInfo->comm->localRank) continue; - CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.sendbuffsBase[i])); - CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.recvbuffsBase[i])); - } - eqElem = eqInfo->elemList->getNext(); - } -#else - // Instead, we push these pointers to a pool owned by ncclComm - // and asks a helper thread to close mem handles - struct ncclGraphHelperResources* res = comm->graphHelperResources; - int ipcTailOld = 0; - if (res == NULL || (!comm->graphHelperThread) || eqInfo->nRegBuffs == 0) goto skip; - - pthread_mutex_lock(&res->threadLock); - ipcTailOld = res->ipcTail; - while (eqElem != NULL) { - for (int i=0; ibuffRegInfo.nBuffs; i++) { - if (eqElem->buffRegInfo.sendbuffsBase[i] != NULL) { - res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.sendbuffsBase[i]; - res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE; - } - if (eqElem->buffRegInfo.recvbuffsBase[i] != NULL) { - res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.recvbuffsBase[i]; - res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE; - } - } - eqElem = eqInfo->elemList->getNext(); - } - if (res->ipcTail != ipcTailOld) { - res->threadState = ThreadStart; - TRACE(NCCL_COLL, "CUDA Graph destroy function signaling helper thread with %d IPC handles", res->ipcTail-ipcTailOld); - pthread_cond_signal(&res->threadCond); - } - pthread_mutex_unlock(&res->threadLock); -#endif - -skip: - delete eqInfo->elemList; - free(eqInfo); - comm->nQueueInfoDestroyed++; - return; -} #endif // End include guard diff --git a/src/include/graph.h b/src/include/graph.h index 898b903..1997f76 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -23,7 +23,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system); ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system); -ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info); +ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm); void ncclTopoFree(struct ncclTopoSystem* system); ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm); ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm); @@ -33,7 +33,7 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank); ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank); ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr); -int ncclPxnDisable(); +int ncclPxnDisable(struct ncclComm* comm); ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks); ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank); diff --git a/src/include/group.h b/src/include/group.h index 239b05f..e6f31b1 100644 --- a/src/include/group.h +++ b/src/include/group.h @@ -10,15 +10,82 @@ #include "nccl.h" #include "comm.h" -bool ncclAsyncMode(); -ncclResult_t ncclAsyncErrCheck(ncclResult_t ret); +ncclResult_t ncclGroupErrCheck(ncclResult_t ret); +void ncclGroupCommJoin(struct ncclComm* comm); +void ncclGroupCommPreconnect(struct ncclComm* comm); +void ncclGroupCommLeave(struct ncclComm* comm); typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); -typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); +struct ncclAsyncJob { + struct ncclAsyncJob* next; + pthread_t thread; + ncclResult_t result; + ncclResult_t(*func)(struct ncclAsyncJob*); + void(*undo)(struct ncclAsyncJob*); + void(*destructor)(void*); +}; + +ncclResult_t ncclAsyncLaunch( + struct ncclAsyncJob* job, + ncclResult_t(*func)(struct ncclAsyncJob*), + void(*undo)(struct ncclAsyncJob*), + void(*destructor)(void*) +); + +ncclResult_t ncclGroupStartInternal(); +ncclResult_t ncclGroupEndInternal(); + +//////////////////////////////////////////////////////////////////////////////// + +extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting +extern __thread ncclResult_t ncclGroupError; +extern __thread struct ncclComm* ncclGroupCommHead; +extern __thread struct ncclComm* ncclGroupCommPreconnectHead; + +inline ncclResult_t ncclGroupStartInternal() { + ncclGroupDepth++; + return ncclSuccess; +} + +inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) { + if (ncclGroupDepth > 0) { + if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret; + } + return ret; +} + +// Add comm to this thread's group +inline void ncclGroupCommJoin(struct ncclComm* comm) { + if (comm->groupNext == reinterpret_cast(0x1)) { + // Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves + // the users program order yet insures siblings occur consecutively. This + // is required by doLaunches() in "group.cc". + struct ncclComm** pp = &ncclGroupCommHead; + while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0) + pp = &(*pp)->groupNext; + comm->groupNext = *pp; + *pp = comm; + // Comms gets a new memory stack scope upon joining. Each task batched for + // this comm is allocated there. + ncclMemoryStackPush(&comm->memScoped); + } +} + +// Add comm to this thread's group needing preconnect +inline void ncclGroupCommPreconnect(struct ncclComm* comm) { + if (comm->preconnectNext == reinterpret_cast(0x1)) { + comm->preconnectNext = ncclGroupCommPreconnectHead; + ncclGroupCommPreconnectHead = comm; + } +} + +// Comm has left group +inline void ncclGroupCommLeave(struct ncclComm* comm) { + comm->groupNext = reinterpret_cast(0x1); + ncclMemoryStackPop(&comm->memScoped); +} -ncclResult_t ncclAsyncColl(ncclComm_t comm); #endif diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h index 63555ba..c747589 100644 --- a/src/include/ibvwrap.h +++ b/src/include/ibvwrap.h @@ -1067,6 +1067,9 @@ ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd); ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access); struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access); ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access); +/* DMA-BUF support */ +ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); +struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr); ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context); ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel); diff --git a/src/include/info.h b/src/include/info.h index 3461cc7..b511728 100644 --- a/src/include/info.h +++ b/src/include/info.h @@ -10,6 +10,9 @@ #include "nccl.h" #include "devcomm.h" #include "collectives.h" +#include "core.h" +#include "utils.h" +#include "strongstream.h" typedef enum : uint8_t { ncclPatternRing, @@ -54,4 +57,62 @@ struct ncclInfo { int channelId; }; +inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) { + info->nBytes = info->count * ncclTypeSize(info->datatype); + if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) { + info->count = info->nBytes; + info->datatype = ncclInt8; + } + if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank + return ncclSuccess; +} + +struct ncclTaskColl { + struct ncclTaskColl* next; + ncclFunc_t func; + void const* sendbuff; + void* recvbuff; + size_t count; + int root; + ncclDataType_t datatype; + ncclDevRedOpFull op; + int chunkSteps, sliceSteps; +}; +struct ncclTaskP2p { + ncclTaskP2p *next; + void *buff; + size_t bytes; + // Stateful chunk index. If a p2p gets "cut" over two plans this keeps track + // of where it left off. + int chunk; +}; + +struct ncclCudaStreamList { + struct ncclCudaStreamList *next; + cudaStream_t stream; +}; + +struct ncclTasks { + struct Peer { + bool sendSeen, recvSeen; + struct ncclIntruQueue sendQueue; + struct ncclIntruQueue recvQueue; + }; + struct ncclIntruQueue collQueue; + size_t collBytesTotal; + struct Peer* peers/*[nRanks]*/; + int *p2pSendOrder/*[nRanks]*/, *p2pRecvOrder/*[nRanks]*/; + int nTasksColl, nTasksP2p; + + // The list of user streams aggregated over all tasks present. + struct ncclCudaStreamList* streams; + // The most recent user stream. Ignored if streams==nullptr + cudaStream_t streamRecent; + // The graph capturing all user streams or invalid if none. Thus we restrict the + // user that all streams must be captured in the same graph or not captured + // at all. Technically we could probably relax this, but that would mean + // collecting a different `ncclTasks` per graph and one for non-graph. + struct ncclCudaGraph capturingGraph; +}; + #endif diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h index ce61672..255a44e 100644 --- a/src/include/nccl_net.h +++ b/src/include/nccl_net.h @@ -14,12 +14,13 @@ #define NCCL_PTR_HOST 0x1 #define NCCL_PTR_CUDA 0x2 +#define NCCL_PTR_DMABUF 0x4 // Maximum number of requests per comm object #define NCCL_NET_MAX_REQUESTS 8 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; -typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_ALL=~0} ncclDebugLogSubSys; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); @@ -28,15 +29,15 @@ typedef struct { char* pciPath; // Path to the PCI device in /sys. uint64_t guid; // Unique identifier for the NIC chip. Important for // cards with multiple PCI functions (Physical or virtual). - int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] int speed; // Port speed in Mbps. int port; // Port number. float latency; // Network latency int maxComms; // Maximum number of comms we can create int maxRecvs; // Maximum number of grouped receives. -}ncclNetProperties_v5_t; +}ncclNetProperties_v6_t; -typedef ncclNetProperties_v5_t ncclNetProperties_t; +typedef ncclNetProperties_v6_t ncclNetProperties_t; typedef struct { // Name of the network (mainly for logs) @@ -46,7 +47,103 @@ typedef struct { // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props); + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v6_t; + +typedef ncclNet_v6_t ncclNet_t; + +#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6 + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v6_t; + +typedef ncclCollNet_v6_t ncclCollNet_t; + +#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6 + +// v5 struct for backwards compatibility +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. @@ -83,10 +180,7 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v5_t; -typedef ncclNet_v5_t ncclNet_t; - -#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v5 - +// v5 struct for backwards compatibility typedef struct { // Name of the collective network (mainly for logs) const char* name; @@ -96,7 +190,7 @@ typedef struct { // If ndev returns 0, all other functions might be set to NULL. ncclResult_t (*devices)(int* ndev); // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props); + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create connections. @@ -125,10 +219,7 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclCollNet_v5_t; -typedef ncclCollNet_v5_t ncclCollNet_t; - -#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v5 - +// v4 struct for backwards compatibility typedef struct { char* name; // Used mostly for logging. char* pciPath; // Path to the PCI device in /sys. @@ -140,6 +231,7 @@ typedef struct { int maxComms; // Maximum number of comms we can create } ncclNetProperties_v4_t; +// v4 struct for backwards compatibility typedef struct { // Name of the network (mainly for logs) const char* name; @@ -179,6 +271,7 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v4_t; +// v4 struct for backwards compatibility typedef struct { // Name of the collective network (mainly for logs) const char* name; diff --git a/src/include/net.h b/src/include/net.h index 0cc5067..5a7b5e3 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -9,33 +9,36 @@ #include "nccl.h" #include "nccl_net.h" +#include "comm.h" #include "checks.h" -extern ncclNet_t* ncclNet; typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; -ncclResult_t ncclNetInit(); -int ncclNetVersion(); +ncclResult_t ncclNetPluginInit(); +ncclResult_t ncclNetInit(struct ncclComm* comm); +int ncclNetVersion(struct ncclComm* comm); // Translation to external API -static const char* ncclNetName() { return ncclNet->name; } -static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; } -static ncclResult_t ncclNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclNet->getProperties(dev, props)); return ncclSuccess; } -static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; } -static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; } -static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; } -static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; } -static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; } -static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; } -static ncclResult_t ncclNetIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; } -static ncclResult_t ncclNetIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; } -static ncclResult_t ncclNetTest(void* request, int* done, int* sizes) { NCCLCHECK(ncclNet->test(request, done, sizes)); return ncclSuccess; } -static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; } -static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; } -static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; } +static const char* ncclNetName(struct ncclComm* comm) { return comm->ncclNet->name; } +static ncclResult_t ncclNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclNet->devices(ndev)); return ncclSuccess; } +static ncclResult_t ncclNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclNet->getProperties(dev, props)); return ncclSuccess; } +static ncclResult_t ncclNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; } +static ncclResult_t ncclNetConnect(struct ncclComm* comm, int dev, void* handle, void** sendComm) { NCCLCHECK(comm->ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; } +static ncclResult_t ncclNetAccept(struct ncclComm* comm, void* listenComm, void** recvComm) { NCCLCHECK(comm->ncclNet->accept(listenComm, recvComm)); return ncclSuccess; } +static ncclResult_t ncclNetRegMr(struct ncclComm* comm, void* netComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclNet->regMr(netComm, data, size, type, mhandle)); return ncclSuccess; } +/* DMA-BUF support */ +static ncclResult_t ncclNetRegMrDmaBuf(struct ncclComm* comm, void* netComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclNet->regMrDmaBuf(netComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; } +static ncclResult_t ncclNetDeregMr(struct ncclComm* comm, void* netComm, void* mhandle) { NCCLCHECK(comm->ncclNet->deregMr(netComm, mhandle)); return ncclSuccess; } +static ncclResult_t ncclNetIsend(struct ncclComm* comm, void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(comm->ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; } +static ncclResult_t ncclNetIrecv(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; } +static ncclResult_t ncclNetIflush(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; } +static ncclResult_t ncclNetTest(struct ncclComm* comm, void* request, int* done, int* sizes) { NCCLCHECK(comm->ncclNet->test(request, done, sizes)); return ncclSuccess; } +static ncclResult_t ncclNetCloseSend(struct ncclComm* comm, void* sendComm) { NCCLCHECK(comm->ncclNet->closeSend(sendComm)); return ncclSuccess; } +static ncclResult_t ncclNetCloseRecv(struct ncclComm* comm, void* recvComm) { NCCLCHECK(comm->ncclNet->closeRecv(recvComm)); return ncclSuccess; } +static ncclResult_t ncclNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclNet->closeListen(listenComm)); return ncclSuccess; } // Test whether the current GPU support GPU Direct RDMA. -ncclResult_t ncclGpuGdrSupport(int* gdrSupport); +ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport); extern ncclNet_t ncclNetIb; extern ncclNet_t ncclNetSocket; diff --git a/src/include/p2p.h b/src/include/p2p.h index 2519873..69d1ea7 100644 --- a/src/include/p2p.h +++ b/src/include/p2p.h @@ -9,19 +9,4 @@ #ifndef NCCL_P2P_H_ #define NCCL_P2P_H_ -struct ncclP2Pinfo { - void* buff; - ssize_t nbytes; -}; - -typedef ncclRecyclableList ncclP2Plist; - -static ncclResult_t ncclSaveP2pInfo(ncclP2Plist* &p2p, void* buff, ssize_t nBytes) { - if (p2p == NULL) p2p = new ncclP2Plist(); - struct ncclP2Pinfo* next; - NCCLCHECK(p2p->getNewElem(&next)); - next->buff = buff; - next->nbytes = nBytes; - return ncclSuccess; -} #endif diff --git a/src/include/proxy.h b/src/include/proxy.h index c7ca0aa..dcab5e2 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -32,11 +32,16 @@ struct ncclProxyOp { int sliceSteps; int chunkSteps; int chunkSize; - ncclDataType_t dtype; - ncclRedOp_t redOp; - ncclPattern_t pattern; // uint8_t + uint8_t /*ncclDataType_t*/ dtype; + uint8_t /*ncclDevRedOp_t*/ redOp; + uint8_t /*ncclPattern_t*/ pattern; uint8_t protocol; - uint16_t pad; + + union { + uint64_t unused; + // For use by enqueue.cc + struct ncclProxyOp *enqNext; + }; }; static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch"); @@ -68,9 +73,9 @@ struct ncclProxyArgs { int sliceSteps; int chunkSteps; int chunkSize; - ncclDataType_t dtype; - ncclRedOp_t redOp; - ncclPattern_t pattern; + uint8_t /*ncclDataType_t*/ dtype; + uint8_t /*ncclDevRedOp_t*/ redOp; + uint8_t /*ncclPattern_t*/ pattern; uint8_t protocol; int state; char* sharedBuff[NCCL_STEPS]; @@ -158,6 +163,7 @@ struct ncclProxyState { pthread_t thread; struct ncclSocket* listenSock; int stop; + CUcontext cudaCtx; // Used by main thread union ncclSocketAddress* peerAddresses; @@ -187,9 +193,8 @@ enum proxyMode { proxyTo = 2 }; -ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* proxyOp, int nranks); +ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire); ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp); -ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* proxyOp); ncclResult_t ncclProxyStart(struct ncclComm* comm); ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses); ncclResult_t ncclProxyCreate(struct ncclComm* comm); diff --git a/src/include/strongstream.h b/src/include/strongstream.h new file mode 100644 index 0000000..b72f77c --- /dev/null +++ b/src/include/strongstream.h @@ -0,0 +1,142 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_STRONGSTREAM_H_ +#define NCCL_STRONGSTREAM_H_ + +#include "nccl.h" +#include "checks.h" + +#include + +/* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes + * easily. + */ +struct ncclCudaGraph { +#if CUDART_VERSION >= 11030 + cudaGraph_t graph; + uint64_t graphId; +#endif +}; + +inline struct ncclCudaGraph ncclCudaGraphNull() { + struct ncclCudaGraph tmp; + #if CUDART_VERSION >= 11030 + tmp.graph = nullptr; + tmp.graphId = ULLONG_MAX; + #endif + return tmp; +} + +inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) { + #if CUDART_VERSION >= 11030 + return graph.graph != nullptr; + #else + return false; + #endif +} + +inline bool ncclCudaGraphSame(struct ncclCudaGraph a, struct ncclCudaGraph b) { + #if CUDART_VERSION >= 11030 + return a.graphId == b.graphId; + #else + return true; + #endif +} + +ncclResult_t ncclCudaGetCapturingGraph(struct ncclCudaGraph* graph, cudaStream_t stream); +ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg); + + +/* ncclStrongStream: An abstraction over CUDA streams that do not lose their + * identity while being captured. Regular streams have the deficiency that the + * captured form of a stream in one graph launch has no relation to the + * uncaptured stream or to the captured form in other graph launches. This makes + * streams unfit for the use of serializing access to a persistent resource. + * Strong streams have been introduced to address this need. + * + * Constraints of using strong streams: + * + * - Operations that enqueue work to the strong stream need to be enclosed by + * ncclStrongStream[Acquire/Release] pairs. Acquire/release act like fences, + * the strong stream is not stateful so there is no harm in redundant acquire + * or releases. + * + * - An {Acquire; ...; Release} sequence must not be concurrent with any + * other operations against the strong stream including graph launches which + * reference this stream. + * + * - All strong stream functions take a "graph" parameter which must reference + * the currently capturing graph, or null if none. + */ +struct ncclStrongStream; + +ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss); +ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss); + +// Has this strong stream ever been captured in a graph. +bool ncclStrongStreamEverCaptured(struct ncclStrongStream* ss); + +// Acquire-fence the strong stream. +ncclResult_t ncclStrongStreamAcquire( + struct ncclCudaGraph graph, struct ncclStrongStream* ss +); + +// Acquire-fence the strong stream assuming no graph is capturing. This permits +// the caller to enqueue directly to the `ss->stream` member using native CUDA +// calls. Strong stream must be released via: +// ncclStrongStreamRelease(ncclCudaGraphNull(), graphRefs, ss); +ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss); + +// Release-fence of the strong stream. +ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss); + +// Add a host launch to the stream. +ncclResult_t ncclStrongStreamLaunchHost( + struct ncclCudaGraph graph, struct ncclStrongStream* ss, + cudaHostFn_t fn, void* arg +); +// Add a kernel launch to the stream. +ncclResult_t ncclStrongStreamLaunchKernel( + struct ncclCudaGraph graph, struct ncclStrongStream* ss, + void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes +); +// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired. +ncclResult_t ncclStrongStreamWaitStream( + struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b +); +// `b` must be capturing within `graph`. +ncclResult_t ncclStrongStreamWaitStream( + struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b +); +// `a` must be capturing within `graph`. +ncclResult_t ncclStrongStreamWaitStream( + struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b +); + +// Synchrnoization does not need the strong stream to be acquired. +ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss); + +//////////////////////////////////////////////////////////////////////////////// + +struct ncclStrongStream { + cudaStream_t stream; + cudaEvent_t event; + #if CUDART_VERSION >= 11030 + cudaGraphNode_t node; // null if never captured, otherwise never null again + uint64_t graphId:63, eventIsLagging:1; + #endif +}; + +inline bool ncclStrongStreamEverCaptured(struct ncclStrongStream* ss) { + #if CUDART_VERSION >= 11030 + return ss->node != nullptr; + #else + return false; + #endif +} + +#endif diff --git a/src/include/transport.h b/src/include/transport.h index 043a415..e13c9e8 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -20,7 +20,12 @@ #include "proxy.h" -extern struct ncclTransport ncclTransports[]; +extern struct ncclTransport p2pTransport; +extern struct ncclTransport shmTransport; +extern struct ncclTransport netTransport; +extern struct ncclTransport collNetTransport; + +extern struct ncclTransport* ncclTransports[]; // Forward declarations struct ncclRing; @@ -63,7 +68,7 @@ struct ncclTransport { struct ncclTransportComm recv; }; -ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex); +ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex); ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL); enum { collNetRecv=0, collNetSend=1 }; diff --git a/src/include/utils.h b/src/include/utils.h index f08ff37..0604d15 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -8,8 +8,12 @@ #define NCCL_UTILS_H_ #include "nccl.h" +#include "alloc.h" #include "checks.h" #include +#include +#include +#include int ncclCudaCompCap(); @@ -38,81 +42,446 @@ static long log2i(long n) { return l; } -// Recyclable list that avoids frequent malloc/free +inline uint64_t clockNano() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec; +} + +//////////////////////////////////////////////////////////////////////////////// + +template +inline void ncclAtomicRefCountIncrement(Int* refs) { + __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED); +} + +template +inline Int ncclAtomicRefCountDecrement(Int* refs) { + return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL); +} + +//////////////////////////////////////////////////////////////////////////////// +/* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that + * granularity of LIFO is not per object, instead frames containing many objects + * are pushed and popped. Therefor deallocation is extremely cheap since its + * done at the frame granularity. + * + * The initial state of the stack is with one frame, the "nil" frame, which + * cannot be popped. Therefor objects allocated in the nil frame cannot be + * deallocated sooner than stack destruction. + */ +struct ncclMemoryStack; + +void ncclMemoryStackConstruct(struct ncclMemoryStack* me); +void ncclMemoryStackDestruct(struct ncclMemoryStack* me); +void ncclMemoryStackPush(struct ncclMemoryStack* me); +void ncclMemoryStackPop(struct ncclMemoryStack* me); template -struct ncclListElem { - T data; - struct ncclListElem* next; +T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1); + +//////////////////////////////////////////////////////////////////////////////// +/* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for + * a pool instance to ever hold objects whose type have differing + * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by + * a backing `ncclMemoryStack` passed during Alloc(). If memory + * backing any currently held object is deallocated then it is an error to do + * anything other than reconstruct it, after which it is a valid empty pool. + */ +struct ncclMemoryPool; + +// Equivalent to zero-initialization +void ncclMemoryPoolConstruct(struct ncclMemoryPool* me); +template +T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing); +template +void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj); +void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from); + +//////////////////////////////////////////////////////////////////////////////// +/* ncclIntruQueue: A singly-linked list queue where the per-object next pointer + * field is given via the `next` template argument. + * + * Example: + * struct Foo { + * struct Foo *next1, *next2; // can be a member of two lists at once + * }; + * ncclIntruQueue list1; + * ncclIntruQueue list2; + */ +template +struct ncclIntruQueue; + +template +void ncclIntruQueueConstruct(ncclIntruQueue *me); +template +bool ncclIntruQueueEmpty(ncclIntruQueue *me); +template +T* ncclIntruQueueHead(ncclIntruQueue *me); +template +void ncclIntruQueueEnqueue(ncclIntruQueue *me, T *x); +template +T* ncclIntruQueueDequeue(ncclIntruQueue *me); +template +T* ncclIntruQueueTryDequeue(ncclIntruQueue *me); +template +void ncclIntruQueueFreeAll(ncclIntruQueue *me, ncclMemoryPool *memPool); + +//////////////////////////////////////////////////////////////////////////////// +/* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex" + * and "cond" fields are part of the public interface. + */ +struct ncclThreadSignal { + pthread_mutex_t mutex; + pthread_cond_t cond; }; -template -class ncclRecyclableList { - private: - struct ncclListElem* head; - struct ncclListElem* tail; - struct ncclListElem* cursor; - int n; +// returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER} +constexpr ncclThreadSignal ncclThreadSignalStaticInitializer(); - public: - ncclRecyclableList() { - tail = cursor = head = NULL; - n = 0; - } +void ncclThreadSignalConstruct(struct ncclThreadSignal* me); +void ncclThreadSignalDestruct(struct ncclThreadSignal* me); - int count() const { return n; } +// A convenience instance per-thread. +extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance; - // Get a new element from the list and return pointer - ncclResult_t getNewElem(T** dataOut) { - if (tail != NULL) { - *dataOut = &tail->data; - memset(*dataOut, 0, sizeof(T)); - } else { - NCCLCHECK(ncclCalloc(&tail, 1)); - *dataOut = &tail->data; - cursor = head = tail; - } - if (tail->next == NULL) { - NCCLCHECK(ncclCalloc(&tail->next, 1)); - } - tail = tail->next; - n += 1; - return ncclSuccess; - } +//////////////////////////////////////////////////////////////////////////////// - T* begin() { - if (head == NULL || head == tail) return NULL; - cursor = head->next; - return &head->data; - } +template +struct ncclIntruQueueMpsc; - // Get next element from the list during an iteration - T* getNext() { - // tail always points to the next element to be enqueued - // hence does not contain valid data - if (cursor == NULL || cursor == tail) return NULL; - T* rv = &cursor->data; - cursor = cursor->next; - return rv; - } +template +void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc* me); +template +bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc* me); +// Enqueue element. Returns true if queue is not abandoned. Even if queue is +// abandoned the element enqueued, so the caller needs to make arrangements for +// the queue to be tended. +template +bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc* me, T* x); +// Dequeue all elements at a glance. If there aren't any and `waitSome` is +// true then this call will wait until it can return a non empty list. +template +T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc* me, bool waitSome); +// Dequeue all elements and set queue to abandoned state. +template +T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc* me); - T* peakNext() { - if (cursor == NULL || cursor == tail) return NULL; - return &cursor->data; - } +//////////////////////////////////////////////////////////////////////////////// - // Recycle the list without freeing the space - void recycle() { - tail = cursor = head; - n = 0; - } +struct ncclMemoryStack { + struct Hunk { + struct Hunk* above; // reverse stack pointer + size_t size; // size of this allocation (including this header struct) + }; + struct Unhunk { // proxy header for objects allocated out-of-hunk + struct Unhunk* next; + void* obj; + }; + struct Frame { + struct Hunk* hunk; // top of non-empty hunks + uintptr_t bumper, end; // points into top hunk + struct Unhunk* unhunks; + struct Frame* below; + }; - ~ncclRecyclableList() { - while (head != NULL) { - struct ncclListElem* temp = head; - head = head->next; - free(temp); - } - } + static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align); + static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align); + + struct Hunk stub; + struct Frame topFrame; }; +inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) { + me->stub.above = nullptr; + me->stub.size = 0; + me->topFrame.hunk = &me->stub; + me->topFrame.bumper = 0; + me->topFrame.end = 0; + me->topFrame.unhunks = nullptr; + me->topFrame.below = nullptr; +} + +inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) { + uintptr_t o = (me->topFrame.bumper + align-1) & -uintptr_t(align); + void* obj; + if (__builtin_expect(o + size <= me->topFrame.end, true)) { + me->topFrame.bumper = o + size; + obj = reinterpret_cast(o); + } else { + obj = allocateSpilled(me, size, align); + } + return obj; +} + +template +inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) { + void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T)); + memset(obj, 0, n*sizeof(T)); + return (T*)obj; +} + +inline void ncclMemoryStackPush(struct ncclMemoryStack* me) { + using Frame = ncclMemoryStack::Frame; + Frame tmp = me->topFrame; + Frame* snapshot = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame)); + *snapshot = tmp; // C++ struct assignment + me->topFrame.unhunks = nullptr; + me->topFrame.below = snapshot; +} + +inline void ncclMemoryStackPop(struct ncclMemoryStack* me) { + ncclMemoryStack::Unhunk* un = me->topFrame.unhunks; + while (un != nullptr) { + free(un->obj); + un = un->next; + } + me->topFrame = *me->topFrame.below; // C++ struct assignment +} + + +//////////////////////////////////////////////////////////////////////////////// + +struct ncclMemoryPool { + struct Cell { + Cell *next; + }; + template + union CellSized { + Cell cell; + alignas(Align) char space[Size]; + }; + struct Cell* head; + struct Cell* tail; // meaningful only when head != nullptr +}; + +inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) { + me->head = nullptr; +} + +template +inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) { + using Cell = ncclMemoryPool::Cell; + using CellSized = ncclMemoryPool::CellSized; + Cell* cell; + if (__builtin_expect(me->head != nullptr, true)) { + cell = me->head; + me->head = cell->next; + } else { + // Use the internal allocate() since it doesn't memset to 0 yet. + cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized)); + } + memset(cell, 0, sizeof(T)); + return reinterpret_cast(cell); +} + +template +inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) { + using Cell = ncclMemoryPool::Cell; + Cell* cell = reinterpret_cast(obj); + cell->next = me->head; + if (me->head == nullptr) me->tail = cell; + me->head = cell; +} + +inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) { + if (from->head != nullptr) { + from->tail->next = me->head; + if (me->head == nullptr) me->tail = from->tail; + me->head = from->head; + from->head = nullptr; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +template +struct ncclIntruQueue { + T *head, *tail; +}; + +template +inline void ncclIntruQueueConstruct(ncclIntruQueue *me) { + me->head = nullptr; + me->tail = nullptr; +} + +template +inline bool ncclIntruQueueEmpty(ncclIntruQueue *me) { + return me->head == nullptr; +} + +template +inline T* ncclIntruQueueHead(ncclIntruQueue *me) { + return me->head; +} + +template +inline T* ncclIntruQueueTail(ncclIntruQueue *me) { + return me->tail; +} + +template +inline void ncclIntruQueueEnqueue(ncclIntruQueue *me, T *x) { + x->*next = nullptr; + (me->head ? me->tail->*next : me->head) = x; + me->tail = x; +} + +template +inline T* ncclIntruQueueDequeue(ncclIntruQueue *me) { + T *ans = me->head; + me->head = ans->*next; + if (me->head == nullptr) me->tail = nullptr; + return ans; +} + +template +inline T* ncclIntruQueueTryDequeue(ncclIntruQueue *me) { + T *ans = me->head; + if (ans != nullptr) { + me->head = ans->*next; + if (me->head == nullptr) me->tail = nullptr; + } + return ans; +} + +template +void ncclIntruQueueFreeAll(ncclIntruQueue *me, ncclMemoryPool *pool) { + T *head = me->head; + me->head = nullptr; + me->tail = nullptr; + while (head != nullptr) { + T *tmp = head->*next; + ncclMemoryPoolFree(pool, tmp); + head = tmp; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() { + return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}; +} + +inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) { + pthread_mutex_init(&me->mutex, nullptr); + pthread_cond_init(&me->cond, nullptr); +} + +inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) { + pthread_mutex_destroy(&me->mutex); + pthread_cond_destroy(&me->cond); +} + +//////////////////////////////////////////////////////////////////////////////// + +template +struct ncclIntruQueueMpsc { + T* head; + uintptr_t tail; + struct ncclThreadSignal* waiting; +}; + +template +void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc* me) { + me->head = nullptr; + me->tail = 0x0; + me->waiting = nullptr; +} + +template +bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc* me) { + return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2; +} + +template +bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc* me, T* x) { + __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED); + uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast(x), __ATOMIC_ACQ_REL); + T* prev = reinterpret_cast(utail); + T** prevNext = utail <= 0x2 ? &me->head : &(prev->*next); + __atomic_store_n(prevNext, x, __ATOMIC_RELAXED); + if (utail == 0x1) { // waiting + __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting + // This lock/unlock is essential to ensure we don't race ahead of the consumer + // and signal the cond before they begin waiting on it. + struct ncclThreadSignal* waiting = me->waiting; + pthread_mutex_lock(&waiting->mutex); + pthread_mutex_unlock(&waiting->mutex); + pthread_cond_broadcast(&waiting->cond); + } + return utail != 0x2; // not abandoned +} + +template +T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc* me, bool waitSome) { + T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); + if (head == nullptr) { + if (!waitSome) return nullptr; + uint64_t t0 = clockNano(); + bool sleeping = false; + do { + if (clockNano()-t0 >= 10*1000) { // spin for first 10us + struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance; + pthread_mutex_lock(&waitSignal->mutex); + uintptr_t expected = sleeping ? 0x1 : 0x0; + uintptr_t desired = 0x1; + me->waiting = waitSignal; // release done by successful compare exchange + if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) { + sleeping = true; + pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex); + } + pthread_mutex_unlock(&waitSignal->mutex); + } + head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); + } while (head == nullptr); + } + + __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED); + uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL); + T* tail = utail <= 0x2 ? nullptr : reinterpret_cast(utail); + T *x = head; + while (x != tail) { + T *x1; + int spins = 0; + while (true) { + x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED); + if (x1 != nullptr) break; + if (++spins == 1024) { spins = 1024-1; sched_yield(); } + } + x = x1; + } + return head; +} + +template +T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc* me) { + uintptr_t expected = 0x0; + if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) { + return nullptr; + } else { + int spins = 0; + T* head; + while (true) { + head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); + if (head != nullptr) break; + if (++spins == 1024) { spins = 1024-1; sched_yield(); } + } + __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED); + uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL); + T* tail = utail <= 0x2 ? nullptr : reinterpret_cast(utail); + T *x = head; + while (x != tail) { + T *x1; + spins = 0; + while (true) { + x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED); + if (x1 != nullptr) break; + if (++spins == 1024) { spins = 1024-1; sched_yield(); } + } + x = x1; + } + return head; + } +} #endif diff --git a/src/init.cc b/src/init.cc index c6b6e8f..9269708 100644 --- a/src/init.cc +++ b/src/init.cc @@ -28,10 +28,6 @@ #define STR2(v) #v #define STR(v) STR2(v) -#ifdef ENABLE_TRACE -std::chrono::high_resolution_clock::time_point ncclEpoch; -#endif - #if CUDART_VERSION >= 9020 #define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream #else @@ -46,6 +42,17 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); +static uint64_t hashUniqueId(ncclUniqueId const &id) { + char const *bytes = (char const*)&id; + uint64_t h = 0xdeadbeef; + for(int i=0; i < (int)sizeof(ncclUniqueId); i++) { + h ^= h >> 32; + h *= 0x8db3db47fa2994ad; + h += bytes[i]; + } + return h; +} + // GDRCOPY support: Off by default NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0); @@ -65,18 +72,28 @@ NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0); pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; static bool initialized = false; static size_t maxLocalSizeBytes = 0; + +bool ncclMainExited = false; + +static void atexitHandler() { + ncclMainExited = true; +} + static ncclResult_t ncclInit() { - if (initialized) return ncclSuccess; + if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return ncclSuccess; pthread_mutex_lock(&initLock); if (!initialized) { + atexit(atexitHandler); initEnv(); initGdrCopy(); maxLocalSizeBytes = ncclKernMaxLocalSize(); int carveout = ncclParamL1SharedMemoryCarveout(); if (carveout) ncclKernSetSharedMemoryCarveout(carveout); - NCCLCHECK(ncclNetInit()); - INFO(NCCL_INIT, "Using network %s", ncclNetName()); - initialized = true; + // Always initialize bootstrap network + NCCLCHECK(bootstrapNetInit()); + NCCLCHECK(ncclNetPluginInit()); + + __atomic_store_n(&initialized, true, __ATOMIC_RELEASE); } pthread_mutex_unlock(&initLock); return ncclSuccess; @@ -93,7 +110,9 @@ NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out); ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { NCCLCHECK(ncclInit()); NCCLCHECK(PtrCheck(out, "GetUniqueId", "out")); - return bootstrapGetUniqueId(out); + ncclResult_t res = bootstrapGetUniqueId(out); + TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out)); + return res; } // Prevent compiler from optimizing out these operations @@ -104,11 +123,96 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { #endif void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) { + // Important that this does not trash intraComm0 & intraRefs. comm->rank = comm->cudaDev = comm->busId = comm->nRanks = -1; } #undef NCCL_NO_OPTIMIZE + +static ncclResult_t ncclDestructorFnFree(struct ncclDestructor* dtor) { + free(dtor->obj); + return ncclSuccess; +} +void ncclCommPushFree(struct ncclComm* comm, void* obj) { + struct ncclDestructor* dtor = ncclMemoryStackAlloc(&comm->memPermanent); + dtor->fn = ncclDestructorFnFree; + dtor->obj = obj; + dtor->next = comm->destructorHead; + comm->destructorHead = dtor; +} + +static ncclResult_t ncclDestructorFnCudaFree(struct ncclDestructor* dtor) { + CUDACHECK(cudaFree(dtor->obj)); + return ncclSuccess; +} +void ncclCommPushCudaFree(struct ncclComm* comm, void* obj) { + struct ncclDestructor* dtor = ncclMemoryStackAlloc(&comm->memPermanent); + dtor->fn = ncclDestructorFnCudaFree; + dtor->obj = obj; + dtor->next = comm->destructorHead; + comm->destructorHead = dtor; +} + +static ncclResult_t ncclDestructorFnCudaHostFree(struct ncclDestructor* dtor) { + CUDACHECK(cudaFreeHost(dtor->obj)); + return ncclSuccess; +} +void ncclCommPushCudaHostFree(struct ncclComm* comm, void* obj) { + struct ncclDestructor* dtor = ncclMemoryStackAlloc(&comm->memPermanent); + dtor->fn = ncclDestructorFnCudaHostFree; + dtor->obj = obj; + dtor->next = comm->destructorHead; + comm->destructorHead = dtor; +} + +static ncclResult_t ncclDestructorFnCudaGdrFree(struct ncclDestructor* dtor) { + NCCLCHECK(ncclGdrCudaFree(dtor->obj)); + return ncclSuccess; +} +void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle) { + struct ncclDestructor* dtor = ncclMemoryStackAlloc(&comm->memPermanent); + dtor->fn = ncclDestructorFnCudaGdrFree; + dtor->obj = handle; + dtor->next = comm->destructorHead; + comm->destructorHead = dtor; +} + +void commZombieCleanup(struct ncclComm* comm) { + ncclMemoryStackDestruct(&comm->memScoped); + ncclMemoryStackDestruct(&comm->memPermanent); + + struct ncclComm* intraComm0 = comm->intraComm0; + if (0 == ncclAtomicRefCountDecrement(&intraComm0->intraRefs)) { + // Wait for all service threads to be done. We could not + // do it earlier because it could have blocked and prevented + // other ranks in the process to call ncclCommDestroy + comm = intraComm0; + while (comm != nullptr) { + if (comm->proxyState.thread) pthread_join(comm->proxyState.thread, nullptr); + struct ncclComm* next = comm->intraNext; + free(comm); + comm = next; + } + } +} + +static void* commZombieMain(void* arg) { + ncclResult_t result = ncclSuccess; + struct ncclComm* comm = (struct ncclComm*)arg; + while (comm->persistentRefs != 0) { + struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, /*waitSome=*/true); + while (cb != nullptr) { + struct ncclCommCallback* next = cb->next; + NCCLCHECKGOTO(cb->fn(comm, cb), result, ignore); // may reclaim memory of cb + ignore: + cb = next; + } + } + commZombieCleanup(comm); + return arg; +} + static ncclResult_t commFree(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; @@ -120,13 +224,6 @@ static ncclResult_t commFree(ncclComm_t comm) { free(comm->connectSend); free(comm->connectRecv); - for (int peer=0; peernRanks; peer++) { - delete comm->p2pSends[peer]; - delete comm->p2pRecvs[peer]; - } - free(comm->p2pSends); - free(comm->p2pRecvs); - free(comm->asyncOps); free(comm->peerInfo); ncclTopoFree(comm->topo); @@ -138,51 +235,60 @@ static ncclResult_t commFree(ncclComm_t comm) { if (comm->bootstrap) NCCLCHECK(bootstrapClose(comm->bootstrap)); - CUDACHECK(cudaFree((ncclDevCommAndChannels*)comm->devComm)); - for (int channel=0; channelchannels+channel, comm->nRanks)); - if (comm->doneEvent != NULL) - CUDACHECK(cudaEventDestroy(comm->doneEvent)); + NCCLCHECK(ncclStrongStreamDestruct(&comm->hostStream)); + NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream)); - if (comm->intDoneEvent != NULL) - CUDACHECK(cudaEventDestroy(comm->intDoneEvent)); - - if (comm->launchMode == ncclComm::GROUP) { - CUDACHECK(cudaStreamDestroy(comm->groupStream)); - } - - // Last rank frees shared resources between threads - int isLast; - NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); - if (isLast) { - // Wait for all service threads to be done. We could not - // do it earlier because it could have blocked and prevented - // other ranks in the process to call ncclCommDestroy - for (int i=0; iintraRanks; i++) { - void* ret; - if (comm->intraThreads[i]) pthread_join(comm->intraThreads[i], &ret); - } - free(comm->intraBarrier); - free(comm->intraParams); - free(comm->intraThreads); - free(comm->intraCudaDevs); - free(comm->intraCGMode); - free(comm->intraCC); - } NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag)); - // Poison comm to try and catch a double free - commPoison(comm); + struct ncclDestructor* dtor = comm->destructorHead; + while (dtor != nullptr) { + NCCLCHECK(dtor->fn(dtor)); + dtor = dtor->next; + } - free(comm); + commPoison(comm); // Important that this does not interfere with anything used below. + + if (comm->persistentRefs == 0) { + commZombieCleanup(comm); + } else { + // Spawn a thread to listen for remaining messages from graph cleanup. + pthread_t zombie; + pthread_create(&zombie, nullptr, commZombieMain, comm); + pthread_detach(zombie); + } return ncclSuccess; } NCCL_PARAM(AggChannelSize, "AGG_CHANNEL_SIZE", -2); NCCL_PARAM(DisableGraphHelper, "GRAPH_HELPER_DISABLE", 0); -NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 0); +// GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory +NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1); +NCCL_PARAM(WorkFifoDepth, "WORK_FIFO_DEPTH", 64<<10); +enum ncclLaunchMode ncclParamLaunchMode; + +NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 1); + +// Detect DMA-BUF support +static ncclResult_t dmaBufSupported(struct ncclComm* comm) { + if (ncclParamDmaBufEnable() == 0 || comm->ncclNet->regMrDmaBuf == NULL) return ncclInternalError; +#if CUDA_VERSION >= 11070 + int flag = 0; + CUdevice dev; + int cudaDriverVersion; + CUCHECK(cuDriverGetVersion(&cudaDriverVersion)); + if (cudaDriverVersion < 11070) return ncclInternalError; + CUCHECK(cuDeviceGet(&dev, comm->cudaDev)); + // Query device to see if DMA-BUF support is available + (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev)); + if (flag == 0) return ncclInternalError; + INFO(NCCL_INIT, "DMA-BUF is available on GPU device %d", comm->cudaDev); + return ncclSuccess; +#endif + return ncclInternalError; +} static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { if (ndev < 1) { @@ -194,100 +300,114 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { return ncclInvalidArgument; } - // Try to create a CUDA object right away. If there is something wrong with - // the device we're on (failure cause #1) , better know it early. - cudaEvent_t doneEvent; - CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming)); - cudaEvent_t intDoneEvent; - CUDACHECK(cudaEventCreateWithFlags(&intDoneEvent, cudaEventDisableTiming)); - struct ncclComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); + ncclMemoryStackConstruct(&comm->memPermanent); + ncclMemoryStackConstruct(&comm->memScoped); + comm->destructorHead = nullptr; + comm->rank = rank; + comm->nRanks = ndev; + + NCCLCHECK(ncclNetInit(comm)); + INFO(NCCL_INIT, "Using network %s", ncclNetName(comm)); + + // Try to create a CUDA object right away. If there is something wrong with + // the device we're on (failure cause #1) , better know it early. + NCCLCHECK(ncclStrongStreamConstruct(&comm->deviceStream)); + NCCLCHECK(ncclStrongStreamConstruct(&comm->hostStream)); - comm->rank = comm->hostDevComm.rank = rank; - comm->nRanks = comm->hostDevComm.nRanks = ndev; cudaGetDevice(&comm->cudaDev); NCCLCHECK(getBusId(comm->cudaDev, &comm->busId)); TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx", comm, rank, ndev, comm->cudaDev, comm->busId); - comm->doneEvent = doneEvent; - comm->intDoneEvent = intDoneEvent; comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; -#if CUDART_VERSION >= 9020 - comm->groupCudaStream = ncclParamGroupCudaStream(); -#else - // Don't allow the user to overload the default setting in older CUDA builds - comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM; -#endif + comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false; comm->fatalError = ncclSuccess; NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1)); - comm->hostDevComm.abortFlag = comm->abortFlag; *comm->abortFlag = 0; - comm->argsptrs[0] = &comm->devComm; - comm->argsptrs[1] = &comm->args; comm->collNetSupport = 0; - NCCLCHECK(ncclCalloc(&comm->asyncOps, NCCL_MAX_OPS)); - comm->asyncOpCount = 0; - comm->asyncTotalSize = 0; + ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan); + ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp); + ncclMemoryPoolConstruct(&comm->memPool_ncclPointerList); + + comm->groupNext = reinterpret_cast(0x1); + comm->preconnectNext = reinterpret_cast(0x1); comm->channelSize = ncclParamAggChannelSize(); - comm->asyncAllocMode = ncclComm::SHORTEST_QUEUE; - char* str = getenv("NCCL_AGG_ALLOC_MODE"); - if (str) INFO(NCCL_ENV, "NCCL_AGG_ALLOC_MODE set by environment to %s", str); - if (str && strcmp(str, "ROUND_ROBIN") == 0) { - comm->asyncAllocMode = ncclComm::ROUND_ROBIN; - } - - CUDACHECK(cudaDriverGetVersion(&comm->driverVersion)); - - NCCLCHECK(ncclCreateQueueInfo(&comm->enqueueInfo, comm)); - comm->lastSetupNode = NULL; - comm->lastCudaGraphId = -1; - comm->disableGraphHelper = ncclParamDisableGraphHelper(); - comm->graphRegister = ncclParamGraphRegister(); -#if CUDART_VERSION >= 11030 - NCCLCHECK(ncclCalloc(&comm->graphHelperResources, 1)); - comm->graphHelperResources->comm = comm; - if (comm->driverVersion >= 11030) - // cudaGetDriverEntryPoint requires R465 or above (enhanced compat need) - CUDACHECK(cudaGetDriverEntryPoint("cuMemGetAddressRange", (void**)&comm->pfnCuMemGetAddressRange, cudaEnableDefault)); -#endif static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels"); static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels"); NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks)); NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks)); - comm->p2pSendCount = comm->p2pRecvCount = 0; - NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks)); - NCCLCHECK(ncclCalloc(&comm->p2pRecvs, comm->nRanks)); - // Mark channels as non initialized. - for (int c=0; cchannels[c].id = -1; + for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1; + + ncclIntruQueueMpscConstruct(&comm->callbackQueue); *comret = comm; return ncclSuccess; } static ncclResult_t devCommSetup(ncclComm_t comm) { - ncclDevCommAndChannels *devCommAndChans; - NCCLCHECK(ncclCudaCalloc(&devCommAndChans, 1)); + NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream)); + + int nRanks = comm->nRanks; + struct ncclDevCommAndChannels *devCommAndChans, tmpCommAndChans; + NCCLCHECK(ncclCudaCallocAsync(&devCommAndChans, 1, comm->deviceStream.stream)); + ncclCommPushCudaFree(comm, devCommAndChans); comm->devComm = &devCommAndChans->comm; - comm->hostDevComm.channels = devCommAndChans->channels; + tmpCommAndChans.comm.rank = comm->rank; + tmpCommAndChans.comm.nRanks = nRanks; + tmpCommAndChans.comm.abortFlag = comm->abortFlag; + for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) { + tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p]; + } + tmpCommAndChans.comm.channels = &devCommAndChans->channels[0]; - // Duplicate the channels on the device - int nChannels = std::max(comm->nChannels, comm->p2pnChannels); - NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, nChannels)); + comm->workFifoDepth = ncclParamWorkFifoDepth(); + if (0 != (comm->workFifoDepth & (comm->workFifoDepth-1))) { + WARN("NCCL_WORK_FIFO_DEPTH=%d is being ignored because it is not a power of 2.", comm->workFifoDepth); + comm->workFifoDepth = 64<<10; + } + tmpCommAndChans.comm.workFifoDepth = comm->workFifoDepth; - // Copy userRanks and peers - for (int r=0; rnChannels; r++) { - NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks)); + if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) { + // The workFifoHeap lives in GDR mapped CUDA memory. + NCCLCHECK(ncclGdrCudaCalloc(&comm->workFifoHeap, &comm->devWorkFifoHeap, comm->workFifoDepth, &comm->workFifoHeapGdrHandle)); + ncclCommPushCudaGdrFree(comm, comm->workFifoHeapGdrHandle); + } else { + // The workFifoHeap lives in cudaHost memory. + comm->workFifoHeapGdrHandle = nullptr; + NCCLCHECK(ncclCudaHostCalloc(&comm->workFifoHeap, comm->workFifoDepth)); + ncclCommPushCudaHostFree(comm, comm->workFifoHeap); + comm->devWorkFifoHeap = comm->workFifoHeap; + } + tmpCommAndChans.comm.workFifoHeap = comm->devWorkFifoHeap; + + NCCLCHECK(ncclCudaHostCalloc(&comm->workFifoDone, MAXCHANNELS)); + ncclCommPushCudaHostFree(comm, comm->workFifoDone); + comm->workFifoSent = 0; + comm->workFifoAckdMin = 0; + + for (int c=0; c < MAXCHANNELS; c++) { + tmpCommAndChans.channels[c].peers = comm->channels[c].devPeers; + tmpCommAndChans.channels[c].ring = comm->channels[c].ring; + tmpCommAndChans.channels[c].ring.userRanks = comm->channels[c].devRingUserRanks; + tmpCommAndChans.channels[c].tree = comm->channels[c].tree; + tmpCommAndChans.channels[c].collTree = comm->channels[c].collTree; + tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c]; + + if (comm->channels[c].ring.userRanks != nullptr) { + NCCLCHECK(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->deviceStream.stream)); + } } - // Duplicate the dev comm on the device - NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1)); + NCCLCHECK(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->deviceStream.stream)); + CUDACHECK(cudaStreamSynchronize(comm->deviceStream.stream)); + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNull(), &comm->deviceStream)); return ncclSuccess; } @@ -319,7 +439,7 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u info->busId = comm->busId; - NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport)); + NCCLCHECK(ncclGpuGdrSupport(comm, &info->gdrSupport)); info->comm = comm; info->cudaCompCap = ncclCudaCompCap(); return ncclSuccess; @@ -343,84 +463,6 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, return ncclSuccess; } -void* waitForNonNullPtr(void* p) { - volatile void** ptr = (volatile void**) p; - while (*ptr == NULL) sched_yield(); - return (void*)*ptr; -} - -ncclResult_t initParams(struct ncclComm* comm) { - struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank; - params->args = comm->argsptrs; - params->stream = NULL; - params->sharedMem = 0; - params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1; - params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1; - return ncclSuccess; -} - -// Allocate/Set Intra Process Structures and set CG options -ncclResult_t ncclCommSetIntraProc(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) { - comm->intraRank = rank; - comm->intraRanks = ranks; - comm->intraPhase = 0; - - // Alloc shared structures - if (rank == 0) { - assert(comm == comm0); - int* bar; - NCCLCHECK(ncclCalloc(&bar, 2)); - bar[0] = bar[1] = 0; - comm->intraBarrier = bar; - NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks)); - NCCLCHECK(ncclCalloc(&comm->intraThreads, comm->intraRanks)); - NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks)); - int* CGMode; - NCCLCHECK(ncclCalloc(&CGMode, 1)); - *CGMode = 0x11; - comm->intraCGMode = CGMode; - int* CC; - NCCLCHECK(ncclCalloc(&CC, 1)); - *CC = ncclCudaCompCap(); - comm->intraCC = CC; - } else { - comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier); - comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams); - comm->intraThreads = (pthread_t*)waitForNonNullPtr(&comm0->intraThreads); - comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs); - comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode); - comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC); - } - comm->intraCudaDevs[comm->intraRank] = comm->cudaDev; - comm->intraThreads[comm->intraRank] = comm->proxyState.thread; - NCCLCHECK(initParams(comm)); - - int cgMdLaunch = 0; - - // Set CG Mode - comm->launchMode = ncclComm::PARALLEL; - char* str = getenv("NCCL_LAUNCH_MODE"); - if (str) INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", str); - if (str && strcmp(str, "GROUP") == 0) { - comm->launchMode = ncclComm::GROUP; - } - if (comm->launchMode == ncclComm::GROUP) { - CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking)); -#if CUDART_VERSION >= 9000 - if (*comm->intraCC && (ncclCudaCompCap() == *comm->intraCC)) { - // Check whether the GPU supports Cooperative Group Multi Device Launch - (void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev); - } -#endif - } - - // Disable cgMdLaunch if any rank does not support it - if (cgMdLaunch == 0) { - *comm->intraCGMode = 0x10; - } - return ncclSuccess; -} - #define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine)) #define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t)) #define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */ @@ -439,7 +481,7 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) { if (cpuArch == NCCL_TOPO_CPU_ARCH_ARM) defaults[NCCL_PROTO_SIMPLE] = DEFAULT_BUFFSIZE_ARM; for (int p=0; pbuffSizes[p] = comm->hostDevComm.buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p]; + comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p]; } return ncclSuccess; } @@ -476,11 +518,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm // Topo detection / System graph creation NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo)); // Compute paths between GPUs and NICs - NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo)); + NCCLCHECK(ncclTopoComputePaths(comm->topo, comm)); // Remove inaccessible GPUs and unused NICs NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm)); // Recompute paths after trimming - NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo)); + NCCLCHECK(ncclTopoComputePaths(comm->topo, comm)); // Init search NCCLCHECK(ncclTopoSearchInit(comm->topo)); // Print final topology @@ -532,7 +574,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm } // Determine local CollNet support before all-gather - if (collNetSupport()) { + if (collNetSupport(comm)) { char *collNetEnable = getenv("NCCL_COLLNET_ENABLE"); if (collNetEnable != NULL) { INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable); @@ -564,6 +606,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm } *allGather3Data; NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); + NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev)); allGather3Data[rank].tree.pattern = treeGraph.pattern; allGather3Data[rank].tree.nChannels = treeGraph.nChannels; @@ -725,7 +768,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm struct ncclChannel* channel = comm->channels+c; NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore); if (comm->nRanks == 1) continue; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore); free(rings); @@ -735,8 +778,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm for (int c=0; cnChannels; c++) { struct ncclChannel* channel = comm->channels+c; if (comm->nRanks == 1) continue; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore); - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore); INFO(NCCL_INIT, "Connected all trees"); @@ -773,12 +816,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int highestTransportType0, highestTransportType1; for (int c=0; cnChannels; c++) { struct ncclChannel* channelRecv = comm->channels+c; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0, &highestTransportType0), ret, collnet_cleanup); for (int c=0; cnChannels; c++) { struct ncclChannel* channelSend = comm->channels+c; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1, &highestTransportType1), ret, collnet_cleanup); @@ -816,6 +859,52 @@ collnet_cleanup: // Compute nChannels per peer for p2p NCCLCHECK(ncclTopoComputeP2pChannels(comm)); + do { // Setup p2p structures in comm->tasks + struct ncclTasks* tasks = &comm->tasks; + int nRanks = comm->nRanks; + int node = comm->node; + int nNodes = comm->nNodes; + struct ncclNodeRanks *nodeRanks = comm->nodeRanks; + int localRank = comm->localRank; + tasks->peers = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); + tasks->p2pSendOrder = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); + tasks->p2pRecvOrder = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); + int s=0, r=0; + // schedule delta 0, +1, -1, +2, -2, ... + // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even. + for (int d=0; d <= nNodes/4; d++) { + int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes }; + int index = 0; + int delta = deltas[index]; + sched_delta: + int recvNode = (node+nNodes-delta)%nNodes; + int sendNode = (node+delta)%nNodes; + int steps = comm->maxLocalRanks; + for (int step=0; step < steps; step++) { + int recvIndex = (localRank-step+steps)%steps; + if (recvIndex < nodeRanks[recvNode].localRanks) { + tasks->p2pRecvOrder[r] = nodeRanks[recvNode].localRankToRank[recvIndex]; + r++; + } + int sendIndex = (localRank+step)%steps; + if (sendIndex < nodeRanks[sendNode].localRanks) { + tasks->p2pSendOrder[s] = nodeRanks[sendNode].localRankToRank[sendIndex]; + s++; + } + } + index++; + if (index == 1 && deltas[1] == deltas[0]) index++; + if (index == 2 && deltas[2] == deltas[0]) index++; + if (index == 3 && deltas[3] == deltas[2]) index++; + if (index == 3 && deltas[3] == deltas[1]) index++; + if (index < 4) { + delta = deltas[index]; + goto sched_delta; + } + } + assert(s == nRanks && r == nRanks); + } while (0); + if (ncclParamNvbPreconnect()) { // Connect p2p when using NVB path int nvbNpeers; @@ -847,7 +936,7 @@ collnet_cleanup: NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0)); // Then to remote ones when using PXN - if (ncclPxnDisable() == 0) { + if (ncclPxnDisable(comm) == 0) { int nranks; int* pxnPeers; NCCLCHECK(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks)); @@ -868,6 +957,10 @@ collnet_cleanup: if (intraProcRanks == 0) intraProcRank0 = i; if (i == rank) intraProcRank = intraProcRanks; intraProcRanks++; + if (intraProcRank0 == rank && rank != i) { + comm->peerInfo[i].comm->intraNext = comm->intraNext; + comm->intraNext = comm->peerInfo[i].comm; + } } } TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", @@ -878,9 +971,33 @@ collnet_cleanup: intraProcRank, intraProcRanks, intraProcRank0); return ncclInternalError; } - NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, comm->peerInfo[intraProcRank0].comm)); + struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm; + assert(intraProcRank==0 ? comm==comm0 : true); + comm->intraComm0 = comm0; + comm->intraRefs = intraProcRank==0 ? intraProcRanks : 0; + comm->intraRank = intraProcRank; + comm->intraRanks = intraProcRanks; + comm->intraBarrierPhase = 0; + comm->intraBarrierCounter = 0; + comm->intraBarrierGate = 0; } while(0); + if (comm->intraRank == 0) { // Load ncclParamLaunchMode + char* str = getenv("NCCL_LAUNCH_MODE"); + enum ncclLaunchMode mode, modeOld; + if (str && strcasecmp(str, "GROUP") == 0) { + mode = ncclLaunchModeGroup; + } else { + mode = ncclLaunchModeParallel; + } + // In theory we could be racing with other communicators not associated with + // this one if the user is connecting to multiple ncclUniqueId's concurrently. + modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED); + if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') { + INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP"); + } + } + /* Local intra-node barrier */ NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0])); @@ -899,8 +1016,22 @@ affinity_restore: NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0); -ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) { - ncclResult_t res; +struct ncclCommInitRankAsyncJob { + struct ncclAsyncJob base; + ncclComm_t* newcomm; + int nranks, myrank; + ncclUniqueId commId; + int cudaDev; +}; + +static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { + struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_; + ncclComm_t* newcomm = job->newcomm; + int nranks = job->nranks; + ncclUniqueId commId = job->commId; // C++ struct assignment + int myrank = job->myrank; + int cudaDev = job->cudaDev; + ncclResult_t res = ncclSuccess; CUDACHECK(cudaSetDevice(cudaDev)); // Set the maximum kernel stack size of all kernels to avoid @@ -915,7 +1046,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId); - + TRACE_CALL("ncclCommInitRank(%p,%d,0x%llx,%d,%d)", *newcomm, nranks, (unsigned long long)hashUniqueId(commId), myrank, (*newcomm)->cudaDev); return ncclSuccess; cleanup: if ((*newcomm) && (*newcomm)->bootstrap) bootstrapAbort((*newcomm)->bootstrap); @@ -923,6 +1054,12 @@ cleanup: return res; } +static void ncclCommInitRankUndo(struct ncclAsyncJob* job_) { + struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_; + ncclCommDestroy(*job->newcomm); + *job->newcomm = nullptr; +} + static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) { ncclResult_t res; char* env = getenv("NCCL_COMM_ID"); @@ -944,20 +1081,26 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni goto end; } - if (ncclAsyncMode()) { - NCCLCHECKGOTO(ncclAsyncInit(ncclCommInitRankSync, newcomm, nranks, commId, myrank, cudaDev), res, end); - } else { - NCCLCHECKGOTO(ncclCommInitRankSync(newcomm, nranks, commId, myrank, cudaDev), res, end); - } + struct ncclCommInitRankAsyncJob *job; + NCCLCHECKGOTO(ncclCalloc(&job, 1), res, end); + job->newcomm = newcomm; + job->nranks = nranks; + job->commId = commId; // C++ struct assignment + job->myrank = myrank; + job->cudaDev = cudaDev; + NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, ncclCommInitRankUndo, free), res, end); end: - if (ncclAsyncMode()) return ncclAsyncErrCheck(res); - else return res; + return ncclGroupErrCheck(res); } NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank); ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { NVTX3_FUNC_RANGE_IN(nccl_domain); + + // Load the CUDA driver and dlsym hooks (can fail on old drivers) + (void) cudaLibraryInit(); + int cudaDev; CUDACHECK(cudaGetDevice(&cudaDev)); NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev)); @@ -967,6 +1110,10 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist); ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { NVTX3_FUNC_RANGE_IN(nccl_domain); + + // Load the CUDA driver and dlsym hooks (can fail on old drivers) + (void) cudaLibraryInit(); + NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms")); if (ndev < 0) { WARN("Invalid device count requested : %d", ndev); @@ -984,22 +1131,6 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { return ncclSuccess; } -static ncclResult_t ncclGraphHelperDestroy(ncclComm* comm) { - auto res = comm->graphHelperResources; - if (comm->graphHelperThread && res) { - pthread_mutex_lock(&res->threadLock); - res->threadState = ThreadStop; - pthread_cond_signal(&res->threadCond); - pthread_mutex_unlock(&res->threadLock); - pthread_join(comm->graphHelperThread, NULL); - } - if (res) { - free(res); - res = NULL; - } - return ncclSuccess; -} - static ncclResult_t commDestroy(ncclComm_t comm) { // Try and prevent a double free of the comm struct (user error) if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) { @@ -1017,13 +1148,9 @@ static ncclResult_t commDestroy(ncclComm_t comm) { TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, comm->rank, *comm->abortFlag, comm->fatalError); - CUDACHECK(cudaStreamSynchronize(comm->groupStream)); - - ncclDestroyQueueInfo(comm->enqueueInfo); -#if CUDART_VERSION >= 11030 - NCCLCHECK(ncclGraphHelperDestroy(comm)); -#endif - INFO(NCCL_COLL, "Created %d queue info, destroyed %d", comm->nQueueInfoCreated, comm->nQueueInfoDestroyed); + NCCLCHECK(ncclStrongStreamSynchronize(&comm->hostStream)); + NCCLCHECK(ncclStrongStreamSynchronize(&comm->deviceStream)); + NCCLCHECK(ncclCommPollCallbacks(comm)); NCCLCHECK(commFree(comm)); @@ -1075,10 +1202,19 @@ const char* ncclGetErrorString(ncclResult_t code) { case ncclInternalError : return "internal error"; case ncclInvalidArgument : return "invalid argument"; case ncclInvalidUsage : return "invalid usage"; + case ncclRemoteError : return "remote process exited or there was a network error"; default : return "unknown result code"; } } +/* Returns a human-readable message of the last error that occurred. + * comm is currently unused and can be set to NULL + */ +NCCL_API(const char*, ncclGetLastError, const ncclComm_t comm); +const char* ncclGetLastError(ncclComm_t comm) { + return ncclLastError; +} + NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError); ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm")); diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc index 5406bf0..994d1fd 100644 --- a/src/misc/argcheck.cc +++ b/src/misc/argcheck.cc @@ -44,12 +44,7 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) { return ncclInvalidArgument; } // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars. - info->nBytes = info->count * ncclTypeSize(info->datatype); - if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) { - info->count = info->nBytes; - info->datatype = ncclInt8; - } - if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank + NCCLCHECK(ncclInfoSetDerived(info, info->comm->nRanks)); if (info->op < 0 || ncclMaxRedOp < info->op) { WARN("%s : invalid reduction operation %d", info->opName, info->op); diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc new file mode 100644 index 0000000..43c95c2 --- /dev/null +++ b/src/misc/cudawrap.cc @@ -0,0 +1,163 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl.h" +#include "debug.h" +#include "cudawrap.h" + +#include + +#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr + +#if CUDART_VERSION >= 11030 +/* CUDA Driver functions loaded with cuGetProcAddress for versioning */ +DECLARE_CUDA_PFN(cuDeviceGet); +DECLARE_CUDA_PFN(cuDeviceGetAttribute); +DECLARE_CUDA_PFN(cuGetErrorString); +DECLARE_CUDA_PFN(cuGetErrorName); +/* enqueue.cc */ +DECLARE_CUDA_PFN(cuMemGetAddressRange); +/* proxy.cc */ +DECLARE_CUDA_PFN(cuCtxCreate_v3020); +DECLARE_CUDA_PFN(cuCtxDestroy); +DECLARE_CUDA_PFN(cuCtxSetCurrent); +#if CUDA_VERSION >= 11070 +/* transport/collNet.cc/net.cc*/ +DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support +#endif +#endif + +/* CUDA Driver functions loaded with dlsym() */ +DECLARE_CUDA_PFN(cuInit); +DECLARE_CUDA_PFN(cuDriverGetVersion); +DECLARE_CUDA_PFN(cuGetProcAddress); + +static enum { cudaUninitialized, cudaInitializing, cudaInitialized, cudaError } cudaState = cudaUninitialized; + +#define CUDA_DRIVER_MIN_VERSION 11030 + +static void *cudaLib; +static int cudaDriverVersion; + +#if CUDART_VERSION >= 11030 +/* + Load the CUDA symbols + */ +static int cudaPfnFuncLoader(void) { + CUresult res; + +#define LOAD_SYM(symbol, ignore) do { \ + res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), cudaDriverVersion, 0); \ + if (res != 0) { \ + if (!ignore) { \ + WARN("Retrieve %s version %d failed with %d", #symbol, cudaDriverVersion, res); \ + return ncclSystemError; } \ + } } while(0) + + LOAD_SYM(cuGetErrorString, 0); + LOAD_SYM(cuGetErrorName, 0); + LOAD_SYM(cuDeviceGet, 0); + LOAD_SYM(cuDeviceGetAttribute, 0); + LOAD_SYM(cuMemGetAddressRange, 1); + LOAD_SYM(cuCtxCreate_v3020, 1); + LOAD_SYM(cuCtxDestroy, 1); + LOAD_SYM(cuCtxSetCurrent, 1); +#if CUDA_VERSION >= 11070 + LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support +#endif + return ncclSuccess; +} +#endif + +ncclResult_t cudaLibraryInit(void) { + CUresult res; + + if (cudaState == cudaInitialized) + return ncclSuccess; + if (cudaState == cudaError) + return ncclSystemError; + + if (__sync_bool_compare_and_swap(&cudaState, cudaUninitialized, cudaInitializing) == false) { + // Another thread raced in front of us. Wait for it to be done. + while (cudaState == cudaInitializing) sched_yield(); + return (cudaState == cudaInitialized) ? ncclSuccess : ncclSystemError; + } + + /* + * Load CUDA driver library + */ + char path[1024]; + char *ncclCudaPath = getenv("NCCL_CUDA_PATH"); + if (ncclCudaPath == NULL) + snprintf(path, 1024, "%s", "libcuda.so"); + else + snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so"); + + cudaLib = dlopen(path, RTLD_LAZY); + if (cudaLib == NULL) { + WARN("Failed to find CUDA library in %s (NCCL_CUDA_PATH=%s)", ncclCudaPath, ncclCudaPath); + goto error; + } + + /* + * Load initial CUDA functions + */ + + pfn_cuInit = (PFN_cuInit) dlsym(cudaLib, "cuInit"); + if (pfn_cuInit == NULL) { + WARN("Failed to load CUDA missing symbol cuInit"); + goto error; + } + + pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion) dlsym(cudaLib, "cuDriverGetVersion"); + if (pfn_cuDriverGetVersion == NULL) { + WARN("Failed to load CUDA missing symbol cuDriverGetVersion"); + goto error; + } + + res = pfn_cuDriverGetVersion(&cudaDriverVersion); + if (res != 0) { + WARN("cuDriverGetVersion failed with %d", res); + goto error; + } + + INFO(NCCL_INIT, "cudaDriverVersion %d", cudaDriverVersion); + + if (cudaDriverVersion < CUDA_DRIVER_MIN_VERSION) { + // WARN("CUDA Driver version found is %d. Minimum requirement is %d", cudaDriverVersion, CUDA_DRIVER_MIN_VERSION); + // Silently ignore version check mismatch for backwards compatibility + goto error; + } + + pfn_cuGetProcAddress = (PFN_cuGetProcAddress) dlsym(cudaLib, "cuGetProcAddress"); + if (pfn_cuGetProcAddress == NULL) { + WARN("Failed to load CUDA missing symbol cuGetProcAddress"); + goto error; + } + + /* + * Required to initialize the CUDA Driver. + * Multiple calls of cuInit() will return immediately + * without making any relevant change + */ + pfn_cuInit(0); + +#if CUDART_VERSION >= 11030 + if (cudaPfnFuncLoader()) { + WARN("CUDA some PFN functions not found in the library"); + goto error; + } +#endif + + cudaState = cudaInitialized; + return ncclSuccess; + +error: + cudaState = cudaError; + return ncclSystemError; +} + + diff --git a/src/misc/gdrwrap.cc b/src/misc/gdrwrap.cc index ed0c697..e81c7ea 100644 --- a/src/misc/gdrwrap.cc +++ b/src/misc/gdrwrap.cc @@ -57,7 +57,7 @@ ncclResult_t wrap_gdr_symbols(void) { if (__sync_bool_compare_and_swap(&gdrState, gdrUninitialized, gdrInitializing) == false) { // Another thread raced in front of us. Wait for it to be done. - while (gdrState == gdrInitializing) pthread_yield(); + while (gdrState == gdrInitializing) sched_yield(); return (gdrState == gdrInitialized) ? ncclSuccess : ncclSystemError; } diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc index e1aabac..3b8daac 100644 --- a/src/misc/ibvwrap.cc +++ b/src/misc/ibvwrap.cc @@ -30,6 +30,8 @@ struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context); int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd); struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access); struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access); +/* DMA-BUF support */ +struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); int (*ibv_internal_dereg_mr)(struct ibv_mr *mr); struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); int (*ibv_internal_destroy_cq)(struct ibv_cq *cq); @@ -49,7 +51,7 @@ ncclResult_t wrap_ibv_symbols(void) { if (__sync_bool_compare_and_swap(&ibvState, ibvUninitialized, ibvInitializing) == false) { // Another thread raced in front of us. Wait for it to be done. - while (ibvState == ibvInitializing) pthread_yield(); + while (ibvState == ibvInitializing) sched_yield(); return (ibvState == ibvInitialized) ? ncclSuccess : ncclSystemError; } @@ -98,6 +100,8 @@ ncclResult_t wrap_ibv_symbols(void) { LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr); // Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8 LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibv_internal_reg_mr_iova2, "IBVERBS_1.8"); + // Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12 + LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12"); LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr); LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq); LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq); @@ -126,6 +130,7 @@ teardown: ibv_internal_dealloc_pd = NULL; ibv_internal_reg_mr = NULL; ibv_internal_reg_mr_iova2 = NULL; + ibv_internal_reg_dmabuf_mr = NULL; ibv_internal_dereg_mr = NULL; ibv_internal_create_cq = NULL; ibv_internal_destroy_cq = NULL; @@ -259,7 +264,7 @@ ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or } ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) { - IBV_PTR_CHECK(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr"); + IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr"); } struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) { @@ -275,7 +280,19 @@ ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void return ncclInternalError; } if (ret == NULL) { return ncclSuccess; } // Assume dummy call - IBV_PTR_CHECK(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2"); + IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2"); +} + +/* DMA-BUF support */ +ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) { + IBV_PTR_CHECK_ERRNO(ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr"); +} + +struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) { + if (ibv_internal_reg_dmabuf_mr == NULL) { + return NULL; + } + return ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access); } ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ diff --git a/src/misc/socket.cc b/src/misc/socket.cc index ef2bea6..16049fa 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -332,9 +332,10 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) { #endif } - /* make all new sockets non-blocking */ - EQCHECK(flags = fcntl(fd, F_GETFL), -1); - SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); + if (sock->asyncFlag) { + EQCHECK(flags = fcntl(fd, F_GETFL), -1); + SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); + } // addr port should be 0 (Any port) SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind"); @@ -373,7 +374,7 @@ static ncclResult_t getFdState(int fd, enum ncclSocketState* state) { SYSCHECK(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt"); } - if (ret == EINPROGRESS) + if (ret == EINPROGRESS || ret == ECONNREFUSED) *state = ncclSocketConnecting; else if (ret == 0) *state = ncclSocketConnected; @@ -409,10 +410,12 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) { const int one = 1; SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt"); - + /* support non-blocking socket; by default, the socket is non-blocking */ - EQCHECK(flags = fcntl(fd, F_GETFL), -1); - SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); + if (sock->asyncFlag) { + EQCHECK(flags = fcntl(fd, F_GETFL), -1); + SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); + } /* const int bufsize = 128*1024; SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt"); @@ -424,31 +427,26 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) { int timedout_retries = 0; int refused_retries = 0; retry: - /* async connect; abort when error happens and abortFlag is present. */ + /* blocking/non-blocking connect() is determined by asyncFlag. */ ret = connect(fd, &sock->addr.sa, salen); - if (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) || - (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) { - if (refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno)); + if (!sock->asyncFlag && (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) || + (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES))) { + if (errno == ECONNREFUSED && refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno)); usleep(SLEEP_INT); goto retry; - } else if (errno == EINPROGRESS && !sock->asyncFlag) { - enum ncclSocketState state; - do { - if (sock->abortFlag) NEQCHECK(*sock->abortFlag, 0); - NCCLCHECK(getFdState(fd, &state)); - } while (state == ncclSocketConnecting); - EQCHECK(state, ncclSocketError); - ret = 0; } - if (ret == 0 || (errno == EINPROGRESS && sock->asyncFlag)) { + /* If connect() fails with errno == EAGAIN/EINPROGRESS/ETIMEDOUT, we may want to try connect again. + * However, it can return EISCONN instead of success which indicates connection is built up in + * background already. No need to call connect() again. */ + if (ret == 0 || ((errno == EINPROGRESS || errno == ECONNREFUSED) && sock->asyncFlag) || errno == EISCONN) { sock->fd = fd; return ncclSuccess; } WARN("Net : Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno)); - return ncclSystemError; + return ncclRemoteError; } ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) { @@ -501,7 +499,7 @@ static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void* if (bytes == -1) { if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { WARN("Net : Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno)); - return ncclSystemError; + return ncclRemoteError; } else { bytes = 0; } @@ -521,7 +519,7 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int if (closed) { char line[SOCKET_NAME_MAXLEN+1]; WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0)); - return ncclSystemError; + return ncclRemoteError; } return ncclSuccess; } diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc new file mode 100644 index 0000000..4933799 --- /dev/null +++ b/src/misc/strongstream.cc @@ -0,0 +1,272 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "strongstream.h" +#include "checks.h" +#include "param.h" + +//////////////////////////////////////////////////////////////////////////////// + +ncclResult_t ncclCudaGetCapturingGraph( + struct ncclCudaGraph* graph, cudaStream_t stream + ) { + #if CUDART_VERSION >= 11030 + thread_local int driver = -1; + if (driver == -1) { + CUDACHECK(cudaDriverGetVersion(&driver)); + } + if (driver < 11030) { + cudaStreamCaptureStatus status; + unsigned long long gid; + graph->graph = nullptr; + CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, &gid)); + if (status != cudaStreamCaptureStatusNone) { + WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support"); + return ncclInvalidUsage; + } + } else { + cudaStreamCaptureStatus status; + unsigned long long gid; + CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &gid, &graph->graph, nullptr, nullptr)); + if (status != cudaStreamCaptureStatusActive) { + graph->graph = nullptr; + gid = ULLONG_MAX; + } + graph->graphId = gid; + } + #endif + return ncclSuccess; +} + +ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg) { + #if CUDART_VERSION >= 11030 + cudaUserObject_t object; + CUDACHECK(cudaUserObjectCreate( + &object, arg, fn, /*initialRefcount=*/1, cudaUserObjectNoDestructorSync + )); + // Hand over ownership to CUDA Graph + CUDACHECK(cudaGraphRetainUserObject(graph.graph, object, 1, cudaGraphUserObjectMove)); + return ncclSuccess; + #else + return ncclInvalidUsage; + #endif +} + +//////////////////////////////////////////////////////////////////////////////// + +ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss) { + CUDACHECK(cudaStreamCreateWithFlags(&ss->stream, cudaStreamNonBlocking)); + CUDACHECK(cudaEventCreateWithFlags(&ss->event, cudaEventDisableTiming)); + #if CUDART_VERSION >= 11030 + ss->node = nullptr; + ss->graphId = (1ull<<(8*sizeof(long long)-1))-1; + ss->eventIsLagging = 0; + #endif + return ncclSuccess; +} + +ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss) { + #if CUDART_VERSION >= 11030 + CUDACHECK(cudaEventDestroy(ss->event)); + #endif + CUDACHECK(cudaStreamDestroy(ss->stream)); + return ncclSuccess; +} + +NCCL_PARAM(GraphMixingSupport, "GRAPH_MIXING_SUPPORT", 1) + +ncclResult_t ncclStrongStreamAcquire( + struct ncclCudaGraph graph, struct ncclStrongStream* ss + ) { + #if CUDART_VERSION >= 11030 + bool mixing = ncclParamGraphMixingSupport(); + if (graph.graph == nullptr) { + if (mixing && ncclStrongStreamEverCaptured(ss)) { + CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0)); + ss->eventIsLagging = 0; + } + } else { + if (ss->graphId != graph.graphId) { + if (mixing && ss->eventIsLagging) { + // Can only be here if previous release was for uncaptured work that + // elided updating the event because no capture had yet occurred. + CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0)); + CUDACHECK(cudaEventRecord(ss->event, ss->stream)); + } + ss->graphId = graph.graphId; + ss->eventIsLagging = 0; + if (mixing) { + CUDACHECK(cudaGraphAddEventWaitNode(&ss->node, graph.graph, nullptr, 0, ss->event)); + } else { + CUDACHECK(cudaGraphAddEmptyNode(&ss->node, graph.graph, nullptr, 0)); + } + } + } + #endif + return ncclSuccess; +} + +ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss) { + #if CUDART_VERSION >= 11030 + bool mixing = ncclParamGraphMixingSupport(); + if (mixing && ncclStrongStreamEverCaptured(ss)) { + CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0)); + } + ss->eventIsLagging = 1; // Assume the caller is going to add work to stream. + #endif + return ncclSuccess; +} + +ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss) { + #if CUDART_VERSION >= 11030 + bool mixing = ncclParamGraphMixingSupport(); + if (mixing && ss->eventIsLagging) { + if (graph.graph == nullptr) { + if (ncclStrongStreamEverCaptured(ss)) { + CUDACHECK(cudaEventRecord(ss->event, ss->stream)); + ss->eventIsLagging = 0; + } + } else { + CUDACHECK(cudaGraphAddEventRecordNode(&ss->node, graph.graph, &ss->node, 1, ss->event)); + ss->eventIsLagging = 0; + } + } + #endif + return ncclSuccess; +} + +ncclResult_t ncclStrongStreamLaunchHost( + struct ncclCudaGraph graph, struct ncclStrongStream* ss, cudaHostFn_t fn, void* arg + ) { + #if CUDART_VERSION >= 11030 + if (graph.graph == nullptr) { + CUDACHECK(cudaLaunchHostFunc(ss->stream, fn, arg)); + } else { + cudaHostNodeParams p; + p.fn = fn; + p.userData = arg; + CUDACHECK(cudaGraphAddHostNode(&ss->node, graph.graph, &ss->node, 1, &p)); + } + ss->eventIsLagging = 1; + #else + CUDACHECK(cudaLaunchHostFunc(ss->stream, fn, arg)); + #endif + return ncclSuccess; +} + +ncclResult_t ncclStrongStreamLaunchKernel( + struct ncclCudaGraph graph, struct ncclStrongStream* ss, + void* fn, dim3 grid, dim3 block, void* args[], size_t sharedMemBytes + ) { + #if CUDART_VERSION >= 11030 + if (graph.graph == nullptr) { + CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->stream)); + } else { + cudaGraphNode_t tip = ss->node; + cudaKernelNodeParams p; + p.func = fn; + p.gridDim = grid; + p.blockDim = block; + p.kernelParams = args; + p.sharedMemBytes = sharedMemBytes; + p.extra = nullptr; + CUDACHECK(cudaGraphAddKernelNode(&ss->node, graph.graph, &tip, 1, &p)); + } + ss->eventIsLagging = 1; + #else + CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->stream)); + #endif + return ncclSuccess; +} + +ncclResult_t ncclStrongStreamWaitStream( + struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b + ) { + #if CUDART_VERSION >= 11030 + if (graph.graph == nullptr) { + if (b->eventIsLagging) { + b->eventIsLagging = 0; + CUDACHECK(cudaEventRecord(b->event, b->stream)); + } + CUDACHECK(cudaStreamWaitEvent(a->stream, b->event, 0)); + a->eventIsLagging = 1; + } else { + cudaGraphNode_t pair[2] = {a->node, b->node}; + CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2)); + } + #else + CUDACHECK(cudaEventRecord(b->event, b->stream)); + CUDACHECK(cudaStreamWaitEvent(a->stream, b->event, 0)); + #endif + return ncclSuccess; +} + +ncclResult_t ncclStrongStreamWaitStream( + struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b + ) { + #if CUDART_VERSION >= 11030 + if (graph.graph == nullptr) { + CUDACHECK(cudaEventRecord(a->event, b)); + CUDACHECK(cudaStreamWaitEvent(a->stream, a->event, 0)); + // We used a->event to record b so it no longer reflects anything about a. + a->eventIsLagging = 1; + } else { + cudaStreamCaptureStatus status; + unsigned long long gid1; + cudaGraphNode_t const* deps; + size_t depN = 0; + CUDACHECK(cudaStreamGetCaptureInfo_v2(b, &status, &gid1, nullptr, &deps, &depN)); + if (status != cudaStreamCaptureStatusActive || graph.graphId != gid1) { + WARN("Stream is not being captured by the expected graph."); + return ncclInvalidUsage; + } + if (depN > 0 && (depN > 1 || deps[0] != a->node)) { + cudaGraphNode_t tie; + if (depN == 1) { + tie = deps[0]; + } else { + CUDACHECK(cudaGraphAddEmptyNode(&tie, graph.graph, deps, depN)); + } + cudaGraphNode_t pair[2] = {a->node, tie}; + CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2)); + } + // a->eventIsLagging doesn't change since we are just updating the + // dependencies of a->node. + } + #else + CUDACHECK(cudaEventRecord(a->event, b)); + CUDACHECK(cudaStreamWaitEvent(a->stream, a->event, 0)); + #endif + return ncclSuccess; +} + +ncclResult_t ncclStrongStreamWaitStream( + struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b + ) { + #if CUDART_VERSION >= 11030 + if (graph.graph == nullptr) { + if (b->eventIsLagging) { + b->eventIsLagging = 0; + CUDACHECK(cudaEventRecord(b->event, b->stream)); + } + CUDACHECK(cudaStreamWaitEvent(a, b->event, 0)); + } else { + CUDACHECK(cudaStreamUpdateCaptureDependencies(a, &b->node, 1, cudaStreamAddCaptureDependencies)); + } + #else + CUDACHECK(cudaEventRecord(b->event, b->stream)); + CUDACHECK(cudaStreamWaitEvent(a, b->event, 0)); + #endif + return ncclSuccess; +} + +ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) { + #if CUDART_VERSION >= 11030 + CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0)); + #endif + CUDACHECK(cudaStreamSynchronize(ss->stream)); + return ncclSuccess; +} diff --git a/src/misc/utils.cc b/src/misc/utils.cc index f3e3ca2..20e8e41 100644 --- a/src/misc/utils.cc +++ b/src/misc/utils.cc @@ -9,6 +9,8 @@ #include "nvmlwrap.h" +#include + // Get current Compute Capability int ncclCudaCompCap() { int cudaDev; @@ -190,3 +192,102 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz } return false; } + +__thread struct ncclThreadSignal ncclThreadSignalLocalInstance = ncclThreadSignalStaticInitializer(); + +void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) { + // `me->hunks` points to the top of the stack non-empty hunks. Hunks above + // this (reachable via `->above`) are empty. + struct Hunk* top = me->topFrame.hunk; + size_t mallocSize = 0; + + // If we have lots of space left in hunk but that wasn't enough then we'll + // allocate the object unhunked. + if (me->topFrame.end - me->topFrame.bumper >= 8<<10) + goto unhunked; + + // If we have another hunk (which must be empty) waiting above this one and + // the object fits then use that. + if (top && top->above) { + struct Hunk* top1 = top->above; + uintptr_t uobj = (reinterpret_cast(top1) + sizeof(struct Hunk) + align-1) & -uintptr_t(align); + if (uobj + size <= reinterpret_cast(top1) + top1->size) { + me->topFrame.hunk = top1; + me->topFrame.bumper = uobj + size; + me->topFrame.end = reinterpret_cast(top1) + top1->size; + return reinterpret_cast(uobj); + } + } + + { // If the next hunk we're going to allocate wouldn't be big enough but the + // Unhunk proxy fits in the current hunk then go allocate as unhunked. + size_t nextSize = (top ? top->size : 0) + (64<<10); + constexpr size_t maxAlign = 64; + if (nextSize < sizeof(struct Hunk) + maxAlign + size) { + uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk)); + if (uproxy + sizeof(struct Unhunk) <= me->topFrame.end) + goto unhunked; + } + + // At this point we must need another hunk, either to fit the object + // itself or its Unhunk proxy. + mallocSize = nextSize; + INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize); + struct Hunk *top1 = (struct Hunk*)malloc(mallocSize); + if (top1 == nullptr) goto malloc_exhausted; + top1->size = nextSize; + top1->above = nullptr; + if (top) top->above = top1; + top = top1; + me->topFrame.hunk = top; + me->topFrame.end = reinterpret_cast(top) + nextSize; + me->topFrame.bumper = reinterpret_cast(top) + sizeof(struct Hunk); + } + + { // Try to fit object in the new top hunk. + uintptr_t uobj = (me->topFrame.bumper + align-1) & -uintptr_t(align); + if (uobj + size <= me->topFrame.end) { + me->topFrame.bumper = uobj + size; + return reinterpret_cast(uobj); + } + } + +unhunked: + { // We need to allocate the object out-of-band and put an Unhunk proxy in-band + // to keep track of it. + uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk)); + Unhunk* proxy = reinterpret_cast(uproxy); + me->topFrame.bumper = uproxy + sizeof(Unhunk); + proxy->next = me->topFrame.unhunks; + me->topFrame.unhunks = proxy; + mallocSize = size; + proxy->obj = malloc(mallocSize); + INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize); + if (proxy->obj == nullptr) goto malloc_exhausted; + return proxy->obj; + } + +malloc_exhausted: + WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize); + abort(); +} + +void ncclMemoryStackDestruct(struct ncclMemoryStack* me) { + // Free unhunks first because both the frames and unhunk proxies lie within the hunks. + struct ncclMemoryStack::Frame* f = &me->topFrame; + while (f != nullptr) { + struct ncclMemoryStack::Unhunk* u = f->unhunks; + while (u != nullptr) { + free(u->obj); + u = u->next; + } + f = f->below; + } + // Free hunks + struct ncclMemoryStack::Hunk* h = me->stub.above; + while (h != nullptr) { + struct ncclMemoryStack::Hunk *h1 = h->above; + free(h); + h = h1; + } +} diff --git a/src/nccl.h.in b/src/nccl.h.in index 93a141c..edd98a3 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -38,7 +38,8 @@ typedef enum { ncclSuccess = 0, ncclInternalError = 3, ncclInvalidArgument = 4, ncclInvalidUsage = 5, - ncclNumResults = 6 } ncclResult_t; + ncclRemoteError = 6, + ncclNumResults = 7 } ncclResult_t; /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. * This integer is coded with the MAJOR, MINOR and PATCH level of the @@ -81,10 +82,16 @@ ncclResult_t pncclCommDestroy(ncclComm_t comm); ncclResult_t ncclCommAbort(ncclComm_t comm); ncclResult_t pncclCommAbort(ncclComm_t comm); -/* Returns a human-readable error message. */ +/* Returns a string for each error code. */ const char* ncclGetErrorString(ncclResult_t result); const char* pncclGetErrorString(ncclResult_t result); +/* Returns a human-readable message of the last error that occurred. + * comm is currently unused and can be set to NULL + */ +const char* ncclGetLastError(ncclComm_t comm); +const char* pncclGetError(ncclComm_t comm); + /* Checks whether the comm has encountered any asynchronous errors */ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); diff --git a/src/net.cc b/src/net.cc index cb65218..53ec80e 100644 --- a/src/net.cc +++ b/src/net.cc @@ -9,15 +9,16 @@ //#include //#include -ncclNet_t *ncclNet; -ncclCollNet_t *ncclCollNet; - -static ncclNet_v5_t ncclNet_v4_as_v5; +static ncclNet_v6_t ncclNet_v4_as_v6; +static ncclNet_v6_t ncclNet_v5_as_v6; static ncclNet_v4_t *ncclNet_v4; -static ncclCollNet_v5_t ncclCollNet_v4_as_v5; +static ncclNet_v5_t *ncclNet_v5; +static ncclCollNet_v6_t ncclCollNet_v4_as_v6; +static ncclCollNet_v6_t ncclCollNet_v5_as_v6; static ncclCollNet_v4_t *ncclCollNet_v4; +static ncclCollNet_v5_t *ncclCollNet_v5; -static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) { +static ncclResult_t ncclNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) { ncclNetProperties_v4_t p4; ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4); if (ans != ncclSuccess) return ans; @@ -33,17 +34,17 @@ static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5 return ncclSuccess; } -static ncclResult_t ncclNet_v4_as_v5_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { +static ncclResult_t ncclNet_v4_as_v6_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclNet_v4->isend(sendComm, data, size, mhandle, request); } -static ncclResult_t ncclNet_v4_as_v5_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { +static ncclResult_t ncclNet_v4_as_v6_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { if (n == 0) return ncclSuccess; if (n != 1) return ncclInvalidArgument; return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request); } -static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { +static ncclResult_t ncclNet_v4_as_v6_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { if (n == 0) return ncclSuccess; if (n != 1) return ncclInvalidArgument; return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request); @@ -51,27 +52,51 @@ static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data, // We use a wrapper around the v4 init to copy over the struct contents // post-init since they may not be initialized before hand. -static ncclResult_t ncclNet_v4_as_v5_init(ncclDebugLogger_t logfn) { +static ncclResult_t ncclNet_v4_as_v6_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclNet_v4->init(logfn)); - ncclNet_v4_as_v5.name = ncclNet_v4->name; - ncclNet_v4_as_v5.devices = ncclNet_v4->devices; - ncclNet_v4_as_v5.getProperties = ncclNet_v4_as_v5_getProperties; - ncclNet_v4_as_v5.listen = ncclNet_v4->listen; - ncclNet_v4_as_v5.connect = ncclNet_v4->connect; - ncclNet_v4_as_v5.accept = ncclNet_v4->accept; - ncclNet_v4_as_v5.regMr = ncclNet_v4->regMr; - ncclNet_v4_as_v5.deregMr = ncclNet_v4->deregMr; - ncclNet_v4_as_v5.isend = ncclNet_v4_as_v5_isend; - ncclNet_v4_as_v5.irecv = ncclNet_v4_as_v5_irecv; - ncclNet_v4_as_v5.iflush = ncclNet_v4_as_v5_iflush; - ncclNet_v4_as_v5.test = ncclNet_v4->test; - ncclNet_v4_as_v5.closeSend = ncclNet_v4->closeSend; - ncclNet_v4_as_v5.closeRecv = ncclNet_v4->closeRecv; - ncclNet_v4_as_v5.closeListen = ncclNet_v4->closeListen; + ncclNet_v4_as_v6.name = ncclNet_v4->name; + ncclNet_v4_as_v6.devices = ncclNet_v4->devices; + ncclNet_v4_as_v6.getProperties = ncclNet_v4_as_v6_getProperties; + ncclNet_v4_as_v6.listen = ncclNet_v4->listen; + ncclNet_v4_as_v6.connect = ncclNet_v4->connect; + ncclNet_v4_as_v6.accept = ncclNet_v4->accept; + ncclNet_v4_as_v6.regMr = ncclNet_v4->regMr; + ncclNet_v4_as_v6.regMrDmaBuf = NULL; + ncclNet_v4_as_v6.deregMr = ncclNet_v4->deregMr; + ncclNet_v4_as_v6.isend = ncclNet_v4_as_v6_isend; + ncclNet_v4_as_v6.irecv = ncclNet_v4_as_v6_irecv; + ncclNet_v4_as_v6.iflush = ncclNet_v4_as_v6_iflush; + ncclNet_v4_as_v6.test = ncclNet_v4->test; + ncclNet_v4_as_v6.closeSend = ncclNet_v4->closeSend; + ncclNet_v4_as_v6.closeRecv = ncclNet_v4->closeRecv; + ncclNet_v4_as_v6.closeListen = ncclNet_v4->closeListen; return ncclSuccess; } -static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) { +// We use a wrapper around the v5 init to copy over the struct contents +// post-init since they may not be initialized before hand. +static ncclResult_t ncclNet_v5_as_v6_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclNet_v5->init(logfn)); + ncclNet_v5_as_v6.name = ncclNet_v5->name; + ncclNet_v5_as_v6.devices = ncclNet_v5->devices; + ncclNet_v5_as_v6.getProperties = ncclNet_v5->getProperties; + ncclNet_v5_as_v6.listen = ncclNet_v5->listen; + ncclNet_v5_as_v6.connect = ncclNet_v5->connect; + ncclNet_v5_as_v6.accept = ncclNet_v5->accept; + ncclNet_v5_as_v6.regMr = ncclNet_v5->regMr; + ncclNet_v5_as_v6.regMrDmaBuf = NULL; + ncclNet_v5_as_v6.deregMr = ncclNet_v5->deregMr; + ncclNet_v5_as_v6.isend = ncclNet_v5->isend; + ncclNet_v5_as_v6.irecv = ncclNet_v5->irecv; + ncclNet_v5_as_v6.iflush = ncclNet_v5->iflush; + ncclNet_v5_as_v6.test = ncclNet_v5->test; + ncclNet_v5_as_v6.closeSend = ncclNet_v5->closeSend; + ncclNet_v5_as_v6.closeRecv = ncclNet_v5->closeRecv; + ncclNet_v5_as_v6.closeListen = ncclNet_v5->closeListen; + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) { ncclNetProperties_v4_t p4; ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4); if (ans != ncclSuccess) return ans; @@ -89,25 +114,58 @@ static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetPropertie // We use a wrapper around the v4 init to copy over the struct contents // post-init since they may not be initialized before hand. -static ncclResult_t ncclCollNet_v4_as_v5_init(ncclDebugLogger_t logfn) { +static ncclResult_t ncclCollNet_v4_as_v6_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclCollNet_v4->init(logfn)); - ncclCollNet_v4_as_v5.name = ncclCollNet_v4->name; - ncclCollNet_v4_as_v5.devices = ncclCollNet_v4->devices; - ncclCollNet_v4_as_v5.getProperties = ncclCollNet_v4_as_v5_getProperties; - ncclCollNet_v4_as_v5.listen = ncclCollNet_v4->listen; - ncclCollNet_v4_as_v5.connect = ncclCollNet_v4->connect; - ncclCollNet_v4_as_v5.reduceSupport = ncclCollNet_v4->reduceSupport; - ncclCollNet_v4_as_v5.regMr = ncclCollNet_v4->regMr; - ncclCollNet_v4_as_v5.deregMr = ncclCollNet_v4->deregMr; - ncclCollNet_v4_as_v5.iallreduce = ncclCollNet_v4->iallreduce; - ncclCollNet_v4_as_v5.iflush = ncclCollNet_v4->iflush; - ncclCollNet_v4_as_v5.test = ncclCollNet_v4->test; - ncclCollNet_v4_as_v5.closeColl = ncclCollNet_v4->closeColl; - ncclCollNet_v4_as_v5.closeListen = ncclCollNet_v4->closeListen; + ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name; + ncclCollNet_v4_as_v6.devices = ncclCollNet_v4->devices; + ncclCollNet_v4_as_v6.getProperties = ncclCollNet_v4_as_v6_getProperties; + ncclCollNet_v4_as_v6.listen = ncclCollNet_v4->listen; + ncclCollNet_v4_as_v6.connect = ncclCollNet_v4->connect; + ncclCollNet_v4_as_v6.reduceSupport = ncclCollNet_v4->reduceSupport; + ncclCollNet_v4_as_v6.regMr = ncclCollNet_v4->regMr; + ncclCollNet_v4_as_v6.regMrDmaBuf = NULL; + ncclCollNet_v4_as_v6.deregMr = ncclCollNet_v4->deregMr; + ncclCollNet_v4_as_v6.iallreduce = ncclCollNet_v4->iallreduce; + ncclCollNet_v4_as_v6.iflush = ncclCollNet_v4->iflush; + ncclCollNet_v4_as_v6.test = ncclCollNet_v4->test; + ncclCollNet_v4_as_v6.closeColl = ncclCollNet_v4->closeColl; + ncclCollNet_v4_as_v6.closeListen = ncclCollNet_v4->closeListen; return ncclSuccess; } -static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) { +// We use a wrapper around the v5 init to copy over the struct contents +// post-init since they may not be initialized before hand. +static ncclResult_t ncclCollNet_v5_as_v6_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclCollNet_v5->init(logfn)); + ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name; + ncclCollNet_v5_as_v6.devices = ncclCollNet_v5->devices; + ncclCollNet_v5_as_v6.getProperties = ncclCollNet_v5->getProperties; + ncclCollNet_v5_as_v6.listen = ncclCollNet_v5->listen; + ncclCollNet_v5_as_v6.connect = ncclCollNet_v5->connect; + ncclCollNet_v5_as_v6.reduceSupport = ncclCollNet_v5->reduceSupport; + ncclCollNet_v5_as_v6.regMr = ncclCollNet_v5->regMr; + ncclCollNet_v5_as_v6.regMrDmaBuf = NULL; + ncclCollNet_v5_as_v6.deregMr = ncclCollNet_v5->deregMr; + ncclCollNet_v5_as_v6.iallreduce = ncclCollNet_v5->iallreduce; + ncclCollNet_v5_as_v6.iflush = ncclCollNet_v5->iflush; + ncclCollNet_v5_as_v6.test = ncclCollNet_v5->test; + ncclCollNet_v5_as_v6.closeColl = ncclCollNet_v5->closeColl; + ncclCollNet_v5_as_v6.closeListen = ncclCollNet_v5->closeListen; + return ncclSuccess; +} + +static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER; +ncclNet_t* ncclNets[3] = { nullptr, &ncclNetIb, &ncclNetSocket }; +ncclCollNet_t* ncclCollNets[3] = { nullptr, nullptr, nullptr }; +enum ncclNetState { + ncclNetStateInit = 0, + ncclNetStateEnabled = 1, + ncclNetStateDisabled = 2 +}; +enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; +enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; + +ncclResult_t ncclNetPluginInit() { char ncclNetPluginName[128]; const char* envPluginName = getenv("NCCL_NET_PLUGIN"); if (envPluginName && strlen(envPluginName)) { @@ -126,67 +184,104 @@ static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) { } else { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror()); } - return; + return ncclSuccess; } - *net = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5"); - if (*net == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v5 symbol."); - ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4"); - if (ncclNet_v4 == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v4 symbol."); - if (netPluginLib != nullptr) dlclose(netPluginLib); - return; + ncclNets[0] = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6"); + if (ncclNets[0] == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol."); + // Try v5 plugin + ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5"); + if (ncclNet_v5 == nullptr) { + ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4"); + if (ncclNet_v4 == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (v4 or v5)."); + if (netPluginLib != nullptr) dlclose(netPluginLib); + return ncclSuccess; + } + ncclNets[0] = &ncclNet_v4_as_v6; + ncclNet_v4_as_v6.init = ncclNet_v4_as_v6_init; + // Set the name right away to allow for NCCL_NET=... to work + ncclNet_v4_as_v6.name = ncclNet_v4->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v4)", ncclNets[0]->name); + } else { + ncclNets[0] = &ncclNet_v5_as_v6; + ncclNet_v5_as_v6.init = ncclNet_v5_as_v6_init; + // Set the name right away to allow for NCCL_NET=... to work + ncclNet_v5_as_v6.name = ncclNet_v5->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name); } - *net = &ncclNet_v4_as_v5; - ncclNet_v4_as_v5.init = ncclNet_v4_as_v5_init; } // Check for CollNet - *collnet = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5"); - if (*collnet == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol."); - ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4"); - if (ncclCollNet_v4 == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol."); + ncclCollNets[0] = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6"); + if (ncclCollNets[0] == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol."); + ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5"); + if (ncclCollNet_v5 == nullptr) { + ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4"); + if (ncclCollNet_v4 == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5)."); + } else { + ncclCollNets[0] = &ncclCollNet_v4_as_v6; + ncclCollNet_v4_as_v6.init = ncclCollNet_v4_as_v6_init; + ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v4)", ncclCollNets[0]->name); + } } else { - *collnet = &ncclCollNet_v4_as_v5; - ncclCollNet_v4_as_v5.init = ncclCollNet_v4_as_v5_init; + ncclCollNets[0] = &ncclCollNet_v5_as_v6; + ncclCollNet_v5_as_v6.init = ncclCollNet_v5_as_v6_init; + ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name); } } - return; + return ncclSuccess; } -ncclResult_t ncclNetInit() { - // Always initialize bootstrap network - NCCLCHECK(bootstrapNetInit()); +static ncclResult_t netGetState(int i, enum ncclNetState* state) { + pthread_mutex_lock(&netLock); + if (ncclNetStates[i] == ncclNetStateInit) { + int ndev; + if (ncclNets[i]->init(ncclDebugLog) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled; + else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled; + else ncclNetStates[i] = ncclNetStateEnabled; + } + *state = ncclNetStates[i]; + pthread_mutex_unlock(&netLock); + return ncclSuccess; +} +static ncclResult_t collNetGetState(int i, enum ncclNetState* state) { + if (ncclCollNetStates[i] == ncclNetStateInit) { + int ndev; + if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled; + else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled; + else ncclCollNetStates[i] = ncclNetStateEnabled; + } + *state = ncclCollNetStates[i]; + return ncclSuccess; +} + +ncclResult_t ncclNetInit(struct ncclComm* comm) { // Initialize main communication network - ncclNet_t* nets[3] = { nullptr, &ncclNetIb, &ncclNetSocket }; - ncclCollNet_t* collNets[3] = { nullptr, nullptr, nullptr }; - initPlugin(&nets[0], &collNets[0]); char* netName = getenv("NCCL_NET"); bool ok = false; for (int i=0; i<3; i++) { - if (nets[i] == nullptr) continue; - if (netName && strcmp(netName, nets[i]->name) != 0) continue; + if (ncclNets[i] == nullptr) continue; + enum ncclNetState state; + NCCLCHECK(netGetState(i, &state)); + if (state != ncclNetStateEnabled) continue; + if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue; - // net plugin is already initialized - int ndev; - if (nets[i]->init(ncclDebugLog) != ncclSuccess) continue; - if (nets[i]->devices(&ndev) != ncclSuccess) continue; - if (ndev <= 0) continue; - ncclNet = nets[i]; + comm->ncclNet = ncclNets[i]; ok = true; - if (collNets[i]) { - do { - if (collNets[i]->init(ncclDebugLog) != ncclSuccess) break; - if (collNets[i]->devices(&ndev) != ncclSuccess) break; - if (ndev <= 0) break; - ncclCollNet = collNets[i]; - } while(0); + if (ncclCollNets[i]) { + NCCLCHECK(collNetGetState(i, &state)); + if (state == ncclNetStateEnabled) { + comm->ncclCollNet = ncclCollNets[i]; + } } break; } @@ -198,7 +293,7 @@ ncclResult_t ncclNetInit() { return ncclSuccess; } -ncclResult_t ncclGpuGdrSupport(int* gdrSupport) { +ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { constexpr int GPU_BUF_SIZE = 2*1024*1024; #if CUDART_VERSION >= 11030 // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute @@ -213,12 +308,12 @@ ncclResult_t ncclGpuGdrSupport(int* gdrSupport) { } #endif int netDevs; - NCCLCHECK(ncclNetDevices(&netDevs)); + NCCLCHECK(ncclNetDevices(comm, &netDevs)); *gdrSupport = 0; for (int dev=0; devabortFlag, ret, cleanup2); } while (rComm == NULL) { - NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3); + NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3); } CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4); - if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { - NCCLCHECK(ncclNetDeregMr(sComm, mHandle)); - NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); - NCCLCHECK(ncclNetDeregMr(rComm, mHandle)); + if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { + NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle)); + NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); + NCCLCHECK(ncclNetDeregMr(comm, rComm, mHandle)); *gdrSupport = 1; } ncclDebugNoWarn = 0; CUDACHECK(cudaFree(gpuPtr)); cleanup4: - NCCLCHECK(ncclNetCloseRecv(rComm)); + NCCLCHECK(ncclNetCloseRecv(comm, rComm)); cleanup3: - NCCLCHECK(ncclNetCloseSend(sComm)); + NCCLCHECK(ncclNetCloseSend(comm, sComm)); cleanup2: - NCCLCHECK(ncclNetCloseListen(lComm)); + NCCLCHECK(ncclNetCloseListen(comm, lComm)); cleanup1: break; } return ncclSuccess; } -int ncclNetVersion() { - return (ncclNet == &ncclNet_v4_as_v5) ? 4 : 5; +int ncclNetVersion(struct ncclComm* comm) { + return (comm->ncclNet == &ncclNet_v4_as_v6) ? 4 : ((comm->ncclNet == &ncclNet_v5_as_v6) ? 5 : 6); } diff --git a/src/proxy.cc b/src/proxy.cc index d3c6a98..5021bc8 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -13,6 +13,8 @@ #define ENABLE_TIMER 0 #include "timer.h" +#include + enum { proxyRecv=0, proxySend=1 }; static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) { @@ -349,10 +351,10 @@ ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector* return ncclSuccess; } -static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex) { +static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) { if (peer < 0) return ncclSuccess; - struct ncclPeer* peerComm = channel->peers+peer; + struct ncclChannelPeer* peerComm = channel->peers+peer; struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex; if (connector->transportComm == NULL) { WARN("Rank %d has no transport for %s peer %d on channel %d/%d", connector->comm->rank, @@ -361,35 +363,62 @@ static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, s } if (connector->transportComm->proxyProgress == NULL) return ncclSuccess; - NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op)); + if (justInquire) *justInquire = true; + else { + NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op)); + } return ncclSuccess; } -ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* op, int nranks) { - struct ncclChannel* channel = comm->channels+op->channelId; - int pattern = op->pattern; - if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) { - struct ncclRing* ring = &channel->ring; - if (NeedProxy(proxyRecv, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, 0)); - if (NeedProxy(proxySend, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, 0)); - } - if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) { - // Tree up - struct ncclTree* tree = &channel->tree; - for (int i=0; idown[i], op, 0)); - NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0)); - } - if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) { - // Tree down - struct ncclTree* tree = &channel->tree; - for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0)); - NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0)); - } - if (pattern == ncclPatternCollTreeUpDown) { - // CollTree up - NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1)); // For CollTree up, we are using push - // CollTree down - NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0)); +// justInquire != nullptr means don't actually do anything, just assertain need of +// ncclProxySaveOp for this op. +ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool* justInquire) { + struct ncclChannel* channel = &comm->channels[op->channelId]; + if (justInquire) *justInquire = false; + switch (op->pattern) { + case ncclPatternRing: + case ncclPatternRingTwice: + case ncclPatternPipelineFrom: + case ncclPatternPipelineTo: { + struct ncclRing* ring = &channel->ring; + if (NeedProxy(proxyRecv, op->pattern, op->root, ring, comm->nRanks)) { + NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, 0, justInquire)); + } + if (NeedProxy(proxySend, op->pattern, op->root, ring, comm->nRanks)) { + NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, 0, justInquire)); + } + } break; + case ncclPatternTreeUp: + case ncclPatternTreeDown: + case ncclPatternTreeUpDown: { + if (op->pattern != ncclPatternTreeDown) { // Tree up + struct ncclTree* tree = &channel->tree; + for (int i=0; idown[i], op, 0, justInquire)); + } + NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0, justInquire)); + } + if (op->pattern != ncclPatternTreeUp) { // Tree down + struct ncclTree* tree = &channel->tree; + for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) { + NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0, justInquire)); + } + NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0, justInquire)); + } + } break; + case ncclPatternCollTreeUpDown: { + // CollTree up + NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1, justInquire)); // For CollTree up, we are using push + // CollTree down + NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0, justInquire)); + } break; + case ncclPatternSend: + case ncclPatternRecv: { + if (op->root == comm->rank) return ncclSuccess; + op->nsteps = DIVUP(op->nbytes, op->chunkSize); + if (op->nsteps == 0) op->nsteps = 1; + NCCLCHECK(SaveProxy(channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, 1, justInquire)); + } break; } return ncclSuccess; } @@ -406,22 +435,23 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op) op->protocol = NCCL_PROTO_SIMPLE; op->dtype = info->datatype; - int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR; + int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; + if (info->comm->nNodes > 1) stepSize /= SENDRECV_SLICEFACTOR; info->chunkSize = stepSize; op->root = info->root; op->nbytes = info->count; - struct ncclPeer* peer = channel->peers + op->root; + struct ncclChannelPeer* peer = channel->peers + op->root; if (info->coll == ncclFuncSend) { op->pattern = ncclPatternSend; - if (op->root != info->comm->rank && peer->send[1].transportComm && peer->send[1].transportComm->proxyProgress) { + if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) { // Tune chunk size for the network if (info->count < stepSize) info->chunkSize /= 4; else if (info->count < 8*stepSize) info->chunkSize /= 2; } } else if (info->coll == ncclFuncRecv) { op->pattern = ncclPatternRecv; - if (op->root != info->comm->rank && peer->recv[1].transportComm && peer->recv[1].transportComm->proxyProgress) { + if (op->root != info->comm->rank && peer->recv[1].transportComm == &netTransport.recv) { // Tune chunk size for the network if (info->count < stepSize) info->chunkSize /= 4; else if (info->count < 8*stepSize) info->chunkSize /= 2; @@ -437,22 +467,6 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op) return ncclSuccess; } -ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* op) { - struct ncclChannel* channel = comm->channels+op->channelId; - op->opCount = channel->workFifoTail-1; - if (op->root == comm->rank) return ncclSuccess; - if (op->pattern == ncclPatternRecv) { - op->nsteps = DIVUP(op->nbytes, op->chunkSize); - if (op->nsteps == 0) op->nsteps = 1; - NCCLCHECK(SaveProxy(channel, proxyRecv, op->root, op, 1)); - } else if (op->pattern == ncclPatternSend) { - op->nsteps = DIVUP(op->nbytes, op->chunkSize); - if (op->nsteps == 0) op->nsteps = 1; - NCCLCHECK(SaveProxy(channel, proxySend, op->root, op, 1)); - } - return ncclSuccess; -} - static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) { struct ncclProxyArgs* freeOp = *opPtr; struct ncclProxyArgs* next = freeOp->next; @@ -594,8 +608,48 @@ void ncclDumpProxyState(int signal) { dumpProxyState(ncclLastProxyState); } +NCCL_PARAM(CreateThreadContext, "CREATE_THREAD_CONTEXT", 0); +ncclResult_t ncclSetThreadContext(struct ncclComm* comm) { +#if CUDART_VERSION >= 11030 + static int createThreadContext = -1; + + if (createThreadContext == -1) { + createThreadContext = ncclParamCreateThreadContext(); + if (createThreadContext) { + if (CUPFN(cuCtxCreate_v3020) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) { + WARN("Unable to create thread context due to old driver, disabling."); + createThreadContext = 0; + } + } + } + if (createThreadContext) { + if (comm->proxyState.cudaCtx == NULL) { + if (CUPFN(cuCtxCreate_v3020(&comm->proxyState.cudaCtx, + CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, comm->cudaDev)) != CUDA_SUCCESS) { + WARN("Failed to create CUDA context on device %d", comm->cudaDev); + createThreadContext = 0; + return ncclSuccess; + } + } else { + if (CUPFN(cuCtxSetCurrent(comm->proxyState.cudaCtx)) != CUDA_SUCCESS) { + WARN("Failed to set CUDA context on device %d", comm->cudaDev); + return ncclUnhandledCudaError; + } + } + } +#endif + return ncclSuccess; +} + void* ncclProxyProgress(void *comm_) { struct ncclComm* comm = (struct ncclComm*)comm_; + if (ncclSetThreadContext(comm) != ncclSuccess) { + WARN("[Proxy Progress] Failed to set CUDA context on device %d", comm->cudaDev); + } else if (cudaSetDevice(comm->cudaDev) != cudaSuccess) { + WARN("[Proxy Progress] Failed to set CUDA device %d", comm->cudaDev); + } + if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + struct ncclProxyProgressState* state = &comm->proxyState.progressState; state->nextOps = -1; signal(SIGUSR1, ncclDumpProxyState); @@ -728,9 +782,9 @@ static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool, static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { if (connection->send) { - NCCLCHECK(ncclTransports[connection->transport].send.proxyFree(connection, comm)); + NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, comm)); } else { - NCCLCHECK(ncclTransports[connection->transport].recv.proxyFree(connection, comm)); + NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, comm)); } return ncclSuccess; } @@ -774,7 +828,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int))); NCCLCHECK(ncclSocketSend(sock, &comm->localRank, sizeof(int))); NCCLCHECK(ncclSocketRecv(sock, &proxyConn->connection, sizeof(void*))); - struct ncclTransportComm* tcomm = send ? &ncclTransports[transport].send : &ncclTransports[transport].recv; + struct ncclTransportComm* tcomm = send ? &ncclTransports[transport]->send : &ncclTransports[transport]->recv; // If we need proxy progress, map progress ops if (tcomm->proxyProgress) { char poolPath[] = "/dev/shm/nccl-XXXXXX"; @@ -881,7 +935,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr NCCLCHECK(ncclSocketRecv(sock, &peer->localRank, sizeof(int))); connection->localRank = peer->localRank; NCCLCHECK(ncclSocketSend(sock, &connection, sizeof(void*))); - connection->tcomm = connection->send ? &ncclTransports[connection->transport].send : &ncclTransports[connection->transport].recv; + connection->tcomm = connection->send ? &ncclTransports[connection->transport]->send : &ncclTransports[connection->transport]->recv; // If we need proxy progress, let's allocate ops and start the thread if (connection->tcomm->proxyProgress) { NCCLCHECK(proxyProgressInit(comm)); @@ -947,7 +1001,10 @@ static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* p void* ncclProxyService(void* _args) { struct ncclComm* comm = (struct ncclComm *) _args; - if (cudaSetDevice(comm->cudaDev) != cudaSuccess) { + if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + if (ncclSetThreadContext(comm) != ncclSuccess) { + WARN("[Proxy Service] Failed to set CUDA context on device %d", comm->cudaDev); + } else if (cudaSetDevice(comm->cudaDev) != cudaSuccess) { WARN("[Proxy Service] Failed to set CUDA device %d", comm->cudaDev); } if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); diff --git a/src/transport.cc b/src/transport.cc index 7ce5f2e..7ebaf27 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -10,16 +10,11 @@ #define ENABLE_TIMER 0 #include "timer.h" -extern struct ncclTransport p2pTransport; -extern struct ncclTransport shmTransport; -extern struct ncclTransport netTransport; -extern struct ncclTransport collNetTransport; - -struct ncclTransport ncclTransports[NTRANSPORTS] = { - p2pTransport, - shmTransport, - netTransport, - collNetTransport +struct ncclTransport* ncclTransports[NTRANSPORTS] = { + &p2pTransport, + &shmTransport, + &netTransport, + &collNetTransport }; template @@ -29,7 +24,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex : comm->channels[channelId].peers[peer].recv + connIndex; for (int t=0; tsend : &transport->recv; int ret = 0; NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo)); @@ -44,9 +39,10 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* return ncclSystemError; } -ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) { +ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) { TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv); - uint32_t mask = 1 << channel->id; + struct ncclChannel* channel = &comm->channels[channelId]; + uint32_t mask = 1 << channelId; for (int i=0; i= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue; @@ -71,9 +67,10 @@ void dumpData(struct ncclConnect* data, int ndata) { ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) { // Stream used during transport setup; need for P2P pre-connect + CUDA Graph + int highestType = TRANSPORT_P2P; // track highest transport type + cudaStream_t transportSetupStream; CUDACHECK(cudaStreamCreateWithFlags(&transportSetupStream, cudaStreamNonBlocking)); - int highestType = TRANSPORT_P2P; // track highest transport type struct ncclConnect data[2*MAXCHANNELS]; for (int i=1; inRanks; i++) { @@ -126,7 +123,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex; NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn)); conn->connected = 1; - CUDACHECK(cudaMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice, transportSetupStream)); + CUDACHECK(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, transportSetupStream)); } } TIME_STOP(3); @@ -136,7 +133,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex; NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn)); conn->connected = 1; - CUDACHECK(cudaMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice, transportSetupStream)); + CUDACHECK(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, transportSetupStream)); } } TIME_STOP(4); @@ -168,10 +165,6 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN // check if we can connect to collnet, whose root is the nranks-th rank struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks; peerInfo->rank = nranks; - int support = 1; - if (isMaster) { - NCCLCHECK(collNetTransport.canConnect(&support, comm->topo, collNetGraph, myInfo, peerInfo)); - } // send master receives connect info from peer recv master if (isMaster && type == collNetSend) { @@ -181,14 +174,14 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN } // select - struct ncclPeer* root = channel->peers+nranks; + struct ncclChannelPeer* root = channel->peers+nranks; // connector index: 0 for recv, 1 for send struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type; struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send); conn->transportComm = transportComm; // setup struct ncclConnect myConnect; - if (isMaster && support) { + if (isMaster) { NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type)); } // prepare connect handles @@ -218,11 +211,11 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect)); } // connect - if (isMaster && support) { + if (isMaster) { NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup); - struct ncclPeer* devRoot = channel->devPeers+nranks; - struct ncclConnector* devConn = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type; - CUDACHECKGOTO(cudaMemcpy(devConn, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice), res, cleanup); + struct ncclDevChannelPeer* devRoot = channel->devPeers+nranks; + struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type; + CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup); } // recv side sends connect info to send side if (isMaster && type == collNetRecv) { @@ -231,7 +224,7 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup); TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer); } - if (support) fail = 0; + fail = 0; cleanup: if (allConnects != NULL) free(allConnects); if (masterConnects != NULL) free(masterConnects); @@ -260,7 +253,7 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) { // Free collNet resources for (int r=0; rnChannels; r++) { struct ncclChannel* channel = comm->channels+r; - struct ncclPeer* peer = channel->peers+comm->nRanks; + struct ncclChannelPeer* peer = channel->peers+comm->nRanks; for (int b=0; bsend + b; if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send)); diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index 771a18f..0404aa8 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -128,9 +128,9 @@ struct recvResources { int collNetRank; }; -/* Determine if we can communicate with the peer */ static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { - *ret = 1; + // This transport cannot be used for p2p + *ret = 0; return ncclSuccess; } @@ -154,7 +154,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn)); NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); - INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev, + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(comm), req.netDev, req.useGdr ? "/GDRDMA" : ""); return ncclSuccess; } @@ -172,7 +172,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t))); - INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev, + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(comm), req.netDev, req.useGdr ? "/GDRDMA" : ""); return ncclSuccess; } @@ -297,7 +297,7 @@ ncclResult_t sharedListen(struct ncclComm* comm, int netDev, void* collNetHandle comm->proxyState.progressState.collNet.resources = resources; } if (resources->collNetComms[netDev] == NULL) - NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev)); + NCCLCHECK(collNetListen(comm, netDev, collNetHandle, resources->collNetListenComms+netDev)); return ncclSuccess; } @@ -311,13 +311,13 @@ static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct nccl struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i); handlePtrs[i] = &(info->collNetHandle); } - ncclResult_t ret = collNetConnect((void**)handlePtrs, nranks, rank, + ncclResult_t ret = collNetConnect(comm, (void**)handlePtrs, nranks, rank, resources->collNetListenComms[netDev], resources->collNetComms+netDev); free(handlePtrs); if (ret == ncclSuccess) { // Close listen comm - NCCLCHECK(collNetCloseListen(resources->collNetListenComms[netDev])); + NCCLCHECK(collNetCloseListen(comm, resources->collNetListenComms[netDev])); } else { resources->collNetListenComms[netDev] = NULL; } @@ -331,7 +331,7 @@ static ncclResult_t sharedFree(struct ncclComm* comm, int netDev) { struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources; resources->commRefCount[netDev]--; if (resources->commRefCount[netDev] == 0) { - NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev])); + NCCLCHECK(collNetCloseColl(comm, resources->collNetComms[netDev])); } for (int n=0; ncommRefCount[n]) return ncclSuccess; comm->proxyState.progressState.collNet.resources = NULL; @@ -447,9 +447,22 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); - NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size, - resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, - &resources->sendMhandles[NCCL_PROTO_SIMPLE])); +#if CUDA_VERSION >= 11070 + /* DMA-BUF support */ + if (resources->useGdr && comm->dmaBufSupport) { + int dmabuf_fd; + CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); + NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size, + NCCL_PTR_CUDA, 0ULL, dmabuf_fd, + &resources->sendMhandles[NCCL_PROTO_SIMPLE])); + (void)close(dmabuf_fd); + } else // FALL-THROUGH to nv_peermem GDR path +#endif + { + NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, + &resources->sendMhandles[NCCL_PROTO_SIMPLE])); + } *((struct connectMap**)respBuff) = &resources->map; return ncclSuccess; @@ -503,9 +516,22 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); - NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size, - resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, - &resources->mhandles[NCCL_PROTO_SIMPLE])); +#if CUDA_VERSION >= 11070 + /* DMA-BUF support */ + if (resources->useGdr && comm->dmaBufSupport) { + int dmabuf_fd; + CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); + NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size, + NCCL_PTR_CUDA, 0ULL, dmabuf_fd, + &resources->mhandles[NCCL_PROTO_SIMPLE])); + (void)close(dmabuf_fd); + } else // FALL-THROUGH to nv_peermem GDR path +#endif + { + NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, + &resources->mhandles[NCCL_PROTO_SIMPLE])); + } // Pass info to send side info->reqFifo = resources->reqFifo; @@ -521,7 +547,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct struct sendResources* resources = (struct sendResources*)(connection->transportResources); for (int p=0; psendMhandles[p]) { - NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[p])); + NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p])); } } struct connectMapMem* mems = resources->map.mems; @@ -538,7 +564,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct struct recvResources* resources = (struct recvResources*)(connection->transportResources); for (int p=0; pmhandles[p]) { - NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[p])); + NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p])); } } struct connectMapMem* mems = resources->map.mems; @@ -625,10 +651,10 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg int sharedBuffSlot = sub->transmitted%NCCL_STEPS; if (reqFifo[group][buffSlot].recvBuff != NULL) { int totalSize = (s-group*COLLNET_GROUP_NSUBS+1) * args->sharedSize[sharedBuffSlot]; - int count = totalSize / ncclTypeSize(args->dtype); + int count = totalSize / ncclTypeSize((ncclDataType_t)args->dtype); reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot]; char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot]; - NCCLCHECK(collNetIallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot)); + NCCLCHECK(collNetIallreduce(comm, resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot)); if (sub->requests[buffSlot] == NULL) continue; TRACE(NCCL_NET, "sendProxy [%d/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]); @@ -644,7 +670,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg int done, size; int group = s / COLLNET_GROUP_NSUBS; int buffSlot = (sub->base+sub->done)%NCCL_STEPS; - NCCLCHECK(collNetTest((void*)(sub->requests[buffSlot]), &done, &size)); + NCCLCHECK(collNetTest(comm, (void*)(sub->requests[buffSlot]), &done, &size)); if (done) { TRACE(NCCL_NET, "sendProxy [%d/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size); // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush) @@ -735,7 +761,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg int startChannel = group*COLLNET_GROUP_NSUBS; int offset; NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset)); - NCCLCHECK(collNetIflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot)); + NCCLCHECK(collNetIflush(comm, resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot)); } } else { for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps; @@ -749,7 +775,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg int group = s / COLLNET_GROUP_NSUBS; int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS; int done = 1; - if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(sub->requests[buffSlot], &done, NULL)); + if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(comm, sub->requests[buffSlot], &done, NULL)); if (done) { TRACE(NCCL_NET, "recvProxy [%d/%d/%d] flushed", sub->flushed, group, buffSlot); for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps; diff --git a/src/transport/net.cc b/src/transport/net.cc index e96f189..be3afc4 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -181,10 +181,10 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); if (proxyRank == myInfo->rank) { - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev, + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); } else { - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev, + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev, proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); } *((int*)connectInfo) = proxyRank; @@ -217,7 +217,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph req.remoteRank = peerInfo->rank; NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t))); - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev, + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); return ncclSuccess; } @@ -447,7 +447,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc resources->channelId = req->channelId; resources->connIndex = req->connIndex; ncclNetProperties_t props; - NCCLCHECK(ncclNetGetProperties(req->netDev, &props)); + NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props)); resources->maxRecvs = props.maxRecvs; // We don't return any data @@ -473,11 +473,11 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc resources->channelId = req->channelId; resources->connIndex = req->connIndex; ncclNetProperties_t props; - NCCLCHECK(ncclNetGetProperties(req->netDev, &props)); + NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props)); resources->maxRecvs = props.maxRecvs; if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError; - NCCLCHECK(ncclNetListen(req->netDev, respBuff, &resources->netListenComm)); + NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm)); *done = 1; return ncclSuccess; } @@ -504,15 +504,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks)); } struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank; - if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, comms->sendComm+resources->channelId)); + if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId)); resources->netSendComm = comms->sendComm[resources->channelId]; if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++; } else { - NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm)); + NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm)); } } else { // Connect to remote peer - NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm)); + NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm)); connection->proxyAppendPtr = &connection->proxyAppend; } if (resources->netSendComm == NULL) { @@ -586,7 +586,19 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str for (int p=0; pbuffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]); if (resources->buffers[p]) { - NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); +#if CUDA_VERSION >= 11070 + /* DMA-BUF support */ + int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST; + if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) { + int dmabuf_fd; + CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); + NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); + (void)close(dmabuf_fd); + } else // FALL-THROUGH to nv_peermem GDR path +#endif + { + NCCLCHECK(ncclNetRegMr(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); + } } } @@ -620,15 +632,15 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks)); } struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank; - if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(resources->netListenComm, comms->recvComm+resources->channelId)); + if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId)); resources->netRecvComm = comms->recvComm[resources->channelId]; if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++; } else { - NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm)); + NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm)); } } else { // Connect to remote peer - NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm)); + NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm)); connection->proxyAppendPtr = &connection->proxyAppend; } if (resources->netRecvComm == NULL) { @@ -636,7 +648,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str return ncclSuccess; } *done = 1; - NCCLCHECK(ncclNetCloseListen(resources->netListenComm)); + NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm)); // Create structures struct connectMap* map = &resources->map; @@ -691,7 +703,19 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str for (int p=0; pbuffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]); if (resources->buffers[p]) { - NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); +#if CUDA_VERSION >= 11070 + /* DMA-BUF support */ + int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST; + if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) { + int dmabuf_fd; + CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); + NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); + (void)close(dmabuf_fd); + } else // FALL-THROUGH to nv_peermem GDR path +#endif + { + NCCLCHECK(ncclNetRegMr(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); + } } } @@ -709,7 +733,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct } for (int p=0; pbuffers[p]) { - NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[p])); + NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p])); } } struct connectMapMem* mems = resources->map.mems; @@ -725,12 +749,12 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank; comms->sendRefCount[resources->channelId]--; - if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comms->sendComm[resources->channelId])); + if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comm, comms->sendComm[resources->channelId])); } else { - NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); + NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm)); } } else { - NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); + NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm)); } free(resources); return ncclSuccess; @@ -744,7 +768,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct } for (int p=0; pbuffers[p]) { - NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[p])); + NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p])); } } struct connectMapMem* mems = resources->map.mems; @@ -756,12 +780,12 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank; comms->recvRefCount[resources->channelId]--; - if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comms->recvComm[resources->channelId])); + if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comm, comms->recvComm[resources->channelId])); } else { - NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm)); + NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm)); } } else { - NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm)); + NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm)); } free(resources); return ncclSuccess; @@ -849,7 +873,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg } if (ready) { // Data is ready, try to send. - NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot)); + NCCLCHECK(ncclNetIsend(comm, resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot)); if (sub->requests[buffSlot] != NULL) { TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]); sizesFifo[buffSlot] = -1; @@ -867,7 +891,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg if (sub->done < sub->transmitted) { int done; int buffSlot = (sub->base+sub->done)%NCCL_STEPS; - NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL)); + NCCLCHECK(ncclNetTest(comm, sub->requests[buffSlot], &done, NULL)); if (done) { TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]); sub->done += args->sliceSteps; @@ -971,7 +995,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg uint64_t step = subGroup->posted; struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); void** requestPtr = subGroup->requests+(step%NCCL_STEPS); - NCCLCHECK(ncclNetIrecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr)); + NCCLCHECK(ncclNetIrecv(comm, resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr)); if (*requestPtr) { for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup+i; @@ -993,7 +1017,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg int sizes[NCCL_PROXY_MAX_SUBS]; void* mhandles[NCCL_PROXY_MAX_SUBS]; for (int i=0; irequests[step%NCCL_STEPS], &done, sizes)); + NCCLCHECK(ncclNetTest(comm, subGroup->requests[step%NCCL_STEPS], &done, sizes)); if (done) { int useGdr = 0; int totalSize = 0; @@ -1034,7 +1058,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg } } struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); - NCCLCHECK(ncclNetIflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS))); + NCCLCHECK(ncclNetIflush(comm, resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS))); } } args->idle = 0; @@ -1049,7 +1073,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg uint64_t step = subGroup->transmitted; int done = 1; void* request = subGroup->requests[step%NCCL_STEPS]; - if (request) NCCLCHECK(ncclNetTest(request, &done, NULL)); + if (request) NCCLCHECK(ncclNetTest(comm, request, &done, NULL)); if (done) { for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index d3d4f9a..d4bb8cf 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -274,6 +274,31 @@ ncclResult_t ncclIbGdrSupport(int ibDev) { return ncclSuccess; } +// Detect whether DMA-BUF support is present in the kernel +// Returns : +// ncclSuccess : DMA-BUF support is available +// ncclSystemError : DMA-BUF is not supported by the kernel +ncclResult_t ncclIbDmaBufSupport(int dev) { + static int dmaBufSupported = -1; + if (dmaBufSupported == -1) { + ncclResult_t res; + struct ibv_pd* pd; + struct ibv_context* ctx; + ctx = ncclIbDevs[dev].context; + NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure); + // Test kernel DMA-BUF support with a dummy call (fd=-1) + (void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/); + // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP if not supported (EBADF otherwise) + dmaBufSupported = (errno != EOPNOTSUPP) ? 1 : 0; + NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure); + } + if (dmaBufSupported == 0) return ncclSystemError; + return ncclSuccess; +failure: + dmaBufSupported = 0; + return ncclSystemError; +} + static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) { memcpy(addr, &ncclIbIfAddr, sizeof(*addr)); return ncclSuccess; @@ -286,10 +311,11 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) { props->pciPath = ncclIbDevs[dev].pciPath; props->guid = ncclIbDevs[dev].guid; props->ptrSupport = NCCL_PTR_HOST; - if (ncclIbGdrSupport(dev) != ncclSuccess) { - INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName); - } else { - props->ptrSupport |= NCCL_PTR_CUDA; + if (ncclIbGdrSupport(dev) == ncclSuccess) { + props->ptrSupport |= NCCL_PTR_CUDA; // GDR support via nv_peermem + } + if (ncclIbDmaBufSupport(dev) == ncclSuccess) { + props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF } props->speed = ncclIbDevs[dev].speed; props->latency = 0; // Not set @@ -546,6 +572,7 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) { static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large"); memset(handle, 0, sizeof(struct ncclIbHandle)); comm->dev = dev; + comm->sock.asyncFlag = 1; /* nonblocking socket is required by network communication. */ NCCLCHECK(GetSocketAddr(&comm->sock.addr)); NCCLCHECK(ncclSocketListen(&comm->sock)); memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress)); @@ -580,7 +607,7 @@ ib_connect_check: /* expect user to call again */ return ncclSuccess; } else if (conState == ncclSocketError) { - return ncclSystemError; + return ncclRemoteError; } // IB Setup @@ -658,7 +685,6 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) { stage->comm = rComm; stage->state = ncclIbCommStateAccept; lComm->sock.asyncFlag = 1; - rComm->sock.asyncFlag = 1; ib_accept: NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock)); @@ -812,7 +838,8 @@ ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) { ncclResult_t ncclIbTest(void* request, int* done, int* size); -ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) { +/* DMA-BUF support */ +ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset"); assert(size > 0); @@ -822,7 +849,7 @@ ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhan struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm; struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache; uintptr_t addr = (uintptr_t)data & -pageSize; - int pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; + size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; ncclResult_t res; pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock); for (int slot=0; /*true*/; slot++) { @@ -834,14 +861,20 @@ ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhan // Deregister / register struct ibv_mr* mr; unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ; - if (ncclIbRelaxedOrderingEnabled) { - // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support - NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, (uintptr_t)addr, flags|IBV_ACCESS_RELAXED_ORDERING), res, returning); + if (ncclIbRelaxedOrderingEnabled) flags |= IBV_ACCESS_RELAXED_ORDERING; + if (fd != -1) { + /* DMA-BUF support */ + NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, verbs->pd, offset, pages*pageSize, addr, fd, flags), res, returning); + } else { + if (ncclIbRelaxedOrderingEnabled) { + // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support + NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, addr, flags), res, returning); + } + else { + NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning); + } } - else { - NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning); - } - TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey); + TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x fd %d", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey, fd); cache->population += 1; cache->slots[slot].addr = addr; cache->slots[slot].pages = pages; @@ -863,6 +896,10 @@ returning: return res; } +ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) { + return ncclIbRegMrDmaBuf(comm, data, (size_t)size, type, 0ULL, -1, mhandle); +} + ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm; struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache; @@ -916,13 +953,16 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { // Write size as immediate data. In the case of multi-send, only write // 0 or 1 as size to indicate whether there was data sent or received. - uint64_t immData = 0; + uint32_t immData = 0; if (nreqs == 1) { immData = reqs[0]->send.size; } else { - uint8_t* multiImmData = (uint8_t*)&immData; + if (nreqs > 32) { + WARN("Cannot store sizes of %d requests in a 32-bits field", nreqs); + return ncclInternalError; + } for (int r=0; rsend.size ? 1 : 0; + immData |= (reqs[r]->send.size ? 1 : 0) << r; } } @@ -1197,7 +1237,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { char line[SOCKET_NAME_MAXLEN+1]; WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d", ncclSocketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err); - return ncclSystemError; + return ncclRemoteError; } struct ncclIbRequest* req = r->verbs->reqs+(wc->wr_id & 0xff); @@ -1212,9 +1252,8 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { if (req->type != NCCL_NET_IB_REQ_RECV) return ncclInternalError; if (req->nreqs > 1) { // In the case of a multi recv, we only set sizes to 0 or 1. - uint8_t* sizes = (uint8_t*)&wc->imm_data; for (int i=0; inreqs; i++) { - req->recv.sizes[i] |= sizes[i]; + req->recv.sizes[i] = (wc->imm_data >> i) & 0x1; } } else { req->recv.sizes[0] += wc->imm_data; @@ -1275,6 +1314,7 @@ ncclNet_t ncclNetIb = { ncclIbConnect, ncclIbAccept, ncclIbRegMr, + ncclIbRegMrDmaBuf, ncclIbDeregMr, ncclIbIsend, ncclIbIrecv, diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index 9e14aa2..a0d80d3 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -311,6 +311,7 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) { struct ncclSocketListenComm* comm; NCCLCHECK(ncclSocketNewListenComm(&comm)); NCCLCHECK(GetSocketAddr(dev, &comm->sock.addr)); + comm->sock.asyncFlag = 1; NCCLCHECK(ncclSocketListen(&comm->sock)); memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress)); NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads)); @@ -359,7 +360,7 @@ socket_connect_check: /* expect user to call again */ return ncclSuccess; } else if (conState == ncclSocketError) { - return ncclSystemError; + return ncclRemoteError; } stage->state = ncclSocketCommStateSend; @@ -616,6 +617,7 @@ ncclNet_t ncclNetSocket = { ncclSocketConnect, ncclSocketAccept, ncclSocketRegMr, + NULL, // No DMA-BUF support ncclSocketDeregMr, ncclSocketIsend, ncclSocketIrecv, diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 9859c87..414f05d 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -7,6 +7,7 @@ #include "comm.h" #include "graph.h" #include "utils.h" +#include "shm.h" struct ncclP2pBuff { void* directPtr; @@ -17,6 +18,34 @@ struct p2pConnectInfo { int rank; int read; struct ncclP2pBuff p2pBuff; + // Use by CE memcpy + char shmName[7]; + int shmSize; +}; +static_assert(sizeof(struct p2pConnectInfo) <= CONNECT_SIZE, "p2pConnectInfo is too large"); + +struct p2pShm { + struct ncclSendMem sendMem; + struct ncclRecvMem recvMem; +}; +struct p2pProxyInfo { + // Shared memory between proxy and receiving GPU + struct p2pShm* shm; + struct p2pShm* devShm; + char shmName[7]; + int shmSize; + + // Intermediate step for sender + struct ncclRecvMem* ceRecvMem; + char* ceDevBuff; + + // Receiver buffer + char* recvFifo; + + // Used by progress only + uint64_t step; + cudaStream_t stream; + cudaEvent_t events[NCCL_STEPS]; }; static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large"); @@ -24,12 +53,16 @@ struct p2pSendResources { struct ncclSendMem* devMem; void* sendMemIpc; void* recvMemIpc; + struct p2pProxyInfo proxyInfo; }; struct p2pRecvResources { struct ncclRecvMem* devMem; void* sendMemIpc; void* recvMemIpc; + struct p2pShm* shm; + struct p2pShm* devShm; + int shmSize; }; #include @@ -51,8 +84,14 @@ static int busIdToCudaDev(int64_t busId) { return -1; } +NCCL_PARAM(P2pUseCudaMemcpy, "P2P_USE_CUDA_MEMCPY", 0); +static int useMemcpy = 0; +static void initCeOperation(); + /* Determine if two peers can communicate through p2p */ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { + initCeOperation(); + // Rule out different nodes / isolated containers if (info1->hostHash != info2->hostHash || info1->shmDev != info2->shmDev) { *ret = 0; @@ -63,7 +102,10 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop int intermediateRank; NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank)); if (*ret == 0) return ncclSuccess; - if (intermediateRank != -1) return ncclSuccess; + if (intermediateRank != -1) { + if (useMemcpy) *ret = 0; + return ncclSuccess; + } // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) int cudaDev1 = busIdToCudaDev(info1->busId); @@ -170,6 +212,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st send->transportResources = resources; int useRead, intermediateRank; NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank)); + if (useMemcpy) useRead = 0; static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; @@ -185,14 +228,14 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st if (intermediateRank == -1) { info->rank = myInfo->rank; - if (myInfo->pidHash == peerInfo->pidHash) { + if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) { if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr); } else { send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; - INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s", - channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr); + INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s", + channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : ""); } } else { info->rank = intermediateRank; @@ -202,9 +245,15 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st } NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn)); - NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); + if (useMemcpy) { + NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo))); + info->shmSize = resources->proxyInfo.shmSize; + memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName)); + } else { + NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); + NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc)); + } - NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc)); return ncclSuccess; } @@ -230,7 +279,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st if (intermediateRank == -1) { info->rank = myInfo->rank; - if (myInfo->pidHash == peerInfo->pidHash) { + if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) { if (ncclParamP2pDirectDisable() == 0) recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; } else { recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; @@ -258,30 +307,61 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co for (int p=0; pread && p == NCCL_PROTO_SIMPLE) { /* For P2P Read the SIMPLE buffer is local (ncclSendMem) */ + if (resources->devMem == NULL) return ncclInternalError; // We should not use read + memcpy send->conn.buffs[p] = (char*)(resources->devMem+1); } else { send->conn.buffs[p] = buff; buff += send->comm->buffSizes[p]; } } - send->conn.tail = &remDevMem->tail; - send->conn.head = &resources->devMem->head; - send->conn.ptrExchange = &resources->devMem->ptrExchange; - send->conn.redOpArgExchange = resources->devMem->redOpArgExchange; + + if (useMemcpy) { + send->conn.tail = &resources->proxyInfo.ceRecvMem->tail; + send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo; + send->conn.head = &resources->proxyInfo.devShm->sendMem.head; + // Send SIMPLE buff to proxy, and replace it by local buffer + NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0)); + send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff; + } else { + send->conn.tail = &remDevMem->tail; + send->conn.head = &resources->devMem->head; + send->conn.ptrExchange = &resources->devMem->ptrExchange; + send->conn.redOpArgExchange = resources->devMem->redOpArgExchange; + } return ncclSuccess; } /* Connect/Recv from this peer */ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources; - struct ncclSendMem* remDevMem; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; - NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc)); + struct ncclSendMem* remDevMem = NULL; + + if (useMemcpy) { + char shmPath[PATH_MAX]; + sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName); + TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); + resources->shmSize = info->shmSize; + NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, 0)); + // Remove the file to ensure proper clean-up + NCCLCHECK(ncclShmUnlink(shmPath)); + + recv->conn.tail = &resources->devShm->recvMem.tail; + recv->conn.head = &resources->devShm->sendMem.head; + } else { + NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc)); + + recv->conn.tail = &resources->devMem->tail; + recv->conn.head = &remDevMem->head; + recv->conn.ptrExchange = &remDevMem->ptrExchange; + recv->conn.redOpArgExchange = remDevMem->redOpArgExchange; + } char* buff = (char*)(resources->devMem+1); for (int p=0; pread && p == NCCL_PROTO_SIMPLE) { + if (remDevMem == NULL) return ncclInternalError; // We should not use read + memcpy /* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */ recv->conn.buffs[p] = (char*)(remDevMem+1); } else { @@ -289,10 +369,6 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn buff += recv->comm->buffSizes[p]; } } - recv->conn.tail = &resources->devMem->tail; - recv->conn.head = &remDevMem->head; - recv->conn.ptrExchange = &remDevMem->ptrExchange; - recv->conn.redOpArgExchange = remDevMem->redOpArgExchange; return ncclSuccess; } @@ -308,11 +384,52 @@ ncclResult_t p2pRecvFree(struct ncclConnector* recv) { struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources; if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); + if (useMemcpy) { + NCCLCHECK(ncclShmClose(resources->shm, resources->devShm, resources->shmSize)); + } free(resources); return ncclSuccess; } -static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { +static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + if (useMemcpy) { + struct p2pProxyInfo* proxyInfo; + NCCLCHECK(ncclCalloc(&proxyInfo, 1)); + connection->transportResources = proxyInfo; + + NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, comm->buffSizes[NCCL_PROTO_SIMPLE])); + + char shmPath[PATH_MAX]; + shmPath[0] = '\0'; + proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem); + NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1)); + TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize); + memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName)); + + NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); + + if (respSize != sizeof(struct p2pProxyInfo)) return ncclInternalError; + memcpy(respBuff, proxyInfo, sizeof(struct p2pProxyInfo)); + } else { + if (reqSize != sizeof(int)) return ncclInternalError; + int size = *((int*)reqBuff); + if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError; + struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff; + NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size)); + connection->transportResources = p2pBuff->directPtr; + cudaError_t res = cudaIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr); + if (res != cudaSuccess) { + WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res)); + cudaFree(p2pBuff->directPtr); + free(p2pBuff); + CUDACHECK(res); + } + } + *done = 1; + return ncclSuccess; +} + +static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (reqSize != sizeof(int)) return ncclInternalError; int size = *((int*)reqBuff); if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError; @@ -330,15 +447,116 @@ static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct return ncclSuccess; } -static ncclResult_t p2pProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { +static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources; + + if (reqSize != sizeof(void*)) return ncclInternalError; + proxyInfo->recvFifo = *((char**)reqBuff); + + CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking)); + for (int i=0; ievents+i)); + } + connection->proxyAppendPtr = &connection->proxyAppend; + return ncclSuccess; +} + +static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { + if (useMemcpy) { + struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources; + NCCLCHECK(ncclShmClose(proxyInfo->shm, proxyInfo->devShm, proxyInfo->shmSize)); + NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem)); + CUDACHECK(cudaFree(proxyInfo->ceDevBuff)); + CUDACHECK(cudaStreamDestroy(proxyInfo->stream)); + for (int i=0; ievents[i])); + } + free(proxyInfo); + } else { + // Do not check return code as CUDA may have already shut down + cudaFree(connection->transportResources); + } + return ncclSuccess; +} + +static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { // Do not check return code as CUDA may have already shut down cudaFree(connection->transportResources); return ncclSuccess; } +static ncclResult_t p2pSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { + if (args->state == ncclProxyOpReady) { + for (int s=0; snsubs; s++) { + struct ncclProxySubArgs* sub = args->subs+s; + struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources); + // Round to next multiple of sliceSteps + sub->base = ROUNDUP(resources->step, args->chunkSteps); + sub->posted = sub->transmitted = sub->done = 0; + } + args->state = ncclProxyOpProgress; + } + args->idle = 1; + if (args->state == ncclProxyOpProgress) { + int p = args->protocol; + int stepSize = comm->buffSizes[p] / NCCL_STEPS; + for (int s=0; snsubs; s++) { + struct ncclProxySubArgs* sub = args->subs+s; + struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources); + if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy + resources->step = sub->base + sub->nsteps; + args->done++; + continue; + } + if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) { + int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; + volatile int* sizesFifo = resources->ceRecvMem->sizesFifo; + volatile uint64_t* recvTail = &resources->ceRecvMem->tail; + // Check GPU has sent everything + if ((*recvTail > sub->base+sub->transmitted)) { + int size = sizesFifo[buffSlot]; + CUDACHECK(cudaMemcpyAsync(resources->recvFifo+buffSlot*stepSize, resources->ceDevBuff+buffSlot*stepSize, size, cudaMemcpyDeviceToDevice, resources->stream)); + CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream)); + sub->transmitted += args->sliceSteps; + } + } + if (sub->done < sub->transmitted) { + int buffSlot = (sub->base+sub->done)%NCCL_STEPS; + cudaError_t res = cudaEventQuery(resources->events[buffSlot]); + if (res != cudaErrorNotReady) CUDACHECK(res); + if (res == cudaSuccess) { + sub->done += args->sliceSteps; + // Notify SHM + resources->shm->recvMem.tail = sub->base + sub->done; + } + if (sub->done == sub->nsteps) { + resources->step = sub->base + sub->nsteps; + args->done++; + } + } + } + if (args->done == args->nsubs) { + args->state = ncclProxyOpNone; + } + } + return ncclSuccess; +} + struct ncclTransport p2pTransport = { "P2P", p2pCanConnect, - { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL }, - { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL } + { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL }, + { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL } }; + +static void initCeOperation() { + static int init = 0; + if (!init) { + useMemcpy = ncclParamP2pUseCudaMemcpy(); + if (useMemcpy) { + p2pTransport.send.proxyConnect = p2pSendProxyConnect; + p2pTransport.send.proxyProgress = p2pSendProxyProgress; + } + init = 1; + } +} diff --git a/src/transport/shm.cc b/src/transport/shm.cc index 974a2ab..4a6120a 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -31,11 +31,21 @@ struct shmRecvResources { struct ncclRecvMem* devHostMem; }; +#define SHM_SEND_SIDE 1 +#define SHM_RECV_SIDE 2 NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0); +NCCL_PARAM(ShmUseCudaMemcpy, "SHM_USE_CUDA_MEMCPY", 0); +NCCL_PARAM(ShmMemcpyMode, "SHM_MEMCPY_MODE", SHM_SEND_SIDE); // 1 is sender-side, 2 is receiver-side, 3 is both +static int useMemcpySend = 0; +static int useMemcpyRecv = 0; +NCCL_PARAM(ShmLocality, "SHM_LOCALITY", SHM_RECV_SIDE); // 1 is sender-size, 2 is receiver-size +static int shmLocality = 0; +static void initCeOperation(); /* Determine two peers can communicate with SHM */ -ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { +static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { *ret = 0; + initCeOperation(); if (ncclParamShmDisable() == 1) return ncclSuccess; @@ -55,7 +65,7 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop #define MAX_SHM_NAME_LEN 1024 /* Create and return connect structures for this peer to connect to me */ -ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { +static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct shmSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; @@ -65,16 +75,20 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st char shmPath[PATH_MAX]; shmPath[0] = '\0'; - info->shmSize = resources->shmSize = sizeof(struct ncclSendMem); + int shmSize = sizeof(struct ncclSendMem); + if (shmLocality == SHM_SEND_SIDE) { + for (int p=0; pcomm->buffSizes[p]; + } + info->shmSize = resources->shmSize = shmSize; NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize); memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName)); - INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via SHM/%s/%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useMemcpySend?"CE":"direct", useMemcpyRecv?"CE":"direct"); return ncclSuccess; } -ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { +static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { struct shmRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; @@ -85,7 +99,9 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st char shmPath[PATH_MAX]; shmPath[0] = '\0'; int shmSize = sizeof(struct ncclRecvMem); - for (int p=0; pcomm->buffSizes[p]; + if (shmLocality == SHM_RECV_SIDE) { + for (int p=0; pcomm->buffSizes[p]; + } info->shmSize = resources->shmSize = shmSize; NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize); @@ -94,8 +110,21 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st return ncclSuccess; } +struct shmProxyInfo { + struct ncclRecvMem* ceRecvMem; + char* devFifo; + char* shmFifo; + struct ncclSendMem* sendMem; + struct ncclRecvMem* recvMem; + + // used by progress only + uint64_t step; + cudaStream_t stream; + cudaEvent_t events[NCCL_STEPS]; +}; + /* Connect to this peer */ -ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { +static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { // Setup device pointers struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; struct shmSendResources* resources = (struct shmSendResources*)send->transportResources; @@ -108,19 +137,29 @@ ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectIn // Remove the file to ensure proper clean-up NCCLCHECK(ncclShmUnlink(shmPath)); - send->transportResources = resources; - int offset = 0; + char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1); for (int p=0; pconn.buffs[p] = (char*)(resources->devRemHostMem+1) + offset; - offset += send->comm->buffSizes[p]; + send->conn.buffs[p] = buff; + buff += send->comm->buffSizes[p]; } send->conn.tail = &resources->devRemHostMem->tail; - send->conn.head = &resources->devHostMem->head; + + if (useMemcpyRecv) { + send->conn.sizesFifo = resources->devRemHostMem->sizesFifo; + } + if (useMemcpySend) { + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn)); + struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem }; + NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo))); + send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo; + send->conn.tail = &proxyInfo.ceRecvMem->tail; + send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo; + } return ncclSuccess; } -ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { +static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { // Setup device pointers struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; @@ -131,18 +170,26 @@ ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); NCCLCHECK(ncclShmUnlink(shmPath)); - recv->conn.head = &resources->devRemHostMem->head; - int offset = 0; + char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1); for (int p=0; pconn.buffs[p] = (char*)(resources->devHostMem+1) + offset; - offset += recv->comm->buffSizes[p]; + recv->conn.buffs[p] = buff; + buff += recv->comm->buffSizes[p]; } + recv->conn.head = &resources->devRemHostMem->head; recv->conn.tail = &resources->devHostMem->tail; + + if (useMemcpyRecv) { + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn)); + struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem }; + NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo))); + recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo; + recv->conn.tail = &proxyInfo.ceRecvMem->tail; + } return ncclSuccess; } -ncclResult_t shmSendFree(struct ncclConnector* send) { +static ncclResult_t shmSendFree(struct ncclConnector* send) { struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources; NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); @@ -150,7 +197,7 @@ ncclResult_t shmSendFree(struct ncclConnector* send) { return ncclSuccess; } -ncclResult_t shmRecvFree(struct ncclConnector* recv) { +static ncclResult_t shmRecvFree(struct ncclConnector* recv) { struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); @@ -158,9 +205,209 @@ ncclResult_t shmRecvFree(struct ncclConnector* recv) { return ncclSuccess; } +static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct shmProxyInfo* proxyInfo; + NCCLCHECK(ncclCalloc(&proxyInfo, 1)); + if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError; + memcpy(proxyInfo, reqBuff, reqSize); + NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE])); + NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); + CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking)); + for (int i=0; ievents+i)); + } + connection->proxyAppendPtr = &connection->proxyAppend; + connection->transportResources = proxyInfo; + if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError; + memcpy(respBuff, proxyInfo, respSize); + return ncclSuccess; +} + +static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct shmProxyInfo* proxyInfo; + NCCLCHECK(ncclCalloc(&proxyInfo, 1)); + if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError; + memcpy(proxyInfo, reqBuff, reqSize); + NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE])); + NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); + CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking)); + for (int i=0; ievents+i)); + } + connection->proxyAppendPtr = &connection->proxyAppend; + connection->transportResources = proxyInfo; + if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError; + memcpy(respBuff, proxyInfo, respSize); + return ncclSuccess; +} + +static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { + struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources; + CUDACHECK(cudaStreamDestroy(resources->stream)); + CUDACHECK(cudaFree(resources->devFifo)); + NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); + for (int i=0; ievents[i])); + } + free(connection->transportResources); + return ncclSuccess; +} + +static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { + struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources; + CUDACHECK(cudaStreamDestroy(resources->stream)); + CUDACHECK(cudaFree(resources->devFifo)); + NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); + for (int i=0; ievents[i])); + } + free(connection->transportResources); + return ncclSuccess; +} + +static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { + if (args->state == ncclProxyOpReady) { + for (int s=0; snsubs; s++) { + struct ncclProxySubArgs* sub = args->subs+s; + struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources); + // Round to next multiple of sliceSteps + sub->base = ROUNDUP(resources->step, args->chunkSteps); + sub->posted = sub->transmitted = sub->done = 0; + } + args->state = ncclProxyOpProgress; + } + args->idle = 1; + if (args->state == ncclProxyOpProgress) { + int p = args->protocol; + int stepSize = comm->buffSizes[p] / NCCL_STEPS; + for (int s=0; snsubs; s++) { + struct ncclProxySubArgs* sub = args->subs+s; + struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources); + if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy + resources->step = sub->base + sub->nsteps; + args->done++; + continue; + } + if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) { + int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; + volatile int* sizesFifo = resources->ceRecvMem->sizesFifo; + volatile uint64_t* recvTail = &resources->ceRecvMem->tail; + // Check GPU has sent everything + if ((*recvTail > sub->base+sub->transmitted)) { + int size = sizesFifo[buffSlot]; + CUDACHECK(cudaMemcpyAsync(resources->shmFifo+buffSlot*stepSize, resources->devFifo+buffSlot*stepSize, size, cudaMemcpyDeviceToHost, resources->stream)); + CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream)); + resources->recvMem->sizesFifo[buffSlot] = size; + __sync_synchronize(); // make sure sizesFifo is visible + sub->transmitted += args->sliceSteps; + } + } + if (sub->done < sub->transmitted) { + int buffSlot = (sub->base+sub->done)%NCCL_STEPS; + cudaError_t res = cudaEventQuery(resources->events[buffSlot]); + if (res != cudaErrorNotReady) CUDACHECK(res); + if (res == cudaSuccess) { + sub->done += args->sliceSteps; + // Notify SHM + resources->recvMem->tail = sub->base + sub->done; + } + if (sub->done == sub->nsteps) { + resources->step = sub->base + sub->nsteps; + args->done++; + } + } + } + if (args->done == args->nsubs) { + args->state = ncclProxyOpNone; + } + } + return ncclSuccess; +} + +static ncclResult_t shmRecvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { + if (args->state == ncclProxyOpReady) { + for (int s=0; snsubs; s++) { + struct ncclProxySubArgs* sub = args->subs+s; + struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources); + // Round to next multiple of sliceSteps + sub->base = ROUNDUP(resources->step, args->chunkSteps); + sub->posted = sub->transmitted = sub->done = 0; + } + args->state = ncclProxyOpProgress; + } + args->idle = 1; + if (args->state == ncclProxyOpProgress) { + int p = args->protocol; + int stepSize = comm->buffSizes[p] / NCCL_STEPS; + for (int s=0; snsubs; s++) { + struct ncclProxySubArgs* sub = args->subs+s; + struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources); + if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy + resources->step = sub->base + sub->nsteps; + args->done++; + continue; + } + if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) { + int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; + volatile int* sizesFifo = resources->recvMem->sizesFifo; + volatile uint64_t* recvTail = &resources->recvMem->tail; + // Check data is ready in SHM + if ((*recvTail > sub->base+sub->transmitted)) { + int size = sizesFifo[buffSlot]; + CUDACHECK(cudaMemcpyAsync(resources->devFifo+buffSlot*stepSize, resources->shmFifo+buffSlot*stepSize, size, cudaMemcpyHostToDevice, resources->stream)); + CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream)); + sub->transmitted += args->sliceSteps; + } + } + if (sub->done < sub->transmitted) { + int buffSlot = (sub->base+sub->done)%NCCL_STEPS; + cudaError_t res = cudaEventQuery(resources->events[buffSlot]); + if (res != cudaErrorNotReady) CUDACHECK(res); + if (res == cudaSuccess) { + sub->done += args->sliceSteps; + // Notify GPU + resources->ceRecvMem->tail = sub->base + sub->done; + } + if (sub->done == sub->nsteps) { + resources->step = sub->base + sub->nsteps; + args->done++; + } + } + } + if (args->done == args->nsubs) { + args->state = ncclProxyOpNone; + } + } + return ncclSuccess; +} + struct ncclTransport shmTransport = { "SHM", shmCanConnect, { shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL }, { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL } }; + +static void initCeOperation() { + static int init = 0; + if (!init) { + useMemcpySend = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 1); + useMemcpyRecv = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 2); + if (useMemcpySend) { + shmTransport.send.proxyConnect = shmSendProxyConnect; + shmTransport.send.proxyFree = shmSendProxyFree; + shmTransport.send.proxyProgress = shmSendProxyProgress; + } + if (useMemcpyRecv) { + shmTransport.recv.proxyConnect = shmRecvProxyConnect; + shmTransport.recv.proxyFree = shmRecvProxyFree; + shmTransport.recv.proxyProgress = shmRecvProxyProgress; + } + shmLocality = ncclParamShmLocality(); + if (shmLocality != SHM_SEND_SIDE && shmLocality != SHM_RECV_SIDE) { + WARN("Ignoring SHM locality, must be 1 (sender side) or 2 (receiver side, default)"); + shmLocality = SHM_RECV_SIDE; + } + init = 1; + } +}