From c4e2aa6c792b4e94d0343c72ce20e71285238827 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Thu, 18 Aug 2022 02:53:17 -0700 Subject: [PATCH] 2.14.3-1 Add support for improved fault tolerance: non-blocking mode, new init function with config, and ncclCommFinalize function. Reintroduce collnet+chain algorithm, alongside collnet+direct. Add LL protocol for intra-node P2P (on by default) and network communication (off by default). Use network instead of shared memory when performance is better. Fix: wait for CUDA graph destroy before destroying comm with linked graph resources. Remove aggressive polling during enqueue. Fix DMABUF fallback on MOFED 5.4 and earlier. --- makefiles/version.mk | 4 +- src/bootstrap.cc | 29 +- src/channel.cc | 5 +- src/collectives/device/all_reduce.h | 115 ++++-- src/collectives/device/common.h | 5 +- src/collectives/device/functions.cu | 3 +- src/collectives/device/prims_ll.h | 21 +- src/collectives/device/prims_ll128.h | 14 +- src/collectives/device/sendrecv.h | 29 +- src/enqueue.cc | 255 ++++++------ src/graph/connect.cc | 35 +- src/graph/paths.cc | 92 +++-- src/graph/search.cc | 174 ++++----- src/graph/topo.cc | 69 ++-- src/graph/topo.h | 54 +-- src/graph/tuning.cc | 63 ++- src/group.cc | 380 +++++++++++++----- src/include/alloc.h | 24 +- src/include/checks.h | 10 +- src/include/collectives.h | 7 +- src/include/comm.h | 39 +- src/include/cudawrap.h | 32 +- src/include/devcomm.h | 12 +- src/include/graph.h | 6 +- src/include/group.h | 33 +- src/include/info.h | 3 +- src/include/proxy.h | 2 + src/init.cc | 565 +++++++++++++++++++++------ src/misc/cudawrap.cc | 95 ++--- src/misc/gdrwrap.cc | 29 +- src/misc/ibvwrap.cc | 36 +- src/misc/shmutils.cc | 11 +- src/misc/socket.cc | 69 ++-- src/nccl.h.in | 39 +- src/net.cc | 8 +- src/proxy.cc | 108 +++-- src/transport/coll_net.cc | 78 ++-- src/transport/net.cc | 56 ++- src/transport/net_ib.cc | 11 +- src/transport/net_socket.cc | 3 +- src/transport/p2p.cc | 56 ++- src/transport/shm.cc | 50 ++- 42 files changed, 1787 insertions(+), 942 deletions(-) diff --git a/makefiles/version.mk b/makefiles/version.mk index 496796a..55fa6cc 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 13 -NCCL_PATCH := 4 +NCCL_MINOR := 14 +NCCL_PATCH := 3 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/bootstrap.cc b/src/bootstrap.cc index 0ba89a5..b7e0576 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -104,8 +104,8 @@ static void *bootstrapRoot(void* args) { /* Receive addresses from all ranks */ do { struct ncclSocket sock; - sock.abortFlag = NULL; /* bootstrap root thread always uses blocking ncclSocketAccept. */ + NCCLCHECKGOTO(ncclSocketInit(&sock, NULL, NULL, 0), res, out); NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out); NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out); close(sock.fd); @@ -228,16 +228,17 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) { info.rank = rank; info.nranks = nranks; struct ncclSocket sock, listenSockRoot; - sock.abortFlag = listenSockRoot.abortFlag = comm->abortFlag; - sock.asyncFlag = listenSockRoot.asyncFlag = 0; + NCCLCHECK(ncclSocketInit(&sock, (union ncclSocketAddress*) id, comm->abortFlag, 0)); + NCCLCHECK(ncclSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->abortFlag, 0)); + NCCLCHECK(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->abortFlag, 0)); + NCCLCHECK(ncclSocketInit(&state->ringSendSocket, NULL, comm->abortFlag, 0)); + NCCLCHECK(ncclSocketInit(&state->ringRecvSocket, NULL, comm->abortFlag, 0)); // Create socket for other ranks to contact me - memcpy(&state->listenSock.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress)); NCCLCHECK(ncclSocketListen(&state->listenSock)); memcpy(&info.extAddressListen, &state->listenSock.addr, sizeof(union ncclSocketAddress)); // Create socket for root to contact me - memcpy(&listenSockRoot.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress)); NCCLCHECK(ncclSocketListen(&listenSockRoot)); memcpy(&info.extAddressListenRoot, &listenSockRoot.addr, sizeof(union ncclSocketAddress)); @@ -252,7 +253,6 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) { } // send info on my listening socket to root - memcpy(&sock.addr, id, sizeof(union ncclSocketAddress)); NCCLCHECK(ncclSocketConnect(&sock)); NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info))); close(sock.fd); @@ -276,8 +276,7 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) { NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks)); struct ncclSocket* proxySocket; NCCLCHECK(ncclCalloc(&proxySocket, 1)); - proxySocket->abortFlag = NULL; // proxy is aborted through a message - memcpy(&proxySocket->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, NULL, 0)); NCCLCHECK(ncclSocketListen(proxySocket)); memcpy(state->peerProxyAddresses+rank, &proxySocket->addr, sizeof(union ncclSocketAddress)); NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress))); @@ -317,9 +316,8 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) { struct bootstrapState* state = (struct bootstrapState*)commState; struct ncclSocket sock; - sock.abortFlag = state->abortFlag; - sock.asyncFlag = 0; - memcpy(&sock.addr, state->peerCommAddresses+peer, sizeof(union ncclSocketAddress)); + + NCCLCHECK(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->abortFlag, 1)); NCCLCHECK(ncclSocketConnect(&sock)); NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int))); NCCLCHECK(bootstrapNetSend(&sock, &tag, sizeof(int))); @@ -408,9 +406,7 @@ ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, // We can't know who we'll receive from, so we need to receive everything at once ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) { struct bootstrapState* state = (struct bootstrapState*)commState; - struct ncclSocket sock; - sock.abortFlag = state->abortFlag; // Search unexpected connections first NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock)); @@ -421,6 +417,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int s } // Then look for new connections + NCCLCHECK(ncclSocketInit(&sock, NULL, state->listenSock.abortFlag, 0)); while (1) { NCCLCHECK(ncclSocketAccept(&sock, &state->listenSock)); int newPeer, newTag; @@ -442,9 +439,9 @@ ncclResult_t bootstrapClose(void* commState) { WARN("Unexpected connections are not empty"); return ncclInternalError; } - close(state->listenSock.fd); - close(state->ringSendSocket.fd); - close(state->ringRecvSocket.fd); + if (state->listenSock.fd >= 0) close(state->listenSock.fd); + if (state->ringSendSocket.fd >= 0) close(state->ringSendSocket.fd); + if (state->ringRecvSocket.fd >= 0) close(state->ringRecvSocket.fd); free(state->peerCommAddresses); free(state); diff --git a/src/channel.cc b/src/channel.cc index 4d28a68..9587008 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -40,7 +40,10 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) { } ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { - if (channel->id == -1) return ncclSuccess; + /* channel peers are only valid when async init thread completes commAlloc() and + * the channel is intialized with initChannel(); if either is not done, this channel + * should never be free. */ + if (channel->id == -1 || channel->peers == NULL) return ncclSuccess; // Free transport proxy resources // Note: free all send resources first due to CollNet arrangement diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index 23f6d0a..3f12e5e 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -274,19 +274,19 @@ struct RunWorkElement -struct RunWorkElement { +struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { static constexpr int COLLNET_COPY_THREADS = 96; const int tid = threadIdx.x; const int bid = args->bid; const int nChannels = args->nChannels; - struct ncclDirect* tree = &ncclShmem.channel.collTree; + struct ncclDirect* direct = &ncclShmem.channel.collnetDirect; const ssize_t chunkSize = int(args->lastChunkSize); const ssize_t size = args->count; - const ssize_t loopSize = nChannels*tree->nHeads*chunkSize; + const ssize_t loopSize = nChannels*direct->nHeads*chunkSize; - const int hasUp = (tree->up[0] >= 0) ? 1 : 0; - const int hasDn = (tree->down[0] >= 0) ? 1 : 0; + const int hasUp = (direct->up[0] >= 0) ? 1 : 0; + const int hasDn = (direct->down[0] >= 0) ? 1 : 0; const int nThreadsScatter = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 3*COLLNET_COPY_THREADS : 0); const int nThreadsGather = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0); const int nThreadsBcast = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 2*COLLNET_COPY_THREADS); @@ -301,24 +301,24 @@ struct RunWorkElement, /*Direct=*/1, Proto, 0> - prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args); + prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize; - int nelem = min(tree->nHeads*chunkSize, size-offset); + ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize; + int nelem = min(direct->nHeads*chunkSize, size-offset); if (args->regUsed) { - prims.directScatter(offset, nelem, chunkSize, tree->headRank, tree->shift); + prims.directScatter(offset, nelem, chunkSize, direct->headRank, direct->shift); } else { - prims.scatter(offset, nelem, chunkSize, tree->headRank, tree->shift); + prims.scatter(offset, nelem, chunkSize, direct->headRank, direct->shift); } } - } else if (tid >= tidStartReduce && tree->out != -1) { + } else if (tid >= tidStartReduce && direct->out != -1) { int group = (3*Proto::MaxGroupWidth) | (1<<16); if (hasDn) { // Reduce, send to network Primitives, /*Direct=*/1, Proto, 0> - prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group, args); + prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, group, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize; + ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); if (args->regUsed) { prims.directRecvReduceSend(offset, offset, nelem); @@ -329,9 +329,9 @@ struct RunWorkElement, /*Direct=*/0, Proto, 0> - prims(tid-tidStartReduce, nThreadsReduce, nullptr, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group); + prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, group); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize; + ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); prims.send(offset, nelem); } @@ -340,29 +340,29 @@ struct RunWorkElement, /*Direct=*/1, Proto, 0> - prims(tid, nThreadsGather, tree->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args); + prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize; - int nelem = min(tree->nHeads*chunkSize, size-offset); - prims.directGather(offset, nelem, chunkSize, tree->headRank, tree->shift); + ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize; + int nelem = min(direct->nHeads*chunkSize, size-offset); + prims.directGather(offset, nelem, chunkSize, direct->headRank, direct->shift); } - } else if (tid >= tidStartBcast && tid < tidStartScatter && tree->out != -1) { + } else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) { int group = (1*Proto::MaxGroupWidth) | (0<<16); if (hasDn) { // Recv from network, broadcast Primitives, /*Direct=*/1, Proto, 0> - prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args); + prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize; + ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); prims.recvCopyDirectSend(offset, offset, nelem, /*postOp=*/true); } } else { // Recv from network (no post thread needed) Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidStartBcast, nThreadsBcast, &tree->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, group); + prims(tid-tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, group); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize; + ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); prims.recv(offset, nelem, /*postOp=*/true); } @@ -371,6 +371,73 @@ struct RunWorkElement +struct RunWorkElement { + __device__ __forceinline__ void run(ncclWorkElem *args) { + const int tid = threadIdx.x; + const int nthreads = args->nWarps*WARP_SIZE; + const int bid = args->bid; + const int nChannels = args->nChannels; + ncclTree *tree = &ncclShmem.channel.collnetChain; + ssize_t chunkSize = int(args->lastChunkSize); + const ssize_t loopSize = int(nChannels*chunkSize); + const ssize_t size = args->count; + + int nthreadsSplit = nthreads/2; + if (nthreadsSplit >= 256) nthreadsSplit += 64; + + int group, send, recv, groupTid, groupNthreads; + using Proto = ProtoSimple<1, 1>; + if (tid < nthreadsSplit) { + group = (0*Proto::MaxGroupWidth) | (1<<16); + recv = tree->down[0]; + send = tree->up; + groupTid = tid; + groupNthreads = nthreadsSplit; + } else { + group = (1*Proto::MaxGroupWidth); + recv = tree->up; + send = tree->down[0]; + groupTid = tid - nthreadsSplit; + groupNthreads = nthreads-nthreadsSplit; + } + + Primitives, /*Direct=*/1, Proto, 0> + prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff, args->redOpArg, group); + + if (tid < nthreadsSplit) { + if (recv == -1) { + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + bid*int(chunkSize); + int nelem = min(chunkSize, size-offset); + prims.send(offset, nelem); + } + } else { + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + bid*int(chunkSize); + int nelem = min(chunkSize, size-offset); + prims.recvReduceSend(offset, nelem); + } + } + } + else { + if (send == -1) { + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + bid*int(chunkSize); + int nelem = min(chunkSize, size-offset); + prims.directRecv(offset, nelem); + } + } else { + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + bid*int(chunkSize); + int nelem = min(chunkSize, size-offset); + prims.directRecvCopySend(offset, offset, nelem); + } + } + } + } +}; + template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index ab333b4..310938f 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -231,7 +231,8 @@ __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \ #define IMPL_COLL3(func, devredop, type, ncclType) \ IMPL_COLL4(func, TREE, devredop, type, ncclType) \ IMPL_COLL4(func, RING, devredop, type, ncclType) \ - IMPL_COLL4(func, COLLNET, devredop, type, ncclType) + IMPL_COLL4(func, COLLNET_DIRECT, devredop, type, ncclType) \ + IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType) #if NCCL_TYPE == 0 #define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int8_t, ncclInt8) @@ -281,7 +282,7 @@ __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \ // Point-to-point primitives only have one function/kernel. #define IMPL_COLL_P(func) \ IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t); \ - IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, 0); + IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P); #else #define IMPL_COLL_C(func) #define IMPL_COLL_P(func) diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu index c7060f3..7c36064 100644 --- a/src/collectives/device/functions.cu +++ b/src/collectives/device/functions.cu @@ -18,7 +18,8 @@ __shared__ ncclShmemData ncclShmem; #define NCCL_FUNC4(func, devredop, type, nullify) \ NCCL_FUNC5(func, TREE, devredop, type, nullify), \ NCCL_FUNC5(func, RING, devredop, type, nullify), \ - NCCL_FUNC5(func, COLLNET, devredop, type, nullify) + NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \ + NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify) #if defined(__CUDA_BF16_TYPES_EXIST__) // Must be consistent with ncclDataType_t diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h index e8cc8e3..60f64ff 100644 --- a/src/collectives/device/prims_ll.h +++ b/src/collectives/device/prims_ll.h @@ -8,7 +8,10 @@ template class Primitives: public PrimitivesWithoutDirect> { - static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; + // In the case of Fan::MaxRecv == 0, we need to force MaxRecv to 1 for this to compile + // This is because of a recv buffer which is allocated to MaxRecv length in send-only cases + static constexpr int MaxRecv = Fan::MaxRecv > 1 ? Fan::MaxRecv : 1; + static constexpr int MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; RedOp redOp; const int tid; @@ -41,7 +44,10 @@ class Primitives: inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); } inline __device__ void barrier() { - asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group)); + if (nthreads == WARP_SIZE) + __syncwarp(); + else + asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group)); } uint32_t abort = 0; @@ -319,18 +325,19 @@ class Primitives: void const *inputBuf, void *outputBuf, uint64_t redOpArg, int group=0 ): redOp(redOpArg), - tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group), + tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group&(uint16_t)0xFFFF), stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) { - + int connIndex = group >> 16; auto *channel = &ncclShmem.channel; // If we are going to support oneshot collNet + LL, then we would need to add connector index here int nrecv=0, nsend=0; - while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) { - loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv); + // We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1 + while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) { + loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[connIndex], nrecv); nrecv++; } while (nsend < MaxSend && sendPeers[nsend] >= 0) { - loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend); + loadSendConn(&channel->peers[sendPeers[nsend]].send[connIndex], nsend); nsend++; } this->fan = Fan(nrecv, nsend); diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h index 93b6b4f..3136940 100644 --- a/src/collectives/device/prims_ll128.h +++ b/src/collectives/device/prims_ll128.h @@ -193,7 +193,8 @@ class Primitives: load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]); needReload |= flagThread && (vr[u+1] != flag); } - } while (__any_sync(WARP_MASK, needReload) && checkAbort(spins, 0, 0) == 0); + needReload &= (0 == checkAbort(spins, 0, 0)); + } while (__any_sync(WARP_MASK, needReload)); } /************* Finish register load **************/ @@ -234,7 +235,8 @@ class Primitives: load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]); needReload |= flagThread && (vr[u+1] != flag); } - } while (__any_sync(WARP_MASK, needReload) && checkAbort(spins, i, 0) == 0); + needReload &= (0 == checkAbort(spins, i, 0)); + } while (__any_sync(WARP_MASK, needReload)); #pragma unroll for (int u=0; u> 16; auto *channel = &ncclShmem.channel; int nrecv=0, nsend=0; while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) { - loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv); + loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[connIndex], nrecv); nrecv++; } while (nsend < MaxSend && sendPeers[nsend] >= 0) { - loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend); + loadSendConn(&channel->peers[sendPeers[nsend]].send[connIndex], nsend); nsend++; } this->fan = Fan(nrecv, nsend); diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h index feae653..ec1e20c 100644 --- a/src/collectives/device/sendrecv.h +++ b/src/collectives/device/sendrecv.h @@ -10,7 +10,8 @@ template struct RunWork { - __device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { + template + __device__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { void* buff = reinterpret_cast(uintptr_t(args->buffHi32)<<32 | args->buffLo32); size_t count = reinterpret_cast(size_t(args->countHi32)<<32 | args->countLo32); if (args->peer == ncclShmem.comm.rank) { @@ -20,8 +21,8 @@ struct RunWork { ReduceOrCopyMulti(tid, nthreads, nullptr, false, 1, (const T**)&buff, 1, (T**)&recvBuff, count); } } else { - using Proto = ProtoSimple<1, 1>; - int const chunkSize = args->chunkSize/sizeof(T); + int chunkSize = args->chunkSize/sizeof(T); + if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; int const peer = args->peer; Primitives, 1, Proto, 1> prims (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group); @@ -34,12 +35,13 @@ struct RunWork { } } - __device__ __forceinline__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { + template + __device__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { if (args->peer != ncclShmem.comm.rank) { - using Proto = ProtoSimple<1, 1>; void* buff = reinterpret_cast(uintptr_t(args->buffHi32)<<32 | args->buffLo32); ssize_t count = reinterpret_cast(size_t(args->countHi32)<<32 | args->countLo32); - int const chunkSize = args->chunkSize/sizeof(T); + int chunkSize = args->chunkSize/sizeof(T); + if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize int const peer = args->peer; Primitives, 1, Proto, 1> prims (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group); @@ -70,10 +72,21 @@ struct RunWork { if (args->p2pType == ncclWorkP2pTypeUnused) return; if (tid >= nthreads || args->peer == -1) return; + + // Select Proto here + // This is to allow the same kernel to run multiple primitives on different warps (thread groups) if ((group%2) == 0) { - runRecv(tid, nthreads, group, args); + if (args->proto == NCCL_PROTO_LL) { + runRecv(tid, nthreads, group, args); + } else { + runRecv>(tid, nthreads, group, args); + } } else { - runSend(tid, nthreads, group, args); + if (args->proto == NCCL_PROTO_LL) { + runSend(tid, nthreads, group, args); + } else { + runSend>(tid, nthreads, group, args); + } } } }; diff --git a/src/enqueue.cc b/src/enqueue.cc index d3fbbe5..7c6d835 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -17,102 +17,75 @@ static void* const ncclKernelGeneric = (void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t); +struct ncclKernelMatch { + void* kernelFn; + bool specialized; +}; + // Only generate inline kernels for LL -#define NCCL_FUNC5(func, algo, devredop, dtype) \ - /*LL */(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), \ - /*LL128 */nullptr /*(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype)*/, \ - /*SIMPLE*/nullptr /*(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype)*/ +#define NCCL_FUNC5(func, algo, devredop, dtype, specialized) \ + /*LL */{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), true && specialized}, \ + /*LL128 */{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), false && specialized}, \ + /*SIMPLE*/{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), false && specialized} -#define NCCL_FUNC4(func, devredop, type) \ - (void*)NCCL_FUNC5(func, TREE, devredop, type), \ - (void*)NCCL_FUNC5(func, RING, devredop, type), \ - (void*)NCCL_FUNC5(func, COLLNET, devredop, type) +#define NCCL_FUNC4(func, devredop, type, specialized) \ + NCCL_FUNC5(func, TREE, devredop, type, specialized), \ + NCCL_FUNC5(func, RING, devredop, type, specialized), \ + NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, specialized), \ + NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, specialized) -#if defined(__CUDA_BF16_TYPES_EXIST__) -// Must be consistent with ncclDataType_t -#define NCCL_FUNCS3A(func, devredop) \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, uint8_t), \ - (void*)NCCL_FUNC4(func, devredop, int32_t), \ - (void*)NCCL_FUNC4(func, devredop, uint32_t), \ - (void*)NCCL_FUNC4(func, devredop, int64_t), \ - (void*)NCCL_FUNC4(func, devredop, uint64_t), \ - (void*)NCCL_FUNC4(func, devredop, half), \ - (void*)NCCL_FUNC4(func, devredop, float), \ - (void*)NCCL_FUNC4(func, devredop, double), \ - (void*)NCCL_FUNC4(func, devredop, __nv_bfloat16) -#define NCCL_FUNCS3B(func, devredop) \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t) +#ifdef __CUDA_BF16_TYPES_EXIST__ + #define HAVE_BFLOAT16 1 #else -// Must be consistent with ncclDataType_t -#define NCCL_FUNCS3A(func, devredop) \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, uint8_t), \ - (void*)NCCL_FUNC4(func, devredop, int32_t), \ - (void*)NCCL_FUNC4(func, devredop, uint32_t), \ - (void*)NCCL_FUNC4(func, devredop, int64_t), \ - (void*)NCCL_FUNC4(func, devredop, uint64_t), \ - (void*)NCCL_FUNC4(func, devredop, half), \ - (void*)NCCL_FUNC4(func, devredop, float), \ - (void*)NCCL_FUNC4(func, devredop, double) -#define NCCL_FUNCS3B(func, devredop) \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t), \ - (void*)NCCL_FUNC4(func, devredop, int8_t) + #define HAVE_BFLOAT16 0 #endif +// Must be consistent with ncclDataType_t +#define NCCL_FUNCS3(func, devredop, reduction, specialized) \ + NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int8_t, int8_t), specialized), \ + NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint8_t, int8_t), specialized), \ + NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int32_t, int8_t), specialized), \ + NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint32_t, int8_t), specialized), \ + NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int64_t, int8_t), specialized), \ + NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint64_t, int8_t), specialized), \ + NCCL_FUNC4(func, devredop, MACRO_IF(reduction, half, int8_t), specialized), \ + NCCL_FUNC4(func, devredop, MACRO_IF(reduction, float, int8_t), specialized), \ + NCCL_FUNC4(func, devredop, MACRO_IF(reduction, double, int8_t), specialized) \ + MACRO_IF(HAVE_BFLOAT16, \ + SINGLE_ARG(, NCCL_FUNC4(func, devredop, MACRO_IF(reduction, __nv_bfloat16, int8_t), specialized)), \ + /*nothing*/ \ + ) + // Must be consistent with ncclDevRedOp_t -- but we only generate kernel for sums. -#define NCCL_FUNCS2A(func) \ - NCCL_FUNCS3A(func, Sum), /*Sum*/ \ - NCCL_FUNCS3A(func, Sum), /*Prod*/ \ - NCCL_FUNCS3A(func, Sum), /*Max*/ \ - NCCL_FUNCS3A(func, Sum), /*Min*/ \ - NCCL_FUNCS3A(func, Sum), /*PreMulSum*/ \ - NCCL_FUNCS3A(func, Sum) /*SumPostDiv*/ -#define NCCL_FUNCS2B(func) \ - NCCL_FUNCS3B(func, Sum), /*Sum*/ \ - NCCL_FUNCS3B(func, Sum), /*Prod*/ \ - NCCL_FUNCS3B(func, Sum), /*Max*/ \ - NCCL_FUNCS3B(func, Sum), /*Min*/ \ - NCCL_FUNCS3B(func, Sum), /*PreMulSum*/ \ - NCCL_FUNCS3B(func, Sum) /*SumPostDiv*/ +#define NCCL_FUNCS2(func, reduction) \ + NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/1), /*Sum*/ \ + NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Prod*/ \ + NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Max*/ \ + NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Min*/ \ + NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*PreMulSum*/ \ + NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0) /*SumPostDiv*/ // Must be consistent with the ncclFuncSet enum -static void* const ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = { - (void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), +static const ncclKernelMatch ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = { + {(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), true}, // We don't bake special kernels for the one-rank reductions - /*int8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), - /*uint8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), - /*int32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), - /*uint32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), - /*int64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), - /*uint64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), - /*half*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), - /*float*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), - /*double*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), - #if defined(__CUDA_BF16_TYPES_EXIST__) - /*bfloat16*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), + {/*int8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, + {/*uint8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, + {/*int32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, + {/*uint32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, + {/*int64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, + {/*uint64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, + {/*half*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, + {/*float*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, + {/*double*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, + #if HAVE_BFLOAT16 + {/*bfloat16*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, #endif - NCCL_FUNCS2B(Broadcast), - NCCL_FUNCS2A(Reduce), - NCCL_FUNCS2B(AllGather), - NCCL_FUNCS2A(ReduceScatter), - NCCL_FUNCS2A(AllReduce) + NCCL_FUNCS2(Broadcast, /*reduction=*/0), + NCCL_FUNCS2(Reduce, /*reduction=*/1), + NCCL_FUNCS2(AllGather, /*reduction=*/0), + NCCL_FUNCS2(ReduceScatter, /*reduction=*/1), + NCCL_FUNCS2(AllReduce, /*reduction=*/1) }; static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */); @@ -124,10 +97,8 @@ size_t ncclKernMaxLocalSize() { cudaFuncAttributes attr = {0}; size_t max = 0; for (int i = 0; i < numNcclKerns; i++) { - if (ncclKerns[i] != nullptr) { - CUDACHECKGOTO(cudaFuncGetAttributes(&attr, ncclKerns[i]), res, error); - if (attr.localSizeBytes > max) max = attr.localSizeBytes; - } + CUDACHECKGOTO(cudaFuncGetAttributes(&attr, ncclKerns[i].kernelFn), res, error); + if (attr.localSizeBytes > max) max = attr.localSizeBytes; } error: @@ -139,7 +110,7 @@ ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut) { ncclResult_t res = ncclSuccess; int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]); for (int i = 0; i < numNcclKerns; i++) { - CUDACHECKGOTO(cudaFuncSetAttribute(ncclKerns[i], cudaFuncAttributePreferredSharedMemoryCarveout, carveOut), res, error); + CUDACHECKGOTO(cudaFuncSetAttribute(ncclKerns[i].kernelFn, cudaFuncAttributePreferredSharedMemoryCarveout, carveOut), res, error); } error: @@ -331,14 +302,14 @@ static ncclResult_t addCollToPlan( workElemReg.elem = *workElem; // C++ struct assignment workElemReg.elem.regUsed = 1; for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) { - int peer = channel->collTree.down[i]; + int peer = channel->collnetDirect.down[i]; if (peer == -1) break; int j = comm->rankToLocalRank[peer]; // Get intra-node slot workElemReg.dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer workElemReg.dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer } for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) { - int peer = channel->collTree.up[i]; + int peer = channel->collnetDirect.up[i]; if (peer == -1) break; int j = comm->rankToLocalRank[peer]; // Output buffer of root peer @@ -360,6 +331,8 @@ static ncclResult_t addCollToPlan( return ncclSuccess; } +NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 16384); + // Put p2p op in plan assuming there is space in nWorkBudget, so you must // ensure *nWorkBudget >= 1 upon entry. static ncclResult_t addP2pToPlan( @@ -377,10 +350,16 @@ static ncclResult_t addP2pToPlan( NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, info.coll, &channelId)); info.channelId = channelId; + // 1 is connIndex + struct ncclConnInfo* conn = isSendNotRecv ? + &comm->channels[channelId].peers[peer].send[1].conn : &comm->channels[channelId].peers[peer].recv[1].conn; + info.protocol = ((conn->buffs[NCCL_PROTO_LL] != nullptr) && bytes <= ncclParamP2pLLThreshold()) ? NCCL_PROTO_LL : NCCL_PROTO_SIMPLE; + struct ncclProxyOp proxyOp = {}; NCCLCHECK(ncclProxyComputeP2p(&info, &proxyOp)); struct ncclWorkElemP2p elem = {0}; + elem.proto = info.protocol; elem.peer = peer; elem.nWarps = NCCL_MAX_NTHREADS/WARP_SIZE; elem.p2pType = isSendNotRecv ? ncclWorkP2pTypeSend : ncclWorkP2pTypeRecv; @@ -421,8 +400,6 @@ static void finishPlan(struct ncclKernelPlan* plan) { plan->channelCount = channelCount; plan->channelMask = channelMask; plan->hasProxyOps = hasProxyOps; - if (plan->kernelFn == nullptr) - plan->kernelFn = ncclKernelGeneric; plan->threadPerBlock = std::max(plan->threadPerBlock, 3*WARP_SIZE); } @@ -582,7 +559,7 @@ static ncclResult_t scheduleCollTasksToPlan( void* regBufSend[NCCL_MAX_LOCAL_RANKS]; void* regBufRecv[NCCL_MAX_LOCAL_RANKS]; if (plan->persistent && ncclParamGraphRegister() && - info.algorithm == NCCL_ALGO_COLLNET && // limited to CollNet for now + info.algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other comm->intraRanks < comm->localRanks) { // only with inter-process & intra-node peers NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, ®BufUsed, regBufSend, regBufRecv)); @@ -596,8 +573,10 @@ static ncclResult_t scheduleCollTasksToPlan( head = ncclIntruQueueHead(&tasks->collQueue); plan->threadPerBlock = std::max(plan->threadPerBlock, info.nThreads); - if (ncclKerns[workFuncIndex] != nullptr) - plan->kernelFn = ncclKerns[workFuncIndex]; + if (!plan->kernelSpecialized) { + plan->kernelFn = ncclKerns[workFuncIndex].kernelFn; + plan->kernelSpecialized = ncclKerns[workFuncIndex].specialized; + } } } return ncclSuccess; @@ -623,11 +602,15 @@ static ncclResult_t scheduleP2pTasksToPlan( int const *recvOrder = tasks->p2pRecvOrder; plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MAX_NTHREADS); + if (!plan->kernelSpecialized) { + plan->kernelFn = ncclKerns[FUNC_INDEX_P2P].kernelFn; + plan->kernelSpecialized = ncclKerns[FUNC_INDEX_P2P].specialized; + } // Compute how much to split operations // Natural step size matching buffer steps. ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; - if (comm->nNodes > 1) stepSize /= SENDRECV_SLICEFACTOR; + if (comm->nNodes > 1) stepSize = comm->p2pNetChunkSize; // Try to use all channels int nChannelsMax = comm->p2pnChannelsPerPeer; int nChannelsMin = nChannelsMax; @@ -723,7 +706,6 @@ static inline uint32_t rollingMin32(uint32_t a, uint32_t b) { // Spin until its safe to increase comm->workFifoSent to desiredSent. static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredSent) { if (__builtin_expect(rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent), false)) { - uint64_t t0 = clockNano(); while (1) { // We have to poll for notifications from device. uint32_t* doneLive = comm->workFifoDone; @@ -756,8 +738,7 @@ static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredSent) { // See if that was enough. if (!rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent)) break; - // Nope. Maintain vigorous spin for first 5us, then start yielding. - if (clockNano()-t0 >= 5*1000) sched_yield(); + sched_yield(); } } } @@ -883,10 +864,10 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim` if (plan->persistent) { comm->persistentRefs -= 1; - if (!ncclMainExited) NCCLCHECK(ncclCudaFree(plan->workHead)); + NCCLCHECK(ncclCudaFree(plan->workHead)); while (!ncclIntruQueueEmpty(&plan->ipcMemQueue)) { struct ncclPointerList* q = ncclIntruQueueDequeue(&plan->ipcMemQueue); - if (!ncclMainExited) CUDACHECKIGNORE(cudaIpcCloseMemHandle(q->ptr)); + CUDACHECKIGNORE(cudaIpcCloseMemHandle(q->ptr)); ncclMemoryPoolFree(&comm->memPool_ncclPointerList, q); } } @@ -913,7 +894,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { // Poll for callbacks sent to us from other threads. Typically these free // resources from to our memory pools. - NCCLCHECK(ncclCommPollCallbacks(comm)); + NCCLCHECK(ncclCommPollCallbacks(comm, /*waitSome=*/false)); // We already have one frame present which holds all of our tasks (which we // are about to schedule). Now push an additional frame for allocating @@ -1080,7 +1061,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i info->protocol = -1; int nAlgos = NCCL_NUM_ALGORITHMS; for (int a=0; anChannels > 0) ? info->nChannels : comm->nChannels; int nt = comm->maxThreads[info->algorithm][info->protocol]; int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol]; - if (info->algorithm == NCCL_ALGO_COLLNET) { + if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) { // CollNet channel tuning int ncSwitch = 16; bool flag = true; while (ncSwitch >= 1 && flag) { - while ((flag = info->nBytes < nc*nt*info->comm->channels[0].collTree.nHeads*threadThreshold) && nc > ncSwitch) { + while ((flag = info->nBytes < nc*nt*info->comm->channels[0].collnetDirect.nHeads*threadThreshold) && nc > ncSwitch) { if (nc == ncSwitch+ncSwitch/2) threadThreshold /= 2; nc--; } @@ -1125,7 +1106,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i nt += WARP_SIZE; // Extra warp for sync // More threads or sync warps needed due to split thread model if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE; - if (info->algorithm == NCCL_ALGO_COLLNET) nt += 3*WARP_SIZE; + if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) nt += 3*WARP_SIZE; + if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) nt += 3*WARP_SIZE; } nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt; info->nChannels = nc; @@ -1143,7 +1125,11 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) { case ncclFuncAllGather: info->pattern = ncclPatternRing; break; case ncclFuncAllReduce: - info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUpDown : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break; + info->pattern = + info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect : + info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain : + info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : + ncclPatternRingTwice; break; default: WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm); return ncclInternalError; @@ -1158,9 +1144,10 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { case ncclPatternTreeUpDown: case ncclPatternPipelineFrom: case ncclPatternPipelineTo: + case ncclPatternCollnetChain: info->nstepsPerLoop = info-> nchunksPerLoop = 1; break; - case ncclPatternCollTreeUpDown: - info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collTree.nHeads; break; + case ncclPatternCollnetDirect: + info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collnetDirect.nHeads; break; case ncclPatternRing: info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break; case ncclPatternRingTwice: @@ -1217,15 +1204,22 @@ comp_next: } // Use lastChunkSize as chunkSize work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); - } else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) { + } else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) { // Optimize chunkSize / nSteps - while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*64 && chunkSize > 131072) chunkSize /= 2; - while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 65536) chunkSize /= 2; - while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 32768) chunkSize /= 2; + while (info->nBytes / (info->nChannels*info->comm->channels[0].collnetDirect.nHeads*chunkSize) < info->comm->channels[0].collnetDirect.depth*64 && chunkSize > 131072) chunkSize /= 2; + while (info->nBytes / (info->nChannels*info->comm->channels[0].collnetDirect.nHeads*chunkSize) < info->comm->channels[0].collnetDirect.depth*8 && chunkSize > 65536) chunkSize /= 2; + while (info->nBytes / (info->nChannels*info->comm->channels[0].collnetDirect.nHeads*chunkSize) < info->comm->channels[0].collnetDirect.depth*8 && chunkSize > 32768) chunkSize /= 2; // Use lastChunkSize as chunkSize work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); // Set direct direction for broadcast-gather (read or write) work->direct = (info->nBytes / info->nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ; + } else if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) { + stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; + chunkSize = std::min(256*1024, stepSize*chunkSteps); + while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth*64 && chunkSize > 131072) chunkSize /= 2; + while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth*8 && chunkSize > 65536) chunkSize /= 2; + while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2; + work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->protocol == NCCL_PROTO_LL) { const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine); const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize; @@ -1254,7 +1248,7 @@ comp_next: proxyOp->chunkSize = chunkSize; proxyOp->protocol = info->protocol; proxyOp->dtype = info->datatype; - proxyOp->redOp = info->algorithm != NCCL_ALGO_COLLNET ? ncclNumOps : // Only set redOp when using CollNet + proxyOp->redOp = (info->algorithm != NCCL_ALGO_COLLNET_DIRECT && info->algorithm != NCCL_ALGO_COLLNET_CHAIN) ? ncclNumOps : // Only set redOp when using CollNet info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum info->op; proxyOp->pattern = info->pattern; @@ -1444,30 +1438,43 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { NCCLCHECK(ncclGroupStartInternal()); ncclResult_t ret = ncclSuccess; int devOld = -1; - NCCLCHECKGOTO(PtrCheck(info->comm, info->opName, "comm"), ret, end0); + + NCCLCHECKGOTO(PtrCheck(info->comm, info->opName, "comm"), ret, fail); + // Check whether communicator is ready to communicate + NCCLCHECKGOTO(ncclCommEnsureReady(info->comm), ret, fail); + if (info->comm->checkPointers) { - CUDACHECKGOTO(cudaGetDevice(&devOld), ret, end0); - CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end0); + CUDACHECKGOTO(cudaGetDevice(&devOld), ret, fail); + CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, fail); } - NCCLCHECKGOTO(ArgsCheck(info), ret, end1); + NCCLCHECKGOTO(ArgsCheck(info), ret, fail); INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); TRACE_CALL("nccl%s(%" PRIx64 ",%" PRIx64 ",%zi,%d,%d,%d,%p,%p)", info->opName, reinterpret_cast(info->sendbuff), reinterpret_cast(info->recvbuff), info->count, info->datatype, info->op, info->root, info->comm, info->stream); - NCCLCHECKGOTO(taskAppend(info->comm, info), ret, end1); + NCCLCHECKGOTO(taskAppend(info->comm, info), ret, fail); -end1: - if (devOld != -1) CUDACHECKGOTO(cudaSetDevice(devOld), ret, end0); -end0: +exit: + if (devOld != -1) CUDACHECK(cudaSetDevice(devOld)); ncclGroupErrCheck(ret); NCCLCHECK(ncclGroupEndInternal()); + /* if depth is 1, ncclGroupEndInternal() will trigger group ops. The state can change + * so we have to check state here. */ + if (info->comm && !info->comm->blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) }; return ret; +fail: + if (info->comm && !info->comm->blocking) (void) ncclCommSetAsyncError(info->comm, ret); + goto exit; } NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) { + NCCLCHECK(PtrCheck(comm, "ncclRedOpCreatePreMulSum", "comm")); + /* join init thread before creating PreMulSum op. */ + NCCLCHECK(ncclCommEnsureReady(comm)); + if (comm->userRedOpFreeHead == comm->userRedOpCapacity) { // double capacity and resize int cap = 2*comm->userRedOpCapacity; diff --git a/src/graph/connect.cc b/src/graph/connect.cc index da9a360..01ff282 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -25,13 +25,15 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclChannel* channel = comm->channels+c; channel->ring.prev = channel->ring.next = -1; channel->tree.up = -1; + channel->collnetChain.up = -1; for (int i=0; itree.down[i] = -1; - channel->collTree.out = -1; - channel->collTree.headRank = -1; - channel->collTree.nHeads = 0; - channel->collTree.shift = 0; - for (int i=0; icollTree.up[i] = -1; - for (int i=0; icollTree.down[i] = -1; + for (int i=0; icollnetChain.down[i] = -1; + channel->collnetDirect.out = -1; + channel->collnetDirect.headRank = -1; + channel->collnetDirect.nHeads = 0; + channel->collnetDirect.shift = 0; + for (int i=0; icollnetDirect.up[i] = -1; + for (int i=0; icollnetDirect.down[i] = -1; int* ringIntra = ringGraph->intra+c*localRanks; int* treeIntra = treeGraph->intra+c*localRanks; @@ -53,6 +55,8 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, topoRanks->treeToChild1[c] = treeIntra[child1Index]; channel->tree.up = i == 0 ? -1 : treeIntra[i-1]; channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1]; + channel->collnetChain.up = i == 0 ? comm->nRanks : treeIntra[i-1]; + channel->collnetChain.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1]; } } topoRanks->ringPrev[c] = channel->ring.prev; @@ -185,13 +189,13 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* int nDown = 0; for (int i=0; icollTree.headRank = i; // Mark the index for deciding offset in the CUDA kernel - channel->collTree.out = comm->nRanks; // Set root of collTree to id nranks + channel->collnetDirect.headRank = i; // Mark the index for deciding offset in the CUDA kernel + channel->collnetDirect.out = comm->nRanks; // Set root of collnetDirect to id nranks int* collNetIntra = collNetGraph->intra+i*localRanks; sprintf(line+strlen(line), "down "); for (int r=0; rcollTree.down[nDown++] = collNetIntra[r]; // connect to all peers + channel->collnetDirect.down[nDown++] = collNetIntra[r]; // connect to all peers sprintf(line+strlen(line), " %d ", collNetIntra[r]); } sprintf(line+strlen(line), "nDown %d ", nDown); @@ -203,15 +207,16 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* sprintf(line+strlen(line), "up "); for (int h=0; hcollTree.up[nUp++] = heads[h]; + channel->collnetDirect.up[nUp++] = heads[h]; sprintf(line+strlen(line), " %d ", heads[h]); } - channel->collTree.nHeads = nHeads; - channel->collTree.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously - channel->collTree.depth = (nUp == 0 && nDown == 0) ? 1 : 2; + channel->collnetDirect.nHeads = nHeads; + channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously + channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2; sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads); - sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collTree.headRank, channel->collTree.out, channel->collTree.shift); + sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift); INFO(NCCL_GRAPH, "%s", line); + channel->collnetChain.depth = comm->nRanks/comm->nNodes; } free(heads); return ncclSuccess; @@ -296,7 +301,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa // Setup CollNet if (comm->collNetSupport == 1) { // Add more channels to saturate intra-node bandwidth, except the 1 PPN case - if (collNetGraph->speedIntra > collNetGraph->speedInter && comm->nRanks > comm->nNodes) { + if (collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) { int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2); nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext); } diff --git a/src/graph/paths.cc b/src/graph/paths.cc index ab8f8c3..f8918b1 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -44,7 +44,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT struct ncclTopoLinkList* basePath; NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath)); basePath->count = 0; - basePath->width = LOC_WIDTH; + basePath->bw = LOC_BW; basePath->type = PATH_LOC; while (nodeList.count) { @@ -61,13 +61,13 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT } struct ncclTopoLinkList* remPath; NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath)); - float width = std::min(path->width, link->width); + float bw = std::min(path->bw, link->bw); // allow routing through a GPU only as 1 hop if (node != baseNode && node->type == GPU && (ncclParamNvbDisable() || link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue; - if ((remPath->width == 0 || remPath->count > path->count) && remPath->width < width) { + if ((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) { // Find reverse link for (int l=0; lnlinks; l++) { if (remNode->links[l].remNode == node) { @@ -83,7 +83,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT // Copy the rest of the path for (int i=0; icount; i++) remPath->list[i+1] = path->list[i]; remPath->count = path->count + 1; - remPath->width = width; + remPath->bw = bw; // Start with path type = link type. PATH and LINK types are supposed to match. // Don't consider LINK_NET as we only care about the NIC->GPU path. @@ -129,9 +129,9 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id); offset = strlen(line); } - INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].width); + INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].bw); #else - sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, topoPathTypeStr[node->paths[t][n].type]); + sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].bw, topoPathTypeStr[node->paths[t][n].type]); offset = strlen(line); #endif } @@ -185,7 +185,7 @@ static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, srcNode->paths[t2][i2].count = l; srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type); if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN; - srcNode->paths[t2][i2].width = std::min(srcNode->paths[tx][ix].width, cpuNode->paths[t2][i2].width); + srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw); return ncclSuccess; } @@ -399,6 +399,40 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int return ncclSuccess; } +NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0); + +// Check whether going through the network would be faster than going through P2P/SHM. +ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net) { + if (ncclParamNetDisableIntra() == 1) { + *net = 0; + return ncclSuccess; + } + *net = 1; + // First check the current GPU-to-GPU speed. + int g1, g2; + if (ncclTopoIdToIndex(system, GPU, id1, &g1) != ncclSuccess || + ncclTopoIdToIndex(system, GPU, id2, &g2) != ncclSuccess) { + return ncclSuccess; + } + + struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1; + struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2; + float speed = gpu1->paths[GPU][g2].bw; + + // Now check the speed each GPU can access the network through PXB or better + float netSpeed1 = 0, netSpeed2 = 0; + for (int n=0; nnodes[NET].count; n++) { + struct ncclTopoLinkList* path = gpu1->paths[NET]+n; + if (path->type <= PATH_PXB && path->bw > netSpeed1) netSpeed1 = path->bw; + path = gpu2->paths[NET]+n; + if (path->type <= PATH_PXB && path->bw > netSpeed2) netSpeed2 = path->bw; + } + + if (netSpeed1 > speed && netSpeed2 > speed) return ncclSuccess; + *net = 0; + return ncclSuccess; +} + ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank) { // Get GPU and NET int n, g; @@ -476,17 +510,23 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm // Remove everything in case we're re-computing for (int t=0; tnodes[CPU].count; c++) { NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system)); } - // Set direct paths from/to GPUs. + // Set direct paths to GPUs. for (int g=0; gnodes[GPU].count; g++) { - // Compute paths to GPU g NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system)); + } - // Update path when we don't want to / can't use GPU Direct P2P + // Set direct paths to NICs. + for (int n=0; nnodes[NET].count; n++) { + NCCLCHECK(ncclTopoSetPaths(system->nodes[NET].nodes+n, system)); + } + + // Update path for GPUs when we don't want to / can't use GPU Direct P2P + for (int g=0; gnodes[GPU].count; g++) { for (int p=0; pnodes[GPU].count; p++) { int p2p; NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL)); @@ -499,31 +539,32 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm } if (comm == NULL) continue; - // Remove GPUs we can't talk to because of containers. + // Remove GPUs we can't (or don't want to) communicate with through P2P or SHM struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank; for (int p=0; pnodes[GPU].count; p++) { if (p == g) continue; struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank; - int shm; - NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo)); int p2p; NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo)); - if (shm == 0 && p2p == 0) { - // Mark this peer as inaccessible. We'll trim it later. - system->nodes[GPU].nodes[p].paths[GPU][g].count = 0; + if (p2p == 0) { + int shm; + NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo)); + if (shm == 0) { + // Mark this peer as inaccessible. We'll trim it later. + system->nodes[GPU].nodes[p].paths[GPU][g].count = 0; + } } } } - // Set direct paths from/to NICs. + // Update paths for NICs (no GPU Direct, PXN, ...) for (int n=0; nnodes[NET].count; n++) { struct ncclTopoNode* netNode = system->nodes[NET].nodes+n; - NCCLCHECK(ncclTopoSetPaths(netNode, system)); for (int g=0; gnodes[GPU].count; g++) { // Check whether we can access the NIC through another NVLink-connected GPU (PXN) struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; - if (ncclPxnDisable(comm) != 1 && gpu->paths[NET][n].type > PATH_PXB) { + if (ncclPxnDisable(comm) != 1) { int pxnGpu = -1; for (int p=0; pnodes[GPU].count; p++) { @@ -531,7 +572,12 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm // PXN = PCI + NVLink. struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p; - if (peerNode->paths[NET][n].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue; + // Only use PXN for NIC n if remote GPU p ... + if (peerNode->paths[NET][n].type > PATH_PXB || // Is connected to the NIC through PCI + peerNode->paths[GPU][g].type > PATH_NVL || // Is connected to us through NVLink + (peerNode->paths[NET][n].bw <= gpu->paths[NET][n].bw && // Has either higher BW to that NIC + gpu->paths[NET][n].type <= PATH_PXB)) // or avoids going through a CPU + continue; pxnGpu = p; @@ -626,8 +672,8 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /* // Local rank path = system->nodes[GPU].nodes[peer].paths[GPU]+g; if (path->type == PATH_NVL) { - float nvlWidth = ncclTopoNVLinkSpeed(system->nodes[GPU].nodes[g].gpu.cudaCompCap); - *nChannels = 2*std::max(1, (int)(path->width / nvlWidth)); + float nvlBw = ncclTopoNVLinkBw(system->nodes[GPU].nodes[g].gpu.cudaCompCap); + *nChannels = 2*std::max(1, (int)(path->bw / nvlBw)); } else { *nChannels = 2; } diff --git a/src/graph/search.cc b/src/graph/search.cc index 0f79258..27a8e43 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -10,39 +10,39 @@ #include "xml.h" #include -// Initialize system->maxWidth. This is the per-channel (i.e. per-SM) -// max speed. -static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) { - float maxWidth = 0.0; +// Initialize system->maxBw. This is the per-channel (i.e. per-SM) +// max bw. +static float getMaxBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) { + float maxBw = 0.0; for (int i=0; inodes[type].count; i++) { struct ncclTopoLinkList* path = gpu->paths[type]+i; - float width = path->width; + float bw = path->bw; if (path->count == 0) continue; - maxWidth = std::max(maxWidth, width); + maxBw = std::max(maxBw, bw); } - return maxWidth; + return maxBw; } -static float getTotalWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { - float nvlinkWidth = 0.0, pciWidth = 0.0; +static float getTotalBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { + float nvlinkBw = 0.0, pciBw = 0.0; for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = gpu->links+l; - if (link->type == LINK_NVL) nvlinkWidth += link->width; - if (link->type == LINK_PCI) pciWidth = link->width; + if (link->type == LINK_NVL) nvlinkBw += link->bw; + if (link->type == LINK_PCI) pciBw = link->bw; } - return std::max(pciWidth, nvlinkWidth); + return std::max(pciBw, nvlinkBw); } ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) { - system->maxWidth = 0.0; - system->totalWidth = 0.0; + system->maxBw = 0.0; + system->totalBw = 0.0; int inter = system->nodes[NET].count; if (inter == 0 && system->nodes[GPU].count == 1) { - system->maxWidth = LOC_WIDTH; + system->maxBw = LOC_BW; return ncclSuccess; } for (int g=0; gnodes[GPU].count; g++) { struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; - system->maxWidth = std::max(system->maxWidth, getMaxWidth(system, gpu, inter ? NET : GPU)); - system->totalWidth = std::max(system->totalWidth, getTotalWidth(system, gpu)); + system->maxBw = std::max(system->maxBw, getMaxBw(system, gpu, inter ? NET : GPU)); + system->totalBw = std::max(system->totalBw, getTotalBw(system, gpu)); } return ncclSuccess; } @@ -62,8 +62,8 @@ static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* // This is unfortunately needed since manipulating floats often results in rounding errors. #define SUB_ROUND(a, b) (a = roundf((a-b)*1000)/1000) -static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNode* start, int maxSteps, float speed, int* steps) { - float pciSpeed = speed; +static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNode* start, int maxSteps, float bw, int* steps) { + float pciBw = bw; for (int step=0; stepcount; step++) { struct ncclTopoNode* node = path->list[step]->remNode; if (node->type == CPU) { @@ -71,7 +71,7 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod if (path->type == PATH_PHB && start->type == GPU && node->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && node->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { - pciSpeed = INTEL_P2P_OVERHEAD(speed); + pciBw = INTEL_P2P_OVERHEAD(bw); } } } @@ -80,19 +80,19 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod for (int step=0; steplist[step]; struct ncclTopoLink* revLink = NULL; - float fwSpeed = link->type == LINK_PCI ? pciSpeed : speed; - float revSpeed = 0; + float fwBw = link->type == LINK_PCI ? pciBw : bw; + float revBw = 0; if (link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) { if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink)); - revSpeed += fwSpeed/8; + revBw += fwBw/8; } if (link->remNode->type == CPU && link->type == LINK_NVL) { if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink)); - revSpeed += fwSpeed; + revBw += fwBw; } - if (link->width < fwSpeed || (revSpeed && revLink->width < revSpeed)) { *steps = step; return ncclSuccess; } - SUB_ROUND(link->width, fwSpeed); - if (revSpeed) SUB_ROUND(revLink->width, revSpeed); + if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; } + SUB_ROUND(link->bw, fwBw); + if (revBw) SUB_ROUND(revLink->bw, revBw); node = link->remNode; } *steps = maxSteps; @@ -111,16 +111,16 @@ static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncc // Now check link type *node = NULL; int intra = type1 == GPU && type2 == GPU; - float speed = intra ? graph->speedIntra : graph->speedInter; + float bw = intra ? graph->bwIntra : graph->bwInter; int type = intra ? graph->typeIntra : graph->typeInter; if (mult == 1 && (path->type > type)) return ncclSuccess; - speed *= mult; + bw *= mult; // Check there is enough bandwidth on paths. int step = 0; - NCCLCHECK(followPath(path, node1, path->count, speed, &step)); + NCCLCHECK(followPath(path, node1, path->count, bw, &step)); if (step < path->count) goto rewind; // Enough bandwidth : return destination node. @@ -130,11 +130,11 @@ static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncc rewind: // Not enough bandwidth : rewind and exit. - NCCLCHECK(followPath(path, node1, step, -speed, &step)); + NCCLCHECK(followPath(path, node1, step, -bw, &step)); return ncclSuccess; } -static int gpuPciWidth(struct ncclTopoNode* gpu) { +static int gpuPciBw(struct ncclTopoNode* gpu) { for (int l=0; lnlinks; l++) { struct ncclTopoLink* gpuLink = gpu->links+l; if (gpuLink->type != LINK_PCI) continue; @@ -142,7 +142,7 @@ static int gpuPciWidth(struct ncclTopoNode* gpu) { for (int l=0; lnlinks; l++) { struct ncclTopoLink* pciLink = pci->links+l; if (pciLink->remNode != gpu) continue; - return std::min(gpuLink->width, pciLink->width); + return std::min(gpuLink->bw, pciLink->bw); } } return -1; @@ -154,29 +154,29 @@ struct ncclGpuScore { int g; // Retain the index int startIndex; // Least important int intraNhops; - int intraWidth; + int intraBw; int interNhops; - int interPciWidth; - int interWidth; // Most important + int interPciBw; + int interBw; // Most important }; static int cmpScore(const void * g1, const void * g2) { struct ncclGpuScore *s1 = (struct ncclGpuScore*)g1; struct ncclGpuScore *s2 = (struct ncclGpuScore*)g2; int d; - if ((d = (s2->interWidth - s1->interWidth))) return d; - if ((d = (s2->interPciWidth - s1->interPciWidth))) return d; + if ((d = (s2->interBw - s1->interBw))) return d; + if ((d = (s2->interPciBw - s1->interPciBw))) return d; if ((d = (s1->interNhops - s2->interNhops))) return d; - if ((d = (s2->intraWidth - s1->intraWidth))) return d; + if ((d = (s2->intraBw - s1->intraBw))) return d; if ((d = (s1->intraNhops - s2->intraNhops))) return d; return s1->startIndex - s2->startIndex; } static int cmpIntraScores(struct ncclGpuScore* scores, int count) { - int intraWidth = scores[0].intraWidth; + int intraBw = scores[0].intraBw; int intraNhops = scores[0].intraNhops; for (int i=1; inodes[GPU].nodes+g); - scores[count].interWidth = netPaths[g].width; + scores[count].interPciBw = gpuPciBw(system->nodes[GPU].nodes+g); + scores[count].interBw = netPaths[g].bw; } count++; } @@ -295,8 +295,8 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoG if (graph->nChannels < graph->minChannels) return ncclSuccess; // 2. Try to get better bandwidth - if (graph->nChannels*graph->speedIntra < refGraph->nChannels*refGraph->speedIntra) return ncclSuccess; - if (graph->nChannels*graph->speedIntra > refGraph->nChannels*refGraph->speedIntra) { + if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra) return ncclSuccess; + if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra) { *copy = 1; return ncclSuccess; } @@ -396,23 +396,23 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo // Balanced Tree : count half of the bandwidth on first two GPUs int nextBackToNet = -1; - float speedInterSave = graph->speedInter; + float bwInterSave = graph->bwInter; if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) { // Count half of the bandwidth on each of the first two GPUs if (step == 0) nextBackToNet = 1; else if (net->id != graph->inter[graph->nChannels*2+1]) continue; - graph->speedInter /= 2; + graph->bwInter /= 2; } NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net)); - graph->speedInter = speedInterSave; + graph->bwInter = bwInterSave; if (net) { graph->inter[graph->nChannels*2+1] = net->id; NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time)); - if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->speedInter /= 2; + if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2; NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net)); - graph->speedInter = speedInterSave; + graph->bwInter = bwInterSave; } } free(nets); @@ -451,7 +451,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo } ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) { - const int speed = graph->speedInter; + const int bw = graph->bwInter; int* nets; NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); int netcount; @@ -461,7 +461,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo struct ncclTopoNode* net = system->nodes[NET].nodes+n; struct ncclTopoNode* gpu; if (graph->collNet && net->net.collSupport == 0) continue; - if (net->net.width < speed) continue; + if (net->net.bw < bw) continue; if (net->net.maxChannels == 0) continue; graph->inter[graph->nChannels*2] = net->id; @@ -470,7 +470,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo for (int i=0; inodes[NET].count; i++) { if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) { - system->nodes[NET].nodes[i].net.width -= speed; + system->nodes[NET].nodes[i].net.bw -= bw; } } net->net.maxChannels--; @@ -490,26 +490,26 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } // Then try the most local GPUs - float maxWidth = 0; + float maxBw = 0; int minHops = 0xfffffff; struct ncclTopoLinkList* paths = net->paths[GPU]; for (int g=0; gnodes[GPU].count; g++) { - if (paths[g].width > maxWidth) { - maxWidth = paths[g].width; + if (paths[g].bw > maxBw) { + maxBw = paths[g].bw; minHops = paths[g].count; - } else if (paths[g].width == maxWidth && paths[g].count < minHops) { + } else if (paths[g].bw == maxBw && paths[g].count < minHops) { minHops = paths[g].count; } } - if (maxWidth >= speed) { + if (maxBw >= bw) { // In the first loop, avoid using GPUs in both directions between channels (one channel // sending from that GPU and one channel receiving to that GPU), since that usually leads // to lower BW. for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) { for (int g=0; gnodes[GPU].count; g++) { - if (paths[g].width == maxWidth && paths[g].count == minHops) { + if (paths[g].bw == maxBw && paths[g].count == minHops) { gpu = system->nodes[GPU].nodes+g; - int gpuUsed = gpuPciWidth(gpu) > 0 ? 0 : 1; + int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1; if (tryGpuBidir == gpuUsed) { NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g)); } @@ -523,7 +523,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo for (int i=0; inodes[NET].count; i++) { if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) { - system->nodes[NET].nodes[i].net.width += speed; + system->nodes[NET].nodes[i].net.bw += bw; } } } @@ -642,8 +642,8 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc NCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern)); NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels)); - NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->speedIntra)); - NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->speedInter)); + NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra)); + NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter)); if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0; const char* str; NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str)); @@ -701,8 +701,8 @@ ncclResult_t ncclTopoGetXmlFromGraph(struct ncclTopoGraph* graph, struct ncclTop NCCLCHECK(xmlSetAttrInt(xmlGraph, "pattern", graph->pattern)); NCCLCHECK(xmlSetAttrInt(xmlGraph, "crossnic", graph->crossNic)); NCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels)); - NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->speedIntra)); - NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->speedInter)); + NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->bwIntra)); + NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->bwInter)); NCCLCHECK(xmlSetAttrFloat(xmlGraph, "latencyinter", graph->latencyInter)); const char* str; NCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType)); @@ -737,7 +737,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph int ngpus = system->nodes[GPU].count; graph->crossNic = ncclParamCrossNic(); int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0; - graph->speedIntra = graph->speedInter = 0; + graph->bwIntra = graph->bwInter = 0; graph->latencyInter = 0; if (graph->crossNic == 2) graph->crossNic = 0; graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; @@ -767,7 +767,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph struct ncclTopoGraph tmpGraph; memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph)); - // First try crossnic, then decrease speed and finally increase speedIntra. + // First try crossnic, then decrease bw and finally increase bwIntra. int nspeeds = 0; float* speedArray = NULL; if (system->nodes[NET].count == 0) { @@ -779,8 +779,8 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph } int pass = 1; int speedIndex = 0; - while (speedArray[speedIndex] > system->maxWidth && speedIndex < nspeeds-1) speedIndex++; - tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex]; + while (speedArray[speedIndex] > system->maxBw && speedIndex < nspeeds-1) speedIndex++; + tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex]; int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT; search: @@ -791,7 +791,7 @@ search: NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time)); #if 0 - printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : ""); + printf("Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : ""); for (int c=0; cnChannels; c++) { printf("%2d : ", c); for (int g=0; gnChannels*graph->speedInter >= system->totalWidth) goto done; + if (graph->nChannels*graph->bwInter >= system->totalBw) goto done; if (pass == 1) { // First pass, we don't have a solution yet ; try other options @@ -846,14 +846,14 @@ search: } tmpGraph.pattern = graph->pattern; - // Decrease speed until we find a solution - if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->speedInter > .49))) { - tmpGraph.speedInter = tmpGraph.speedIntra = speedArray[++speedIndex]; + // Decrease bw until we find a solution + if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->bwInter > .49))) { + tmpGraph.bwInter = tmpGraph.bwIntra = speedArray[++speedIndex]; goto search; } speedIndex = 0; - while (speedArray[speedIndex] > system->maxWidth && speedIndex < nspeeds-1) speedIndex++; - tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex]; + while (speedArray[speedIndex] > system->maxBw && speedIndex < nspeeds-1) speedIndex++; + tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex]; } @@ -863,18 +863,18 @@ done: time = -1; memcpy(&tmpGraph, graph, sizeof(tmpGraph)); speedIndex = 0; - while (speedArray[speedIndex] > graph->speedInter && speedIndex < nspeeds-1) speedIndex++; - tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex]; + while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++; + tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex]; tmpGraph.minChannels = graph->nChannels; pass = 2; } - // 3. See if we can increase speedIntra for trees (2 nodes or collnet) + // 3. See if we can increase bwIntra for trees (2 nodes or collnet) if (pass == 2) { if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING && - tmpGraph.speedIntra == graph->speedIntra && tmpGraph.speedIntra < tmpGraph.speedInter*2 && + tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2 && speedIndex > 0) { - tmpGraph.speedIntra = speedArray[--speedIndex]; + tmpGraph.bwIntra = speedArray[--speedIndex]; goto search; } time = -1; @@ -885,24 +885,24 @@ done: WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern); for (int i=0; iintra[i] = system->nodes[GPU].nodes[i].gpu.rank; graph->inter[0] = graph->inter[1] = 0; - graph->speedIntra = graph->speedInter = 0.1; + graph->bwIntra = graph->bwInter = 0.1; graph->typeIntra = graph->typeInter = PATH_SYS; graph->nChannels = 1; } - if (graph->speedIntra >= 25.0) { + if (graph->bwIntra >= 25.0) { int dupChannels = std::min(graph->nChannels*2, graph->maxChannels); memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int)); memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int)); - graph->speedIntra /= DIVUP(dupChannels, graph->nChannels); - graph->speedInter /= DIVUP(dupChannels, graph->nChannels); + graph->bwIntra /= DIVUP(dupChannels, graph->nChannels); + graph->bwInter /= DIVUP(dupChannels, graph->nChannels); graph->nChannels = dupChannels; } return ncclSuccess; } ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { - INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %f/%f, type %s/%s, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, topoPathTypeStr[graph->typeIntra], topoPathTypeStr[graph->typeInter], graph->sameChannels); + INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, bw %f/%f, type %s/%s, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->bwIntra, graph->bwInter, topoPathTypeStr[graph->typeIntra], topoPathTypeStr[graph->typeInter], graph->sameChannels); int ngpus = system->nodes[GPU].count; char line[1024]; diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 2730bf9..9e4c978 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -56,24 +56,24 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode* return ncclSuccess; } -int interCpuWidth = 0; -int cpuPciWidth = 0; +int interCpuBw = 0; +int cpuPciBw = 0; -static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* width) { - *width = LOC_WIDTH; +static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) { + *bw = LOC_BW; if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) { - *width = P9_WIDTH; + *bw = P9_BW; return ncclSuccess; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_ARM) { - *width = ARM_WIDTH; + *bw = ARM_BW; return ncclSuccess; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { - *width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH; + *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { - *width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_WIDTH : ZPI_WIDTH; + *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW; } return ncclSuccess; } @@ -109,7 +109,7 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo n->nlinks=1; n->links[0].type = LINK_LOC; n->links[0].remNode = n; - n->links[0].width = LOC_WIDTH; + n->links[0].bw = LOC_BW; n->gpu.dev = NCCL_TOPO_UNDEF; n->gpu.rank = NCCL_TOPO_UNDEF; n->gpu.cudaCompCap = NCCL_TOPO_UNDEF; @@ -120,7 +120,7 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo } else if (type == NET) { n->net.asic = 0ULL; n->net.port = NCCL_TOPO_UNDEF; - n->net.width = 0.0; + n->net.bw = 0.0; n->net.latency = 0.0; } *node = n; @@ -150,8 +150,8 @@ ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int ind return ncclSuccess; } -ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width) { - // Aggregate links into higher width for NVLink +ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw) { + // Aggregate links into higher bw for NVLink struct ncclTopoLink* link; for (link = node->links; link->remNode; link++) { if (link->remNode == remNode && link->type == type) break; @@ -159,13 +159,13 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode if (link->remNode == NULL) node->nlinks++; link->type = type; link->remNode = remNode; - link->width += width; + link->bw += bw; // Sort links in BW descending order struct ncclTopoLink linkSave; memcpy(&linkSave, link, sizeof(struct ncclTopoLink)); while (link != node->links) { - if ((link-1)->width >= linkSave.width) break; + if ((link-1)->bw >= linkSave.bw) break; memcpy(link, link-1, sizeof(struct ncclTopoLink)); link--; } @@ -237,9 +237,9 @@ ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) { for (int n=0; nnodes[CPU].count; n++) { for (int p=0; pnodes[CPU].count; p++) { if (n == p) continue; - float width; - NCCLCHECK(ncclTopoGetInterCpuWidth(system->nodes[CPU].nodes+n, &width)); - NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, width)); + float bw; + NCCLCHECK(ncclTopoGetInterCpuBw(system->nodes[CPU].nodes+n, &bw)); + NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, bw)); } } return ncclSuccess; @@ -262,13 +262,13 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN struct ncclTopoLink* link = node->links+l; if (link->type == LINK_LOC) continue; if (link->type != LINK_PCI || link->remNode != prevNode) { - sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->width); + sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw); int nextOffset = strlen(line); if (link->type == LINK_PCI) { NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset)); } else { if (link->remNode->type == NET) { - sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.width); + sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw); } else { sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id); } @@ -280,7 +280,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN } ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) { - INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f totalWidth %2.1f ===", s->maxWidth, s->totalWidth); + INFO(NCCL_GRAPH, "=== System : maxBw %2.1f totalBw %2.1f ===", s->maxBw, s->totalBw); char line[1024]; for (int n=0; nnodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0)); INFO(NCCL_GRAPH, "=========================================="); @@ -335,7 +335,7 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s int mbps; NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0)); if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1 - net->net.width = mbps / 8000.0; + net->net.bw = mbps / 8000.0; if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0; NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0)); NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0)); @@ -343,8 +343,8 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0)); ncclDebugNoWarn = 0; - NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width)); - NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.width)); + NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.bw)); + NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.bw)); return ncclSuccess; } @@ -480,8 +480,8 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0)); if (nic == NULL) { NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0)); - NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH)); - NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH)); + NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW)); + NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW)); } NCCLCHECK(ncclTopoAddNic(node, system, nic)); } @@ -524,10 +524,10 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* } } if (remote) { - float nvlSpeed = ncclTopoNVLinkSpeed(gpu->gpu.cudaCompCap); - NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed)); + float nvlBw = ncclTopoNVLinkBw(gpu->gpu.cudaCompCap); + NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlBw)); if (remote->type != GPU) { - NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed)); + NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlBw)); } } } else { @@ -683,18 +683,18 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* i int g; NCCLCHECK(ncclTopoRankToIndex(system, rank, &g)); int minType = PATH_SYS; - float maxWidth = 0; + float maxBw = 0; int count = 0; int* nets; NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); for (int n=0; nnodes[NET].count; n++) { struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g; - if (path->width > maxWidth || (path->width == maxWidth && path->type < minType)) { - maxWidth = path->width; + if (path->bw > maxBw || (path->bw == maxBw && path->type < minType)) { + maxBw = path->bw; minType = path->type; count = 0; } - if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id; + if (path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id; } if (count == 0) { *id = -1; @@ -790,6 +790,11 @@ ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) { return ncclSuccess; } +ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count) { + *count = system->nodes[NVS].count; + return ncclSuccess; +} + ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax) { if (system->nodes[GPU].count == 0) return ncclInternalError; int min, max; diff --git a/src/graph/topo.h b/src/graph/topo.h index b24a72b..20a3e9d 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -10,23 +10,23 @@ #include "graph.h" #include "core.h" -#define LOC_WIDTH 5000.0 -#define SM60_NVLINK_WIDTH 18.0 -#define SM70_NVLINK_WIDTH 22.0 -#define SM80_NVLINK_WIDTH 22.0 -#define SM86_NVLINK_WIDTH 12.0 -#define PCI_WIDTH 12.0 // PCI Gen3 x16 -#define QPI_WIDTH 6.0 -#define SKL_QPI_WIDTH 9.0 -#define ZPI_WIDTH 6.0 -#define YONGFENG_ZPI_WIDTH 9.0 -#define P9_WIDTH 32.0 -#define ARM_WIDTH 6.0 -#define NET_WIDTH 12.0 // 100Gbit +#define LOC_BW 5000.0 +#define SM60_NVLINK_BW 18.0 +#define SM70_NVLINK_BW 22.0 +#define SM80_NVLINK_BW 22.0 +#define SM86_NVLINK_BW 12.0 +#define PCI_BW 12.0 // PCI Gen3 x16 +#define QPI_BW 6.0 +#define SKL_QPI_BW 9.0 +#define ZPI_BW 6.0 +#define YONGFENG_ZPI_BW 9.0 +#define P9_BW 32.0 +#define ARM_BW 6.0 +#define NET_BW 12.0 // 100Gbit // Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU // to GPU traffic consumes more PCI bandwidth. -#define INTEL_P2P_OVERHEAD(speed) (speed*6/5) +#define INTEL_P2P_OVERHEAD(bw) (bw*6/5) #define NCCL_TOPO_NODE_TYPES 7 #define GPU 0 @@ -78,7 +78,7 @@ extern const char* topoPathTypeStr[]; struct ncclTopoNode; struct ncclTopoLink { int type; - float width; + float bw; struct ncclTopoNode* remNode; }; #define NCCL_TOPO_MAX_LINKS 32 @@ -87,7 +87,7 @@ struct ncclTopoLink { struct ncclTopoLinkList { struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS]; int count; - float width; + float bw; int type; }; @@ -110,7 +110,7 @@ struct ncclTopoNode { struct { uint64_t asic; int port; - float width; + float bw; float latency; int gdrSupport; int collSupport; @@ -141,14 +141,14 @@ struct ncclTopoNodeSet { struct ncclTopoSystem { struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES]; - float maxWidth; - float totalWidth; + float maxBw; + float totalBw; }; ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id); ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id); ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id); -ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width); +ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw); ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system); ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system); ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank); @@ -192,13 +192,13 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in return ncclInternalError; } -// Returns NVLink speed in GB/s -static float ncclTopoNVLinkSpeed(int cudaCompCap) { +// Returns NVLink bw in GB/s +static float ncclTopoNVLinkBw(int cudaCompCap) { return - cudaCompCap == 86 ? SM86_NVLINK_WIDTH : - cudaCompCap >= 80 ? SM80_NVLINK_WIDTH : - cudaCompCap >= 70 ? SM70_NVLINK_WIDTH : - cudaCompCap >= 60 ? SM60_NVLINK_WIDTH : - SM80_NVLINK_WIDTH; + cudaCompCap == 86 ? SM86_NVLINK_BW : + cudaCompCap >= 80 ? SM80_NVLINK_BW : + cudaCompCap >= 70 ? SM70_NVLINK_BW : + cudaCompCap >= 60 ? SM60_NVLINK_BW : + SM80_NVLINK_BW; } #endif diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index b07ca38..bc5e969 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -53,7 +53,7 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li // Latencies in us, Bandwidths in GB/s // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple } -static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 10.0, 8.4 }, { 4.4, 4.4, 0 } }; +static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 10.0, 8.4 }, { 4.4, 4.4, 0 }, { 4.4, 4.4, 0 }}; // NVLink, PCI, Network #define NCCL_HW_NVLINK 0 @@ -62,11 +62,14 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, // Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network). static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { /* NVLINK */ - { /* Tree (LL/LL128/Simple)*/ { .52, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .47, 1.9, 3.4 }, /* CollNet (LL/LL128/Simple)*/ { .5, 1.2, 8.0 } }, + { /* Tree (LL/LL128/Simple)*/ { .52, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .47, 1.9, 3.4 }, + /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 } }, /* PCI */ - { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 1.0, 1.9, 8.0 } }, + { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, + /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 } }, /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 28 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } } + { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 9.6 }, + /* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 10.7 } } }; // LL128 max BW per channel @@ -75,15 +78,16 @@ static const double llMaxBws[2][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, static const double perChMaxTreeBws[2][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8} }; ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) { - int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS; + int simpleDefaultThreads = (ringGraph->bwIntra*ringGraph->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads); comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS); - comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS; - comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] = + comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = + comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS; + comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS); - comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] = + comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS); int nNodes = comm->nNodes; @@ -102,7 +106,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]; float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount - struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph }; + struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph, collNetGraph }; int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS]; for (int a=0; atypeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI; for (int a=0; aspeedIntra : graphs[a]->speedInter; - float busBw = graphs[a]->nChannels * speed; + int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0; + float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter; + float busBw = graphs[a]->nChannels * bw; // Various model refinements if (compCap80) busBw = std::min(busBw, 235.0f); @@ -129,7 +134,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh*graphs[a]->nChannels); - if (a == NCCL_ALGO_COLLNET && p != NCCL_PROTO_SIMPLE) busBw = 0; // Oneshot CollNet only supports Simple + if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used + if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used + if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) { + // Collnet+Direct requires all GPUs to have a local NIC to work at full speed + float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio + factor -= (factor-1)/2; + busBw /= factor; + } + if (a == NCCL_ALGO_COLLNET_CHAIN && p == NCCL_PROTO_SIMPLE) busBw *= .75; // Convert bus BW to algorithm BW float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps; @@ -155,9 +168,11 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom } else if (a == NCCL_ALGO_TREE) { comm->latencies[coll][a][p] += 2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat); - } else { + } else if (a == NCCL_ALGO_COLLNET_DIRECT) { comm->latencies[coll][a][p] += 2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.5) + interLat; // Add 0.5 arity serialization latency + } else if (a == NCCL_ALGO_COLLNET_CHAIN) { + comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat; } } } @@ -166,7 +181,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom // Protocols/Algorithms enable/disable, and user overrides. // All are enabled except ll128 which is enabled by default only in certain cases. int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 }; - int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1 }; + int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1 }; const char *protoStr = getenv("NCCL_PROTO"); if (protoStr) { @@ -180,12 +195,18 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom } // Disable CollNet if it is not supported if (comm->collNetSupport == 0) { - algoEnable[NCCL_ALGO_COLLNET] = 0; + algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0; + algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0; // If user has hard set NCCL_ALGO=COLLNET, ignore it if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) { algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1; if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET"); } + } else { + // Disable CollNet+Direct if not on an NVSwitch system + int nvsCount = 0; + NCCLCHECK(ncclTopoGetNvsCount(comm->topo, &nvsCount)); + if (nvsCount == 0) algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0; } for (int c=0; cthreadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD; } comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks; - comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = 512; + comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = 512; + comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = 512; // Override defaults with user env char* str = getenv("NCCL_THREAD_THRESHOLDS"); if (str) { INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str); - ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2}}; + ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }, { -2, -2, -2 }, { -2, -2, -2 }}; sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2); for (int a=0; athreadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL], comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128], comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL], comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128], comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], - comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_LL], - comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128], - comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE]); + comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE], + comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE]); return ncclSuccess; } diff --git a/src/group.cc b/src/group.cc index d9bc684..590068d 100644 --- a/src/group.cc +++ b/src/group.cc @@ -9,31 +9,52 @@ #include "enqueue.h" #include "transport.h" #include "channel.h" +#include __thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting __thread ncclResult_t ncclGroupError = ncclSuccess; __thread struct ncclComm* ncclGroupCommHead = nullptr; __thread struct ncclComm* ncclGroupCommPreconnectHead = nullptr; __thread struct ncclIntruQueue ncclAsyncJobs; +__thread struct ncclGroupJob *ncclGroupJobMainPtr = NULL; +__thread struct ncclGroupJob ncclGroupJobMain; +__thread int ncclGroupBlocking = -1; /* default mode */ +__thread bool ncclGroupJobAbortFlag = false; + +void* ncclAsyncJobMain(void* arg); +static ncclResult_t groupJobComplete(struct ncclGroupJob *job); ncclResult_t ncclAsyncLaunch( struct ncclAsyncJob* job, ncclResult_t(*func)(struct ncclAsyncJob*), void(*undo)(struct ncclAsyncJob*), - void(*destructor)(void*) + void(*destructor)(void*), ncclComm_t comm ) { - if (0 == ncclGroupDepth) { - ncclResult_t res = func(job); - if (res != ncclSuccess && undo) undo(job); + ncclResult_t ret = ncclSuccess; + + if (ncclGroupDepth == 0) { + ret = func(job); + if (ret != ncclSuccess && undo) undo(job); if (destructor) destructor(job); - return res; } else { job->func = func; job->undo = undo; job->destructor = destructor; + job->abortFlag = comm->abortFlag; + job->state = ncclGroupJobRunning; + job->comm = comm; + /* check if there are blocking and nonblocking comms at the same time in group. */ + if (ncclGroupBlocking == -1) { + /* first met communicator */ + ncclGroupBlocking = comm->blocking; + } else if (ncclGroupBlocking != comm->blocking) { + WARN("Blocking and nonblocking communicators are not allowed in the same group."); + ret = ncclInvalidArgument; + } ncclIntruQueueEnqueue(&ncclAsyncJobs, job); - return ncclSuccess; } + + return ret; } void* ncclAsyncJobMain(void* arg) { @@ -42,23 +63,50 @@ void* ncclAsyncJobMain(void* arg) { if (job->result != ncclSuccess) { INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, job->result); } + __atomic_store_n(&job->state, ncclGroupJobDone, __ATOMIC_RELEASE); return arg; } +ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) { + ncclResult_t ret; + SYSCHECK(pthread_join(job->thread, NULL), "pthread_join"); + if (job->result != ncclSuccess) { + WARN("ncclAsyncJobComplete: job %p failed, job error %d", job, job->result); + } + ret = job->result; + if (job->destructor) job->destructor((void*)job); + return ret; +} + NCCL_API(ncclResult_t, ncclGroupStart); ncclResult_t ncclGroupStart() { + ncclResult_t ret = ncclSuccess; NVTX3_FUNC_RANGE_IN(nccl_domain); + + /* if previous group launch does not complete, don't launch this one. */ + if (ncclGroupJobMainPtr != NULL) { + if (__atomic_load_n(&ncclGroupJobMainPtr->doneFlag, __ATOMIC_ACQUIRE) == false) { + ret = ncclInvalidUsage; + goto exit; + } else { + NCCLCHECKGOTO(groupJobComplete(ncclGroupJobMainPtr), ret, exit); + } + } NCCLCHECK(ncclGroupStartInternal()); TRACE_CALL("ncclGroupStart()"); - return ncclSuccess; + +exit: + return ret; } NCCL_API(ncclResult_t, ncclGroupEnd); ncclResult_t ncclGroupEnd() { + ncclResult_t ret = ncclSuccess; NVTX3_FUNC_RANGE_IN(nccl_domain); - NCCLCHECK(ncclGroupEndInternal()); + NCCLCHECKGOTO(ncclGroupEndInternal(), ret, exit); TRACE_CALL("ncclGroupEnd()"); - return ncclSuccess; +exit: + return ret; } struct ncclPreconnectJob { @@ -143,31 +191,103 @@ failure: return result; } -ncclResult_t ncclGroupEndInternal() { - if (ncclGroupDepth == 0) { - WARN("ncclGroupEnd: not in a group call."); - return ncclInvalidUsage; +static inline void groupResetJobState() { + ncclGroupBlocking = -1; + ncclGroupJobMainPtr = NULL; + memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob)); + return; +} + +static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue* asyncJobsPtr, ncclResult_t* groupErrorPtr, ncclResult_t error) { + struct ncclComm* comm = *groupCommHeadPtr; + + while (comm != nullptr) { + struct ncclComm* next = comm->groupNext; + (void) ncclGroupCommLeave(comm); // overwrites comm->groupNext + // We don't know if preconnect succeeded or happened at all, so clear + // the flags that let `taskAppend()` skip over checking if preconnect + // is needed. + comm->preconnectNext = reinterpret_cast(0x1); + for (int i = 0; i < comm->nRanks; i++) { + comm->tasks.peers[i].sendSeen = false; + comm->tasks.peers[i].recvSeen = false; + comm->connectSend[i] = 0; + comm->connectRecv[i] = 0; + } + comm->unlaunchedPlansHead = nullptr; + // Reclaim abandoned kernel plan memory. Note ncclWork structs were already + // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`. + while (!ncclIntruQueueEmpty(&comm->planQueue)) { + struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue); + // Persistent plans will be reclaimed via the callbackQueue when the + // graph drops its UserObject reference. + if (!plan->persistent) { + for (int c = 0; c < MAXCHANNELS; c++) { + while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) { + struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue); + ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop); + } + } + ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan); + } + } + // Reset comm->tasks to empty. + comm->tasks.nTasksColl = 0; + comm->tasks.nTasksP2p = 0; + comm->tasks.streams = nullptr; + ncclIntruQueueConstruct(&comm->tasks.collQueue); + comm->tasks.collBytesTotal = 0; + for (int i = 0; i < comm->nRanks; i++) { + ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue); + ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue); + } + + if (!comm->blocking) + (void) ncclCommSetAsyncError(comm, error); + comm = next; } - ncclGroupDepth--; - if (ncclGroupDepth > 0) return ncclSuccess; + /* reset everything */ + while (!ncclIntruQueueEmpty(asyncJobsPtr)) { + struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr); + *job->abortFlag = 1; + if (job->comm && !job->comm->blocking) + (void) ncclCommSetAsyncError(job->comm, error); + if (job->undo) job->undo(job); + if (job->destructor) job->destructor((void*)job); + } + + *groupErrorPtr = ncclSuccess; + *groupCommHeadPtr = nullptr; + *groupCommPreconnectHeadPtr = nullptr; + return; +} + +static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) { int savedDev; - CUDACHECK(cudaGetDevice(&savedDev)); - - ncclResult_t ret = ncclGroupError; + ncclResult_t ret = ncclSuccess; bool jobsDone = false; - if (ret != ncclSuccess) goto failure; + bool errorJobAbortFlag = false; + struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_; + struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr; + struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr; + struct ncclIntruQueue *asyncJobsMain = gjob->asyncJobsPtr; + volatile bool *groupAbortFlag = gjob->abortFlagPtr; - if (ncclGroupCommPreconnectHead != nullptr) { - struct ncclComm* comm = ncclGroupCommPreconnectHead; + CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail); + + if (groupCommPreconnectHeadMain != nullptr) { + struct ncclComm* comm = groupCommPreconnectHeadMain; do { struct ncclPreconnectJob* job; - NCCLCHECK(ncclCalloc(&job, 1)); + NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); job->base.func = ncclPreconnectFunc; job->base.undo = nullptr; job->base.destructor = free; + job->base.state = ncclGroupJobRunning; + job->base.abortFlag = comm->abortFlag; job->comm = comm; - ncclIntruQueueEnqueue(&ncclAsyncJobs, &job->base); + ncclIntruQueueEnqueue(asyncJobsMain, &job->base); struct ncclComm* next = comm->preconnectNext; comm->preconnectNext = reinterpret_cast(0x1); @@ -175,94 +295,154 @@ ncclResult_t ncclGroupEndInternal() { } while (comm != nullptr); } - if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) { - struct ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs); + if (!ncclIntruQueueEmpty(asyncJobsMain)) { + struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain); do { - pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job); + SYSCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), ret, fail); job = job->next; } while (job != nullptr); - job = ncclIntruQueueHead(&ncclAsyncJobs); do { - int err = pthread_join(job->thread, nullptr); - if (err != 0) { - WARN("Error waiting for pthread_join : %s", strerror(errno)); - ret = ncclSystemError; - } - if (ret == ncclSuccess && job->result != ncclSuccess) ret = job->result; - job = job->next; - } while (job != nullptr); - - jobsDone = true; - if (ret != ncclSuccess) goto failure; - } - - if (ncclGroupCommHead != nullptr) { - NCCLCHECKGOTO(doLaunches(ncclGroupCommHead), ret, failure); - do { - struct ncclComm* comm = ncclGroupCommHead; - struct ncclComm* next = comm->groupNext; - ncclGroupCommLeave(comm); - ncclGroupCommHead = next; - } while (ncclGroupCommHead != nullptr); - } - - if (false) { - failure: - struct ncclComm* comm = ncclGroupCommHead; - while (comm != nullptr) { - struct ncclComm* next = comm->groupNext; - ncclGroupCommLeave(comm); // overwrites comm->groupNext - // We don't know if preconnect succeeded or happened at all, so clear - // the flags that let `taskAppend()` skip over checking if preconnect - // is needed. - comm->preconnectNext = reinterpret_cast(0x1); - for (int i=0; i < comm->nRanks; i++) { - comm->tasks.peers[i].sendSeen = false; - comm->tasks.peers[i].recvSeen = false; - comm->connectSend[i] = 0; - comm->connectRecv[i] = 0; - } - comm->unlaunchedPlansHead = nullptr; - // Reclaim abandoned kernel plan memory. Note ncclWork structs were already - // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`. - while (!ncclIntruQueueEmpty(&comm->planQueue)) { - struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue); - // Persistent plans will be reclaimed via the callbackQueue when the - // graph drops its UserObject reference. - if (!plan->persistent) { - for (int c=0; c < MAXCHANNELS; c++) { - while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) { - struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue); - ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop); - } + jobsDone = true; + job = ncclIntruQueueHead(asyncJobsMain); + do { + ncclGroupJobState_t state = __atomic_load_n(&job->state, __ATOMIC_ACQUIRE); + if (state == ncclGroupJobRunning) { + jobsDone = false; + } else if (state == ncclGroupJobDone) { + if (pthread_join(job->thread, nullptr) != 0) { + WARN("Error waiting for pthread_join : %s", strerror(errno)); + ret = ncclSystemError; } - ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan); + job->state = ncclGroupJobJoined; + if (job->result != ncclSuccess) { + ret = job->result; + errorJobAbortFlag = true; + } + } else { + /* safety check */ + assert(state == ncclGroupJobJoined); } - } - // Reset comm->tasks to empty. - comm->tasks.nTasksColl = 0; - comm->tasks.nTasksP2p = 0; - comm->tasks.streams = nullptr; - ncclIntruQueueConstruct(&comm->tasks.collQueue); - comm->tasks.collBytesTotal = 0; - for (int i=0; i < comm->nRanks; i++) { - ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue); - ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue); - } - comm = next; - } + + if (*groupAbortFlag == true || errorJobAbortFlag == true) { + *job->abortFlag = 1; + ret = ncclInternalError; + } + + job = job->next; + } while (job != nullptr); + } while (jobsDone == false); + + if (ret != ncclSuccess) goto fail; } - while (!ncclIntruQueueEmpty(&ncclAsyncJobs)) { - struct ncclAsyncJob* job = ncclIntruQueueDequeue(&ncclAsyncJobs); - if (ret != ncclSuccess && jobsDone && job->undo) job->undo(job); + if (groupCommHeadMain != nullptr) { + NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail); + } + + /* this atomic must happen before cleanup and setting state of communicators */ + __atomic_store_n(&gjob->doneFlag, true, __ATOMIC_RELEASE); + + while (!ncclIntruQueueEmpty(asyncJobsMain)) { + struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain); + if (job->comm && !job->comm->blocking) + (void) ncclCommSetAsyncError(job->comm, ret); if (job->destructor) job->destructor((void*)job); } - ncclGroupError = ncclSuccess; - ncclGroupCommHead = nullptr; - ncclGroupCommPreconnectHead = nullptr; - CUDACHECK(cudaSetDevice(savedDev)); // do other clean-ups first before calling cudaSetDevice, because this call can fail too + while (groupCommHeadMain != nullptr) { + struct ncclComm* comm = groupCommHeadMain; + struct ncclComm* next = comm->groupNext; + (void) ncclGroupCommLeave(comm); + if (!comm->blocking) { + (void) ncclCommSetAsyncError(comm, ret); + } + groupCommHeadMain = next; + } + + *gjob->groupErrorPtr = ncclSuccess; + *gjob->groupCommHeadPtr = nullptr; + *gjob->groupCommPreconnectHeadPtr = nullptr; + + CUDACHECK(cudaSetDevice(savedDev)); + +exit: + return ret; +fail: + groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, ret); + goto exit; +} + +ncclResult_t ncclGroupEndInternal() { + ncclResult_t ret = ncclSuccess; + + if (ncclGroupDepth == 0) { + WARN("ncclGroupEnd: not in a group call."); + ret = ncclInvalidUsage; + goto exit; + } + + if ((--ncclGroupDepth) > 0) goto exit; + + if ((ret = ncclGroupError) != ncclSuccess) goto fail; + + if (ncclGroupCommHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs) || ncclGroupCommPreconnectHead != nullptr) { + ncclGroupJobMain.groupCommHeadPtr = &ncclGroupCommHead; + ncclGroupJobMain.groupCommPreconnectHeadPtr = &ncclGroupCommPreconnectHead; + ncclGroupJobMain.groupErrorPtr = &ncclGroupError; + ncclGroupJobMain.asyncJobsPtr = &ncclAsyncJobs; + ncclGroupJobMain.abortFlagPtr = &ncclGroupJobAbortFlag; + ncclGroupJobMain.doneFlag = false; + ncclGroupJobMainPtr = &ncclGroupJobMain; + /* make sure ncclGroupBlocking has been set. */ + assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1); + if (ncclGroupBlocking == 0 && (ncclGroupCommPreconnectHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs))) { + /* nonblocking group */ + if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) { + ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs); + do { + NCCLCHECKGOTO(ncclCommSetAsyncError(job->comm, ncclInProgress), ret, fail); + job = job->next; + } while (job); + } + + if (ncclGroupCommHead) { + ncclComm_t comm = ncclGroupCommHead; + do { + NCCLCHECKGOTO(ncclCommSetAsyncError(comm, ncclInProgress), ret, fail); + comm = comm->groupNext; + } while (comm); + } + ncclGroupJobMainPtr->base.func = groupLaunch; + SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail); + ret = ncclInProgress; + } else { + /* blocking group */ + NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base), ret, fail); + groupResetJobState(); + } + } + +exit: + return ret; +fail: + groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, ret); + groupResetJobState(); + goto exit; +} + +static ncclResult_t groupJobComplete(struct ncclGroupJob* job) { + ncclResult_t ret = ncclSuccess; + if (job) { + ret = ncclAsyncJobComplete(&job->base); + groupResetJobState(); + } return ret; } + +void ncclGroupJobAbort() { + ncclGroupJobAbortFlag = true; + (void) groupJobComplete(ncclGroupJobMainPtr); + /* reset group abort flag */ + ncclGroupJobAbortFlag = false; +} diff --git a/src/include/alloc.h b/src/include/alloc.h index 29ec87a..7c20003 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -21,17 +21,15 @@ uint64_t clockNano(); // from utils.h with which we have a circular dependency template ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { ncclResult_t result = ncclSuccess; - uint64_t time = 0; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - time = clockNano(); CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish); - time = clockNano() - time; memset(*ptr, 0, nelem*sizeof(T)); - INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p seconds: cudaHostAlloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T)); + INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); return result; } #define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__) @@ -80,12 +78,11 @@ ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, in cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - uint64_t time = clockNano(); CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); - time = clockNano() - time; finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: cudaMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9); + if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T)); + INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); return result; } #define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__) @@ -93,23 +90,20 @@ finish: template ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { ncclResult_t result = ncclSuccess; - uint64_t time0=0, time1=0, time2=0; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); // Need a side stream so as not to interfere with graph capture. cudaStream_t stream; - time0 = clockNano(); CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - time1 = clockNano(); CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); - time2 = clockNano(); CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); - INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: cudaStreamCreateWithFlags=%g cudaMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time1-time0)/1.e9, double(time2-time1)/1.e9); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T)); + INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); return result; } #define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__) @@ -117,17 +111,15 @@ finish: template ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char *filefunc, int line) { ncclResult_t result = ncclSuccess; - uint64_t time = 0; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - time = clockNano(); CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); - time = clockNano() - time; CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); - INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: cudaMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T)); + INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); return result; } #define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__) diff --git a/src/include/checks.h b/src/include/checks.h index 715aeb7..048fc06 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -106,7 +106,7 @@ // Propagate errors up #define NCCLCHECK(call) do { \ ncclResult_t res = call; \ - if (res != ncclSuccess) { \ + if (res != ncclSuccess && res != ncclInProgress) { \ /* Print the back trace*/ \ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ return res; \ @@ -115,7 +115,7 @@ #define NCCLCHECKGOTO(call, res, label) do { \ res = call; \ - if (res != ncclSuccess) { \ + if (res != ncclSuccess && res != ncclInProgress) { \ /* Print the back trace*/ \ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ goto label; \ @@ -125,7 +125,7 @@ #define NCCLWAIT(call, cond, abortFlagPtr) do { \ volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ ncclResult_t res = call; \ - if (res != ncclSuccess) { \ + if (res != ncclSuccess && res != ncclInProgress) { \ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ return ncclInternalError; \ } \ @@ -135,7 +135,7 @@ #define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \ volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ res = call; \ - if (res != ncclSuccess) { \ + if (res != ncclSuccess && res != ncclInProgress) { \ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ goto label; \ } \ @@ -143,7 +143,7 @@ } while (!(cond)); #define NCCLCHECKTHREAD(a, args) do { \ - if (((args)->ret = (a)) != ncclSuccess) { \ + if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \ INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \ return args; \ } \ diff --git a/src/include/collectives.h b/src/include/collectives.h index 7f0d0b6..f50a379 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -38,8 +38,9 @@ struct ncclDevRedOpFull { extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \ extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \ +#define SINGLE_ARG(...) __VA_ARGS__ #define CONCAT(a,b) a##b -#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f) +#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(SINGLE_ARG(t), SINGLE_ARG(f)) #define MACRO_IF_0(t, f) f #define MACRO_IF_1(t, f) t @@ -51,7 +52,8 @@ struct ncclDevRedOpFull { #define DECL3(func, devredop, type, undef) \ DECL4(func, RING, devredop, type, undef) \ DECL4(func, TREE, devredop, type, undef) \ - DECL4(func, COLLNET, devredop, type, undef) + DECL4(func, COLLNET_DIRECT, devredop, type, undef) \ + DECL4(func, COLLNET_CHAIN, devredop, type, undef) #if defined(__CUDA_BF16_TYPES_EXIST__) #define DECL2(func, devredop, undefForFloat) \ @@ -117,7 +119,6 @@ extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)(); #define BROADCAST_CHUNKSTEPS 1 #define REDUCE_SLICESTEPS 1 #define REDUCE_CHUNKSTEPS 1 -#define SENDRECV_SLICEFACTOR 4 #define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above #endif diff --git a/src/include/comm.h b/src/include/comm.h index ee752fc..2adce32 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -102,7 +102,8 @@ struct ncclChannel { struct ncclRing ring; int* devRingUserRanks; struct ncclTree tree; - struct ncclDirect collTree; + struct ncclTree collnetChain; + struct ncclDirect collnetDirect; int id; // index of this channel uint32_t workFifoSent; // last used work index+1 uint64_t p2pOpCount; @@ -128,6 +129,7 @@ struct ncclKernelPlan { struct ncclKernelPlan* next; bool persistent; // aka captured in a graph + bool kernelSpecialized; void *kernelFn; int channelUbound; // only channels c < channelUbound are present int channelCount; // number of channels present @@ -201,8 +203,12 @@ struct ncclComm { int p2pnChannelsPerPeer; int p2pChannels[MAXCHANNELS]; + // Should this comm allocate LL buffers for network P2P connections? + bool allocP2pNetLLBuffers; + // Buffer sizes int buffSizes[NCCL_NUM_PROTOCOLS]; + int p2pNetChunkSize; // Algorithm/Protocols thresholds ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; @@ -210,8 +216,9 @@ struct ncclComm { float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - // Whether there has been a fatal error in this communicator. - ncclResult_t fatalError; + /* This attribute can indicate the states of communicators and return code of + * asynchronous NCCL operations. */ + ncclResult_t asyncResult; // Flag to ask NCCL kernels to abort volatile uint32_t *abortFlag; @@ -276,12 +283,16 @@ struct ncclComm { struct ncclIntruQueue planQueue; // First of the unlaunched kernels in `planQueue` struct ncclKernelPlan* unlaunchedPlansHead; -}; -// Set to true during an `atexit()` handler. We use this to intentionally leak -// unfreed CUDA resources when cleaning up after return of `main()` to avoid -// CUDA calls after CUDA runtime teardown. -extern bool ncclMainExited; + // communicator mode + int blocking; + // initState is to more conveniently reclaim resources when errors happen. + ncclResult_t initState; + // flag to indicate if ncclCommFinalize() is called + bool finalizeCalled; + // shared structures for finalization + int finalizeRankCnt; +}; enum ncclLaunchMode { ncclLaunchModeInvalid=0, @@ -295,13 +306,16 @@ void ncclCommPushCudaFree(struct ncclComm* comm, void* buf); void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf); void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle); -inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm) { - struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, /*waitSome=*/false); +inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome) { + ncclResult_t result = ncclSuccess; + struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, waitSome); while (cb != nullptr) { struct ncclCommCallback* next = cb->next; - NCCLCHECK(cb->fn(comm, cb)); // may reclaim memory of cb + ncclResult_t res1 = cb->fn(comm, cb); // may reclaim memory of cb + if (res1 != ncclSuccess) result = res1; cb = next; } + NCCLCHECK(result); return ncclSuccess; } @@ -358,4 +372,7 @@ static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) { return op1 < int(ncclNumOps) ? op : ncclRedOp_t(op1); } +ncclResult_t ncclCommEnsureReady(ncclComm_t comm); +ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState); + #endif diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h index eaa5949..2bd3b4d 100644 --- a/src/include/cudawrap.h +++ b/src/include/cudawrap.h @@ -12,9 +12,9 @@ #if CUDART_VERSION >= 11030 #include #else -typedef CUresult (CUDAAPI *PFN_cuInit)(unsigned int Flags); -typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion)(int *driverVersion); -typedef CUresult (CUDAAPI *PFN_cuGetProcAddress)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags); +typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion); +typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags); #endif #define CUPFN(symbol) pfn_##symbol @@ -60,27 +60,27 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress)(const char *symbol, void **pfn, } \ } while(0) -#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol +#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol #if CUDART_VERSION >= 11030 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */ -DECLARE_CUDA_PFN_EXTERN(cuDeviceGet); -DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute); -DECLARE_CUDA_PFN_EXTERN(cuGetErrorString); -DECLARE_CUDA_PFN_EXTERN(cuGetErrorName); -DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange); -DECLARE_CUDA_PFN_EXTERN(cuCtxCreate_v3020); -DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy); -DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent); +DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000); +DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000); +DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000); +DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000); +DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020); +DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020); +DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000); +DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000); #if CUDA_VERSION >= 11070 -DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support +DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support #endif #endif /* CUDA Driver functions loaded with dlsym() */ -DECLARE_CUDA_PFN_EXTERN(cuInit); -DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion); -DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress); +DECLARE_CUDA_PFN_EXTERN(cuInit, 2000); +DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion, 2020); +DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030); ncclResult_t cudaLibraryInit(void); diff --git a/src/include/devcomm.h b/src/include/devcomm.h index f8b630e..53d6838 100644 --- a/src/include/devcomm.h +++ b/src/include/devcomm.h @@ -15,10 +15,11 @@ typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t; extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS]; -#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet +#define NCCL_NUM_ALGORITHMS 4 // Tree/Ring/CollNet* #define NCCL_ALGO_TREE 0 #define NCCL_ALGO_RING 1 -#define NCCL_ALGO_COLLNET 2 +#define NCCL_ALGO_COLLNET_DIRECT 2 +#define NCCL_ALGO_COLLNET_CHAIN 3 extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS]; #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 @@ -205,7 +206,9 @@ struct ncclWorkElem { static_assert(NCCL_MAX_WORK_ELEMENTS == 9, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 9"); struct ncclWorkElemP2p { - int32_t peer; + int peer : 30; + int proto : 2; + enum ncclWorkP2PType p2pType; uint8_t nWarps; uint8_t warpStart; @@ -259,7 +262,8 @@ struct alignas(16) ncclDevChannel { struct ncclDevChannelPeer *peers; struct ncclRing ring; struct ncclTree tree; - struct ncclDirect collTree; + struct ncclTree collnetChain; + struct ncclDirect collnetDirect; uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed }; diff --git a/src/include/graph.h b/src/include/graph.h index 1997f76..63b05b1 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -33,6 +33,7 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank); ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank); ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr); +ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net); int ncclPxnDisable(struct ncclComm* comm); ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks); ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank); @@ -51,6 +52,7 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu #define NCCL_TOPO_CPU_TYPE_YONGFENG 1 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model); ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count); +ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id); #define NCCL_TOPO_MAX_NODES 256 @@ -72,8 +74,8 @@ struct ncclTopoGraph { int maxChannels; // Output int nChannels; - float speedIntra; - float speedInter; + float bwIntra; + float bwInter; float latencyInter; int typeIntra; int typeInter; diff --git a/src/include/group.h b/src/include/group.h index e6f31b1..e4b2ee0 100644 --- a/src/include/group.h +++ b/src/include/group.h @@ -13,12 +13,19 @@ ncclResult_t ncclGroupErrCheck(ncclResult_t ret); void ncclGroupCommJoin(struct ncclComm* comm); void ncclGroupCommPreconnect(struct ncclComm* comm); -void ncclGroupCommLeave(struct ncclComm* comm); +ncclResult_t ncclGroupCommLeave(struct ncclComm* comm); +void ncclGroupJobAbort(); typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); +typedef enum ncclGroupJobState { + ncclGroupJobRunning = 0, + ncclGroupJobDone = 1, + ncclGroupJobJoined = 2, +} ncclGroupJobState_t; + struct ncclAsyncJob { struct ncclAsyncJob* next; pthread_t thread; @@ -26,17 +33,31 @@ struct ncclAsyncJob { ncclResult_t(*func)(struct ncclAsyncJob*); void(*undo)(struct ncclAsyncJob*); void(*destructor)(void*); + ncclGroupJobState_t state; + volatile uint32_t *abortFlag; /* point to comm abortFlag */ + ncclComm_t comm; }; ncclResult_t ncclAsyncLaunch( struct ncclAsyncJob* job, ncclResult_t(*func)(struct ncclAsyncJob*), void(*undo)(struct ncclAsyncJob*), - void(*destructor)(void*) + void(*destructor)(void*), ncclComm_t comm ); +struct ncclGroupJob { + struct ncclAsyncJob base; + struct ncclComm **groupCommHeadPtr; + struct ncclComm **groupCommPreconnectHeadPtr; + ncclResult_t *groupErrorPtr; + volatile bool *abortFlagPtr; + struct ncclIntruQueue *asyncJobsPtr; + bool doneFlag; +}; + ncclResult_t ncclGroupStartInternal(); ncclResult_t ncclGroupEndInternal(); +ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job); //////////////////////////////////////////////////////////////////////////////// @@ -44,6 +65,7 @@ extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting extern __thread ncclResult_t ncclGroupError; extern __thread struct ncclComm* ncclGroupCommHead; extern __thread struct ncclComm* ncclGroupCommPreconnectHead; +extern __thread int ncclGroupBlocking; inline ncclResult_t ncclGroupStartInternal() { ncclGroupDepth++; @@ -52,7 +74,7 @@ inline ncclResult_t ncclGroupStartInternal() { inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) { if (ncclGroupDepth > 0) { - if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret; + if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret; } return ret; } @@ -72,6 +94,8 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) { // this comm is allocated there. ncclMemoryStackPush(&comm->memScoped); } + + ncclGroupBlocking = comm->blocking; } // Add comm to this thread's group needing preconnect @@ -83,9 +107,10 @@ inline void ncclGroupCommPreconnect(struct ncclComm* comm) { } // Comm has left group -inline void ncclGroupCommLeave(struct ncclComm* comm) { +inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) { comm->groupNext = reinterpret_cast(0x1); ncclMemoryStackPop(&comm->memScoped); + return ncclSuccess; } #endif diff --git a/src/include/info.h b/src/include/info.h index b511728..a770c32 100644 --- a/src/include/info.h +++ b/src/include/info.h @@ -22,7 +22,8 @@ typedef enum : uint8_t { ncclPatternTreeUp, ncclPatternTreeDown, ncclPatternTreeUpDown, - ncclPatternCollTreeUpDown, + ncclPatternCollnetChain, + ncclPatternCollnetDirect, ncclPatternSend, ncclPatternRecv } ncclPattern_t; diff --git a/src/include/proxy.h b/src/include/proxy.h index dcab5e2..fa8f388 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -164,6 +164,7 @@ struct ncclProxyState { struct ncclSocket* listenSock; int stop; CUcontext cudaCtx; + int safeAbortFlag; // Used by main thread union ncclSocketAddress* peerAddresses; @@ -183,6 +184,7 @@ struct ncclProxyConnection { struct ncclProxyArgs *proxyAppend; struct ncclProxyArgs **proxyAppendPtr; void* transportResources; + bool initFlag; }; typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*); diff --git a/src/init.cc b/src/init.cc index 9269708..25c8d5d 100644 --- a/src/init.cc +++ b/src/init.cc @@ -35,12 +35,13 @@ #endif const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" }; -const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" }; +const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain" }; const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" }; NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); +NCCL_PARAM(CommBlocking, "COMM_BLOCKING", 0); static uint64_t hashUniqueId(ncclUniqueId const &id) { char const *bytes = (char const*)&id; @@ -73,17 +74,10 @@ pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; static bool initialized = false; static size_t maxLocalSizeBytes = 0; -bool ncclMainExited = false; - -static void atexitHandler() { - ncclMainExited = true; -} - static ncclResult_t ncclInit() { if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return ncclSuccess; pthread_mutex_lock(&initLock); if (!initialized) { - atexit(atexitHandler); initEnv(); initGdrCopy(); maxLocalSizeBytes = ncclKernMaxLocalSize(); @@ -178,46 +172,11 @@ void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle) { comm->destructorHead = dtor; } -void commZombieCleanup(struct ncclComm* comm) { - ncclMemoryStackDestruct(&comm->memScoped); - ncclMemoryStackDestruct(&comm->memPermanent); - - struct ncclComm* intraComm0 = comm->intraComm0; - if (0 == ncclAtomicRefCountDecrement(&intraComm0->intraRefs)) { - // Wait for all service threads to be done. We could not - // do it earlier because it could have blocked and prevented - // other ranks in the process to call ncclCommDestroy - comm = intraComm0; - while (comm != nullptr) { - if (comm->proxyState.thread) pthread_join(comm->proxyState.thread, nullptr); - struct ncclComm* next = comm->intraNext; - free(comm); - comm = next; - } - } -} - -static void* commZombieMain(void* arg) { - ncclResult_t result = ncclSuccess; - struct ncclComm* comm = (struct ncclComm*)arg; - while (comm->persistentRefs != 0) { - struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, /*waitSome=*/true); - while (cb != nullptr) { - struct ncclCommCallback* next = cb->next; - NCCLCHECKGOTO(cb->fn(comm, cb), result, ignore); // may reclaim memory of cb - ignore: - cb = next; - } - } - commZombieCleanup(comm); - return arg; -} - static ncclResult_t commFree(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; - // First stop all threads before we free anything. + // Stop all threads before we free anything. NCCLCHECK(ncclProxyDestroy(comm)); delete[] comm->userRedOps; @@ -226,9 +185,12 @@ static ncclResult_t commFree(ncclComm_t comm) { free(comm->connectRecv); free(comm->peerInfo); - ncclTopoFree(comm->topo); - for (int n=0; nnNodes; n++) free(comm->nodeRanks[n].localRankToRank); - free(comm->nodeRanks); + if (comm->topo) + ncclTopoFree(comm->topo); + if (comm->nodeRanks) { + for (int n=0; nnNodes; n++) free(comm->nodeRanks[n].localRankToRank); + free(comm->nodeRanks); + } free(comm->rankToNode); free(comm->rankToLocalRank); @@ -238,10 +200,10 @@ static ncclResult_t commFree(ncclComm_t comm) { for (int channel=0; channelchannels+channel, comm->nRanks)); - NCCLCHECK(ncclStrongStreamDestruct(&comm->hostStream)); - NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream)); - - NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag)); + if (comm->initState == ncclSuccess) { + NCCLCHECK(ncclStrongStreamDestruct(&comm->hostStream)); + NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream)); + } struct ncclDestructor* dtor = comm->destructorHead; while (dtor != nullptr) { @@ -249,16 +211,34 @@ static ncclResult_t commFree(ncclComm_t comm) { dtor = dtor->next; } + ncclMemoryStackDestruct(&comm->memScoped); + ncclMemoryStackDestruct(&comm->memPermanent); + commPoison(comm); // Important that this does not interfere with anything used below. - if (comm->persistentRefs == 0) { - commZombieCleanup(comm); + if (comm->initState == ncclSuccess) { + struct ncclComm* intraComm0 = comm->intraComm0; + if (0 == ncclAtomicRefCountDecrement(&intraComm0->intraRefs)) { + // Wait for all service threads to be done. We could not + // do it earlier because it could have blocked and prevented + // other ranks in the process to call ncclCommDestroy + comm = intraComm0; + while (comm != nullptr) { + if (comm->proxyState.thread) pthread_join(comm->proxyState.thread, nullptr); + struct ncclComm* next = comm->intraNext; + free(comm); + comm = next; + } + } + } else if (comm->proxyState.thread) { + pthread_join(comm->proxyState.thread, nullptr); + ncclCudaHostFree((void *)comm->abortFlag); + free(comm); } else { - // Spawn a thread to listen for remaining messages from graph cleanup. - pthread_t zombie; - pthread_create(&zombie, nullptr, commZombieMain, comm); - pthread_detach(zombie); + ncclCudaHostFree((void *)comm->abortFlag); + free(comm); } + return ncclSuccess; } @@ -290,6 +270,26 @@ static ncclResult_t dmaBufSupported(struct ncclComm* comm) { return ncclInternalError; } +ncclResult_t ncclCommEnsureReady(ncclComm_t comm) { + /* comm must be ready, or error will be reported */ + ncclResult_t ret = ncclSuccess; + + if (*comm->abortFlag) { + ncclGroupJobAbort(); + } else { + NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); + if (ret != ncclSuccess) { + /* if ret is not ncclInProgress, we just keep it. */ + WARN("Attempt to use communicator before the previous operation returned ncclSuccess\n"); + if (ret == ncclInProgress) ret = ncclInvalidArgument; + goto exit; + } + } + +exit: + return ret; +} + static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { if (ndev < 1) { WARN("invalid device count (%d) requested", ndev); @@ -301,7 +301,19 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { } struct ncclComm* comm; - NCCLCHECK(ncclCalloc(&comm, 1)); + /* Cuurently we calloc comm in ncclCommInitRankDev for async function support. + * This 'if' structure is designed to consider the case where commAlloc is called + * in other cases except ncclCommInitRankDev. */ + if (*comret == NULL) { + /* user requests a new communicator */ + NCCLCHECK(ncclCalloc(&comm, 1)); + NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1)); + NCCLCHECK(ncclCommSetAsyncError(comm, ncclInProgress)); + } else { + /* We already allocated a communicator in ncclCommInitRankDev. */ + comm = *comret; + } + ncclMemoryStackConstruct(&comm->memPermanent); ncclMemoryStackConstruct(&comm->memScoped); comm->destructorHead = nullptr; @@ -322,10 +334,6 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false; - comm->fatalError = ncclSuccess; - - NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1)); - *comm->abortFlag = 0; comm->collNetSupport = 0; @@ -397,7 +405,8 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { tmpCommAndChans.channels[c].ring = comm->channels[c].ring; tmpCommAndChans.channels[c].ring.userRanks = comm->channels[c].devRingUserRanks; tmpCommAndChans.channels[c].tree = comm->channels[c].tree; - tmpCommAndChans.channels[c].collTree = comm->channels[c].collTree; + tmpCommAndChans.channels[c].collnetChain = comm->channels[c].collnetChain; + tmpCommAndChans.channels[c].collnetDirect = comm->channels[c].collnetDirect; tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c]; if (comm->channels[c].ring.userRanks != nullptr) { @@ -471,6 +480,8 @@ NCCL_PARAM(BuffSize, "BUFFSIZE", -2); NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2); NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2); +NCCL_PARAM(P2pNetChunkSize, "P2P_NET_CHUNKSIZE", (1 << 17)); /* 128 kB */ + static ncclResult_t computeBuffSizes(struct ncclComm* comm) { int cpuArch, cpuVendor, cpuModel; NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); @@ -483,12 +494,15 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) { for (int p=0; pbuffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p]; } + + comm->p2pNetChunkSize = ncclParamP2pNetChunkSize(); return ncclSuccess; } NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0); NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2); NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1); +NCCL_PARAM(AllocP2pNetLLBuffers, "NCCL_ALLOC_P2P_NET_LL_BUFFERS", 0); static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) { // We use 2 AllGathers @@ -568,6 +582,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph)); NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph)); + // Initialize num P2P LL buffers for this communicator + comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1; + if (comm->rank == ncclParamGraphDumpFileRank()) { struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph }; NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs)); @@ -590,8 +607,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int pattern; int nChannels; int sameChannels; - float speedIntra; - float speedInter; + float bwIntra; + float bwInter; int typeIntra; int typeInter; }; @@ -611,22 +628,22 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm allGather3Data[rank].tree.pattern = treeGraph.pattern; allGather3Data[rank].tree.nChannels = treeGraph.nChannels; allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels; - allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra; - allGather3Data[rank].tree.speedInter = treeGraph.speedInter; + allGather3Data[rank].tree.bwIntra = treeGraph.bwIntra; + allGather3Data[rank].tree.bwInter = treeGraph.bwInter; allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra; allGather3Data[rank].tree.typeInter = treeGraph.typeInter; allGather3Data[rank].ring.pattern = ringGraph.pattern; allGather3Data[rank].ring.nChannels = ringGraph.nChannels; allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels; - allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra; - allGather3Data[rank].ring.speedInter = ringGraph.speedInter; + allGather3Data[rank].ring.bwIntra = ringGraph.bwIntra; + allGather3Data[rank].ring.bwInter = ringGraph.bwInter; allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra; allGather3Data[rank].ring.typeInter = ringGraph.typeInter; allGather3Data[rank].collNet.pattern = collNetGraph.pattern; allGather3Data[rank].collNet.nChannels = collNetGraph.nChannels; allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels; - allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra; - allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter; + allGather3Data[rank].collNet.bwIntra = collNetGraph.bwIntra; + allGather3Data[rank].collNet.bwInter = collNetGraph.bwInter; allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra; allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter; allGather3Data[rank].collNetSupport = comm->collNetSupport; @@ -695,20 +712,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm // Make sure we align all ranks so that the tuning is consistent across ranks treeGraph.nChannels = std::min(allGather3Data[i].tree.nChannels, treeGraph.nChannels); treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); - treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra); - treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter); + treeGraph.bwIntra = std::min(allGather3Data[i].tree.bwIntra, treeGraph.bwIntra); + treeGraph.bwInter = std::min(allGather3Data[i].tree.bwInter, treeGraph.bwInter); treeGraph.typeIntra = std::max(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra); treeGraph.typeInter = std::max(allGather3Data[i].tree.typeInter, treeGraph.typeInter); ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels); ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels); - ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra); - ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter); + ringGraph.bwIntra = std::min(allGather3Data[i].ring.bwIntra, ringGraph.bwIntra); + ringGraph.bwInter = std::min(allGather3Data[i].ring.bwInter, ringGraph.bwInter); ringGraph.typeIntra = std::max(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra); ringGraph.typeInter = std::max(allGather3Data[i].ring.typeInter, ringGraph.typeInter); collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels); collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels); - collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra); - collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter); + collNetGraph.bwIntra = std::min(allGather3Data[i].collNet.bwIntra, collNetGraph.bwIntra); + collNetGraph.bwInter = std::min(allGather3Data[i].collNet.bwInter, collNetGraph.bwInter); collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra); collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter); comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport); @@ -812,16 +829,38 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup); TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank); - // Connect intra-node CollNet + char line[1024]; + line[0]='\0'; + for (int c=0; cnChannels; c++) { + struct ncclTree* chain = &comm->channels[c].collnetChain; + snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d", + c, chain->down[0], rank, chain->up); + } + line[1023] = '\0'; + INFO(NCCL_INIT, "Collnet Chains %s", line); + // Connect Collnet + chain + for (int c=0; cnChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, collnet_cleanup); + } + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0), ret, collnet_cleanup); + for (int c=0; cnChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, collnet_cleanup); + } + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup); + INFO(NCCL_INIT, "Connected collnet + chain"); + + // Connect intra-node CollNet + Direct int highestTransportType0, highestTransportType1; for (int c=0; cnChannels; c++) { struct ncclChannel* channelRecv = comm->channels+c; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, collnet_cleanup); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0, &highestTransportType0), ret, collnet_cleanup); for (int c=0; cnChannels; c++) { struct ncclChannel* channelSend = comm->channels+c; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, collnet_cleanup); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1, &highestTransportType1), ret, collnet_cleanup); @@ -1024,9 +1063,15 @@ struct ncclCommInitRankAsyncJob { int cudaDev; }; +struct ncclCommFinalizeAsyncJob { + struct ncclAsyncJob base; + ncclComm_t comm; +}; + static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_; ncclComm_t* newcomm = job->newcomm; + ncclComm_t comm = *newcomm; int nranks = job->nranks; ncclUniqueId commId = job->commId; // C++ struct assignment int myrank = job->myrank; @@ -1040,58 +1085,85 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zi", maxLocalSizeBytes); CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, maxLocalSizeBytes)); } - *newcomm = NULL; NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup); NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); + // update communicator state + comm->initState = ncclSuccess; + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId); TRACE_CALL("ncclCommInitRank(%p,%d,0x%llx,%d,%d)", *newcomm, nranks, (unsigned long long)hashUniqueId(commId), myrank, (*newcomm)->cudaDev); return ncclSuccess; cleanup: - if ((*newcomm) && (*newcomm)->bootstrap) bootstrapAbort((*newcomm)->bootstrap); - *newcomm = NULL; + comm->initState = res; return res; } +static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { + ncclResult_t ret = ncclSuccess; + + /* first set configuration */ + if (config) { + comm->blocking = config->blocking; + } else { + /* default setting of communicator */ + comm->blocking = 1; + } + + return ret; +} + static void ncclCommInitRankUndo(struct ncclAsyncJob* job_) { struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_; ncclCommDestroy(*job->newcomm); *job->newcomm = nullptr; } -static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) { +static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev, ncclConfig_t *config) { ncclResult_t res; + ncclComm_t comm = NULL; + struct ncclCommInitRankAsyncJob *job = NULL; char* env = getenv("NCCL_COMM_ID"); if (env && myrank == 0) { INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env); - NCCLCHECKGOTO(bootstrapCreateRoot(&commId, true), res, end); + NCCLCHECKGOTO(bootstrapCreateRoot(&commId, true), res, fail); } - NCCLCHECKGOTO(ncclInit(), res, end); + NCCLCHECKGOTO(ncclInit(), res, fail); if (myrank == 0) showVersion(); // Make sure the CUDA runtime is initialized. - CUDACHECKGOTO(cudaFree(NULL), res, end); + CUDACHECKGOTO(cudaFree(NULL), res, fail); - NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, end); + NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, fail); if (nranks < 1 || myrank < 0 || myrank >= nranks) { WARN("Invalid rank requested : %d/%d", myrank, nranks); res = ncclInvalidArgument; - goto end; + goto fail; } - struct ncclCommInitRankAsyncJob *job; - NCCLCHECKGOTO(ncclCalloc(&job, 1), res, end); + NCCLCHECKGOTO(ncclCalloc(&comm, 1), res, fail); + NCCLCHECKGOTO(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1), res, fail); + // set up comm state and abortFlag only + *comm->abortFlag = 0; + NCCLCHECKGOTO(parseCommConfig(comm, config), res, fail); + /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */ + comm->initState = ncclInternalError; + *newcomm = comm; + + NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); job->newcomm = newcomm; job->nranks = nranks; job->commId = commId; // C++ struct assignment job->myrank = myrank; job->cudaDev = cudaDev; - NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, ncclCommInitRankUndo, free), res, end); + NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail); -end: +exit: return ncclGroupErrCheck(res); +fail: + goto exit; } NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank); @@ -1103,63 +1175,296 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm int cudaDev; CUDACHECK(cudaGetDevice(&cudaDev)); - NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev)); + NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, NULL)); return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist); ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { NVTX3_FUNC_RANGE_IN(nccl_domain); - + ncclResult_t ret = ncclSuccess; + int totalnDev; + int *gpuFlags = NULL; // Load the CUDA driver and dlsym hooks (can fail on old drivers) (void) cudaLibraryInit(); - NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms")); + NCCLCHECKGOTO(PtrCheck(comms, "CommInitAll", "comms"), ret, fail); if (ndev < 0) { WARN("Invalid device count requested : %d", ndev); - return ncclInvalidArgument; + ret = ncclInvalidArgument; + goto fail; + } + + CUDACHECKGOTO(cudaGetDeviceCount(&totalnDev), ret, fail); + if (devlist) { + NCCLCHECKGOTO(ncclCalloc(&gpuFlags, totalnDev), ret, fail); + for (int i = 0; i < ndev; ++i) { + /* invalid device check. */ + if (devlist[i] < 0 || devlist[i] >= totalnDev) { + ret = ncclUnhandledCudaError; + goto fail; + } + + /* duplicate device check. */ + if (gpuFlags[devlist[i]] != 0) { + ret = ncclInvalidUsage; + goto fail; + } + + gpuFlags[devlist[i]] = 1; + } + free(gpuFlags); } ncclUniqueId uniqueId; - NCCLCHECK(ncclGetUniqueId(&uniqueId)); - NCCLCHECK(ncclGroupStart()); + NCCLCHECKGOTO(ncclGetUniqueId(&uniqueId), ret, fail); + NCCLCHECKGOTO(ncclGroupStart(), ret, fail); for (int i=0; irank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) { - WARN("comm %p has already been destroyed", comm); +ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState) { + if (nextState < 0 || nextState >= ncclNumResults || comm == NULL) { + WARN("ncclCommSetAsyncError: error comm %p sets state %d", comm, nextState); return ncclInvalidArgument; } + __atomic_store_n(&comm->asyncResult, nextState, __ATOMIC_RELEASE); + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config); +ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) { + NVTX3_FUNC_RANGE_IN(nccl_domain); + int cudaDev; + ncclResult_t ret = ncclSuccess; + ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER; + ncclConfig_t *internalConfigPtr; + size_t realSize; + int blockingEnv; + + NCCLCHECK(ncclGroupStartInternal()); + internalConfigPtr = &internalConfig; + if (config) { + memcpy((void*)&realSize, (void*)config, sizeof(size_t)); + realSize = realSize > sizeof(ncclConfig_t) ? sizeof(ncclConfig_t) : realSize; + memcpy((void*)internalConfigPtr, (void*)config, realSize); + if (internalConfigPtr->magic != 0xcafebeef) { + WARN("ncclConfig_t argument not initialized via NCCL_CONFIG_INITIALIZER"); + ret = ncclInvalidArgument; + goto exit; + } + } + + /* check input config attributes */ + if (internalConfigPtr->blocking != 0 && internalConfigPtr->blocking != 1) { + WARN("Invalid config blocking attribute value %d", internalConfigPtr->blocking); + ret = ncclInvalidArgument; + goto exit; + } + + /* overwrite configuration from env variable. */ + blockingEnv = ncclParamCommBlocking(); + if (blockingEnv != 0 && blockingEnv != 1) { + WARN("Invalid NCCL_COMM_BLOCKING value %d", blockingEnv); + } + if (blockingEnv == 1) internalConfigPtr->blocking = blockingEnv; + + (void) cudaLibraryInit(); + CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, exit); + NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail); + +exit: + ncclGroupErrCheck(ret); + NCCLCHECK(ncclGroupEndInternal()); + if (newcomm && *newcomm && !(*newcomm)->blocking) (void) ncclCommGetAsyncError(*newcomm, &ret); + return ret; +fail: + if (newcomm && *newcomm && !(*newcomm)->blocking) (void) ncclCommSetAsyncError(*newcomm, ret); + goto exit; +} + +static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) { + struct ncclCommFinalizeAsyncJob* job = (struct ncclCommFinalizeAsyncJob*) job_; + ncclComm_t comm = job->comm; int savedDevice; CUDACHECK(cudaGetDevice(&savedDevice)); int commDevice = comm->cudaDev; + ncclResult_t ret; + CUDACHECKGOTO(cudaGetDevice(&savedDevice), ret, fail); + if (savedDevice != commDevice) { + CUDACHECKGOTO(cudaSetDevice(commDevice), ret, fail); + } + + TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult); + + if (comm->initState == ncclSuccess) { + NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->hostStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->deviceStream), ret, fail); + } + NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail); + // And keep polling until all graphs referencing us die. + while (comm->persistentRefs != 0) { + NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail); + } + + if (savedDevice != commDevice) { + CUDACHECKGOTO(cudaSetDevice(savedDevice), ret, fail); + } + +exit: + return ret; +fail: + goto exit; +} + +static ncclResult_t commCleanup(ncclComm_t comm) { + int savedDevice; + int commDevice = comm->cudaDev; + + CUDACHECK(cudaGetDevice(&savedDevice)); if (savedDevice != commDevice) { CUDACHECK(cudaSetDevice(commDevice)); } - TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, comm->rank, *comm->abortFlag, comm->fatalError); - - NCCLCHECK(ncclStrongStreamSynchronize(&comm->hostStream)); - NCCLCHECK(ncclStrongStreamSynchronize(&comm->deviceStream)); - NCCLCHECK(ncclCommPollCallbacks(comm)); - NCCLCHECK(commFree(comm)); - if (savedDevice != commDevice) + if (savedDevice != commDevice) { CUDACHECK(cudaSetDevice(savedDevice)); + } return ncclSuccess; } +static ncclResult_t commFinalize(ncclComm_t comm, bool userCalled) { + ncclResult_t ret = ncclSuccess; + struct ncclCommFinalizeAsyncJob *job = NULL; + + comm->finalizeCalled = true; + /* launch async thread to finalize comm. */ + NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); + job->comm = comm; + + if (userCalled) { + NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commDestroySync, NULL, free, comm), ret, fail); + } else { + NCCLCHECKGOTO(commDestroySync(&job->base), ret, fail); + free(job); + } + +exit: + return ncclGroupErrCheck(ret); +fail: + if (job) free(job); + goto exit; +} + +NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm); +ncclResult_t ncclCommFinalize(ncclComm_t comm) { + NVTX3_FUNC_RANGE_IN(nccl_domain); + ncclResult_t ret = ncclSuccess; + + NCCLCHECK(ncclGroupStartInternal()); + if (comm == NULL) goto exit; + + /* wait comm ready before finalize. */ + NCCLCHECKGOTO(ncclCommEnsureReady(comm), ret, fail); + + /* prevent double finalize. */ + if (comm->finalizeCalled) { + ret = ncclInvalidArgument; + goto fail; + } + + /* finalize comm. */ + ret = commFinalize(comm, true); + +exit: + ncclGroupErrCheck(ret); + NCCLCHECK(ncclGroupEndInternal()); + if (comm && !comm->blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)) }; + return ret; +fail: + if (comm && !comm->blocking) (void) ncclCommSetAsyncError(comm, ret); + goto exit; +} + +static ncclResult_t commReclaim(ncclComm_t comm) { + ncclResult_t ret = ncclSuccess; + ncclResult_t state; + int curRank; /* Debug info */ + + NCCLCHECKGOTO(ncclCommGetAsyncError(comm, &state), ret, fail); + TRACE(NCCL_INIT, "commReclaim: reclaim comm %p rank %d state %d", comm, comm->rank, state); + if (state == ncclSuccess && *comm->abortFlag == 0 && comm->finalizeCalled == false) { + /* user does not call ncclCommFinalize and this is a normal comm destroy. ncclCommDestroy + * should be nonblocking until last call of ncclCommDestroy. */ + NCCLCHECKGOTO(commFinalize(comm, false), ret, fail); + } + + if (comm->initState != ncclSuccess) { + /* if init errors happen, no finalize thread should have been launched. Main thread can reclaim + * everything since no NCCL kernel was issued. */ + struct ncclCommFinalizeAsyncJob job; + + job.comm = comm; + curRank = comm->rank; + /* comm aborts, commDestroySync should not be blocked. */ + if ((ret = commDestroySync((struct ncclAsyncJob*) &job)) != ncclSuccess) { + WARN("commReclaim: comm %p (rank = %d) in abort, error %d", comm, curRank, ret); + } + + if ((ret = commCleanup(comm)) != ncclSuccess) { + WARN("commReclaim: cleanup comm %p rank %d failed in destroy/abort, error %d", comm, curRank, ret); + } + } else { + int curRankCnt; + int intraRanks = comm->intraRanks; + ncclComm_t intracomm0 = comm->intraComm0; + int *finalizeRankCnt = &intracomm0->finalizeRankCnt; + + assert(intracomm0 != NULL && finalizeRankCnt != NULL); + curRankCnt = __atomic_add_fetch(finalizeRankCnt, 1, __ATOMIC_ACQ_REL); + if (curRankCnt == intraRanks) { + ncclComm_t curIntraComm; + ncclComm_t nextIntraComm = intracomm0; + + while (nextIntraComm) { + curIntraComm = nextIntraComm; + curRank = curIntraComm->rank; + nextIntraComm = nextIntraComm->intraNext; + + if (comm->finalizeCalled == false) { + struct ncclCommFinalizeAsyncJob job; + job.comm = curIntraComm; + /* every comm aborts, commDestroySync should not be blocked. */ + if ((ret = commDestroySync((struct ncclAsyncJob*) &job)) != ncclSuccess) + WARN("commReclaim: comm %p (rank = %d) in abort, error %d", curIntraComm, curRank, ret); + } + + if ((ret = commCleanup(curIntraComm)) != ncclSuccess) { + WARN("commReclaim: cleanup comm %p rank %d failed in destroy/abort, error %d", curIntraComm, curRank, ret); + } + } + } + } + +exit: + return ret; +fail: + goto exit; +} + NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm); ncclResult_t ncclCommDestroy(ncclComm_t comm) { NVTX3_FUNC_RANGE_IN(nccl_domain); @@ -1169,9 +1474,18 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev; int64_t busId = comm->busId; TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId); + // Try and prevent a double free of the comm struct (user error) + if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) { + WARN("comm %p has already been destroyed", comm); + return ncclInvalidArgument; + } - NCCLCHECK(commDestroy(comm)); + /* init thread must be joined before we destroy the comm. */ + NCCLCHECK(ncclCommEnsureReady(comm)); + + NCCLCHECK(commReclaim(comm)); INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Destroy COMPLETE", comm, rank, nranks, cudaDev, busId); + return ncclSuccess; } @@ -1187,9 +1501,13 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { // Ask anything that might still be running on the device to quit *comm->abortFlag = 1; + /* init thread must be joined before we destroy the comm, + * and we should ignore the init error here. */ + ncclCommEnsureReady(comm); - NCCLCHECK(commDestroy(comm)); + (void) commReclaim(comm); INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Abort COMPLETE", comm, rank, nranks, cudaDev, busId); + return ncclSuccess; } @@ -1203,6 +1521,7 @@ const char* ncclGetErrorString(ncclResult_t code) { case ncclInvalidArgument : return "invalid argument"; case ncclInvalidUsage : return "invalid usage"; case ncclRemoteError : return "remote process exited or there was a network error"; + case ncclInProgress : return "NCCL operation in progress"; default : return "unknown result code"; } } @@ -1219,15 +1538,21 @@ NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asy ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm")); NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError")); - *asyncError = comm->fatalError; + + *asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE); return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count); ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { NVTX3_FUNC_RANGE_IN(nccl_domain); + NCCLCHECK(PtrCheck(comm, "CommCount", "comm")); NCCLCHECK(PtrCheck(count, "CommCount", "count")); + + /* init thread must be joined before we access the attributes of comm. */ + NCCLCHECK(ncclCommEnsureReady(comm)); + *count = comm->nRanks; return ncclSuccess; } @@ -1235,8 +1560,12 @@ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid); ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) { NVTX3_FUNC_RANGE_IN(nccl_domain); + NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm")); NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid")); + + NCCLCHECK(ncclCommEnsureReady(comm)); + *devid = comm->cudaDev; return ncclSuccess; } @@ -1244,8 +1573,12 @@ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) { NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank); ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { NVTX3_FUNC_RANGE_IN(nccl_domain); + NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm")); NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank")); + + NCCLCHECK(ncclCommEnsureReady(comm)); + *rank = comm->rank; return ncclSuccess; } diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc index 43c95c2..52663b5 100644 --- a/src/misc/cudawrap.cc +++ b/src/misc/cudawrap.cc @@ -10,32 +10,30 @@ #include -#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr +#define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr #if CUDART_VERSION >= 11030 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */ -DECLARE_CUDA_PFN(cuDeviceGet); -DECLARE_CUDA_PFN(cuDeviceGetAttribute); -DECLARE_CUDA_PFN(cuGetErrorString); -DECLARE_CUDA_PFN(cuGetErrorName); +DECLARE_CUDA_PFN(cuDeviceGet, 2000); +DECLARE_CUDA_PFN(cuDeviceGetAttribute, 2000); +DECLARE_CUDA_PFN(cuGetErrorString, 6000); +DECLARE_CUDA_PFN(cuGetErrorName, 6000); /* enqueue.cc */ -DECLARE_CUDA_PFN(cuMemGetAddressRange); +DECLARE_CUDA_PFN(cuMemGetAddressRange, 3020); /* proxy.cc */ -DECLARE_CUDA_PFN(cuCtxCreate_v3020); -DECLARE_CUDA_PFN(cuCtxDestroy); -DECLARE_CUDA_PFN(cuCtxSetCurrent); +DECLARE_CUDA_PFN(cuCtxCreate, 3020); +DECLARE_CUDA_PFN(cuCtxDestroy, 4000); +DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000); #if CUDA_VERSION >= 11070 /* transport/collNet.cc/net.cc*/ -DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support +DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support #endif #endif /* CUDA Driver functions loaded with dlsym() */ -DECLARE_CUDA_PFN(cuInit); -DECLARE_CUDA_PFN(cuDriverGetVersion); -DECLARE_CUDA_PFN(cuGetProcAddress); - -static enum { cudaUninitialized, cudaInitializing, cudaInitialized, cudaError } cudaState = cudaUninitialized; +DECLARE_CUDA_PFN(cuInit, 2000); +DECLARE_CUDA_PFN(cuDriverGetVersion, 2020); +DECLARE_CUDA_PFN(cuGetProcAddress, 11030); #define CUDA_DRIVER_MIN_VERSION 11030 @@ -46,46 +44,37 @@ static int cudaDriverVersion; /* Load the CUDA symbols */ -static int cudaPfnFuncLoader(void) { +static ncclResult_t cudaPfnFuncLoader(void) { CUresult res; -#define LOAD_SYM(symbol, ignore) do { \ - res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), cudaDriverVersion, 0); \ +#define LOAD_SYM(symbol, version, ignore) do { \ + res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), version, 0); \ if (res != 0) { \ if (!ignore) { \ - WARN("Retrieve %s version %d failed with %d", #symbol, cudaDriverVersion, res); \ + WARN("Retrieve %s version %d failed with %d", #symbol, version, res); \ return ncclSystemError; } \ } } while(0) - LOAD_SYM(cuGetErrorString, 0); - LOAD_SYM(cuGetErrorName, 0); - LOAD_SYM(cuDeviceGet, 0); - LOAD_SYM(cuDeviceGetAttribute, 0); - LOAD_SYM(cuMemGetAddressRange, 1); - LOAD_SYM(cuCtxCreate_v3020, 1); - LOAD_SYM(cuCtxDestroy, 1); - LOAD_SYM(cuCtxSetCurrent, 1); + LOAD_SYM(cuGetErrorString, 6000, 0); + LOAD_SYM(cuGetErrorName, 6000, 0); + LOAD_SYM(cuDeviceGet, 2000, 0); + LOAD_SYM(cuDeviceGetAttribute, 2000, 0); + LOAD_SYM(cuMemGetAddressRange, 3020, 1); + LOAD_SYM(cuCtxCreate, 3020, 1); + LOAD_SYM(cuCtxDestroy, 4000, 1); + LOAD_SYM(cuCtxSetCurrent, 4000, 1); #if CUDA_VERSION >= 11070 - LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support + LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support #endif return ncclSuccess; } #endif -ncclResult_t cudaLibraryInit(void) { +static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; +static ncclResult_t initResult; + +static void initOnceFunc() { CUresult res; - - if (cudaState == cudaInitialized) - return ncclSuccess; - if (cudaState == cudaError) - return ncclSystemError; - - if (__sync_bool_compare_and_swap(&cudaState, cudaUninitialized, cudaInitializing) == false) { - // Another thread raced in front of us. Wait for it to be done. - while (cudaState == cudaInitializing) sched_yield(); - return (cudaState == cudaInitialized) ? ncclSuccess : ncclSystemError; - } - /* * Load CUDA driver library */ @@ -106,13 +95,13 @@ ncclResult_t cudaLibraryInit(void) { * Load initial CUDA functions */ - pfn_cuInit = (PFN_cuInit) dlsym(cudaLib, "cuInit"); + pfn_cuInit = (PFN_cuInit_v2000) dlsym(cudaLib, "cuInit"); if (pfn_cuInit == NULL) { WARN("Failed to load CUDA missing symbol cuInit"); goto error; } - pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion) dlsym(cudaLib, "cuDriverGetVersion"); + pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion_v2020) dlsym(cudaLib, "cuDriverGetVersion"); if (pfn_cuDriverGetVersion == NULL) { WARN("Failed to load CUDA missing symbol cuDriverGetVersion"); goto error; @@ -132,7 +121,7 @@ ncclResult_t cudaLibraryInit(void) { goto error; } - pfn_cuGetProcAddress = (PFN_cuGetProcAddress) dlsym(cudaLib, "cuGetProcAddress"); + pfn_cuGetProcAddress = (PFN_cuGetProcAddress_v11030) dlsym(cudaLib, "cuGetProcAddress"); if (pfn_cuGetProcAddress == NULL) { WARN("Failed to load CUDA missing symbol cuGetProcAddress"); goto error; @@ -145,19 +134,21 @@ ncclResult_t cudaLibraryInit(void) { */ pfn_cuInit(0); -#if CUDART_VERSION >= 11030 + #if CUDART_VERSION >= 11030 if (cudaPfnFuncLoader()) { WARN("CUDA some PFN functions not found in the library"); goto error; } -#endif - - cudaState = cudaInitialized; - return ncclSuccess; + #endif + initResult = ncclSuccess; + return; error: - cudaState = cudaError; - return ncclSystemError; + initResult = ncclSystemError; + return; } - +ncclResult_t cudaLibraryInit() { + pthread_once(&initOnceControl, initOnceFunc); + return initResult; +} diff --git a/src/misc/gdrwrap.cc b/src/misc/gdrwrap.cc index e81c7ea..4729efe 100644 --- a/src/misc/gdrwrap.cc +++ b/src/misc/gdrwrap.cc @@ -9,8 +9,6 @@ #ifndef GDR_DIRECT #include "core.h" -static enum { gdrUninitialized, gdrInitializing, gdrInitialized, gdrError } gdrState = gdrUninitialized; - /* Function pointers assigned from dlopen() */ static gdr_t (*gdr_internal_open)(void); static int (*gdr_internal_close)(gdr_t g); @@ -49,18 +47,10 @@ pthread_mutex_t gdrLock = PTHREAD_MUTEX_INITIALIZER; *cast = tmp; \ } while (0) -ncclResult_t wrap_gdr_symbols(void) { - if (gdrState == gdrInitialized) - return ncclSuccess; - if (gdrState == gdrError) - return ncclSystemError; - - if (__sync_bool_compare_and_swap(&gdrState, gdrUninitialized, gdrInitializing) == false) { - // Another thread raced in front of us. Wait for it to be done. - while (gdrState == gdrInitializing) sched_yield(); - return (gdrState == gdrInitialized) ? ncclSuccess : ncclSystemError; - } +static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; +static ncclResult_t initResult; +static void initOnceFunc(void) { static void* gdrhandle = NULL; void* tmp; void** cast; @@ -84,8 +74,8 @@ ncclResult_t wrap_gdr_symbols(void) { LOAD_SYM(gdrhandle, "gdr_copy_to_mapping", gdr_internal_copy_to_mapping); LOAD_SYM(gdrhandle, "gdr_copy_from_mapping", gdr_internal_copy_from_mapping); - gdrState = gdrInitialized; - return ncclSuccess; + initResult = ncclSuccess; + return; teardown: gdr_internal_open = NULL; @@ -101,11 +91,16 @@ teardown: gdr_internal_copy_from_mapping = NULL; if (gdrhandle != NULL) dlclose(gdrhandle); - gdrState = gdrError; - return ncclSystemError; + initResult = ncclSystemError; + return; } +ncclResult_t wrap_gdr_symbols(void) { + pthread_once(&initOnceControl, initOnceFunc); + return initResult; +} + gdr_t wrap_gdr_open(void) { if (gdr_internal_open == NULL) { WARN("GDRCOPY lib wrapper not initialized."); diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc index 3b8daac..8a736d3 100644 --- a/src/misc/ibvwrap.cc +++ b/src/misc/ibvwrap.cc @@ -11,8 +11,6 @@ #include #include "core.h" -static enum { ibvUninitialized, ibvInitializing, ibvInitialized, ibvError } ibvState = ibvUninitialized; - /*Function Pointers*/ int (*ibv_internal_fork_init)(void); struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices); @@ -43,18 +41,10 @@ const char * (*ibv_internal_event_type_str)(enum ibv_event_type event); // IBVERBS Library versioning #define IBVERBS_VERSION "IBVERBS_1.1" -ncclResult_t wrap_ibv_symbols(void) { - if (ibvState == ibvInitialized) - return ncclSuccess; - if (ibvState == ibvError) - return ncclSystemError; - - if (__sync_bool_compare_and_swap(&ibvState, ibvUninitialized, ibvInitializing) == false) { - // Another thread raced in front of us. Wait for it to be done. - while (ibvState == ibvInitializing) sched_yield(); - return (ibvState == ibvInitialized) ? ncclSuccess : ncclSystemError; - } +static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; +static ncclResult_t initResult; +static void initOnceFunc(void) { static void* ibvhandle = NULL; void* tmp; void** cast; @@ -111,8 +101,8 @@ ncclResult_t wrap_ibv_symbols(void) { LOAD_SYM(ibvhandle, "ibv_fork_init", ibv_internal_fork_init); LOAD_SYM(ibvhandle, "ibv_event_type_str", ibv_internal_event_type_str); - ibvState = ibvInitialized; - return ncclSuccess; + initResult = ncclSuccess; + return; teardown: ibv_internal_get_device_list = NULL; @@ -141,8 +131,13 @@ teardown: ibv_internal_event_type_str = NULL; if (ibvhandle != NULL) dlclose(ibvhandle); - ibvState = ibvError; - return ncclSystemError; + initResult = ncclSystemError; + return; +} + +ncclResult_t wrap_ibv_symbols(void) { + pthread_once(&initOnceControl, initOnceFunc); + return initResult; } #define IBV_PTR_CHECK_ERRNO(name_internal, call, retval, error_retval, name) \ @@ -256,7 +251,7 @@ ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int } ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) { - IBV_PTR_CHECK(ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd"); + IBV_PTR_CHECK_ERRNO(ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd"); } ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ @@ -290,6 +285,7 @@ ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) { if (ibv_internal_reg_dmabuf_mr == NULL) { + errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set return NULL; } return ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access); @@ -300,7 +296,7 @@ ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or t } ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) { - IBV_PTR_CHECK(ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq"); + IBV_PTR_CHECK_ERRNO(ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq"); } ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) { @@ -312,7 +308,7 @@ ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) { } ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) { - IBV_PTR_CHECK(ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp"); + IBV_PTR_CHECK_ERRNO(ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp"); } ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc index d6bc353..a432ff6 100644 --- a/src/misc/shmutils.cc +++ b/src/misc/shmutils.cc @@ -38,6 +38,7 @@ static ncclResult_t ncclShmSetup(char* shmPath, const int shmSize, int* fd, void WARN("Error: failed to extend %s to %d bytes", shmPath, shmSize); return ncclSystemError; } + INFO(NCCL_ALLOC, "Allocated %d bytes of shared memory in %s\n", shmSize, shmPath); } else { SYSCHECKVAL(open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", *fd); } @@ -81,10 +82,12 @@ ncclResult_t ncclShmUnlink(const char* shmPath) { } ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize) { - if (devShmPtr) CUDACHECK(cudaHostUnregister(shmPtr)); - if (munmap(shmPtr, shmSize) != 0) { - WARN("munmap of shared memory failed"); - return ncclSystemError; + if (shmPtr) { + if (devShmPtr) CUDACHECK(cudaHostUnregister(shmPtr)); + if (munmap(shmPtr, shmSize) != 0) { + WARN("munmap of shared memory failed"); + return ncclSystemError; + } } return ncclSuccess; } diff --git a/src/misc/socket.cc b/src/misc/socket.cc index 16049fa..7161aee 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -332,10 +332,10 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) { #endif } - if (sock->asyncFlag) { - EQCHECK(flags = fcntl(fd, F_GETFL), -1); - SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); - } + /* The socket is set non-blocking for OS level, but asyncFlag is used to control + * blocking and non-blocking behavior in user level. */ + EQCHECK(flags = fcntl(fd, F_GETFL), -1); + SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); // addr port should be 0 (Any port) SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind"); @@ -411,11 +411,10 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) { const int one = 1; SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt"); - /* support non-blocking socket; by default, the socket is non-blocking */ - if (sock->asyncFlag) { - EQCHECK(flags = fcntl(fd, F_GETFL), -1); - SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); - } + /* The socket is set non-blocking for OS level, but asyncFlag is used to control + * blocking and non-blocking behavior in user level. */ + EQCHECK(flags = fcntl(fd, F_GETFL), -1); + SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); /* const int bufsize = 128*1024; SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt"); @@ -430,17 +429,26 @@ retry: /* blocking/non-blocking connect() is determined by asyncFlag. */ ret = connect(fd, &sock->addr.sa, salen); - if (!sock->asyncFlag && (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) || - (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES))) { - if (errno == ECONNREFUSED && refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno)); - usleep(SLEEP_INT); - goto retry; - } + if (!sock->asyncFlag) { + /* blocking socket, need retry if connect fails. */ + if (errno == EINPROGRESS || errno == EAGAIN || errno == EALREADY || + (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) || + (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) { + /* check abortFlag as long as we have chance to retry. */ + if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError; + if (errno == ECONNREFUSED && refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno)); + usleep(SLEEP_INT); + goto retry; + } - /* If connect() fails with errno == EAGAIN/EINPROGRESS/ETIMEDOUT, we may want to try connect again. - * However, it can return EISCONN instead of success which indicates connection is built up in - * background already. No need to call connect() again. */ - if (ret == 0 || ((errno == EINPROGRESS || errno == ECONNREFUSED) && sock->asyncFlag) || errno == EISCONN) { + /* If connect() fails with errno == EAGAIN/EINPROGRESS/ETIMEDOUT, we may want to try connect again. + * However, it can return EISCONN instead of success which indicates connection is built up in + * background already. No need to call connect() again. */ + if (ret == 0 || errno == EISCONN) { + sock->fd = fd; + return ncclSuccess; + } + } else { sock->fd = fd; return ncclSuccess; } @@ -451,17 +459,26 @@ retry: ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) { socklen_t socklen = sizeof(union ncclSocketAddress); + struct pollfd pollfd; int tmpFd = sock->fd = -1; + int pollret; - do { - if (listenSocket->abortFlag) NEQCHECK(*listenSocket->abortFlag, 0); + pollfd.fd = listenSocket->fd; + pollfd.events = POLLIN; +retry: + if ((pollret = poll(&pollfd, 1, listenSocket->asyncFlag ? 0 : 100)) < 0) { + return ncclSystemError; + } else { tmpFd = accept(listenSocket->fd, &sock->addr.sa, &socklen); - } while ((errno == EAGAIN || errno == EWOULDBLOCK) && tmpFd == -1 && !listenSocket->asyncFlag); + } if (!listenSocket->asyncFlag) { + /* blocking socket, if tmpFd is still -1, we need to retry */ + if (tmpFd == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) { + if (listenSocket->abortFlag && *listenSocket->abortFlag != 0) return ncclInternalError; + goto retry; + } EQCHECK(tmpFd, -1); - } else if (tmpFd == -1 && errno != EAGAIN && errno != EWOULDBLOCK) { - return ncclSystemError; } sock->fd = tmpFd; @@ -491,7 +508,7 @@ static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void* char line[SOCKET_NAME_MAXLEN+1]; do { if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); - if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); + if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL); if (op == NCCL_SOCKET_RECV && bytes == 0) { *closed = 1; return ncclSuccess; @@ -507,7 +524,7 @@ static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void* (*offset) += bytes; if (sock->abortFlag && *sock->abortFlag != 0) { INFO(NCCL_NET, "Socket progress: abort called"); - return ncclSystemError; + return ncclInternalError; } } while (bytes > 0 && (*offset) < size); return ncclSuccess; diff --git a/src/nccl.h.in b/src/nccl.h.in index edd98a3..ccb8f57 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -39,7 +39,28 @@ typedef enum { ncclSuccess = 0, ncclInvalidArgument = 4, ncclInvalidUsage = 5, ncclRemoteError = 6, - ncclNumResults = 7 } ncclResult_t; + ncclInProgress = 7, + ncclNumResults = 8 } ncclResult_t; + +/* Communicator configuration. Users can assign value to attributes to specify the + * behavior of a communicator. */ +typedef struct ncclConfig_v21400 { + /* attributes that users should never touch. */ + size_t size; + unsigned int magic; + unsigned int version; + /* attributes that users are able to customize. */ + int blocking; +} ncclConfig_t; + +/* Config initializer must be assigned to initialize config structure when it is created. + * Not initialized config will result in NCCL error. */ +#define NCCL_CONFIG_INITIALIZER { \ + sizeof(ncclConfig_t), /* size */ \ + 0xcafebeef, /* magic */ \ + NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ + 1 /* blocking */ \ +} /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. * This integer is coded with the MAJOR, MINOR and PATCH level of the @@ -54,6 +75,11 @@ ncclResult_t pncclGetVersion(int *version); ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); +/* Create a new communicator (multi thread/process version) with a configuration + * set by users. */ +ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); +ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); + /* Creates a new communicator (multi thread/process version). * rank must be between 0 and nranks-1 and unique within a communicator clique. * Each rank is associated to a CUDA device, which has to be set before calling @@ -72,8 +98,15 @@ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); -/* Frees resources associated with communicator object, but waits for any operations - * that might still be running on the device. */ +/* Finalize a communicator. ncclCommFinalize flushes all issued communications, + * and marks communicator state as ncclInProgress. The state will change to ncclSuccess + * when the communicator is globally quiescent and related resources are freed; then, + * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator + * itself) without blocking. */ +ncclResult_t ncclCommFinalize(ncclComm_t comm); +ncclResult_t pncclCommFinalize(ncclComm_t comm); + +/* Frees local resources associated with communicator object. */ ncclResult_t ncclCommDestroy(ncclComm_t comm); ncclResult_t pncclCommDestroy(ncclComm_t comm); diff --git a/src/net.cc b/src/net.cc index 53ec80e..1480c76 100644 --- a/src/net.cc +++ b/src/net.cc @@ -324,12 +324,8 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { ncclResult_t ret; ncclDebugNoWarn = NCCL_NET; NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1); - while (sComm == NULL) { - NCCLWAITGOTO(ncclNetConnect(comm, dev, &handle, &sComm), sComm != NULL, comm->abortFlag, ret, cleanup2); - } - while (rComm == NULL) { - NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3); - } + NCCLWAITGOTO(ncclNetConnect(comm, dev, &handle, &sComm), sComm != NULL, comm->abortFlag, ret, cleanup2); + NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3); CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4); if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle)); diff --git a/src/proxy.cc b/src/proxy.cc index 5021bc8..1a2f361 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -406,17 +406,17 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0, justInquire)); } } break; - case ncclPatternCollTreeUpDown: { - // CollTree up - NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1, justInquire)); // For CollTree up, we are using push - // CollTree down - NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0, justInquire)); + case ncclPatternCollnetChain: { + NCCLCHECK(SaveProxy(channel, proxySend, channel->collnetChain.up, op, 1, justInquire)); + NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collnetChain.up, op, 0, justInquire)); + } break; + case ncclPatternCollnetDirect: { + NCCLCHECK(SaveProxy(channel, proxySend, channel->collnetDirect.out, op, 1, justInquire)); + NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collnetDirect.out, op, 0, justInquire)); } break; case ncclPatternSend: case ncclPatternRecv: { if (op->root == comm->rank) return ncclSuccess; - op->nsteps = DIVUP(op->nbytes, op->chunkSize); - if (op->nsteps == 0) op->nsteps = 1; NCCLCHECK(SaveProxy(channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, 1, justInquire)); } break; } @@ -432,16 +432,17 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op) op->channelId = channelId; op->sliceSteps = 1; op->chunkSteps = 1; - op->protocol = NCCL_PROTO_SIMPLE; op->dtype = info->datatype; + op->protocol = info->protocol; - int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; - if (info->comm->nNodes > 1) stepSize /= SENDRECV_SLICEFACTOR; + int stepSize = info->comm->buffSizes[op->protocol]/NCCL_STEPS; + + // If nNodes > 1 and we're using Simple, reduce the stepSize to increase shared buffer utilization + if (info->comm->nNodes > 1 && op->protocol == NCCL_PROTO_SIMPLE) stepSize = info->comm->p2pNetChunkSize; info->chunkSize = stepSize; op->root = info->root; - op->nbytes = info->count; - struct ncclChannelPeer* peer = channel->peers + op->root; + struct ncclChannelPeer* peer = channel->peers + op->root; if (info->coll == ncclFuncSend) { op->pattern = ncclPatternSend; if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) { @@ -464,6 +465,17 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op) info->chunkSize = ncclParamChunkSize(); } op->chunkSize = info->chunkSize; + + // Compute nSteps for proxies + int chunkEffectiveSize = op->chunkSize; + if (op->protocol == NCCL_PROTO_LL) { + chunkEffectiveSize /= 2; + } + + op->nbytes = stepSize; + op->nsteps = DIVUP(info->count, chunkEffectiveSize); + if (op->nsteps == 0) op->nsteps = 1; + return ncclSuccess; } @@ -616,7 +628,7 @@ ncclResult_t ncclSetThreadContext(struct ncclComm* comm) { if (createThreadContext == -1) { createThreadContext = ncclParamCreateThreadContext(); if (createThreadContext) { - if (CUPFN(cuCtxCreate_v3020) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) { + if (CUPFN(cuCtxCreate) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) { WARN("Unable to create thread context due to old driver, disabling."); createThreadContext = 0; } @@ -624,7 +636,7 @@ ncclResult_t ncclSetThreadContext(struct ncclComm* comm) { } if (createThreadContext) { if (comm->proxyState.cudaCtx == NULL) { - if (CUPFN(cuCtxCreate_v3020(&comm->proxyState.cudaCtx, + if (CUPFN(cuCtxCreate(&comm->proxyState.cudaCtx, CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, comm->cudaDev)) != CUDA_SUCCESS) { WARN("Failed to create CUDA context on device %d", comm->cudaDev); createThreadContext = 0; @@ -641,6 +653,9 @@ ncclResult_t ncclSetThreadContext(struct ncclComm* comm) { return ncclSuccess; } +// Set to SIGUSR1 or SIGUSR2 to help debug proxy state during hangs +NCCL_PARAM(ProxyDumpSignal, "PROXY_DUMP_SIGNAL", -1); + void* ncclProxyProgress(void *comm_) { struct ncclComm* comm = (struct ncclComm*)comm_; if (ncclSetThreadContext(comm) != ncclSuccess) { @@ -652,7 +667,8 @@ void* ncclProxyProgress(void *comm_) { struct ncclProxyProgressState* state = &comm->proxyState.progressState; state->nextOps = -1; - signal(SIGUSR1, ncclDumpProxyState); + const int sig = ncclParamProxyDumpSignal(); + if (sig != -1) signal(sig, ncclDumpProxyState); ncclLastProxyState = state; char threadName[NCCL_THREAD_NAMELEN]; snprintf(threadName, NCCL_THREAD_NAMELEN, "NCCL Progress%2d", comm->cudaDev); @@ -664,7 +680,7 @@ void* ncclProxyProgress(void *comm_) { int idle = 1; ncclResult_t ret = progressOps(comm, state, state->active, &idle); if (ret != ncclSuccess) { - comm->fatalError = ret; + (void) ncclCommSetAsyncError(comm, ret); INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); return NULL; } @@ -676,7 +692,7 @@ void* ncclProxyProgress(void *comm_) { ret = ncclProxyGetPostedOps(comm, &added); if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); } if (ret != ncclSuccess) { - comm->fatalError = ret; + (void) ncclCommSetAsyncError(comm, ret); INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); } if (added == 0) { @@ -782,9 +798,13 @@ static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool, static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { if (connection->send) { - NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, comm)); + if (ncclTransports[connection->transport]->send.proxyFree) { + NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, comm)); + } } else { - NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, comm)); + if (ncclTransports[connection->transport]->recv.proxyFree) { + NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, comm)); + } } return ncclSuccess; } @@ -793,7 +813,10 @@ static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* poo for (int b=0; bbanks; b++) { int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE; for (int i=0; ipools[b]+i, comm)); + ncclProxyConnection *connection = pool->pools[b]+i; + if (connection->initFlag == true) { + NCCLCHECK(proxyFree(connection, comm)); + } } free(pool->pools[b]); } @@ -812,8 +835,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in NCCLCHECK(ncclCalloc(&comm->proxyState.proxyOps, comm->localRanks)); NCCLCHECK(ncclCalloc(&comm->proxyState.sharedDevMems, comm->localRanks)); for (int r=0; rlocalRanks; r++) { - comm->proxyState.peerSocks[r].fd = -1; - comm->proxyState.peerSocks[r].abortFlag = comm->abortFlag; + NCCLCHECK(ncclSocketInit(&comm->proxyState.peerSocks[r], NULL, comm->abortFlag, 0)); } } NCCLCHECK(ncclTopoGetLocalRank(comm->topo, rank, &proxyConn->localRank)); @@ -943,6 +965,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1)); } INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport); + __atomic_store_n(&connection->initFlag, true, __ATOMIC_RELEASE); return ncclSuccess; } @@ -1019,7 +1042,8 @@ void* ncclProxyService(void* _args) { struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS]; memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS); for (int s=0; sabortFlag, 0); + pollfds[s].fd = -1; pollfds[s].events = POLLHUP|POLLIN; } pollfds[NCCL_MAX_LOCAL_RANKS].fd = comm->proxyState.listenSock->fd; @@ -1029,8 +1053,9 @@ void* ncclProxyService(void* _args) { int npeers = 0; int stop = 0; int asyncOpCount = 0; - while (stop == 0 || (stop == 1 && npeers > 0)) { - if (int error = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : -1) < 0) { + while ((stop == 0 || (stop == 1 && npeers > 0)) && *comm->abortFlag == 0) { + /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */ + if (int error = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500) < 0) { WARN("[Proxy Service] Poll failed with error %d", error); return NULL; } @@ -1071,10 +1096,7 @@ void* ncclProxyService(void* _args) { INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank); closeConn = 1; } else { - if (type == ncclProxyMsgAbort) { - stop = 2; - closeConn = 1; - } else if (type == ncclProxyMsgStop) { + if (type == ncclProxyMsgStop) { stop = 1; closeConn = 1; } else if (type == ncclProxyMsgClose) { @@ -1104,6 +1126,10 @@ void* ncclProxyService(void* _args) { } } } + /* wait until main thread flush all NCCL operations. */ + while (*comm->abortFlag != 0 && __atomic_load_n(&comm->proxyState.safeAbortFlag, __ATOMIC_ACQUIRE) == 0) + usleep(1000); + // Wait for all operations to complete and stop progress thread before freeing any resource if (ncclProxyProgressDestroy(comm) != ncclSuccess) { WARN("[Proxy Service] proxyDestroy failed"); @@ -1133,15 +1159,23 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) { ncclResult_t ncclProxyDestroy(struct ncclComm* comm) { struct ncclProxyState* state = &comm->proxyState; + + if (state == NULL) return ncclSuccess; if (state->peerAddresses) { - struct ncclSocket sock; - sock.abortFlag = NULL; - sock.asyncFlag = 0; - memcpy(&sock.addr, comm->proxyState.peerAddresses+comm->rank, sizeof(union ncclSocketAddress)); - NCCLCHECK(ncclSocketConnect(&sock)); - int type = (*comm->abortFlag) ? ncclProxyMsgAbort : ncclProxyMsgStop; - NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int))); - close(sock.fd); + if (*comm->abortFlag == 0) { + struct ncclSocket sock; + sock.abortFlag = NULL; + sock.asyncFlag = 0; + memcpy(&sock.addr, comm->proxyState.peerAddresses+comm->rank, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketConnect(&sock)); + int type = ncclProxyMsgStop; + NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int))); + close(sock.fd); + } else { + /* when abortFlag is set, all socket related communications are no longer reliable. We need to + * set a flag to let proxy thread exit. */ + __atomic_store_n(&state->safeAbortFlag, 1, __ATOMIC_RELEASE); + } free(state->peerAddresses); } if (state->peerSocks) { diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index 0404aa8..432511c 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -100,6 +100,7 @@ struct sendResources { int nranks; int netDev; int useGdr; + int useDmaBuf; uint64_t* gdcSync; void* gdrDesc; void* sendMhandles[NCCL_NUM_PROTOCOLS]; @@ -119,6 +120,7 @@ struct recvResources { int nranks; int netDev; int useGdr; + int useDmaBuf; uint64_t* gdcSync; uint64_t* gdcFlush; void* gdrDesc; @@ -154,7 +156,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn)); NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); - INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(comm), req.netDev, + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, req.useGdr ? "/GDRDMA" : ""); return ncclSuccess; } @@ -172,7 +174,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t))); - INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(comm), req.netDev, + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, req.useGdr ? "/GDRDMA" : ""); return ncclSuccess; } @@ -281,6 +283,10 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc resources->netDev = req->netDev; resources->useGdr = req->useGdr; + ncclNetProperties_t props; + NCCLCHECK(collNetGetProperties(comm, req->netDev, &props)); + /* DMA-BUF support */ + resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); return ncclSuccess; } @@ -386,6 +392,10 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc resources->netDev = req->netDev; resources->useGdr = req->useGdr; + ncclNetProperties_t props; + NCCLCHECK(collNetGetProperties(comm, req->netDev, &props)); + /* DMA-BUF support */ + resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); collNetHandle_t* netHandle = (collNetHandle_t*) respBuff; if (respSize != sizeof(collNetHandle_t)) return ncclInternalError; @@ -449,7 +459,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str #if CUDA_VERSION >= 11070 /* DMA-BUF support */ - if (resources->useGdr && comm->dmaBufSupport) { + if (resources->useGdr && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size, @@ -518,7 +528,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str #if CUDA_VERSION >= 11070 /* DMA-BUF support */ - if (resources->useGdr && comm->dmaBufSupport) { + if (resources->useGdr && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size, @@ -545,35 +555,41 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { struct sendResources* resources = (struct sendResources*)(connection->transportResources); - for (int p=0; psendMhandles[p]) { - NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p])); + + if (resources) { + for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) { + if (resources->sendMhandles[p]) { + NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p])); + } } + struct connectMapMem* mems = resources->map.mems; + NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); + CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); + NCCLCHECK(sharedBuffersDestroy(comm)); + NCCLCHECK(sharedFree(comm, resources->netDev)); + free(connection->transportResources); } - struct connectMapMem* mems = resources->map.mems; - NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); - CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); - if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); - NCCLCHECK(sharedBuffersDestroy(comm)); - NCCLCHECK(sharedFree(comm, resources->netDev)); - free(connection->transportResources); return ncclSuccess; } static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { struct recvResources* resources = (struct recvResources*)(connection->transportResources); - for (int p=0; pmhandles[p]) { - NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p])); + + if (resources) { + for (int p=0; pmhandles[p]) { + NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p])); + } } + struct connectMapMem* mems = resources->map.mems; + NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); + CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); + NCCLCHECK(sharedBuffersDestroy(comm)); + NCCLCHECK(sharedFree(comm, resources->netDev)); + free(connection->transportResources); } - struct connectMapMem* mems = resources->map.mems; - NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); - CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); - if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); - NCCLCHECK(sharedBuffersDestroy(comm)); - NCCLCHECK(sharedFree(comm, resources->netDev)); - free(connection->transportResources); return ncclSuccess; } @@ -582,10 +598,6 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct (s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1) static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { - if (args->protocol != NCCL_PROTO_SIMPLE) { - WARN("CollNet does not support LL/LL128"); - return ncclInternalError; - } if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; @@ -599,7 +611,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg } args->idle = 1; if (args->state == ncclProxyOpProgress) { - int p = args->protocol; + int p = NCCL_PROTO_SIMPLE; int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS); int perGroupSteps = NCCL_STEPS / nGroups; for (int s=0; snsubs; s++) { @@ -695,10 +707,6 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg } static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { - if (args->protocol != NCCL_PROTO_SIMPLE) { - WARN("CollNet does not support LL/LL128"); - return ncclInternalError; - } if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; @@ -712,7 +720,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg } args->idle = 1; if (args->state == ncclProxyOpProgress) { - int p = args->protocol; + int p = NCCL_PROTO_SIMPLE; int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS); int perGroupSteps = NCCL_STEPS / nGroups; for (int s=0; snsubs; s++) { @@ -746,7 +754,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg TRACE(NCCL_NET, "recvProxy [%d/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize); sub->received += args->sliceSteps; sub->requests[buffSlot] = NULL; - if (reqFifo[group][buffSlot].size > 0 && resources->useGdr) { + if (1 && reqFifo[group][buffSlot].size > 0 && resources->useGdr) { // GDRCOPY support if (resources->gdcFlush) { #if defined (__x86_64__) diff --git a/src/transport/net.cc b/src/transport/net.cc index be3afc4..12390c0 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -90,6 +90,7 @@ struct sendResources { int remoteRank; int netDev; int useGdr; + int useDmaBuf; int maxRecvs; uint64_t* gdcSync; void* gdrDesc; @@ -116,6 +117,7 @@ struct recvResources { int proxyRank; int netDev; int useGdr; + int useDmaBuf; int maxRecvs; uint64_t* gdcSync; uint64_t* gdcFlush; @@ -130,19 +132,13 @@ struct recvResources { uint64_t llLastCleaning; }; -NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0); - /* Determine if two peers can communicate with NET */ static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { - // Same host? - if (info1->hostHash == info2->hostHash) { - // User disabled NET for intra-node? - if (ncclParamNetDisableIntra() == 1) { - *ret = 0; - return ncclSuccess; - } - } *ret = 1; + if (info1->hostHash == info2->hostHash) { + // If on the same host, check intra-node net is not disabled. + NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, ret)); + } return ncclSuccess; } @@ -336,12 +332,15 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne static ncclResult_t sendFree(struct ncclConnector* send) { struct connectMap* map = (struct connectMap*)(send->transportResources); - if (map->sameProcess == 0) { - NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); - if (map->mems[NCCL_NET_MAP_DEVMEM].size) { - CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); + if (map) { + if (map->sameProcess == 0) { + NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); + if (map->mems[NCCL_NET_MAP_DEVMEM].size) { + CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); + } } } + return ncclSuccess; } @@ -368,7 +367,7 @@ static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int local struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv; state->refcount++; if (state->size == 0) { - state->size = nChannels*(NCCL_SHARED_STEPS/NCCL_STEPS)*comm->buffSizes[NCCL_PROTO_SIMPLE]/SENDRECV_SLICEFACTOR; + state->size = nChannels*NCCL_SHARED_STEPS*comm->p2pNetChunkSize; } if (size) *size = state->size; @@ -394,9 +393,8 @@ static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int local static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) { // Use different pools for different channels and also separate send/recv. - int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR); int globalSlot = (channel*NCCL_SHARED_STEPS)+slot; - *offset = slotSize * globalSlot; + *offset = comm->p2pNetChunkSize * globalSlot; return ncclSuccess; } @@ -448,6 +446,8 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc resources->connIndex = req->connIndex; ncclNetProperties_t props; NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props)); + /* DMA-BUF support */ + resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); resources->maxRecvs = props.maxRecvs; // We don't return any data @@ -474,6 +474,8 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc resources->connIndex = req->connIndex; ncclNetProperties_t props; NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props)); + /* DMA-BUF support */ + resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); resources->maxRecvs = props.maxRecvs; if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError; @@ -541,6 +543,12 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str comm, resources->useGdr, resources->localRank, 0, map->sameProcess, comm->p2pnChannels, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipc)); resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size; + + if (comm->allocP2pNetLLBuffers) { + NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*p == NCCL_PROTO_LL*/, comm->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]); + resources->buffSizes[NCCL_PROTO_LL] = comm->buffSizes[NCCL_PROTO_LL]; + } + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); } @@ -589,7 +597,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str #if CUDA_VERSION >= 11070 /* DMA-BUF support */ int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST; - if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) { + if (type == NCCL_PTR_CUDA && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); @@ -676,6 +684,11 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); + if (comm->allocP2pNetLLBuffers) { + NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, comm->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]); + resources->buffSizes[NCCL_PROTO_LL] = comm->buffSizes[NCCL_PROTO_LL]; + } + if (map->mems[NCCL_NET_MAP_DEVMEM].size) { if (resources->shared == 0) { NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size)); @@ -706,7 +719,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str #if CUDA_VERSION >= 11070 /* DMA-BUF support */ int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST; - if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) { + if (type == NCCL_PTR_CUDA && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); @@ -846,7 +859,8 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) { // We have something to receive, let's check if it's completely ready. int size = sizesFifo[buffSlot]; - char* buff = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize; + bool shared = (p == NCCL_PROTO_SIMPLE) && resources->shared; + char* buff = shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize; int ready = 1; if (p == NCCL_PROTO_LL128) { ready = resources->useGdr; @@ -974,7 +988,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg int stepSize = resources->buffSizes[p] / NCCL_STEPS; char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; - if (resources->shared) { + if (p == NCCL_PROTO_SIMPLE && resources->shared) { int sharedBuffSlot = sub->posted%maxDepth; int offset; NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset)); diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index d4bb8cf..a1f8897 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -25,7 +25,6 @@ #include "ibvwrap.h" -#define USE_RDMA_WRITE 1 #define MAXNAMESIZE 64 static char ncclIbIfName[MAX_IF_NAME_SIZE+1]; static union ncclSocketAddress ncclIbIfAddr; @@ -73,7 +72,7 @@ pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER; static int ncclIbRelaxedOrderingEnabled = 0; NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0); -NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14); +NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 18); NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7); NCCL_PARAM(IbPkey, "IB_PKEY", 0); NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0); @@ -120,7 +119,7 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) return ncclSuccess; } -static int ibvWidths[] = { 1, 4, 8, 12 }; +static int ibvWidths[] = { 1, 4, 8, 12, 2 }; static int ibvSpeeds[] = { 2500, 5000, 10000, 10000, 14000, 25000, 50000 }; static int firstBitSet(int val, int max) { int i = 0; @@ -288,8 +287,8 @@ ncclResult_t ncclIbDmaBufSupport(int dev) { NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure); // Test kernel DMA-BUF support with a dummy call (fd=-1) (void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/); - // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP if not supported (EBADF otherwise) - dmaBufSupported = (errno != EOPNOTSUPP) ? 1 : 0; + // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise) + dmaBufSupported = (errno != EOPNOTSUPP && errno != EPROTONOSUPPORT) ? 1 : 0; NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure); } if (dmaBufSupported == 0) return ncclSystemError; @@ -684,7 +683,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm))); stage->comm = rComm; stage->state = ncclIbCommStateAccept; - lComm->sock.asyncFlag = 1; + NCCLCHECK(ncclSocketInit(&rComm->sock, NULL, lComm->sock.abortFlag, 1)); ib_accept: NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock)); diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index a0d80d3..678aab8 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -317,7 +317,6 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) { NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads)); handle->nSocks = comm->nSocks; handle->nThreads = comm->nThreads; - comm->sock.asyncFlag = 1; comm->dev = dev; *listenComm = comm; return ncclSuccess; @@ -394,7 +393,7 @@ ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) { for (; inSocks+1; i++) { uint8_t sendSockIdx; ncclCalloc(&sock, 1); - NCCLCHECK(ncclSocketInit(sock, NULL, NULL, 1)); + NCCLCHECK(ncclSocketInit(sock, NULL, lComm->sock.abortFlag, 1)); stage->sock = sock; stage->state = ncclSocketCommStateAccept; stage->iteration = i; diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 414f05d..b0bad4a 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -107,6 +107,14 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop return ncclSuccess; } + // Check if NET would work better + int useNet = 0; + NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet)); + if (useNet) { + *ret = 0; + return ncclSuccess; + } + // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) int cudaDev1 = busIdToCudaDev(info1->busId); int cudaDev2 = busIdToCudaDev(info2->busId); @@ -230,17 +238,17 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st info->rank = myInfo->rank; if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) { if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; - INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s", - channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr); + INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s", + channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr); } else { send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; - INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s", - channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : ""); + INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s", + channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : ""); } } else { info->rank = intermediateRank; - INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s", - channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank, + INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s", + channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank, comm->peerInfo[intermediateRank].busId, useReadStr); } @@ -374,20 +382,24 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn ncclResult_t p2pSendFree(struct ncclConnector* send) { struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources; - if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); - if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); - free(resources); + if (resources) { + if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); + if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); + free(resources); + } return ncclSuccess; } ncclResult_t p2pRecvFree(struct ncclConnector* recv) { struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources; - if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); - if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); - if (useMemcpy) { - NCCLCHECK(ncclShmClose(resources->shm, resources->devShm, resources->shmSize)); + if (resources) { + if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); + if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); + if (useMemcpy) { + NCCLCHECK(ncclShmClose(resources->shm, resources->devShm, resources->shmSize)); + } + free(resources); } - free(resources); return ncclSuccess; } @@ -464,14 +476,16 @@ static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { if (useMemcpy) { struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources; - NCCLCHECK(ncclShmClose(proxyInfo->shm, proxyInfo->devShm, proxyInfo->shmSize)); - NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem)); - CUDACHECK(cudaFree(proxyInfo->ceDevBuff)); - CUDACHECK(cudaStreamDestroy(proxyInfo->stream)); - for (int i=0; ievents[i])); + if (proxyInfo) { + NCCLCHECK(ncclShmClose(proxyInfo->shm, proxyInfo->devShm, proxyInfo->shmSize)); + NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem)); + CUDACHECK(cudaFree(proxyInfo->ceDevBuff)); + CUDACHECK(cudaStreamDestroy(proxyInfo->stream)); + for (int i=0; ievents[i])); + } + free(proxyInfo); } - free(proxyInfo); } else { // Do not check return code as CUDA may have already shut down cudaFree(connection->transportResources); diff --git a/src/transport/shm.cc b/src/transport/shm.cc index 4a6120a..740bd2a 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -49,6 +49,10 @@ static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct if (ncclParamShmDisable() == 1) return ncclSuccess; + int useNet = 0; + NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet)); + if (useNet) return ncclSuccess; + // Same host? TRACE(NCCL_INIT|NCCL_SHM, "peer1 hostHash %lx peer2 hostHash %lx", info1->hostHash, info2->hostHash); if (info1->hostHash != info2->hostHash) return ncclSuccess; @@ -191,17 +195,21 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co static ncclResult_t shmSendFree(struct ncclConnector* send) { struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources; - NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); - NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); - free(resources); + if (resources) { + NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); + NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); + free(resources); + } return ncclSuccess; } static ncclResult_t shmRecvFree(struct ncclConnector* recv) { struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; - NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); - NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); - free(resources); + if (resources) { + NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); + NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); + free(resources); + } return ncclSuccess; } @@ -243,25 +251,31 @@ static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources; - CUDACHECK(cudaStreamDestroy(resources->stream)); - CUDACHECK(cudaFree(resources->devFifo)); - NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); - for (int i=0; ievents[i])); + + if (resources) { + CUDACHECK(cudaStreamDestroy(resources->stream)); + CUDACHECK(cudaFree(resources->devFifo)); + NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); + for (int i=0; ievents[i])); + } + free(connection->transportResources); } - free(connection->transportResources); return ncclSuccess; } static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources; - CUDACHECK(cudaStreamDestroy(resources->stream)); - CUDACHECK(cudaFree(resources->devFifo)); - NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); - for (int i=0; ievents[i])); + + if (resources) { + CUDACHECK(cudaStreamDestroy(resources->stream)); + CUDACHECK(cudaFree(resources->devFifo)); + NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); + for (int i=0; ievents[i])); + } + free(connection->transportResources); } - free(connection->transportResources); return ncclSuccess; }