From 3fec2fa5eea34917f09e2282afc38863b454eae1 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 11 May 2021 18:16:30 -0700 Subject: [PATCH] 2.9.9-1 Fix crash when setting NCCL_MAX_P2P_NCHANNELS below nchannels. Fix hang during sendrecv dynamic NVB connection establishment on cubemesh topologies. Add environment variable to only use SHARP on communicators beyond a given number of ranks. Add debug subsystem to trace memory allocations. Fix compilation with TRACE=1. (Issue #505) --- makefiles/version.mk | 2 +- src/debug.cc | 4 +- src/enqueue.cc | 15 +++++-- src/graph/paths.cc | 21 ++++++++- src/graph/xml.cc | 2 +- src/graph/xml.h | 2 +- src/include/alloc.h | 4 ++ src/include/comm.h | 1 + src/include/graph.h | 1 + src/include/nccl_net.h | 4 +- src/include/transport.h | 1 + src/init.cc | 94 +++++++++++++++++++++++++++++++---------- src/transport.cc | 37 ++++++++-------- src/transport/net.cc | 6 +-- 14 files changed, 142 insertions(+), 52 deletions(-) diff --git a/makefiles/version.mk b/makefiles/version.mk index 78c601f..87c2017 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 9 -NCCL_PATCH := 8 +NCCL_PATCH := 9 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/debug.cc b/src/debug.cc index 25bf37a..a47ceaf 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -62,6 +62,8 @@ void ncclDebugInit() { mask = NCCL_TUNING; } else if (strcasecmp(subsys, "ENV") == 0) { mask = NCCL_ENV; + } else if (strcasecmp(subsys, "ALLOC") == 0) { + mask = NCCL_ALLOC; } else if (strcasecmp(subsys, "ALL") == 0) { mask = NCCL_ALL; } diff --git a/src/enqueue.cc b/src/enqueue.cc index 09da21c..00920da 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -133,7 +133,8 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph // Because in cudaGraph mode the launch param needs to be determined // at capture time instead of launch time. if (!usingCudaGraph) { - for (int c=0; cp2pnChannels; c++) { + int nChannels = std::max(comm->nChannels, comm->p2pnChannels); + for (int c=0; cchannels[c].workCount) params->gridDim.x = c+1; } eqInfo->maxChannels = params->gridDim.x; @@ -169,8 +170,8 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph // GDRCOPY support uint64_t first = (channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS; uint64_t nelems = channel->workCount; - TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld last %ld nelems %zi", - channel->workFifo, channel->workFifoGdr, first, last, nelems); + TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld nelems %zi", + channel->workFifo, channel->workFifoGdr, first, nelems); for (int i = 0; i < nelems; i++) { int elem = (first+i) % NCCL_MAX_OPS; @@ -799,6 +800,14 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) { #if CUDART_VERSION >= 11030 cudaStreamCaptureStatus captureStatus; unsigned long long cudaGraphId; + if (comm->driverVersion < 11030) { + CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus)); + if (captureStatus != cudaStreamCaptureStatusNone) { + WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support"); + return ncclInvalidUsage; + } + return ncclSuccess; + } CUDACHECK(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL)); if (captureStatus == cudaStreamCaptureStatusActive) { if (cudaGraphId != comm->lastCudaGraphId) { diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 079f5d8..fae5afa 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -29,6 +29,8 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* return ncclInternalError; } +NCCL_PARAM(NvbDisable, "NVB_DISABLE", 0); + static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) { if (baseNode->paths[baseNode->type] == NULL) { NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count)); @@ -63,7 +65,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT // allow routing through a GPU only as 1 hop if (node != baseNode && node->type == GPU && - (link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue; + (ncclParamNvbDisable() || link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue; if ((remPath->width == 0 || remPath->count > path->count) && remPath->width < width) { // Find reverse link @@ -529,3 +531,20 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer); return ncclSuccess; } + +ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks) { + int ngpus = system->nodes[GPU].count; + NCCLCHECK(ncclCalloc(ranks, ngpus)); + int nvbGpus = 0; + for (int g=0; gnodes[GPU].nodes+g; + if (gpu->gpu.rank != rank) continue; + for (int p=0; ppaths[GPU][p].type == PATH_NVB) { + (*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank; + } + } + } + *nranks = nvbGpus; + return ncclSuccess; +} diff --git a/src/graph/xml.cc b/src/graph/xml.cc index f94d9e8..05a77bf 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -469,7 +469,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class")); } - ncclDebugNoWarn = 1; + ncclDebugNoWarn = NCCL_GRAPH; NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index)); if (index == -1) { if (path == NULL) getPciPath(busId, &path); diff --git a/src/graph/xml.h b/src/graph/xml.h index 9a617af..6f1ecfb 100644 --- a/src/graph/xml.h +++ b/src/graph/xml.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/alloc.h b/src/include/alloc.h index 08c63e9..e898d37 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -16,6 +16,7 @@ template static ncclResult_t ncclCudaHostCalloc(T** ptr, size_t nelem) { CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped)); memset(*ptr, 0, nelem*sizeof(T)); + INFO(NCCL_ALLOC, "Cuda Host Alloc Size %ld pointer %p", nelem*sizeof(T), *ptr); return ncclSuccess; } @@ -33,6 +34,7 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) { } memset(p, 0, nelem*sizeof(T)); *ptr = (T*)p; + INFO(NCCL_ALLOC, "Mem Alloc Size %ld pointer %p", nelem*sizeof(T), *ptr); return ncclSuccess; } @@ -45,6 +47,7 @@ static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) { CUDACHECK(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream)); CUDACHECK(cudaStreamSynchronize(stream)); CUDACHECK(cudaStreamDestroy(stream)); + INFO(NCCL_ALLOC, "Cuda Alloc Size %ld pointer %p", nelem*sizeof(T), *ptr); return ncclSuccess; } @@ -65,6 +68,7 @@ static ncclResult_t ncclIbMalloc(void** ptr, size_t size) { if (ret != 0) return ncclSystemError; memset(p, 0, size); *ptr = p; + INFO(NCCL_ALLOC, "Ib Alloc Size %ld pointer %p", size, *ptr); return ncclSuccess; } diff --git a/src/include/comm.h b/src/include/comm.h index 640dcd3..ee8ac46 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -159,6 +159,7 @@ struct ncclComm { struct ncclQueueInfo* enqueueInfo; cudaGraphNode_t lastSetupNode; unsigned long long lastCudaGraphId; + int driverVersion; }; #endif diff --git a/src/include/graph.h b/src/include/graph.h index 892c6d2..1429b3a 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -26,6 +26,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer void ncclTopoFree(struct ncclTopoSystem* system); ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm); ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm); +ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks); // Query topology ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net); diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h index 8c016dc..389c1ea 100644 --- a/src/include/nccl_net.h +++ b/src/include/nccl_net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -19,7 +19,7 @@ #define NCCL_NET_MAX_REQUESTS 8 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; -typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALL=~0} ncclDebugLogSubSys; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); diff --git a/src/include/transport.h b/src/include/transport.h index 33ff432..115bdc5 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -60,4 +60,5 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* enum { collNetRecv=0, collNetSend=1 }; int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type); ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail); +ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm); #endif diff --git a/src/init.cc b/src/init.cc index 32c09db..474218b 100644 --- a/src/init.cc +++ b/src/init.cc @@ -248,7 +248,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { comm->nRanks = comm->hostDevComm.nRanks = ndev; cudaGetDevice(&comm->cudaDev); NCCLCHECK(getBusId(comm->cudaDev, &comm->busId)); - TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x", comm, rank, ndev, comm->cudaDev, comm->busId); + TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx", comm, rank, ndev, comm->cudaDev, comm->busId); comm->doneEvent = doneEvent; comm->intDoneEvent = intDoneEvent; @@ -277,6 +277,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { comm->lastSetupNode = NULL; comm->lastCudaGraphId = -1; + CUDACHECK(cudaDriverGetVersion(&comm->driverVersion)); + static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels"); static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels"); NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks)); @@ -295,11 +297,12 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { static ncclResult_t devCommSetup(ncclComm_t comm) { // Duplicate the channels on the device - NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->p2pnChannels)); - NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->p2pnChannels)); + int nChannels = std::max(comm->nChannels, comm->p2pnChannels); + NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, nChannels)); + NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, nChannels)); // Copy userRanks and peers - for (int r=0; rp2pnChannels; r++) { + for (int r=0; rnChannels; r++) { NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks)); } @@ -459,6 +462,8 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) { NCCL_PARAM(CrossNic, "CROSS_NIC", 2); NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0); +NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2); +NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1); static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) { // We use 2 AllGathers @@ -579,7 +584,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs)); } - // Determine CollNet support + // Determine local CollNet support before all-gather if (tmpNnodes > 1 && ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1; if (intraRanks > 8) { if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node"); @@ -687,6 +692,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm for (int i=0; inChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel)); } + // Determine CollNet support after all-gather now that we know nNodes + int collNetNodeThreshold = ncclParamCollNetNodeThreshold(); + if (comm->nNodes < collNetNodeThreshold) { + if (comm->collNetSupport == 1) + INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold); + comm->collNetSupport = 0; + } + int *rings; NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph)); @@ -727,6 +740,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore); + free(rings); INFO(NCCL_INIT, "Connected all rings"); // Connect Trees @@ -759,27 +773,37 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm else if (ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend) != 1) collNetSetupFail = 1; } + // Verify CollNet setup across ranks after trying the first channel + if (c == 0) { + NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup); + } } + // Verify CollNet setup across ranks after trying all channels + NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup); + TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank); + + // Connect intra-node CollNet + for (int c=0; cnChannels; c++) { + struct ncclChannel* channelRecv = comm->channels+c; + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup); + } + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0), ret, collnet_cleanup); + for (int c=0; cnChannels; c++) { + struct ncclChannel* channelSend = comm->channels+c; + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup); + } + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup); + INFO(NCCL_INIT, "rank %d Connected CollNet", rank); + +collnet_cleanup: free(heads); - // Verify CollNet setup across ranks - NCCLCHECK(ncclTransportCollNetCheck(comm, collNetSetupFail)); - if (comm->collNetSupport) { - TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank); - for (int c=0; cnChannels; c++) { - struct ncclChannel* channelRecv = comm->channels+c; - NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0)); - } - NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 0)); - for (int c=0; cnChannels; c++) { - struct ncclChannel* channelSend = comm->channels+c; - NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1)); - } - NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 1)); - INFO(NCCL_INIT, "rank %d Connected CollNet", rank); + if (ret != ncclSuccess) { + NCCLCHECK(ncclTransportCollNetFree(comm)); + comm->collNetSupport = 0; + ret = ncclSuccess; } } TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); - free(rings); // Compute time models for algorithm and protocol combinations NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph)); @@ -787,6 +811,32 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm // Compute nChannels per peer for p2p NCCLCHECK(ncclTopoComputeP2pChannels(comm)); + if (ncclParamNvbPreconnect()) { + // Connect p2p when using NVB path + int nvbNpeers; + int* nvbPeers; + NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers)); + for (int r=0; rnRanks + (comm->rank-peer)) % comm->nRanks; + for (int c=0; cp2pnChannelsPerPeer; c++) { + int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; + if (comm->channels[channelId].peers[peer].recv[0].connected == 0) { // P2P uses only 1 connector + comm->connectRecv[peer] |= (1<nRanks - (comm->rank-peer)) % comm->nRanks; + for (int c=0; cp2pnChannelsPerPeer; c++) { + int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; + if (comm->channels[channelId].peers[peer].send[0].connected == 0) { // P2P uses only 1 connector + comm->connectSend[peer] |= (1<nNodes) NCCLCHECK(ncclProxyCreate(comm)); @@ -916,7 +966,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; - TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %x", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId); + TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId); // Try and prevent a double free of the comm struct (user error) if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) { diff --git a/src/transport.cc b/src/transport.cc index ad910aa..90838ea 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -237,23 +237,26 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa free(allGatherFailures); if (collNetSetupFail) { if (rank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead"); - // Free collNet resources - for (int r=0; rnChannels; r++) { - struct ncclChannel* channel = comm->channels+r; - struct ncclPeer* peer = channel->peers+nranks; - for (int b=0; bsend + b; - if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources)); - send->transportResources = NULL; // avoid double free - } - for (int b=0; brecv + b; - if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources)); - recv->transportResources = NULL; // avoid double free - } - } - // Set support to 0 - comm->collNetSupport = 0; + return ncclSystemError; + } + return ncclSuccess; +} + +ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) { + // Free collNet resources + for (int r=0; rnChannels; r++) { + struct ncclChannel* channel = comm->channels+r; + struct ncclPeer* peer = channel->peers+comm->nRanks; + for (int b=0; bsend + b; + if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources)); + send->transportResources = NULL; // avoid double free + } + for (int b=0; brecv + b; + if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources)); + recv->transportResources = NULL; // avoid double free + } } return ncclSuccess; } diff --git a/src/transport/net.cc b/src/transport/net.cc index 391f7cf..2b548ce 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -371,7 +371,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { // Data is ready, try to send. NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, sub->requests+buffSlot)); if (sub->requests[buffSlot] != NULL) { - TRACE(NCCL_NET, "sendProxy [%d/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]); + TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]); sizesFifo[buffSlot] = -1; // Make sure size is reset to zero before we update the head. __sync_synchronize(); @@ -388,7 +388,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { int buffSlot = (sub->base+sub->done)%NCCL_STEPS; NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL)); if (done) { - TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", sub->done, buffSlot, sub->requests[buffSlot]); + TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]); sub->done += args->sliceSteps; if (resources->shared == 0) { @@ -447,7 +447,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { } NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, sub->requests+buffSlot)); if (sub->requests[buffSlot] != NULL) { - TRACE(NCCL_NET, "recvProxy [%d/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]); + TRACE(NCCL_NET, "recvProxy [%ld/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]); sub->posted += args->sliceSteps; args->idle = 0; continue;