From 3fec2fa5eea34917f09e2282afc38863b454eae1 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 11 May 2021 18:16:30 -0700
Subject: [PATCH] 2.9.9-1

Fix crash when setting NCCL_MAX_P2P_NCHANNELS below nchannels.
Fix hang during sendrecv dynamic NVB connection establishment on
cubemesh topologies.
Add environment variable to only use SHARP on communicators beyond
a given number of ranks.
Add debug subsystem to trace memory allocations.
Fix compilation with TRACE=1. (Issue #505)
---
 makefiles/version.mk    |  2 +-
 src/debug.cc            |  4 +-
 src/enqueue.cc          | 15 +++++--
 src/graph/paths.cc      | 21 ++++++++-
 src/graph/xml.cc        |  2 +-
 src/graph/xml.h         |  2 +-
 src/include/alloc.h     |  4 ++
 src/include/comm.h      |  1 +
 src/include/graph.h     |  1 +
 src/include/nccl_net.h  |  4 +-
 src/include/transport.h |  1 +
 src/init.cc             | 94 +++++++++++++++++++++++++++++++----------
 src/transport.cc        | 37 ++++++++--------
 src/transport/net.cc    |  6 +--
 14 files changed, 142 insertions(+), 52 deletions(-)
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 78c601f..87c2017 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 9
-NCCL_PATCH   := 8
+NCCL_PATCH   := 9
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/debug.cc b/src/debug.cc
index 25bf37a..a47ceaf 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -62,6 +62,8 @@ void ncclDebugInit() {
         mask = NCCL_TUNING;
       } else if (strcasecmp(subsys, "ENV") == 0) {
         mask = NCCL_ENV;
+      } else if (strcasecmp(subsys, "ALLOC") == 0) {
+        mask = NCCL_ALLOC;
       } else if (strcasecmp(subsys, "ALL") == 0) {
         mask = NCCL_ALL;
       }
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 09da21c..00920da 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -133,7 +133,8 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph
   // Because in cudaGraph mode the launch param needs to be determined
   // at capture time instead of launch time.
   if (!usingCudaGraph) {
-    for (int c=0; c<comm->p2pnChannels; c++) {
+    int nChannels = std::max(comm->nChannels, comm->p2pnChannels);
+    for (int c=0; c<nChannels; c++) {
       if (comm->channels[c].workCount) params->gridDim.x = c+1;
     }
     eqInfo->maxChannels = params->gridDim.x;
@@ -169,8 +170,8 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph
       // GDRCOPY support
       uint64_t first = (channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS;
       uint64_t nelems = channel->workCount;
-      TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld last %ld nelems %zi",
-            channel->workFifo, channel->workFifoGdr, first, last, nelems);
+      TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld nelems %zi",
+            channel->workFifo, channel->workFifoGdr, first, nelems);
 
       for (int i = 0; i < nelems; i++) {
         int elem = (first+i) % NCCL_MAX_OPS;
@@ -799,6 +800,14 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) {
 #if CUDART_VERSION >= 11030
   cudaStreamCaptureStatus captureStatus;
   unsigned long long cudaGraphId;
+  if (comm->driverVersion < 11030) {
+    CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus));
+    if (captureStatus != cudaStreamCaptureStatusNone) {
+      WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
+      return ncclInvalidUsage;
+    }
+    return ncclSuccess;
+  }
   CUDACHECK(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL));
   if (captureStatus == cudaStreamCaptureStatusActive) {
     if (cudaGraphId != comm->lastCudaGraphId) {
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 079f5d8..fae5afa 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -29,6 +29,8 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
   return ncclInternalError;
 }
 
+NCCL_PARAM(NvbDisable, "NVB_DISABLE", 0);
+
 static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
   if (baseNode->paths[baseNode->type] == NULL) {
     NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
@@ -63,7 +65,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
 
         // allow routing through a GPU only as 1 hop
         if (node != baseNode && node->type == GPU &&
-            (link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue;
+            (ncclParamNvbDisable() || link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue;
 
         if ((remPath->width == 0 || remPath->count > path->count) && remPath->width < width) {
           // Find reverse link
@@ -529,3 +531,20 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
   INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
   return ncclSuccess;
 }
+
+ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks) {
+  int ngpus = system->nodes[GPU].count;
+  NCCLCHECK(ncclCalloc(ranks, ngpus));
+  int nvbGpus = 0;
+  for (int g=0; g<ngpus; g++) {
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+    if (gpu->gpu.rank != rank) continue;
+    for (int p=0; p<ngpus; p++) {
+      if (gpu->paths[GPU][p].type == PATH_NVB) {
+        (*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank;
+      }
+    }
+  }
+  *nranks = nvbGpus;
+  return ncclSuccess;
+}
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index f94d9e8..05a77bf 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -469,7 +469,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
     if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
     NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
   }
-  ncclDebugNoWarn = 1;
+  ncclDebugNoWarn = NCCL_GRAPH;
   NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
   if (index == -1) {
     if (path == NULL) getPciPath(busId, &path);
diff --git a/src/graph/xml.h b/src/graph/xml.h
index 9a617af..6f1ecfb 100644
--- a/src/graph/xml.h
+++ b/src/graph/xml.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/include/alloc.h b/src/include/alloc.h
index 08c63e9..e898d37 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -16,6 +16,7 @@ template <typename T>
 static ncclResult_t ncclCudaHostCalloc(T** ptr, size_t nelem) {
   CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped));
   memset(*ptr, 0, nelem*sizeof(T));
+  INFO(NCCL_ALLOC, "Cuda Host Alloc Size %ld pointer %p", nelem*sizeof(T), *ptr);
   return ncclSuccess;
 }
 
@@ -33,6 +34,7 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
   }
   memset(p, 0, nelem*sizeof(T));
   *ptr = (T*)p;
+  INFO(NCCL_ALLOC, "Mem Alloc Size %ld pointer %p", nelem*sizeof(T), *ptr);
   return ncclSuccess;
 }
 
@@ -45,6 +47,7 @@ static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
   CUDACHECK(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
   CUDACHECK(cudaStreamSynchronize(stream));
   CUDACHECK(cudaStreamDestroy(stream));
+  INFO(NCCL_ALLOC, "Cuda Alloc Size %ld pointer %p", nelem*sizeof(T), *ptr);
   return ncclSuccess;
 }
 
@@ -65,6 +68,7 @@ static ncclResult_t ncclIbMalloc(void** ptr, size_t size) {
   if (ret != 0) return ncclSystemError;
   memset(p, 0, size);
   *ptr = p;
+  INFO(NCCL_ALLOC, "Ib Alloc Size %ld pointer %p", size, *ptr);
   return ncclSuccess;
 }
 
diff --git a/src/include/comm.h b/src/include/comm.h
index 640dcd3..ee8ac46 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -159,6 +159,7 @@ struct ncclComm {
   struct ncclQueueInfo* enqueueInfo;
   cudaGraphNode_t lastSetupNode;
   unsigned long long lastCudaGraphId;
+  int driverVersion;
 };
 
 #endif
diff --git a/src/include/graph.h b/src/include/graph.h
index 892c6d2..1429b3a 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -26,6 +26,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
 void ncclTopoFree(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
 ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
+ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
 
 // Query topology
 ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net);
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index 8c016dc..389c1ea 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -19,7 +19,7 @@
 #define NCCL_NET_MAX_REQUESTS 8
 
 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALL=~0} ncclDebugLogSubSys;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_ALL=~0} ncclDebugLogSubSys;
 
 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
 
diff --git a/src/include/transport.h b/src/include/transport.h
index 33ff432..115bdc5 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -60,4 +60,5 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
 enum { collNetRecv=0, collNetSend=1 };
 int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
 ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
+ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
 #endif
diff --git a/src/init.cc b/src/init.cc
index 32c09db..474218b 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -248,7 +248,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   comm->nRanks = comm->hostDevComm.nRanks = ndev;
   cudaGetDevice(&comm->cudaDev);
   NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
-  TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x", comm, rank, ndev, comm->cudaDev, comm->busId);
+  TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx", comm, rank, ndev, comm->cudaDev, comm->busId);
 
   comm->doneEvent = doneEvent;
   comm->intDoneEvent = intDoneEvent;
@@ -277,6 +277,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   comm->lastSetupNode = NULL;
   comm->lastCudaGraphId = -1;
 
+  CUDACHECK(cudaDriverGetVersion(&comm->driverVersion));
+
   static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
   static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
   NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
@@ -295,11 +297,12 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
 
 static ncclResult_t devCommSetup(ncclComm_t comm) {
   // Duplicate the channels on the device
-  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->p2pnChannels));
-  NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->p2pnChannels));
+  int nChannels = std::max(comm->nChannels, comm->p2pnChannels);
+  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, nChannels));
+  NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, nChannels));
 
   // Copy userRanks and peers
-  for (int r=0; r<comm->p2pnChannels; r++) {
+  for (int r=0; r<comm->nChannels; r++) {
     NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
   }
 
@@ -459,6 +462,8 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
 
 NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
 NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
+NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
+NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1);
 
 static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
   // We use 2 AllGathers
@@ -579,7 +584,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
   }
 
-  // Determine CollNet support
+  // Determine local CollNet support before all-gather
   if (tmpNnodes > 1 && ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1;
   if (intraRanks > 8) {
     if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node");
@@ -687,6 +692,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
   }
 
+  // Determine CollNet support after all-gather now that we know nNodes
+  int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
+  if (comm->nNodes < collNetNodeThreshold) {
+    if (comm->collNetSupport == 1)
+      INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
+    comm->collNetSupport = 0;
+  }
+
   int *rings;
   NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
   NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph));
@@ -727,6 +740,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
   }
   NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore);
+  free(rings);
   INFO(NCCL_INIT, "Connected all rings");
 
   // Connect Trees
@@ -759,27 +773,37 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
         else if (ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend) != 1)
           collNetSetupFail = 1;
       }
+      // Verify CollNet setup across ranks after trying the first channel
+      if (c == 0) {
+        NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup);
+      }
     }
+    // Verify CollNet setup across ranks after trying all channels
+    NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup);
+    TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
+
+    // Connect intra-node CollNet
+    for (int c=0; c<comm->nChannels; c++) {
+      struct ncclChannel* channelRecv = comm->channels+c;
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
+    }
+    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0), ret, collnet_cleanup);
+    for (int c=0; c<comm->nChannels; c++) {
+      struct ncclChannel* channelSend = comm->channels+c;
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
+    }
+    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup);
+    INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
+
+collnet_cleanup:
     free(heads);
-    // Verify CollNet setup across ranks
-    NCCLCHECK(ncclTransportCollNetCheck(comm, collNetSetupFail));
-    if (comm->collNetSupport) {
-      TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
-      for (int c=0; c<comm->nChannels; c++) {
-        struct ncclChannel* channelRecv = comm->channels+c;
-        NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0));
-      }
-      NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 0));
-      for (int c=0; c<comm->nChannels; c++) {
-        struct ncclChannel* channelSend = comm->channels+c;
-        NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1));
-      }
-      NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 1));
-      INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
+    if (ret != ncclSuccess) {
+      NCCLCHECK(ncclTransportCollNetFree(comm));
+      comm->collNetSupport = 0;
+      ret = ncclSuccess;
     }
   }
   TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
-  free(rings);
 
   // Compute time models for algorithm and protocol combinations
   NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
@@ -787,6 +811,32 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   // Compute nChannels per peer for p2p
   NCCLCHECK(ncclTopoComputeP2pChannels(comm));
 
+  if (ncclParamNvbPreconnect()) {
+    // Connect p2p when using NVB path
+    int nvbNpeers;
+    int* nvbPeers;
+    NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers));
+    for (int r=0; r<nvbNpeers; r++) {
+      int peer = nvbPeers[r];
+      int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
+      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
+        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
+        if (comm->channels[channelId].peers[peer].recv[0].connected == 0) { // P2P uses only 1 connector
+          comm->connectRecv[peer] |= (1<<channelId);
+        }
+      }
+      delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
+      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
+        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
+        if (comm->channels[channelId].peers[peer].send[0].connected == 0) { // P2P uses only 1 connector
+          comm->connectSend[peer] |= (1<<channelId);
+        }
+      }
+    }
+    NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 0));
+    free(nvbPeers);
+  }
+
   NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm));
 
   if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
@@ -916,7 +966,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
-  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %x", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
+  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
 
   // Try and prevent a double free of the comm struct (user error)
   if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
diff --git a/src/transport.cc b/src/transport.cc
index ad910aa..90838ea 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -237,23 +237,26 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa
   free(allGatherFailures);
   if (collNetSetupFail) {
     if (rank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
-    // Free collNet resources
-    for (int r=0; r<comm->nChannels; r++) {
-      struct ncclChannel* channel = comm->channels+r;
-      struct ncclPeer* peer = channel->peers+nranks;
-      for (int b=0; b<NCCL_MAX_CONNS; b++) {
-        struct ncclConnector* send = peer->send + b;
-        if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
-        send->transportResources = NULL; // avoid double free
-      }
-      for (int b=0; b<NCCL_MAX_CONNS; b++) {
-        struct ncclConnector* recv = peer->recv + b;
-        if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
-        recv->transportResources = NULL; // avoid double free
-      }
-    }
-    // Set support to 0
-    comm->collNetSupport = 0;
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
+  // Free collNet resources
+  for (int r=0; r<comm->nChannels; r++) {
+    struct ncclChannel* channel = comm->channels+r;
+    struct ncclPeer* peer = channel->peers+comm->nRanks;
+    for (int b=0; b<NCCL_MAX_CONNS; b++) {
+      struct ncclConnector* send = peer->send + b;
+      if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
+      send->transportResources = NULL; // avoid double free
+    }
+    for (int b=0; b<NCCL_MAX_CONNS; b++) {
+      struct ncclConnector* recv = peer->recv + b;
+      if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
+      recv->transportResources = NULL; // avoid double free
+    }
   }
   return ncclSuccess;
 }
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 391f7cf..2b548ce 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -371,7 +371,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
             // Data is ready, try to send.
             NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, sub->requests+buffSlot));
             if (sub->requests[buffSlot] != NULL) {
-              TRACE(NCCL_NET, "sendProxy [%d/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
+              TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
               sizesFifo[buffSlot] = -1;
               // Make sure size is reset to zero before we update the head.
               __sync_synchronize();
@@ -388,7 +388,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
         int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
         NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
         if (done) {
-          TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", sub->done, buffSlot, sub->requests[buffSlot]);
+          TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
           sub->done += args->sliceSteps;
 
           if (resources->shared == 0) {
@@ -447,7 +447,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
         }
         NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, sub->requests+buffSlot));
         if (sub->requests[buffSlot] != NULL) {
-          TRACE(NCCL_NET, "recvProxy [%d/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]);
+          TRACE(NCCL_NET, "recvProxy [%ld/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]);
           sub->posted += args->sliceSteps;
           args->idle = 0;
           continue;