From c4e2aa6c792b4e94d0343c72ce20e71285238827 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Thu, 18 Aug 2022 02:53:17 -0700
Subject: [PATCH] 2.14.3-1

Add support for improved fault tolerance: non-blocking mode, new
init function with config, and ncclCommFinalize function.
Reintroduce collnet+chain algorithm, alongside collnet+direct.
Add LL protocol for intra-node P2P (on by default) and network
communication (off by default).
Use network instead of shared memory when performance is better.
Fix: wait for CUDA graph destroy before destroying comm with linked
graph resources.
Remove aggressive polling during enqueue.
Fix DMABUF fallback on MOFED 5.4 and earlier.
---
 makefiles/version.mk                 |   4 +-
 src/bootstrap.cc                     |  29 +-
 src/channel.cc                       |   5 +-
 src/collectives/device/all_reduce.h  | 115 ++++--
 src/collectives/device/common.h      |   5 +-
 src/collectives/device/functions.cu  |   3 +-
 src/collectives/device/prims_ll.h    |  21 +-
 src/collectives/device/prims_ll128.h |  14 +-
 src/collectives/device/sendrecv.h    |  29 +-
 src/enqueue.cc                       | 255 ++++++------
 src/graph/connect.cc                 |  35 +-
 src/graph/paths.cc                   |  92 +++--
 src/graph/search.cc                  | 174 ++++-----
 src/graph/topo.cc                    |  69 ++--
 src/graph/topo.h                     |  54 +--
 src/graph/tuning.cc                  |  63 ++-
 src/group.cc                         | 380 +++++++++++++-----
 src/include/alloc.h                  |  24 +-
 src/include/checks.h                 |  10 +-
 src/include/collectives.h            |   7 +-
 src/include/comm.h                   |  39 +-
 src/include/cudawrap.h               |  32 +-
 src/include/devcomm.h                |  12 +-
 src/include/graph.h                  |   6 +-
 src/include/group.h                  |  33 +-
 src/include/info.h                   |   3 +-
 src/include/proxy.h                  |   2 +
 src/init.cc                          | 565 +++++++++++++++++++++------
 src/misc/cudawrap.cc                 |  95 ++---
 src/misc/gdrwrap.cc                  |  29 +-
 src/misc/ibvwrap.cc                  |  36 +-
 src/misc/shmutils.cc                 |  11 +-
 src/misc/socket.cc                   |  69 ++--
 src/nccl.h.in                        |  39 +-
 src/net.cc                           |   8 +-
 src/proxy.cc                         | 108 +++--
 src/transport/coll_net.cc            |  78 ++--
 src/transport/net.cc                 |  56 ++-
 src/transport/net_ib.cc              |  11 +-
 src/transport/net_socket.cc          |   3 +-
 src/transport/p2p.cc                 |  56 ++-
 src/transport/shm.cc                 |  50 ++-
 42 files changed, 1787 insertions(+), 942 deletions(-)

diff --git a/makefiles/version.mk b/makefiles/version.mk
index 496796a..55fa6cc 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 13
-NCCL_PATCH   := 4
+NCCL_MINOR   := 14
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index 0ba89a5..b7e0576 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -104,8 +104,8 @@ static void *bootstrapRoot(void* args) {
   /* Receive addresses from all ranks */
   do {
     struct ncclSocket sock;
-    sock.abortFlag = NULL;
     /* bootstrap root thread always uses blocking ncclSocketAccept. */
+    NCCLCHECKGOTO(ncclSocketInit(&sock, NULL, NULL, 0), res, out);
     NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out);
     NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out);
     close(sock.fd);
@@ -228,16 +228,17 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
   info.rank = rank;
   info.nranks = nranks;
   struct ncclSocket sock, listenSockRoot;
-  sock.abortFlag = listenSockRoot.abortFlag = comm->abortFlag;
-  sock.asyncFlag = listenSockRoot.asyncFlag = 0;
 
+  NCCLCHECK(ncclSocketInit(&sock, (union ncclSocketAddress*) id, comm->abortFlag, 0));
+  NCCLCHECK(ncclSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->abortFlag, 0));
+  NCCLCHECK(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->abortFlag, 0));
+  NCCLCHECK(ncclSocketInit(&state->ringSendSocket, NULL, comm->abortFlag, 0));
+  NCCLCHECK(ncclSocketInit(&state->ringRecvSocket, NULL, comm->abortFlag, 0));
   // Create socket for other ranks to contact me
-  memcpy(&state->listenSock.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
   NCCLCHECK(ncclSocketListen(&state->listenSock));
   memcpy(&info.extAddressListen, &state->listenSock.addr, sizeof(union ncclSocketAddress));
 
   // Create socket for root to contact me
-  memcpy(&listenSockRoot.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
   NCCLCHECK(ncclSocketListen(&listenSockRoot));
   memcpy(&info.extAddressListenRoot, &listenSockRoot.addr, sizeof(union ncclSocketAddress));
 
@@ -252,7 +253,6 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
   }
 
   // send info on my listening socket to root
-  memcpy(&sock.addr, id, sizeof(union ncclSocketAddress));
   NCCLCHECK(ncclSocketConnect(&sock));
   NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info)));
   close(sock.fd);
@@ -276,8 +276,7 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
   NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
   struct ncclSocket* proxySocket;
   NCCLCHECK(ncclCalloc(&proxySocket, 1));
-  proxySocket->abortFlag = NULL; // proxy is aborted through a message
-  memcpy(&proxySocket->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, NULL, 0));
   NCCLCHECK(ncclSocketListen(proxySocket));
   memcpy(state->peerProxyAddresses+rank, &proxySocket->addr, sizeof(union ncclSocketAddress));
   NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
@@ -317,9 +316,8 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
 ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
   struct bootstrapState* state = (struct bootstrapState*)commState;
   struct ncclSocket sock;
-  sock.abortFlag = state->abortFlag;
-  sock.asyncFlag = 0;
-  memcpy(&sock.addr, state->peerCommAddresses+peer, sizeof(union ncclSocketAddress));
+
+  NCCLCHECK(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->abortFlag, 1));
   NCCLCHECK(ncclSocketConnect(&sock));
   NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int)));
   NCCLCHECK(bootstrapNetSend(&sock, &tag, sizeof(int)));
@@ -408,9 +406,7 @@ ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag,
 // We can't know who we'll receive from, so we need to receive everything at once
 ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
   struct bootstrapState* state = (struct bootstrapState*)commState;
-
   struct ncclSocket sock;
-  sock.abortFlag = state->abortFlag;
 
   // Search unexpected connections first
   NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock));
@@ -421,6 +417,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int s
   }
 
   // Then look for new connections
+  NCCLCHECK(ncclSocketInit(&sock, NULL, state->listenSock.abortFlag, 0));
   while (1) {
     NCCLCHECK(ncclSocketAccept(&sock, &state->listenSock));
     int newPeer, newTag;
@@ -442,9 +439,9 @@ ncclResult_t bootstrapClose(void* commState) {
     WARN("Unexpected connections are not empty");
     return ncclInternalError;
   }
-  close(state->listenSock.fd);
-  close(state->ringSendSocket.fd);
-  close(state->ringRecvSocket.fd);
+  if (state->listenSock.fd >= 0) close(state->listenSock.fd);
+  if (state->ringSendSocket.fd >= 0) close(state->ringSendSocket.fd);
+  if (state->ringRecvSocket.fd >= 0) close(state->ringRecvSocket.fd);
 
   free(state->peerCommAddresses);
   free(state);
diff --git a/src/channel.cc b/src/channel.cc
index 4d28a68..9587008 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -40,7 +40,10 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
 }
 
 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
-  if (channel->id == -1) return ncclSuccess;
+  /* channel peers are only valid when async init thread completes commAlloc() and
+   * the channel is intialized with initChannel(); if either is not done, this channel
+   * should never be free. */
+  if (channel->id == -1 || channel->peers == NULL) return ncclSuccess;
 
   // Free transport proxy resources
   // Note: free all send resources first due to CollNet arrangement
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index 23f6d0a..3f12e5e 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -274,19 +274,19 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SI
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO_SIMPLE> {
+struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(ncclWorkElem *args) {
     static constexpr int COLLNET_COPY_THREADS = 96;
     const int tid = threadIdx.x;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
-    struct ncclDirect* tree = &ncclShmem.channel.collTree;
+    struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
     const ssize_t chunkSize = int(args->lastChunkSize);
     const ssize_t size = args->count;
-    const ssize_t loopSize = nChannels*tree->nHeads*chunkSize;
+    const ssize_t loopSize = nChannels*direct->nHeads*chunkSize;
 
-    const int hasUp = (tree->up[0] >= 0) ? 1 : 0;
-    const int hasDn = (tree->down[0] >= 0) ? 1 : 0;
+    const int hasUp = (direct->up[0] >= 0) ? 1 : 0;
+    const int hasDn = (direct->down[0] >= 0) ? 1 : 0;
     const int nThreadsScatter = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 3*COLLNET_COPY_THREADS : 0);
     const int nThreadsGather  =             ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
     const int nThreadsBcast   = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 2*COLLNET_COPY_THREADS);
@@ -301,24 +301,24 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
       // Scatter
       int group = (2*Proto::MaxGroupWidth) | (1<<16);
       Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
-        prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
+        prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize;
-        int nelem = min(tree->nHeads*chunkSize, size-offset);
+        ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
+        int nelem = min(direct->nHeads*chunkSize, size-offset);
         if (args->regUsed) {
-          prims.directScatter(offset, nelem, chunkSize, tree->headRank, tree->shift);
+          prims.directScatter(offset, nelem, chunkSize, direct->headRank, direct->shift);
         } else {
-          prims.scatter(offset, nelem, chunkSize, tree->headRank, tree->shift);
+          prims.scatter(offset, nelem, chunkSize, direct->headRank, direct->shift);
         }
       }
-    } else if (tid >= tidStartReduce && tree->out != -1) {
+    } else if (tid >= tidStartReduce && direct->out != -1) {
       int group = (3*Proto::MaxGroupWidth) | (1<<16);
       if (hasDn) {
         // Reduce, send to network
         Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
-          prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
+          prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
+          ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
           if (args->regUsed) {
             prims.directRecvReduceSend(offset, offset, nelem);
@@ -329,9 +329,9 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
       } else {
         // Directly send to network
         Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidStartReduce, nThreadsReduce, nullptr, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group);
+          prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, group);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
+          ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
           prims.send(offset, nelem);
         }
@@ -340,29 +340,29 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
       // Gather
       int group = (0*Proto::MaxGroupWidth) | (0<<16);
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
-        prims(tid, nThreadsGather, tree->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
+        prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize;
-        int nelem = min(tree->nHeads*chunkSize, size-offset);
-        prims.directGather(offset, nelem, chunkSize, tree->headRank, tree->shift);
+        ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
+        int nelem = min(direct->nHeads*chunkSize, size-offset);
+        prims.directGather(offset, nelem, chunkSize, direct->headRank, direct->shift);
       }
-    } else if (tid >= tidStartBcast && tid < tidStartScatter && tree->out != -1) {
+    } else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
       int group = (1*Proto::MaxGroupWidth) | (0<<16);
       if (hasDn) {
         // Recv from network, broadcast
         Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
-          prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
+          prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
+          ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
           prims.recvCopyDirectSend(offset, offset, nelem, /*postOp=*/true);
         }
       } else {
         // Recv from network (no post thread needed)
         Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidStartBcast, nThreadsBcast, &tree->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, group);
+          prims(tid-tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, group);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
+          ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
           prims.recv(offset, nelem, /*postOp=*/true);
         }
@@ -371,6 +371,73 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
   }
 };
 
+template<typename T, typename RedOp>
+struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
+    const int tid = threadIdx.x;
+    const int nthreads = args->nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
+    ncclTree *tree = &ncclShmem.channel.collnetChain;
+    ssize_t chunkSize = int(args->lastChunkSize);
+    const ssize_t loopSize = int(nChannels*chunkSize);
+    const ssize_t size = args->count;
+
+    int nthreadsSplit = nthreads/2;
+    if (nthreadsSplit >= 256) nthreadsSplit += 64;
+
+    int group, send, recv, groupTid, groupNthreads;
+    using Proto = ProtoSimple<1, 1>;
+    if (tid < nthreadsSplit) {
+      group = (0*Proto::MaxGroupWidth) | (1<<16);
+      recv = tree->down[0];
+      send = tree->up;
+      groupTid = tid;
+      groupNthreads = nthreadsSplit;
+    } else {
+      group = (1*Proto::MaxGroupWidth);
+      recv = tree->up;
+      send = tree->down[0];
+      groupTid = tid - nthreadsSplit;
+      groupNthreads = nthreads-nthreadsSplit;
+    }
+
+    Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+      prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff, args->redOpArg, group);
+
+    if (tid < nthreadsSplit) {
+      if (recv == -1) {
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + bid*int(chunkSize);
+          int nelem = min(chunkSize, size-offset);
+          prims.send(offset, nelem);
+        }
+      } else {
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + bid*int(chunkSize);
+          int nelem = min(chunkSize, size-offset);
+          prims.recvReduceSend(offset, nelem);
+        }
+      }
+    }
+    else {
+      if (send == -1) {
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + bid*int(chunkSize);
+          int nelem = min(chunkSize, size-offset);
+          prims.directRecv(offset, nelem);
+        }
+      } else {
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + bid*int(chunkSize);
+          int nelem = min(chunkSize, size-offset);
+          prims.directRecvCopySend(offset, offset, nelem);
+        }
+      }
+    }
+  }
+};
+
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
   __device__ __forceinline__ void run(ncclWorkElem *args) {
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index ab333b4..310938f 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -231,7 +231,8 @@ __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
 #define IMPL_COLL3(func, devredop, type, ncclType) \
   IMPL_COLL4(func, TREE,    devredop, type, ncclType) \
   IMPL_COLL4(func, RING,    devredop, type, ncclType) \
-  IMPL_COLL4(func, COLLNET, devredop, type, ncclType)
+  IMPL_COLL4(func, COLLNET_DIRECT, devredop, type, ncclType) \
+  IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType)
 
 #if NCCL_TYPE == 0
 #define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int8_t,   ncclInt8)
@@ -281,7 +282,7 @@ __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
 // Point-to-point primitives only have one function/kernel.
 #define IMPL_COLL_P(func) \
   IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t); \
-  IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, 0);
+  IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);
 #else
 #define IMPL_COLL_C(func)
 #define IMPL_COLL_P(func)
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index c7060f3..7c36064 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -18,7 +18,8 @@ __shared__ ncclShmemData ncclShmem;
 #define NCCL_FUNC4(func, devredop, type, nullify) \
   NCCL_FUNC5(func, TREE,    devredop, type, nullify), \
   NCCL_FUNC5(func, RING,    devredop, type, nullify), \
-  NCCL_FUNC5(func, COLLNET, devredop, type, nullify)
+  NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
+  NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify)
 
 #if defined(__CUDA_BF16_TYPES_EXIST__)
 // Must be consistent with ncclDataType_t
diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h
index e8cc8e3..60f64ff 100644
--- a/src/collectives/device/prims_ll.h
+++ b/src/collectives/device/prims_ll.h
@@ -8,7 +8,10 @@ template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
 class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
   public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>> {
 
-  static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
+  // In the case of Fan::MaxRecv == 0, we need to force MaxRecv to 1 for this to compile
+  // This is because of a recv buffer which is allocated to MaxRecv length in send-only cases
+  static constexpr int MaxRecv = Fan::MaxRecv > 1 ? Fan::MaxRecv : 1;
+  static constexpr int MaxSend = Fan::MaxSend;
   static constexpr int Input=0, Output=1;
   RedOp redOp;
   const int tid;
@@ -41,7 +44,10 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
   inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
 
   inline __device__ void barrier() {
-    asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
+    if (nthreads == WARP_SIZE)
+      __syncwarp();
+    else
+      asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
   }
 
   uint32_t abort = 0;
@@ -319,18 +325,19 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
       void const *inputBuf, void *outputBuf, uint64_t redOpArg, int group=0
     ):
     redOp(redOpArg),
-    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
+    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group&(uint16_t)0xFFFF),
     stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
-
+    int connIndex = group >> 16;
     auto *channel = &ncclShmem.channel;
     // If we are going to support oneshot collNet + LL, then we would need to add connector index here
     int nrecv=0, nsend=0;
-    while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
-      loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
+    // We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1
+    while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) {
+      loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[connIndex], nrecv);
       nrecv++;
     }
     while (nsend < MaxSend && sendPeers[nsend] >= 0) {
-      loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend);
+      loadSendConn(&channel->peers[sendPeers[nsend]].send[connIndex], nsend);
       nsend++;
     }
     this->fan = Fan(nrecv, nsend);
diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h
index 93b6b4f..3136940 100644
--- a/src/collectives/device/prims_ll128.h
+++ b/src/collectives/device/prims_ll128.h
@@ -193,7 +193,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
           load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
           needReload |= flagThread && (vr[u+1] != flag);
         }
-      } while (__any_sync(WARP_MASK, needReload) && checkAbort(spins, 0, 0) == 0);
+        needReload &= (0 == checkAbort(spins, 0, 0));
+      } while (__any_sync(WARP_MASK, needReload));
     }
 
     /************* Finish register load **************/
@@ -234,7 +235,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
             load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
             needReload |= flagThread && (vr[u+1] != flag);
           }
-        } while (__any_sync(WARP_MASK, needReload) && checkAbort(spins, i, 0) == 0);
+          needReload &= (0 == checkAbort(spins, i, 0));
+        } while (__any_sync(WARP_MASK, needReload));
 
         #pragma unroll
         for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
@@ -358,17 +360,17 @@ public:
     ):
     redOp(redOpArg),
     tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
-    flagThread((tid%8)==7), group(group),
+    flagThread((tid%8)==7), group(group&(uint16_t)0xFFFF),
     stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
-
+    int connIndex = group >> 16;
     auto *channel = &ncclShmem.channel;
     int nrecv=0, nsend=0;
     while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
-      loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
+      loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[connIndex], nrecv);
       nrecv++;
     }
     while (nsend < MaxSend && sendPeers[nsend] >= 0) {
-      loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend);
+      loadSendConn(&channel->peers[sendPeers[nsend]].send[connIndex], nsend);
       nsend++;
     }
     this->fan = Fan(nrecv, nsend);
diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h
index feae653..ec1e20c 100644
--- a/src/collectives/device/sendrecv.h
+++ b/src/collectives/device/sendrecv.h
@@ -10,7 +10,8 @@
 
 template<typename T, typename RedOp>
 struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
+  template<typename Proto>
+  __device__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
     void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
     size_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
     if (args->peer == ncclShmem.comm.rank) {
@@ -20,8 +21,8 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
         ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&buff, 1, (T**)&recvBuff, count);
       }
     } else {
-      using Proto = ProtoSimple<1, 1>;
-      int const chunkSize = args->chunkSize/sizeof(T);
+      int chunkSize = args->chunkSize/sizeof(T);
+      if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
       int const peer = args->peer;
       Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
         (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group);
@@ -34,12 +35,13 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
     }
   }
 
-  __device__ __forceinline__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
+  template<typename Proto>
+  __device__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
     if (args->peer != ncclShmem.comm.rank) {
-      using Proto = ProtoSimple<1, 1>;
       void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
       ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
-      int const chunkSize = args->chunkSize/sizeof(T);
+      int chunkSize = args->chunkSize/sizeof(T);
+      if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
       int const peer = args->peer;
       Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
         (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group);
@@ -70,10 +72,21 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 
     if (args->p2pType == ncclWorkP2pTypeUnused) return;
     if (tid >= nthreads || args->peer == -1) return;
+
+    // Select Proto here
+    // This is to allow the same kernel to run multiple primitives on different warps (thread groups)
     if ((group%2) == 0) {
-      runRecv(tid, nthreads, group, args);
+      if (args->proto == NCCL_PROTO_LL) {
+        runRecv<ProtoLL>(tid, nthreads, group, args);
+      } else {
+        runRecv<ProtoSimple<1,1>>(tid, nthreads, group, args);
+      }
     } else {
-      runSend(tid, nthreads, group, args);
+      if (args->proto == NCCL_PROTO_LL) {
+        runSend<ProtoLL>(tid, nthreads, group, args);
+      } else {
+        runSend<ProtoSimple<1,1>>(tid, nthreads, group, args);
+      }
     }
   }
 };
diff --git a/src/enqueue.cc b/src/enqueue.cc
index d3fbbe5..7c6d835 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -17,102 +17,75 @@
 
 static void* const ncclKernelGeneric = (void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t);
 
+struct ncclKernelMatch {
+  void* kernelFn;
+  bool specialized;
+};
+
 // Only generate inline kernels for LL
-#define NCCL_FUNC5(func, algo, devredop, dtype) \
-  /*LL    */(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), \
-  /*LL128 */nullptr /*(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype)*/, \
-  /*SIMPLE*/nullptr /*(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype)*/
+#define NCCL_FUNC5(func, algo, devredop, dtype, specialized) \
+  /*LL    */{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), true && specialized}, \
+  /*LL128 */{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), false && specialized}, \
+  /*SIMPLE*/{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), false && specialized}
 
-#define NCCL_FUNC4(func, devredop, type) \
-  (void*)NCCL_FUNC5(func, TREE,    devredop, type), \
-  (void*)NCCL_FUNC5(func, RING,    devredop, type), \
-  (void*)NCCL_FUNC5(func, COLLNET, devredop, type)
+#define NCCL_FUNC4(func, devredop, type, specialized) \
+  NCCL_FUNC5(func, TREE,           devredop, type, specialized), \
+  NCCL_FUNC5(func, RING,           devredop, type, specialized), \
+  NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, specialized), \
+  NCCL_FUNC5(func, COLLNET_CHAIN,  devredop, type, specialized)
 
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-// Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(func, devredop) \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, uint8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int32_t), \
-  (void*)NCCL_FUNC4(func, devredop, uint32_t), \
-  (void*)NCCL_FUNC4(func, devredop, int64_t), \
-  (void*)NCCL_FUNC4(func, devredop, uint64_t), \
-  (void*)NCCL_FUNC4(func, devredop, half), \
-  (void*)NCCL_FUNC4(func, devredop, float), \
-  (void*)NCCL_FUNC4(func, devredop, double), \
-  (void*)NCCL_FUNC4(func, devredop, __nv_bfloat16)
-#define NCCL_FUNCS3B(func, devredop) \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t)
+#ifdef __CUDA_BF16_TYPES_EXIST__
+  #define HAVE_BFLOAT16 1
 #else
-// Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(func, devredop) \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, uint8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int32_t), \
-  (void*)NCCL_FUNC4(func, devredop, uint32_t), \
-  (void*)NCCL_FUNC4(func, devredop, int64_t), \
-  (void*)NCCL_FUNC4(func, devredop, uint64_t), \
-  (void*)NCCL_FUNC4(func, devredop, half), \
-  (void*)NCCL_FUNC4(func, devredop, float), \
-  (void*)NCCL_FUNC4(func, devredop, double)
-#define NCCL_FUNCS3B(func, devredop) \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t), \
-  (void*)NCCL_FUNC4(func, devredop, int8_t)
+  #define HAVE_BFLOAT16 0
 #endif
 
+// Must be consistent with ncclDataType_t
+#define NCCL_FUNCS3(func, devredop, reduction, specialized) \
+  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int8_t, int8_t), specialized), \
+  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint8_t, int8_t), specialized), \
+  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int32_t, int8_t), specialized), \
+  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint32_t, int8_t), specialized), \
+  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int64_t, int8_t), specialized), \
+  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint64_t, int8_t), specialized), \
+  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, half, int8_t), specialized), \
+  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, float, int8_t), specialized), \
+  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, double, int8_t), specialized) \
+  MACRO_IF(HAVE_BFLOAT16, \
+    SINGLE_ARG(, NCCL_FUNC4(func, devredop, MACRO_IF(reduction, __nv_bfloat16, int8_t), specialized)), \
+    /*nothing*/ \
+  )
+
 // Must be consistent with ncclDevRedOp_t -- but we only generate kernel for sums.
-#define NCCL_FUNCS2A(func) \
-  NCCL_FUNCS3A(func, Sum), /*Sum*/ \
-  NCCL_FUNCS3A(func, Sum), /*Prod*/ \
-  NCCL_FUNCS3A(func, Sum), /*Max*/ \
-  NCCL_FUNCS3A(func, Sum), /*Min*/ \
-  NCCL_FUNCS3A(func, Sum), /*PreMulSum*/ \
-  NCCL_FUNCS3A(func, Sum)  /*SumPostDiv*/
-#define NCCL_FUNCS2B(func) \
-  NCCL_FUNCS3B(func, Sum), /*Sum*/ \
-  NCCL_FUNCS3B(func, Sum), /*Prod*/ \
-  NCCL_FUNCS3B(func, Sum), /*Max*/ \
-  NCCL_FUNCS3B(func, Sum), /*Min*/ \
-  NCCL_FUNCS3B(func, Sum), /*PreMulSum*/ \
-  NCCL_FUNCS3B(func, Sum)  /*SumPostDiv*/
+#define NCCL_FUNCS2(func, reduction) \
+  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/1), /*Sum*/ \
+  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Prod*/ \
+  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Max*/ \
+  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Min*/ \
+  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*PreMulSum*/ \
+  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0)  /*SumPostDiv*/
 
 // Must be consistent with the ncclFuncSet enum
-static void* const ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
-  (void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+static const ncclKernelMatch ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
+  {(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), true},
   // We don't bake special kernels for the one-rank reductions
-  /*int8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
-  /*uint8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
-  /*int32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
-  /*uint32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
-  /*int64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
-  /*uint64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
-  /*half*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
-  /*float*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
-  /*double*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
-  #if defined(__CUDA_BF16_TYPES_EXIST__)
-    /*bfloat16*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  {/*int8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
+  {/*uint8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
+  {/*int32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
+  {/*uint32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
+  {/*int64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
+  {/*uint64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
+  {/*half*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
+  {/*float*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
+  {/*double*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
+  #if HAVE_BFLOAT16
+    {/*bfloat16*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
   #endif
-  NCCL_FUNCS2B(Broadcast),
-  NCCL_FUNCS2A(Reduce),
-  NCCL_FUNCS2B(AllGather),
-  NCCL_FUNCS2A(ReduceScatter),
-  NCCL_FUNCS2A(AllReduce)
+  NCCL_FUNCS2(Broadcast, /*reduction=*/0),
+  NCCL_FUNCS2(Reduce, /*reduction=*/1),
+  NCCL_FUNCS2(AllGather, /*reduction=*/0),
+  NCCL_FUNCS2(ReduceScatter, /*reduction=*/1),
+  NCCL_FUNCS2(AllReduce, /*reduction=*/1)
 };
 
 static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */);
@@ -124,10 +97,8 @@ size_t ncclKernMaxLocalSize() {
   cudaFuncAttributes attr = {0};
   size_t max = 0;
   for (int i = 0; i < numNcclKerns; i++) {
-    if (ncclKerns[i] != nullptr) {
-      CUDACHECKGOTO(cudaFuncGetAttributes(&attr, ncclKerns[i]), res, error);
-      if (attr.localSizeBytes > max) max = attr.localSizeBytes;
-    }
+    CUDACHECKGOTO(cudaFuncGetAttributes(&attr, ncclKerns[i].kernelFn), res, error);
+    if (attr.localSizeBytes > max) max = attr.localSizeBytes;
   }
 
 error:
@@ -139,7 +110,7 @@ ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut) {
   ncclResult_t res = ncclSuccess;
   int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
   for (int i = 0; i < numNcclKerns; i++) {
-    CUDACHECKGOTO(cudaFuncSetAttribute(ncclKerns[i], cudaFuncAttributePreferredSharedMemoryCarveout, carveOut), res, error);
+    CUDACHECKGOTO(cudaFuncSetAttribute(ncclKerns[i].kernelFn, cudaFuncAttributePreferredSharedMemoryCarveout, carveOut), res, error);
   }
 
 error:
@@ -331,14 +302,14 @@ static ncclResult_t addCollToPlan(
       workElemReg.elem = *workElem; // C++ struct assignment
       workElemReg.elem.regUsed = 1;
       for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
-        int peer = channel->collTree.down[i];
+        int peer = channel->collnetDirect.down[i];
         if (peer == -1) break;
         int j = comm->rankToLocalRank[peer]; // Get intra-node slot
         workElemReg.dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer
         workElemReg.dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer
       }
       for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
-        int peer = channel->collTree.up[i];
+        int peer = channel->collnetDirect.up[i];
         if (peer == -1) break;
         int j = comm->rankToLocalRank[peer];
         // Output buffer of root peer
@@ -360,6 +331,8 @@ static ncclResult_t addCollToPlan(
   return ncclSuccess;
 }
 
+NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 16384);
+
 // Put p2p op in plan assuming there is space in nWorkBudget, so you must
 // ensure *nWorkBudget >= 1 upon entry.
 static ncclResult_t addP2pToPlan(
@@ -377,10 +350,16 @@ static ncclResult_t addP2pToPlan(
   NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, info.coll, &channelId));
   info.channelId = channelId;
 
+  // 1 is connIndex
+  struct ncclConnInfo* conn = isSendNotRecv ?
+    &comm->channels[channelId].peers[peer].send[1].conn : &comm->channels[channelId].peers[peer].recv[1].conn;
+  info.protocol = ((conn->buffs[NCCL_PROTO_LL] != nullptr) && bytes <= ncclParamP2pLLThreshold()) ? NCCL_PROTO_LL : NCCL_PROTO_SIMPLE;
+
   struct ncclProxyOp proxyOp = {};
   NCCLCHECK(ncclProxyComputeP2p(&info, &proxyOp));
 
   struct ncclWorkElemP2p elem = {0};
+  elem.proto = info.protocol;
   elem.peer = peer;
   elem.nWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
   elem.p2pType = isSendNotRecv ? ncclWorkP2pTypeSend : ncclWorkP2pTypeRecv;
@@ -421,8 +400,6 @@ static void finishPlan(struct ncclKernelPlan* plan) {
   plan->channelCount = channelCount;
   plan->channelMask = channelMask;
   plan->hasProxyOps = hasProxyOps;
-  if (plan->kernelFn == nullptr)
-    plan->kernelFn = ncclKernelGeneric;
   plan->threadPerBlock = std::max(plan->threadPerBlock, 3*WARP_SIZE);
 }
 
@@ -582,7 +559,7 @@ static ncclResult_t scheduleCollTasksToPlan(
       void* regBufSend[NCCL_MAX_LOCAL_RANKS];
       void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
       if (plan->persistent && ncclParamGraphRegister() &&
-          info.algorithm == NCCL_ALGO_COLLNET &&   // limited to CollNet for now
+          info.algorithm == NCCL_ALGO_COLLNET_DIRECT &&   // limited to CollNetDirect for now
           comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
           comm->intraRanks < comm->localRanks) { // only with inter-process & intra-node peers
         NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, &regBufUsed, regBufSend, regBufRecv));
@@ -596,8 +573,10 @@ static ncclResult_t scheduleCollTasksToPlan(
       head = ncclIntruQueueHead(&tasks->collQueue);
 
       plan->threadPerBlock = std::max(plan->threadPerBlock, info.nThreads);
-      if (ncclKerns[workFuncIndex] != nullptr)
-        plan->kernelFn = ncclKerns[workFuncIndex];
+      if (!plan->kernelSpecialized) {
+        plan->kernelFn = ncclKerns[workFuncIndex].kernelFn;
+        plan->kernelSpecialized = ncclKerns[workFuncIndex].specialized;
+      }
     }
   }
   return ncclSuccess;
@@ -623,11 +602,15 @@ static ncclResult_t scheduleP2pTasksToPlan(
   int const *recvOrder = tasks->p2pRecvOrder;
 
   plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MAX_NTHREADS);
+  if (!plan->kernelSpecialized) {
+    plan->kernelFn = ncclKerns[FUNC_INDEX_P2P].kernelFn;
+    plan->kernelSpecialized = ncclKerns[FUNC_INDEX_P2P].specialized;
+  }
 
   // Compute how much to split operations
   // Natural step size matching buffer steps.
   ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
-  if (comm->nNodes > 1) stepSize /= SENDRECV_SLICEFACTOR;
+  if (comm->nNodes > 1) stepSize = comm->p2pNetChunkSize;
   // Try to use all channels
   int nChannelsMax = comm->p2pnChannelsPerPeer;
   int nChannelsMin = nChannelsMax;
@@ -723,7 +706,6 @@ static inline uint32_t rollingMin32(uint32_t a, uint32_t b) {
 // Spin until its safe to increase comm->workFifoSent to desiredSent.
 static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredSent) {
   if (__builtin_expect(rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent), false)) {
-    uint64_t t0 = clockNano();
     while (1) {
       // We have to poll for notifications from device.
       uint32_t* doneLive = comm->workFifoDone;
@@ -756,8 +738,7 @@ static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredSent) {
 
       // See if that was enough.
       if (!rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent)) break;
-      // Nope. Maintain vigorous spin for first 5us, then start yielding.
-      if (clockNano()-t0 >= 5*1000) sched_yield();
+      sched_yield();
     }
   }
 }
@@ -883,10 +864,10 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
   struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim`
   if (plan->persistent) {
     comm->persistentRefs -= 1;
-    if (!ncclMainExited) NCCLCHECK(ncclCudaFree(plan->workHead));
+    NCCLCHECK(ncclCudaFree(plan->workHead));
     while (!ncclIntruQueueEmpty(&plan->ipcMemQueue)) {
       struct ncclPointerList* q = ncclIntruQueueDequeue(&plan->ipcMemQueue);
-      if (!ncclMainExited) CUDACHECKIGNORE(cudaIpcCloseMemHandle(q->ptr));
+      CUDACHECKIGNORE(cudaIpcCloseMemHandle(q->ptr));
       ncclMemoryPoolFree(&comm->memPool_ncclPointerList, q);
     }
   }
@@ -913,7 +894,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
 
   // Poll for callbacks sent to us from other threads. Typically these free
   // resources from to our memory pools.
-  NCCLCHECK(ncclCommPollCallbacks(comm));
+  NCCLCHECK(ncclCommPollCallbacks(comm, /*waitSome=*/false));
 
   // We already have one frame present which holds all of our tasks (which we
   // are about to schedule). Now push an additional frame for allocating
@@ -1080,7 +1061,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
     info->protocol = -1;
     int nAlgos = NCCL_NUM_ALGORITHMS;
     for (int a=0; a<nAlgos; a++) {
-      if (a == NCCL_ALGO_COLLNET && collNetTypeSupport != 1) continue;
+      if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetTypeSupport != 1) continue;
       for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
         float time;
         NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time));
@@ -1102,12 +1083,12 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
   int nc = (info->nChannels > 0) ? info->nChannels : comm->nChannels;
   int nt = comm->maxThreads[info->algorithm][info->protocol];
   int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
-  if (info->algorithm == NCCL_ALGO_COLLNET) {
+  if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
     // CollNet channel tuning
     int ncSwitch = 16;
     bool flag = true;
     while (ncSwitch >= 1 && flag) {
-      while ((flag = info->nBytes < nc*nt*info->comm->channels[0].collTree.nHeads*threadThreshold) && nc > ncSwitch) {
+      while ((flag = info->nBytes < nc*nt*info->comm->channels[0].collnetDirect.nHeads*threadThreshold) && nc > ncSwitch) {
         if (nc == ncSwitch+ncSwitch/2) threadThreshold /= 2;
         nc--;
       }
@@ -1125,7 +1106,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
     nt += WARP_SIZE; // Extra warp for sync
     // More threads or sync warps needed due to split thread model
     if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE;
-    if (info->algorithm == NCCL_ALGO_COLLNET) nt += 3*WARP_SIZE;
+    if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) nt += 3*WARP_SIZE;
+    if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) nt += 3*WARP_SIZE;
   }
   nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
   info->nChannels = nc;
@@ -1143,7 +1125,11 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
     case ncclFuncAllGather:
       info->pattern = ncclPatternRing; break;
     case ncclFuncAllReduce:
-      info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUpDown : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
+      info->pattern =
+        info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
+        info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain :
+        info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown :
+        ncclPatternRingTwice; break;
     default:
       WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
       return ncclInternalError;
@@ -1158,9 +1144,10 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
     case ncclPatternTreeUpDown:
     case ncclPatternPipelineFrom:
     case ncclPatternPipelineTo:
+    case ncclPatternCollnetChain:
       info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
-    case ncclPatternCollTreeUpDown:
-      info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collTree.nHeads; break;
+    case ncclPatternCollnetDirect:
+      info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collnetDirect.nHeads; break;
     case ncclPatternRing:
       info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
     case ncclPatternRingTwice:
@@ -1217,15 +1204,22 @@ comp_next:
     }
     // Use lastChunkSize as chunkSize
     work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
-  } else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
+  } else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
     // Optimize chunkSize / nSteps
-    while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*64 && chunkSize > 131072) chunkSize /= 2;
-    while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 65536) chunkSize /= 2;
-    while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 32768) chunkSize /= 2;
+    while (info->nBytes / (info->nChannels*info->comm->channels[0].collnetDirect.nHeads*chunkSize) < info->comm->channels[0].collnetDirect.depth*64 && chunkSize > 131072) chunkSize /= 2;
+    while (info->nBytes / (info->nChannels*info->comm->channels[0].collnetDirect.nHeads*chunkSize) < info->comm->channels[0].collnetDirect.depth*8 && chunkSize > 65536) chunkSize /= 2;
+    while (info->nBytes / (info->nChannels*info->comm->channels[0].collnetDirect.nHeads*chunkSize) < info->comm->channels[0].collnetDirect.depth*8 && chunkSize > 32768) chunkSize /= 2;
     // Use lastChunkSize as chunkSize
     work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
     // Set direct direction for broadcast-gather (read or write)
     work->direct = (info->nBytes / info->nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ;
+  } else if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
+    stepSize   = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
+    chunkSize  = std::min(256*1024, stepSize*chunkSteps);
+    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth*64 && chunkSize > 131072) chunkSize /= 2;
+    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth*8 && chunkSize > 65536) chunkSize /= 2;
+    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
+    work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
   } else if (info->protocol == NCCL_PROTO_LL) {
     const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
     const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
@@ -1254,7 +1248,7 @@ comp_next:
   proxyOp->chunkSize = chunkSize;
   proxyOp->protocol = info->protocol;
   proxyOp->dtype = info->datatype;
-  proxyOp->redOp = info->algorithm != NCCL_ALGO_COLLNET ? ncclNumOps : // Only set redOp when using CollNet
+  proxyOp->redOp = (info->algorithm != NCCL_ALGO_COLLNET_DIRECT && info->algorithm != NCCL_ALGO_COLLNET_CHAIN) ? ncclNumOps : // Only set redOp when using CollNet
                      info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
                      info->op;
   proxyOp->pattern = info->pattern;
@@ -1444,30 +1438,43 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
   NCCLCHECK(ncclGroupStartInternal());
   ncclResult_t ret = ncclSuccess;
   int devOld = -1;
-  NCCLCHECKGOTO(PtrCheck(info->comm, info->opName, "comm"), ret, end0);
+
+  NCCLCHECKGOTO(PtrCheck(info->comm, info->opName, "comm"), ret, fail);
+  // Check whether communicator is ready to communicate
+  NCCLCHECKGOTO(ncclCommEnsureReady(info->comm), ret, fail);
+
   if (info->comm->checkPointers) {
-    CUDACHECKGOTO(cudaGetDevice(&devOld), ret, end0);
-    CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end0);
+    CUDACHECKGOTO(cudaGetDevice(&devOld), ret, fail);
+    CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, fail);
   }
-  NCCLCHECKGOTO(ArgsCheck(info), ret, end1);
+  NCCLCHECKGOTO(ArgsCheck(info), ret, fail);
 
   INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
         info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
         info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
   TRACE_CALL("nccl%s(%" PRIx64 ",%" PRIx64 ",%zi,%d,%d,%d,%p,%p)", info->opName, reinterpret_cast<int64_t>(info->sendbuff), reinterpret_cast<int64_t>(info->recvbuff), info->count, info->datatype, info->op, info->root, info->comm, info->stream);
 
-  NCCLCHECKGOTO(taskAppend(info->comm, info), ret, end1);
+  NCCLCHECKGOTO(taskAppend(info->comm, info), ret, fail);
 
-end1:
-  if (devOld != -1) CUDACHECKGOTO(cudaSetDevice(devOld), ret, end0);
-end0:
+exit:
+  if (devOld != -1) CUDACHECK(cudaSetDevice(devOld));
   ncclGroupErrCheck(ret);
   NCCLCHECK(ncclGroupEndInternal());
+  /* if depth is 1, ncclGroupEndInternal() will trigger group ops. The state can change
+   * so we have to check state here. */
+  if (info->comm && !info->comm->blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) };
   return ret;
+fail:
+  if (info->comm && !info->comm->blocking) (void) ncclCommSetAsyncError(info->comm, ret);
+  goto exit;
 }
 
 NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
 ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) {
+  NCCLCHECK(PtrCheck(comm, "ncclRedOpCreatePreMulSum", "comm"));
+  /* join init thread before creating PreMulSum op. */
+  NCCLCHECK(ncclCommEnsureReady(comm));
+
   if (comm->userRedOpFreeHead == comm->userRedOpCapacity) {
     // double capacity and resize
     int cap = 2*comm->userRedOpCapacity;
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index da9a360..01ff282 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -25,13 +25,15 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
     struct ncclChannel* channel = comm->channels+c;
     channel->ring.prev = channel->ring.next = -1;
     channel->tree.up = -1;
+    channel->collnetChain.up = -1;
     for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
-    channel->collTree.out = -1;
-    channel->collTree.headRank = -1;
-    channel->collTree.nHeads = 0;
-    channel->collTree.shift = 0;
-    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collTree.up[i] = -1;
-    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collTree.down[i] = -1;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collnetChain.down[i] = -1;
+    channel->collnetDirect.out = -1;
+    channel->collnetDirect.headRank = -1;
+    channel->collnetDirect.nHeads = 0;
+    channel->collnetDirect.shift = 0;
+    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
+    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;
 
     int* ringIntra = ringGraph->intra+c*localRanks;
     int* treeIntra = treeGraph->intra+c*localRanks;
@@ -53,6 +55,8 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
         topoRanks->treeToChild1[c] = treeIntra[child1Index];
         channel->tree.up         = i == 0 ? -1 : treeIntra[i-1];
         channel->tree.down[0]    = i == localRanks-1 ? -1 : treeIntra[i+1];
+        channel->collnetChain.up         = i == 0 ? comm->nRanks : treeIntra[i-1];
+        channel->collnetChain.down[0]    = i == localRanks-1 ? -1 : treeIntra[i+1];
       }
     }
     topoRanks->ringPrev[c] = channel->ring.prev;
@@ -185,13 +189,13 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
     int nDown = 0;
     for (int i=0; i<nHeads; i++) {
       if (rank == heads[i]) { // is head
-        channel->collTree.headRank = i; // Mark the index for deciding offset in the CUDA kernel
-        channel->collTree.out = comm->nRanks; // Set root of collTree to id nranks
+        channel->collnetDirect.headRank = i; // Mark the index for deciding offset in the CUDA kernel
+        channel->collnetDirect.out = comm->nRanks; // Set root of collnetDirect to id nranks
         int* collNetIntra = collNetGraph->intra+i*localRanks;
         sprintf(line+strlen(line), "down ");
         for (int r=0; r<localRanks; r++) {
           if (collNetIntra[r] == rank) continue;
-          channel->collTree.down[nDown++] = collNetIntra[r];  // connect to all peers
+          channel->collnetDirect.down[nDown++] = collNetIntra[r];  // connect to all peers
           sprintf(line+strlen(line), " %d ", collNetIntra[r]);
         }
         sprintf(line+strlen(line), "nDown %d ", nDown);
@@ -203,15 +207,16 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
     sprintf(line+strlen(line), "up ");
     for (int h=0; h<nHeads; h++) {
       if (rank == heads[h]) continue;
-      channel->collTree.up[nUp++] = heads[h];
+      channel->collnetDirect.up[nUp++] = heads[h];
       sprintf(line+strlen(line), " %d ", heads[h]);
     }
-    channel->collTree.nHeads = nHeads;
-    channel->collTree.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
-    channel->collTree.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
+    channel->collnetDirect.nHeads = nHeads;
+    channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
+    channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
     sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
-    sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collTree.headRank, channel->collTree.out, channel->collTree.shift);
+    sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift);
     INFO(NCCL_GRAPH, "%s", line);
+    channel->collnetChain.depth = comm->nRanks/comm->nNodes;
   }
   free(heads);
   return ncclSuccess;
@@ -296,7 +301,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   // Setup CollNet
   if (comm->collNetSupport == 1) {
     // Add more channels to saturate intra-node bandwidth, except the 1 PPN case
-    if (collNetGraph->speedIntra > collNetGraph->speedInter && comm->nRanks > comm->nNodes) {
+    if (collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) {
       int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
       nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
     }
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index ab8f8c3..f8918b1 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -44,7 +44,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
   struct ncclTopoLinkList* basePath;
   NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
   basePath->count = 0;
-  basePath->width = LOC_WIDTH;
+  basePath->bw = LOC_BW;
   basePath->type = PATH_LOC;
 
   while (nodeList.count) {
@@ -61,13 +61,13 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
         }
         struct ncclTopoLinkList* remPath;
         NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
-        float width = std::min(path->width, link->width);
+        float bw = std::min(path->bw, link->bw);
 
         // allow routing through a GPU only as 1 hop
         if (node != baseNode && node->type == GPU &&
             (ncclParamNvbDisable() || link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue;
 
-        if ((remPath->width == 0 || remPath->count > path->count) && remPath->width < width) {
+        if ((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) {
           // Find reverse link
           for (int l=0; l<remNode->nlinks; l++) {
             if (remNode->links[l].remNode == node) {
@@ -83,7 +83,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
           // Copy the rest of the path
           for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
           remPath->count = path->count + 1;
-          remPath->width = width;
+          remPath->bw = bw;
 
           // Start with path type = link type. PATH and LINK types are supposed to match.
           // Don't consider LINK_NET as we only care about the NIC->GPU path.
@@ -129,9 +129,9 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n
         sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
         offset = strlen(line);
       }
-      INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].width);
+      INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].bw);
 #else
-      sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, topoPathTypeStr[node->paths[t][n].type]);
+      sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].bw, topoPathTypeStr[node->paths[t][n].type]);
       offset = strlen(line);
 #endif
     }
@@ -185,7 +185,7 @@ static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix,
   srcNode->paths[t2][i2].count = l;
   srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
   if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN;
-  srcNode->paths[t2][i2].width = std::min(srcNode->paths[tx][ix].width, cpuNode->paths[t2][i2].width);
+  srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw);
   return ncclSuccess;
 }
 
@@ -399,6 +399,40 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
   return ncclSuccess;
 }
 
+NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0);
+
+// Check whether going through the network would be faster than going through P2P/SHM.
+ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net) {
+  if (ncclParamNetDisableIntra() == 1) {
+    *net = 0;
+    return ncclSuccess;
+  }
+  *net = 1;
+  // First check the current GPU-to-GPU speed.
+  int g1, g2;
+  if (ncclTopoIdToIndex(system, GPU, id1, &g1) != ncclSuccess ||
+      ncclTopoIdToIndex(system, GPU, id2, &g2) != ncclSuccess) {
+    return ncclSuccess;
+  }
+
+  struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
+  struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
+  float speed = gpu1->paths[GPU][g2].bw;
+
+  // Now check the speed each GPU can access the network through PXB or better
+  float netSpeed1 = 0, netSpeed2 = 0;
+  for (int n=0; n<system->nodes[NET].count; n++) {
+    struct ncclTopoLinkList* path = gpu1->paths[NET]+n;
+    if (path->type <= PATH_PXB && path->bw > netSpeed1) netSpeed1 = path->bw;
+    path = gpu2->paths[NET]+n;
+    if (path->type <= PATH_PXB && path->bw > netSpeed2) netSpeed2 = path->bw;
+  }
+
+  if (netSpeed1 > speed && netSpeed2 > speed) return ncclSuccess;
+  *net = 0;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank) {
   // Get GPU and NET
   int n, g;
@@ -476,17 +510,23 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
   // Remove everything in case we're re-computing
   for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
 
-  // Set direct paths from/to CPUs. We need them in many cases.
+  // Set direct paths to CPUs. We need them in many cases.
   for (int c=0; c<system->nodes[CPU].count; c++) {
     NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
   }
 
-  // Set direct paths from/to GPUs.
+  // Set direct paths to GPUs.
   for (int g=0; g<system->nodes[GPU].count; g++) {
-    // Compute paths to GPU g
     NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system));
+  }
 
-    // Update path when we don't want to / can't use GPU Direct P2P
+  // Set direct paths to NICs.
+  for (int n=0; n<system->nodes[NET].count; n++) {
+    NCCLCHECK(ncclTopoSetPaths(system->nodes[NET].nodes+n, system));
+  }
+
+  // Update path for GPUs when we don't want to / can't use GPU Direct P2P
+  for (int g=0; g<system->nodes[GPU].count; g++) {
     for (int p=0; p<system->nodes[GPU].count; p++) {
       int p2p;
       NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
@@ -499,31 +539,32 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
     }
 
     if (comm == NULL) continue;
-    // Remove GPUs we can't talk to because of containers.
+    // Remove GPUs we can't (or don't want to) communicate with through P2P or SHM
     struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank;
     for (int p=0; p<system->nodes[GPU].count; p++) {
       if (p == g) continue;
       struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank;
-      int shm;
-      NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
       int p2p;
       NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
-      if (shm == 0 && p2p == 0) {
-        // Mark this peer as inaccessible. We'll trim it later.
-        system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
+      if (p2p == 0) {
+        int shm;
+        NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
+        if (shm == 0) {
+          // Mark this peer as inaccessible. We'll trim it later.
+          system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
+        }
       }
     }
   }
 
-  // Set direct paths from/to NICs.
+  // Update paths for NICs (no GPU Direct, PXN, ...)
   for (int n=0; n<system->nodes[NET].count; n++) {
     struct ncclTopoNode* netNode = system->nodes[NET].nodes+n;
-    NCCLCHECK(ncclTopoSetPaths(netNode, system));
 
     for (int g=0; g<system->nodes[GPU].count; g++) {
       // Check whether we can access the NIC through another NVLink-connected GPU (PXN)
       struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-      if (ncclPxnDisable(comm) != 1 && gpu->paths[NET][n].type > PATH_PXB) {
+      if (ncclPxnDisable(comm) != 1) {
         int pxnGpu = -1;
 
         for (int p=0; p<system->nodes[GPU].count; p++) {
@@ -531,7 +572,12 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
 
           // PXN = PCI + NVLink.
           struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
-          if (peerNode->paths[NET][n].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue;
+          // Only use PXN for NIC n if remote GPU p ...
+          if (peerNode->paths[NET][n].type > PATH_PXB || // Is connected to the NIC through PCI
+              peerNode->paths[GPU][g].type > PATH_NVL || // Is connected to us through NVLink
+              (peerNode->paths[NET][n].bw <= gpu->paths[NET][n].bw && // Has either higher BW to that NIC
+               gpu->paths[NET][n].type <= PATH_PXB))                        //     or avoids going through a CPU
+            continue;
 
           pxnGpu = p;
 
@@ -626,8 +672,8 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*
     // Local rank
     path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
     if (path->type == PATH_NVL) {
-      float nvlWidth = ncclTopoNVLinkSpeed(system->nodes[GPU].nodes[g].gpu.cudaCompCap);
-      *nChannels = 2*std::max(1, (int)(path->width / nvlWidth));
+      float nvlBw = ncclTopoNVLinkBw(system->nodes[GPU].nodes[g].gpu.cudaCompCap);
+      *nChannels = 2*std::max(1, (int)(path->bw / nvlBw));
     } else {
       *nChannels = 2;
     }
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 0f79258..27a8e43 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -10,39 +10,39 @@
 #include "xml.h"
 #include <math.h>
 
-// Initialize system->maxWidth. This is the per-channel (i.e. per-SM)
-// max speed.
-static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) {
-  float maxWidth = 0.0;
+// Initialize system->maxBw. This is the per-channel (i.e. per-SM)
+// max bw.
+static float getMaxBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) {
+  float maxBw = 0.0;
   for (int i=0; i<system->nodes[type].count; i++) {
     struct ncclTopoLinkList* path = gpu->paths[type]+i;
-    float width = path->width;
+    float bw = path->bw;
     if (path->count == 0) continue;
-    maxWidth = std::max(maxWidth, width);
+    maxBw = std::max(maxBw, bw);
   }
-  return maxWidth;
+  return maxBw;
 }
-static float getTotalWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
-  float nvlinkWidth = 0.0, pciWidth = 0.0;
+static float getTotalBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
+  float nvlinkBw = 0.0, pciBw = 0.0;
   for (int l=0; l<gpu->nlinks; l++) {
     struct ncclTopoLink* link = gpu->links+l;
-    if (link->type == LINK_NVL) nvlinkWidth += link->width;
-    if (link->type == LINK_PCI) pciWidth = link->width;
+    if (link->type == LINK_NVL) nvlinkBw += link->bw;
+    if (link->type == LINK_PCI) pciBw = link->bw;
   }
-  return std::max(pciWidth, nvlinkWidth);
+  return std::max(pciBw, nvlinkBw);
 }
 ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
-  system->maxWidth = 0.0;
-  system->totalWidth = 0.0;
+  system->maxBw = 0.0;
+  system->totalBw = 0.0;
   int inter = system->nodes[NET].count;
   if (inter == 0 && system->nodes[GPU].count == 1) {
-    system->maxWidth = LOC_WIDTH;
+    system->maxBw = LOC_BW;
     return ncclSuccess;
   }
   for (int g=0; g<system->nodes[GPU].count; g++) {
     struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-    system->maxWidth = std::max(system->maxWidth, getMaxWidth(system, gpu, inter ? NET : GPU));
-    system->totalWidth = std::max(system->totalWidth, getTotalWidth(system, gpu));
+    system->maxBw = std::max(system->maxBw, getMaxBw(system, gpu, inter ? NET : GPU));
+    system->totalBw = std::max(system->totalBw, getTotalBw(system, gpu));
   }
   return ncclSuccess;
 }
@@ -62,8 +62,8 @@ static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode*
 // This is unfortunately needed since manipulating floats often results in rounding errors.
 #define SUB_ROUND(a, b) (a = roundf((a-b)*1000)/1000)
 
-static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNode* start, int maxSteps, float speed, int* steps) {
-  float pciSpeed = speed;
+static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNode* start, int maxSteps, float bw, int* steps) {
+  float pciBw = bw;
   for (int step=0; step<path->count; step++) {
     struct ncclTopoNode* node = path->list[step]->remNode;
     if (node->type == CPU) {
@@ -71,7 +71,7 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
       if (path->type == PATH_PHB && start->type == GPU &&
           node->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 &&
           node->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
-        pciSpeed = INTEL_P2P_OVERHEAD(speed);
+        pciBw = INTEL_P2P_OVERHEAD(bw);
       }
     }
   }
@@ -80,19 +80,19 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
   for (int step=0; step<maxSteps; step++) {
     struct ncclTopoLink* link = path->list[step];
     struct ncclTopoLink* revLink = NULL;
-    float fwSpeed = link->type == LINK_PCI ? pciSpeed : speed;
-    float revSpeed = 0;
+    float fwBw = link->type == LINK_PCI ? pciBw : bw;
+    float revBw = 0;
     if (link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) {
       if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
-      revSpeed += fwSpeed/8;
+      revBw += fwBw/8;
     }
     if (link->remNode->type == CPU && link->type == LINK_NVL) {
       if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
-      revSpeed += fwSpeed;
+      revBw += fwBw;
     }
-    if (link->width < fwSpeed || (revSpeed && revLink->width < revSpeed)) { *steps = step; return ncclSuccess; }
-    SUB_ROUND(link->width, fwSpeed);
-    if (revSpeed) SUB_ROUND(revLink->width, revSpeed);
+    if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; }
+    SUB_ROUND(link->bw, fwBw);
+    if (revBw) SUB_ROUND(revLink->bw, revBw);
     node = link->remNode;
   }
   *steps = maxSteps;
@@ -111,16 +111,16 @@ static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncc
   // Now check link type
   *node = NULL;
   int intra = type1 == GPU && type2 == GPU;
-  float speed = intra ? graph->speedIntra : graph->speedInter;
+  float bw = intra ? graph->bwIntra : graph->bwInter;
   int type = intra ? graph->typeIntra : graph->typeInter;
 
   if (mult == 1 && (path->type > type)) return ncclSuccess;
 
-  speed *= mult;
+  bw *= mult;
 
   // Check there is enough bandwidth on paths.
   int step = 0;
-  NCCLCHECK(followPath(path, node1, path->count, speed, &step));
+  NCCLCHECK(followPath(path, node1, path->count, bw, &step));
   if (step < path->count) goto rewind;
 
   // Enough bandwidth : return destination node.
@@ -130,11 +130,11 @@ static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncc
 
 rewind:
   // Not enough bandwidth : rewind and exit.
-  NCCLCHECK(followPath(path, node1, step, -speed, &step));
+  NCCLCHECK(followPath(path, node1, step, -bw, &step));
   return ncclSuccess;
 }
 
-static int gpuPciWidth(struct ncclTopoNode* gpu) {
+static int gpuPciBw(struct ncclTopoNode* gpu) {
   for (int l=0; l<gpu->nlinks; l++) {
     struct ncclTopoLink* gpuLink = gpu->links+l;
     if (gpuLink->type != LINK_PCI) continue;
@@ -142,7 +142,7 @@ static int gpuPciWidth(struct ncclTopoNode* gpu) {
     for (int l=0; l<pci->nlinks; l++) {
       struct ncclTopoLink* pciLink = pci->links+l;
       if (pciLink->remNode != gpu) continue;
-      return std::min(gpuLink->width, pciLink->width);
+      return std::min(gpuLink->bw, pciLink->bw);
     }
   }
   return -1;
@@ -154,29 +154,29 @@ struct ncclGpuScore {
   int g;             // Retain the index
   int startIndex;    // Least important
   int intraNhops;
-  int intraWidth;
+  int intraBw;
   int interNhops;
-  int interPciWidth;
-  int interWidth;    // Most important
+  int interPciBw;
+  int interBw;    // Most important
 };
 
 static int cmpScore(const void * g1, const void * g2) {
    struct ncclGpuScore *s1 = (struct ncclGpuScore*)g1;
    struct ncclGpuScore *s2 = (struct ncclGpuScore*)g2;
    int d;
-   if ((d = (s2->interWidth - s1->interWidth))) return d;
-   if ((d = (s2->interPciWidth - s1->interPciWidth))) return d;
+   if ((d = (s2->interBw - s1->interBw))) return d;
+   if ((d = (s2->interPciBw - s1->interPciBw))) return d;
    if ((d = (s1->interNhops - s2->interNhops))) return d;
-   if ((d = (s2->intraWidth - s1->intraWidth))) return d;
+   if ((d = (s2->intraBw - s1->intraBw))) return d;
    if ((d = (s1->intraNhops - s2->intraNhops))) return d;
    return s1->startIndex - s2->startIndex;
 }
 
 static int cmpIntraScores(struct ncclGpuScore* scores, int count) {
-  int intraWidth = scores[0].intraWidth;
+  int intraBw = scores[0].intraBw;
   int intraNhops = scores[0].intraNhops;
   for (int i=1; i<count; i++) {
-    if (scores[i].intraWidth != intraWidth || scores[i].intraNhops != intraNhops) return 1;
+    if (scores[i].intraBw != intraBw || scores[i].intraNhops != intraNhops) return 1;
   }
   return 0;
 }
@@ -229,11 +229,11 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
     scores[count].g = g;
     scores[count].startIndex = i;
     scores[count].intraNhops = paths[g].count;
-    scores[count].intraWidth = paths[g].width;
+    scores[count].intraBw = paths[g].bw;
     if (netPaths) {
       scores[count].interNhops = netPaths[g].count;
-      scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g);
-      scores[count].interWidth = netPaths[g].width;
+      scores[count].interPciBw = gpuPciBw(system->nodes[GPU].nodes+g);
+      scores[count].interBw = netPaths[g].bw;
     }
     count++;
   }
@@ -295,8 +295,8 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoG
   if (graph->nChannels < graph->minChannels) return ncclSuccess;
 
   // 2. Try to get better bandwidth
-  if (graph->nChannels*graph->speedIntra < refGraph->nChannels*refGraph->speedIntra) return ncclSuccess;
-  if (graph->nChannels*graph->speedIntra > refGraph->nChannels*refGraph->speedIntra) {
+  if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra) return ncclSuccess;
+  if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra) {
     *copy = 1;
     return ncclSuccess;
   }
@@ -396,23 +396,23 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
 
         // Balanced Tree : count half of the bandwidth on first two GPUs
         int nextBackToNet = -1;
-        float speedInterSave = graph->speedInter;
+        float bwInterSave = graph->bwInter;
         if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
           // Count half of the bandwidth on each of the first two GPUs
           if (step == 0) nextBackToNet = 1;
           else if (net->id != graph->inter[graph->nChannels*2+1]) continue;
-          graph->speedInter /= 2;
+          graph->bwInter /= 2;
         }
 
         NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
-        graph->speedInter = speedInterSave;
+        graph->bwInter = bwInterSave;
         if (net) {
           graph->inter[graph->nChannels*2+1] = net->id;
           NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
 
-          if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->speedInter /= 2;
+          if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2;
           NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
-          graph->speedInter = speedInterSave;
+          graph->bwInter = bwInterSave;
         }
       }
       free(nets);
@@ -451,7 +451,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
 }
 
 ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
-  const int speed = graph->speedInter;
+  const int bw = graph->bwInter;
   int* nets;
   NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
   int netcount;
@@ -461,7 +461,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
     struct ncclTopoNode* net = system->nodes[NET].nodes+n;
     struct ncclTopoNode* gpu;
     if (graph->collNet && net->net.collSupport == 0) continue;
-    if (net->net.width < speed) continue;
+    if (net->net.bw < bw) continue;
     if (net->net.maxChannels == 0) continue;
 
     graph->inter[graph->nChannels*2] = net->id;
@@ -470,7 +470,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
     for (int i=0; i<system->nodes[NET].count; i++) {
       if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
           (system->nodes[NET].nodes[i].net.port == net->net.port)) {
-        system->nodes[NET].nodes[i].net.width -= speed;
+        system->nodes[NET].nodes[i].net.bw -= bw;
       }
     }
     net->net.maxChannels--;
@@ -490,26 +490,26 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
       }
 
       // Then try the most local GPUs
-      float maxWidth = 0;
+      float maxBw = 0;
       int minHops = 0xfffffff;
       struct ncclTopoLinkList* paths = net->paths[GPU];
       for (int g=0; g<system->nodes[GPU].count; g++) {
-        if (paths[g].width > maxWidth) {
-          maxWidth = paths[g].width;
+        if (paths[g].bw > maxBw) {
+          maxBw = paths[g].bw;
           minHops = paths[g].count;
-        } else if (paths[g].width == maxWidth && paths[g].count < minHops) {
+        } else if (paths[g].bw == maxBw && paths[g].count < minHops) {
           minHops = paths[g].count;
         }
       }
-      if (maxWidth >= speed) {
+      if (maxBw >= bw) {
         // In the first loop, avoid using GPUs in both directions between channels (one channel
         // sending from that GPU and one channel receiving to that GPU), since that usually leads
         // to lower BW.
         for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) {
           for (int g=0; g<system->nodes[GPU].count; g++) {
-            if (paths[g].width == maxWidth && paths[g].count == minHops) {
+            if (paths[g].bw == maxBw && paths[g].count == minHops) {
               gpu = system->nodes[GPU].nodes+g;
-              int gpuUsed = gpuPciWidth(gpu) > 0 ? 0 : 1;
+              int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1;
               if (tryGpuBidir == gpuUsed) {
                 NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
               }
@@ -523,7 +523,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
     for (int i=0; i<system->nodes[NET].count; i++) {
       if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
           (system->nodes[NET].nodes[i].net.port == net->net.port)) {
-        system->nodes[NET].nodes[i].net.width += speed;
+        system->nodes[NET].nodes[i].net.bw += bw;
       }
     }
   }
@@ -642,8 +642,8 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc
 
   NCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern));
   NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
-  NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->speedIntra));
-  NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->speedInter));
+  NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra));
+  NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter));
   if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0;
   const char* str;
   NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
@@ -701,8 +701,8 @@ ncclResult_t ncclTopoGetXmlFromGraph(struct ncclTopoGraph* graph, struct ncclTop
   NCCLCHECK(xmlSetAttrInt(xmlGraph, "pattern", graph->pattern));
   NCCLCHECK(xmlSetAttrInt(xmlGraph, "crossnic", graph->crossNic));
   NCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels));
-  NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->speedIntra));
-  NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->speedInter));
+  NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->bwIntra));
+  NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->bwInter));
   NCCLCHECK(xmlSetAttrFloat(xmlGraph, "latencyinter", graph->latencyInter));
   const char* str;
   NCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType));
@@ -737,7 +737,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   int ngpus = system->nodes[GPU].count;
   graph->crossNic = ncclParamCrossNic();
   int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
-  graph->speedIntra = graph->speedInter = 0;
+  graph->bwIntra = graph->bwInter = 0;
   graph->latencyInter = 0;
   if (graph->crossNic == 2) graph->crossNic = 0;
   graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
@@ -767,7 +767,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   struct ncclTopoGraph tmpGraph;
   memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
 
-  // First try crossnic, then decrease speed and finally increase speedIntra.
+  // First try crossnic, then decrease bw and finally increase bwIntra.
   int nspeeds = 0;
   float* speedArray = NULL;
   if (system->nodes[NET].count == 0) {
@@ -779,8 +779,8 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   }
   int pass = 1;
   int speedIndex = 0;
-  while (speedArray[speedIndex] > system->maxWidth && speedIndex < nspeeds-1) speedIndex++;
-  tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
+  while (speedArray[speedIndex] > system->maxBw && speedIndex < nspeeds-1) speedIndex++;
+  tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
   int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
 
 search:
@@ -791,7 +791,7 @@ search:
 
   NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
 #if 0
-  printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
+  printf("Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
   for (int c=0; c<graph->nChannels; c++) {
     printf("%2d : ", c);
     for (int g=0; g<ngpus; g++) {
@@ -803,7 +803,7 @@ search:
 #endif
   // Optimal solution, stop here
   if (time == -1) goto done;
-  if (graph->nChannels*graph->speedInter >= system->totalWidth) goto done;
+  if (graph->nChannels*graph->bwInter >= system->totalBw) goto done;
 
   if (pass == 1) {
     // First pass, we don't have a solution yet ; try other options
@@ -846,14 +846,14 @@ search:
     }
     tmpGraph.pattern = graph->pattern;
 
-    // Decrease speed until we find a solution
-    if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->speedInter > .49))) {
-      tmpGraph.speedInter = tmpGraph.speedIntra = speedArray[++speedIndex];
+    // Decrease bw until we find a solution
+    if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->bwInter > .49))) {
+      tmpGraph.bwInter = tmpGraph.bwIntra = speedArray[++speedIndex];
       goto search;
     }
     speedIndex = 0;
-    while (speedArray[speedIndex] > system->maxWidth && speedIndex < nspeeds-1) speedIndex++;
-    tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
+    while (speedArray[speedIndex] > system->maxBw && speedIndex < nspeeds-1) speedIndex++;
+    tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
 
   }
 
@@ -863,18 +863,18 @@ done:
     time = -1;
     memcpy(&tmpGraph, graph, sizeof(tmpGraph));
     speedIndex = 0;
-    while (speedArray[speedIndex] > graph->speedInter && speedIndex < nspeeds-1) speedIndex++;
-    tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
+    while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++;
+    tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
     tmpGraph.minChannels = graph->nChannels;
     pass = 2;
   }
 
-  // 3. See if we can increase speedIntra for trees (2 nodes or collnet)
+  // 3. See if we can increase bwIntra for trees (2 nodes or collnet)
   if (pass == 2) {
     if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING &&
-        tmpGraph.speedIntra == graph->speedIntra && tmpGraph.speedIntra < tmpGraph.speedInter*2 &&
+        tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2 &&
         speedIndex > 0) {
-      tmpGraph.speedIntra = speedArray[--speedIndex];
+      tmpGraph.bwIntra = speedArray[--speedIndex];
       goto search;
     }
     time = -1;
@@ -885,24 +885,24 @@ done:
     WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
     for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
     graph->inter[0] = graph->inter[1] = 0;
-    graph->speedIntra = graph->speedInter = 0.1;
+    graph->bwIntra = graph->bwInter = 0.1;
     graph->typeIntra = graph->typeInter = PATH_SYS;
     graph->nChannels = 1;
   }
 
-  if (graph->speedIntra >= 25.0) {
+  if (graph->bwIntra >= 25.0) {
     int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
     memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
     memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
-    graph->speedIntra /= DIVUP(dupChannels, graph->nChannels);
-    graph->speedInter /= DIVUP(dupChannels, graph->nChannels);
+    graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
+    graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
     graph->nChannels = dupChannels;
   }
   return ncclSuccess;
 }
 
 ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
-  INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %f/%f, type %s/%s, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, topoPathTypeStr[graph->typeIntra], topoPathTypeStr[graph->typeInter], graph->sameChannels);
+  INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, bw %f/%f, type %s/%s, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->bwIntra, graph->bwInter, topoPathTypeStr[graph->typeIntra], topoPathTypeStr[graph->typeInter], graph->sameChannels);
   int ngpus = system->nodes[GPU].count;
 
   char line[1024];
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 2730bf9..9e4c978 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -56,24 +56,24 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode*
   return ncclSuccess;
 }
 
-int interCpuWidth = 0;
-int cpuPciWidth = 0;
+int interCpuBw = 0;
+int cpuPciBw = 0;
 
-static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* width) {
-  *width = LOC_WIDTH;
+static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) {
+  *bw = LOC_BW;
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) {
-    *width = P9_WIDTH;
+    *bw = P9_BW;
     return ncclSuccess;
   }
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_ARM) {
-    *width = ARM_WIDTH;
+    *bw = ARM_BW;
     return ncclSuccess;
   }
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
-    *width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH;
+    *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
   }
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
-    *width = cpu->cpu.model ==  NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_WIDTH : ZPI_WIDTH;
+    *bw = cpu->cpu.model ==  NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
   }
   return ncclSuccess;
 }
@@ -109,7 +109,7 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo
     n->nlinks=1;
     n->links[0].type = LINK_LOC;
     n->links[0].remNode = n;
-    n->links[0].width = LOC_WIDTH;
+    n->links[0].bw = LOC_BW;
     n->gpu.dev = NCCL_TOPO_UNDEF;
     n->gpu.rank = NCCL_TOPO_UNDEF;
     n->gpu.cudaCompCap = NCCL_TOPO_UNDEF;
@@ -120,7 +120,7 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo
   } else if (type == NET) {
     n->net.asic = 0ULL;
     n->net.port = NCCL_TOPO_UNDEF;
-    n->net.width = 0.0;
+    n->net.bw = 0.0;
     n->net.latency = 0.0;
   }
   *node = n;
@@ -150,8 +150,8 @@ ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int ind
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width) {
-  // Aggregate links into higher width for NVLink
+ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw) {
+  // Aggregate links into higher bw for NVLink
   struct ncclTopoLink* link;
   for (link = node->links; link->remNode; link++) {
     if (link->remNode == remNode && link->type == type) break;
@@ -159,13 +159,13 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode
   if (link->remNode == NULL) node->nlinks++;
   link->type = type;
   link->remNode = remNode;
-  link->width += width;
+  link->bw += bw;
 
   // Sort links in BW descending order
   struct ncclTopoLink linkSave;
   memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
   while (link != node->links) {
-    if ((link-1)->width >= linkSave.width) break;
+    if ((link-1)->bw >= linkSave.bw) break;
     memcpy(link, link-1, sizeof(struct ncclTopoLink));
     link--;
   }
@@ -237,9 +237,9 @@ ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
   for (int n=0; n<system->nodes[CPU].count; n++) {
     for (int p=0; p<system->nodes[CPU].count; p++) {
       if (n == p) continue;
-      float width;
-      NCCLCHECK(ncclTopoGetInterCpuWidth(system->nodes[CPU].nodes+n, &width));
-      NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, width));
+      float bw;
+      NCCLCHECK(ncclTopoGetInterCpuBw(system->nodes[CPU].nodes+n, &bw));
+      NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, bw));
     }
   }
   return ncclSuccess;
@@ -262,13 +262,13 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
     struct ncclTopoLink* link = node->links+l;
     if (link->type == LINK_LOC) continue;
     if (link->type != LINK_PCI || link->remNode != prevNode) {
-      sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->width);
+      sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw);
       int nextOffset = strlen(line);
       if (link->type == LINK_PCI) {
         NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
       } else {
         if (link->remNode->type == NET) {
-          sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.width);
+          sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
         } else {
           sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
         }
@@ -280,7 +280,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
 }
 
 ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
-  INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f totalWidth %2.1f ===", s->maxWidth, s->totalWidth);
+  INFO(NCCL_GRAPH, "=== System : maxBw %2.1f totalBw %2.1f ===", s->maxBw, s->totalBw);
   char line[1024];
   for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
   INFO(NCCL_GRAPH, "==========================================");
@@ -335,7 +335,7 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s
   int mbps;
   NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0));
   if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
-  net->net.width = mbps / 8000.0;
+  net->net.bw = mbps / 8000.0;
   if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0;
   NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0));
   NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0));
@@ -343,8 +343,8 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s
   NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0));
   ncclDebugNoWarn = 0;
 
-  NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width));
-  NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.width));
+  NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.bw));
+  NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.bw));
   return ncclSuccess;
 }
 
@@ -480,8 +480,8 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
       NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));
       if (nic == NULL) {
         NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0));
-        NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH));
-        NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH));
+        NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW));
+        NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW));
       }
       NCCLCHECK(ncclTopoAddNic(node, system, nic));
     }
@@ -524,10 +524,10 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
       }
     }
     if (remote) {
-      float nvlSpeed = ncclTopoNVLinkSpeed(gpu->gpu.cudaCompCap);
-      NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));
+      float nvlBw = ncclTopoNVLinkBw(gpu->gpu.cudaCompCap);
+      NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlBw));
       if (remote->type != GPU) {
-        NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));
+        NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlBw));
       }
     }
   } else {
@@ -683,18 +683,18 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* i
   int g;
   NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
   int minType = PATH_SYS;
-  float maxWidth = 0;
+  float maxBw = 0;
   int count = 0;
   int* nets;
   NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
   for (int n=0; n<system->nodes[NET].count; n++) {
     struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g;
-    if (path->width > maxWidth || (path->width == maxWidth && path->type < minType)) {
-      maxWidth = path->width;
+    if (path->bw > maxBw || (path->bw == maxBw && path->type < minType)) {
+      maxBw = path->bw;
       minType = path->type;
       count = 0;
     }
-    if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
+    if (path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
   }
   if (count == 0) {
     *id = -1;
@@ -790,6 +790,11 @@ ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) {
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count) {
+  *count = system->nodes[NVS].count;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax) {
   if (system->nodes[GPU].count == 0) return ncclInternalError;
   int min, max;
diff --git a/src/graph/topo.h b/src/graph/topo.h
index b24a72b..20a3e9d 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -10,23 +10,23 @@
 #include "graph.h"
 #include "core.h"
 
-#define LOC_WIDTH 5000.0
-#define SM60_NVLINK_WIDTH 18.0
-#define SM70_NVLINK_WIDTH 22.0
-#define SM80_NVLINK_WIDTH 22.0
-#define SM86_NVLINK_WIDTH 12.0
-#define PCI_WIDTH 12.0           // PCI Gen3 x16
-#define QPI_WIDTH 6.0
-#define SKL_QPI_WIDTH 9.0
-#define ZPI_WIDTH 6.0
-#define YONGFENG_ZPI_WIDTH 9.0
-#define P9_WIDTH 32.0
-#define ARM_WIDTH 6.0
-#define NET_WIDTH 12.0           // 100Gbit
+#define LOC_BW 5000.0
+#define SM60_NVLINK_BW 18.0
+#define SM70_NVLINK_BW 22.0
+#define SM80_NVLINK_BW 22.0
+#define SM86_NVLINK_BW 12.0
+#define PCI_BW 12.0           // PCI Gen3 x16
+#define QPI_BW 6.0
+#define SKL_QPI_BW 9.0
+#define ZPI_BW 6.0
+#define YONGFENG_ZPI_BW 9.0
+#define P9_BW 32.0
+#define ARM_BW 6.0
+#define NET_BW 12.0           // 100Gbit
 
 // Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
 // to GPU traffic consumes more PCI bandwidth.
-#define INTEL_P2P_OVERHEAD(speed) (speed*6/5)
+#define INTEL_P2P_OVERHEAD(bw) (bw*6/5)
 
 #define NCCL_TOPO_NODE_TYPES 7
 #define GPU 0
@@ -78,7 +78,7 @@ extern const char* topoPathTypeStr[];
 struct ncclTopoNode;
 struct ncclTopoLink {
   int type;
-  float width;
+  float bw;
   struct ncclTopoNode* remNode;
 };
 #define NCCL_TOPO_MAX_LINKS 32
@@ -87,7 +87,7 @@ struct ncclTopoLink {
 struct ncclTopoLinkList {
   struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS];
   int count;
-  float width;
+  float bw;
   int type;
 };
 
@@ -110,7 +110,7 @@ struct ncclTopoNode {
     struct {
       uint64_t asic;
       int port;
-      float width;
+      float bw;
       float latency;
       int gdrSupport;
       int collSupport;
@@ -141,14 +141,14 @@ struct ncclTopoNodeSet {
 
 struct ncclTopoSystem {
   struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
-  float maxWidth;
-  float totalWidth;
+  float maxBw;
+  float totalBw;
 };
 
 ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
 ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
 ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id);
-ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width);
+ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw);
 ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
 ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank);
@@ -192,13 +192,13 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in
   return ncclInternalError;
 }
 
-// Returns NVLink speed in GB/s
-static float ncclTopoNVLinkSpeed(int cudaCompCap) {
+// Returns NVLink bw in GB/s
+static float ncclTopoNVLinkBw(int cudaCompCap) {
   return
-    cudaCompCap == 86 ? SM86_NVLINK_WIDTH :
-    cudaCompCap >= 80 ? SM80_NVLINK_WIDTH :
-    cudaCompCap >= 70 ? SM70_NVLINK_WIDTH :
-    cudaCompCap >= 60 ? SM60_NVLINK_WIDTH :
-    SM80_NVLINK_WIDTH;
+    cudaCompCap == 86 ? SM86_NVLINK_BW :
+    cudaCompCap >= 80 ? SM80_NVLINK_BW :
+    cudaCompCap >= 70 ? SM70_NVLINK_BW :
+    cudaCompCap >= 60 ? SM60_NVLINK_BW :
+    SM80_NVLINK_BW;
 }
 #endif
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index b07ca38..bc5e969 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -53,7 +53,7 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
 
 // Latencies in us, Bandwidths in GB/s
 // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
-static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4,  0 }, { 3.6, 10.0, 8.4 }, { 4.4, 4.4,  0 } };
+static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4,  0 }, { 3.6, 10.0, 8.4 }, { 4.4, 4.4,  0 }, { 4.4, 4.4,  0 }};
 
 // NVLink, PCI, Network
 #define NCCL_HW_NVLINK 0
@@ -62,11 +62,14 @@ static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4,
 // Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network).
 static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
 { /* NVLINK */
-  { /* Tree (LL/LL128/Simple)*/ { .52, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .47, 1.9, 3.4 }, /* CollNet (LL/LL128/Simple)*/ {  .5, 1.2, 8.0 } },
+  { /* Tree (LL/LL128/Simple)*/ { .52, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .47, 1.9, 3.4 },
+    /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 } },
   /* PCI */
-  { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 1.0, 1.9, 8.0 } },
+  { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
+    /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 } },
   /* NET */
-  { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 28 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } }
+  { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 9.6 },
+    /* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 10.7 } }
 };
 
 // LL128 max BW per channel
@@ -75,15 +78,16 @@ static const double llMaxBws[2][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0,
 static const double perChMaxTreeBws[2][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8} };
 
 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
-  int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
+  int simpleDefaultThreads = (ringGraph->bwIntra*ringGraph->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
   comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
     getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
   comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
     getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
-  comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
-  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
+  comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] =
+    comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
+  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] =
     getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
-  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
+  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
     getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
 
   int nNodes = comm->nNodes;
@@ -102,7 +106,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
   float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
 
-  struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph };
+  struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph, collNetGraph };
   int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
   for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
   for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
@@ -119,8 +123,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
       if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue;
 
       for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-        float speed = nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
-        float busBw = graphs[a]->nChannels * speed;
+        int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
+        float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
+        float busBw = graphs[a]->nChannels * bw;
 
         // Various model refinements
         if (compCap80) busBw = std::min(busBw, 235.0f);
@@ -129,7 +134,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
         if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh*graphs[a]->nChannels);
-        if (a == NCCL_ALGO_COLLNET && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Oneshot CollNet only supports Simple
+        if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
+        if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
+	if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
+          // Collnet+Direct requires all GPUs to have a local NIC to work at full speed
+          float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio
+	  factor -= (factor-1)/2;
+          busBw /= factor;
+	}
+	if (a == NCCL_ALGO_COLLNET_CHAIN && p == NCCL_PROTO_SIMPLE) busBw *= .75;
 
         // Convert bus BW to algorithm BW
         float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps;
@@ -155,9 +168,11 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
         } else if (a == NCCL_ALGO_TREE) {
           comm->latencies[coll][a][p] +=
             2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
-        } else {
+        } else if (a == NCCL_ALGO_COLLNET_DIRECT) {
           comm->latencies[coll][a][p] +=
             2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.5) + interLat;  // Add 0.5 arity serialization latency
+        } else if (a == NCCL_ALGO_COLLNET_CHAIN) {
+          comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat;
         }
       }
     }
@@ -166,7 +181,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   // Protocols/Algorithms enable/disable, and user overrides.
   // All are enabled except ll128 which is enabled by default only in certain cases.
   int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
-  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1 };
+  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1 };
 
   const char *protoStr = getenv("NCCL_PROTO");
   if (protoStr) {
@@ -180,12 +195,18 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   }
   // Disable CollNet if it is not supported
   if (comm->collNetSupport == 0) {
-    algoEnable[NCCL_ALGO_COLLNET] = 0;
+    algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
+    algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
     // If user has hard set NCCL_ALGO=COLLNET, ignore it
     if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) {
       algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
       if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
     }
+  } else {
+    // Disable CollNet+Direct if not on an NVSwitch system
+    int nvsCount = 0;
+    NCCLCHECK(ncclTopoGetNvsCount(comm->topo, &nvsCount));
+    if (nvsCount == 0) algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
   }
 
   for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -234,13 +255,14 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
     comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
   }
   comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks;
-  comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = 512;
+  comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = 512;
+  comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = 512;
 
   // Override defaults with user env
   char* str = getenv("NCCL_THREAD_THRESHOLDS");
   if (str) {
     INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str);
-    ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2}};
+    ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }, { -2, -2, -2 }, { -2, -2, -2 }};
     sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
     for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
       for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -249,16 +271,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
     }
   }
 
-  INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld/%ld/%ld",
+  INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld | %ld",
       comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL],
       comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128],
       comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE],
       comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL],
       comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128],
       comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE],
-      comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_LL],
-      comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128],
-      comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE]);
+      comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE],
+      comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE]);
   return ncclSuccess;
 }
 
diff --git a/src/group.cc b/src/group.cc
index d9bc684..590068d 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -9,31 +9,52 @@
 #include "enqueue.h"
 #include "transport.h"
 #include "channel.h"
+#include <assert.h>
 
 __thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting
 __thread ncclResult_t ncclGroupError = ncclSuccess;
 __thread struct ncclComm* ncclGroupCommHead = nullptr;
 __thread struct ncclComm* ncclGroupCommPreconnectHead = nullptr;
 __thread struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> ncclAsyncJobs;
+__thread struct ncclGroupJob *ncclGroupJobMainPtr = NULL;
+__thread struct ncclGroupJob ncclGroupJobMain;
+__thread int ncclGroupBlocking = -1; /* default mode */
+__thread bool ncclGroupJobAbortFlag = false;
+
+void* ncclAsyncJobMain(void* arg);
+static ncclResult_t groupJobComplete(struct ncclGroupJob *job);
 
 ncclResult_t ncclAsyncLaunch(
     struct ncclAsyncJob* job,
     ncclResult_t(*func)(struct ncclAsyncJob*),
     void(*undo)(struct ncclAsyncJob*),
-    void(*destructor)(void*)
+    void(*destructor)(void*), ncclComm_t comm
   ) {
-  if (0 == ncclGroupDepth) {
-    ncclResult_t res = func(job);
-    if (res != ncclSuccess && undo) undo(job);
+  ncclResult_t ret = ncclSuccess;
+
+  if (ncclGroupDepth == 0) {
+    ret = func(job);
+    if (ret != ncclSuccess && undo) undo(job);
     if (destructor) destructor(job);
-    return res;
   } else {
     job->func = func;
     job->undo = undo;
     job->destructor = destructor;
+    job->abortFlag = comm->abortFlag;
+    job->state = ncclGroupJobRunning;
+    job->comm = comm;
+    /* check if there are blocking and nonblocking comms at the same time in group. */
+    if (ncclGroupBlocking == -1) {
+      /* first met communicator */
+      ncclGroupBlocking = comm->blocking;
+    } else if (ncclGroupBlocking != comm->blocking) {
+      WARN("Blocking and nonblocking communicators are not allowed in the same group.");
+      ret = ncclInvalidArgument;
+    }
     ncclIntruQueueEnqueue(&ncclAsyncJobs, job);
-    return ncclSuccess;
   }
+
+  return ret;
 }
 
 void* ncclAsyncJobMain(void* arg) {
@@ -42,23 +63,50 @@ void* ncclAsyncJobMain(void* arg) {
   if (job->result != ncclSuccess) {
     INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, job->result);
   }
+  __atomic_store_n(&job->state, ncclGroupJobDone, __ATOMIC_RELEASE);
   return arg;
 }
 
+ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) {
+  ncclResult_t ret;
+  SYSCHECK(pthread_join(job->thread, NULL), "pthread_join");
+  if (job->result != ncclSuccess) {
+    WARN("ncclAsyncJobComplete: job %p failed, job error %d", job, job->result);
+  }
+  ret = job->result;
+  if (job->destructor) job->destructor((void*)job);
+  return ret;
+}
+
 NCCL_API(ncclResult_t, ncclGroupStart);
 ncclResult_t ncclGroupStart() {
+  ncclResult_t ret = ncclSuccess;
   NVTX3_FUNC_RANGE_IN(nccl_domain);
+
+  /* if previous group launch does not complete, don't launch this one. */
+  if (ncclGroupJobMainPtr != NULL) {
+    if (__atomic_load_n(&ncclGroupJobMainPtr->doneFlag, __ATOMIC_ACQUIRE) == false) {
+      ret = ncclInvalidUsage;
+      goto exit;
+    } else {
+      NCCLCHECKGOTO(groupJobComplete(ncclGroupJobMainPtr), ret, exit);
+    }
+  }
   NCCLCHECK(ncclGroupStartInternal());
   TRACE_CALL("ncclGroupStart()");
-  return ncclSuccess;
+
+exit:
+  return ret;
 }
 
 NCCL_API(ncclResult_t, ncclGroupEnd);
 ncclResult_t ncclGroupEnd() {
+  ncclResult_t ret = ncclSuccess;
   NVTX3_FUNC_RANGE_IN(nccl_domain);
-  NCCLCHECK(ncclGroupEndInternal());
+  NCCLCHECKGOTO(ncclGroupEndInternal(), ret, exit);
   TRACE_CALL("ncclGroupEnd()");
-  return ncclSuccess;
+exit:
+  return ret;
 }
 
 struct ncclPreconnectJob {
@@ -143,31 +191,103 @@ failure:
   return result;
 }
 
-ncclResult_t ncclGroupEndInternal() {
-  if (ncclGroupDepth == 0) {
-    WARN("ncclGroupEnd: not in a group call.");
-    return ncclInvalidUsage;
+static inline void groupResetJobState() {
+  ncclGroupBlocking = -1;
+  ncclGroupJobMainPtr = NULL;
+  memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
+  return;
+}
+
+static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncJobsPtr, ncclResult_t* groupErrorPtr, ncclResult_t error) {
+  struct ncclComm* comm = *groupCommHeadPtr;
+
+  while (comm != nullptr) {
+    struct ncclComm* next = comm->groupNext;
+    (void) ncclGroupCommLeave(comm); // overwrites comm->groupNext
+    // We don't know if preconnect succeeded or happened at all, so clear
+    // the flags that let `taskAppend()` skip over checking if preconnect
+    // is needed.
+    comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
+    for (int i = 0; i < comm->nRanks; i++) {
+      comm->tasks.peers[i].sendSeen = false;
+      comm->tasks.peers[i].recvSeen = false;
+      comm->connectSend[i] = 0;
+      comm->connectRecv[i] = 0;
+    }
+    comm->unlaunchedPlansHead = nullptr;
+    // Reclaim abandoned kernel plan memory. Note ncclWork structs were already
+    // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
+    while (!ncclIntruQueueEmpty(&comm->planQueue)) {
+      struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue);
+      // Persistent plans will be reclaimed via the callbackQueue when the
+      // graph drops its UserObject reference.
+      if (!plan->persistent) {
+        for (int c = 0; c < MAXCHANNELS; c++) {
+          while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) {
+            struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue);
+            ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
+          }
+        }
+        ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
+      }
+    }
+    // Reset comm->tasks to empty.
+    comm->tasks.nTasksColl = 0;
+    comm->tasks.nTasksP2p = 0;
+    comm->tasks.streams = nullptr;
+    ncclIntruQueueConstruct(&comm->tasks.collQueue);
+    comm->tasks.collBytesTotal = 0;
+    for (int i = 0; i < comm->nRanks; i++) {
+      ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
+      ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
+    }
+
+    if (!comm->blocking)
+      (void) ncclCommSetAsyncError(comm, error);
+    comm = next;
   }
-  ncclGroupDepth--;
-  if (ncclGroupDepth > 0) return ncclSuccess;
 
+  /* reset everything */
+  while (!ncclIntruQueueEmpty(asyncJobsPtr)) {
+    struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr);
+    *job->abortFlag = 1;
+    if (job->comm && !job->comm->blocking)
+      (void) ncclCommSetAsyncError(job->comm, error);
+    if (job->undo) job->undo(job);
+    if (job->destructor) job->destructor((void*)job);
+  }
+
+  *groupErrorPtr = ncclSuccess;
+  *groupCommHeadPtr = nullptr;
+  *groupCommPreconnectHeadPtr = nullptr;
+  return;
+}
+
+static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
   int savedDev;
-  CUDACHECK(cudaGetDevice(&savedDev));
-
-  ncclResult_t ret = ncclGroupError;
+  ncclResult_t ret = ncclSuccess;
   bool jobsDone = false;
-  if (ret != ncclSuccess) goto failure;
+  bool errorJobAbortFlag = false;
+  struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
+  struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
+  struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
+  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
+  volatile bool *groupAbortFlag = gjob->abortFlagPtr;
 
-  if (ncclGroupCommPreconnectHead != nullptr) {
-    struct ncclComm* comm = ncclGroupCommPreconnectHead;
+  CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
+
+  if (groupCommPreconnectHeadMain != nullptr) {
+    struct ncclComm* comm = groupCommPreconnectHeadMain;
     do {
       struct ncclPreconnectJob* job;
-      NCCLCHECK(ncclCalloc(&job, 1));
+      NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
       job->base.func = ncclPreconnectFunc;
       job->base.undo = nullptr;
       job->base.destructor = free;
+      job->base.state = ncclGroupJobRunning;
+      job->base.abortFlag = comm->abortFlag;
       job->comm = comm;
-      ncclIntruQueueEnqueue(&ncclAsyncJobs, &job->base);
+      ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
 
       struct ncclComm* next = comm->preconnectNext;
       comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
@@ -175,94 +295,154 @@ ncclResult_t ncclGroupEndInternal() {
     } while (comm != nullptr);
   }
 
-  if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
-    struct ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
+  if (!ncclIntruQueueEmpty(asyncJobsMain)) {
+    struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain);
     do {
-      pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job);
+      SYSCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), ret, fail);
       job = job->next;
     } while (job != nullptr);
 
-    job = ncclIntruQueueHead(&ncclAsyncJobs);
     do {
-      int err = pthread_join(job->thread, nullptr);
-      if (err != 0) {
-        WARN("Error waiting for pthread_join : %s", strerror(errno));
-        ret = ncclSystemError;
-      }
-      if (ret == ncclSuccess && job->result != ncclSuccess) ret = job->result;
-      job = job->next;
-    } while (job != nullptr);
-
-    jobsDone = true;
-    if (ret != ncclSuccess) goto failure;
-  }
-
-  if (ncclGroupCommHead != nullptr) {
-    NCCLCHECKGOTO(doLaunches(ncclGroupCommHead), ret, failure);
-    do {
-      struct ncclComm* comm = ncclGroupCommHead;
-      struct ncclComm* next = comm->groupNext;
-      ncclGroupCommLeave(comm);
-      ncclGroupCommHead = next;
-    } while (ncclGroupCommHead != nullptr);
-  }
-
-  if (false) {
-  failure:
-    struct ncclComm* comm = ncclGroupCommHead;
-    while (comm != nullptr) {
-      struct ncclComm* next = comm->groupNext;
-      ncclGroupCommLeave(comm); // overwrites comm->groupNext
-      // We don't know if preconnect succeeded or happened at all, so clear
-      // the flags that let `taskAppend()` skip over checking if preconnect
-      // is needed.
-      comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
-      for (int i=0; i < comm->nRanks; i++) {
-        comm->tasks.peers[i].sendSeen = false;
-        comm->tasks.peers[i].recvSeen = false;
-        comm->connectSend[i] = 0;
-        comm->connectRecv[i] = 0;
-      }
-      comm->unlaunchedPlansHead = nullptr;
-      // Reclaim abandoned kernel plan memory. Note ncclWork structs were already
-      // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
-      while (!ncclIntruQueueEmpty(&comm->planQueue)) {
-        struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue);
-        // Persistent plans will be reclaimed via the callbackQueue when the
-        // graph drops its UserObject reference.
-        if (!plan->persistent) {
-          for (int c=0; c < MAXCHANNELS; c++) {
-            while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) {
-              struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue);
-              ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
-            }
+      jobsDone = true;
+      job = ncclIntruQueueHead(asyncJobsMain);
+      do {
+        ncclGroupJobState_t state = __atomic_load_n(&job->state, __ATOMIC_ACQUIRE);
+        if (state == ncclGroupJobRunning) {
+          jobsDone = false;
+        } else if (state == ncclGroupJobDone) {
+          if (pthread_join(job->thread, nullptr) != 0) {
+            WARN("Error waiting for pthread_join : %s", strerror(errno));
+            ret = ncclSystemError;
           }
-          ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
+          job->state = ncclGroupJobJoined;
+          if (job->result != ncclSuccess) {
+            ret = job->result;
+            errorJobAbortFlag = true;
+          }
+        } else {
+          /* safety check */
+          assert(state == ncclGroupJobJoined);
         }
-      }
-      // Reset comm->tasks to empty.
-      comm->tasks.nTasksColl = 0;
-      comm->tasks.nTasksP2p = 0;
-      comm->tasks.streams = nullptr;
-      ncclIntruQueueConstruct(&comm->tasks.collQueue);
-      comm->tasks.collBytesTotal = 0;
-      for (int i=0; i < comm->nRanks; i++) {
-        ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
-        ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
-      }
-      comm = next;
-    }
+
+        if (*groupAbortFlag == true || errorJobAbortFlag == true) {
+          *job->abortFlag = 1;
+          ret = ncclInternalError;
+        }
+
+        job = job->next;
+      } while (job != nullptr);
+    } while (jobsDone == false);
+
+    if (ret != ncclSuccess) goto fail;
   }
 
-  while (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
-    struct ncclAsyncJob* job = ncclIntruQueueDequeue(&ncclAsyncJobs);
-    if (ret != ncclSuccess && jobsDone && job->undo) job->undo(job);
+  if (groupCommHeadMain != nullptr) {
+    NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
+  }
+
+  /* this atomic must happen before cleanup and setting state of communicators */
+  __atomic_store_n(&gjob->doneFlag, true, __ATOMIC_RELEASE);
+
+  while (!ncclIntruQueueEmpty(asyncJobsMain)) {
+    struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
+    if (job->comm && !job->comm->blocking)
+      (void) ncclCommSetAsyncError(job->comm, ret);
     if (job->destructor) job->destructor((void*)job);
   }
 
-  ncclGroupError = ncclSuccess;
-  ncclGroupCommHead = nullptr;
-  ncclGroupCommPreconnectHead = nullptr;
-  CUDACHECK(cudaSetDevice(savedDev)); // do other clean-ups first before calling cudaSetDevice, because this call can fail too
+  while (groupCommHeadMain != nullptr) {
+    struct ncclComm* comm = groupCommHeadMain;
+    struct ncclComm* next = comm->groupNext;
+    (void) ncclGroupCommLeave(comm);
+    if (!comm->blocking) {
+      (void) ncclCommSetAsyncError(comm, ret);
+    }
+    groupCommHeadMain = next;
+  }
+
+  *gjob->groupErrorPtr = ncclSuccess;
+  *gjob->groupCommHeadPtr = nullptr;
+  *gjob->groupCommPreconnectHeadPtr = nullptr;
+
+  CUDACHECK(cudaSetDevice(savedDev));
+
+exit:
+  return ret;
+fail:
+  groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, ret);
+  goto exit;
+}
+
+ncclResult_t ncclGroupEndInternal() {
+  ncclResult_t ret = ncclSuccess;
+
+  if (ncclGroupDepth == 0) {
+    WARN("ncclGroupEnd: not in a group call.");
+    ret = ncclInvalidUsage;
+    goto exit;
+  }
+
+  if ((--ncclGroupDepth) > 0) goto exit;
+
+  if ((ret = ncclGroupError) != ncclSuccess) goto fail;
+
+  if (ncclGroupCommHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs) || ncclGroupCommPreconnectHead != nullptr) {
+    ncclGroupJobMain.groupCommHeadPtr = &ncclGroupCommHead;
+    ncclGroupJobMain.groupCommPreconnectHeadPtr = &ncclGroupCommPreconnectHead;
+    ncclGroupJobMain.groupErrorPtr = &ncclGroupError;
+    ncclGroupJobMain.asyncJobsPtr = &ncclAsyncJobs;
+    ncclGroupJobMain.abortFlagPtr = &ncclGroupJobAbortFlag;
+    ncclGroupJobMain.doneFlag = false;
+    ncclGroupJobMainPtr = &ncclGroupJobMain;
+    /* make sure ncclGroupBlocking has been set. */
+    assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1);
+    if (ncclGroupBlocking == 0 && (ncclGroupCommPreconnectHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs))) {
+      /* nonblocking group */
+      if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
+        ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
+        do {
+          NCCLCHECKGOTO(ncclCommSetAsyncError(job->comm, ncclInProgress), ret, fail);
+          job = job->next;
+        } while (job);
+      }
+
+      if (ncclGroupCommHead) {
+        ncclComm_t comm = ncclGroupCommHead;
+        do {
+          NCCLCHECKGOTO(ncclCommSetAsyncError(comm, ncclInProgress), ret, fail);
+          comm = comm->groupNext;
+        } while (comm);
+      }
+      ncclGroupJobMainPtr->base.func = groupLaunch;
+      SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail);
+      ret = ncclInProgress;
+    } else {
+      /* blocking group */
+      NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base), ret, fail);
+      groupResetJobState();
+    }
+  }
+
+exit:
+  return ret;
+fail:
+  groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, ret);
+  groupResetJobState();
+  goto exit;
+}
+
+static ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
+  ncclResult_t ret = ncclSuccess;
+  if (job) {
+    ret = ncclAsyncJobComplete(&job->base);
+    groupResetJobState();
+  }
   return ret;
 }
+
+void ncclGroupJobAbort() {
+  ncclGroupJobAbortFlag = true;
+  (void) groupJobComplete(ncclGroupJobMainPtr);
+  /* reset group abort flag */
+  ncclGroupJobAbortFlag = false;
+}
diff --git a/src/include/alloc.h b/src/include/alloc.h
index 29ec87a..7c20003 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -21,17 +21,15 @@ uint64_t clockNano(); // from utils.h with which we have a circular dependency
 template <typename T>
 ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
   ncclResult_t result = ncclSuccess;
-  uint64_t time = 0;
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   *ptr = nullptr;
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  time = clockNano();
   CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish);
-  time = clockNano() - time;
   memset(*ptr, 0, nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p seconds: cudaHostAlloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
 finish:
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T));
+  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
   return result;
 }
 #define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -80,12 +78,11 @@ ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, in
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   *ptr = nullptr;
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  uint64_t time = clockNano();
   CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
-  time = clockNano() - time;
 finish:
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: cudaMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
+  if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T));
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
   return result;
 }
 #define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -93,23 +90,20 @@ finish:
 template <typename T>
 ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
   ncclResult_t result = ncclSuccess;
-  uint64_t time0=0, time1=0, time2=0;
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   *ptr = nullptr;
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
   // Need a side stream so as not to interfere with graph capture.
   cudaStream_t stream;
-  time0 = clockNano();
   CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  time1 = clockNano();
   CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
-  time2 = clockNano();
   CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
   CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
   CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
-  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: cudaStreamCreateWithFlags=%g cudaMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time1-time0)/1.e9, double(time2-time1)/1.e9);
 finish:
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T));
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
   return result;
 }
 #define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -117,17 +111,15 @@ finish:
 template <typename T>
 ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char *filefunc, int line) {
   ncclResult_t result = ncclSuccess;
-  uint64_t time = 0;
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   *ptr = nullptr;
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  time = clockNano();
   CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
-  time = clockNano() - time;
   CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
-  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: cudaMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
 finish:
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T));
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
   return result;
 }
 #define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)
diff --git a/src/include/checks.h b/src/include/checks.h
index 715aeb7..048fc06 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -106,7 +106,7 @@
 // Propagate errors up
 #define NCCLCHECK(call) do { \
   ncclResult_t res = call; \
-  if (res != ncclSuccess) { \
+  if (res != ncclSuccess && res != ncclInProgress) { \
     /* Print the back trace*/ \
     if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
     return res; \
@@ -115,7 +115,7 @@
 
 #define NCCLCHECKGOTO(call, res, label) do { \
   res = call; \
-  if (res != ncclSuccess) { \
+  if (res != ncclSuccess && res != ncclInProgress) { \
     /* Print the back trace*/ \
     if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
     goto label; \
@@ -125,7 +125,7 @@
 #define NCCLWAIT(call, cond, abortFlagPtr) do {         \
   volatile uint32_t* tmpAbortFlag = (abortFlagPtr);     \
   ncclResult_t res = call;                \
-  if (res != ncclSuccess) {               \
+  if (res != ncclSuccess && res != ncclInProgress) {               \
     if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
     return ncclInternalError;             \
   }                                       \
@@ -135,7 +135,7 @@
 #define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \
   volatile uint32_t* tmpAbortFlag = (abortFlagPtr);             \
   res = call;                             \
-  if (res != ncclSuccess) {               \
+  if (res != ncclSuccess && res != ncclInProgress) {               \
     if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
     goto label;                           \
   }                                       \
@@ -143,7 +143,7 @@
 } while (!(cond));
 
 #define NCCLCHECKTHREAD(a, args) do { \
-  if (((args)->ret = (a)) != ncclSuccess) { \
+  if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \
     INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
     return args; \
   } \
diff --git a/src/include/collectives.h b/src/include/collectives.h
index 7f0d0b6..f50a379 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -38,8 +38,9 @@ struct ncclDevRedOpFull {
   extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
   extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
 
+#define SINGLE_ARG(...) __VA_ARGS__
 #define CONCAT(a,b) a##b
-#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f)
+#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(SINGLE_ARG(t), SINGLE_ARG(f))
 #define MACRO_IF_0(t, f) f
 #define MACRO_IF_1(t, f) t
 
@@ -51,7 +52,8 @@ struct ncclDevRedOpFull {
 #define DECL3(func, devredop, type, undef) \
   DECL4(func, RING,    devredop, type, undef) \
   DECL4(func, TREE,    devredop, type, undef) \
-  DECL4(func, COLLNET, devredop, type, undef)
+  DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
+  DECL4(func, COLLNET_CHAIN, devredop, type, undef)
 
 #if defined(__CUDA_BF16_TYPES_EXIST__)
 #define DECL2(func, devredop, undefForFloat) \
@@ -117,7 +119,6 @@ extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
 #define BROADCAST_CHUNKSTEPS 1
 #define REDUCE_SLICESTEPS 1
 #define REDUCE_CHUNKSTEPS 1
-#define SENDRECV_SLICEFACTOR 4
 #define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
 
 #endif
diff --git a/src/include/comm.h b/src/include/comm.h
index ee752fc..2adce32 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -102,7 +102,8 @@ struct ncclChannel {
   struct ncclRing ring;
   int* devRingUserRanks;
   struct ncclTree tree;
-  struct ncclDirect collTree;
+  struct ncclTree collnetChain;
+  struct ncclDirect collnetDirect;
   int id; // index of this channel
   uint32_t workFifoSent; // last used work index+1
   uint64_t p2pOpCount;
@@ -128,6 +129,7 @@ struct ncclKernelPlan {
   struct ncclKernelPlan* next;
 
   bool persistent; // aka captured in a graph
+  bool kernelSpecialized;
   void *kernelFn;
   int channelUbound; // only channels c < channelUbound are present
   int channelCount; // number of channels present
@@ -201,8 +203,12 @@ struct ncclComm {
   int p2pnChannelsPerPeer;
   int p2pChannels[MAXCHANNELS];
 
+  // Should this comm allocate LL buffers for network P2P connections?
+  bool allocP2pNetLLBuffers;
+
   // Buffer sizes
   int buffSizes[NCCL_NUM_PROTOCOLS];
+  int p2pNetChunkSize;
 
   // Algorithm/Protocols thresholds
   ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
@@ -210,8 +216,9 @@ struct ncclComm {
   float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
 
-  // Whether there has been a fatal error in this communicator.
-  ncclResult_t fatalError;
+  /* This attribute can indicate the states of communicators and return code of
+   * asynchronous NCCL operations. */
+  ncclResult_t asyncResult;
 
   // Flag to ask NCCL kernels to abort
   volatile uint32_t *abortFlag;
@@ -276,12 +283,16 @@ struct ncclComm {
   struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
   // First of the unlaunched kernels in `planQueue`
   struct ncclKernelPlan* unlaunchedPlansHead;
-};
 
-// Set to true during an `atexit()` handler. We use this to intentionally leak
-// unfreed CUDA resources when cleaning up after return of `main()` to avoid
-// CUDA calls after CUDA runtime teardown.
-extern bool ncclMainExited;
+  // communicator mode
+  int blocking;
+  // initState is to more conveniently reclaim resources when errors happen.
+  ncclResult_t initState;
+  // flag to indicate if ncclCommFinalize() is called
+  bool finalizeCalled;
+  // shared structures for finalization
+  int finalizeRankCnt;
+};
 
 enum ncclLaunchMode {
   ncclLaunchModeInvalid=0,
@@ -295,13 +306,16 @@ void ncclCommPushCudaFree(struct ncclComm* comm, void* buf);
 void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf);
 void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle);
 
-inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm) {
-  struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, /*waitSome=*/false);
+inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome) {
+  ncclResult_t result = ncclSuccess;
+  struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, waitSome);
   while (cb != nullptr) {
     struct ncclCommCallback* next = cb->next;
-    NCCLCHECK(cb->fn(comm, cb)); // may reclaim memory of cb
+    ncclResult_t res1 = cb->fn(comm, cb); // may reclaim memory of cb
+    if (res1 != ncclSuccess) result = res1;
     cb = next;
   }
+  NCCLCHECK(result);
   return ncclSuccess;
 }
 
@@ -358,4 +372,7 @@ static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) {
   return op1 < int(ncclNumOps) ? op : ncclRedOp_t(op1);
 }
 
+ncclResult_t ncclCommEnsureReady(ncclComm_t comm);
+ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState);
+
 #endif
diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h
index eaa5949..2bd3b4d 100644
--- a/src/include/cudawrap.h
+++ b/src/include/cudawrap.h
@@ -12,9 +12,9 @@
 #if CUDART_VERSION >= 11030
 #include <cudaTypedefs.h>
 #else
-typedef CUresult (CUDAAPI *PFN_cuInit)(unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion)(int *driverVersion);
-typedef CUresult (CUDAAPI *PFN_cuGetProcAddress)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
+typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
+typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
 #endif
 
 #define CUPFN(symbol) pfn_##symbol
@@ -60,27 +60,27 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress)(const char *symbol, void **pfn,
     }									\
 } while(0)
 
-#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
+#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol
 
 #if CUDART_VERSION >= 11030
 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
-DECLARE_CUDA_PFN_EXTERN(cuDeviceGet);
-DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute);
-DECLARE_CUDA_PFN_EXTERN(cuGetErrorString);
-DECLARE_CUDA_PFN_EXTERN(cuGetErrorName);
-DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange);
-DECLARE_CUDA_PFN_EXTERN(cuCtxCreate_v3020);
-DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy);
-DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
+DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
+DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020);
+DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
+DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
 #if CUDA_VERSION >= 11070
-DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
+DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
 #endif
 #endif
 
 /* CUDA Driver functions loaded with dlsym() */
-DECLARE_CUDA_PFN_EXTERN(cuInit);
-DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion);
-DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress);
+DECLARE_CUDA_PFN_EXTERN(cuInit, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion, 2020);
+DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030);
 
 
 ncclResult_t cudaLibraryInit(void);
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index f8b630e..53d6838 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -15,10 +15,11 @@
 typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
 extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
 
-#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
+#define NCCL_NUM_ALGORITHMS 4 // Tree/Ring/CollNet*
 #define NCCL_ALGO_TREE 0
 #define NCCL_ALGO_RING 1
-#define NCCL_ALGO_COLLNET 2
+#define NCCL_ALGO_COLLNET_DIRECT 2
+#define NCCL_ALGO_COLLNET_CHAIN 3
 extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
 
 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
@@ -205,7 +206,9 @@ struct ncclWorkElem {
 static_assert(NCCL_MAX_WORK_ELEMENTS == 9, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 9");
 
 struct ncclWorkElemP2p {
-  int32_t peer;
+  int peer : 30;
+  int proto : 2;
+
   enum ncclWorkP2PType p2pType;
   uint8_t nWarps;
   uint8_t warpStart;
@@ -259,7 +262,8 @@ struct alignas(16) ncclDevChannel {
   struct ncclDevChannelPeer *peers;
   struct ncclRing ring;
   struct ncclTree tree;
-  struct ncclDirect collTree;
+  struct ncclTree collnetChain;
+  struct ncclDirect collnetDirect;
   uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
 };
 
diff --git a/src/include/graph.h b/src/include/graph.h
index 1997f76..63b05b1 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -33,6 +33,7 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
 ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
+ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
 int ncclPxnDisable(struct ncclComm* comm);
 ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
 ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
@@ -51,6 +52,7 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
 #define NCCL_TOPO_CPU_TYPE_YONGFENG 1
 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
 ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id);
 
 #define NCCL_TOPO_MAX_NODES 256
@@ -72,8 +74,8 @@ struct ncclTopoGraph {
   int maxChannels;
   // Output
   int nChannels;
-  float speedIntra;
-  float speedInter;
+  float bwIntra;
+  float bwInter;
   float latencyInter;
   int typeIntra;
   int typeInter;
diff --git a/src/include/group.h b/src/include/group.h
index e6f31b1..e4b2ee0 100644
--- a/src/include/group.h
+++ b/src/include/group.h
@@ -13,12 +13,19 @@
 ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
 void ncclGroupCommJoin(struct ncclComm* comm);
 void ncclGroupCommPreconnect(struct ncclComm* comm);
-void ncclGroupCommLeave(struct ncclComm* comm);
+ncclResult_t ncclGroupCommLeave(struct ncclComm* comm);
+void ncclGroupJobAbort();
 
 typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
 
 ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
 
+typedef enum ncclGroupJobState {
+  ncclGroupJobRunning = 0,
+  ncclGroupJobDone    = 1,
+  ncclGroupJobJoined  = 2,
+} ncclGroupJobState_t;
+
 struct ncclAsyncJob {
   struct ncclAsyncJob* next;
   pthread_t thread;
@@ -26,17 +33,31 @@ struct ncclAsyncJob {
   ncclResult_t(*func)(struct ncclAsyncJob*);
   void(*undo)(struct ncclAsyncJob*);
   void(*destructor)(void*);
+  ncclGroupJobState_t state;
+  volatile uint32_t *abortFlag; /* point to comm abortFlag */
+  ncclComm_t comm;
 };
 
 ncclResult_t ncclAsyncLaunch(
   struct ncclAsyncJob* job,
   ncclResult_t(*func)(struct ncclAsyncJob*),
   void(*undo)(struct ncclAsyncJob*),
-  void(*destructor)(void*)
+  void(*destructor)(void*), ncclComm_t comm
 );
 
+struct ncclGroupJob {
+  struct ncclAsyncJob base;
+  struct ncclComm **groupCommHeadPtr;
+  struct ncclComm **groupCommPreconnectHeadPtr;
+  ncclResult_t *groupErrorPtr;
+  volatile bool *abortFlagPtr;
+  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
+  bool doneFlag;
+};
+
 ncclResult_t ncclGroupStartInternal();
 ncclResult_t ncclGroupEndInternal();
+ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -44,6 +65,7 @@ extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
 extern __thread ncclResult_t ncclGroupError;
 extern __thread struct ncclComm* ncclGroupCommHead;
 extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
+extern __thread int ncclGroupBlocking;
 
 inline ncclResult_t ncclGroupStartInternal() {
   ncclGroupDepth++;
@@ -52,7 +74,7 @@ inline ncclResult_t ncclGroupStartInternal() {
 
 inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
   if (ncclGroupDepth > 0) {
-    if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret;
+    if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret;
   }
   return ret;
 }
@@ -72,6 +94,8 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
     // this comm is allocated there.
     ncclMemoryStackPush(&comm->memScoped);
   }
+
+  ncclGroupBlocking = comm->blocking;
 }
 
 // Add comm to this thread's group needing preconnect
@@ -83,9 +107,10 @@ inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
 }
 
 // Comm has left group
-inline void ncclGroupCommLeave(struct ncclComm* comm) {
+inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) {
   comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
   ncclMemoryStackPop(&comm->memScoped);
+  return ncclSuccess;
 }
 
 #endif
diff --git a/src/include/info.h b/src/include/info.h
index b511728..a770c32 100644
--- a/src/include/info.h
+++ b/src/include/info.h
@@ -22,7 +22,8 @@ typedef enum : uint8_t {
   ncclPatternTreeUp,
   ncclPatternTreeDown,
   ncclPatternTreeUpDown,
-  ncclPatternCollTreeUpDown,
+  ncclPatternCollnetChain,
+  ncclPatternCollnetDirect,
   ncclPatternSend,
   ncclPatternRecv
 } ncclPattern_t;
diff --git a/src/include/proxy.h b/src/include/proxy.h
index dcab5e2..fa8f388 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -164,6 +164,7 @@ struct ncclProxyState {
   struct ncclSocket* listenSock;
   int stop;
   CUcontext cudaCtx;
+  int safeAbortFlag;
 
   // Used by main thread
   union ncclSocketAddress* peerAddresses;
@@ -183,6 +184,7 @@ struct ncclProxyConnection {
   struct ncclProxyArgs *proxyAppend;
   struct ncclProxyArgs **proxyAppendPtr;
   void* transportResources;
+  bool initFlag;
 };
 
 typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
diff --git a/src/init.cc b/src/init.cc
index 9269708..25c8d5d 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -35,12 +35,13 @@
 #endif
 
 const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
-const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" };
+const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain" };
 const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
 
 NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
 
 NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
+NCCL_PARAM(CommBlocking, "COMM_BLOCKING", 0);
 
 static uint64_t hashUniqueId(ncclUniqueId const &id) {
   char const *bytes = (char const*)&id;
@@ -73,17 +74,10 @@ pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
 static bool initialized = false;
 static size_t maxLocalSizeBytes = 0;
 
-bool ncclMainExited = false;
-
-static void atexitHandler() {
-  ncclMainExited = true;
-}
-
 static ncclResult_t ncclInit() {
   if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return ncclSuccess;
   pthread_mutex_lock(&initLock);
   if (!initialized) {
-    atexit(atexitHandler);
     initEnv();
     initGdrCopy();
     maxLocalSizeBytes = ncclKernMaxLocalSize();
@@ -178,46 +172,11 @@ void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle) {
   comm->destructorHead = dtor;
 }
 
-void commZombieCleanup(struct ncclComm* comm) {
-  ncclMemoryStackDestruct(&comm->memScoped);
-  ncclMemoryStackDestruct(&comm->memPermanent);
-
-  struct ncclComm* intraComm0 = comm->intraComm0;
-  if (0 == ncclAtomicRefCountDecrement(&intraComm0->intraRefs)) {
-    // Wait for all service threads to be done. We could not
-    // do it earlier because it could have blocked and prevented
-    // other ranks in the process to call ncclCommDestroy
-    comm = intraComm0;
-    while (comm != nullptr) {
-      if (comm->proxyState.thread) pthread_join(comm->proxyState.thread, nullptr);
-      struct ncclComm* next = comm->intraNext;
-      free(comm);
-      comm = next;
-    }
-  }
-}
-
-static void* commZombieMain(void* arg) {
-  ncclResult_t result = ncclSuccess;
-  struct ncclComm* comm = (struct ncclComm*)arg;
-  while (comm->persistentRefs != 0) {
-    struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, /*waitSome=*/true);
-    while (cb != nullptr) {
-      struct ncclCommCallback* next = cb->next;
-      NCCLCHECKGOTO(cb->fn(comm, cb), result, ignore); // may reclaim memory of cb
-    ignore:
-      cb = next;
-    }
-  }
-  commZombieCleanup(comm);
-  return arg;
-}
-
 static ncclResult_t commFree(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
-  // First stop all threads before we free anything.
+  // Stop all threads before we free anything.
   NCCLCHECK(ncclProxyDestroy(comm));
 
   delete[] comm->userRedOps;
@@ -226,9 +185,12 @@ static ncclResult_t commFree(ncclComm_t comm) {
   free(comm->connectRecv);
 
   free(comm->peerInfo);
-  ncclTopoFree(comm->topo);
-  for (int n=0; n<comm->nNodes; n++) free(comm->nodeRanks[n].localRankToRank);
-  free(comm->nodeRanks);
+  if (comm->topo)
+    ncclTopoFree(comm->topo);
+  if (comm->nodeRanks) {
+    for (int n=0; n<comm->nNodes; n++) free(comm->nodeRanks[n].localRankToRank);
+    free(comm->nodeRanks);
+  }
   free(comm->rankToNode);
   free(comm->rankToLocalRank);
 
@@ -238,10 +200,10 @@ static ncclResult_t commFree(ncclComm_t comm) {
   for (int channel=0; channel<MAXCHANNELS; channel++)
     NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));
 
-  NCCLCHECK(ncclStrongStreamDestruct(&comm->hostStream));
-  NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream));
-
-  NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));
+  if (comm->initState == ncclSuccess) {
+    NCCLCHECK(ncclStrongStreamDestruct(&comm->hostStream));
+    NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream));
+  }
 
   struct ncclDestructor* dtor = comm->destructorHead;
   while (dtor != nullptr) {
@@ -249,16 +211,34 @@ static ncclResult_t commFree(ncclComm_t comm) {
     dtor = dtor->next;
   }
 
+  ncclMemoryStackDestruct(&comm->memScoped);
+  ncclMemoryStackDestruct(&comm->memPermanent);
+
   commPoison(comm); // Important that this does not interfere with anything used below.
 
-  if (comm->persistentRefs == 0) {
-    commZombieCleanup(comm);
+  if (comm->initState == ncclSuccess) {
+    struct ncclComm* intraComm0 = comm->intraComm0;
+    if (0 == ncclAtomicRefCountDecrement(&intraComm0->intraRefs)) {
+      // Wait for all service threads to be done. We could not
+      // do it earlier because it could have blocked and prevented
+      // other ranks in the process to call ncclCommDestroy
+      comm = intraComm0;
+      while (comm != nullptr) {
+        if (comm->proxyState.thread) pthread_join(comm->proxyState.thread, nullptr);
+        struct ncclComm* next = comm->intraNext;
+        free(comm);
+        comm = next;
+      }
+    }
+  } else if (comm->proxyState.thread) {
+    pthread_join(comm->proxyState.thread, nullptr);
+    ncclCudaHostFree((void *)comm->abortFlag);
+    free(comm);
   } else {
-    // Spawn a thread to listen for remaining messages from graph cleanup.
-    pthread_t zombie;
-    pthread_create(&zombie, nullptr, commZombieMain, comm);
-    pthread_detach(zombie);
+    ncclCudaHostFree((void *)comm->abortFlag);
+    free(comm);
   }
+
   return ncclSuccess;
 }
 
@@ -290,6 +270,26 @@ static ncclResult_t dmaBufSupported(struct ncclComm* comm) {
   return ncclInternalError;
 }
 
+ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
+  /* comm must be ready, or error will be reported */
+  ncclResult_t ret = ncclSuccess;
+
+  if (*comm->abortFlag) {
+    ncclGroupJobAbort();
+  } else {
+    NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
+    if (ret != ncclSuccess) {
+      /* if ret is not ncclInProgress, we just keep it. */
+      WARN("Attempt to use communicator before the previous operation returned ncclSuccess\n");
+      if (ret == ncclInProgress) ret = ncclInvalidArgument;
+      goto exit;
+    }
+  }
+
+exit:
+  return ret;
+}
+
 static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   if (ndev < 1) {
     WARN("invalid device count (%d) requested", ndev);
@@ -301,7 +301,19 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   }
 
   struct ncclComm* comm;
-  NCCLCHECK(ncclCalloc(&comm, 1));
+  /* Cuurently we calloc comm in ncclCommInitRankDev for async function support.
+   * This 'if' structure is designed to consider the case where commAlloc is called
+   * in other cases except ncclCommInitRankDev. */
+  if (*comret == NULL) {
+    /* user requests a new communicator */
+    NCCLCHECK(ncclCalloc(&comm, 1));
+    NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1));
+    NCCLCHECK(ncclCommSetAsyncError(comm, ncclInProgress));
+  } else {
+    /* We already allocated a communicator in ncclCommInitRankDev. */
+    comm = *comret;
+  }
+
   ncclMemoryStackConstruct(&comm->memPermanent);
   ncclMemoryStackConstruct(&comm->memScoped);
   comm->destructorHead = nullptr;
@@ -322,10 +334,6 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
 
   comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
   comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false;
-  comm->fatalError = ncclSuccess;
-
-  NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1));
-  *comm->abortFlag = 0;
 
   comm->collNetSupport = 0;
 
@@ -397,7 +405,8 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
     tmpCommAndChans.channels[c].ring = comm->channels[c].ring;
     tmpCommAndChans.channels[c].ring.userRanks = comm->channels[c].devRingUserRanks;
     tmpCommAndChans.channels[c].tree = comm->channels[c].tree;
-    tmpCommAndChans.channels[c].collTree = comm->channels[c].collTree;
+    tmpCommAndChans.channels[c].collnetChain = comm->channels[c].collnetChain;
+    tmpCommAndChans.channels[c].collnetDirect = comm->channels[c].collnetDirect;
     tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c];
 
     if (comm->channels[c].ring.userRanks != nullptr) {
@@ -471,6 +480,8 @@ NCCL_PARAM(BuffSize, "BUFFSIZE", -2);
 NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2);
 NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2);
 
+NCCL_PARAM(P2pNetChunkSize, "P2P_NET_CHUNKSIZE", (1 << 17)); /* 128 kB */
+
 static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
   int cpuArch, cpuVendor, cpuModel;
   NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
@@ -483,12 +494,15 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
   }
+
+  comm->p2pNetChunkSize = ncclParamP2pNetChunkSize();
   return ncclSuccess;
 }
 
 NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
 NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
 NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1);
+NCCL_PARAM(AllocP2pNetLLBuffers, "NCCL_ALLOC_P2P_NET_LL_BUFFERS", 0);
 
 static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
   // We use 2 AllGathers
@@ -568,6 +582,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
   NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
 
+  // Initialize num P2P LL buffers for this communicator
+  comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1;
+
   if (comm->rank == ncclParamGraphDumpFileRank()) {
     struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
     NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
@@ -590,8 +607,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     int pattern;
     int nChannels;
     int sameChannels;
-    float speedIntra;
-    float speedInter;
+    float bwIntra;
+    float bwInter;
     int typeIntra;
     int typeInter;
   };
@@ -611,22 +628,22 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   allGather3Data[rank].tree.pattern = treeGraph.pattern;
   allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
   allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
-  allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
-  allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
+  allGather3Data[rank].tree.bwIntra = treeGraph.bwIntra;
+  allGather3Data[rank].tree.bwInter = treeGraph.bwInter;
   allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra;
   allGather3Data[rank].tree.typeInter = treeGraph.typeInter;
   allGather3Data[rank].ring.pattern = ringGraph.pattern;
   allGather3Data[rank].ring.nChannels = ringGraph.nChannels;
   allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
-  allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
-  allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
+  allGather3Data[rank].ring.bwIntra = ringGraph.bwIntra;
+  allGather3Data[rank].ring.bwInter = ringGraph.bwInter;
   allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra;
   allGather3Data[rank].ring.typeInter = ringGraph.typeInter;
   allGather3Data[rank].collNet.pattern = collNetGraph.pattern;
   allGather3Data[rank].collNet.nChannels = collNetGraph.nChannels;
   allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels;
-  allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra;
-  allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter;
+  allGather3Data[rank].collNet.bwIntra = collNetGraph.bwIntra;
+  allGather3Data[rank].collNet.bwInter = collNetGraph.bwInter;
   allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
   allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter;
   allGather3Data[rank].collNetSupport = comm->collNetSupport;
@@ -695,20 +712,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     // Make sure we align all ranks so that the tuning is consistent across ranks
     treeGraph.nChannels = std::min(allGather3Data[i].tree.nChannels, treeGraph.nChannels);
     treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
-    treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
-    treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
+    treeGraph.bwIntra = std::min(allGather3Data[i].tree.bwIntra, treeGraph.bwIntra);
+    treeGraph.bwInter = std::min(allGather3Data[i].tree.bwInter, treeGraph.bwInter);
     treeGraph.typeIntra = std::max(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
     treeGraph.typeInter = std::max(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
     ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels);
     ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
-    ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
-    ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
+    ringGraph.bwIntra = std::min(allGather3Data[i].ring.bwIntra, ringGraph.bwIntra);
+    ringGraph.bwInter = std::min(allGather3Data[i].ring.bwInter, ringGraph.bwInter);
     ringGraph.typeIntra = std::max(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
     ringGraph.typeInter = std::max(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
     collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels);
     collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
-    collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
-    collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
+    collNetGraph.bwIntra = std::min(allGather3Data[i].collNet.bwIntra, collNetGraph.bwIntra);
+    collNetGraph.bwInter = std::min(allGather3Data[i].collNet.bwInter, collNetGraph.bwInter);
     collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
     collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
     comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport);
@@ -812,16 +829,38 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup);
     TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
 
-    // Connect intra-node CollNet
+    char line[1024];
+    line[0]='\0';
+    for (int c=0; c<comm->nChannels; c++) {
+      struct ncclTree* chain = &comm->channels[c].collnetChain;
+      snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d",
+          c, chain->down[0], rank, chain->up);
+    }
+    line[1023] = '\0';
+    INFO(NCCL_INIT, "Collnet Chains %s", line);
+    // Connect Collnet + chain
+    for (int c=0; c<comm->nChannels; c++) {
+      struct ncclChannel* channel = comm->channels+c;
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, collnet_cleanup);
+    }
+    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0), ret, collnet_cleanup);
+    for (int c=0; c<comm->nChannels; c++) {
+      struct ncclChannel* channel = comm->channels+c;
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, collnet_cleanup);
+    }
+    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup);
+    INFO(NCCL_INIT, "Connected collnet + chain");
+
+    // Connect intra-node CollNet + Direct
     int highestTransportType0, highestTransportType1;
     for (int c=0; c<comm->nChannels; c++) {
       struct ncclChannel* channelRecv = comm->channels+c;
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, collnet_cleanup);
     }
     NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0, &highestTransportType0), ret, collnet_cleanup);
     for (int c=0; c<comm->nChannels; c++) {
       struct ncclChannel* channelSend = comm->channels+c;
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, collnet_cleanup);
     }
     NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1, &highestTransportType1), ret, collnet_cleanup);
 
@@ -1024,9 +1063,15 @@ struct ncclCommInitRankAsyncJob {
   int cudaDev;
 };
 
+struct ncclCommFinalizeAsyncJob {
+  struct ncclAsyncJob base;
+  ncclComm_t comm;
+};
+
 static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
   ncclComm_t* newcomm = job->newcomm;
+  ncclComm_t comm = *newcomm;
   int nranks = job->nranks;
   ncclUniqueId commId = job->commId; // C++ struct assignment
   int myrank = job->myrank;
@@ -1040,58 +1085,85 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zi", maxLocalSizeBytes);
     CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, maxLocalSizeBytes));
   }
-  *newcomm = NULL;
   NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
   NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
   NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
 
+  // update communicator state
+  comm->initState = ncclSuccess;
+
   INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
   TRACE_CALL("ncclCommInitRank(%p,%d,0x%llx,%d,%d)", *newcomm, nranks, (unsigned long long)hashUniqueId(commId), myrank, (*newcomm)->cudaDev);
   return ncclSuccess;
 cleanup:
-  if ((*newcomm) && (*newcomm)->bootstrap) bootstrapAbort((*newcomm)->bootstrap);
-  *newcomm = NULL;
+  comm->initState = res;
   return res;
 }
 
+static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
+  ncclResult_t ret = ncclSuccess;
+
+  /* first set configuration */
+  if (config) {
+    comm->blocking = config->blocking;
+  } else {
+    /* default setting of communicator */
+    comm->blocking = 1;
+  }
+
+  return ret;
+}
+
 static void ncclCommInitRankUndo(struct ncclAsyncJob* job_) {
   struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
   ncclCommDestroy(*job->newcomm);
   *job->newcomm = nullptr;
 }
 
-static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) {
+static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev, ncclConfig_t *config) {
   ncclResult_t res;
+  ncclComm_t comm = NULL;
+  struct ncclCommInitRankAsyncJob *job = NULL;
   char* env = getenv("NCCL_COMM_ID");
   if (env && myrank == 0) {
     INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
-    NCCLCHECKGOTO(bootstrapCreateRoot(&commId, true), res, end);
+    NCCLCHECKGOTO(bootstrapCreateRoot(&commId, true), res, fail);
   }
 
-  NCCLCHECKGOTO(ncclInit(), res, end);
+  NCCLCHECKGOTO(ncclInit(), res, fail);
   if (myrank == 0) showVersion();
 
   // Make sure the CUDA runtime is initialized.
-  CUDACHECKGOTO(cudaFree(NULL), res, end);
+  CUDACHECKGOTO(cudaFree(NULL), res, fail);
 
-  NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, end);
+  NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, fail);
   if (nranks < 1 || myrank < 0 || myrank >= nranks) {
     WARN("Invalid rank requested : %d/%d", myrank, nranks);
     res = ncclInvalidArgument;
-    goto end;
+    goto fail;
   }
 
-  struct ncclCommInitRankAsyncJob *job;
-  NCCLCHECKGOTO(ncclCalloc(&job, 1), res, end);
+  NCCLCHECKGOTO(ncclCalloc(&comm, 1), res, fail);
+  NCCLCHECKGOTO(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1), res, fail);
+  // set up comm state and abortFlag only
+  *comm->abortFlag = 0;
+  NCCLCHECKGOTO(parseCommConfig(comm, config), res, fail);
+  /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */
+  comm->initState = ncclInternalError;
+  *newcomm = comm;
+
+  NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
   job->newcomm = newcomm;
   job->nranks = nranks;
   job->commId = commId; // C++ struct assignment
   job->myrank = myrank;
   job->cudaDev = cudaDev;
-  NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, ncclCommInitRankUndo, free), res, end);
+  NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail);
 
-end:
+exit:
   return ncclGroupErrCheck(res);
+fail:
+  goto exit;
 }
 
 NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
@@ -1103,63 +1175,296 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
 
   int cudaDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
-  NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev));
+  NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, NULL));
   return ncclSuccess;
 }
 
 NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
 ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   NVTX3_FUNC_RANGE_IN(nccl_domain);
-
+  ncclResult_t ret = ncclSuccess;
+  int totalnDev;
+  int *gpuFlags = NULL;
   // Load the CUDA driver and dlsym hooks (can fail on old drivers)
   (void) cudaLibraryInit();
 
-  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
+  NCCLCHECKGOTO(PtrCheck(comms, "CommInitAll", "comms"), ret, fail);
   if (ndev < 0) {
     WARN("Invalid device count requested : %d", ndev);
-    return ncclInvalidArgument;
+    ret = ncclInvalidArgument;
+    goto fail;
+  }
+
+  CUDACHECKGOTO(cudaGetDeviceCount(&totalnDev), ret, fail);
+  if (devlist) {
+    NCCLCHECKGOTO(ncclCalloc(&gpuFlags, totalnDev), ret, fail);
+    for (int i = 0; i < ndev; ++i) {
+      /* invalid device check. */
+      if (devlist[i] < 0 || devlist[i] >= totalnDev) {
+        ret = ncclUnhandledCudaError;
+        goto fail;
+      }
+
+      /* duplicate device check. */
+      if (gpuFlags[devlist[i]] != 0) {
+        ret = ncclInvalidUsage;
+        goto fail;
+      }
+
+      gpuFlags[devlist[i]] = 1;
+    }
+    free(gpuFlags);
   }
 
   ncclUniqueId uniqueId;
-  NCCLCHECK(ncclGetUniqueId(&uniqueId));
-  NCCLCHECK(ncclGroupStart());
+  NCCLCHECKGOTO(ncclGetUniqueId(&uniqueId), ret, fail);
+  NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
   for (int i=0; i<ndev; i++) {
     // Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
-    ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i);
+    ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, NULL);
   }
-  NCCLCHECK(ncclGroupEnd());
-  return ncclSuccess;
+  NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
+
+exit:
+  return ret;
+fail:
+  if (gpuFlags) free(gpuFlags);
+  goto exit;
 }
 
-static ncclResult_t commDestroy(ncclComm_t comm) {
-  // Try and prevent a double free of the comm struct (user error)
-  if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
-    WARN("comm %p has already been destroyed", comm);
+ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState) {
+  if (nextState < 0 || nextState >= ncclNumResults || comm == NULL) {
+    WARN("ncclCommSetAsyncError: error comm %p sets state %d", comm, nextState);
     return ncclInvalidArgument;
   }
 
+  __atomic_store_n(&comm->asyncResult, nextState, __ATOMIC_RELEASE);
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config);
+ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  int cudaDev;
+  ncclResult_t ret = ncclSuccess;
+  ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
+  ncclConfig_t *internalConfigPtr;
+  size_t realSize;
+  int blockingEnv;
+
+  NCCLCHECK(ncclGroupStartInternal());
+  internalConfigPtr = &internalConfig;
+  if (config) {
+    memcpy((void*)&realSize, (void*)config, sizeof(size_t));
+    realSize = realSize > sizeof(ncclConfig_t) ? sizeof(ncclConfig_t) : realSize;
+    memcpy((void*)internalConfigPtr, (void*)config, realSize);
+    if (internalConfigPtr->magic != 0xcafebeef) {
+      WARN("ncclConfig_t argument not initialized via NCCL_CONFIG_INITIALIZER");
+      ret = ncclInvalidArgument;
+      goto exit;
+    }
+  }
+
+  /* check input config attributes */
+  if (internalConfigPtr->blocking != 0 && internalConfigPtr->blocking != 1) {
+    WARN("Invalid config blocking attribute value %d", internalConfigPtr->blocking);
+    ret = ncclInvalidArgument;
+    goto exit;
+  }
+
+  /* overwrite configuration from env variable. */
+  blockingEnv = ncclParamCommBlocking();
+  if (blockingEnv != 0 && blockingEnv != 1) {
+    WARN("Invalid NCCL_COMM_BLOCKING value %d", blockingEnv);
+  }
+  if (blockingEnv == 1) internalConfigPtr->blocking = blockingEnv;
+
+  (void) cudaLibraryInit();
+  CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, exit);
+  NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail);
+
+exit:
+  ncclGroupErrCheck(ret);
+  NCCLCHECK(ncclGroupEndInternal());
+  if (newcomm && *newcomm && !(*newcomm)->blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
+  return ret;
+fail:
+  if (newcomm && *newcomm && !(*newcomm)->blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
+  goto exit;
+}
+
+static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
+  struct ncclCommFinalizeAsyncJob* job = (struct ncclCommFinalizeAsyncJob*) job_;
+  ncclComm_t comm = job->comm;
   int savedDevice;
   CUDACHECK(cudaGetDevice(&savedDevice));
   int commDevice = comm->cudaDev;
+  ncclResult_t ret;
 
+  CUDACHECKGOTO(cudaGetDevice(&savedDevice), ret, fail);
+  if (savedDevice != commDevice) {
+    CUDACHECKGOTO(cudaSetDevice(commDevice), ret, fail);
+  }
+
+  TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult);
+
+  if (comm->initState == ncclSuccess) {
+    NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->hostStream), ret, fail);
+    NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->deviceStream), ret, fail);
+  }
+  NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
+  // And keep polling until all graphs referencing us die.
+  while (comm->persistentRefs != 0) {
+    NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail);
+  }
+
+  if (savedDevice != commDevice) {
+    CUDACHECKGOTO(cudaSetDevice(savedDevice), ret, fail);
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+static ncclResult_t commCleanup(ncclComm_t comm) {
+  int savedDevice;
+  int commDevice = comm->cudaDev;
+
+  CUDACHECK(cudaGetDevice(&savedDevice));
   if (savedDevice != commDevice) {
     CUDACHECK(cudaSetDevice(commDevice));
   }
 
-  TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, comm->rank, *comm->abortFlag, comm->fatalError);
-
-  NCCLCHECK(ncclStrongStreamSynchronize(&comm->hostStream));
-  NCCLCHECK(ncclStrongStreamSynchronize(&comm->deviceStream));
-  NCCLCHECK(ncclCommPollCallbacks(comm));
-
   NCCLCHECK(commFree(comm));
 
-  if (savedDevice != commDevice)
+  if (savedDevice != commDevice) {
     CUDACHECK(cudaSetDevice(savedDevice));
+  }
 
   return ncclSuccess;
 }
 
+static ncclResult_t commFinalize(ncclComm_t comm, bool userCalled) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclCommFinalizeAsyncJob *job = NULL;
+
+  comm->finalizeCalled = true;
+  /* launch async thread to finalize comm. */
+  NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
+  job->comm = comm;
+
+  if (userCalled) {
+    NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commDestroySync, NULL, free, comm), ret, fail);
+  } else {
+    NCCLCHECKGOTO(commDestroySync(&job->base), ret, fail);
+    free(job);
+  }
+
+exit:
+  return ncclGroupErrCheck(ret);
+fail:
+  if (job) free(job);
+  goto exit;
+}
+
+NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm);
+ncclResult_t ncclCommFinalize(ncclComm_t comm) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  ncclResult_t ret = ncclSuccess;
+
+  NCCLCHECK(ncclGroupStartInternal());
+  if (comm == NULL) goto exit;
+
+  /* wait comm ready before finalize. */
+  NCCLCHECKGOTO(ncclCommEnsureReady(comm), ret, fail);
+
+  /* prevent double finalize. */
+  if (comm->finalizeCalled) {
+    ret = ncclInvalidArgument;
+    goto fail;
+  }
+
+  /* finalize comm. */
+  ret = commFinalize(comm, true);
+
+exit:
+  ncclGroupErrCheck(ret);
+  NCCLCHECK(ncclGroupEndInternal());
+  if (comm && !comm->blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)) };
+  return ret;
+fail:
+  if (comm && !comm->blocking) (void) ncclCommSetAsyncError(comm, ret);
+  goto exit;
+}
+
+static ncclResult_t commReclaim(ncclComm_t comm) {
+  ncclResult_t ret = ncclSuccess;
+  ncclResult_t state;
+  int curRank; /* Debug info */
+
+  NCCLCHECKGOTO(ncclCommGetAsyncError(comm, &state), ret, fail);
+  TRACE(NCCL_INIT, "commReclaim: reclaim comm %p rank %d state %d", comm, comm->rank, state);
+  if (state == ncclSuccess && *comm->abortFlag == 0 && comm->finalizeCalled == false) {
+    /* user does not call ncclCommFinalize and this is a normal comm destroy. ncclCommDestroy
+     * should be nonblocking until last call of ncclCommDestroy. */
+    NCCLCHECKGOTO(commFinalize(comm, false), ret, fail);
+  }
+
+  if (comm->initState != ncclSuccess) {
+    /* if init errors happen, no finalize thread should have been launched. Main thread can reclaim
+     * everything since no NCCL kernel was issued. */
+    struct ncclCommFinalizeAsyncJob job;
+
+    job.comm = comm;
+    curRank = comm->rank;
+    /* comm aborts, commDestroySync should not be blocked. */
+    if ((ret = commDestroySync((struct ncclAsyncJob*) &job)) != ncclSuccess) {
+      WARN("commReclaim: comm %p (rank = %d) in abort, error %d", comm, curRank, ret);
+    }
+
+    if ((ret = commCleanup(comm)) != ncclSuccess) {
+      WARN("commReclaim: cleanup comm %p rank %d failed in destroy/abort, error %d", comm, curRank, ret);
+    }
+  } else {
+    int curRankCnt;
+    int intraRanks = comm->intraRanks;
+    ncclComm_t intracomm0 = comm->intraComm0;
+    int *finalizeRankCnt = &intracomm0->finalizeRankCnt;
+
+    assert(intracomm0 != NULL && finalizeRankCnt != NULL);
+    curRankCnt = __atomic_add_fetch(finalizeRankCnt, 1, __ATOMIC_ACQ_REL);
+    if (curRankCnt == intraRanks) {
+      ncclComm_t curIntraComm;
+      ncclComm_t nextIntraComm = intracomm0;
+
+      while (nextIntraComm) {
+        curIntraComm = nextIntraComm;
+        curRank = curIntraComm->rank;
+        nextIntraComm = nextIntraComm->intraNext;
+
+        if (comm->finalizeCalled == false) {
+          struct ncclCommFinalizeAsyncJob job;
+          job.comm = curIntraComm;
+          /* every comm aborts, commDestroySync should not be blocked. */
+          if ((ret = commDestroySync((struct ncclAsyncJob*) &job)) != ncclSuccess)
+            WARN("commReclaim: comm %p (rank = %d) in abort, error %d", curIntraComm, curRank, ret);
+        }
+
+        if ((ret = commCleanup(curIntraComm)) != ncclSuccess) {
+          WARN("commReclaim: cleanup comm %p rank %d failed in destroy/abort, error %d", curIntraComm, curRank, ret);
+        }
+      }
+    }
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
 NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
 ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   NVTX3_FUNC_RANGE_IN(nccl_domain);
@@ -1169,9 +1474,18 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
   int64_t busId = comm->busId;
   TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId);
+  // Try and prevent a double free of the comm struct (user error)
+  if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) {
+    WARN("comm %p has already been destroyed", comm);
+    return ncclInvalidArgument;
+  }
 
-  NCCLCHECK(commDestroy(comm));
+  /* init thread must be joined before we destroy the comm. */
+  NCCLCHECK(ncclCommEnsureReady(comm));
+
+  NCCLCHECK(commReclaim(comm));
   INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Destroy COMPLETE", comm, rank, nranks, cudaDev, busId);
+
   return ncclSuccess;
 }
 
@@ -1187,9 +1501,13 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
 
   // Ask anything that might still be running on the device to quit
   *comm->abortFlag = 1;
+  /* init thread must be joined before we destroy the comm,
+   * and we should ignore the init error here. */
+  ncclCommEnsureReady(comm);
 
-  NCCLCHECK(commDestroy(comm));
+  (void) commReclaim(comm);
   INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Abort COMPLETE", comm, rank, nranks, cudaDev, busId);
+
   return ncclSuccess;
 }
 
@@ -1203,6 +1521,7 @@ const char* ncclGetErrorString(ncclResult_t code) {
     case ncclInvalidArgument        : return "invalid argument";
     case ncclInvalidUsage           : return "invalid usage";
     case ncclRemoteError            : return "remote process exited or there was a network error";
+    case ncclInProgress             : return "NCCL operation in progress";
     default                         : return "unknown result code";
   }
 }
@@ -1219,15 +1538,21 @@ NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asy
 ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
   NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
   NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
-  *asyncError = comm->fatalError;
+
+  *asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE);
   return ncclSuccess;
 }
 
 NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
 ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
   NVTX3_FUNC_RANGE_IN(nccl_domain);
+
   NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
   NCCLCHECK(PtrCheck(count, "CommCount", "count"));
+
+  /* init thread must be joined before we access the attributes of comm. */
+  NCCLCHECK(ncclCommEnsureReady(comm));
+
   *count = comm->nRanks;
   return ncclSuccess;
 }
@@ -1235,8 +1560,12 @@ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
 NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
 ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
   NVTX3_FUNC_RANGE_IN(nccl_domain);
+
   NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
   NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
+
+  NCCLCHECK(ncclCommEnsureReady(comm));
+
   *devid = comm->cudaDev;
   return ncclSuccess;
 }
@@ -1244,8 +1573,12 @@ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
 NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
 ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
   NVTX3_FUNC_RANGE_IN(nccl_domain);
+
   NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
   NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
+
+  NCCLCHECK(ncclCommEnsureReady(comm));
+
   *rank = comm->rank;
   return ncclSuccess;
 }
diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc
index 43c95c2..52663b5 100644
--- a/src/misc/cudawrap.cc
+++ b/src/misc/cudawrap.cc
@@ -10,32 +10,30 @@
 
 #include <dlfcn.h>
 
-#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
+#define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr
 
 #if CUDART_VERSION >= 11030
 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
-DECLARE_CUDA_PFN(cuDeviceGet);
-DECLARE_CUDA_PFN(cuDeviceGetAttribute);
-DECLARE_CUDA_PFN(cuGetErrorString);
-DECLARE_CUDA_PFN(cuGetErrorName);
+DECLARE_CUDA_PFN(cuDeviceGet, 2000);
+DECLARE_CUDA_PFN(cuDeviceGetAttribute, 2000);
+DECLARE_CUDA_PFN(cuGetErrorString, 6000);
+DECLARE_CUDA_PFN(cuGetErrorName, 6000);
 /* enqueue.cc */
-DECLARE_CUDA_PFN(cuMemGetAddressRange);
+DECLARE_CUDA_PFN(cuMemGetAddressRange, 3020);
 /* proxy.cc */
-DECLARE_CUDA_PFN(cuCtxCreate_v3020);
-DECLARE_CUDA_PFN(cuCtxDestroy);
-DECLARE_CUDA_PFN(cuCtxSetCurrent);
+DECLARE_CUDA_PFN(cuCtxCreate, 3020);
+DECLARE_CUDA_PFN(cuCtxDestroy, 4000);
+DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000);
 #if CUDA_VERSION >= 11070
 /* transport/collNet.cc/net.cc*/
-DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support
+DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
 #endif
 #endif
 
 /* CUDA Driver functions loaded with dlsym() */
-DECLARE_CUDA_PFN(cuInit);
-DECLARE_CUDA_PFN(cuDriverGetVersion);
-DECLARE_CUDA_PFN(cuGetProcAddress);
-
-static enum { cudaUninitialized, cudaInitializing, cudaInitialized, cudaError } cudaState = cudaUninitialized;
+DECLARE_CUDA_PFN(cuInit, 2000);
+DECLARE_CUDA_PFN(cuDriverGetVersion, 2020);
+DECLARE_CUDA_PFN(cuGetProcAddress, 11030);
 
 #define CUDA_DRIVER_MIN_VERSION 11030
 
@@ -46,46 +44,37 @@ static int cudaDriverVersion;
 /*
   Load the CUDA symbols
  */
-static int cudaPfnFuncLoader(void) {
+static ncclResult_t cudaPfnFuncLoader(void) {
   CUresult res;
 
-#define LOAD_SYM(symbol, ignore) do {                                   \
-    res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), cudaDriverVersion, 0); \
+#define LOAD_SYM(symbol, version, ignore) do {                           \
+    res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), version, 0); \
     if (res != 0) {                                                     \
       if (!ignore) {                                                    \
-        WARN("Retrieve %s version %d failed with %d", #symbol, cudaDriverVersion, res); \
+        WARN("Retrieve %s version %d failed with %d", #symbol, version, res); \
         return ncclSystemError; }                                       \
     } } while(0)
 
-  LOAD_SYM(cuGetErrorString, 0);
-  LOAD_SYM(cuGetErrorName, 0);
-  LOAD_SYM(cuDeviceGet, 0);
-  LOAD_SYM(cuDeviceGetAttribute, 0);
-  LOAD_SYM(cuMemGetAddressRange, 1);
-  LOAD_SYM(cuCtxCreate_v3020, 1);
-  LOAD_SYM(cuCtxDestroy, 1);
-  LOAD_SYM(cuCtxSetCurrent, 1);
+  LOAD_SYM(cuGetErrorString, 6000, 0);
+  LOAD_SYM(cuGetErrorName, 6000, 0);
+  LOAD_SYM(cuDeviceGet, 2000, 0);
+  LOAD_SYM(cuDeviceGetAttribute, 2000, 0);
+  LOAD_SYM(cuMemGetAddressRange, 3020, 1);
+  LOAD_SYM(cuCtxCreate, 3020, 1);
+  LOAD_SYM(cuCtxDestroy, 4000, 1);
+  LOAD_SYM(cuCtxSetCurrent, 4000, 1);
 #if CUDA_VERSION >= 11070
-  LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support
+  LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
 #endif
   return ncclSuccess;
 }
 #endif
 
-ncclResult_t cudaLibraryInit(void) {
+static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
+static ncclResult_t initResult;
+
+static void initOnceFunc() {
   CUresult res;
-
-  if (cudaState == cudaInitialized)
-    return ncclSuccess;
-  if (cudaState == cudaError)
-    return ncclSystemError;
-
-  if (__sync_bool_compare_and_swap(&cudaState, cudaUninitialized, cudaInitializing) == false) {
-    // Another thread raced in front of us. Wait for it to be done.
-    while (cudaState == cudaInitializing) sched_yield();
-    return (cudaState == cudaInitialized) ? ncclSuccess : ncclSystemError;
-  }
-
   /*
    * Load CUDA driver library
    */
@@ -106,13 +95,13 @@ ncclResult_t cudaLibraryInit(void) {
    * Load initial CUDA functions
    */
 
-  pfn_cuInit = (PFN_cuInit) dlsym(cudaLib, "cuInit");
+  pfn_cuInit = (PFN_cuInit_v2000) dlsym(cudaLib, "cuInit");
   if (pfn_cuInit == NULL) {
     WARN("Failed to load CUDA missing symbol cuInit");
     goto error;
   }
 
-  pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion) dlsym(cudaLib, "cuDriverGetVersion");
+  pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion_v2020) dlsym(cudaLib, "cuDriverGetVersion");
   if (pfn_cuDriverGetVersion == NULL) {
     WARN("Failed to load CUDA missing symbol cuDriverGetVersion");
     goto error;
@@ -132,7 +121,7 @@ ncclResult_t cudaLibraryInit(void) {
     goto error;
   }
 
-  pfn_cuGetProcAddress = (PFN_cuGetProcAddress) dlsym(cudaLib, "cuGetProcAddress");
+  pfn_cuGetProcAddress = (PFN_cuGetProcAddress_v11030) dlsym(cudaLib, "cuGetProcAddress");
   if (pfn_cuGetProcAddress == NULL) {
     WARN("Failed to load CUDA missing symbol cuGetProcAddress");
     goto error;
@@ -145,19 +134,21 @@ ncclResult_t cudaLibraryInit(void) {
    */
   pfn_cuInit(0);
 
-#if CUDART_VERSION >= 11030
+  #if CUDART_VERSION >= 11030
   if (cudaPfnFuncLoader()) {
     WARN("CUDA some PFN functions not found in the library");
     goto error;
   }
-#endif
-
-  cudaState = cudaInitialized;
-  return ncclSuccess;
+  #endif
 
+  initResult = ncclSuccess;
+  return;
 error:
-  cudaState = cudaError;
-  return ncclSystemError;
+  initResult = ncclSystemError;
+  return;
 }
 
-
+ncclResult_t cudaLibraryInit() {
+  pthread_once(&initOnceControl, initOnceFunc);
+  return initResult;
+}
diff --git a/src/misc/gdrwrap.cc b/src/misc/gdrwrap.cc
index e81c7ea..4729efe 100644
--- a/src/misc/gdrwrap.cc
+++ b/src/misc/gdrwrap.cc
@@ -9,8 +9,6 @@
 #ifndef GDR_DIRECT
 #include "core.h"
 
-static enum { gdrUninitialized, gdrInitializing, gdrInitialized, gdrError } gdrState = gdrUninitialized;
-
 /* Function pointers assigned from dlopen() */
 static gdr_t (*gdr_internal_open)(void);
 static int (*gdr_internal_close)(gdr_t g);
@@ -49,18 +47,10 @@ pthread_mutex_t gdrLock = PTHREAD_MUTEX_INITIALIZER;
     *cast = tmp;                                         \
   } while (0)
 
-ncclResult_t wrap_gdr_symbols(void) {
-  if (gdrState == gdrInitialized)
-    return ncclSuccess;
-  if (gdrState == gdrError)
-    return ncclSystemError;
-
-  if (__sync_bool_compare_and_swap(&gdrState, gdrUninitialized, gdrInitializing) == false) {
-    // Another thread raced in front of us. Wait for it to be done.
-    while (gdrState == gdrInitializing) sched_yield();
-    return (gdrState == gdrInitialized) ? ncclSuccess : ncclSystemError;
-  }
+static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
+static ncclResult_t initResult;
 
+static void initOnceFunc(void) {
   static void* gdrhandle = NULL;
   void* tmp;
   void** cast;
@@ -84,8 +74,8 @@ ncclResult_t wrap_gdr_symbols(void) {
   LOAD_SYM(gdrhandle, "gdr_copy_to_mapping", gdr_internal_copy_to_mapping);
   LOAD_SYM(gdrhandle, "gdr_copy_from_mapping", gdr_internal_copy_from_mapping);
 
-  gdrState = gdrInitialized;
-  return ncclSuccess;
+  initResult = ncclSuccess;
+  return;
 
 teardown:
   gdr_internal_open = NULL;
@@ -101,11 +91,16 @@ teardown:
   gdr_internal_copy_from_mapping = NULL;
 
   if (gdrhandle != NULL) dlclose(gdrhandle);
-  gdrState = gdrError;
-  return ncclSystemError;
+  initResult = ncclSystemError;
+  return;
 }
 
 
+ncclResult_t wrap_gdr_symbols(void) {
+  pthread_once(&initOnceControl, initOnceFunc);
+  return initResult;
+}
+
 gdr_t wrap_gdr_open(void) {
   if (gdr_internal_open == NULL) {
     WARN("GDRCOPY lib wrapper not initialized.");
diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc
index 3b8daac..8a736d3 100644
--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@@ -11,8 +11,6 @@
 #include <dlfcn.h>
 #include "core.h"
 
-static enum { ibvUninitialized, ibvInitializing, ibvInitialized, ibvError } ibvState = ibvUninitialized;
-
 /*Function Pointers*/
 int (*ibv_internal_fork_init)(void);
 struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
@@ -43,18 +41,10 @@ const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
 // IBVERBS Library versioning
 #define IBVERBS_VERSION "IBVERBS_1.1"
 
-ncclResult_t wrap_ibv_symbols(void) {
-  if (ibvState == ibvInitialized)
-    return ncclSuccess;
-  if (ibvState == ibvError)
-    return ncclSystemError;
-
-  if (__sync_bool_compare_and_swap(&ibvState, ibvUninitialized, ibvInitializing) == false) {
-    // Another thread raced in front of us. Wait for it to be done.
-    while (ibvState == ibvInitializing) sched_yield();
-    return (ibvState == ibvInitialized) ? ncclSuccess : ncclSystemError;
-  }
+static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
+static ncclResult_t initResult;
 
+static void initOnceFunc(void) {
   static void* ibvhandle = NULL;
   void* tmp;
   void** cast;
@@ -111,8 +101,8 @@ ncclResult_t wrap_ibv_symbols(void) {
   LOAD_SYM(ibvhandle, "ibv_fork_init", ibv_internal_fork_init);
   LOAD_SYM(ibvhandle, "ibv_event_type_str", ibv_internal_event_type_str);
 
-  ibvState = ibvInitialized;
-  return ncclSuccess;
+  initResult = ncclSuccess;
+  return;
 
 teardown:
   ibv_internal_get_device_list = NULL;
@@ -141,8 +131,13 @@ teardown:
   ibv_internal_event_type_str = NULL;
 
   if (ibvhandle != NULL) dlclose(ibvhandle);
-  ibvState = ibvError;
-  return ncclSystemError;
+  initResult = ncclSystemError;
+  return;
+}
+
+ncclResult_t wrap_ibv_symbols(void) {
+  pthread_once(&initOnceControl, initOnceFunc);
+  return initResult;
 }
 
 #define IBV_PTR_CHECK_ERRNO(name_internal, call, retval, error_retval, name) \
@@ -256,7 +251,7 @@ ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int
 }
 
 ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) {
-  IBV_PTR_CHECK(ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd");
+  IBV_PTR_CHECK_ERRNO(ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd");
 }
 
 ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
@@ -290,6 +285,7 @@ ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint
 
 struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
   if (ibv_internal_reg_dmabuf_mr == NULL) {
+    errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set
     return NULL;
   }
   return ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
@@ -300,7 +296,7 @@ ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or t
 }
 
 ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) {
-  IBV_PTR_CHECK(ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq");
+  IBV_PTR_CHECK_ERRNO(ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq");
 }
 
 ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) {
@@ -312,7 +308,7 @@ ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) {
 }
 
 ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) {
-  IBV_PTR_CHECK(ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
+  IBV_PTR_CHECK_ERRNO(ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
 }
 
 ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc
index d6bc353..a432ff6 100644
--- a/src/misc/shmutils.cc
+++ b/src/misc/shmutils.cc
@@ -38,6 +38,7 @@ static ncclResult_t ncclShmSetup(char* shmPath, const int shmSize, int* fd, void
       WARN("Error: failed to extend %s to %d bytes", shmPath, shmSize);
       return ncclSystemError;
     }
+    INFO(NCCL_ALLOC, "Allocated %d bytes of shared memory in %s\n", shmSize, shmPath);
   } else {
     SYSCHECKVAL(open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
   }
@@ -81,10 +82,12 @@ ncclResult_t ncclShmUnlink(const char* shmPath) {
 }
 
 ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize) {
-  if (devShmPtr) CUDACHECK(cudaHostUnregister(shmPtr));
-  if (munmap(shmPtr, shmSize) != 0) {
-    WARN("munmap of shared memory failed");
-    return ncclSystemError;
+  if (shmPtr) {
+    if (devShmPtr) CUDACHECK(cudaHostUnregister(shmPtr));
+    if (munmap(shmPtr, shmSize) != 0) {
+      WARN("munmap of shared memory failed");
+      return ncclSystemError;
+    }
   }
   return ncclSuccess;
 }
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index 16049fa..7161aee 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -332,10 +332,10 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
 #endif
   }
 
-  if (sock->asyncFlag) {
-    EQCHECK(flags = fcntl(fd, F_GETFL), -1);
-    SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
-  }
+  /* The socket is set non-blocking for OS level, but asyncFlag is used to control
+   * blocking and non-blocking behavior in user level. */
+  EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+  SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
 
   // addr port should be 0 (Any port)
   SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind");
@@ -411,11 +411,10 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
   const int one = 1;
   SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
   
-  /* support non-blocking socket; by default, the socket is non-blocking */
-  if (sock->asyncFlag) {
-    EQCHECK(flags = fcntl(fd, F_GETFL), -1);
-    SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
-  }
+  /* The socket is set non-blocking for OS level, but asyncFlag is used to control
+   * blocking and non-blocking behavior in user level. */
+  EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+  SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
 
   /*  const int bufsize = 128*1024;
     SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
@@ -430,17 +429,26 @@ retry:
   /* blocking/non-blocking connect() is determined by asyncFlag. */
   ret = connect(fd, &sock->addr.sa, salen);
 
-  if (!sock->asyncFlag && (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
-    (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES))) {
-    if (errno == ECONNREFUSED && refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
-    usleep(SLEEP_INT);
-    goto retry;
-  }
+  if (!sock->asyncFlag) {
+    /* blocking socket, need retry if connect fails. */
+    if (errno == EINPROGRESS || errno == EAGAIN || errno == EALREADY ||
+    (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
+    (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
+      /* check abortFlag as long as we have chance to retry. */
+      if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
+      if (errno == ECONNREFUSED && refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
+      usleep(SLEEP_INT);
+      goto retry;
+    }
 
-  /* If connect() fails with errno == EAGAIN/EINPROGRESS/ETIMEDOUT, we may want to try connect again.
-   * However, it can return EISCONN instead of success which indicates connection is built up in
-   * background already. No need to call connect() again. */
-  if (ret == 0 || ((errno == EINPROGRESS || errno == ECONNREFUSED) && sock->asyncFlag) || errno == EISCONN) {
+    /* If connect() fails with errno == EAGAIN/EINPROGRESS/ETIMEDOUT, we may want to try connect again.
+     * However, it can return EISCONN instead of success which indicates connection is built up in
+     * background already. No need to call connect() again. */
+    if (ret == 0 || errno == EISCONN) {
+      sock->fd = fd;
+      return ncclSuccess;
+    }
+  } else {
     sock->fd = fd;
     return ncclSuccess;
   }
@@ -451,17 +459,26 @@ retry:
 
 ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) {
   socklen_t socklen = sizeof(union ncclSocketAddress);
+  struct pollfd pollfd;
   int tmpFd = sock->fd = -1;
+  int pollret;
 
-  do {
-    if (listenSocket->abortFlag) NEQCHECK(*listenSocket->abortFlag, 0);
+  pollfd.fd = listenSocket->fd;
+  pollfd.events = POLLIN;
+retry:
+  if ((pollret = poll(&pollfd, 1, listenSocket->asyncFlag ? 0 : 100)) < 0) {
+    return ncclSystemError;
+  } else {
     tmpFd = accept(listenSocket->fd, &sock->addr.sa, &socklen);
-  } while ((errno == EAGAIN || errno == EWOULDBLOCK) && tmpFd == -1 && !listenSocket->asyncFlag);
+  }
 
   if (!listenSocket->asyncFlag) {
+    /* blocking socket, if tmpFd is still -1, we need to retry */
+    if (tmpFd == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
+      if (listenSocket->abortFlag && *listenSocket->abortFlag != 0) return ncclInternalError;
+      goto retry;
+    }
     EQCHECK(tmpFd, -1);
-  } else if (tmpFd == -1 && errno != EAGAIN && errno != EWOULDBLOCK) {
-    return ncclSystemError;
   }
 
   sock->fd = tmpFd;
@@ -491,7 +508,7 @@ static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void*
   char line[SOCKET_NAME_MAXLEN+1];
   do {
     if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
-    if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
+    if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
     if (op == NCCL_SOCKET_RECV && bytes == 0) {
       *closed = 1;
       return ncclSuccess;
@@ -507,7 +524,7 @@ static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void*
     (*offset) += bytes;
     if (sock->abortFlag && *sock->abortFlag != 0) {
       INFO(NCCL_NET, "Socket progress: abort called");
-      return ncclSystemError;
+      return ncclInternalError;
     }
   } while (bytes > 0 && (*offset) < size);
   return ncclSuccess;
diff --git a/src/nccl.h.in b/src/nccl.h.in
index edd98a3..ccb8f57 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -39,7 +39,28 @@ typedef enum { ncclSuccess                 =  0,
                ncclInvalidArgument         =  4,
                ncclInvalidUsage            =  5,
                ncclRemoteError             =  6,
-               ncclNumResults              =  7 } ncclResult_t;
+               ncclInProgress              =  7,
+               ncclNumResults              =  8 } ncclResult_t;
+
+/* Communicator configuration. Users can assign value to attributes to specify the
+ * behavior of a communicator. */
+typedef struct ncclConfig_v21400 {
+  /* attributes that users should never touch. */
+  size_t size;
+  unsigned int magic;
+  unsigned int version;
+  /* attributes that users are able to customize. */
+  int blocking;
+} ncclConfig_t;
+
+/* Config initializer must be assigned to initialize config structure when it is created.
+ * Not initialized config will result in NCCL error. */
+#define NCCL_CONFIG_INITIALIZER {                                       \
+  sizeof(ncclConfig_t), /* size */                                      \
+  0xcafebeef,           /* magic */                                     \
+  NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */       \
+  1                     /* blocking */                                  \
+}
 
 /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
  * This integer is coded with the MAJOR, MINOR and PATCH level of the
@@ -54,6 +75,11 @@ ncclResult_t pncclGetVersion(int *version);
 ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
 ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
 
+/* Create a new communicator (multi thread/process version) with a configuration
+ * set by users. */
+ncclResult_t  ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
+ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
+
 /* Creates a new communicator (multi thread/process version).
  * rank must be between 0 and nranks-1 and unique within a communicator clique.
  * Each rank is associated to a CUDA device, which has to be set before calling
@@ -72,8 +98,15 @@ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId
 ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 
-/* Frees resources associated with communicator object, but waits for any operations
- * that might still be running on the device. */
+/* Finalize a communicator. ncclCommFinalize flushes all issued communications,
+ * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
+ * when the communicator is globally quiescent and related resources are freed; then,
+ * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
+ * itself) without blocking. */
+ncclResult_t  ncclCommFinalize(ncclComm_t comm);
+ncclResult_t pncclCommFinalize(ncclComm_t comm);
+
+/* Frees local resources associated with communicator object. */
 ncclResult_t  ncclCommDestroy(ncclComm_t comm);
 ncclResult_t pncclCommDestroy(ncclComm_t comm);
 
diff --git a/src/net.cc b/src/net.cc
index 53ec80e..1480c76 100644
--- a/src/net.cc
+++ b/src/net.cc
@@ -324,12 +324,8 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
     ncclResult_t ret;
     ncclDebugNoWarn = NCCL_NET;
     NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1);
-    while (sComm == NULL) {
-      NCCLWAITGOTO(ncclNetConnect(comm, dev, &handle, &sComm), sComm != NULL, comm->abortFlag, ret, cleanup2);
-    }
-    while (rComm == NULL) {
-      NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3);
-    }
+    NCCLWAITGOTO(ncclNetConnect(comm, dev, &handle, &sComm), sComm != NULL, comm->abortFlag, ret, cleanup2);
+    NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3);
     CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
     if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
       NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle));
diff --git a/src/proxy.cc b/src/proxy.cc
index 5021bc8..1a2f361 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -406,17 +406,17 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
         NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0, justInquire));
       }
     } break;
-  case ncclPatternCollTreeUpDown: {
-      // CollTree up
-      NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1, justInquire));  // For CollTree up, we are using push
-      // CollTree down
-      NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0, justInquire));
+  case ncclPatternCollnetChain: {
+      NCCLCHECK(SaveProxy(channel, proxySend, channel->collnetChain.up, op, 1, justInquire));
+      NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collnetChain.up, op, 0, justInquire));
+    } break;
+  case ncclPatternCollnetDirect: {
+      NCCLCHECK(SaveProxy(channel, proxySend, channel->collnetDirect.out, op, 1, justInquire));
+      NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collnetDirect.out, op, 0, justInquire));
     } break;
   case ncclPatternSend:
   case ncclPatternRecv: {
       if (op->root == comm->rank) return ncclSuccess;
-      op->nsteps = DIVUP(op->nbytes, op->chunkSize);
-      if (op->nsteps == 0) op->nsteps = 1;
       NCCLCHECK(SaveProxy(channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, 1, justInquire));
     } break;
   }
@@ -432,16 +432,17 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
   op->channelId = channelId;
   op->sliceSteps = 1;
   op->chunkSteps = 1;
-  op->protocol = NCCL_PROTO_SIMPLE;
   op->dtype = info->datatype;
+  op->protocol = info->protocol;
 
-  int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
-  if (info->comm->nNodes > 1) stepSize /= SENDRECV_SLICEFACTOR;
+  int stepSize = info->comm->buffSizes[op->protocol]/NCCL_STEPS;
+
+  // If nNodes > 1 and we're using Simple, reduce the stepSize to increase shared buffer utilization
+  if (info->comm->nNodes > 1 && op->protocol == NCCL_PROTO_SIMPLE) stepSize = info->comm->p2pNetChunkSize;
   info->chunkSize = stepSize;
   op->root = info->root;
-  op->nbytes = info->count;
-  struct ncclChannelPeer* peer = channel->peers + op->root;
 
+  struct ncclChannelPeer* peer = channel->peers + op->root;
   if (info->coll == ncclFuncSend) {
     op->pattern = ncclPatternSend;
     if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) {
@@ -464,6 +465,17 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
     info->chunkSize = ncclParamChunkSize();
   }
   op->chunkSize = info->chunkSize;
+
+  // Compute nSteps for proxies
+  int chunkEffectiveSize = op->chunkSize;
+  if (op->protocol == NCCL_PROTO_LL) {
+    chunkEffectiveSize /= 2;
+  }
+
+  op->nbytes = stepSize;
+  op->nsteps = DIVUP(info->count, chunkEffectiveSize);
+  if (op->nsteps == 0) op->nsteps = 1;
+
   return ncclSuccess;
 }
 
@@ -616,7 +628,7 @@ ncclResult_t ncclSetThreadContext(struct ncclComm* comm) {
   if (createThreadContext == -1) {
     createThreadContext = ncclParamCreateThreadContext();
     if (createThreadContext) {
-      if (CUPFN(cuCtxCreate_v3020) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
+      if (CUPFN(cuCtxCreate) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
         WARN("Unable to create thread context due to old driver, disabling.");
         createThreadContext = 0;
       }
@@ -624,7 +636,7 @@ ncclResult_t ncclSetThreadContext(struct ncclComm* comm) {
   }
   if (createThreadContext) {
     if (comm->proxyState.cudaCtx == NULL) {
-      if (CUPFN(cuCtxCreate_v3020(&comm->proxyState.cudaCtx,
+      if (CUPFN(cuCtxCreate(&comm->proxyState.cudaCtx,
                                   CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, comm->cudaDev)) != CUDA_SUCCESS) {
         WARN("Failed to create CUDA context on device %d", comm->cudaDev);
         createThreadContext = 0;
@@ -641,6 +653,9 @@ ncclResult_t ncclSetThreadContext(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
+// Set to SIGUSR1 or SIGUSR2 to help debug proxy state during hangs
+NCCL_PARAM(ProxyDumpSignal, "PROXY_DUMP_SIGNAL", -1);
+
 void* ncclProxyProgress(void *comm_) {
   struct ncclComm* comm = (struct ncclComm*)comm_;
   if (ncclSetThreadContext(comm) != ncclSuccess) {
@@ -652,7 +667,8 @@ void* ncclProxyProgress(void *comm_) {
 
   struct ncclProxyProgressState* state = &comm->proxyState.progressState;
   state->nextOps = -1;
-  signal(SIGUSR1, ncclDumpProxyState);
+  const int sig = ncclParamProxyDumpSignal();
+  if (sig != -1) signal(sig, ncclDumpProxyState);
   ncclLastProxyState = state;
   char threadName[NCCL_THREAD_NAMELEN];
   snprintf(threadName, NCCL_THREAD_NAMELEN, "NCCL Progress%2d", comm->cudaDev);
@@ -664,7 +680,7 @@ void* ncclProxyProgress(void *comm_) {
     int idle = 1;
     ncclResult_t ret = progressOps(comm, state, state->active, &idle);
     if (ret != ncclSuccess) {
-      comm->fatalError = ret;
+      (void) ncclCommSetAsyncError(comm, ret);
       INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
       return NULL;
     }
@@ -676,7 +692,7 @@ void* ncclProxyProgress(void *comm_) {
       ret = ncclProxyGetPostedOps(comm, &added);
       if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
       if (ret != ncclSuccess) {
-        comm->fatalError = ret;
+        (void) ncclCommSetAsyncError(comm, ret);
         INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
       }
       if (added == 0) {
@@ -782,9 +798,13 @@ static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool,
 
 static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
   if (connection->send) {
-    NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, comm));
+    if (ncclTransports[connection->transport]->send.proxyFree) {
+      NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, comm));
+    }
   } else {
-    NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, comm));
+    if (ncclTransports[connection->transport]->recv.proxyFree) {
+      NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, comm));
+    }
   }
   return ncclSuccess;
 }
@@ -793,7 +813,10 @@ static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* poo
   for (int b=0; b<pool->banks; b++) {
     int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE;
     for (int i=0; i<max; i++) {
-      NCCLCHECK(proxyFree(pool->pools[b]+i, comm));
+      ncclProxyConnection *connection = pool->pools[b]+i;
+      if (connection->initFlag == true) {
+        NCCLCHECK(proxyFree(connection, comm));
+      }
     }
     free(pool->pools[b]);
   }
@@ -812,8 +835,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
     NCCLCHECK(ncclCalloc(&comm->proxyState.proxyOps, comm->localRanks));
     NCCLCHECK(ncclCalloc(&comm->proxyState.sharedDevMems, comm->localRanks));
     for (int r=0; r<comm->localRanks; r++) {
-      comm->proxyState.peerSocks[r].fd = -1;
-      comm->proxyState.peerSocks[r].abortFlag = comm->abortFlag;
+      NCCLCHECK(ncclSocketInit(&comm->proxyState.peerSocks[r], NULL, comm->abortFlag, 0));
     }
   }
   NCCLCHECK(ncclTopoGetLocalRank(comm->topo, rank, &proxyConn->localRank));
@@ -943,6 +965,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
     NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1));
   }
   INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport);
+  __atomic_store_n(&connection->initFlag, true, __ATOMIC_RELEASE);
   return ncclSuccess;
 }
 
@@ -1019,7 +1042,8 @@ void* ncclProxyService(void* _args) {
   struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
   memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS);
   for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
-    peers[s].sock.fd = pollfds[s].fd = -1;
+    ncclSocketInit(&peers[s].sock, NULL, comm->abortFlag, 0);
+    pollfds[s].fd = -1;
     pollfds[s].events = POLLHUP|POLLIN;
   }
   pollfds[NCCL_MAX_LOCAL_RANKS].fd = comm->proxyState.listenSock->fd;
@@ -1029,8 +1053,9 @@ void* ncclProxyService(void* _args) {
   int npeers = 0;
   int stop = 0;
   int asyncOpCount = 0;
-  while (stop == 0 || (stop == 1 && npeers > 0)) {
-    if (int error = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : -1) < 0) {
+  while ((stop == 0 || (stop == 1 && npeers > 0)) && *comm->abortFlag == 0) {
+    /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
+    if (int error = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500) < 0) {
       WARN("[Proxy Service] Poll failed with error %d", error);
       return NULL;
     }
@@ -1071,10 +1096,7 @@ void* ncclProxyService(void* _args) {
           INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank);
           closeConn = 1;
         } else {
-          if (type == ncclProxyMsgAbort) {
-            stop = 2;
-            closeConn = 1;
-          } else if (type == ncclProxyMsgStop) {
+          if (type == ncclProxyMsgStop) {
             stop = 1;
             closeConn = 1;
           } else if (type == ncclProxyMsgClose) {
@@ -1104,6 +1126,10 @@ void* ncclProxyService(void* _args) {
       }
     }
   }
+  /* wait until main thread flush all NCCL operations. */
+  while (*comm->abortFlag != 0 && __atomic_load_n(&comm->proxyState.safeAbortFlag, __ATOMIC_ACQUIRE) == 0)
+    usleep(1000);
+
   // Wait for all operations to complete and stop progress thread before freeing any resource
   if (ncclProxyProgressDestroy(comm) != ncclSuccess) {
     WARN("[Proxy Service] proxyDestroy failed");
@@ -1133,15 +1159,23 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
 
 ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
   struct ncclProxyState* state = &comm->proxyState;
+
+  if (state == NULL) return ncclSuccess;
   if (state->peerAddresses) {
-    struct ncclSocket sock;
-    sock.abortFlag = NULL;
-    sock.asyncFlag = 0;
-    memcpy(&sock.addr, comm->proxyState.peerAddresses+comm->rank, sizeof(union ncclSocketAddress));
-    NCCLCHECK(ncclSocketConnect(&sock));
-    int type = (*comm->abortFlag) ? ncclProxyMsgAbort : ncclProxyMsgStop;
-    NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int)));
-    close(sock.fd);
+    if (*comm->abortFlag == 0) {
+      struct ncclSocket sock;
+      sock.abortFlag = NULL;
+      sock.asyncFlag = 0;
+      memcpy(&sock.addr, comm->proxyState.peerAddresses+comm->rank, sizeof(union ncclSocketAddress));
+      NCCLCHECK(ncclSocketConnect(&sock));
+      int type = ncclProxyMsgStop;
+      NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int)));
+      close(sock.fd);
+    } else {
+      /* when abortFlag is set, all socket related communications are no longer reliable. We need to
+       * set a flag to let proxy thread exit. */
+      __atomic_store_n(&state->safeAbortFlag, 1, __ATOMIC_RELEASE);
+    }
     free(state->peerAddresses);
   }
   if (state->peerSocks) {
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 0404aa8..432511c 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -100,6 +100,7 @@ struct sendResources {
   int nranks;
   int netDev;
   int useGdr;
+  int useDmaBuf;
   uint64_t* gdcSync;
   void* gdrDesc;
   void* sendMhandles[NCCL_NUM_PROTOCOLS];
@@ -119,6 +120,7 @@ struct recvResources {
   int nranks;
   int netDev;
   int useGdr;
+  int useDmaBuf;
   uint64_t* gdcSync;
   uint64_t* gdcFlush;
   void* gdrDesc;
@@ -154,7 +156,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
   NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
 
-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(comm), req.netDev,
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
       req.useGdr ? "/GDRDMA" : "");
   return ncclSuccess;
 }
@@ -172,7 +174,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
   NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
 
-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(comm), req.netDev,
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
       req.useGdr ? "/GDRDMA" : "");
   return ncclSuccess;
 }
@@ -281,6 +283,10 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
 
   resources->netDev = req->netDev;
   resources->useGdr = req->useGdr;
+  ncclNetProperties_t props;
+  NCCLCHECK(collNetGetProperties(comm, req->netDev, &props));
+  /* DMA-BUF support */
+  resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
   return ncclSuccess;
 }
 
@@ -386,6 +392,10 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
 
   resources->netDev = req->netDev;
   resources->useGdr = req->useGdr;
+  ncclNetProperties_t props;
+  NCCLCHECK(collNetGetProperties(comm, req->netDev, &props));
+  /* DMA-BUF support */
+  resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
 
   collNetHandle_t* netHandle = (collNetHandle_t*) respBuff;
   if (respSize != sizeof(collNetHandle_t)) return ncclInternalError;
@@ -449,7 +459,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
 
 #if CUDA_VERSION >= 11070
   /* DMA-BUF support */
-  if (resources->useGdr && comm->dmaBufSupport) {
+  if (resources->useGdr && resources->useDmaBuf) {
     int dmabuf_fd;
     CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
     NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
@@ -518,7 +528,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
 
 #if CUDA_VERSION >= 11070
   /* DMA-BUF support */
-  if (resources->useGdr && comm->dmaBufSupport) {
+  if (resources->useGdr && resources->useDmaBuf) {
     int dmabuf_fd;
     CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
     NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
@@ -545,35 +555,41 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
 
 static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
   struct sendResources* resources = (struct sendResources*)(connection->transportResources);
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    if (resources->sendMhandles[p]) {
-      NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p]));
+
+  if (resources) {
+    for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) {
+      if (resources->sendMhandles[p]) {
+        NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p]));
+      }
     }
+    struct connectMapMem* mems = resources->map.mems;
+    NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+    CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+    if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+    NCCLCHECK(sharedBuffersDestroy(comm));
+    NCCLCHECK(sharedFree(comm, resources->netDev));
+    free(connection->transportResources);
   }
-  struct connectMapMem* mems = resources->map.mems;
-  NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
-  CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
-  if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
-  NCCLCHECK(sharedBuffersDestroy(comm));
-  NCCLCHECK(sharedFree(comm, resources->netDev));
-  free(connection->transportResources);
   return ncclSuccess;
 }
 
 static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
   struct recvResources* resources = (struct recvResources*)(connection->transportResources);
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    if (resources->mhandles[p]) {
-      NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p]));
+
+  if (resources) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      if (resources->mhandles[p]) {
+        NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p]));
+      }
     }
+    struct connectMapMem* mems = resources->map.mems;
+    NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+    CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+    if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+    NCCLCHECK(sharedBuffersDestroy(comm));
+    NCCLCHECK(sharedFree(comm, resources->netDev));
+    free(connection->transportResources);
   }
-  struct connectMapMem* mems = resources->map.mems;
-  NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
-  CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
-  if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
-  NCCLCHECK(sharedBuffersDestroy(comm));
-  NCCLCHECK(sharedFree(comm, resources->netDev));
-  free(connection->transportResources);
   return ncclSuccess;
 }
 
@@ -582,10 +598,6 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
   (s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1)
 
 static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
-  if (args->protocol != NCCL_PROTO_SIMPLE) {
-    WARN("CollNet does not support LL/LL128");
-    return ncclInternalError;
-  }
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
@@ -599,7 +611,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
   }
   args->idle = 1;
   if (args->state == ncclProxyOpProgress) {
-    int p = args->protocol;
+    int p = NCCL_PROTO_SIMPLE;
     int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS);
     int perGroupSteps = NCCL_STEPS / nGroups;
     for (int s=0; s<args->nsubs; s++) {
@@ -695,10 +707,6 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
 }
 
 static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
-  if (args->protocol != NCCL_PROTO_SIMPLE) {
-    WARN("CollNet does not support LL/LL128");
-    return ncclInternalError;
-  }
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
@@ -712,7 +720,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
   }
   args->idle = 1;
   if (args->state == ncclProxyOpProgress) {
-    int p = args->protocol;
+    int p = NCCL_PROTO_SIMPLE;
     int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS);
     int perGroupSteps = NCCL_STEPS / nGroups;
     for (int s=0; s<args->nsubs; s++) {
@@ -746,7 +754,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
           TRACE(NCCL_NET, "recvProxy [%d/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize);
           sub->received += args->sliceSteps;
           sub->requests[buffSlot] = NULL;
-          if (reqFifo[group][buffSlot].size > 0 && resources->useGdr) {
+          if (1 && reqFifo[group][buffSlot].size > 0 && resources->useGdr) {
             // GDRCOPY support
             if (resources->gdcFlush) {
 #if defined (__x86_64__)
diff --git a/src/transport/net.cc b/src/transport/net.cc
index be3afc4..12390c0 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -90,6 +90,7 @@ struct sendResources {
   int remoteRank;
   int netDev;
   int useGdr;
+  int useDmaBuf;
   int maxRecvs;
   uint64_t* gdcSync;
   void* gdrDesc;
@@ -116,6 +117,7 @@ struct recvResources {
   int proxyRank;
   int netDev;
   int useGdr;
+  int useDmaBuf;
   int maxRecvs;
   uint64_t* gdcSync;
   uint64_t* gdcFlush;
@@ -130,19 +132,13 @@ struct recvResources {
   uint64_t llLastCleaning;
 };
 
-NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0);
-
 /* Determine if two peers can communicate with NET */
 static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
-  // Same host?
-  if (info1->hostHash == info2->hostHash) {
-    // User disabled NET for intra-node?
-    if (ncclParamNetDisableIntra() == 1) {
-      *ret = 0;
-      return ncclSuccess;
-    }
-  }
   *ret = 1;
+  if (info1->hostHash == info2->hostHash) {
+    // If on the same host, check intra-node net is not disabled.
+    NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, ret));
+  }
   return ncclSuccess;
 }
 
@@ -336,12 +332,15 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
 
 static ncclResult_t sendFree(struct ncclConnector* send) {
   struct connectMap* map = (struct connectMap*)(send->transportResources);
-  if (map->sameProcess == 0) {
-    NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
-    if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
-      CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
+  if (map) {
+    if (map->sameProcess == 0) {
+      NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+      if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
+        CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
+      }
     }
   }
+
   return ncclSuccess;
 }
 
@@ -368,7 +367,7 @@ static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int local
   struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
   state->refcount++;
   if (state->size == 0) {
-    state->size = nChannels*(NCCL_SHARED_STEPS/NCCL_STEPS)*comm->buffSizes[NCCL_PROTO_SIMPLE]/SENDRECV_SLICEFACTOR;
+    state->size = nChannels*NCCL_SHARED_STEPS*comm->p2pNetChunkSize;
   }
 
   if (size) *size = state->size;
@@ -394,9 +393,8 @@ static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int local
 
 static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) {
   // Use different pools for different channels and also separate send/recv.
-  int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR);
   int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
-  *offset = slotSize * globalSlot;
+  *offset = comm->p2pNetChunkSize * globalSlot;
   return ncclSuccess;
 }
 
@@ -448,6 +446,8 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
   resources->connIndex = req->connIndex;
   ncclNetProperties_t props;
   NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
+  /* DMA-BUF support */
+  resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
   resources->maxRecvs = props.maxRecvs;
 
   // We don't return any data
@@ -474,6 +474,8 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   resources->connIndex = req->connIndex;
   ncclNetProperties_t props;
   NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
+  /* DMA-BUF support */
+  resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
   resources->maxRecvs = props.maxRecvs;
 
   if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
@@ -541,6 +543,12 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
           comm, resources->useGdr, resources->localRank, 0, map->sameProcess, comm->p2pnChannels,
           &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipc));
     resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
+
+    if (comm->allocP2pNetLLBuffers) {
+      NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*p == NCCL_PROTO_LL*/, comm->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
+      resources->buffSizes[NCCL_PROTO_LL] = comm->buffSizes[NCCL_PROTO_LL];
+    }
+
     NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
   }
 
@@ -589,7 +597,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
 #if CUDA_VERSION >= 11070
       /* DMA-BUF support */
       int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
-      if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) {
+      if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
         int dmabuf_fd;
         CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
         NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
@@ -676,6 +684,11 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
   NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
 
+  if (comm->allocP2pNetLLBuffers) {
+    NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, comm->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
+    resources->buffSizes[NCCL_PROTO_LL] = comm->buffSizes[NCCL_PROTO_LL];
+  }
+
   if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
     if (resources->shared == 0) {
       NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size));
@@ -706,7 +719,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
 #if CUDA_VERSION >= 11070
       /* DMA-BUF support */
       int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
-      if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) {
+      if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
         int dmabuf_fd;
         CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
         NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
@@ -846,7 +859,8 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) {
           // We have something to receive, let's check if it's completely ready.
           int size = sizesFifo[buffSlot];
-          char* buff = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
+          bool shared = (p == NCCL_PROTO_SIMPLE) && resources->shared;
+          char* buff = shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
           int ready = 1;
           if (p == NCCL_PROTO_LL128) {
             ready = resources->useGdr;
@@ -974,7 +988,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
           int stepSize = resources->buffSizes[p] / NCCL_STEPS;
           char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
           int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
-          if (resources->shared) {
+          if (p == NCCL_PROTO_SIMPLE && resources->shared) {
             int sharedBuffSlot = sub->posted%maxDepth;
             int offset;
             NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset));
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index d4bb8cf..a1f8897 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -25,7 +25,6 @@
 
 #include "ibvwrap.h"
 
-#define USE_RDMA_WRITE 1
 #define MAXNAMESIZE 64
 static char ncclIbIfName[MAX_IF_NAME_SIZE+1];
 static union ncclSocketAddress ncclIbIfAddr;
@@ -73,7 +72,7 @@ pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
 static int ncclIbRelaxedOrderingEnabled = 0;
 
 NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0);
-NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14);
+NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 18);
 NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
 NCCL_PARAM(IbPkey, "IB_PKEY", 0);
 NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
@@ -120,7 +119,7 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort)
   return ncclSuccess;
 }
 
-static int ibvWidths[] = { 1, 4, 8, 12 };
+static int ibvWidths[] = { 1, 4, 8, 12, 2 };
 static int ibvSpeeds[] = { 2500, 5000, 10000, 10000, 14000, 25000, 50000 };
 static int firstBitSet(int val, int max) {
   int i = 0;
@@ -288,8 +287,8 @@ ncclResult_t ncclIbDmaBufSupport(int dev) {
     NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
     // Test kernel DMA-BUF support with a dummy call (fd=-1)
     (void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/);
-    // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP if not supported (EBADF otherwise)
-    dmaBufSupported = (errno != EOPNOTSUPP) ? 1 : 0;
+    // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
+    dmaBufSupported = (errno != EOPNOTSUPP && errno != EPROTONOSUPPORT) ? 1 : 0;
     NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
   }
   if (dmaBufSupported == 0) return ncclSystemError;
@@ -684,7 +683,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
   NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
   stage->comm = rComm;
   stage->state = ncclIbCommStateAccept;
-  lComm->sock.asyncFlag = 1;
+  NCCLCHECK(ncclSocketInit(&rComm->sock, NULL, lComm->sock.abortFlag, 1));
 
 ib_accept:
   NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock));
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index a0d80d3..678aab8 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -317,7 +317,6 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
   NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
   handle->nSocks = comm->nSocks;
   handle->nThreads = comm->nThreads;
-  comm->sock.asyncFlag = 1;
   comm->dev = dev;
   *listenComm = comm;
   return ncclSuccess;
@@ -394,7 +393,7 @@ ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
   for (; i<rComm->nSocks+1; i++) {
     uint8_t sendSockIdx;
     ncclCalloc(&sock, 1);
-    NCCLCHECK(ncclSocketInit(sock, NULL, NULL, 1));
+    NCCLCHECK(ncclSocketInit(sock, NULL, lComm->sock.abortFlag, 1));
     stage->sock = sock;
     stage->state = ncclSocketCommStateAccept;
     stage->iteration = i;
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 414f05d..b0bad4a 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -107,6 +107,14 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
     return ncclSuccess;
   }
 
+  // Check if NET would work better
+  int useNet = 0;
+  NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
+  if (useNet) {
+    *ret = 0;
+    return ncclSuccess;
+  }
+
   // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
   int cudaDev1 = busIdToCudaDev(info1->busId);
   int cudaDev2 = busIdToCudaDev(info2->busId);
@@ -230,17 +238,17 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     info->rank = myInfo->rank;
     if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
       if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
-      INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
-          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
+      INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
+          channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
     } else {
       send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
-      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s",
-          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "");
+      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s",
+          channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "");
     }
   } else {
     info->rank = intermediateRank;
-    INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s",
-        channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
+    INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s",
+        channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
 	comm->peerInfo[intermediateRank].busId, useReadStr);
   }
 
@@ -374,20 +382,24 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
 
 ncclResult_t p2pSendFree(struct ncclConnector* send) {
   struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
-  if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
-  if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
-  free(resources);
+  if (resources) {
+    if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
+    if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
+    free(resources);
+  }
   return ncclSuccess;
 }
 
 ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
   struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
-  if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
-  if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
-  if (useMemcpy) {
-    NCCLCHECK(ncclShmClose(resources->shm, resources->devShm, resources->shmSize));
+  if (resources) {
+    if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
+    if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
+    if (useMemcpy) {
+      NCCLCHECK(ncclShmClose(resources->shm, resources->devShm, resources->shmSize));
+    }
+    free(resources);
   }
-  free(resources);
   return ncclSuccess;
 }
 
@@ -464,14 +476,16 @@ static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection,
 static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
   if (useMemcpy) {
     struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
-    NCCLCHECK(ncclShmClose(proxyInfo->shm, proxyInfo->devShm, proxyInfo->shmSize));
-    NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
-    CUDACHECK(cudaFree(proxyInfo->ceDevBuff));
-    CUDACHECK(cudaStreamDestroy(proxyInfo->stream));
-    for (int i=0; i<NCCL_STEPS; i++) {
-      CUDACHECK(cudaEventDestroy(proxyInfo->events[i]));
+    if (proxyInfo) {
+      NCCLCHECK(ncclShmClose(proxyInfo->shm, proxyInfo->devShm, proxyInfo->shmSize));
+      NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
+      CUDACHECK(cudaFree(proxyInfo->ceDevBuff));
+      CUDACHECK(cudaStreamDestroy(proxyInfo->stream));
+      for (int i=0; i<NCCL_STEPS; i++) {
+        CUDACHECK(cudaEventDestroy(proxyInfo->events[i]));
+      }
+      free(proxyInfo);
     }
-    free(proxyInfo);
   } else {
     // Do not check return code as CUDA may have already shut down
     cudaFree(connection->transportResources);
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index 4a6120a..740bd2a 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -49,6 +49,10 @@ static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct
 
   if (ncclParamShmDisable() == 1) return ncclSuccess;
 
+  int useNet = 0;
+  NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
+  if (useNet) return ncclSuccess;
+
   // Same host?
   TRACE(NCCL_INIT|NCCL_SHM, "peer1 hostHash %lx peer2 hostHash %lx", info1->hostHash, info2->hostHash);
   if (info1->hostHash != info2->hostHash) return ncclSuccess;
@@ -191,17 +195,21 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
 
 static ncclResult_t shmSendFree(struct ncclConnector* send) {
   struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
-  NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
-  NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
-  free(resources);
+  if (resources) {
+    NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+    NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+    free(resources);
+  }
   return ncclSuccess;
 }
 
 static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
   struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
-  NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
-  NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
-  free(resources);
+  if (resources) {
+    NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+    NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+    free(resources);
+  }
   return ncclSuccess;
 }
 
@@ -243,25 +251,31 @@ static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection,
 
 static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
   struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
-  CUDACHECK(cudaStreamDestroy(resources->stream));
-  CUDACHECK(cudaFree(resources->devFifo));
-  NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
-  for (int i=0; i<NCCL_STEPS; i++) {
-    CUDACHECK(cudaEventDestroy(resources->events[i]));
+
+  if (resources) {
+    CUDACHECK(cudaStreamDestroy(resources->stream));
+    CUDACHECK(cudaFree(resources->devFifo));
+    NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
+    for (int i=0; i<NCCL_STEPS; i++) {
+      CUDACHECK(cudaEventDestroy(resources->events[i]));
+    }
+    free(connection->transportResources);
   }
-  free(connection->transportResources);
   return ncclSuccess;
 }
 
 static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
   struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
-  CUDACHECK(cudaStreamDestroy(resources->stream));
-  CUDACHECK(cudaFree(resources->devFifo));
-  NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
-  for (int i=0; i<NCCL_STEPS; i++) {
-    CUDACHECK(cudaEventDestroy(resources->events[i]));
+
+  if (resources) {
+    CUDACHECK(cudaStreamDestroy(resources->stream));
+    CUDACHECK(cudaFree(resources->devFifo));
+    NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
+    for (int i=0; i<NCCL_STEPS; i++) {
+      CUDACHECK(cudaEventDestroy(resources->events[i]));
+    }
+    free(connection->transportResources);
   }
-  free(connection->transportResources);
   return ncclSuccess;
 }