diff --git a/makefiles/version.mk b/makefiles/version.mk
index 88656d9..496796a 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 12
-NCCL_PATCH   := 12
+NCCL_MINOR   := 13
+NCCL_PATCH   := 4
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index 82e21a0..d658c35 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -10,7 +10,8 @@ include ../makefiles/version.mk
 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
 LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc enhcompat.cc net.cc \
-		misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc \
+		misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
+		misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
 		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
                 collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
                 graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index 4f7f48c..0ba89a5 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -105,6 +105,7 @@ static void *bootstrapRoot(void* args) {
   do {
     struct ncclSocket sock;
     sock.abortFlag = NULL;
+    /* bootstrap root thread always uses blocking ncclSocketAccept. */
     NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out);
     NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out);
     close(sock.fd);
@@ -139,6 +140,7 @@ static void *bootstrapRoot(void* args) {
     int next = (r+1) % nranks;
     struct ncclSocket sock;
     sock.abortFlag = NULL;
+    sock.asyncFlag = 0;
     memcpy(&sock.addr, rankAddressesRoot+r, sizeof(union ncclSocketAddress));
     NCCLCHECKGOTO(ncclSocketConnect(&sock), res, out);
     NCCLCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union ncclSocketAddress)), res, out);
@@ -316,6 +318,7 @@ ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int s
   struct bootstrapState* state = (struct bootstrapState*)commState;
   struct ncclSocket sock;
   sock.abortFlag = state->abortFlag;
+  sock.asyncFlag = 0;
   memcpy(&sock.addr, state->peerCommAddresses+peer, sizeof(union ncclSocketAddress));
   NCCLCHECK(ncclSocketConnect(&sock));
   NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int)));
diff --git a/src/channel.cc b/src/channel.cc
index 87cec65..4d28a68 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -8,75 +8,54 @@
 #include "param.h"
 #include "gdrwrap.h"
 
-// GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory
-NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1);
-
-ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
-  struct ncclChannel* channel = comm->channels+channelid;
+ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
+  struct ncclChannel* channel = &comm->channels[channelId];
   if (channel->id != -1) return ncclSuccess;
-  channel->id = channelid;
 
-  // Ring index to user rank table.
-  NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
-  NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
+  int nRanks = comm->nRanks;
+  channel->id = channelId;
+  channel->workFifoSent = 0;
 
-  // Communication structures with peers.
-  NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network)
-  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1));
-  for (size_t i=0; i<comm->nRanks+1; ++i) {
-    for (int b=0; b<NCCL_MAX_CONNS; b++) {
-      channel->peers[i].send[b].comm = comm;
-      channel->peers[i].recv[b].comm = comm;
+  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream));
+
+  // The extra on nRanks+1 is for collnet root (i.e. network)
+  channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nRanks+1);
+  NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nRanks+1, comm->deviceStream.stream));
+  ncclCommPushCudaFree(comm, channel->devPeers);
+
+  channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
+  NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, comm->deviceStream.stream));
+  ncclCommPushCudaFree(comm, channel->devRingUserRanks);
+
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNull(), &comm->deviceStream));
+
+  for (int r=0; r < nRanks+1; ++r) {
+    for (int b=0; b < NCCL_MAX_CONNS; b++) {
+      channel->peers[r].send[b].comm = comm;
+      channel->peers[r].recv[b].comm = comm;
     }
   }
 
-  // Per-channel operation list.
-  NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
-  if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
-    // GDRCOPY support
-    // We allocate a workFifo in GDR mapped CUDA memory
-    // But we still allocate the Host workFifo so that we
-    // can copy the work elements to CUDA memory on kernel launch
-    NCCLCHECK(ncclGdrCudaCalloc(&channel->workFifoGdr, &channel->workFifoDev, NCCL_MAX_OPS, &channel->gdrMemDesc));
-  } else {
-    // The device workFifo is the Host one
-    channel->workFifoDev = channel->workFifo;
-  }
-
   return ncclSuccess;
 }
 
 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
   if (channel->id == -1) return ncclSuccess;
-  // Operation list
-  NCCLCHECK(ncclCudaHostFree(channel->workFifo));
-  if (channel->gdrMemDesc) {
-    // GDRCOPY support
-    NCCLCHECK(ncclGdrCudaFree(channel->gdrMemDesc));
-  }
-
-  // Free Ring index to rank tables
-  free(channel->ring.userRanks);
-  CUDACHECK(cudaFree(channel->ring.devUserRanks));
 
   // Free transport proxy resources
   // Note: free all send resources first due to CollNet arrangement
   for (int r=0; r<nRanks+1; r++) {
-    struct ncclPeer* peer = channel->peers+r;
+    struct ncclChannelPeer* peer = channel->peers+r;
     for (int b=0; b<NCCL_MAX_CONNS; b++) {
       if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
     }
   }
   for (int r=0; r<nRanks+1; r++) {
-    struct ncclPeer* peer = channel->peers+r;
+    struct ncclChannelPeer* peer = channel->peers+r;
     for (int b=0; b<NCCL_MAX_CONNS; b++) {
       if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
     }
   }
 
-  // Free the peer structures.
-  CUDACHECK(cudaFree(channel->devPeers));
-  free(channel->peers);
-
   return ncclSuccess;
 }
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index c86384c..4e82dd6 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -12,11 +12,11 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
     ncclRing *ring = &ncclShmem.channel.ring;
-    const int *ringRanks = ring->devUserRanks;
+    const int *ringRanks = ring->userRanks;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
     // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index 41ef255..23f6d0a 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -12,7 +12,7 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
     ncclRing *ring = &ncclShmem.channel.ring;
@@ -97,7 +97,7 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __forceinline__ void runTreeUpDown(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
     ncclTree *tree = &ncclShmem.channel.tree;
@@ -169,7 +169,7 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __forceinline__ void runTreeSplit(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
     ncclTree *tree = &ncclShmem.channel.tree;
@@ -290,7 +290,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
     const int nThreadsScatter = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 3*COLLNET_COPY_THREADS : 0);
     const int nThreadsGather  =             ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
     const int nThreadsBcast   = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 2*COLLNET_COPY_THREADS);
-    const int nThreadsReduce = args->header.nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
+    const int nThreadsReduce = args->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
     const int tidStartBcast = nThreadsGather;
     const int tidStartScatter = tidStartBcast + nThreadsBcast;
     const int tidStartReduce = tidStartScatter + nThreadsScatter;
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index ba4ef56..ebe4381 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -12,7 +12,7 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
     ncclRing *ring = &ncclShmem.channel.ring;
@@ -20,8 +20,8 @@ namespace {
     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
     const ssize_t loopSize = nChannels*chunkSize;
     const ssize_t size = args->count;
-    const int rank = ring->devUserRanks[0];
-    const int nextRank = ring->devUserRanks[1];
+    const int rank = ring->userRanks[0];
+    const int nextRank = ring->userRanks[1];
     const int root = args->root;
 
     T *inputBuf = (T*)args->sendbuff;
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index 40a2303..ab333b4 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -19,90 +19,6 @@
 
 #define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1)  // Using balanced tree instead of split tree
 
-__device__ inline bool barrierReduceAny(int bit) {
-  uint32_t popc;
-  asm ("{"
-    ".reg .pred barr_pred;"
-    "setp.eq.u32 barr_pred, %1, 1;"
-    "bar.red.popc.u32 %0, 2, barr_pred;"
-  "}" : "=r"(popc) : "r"(bit));
-  return popc != 0;
-}
-
-// Copy src to dst and fill extra size with zeroes
-template<typename Tdst, typename Tsrc>
-__device__ void copyToShmem(Tdst *dst, Tsrc const *src, int tid, int nthreads) {
-  static_assert(sizeof(Tdst)%(2*sizeof(uint64_t)) == 0 && sizeof(Tsrc)%(2*sizeof(uint64_t)) == 0,
-      "copyToShmem needs sizes which are multiple of 16B");
-  static_assert(sizeof(Tdst) >= sizeof(Tsrc), "Tdst size is too small");
-  static_assert(sizeof(Tdst) <= WARP_SIZE*2*sizeof(uint64_t), "copyToShmem limited to 512B to make sure it can always be done in one cycle");
-  uint64_t *d = reinterpret_cast<uint64_t*>(dst);
-  uint64_t const *s = reinterpret_cast<uint64_t const*>(src);
-  uint64_t *shmemPtr = shmemCvtPtr(d);
-  int offset = 2*tid;
-  uint64_t v0, v1;
-  if (offset >= sizeof(Tsrc)/sizeof(uint64_t)) {
-    v0 = v1 = 0ULL;
-  } else {
-    v0 = s[offset] ; v1 = s[offset+1];
-  }
-  if (offset < sizeof(Tdst)/sizeof(uint64_t)) storeShmem128(shmemPtr+offset, v0, v1);
-}
-
-template<typename T>
-__device__ int copyToShmem(T *dst, T const *src, int turn=0) {
-  static_assert(sizeof(uint64_t) <= alignof(T), "Uhoh");
-  uint64_t *d = reinterpret_cast<uint64_t*>(dst);
-  uint64_t const *s = reinterpret_cast<uint64_t const*>(src);
-  int t = threadIdx.x - turn;
-  if (t < 0) t += blockDim.x;
-  int n = sizeof(T)/sizeof(uint64_t);
-
-  int delta = (n + WARP_SIZE-1) & -WARP_SIZE; // round up to warp lane 0
-  if (delta < blockDim.x) {
-    turn += delta;
-    if (turn >= blockDim.x) turn -= blockDim.x;
-  }
-  else
-    turn = 0;
-
-  n -= t;
-  d += t;
-  s += t;
-  #pragma unroll
-  for (int i=0; i < divUp(sizeof(T), WARP_SIZE*sizeof(uint64_t)); i++) {
-    if (n > 0) {
-      *d = *s;
-      d += blockDim.x;
-      s += blockDim.x;
-      n -= blockDim.x;
-    }
-  }
-  return turn;
-}
-
-template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
-struct RunWorkElement {
-  __device__ void run(ncclWorkElem*) {
-    // Put NOT IMPLEMENTED behavior here.
-  }
-};
-
-template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
-struct RunWork {
-  // This __forceinline__ is necessary. The compiler was inserting a function call
-  // here from the LL ncclKernel.
-  __device__ __forceinline__ void run(ncclWork *w) {
-    int wid = threadIdx.x / WARP_SIZE;
-    int inc = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) / sizeof(ncclWorkElem) : 1;
-    #pragma unroll 1
-    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e += inc) {
-      if (wid < w->header.nWarps)
-        RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(&w->elems[e]);
-    }
-  }
-};
-
 typedef void(*ncclKern_t)();
 extern __device__ ncclKern_t ncclFuncs[];
 
@@ -120,15 +36,62 @@ struct ncclShmemData {
     struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
   };
   uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
-  struct ncclDevComm comm;
-  struct ncclChannel channel;
-  uint64_t pad;
-  struct ncclWork work;
+  int channelId;
+  alignas(16) struct ncclDevComm comm;
+  alignas(16) struct ncclDevChannel channel;
+  alignas(16) struct ncclWork work;
 };
 static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
 
+extern __shared__ ncclShmemData ncclShmem;
+
+__device__ inline bool barrierReduceAny(int bit) {
+  uint32_t popc;
+  asm ("{"
+    ".reg .pred barr_pred;"
+    "setp.eq.u32 barr_pred, %1, 1;"
+    "bar.red.popc.u32 %0, 2, barr_pred;"
+  "}" : "=r"(popc) : "r"(bit));
+  return popc != 0;
+}
+
+// Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads.
+inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int bytes) {
+  int offset = 16*tid;
+  if (offset < bytes) {
+    uint64_t a=0, b=0;
+    asm("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset));
+    asm volatile("st.v2.u64 [%0],{%1,%2};" :: "l"((char*)dst + offset), "l"(a), "l"(b));
+  }
+}
+
+template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
+struct RunWorkElement {
+  __device__ void run(ncclWorkElem*) {
+    // Put NOT IMPLEMENTED behavior here.
+  }
+};
+
+template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
+struct RunWork {
+  // This __forceinline__ is necessary. The compiler was inserting a function call
+  // here from the LL ncclKernel.
+  __device__ __forceinline__ void run(ncclWork *w) {
+    int wid = threadIdx.x / WARP_SIZE;
+    ncclWorkElem* we = w->header.type == ncclWorkTypeRegColl ? &w->regElems[0].elem : &w->elems[0];
+    int stride = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) : sizeof(ncclWorkElem);
+    #pragma unroll 1
+    while ((char*)we + stride <= (char*)(w+1) && we->isUsed) {
+      if (wid < we->nWarps) {
+        RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(we);
+      }
+      we = (ncclWorkElem*)((char*)we + stride);
+    }
+  }
+};
+
 static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
-  if (we->header.type != ncclWorkTypeUnused && we->redOpArgIsPtr) {
+  if (we->isUsed && we->redOpArgIsPtr) {
     /* redOpArg is a pointer to the scalar value, so we'll dereference it
      * here so that redOpArg holds the bits of the scalar going forward.
      * The tricky thing is we don't know its type T since that's encoded in
@@ -148,48 +111,69 @@ static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
   }
 }
 
-extern __shared__ ncclShmemData ncclShmem;
-
 template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex>
-__device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first)  {
+__device__ void ncclKernel(
+    struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead
+  )  {
   int tid = threadIdx.x;
-  int nthreads = blockDim.x;
-  int bid = blockIdx.x;
 
-  int turn = copyToShmem(&ncclShmem.comm, comm);
-  // get address of channel without incurring indirect load from ncclDevCom::channels
-  ncclChannel *channel = &((ncclDevCommAndChannels*)comm)->channels[bid];
-  turn = copyToShmem(&ncclShmem.channel, channel, turn);
+  // To map blockId to channelId, we need the n'th set bit of channelMask which
+  // is the inverse of counting the number of set bits among the the first n.
+  if (tid < WARP_SIZE) {
+    int x = tid;
+    if (channelMask & (1ull<<x)) {
+      int y = __popcll(channelMask & ((1ull<<x)-1));
+      if (blockIdx.x == y) ncclShmem.channelId = x;
+    }
+    if (32 < MAXCHANNELS) {
+      x = 32 + tid;
+      if (channelMask & (1ull<<x)) {
+        int y = __popcll(channelMask & ((1ull<<x)-1));
+        if (blockIdx.x == y) ncclShmem.channelId = x;
+      }
+    }
+  }
+  __syncthreads(); // publish ncclShmem.channelId
+  int channelId = ncclShmem.channelId;
 
-  // To optimize for latency, (only) the first operation is passed as argument.
-  if (bid == 0 && first.header.type != ncclWorkTypeUnused) {
-    // Copy first elem to work and zero out the rest
-    copyToShmem(&ncclShmem.work, &first, tid, nthreads);
+  if (true) {
+    void *dst, *src;
+    int bytes;
+    // Use first 3 warps to load comm, channel, and work into ncclShmem
+    switch (tid/WARP_SIZE) {
+    case 0:
+      dst = &ncclShmem.comm;
+      src = comm;
+      bytes = sizeof(ncclDevComm);
+      static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
+      break;
+    case 1:
+      // Get address of channel without incurring indirect load from ncclDevComm::channels
+      dst = &ncclShmem.channel;
+      src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
+      bytes = sizeof(ncclDevChannel);
+      static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
+      break;
+    case 2:
+      dst = &ncclShmem.work;
+      src = workHead + blockIdx.x;
+      bytes = sizeof(ncclWork);
+      static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn.");
+      break;
+    default:
+      bytes = 0;
+      break;
+    }
+    copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
   }
   __syncthreads(); // publish ncclShmem
 
-  ncclWork *workFifoHost = ncclShmem.channel.workFifo;
-  ncclWork *workFifoDev = ncclShmem.channel.workFifoDev;
-  int workFifoIx = ncclShmem.channel.index;
-
-  if (bid == 0 && first.header.type != ncclWorkTypeUnused)
-    goto SkipLoadWork;
-
   while (true) {
-    copyToShmem(&ncclShmem.work, &workFifoDev[workFifoIx], tid, nthreads);
-    { // Check whether the last operation was aborted and make sure all threads exit
-      int aborted = tid == 0 ? *comm->abortFlag : 0;
-      if (barrierReduceAny(aborted)) // publish ncclShmem.work
-        break;
-      if (tid == 0)
-        workFifoHost[workFifoIx].header.type = ncclWorkTypeUnused;
+    // Notify host that all fifo reads are complete.
+    if (tid == 0 && ncclShmem.work.header.isLast && ncclShmem.work.header.inFifo) {
+      *ncclShmem.channel.workFifoDone = ncclShmem.work.header.doneAcks;
     }
 
-  SkipLoadWork:
-    workFifoIx = (workFifoIx + 1)%NCCL_MAX_OPS;
-    if (tid == 0)
-      channel->index = workFifoIx; // write back to real channel, not shmem shadow
-
     __syncwarp();
     if (ncclShmem.work.header.type == ncclWorkTypeColl) {
       if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]);
@@ -198,21 +182,34 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first)  {
     }
     __syncthreads();
 
-    if (ncclShmem.work.header.funcIndex == FnIndex)
+    if (ncclShmem.work.header.funcIndex == FnIndex) {
       RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
-    else
+    } else {
       ncclFuncs[ncclShmem.work.header.funcIndex]();
+    }
 
-    if (ncclShmem.work.header.isLast) break;
+    int workIxNext = ncclShmem.work.header.workNext;
     __syncthreads();
+    if (ncclShmem.work.header.isLast) break;
+
+    copyToShmem16(tid, &ncclShmem.work, workHead + workIxNext, sizeof(ncclWork));
+
+    { // Check whether the last operation was aborted and make sure all threads exit
+      int aborted = tid == 0 ? *comm->abortFlag : 0;
+      if (barrierReduceAny(aborted)) // publish ncclShmem.work
+        break;
+    }
   }
 }
 
 // Only generate kernels for SUM
 #if NCCL_OP == 0
 #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
-__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem first) { \
-  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex>(comm, first); \
+__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)( \
+    struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead \
+  ) { \
+  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex> \
+    (comm, channelMask, workHead); \
 }
 #else
 #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fInded)
diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu
index b7dc3e9..f594e34 100644
--- a/src/collectives/device/onerank_reduce.cu
+++ b/src/collectives/device/onerank_reduce.cu
@@ -16,7 +16,7 @@ namespace {
     int tid = threadIdx.x;
     int tn = blockDim.x;
     #pragma unroll 1
-    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e++) {
+    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].isUsed; e++) {
       ncclWorkElem *we = &w->elems[e];
       intptr_t eltN = we->count;
       int bid = we->bid;
diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h
index afed3df..e8cc8e3 100644
--- a/src/collectives/device/prims_ll.h
+++ b/src/collectives/device/prims_ll.h
@@ -326,11 +326,11 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
     // If we are going to support oneshot collNet + LL, then we would need to add connector index here
     int nrecv=0, nsend=0;
     while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
-      loadRecvConn(&channel->devPeers[recvPeers[nrecv]].recv->conn, nrecv);
+      loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
       nrecv++;
     }
     while (nsend < MaxSend && sendPeers[nsend] >= 0) {
-      loadSendConn(&channel->devPeers[sendPeers[nsend]].send->conn, nsend);
+      loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend);
       nsend++;
     }
     this->fan = Fan(nrecv, nsend);
diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h
index 8090385..93b6b4f 100644
--- a/src/collectives/device/prims_ll128.h
+++ b/src/collectives/device/prims_ll128.h
@@ -364,11 +364,11 @@ public:
     auto *channel = &ncclShmem.channel;
     int nrecv=0, nsend=0;
     while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
-      loadRecvConn(&channel->devPeers[recvPeers[nrecv]].recv->conn, nrecv);
+      loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
       nrecv++;
     }
     while (nsend < MaxSend && sendPeers[nsend] >= 0) {
-      loadSendConn(&channel->devPeers[sendPeers[nsend]].send->conn, nsend);
+      loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend);
       nsend++;
     }
     this->fan = Fan(nrecv, nsend);
diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h
index fd61dc4..a727849 100644
--- a/src/collectives/device/prims_simple.h
+++ b/src/collectives/device/prims_simple.h
@@ -303,9 +303,9 @@ class Primitives<
     }
   }
 
-  __device__ __forceinline__ void loadRecvConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) {
+  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
     if (flags & (RoleWaitRecv|RolePostRecv)) {
-      auto *conn = &peer->recv[connIndex].conn;
+      auto *conn = &peer->recv[connIndex];
       step = conn->step;
       step = roundUp(step, SlicePerChunk*StepPerSlice);
       if (flags & RolePostRecv) {
@@ -343,9 +343,9 @@ class Primitives<
     }
   }
 
-  __device__ __forceinline__ void loadSendConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) {
+  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
     if (flags & (RoleWaitSend|RolePostSend)) {
-      auto *conn = &peer->send[connIndex].conn;
+      auto *conn = &peer->send[connIndex];
       step = conn->step;
       step = roundUp(step, SlicePerChunk*StepPerSlice);
       if (flags & RolePostSend) {
@@ -428,8 +428,8 @@ class Primitives<
     if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
     if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
 
-    loadRecvConn(&ncclShmem.channel.devPeers[peer], connIndex, e);
-    loadSendConn(&ncclShmem.channel.devPeers[peer], connIndex, e);
+    loadRecvConn(&ncclShmem.channel.peers[peer], connIndex, e);
+    loadSendConn(&ncclShmem.channel.peers[peer], connIndex, e);
 
     setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
   }
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index 8dc867b..0927037 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -12,7 +12,7 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
     ncclRing *ring = &ncclShmem.channel.ring;
@@ -22,7 +22,7 @@ namespace {
     const ssize_t loopSize = nChannels*chunkSize;
     const ssize_t size = args->count;
     const int rank = ncclShmem.comm.rank;
-    const int prevRank = ring->devUserRanks[nranks-1];
+    const int prevRank = ring->userRanks[nranks-1];
     const int root = args->root;
 
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index 3f38b1a..754889a 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -12,11 +12,11 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
     ncclRing *ring = &ncclShmem.channel.ring;
-    int const *ringRanks = ring->devUserRanks;
+    int const *ringRanks = ring->userRanks;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
     // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h
index be0dbc5..feae653 100644
--- a/src/collectives/device/sendrecv.h
+++ b/src/collectives/device/sendrecv.h
@@ -11,21 +11,23 @@
 template<typename T, typename RedOp>
 struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
+    void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
+    size_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
     if (args->peer == ncclShmem.comm.rank) {
       struct ncclWorkElemP2p* recvArgs = args-1;
-      if (args->buff != recvArgs->buff) {
-        ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&args->buff, 1, (T**)&recvArgs->buff, args->count);
+      void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
+      if (buff != recvBuff) {
+        ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&buff, 1, (T**)&recvBuff, count);
       }
     } else {
       using Proto = ProtoSimple<1, 1>;
-      ssize_t const count = args->count;
       int const chunkSize = args->chunkSize/sizeof(T);
       int const peer = args->peer;
       Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
-        (tid, nthreads, nullptr, &peer, args->buff, nullptr, /*redOpArg(ignored)=*/0, group);
-      ssize_t offset = 0;
+        (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group);
+      size_t offset = 0;
       do {
-        int nelem = min(chunkSize, count-offset);
+        int nelem = min(size_t(chunkSize), count-offset);
         prims.directSend(offset, offset, nelem);
         offset += nelem;
       } while(offset < count);
@@ -35,14 +37,15 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
     if (args->peer != ncclShmem.comm.rank) {
       using Proto = ProtoSimple<1, 1>;
-      ssize_t const count = args->count;
+      void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
+      ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
       int const chunkSize = args->chunkSize/sizeof(T);
       int const peer = args->peer;
       Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
-        (tid, nthreads, &peer, nullptr, nullptr, args->buff, /*redOpArg(ignored)=*/0, group);
-      ssize_t offset = 0;
+        (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group);
+      size_t offset = 0;
       do {
-        int nelem = min(chunkSize, count-offset);
+        int nelem = min(size_t(chunkSize), count-offset);
         prims.directRecv(offset, nelem);
         offset += nelem;
       } while(offset < count);
@@ -61,11 +64,11 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
     #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
     int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
     args += group;
-    if (args->header.type == ncclWorkTypeUnused) return;
-
     tid -= args->warpStart * WARP_SIZE;
     int nthreads = args->nWarps * WARP_SIZE;
     group |= 1<<16; // Used to select connIndex 1
+
+    if (args->p2pType == ncclWorkP2pTypeUnused) return;
     if (tid >= nthreads || args->peer == -1) return;
     if ((group%2) == 0) {
       runRecv(tid, nthreads, group, args);
diff --git a/src/debug.cc b/src/debug.cc
index 9060abb..1c184d0 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -8,30 +8,24 @@
 #include "nccl_net.h"
 #include <stdlib.h>
 #include <stdarg.h>
+#include <sys/syscall.h>
 
 int ncclDebugLevel = -1;
+static int pid = -1;
+static char hostname[1024];
 thread_local int ncclDebugNoWarn = 0;
+char ncclLastError[1024] = ""; // Global string for the last error in human readable form
 uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
 FILE *ncclDebugFile = stdout;
 pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
+std::chrono::steady_clock::time_point ncclEpoch;
+
+static __thread int tid = -1;
 
 void ncclDebugInit() {
   pthread_mutex_lock(&ncclDebugLock);
   if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
   const char* nccl_debug = getenv("NCCL_DEBUG");
-  if (nccl_debug == NULL) {
-    ncclDebugLevel = NCCL_LOG_NONE;
-  } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
-    ncclDebugLevel = NCCL_LOG_VERSION;
-  } else if (strcasecmp(nccl_debug, "WARN") == 0) {
-    ncclDebugLevel = NCCL_LOG_WARN;
-  } else if (strcasecmp(nccl_debug, "INFO") == 0) {
-    ncclDebugLevel = NCCL_LOG_INFO;
-  } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
-    ncclDebugLevel = NCCL_LOG_ABORT;
-  } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
-    ncclDebugLevel = NCCL_LOG_TRACE;
-  }
 
   /* Parse the NCCL_DEBUG_SUBSYS env var
    * This can be a comma separated list such as INIT,COLL
@@ -64,6 +58,8 @@ void ncclDebugInit() {
         mask = NCCL_ENV;
       } else if (strcasecmp(subsys, "ALLOC") == 0) {
         mask = NCCL_ALLOC;
+      } else if (strcasecmp(subsys, "CALL") == 0) {
+        mask = NCCL_CALL;
       } else if (strcasecmp(subsys, "ALL") == 0) {
         mask = NCCL_ALL;
       }
@@ -75,6 +71,10 @@ void ncclDebugInit() {
     free(ncclDebugSubsys);
   }
 
+  // Cache pid and hostname
+  getHostName(hostname, 1024, '.');
+  pid = getpid();
+
   /* Parse and expand the NCCL_DEBUG_FILE path and
    * then create the debug file. But don't bother unless the
    * NCCL_DEBUG level is > VERSION
@@ -94,12 +94,10 @@ void ncclDebugInit() {
           *dfn++ = '%';
           break;
         case 'h': // %h = hostname
-          char hostname[1024];
-          getHostName(hostname, 1024, '.');
           dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
           break;
         case 'p': // %p = pid
-          dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
+          dfn += snprintf(dfn, PATH_MAX, "%d", pid);
           break;
         default: // Echo everything we don't understand
           *dfn++ = '%';
@@ -110,15 +108,30 @@ void ncclDebugInit() {
     *dfn = '\0';
     if (debugFn[0] != '\0') {
       FILE *file = fopen(debugFn, "w");
-      if (file != NULL) {
+      if (file != nullptr) {
+        setbuf(file, nullptr); // disable buffering
         ncclDebugFile = file;
       }
     }
   }
 
-#ifdef ENABLE_TRACE
-  ncclEpoch = std::chrono::high_resolution_clock::now();
-#endif
+  int tempNcclDebugLevel = -1;
+  if (nccl_debug == NULL) {
+    tempNcclDebugLevel = NCCL_LOG_NONE;
+  } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
+    tempNcclDebugLevel = NCCL_LOG_VERSION;
+  } else if (strcasecmp(nccl_debug, "WARN") == 0) {
+    tempNcclDebugLevel = NCCL_LOG_WARN;
+  } else if (strcasecmp(nccl_debug, "INFO") == 0) {
+    tempNcclDebugLevel = NCCL_LOG_INFO;
+  } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
+    tempNcclDebugLevel = NCCL_LOG_ABORT;
+  } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
+    tempNcclDebugLevel = NCCL_LOG_TRACE;
+  }
+
+  ncclEpoch = std::chrono::steady_clock::now();
+  __atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE);
   pthread_mutex_unlock(&ncclDebugLock);
 }
 
@@ -127,45 +140,53 @@ void ncclDebugInit() {
  * they can share the debugging mechanisms and output files
  */
 void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
-  if (ncclDebugLevel == -1) ncclDebugInit();
+  if (__atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE) == -1) ncclDebugInit();
   if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
+
+  // Save the last error (WARN) as a human readable string
+  if (level == NCCL_LOG_WARN) {
+    pthread_mutex_lock(&ncclDebugLock);
+    va_list vargs;
+    va_start(vargs, fmt);
+    (void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs);
+    va_end(vargs);
+    pthread_mutex_unlock(&ncclDebugLock);
+  }
   if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;
 
-  // Gather the rank information. This can take > 1us so we want to make sure
-  // we only do it when needed.
-  char hostname[1024];
-  getHostName(hostname, 1024, '.');
+  if (tid == -1) {
+    tid = syscall(SYS_gettid);
+  }
+
   int cudaDev;
-  cudaGetDevice(&cudaDev);
-  int pid = getpid();
-  int tid = syscall(SYS_gettid);
+  if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
+    cudaGetDevice(&cudaDev);
+  }
 
   char buffer[1024];
   size_t len = 0;
-  pthread_mutex_lock(&ncclDebugLock);
-  if (level == NCCL_LOG_WARN)
-    len = snprintf(buffer, sizeof(buffer),
-        "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line);
-  else if (level == NCCL_LOG_INFO)
-    len = snprintf(buffer, sizeof(buffer),
-        "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
-#ifdef ENABLE_TRACE
-  else if (level == NCCL_LOG_TRACE) {
-    auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
+  if (level == NCCL_LOG_WARN) {
+    len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
+                   hostname, pid, tid, cudaDev, filefunc, line);
+  } else if (level == NCCL_LOG_INFO) {
+    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
+  } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
+    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid);
+  } else if (level == NCCL_LOG_TRACE) {
+    auto delta = std::chrono::steady_clock::now() - ncclEpoch;
     double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
-    len = snprintf(buffer, sizeof(buffer),
-        "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, pid, tid, cudaDev, timestamp, filefunc, line);
+    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ",
+                   hostname, pid, tid, cudaDev, timestamp, filefunc, line);
   }
-#endif
+
   if (len) {
     va_list vargs;
     va_start(vargs, fmt);
-    (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
+    len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
     va_end(vargs);
-    fprintf(ncclDebugFile,"%s\n", buffer);
-    fflush(ncclDebugFile);
+    buffer[len++] = '\n';
+    fwrite(buffer, 1, len, ncclDebugFile);
   }
-  pthread_mutex_unlock(&ncclDebugLock);
 }
 
 NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 349cb2b..d3fbbe5 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -10,14 +10,18 @@
 #include "gdrwrap.h"
 #include "bootstrap.h"
 #include "channel.h"
+#include "cudawrap.h"
 
 #include <cstring> // std::memcpy
+#include <cinttypes> // PRIx64
+
+static void* const ncclKernelGeneric = (void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t);
 
 // Only generate inline kernels for LL
 #define NCCL_FUNC5(func, algo, devredop, dtype) \
-  (void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), \
-  (void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), \
-  (void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype)
+  /*LL    */(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), \
+  /*LL128 */nullptr /*(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype)*/, \
+  /*SIMPLE*/nullptr /*(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype)*/
 
 #define NCCL_FUNC4(func, devredop, type) \
   (void*)NCCL_FUNC5(func, TREE,    devredop, type), \
@@ -111,6 +115,8 @@ static void* const ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*
   NCCL_FUNCS2A(AllReduce)
 };
 
+static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */);
+
 // Determine the maximum kernel stack size of all CUDA kernels
 size_t ncclKernMaxLocalSize() {
   ncclResult_t res = ncclSuccess;
@@ -118,8 +124,10 @@ size_t ncclKernMaxLocalSize() {
   cudaFuncAttributes attr = {0};
   size_t max = 0;
   for (int i = 0; i < numNcclKerns; i++) {
-    CUDACHECKGOTO(cudaFuncGetAttributes(&attr, ncclKerns[i]), res, error);
-    if (attr.localSizeBytes > max) max = attr.localSizeBytes;
+    if (ncclKerns[i] != nullptr) {
+      CUDACHECKGOTO(cudaFuncGetAttributes(&attr, ncclKerns[i]), res, error);
+      if (attr.localSizeBytes > max) max = attr.localSizeBytes;
+    }
   }
 
 error:
@@ -143,267 +151,906 @@ error:
 /*       Launch system : synchronization and CUDA kernel launch              */
 /*****************************************************************************/
 
-ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
-#if CUDART_VERSION >= 9000
-  if (cgMode & 0x01) {
-    CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices,
-            // These flags are to reduce the latency of using this API
-            cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync));
-    return ncclSuccess;
+static void appendWorkElemColl(
+    struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
+    int funcIndex, struct ncclWorkElem const *elem, int bid
+  ) {
+  struct ncclKernelPlan::Channel* chan = &plan->channels[channelId];
+  struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue);
+  if (q && funcIndex == q->work.header.funcIndex
+        && elem->nWarps == q->work.elems[0].nWarps
+        && chan->nWorkElem < NCCL_MAX_WORK_ELEMENTS) {
+    int e = chan->nWorkElem++;
+    q->work.elems[e] = *elem; // C++ struct assignment
+    q->work.elems[e].bid = bid;
+    q->work.elems[e].isUsed = 1;
+    return;
   }
-#endif
-  int savedDev;
-  CUDACHECK(cudaGetDevice(&savedDev));
-  for (int i = 0; i < numDevices; i++) {
-    struct cudaLaunchParams* params = paramsList+i;
-    CUDACHECK(cudaSetDevice(cudaDevs[i]));
-    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+  q = ncclMemoryStackAlloc<struct ncclWorkList>(&comm->memScoped);
+  q->work.header.type = ncclWorkTypeColl;
+  q->work.header.funcIndex = funcIndex;
+  q->work.elems[0] = *elem; // C++ struct assignment
+  q->work.elems[0].bid = bid;
+  q->work.elems[0].isUsed = 1;
+  chan->nWorkElem = 1;
+  chan->nWork += 1;
+  ncclIntruQueueEnqueue(&chan->workQueue, q);
+}
+
+static void appendWorkElemColl(
+    struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
+    int funcIndex, struct ncclWorkElemReg const *elem, int bid
+  ) {
+  struct ncclKernelPlan::Channel* chan = &plan->channels[channelId];
+  struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue);
+  if (q && funcIndex == q->work.header.funcIndex
+        && elem->elem.nWarps == q->work.regElems[0].elem.nWarps
+        && chan->nWorkElem < NCCL_MAX_WORK_ELEMENTS_REG) {
+    int e = chan->nWorkElem++;
+    q->work.regElems[e] = *elem; // C++ struct assignment
+    q->work.regElems[e].elem.bid = bid;
+    q->work.regElems[e].elem.isUsed = 1;
+    return;
+  }
+  q = ncclMemoryStackAlloc<struct ncclWorkList>(&comm->memScoped);
+  q->work.header.type = ncclWorkTypeRegColl;
+  q->work.header.funcIndex = funcIndex;
+  q->work.regElems[0] = *elem; // C++ struct assignment
+  q->work.regElems[0].elem.bid = bid;
+  q->work.regElems[0].elem.isUsed = 1;
+  chan->nWorkElem = 1;
+  chan->nWork += 1;
+  ncclIntruQueueEnqueue(&chan->workQueue, q);
+}
+
+static void finishWorkP2p(struct ncclWork* work) {
+  int nElem = 0;
+  for (int e=0; e < NCCL_MAX_WORK_ELEMENTS_P2P; e++) {
+    if (work->p2pElems[e].p2pType != ncclWorkP2pTypeUnused)
+      nElem = e+1;
+  }
+  int nGroup = 1;
+  while (nGroup < nElem) nGroup *= 2;
+  int nWarp = 1;
+  while (nWarp*nGroup <= (NCCL_MAX_NTHREADS/WARP_SIZE)/2) nWarp *= 2;
+  for (int i=0; i < nGroup; i++) {
+    work->p2pElems[i].ngroups = nGroup;
+    work->p2pElems[i].warpStart = i*(NCCL_MAX_NTHREADS/WARP_SIZE)/nGroup;
+    int extraWarp = nWarp >= 2 ? i%2 : 0;
+    work->p2pElems[i].nWarps = nWarp + extraWarp;
+  }
+}
+
+static void finishWork(struct ncclWork* work) {
+  if (work->header.type == ncclWorkTypeP2p) {
+    finishWorkP2p(work);
+  }
+}
+
+static void appendWorkElemP2p(
+    struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
+    struct ncclWorkElemP2p const *elem
+  ) {
+  constexpr int funcIndex = FUNC_INDEX_P2P;
+  struct ncclKernelPlan::Channel* chan = &plan->channels[channelId];
+  struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue);
+  if (q && funcIndex == q->work.header.funcIndex) {
+    if (chan->p2pTailElem[elem->p2pType-1] < NCCL_MAX_WORK_ELEMENTS_P2P) {
+      for (int e = -2 + chan->p2pTailElem[elem->p2pType-1]; e >= 0; e -= 2) {
+        // Can't have multiple elements of the same ncclWork communicate with the
+        // same peer otherwise they would attempt to use that connection concurrently.
+        if (q->work.p2pElems[e].peer == elem->peer)
+          goto NewWork;
+      }
+      int e = chan->p2pTailElem[elem->p2pType-1];
+      q->work.p2pElems[e] = *elem; // C++ struct assignment
+      chan->p2pTailElem[elem->p2pType-1] += 2;
+      return;
+    }
+  NewWork:
+    finishWorkP2p(&q->work);
+  }
+  q = ncclMemoryStackAlloc<struct ncclWorkList>(&comm->memScoped);
+  q->work.header.type = ncclWorkTypeP2p;
+  q->work.header.funcIndex = FUNC_INDEX_P2P;
+  chan->p2pTailElem[ncclWorkP2pTypeRecv-1] = 0;
+  chan->p2pTailElem[ncclWorkP2pTypeSend-1] = 1;
+  q->work.p2pElems[chan->p2pTailElem[elem->p2pType-1]] = *elem; // C++ struct assignment
+  chan->p2pTailElem[elem->p2pType-1] += 2;
+  chan->nWork += 1;
+  ncclIntruQueueEnqueue(&chan->workQueue, q);
+}
+
+static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclProxyOp* op) {
+  bool needed = true;
+  NCCLCHECK(ncclProxySaveOp(comm, op, &needed));
+  if (needed) {
+    struct ncclProxyOp* q = ncclMemoryPoolAlloc<struct ncclProxyOp>(&comm->memPool_ncclProxyOp, &comm->memPermanent);
+    *q = *op; // C++ struct assignment
+    ncclIntruQueueEnqueue(&plan->channels[op->channelId].proxyOpQueue, q);
   }
-  CUDACHECK(cudaSetDevice(savedDev));
   return ncclSuccess;
 }
 
-static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** work, struct ncclWorkElem* base) {
-  if (channel->workCount == NCCL_MAX_OPS) {
-    WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
-    return ncclInvalidUsage;
+// Put coll workelem & proxyOp in plan assuming nWorkBudget permits, so please
+// ensure *nWorkBudget >= nBids upon entry.
+static ncclResult_t addCollToPlan(
+    struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, int funcIndex,
+    struct ncclWorkElem const* workElem, struct ncclProxyOp const* proxyOp,
+    int nBid, size_t bytes, bool regBufUsed, void* regBufSend[], void* regBufRecv[]
+  ) {
+  struct ncclKernelPlan::Channel *chans = plan->channels;
+  int nCollChannels = comm->nChannels;
+
+  // Choose the `nBid` least loaded channels to do the work. This ensures
+  // all bids go to different channels in case they need to synchronize.
+  int least[/*nBid*/MAXCHANNELS];
+  least[0] = 0;
+  int maxIndexInLeast = 0;
+  size_t maxBytesInLeast = chans[0].collBytes;
+  // Initialize least[] such that the first nBid channels are accounted for.
+  for (int b=1; b < nBid; b++) {
+    least[b] = b;
+    if (maxBytesInLeast < chans[b].collBytes) {
+      maxIndexInLeast = b;
+      maxBytesInLeast = chans[b].collBytes;
+    }
   }
-  int opIndex = channel->workFifoTail%NCCL_MAX_OPS;
-  struct ncclWork* w = channel->workFifo+opIndex;
-  volatile uint8_t* typePtr = (volatile uint8_t*)&w->header.type;
-  while (typePtr[0] != ncclWorkTypeUnused) sched_yield();
-  memset(w, 0, sizeof(struct ncclWork));
-  // Initialize with work elem if provided
-  if (base) memcpy(w->elems, base, sizeof(struct ncclWorkElem));
-  channel->workFifoTail++;
-  channel->workCount++;
-  if (work) *work = w;
-  return ncclSuccess;
-}
-
-// Finalize channel work FIFO states before launch
-// Called during dynamic enqueue
-static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph) {
-  ncclComm_t comm = eqInfo->comm;
-  // Do not use comm->myParams in this function unless in non-graph mode
-  // In graph mode, enqueue is async to capture, myParams can have been changed
-  struct cudaLaunchParams* params = comm->myParams;
-
-  // Only launch blocks where we have work to do.
-  // This is not supported when we are in cudaGraph mode.
-  // Because in cudaGraph mode the launch param needs to be determined
-  // at capture time instead of launch time.
-  if (!usingCudaGraph) {
-    int nChannels = std::max(comm->nChannels, comm->p2pnChannels);
-    for (int c=0; c<nChannels; c++) {
-      if (comm->channels[c].workCount) params->gridDim.x = c+1;
-    }
-    eqInfo->maxChannels = params->gridDim.x;
-  }
-
-  // Set isLast = 1 for the last operation and add a no-op on empty channels (p2p case).
-  for (int c=0; c<eqInfo->maxChannels; c++) {
-    struct ncclChannel* channel = comm->channels+c;
-    if (channel->workCount == 0) {
-      struct ncclWork* w;
-      NCCLCHECK(getNextOp(channel, &w, NULL));
-      w->header.funcIndex = FUNC_INDEX_P2P;
-      w->header.type = ncclWorkTypeP2p;
-      w->header.nWarps = 0;
-    }
-    channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].header.isLast = 1;
-
-    if (c == 0) {
-      // As we inline the first coll directly, we can free it immediately.
-      // Except P2P or aggregation or registration cases
-      struct ncclWork* work = channel->workFifo+((channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS);
-      if (work->header.type == ncclWorkTypeColl && eqInfo->elemList->count() == 1)
-        work->header.type = ncclWorkTypeUnused;
-    }
-
-    if (channel->gdrMemDesc) {
-      // GDRCOPY support
-      uint64_t first = (channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS;
-      uint64_t nelems = channel->workCount;
-      TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld nelems %zi",
-            channel->workFifo, channel->workFifoGdr, first, nelems);
-
-      for (int i = 0; i < nelems; i++) {
-        int elem = (first+i) % NCCL_MAX_OPS;
-        // Copy Host workFifo to CUDA workFifo via the GDRCOPY mapping
-        NCCLCHECK(ncclGdrCudaCopy(channel->gdrMemDesc, channel->workFifoGdr+elem, channel->workFifo+elem, 1));
+  // Sort in the rest of the channels. If a channel has less work than the max
+  // member of least[], replace that member and compute the new max. The optimal
+  // algorithm uses a max-heap, but for our small sizes I suspect the better
+  // asymptotic complexity would be swamped by the increased instruction complexity.
+  for (int c=nBid; c < nCollChannels; c++) {
+    if (chans[c].collBytes < maxBytesInLeast) {
+      least[maxIndexInLeast] = c;
+      maxBytesInLeast = chans[least[0]].collBytes;
+      maxIndexInLeast = 0;
+      for (int b=1; b < nBid; b++) {
+        if (maxBytesInLeast < chans[least[b]].collBytes) {
+          maxIndexInLeast = b;
+          maxBytesInLeast = chans[least[b]].collBytes;
+        }
       }
     }
   }
 
-  return ncclSuccess;
-}
+  uint64_t opCount = uint64_t(plan->collOpCount++)<<1 | 0;
+  bytes /= nBid;
+  for (int bid=0; bid < nBid; bid++) {
+    int c = least[bid];
+    chans[c].collBytes += bytes;
 
-ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
-  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  int val = *ptr;
-  bool done = false;
-  while (done == false) {
-    if (val >= comm->intraRanks) {
-      WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
-      return ncclInvalidUsage;
+    // Add work elem
+    *nWorkBudget += chans[c].nWork;
+    if (!regBufUsed) {
+      appendWorkElemColl(comm, plan, c, funcIndex, workElem, bid);
+    } else {
+      // Buffer registration in play which could only for CollNet at the moment.
+      struct ncclChannel* channel = &comm->channels[c];
+      struct ncclWorkElemReg workElemReg;
+      workElemReg.elem = *workElem; // C++ struct assignment
+      workElemReg.elem.regUsed = 1;
+      for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
+        int peer = channel->collTree.down[i];
+        if (peer == -1) break;
+        int j = comm->rankToLocalRank[peer]; // Get intra-node slot
+        workElemReg.dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer
+        workElemReg.dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer
+      }
+      for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
+        int peer = channel->collTree.up[i];
+        if (peer == -1) break;
+        int j = comm->rankToLocalRank[peer];
+        // Output buffer of root peer
+        workElemReg.upOutputs[i] = regBufRecv[j];
+      }
+      appendWorkElemColl(comm, plan, c, funcIndex, &workElemReg, bid);
     }
-    if (val+1 == comm->intraRanks) {
-      // Reset the barrier.
-      comm->intraBarrier[comm->intraPhase^1] = 0;
-      *isLast = 1;
-      return ncclSuccess;
-    }
-    done = __sync_bool_compare_and_swap(ptr, val, val+1);
-    val++;
-  }
-  *isLast = 0;
-  return ncclSuccess;
-}
+    *nWorkBudget -= chans[c].nWork; // subtract delta of chans[c].nWork
 
-ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
-  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  int val = *ptr;
-  if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
-    WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
-    return ncclInternalError;
+    // Add proxy task. Empty collectives do not make it to the proxy thread
+    // since they don't imply synchronization for the user like p2p.
+    if (proxyOp->nsteps != 0) {
+      struct ncclProxyOp tmp = *proxyOp; // C++ struct assignment
+      tmp.channelId = c;
+      tmp.opCount = opCount;
+      NCCLCHECK(addProxyOpIfNeeded(comm, plan, &tmp));
+    }
   }
   return ncclSuccess;
 }
 
-ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
-  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  while (*ptr < comm->intraRanks) pthread_yield();
-  comm->intraPhase ^= 1;
+// Put p2p op in plan assuming there is space in nWorkBudget, so you must
+// ensure *nWorkBudget >= 1 upon entry.
+static ncclResult_t addP2pToPlan(
+    struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget,
+    bool isSendNotRecv, int peer, int chunk, void *addr, size_t bytes
+  ) {
+  struct ncclInfo info = {
+    isSendNotRecv ? ncclFuncSend : ncclFuncRecv,
+    isSendNotRecv ? "Send" : "Recv",
+    nullptr, addr, bytes, ncclInt8, ncclSum, peer, comm, (cudaStream_t)0,
+    /*Args*/1, 1
+  };
+
+  int channelId;
+  NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, info.coll, &channelId));
+  info.channelId = channelId;
+
+  struct ncclProxyOp proxyOp = {};
+  NCCLCHECK(ncclProxyComputeP2p(&info, &proxyOp));
+
+  struct ncclWorkElemP2p elem = {0};
+  elem.peer = peer;
+  elem.nWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
+  elem.p2pType = isSendNotRecv ? ncclWorkP2pTypeSend : ncclWorkP2pTypeRecv;
+  elem.buffLo32 = uint32_t(reinterpret_cast<uintptr_t>(addr));
+  elem.buffHi32 = reinterpret_cast<uintptr_t>(addr)>>32;
+  elem.countLo32 = uint32_t(bytes);
+  elem.countHi32 = bytes>>32;
+  elem.chunkSize = info.chunkSize; // computed by ncclProxyComputeP2p
+
+  *nWorkBudget += plan->channels[channelId].nWork;
+  appendWorkElemP2p(comm, plan, channelId, &elem);
+  *nWorkBudget -= plan->channels[channelId].nWork;
+
+  // Calculate the opCount after appendWorkElemP2p since it will always return
+  // with channel->nWork equal to one plus the work index this p2p settled in.
+  proxyOp.opCount = uint64_t(plan->channels[channelId].nWork)<<1 | 1;
+  NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp));
   return ncclSuccess;
 }
 
-// Check dependency wrt outside streams or previous launches
-// Launch kernel in GROUP mode
-ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) {
-  struct cudaLaunchParams* params = comm->myParams;
-  if (params->gridDim.x == 0) return ncclSuccess;
+static void finishPlan(struct ncclKernelPlan* plan) {
+  int channelUbound = 0;
+  int channelCount = 0;
+  uint64_t channelMask = 0;
+  bool hasProxyOps = false;
+  for (int c=0; c < MAXCHANNELS; c++) {
+    struct ncclWorkList* tail = ncclIntruQueueTail(&plan->channels[c].workQueue);
+    if (tail != nullptr) {
+      channelUbound = c+1;
+      channelCount += 1;
+      channelMask |= 1ull<<c;
+      tail->work.header.isLast = 1;
+      finishWork(&tail->work);
+    }
+    hasProxyOps |= !ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue);
+  }
+  plan->channelUbound = channelUbound;
+  plan->channelCount = channelCount;
+  plan->channelMask = channelMask;
+  plan->hasProxyOps = hasProxyOps;
+  if (plan->kernelFn == nullptr)
+    plan->kernelFn = ncclKernelGeneric;
+  plan->threadPerBlock = std::max(plan->threadPerBlock, 3*WARP_SIZE);
+}
 
-  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
-  if (comm->launchMode == ncclComm::GROUP &&
-      (comm->groupCudaStream ||
-       comm->userStream == cudaStreamDefault ||
-       comm->userStream == cudaStreamLegacy ||
-       comm->userStream == cudaStreamPerThread)) {
-    // Enqueue event in user stream
-    CUDACHECK(cudaEventRecord(comm->intDoneEvent, comm->userStream));
-    // Create dependency between user stream and internal NCCL stream
-    CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->intDoneEvent, 0));
-    params->stream = comm->groupStream;
+static ncclResult_t registerIntraNodeBuffers(
+    struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclInfo* info,
+    bool* outRegBufUsed,
+    void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
+    void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS]
+  ) {
+  *outRegBufUsed = false;
+  ncclResult_t result = ncclSuccess;
+
+#if CUDART_VERSION >= 11030
+  int localRank = comm->localRank;
+
+  if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess;
+
+  struct HandlePair {
+    cudaIpcMemHandle_t ipc[2]; // {send, recv}
+    size_t offset[2]; // {send, recv}
+  };
+  struct HandlePair handles[NCCL_MAX_LOCAL_RANKS];
+
+  CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback);
+  CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback);
+
+  void *baseSend, *baseRecv;
+  size_t size;
+  CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff));
+  handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend;
+  CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff));
+  handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv;
+
+  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair)));
+
+  // Open handles locally
+  for (int i=0; i < comm->localRanks; i++) {
+    if (i == localRank) { // Skip self
+      outRegBufSend[i] = nullptr;
+      outRegBufRecv[i] = nullptr;
+    } else {
+      for (int sr=0; sr < 2; sr++) {
+        // Get base address of mapping
+        void* base;
+        CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess));
+        // Get real buffer address by adding offset in the mapping
+        (sr==0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr];
+        // Enqueue reminder to close memory handle
+        struct ncclPointerList* q = ncclMemoryPoolAlloc<struct ncclPointerList>(&comm->memPool_ncclPointerList, &comm->memPermanent);
+        q->ptr = base;
+        ncclIntruQueueEnqueue(&plan->ipcMemQueue, q);
+      }
+    }
+  }
+  *outRegBufUsed = true;
+
+fallback:
+#endif
+  return result;
+}
+
+NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 0);
+
+static ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport);
+static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps);
+
+static ncclResult_t scheduleCollTasksToPlan(
+    struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget
+  ) {
+  struct ncclTasks* tasks = &comm->tasks;
+
+  size_t bytePerChannel[/*collNetSupport*/2];
+  if (comm->channelSize > 0) {
+    // Set by user
+    bytePerChannel[/*collNetSupport=*/0] = comm->channelSize;
+    bytePerChannel[/*collNetSupport=*/1] = comm->channelSize;
   } else {
-    if (comm->userStream != params->stream && !comm->usingCudaGraph) {
-      // Stream changed from last call, create dependency against last NCCL kernel launch
-      CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
-    }
-    params->stream = comm->userStream;
+    // Latency increases as scale increases
+    // We would thus want to increase the chunk size to compensate for the lost efficiency
+    bytePerChannel[/*collNetSupport=*/0] = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks);
+    bytePerChannel[/*collNetSupport=*/1] = 256<<10; // Hand-tuned
   }
 
-  if (comm->launchMode == ncclComm::GROUP) {
-    int isLast = 0;
-    NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
-    if (isLast) {
-      // I'm the last. Launch all operations.
-      NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
-      NCCLCHECK(ncclCpuBarrierLast(comm));
+  for (int collNetSupport=0; collNetSupport < 2; collNetSupport++) {
+    while (tasks->collBytesTotal < bytePerChannel[collNetSupport]*comm->nChannels &&
+           bytePerChannel[collNetSupport] > NCCL_MIN_CHANNEL_SIZE) {
+      // Reduce per-channel size so we utilize all channels.
+      bytePerChannel[collNetSupport] /= 2;
+    }
+  }
+
+  while (tasks->nTasksColl != 0) {
+    struct ncclTaskColl* head = ncclIntruQueueHead(&tasks->collQueue);
+    struct ncclInfo aggInfo = {};
+    aggInfo.comm = comm;
+    aggInfo.coll = head->func;
+    aggInfo.datatype = head->datatype;
+    aggInfo.opFull = head->op;
+    aggInfo.op = (ncclRedOp_t)(int)head->op.op;
+    aggInfo.count = head->count;
+    int nAggChannels = 0;
+    int nAggOps = 1;
+    struct ncclTaskColl* aggEnd = head->next;
+    int collNetSupport = 0;
+    NCCLCHECK(getCollNetSupport(&aggInfo, &collNetSupport));
+
+    // Find a range of ops that can be aggregated together.
+    while (aggEnd != nullptr &&
+           aggEnd->func == aggInfo.coll &&
+           aggEnd->datatype == aggInfo.datatype &&
+           aggEnd->op.op == aggInfo.opFull.op) {
+      aggInfo.count += aggEnd->count;
+      int nc = DIVUP(aggEnd->count*ncclTypeSize(aggInfo.datatype), bytePerChannel[collNetSupport]);
+      nc = std::max(1, std::min(nc, comm->nChannels));
+      nAggChannels += nc;
+      nAggOps++;
+      aggEnd = aggEnd->next;
+    }
+
+    if (nAggOps > 1) {
+      NCCLCHECK(ncclInfoSetDerived(&aggInfo, comm->nRanks));
+      aggInfo.nChannels = std::min(comm->nChannels, nAggChannels);
+      int opPerChannel = DIVUP(nAggChannels, aggInfo.nChannels);
+      NCCLCHECK(getAlgoInfo(&aggInfo, collNetSupport, opPerChannel));
+    }
+
+    while (head != aggEnd) {
+      struct ncclInfo info = {};
+      info.comm = comm;
+      info.coll = head->func;
+      info.sendbuff = head->sendbuff;
+      info.recvbuff = head->recvbuff;
+      info.count = head->count;
+      info.root = head->root;
+      info.datatype = head->datatype;
+      info.opFull = head->op; // C++ struct assignment
+      info.op = (ncclRedOp_t)(int)head->op.op;
+      info.chunkSteps = head->chunkSteps;
+      info.sliceSteps = head->sliceSteps;
+      NCCLCHECK(ncclInfoSetDerived(&info, comm->nRanks));
+      if (nAggOps > 1) {
+        info.nChannels = DIVUP(info.nBytes, bytePerChannel[collNetSupport]);
+        info.nChannels = std::max(1, std::min(info.nChannels, comm->nChannels));
+        info.algorithm = aggInfo.algorithm;
+        info.protocol = aggInfo.protocol;
+        info.nThreads = aggInfo.nThreads;
+      }
+
+      int workFuncIndex;
+      struct ncclWorkElem workElem = {};
+      struct ncclProxyOp proxyOp = {};
+      NCCLCHECK(computeColl(&info, &workFuncIndex, &workElem, &proxyOp));
+
+      if (*nWorkBudget < info.nChannels) return ncclSuccess; // Ensure room for addCollToPlan()
+
+      bool regBufUsed = false;
+      void* regBufSend[NCCL_MAX_LOCAL_RANKS];
+      void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
+      if (plan->persistent && ncclParamGraphRegister() &&
+          info.algorithm == NCCL_ALGO_COLLNET &&   // limited to CollNet for now
+          comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
+          comm->intraRanks < comm->localRanks) { // only with inter-process & intra-node peers
+        NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, &regBufUsed, regBufSend, regBufRecv));
+      }
+
+      NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp,
+        info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv));
+      tasks->nTasksColl -= 1;
+      tasks->collBytesTotal -= info.nBytes;
+      ncclIntruQueueDequeue(&tasks->collQueue);
+      head = ncclIntruQueueHead(&tasks->collQueue);
+
+      plan->threadPerBlock = std::max(plan->threadPerBlock, info.nThreads);
+      if (ncclKerns[workFuncIndex] != nullptr)
+        plan->kernelFn = ncclKerns[workFuncIndex];
     }
   }
   return ncclSuccess;
 }
 
-// Launch kernel in PARALLEL mode
-ncclResult_t ncclLaunchKernel(ncclComm_t comm) {
-  struct cudaLaunchParams *params = comm->myParams;
-  if (params->gridDim.x == 0) return ncclSuccess;
-
-  // We can't print the CG mode before the first barrier happened.
-  if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
-    *comm->intraCGMode ^= 0x10;
-    INFO(NCCL_INIT,"Launch mode %s%s%s",
-        comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
-        *comm->intraCGMode ? "/CGMD" : "",
-        (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
+static size_t calcP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
+  size_t size = std::max(minSize, divUp(totalSize, minChannels));
+  int nChannels = minChannels;
+  while (size > maxSize && nChannels <= maxChannels/2) {
+    nChannels *= 2;
+    size = divUp(totalSize, nChannels);
   }
+  return alignUp(size, minSize);
+}
 
-  if (comm->launchMode == ncclComm::GROUP) {
-    NCCLCHECK(ncclCpuBarrierOut(comm));
+static ncclResult_t scheduleP2pTasksToPlan(
+    struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget
+  ) {
+  struct ncclTasks* tasks = &comm->tasks;
+  int nRanks = comm->nRanks;
+  struct ncclTasks::Peer* peers = tasks->peers;
+  int const *sendOrder = tasks->p2pSendOrder;
+  int const *recvOrder = tasks->p2pRecvOrder;
+
+  plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MAX_NTHREADS);
+
+  // Compute how much to split operations
+  // Natural step size matching buffer steps.
+  ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
+  if (comm->nNodes > 1) stepSize /= SENDRECV_SLICEFACTOR;
+  // Try to use all channels
+  int nChannelsMax = comm->p2pnChannelsPerPeer;
+  int nChannelsMin = nChannelsMax;
+  // Try to use all channels, but one channel per operation.
+  while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
+  // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
+  while (nChannelsMax*nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
+
+  while (tasks->nTasksP2p != 0) {
+    for (int i=0; i < nRanks; i++) {
+      int sendPeer = sendOrder[i];
+      int recvPeer = recvOrder[i];
+      struct ncclTaskP2p* send = ncclIntruQueueHead(&peers[sendPeer].sendQueue);
+      struct ncclTaskP2p* recv = ncclIntruQueueHead(&peers[recvPeer].recvQueue);
+      if (sendPeer == comm->rank) {
+        if (recvPeer != comm->rank) {
+          WARN("Sendrecv plan not aligned for self");
+          return ncclInternalError;
+        }
+        if (send && recv == nullptr) {
+          WARN("Trying to send to self without a matching recv");
+          return ncclInvalidUsage;
+        }
+        if (send == nullptr && recv) {
+          WARN("Trying to recv to self without a matching send");
+          return ncclInvalidUsage;
+        }
+      }
+      if (send != nullptr || recv != nullptr) {
+        char* recvPtr = recv ? (char*)recv->buff : nullptr;
+        char* sendPtr = send ? (char*)send->buff : nullptr;
+        ssize_t recvBytes = recv ? recv->bytes : 0;
+        ssize_t sendBytes = send ? send->bytes : 0;
+        ssize_t minSize = stepSize/8;
+        ssize_t maxSize = comm->nNodes > 1 ? stepSize : stepSize*32;
+        ssize_t recvChunkBytesMax = calcP2pChunkSize(recvBytes, nChannelsMin, nChannelsMax, minSize, maxSize);
+        ssize_t sendChunkBytesMax = calcP2pChunkSize(sendBytes, nChannelsMin, nChannelsMax, minSize, maxSize);
+        // Zero size send/recv are syncs, encode here with -1.
+        recvBytes = recv && recvBytes == 0 ? -1 : recvBytes;
+        sendBytes = send && sendBytes == 0 ? -1 : sendBytes;
+        // Advance to current chunk. Syncs will always have chunk=0 so no effect on the -1.
+        if (recv) recvPtr   += recv->chunk*recvChunkBytesMax;
+        if (recv) recvBytes -= recv->chunk*recvChunkBytesMax;
+        if (send) sendPtr   += send->chunk*sendChunkBytesMax;
+        if (send) sendBytes -= send->chunk*sendChunkBytesMax;
+
+        do {
+          ssize_t recvChunkBytes = std::min(recvBytes, recvChunkBytesMax); // -1 preserved
+          ssize_t sendChunkBytes = std::min(sendBytes, sendChunkBytesMax);
+          if (recvChunkBytes != 0) {
+            if (recvChunkBytes == -1) recvChunkBytes = 0;
+            if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget
+            NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes));
+            recvPtr += recvChunkBytes;
+            recvBytes -= recvChunkBytes;
+            recv->chunk += 1;
+            if (recvBytes <= 0) {
+              recvBytes = 0; // in case still -1
+              ncclIntruQueueDequeue(&peers[recvPeer].recvQueue);
+              tasks->nTasksP2p -= 1;
+            }
+          }
+          if (sendChunkBytes != 0) {
+            if (sendChunkBytes == -1) sendChunkBytes = 0;
+            if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget
+            NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/true, sendPeer, send->chunk, sendPtr, sendChunkBytes));
+            sendPtr += sendChunkBytes;
+            sendBytes -= sendChunkBytes;
+            send->chunk += 1;
+            if (sendBytes <= 0) {
+              sendBytes = 0; // in case still -1
+              ncclIntruQueueDequeue(&peers[sendPeer].sendQueue);
+              tasks->nTasksP2p -= 1;
+            }
+          }
+        } while (sendBytes != 0 || recvBytes != 0);
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+// Comparison of monotonic rolling counters.
+static inline bool rollingLess32(uint32_t a, uint32_t b) {
+  constexpr uint32_t PositiveMax = uint32_t(-1)>>1;
+  return a-b > PositiveMax;
+}
+static inline uint32_t rollingMin32(uint32_t a, uint32_t b) {
+  constexpr uint32_t PositiveMax = uint32_t(-1)>>1;
+  return (b-a <= PositiveMax) ? a : b;
+}
+
+// Spin until its safe to increase comm->workFifoSent to desiredSent.
+static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredSent) {
+  if (__builtin_expect(rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent), false)) {
+    uint64_t t0 = clockNano();
+    while (1) {
+      // We have to poll for notifications from device.
+      uint32_t* doneLive = comm->workFifoDone;
+      uint32_t ackd[MAXCHANNELS];
+      for (int c=0; c < MAXCHANNELS; c++) {
+        ackd[c] = __atomic_load_n(&doneLive[c], __ATOMIC_RELAXED);
+      }
+      // Compiler-only fence to prevent fusion of loops to encourage dense loads.
+      __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+      uint32_t ackdAll = comm->workFifoSent;
+      for (int c=0; c < MAXCHANNELS; c++) {
+        // ackdAll is min over all non-quiesced channels
+        if (ackd[c] != comm->channels[c].workFifoSent)
+          ackdAll = rollingMin32(ackdAll, ackd[c]);
+      }
+
+      // Compiler only fence to prevent fusion of loops to encourage dense stores.
+      __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+      for (int c=0; c < MAXCHANNELS; c++) {
+        // Advance counter on quiesced channels so they don't lag behind
+        // too far where they could get lost in 32-bit wraparound.
+        if (ackd[c] == comm->channels[c].workFifoSent) {
+          comm->channels[c].workFifoSent = ackdAll;
+          __atomic_store_n(&doneLive[c], ackdAll, __ATOMIC_RELAXED);
+        }
+      }
+      comm->workFifoAckdMin = ackdAll;
+
+      // See if that was enough.
+      if (!rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent)) break;
+      // Nope. Maintain vigorous spin for first 5us, then start yielding.
+      if (clockNano()-t0 >= 5*1000) sched_yield();
+    }
+  }
+}
+
+static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  bool persistent = plan->persistent;
+  int channelUbound = plan->channelUbound;
+  int nWork = 0;
+  for (int c=0; c < channelUbound; c++) nWork += plan->channels[c].nWork;
+
+  struct ncclWork* workHeap;
+  if (!persistent) {
+    workHeap = comm->workFifoHeap;
   } else {
-    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+    workHeap = ncclMemoryStackAlloc<struct ncclWork>(&comm->memScoped, nWork);
+  }
+  uint32_t ixMask = persistent ? ~uint32_t(0) : comm->workFifoDepth-1;
+  uint32_t ixSent;
+  if (persistent) {
+    ixSent = 0;
+  } else {
+    ixSent = comm->workFifoSent;
+    // First work for a channel has to be at workHeap+blockIdx.x which means
+    // we cannot tolerate fifo wraparound. So round up to the wrap boundary
+    // if not doing so would incur crossing it.
+    if (((ixSent + plan->channelCount-1) & ixMask) < (ixSent & ixMask)) {
+      ixSent = (ixSent + ixMask) & ~ixMask;
+      // Need to update workFifoSent so waitWorkFifoAvailable() knows we've
+      // skipped those elements. Consider if all the channels report quiesced,
+      // this way the skipped slots will be considered consumed as well.
+      comm->workFifoSent = ixSent;
+    }
+    waitWorkFifoAvailable(comm, ixSent + nWork);
+  }
+  uint32_t ixHead = ixSent;
+  ixSent += plan->channelCount;
+  int channelsWithWork = 0; // number of channels below `c` with work structs.
+  for (int c=0; c < channelUbound; c++) {
+    struct ncclWorkList* q = ncclIntruQueueHead(&plan->channels[c].workQueue);
+    // Offset of first work equals number of channels below with work.
+    uint32_t ix = ixHead + channelsWithWork;
+    channelsWithWork += q != nullptr ? 1 : 0;
+    while (q != nullptr) {
+      if (q->next != nullptr) {
+        q->work.header.workNext = int32_t(ixSent & ixMask) - int32_t(ixHead & ixMask);
+      } else {
+        q->work.header.inFifo = !persistent ? 1 : 0;
+        // Tell channel to ack us back ix+1 indicating that all slots up to and
+        // including ix have been consumed.
+        q->work.header.doneAcks = ix+1;
+        comm->channels[c].workFifoSent = ix+1;
+      }
+      workHeap[ix & ixMask] = q->work; // C++ struct assignment
+      q = q->next;
+      if (q != nullptr) ix = ixSent++;
+    }
   }
 
+  if (!persistent) {
+    comm->workFifoSent = ixSent;
+    if (comm->workFifoHeapGdrHandle != nullptr) wc_store_fence();
+    plan->workHead = &comm->devWorkFifoHeap[ixHead & ixMask];
+  } else {
+    NCCLCHECK(ncclCudaMalloc(&plan->workHead, nWork));
+    NCCLCHECK(ncclCudaMemcpy(plan->workHead, workHeap, nWork));
+  }
   return ncclSuccess;
 }
 
-// Launch network proxy
-static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) {
-  // Start the network proxies as soon as the kernel has been launched. We can't
-  // perform any CUDA call between the two or having a cudaFree between the CUDA
-  // launch and the ncclProxyStart call could cause a deadlock.
-  // Also, starting the proxies after the CUDA launch seems to be better for
-  // performance (latency).
-  ncclComm_t comm = eqInfo->comm;
-  if (eqInfo->maxChannels == 0) return ncclSuccess;
-
-  for (int r=0; r<eqInfo->maxChannels; r++) {
-    struct ncclChannel* channel = comm->channels+r;
-    channel->workCount = 0;
-    channel->totalSize = 0;
+static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  uint64_t collOpCount = comm->collOpCount;
+  // Advance comm's collOpCount by number of colls in this plan.
+  comm->collOpCount = collOpCount + plan->collOpCount;
+  for (int c=0; c < plan->channelUbound; c++) {
+    struct ncclProxyOp* q = ncclIntruQueueHead(&plan->channels[c].proxyOpQueue);
+    uint64_t p2pOpCount = comm->channels[c].p2pOpCount;
+    uint64_t nextP2pOpCount = p2pOpCount;
+    while (q != nullptr) {
+      struct ncclProxyOp* qNext = q->enqNext;
+      // Ignoring the bottom tag bit, opCount's are zero-based within plan so
+      // translate them to the tip of the comm's history.
+      if (q->opCount & 1) { // p2p
+        // p2pOpCount is monotonic increasing within a plan's channel so just
+        // remember last value to compute max.
+        nextP2pOpCount = p2pOpCount + (q->opCount>>1);
+        nextP2pOpCount += 1; // +1 to ensure next plan doesn't collide
+        q->opCount = (p2pOpCount<<1) + q->opCount;
+      } else { // coll
+        q->opCount = (collOpCount<<1) + q->opCount;
+      }
+      NCCLCHECK(ncclProxySaveOp(comm, q, nullptr)); // May overwrite enqNext.
+      if (!plan->persistent) {
+        // Non-persistent kernels have their memory reclaimed after upload.
+        ncclMemoryPoolFree(&plan->memPool_ncclProxyOp, q);
+      }
+      q = qNext;
+    }
+    // Advance channel's p2pOpCount by number of p2p's in this plan channel.
+    comm->channels[c].p2pOpCount = nextP2pOpCount;
   }
-  comm->lastChannel = 0;
+  return ncclSuccess;
+}
+
+static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  NCCLCHECK(uploadProxyOps(comm, plan));
   NCCLCHECK(ncclProxyStart(comm));
-  return ncclSuccess;
-}
-
-// Record done event for current launch
-ncclResult_t ncclRecordEvents(ncclComm_t comm) {
-  struct cudaLaunchParams *params = comm->myParams;
-
-  // Enqueue event after NCCL kernel (only in non-graph mode)
-  if (!comm->usingCudaGraph) CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
-  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
-  if (comm->launchMode == ncclComm::GROUP &&
-      (comm->groupCudaStream ||
-       comm->userStream == cudaStreamDefault ||
-       comm->userStream == cudaStreamLegacy ||
-       comm->userStream == cudaStreamPerThread)) {
-    CUDACHECK(cudaEventRecord(comm->intDoneEvent, params->stream));
-    // Create dependency between NCCL internal stream and user stream
-    CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->intDoneEvent, 0));
+  if (!plan->persistent) {
+    // Notify main thread of our reclaiming. This will reclaim plan concurrently.
+    ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer);
   }
   return ncclSuccess;
 }
 
-// Reset parameter space for launch
-ncclResult_t ncclLaunchReset(ncclComm_t comm) {
-  comm->userStreamSet = false;
+static void CUDART_CB hostStreamPlanCallback(void *plan_) {
+  struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plan_;
+  ncclResult_t result = hostStreamPlanTask(plan->comm, plan);
+  if (result != ncclSuccess) {
+    WARN("hostStreamPlanCallback() failed : %s\n", ncclGetErrorString(result));
+  }
+}
 
-  // We are finishing capture of the current launch
-  // But we need to keep the current enqueue info for CUDA graph
-  // Thus we need to creating a new enqueue info for the next run
-  if (comm->usingCudaGraph) {
-    NCCLCHECK(ncclCreateQueueInfo(&comm->enqueueInfo, comm));
-  } else {
-    // If not in CUDA graph mode, we reuse the same info space
-    NCCLCHECK(ncclResetQueueInfo(comm->enqueueInfo));
+static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* me) {
+  struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim`
+  if (plan->persistent) {
+    comm->persistentRefs -= 1;
+    if (!ncclMainExited) NCCLCHECK(ncclCudaFree(plan->workHead));
+    while (!ncclIntruQueueEmpty(&plan->ipcMemQueue)) {
+      struct ncclPointerList* q = ncclIntruQueueDequeue(&plan->ipcMemQueue);
+      if (!ncclMainExited) CUDACHECKIGNORE(cudaIpcCloseMemHandle(q->ptr));
+      ncclMemoryPoolFree(&comm->memPool_ncclPointerList, q);
+    }
+  }
+  ncclMemoryPoolTakeAll(&comm->memPool_ncclProxyOp, &plan->memPool_ncclProxyOp);
+  ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
+  return ncclSuccess;
+}
+
+static void persistentDestructor(void* plans_) {
+  struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plans_;
+  struct ncclComm* comm = plan->comm;
+  while (plan != nullptr) {
+    struct ncclKernelPlan* next = plan->next;
+    ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer);
+    plan = next;
+  }
+}
+
+ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
+  ncclResult_t result = ncclSuccess;
+  struct ncclTasks* tasks = &comm->tasks;
+  bool persistent = ncclCudaGraphValid(tasks->capturingGraph);
+  int nPlans = 0;
+
+  // Poll for callbacks sent to us from other threads. Typically these free
+  // resources from to our memory pools.
+  NCCLCHECK(ncclCommPollCallbacks(comm));
+
+  // We already have one frame present which holds all of our tasks (which we
+  // are about to schedule). Now push an additional frame for allocating
+  // work structs (see appendWorkElem() variants all use scoped allocation).
+  ncclMemoryStackPush(&comm->memScoped);
+
+  if (tasks->nTasksColl + tasks->nTasksP2p != 0) {
+    do {
+      struct ncclKernelPlan* plan = ncclMemoryPoolAlloc<struct ncclKernelPlan>(&comm->memPool_ncclKernelPlan, &comm->memPermanent);
+      ncclIntruQueueEnqueue(&comm->planQueue, plan);
+      nPlans += 1;
+      plan->comm = comm;
+      plan->reclaimer.fn = reclaimPlan;
+      plan->persistent = persistent;
+
+      // Non-persistent kernels fill up at most half of our fifo per kernel.
+      int nWorkBudget = plan->persistent ? INT_MAX : comm->workFifoDepth/2;
+      int nWorkBudgetOld = nWorkBudget;
+
+      // Drain coll tasks first. This is essential since we partition tasks based
+      // on the work budget and p2p work isn't collective. If we were to drain p2p
+      // first, the place where we cut the kernel could vary by rank which would
+      // cause the "shortest channel first" channel picker to have divergent results.
+      if (tasks->nTasksColl != 0) {
+        NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &nWorkBudget), result, failure);
+      }
+      // And only drain p2p tasks once colls are depleted.
+      if (tasks->nTasksColl == 0 && tasks->nTasksP2p != 0) {
+        NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &nWorkBudget), result, failure);
+      }
+      if (nWorkBudget == nWorkBudgetOld) {
+        // We weren't able to fit any tasks into our budget which means now we're
+        // stuck in an infinite loop. We defer this check until here, instead of
+        // doing it in comm init, to permit testing with insanely shallow queues
+        // for cases where that's expected to still work (e.g. few channels).
+        WARN("'NCCL_WORK_FIFO_DEPTH=%d' is too small. Minimum value is %d", comm->workFifoDepth, 2*MAXCHANNELS);
+        result = ncclInvalidUsage;
+        goto failure;
+      }
+      finishPlan(plan);
+    } while (tasks->nTasksColl + tasks->nTasksP2p != 0);
+
+    struct ncclKernelPlan* planHead = ncclIntruQueueHead(&comm->planQueue);
+    comm->unlaunchedPlansHead = planHead;
+
+    NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->deviceStream), result, failure);
+
+    // Create dependency for nccl device work on user streams.
+    for (struct ncclCudaStreamList* l=tasks->streams; l != nullptr; l = l->next) {
+      NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, l->stream), result, failure);
+    }
+
+    if (persistent || comm->persistentRefs != 0) {
+      bool acquired = false;
+      for (struct ncclKernelPlan* plan=planHead; plan != nullptr; plan = plan->next) {
+        if (plan->hasProxyOps) {
+          if (!acquired) {
+            acquired = true;
+            NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->hostStream), result, failure);
+          }
+          NCCLCHECKGOTO(ncclStrongStreamLaunchHost(tasks->capturingGraph, &comm->hostStream, hostStreamPlanCallback, plan), result, failure);
+        }
+      }
+      if (acquired) {
+        NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->hostStream), result, failure);
+      }
+    }
+
+    if (persistent) {
+      comm->persistentRefs += nPlans;
+      NCCLCHECKGOTO(ncclCudaGraphAddDestructor(tasks->capturingGraph, persistentDestructor, (void*)planHead), result, failure);
+    }
   }
 
-  // After capturing an op in graph mode or launching the op in non-graph mode
-  // we can reset myParams for use in next op
-  struct cudaLaunchParams *params = comm->myParams;
-  params->gridDim.x = params->blockDim.x = 0;
-  params->func = NULL;
-
-  // Reset launch mode to GROUP if changed
-  if (comm->launchMode == ncclComm::GROUP_GRAPH) comm->launchMode = ncclComm::GROUP;
-  comm->usingCudaGraph = 0;
+  if (false) {
+  failure:
+    ncclMemoryStackPop(&comm->memScoped); // deallocate ncclWork's
+  }
+  return result;
+}
 
+ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  // This code is called after we've checked in to the intra-process barrier
+  // but before launching the kernel. We are not allowed to call CUDA unless the
+  // kernel launch is captured.
+  NCCLCHECK(uploadWork(comm, plan));
   return ncclSuccess;
 }
 
+ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  struct ncclTasks* tasks = &comm->tasks;
+  dim3 grid = {(unsigned)plan->channelCount, 1, 1};
+  dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
+  void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead};
+  NCCLCHECK(ncclStrongStreamLaunchKernel(
+    tasks->capturingGraph, &comm->deviceStream, plan->kernelFn, grid, block, args, 0
+  ));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  if (comm->persistentRefs == 0) { // implies !plan->persistent
+    // If this isn't being captured and there aren't any CUDA graphs alive
+    // then we don't need to do our proxyOp pushing on the host stream.
+    NCCLCHECK(hostStreamPlanTask(comm, plan));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
+  ncclResult_t result = ncclSuccess;
+  struct ncclTasks* tasks = &comm->tasks;
+  tasks->collBytesTotal = 0; // Just in case subtraction during scheduleCollTasksToPlan() doesn't get to 0
+
+  // Deallocate ncclWork's. This frame exists so long as ncclLaunchPrepare
+  // succeeded, and if it ncclLaunchPrepare didn't succeed we wouldn't be here.
+  ncclMemoryStackPop(&comm->memScoped);
+
+  if (!ncclIntruQueueEmpty(&comm->planQueue)) {
+    // Reset queue to empty without destroying plans since those will be sent
+    // back to us for reclaiming via callbackQueue.
+    ncclIntruQueueConstruct(&comm->planQueue);
+    // Close strong stream "transaction" encompassing cuda launches
+    NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->deviceStream), result, resume1);
+  resume1:
+    // Create dependency for user streams on nccl device work.
+    struct ncclCudaStreamList* sl = tasks->streams;
+    tasks->streams = nullptr; // reset streams to empty
+    while (sl != nullptr) {
+      NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->deviceStream), result, resume2);
+    resume2:
+      sl = sl->next;
+    }
+  }
+  return result;
+}
+
 /*****************************************************************************/
 /* Enqueueing system : computation of kernel and proxy operations parameters */
 /*****************************************************************************/
@@ -412,7 +1059,7 @@ static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNet
   if (info->comm->collNetSupport > 0) {
     // Translate ncclAvg and PreMulSum
     ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op;
-    NCCLCHECK(collNetReduceSupport(info->datatype, netOp, collNetTypeSupport));
+    NCCLCHECK(collNetReduceSupport(info->comm, info->datatype, netOp, collNetTypeSupport));
   } else {
     *collNetTypeSupport = 0;
   }
@@ -480,6 +1127,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
     if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE;
     if (info->algorithm == NCCL_ALGO_COLLNET) nt += 3*WARP_SIZE;
   }
+  nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
   info->nChannels = nc;
   info->nThreads = nt;
   return ncclSuccess;
@@ -524,7 +1172,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
   return ncclSuccess;
 }
 
-static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */) {
+static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */) {
   int collNetTypeSupport = 0;
   // Check whether algo and proto have been preset (as in aggregation case)
   // If so, skip the calculation
@@ -537,23 +1185,22 @@ comp_next:
   NCCLCHECK(getPatternInfo(info));
   NCCLCHECK(getLoopInfo(info));
 
-  work->header.type = ncclWorkTypeColl;
   work->sendbuff = info->sendbuff;
   work->recvbuff = info->recvbuff;
   work->root = info->root;
   work->count = info->count;
   work->nChannels = info->nChannels;
-  work->header.nWarps = info->nThreads / WARP_SIZE;
+  work->nWarps = info->nThreads / WARP_SIZE;
   work->redOpArg = info->opFull.scalarArg;
   work->redOpArgIsPtr = info->opFull.scalarArgIsPtr;
 
   if (info->comm->nRanks == 1) {
     // one-rank reduce index
-    work->header.funcIndex = 1 + int(info->datatype);
+    *workFuncIndex = 1 + int(info->datatype);
     return ncclSuccess;
   }
 
-  work->header.funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
+  *workFuncIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
 
   int stepSize   = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
   int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
@@ -623,636 +1270,6 @@ comp_next:
   return ncclSuccess;
 }
 
-static ncclResult_t checkSetStream(struct ncclInfo* info) {
- if (info->comm->userStreamSet == false) {
-    info->comm->userStream = info->stream;
-    info->comm->userStreamSet = true;
-  } else if (info->stream != info->comm->userStream) {
-    WARN("Error : mixing different streams within a group call is not supported.");
-    return ncclInvalidUsage;
-  }
-  return ncclSuccess;
-}
-
-// Handle structure for user buffer registration (IPC) exchange
-struct ncclBuffRegHandle {
-  cudaIpcMemHandle_t sendBuffIpc;
-  cudaIpcMemHandle_t recvBuffIpc;
-  ssize_t sendBuffOffset;
-  ssize_t recvBuffOffset;
-};
-
-// Register input and output buffers
-// Exchange with ranks on the same host
-static ncclResult_t ncclRegBuffAndExchange(struct ncclInfo* info, struct ncclBuffRegInfo* regInfo) {
-  ncclComm_t comm = info->comm;
-  if (comm->localRanks == 1) return ncclSuccess;
-  if (comm->pfnCuMemGetAddressRange == NULL) return ncclSuccess;  // CUDA toolkit or driver version too old
-
-  ncclResult_t ret = ncclSuccess;
-  struct ncclBuffRegHandle regHandles[NCCL_MAX_LOCAL_RANKS];
-  // Get IPC handles
-  // Note: the handle only corresponds to the base address of the allocation
-  CUDACHECKGOTO(cudaIpcGetMemHandle(&regHandles[comm->localRank].sendBuffIpc, (void*)info->sendbuff), ret, reg_fallback);
-  CUDACHECKGOTO(cudaIpcGetMemHandle(&regHandles[comm->localRank].recvBuffIpc, (void*)info->recvbuff), ret, reg_fallback);
-  // Get offset of user buffer within allocation
-  void* baseAddr;
-  size_t size;
-  // Get base address
-  CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->sendbuff));
-  regHandles[comm->localRank].sendBuffOffset = (char*)info->sendbuff - (char*)baseAddr;
-  CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->recvbuff));
-  regHandles[comm->localRank].recvBuffOffset = (char*)info->recvbuff - (char*)baseAddr;
-  TRACE(NCCL_COLL, "Base %p size %lu offset %ld", baseAddr, size, regHandles[comm->localRank].recvBuffOffset);
-
-  // Exchange handles within node
-  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regHandles, sizeof(struct ncclBuffRegHandle)));
-  // Open handles at local process
-  for (int i=0; i<comm->localRanks; i++) {
-    // Skip myself
-    if (i == comm->localRank) {
-      regInfo->sendbuffsBase[i] = regInfo->recvbuffsBase[i] = NULL;
-      continue;
-    }
-    // Get base address of mapping
-    CUDACHECK(cudaIpcOpenMemHandle(regInfo->sendbuffsBase+i, regHandles[i].sendBuffIpc, cudaIpcMemLazyEnablePeerAccess));
-    CUDACHECK(cudaIpcOpenMemHandle(regInfo->recvbuffsBase+i, regHandles[i].recvBuffIpc, cudaIpcMemLazyEnablePeerAccess));
-    // Get real buffer address by adding offset in the mapping
-    regInfo->sendbuffs[i] = (char*)regInfo->sendbuffsBase[i] + regHandles[i].sendBuffOffset;
-    regInfo->recvbuffs[i] = (char*)regInfo->recvbuffsBase[i] + regHandles[i].recvBuffOffset;
-  }
-  // Marks the operation as being buffer registered
-  regInfo->nBuffs = comm->localRanks;
-  TRACE(NCCL_COLL, "Rank %d exchanged %d buffers", comm->rank, regInfo->nBuffs);
-  return ncclSuccess;
-
-reg_fallback:
-  // If we cannot register specific buffer types, we just bypass this stage, and continue without failing
-  (void)ret;
-  WARN("Unable to register user buffers");
-  return ncclSuccess;
-}
-
-// Compute enqueue element, save it in list
-// Compute CUDA launch parameters
-// Capture time code in view of CUDA graph
-static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) {
-  ncclComm_t comm = info->comm;
-  if (comm->nRanks == 1 &&
-      // User-defined reduction ops may need alter the data even for unitary reductions
-      info->op < ncclNumOps) {
-    if (info->sendbuff != info->recvbuff)
-      CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream));
-    return ncclSuccess;
-  }
-
-  // Compute cuda kernel arg and proxy arg templates
-  struct ncclQueueElem* eqElem;
-  NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem));
-  struct ncclWork* work = &eqElem->work;
-  NCCLCHECK(computeColl(info, work->elems, &eqElem->proxyOp));
-
-  // Determine grid size
-  struct cudaLaunchParams* params = comm->myParams;
-  params->gridDim.x += info->nChannels;
-  params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
-  params->blockDim.x = std::max<unsigned>(params->blockDim.x, info->nThreads);
-  comm->enqueueInfo->maxChannels = params->gridDim.x;  // params may be varied by a second graph hence we need to capture it here
-
-  // Inline the first kernel
-  if (params->func == NULL) {
-    params->func = ncclKerns[work->header.funcIndex];
-    if (work->header.type == ncclWorkTypeColl) {
-      // Copy the first operation to the inline argument. Type may be set later to
-      // ncclWorkTypeUnused if we have more than one coll element.
-      memcpy(&comm->args, work->elems, sizeof(struct ncclWorkElem));
-      comm->args.bid = 0;    // Only inline for channel 0
-      comm->args.header.isLast = 1; // I am so far the last element
-    }
-  }
-
-  // Register and exchange input and output buffers
-  if (comm->usingCudaGraph &&                   // only in CUDA graph mode
-      comm->graphRegister == 1 &&               // when registration is enabled
-      info->algorithm == NCCL_ALGO_COLLNET &&   // limited to CollNet for now
-      comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
-      comm->intraRanks == 1) {                  // only in multi-process mode
-    NCCLCHECK(ncclRegBuffAndExchange(info, &eqElem->buffRegInfo));
-    comm->enqueueInfo->nRegBuffs += eqElem->buffRegInfo.nBuffs;
-    work->header.type = ncclWorkTypeRegColl;
-    // Disable inline argument because we need kernel to copy the entire ncclWork from workFifo
-    // because the registered addresses are in ncclWorkElemReg
-    comm->args.header.type = ncclWorkTypeUnused;
-  }
-
-  return ncclSuccess;
-}
-
-// Find the channel with the least enqueued work (counted in bytes)
-static inline int findShortestChannel(ncclComm_t comm) {
-  size_t minSize = SIZE_MAX;
-  int minC = 0;
-  for (int c=0; c<comm->nChannels; c++) {
-    struct ncclChannel* channel = comm->channels+c;
-    if (channel->totalSize < minSize) {
-      minSize = channel->totalSize;
-      minC = c;
-    }
-  }
-  return minC;
-}
-
-// Get next channel based on shortest-queue mode or round-robin mode
-static inline int getNextChannel(ncclComm_t comm, int aggMode) {
-  int nextChannel = 0;
-  if (aggMode && comm->asyncAllocMode == ncclComm::SHORTEST_QUEUE) {
-    nextChannel = findShortestChannel(comm);
-  } else {
-    nextChannel = comm->lastChannel % comm->nChannels;
-    comm->lastChannel++;
-  }
-  return nextChannel;
-}
-
-// Setup aggregated kernels
-// Op info has been previously saved in comm->asyncOps
-ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
-  if (comm->asyncOpCount == 0) {
-    return ncclSuccess;
-  } else if (comm->asyncOpCount == 1) {
-    // No aggregation
-    struct ncclInfo* info = comm->asyncOps;
-    info->nChannels = 0;
-    NCCLCHECK(ncclSetupCollKernel(info));
-  } else {
-    // Aggregation
-    // Determine a per-channel chunk size used to divide an operation into multiple channels
-    size_t channelSize;
-    if (comm->channelSize > 0) {
-      // Set by user
-      channelSize = comm->channelSize;
-    } else if (comm->collNetSupport && comm->asyncOps[0].coll == ncclFuncAllReduce) {
-      // CollNet specific size (tuned based on experiments)
-      channelSize = 256 * 1024;
-    } else {
-      // Latency increases as scale increases
-      // We would thus want to increase the chunk size to compensate for the lost efficiency
-      channelSize = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks);
-    }
-    // Reduce the per-channel size if we cannot fully utilize the channels
-    while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2;
-    // Check whether the ops have same reduce and data types (and hence can be packed in same ncclWork)
-    int channelUsed = 0;
-    int homogeneous = 1;
-    int allCollNetSupport = comm->collNetSupport;
-    for (int c = 0; c < comm->asyncOpCount; c++) {
-      struct ncclInfo* info = comm->asyncOps+c;
-      info->nChannels = std::min(std::max(1, (int)DIVUP(info->nBytes, channelSize)), comm->nChannels); // assign number of channels
-      channelUsed += info->nChannels;
-      // We can use fast path if all collectives are the same
-      homogeneous &= info->coll == comm->asyncOps[0].coll &&
-                     info->opFull.op == comm->asyncOps[0].opFull.op &&
-                     info->datatype == comm->asyncOps[0].datatype;
-      if (allCollNetSupport > 0) NCCLCHECK(getCollNetSupport(info, &allCollNetSupport));
-    }
-    // Compute algo, proto, nthreads for the entire kernel
-    // Prepare a synthetic op info to calculate the collective algo
-    struct ncclInfo total;
-    total.comm = comm;
-    total.coll = comm->asyncOps[0].coll;
-    total.nBytes = comm->asyncTotalSize;
-    total.nChannels = std::min(channelUsed, comm->nChannels);
-    int perChannelOps = DIVUP(channelUsed, total.nChannels);
-    if (homogeneous) NCCLCHECK(getAlgoInfo(&total, allCollNetSupport, perChannelOps));
-    // Set for each op
-    for (int c = 0; c < comm->asyncOpCount; c++) {
-      struct ncclInfo* info = comm->asyncOps+c;
-      if (homogeneous) {
-        // Set fields to skip the individual computeColl in ncclSetupCollKernel
-        info->algorithm = total.algorithm;
-        info->protocol = total.protocol;
-        info->nThreads = total.nThreads;
-      }
-      NCCLCHECK(ncclSetupCollKernel(info));
-    }
-    comm->args.header.type = ncclWorkTypeUnused;  // disable inline argument
-  }
-  // Reset counters
-  comm->asyncOpCount = 0;
-  comm->asyncTotalSize = 0;
-  return ncclSuccess;
-}
-
-// Store aggregated operations info
-static ncclResult_t ncclSaveAsyncColl(struct ncclInfo* info) {
-  ncclComm_t comm = info->comm;
-  if (comm->asyncOpCount >= NCCL_MAX_OPS) {
-    WARN("Too many async operations in progress, max is %d", NCCL_MAX_OPS);
-    return ncclInvalidUsage;
-  }
-  memcpy(comm->asyncOps+comm->asyncOpCount, info, sizeof(struct ncclInfo));
-  comm->asyncOpCount++;
-  comm->asyncTotalSize += info->nBytes;
-  return ncclSuccess;
-}
-
-// Save p2p operations in comm->p2pSends and p2pRecvs. Operations will be posted to channels
-// during ncclGroupEnd()
-static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
-  struct ncclComm* comm = info->comm;
-  int peer = info->root;
-  ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
-  int channelBaseId;
-  NCCLCHECK(ncclChannelComputeBase(comm, peer, info->coll, &channelBaseId));
-  if (info->coll == ncclFuncSend) {
-    if (peer != comm->rank) {
-      // Mark channels that need pre-connect
-      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
-        int channelId;
-        NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
-        if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
-          comm->connectSend[peer] |= (1<<channelId);
-          comm->connect = 1;
-        }
-      }
-    }
-    NCCLCHECK(ncclSaveP2pInfo(comm->p2pSends[info->root], info->recvbuff, nBytes));
-    comm->p2pSendCount++;
-  } else {
-    if (peer != comm->rank) {
-      // Mark channels that need pre-connect
-      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
-        int channelId;
-        NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
-        if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
-          comm->connectRecv[peer] |= (1<<channelId);
-          comm->connect = 1;
-        }
-      }
-    }
-    NCCLCHECK(ncclSaveP2pInfo(comm->p2pRecvs[info->root], info->recvbuff, nBytes));
-    comm->p2pRecvCount++;
-  }
-  return ncclSuccess;
-}
-
-static int getSegment(enum ncclWorkElemType type, enum ncclWorkElemSubType subType, int peer, struct ncclWork* work) {
-  if (work->header.type && (work->header.type != type)) return -1;
-
-  if (type == ncclWorkTypeP2p) {  // P2P
-    int start = subType == ncclWorkSubTypeRecv ? 0 : 1;
-    for (int s=start; s<NCCL_MAX_WORK_ELEMENTS_P2P; s+=2) {
-      if (work->p2pElems[s].peer == -1) return s;
-      // Do not aggregate multiple sends to the same peer (or receives from the same peer)
-      if (work->p2pElems[s].peer == peer) return -1;
-    }
-  } else if (type == ncclWorkTypeRegColl) { // CollNet
-    for (int s=0; s<NCCL_MAX_WORK_ELEMENTS_REG; s++) {
-      if (work->regElems[s].elem.header.type == ncclWorkTypeUnused) return s;
-    }
-  } else if (type == ncclWorkTypeColl) {  // Ring or Tree
-    for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
-      if (work->elems[s].header.type == ncclWorkTypeUnused) return s;
-    }
-  }
-  return -1;
-}
-
-// Compute kernel arguments for P2P ops
-static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElemP2p* elem /* output */) {
-  elem->header.type = ncclWorkTypeP2p;
-  elem->header.funcIndex = FUNC_INDEX_P2P;
-  elem->header.nWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
-  elem->buff = info->recvbuff;
-  elem->subType = info->coll == ncclFuncSend ? ncclWorkSubTypeSend : ncclWorkSubTypeRecv;
-  elem->count = info->count;
-  elem->chunkSize = info->chunkSize;
-  elem->peer = info->root;
-  return ncclSuccess;
-}
-
-// Equeue work elements into segment of ncclWork
-// Supporting both collectives (aggregated or not) and P2P
-static ncclResult_t enqueueSegOp(enum ncclWorkElemType type, struct ncclWork* elem /* input */, struct ncclWork* work, int s,
-    struct ncclBuffRegInfo* regInfo, struct ncclChannel* channel, struct ncclComm* comm) {
-
-  if (type == ncclWorkTypeP2p) {
-    memcpy(work->p2pElems+s, elem, sizeof(struct ncclWorkElemP2p));
-    int nelems = 0;
-    for (int i=0; i<NCCL_MAX_WORK_ELEMENTS_P2P; i++) {
-      if (work->p2pElems[i].header.type) nelems = i+1;
-    }
-
-    int ngroups = 1;
-    while (ngroups < nelems) ngroups *= 2;
-    int nWarps = 1;
-    while (nWarps*ngroups <= elem->header.nWarps/2) nWarps *= 2;
-
-    for (int i=0; i<ngroups; i++) {
-      work->p2pElems[i].ngroups = ngroups;
-      work->p2pElems[i].warpStart =
-        i*(NCCL_MAX_NTHREADS/WARP_SIZE)/ngroups;
-      int extraWarp = nWarps >= 2 ? i%2 : 0;
-      work->p2pElems[i].nWarps = nWarps + extraWarp;
-    }
-    return ncclSuccess;
-  }
-
-  memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem));
-
-  if (regInfo->nBuffs == 0) return ncclSuccess;
-
-  // Copy registered buffer addresses into ncclWork
-  struct ncclWorkElemReg* regElem = (struct ncclWorkElemReg*)(work->elems+s);
-  // For CollNet
-  for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
-    int peer = channel->collTree.down[i];
-    if (peer == -1) break;
-    // Get intra-node slot
-    int j = comm->rankToLocalRank[peer];
-    if (j < 0) {
-      WARN("Invalid intra-node rank %d for peer %d", j, peer);
-      return ncclInternalError;
-    }
-    // Input buffer of leaf peer
-    regElem->dnInputs[i] = regInfo->sendbuffs[j];
-    // Output buffer of leaf peer
-    regElem->dnOutputs[i] = regInfo->recvbuffs[j];
-  }
-  for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
-    int peer = channel->collTree.up[i];
-    if (peer == -1) break;
-    int j = comm->rankToLocalRank[peer];
-    if (j < 0) {
-      WARN("Invalid intra-node rank %d for peer %d", j, peer);
-      return ncclInternalError;
-    }
-    // Output buffer of root peer
-    regElem->upOutputs[i] = regInfo->recvbuffs[j];
-  }
-  work->elems[s].regUsed = 1;
-  return ncclSuccess;
-}
-
-// Enqueue P2P op
-ncclResult_t ncclEnqueueP2pKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem) {
-  struct ncclWorkElemP2p* workElem = eqElem->work.p2pElems;
-  struct ncclProxyOp* proxyOp = &eqElem->proxyOp;
-
-  // Try to reuse last p2p operation if not full yet
-  struct ncclChannel* channel = comm->channels+proxyOp->channelId;
-  int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
-  struct ncclWork* w = channel->workFifo+opIndex;
-  int segment = -1;
-  if (channel->workCount) {
-    // Try to pack more segments into a single operation
-    segment = getSegment(ncclWorkTypeP2p, workElem->subType, workElem->peer, w);
-  }
-  if (segment == -1) {
-    NCCLCHECK(getNextOp(channel, &w, NULL));
-    segment = workElem->subType == ncclWorkSubTypeRecv ? 0 : 1;
-    // Initialize work as P2P, set peer=-1 to designate the p2p elem is not used.
-    w->header.type = ncclWorkTypeP2p;
-    for (int i=0; i<NCCL_MAX_WORK_ELEMENTS_P2P; i++) w->p2pElems[i].peer = -1;
-  }
-  //printf("%s to %d -> Channel %d OpCount %ld Segment %d\n", workElem->subType == ncclWorkSubTypeRecv ? "Recv" : "Send", proxyOp->root, channel->id, channel->workFifoTail-1, segment);
-
-  // store work element into FIFO
-  NCCLCHECK(ncclProxySaveP2p(comm, proxyOp));
-  NCCLCHECK(enqueueSegOp(ncclWorkTypeP2p, &eqElem->work, w, segment, &eqElem->buffRegInfo, channel, comm));
-  return ncclSuccess;
-}
-
-// Setup P2P op
-ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) {
-  ncclComm* comm = info->comm;
-  // Compute cuda kernel arg and proxy arg templates
-  struct ncclQueueElem* eqElem;
-  NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem));
-  // The proxy code will set and tune the send/recv chunk size, make sure to run it first.
-  NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyOp));
-  NCCLCHECK(computeP2pWorkElem(info, eqElem->work.p2pElems));
-  // Compute grid size
-  int channelId = info->channelId;
-  struct cudaLaunchParams* params = comm->myParams;
-  params->gridDim.x = std::max<unsigned>(params->gridDim.x, channelId+1);
-  params->blockDim.x = std::max<unsigned>(params->blockDim.x, eqElem->work.header.nWarps*WARP_SIZE);
-  comm->enqueueInfo->maxChannels = params->gridDim.x;  // params may be varied by a second graph hence we need to capture it here
-
-  // Record the first kernel to launch
-  // Just for CUDA kernel to know this is a P2P operation
-  // The CUDA kernel does not use the inlined first work element as fastpath argument
-  if (params->func == NULL) {
-    params->func = ncclKerns[eqElem->work.header.funcIndex];
-    comm->args.header.type = ncclWorkTypeUnused;
-  }
-  return ncclSuccess;
-}
-
-// Dynamic enqueue function for collective kernels
-// Supports both aggregated and non-aggregated modes
-ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem, int aggMode) {
-  struct ncclWork* work = &eqElem->work;
-  struct ncclWorkElem* elem = work->elems;
-  struct ncclProxyOp* proxyOp = &eqElem->proxyOp;
-
-  int nChannels = elem->nChannels;
-  size_t channelSize = elem->count*ncclTypeSize(proxyOp->dtype)/elem->nChannels;
-  enum ncclWorkElemType workElemType = proxyOp->redOp == ncclNumOps ? ncclWorkTypeColl : ncclWorkTypeRegColl;  // redOp is only set when using CollNet
-
-  for (int bid=0; bid<nChannels; bid++) {
-    int channelId = getNextChannel(comm, aggMode);
-    struct ncclChannel* channel = comm->channels+channelId;
-
-    // Proxy
-    proxyOp->channelId = channelId;
-    proxyOp->opCount = comm->collOpCount;
-    if (proxyOp->nsteps) NCCLCHECK(ncclProxySaveColl(comm, proxyOp, comm->nRanks));
-
-    elem->bid = bid % nChannels;
-    struct ncclWork* w = NULL;
-    int segment = -1;
-    if (aggMode && channel->workCount) {
-      // Try to pack more segments into a single operation
-      int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
-      w = channel->workFifo+opIndex;
-      // All elems in work must have same (funcIndex,nThreads),
-      // see "src/collectives/device/common.h"
-      if (w->header.funcIndex == work->header.funcIndex &&
-          w->header.nWarps == work->header.nWarps) {
-        segment = getSegment(workElemType, ncclWorkSubTypeUnused, 0, w);
-      }
-    }
-    if (segment == -1) {
-      NCCLCHECK(getNextOp(channel, &w, NULL));
-      segment = 0;
-    }
-
-    // store work element into FIFO
-    NCCLCHECK(enqueueSegOp(workElemType, work, w, segment, &eqElem->buffRegInfo, channel, comm));
-    channel->totalSize += channelSize;
-  }
-  comm->collOpCount++;
-  return ncclSuccess;
-}
-
-// Host setup node for CUDA Graph
-// Performs the enqueue job
-template<int USING_CUDA_GRAPH>
-void CUDART_CB ncclEnqueueHostSetup(void* arg) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
-  ncclResult_t ret;
-  // All work for current launch has been captured in Queue Info
-  struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)arg;
-  ncclComm_t comm = eqInfo->comm;
-  int aggMode = eqInfo->elemList->count() > 1 ? 1 : 0;
-
-  // Iterate through the element list
-  struct ncclQueueElem* eqElem = eqInfo->elemList->begin();
-  while (eqElem != NULL) {
-    if (eqElem->work.header.funcIndex == FUNC_INDEX_P2P) {
-      NCCLCHECKGOTO(ncclEnqueueP2pKernel(comm, eqElem), ret, cb_end);
-    } else {
-      NCCLCHECKGOTO(ncclEnqueueCollKernel(comm, eqElem, aggMode), ret, cb_end);
-    }
-    eqElem = eqInfo->elemList->getNext();
-  }
-
-  NCCLCHECKGOTO(setupLaunch(eqInfo, USING_CUDA_GRAPH), ret, cb_end);
-  NCCLCHECKGOTO(ncclLaunchProxy(eqInfo), ret, cb_end);
-
-cb_end:
-  if (ret != ncclSuccess) {
-    WARN("Failure in host setup : %s", ncclGetErrorString(ret));
-  }
-  eqInfo->ret = ret;
-}
-
-template void CUDART_CB ncclEnqueueHostSetup<0>(void*);
-template void CUDART_CB ncclEnqueueHostSetup<1>(void*);
-
-// CUDA Graph helper thread
-// for de-registering user buffers
-void* graphHelperFunc(void *args) {
-  struct ncclGraphHelperResources* res = (struct ncclGraphHelperResources*)args;
-  if (res == NULL) {
-    WARN("CUDA Graph helper resource is null");
-    return NULL;
-  }
-  int dev = res->comm->cudaDev;
-  CUDACHECKIGNORE(cudaSetDevice(dev));
-  INFO(NCCL_COLL, "CUDA Graph helper thread created for device %d", dev);
-
-  volatile enum helperThreadState* state = &res->threadState;
-  volatile int* ipcTail = &res->ipcTail;
-  while (1) {
-    // Last IPC entry enqueue so far
-    int ipcTailMark = *ipcTail;
-    int ipcCount = 0;
-    // Close IPC till the last entry
-    while (res->ipcHead != ipcTailMark) {
-      if (res->ipcBases[res->ipcHead] != NULL)
-        CUDACHECKIGNORE(cudaIpcCloseMemHandle(res->ipcBases[res->ipcHead]));
-      res->ipcBases[res->ipcHead] = NULL;
-      res->ipcHead = (res->ipcHead+1)%NCCL_IPC_POOL_SIZE;
-      ipcCount++;
-    }
-    TRACE(NCCL_COLL, "CUDA Graph helper thread closed %d IPC handles", ipcCount);
-    pthread_mutex_lock(&res->threadLock);
-    // Check for exit signal
-    while (res->ipcHead == *ipcTail && *state != ThreadStop) {
-      pthread_cond_wait(&res->threadCond, &res->threadLock);
-    }
-    pthread_mutex_unlock(&res->threadLock);
-    if (*state == ThreadStop) {
-      INFO(NCCL_COLL, "CUDA Graph helper thread for device %d returning", dev);
-      return NULL;
-    }
-  }
-}
-
-// Check if we are in CUDA Graph capture mode
-ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) {
-  comm->usingCudaGraph = 0;
-  // Feature requires CUDA 11.3/R465 or above
-#if CUDART_VERSION >= 11030
-  cudaStreamCaptureStatus captureStatus;
-  unsigned long long cudaGraphId;
-  ncclResult_t ret = ncclSuccess;
-  if (comm->driverVersion < 11030) {
-    // Runtime driver version older than compiler version
-    // Enhanced compat fallback
-    goto enh_compat_end;
-  }
-  // Get CUDA Graph handle
-  CUDACHECKGOTO(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end);
-  if (captureStatus == cudaStreamCaptureStatusActive) {
-    if (cudaGraphId != comm->lastCudaGraphId) {
-      INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId);
-      // We are in a new graph, hence need to forget the last setup node so that
-      // the first setup node in the new graph will not have a dependency
-      comm->lastCudaGraphId = cudaGraphId;
-      comm->lastSetupNode = NULL;
-    }
-    if (comm->launchMode == ncclComm::GROUP) comm->launchMode = ncclComm::GROUP_GRAPH;
-    comm->usingCudaGraph = 1;
-
-    // Create helper thread that closes IPC handles during graph destruction
-    // Only create this thread when buffer registration is enabled
-    if ((!comm->graphHelperThread) && comm->graphRegister == 1 && comm->disableGraphHelper == 0) {
-      pthread_mutex_init(&comm->graphHelperResources->threadLock, NULL);
-      // Init signaling method between Graph destroy function and helper thread
-      pthread_cond_init(&comm->graphHelperResources->threadCond, NULL);
-      // Set state
-      comm->graphHelperResources->threadState = ThreadStart;
-      // Create thread
-      pthread_create(&comm->graphHelperThread, NULL, graphHelperFunc, comm->graphHelperResources);
-      // Name thread
-      ncclSetThreadName(comm->graphHelperThread, "NCCL GrHelper%2d", comm->cudaDev);
-    }
-  }
-  return ncclSuccess;
-
-enh_compat_end: // Enhanced compat fallback
-  (void)ret;
-  CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus));
-  if (captureStatus != cudaStreamCaptureStatusNone) {
-    WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
-    return ncclInvalidUsage;
-  }
-  // If we are not in capture mode, we can ignore the driver being lower
-#endif
-  return ncclSuccess;
-}
-
-// Create host setup node in CUDA Graph
-ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph) {
-#if CUDART_VERSION >= 11030
-  struct ncclQueueInfo* eqInfo = comm->enqueueInfo;
-  // Create a CUDA object to wrap around the argument space
-  // which CUDA graph would manage lifetime of
-  cudaUserObject_t object;
-  CUDACHECK(cudaUserObjectCreate(&object, eqInfo, ncclDestroyQueueInfo, 1/*initialRefcount*/, cudaUserObjectNoDestructorSync));
-  // Hand over ownership to CUDA Graph
-  CUDACHECK(cudaGraphRetainUserObject(graph, object, 1, cudaGraphUserObjectMove));
-
-  cudaHostFn_t fn = ncclEnqueueHostSetup<1>;
-  // Add a CPU node to the graph
-  cudaGraphNode_t setupNode;
-  // Function + parameter space for that function (i.e. enqueue info)
-  cudaHostNodeParams setupNodeParams = {fn, eqInfo};
-  int numDependencies = comm->lastSetupNode == NULL ? 0 : 1;
-  CUDACHECK(cudaGraphAddHostNode(&setupNode, graph, &comm->lastSetupNode, numDependencies, &setupNodeParams));
-  // Create dependency from last setup node in the same graph
-  CUDACHECK(cudaStreamUpdateCaptureDependencies(comm->userStream, &setupNode, 1, cudaStreamAddCaptureDependencies));
-  comm->lastSetupNode = setupNode;
-  return ncclSuccess;
-#else
-  WARN("NCCL does not support this CUDA version for CUDA graph feature");
-  return ncclInternalError;
-#endif
-}
-
 static ncclResult_t hostToDevRedOp(
     ncclDevRedOpFull *opFull, ncclRedOp_t op, ncclDataType_t datatype, ncclComm *comm
   ) {
@@ -1321,70 +1338,131 @@ static ncclResult_t hostToDevRedOp(
   return ncclSuccess;
 }
 
-ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
-  ncclResult_t ret = ncclSuccess;
-  bool isAsync = ncclAsyncMode();
-  int savedDev = -1;
-  // Check arguments
-  NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
-  if (isAsync && info->comm->checkPointers) {
-    CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
-    CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end);
-  }
-  NCCLCHECKGOTO(ArgsCheck(info), ret, end);
+// Converts `info` to a task and adds it to `comm->tasks`. The exception is with
+// single rank communicators, collectives are issued as `ncclMemcpyAsync`s and
+// thus don't need a task.
+static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo const* info) {
+  ncclTasks *tasks = &comm->tasks;
+  if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) {
+    int peer = info->root;
+    ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
+    bool isSendNotRecv = info->coll == ncclFuncSend;
 
-  // Copy reduction op state from op handle into info struct here since the
-  // op handle may be destroyed before ncclGroupEnd().
-  NCCLCHECKGOTO(hostToDevRedOp(&info->opFull, info->op, info->datatype, info->comm), ret, end);
+    // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
+    ncclGroupCommJoin(info->comm);
+    struct ncclTaskP2p* p2p = ncclMemoryStackAlloc<struct ncclTaskP2p>(&comm->memScoped);
+    p2p->buff = (void*)info->recvbuff;
+    p2p->bytes = nBytes;
+    p2p->chunk = 0;
+    ncclIntruQueueEnqueue(
+      isSendNotRecv ? &tasks->peers[peer].sendQueue : &tasks->peers[peer].recvQueue,
+      p2p);
+    tasks->nTasksP2p += 1;
 
-  // Launch asynchronously if needed
-  if (isAsync) {
-    // Always register comm even in case of error to make sure ncclGroupEnd
-    // cleans it up.
-    NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
-    NCCLCHECKGOTO(checkSetStream(info), ret, end);
-
-    INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
-        info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
-        info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
-
-    if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) { //p2p stored separately
-      NCCLCHECKGOTO(ncclSaveP2p(info), ret, end);
-    } else {
-      NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end);
+    // Mark channels that need pre-connect
+    if (comm->rank != peer) {
+      int channelBaseId;
+      NCCLCHECK(ncclChannelComputeBase(comm, peer, info->coll, &channelBaseId));
+      if (!(isSendNotRecv ? tasks->peers[peer].sendSeen : tasks->peers[peer].recvSeen)) {
+        (isSendNotRecv ? tasks->peers[peer].sendSeen : tasks->peers[peer].recvSeen) = true;
+        for (int c=0; c < comm->p2pnChannelsPerPeer; c++) {
+          int channelId;
+          NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
+          if (isSendNotRecv) {
+            if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
+              comm->connectSend[peer] |= (1<<channelId);
+              ncclGroupCommPreconnect(comm);
+            }
+          } else {
+            if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
+              comm->connectRecv[peer] |= (1<<channelId);
+              ncclGroupCommPreconnect(comm);
+            }
+          }
+        }
+      }
     }
   } else {
-    NCCLCHECKGOTO(checkSetStream(info), ret, end);
+    // Copy reduction op state from op handle into info struct here since the
+    // op handle may be destroyed before ncclGroupEnd().
+    struct ncclDevRedOpFull opFull;
+    NCCLCHECK(hostToDevRedOp(&opFull, info->op, info->datatype, comm));
 
-    INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+    // User-defined reduction ops may need alter the data even for unitary reductions
+    if (comm->nRanks == 1 && opFull.op < ncclDevPreMulSum) {
+      if (info->sendbuff != info->recvbuff) {
+        size_t bytes = info->count*ncclTypeSize(info->datatype);
+        CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, bytes, cudaMemcpyDeviceToDevice, info->stream));
+      }
+      return ncclSuccess;
+    } else {
+      // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
+      ncclGroupCommJoin(info->comm);
+      struct ncclTaskColl* t = ncclMemoryStackAlloc<struct ncclTaskColl>(&comm->memScoped);
+      t->func = info->coll;
+      t->sendbuff = info->sendbuff;
+      t->recvbuff = info->recvbuff;
+      t->count = info->count;
+      t->root = info->root;
+      t->datatype = info->datatype;
+      t->op = opFull; // C++ struct assignment
+      t->chunkSteps = info->chunkSteps;
+      t->sliceSteps = info->sliceSteps;
+      ncclIntruQueueEnqueue(&tasks->collQueue, t);
+      tasks->collBytesTotal += t->count*ncclTypeSize(t->datatype);
+      tasks->nTasksColl += 1;
+    }
+  }
+
+  if (info->stream != tasks->streamRecent || tasks->streams == nullptr) {
+    tasks->streamRecent = info->stream;
+    struct ncclCudaStreamList* l = tasks->streams;
+    while (true) {
+      if (l == nullptr) { // Got to the end, this must be a new stream.
+        struct ncclCudaGraph graph;
+        NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream))
+        if (tasks->streams != nullptr && !ncclCudaGraphSame(tasks->capturingGraph, graph)) {
+          WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph.");
+          return ncclInvalidUsage;
+        }
+        tasks->capturingGraph = graph; // C++ struct assignment
+        // Add stream to list
+        l = ncclMemoryStackAlloc<struct ncclCudaStreamList>(&comm->memScoped);
+        l->stream = info->stream;
+        l->next = tasks->streams;
+        tasks->streams = l;
+        break;
+      }
+      if (l->stream == info->stream)
+        break; // Already seen stream.
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
+  NCCLCHECK(ncclGroupStartInternal());
+  ncclResult_t ret = ncclSuccess;
+  int devOld = -1;
+  NCCLCHECKGOTO(PtrCheck(info->comm, info->opName, "comm"), ret, end0);
+  if (info->comm->checkPointers) {
+    CUDACHECKGOTO(cudaGetDevice(&devOld), ret, end0);
+    CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end0);
+  }
+  NCCLCHECKGOTO(ArgsCheck(info), ret, end1);
+
+  INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
         info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
         info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
+  TRACE_CALL("nccl%s(%" PRIx64 ",%" PRIx64 ",%zi,%d,%d,%d,%p,%p)", info->opName, reinterpret_cast<int64_t>(info->sendbuff), reinterpret_cast<int64_t>(info->recvbuff), info->count, info->datatype, info->op, info->root, info->comm, info->stream);
 
-    // Check whether we are in cuda graph mode
-    cudaGraph_t graph;
-    ncclComm_t comm = info->comm;
-    NCCLCHECKGOTO(ncclGetCudaGraph(comm, &graph), ret, end);
+  NCCLCHECKGOTO(taskAppend(info->comm, info), ret, end1);
 
-    // Common part between graph mode and non-graph mode
-    NCCLCHECKGOTO(ncclSetupCollKernel(info), ret, end);
-
-    // Host setup
-    if (comm->usingCudaGraph) {
-      NCCLCHECKGOTO(ncclCudaGraphHostSetup(comm, graph), ret, end);
-    } else {
-      ncclEnqueueHostSetup<0>(comm->enqueueInfo);
-      NCCLCHECKGOTO(comm->enqueueInfo->ret, ret, end);
-    }
-
-    // Common part between graph mode and non-graph mode
-    NCCLCHECKGOTO(ncclLaunchBarrier(comm), ret, end);
-    NCCLCHECKGOTO(ncclLaunchKernel(comm), ret, end);
-    NCCLCHECKGOTO(ncclRecordEvents(comm), ret, end);
-    NCCLCHECKGOTO(ncclLaunchReset(comm), ret, end);
-  }
-end:
-  if (isAsync && savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
-  if (isAsync) ncclAsyncErrCheck(ret);
+end1:
+  if (devOld != -1) CUDACHECKGOTO(cudaSetDevice(devOld), ret, end0);
+end0:
+  ncclGroupErrCheck(ret);
+  NCCLCHECK(ncclGroupEndInternal());
   return ret;
 }
 
@@ -1419,6 +1497,7 @@ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataTyp
   }
   *op = ncclRedOp_t(int(ncclNumOps) + ix);
   *op = ncclUserRedOpMangle(comm, *op);
+  TRACE_CALL("ncclRedOpCreatePreMulSum(%d,%p,%d,%d,%p)", *op, scalar, datatype, residence, comm);
   return ncclSuccess;
 }
 
@@ -1440,5 +1519,6 @@ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) {
   // push to free list
   comm->userRedOps[ix].freeNext = comm->userRedOpFreeHead;
   comm->userRedOpFreeHead = ix;
+  TRACE_CALL("ncclRedOpDestroy(%d,%p)", op, comm);
   return ncclSuccess;
 }
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 222be70..ab8f8c3 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -428,10 +428,10 @@ NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0);
 
 // Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
 // remote proxies without risking deadlocks
-int ncclPxnDisable() {
+int ncclPxnDisable(struct ncclComm* comm) {
   static int pxnDisable = -1;
   if (pxnDisable == -1) {
-    if (ncclNetVersion() == 4) {
+    if (comm && ncclNetVersion(comm) == 4) {
       INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
       pxnDisable = 1;
     } else {
@@ -470,7 +470,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
+ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm) {
   // Precompute paths between GPUs/NICs.
 
   // Remove everything in case we're re-computing
@@ -498,16 +498,16 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
       }
     }
 
-    if (peerInfos == NULL) continue;
+    if (comm == NULL) continue;
     // Remove GPUs we can't talk to because of containers.
-    struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].gpu.rank;
+    struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank;
     for (int p=0; p<system->nodes[GPU].count; p++) {
       if (p == g) continue;
-      struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].gpu.rank;
+      struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank;
       int shm;
-      NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
+      NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
       int p2p;
-      NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
+      NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
       if (shm == 0 && p2p == 0) {
         // Mark this peer as inaccessible. We'll trim it later.
         system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
@@ -523,7 +523,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
     for (int g=0; g<system->nodes[GPU].count; g++) {
       // Check whether we can access the NIC through another NVLink-connected GPU (PXN)
       struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-      if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) {
+      if (ncclPxnDisable(comm) != 1 && gpu->paths[NET][n].type > PATH_PXB) {
         int pxnGpu = -1;
 
         for (int p=0; p<system->nodes[GPU].count; p++) {
@@ -670,7 +670,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
   // We want to spread channels used when there aren't many and progressively
   // fill the whole space of nChannels. To do so we mirror the bits in the
   // nChannels space.
-  for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
+  for (int c=0; c<comm->p2pnChannels; c++) {
     int mirror = 0;
     for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
     comm->p2pChannels[c] = mirror;
diff --git a/src/graph/search.cc b/src/graph/search.cc
index d70b6a7..0f79258 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -958,10 +958,14 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
     NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, dev));
     *proxyRank = rank;
 
-    int pxnLevel = ncclPxnDisable() == 1 ? 0 : ncclParamP2pPxnLevel();
+    int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel();
     // See whether we can use the remote rank preferred device.
     if (ncclParamCrossNic() == 0 || (pxnLevel != 0)) {
-      int netDev = comm->peerInfo[peerRank].netDev;
+      // Find local NIC number close to local cudaDev
+      int cudaDev = comm->peerInfo[peerRank].cudaDev;
+      int localRank;
+      if (ncclTopoDevToRank(comm->topo, cudaDev, &localRank) != ncclSuccess) return ncclSuccess;
+      int netDev = comm->peerInfo[localRank].netDev;
       int n;
       // Check that device exists on our node
       if (ncclParamCrossNic() == 0) {
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 53e12e5..2730bf9 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -626,11 +626,11 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
   // so we start with collnet so that it has precedence.
   int netDevCount = 0;
-  if (collNetSupport()) {
-    NCCLCHECK(collNetDevices(&netDevCount));
+  if (collNetSupport(comm)) {
+    NCCLCHECK(collNetDevices(comm, &netDevCount));
     for (int n=0; n<netDevCount; n++) {
       ncclNetProperties_t props;
-      NCCLCHECK(collNetGetProperties(n, &props));
+      NCCLCHECK(collNetGetProperties(comm, n, &props));
       struct ncclXmlNode* netNode;
       NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
       NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
@@ -639,16 +639,18 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
       NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
       NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
       NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
-      NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
+      bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
+      INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
+      NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
       NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
     }
   }
   if (netDevCount == 0) {
-    NCCLCHECK(ncclNetDevices(&netDevCount));
+    NCCLCHECK(ncclNetDevices(comm, &netDevCount));
   }
   for (int n=0; n<netDevCount; n++) {
     ncclNetProperties_t props;
-    NCCLCHECK(ncclNetGetProperties(n, &props));
+    NCCLCHECK(ncclNetGetProperties(comm, n, &props));
     struct ncclXmlNode* netNode;
     NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
     NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
@@ -658,7 +660,9 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
     NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
     NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
     NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
-    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
+    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
+    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
+    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
   }
 
   // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 71c1fca..b24a72b 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -181,6 +181,17 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank,
   return ncclInternalError;
 }
 
+static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, int* rank) {
+  *rank = -1;
+  for (int i=0; i<system->nodes[GPU].count; i++) {
+    if (system->nodes[GPU].nodes[i].gpu.dev == dev) {
+      *rank = system->nodes[GPU].nodes[i].gpu.rank;
+      return ncclSuccess;
+    }
+  }
+  return ncclInternalError;
+}
+
 // Returns NVLink speed in GB/s
 static float ncclTopoNVLinkSpeed(int cudaCompCap) {
   return
diff --git a/src/group.cc b/src/group.cc
index 5f65a58..d9bc684 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -10,399 +10,259 @@
 #include "transport.h"
 #include "channel.h"
 
-#define MAX_ASYNC_OPS 128
-thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
-thread_local int ncclGroupIndex = 0;
-thread_local int ncclGroupMode = 0;
-thread_local ncclResult_t ncclGroupError = ncclSuccess;
+__thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting
+__thread ncclResult_t ncclGroupError = ncclSuccess;
+__thread struct ncclComm* ncclGroupCommHead = nullptr;
+__thread struct ncclComm* ncclGroupCommPreconnectHead = nullptr;
+__thread struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> ncclAsyncJobs;
 
-bool ncclAsyncMode() {
-  return ncclGroupMode > 0;
-}
-
-ncclResult_t ncclAsyncErrCheck(ncclResult_t ret) {
-  if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret;
-  return ret;
-}
-
-struct ncclInitArgs {
-  ncclInitFunc_t func;
-  int cudaDev;
-  ncclComm_t* newcomm;
-  int ndev;
-  ncclUniqueId commId;
-  int myrank;
-};
-struct ncclCollArgs {
-  ncclComm_t comm;
-};
-
-enum ncclAsyncFuncType {
-  ASYNC_FUNC_INVALID = 0,
-  ASYNC_FUNC_INIT = 1,
-  ASYNC_FUNC_COLL = 2,
-};
-struct ncclAsyncArgs {
-  ncclResult_t ret;
-  enum ncclAsyncFuncType funcType;
-  union {
-    ncclCollArgs coll;
-    ncclInitArgs init;
-  };
-};
-
-thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
-
-void* ncclAsyncThreadMain(void* args_) {
-  struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
-  NCCLCHECKTHREAD(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev));
-  return args;
-}
-
-ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev) {
-  if (ncclGroupIndex >= MAX_ASYNC_OPS) {
-    WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
-    return ncclAsyncErrCheck(ncclInvalidUsage);
+ncclResult_t ncclAsyncLaunch(
+    struct ncclAsyncJob* job,
+    ncclResult_t(*func)(struct ncclAsyncJob*),
+    void(*undo)(struct ncclAsyncJob*),
+    void(*destructor)(void*)
+  ) {
+  if (0 == ncclGroupDepth) {
+    ncclResult_t res = func(job);
+    if (res != ncclSuccess && undo) undo(job);
+    if (destructor) destructor(job);
+    return res;
+  } else {
+    job->func = func;
+    job->undo = undo;
+    job->destructor = destructor;
+    ncclIntruQueueEnqueue(&ncclAsyncJobs, job);
+    return ncclSuccess;
   }
-  int index = ncclGroupIndex++;
-  struct ncclAsyncArgs* args = ncclGroupArgs+index;
-  args->funcType = ASYNC_FUNC_INIT;
-  args->init.func = func;
-  args->init.cudaDev = cudaDev;
-  args->init.newcomm = newcomm;
-  args->init.ndev = ndev;
-  memcpy(&args->init.commId, &commId, sizeof(commId));
-  args->init.myrank = myrank;
-  return ncclSuccess;
 }
 
-ncclResult_t ncclAsyncColl(ncclComm_t comm) {
-  struct ncclAsyncArgs* args = ncclGroupArgs;
-  for (int i=0; i<ncclGroupIndex; i++) {
-    if (args->coll.comm == comm) return ncclSuccess;
-    args++;
+void* ncclAsyncJobMain(void* arg) {
+  struct ncclAsyncJob* job = (struct ncclAsyncJob*)arg;
+  job->result = job->func(job);
+  if (job->result != ncclSuccess) {
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, job->result);
   }
-  if (ncclGroupIndex >= MAX_ASYNC_OPS) {
-    WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
-    return ncclAsyncErrCheck(ncclInvalidUsage);
-  }
-  ncclGroupIndex++;
-  args->funcType = ASYNC_FUNC_COLL;
-  args->coll.comm = comm;
-  return ncclSuccess;
+  return arg;
 }
 
 NCCL_API(ncclResult_t, ncclGroupStart);
 ncclResult_t ncclGroupStart() {
   NVTX3_FUNC_RANGE_IN(nccl_domain);
-  if (ncclGroupMode == 0) {
-    memset(ncclGroupArgs, 0, sizeof(struct ncclAsyncArgs)*MAX_ASYNC_OPS);
-  }
-  ncclGroupMode++;
+  NCCLCHECK(ncclGroupStartInternal());
+  TRACE_CALL("ncclGroupStart()");
   return ncclSuccess;
 }
 
-static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int chunk, size_t count, void* buff) {
-  struct ncclInfo info = { ncclFuncSend, "Send",
-    NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
-    1, 1 };
-  int channelId;
-  NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, ncclFuncSend, &channelId));
-  info.channelId = channelId;
-  NCCLCHECK(ncclSetupP2pKernel(&info));
-  return ncclSuccess;
-}
-static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int chunk, size_t count, void* buff) {
-  struct ncclInfo info = { ncclFuncRecv, "Recv",
-    NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
-    1, 1 };
-  int channelId;
-  NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, ncclFuncRecv, &channelId));
-  info.channelId = channelId;
-  NCCLCHECK(ncclSetupP2pKernel(&info));
-  return ncclSuccess;
-}
-
-void* ncclAsyncThreadPreconnect(void* args_) {
-  struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
-  struct ncclComm* comm = args->coll.comm;
-  CUDACHECKTHREAD(cudaSetDevice(comm->cudaDev));
-  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
-  NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, 1));
-  return args;
-}
-
-static size_t getP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
-  size_t size = std::max(minSize, DIVUP(totalSize, minChannels));
-  int nChannels = minChannels;
-  while (size > maxSize && nChannels <= maxChannels/2) {
-    nChannels *= 2;
-    size = DIVUP(totalSize, nChannels);
-  }
-  ALIGN_SIZE(size, minSize);
-  return size;
-}
-
 NCCL_API(ncclResult_t, ncclGroupEnd);
 ncclResult_t ncclGroupEnd() {
   NVTX3_FUNC_RANGE_IN(nccl_domain);
-  if (ncclGroupMode == 0) {
+  NCCLCHECK(ncclGroupEndInternal());
+  TRACE_CALL("ncclGroupEnd()");
+  return ncclSuccess;
+}
+
+struct ncclPreconnectJob {
+  struct ncclAsyncJob base;
+  struct ncclComm* comm;
+};
+ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) {
+  struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
+  struct ncclComm* comm = job->comm;
+  CUDACHECK(cudaSetDevice(comm->cudaDev));
+  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
+  return ncclSuccess;
+}
+
+static ncclResult_t doLaunches(struct ncclComm* head) {
+  ncclResult_t result = ncclSuccess;
+  struct ncclComm* cliqueComm0 = head->intraComm0;
+  struct ncclComm* cliqueHead = head;
+  struct ncclComm* cliqueNextHead;
+  bool useBarrier = ncclParamLaunchMode == ncclLaunchModeGroup;
+  // This outer loop iterates over cliques of comms which are siblings of the
+  // same global entity. We calculate a clique as all comms which have the same
+  // `intraComm0` value.
+  do {
+    struct ncclComm* comm = cliqueHead;
+    bool capturingYes = false, capturingNo = false;
+    do {
+      (ncclCudaGraphValid(comm->tasks.capturingGraph) ? capturingYes : capturingNo) = true;
+      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
+      NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
+      if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
+      comm = comm->groupNext;
+    } while (comm != nullptr && comm->intraComm0 == cliqueComm0);
+    cliqueNextHead = comm;
+
+    if (capturingYes && capturingNo) {
+      // We have entered barriers but are aborting without leaving them. Thus
+      // these comms are permanently trashed. We need a good mechanism for
+      // tracking and reporting that.
+      WARN("Either none or all communicators in a ncclGroup() can be CUDA graph captured.");
+      result = ncclInvalidUsage;
+      goto failure;
+    }
+
+    while (true) { // Iterate rounds of launches for clique.
+      bool moreRounds;
+      comm = cliqueHead;
+      do { // Iterate clique members.
+        struct ncclComm* next = comm->groupNext;
+        if (useBarrier) {
+          // Barrier reduction result tells us if this was the final round.
+          moreRounds = 0 != ncclCommIntraBarrierOut(comm);
+        } else {
+          moreRounds = comm->unlaunchedPlansHead != nullptr;
+        }
+        if (moreRounds) {
+          // Pop next unlaunched kernel
+          struct ncclKernelPlan* plan = comm->unlaunchedPlansHead;
+          if (plan != nullptr) {
+            comm->unlaunchedPlansHead = plan->next;
+            CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
+            NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure);
+            NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
+          }
+          // Barrier reduction input indicates if we require further rounds.
+          if (useBarrier) ncclCommIntraBarrierIn(comm, comm->unlaunchedPlansHead != nullptr ? 1 : 0);
+          if (plan != nullptr) {
+            NCCLCHECKGOTO(ncclLaunchKernelAfter_NoCuda(comm, plan), result, failure);
+          }
+        } else { // Final round.
+          CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
+          NCCLCHECKGOTO(ncclLaunchFinish(comm), result, failure);
+        }
+        comm = next;
+      } while (comm != cliqueNextHead);
+      if (!moreRounds) break;
+    }
+    cliqueHead = cliqueNextHead;
+  } while (cliqueHead != nullptr);
+failure:
+  return result;
+}
+
+ncclResult_t ncclGroupEndInternal() {
+  if (ncclGroupDepth == 0) {
     WARN("ncclGroupEnd: not in a group call.");
     return ncclInvalidUsage;
   }
-  ncclGroupMode--;
-  if (ncclGroupMode > 0) return ncclSuccess;
+  ncclGroupDepth--;
+  if (ncclGroupDepth > 0) return ncclSuccess;
+
   int savedDev;
   CUDACHECK(cudaGetDevice(&savedDev));
-  int activeThreads = 0;
-  int doneArray[MAX_ASYNC_OPS];
-  for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 1;
+
   ncclResult_t ret = ncclGroupError;
-  int usingCudaGraphAll = -1;
-  cudaGraph_t* graphs = NULL;
-  if (ret != ncclSuccess) goto group_cleanup;
+  bool jobsDone = false;
+  if (ret != ncclSuccess) goto failure;
 
-  /* Launch async ncclCommInitRank */
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_INIT) {
-      pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args);
-      activeThreads++;
-      doneArray[i] = 0;
-    }
-  }
-  /* For init, since we use threads, we just wait for threads to complete */
-  while (activeThreads) {
-    for (int i=0; i<ncclGroupIndex; i++) {
-      struct ncclAsyncArgs* args = ncclGroupArgs+i;
-      if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
-        int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
-        if (err == EBUSY) continue;
-        if (err != 0) ret = ncclSystemError;
-        if (args->ret != ncclSuccess) ret = args->ret;
-        doneArray[i] = 1;
-        activeThreads--;
-      }
-    }
+  if (ncclGroupCommPreconnectHead != nullptr) {
+    struct ncclComm* comm = ncclGroupCommPreconnectHead;
+    do {
+      struct ncclPreconnectJob* job;
+      NCCLCHECK(ncclCalloc(&job, 1));
+      job->base.func = ncclPreconnectFunc;
+      job->base.undo = nullptr;
+      job->base.destructor = free;
+      job->comm = comm;
+      ncclIntruQueueEnqueue(&ncclAsyncJobs, &job->base);
+
+      struct ncclComm* next = comm->preconnectNext;
+      comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
+      comm = next;
+    } while (comm != nullptr);
   }
 
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
-      pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
-    }
-  }
+  if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
+    struct ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
+    do {
+      pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job);
+      job = job->next;
+    } while (job != nullptr);
 
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
-      int err = pthread_join(ncclGroupThreads[i], NULL);
+    job = ncclIntruQueueHead(&ncclAsyncJobs);
+    do {
+      int err = pthread_join(job->thread, nullptr);
       if (err != 0) {
         WARN("Error waiting for pthread_join : %s", strerror(errno));
-        return ncclSystemError;
+        ret = ncclSystemError;
       }
-      NCCLCHECKGOTO(args->ret, ret, end);
-      args->coll.comm->connect = 0;
-    }
+      if (ret == ncclSuccess && job->result != ncclSuccess) ret = job->result;
+      job = job->next;
+    } while (job != nullptr);
+
+    jobsDone = true;
+    if (ret != ncclSuccess) goto failure;
   }
 
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      struct ncclComm* comm = args->coll.comm;
-      int node = comm->node;
-      int nNodes = comm->nNodes;
-      int localRank = comm->localRank;
+  if (ncclGroupCommHead != nullptr) {
+    NCCLCHECKGOTO(doLaunches(ncclGroupCommHead), ret, failure);
+    do {
+      struct ncclComm* comm = ncclGroupCommHead;
+      struct ncclComm* next = comm->groupNext;
+      ncclGroupCommLeave(comm);
+      ncclGroupCommHead = next;
+    } while (ncclGroupCommHead != nullptr);
+  }
 
-      // Compute how much to split operations
-      // Natural step size matching buffer steps.
-      ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
-      // Try to use all channels
-      int nChannelsMax = comm->p2pnChannelsPerPeer;
-      int nChannelsMin = nChannelsMax;
-      // Try to use all channels, but one channel per operation.
-      while (nChannelsMin*comm->nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
-      // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
-      while (nChannelsMax*comm->nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
-
-      while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
-        // schedule delta 0, +1, -1, +2, -2, ...
-        // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
-        for (int d=0; d<=nNodes/4; d++) {
-          int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes };
-          int index = 0;
-          int delta = deltas[index];
-sched_delta:
-          uint32_t recvNode = (node+nNodes-delta)%nNodes;
-          uint32_t sendNode = (node+delta)%nNodes;
-          int steps = comm->maxLocalRanks;
-          for (int s=0; s<steps; s++) {
-            int recvIndex = (localRank-s+steps)%steps;
-            int recvPeer = recvIndex<comm->nodeRanks[recvNode].localRanks ? comm->nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
-            int sendIndex = (localRank+s)%steps;
-            int sendPeer = sendIndex<comm->nodeRanks[sendNode].localRanks ? comm->nodeRanks[sendNode].localRankToRank[sendIndex] : -1;
-            struct ncclP2Pinfo* recv = recvPeer != -1 && comm->p2pRecvs[recvPeer] ? comm->p2pRecvs[recvPeer]->getNext() : NULL;
-            struct ncclP2Pinfo* send = sendPeer != -1 && comm->p2pSends[sendPeer] ? comm->p2pSends[sendPeer]->getNext() : NULL;
-            if (recv != NULL || send != NULL) {
-              ssize_t totRecvBytes = -1, totSendBytes = -1;
-              if (recv != NULL) totRecvBytes = recv->nbytes;
-              if (send != NULL) totSendBytes = send->nbytes;
-              if (recv) comm->p2pRecvCount--;
-              if (send) comm->p2pSendCount--;
-              if (recvPeer == comm->rank) { // Check self send/recv
-                if (sendPeer != comm->rank) { WARN("Sendrecv schedule not aligned for self"); ret = ncclInternalError; goto group_cleanup; }
-                if (send && recv == NULL) { WARN("Trying to send to self without a matching recv"); ret = ncclInvalidUsage; goto group_cleanup; }
-                if (send == NULL && recv) { WARN("Trying to recv to self without a matching send"); ret = ncclInvalidUsage; goto group_cleanup; }
-              }
-              void* recvBuff = recv ? recv->buff : NULL;
-              void* sendBuff = send ? send->buff : NULL;
-              // After we recycle p2pSend/Recv, we're no longer allowed to dereference send or recv, only use them as boolean NULL/not NULL.
-              if (recv && comm->p2pRecvs[recvPeer]->peakNext() == NULL) comm->p2pRecvs[recvPeer]->recycle();
-              if (send && comm->p2pSends[sendPeer]->peakNext() == NULL) comm->p2pSends[sendPeer]->recycle();
-
-              ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
-              ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
-
-              ssize_t sendOffset = 0;
-              ssize_t recvOffset = 0;
-              int sendRemaining = 1, recvRemaining = 1;
-              int chunk = 0;
-              do {
-                // Shuffle channels with s intra-node, and delta inter-node. Inter-node, make sure
-                // to use multiple channels to guarantee progress on all ranks from the same node.
-                ssize_t recvbytes = totRecvBytes-recvOffset;
-                ssize_t sendbytes = totSendBytes-sendOffset;
-                if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
-                if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
-                // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
-                // (total size == 0), otherwise set size to -1.
-                if (sendbytes < 0 || (sendbytes == 0 && totSendBytes != 0)) send = NULL;
-                if (recvbytes < 0 || (recvbytes == 0 && totRecvBytes != 0)) recv = NULL;
-                if (recv) {
-                  NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, chunk, recvbytes, ((char*)recvBuff)+recvOffset), ret, group_cleanup);
-                }
-                if (send) {
-                  NCCLCHECKGOTO(scheduleSend(comm, sendPeer, chunk, sendbytes, ((char*)sendBuff)+sendOffset), ret, group_cleanup);
-                }
-                recvOffset += recvChunkSize;
-                sendOffset += sendChunkSize;
-                chunk++;
-              } while (sendRemaining || recvRemaining);
+  if (false) {
+  failure:
+    struct ncclComm* comm = ncclGroupCommHead;
+    while (comm != nullptr) {
+      struct ncclComm* next = comm->groupNext;
+      ncclGroupCommLeave(comm); // overwrites comm->groupNext
+      // We don't know if preconnect succeeded or happened at all, so clear
+      // the flags that let `taskAppend()` skip over checking if preconnect
+      // is needed.
+      comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
+      for (int i=0; i < comm->nRanks; i++) {
+        comm->tasks.peers[i].sendSeen = false;
+        comm->tasks.peers[i].recvSeen = false;
+        comm->connectSend[i] = 0;
+        comm->connectRecv[i] = 0;
+      }
+      comm->unlaunchedPlansHead = nullptr;
+      // Reclaim abandoned kernel plan memory. Note ncclWork structs were already
+      // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
+      while (!ncclIntruQueueEmpty(&comm->planQueue)) {
+        struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue);
+        // Persistent plans will be reclaimed via the callbackQueue when the
+        // graph drops its UserObject reference.
+        if (!plan->persistent) {
+          for (int c=0; c < MAXCHANNELS; c++) {
+            while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) {
+              struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue);
+              ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
             }
           }
-          index++;
-          if (index == 1 && deltas[1] == deltas[0]) index++;
-          if (index == 2 && deltas[2] == deltas[0]) index++;
-          if (index == 3 && deltas[3] == deltas[2]) index++;
-          if (index == 3 && deltas[3] == deltas[1]) index++;
-          if (index < 4) {
-            delta = deltas[index];
-            goto sched_delta;
-          }
+          ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
         }
       }
+      // Reset comm->tasks to empty.
+      comm->tasks.nTasksColl = 0;
+      comm->tasks.nTasksP2p = 0;
+      comm->tasks.streams = nullptr;
+      ncclIntruQueueConstruct(&comm->tasks.collQueue);
+      comm->tasks.collBytesTotal = 0;
+      for (int i=0; i < comm->nRanks; i++) {
+        ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
+        ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
+      }
+      comm = next;
     }
   }
 
-  /* Collectives are done in three steps :
-   * 0. Save kernels previously enqueued. Compute channel, algo, proto, etc.
-   * 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
-   * 2. Barrier Wait. No CUDA call is permitted
-   * 3. Enqueue Events. CUDA event wait/enqueue.
-   * This is needed because step 2 cannot call any CUDA primitive, otherwise if
-   * cudaFree happens between 1 and 3, it could block that CUDA call and
-   * prevent some ranks from launching their network threads, which would
-   * prevent the NCCL call from completing, blocking the cudaFree call.
-   */
-
-  // Check whether we are in cuda graph mode
-  NCCLCHECK(ncclCalloc(&graphs, ncclGroupIndex));
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      ncclComm_t comm = args->coll.comm;
-      NCCLCHECKGOTO(ncclGetCudaGraph(comm, graphs+i), ret, group_cleanup);
-      if (usingCudaGraphAll == -1) {
-        usingCudaGraphAll = comm->usingCudaGraph;
-      } else if (usingCudaGraphAll != comm->usingCudaGraph) {
-        WARN("Illegal to have some communicators in graph mode while others not");
-        ret = ncclInvalidUsage;
-        goto group_cleanup;
-      }
-    }
-  }
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      ncclComm_t comm = args->coll.comm;
-      NCCLCHECKGOTO(ncclSetupAsyncKernels(comm), ret, group_cleanup);
-    }
-  }
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      if (args->coll.comm->userStream == cudaStreamDefault ||
-          args->coll.comm->userStream == cudaStreamPerThread ||
-          args->coll.comm->userStream == cudaStreamLegacy)
-        CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
-      if (usingCudaGraphAll == 1) {
-        NCCLCHECKGOTO(ncclCudaGraphHostSetup(args->coll.comm, graphs[i]), ret, end);
-      } else {
-        ncclEnqueueHostSetup<0>(args->coll.comm->enqueueInfo);
-      }
-      NCCLCHECKGOTO(ncclLaunchBarrier(args->coll.comm), ret, end);
-    }
-  }
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
-      NCCLCHECKGOTO(ncclLaunchKernel(args->coll.comm), ret, end);
-    }
-  }
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      if (args->coll.comm->userStream == cudaStreamDefault ||
-          args->coll.comm->userStream == cudaStreamPerThread ||
-          args->coll.comm->userStream == cudaStreamLegacy)
-        CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
-      NCCLCHECKGOTO(ncclRecordEvents(args->coll.comm), ret, end);
-      NCCLCHECKGOTO(ncclLaunchReset(args->coll.comm), ret, end);
-    }
+  while (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
+    struct ncclAsyncJob* job = ncclIntruQueueDequeue(&ncclAsyncJobs);
+    if (ret != ncclSuccess && jobsDone && job->undo) job->undo(job);
+    if (job->destructor) job->destructor((void*)job);
   }
 
-  goto end;
-group_cleanup:
-  if (ret != ncclSuccess) {
-    // At least one call in the group failed. Since we want to make that group
-    // an atomic operation, we need to cancel all operations.
-    for (int i=0; i<ncclGroupIndex; i++) {
-      struct ncclAsyncArgs* args = ncclGroupArgs+i;
-      if (args->funcType == ASYNC_FUNC_INIT) {
-        if (args->init.newcomm) ncclCommDestroy(*args->init.newcomm);
-        *args->init.newcomm = NULL;
-      } else {
-        struct ncclComm* comm = args->coll.comm;
-        // Reset aggregation counters
-        comm->asyncOpCount = 0;
-        comm->asyncTotalSize = 0;
-        // Dequeue p2p lists
-        if (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
-          for (int peer=0; peer<comm->nRanks; peer++) {
-            if (comm->p2pSends[peer]) comm->p2pSends[peer]->recycle();
-            if (comm->p2pRecvs[peer]) comm->p2pRecvs[peer]->recycle();
-          }
-          comm->p2pSendCount = comm->p2pRecvCount = 0;
-        }
-        ncclLaunchReset(comm);
-      }
-    }
-  }
-end:
   ncclGroupError = ncclSuccess;
-  ncclGroupIndex = 0;
+  ncclGroupCommHead = nullptr;
+  ncclGroupCommPreconnectHead = nullptr;
   CUDACHECK(cudaSetDevice(savedDev)); // do other clean-ups first before calling cudaSetDevice, because this call can fail too
-  if (graphs) free(graphs);
   return ret;
 }
diff --git a/src/include/alloc.h b/src/include/alloc.h
index 14bccf9..29ec87a 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -10,27 +10,39 @@
 #include "nccl.h"
 #include "checks.h"
 #include "align.h"
+#include "utils.h"
 #include <sys/mman.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <string.h>
 
+uint64_t clockNano(); // from utils.h with which we have a circular dependency
+
 template <typename T>
-static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
-  CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped));
+ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+  ncclResult_t result = ncclSuccess;
+  uint64_t time = 0;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  time = clockNano();
+  CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish);
+  time = clockNano() - time;
   memset(*ptr, 0, nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
-  return ncclSuccess;
+  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p seconds: cudaHostAlloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  return result;
 }
 #define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
 
-static inline ncclResult_t ncclCudaHostFree(void* ptr) {
+inline ncclResult_t ncclCudaHostFree(void* ptr) {
   CUDACHECK(cudaFreeHost(ptr));
   return ncclSuccess;
 }
 
 template <typename T>
-static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
   void* p = malloc(nelem*sizeof(T));
   if (p == NULL) {
     WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
@@ -44,7 +56,7 @@ static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc,
 #define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
 
 template <typename T>
-static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
+ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
   if (nelem < oldNelem) return ncclInternalError;
   if (nelem == oldNelem) return ncclSuccess;
 
@@ -63,29 +75,105 @@ static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
 }
 
 template <typename T>
-static ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
-  // Need async stream for P2P pre-connect + CUDA Graph
+ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  uint64_t time = clockNano();
+  CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  time = clockNano() - time;
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: cudaMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
+  return result;
+}
+#define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
+
+template <typename T>
+ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+  ncclResult_t result = ncclSuccess;
+  uint64_t time0=0, time1=0, time2=0;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  // Need a side stream so as not to interfere with graph capture.
   cudaStream_t stream;
+  time0 = clockNano();
   CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
-  CUDACHECK(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
-  CUDACHECK(cudaStreamSynchronize(stream));
-  CUDACHECK(cudaStreamDestroy(stream));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
-  return ncclSuccess;
+  time1 = clockNano();
+  CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  time2 = clockNano();
+  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
+  CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
+  CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: cudaStreamCreateWithFlags=%g cudaMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time1-time0)/1.e9, double(time2-time1)/1.e9);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  return result;
 }
 #define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
 
 template <typename T>
-static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
-  CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
-  return ncclSuccess;
+ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char *filefunc, int line) {
+  ncclResult_t result = ncclSuccess;
+  uint64_t time = 0;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  time = clockNano();
+  CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  time = clockNano() - time;
+  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: cudaMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  return result;
+}
+#define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)
+
+template <typename T>
+ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  // Need a side stream so as not to interfere with graph capture.
+  cudaStream_t stream;
+  CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish);
+  NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish);
+  CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
+  CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  return result;
+}
+
+template <typename T>
+ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  return result;
+}
+
+template <typename T>
+ncclResult_t ncclCudaFree(T* ptr) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  CUDACHECKGOTO(cudaFree(ptr), result, finish);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  return result;
 }
 
 // Allocate memory to be potentially ibv_reg_mr'd. This needs to be
 // allocated on separate pages as those pages will be marked DONTFORK
 // and if they are shared, that could cause a crash in a child process
-static ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
+inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
   size_t page_size = sysconf(_SC_PAGESIZE);
   void* p;
   int size_aligned = ROUNDUP(size, page_size);
diff --git a/src/include/channel.h b/src/include/channel.h
index dc1536a..0ebb5a2 100644
--- a/src/include/channel.h
+++ b/src/include/channel.h
@@ -31,7 +31,8 @@ static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int
 }
 
 static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
-  *channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
+  //*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
+  *channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels;
   return ncclSuccess;
 }
 
diff --git a/src/include/checks.h b/src/include/checks.h
index 9624608..715aeb7 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -9,7 +9,7 @@
 
 #include "debug.h"
 
-// Check CUDA calls
+// Check CUDA RT calls
 #define CUDACHECK(cmd) do {                                 \
     cudaError_t err = cmd;                                  \
     if( err != cudaSuccess ) {                              \
@@ -142,9 +142,9 @@
   if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
 } while (!(cond));
 
-#define NCCLCHECKTHREAD(a) do { \
-  if ((args->ret = (a)) != ncclSuccess) { \
-    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+#define NCCLCHECKTHREAD(a, args) do { \
+  if (((args)->ret = (a)) != ncclSuccess) { \
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
     return args; \
   } \
 } while(0)
diff --git a/src/include/coll_net.h b/src/include/coll_net.h
index c2d831e..f4b5408 100644
--- a/src/include/coll_net.h
+++ b/src/include/coll_net.h
@@ -10,25 +10,26 @@
 #include "nccl.h"
 #include "nccl_net.h"
 
-extern ncclCollNet_t* ncclCollNet;
 typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 
 // Translation to external API
-static const char* collNetName() { return ncclCollNet->name; }
-static ncclResult_t collNetDevices(int* ndev) { NCCLCHECK(ncclCollNet->devices(ndev)); return ncclSuccess; }
-static ncclResult_t collNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
-static ncclResult_t collNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t collNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
-static ncclResult_t collNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
-static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclCollNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
-static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
-static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
-  NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
-static ncclResult_t collNetIflush(void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
-static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
-static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
-static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
+static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; }
+static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; }
+static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
+static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
+static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
+static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
+/* DMA-BUF support */
+static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
+  NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
+static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
+static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; }
+static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
 
-static int collNetSupport() { return ncclCollNet != nullptr ? 1 : 0; }
+static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; }
 
 #endif
diff --git a/src/include/collectives.h b/src/include/collectives.h
index d65c6ae..7f0d0b6 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -36,7 +36,7 @@ struct ncclDevRedOpFull {
 /* Declare all collective operations */
 #define DECL5(func, algo, proto, devredop, type) \
   extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
-  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem c); \
+  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
 
 #define CONCAT(a,b) a##b
 #define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f)
diff --git a/src/include/comm.h b/src/include/comm.h
index 4b55dc6..ee752fc 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -10,6 +10,8 @@
 #include "transport.h"
 #include "p2p.h"
 #include "collectives.h"
+#include "proxy.h"
+#include "strongstream.h"
 
 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
@@ -58,8 +60,6 @@ struct ncclRecvMem {
   };
 };
 
-typedef cudaError_t(*pfn_cuMemGetAddressRange_t)(void**, size_t*, void*);
-
 enum helperThreadState {ThreadStart, ThreadStop};
 
 #define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS)
@@ -85,15 +85,87 @@ struct ncclNodeRanks {
   int* localRankToRank;
 };
 
-struct ncclComm {
-  struct ncclChannel channels[MAXCHANNELS];
+struct ncclDestructor {
+  struct ncclDestructor* next;
+  void* obj;
+  ncclResult_t(*fn)(struct ncclDestructor* me);
+};
 
+struct ncclCommCallback {
+  struct ncclCommCallback* next;
+  ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
+};
+
+struct ncclChannel {
+  struct ncclChannelPeer* peers;
+  struct ncclDevChannelPeer* devPeers;
+  struct ncclRing ring;
+  int* devRingUserRanks;
+  struct ncclTree tree;
+  struct ncclDirect collTree;
+  int id; // index of this channel
+  uint32_t workFifoSent; // last used work index+1
+  uint64_t p2pOpCount;
+};
+
+struct ncclWorkList {
+  struct ncclWorkList* next;
+  struct ncclWork work;
+};
+
+struct ncclPointerList {
+  struct ncclPointerList* next;
+  void *ptr;
+};
+
+struct ncclKernelPlan {
+  // A kernel plan is also a callback that reclaims itself. Hence this must
+  // be the first member.
+  struct ncclCommCallback reclaimer;
+  struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup
+
+  struct ncclComm* comm;
+  struct ncclKernelPlan* next;
+
+  bool persistent; // aka captured in a graph
+  void *kernelFn;
+  int channelUbound; // only channels c < channelUbound are present
+  int channelCount; // number of channels present
+  uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask)
+  bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
+  int threadPerBlock;
+  // workHeap fields are null until uploadWorkFifo() or preparePersistentKernel()
+  struct ncclWork* workHead;
+
+  int collOpCount; // zero based for this plan
+
+  struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
+
+  struct Channel {
+    int nWork;
+    union {
+      int nWorkElem; // used for coll and reg coll
+      int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1
+    };
+    size_t collBytes;
+    struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
+    struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
+  } channels[MAXCHANNELS];
+};
+
+struct ncclComm {
+  struct ncclMemoryStack memPermanent, memScoped;
+  // List of destructors to run when comm is destructed
+  struct ncclDestructor* destructorHead;
+
+  struct ncclChannel channels[MAXCHANNELS];
   struct ncclPeerInfo* peerInfo;
   struct ncclTopoSystem* topo;
 
+  ncclNet_t* ncclNet;
+  ncclCollNet_t* ncclCollNet;
   void* bootstrap;
   // Bitmasks for ncclTransportP2pSetup
-  int connect;
   uint32_t* connectSend;
   uint32_t* connectRecv;
 
@@ -114,12 +186,8 @@ struct ncclComm {
   // localRanks and localRanktoRank for all nodes
   struct ncclNodeRanks* nodeRanks;
 
-  enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode;
-  cudaStream_t userStream;
-  bool userStreamSet;
-  cudaEvent_t doneEvent;
-  cudaEvent_t intDoneEvent;
   bool checkPointers;
+  bool dmaBufSupport;
 
   // Counter for tracking CUDA launches (P2P and collectives included)
   uint64_t opCount;
@@ -142,36 +210,37 @@ struct ncclComm {
   float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
 
-  // An internal CUDA stream for NCCL kernel CGMD launches
-  int groupCudaStream;
-  cudaStream_t groupStream;
-
   // Whether there has been a fatal error in this communicator.
   ncclResult_t fatalError;
 
   // Flag to ask NCCL kernels to abort
   volatile uint32_t *abortFlag;
 
-  // Device side of the communicator
-  struct ncclDevComm *devComm;
-  // Host copy of the devComm (to free CUDA allocs)
-  struct ncclDevComm hostDevComm;
+  // Device side of the communicator (for cudaFree's)
+  struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
+
+  // Operation pool.
+  int workFifoDepth; // size of workFifoHeap[], power of 2
+  struct ncclWork* workFifoHeap;
+  struct ncclWork* devWorkFifoHeap;
+  void* workFifoHeapGdrHandle;
+
+  // Work completion notificaion
+  uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory
+  uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot.
+  uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels.
 
   // Intra-process sync
+  struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
+  struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head
+  int intraRefs; // reference count from intra-process comms (zero if not leader else intraRanks)
   int intraRank;
   int intraRanks;
-  int* intraBarrier;
-  int intraPhase;
-
-  // Storage for deferred intra-process launch
-  struct cudaLaunchParams * intraParams;
-  struct cudaLaunchParams *myParams;
-  pthread_t* intraThreads;
-  int* intraCudaDevs;
-  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
-  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
-  struct ncclWorkElem args;
-  void* argsptrs[2];
+  uint32_t intraBarrierPhase;
+  char intraPad1[64 - sizeof(uint64_t)];
+  uint64_t intraBarrierCounter; // only used if this is intraComm0
+  char intraPad2[64 - sizeof(uint64_t)];
+  uint64_t intraBarrierGate; // only used if this is intraComm0
 
   struct ncclProxyState proxyState;
 
@@ -179,39 +248,98 @@ struct ncclComm {
   int collNetSupport;
   int intraHighestTransportType;
 
-  // Store info of async operations
-  struct ncclInfo* asyncOps;
-  int asyncOpCount;
-  size_t asyncTotalSize;
-  ssize_t channelSize;
-  int lastChannel;
-  enum { ROUND_ROBIN, SHORTEST_QUEUE } asyncAllocMode;
+  size_t channelSize; // User requested work size (bytes) for channel partitions
 
-  //list of async p2p operation queued in a group semantics
-  ncclP2Plist** p2pSends;
-  ncclP2Plist** p2pRecvs;
-  int p2pSendCount;
-  int p2pRecvCount;
+  // Internal streams
+  struct ncclStrongStream deviceStream, hostStream;
 
-  // Store info for cudaGraph
-  int usingCudaGraph; // Only use it during capture time, not launch time
-  struct ncclQueueInfo* enqueueInfo;
-  int nQueueInfoCreated;
-  int nQueueInfoDestroyed;
-  cudaGraphNode_t lastSetupNode;
-  unsigned long long lastCudaGraphId;
-  int driverVersion;
-  pfn_cuMemGetAddressRange_t pfnCuMemGetAddressRange;
-  pthread_t graphHelperThread;
-  struct ncclGraphHelperResources* graphHelperResources;
-  int disableGraphHelper;
-  int graphRegister;
+  // pools backed by comm->memPermanent
+  struct ncclMemoryPool memPool_ncclProxyOp;
+  struct ncclMemoryPool memPool_ncclKernelPlan;
+  struct ncclMemoryPool memPool_ncclPointerList;
+  // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
+  // this comm is not yet in a group.
+  struct ncclComm* groupNext;
+  // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
+  struct ncclComm* preconnectNext;
+  int persistentRefs; // number of persistent plan-lists capturing this comm
+  struct ncclTasks tasks;
 
   // user-created reduction ops
   int userRedOpCapacity, userRedOpFreeHead;
   ncclUserRedOp *userRedOps;
+
+  // Queue of things for the main thread to do
+  struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
+
+  // List of kernel plans built form tasks.
+  struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
+  // First of the unlaunched kernels in `planQueue`
+  struct ncclKernelPlan* unlaunchedPlansHead;
 };
 
+// Set to true during an `atexit()` handler. We use this to intentionally leak
+// unfreed CUDA resources when cleaning up after return of `main()` to avoid
+// CUDA calls after CUDA runtime teardown.
+extern bool ncclMainExited;
+
+enum ncclLaunchMode {
+  ncclLaunchModeInvalid=0,
+  ncclLaunchModeParallel,
+  ncclLaunchModeGroup
+};
+extern enum ncclLaunchMode ncclParamLaunchMode;
+
+void ncclCommPushFree(struct ncclComm* comm, void* buf);
+void ncclCommPushCudaFree(struct ncclComm* comm, void* buf);
+void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf);
+void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle);
+
+inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm) {
+  struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, /*waitSome=*/false);
+  while (cb != nullptr) {
+    struct ncclCommCallback* next = cb->next;
+    NCCLCHECK(cb->fn(comm, cb)); // may reclaim memory of cb
+    cb = next;
+  }
+  return ncclSuccess;
+}
+
+inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) {
+  int phase = comm->intraBarrierPhase;
+  if (comm->intraRanks == 1) {
+    // Release everyone (just me).
+    comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1);
+  } else {
+    struct ncclComm* comm0 = comm->intraComm0;
+    uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE);
+    if (uint32_t(count) == uint32_t(comm->intraRanks)) {
+      // Reset.
+      __atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED);
+      // Release everyone.
+      __atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE);
+    }
+  }
+}
+
+// returns sum of x values contributed to ncclCommIntraBarrierIn(comm, x)
+inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) {
+  struct ncclComm* comm0 = comm->intraComm0;
+  comm->intraBarrierPhase ^= 1;
+  uint32_t phase = comm->intraBarrierPhase;
+  uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
+  if ((gate & 1) != phase) {
+    uint64_t t0 = clockNano();
+    do {
+      // Spin vigorously for first 5us.
+      if (clockNano()-t0 >= 5*1000) sched_yield();
+      gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
+    } while ((gate & 1) != phase);
+  }
+  if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE);
+  return gate>>32;
+}
+
 // Scrambles the bits of non-builtin values of ncclRedOp_t according to the
 // communicator memory address. Used to catch bugs so that integer handles
 // associated with this communicator won't collide with handles of other
diff --git a/src/include/core.h b/src/include/core.h
index 823a016..ac6ea77 100644
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -55,6 +55,7 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
 
 #include "debug.h"
 #include "checks.h"
+#include "cudawrap.h"
 #include "alloc.h"
 #include "utils.h"
 #include "param.h"
diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h
new file mode 100644
index 0000000..eaa5949
--- /dev/null
+++ b/src/include/cudawrap.h
@@ -0,0 +1,88 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CUDAWRAP_H_
+#define NCCL_CUDAWRAP_H_
+
+#include <cuda.h>
+
+#if CUDART_VERSION >= 11030
+#include <cudaTypedefs.h>
+#else
+typedef CUresult (CUDAAPI *PFN_cuInit)(unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion)(int *driverVersion);
+typedef CUresult (CUDAAPI *PFN_cuGetProcAddress)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
+#endif
+
+#define CUPFN(symbol) pfn_##symbol
+
+// Check CUDA PFN driver calls
+#define CUCHECK(cmd) do {				      \
+    CUresult err = pfn_##cmd;				      \
+    if( err != CUDA_SUCCESS ) {				      \
+      const char *errStr;				      \
+      (void) pfn_cuGetErrorString(err, &errStr);	      \
+      WARN("Cuda failure '%s'", errStr);		      \
+      return ncclUnhandledCudaError;			      \
+    }							      \
+} while(false)
+
+#define CUCHECKGOTO(cmd, res, label) do {		      \
+    CUresult err = pfn_##cmd;				      \
+    if( err != CUDA_SUCCESS ) {				      \
+      const char *errStr;				      \
+      (void) pfn_cuGetErrorString(err, &errStr);	      \
+      WARN("Cuda failure '%s'", errStr);		      \
+      res = ncclUnhandledCudaError;			      \
+      goto label;					      \
+    }							      \
+} while(false)
+
+// Report failure but clear error and continue
+#define CUCHECKIGNORE(cmd) do {						\
+    CUresult err = pfn_##cmd;						\
+    if( err != CUDA_SUCCESS ) {						\
+      const char *errStr;						\
+      (void) pfn_cuGetErrorString(err, &errStr);			\
+      INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, errStr);	\
+    }									\
+} while(false)
+
+#define CUCHECKTHREAD(cmd, args) do {					\
+    CUresult err = pfn_##cmd;						\
+    if (err != CUDA_SUCCESS) {						\
+      INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \
+      args->ret = ncclUnhandledCudaError;				\
+      return args;							\
+    }									\
+} while(0)
+
+#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
+
+#if CUDART_VERSION >= 11030
+/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGet);
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorString);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorName);
+DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange);
+DECLARE_CUDA_PFN_EXTERN(cuCtxCreate_v3020);
+DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy);
+DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
+#if CUDA_VERSION >= 11070
+DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
+#endif
+#endif
+
+/* CUDA Driver functions loaded with dlsym() */
+DECLARE_CUDA_PFN_EXTERN(cuInit);
+DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion);
+DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress);
+
+
+ncclResult_t cudaLibraryInit(void);
+
+#endif
diff --git a/src/include/debug.h b/src/include/debug.h
index 7af38fd..cd6e53b 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -10,8 +10,8 @@
 #include "nccl_net.h"
 #include <stdio.h>
 #include <chrono>
+#include <type_traits>
 
-#include <sys/syscall.h>
 #include <limits.h>
 #include <string.h>
 #include <pthread.h>
@@ -21,7 +21,7 @@
 
 extern int ncclDebugLevel;
 extern uint64_t ncclDebugMask;
-extern pthread_mutex_t ncclDebugOutputLock;
+extern pthread_mutex_t ncclDebugLock;
 extern FILE *ncclDebugFile;
 extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
 
@@ -29,13 +29,15 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
 
 // Let code temporarily downgrade WARN into INFO
 extern thread_local int ncclDebugNoWarn;
+extern char ncclLastError[];
 
 #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+#define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)
 
 #ifdef ENABLE_TRACE
 #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
-extern std::chrono::high_resolution_clock::time_point ncclEpoch;
+extern std::chrono::steady_clock::time_point ncclEpoch;
 #else
 #define TRACE(...)
 #endif
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index 8ff9d4b..f8b630e 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -121,7 +121,6 @@ struct ncclRing {
   // since we need to know how the user expects data to be ordered across
   // devices. Ordered from current device.
   int* userRanks;
-  int* devUserRanks;
 
   int index; // This rank's index in the ring
 };
@@ -146,7 +145,7 @@ struct ncclDirect {
 };
 
 #define NCCL_MAX_CONNS 2
-struct ncclPeer {
+struct ncclChannelPeer {
   struct ncclConnector send[NCCL_MAX_CONNS];
   struct ncclConnector recv[NCCL_MAX_CONNS];
 };
@@ -158,30 +157,38 @@ struct ncclDevComm;
 /* Make sure to adjust padding at the end of ncclWorkElem. */
 #define NCCL_WORK_SIZE 512
 
-enum ncclWorkElemType : uint8_t {
+enum ncclWorkType : uint8_t {
    ncclWorkTypeUnused=0,
    ncclWorkTypeColl=1,
    ncclWorkTypeP2p=2,
    ncclWorkTypeRegColl=3
 };
-enum ncclWorkElemSubType : uint8_t {
-  ncclWorkSubTypeUnused =0,
-  ncclWorkSubTypeSend,
-  ncclWorkSubTypeRecv
+enum ncclWorkP2PType : uint8_t {
+  ncclWorkP2pTypeUnused=0,
+  ncclWorkP2pTypeSend,
+  ncclWorkP2pTypeRecv
 };
 
-struct ncclWorkElemHeader {
+struct ncclWorkHeader {
+  union {
+    int32_t workNext;  // when isLast=0: Offset from kernel argument workHead
+    uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
+  };
   uint16_t funcIndex;
-  enum ncclWorkElemType type;
-  unsigned nWarps:5;
-  unsigned isLast:1;
+  uint8_t isLast:1; // last work for this kernel
+  uint8_t inFifo:1; // is this work in the fifo
+  enum ncclWorkType type;
 };
 
 struct ncclWorkElem {
-  struct ncclWorkElemHeader header;
-  uint8_t regUsed;
+  union {
+    uint8_t flagBits;
+    struct {
+      uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1;
+    };
+  };
+  uint8_t nWarps;
   uint8_t direct;
-  uint8_t redOpArgIsPtr;
 
   const void * sendbuff;
   void * recvbuff;
@@ -192,22 +199,29 @@ struct ncclWorkElem {
   uint8_t bid;
   uint8_t nChannels;
   uint64_t redOpArg;
-  uint64_t pad;
 };
-static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElem) == 0, "ncclWorkElem size must be a multiple of ncclWork size");
+
+#define NCCL_MAX_WORK_ELEMENTS ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem))
+static_assert(NCCL_MAX_WORK_ELEMENTS == 9, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 9");
 
 struct ncclWorkElemP2p {
-  struct ncclWorkElemHeader header;
   int32_t peer;
-  void* buff;
-  size_t count;
-  int chunkSize;
-  uint8_t ngroups;
-  uint8_t warpStart;
+  enum ncclWorkP2PType p2pType;
   uint8_t nWarps;
-  enum ncclWorkElemSubType subType;
+  uint8_t warpStart;
+  uint8_t ngroups;
+  // Important not to use any fields with greater than 4-byte alignment since
+  // we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
+  // there were 8-byte fields.
+  //void* buff;
+  uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
+  //size_t count;
+  uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
+  int chunkSize;
 };
-static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemP2p) == 0, "ncclWorkElemP2p size must be a multiple of ncclWork size");
+
+static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) >= 16, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 16");
+#define NCCL_MAX_WORK_ELEMENTS_P2P 16
 
 struct ncclWorkElemReg {
   struct ncclWorkElem elem;
@@ -215,72 +229,59 @@ struct ncclWorkElemReg {
   void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
   void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
 };
-static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemReg) == 0, "ncclWork size must be a multiple of ncclWorkElemReg size");
-static_assert(sizeof(struct ncclWorkElemReg) % sizeof(struct ncclWorkElem) == 0, "ncclWorkElemReg size must be a multiple of ncclWorkElem size");
 
-#define NCCL_MAX_WORK_ELEMENTS (NCCL_WORK_SIZE/sizeof(struct ncclWorkElem))
-#define NCCL_MAX_WORK_ELEMENTS_P2P (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemP2p))
-#define NCCL_MAX_WORK_ELEMENTS_REG (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemReg))
+#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg))
+static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 2, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 2");
+
 // Number of named barriers supported by CUDA
 #define NCCL_MAX_GROUPS 16
 
 struct ncclWork {
+  struct ncclWorkHeader header;
   union {
-    char pad[NCCL_WORK_SIZE];
-    struct ncclWorkElemHeader header;
+    char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)];
     struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
     struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
     struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
   };
 };
+static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE");
+static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0");
 
-static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "ncclWork size needs to be well aligned");
-
-struct ncclChannel {
-  union {
-    struct {
-      struct ncclRing ring;
-      struct ncclTree tree;
-      struct ncclDirect collTree;
-
-      int id;
-
-      // Communication structures
-      struct ncclPeer* peers;
-      struct ncclPeer* devPeers;
-
-      // Operation list for aggregation
-      struct ncclWork* workFifo;
-      int workCount;
-      size_t totalSize;
-      uint64_t workFifoTail; // Only used by CPU
-      uint16_t index;        // Only used by GPU
-
-      // GDRCOPY support
-      struct ncclWork* workFifoGdr;
-      struct ncclWork* workFifoDev;
-      void* gdrMemDesc;
-    };
-    int data[0x80];
-  };
+struct ncclDevChannelPeer {
+  // Stripped version of ncclChannelPeer where we only keep the ncclConnInfo
+  // instead of the full ncclConnector.
+  struct ncclConnInfo send[NCCL_MAX_CONNS];
+  struct ncclConnInfo recv[NCCL_MAX_CONNS];
+};
+
+struct alignas(16) ncclDevChannel {
+  struct ncclDevChannelPeer *peers;
+  struct ncclRing ring;
+  struct ncclTree tree;
+  struct ncclDirect collTree;
+  uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
 };
-static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
 
 struct ncclDevComm {
   int rank;
   int nRanks;
   int buffSizes[NCCL_NUM_PROTOCOLS];
 
+  // Operation list for aggregation
+  int workFifoDepth;
+  struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
+
   // Flag to ask NCCL kernels to abort
-  volatile uint32_t *abortFlag;
+  volatile uint32_t* abortFlag;
 
   // Channels, device side
-  struct ncclChannel* channels;
+  struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
 };
 
-struct ncclDevCommAndChannels {
-  ncclDevComm comm;
-  ncclChannel channels[MAXCHANNELS];
+struct alignas(16) ncclDevCommAndChannels {
+  struct ncclDevComm comm;
+  struct ncclDevChannel channels[MAXCHANNELS];
 };
 
 #endif
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 282342b..74b7ccd 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -10,6 +10,7 @@
 #include "comm.h"
 #include "group.h"
 #include "collectives.h"
+#include "utils.h"
 
 #define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
 #define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
@@ -17,117 +18,10 @@
 size_t ncclKernMaxLocalSize();
 ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut);
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
-ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
-ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
-ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm);
-ncclResult_t ncclLaunchBarrier(struct ncclComm* comm);
-ncclResult_t ncclLaunchKernel(ncclComm_t comm);
-ncclResult_t ncclRecordEvents(struct ncclComm* comm);
-ncclResult_t ncclLaunchReset(ncclComm_t comm);
-ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info);
-ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm);
-template<int USING_CUDA_GRAPH>
-void CUDART_CB ncclEnqueueHostSetup(void* arg);
-ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph);
-ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph);
+ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
+ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
+ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
+ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
+ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
 
-struct ncclBuffRegInfo {
-  void* sendbuffsBase[NCCL_MAX_LOCAL_RANKS];
-  void* recvbuffsBase[NCCL_MAX_LOCAL_RANKS];
-  void* sendbuffs[NCCL_MAX_LOCAL_RANKS];
-  void* recvbuffs[NCCL_MAX_LOCAL_RANKS];
-  int nBuffs;
-};
-
-// Enqueue information (for kernel and proxy) for each operation
-struct ncclQueueElem {
-  struct ncclWork work;
-  struct ncclProxyOp proxyOp;
-  struct ncclBuffRegInfo buffRegInfo;
-};
-
-typedef ncclRecyclableList<struct ncclQueueElem> ncclQueueElemList;
-
-// Structure passed to CUDA graph
-struct ncclQueueInfo {
-  ncclComm_t comm;
-  int maxChannels;    // Dynamic version of gridDim
-  ncclResult_t ret;   // Return value of host setup call
-  int nRegBuffs;
-  ncclQueueElemList* elemList;
-};
-
-static ncclResult_t ncclCreateQueueInfo(struct ncclQueueInfo** eqInfo, ncclComm_t comm) {
-  NCCLCHECK(ncclCalloc(eqInfo, 1));
-  (*eqInfo)->comm = comm;
-  (*eqInfo)->elemList = new ncclQueueElemList();
-  (*eqInfo)->comm->nQueueInfoCreated++;
-  return ncclSuccess;
-}
-
-// Reset element queue
-static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) {
-  if (eqInfo == NULL) return ncclInternalError;
-  eqInfo->maxChannels = 0;
-  eqInfo->ret = ncclSuccess;
-  eqInfo->nRegBuffs = 0;
-  eqInfo->elemList->recycle();
-  return ncclSuccess;
-}
-
-// Destroy enqueue info space
-// used by both CUDA graph and non CUDA graph
-static void ncclDestroyQueueInfo(void* ptr) {
-  if (ptr == NULL) return;
-  struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr;
-  struct ncclComm* comm = eqInfo->comm;
-  // Close IPC mem handles for registered buffers
-  struct ncclQueueElem* eqElem = eqInfo->elemList->begin();
-#if 0
-  // Ideally, the deregistration should happen here
-  // but currently the destroy function of CUDA objects does not allow CUDA API calls
-  while (eqElem != NULL) {
-    for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
-      if (i == eqInfo->comm->localRank) continue;
-      CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.sendbuffsBase[i]));
-      CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.recvbuffsBase[i]));
-    }
-    eqElem = eqInfo->elemList->getNext();
-  }
-#else
-  // Instead, we push these pointers to a pool owned by ncclComm
-  // and asks a helper thread to close mem handles
-  struct ncclGraphHelperResources* res = comm->graphHelperResources;
-  int ipcTailOld = 0;
-  if (res == NULL || (!comm->graphHelperThread) || eqInfo->nRegBuffs == 0) goto skip;
-
-  pthread_mutex_lock(&res->threadLock);
-  ipcTailOld = res->ipcTail;
-  while (eqElem != NULL) {
-    for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
-      if (eqElem->buffRegInfo.sendbuffsBase[i] != NULL) {
-        res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.sendbuffsBase[i];
-        res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE;
-      }
-      if (eqElem->buffRegInfo.recvbuffsBase[i] != NULL) {
-        res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.recvbuffsBase[i];
-        res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE;
-      }
-    }
-    eqElem = eqInfo->elemList->getNext();
-  }
-  if (res->ipcTail != ipcTailOld) {
-    res->threadState = ThreadStart;
-    TRACE(NCCL_COLL, "CUDA Graph destroy function signaling helper thread with %d IPC handles", res->ipcTail-ipcTailOld);
-    pthread_cond_signal(&res->threadCond);
-  }
-  pthread_mutex_unlock(&res->threadLock);
-#endif
-
-skip:
-  delete eqInfo->elemList;
-  free(eqInfo);
-  comm->nQueueInfoDestroyed++;
-  return;
-}
 #endif // End include guard
diff --git a/src/include/graph.h b/src/include/graph.h
index 898b903..1997f76 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -23,7 +23,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
 ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
 
-ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
+ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm);
 void ncclTopoFree(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
 ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
@@ -33,7 +33,7 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
 ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
-int ncclPxnDisable();
+int ncclPxnDisable(struct ncclComm* comm);
 ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
 ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
 
diff --git a/src/include/group.h b/src/include/group.h
index 239b05f..e6f31b1 100644
--- a/src/include/group.h
+++ b/src/include/group.h
@@ -10,15 +10,82 @@
 #include "nccl.h"
 #include "comm.h"
 
-bool ncclAsyncMode();
-ncclResult_t ncclAsyncErrCheck(ncclResult_t ret);
+ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
+void ncclGroupCommJoin(struct ncclComm* comm);
+void ncclGroupCommPreconnect(struct ncclComm* comm);
+void ncclGroupCommLeave(struct ncclComm* comm);
 
 typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
 
 ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
 
-typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+struct ncclAsyncJob {
+  struct ncclAsyncJob* next;
+  pthread_t thread;
+  ncclResult_t result;
+  ncclResult_t(*func)(struct ncclAsyncJob*);
+  void(*undo)(struct ncclAsyncJob*);
+  void(*destructor)(void*);
+};
+
+ncclResult_t ncclAsyncLaunch(
+  struct ncclAsyncJob* job,
+  ncclResult_t(*func)(struct ncclAsyncJob*),
+  void(*undo)(struct ncclAsyncJob*),
+  void(*destructor)(void*)
+);
+
+ncclResult_t ncclGroupStartInternal();
+ncclResult_t ncclGroupEndInternal();
+
+////////////////////////////////////////////////////////////////////////////////
+
+extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
+extern __thread ncclResult_t ncclGroupError;
+extern __thread struct ncclComm* ncclGroupCommHead;
+extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
+
+inline ncclResult_t ncclGroupStartInternal() {
+  ncclGroupDepth++;
+  return ncclSuccess;
+}
+
+inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
+  if (ncclGroupDepth > 0) {
+    if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret;
+  }
+  return ret;
+}
+
+// Add comm to this thread's group
+inline void ncclGroupCommJoin(struct ncclComm* comm) {
+  if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
+    // Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
+    // the users program order yet insures siblings occur consecutively. This
+    // is required by doLaunches() in "group.cc".
+    struct ncclComm** pp = &ncclGroupCommHead;
+    while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
+      pp = &(*pp)->groupNext;
+    comm->groupNext = *pp;
+    *pp = comm;
+    // Comms gets a new memory stack scope upon joining. Each task batched for
+    // this comm is allocated there.
+    ncclMemoryStackPush(&comm->memScoped);
+  }
+}
+
+// Add comm to this thread's group needing preconnect
+inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
+  if (comm->preconnectNext == reinterpret_cast<struct ncclComm*>(0x1)) {
+    comm->preconnectNext = ncclGroupCommPreconnectHead;
+    ncclGroupCommPreconnectHead = comm;
+  }
+}
+
+// Comm has left group
+inline void ncclGroupCommLeave(struct ncclComm* comm) {
+  comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
+  ncclMemoryStackPop(&comm->memScoped);
+}
 
-ncclResult_t ncclAsyncColl(ncclComm_t comm);
 #endif
diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h
index 63555ba..c747589 100644
--- a/src/include/ibvwrap.h
+++ b/src/include/ibvwrap.h
@@ -1067,6 +1067,9 @@ ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
 ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
 struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
 ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
+/* DMA-BUF support */
+ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
+struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
 ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
 ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
 ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
diff --git a/src/include/info.h b/src/include/info.h
index 3461cc7..b511728 100644
--- a/src/include/info.h
+++ b/src/include/info.h
@@ -10,6 +10,9 @@
 #include "nccl.h"
 #include "devcomm.h"
 #include "collectives.h"
+#include "core.h"
+#include "utils.h"
+#include "strongstream.h"
 
 typedef enum : uint8_t {
   ncclPatternRing,
@@ -54,4 +57,62 @@ struct ncclInfo {
   int channelId;
 };
 
+inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
+  info->nBytes = info->count * ncclTypeSize(info->datatype);
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
+    info->count = info->nBytes;
+    info->datatype = ncclInt8;
+  }
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
+  return ncclSuccess;
+}
+
+struct ncclTaskColl {
+  struct ncclTaskColl* next;
+  ncclFunc_t func;
+  void const* sendbuff;
+  void* recvbuff;
+  size_t count;
+  int root;
+  ncclDataType_t datatype;
+  ncclDevRedOpFull op;
+  int chunkSteps, sliceSteps;
+};
+struct ncclTaskP2p {
+  ncclTaskP2p *next;
+  void *buff;
+  size_t bytes;
+  // Stateful chunk index. If a p2p gets "cut" over two plans this keeps track
+  // of where it left off.
+  int chunk;
+};
+
+struct ncclCudaStreamList {
+  struct ncclCudaStreamList *next;
+  cudaStream_t stream;
+};
+
+struct ncclTasks {
+  struct Peer {
+    bool sendSeen, recvSeen;
+    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
+    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
+  };
+  struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
+  size_t collBytesTotal;
+  struct Peer* peers/*[nRanks]*/;
+  int *p2pSendOrder/*[nRanks]*/, *p2pRecvOrder/*[nRanks]*/;
+  int nTasksColl, nTasksP2p;
+
+  // The list of user streams aggregated over all tasks present.
+  struct ncclCudaStreamList* streams;
+  // The most recent user stream. Ignored if streams==nullptr
+  cudaStream_t streamRecent;
+  // The graph capturing all user streams or invalid if none. Thus we restrict the
+  // user that all streams must be captured in the same graph or not captured
+  // at all. Technically we could probably relax this, but that would mean
+  // collecting a different `ncclTasks` per graph and one for non-graph.
+  struct ncclCudaGraph capturingGraph;
+};
+
 #endif
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index ce61672..255a44e 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -14,12 +14,13 @@
 
 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2
+#define NCCL_PTR_DMABUF 0x4
 
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 8
 
 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_ALL=~0} ncclDebugLogSubSys;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
 
 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
 
@@ -28,15 +29,15 @@ typedef struct {
   char* pciPath;  // Path to the PCI device in /sys.
   uint64_t guid;  // Unique identifier for the NIC chip. Important for
                   // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
+  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
   int speed;      // Port speed in Mbps.
   int port;       // Port number.
   float latency;  // Network latency
   int maxComms;   // Maximum number of comms we can create
   int maxRecvs;   // Maximum number of grouped receives.
-}ncclNetProperties_v5_t;
+}ncclNetProperties_v6_t;
 
-typedef ncclNetProperties_v5_t ncclNetProperties_t;
+typedef ncclNetProperties_v6_t ncclNetProperties_t;
 
 typedef struct {
   // Name of the network (mainly for logs)
@@ -46,7 +47,103 @@ typedef struct {
   // Return the number of adapters.
   ncclResult_t (*devices)(int* ndev);
   // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v6_t;
+
+typedef ncclNet_v6_t ncclNet_t;
+
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v6_t;
+
+typedef ncclCollNet_v6_t ncclCollNet_t;
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6
+
+// v5 struct for backwards compatibility
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
   // Create a receiving object and provide a handle to connect to it. The
   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
   // between ranks to create a connection.
@@ -83,10 +180,7 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v5_t;
 
-typedef ncclNet_v5_t ncclNet_t;
-
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v5
-
+// v5 struct for backwards compatibility
 typedef struct {
   // Name of the collective network (mainly for logs)
   const char* name;
@@ -96,7 +190,7 @@ typedef struct {
   // If ndev returns 0, all other functions might be set to NULL.
   ncclResult_t (*devices)(int* ndev);
   // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
   // Create a receiving object and provide a handle to connect to it. The
   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
   // between ranks to create connections.
@@ -125,10 +219,7 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclCollNet_v5_t;
 
-typedef ncclCollNet_v5_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v5
-
+// v4 struct for backwards compatibility
 typedef struct {
   char* name;     // Used mostly for logging.
   char* pciPath;  // Path to the PCI device in /sys.
@@ -140,6 +231,7 @@ typedef struct {
   int maxComms;   // Maximum number of comms we can create
 } ncclNetProperties_v4_t;
 
+// v4 struct for backwards compatibility
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
@@ -179,6 +271,7 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v4_t;
 
+// v4 struct for backwards compatibility
 typedef struct {
   // Name of the collective network (mainly for logs)
   const char* name;
diff --git a/src/include/net.h b/src/include/net.h
index 0cc5067..5a7b5e3 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -9,33 +9,36 @@
 
 #include "nccl.h"
 #include "nccl_net.h"
+#include "comm.h"
 #include "checks.h"
 
-extern ncclNet_t* ncclNet;
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 
-ncclResult_t ncclNetInit();
-int ncclNetVersion();
+ncclResult_t ncclNetPluginInit();
+ncclResult_t ncclNetInit(struct ncclComm* comm);
+int ncclNetVersion(struct ncclComm* comm);
 
 // Translation to external API
-static const char* ncclNetName() { return ncclNet->name; }
-static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
-static ncclResult_t ncclNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclNet->getProperties(dev, props)); return ncclSuccess; }
-static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
-static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
-static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
-static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
-static ncclResult_t ncclNetTest(void* request, int* done, int* sizes) { NCCLCHECK(ncclNet->test(request, done, sizes)); return ncclSuccess; }
-static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
-static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
-static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
+static const char* ncclNetName(struct ncclComm* comm) { return comm->ncclNet->name; }
+static ncclResult_t ncclNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclNet->devices(ndev)); return ncclSuccess; }
+static ncclResult_t ncclNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclNet->getProperties(dev, props)); return ncclSuccess; }
+static ncclResult_t ncclNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t ncclNetConnect(struct ncclComm* comm, int dev, void* handle, void** sendComm) { NCCLCHECK(comm->ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
+static ncclResult_t ncclNetAccept(struct ncclComm* comm, void* listenComm, void** recvComm) { NCCLCHECK(comm->ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
+static ncclResult_t ncclNetRegMr(struct ncclComm* comm, void* netComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclNet->regMr(netComm, data, size, type, mhandle)); return ncclSuccess; }
+/* DMA-BUF support */
+static ncclResult_t ncclNetRegMrDmaBuf(struct ncclComm* comm, void* netComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclNet->regMrDmaBuf(netComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetDeregMr(struct ncclComm* comm, void* netComm, void* mhandle) { NCCLCHECK(comm->ncclNet->deregMr(netComm, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetIsend(struct ncclComm* comm, void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(comm->ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIrecv(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIflush(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
+static ncclResult_t ncclNetTest(struct ncclComm* comm, void* request, int* done, int* sizes) { NCCLCHECK(comm->ncclNet->test(request, done, sizes)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseSend(struct ncclComm* comm, void* sendComm) { NCCLCHECK(comm->ncclNet->closeSend(sendComm)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseRecv(struct ncclComm* comm, void* recvComm) { NCCLCHECK(comm->ncclNet->closeRecv(recvComm)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclNet->closeListen(listenComm)); return ncclSuccess; }
 
 // Test whether the current GPU support GPU Direct RDMA.
-ncclResult_t ncclGpuGdrSupport(int* gdrSupport);
+ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
 
 extern ncclNet_t ncclNetIb;
 extern ncclNet_t ncclNetSocket;
diff --git a/src/include/p2p.h b/src/include/p2p.h
index 2519873..69d1ea7 100644
--- a/src/include/p2p.h
+++ b/src/include/p2p.h
@@ -9,19 +9,4 @@
 #ifndef NCCL_P2P_H_
 #define NCCL_P2P_H_
 
-struct ncclP2Pinfo {
-  void* buff;
-  ssize_t nbytes;
-};
-
-typedef ncclRecyclableList<struct ncclP2Pinfo> ncclP2Plist;
-
-static ncclResult_t ncclSaveP2pInfo(ncclP2Plist* &p2p, void* buff, ssize_t nBytes) {
-  if (p2p == NULL) p2p = new ncclP2Plist();
-  struct ncclP2Pinfo* next;
-  NCCLCHECK(p2p->getNewElem(&next));
-  next->buff = buff;
-  next->nbytes = nBytes;
-  return ncclSuccess;
-}
 #endif
diff --git a/src/include/proxy.h b/src/include/proxy.h
index c7ca0aa..dcab5e2 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -32,11 +32,16 @@ struct ncclProxyOp {
   int sliceSteps;
   int chunkSteps;
   int chunkSize;
-  ncclDataType_t dtype;
-  ncclRedOp_t redOp;
-  ncclPattern_t pattern; // uint8_t
+  uint8_t /*ncclDataType_t*/ dtype;
+  uint8_t /*ncclDevRedOp_t*/ redOp;
+  uint8_t /*ncclPattern_t*/ pattern;
   uint8_t protocol;
-  uint16_t pad;
+
+  union {
+    uint64_t unused;
+    // For use by enqueue.cc
+    struct ncclProxyOp *enqNext;
+  };
 };
 static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");
 
@@ -68,9 +73,9 @@ struct ncclProxyArgs {
   int sliceSteps;
   int chunkSteps;
   int chunkSize;
-  ncclDataType_t dtype;
-  ncclRedOp_t redOp;
-  ncclPattern_t pattern;
+  uint8_t /*ncclDataType_t*/ dtype;
+  uint8_t /*ncclDevRedOp_t*/ redOp;
+  uint8_t /*ncclPattern_t*/ pattern;
   uint8_t protocol;
   int state;
   char* sharedBuff[NCCL_STEPS];
@@ -158,6 +163,7 @@ struct ncclProxyState {
   pthread_t thread;
   struct ncclSocket* listenSock;
   int stop;
+  CUcontext cudaCtx;
 
   // Used by main thread
   union ncclSocketAddress* peerAddresses;
@@ -187,9 +193,8 @@ enum proxyMode {
   proxyTo = 2
 };
 
-ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* proxyOp, int nranks);
+ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
 ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp);
-ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* proxyOp);
 ncclResult_t ncclProxyStart(struct ncclComm* comm);
 ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
diff --git a/src/include/strongstream.h b/src/include/strongstream.h
new file mode 100644
index 0000000..b72f77c
--- /dev/null
+++ b/src/include/strongstream.h
@@ -0,0 +1,142 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_STRONGSTREAM_H_
+#define NCCL_STRONGSTREAM_H_
+
+#include "nccl.h"
+#include "checks.h"
+
+#include <stdint.h>
+
+/* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes
+ * easily.
+ */
+struct ncclCudaGraph {
+#if CUDART_VERSION >= 11030
+  cudaGraph_t graph;
+  uint64_t graphId;
+#endif
+};
+
+inline struct ncclCudaGraph ncclCudaGraphNull() {
+  struct ncclCudaGraph tmp;
+  #if CUDART_VERSION >= 11030
+    tmp.graph = nullptr;
+    tmp.graphId = ULLONG_MAX;
+  #endif
+  return tmp;
+}
+
+inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) {
+  #if CUDART_VERSION >= 11030
+    return graph.graph != nullptr;
+  #else
+    return false;
+  #endif
+}
+
+inline bool ncclCudaGraphSame(struct ncclCudaGraph a, struct ncclCudaGraph b) {
+  #if CUDART_VERSION >= 11030
+    return a.graphId == b.graphId;
+  #else
+    return true;
+  #endif
+}
+
+ncclResult_t ncclCudaGetCapturingGraph(struct ncclCudaGraph* graph, cudaStream_t stream);
+ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg);
+
+
+/* ncclStrongStream: An abstraction over CUDA streams that do not lose their
+ * identity while being captured. Regular streams have the deficiency that the
+ * captured form of a stream in one graph launch has no relation to the
+ * uncaptured stream or to the captured form in other graph launches. This makes
+ * streams unfit for the use of serializing access to a persistent resource.
+ * Strong streams have been introduced to address this need.
+ *
+ * Constraints of using strong streams:
+ *
+ * - Operations that enqueue work to the strong stream need to be enclosed by
+ *   ncclStrongStream[Acquire/Release] pairs. Acquire/release act like fences,
+ *   the strong stream is not stateful so there is no harm in redundant acquire
+ *   or releases.
+ *
+ * - An {Acquire; ...; Release} sequence must not be concurrent with any
+ *   other operations against the strong stream including graph launches which
+ *   reference this stream.
+ *
+ * - All strong stream functions take a "graph" parameter which must reference
+ *   the currently capturing graph, or null if none.
+ */
+struct ncclStrongStream;
+
+ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss);
+ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss);
+
+// Has this strong stream ever been captured in a graph.
+bool ncclStrongStreamEverCaptured(struct ncclStrongStream* ss);
+
+// Acquire-fence the strong stream.
+ncclResult_t ncclStrongStreamAcquire(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss
+);
+
+// Acquire-fence the strong stream assuming no graph is capturing. This permits
+// the caller to enqueue directly to the `ss->stream` member using native CUDA
+// calls. Strong stream must be released via:
+//   ncclStrongStreamRelease(ncclCudaGraphNull(), graphRefs, ss);
+ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss);
+
+// Release-fence of the strong stream.
+ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss);
+
+// Add a host launch to the stream.
+ncclResult_t ncclStrongStreamLaunchHost(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
+  cudaHostFn_t fn, void* arg
+);
+// Add a kernel launch to the stream.
+ncclResult_t ncclStrongStreamLaunchKernel(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
+  void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes
+);
+// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
+ncclResult_t ncclStrongStreamWaitStream(
+  struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
+);
+// `b` must be capturing within `graph`.
+ncclResult_t ncclStrongStreamWaitStream(
+  struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b
+);
+// `a` must be capturing within `graph`.
+ncclResult_t ncclStrongStreamWaitStream(
+  struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b
+);
+
+// Synchrnoization does not need the strong stream to be acquired.
+ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss);
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ncclStrongStream {
+  cudaStream_t stream;
+  cudaEvent_t event;
+  #if CUDART_VERSION >= 11030
+  cudaGraphNode_t node; // null if never captured, otherwise never null again
+  uint64_t graphId:63, eventIsLagging:1;
+  #endif
+};
+
+inline bool ncclStrongStreamEverCaptured(struct ncclStrongStream* ss) {
+  #if CUDART_VERSION >= 11030
+    return ss->node != nullptr;
+  #else
+    return false;
+  #endif
+}
+
+#endif
diff --git a/src/include/transport.h b/src/include/transport.h
index 043a415..e13c9e8 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -20,7 +20,12 @@
 
 #include "proxy.h"
 
-extern struct ncclTransport ncclTransports[];
+extern struct ncclTransport p2pTransport;
+extern struct ncclTransport shmTransport;
+extern struct ncclTransport netTransport;
+extern struct ncclTransport collNetTransport;
+
+extern struct ncclTransport* ncclTransports[];
 
 // Forward declarations
 struct ncclRing;
@@ -63,7 +68,7 @@ struct ncclTransport {
   struct ncclTransportComm recv;
 };
 
-ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
+ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
 
 enum { collNetRecv=0, collNetSend=1 };
diff --git a/src/include/utils.h b/src/include/utils.h
index f08ff37..0604d15 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -8,8 +8,12 @@
 #define NCCL_UTILS_H_
 
 #include "nccl.h"
+#include "alloc.h"
 #include "checks.h"
 #include <stdint.h>
+#include <time.h>
+#include <sched.h>
+#include <new>
 
 int ncclCudaCompCap();
 
@@ -38,81 +42,446 @@ static long log2i(long n) {
  return l;
 }
 
-// Recyclable list that avoids frequent malloc/free
+inline uint64_t clockNano() {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename Int>
+inline void ncclAtomicRefCountIncrement(Int* refs) {
+  __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED);
+}
+
+template<typename Int>
+inline Int ncclAtomicRefCountDecrement(Int* refs) {
+  return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that
+ * granularity of LIFO is not per object, instead frames containing many objects
+ * are pushed and popped. Therefor deallocation is extremely cheap since its
+ * done at the frame granularity.
+ *
+ * The initial state of the stack is with one frame, the "nil" frame, which
+ * cannot be popped. Therefor objects allocated in the nil frame cannot be
+ * deallocated sooner than stack destruction.
+ */
+struct ncclMemoryStack;
+
+void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
+void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
+void ncclMemoryStackPush(struct ncclMemoryStack* me);
+void ncclMemoryStackPop(struct ncclMemoryStack* me);
 template<typename T>
-struct ncclListElem {
-  T data;
-  struct ncclListElem* next;
+T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1);
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
+ * a pool instance to ever hold objects whose type have differing
+ * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by
+ * a backing `ncclMemoryStack` passed during Alloc(). If memory
+ * backing any currently held object is deallocated then it is an error to do
+ * anything other than reconstruct it, after which it is a valid empty pool.
+ */
+struct ncclMemoryPool;
+
+// Equivalent to zero-initialization
+void ncclMemoryPoolConstruct(struct ncclMemoryPool* me);
+template<typename T>
+T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing);
+template<typename T>
+void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj);
+void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from);
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclIntruQueue: A singly-linked list queue where the per-object next pointer
+ * field is given via the `next` template argument.
+ *
+ * Example:
+ *   struct Foo {
+ *     struct Foo *next1, *next2; // can be a member of two lists at once
+ *   };
+ *   ncclIntruQueue<Foo, &Foo::next1> list1;
+ *   ncclIntruQueue<Foo, &Foo::next2> list2;
+ */
+template<typename T, T *T::*next>
+struct ncclIntruQueue;
+
+template<typename T, T *T::*next>
+void ncclIntruQueueConstruct(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+bool ncclIntruQueueEmpty(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x);
+template<typename T, T *T::*next>
+T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *memPool);
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
+ * and "cond" fields are part of the public interface.
+ */
+struct ncclThreadSignal {
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
 };
 
-template<typename T>
-class ncclRecyclableList {
- private:
-  struct ncclListElem<T>* head;
-  struct ncclListElem<T>* tail;
-  struct ncclListElem<T>* cursor;
-  int n;
+// returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}
+constexpr ncclThreadSignal ncclThreadSignalStaticInitializer();
 
- public:
-  ncclRecyclableList() {
-    tail = cursor = head = NULL;
-    n = 0;
-  }
+void ncclThreadSignalConstruct(struct ncclThreadSignal* me);
+void ncclThreadSignalDestruct(struct ncclThreadSignal* me);
 
-  int count() const { return n; }
+// A convenience instance per-thread.
+extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance;
 
-  // Get a new element from the list and return pointer
-  ncclResult_t getNewElem(T** dataOut) {
-    if (tail != NULL) {
-      *dataOut = &tail->data;
-      memset(*dataOut, 0, sizeof(T));
-    } else {
-      NCCLCHECK(ncclCalloc(&tail, 1));
-      *dataOut = &tail->data;
-      cursor = head = tail;
-    }
-    if (tail->next == NULL) {
-      NCCLCHECK(ncclCalloc(&tail->next, 1));
-    }
-    tail = tail->next;
-    n += 1;
-    return ncclSuccess;
-  }
+////////////////////////////////////////////////////////////////////////////////
 
-  T* begin() {
-    if (head == NULL || head == tail) return NULL;
-    cursor = head->next;
-    return &head->data;
-  }
+template<typename T, T *T::*next>
+struct ncclIntruQueueMpsc;
 
-  // Get next element from the list during an iteration
-  T* getNext() {
-    // tail always points to the next element to be enqueued
-    // hence does not contain valid data
-    if (cursor == NULL || cursor == tail) return NULL;
-    T* rv = &cursor->data;
-    cursor = cursor->next;
-    return rv;
-  }
+template<typename T, T *T::*next>
+void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T,next>* me);
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T,next>* me);
+// Enqueue element. Returns true if queue is not abandoned. Even if queue is
+// abandoned the element enqueued, so the caller needs to make arrangements for
+// the queue to be tended.
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc<T,next>* me, T* x);
+// Dequeue all elements at a glance. If there aren't any and `waitSome` is
+// true then this call will wait until it can return a non empty list.
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc<T,next>* me, bool waitSome);
+// Dequeue all elements and set queue to abandoned state.
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc<T,next>* me);
 
-  T* peakNext() {
-    if (cursor == NULL || cursor == tail) return NULL;
-    return &cursor->data;
-  }
+////////////////////////////////////////////////////////////////////////////////
 
-  // Recycle the list without freeing the space
-  void recycle() {
-    tail = cursor = head;
-    n = 0;
-  }
+struct ncclMemoryStack {
+  struct Hunk {
+    struct Hunk* above; // reverse stack pointer
+    size_t size; // size of this allocation (including this header struct)
+  };
+  struct Unhunk { // proxy header for objects allocated out-of-hunk
+    struct Unhunk* next;
+    void* obj;
+  };
+  struct Frame {
+    struct Hunk* hunk; // top of non-empty hunks
+    uintptr_t bumper, end; // points into top hunk
+    struct Unhunk* unhunks;
+    struct Frame* below;
+  };
 
-  ~ncclRecyclableList() {
-    while (head != NULL) {
-      struct ncclListElem<T>* temp = head;
-      head = head->next;
-      free(temp);
-    }
-  }
+  static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align);
+  static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align);
+
+  struct Hunk stub;
+  struct Frame topFrame;
 };
 
+inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) {
+  me->stub.above = nullptr;
+  me->stub.size = 0;
+  me->topFrame.hunk = &me->stub;
+  me->topFrame.bumper = 0;
+  me->topFrame.end = 0;
+  me->topFrame.unhunks = nullptr;
+  me->topFrame.below = nullptr;
+}
+
+inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) {
+  uintptr_t o = (me->topFrame.bumper + align-1) & -uintptr_t(align);
+  void* obj;
+  if (__builtin_expect(o + size <= me->topFrame.end, true)) {
+    me->topFrame.bumper = o + size;
+    obj = reinterpret_cast<void*>(o);
+  } else {
+    obj = allocateSpilled(me, size, align);
+  }
+  return obj;
+}
+
+template<typename T>
+inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
+  void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T));
+  memset(obj, 0, n*sizeof(T));
+  return (T*)obj;
+}
+
+inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
+  using Frame = ncclMemoryStack::Frame;
+  Frame tmp = me->topFrame;
+  Frame* snapshot = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame));
+  *snapshot = tmp; // C++ struct assignment
+  me->topFrame.unhunks = nullptr;
+  me->topFrame.below = snapshot;
+}
+
+inline void ncclMemoryStackPop(struct ncclMemoryStack* me) {
+  ncclMemoryStack::Unhunk* un = me->topFrame.unhunks;
+  while (un != nullptr) {
+    free(un->obj);
+    un = un->next;
+  }
+  me->topFrame = *me->topFrame.below; // C++ struct assignment
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ncclMemoryPool {
+  struct Cell {
+    Cell *next;
+  };
+  template<int Size, int Align>
+  union CellSized {
+    Cell cell;
+    alignas(Align) char space[Size];
+  };
+  struct Cell* head;
+  struct Cell* tail; // meaningful only when head != nullptr
+};
+
+inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) {
+  me->head = nullptr;
+}
+
+template<typename T>
+inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) {
+  using Cell = ncclMemoryPool::Cell;
+  using CellSized = ncclMemoryPool::CellSized<sizeof(T), alignof(T)>;
+  Cell* cell;
+  if (__builtin_expect(me->head != nullptr, true)) {
+    cell = me->head;
+    me->head = cell->next;
+  } else {
+    // Use the internal allocate() since it doesn't memset to 0 yet.
+    cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized));
+  }
+  memset(cell, 0, sizeof(T));
+  return reinterpret_cast<T*>(cell);
+}
+
+template<typename T>
+inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) {
+  using Cell = ncclMemoryPool::Cell;
+  Cell* cell = reinterpret_cast<Cell*>(obj);
+  cell->next = me->head;
+  if (me->head == nullptr) me->tail = cell;
+  me->head = cell;
+}
+
+inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) {
+  if (from->head != nullptr) {
+    from->tail->next = me->head;
+    if (me->head == nullptr) me->tail = from->tail;
+    me->head = from->head;
+    from->head = nullptr;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, T *T::*next>
+struct ncclIntruQueue {
+  T *head, *tail;
+};
+
+template<typename T, T *T::*next>
+inline void ncclIntruQueueConstruct(ncclIntruQueue<T,next> *me) {
+  me->head = nullptr;
+  me->tail = nullptr;
+}
+
+template<typename T, T *T::*next>
+inline bool ncclIntruQueueEmpty(ncclIntruQueue<T,next> *me) {
+  return me->head == nullptr;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me) {
+  return me->head;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueTail(ncclIntruQueue<T,next> *me) {
+  return me->tail;
+}
+
+template<typename T, T *T::*next>
+inline void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x) {
+  x->*next = nullptr;
+  (me->head ? me->tail->*next : me->head) = x;
+  me->tail = x;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me) {
+  T *ans = me->head;
+  me->head = ans->*next;
+  if (me->head == nullptr) me->tail = nullptr;
+  return ans;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me) {
+  T *ans = me->head;
+  if (ans != nullptr) {
+    me->head = ans->*next;
+    if (me->head == nullptr) me->tail = nullptr;
+  }
+  return ans;
+}
+
+template<typename T, T *T::*next>
+void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *pool) {
+  T *head = me->head;
+  me->head = nullptr;
+  me->tail = nullptr;
+  while (head != nullptr) {
+    T *tmp = head->*next;
+    ncclMemoryPoolFree(pool, tmp);
+    head = tmp;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() {
+  return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER};
+}
+
+inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) {
+  pthread_mutex_init(&me->mutex, nullptr);
+  pthread_cond_init(&me->cond, nullptr);
+}
+
+inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) {
+  pthread_mutex_destroy(&me->mutex);
+  pthread_cond_destroy(&me->cond);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, T *T::*next>
+struct ncclIntruQueueMpsc {
+  T* head;
+  uintptr_t tail;
+  struct ncclThreadSignal* waiting;
+};
+
+template<typename T, T *T::*next>
+void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T,next>* me) {
+  me->head = nullptr;
+  me->tail = 0x0;
+  me->waiting = nullptr;
+}
+
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T,next>* me) {
+  return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2;
+}
+
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc<T,next>* me, T* x) {
+  __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED);
+  uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast<uintptr_t>(x), __ATOMIC_ACQ_REL);
+  T* prev = reinterpret_cast<T*>(utail);
+  T** prevNext = utail <= 0x2 ? &me->head : &(prev->*next);
+  __atomic_store_n(prevNext, x, __ATOMIC_RELAXED);
+  if (utail == 0x1) { // waiting
+    __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting
+    // This lock/unlock is essential to ensure we don't race ahead of the consumer
+    // and signal the cond before they begin waiting on it.
+    struct ncclThreadSignal* waiting = me->waiting;
+    pthread_mutex_lock(&waiting->mutex);
+    pthread_mutex_unlock(&waiting->mutex);
+    pthread_cond_broadcast(&waiting->cond);
+  }
+  return utail != 0x2; // not abandoned
+}
+
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc<T,next>* me, bool waitSome) {
+  T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
+  if (head == nullptr) {
+    if (!waitSome) return nullptr;
+    uint64_t t0 = clockNano();
+    bool sleeping = false;
+    do {
+      if (clockNano()-t0 >= 10*1000) { // spin for first 10us
+        struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance;
+        pthread_mutex_lock(&waitSignal->mutex);
+        uintptr_t expected = sleeping ? 0x1 : 0x0;
+        uintptr_t desired = 0x1;
+        me->waiting = waitSignal; // release done by successful compare exchange
+        if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
+          sleeping = true;
+          pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex);
+        }
+        pthread_mutex_unlock(&waitSignal->mutex);
+      }
+      head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
+    } while (head == nullptr);
+  }
+
+  __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
+  uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL);
+  T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
+  T *x = head;
+  while (x != tail) {
+    T *x1;
+    int spins = 0;
+    while (true) {
+      x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
+      if (x1 != nullptr) break;
+      if (++spins == 1024) { spins = 1024-1; sched_yield(); }
+    }
+    x = x1;
+  }
+  return head;
+}
+
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc<T,next>* me) {
+  uintptr_t expected = 0x0;
+  if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
+    return nullptr;
+  } else {
+    int spins = 0;
+    T* head;
+    while (true) {
+      head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
+      if (head != nullptr) break;
+      if (++spins == 1024) { spins = 1024-1; sched_yield(); }
+    }
+    __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
+    uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL);
+    T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
+    T *x = head;
+    while (x != tail) {
+      T *x1;
+      spins = 0;
+      while (true) {
+        x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
+        if (x1 != nullptr) break;
+        if (++spins == 1024) { spins = 1024-1; sched_yield(); }
+      }
+      x = x1;
+    }
+    return head;
+  }
+}
 #endif
diff --git a/src/init.cc b/src/init.cc
index c6b6e8f..9269708 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -28,10 +28,6 @@
 #define STR2(v) #v
 #define STR(v) STR2(v)
 
-#ifdef ENABLE_TRACE
-std::chrono::high_resolution_clock::time_point ncclEpoch;
-#endif
-
 #if CUDART_VERSION >= 9020
 #define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
 #else
@@ -46,6 +42,17 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
 
 NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
 
+static uint64_t hashUniqueId(ncclUniqueId const &id) {
+  char const *bytes = (char const*)&id;
+  uint64_t h = 0xdeadbeef;
+  for(int i=0; i < (int)sizeof(ncclUniqueId); i++) {
+    h ^= h >> 32;
+    h *= 0x8db3db47fa2994ad;
+    h += bytes[i];
+  }
+  return h;
+}
+
 // GDRCOPY support: Off by default
 NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0);
 
@@ -65,18 +72,28 @@ NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
 static bool initialized = false;
 static size_t maxLocalSizeBytes = 0;
+
+bool ncclMainExited = false;
+
+static void atexitHandler() {
+  ncclMainExited = true;
+}
+
 static ncclResult_t ncclInit() {
-  if (initialized) return ncclSuccess;
+  if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return ncclSuccess;
   pthread_mutex_lock(&initLock);
   if (!initialized) {
+    atexit(atexitHandler);
     initEnv();
     initGdrCopy();
     maxLocalSizeBytes = ncclKernMaxLocalSize();
     int carveout = ncclParamL1SharedMemoryCarveout();
     if (carveout) ncclKernSetSharedMemoryCarveout(carveout);
-    NCCLCHECK(ncclNetInit());
-    INFO(NCCL_INIT, "Using network %s", ncclNetName());
-    initialized = true;
+    // Always initialize bootstrap network
+    NCCLCHECK(bootstrapNetInit());
+    NCCLCHECK(ncclNetPluginInit());
+
+    __atomic_store_n(&initialized, true, __ATOMIC_RELEASE);
   }
   pthread_mutex_unlock(&initLock);
   return ncclSuccess;
@@ -93,7 +110,9 @@ NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
 ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
   NCCLCHECK(ncclInit());
   NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
-  return bootstrapGetUniqueId(out);
+  ncclResult_t res = bootstrapGetUniqueId(out);
+  TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out));
+  return res;
 }
 
 // Prevent compiler from optimizing out these operations
@@ -104,11 +123,96 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
 #endif
 
 void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) {
+  // Important that this does not trash intraComm0 & intraRefs.
   comm->rank = comm->cudaDev = comm->busId = comm->nRanks = -1;
 }
 
 #undef NCCL_NO_OPTIMIZE
 
+
+static ncclResult_t ncclDestructorFnFree(struct ncclDestructor* dtor) {
+  free(dtor->obj);
+  return ncclSuccess;
+}
+void ncclCommPushFree(struct ncclComm* comm, void* obj) {
+  struct ncclDestructor* dtor = ncclMemoryStackAlloc<struct ncclDestructor>(&comm->memPermanent);
+  dtor->fn = ncclDestructorFnFree;
+  dtor->obj = obj;
+  dtor->next = comm->destructorHead;
+  comm->destructorHead = dtor;
+}
+
+static ncclResult_t ncclDestructorFnCudaFree(struct ncclDestructor* dtor) {
+  CUDACHECK(cudaFree(dtor->obj));
+  return ncclSuccess;
+}
+void ncclCommPushCudaFree(struct ncclComm* comm, void* obj) {
+  struct ncclDestructor* dtor = ncclMemoryStackAlloc<struct ncclDestructor>(&comm->memPermanent);
+  dtor->fn = ncclDestructorFnCudaFree;
+  dtor->obj = obj;
+  dtor->next = comm->destructorHead;
+  comm->destructorHead = dtor;
+}
+
+static ncclResult_t ncclDestructorFnCudaHostFree(struct ncclDestructor* dtor) {
+  CUDACHECK(cudaFreeHost(dtor->obj));
+  return ncclSuccess;
+}
+void ncclCommPushCudaHostFree(struct ncclComm* comm, void* obj) {
+  struct ncclDestructor* dtor = ncclMemoryStackAlloc<struct ncclDestructor>(&comm->memPermanent);
+  dtor->fn = ncclDestructorFnCudaHostFree;
+  dtor->obj = obj;
+  dtor->next = comm->destructorHead;
+  comm->destructorHead = dtor;
+}
+
+static ncclResult_t ncclDestructorFnCudaGdrFree(struct ncclDestructor* dtor) {
+  NCCLCHECK(ncclGdrCudaFree(dtor->obj));
+  return ncclSuccess;
+}
+void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle) {
+  struct ncclDestructor* dtor = ncclMemoryStackAlloc<struct ncclDestructor>(&comm->memPermanent);
+  dtor->fn = ncclDestructorFnCudaGdrFree;
+  dtor->obj = handle;
+  dtor->next = comm->destructorHead;
+  comm->destructorHead = dtor;
+}
+
+void commZombieCleanup(struct ncclComm* comm) {
+  ncclMemoryStackDestruct(&comm->memScoped);
+  ncclMemoryStackDestruct(&comm->memPermanent);
+
+  struct ncclComm* intraComm0 = comm->intraComm0;
+  if (0 == ncclAtomicRefCountDecrement(&intraComm0->intraRefs)) {
+    // Wait for all service threads to be done. We could not
+    // do it earlier because it could have blocked and prevented
+    // other ranks in the process to call ncclCommDestroy
+    comm = intraComm0;
+    while (comm != nullptr) {
+      if (comm->proxyState.thread) pthread_join(comm->proxyState.thread, nullptr);
+      struct ncclComm* next = comm->intraNext;
+      free(comm);
+      comm = next;
+    }
+  }
+}
+
+static void* commZombieMain(void* arg) {
+  ncclResult_t result = ncclSuccess;
+  struct ncclComm* comm = (struct ncclComm*)arg;
+  while (comm->persistentRefs != 0) {
+    struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, /*waitSome=*/true);
+    while (cb != nullptr) {
+      struct ncclCommCallback* next = cb->next;
+      NCCLCHECKGOTO(cb->fn(comm, cb), result, ignore); // may reclaim memory of cb
+    ignore:
+      cb = next;
+    }
+  }
+  commZombieCleanup(comm);
+  return arg;
+}
+
 static ncclResult_t commFree(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
@@ -120,13 +224,6 @@ static ncclResult_t commFree(ncclComm_t comm) {
 
   free(comm->connectSend);
   free(comm->connectRecv);
-  for (int peer=0; peer<comm->nRanks; peer++) {
-    delete comm->p2pSends[peer];
-    delete comm->p2pRecvs[peer];
-  }
-  free(comm->p2pSends);
-  free(comm->p2pRecvs);
-  free(comm->asyncOps);
 
   free(comm->peerInfo);
   ncclTopoFree(comm->topo);
@@ -138,51 +235,60 @@ static ncclResult_t commFree(ncclComm_t comm) {
   if (comm->bootstrap)
     NCCLCHECK(bootstrapClose(comm->bootstrap));
 
-  CUDACHECK(cudaFree((ncclDevCommAndChannels*)comm->devComm));
-
   for (int channel=0; channel<MAXCHANNELS; channel++)
     NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));
 
-  if (comm->doneEvent != NULL)
-    CUDACHECK(cudaEventDestroy(comm->doneEvent));
+  NCCLCHECK(ncclStrongStreamDestruct(&comm->hostStream));
+  NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream));
 
-  if (comm->intDoneEvent != NULL)
-    CUDACHECK(cudaEventDestroy(comm->intDoneEvent));
-
-  if (comm->launchMode == ncclComm::GROUP) {
-    CUDACHECK(cudaStreamDestroy(comm->groupStream));
-  }
-
-  // Last rank frees shared resources between threads
-  int isLast;
-  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
-  if (isLast) {
-    // Wait for all service threads to be done. We could not
-    // do it earlier because it could have blocked and prevented
-    // other ranks in the process to call ncclCommDestroy
-    for (int i=0; i<comm->intraRanks; i++) {
-      void* ret;
-      if (comm->intraThreads[i]) pthread_join(comm->intraThreads[i], &ret);
-    }
-    free(comm->intraBarrier);
-    free(comm->intraParams);
-    free(comm->intraThreads);
-    free(comm->intraCudaDevs);
-    free(comm->intraCGMode);
-    free(comm->intraCC);
-  }
   NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));
 
-  // Poison comm to try and catch a double free
-  commPoison(comm);
+  struct ncclDestructor* dtor = comm->destructorHead;
+  while (dtor != nullptr) {
+    NCCLCHECK(dtor->fn(dtor));
+    dtor = dtor->next;
+  }
 
-  free(comm);
+  commPoison(comm); // Important that this does not interfere with anything used below.
+
+  if (comm->persistentRefs == 0) {
+    commZombieCleanup(comm);
+  } else {
+    // Spawn a thread to listen for remaining messages from graph cleanup.
+    pthread_t zombie;
+    pthread_create(&zombie, nullptr, commZombieMain, comm);
+    pthread_detach(zombie);
+  }
   return ncclSuccess;
 }
 
 NCCL_PARAM(AggChannelSize, "AGG_CHANNEL_SIZE", -2);
 NCCL_PARAM(DisableGraphHelper, "GRAPH_HELPER_DISABLE", 0);
-NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 0);
+// GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory
+NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1);
+NCCL_PARAM(WorkFifoDepth, "WORK_FIFO_DEPTH", 64<<10);
+enum ncclLaunchMode ncclParamLaunchMode;
+
+NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 1);
+
+// Detect DMA-BUF support
+static ncclResult_t dmaBufSupported(struct ncclComm* comm) {
+  if (ncclParamDmaBufEnable() == 0 || comm->ncclNet->regMrDmaBuf == NULL) return ncclInternalError;
+#if CUDA_VERSION >= 11070
+  int flag = 0;
+  CUdevice dev;
+  int cudaDriverVersion;
+  CUCHECK(cuDriverGetVersion(&cudaDriverVersion));
+  if (cudaDriverVersion < 11070) return ncclInternalError;
+  CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
+  // Query device to see if DMA-BUF support is available
+  (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev));
+  if (flag == 0) return ncclInternalError;
+  INFO(NCCL_INIT, "DMA-BUF is available on GPU device %d", comm->cudaDev);
+  return ncclSuccess;
+#endif
+  return ncclInternalError;
+}
 
 static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   if (ndev < 1) {
@@ -194,100 +300,114 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
     return ncclInvalidArgument;
   }
 
-  // Try to create a CUDA object right away. If there is something wrong with
-  // the device we're on (failure cause #1) , better know it early.
-  cudaEvent_t doneEvent;
-  CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming));
-  cudaEvent_t intDoneEvent;
-  CUDACHECK(cudaEventCreateWithFlags(&intDoneEvent, cudaEventDisableTiming));
-
   struct ncclComm* comm;
   NCCLCHECK(ncclCalloc(&comm, 1));
+  ncclMemoryStackConstruct(&comm->memPermanent);
+  ncclMemoryStackConstruct(&comm->memScoped);
+  comm->destructorHead = nullptr;
+  comm->rank = rank;
+  comm->nRanks = ndev;
+
+  NCCLCHECK(ncclNetInit(comm));
+  INFO(NCCL_INIT, "Using network %s", ncclNetName(comm));
+
+  // Try to create a CUDA object right away. If there is something wrong with
+  // the device we're on (failure cause #1) , better know it early.
+  NCCLCHECK(ncclStrongStreamConstruct(&comm->deviceStream));
+  NCCLCHECK(ncclStrongStreamConstruct(&comm->hostStream));
 
-  comm->rank = comm->hostDevComm.rank = rank;
-  comm->nRanks = comm->hostDevComm.nRanks = ndev;
   cudaGetDevice(&comm->cudaDev);
   NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
   TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx", comm, rank, ndev, comm->cudaDev, comm->busId);
 
-  comm->doneEvent = doneEvent;
-  comm->intDoneEvent = intDoneEvent;
   comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
-#if CUDART_VERSION >= 9020
-  comm->groupCudaStream = ncclParamGroupCudaStream();
-#else
-  // Don't allow the user to overload the default setting in older CUDA builds
-  comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
-#endif
+  comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false;
   comm->fatalError = ncclSuccess;
 
   NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1));
-  comm->hostDevComm.abortFlag = comm->abortFlag;
   *comm->abortFlag = 0;
 
-  comm->argsptrs[0] = &comm->devComm;
-  comm->argsptrs[1] = &comm->args;
   comm->collNetSupport = 0;
 
-  NCCLCHECK(ncclCalloc(&comm->asyncOps, NCCL_MAX_OPS));
-  comm->asyncOpCount = 0;
-  comm->asyncTotalSize = 0;
+  ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan);
+  ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
+  ncclMemoryPoolConstruct(&comm->memPool_ncclPointerList);
+
+  comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
+  comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
   comm->channelSize = ncclParamAggChannelSize();
-  comm->asyncAllocMode = ncclComm::SHORTEST_QUEUE;
-  char* str = getenv("NCCL_AGG_ALLOC_MODE");
-  if (str) INFO(NCCL_ENV, "NCCL_AGG_ALLOC_MODE set by environment to %s", str);
-  if (str && strcmp(str, "ROUND_ROBIN") == 0) {
-    comm->asyncAllocMode = ncclComm::ROUND_ROBIN;
-  }
-
-  CUDACHECK(cudaDriverGetVersion(&comm->driverVersion));
-
-  NCCLCHECK(ncclCreateQueueInfo(&comm->enqueueInfo, comm));
-  comm->lastSetupNode = NULL;
-  comm->lastCudaGraphId = -1;
-  comm->disableGraphHelper = ncclParamDisableGraphHelper();
-  comm->graphRegister = ncclParamGraphRegister();
-#if CUDART_VERSION >= 11030
-  NCCLCHECK(ncclCalloc(&comm->graphHelperResources, 1));
-  comm->graphHelperResources->comm = comm;
-  if (comm->driverVersion >= 11030)
-    // cudaGetDriverEntryPoint requires R465 or above (enhanced compat need)
-    CUDACHECK(cudaGetDriverEntryPoint("cuMemGetAddressRange", (void**)&comm->pfnCuMemGetAddressRange, cudaEnableDefault));
-#endif
 
   static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
   static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
   NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
   NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks));
 
-  comm->p2pSendCount = comm->p2pRecvCount = 0;
-  NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks));
-  NCCLCHECK(ncclCalloc(&comm->p2pRecvs, comm->nRanks));
-
   // Mark channels as non initialized.
-  for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1;
+  for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1;
+
+  ncclIntruQueueMpscConstruct(&comm->callbackQueue);
 
   *comret = comm;
   return ncclSuccess;
 }
 
 static ncclResult_t devCommSetup(ncclComm_t comm) {
-  ncclDevCommAndChannels *devCommAndChans;
-  NCCLCHECK(ncclCudaCalloc(&devCommAndChans, 1));
+  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream));
+
+  int nRanks = comm->nRanks;
+  struct ncclDevCommAndChannels *devCommAndChans, tmpCommAndChans;
+  NCCLCHECK(ncclCudaCallocAsync(&devCommAndChans, 1, comm->deviceStream.stream));
+  ncclCommPushCudaFree(comm, devCommAndChans);
   comm->devComm = &devCommAndChans->comm;
-  comm->hostDevComm.channels = devCommAndChans->channels;
+  tmpCommAndChans.comm.rank = comm->rank;
+  tmpCommAndChans.comm.nRanks = nRanks;
+  tmpCommAndChans.comm.abortFlag = comm->abortFlag;
+  for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
+    tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
+  }
+  tmpCommAndChans.comm.channels = &devCommAndChans->channels[0];
 
-  // Duplicate the channels on the device
-  int nChannels = std::max(comm->nChannels, comm->p2pnChannels);
-  NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, nChannels));
+  comm->workFifoDepth = ncclParamWorkFifoDepth();
+  if (0 != (comm->workFifoDepth & (comm->workFifoDepth-1))) {
+    WARN("NCCL_WORK_FIFO_DEPTH=%d is being ignored because it is not a power of 2.", comm->workFifoDepth);
+    comm->workFifoDepth = 64<<10;
+  }
+  tmpCommAndChans.comm.workFifoDepth = comm->workFifoDepth;
 
-  // Copy userRanks and peers
-  for (int r=0; r<comm->nChannels; r++) {
-    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
+  if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
+    // The workFifoHeap lives in GDR mapped CUDA memory.
+    NCCLCHECK(ncclGdrCudaCalloc(&comm->workFifoHeap, &comm->devWorkFifoHeap, comm->workFifoDepth, &comm->workFifoHeapGdrHandle));
+    ncclCommPushCudaGdrFree(comm, comm->workFifoHeapGdrHandle);
+  } else {
+    // The workFifoHeap lives in cudaHost memory.
+    comm->workFifoHeapGdrHandle = nullptr;
+    NCCLCHECK(ncclCudaHostCalloc(&comm->workFifoHeap, comm->workFifoDepth));
+    ncclCommPushCudaHostFree(comm, comm->workFifoHeap);
+    comm->devWorkFifoHeap = comm->workFifoHeap;
+  }
+  tmpCommAndChans.comm.workFifoHeap = comm->devWorkFifoHeap;
+
+  NCCLCHECK(ncclCudaHostCalloc(&comm->workFifoDone, MAXCHANNELS));
+  ncclCommPushCudaHostFree(comm, comm->workFifoDone);
+  comm->workFifoSent = 0;
+  comm->workFifoAckdMin = 0;
+
+  for (int c=0; c < MAXCHANNELS; c++) {
+    tmpCommAndChans.channels[c].peers = comm->channels[c].devPeers;
+    tmpCommAndChans.channels[c].ring = comm->channels[c].ring;
+    tmpCommAndChans.channels[c].ring.userRanks = comm->channels[c].devRingUserRanks;
+    tmpCommAndChans.channels[c].tree = comm->channels[c].tree;
+    tmpCommAndChans.channels[c].collTree = comm->channels[c].collTree;
+    tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c];
+
+    if (comm->channels[c].ring.userRanks != nullptr) {
+      NCCLCHECK(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->deviceStream.stream));
+    }
   }
 
-  // Duplicate the dev comm on the device
-  NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1));
+  NCCLCHECK(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->deviceStream.stream));
+  CUDACHECK(cudaStreamSynchronize(comm->deviceStream.stream));
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNull(), &comm->deviceStream));
   return ncclSuccess;
 }
 
@@ -319,7 +439,7 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
 
   info->busId = comm->busId;
 
-  NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
+  NCCLCHECK(ncclGpuGdrSupport(comm, &info->gdrSupport));
   info->comm = comm;
   info->cudaCompCap = ncclCudaCompCap();
   return ncclSuccess;
@@ -343,84 +463,6 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
   return ncclSuccess;
 }
 
-void* waitForNonNullPtr(void* p) {
-  volatile void** ptr = (volatile void**) p;
-  while (*ptr == NULL) sched_yield();
-  return (void*)*ptr;
-}
-
-ncclResult_t initParams(struct ncclComm* comm) {
-  struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
-  params->args = comm->argsptrs;
-  params->stream = NULL;
-  params->sharedMem = 0;
-  params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
-  params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
-  return ncclSuccess;
-}
-
-// Allocate/Set Intra Process Structures and set CG options
-ncclResult_t ncclCommSetIntraProc(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
-  comm->intraRank = rank;
-  comm->intraRanks = ranks;
-  comm->intraPhase = 0;
-
-  // Alloc shared structures
-  if (rank == 0) {
-    assert(comm == comm0);
-    int* bar;
-    NCCLCHECK(ncclCalloc(&bar, 2));
-    bar[0] = bar[1] = 0;
-    comm->intraBarrier = bar;
-    NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
-    NCCLCHECK(ncclCalloc(&comm->intraThreads, comm->intraRanks));
-    NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
-    int* CGMode;
-    NCCLCHECK(ncclCalloc(&CGMode, 1));
-    *CGMode = 0x11;
-    comm->intraCGMode = CGMode;
-    int* CC;
-    NCCLCHECK(ncclCalloc(&CC, 1));
-    *CC = ncclCudaCompCap();
-    comm->intraCC = CC;
-  } else {
-    comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
-    comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
-    comm->intraThreads = (pthread_t*)waitForNonNullPtr(&comm0->intraThreads);
-    comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
-    comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
-    comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
-  }
-  comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
-  comm->intraThreads[comm->intraRank] = comm->proxyState.thread;
-  NCCLCHECK(initParams(comm));
-
-  int cgMdLaunch = 0;
-
-  // Set CG Mode
-  comm->launchMode = ncclComm::PARALLEL;
-  char* str = getenv("NCCL_LAUNCH_MODE");
-  if (str) INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", str);
-  if (str && strcmp(str, "GROUP") == 0) {
-    comm->launchMode = ncclComm::GROUP;
-  }
-  if (comm->launchMode == ncclComm::GROUP) {
-    CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking));
-#if CUDART_VERSION >= 9000
-    if (*comm->intraCC && (ncclCudaCompCap() == *comm->intraCC)) {
-      // Check whether the GPU supports Cooperative Group Multi Device Launch
-      (void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
-    }
-#endif
-  }
-
-  // Disable cgMdLaunch if any rank does not support it
-  if (cgMdLaunch == 0) {
-    *comm->intraCGMode = 0x10;
-  }
-  return ncclSuccess;
-}
-
 #define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine))
 #define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t))
 #define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */
@@ -439,7 +481,7 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
   if (cpuArch == NCCL_TOPO_CPU_ARCH_ARM) defaults[NCCL_PROTO_SIMPLE] = DEFAULT_BUFFSIZE_ARM;
 
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    comm->buffSizes[p] = comm->hostDevComm.buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
+    comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
   }
   return ncclSuccess;
 }
@@ -476,11 +518,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   // Topo detection / System graph creation
   NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo));
   // Compute paths between GPUs and NICs
-  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
+  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm));
   // Remove inaccessible GPUs and unused NICs
   NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
   // Recompute paths after trimming
-  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
+  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm));
   // Init search
   NCCLCHECK(ncclTopoSearchInit(comm->topo));
   // Print final topology
@@ -532,7 +574,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   }
 
   // Determine local CollNet support before all-gather
-  if (collNetSupport()) {
+  if (collNetSupport(comm)) {
     char *collNetEnable = getenv("NCCL_COLLNET_ENABLE");
     if (collNetEnable != NULL) {
       INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
@@ -564,6 +606,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   } *allGather3Data;
 
   NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
+
   NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev));
   allGather3Data[rank].tree.pattern = treeGraph.pattern;
   allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
@@ -725,7 +768,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     struct ncclChannel* channel = comm->channels+c;
     NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
     if (comm->nRanks == 1) continue;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
   }
   NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore);
   free(rings);
@@ -735,8 +778,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   for (int c=0; c<comm->nChannels; c++) {
     struct ncclChannel* channel = comm->channels+c;
     if (comm->nRanks == 1) continue;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore);
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore);
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
   }
   NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore);
   INFO(NCCL_INIT, "Connected all trees");
@@ -773,12 +816,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     int highestTransportType0, highestTransportType1;
     for (int c=0; c<comm->nChannels; c++) {
       struct ncclChannel* channelRecv = comm->channels+c;
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
     }
     NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0, &highestTransportType0), ret, collnet_cleanup);
     for (int c=0; c<comm->nChannels; c++) {
       struct ncclChannel* channelSend = comm->channels+c;
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
     }
     NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1, &highestTransportType1), ret, collnet_cleanup);
 
@@ -816,6 +859,52 @@ collnet_cleanup:
   // Compute nChannels per peer for p2p
   NCCLCHECK(ncclTopoComputeP2pChannels(comm));
 
+  do { // Setup p2p structures in comm->tasks
+    struct ncclTasks* tasks = &comm->tasks;
+    int nRanks = comm->nRanks;
+    int node = comm->node;
+    int nNodes = comm->nNodes;
+    struct ncclNodeRanks *nodeRanks = comm->nodeRanks;
+    int localRank = comm->localRank;
+    tasks->peers = ncclMemoryStackAlloc<ncclTasks::Peer>(&comm->memPermanent, nRanks);
+    tasks->p2pSendOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
+    tasks->p2pRecvOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
+    int s=0, r=0;
+    // schedule delta 0, +1, -1, +2, -2, ...
+    // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
+    for (int d=0; d <= nNodes/4; d++) {
+      int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes };
+      int index = 0;
+      int delta = deltas[index];
+    sched_delta:
+      int recvNode = (node+nNodes-delta)%nNodes;
+      int sendNode = (node+delta)%nNodes;
+      int steps = comm->maxLocalRanks;
+      for (int step=0; step < steps; step++) {
+        int recvIndex = (localRank-step+steps)%steps;
+        if (recvIndex < nodeRanks[recvNode].localRanks) {
+          tasks->p2pRecvOrder[r] = nodeRanks[recvNode].localRankToRank[recvIndex];
+          r++;
+        }
+        int sendIndex = (localRank+step)%steps;
+        if (sendIndex < nodeRanks[sendNode].localRanks) {
+          tasks->p2pSendOrder[s] = nodeRanks[sendNode].localRankToRank[sendIndex];
+          s++;
+        }
+      }
+      index++;
+      if (index == 1 && deltas[1] == deltas[0]) index++;
+      if (index == 2 && deltas[2] == deltas[0]) index++;
+      if (index == 3 && deltas[3] == deltas[2]) index++;
+      if (index == 3 && deltas[3] == deltas[1]) index++;
+      if (index < 4) {
+        delta = deltas[index];
+        goto sched_delta;
+      }
+    }
+    assert(s == nRanks && r == nRanks);
+  } while (0);
+
   if (ncclParamNvbPreconnect()) {
     // Connect p2p when using NVB path
     int nvbNpeers;
@@ -847,7 +936,7 @@ collnet_cleanup:
   NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
 
   // Then to remote ones when using PXN
-  if (ncclPxnDisable() == 0) {
+  if (ncclPxnDisable(comm) == 0) {
     int nranks;
     int* pxnPeers;
     NCCLCHECK(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks));
@@ -868,6 +957,10 @@ collnet_cleanup:
         if (intraProcRanks == 0) intraProcRank0 = i;
         if (i == rank) intraProcRank = intraProcRanks;
         intraProcRanks++;
+        if (intraProcRank0 == rank && rank != i) {
+          comm->peerInfo[i].comm->intraNext = comm->intraNext;
+          comm->intraNext = comm->peerInfo[i].comm;
+        }
       }
     }
     TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
@@ -878,9 +971,33 @@ collnet_cleanup:
           intraProcRank, intraProcRanks, intraProcRank0);
       return ncclInternalError;
     }
-    NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, comm->peerInfo[intraProcRank0].comm));
+    struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm;
+    assert(intraProcRank==0 ? comm==comm0 : true);
+    comm->intraComm0 = comm0;
+    comm->intraRefs = intraProcRank==0 ? intraProcRanks : 0;
+    comm->intraRank = intraProcRank;
+    comm->intraRanks = intraProcRanks;
+    comm->intraBarrierPhase = 0;
+    comm->intraBarrierCounter = 0;
+    comm->intraBarrierGate = 0;
   } while(0);
 
+  if (comm->intraRank == 0) { // Load ncclParamLaunchMode
+    char* str = getenv("NCCL_LAUNCH_MODE");
+    enum ncclLaunchMode mode, modeOld;
+    if (str && strcasecmp(str, "GROUP") == 0) {
+      mode = ncclLaunchModeGroup;
+    } else {
+      mode = ncclLaunchModeParallel;
+    }
+    // In theory we could be racing with other communicators not associated with
+    // this one if the user is connecting to multiple ncclUniqueId's concurrently.
+    modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED);
+    if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') {
+      INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP");
+    }
+  }
+
   /* Local intra-node barrier */
   NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
 
@@ -899,8 +1016,22 @@ affinity_restore:
 
 NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0);
 
-ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) {
-  ncclResult_t res;
+struct ncclCommInitRankAsyncJob {
+  struct ncclAsyncJob base;
+  ncclComm_t* newcomm;
+  int nranks, myrank;
+  ncclUniqueId commId;
+  int cudaDev;
+};
+
+static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
+  struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
+  ncclComm_t* newcomm = job->newcomm;
+  int nranks = job->nranks;
+  ncclUniqueId commId = job->commId; // C++ struct assignment
+  int myrank = job->myrank;
+  int cudaDev = job->cudaDev;
+  ncclResult_t res = ncclSuccess;
 
   CUDACHECK(cudaSetDevice(cudaDev));
   // Set the maximum kernel stack size of all kernels to avoid
@@ -915,7 +1046,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
   NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
 
   INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
-
+  TRACE_CALL("ncclCommInitRank(%p,%d,0x%llx,%d,%d)", *newcomm, nranks, (unsigned long long)hashUniqueId(commId), myrank, (*newcomm)->cudaDev);
   return ncclSuccess;
 cleanup:
   if ((*newcomm) && (*newcomm)->bootstrap) bootstrapAbort((*newcomm)->bootstrap);
@@ -923,6 +1054,12 @@ cleanup:
   return res;
 }
 
+static void ncclCommInitRankUndo(struct ncclAsyncJob* job_) {
+  struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
+  ncclCommDestroy(*job->newcomm);
+  *job->newcomm = nullptr;
+}
+
 static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) {
   ncclResult_t res;
   char* env = getenv("NCCL_COMM_ID");
@@ -944,20 +1081,26 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
     goto end;
   }
 
-  if (ncclAsyncMode()) {
-    NCCLCHECKGOTO(ncclAsyncInit(ncclCommInitRankSync, newcomm, nranks, commId, myrank, cudaDev), res, end);
-  } else {
-    NCCLCHECKGOTO(ncclCommInitRankSync(newcomm, nranks, commId, myrank, cudaDev), res, end);
-  }
+  struct ncclCommInitRankAsyncJob *job;
+  NCCLCHECKGOTO(ncclCalloc(&job, 1), res, end);
+  job->newcomm = newcomm;
+  job->nranks = nranks;
+  job->commId = commId; // C++ struct assignment
+  job->myrank = myrank;
+  job->cudaDev = cudaDev;
+  NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, ncclCommInitRankUndo, free), res, end);
 
 end:
-  if (ncclAsyncMode()) return ncclAsyncErrCheck(res);
-  else return res;
+  return ncclGroupErrCheck(res);
 }
 
 NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
 ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
   NVTX3_FUNC_RANGE_IN(nccl_domain);
+
+  // Load the CUDA driver and dlsym hooks (can fail on old drivers)
+  (void) cudaLibraryInit();
+
   int cudaDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
   NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev));
@@ -967,6 +1110,10 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
 NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
 ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   NVTX3_FUNC_RANGE_IN(nccl_domain);
+
+  // Load the CUDA driver and dlsym hooks (can fail on old drivers)
+  (void) cudaLibraryInit();
+
   NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
   if (ndev < 0) {
     WARN("Invalid device count requested : %d", ndev);
@@ -984,22 +1131,6 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   return ncclSuccess;
 }
 
-static ncclResult_t ncclGraphHelperDestroy(ncclComm* comm) {
-  auto res = comm->graphHelperResources;
-  if (comm->graphHelperThread && res) {
-    pthread_mutex_lock(&res->threadLock);
-    res->threadState = ThreadStop;
-    pthread_cond_signal(&res->threadCond);
-    pthread_mutex_unlock(&res->threadLock);
-    pthread_join(comm->graphHelperThread, NULL);
-  }
-  if (res) {
-    free(res);
-    res = NULL;
-  }
-  return ncclSuccess;
-}
-
 static ncclResult_t commDestroy(ncclComm_t comm) {
   // Try and prevent a double free of the comm struct (user error)
   if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
@@ -1017,13 +1148,9 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
 
   TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, comm->rank, *comm->abortFlag, comm->fatalError);
 
-  CUDACHECK(cudaStreamSynchronize(comm->groupStream));
-
-  ncclDestroyQueueInfo(comm->enqueueInfo);
-#if CUDART_VERSION >= 11030
-  NCCLCHECK(ncclGraphHelperDestroy(comm));
-#endif
-  INFO(NCCL_COLL, "Created %d queue info, destroyed %d", comm->nQueueInfoCreated, comm->nQueueInfoDestroyed);
+  NCCLCHECK(ncclStrongStreamSynchronize(&comm->hostStream));
+  NCCLCHECK(ncclStrongStreamSynchronize(&comm->deviceStream));
+  NCCLCHECK(ncclCommPollCallbacks(comm));
 
   NCCLCHECK(commFree(comm));
 
@@ -1075,10 +1202,19 @@ const char* ncclGetErrorString(ncclResult_t code) {
     case ncclInternalError          : return "internal error";
     case ncclInvalidArgument        : return "invalid argument";
     case ncclInvalidUsage           : return "invalid usage";
+    case ncclRemoteError            : return "remote process exited or there was a network error";
     default                         : return "unknown result code";
   }
 }
 
+/* Returns a human-readable message of the last error that occurred.
+ * comm is currently unused and can be set to NULL
+ */
+NCCL_API(const char*, ncclGetLastError, const ncclComm_t comm);
+const char* ncclGetLastError(ncclComm_t comm) {
+  return ncclLastError;
+}
+
 NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
 ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
   NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc
index 5406bf0..994d1fd 100644
--- a/src/misc/argcheck.cc
+++ b/src/misc/argcheck.cc
@@ -44,12 +44,7 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
     return ncclInvalidArgument;
   }
   // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
-  info->nBytes = info->count * ncclTypeSize(info->datatype);
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
-    info->count = info->nBytes;
-    info->datatype = ncclInt8;
-  }
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
+  NCCLCHECK(ncclInfoSetDerived(info, info->comm->nRanks));
 
   if (info->op < 0 || ncclMaxRedOp < info->op) {
     WARN("%s : invalid reduction operation %d", info->opName, info->op);
diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc
new file mode 100644
index 0000000..43c95c2
--- /dev/null
+++ b/src/misc/cudawrap.cc
@@ -0,0 +1,163 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "debug.h"
+#include "cudawrap.h"
+
+#include <dlfcn.h>
+
+#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
+
+#if CUDART_VERSION >= 11030
+/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
+DECLARE_CUDA_PFN(cuDeviceGet);
+DECLARE_CUDA_PFN(cuDeviceGetAttribute);
+DECLARE_CUDA_PFN(cuGetErrorString);
+DECLARE_CUDA_PFN(cuGetErrorName);
+/* enqueue.cc */
+DECLARE_CUDA_PFN(cuMemGetAddressRange);
+/* proxy.cc */
+DECLARE_CUDA_PFN(cuCtxCreate_v3020);
+DECLARE_CUDA_PFN(cuCtxDestroy);
+DECLARE_CUDA_PFN(cuCtxSetCurrent);
+#if CUDA_VERSION >= 11070
+/* transport/collNet.cc/net.cc*/
+DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support
+#endif
+#endif
+
+/* CUDA Driver functions loaded with dlsym() */
+DECLARE_CUDA_PFN(cuInit);
+DECLARE_CUDA_PFN(cuDriverGetVersion);
+DECLARE_CUDA_PFN(cuGetProcAddress);
+
+static enum { cudaUninitialized, cudaInitializing, cudaInitialized, cudaError } cudaState = cudaUninitialized;
+
+#define CUDA_DRIVER_MIN_VERSION 11030
+
+static void *cudaLib;
+static int cudaDriverVersion;
+
+#if CUDART_VERSION >= 11030
+/*
+  Load the CUDA symbols
+ */
+static int cudaPfnFuncLoader(void) {
+  CUresult res;
+
+#define LOAD_SYM(symbol, ignore) do {                                   \
+    res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), cudaDriverVersion, 0); \
+    if (res != 0) {                                                     \
+      if (!ignore) {                                                    \
+        WARN("Retrieve %s version %d failed with %d", #symbol, cudaDriverVersion, res); \
+        return ncclSystemError; }                                       \
+    } } while(0)
+
+  LOAD_SYM(cuGetErrorString, 0);
+  LOAD_SYM(cuGetErrorName, 0);
+  LOAD_SYM(cuDeviceGet, 0);
+  LOAD_SYM(cuDeviceGetAttribute, 0);
+  LOAD_SYM(cuMemGetAddressRange, 1);
+  LOAD_SYM(cuCtxCreate_v3020, 1);
+  LOAD_SYM(cuCtxDestroy, 1);
+  LOAD_SYM(cuCtxSetCurrent, 1);
+#if CUDA_VERSION >= 11070
+  LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support
+#endif
+  return ncclSuccess;
+}
+#endif
+
+ncclResult_t cudaLibraryInit(void) {
+  CUresult res;
+
+  if (cudaState == cudaInitialized)
+    return ncclSuccess;
+  if (cudaState == cudaError)
+    return ncclSystemError;
+
+  if (__sync_bool_compare_and_swap(&cudaState, cudaUninitialized, cudaInitializing) == false) {
+    // Another thread raced in front of us. Wait for it to be done.
+    while (cudaState == cudaInitializing) sched_yield();
+    return (cudaState == cudaInitialized) ? ncclSuccess : ncclSystemError;
+  }
+
+  /*
+   * Load CUDA driver library
+   */
+  char path[1024];
+  char *ncclCudaPath = getenv("NCCL_CUDA_PATH");
+  if (ncclCudaPath == NULL)
+    snprintf(path, 1024, "%s", "libcuda.so");
+  else
+    snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so");
+
+  cudaLib = dlopen(path, RTLD_LAZY);
+  if (cudaLib == NULL) {
+    WARN("Failed to find CUDA library in %s (NCCL_CUDA_PATH=%s)", ncclCudaPath, ncclCudaPath);
+    goto error;
+  }
+
+  /*
+   * Load initial CUDA functions
+   */
+
+  pfn_cuInit = (PFN_cuInit) dlsym(cudaLib, "cuInit");
+  if (pfn_cuInit == NULL) {
+    WARN("Failed to load CUDA missing symbol cuInit");
+    goto error;
+  }
+
+  pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion) dlsym(cudaLib, "cuDriverGetVersion");
+  if (pfn_cuDriverGetVersion == NULL) {
+    WARN("Failed to load CUDA missing symbol cuDriverGetVersion");
+    goto error;
+  }
+
+  res = pfn_cuDriverGetVersion(&cudaDriverVersion);
+  if (res != 0) {
+    WARN("cuDriverGetVersion failed with %d", res);
+    goto error;
+  }
+
+  INFO(NCCL_INIT, "cudaDriverVersion %d", cudaDriverVersion);
+
+  if (cudaDriverVersion < CUDA_DRIVER_MIN_VERSION) {
+    // WARN("CUDA Driver version found is %d. Minimum requirement is %d", cudaDriverVersion, CUDA_DRIVER_MIN_VERSION);
+    // Silently ignore version check mismatch for backwards compatibility
+    goto error;
+  }
+
+  pfn_cuGetProcAddress = (PFN_cuGetProcAddress) dlsym(cudaLib, "cuGetProcAddress");
+  if (pfn_cuGetProcAddress == NULL) {
+    WARN("Failed to load CUDA missing symbol cuGetProcAddress");
+    goto error;
+  }
+
+  /*
+   * Required to initialize the CUDA Driver.
+   * Multiple calls of cuInit() will return immediately
+   * without making any relevant change
+   */
+  pfn_cuInit(0);
+
+#if CUDART_VERSION >= 11030
+  if (cudaPfnFuncLoader()) {
+    WARN("CUDA some PFN functions not found in the library");
+    goto error;
+  }
+#endif
+
+  cudaState = cudaInitialized;
+  return ncclSuccess;
+
+error:
+  cudaState = cudaError;
+  return ncclSystemError;
+}
+
+
diff --git a/src/misc/gdrwrap.cc b/src/misc/gdrwrap.cc
index ed0c697..e81c7ea 100644
--- a/src/misc/gdrwrap.cc
+++ b/src/misc/gdrwrap.cc
@@ -57,7 +57,7 @@ ncclResult_t wrap_gdr_symbols(void) {
 
   if (__sync_bool_compare_and_swap(&gdrState, gdrUninitialized, gdrInitializing) == false) {
     // Another thread raced in front of us. Wait for it to be done.
-    while (gdrState == gdrInitializing) pthread_yield();
+    while (gdrState == gdrInitializing) sched_yield();
     return (gdrState == gdrInitialized) ? ncclSuccess : ncclSystemError;
   }
 
diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc
index e1aabac..3b8daac 100644
--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@@ -30,6 +30,8 @@ struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
 int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
 struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
 struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
+/* DMA-BUF support */
+struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
 int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
 struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
 int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
@@ -49,7 +51,7 @@ ncclResult_t wrap_ibv_symbols(void) {
 
   if (__sync_bool_compare_and_swap(&ibvState, ibvUninitialized, ibvInitializing) == false) {
     // Another thread raced in front of us. Wait for it to be done.
-    while (ibvState == ibvInitializing) pthread_yield();
+    while (ibvState == ibvInitializing) sched_yield();
     return (ibvState == ibvInitialized) ? ncclSuccess : ncclSystemError;
   }
 
@@ -98,6 +100,8 @@ ncclResult_t wrap_ibv_symbols(void) {
   LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr);
   // Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8
   LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibv_internal_reg_mr_iova2, "IBVERBS_1.8");
+  // Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12
+  LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12");
   LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr);
   LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq);
   LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq);
@@ -126,6 +130,7 @@ teardown:
   ibv_internal_dealloc_pd = NULL;
   ibv_internal_reg_mr = NULL;
   ibv_internal_reg_mr_iova2 = NULL;
+  ibv_internal_reg_dmabuf_mr = NULL;
   ibv_internal_dereg_mr = NULL;
   ibv_internal_create_cq = NULL;
   ibv_internal_destroy_cq = NULL;
@@ -259,7 +264,7 @@ ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or
 }
 
 ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) {
-  IBV_PTR_CHECK(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
+  IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
 }
 
 struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
@@ -275,7 +280,19 @@ ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void
     return ncclInternalError;
   }
   if (ret == NULL) { return ncclSuccess; } // Assume dummy call
-  IBV_PTR_CHECK(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
+  IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
+}
+
+/* DMA-BUF support */
+ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
+  IBV_PTR_CHECK_ERRNO(ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr");
+}
+
+struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
+  if (ibv_internal_reg_dmabuf_mr == NULL) {
+    return NULL;
+  }
+  return ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
 }
 
 ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index ef2bea6..16049fa 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -332,9 +332,10 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
 #endif
   }
 
-  /* make all new sockets non-blocking */
-  EQCHECK(flags = fcntl(fd, F_GETFL), -1);
-  SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  if (sock->asyncFlag) {
+    EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+    SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  }
 
   // addr port should be 0 (Any port)
   SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind");
@@ -373,7 +374,7 @@ static ncclResult_t getFdState(int fd, enum ncclSocketState* state) {
       SYSCHECK(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
     }
 
-    if (ret == EINPROGRESS)
+    if (ret == EINPROGRESS || ret == ECONNREFUSED)
       *state = ncclSocketConnecting;
     else if (ret == 0)
       *state = ncclSocketConnected;
@@ -409,10 +410,12 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
 
   const int one = 1;
   SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
-
+  
   /* support non-blocking socket; by default, the socket is non-blocking */
-  EQCHECK(flags = fcntl(fd, F_GETFL), -1);
-  SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  if (sock->asyncFlag) {
+    EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+    SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  }
 
   /*  const int bufsize = 128*1024;
     SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
@@ -424,31 +427,26 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
   int timedout_retries = 0;
   int refused_retries = 0;
 retry:
-  /* async connect; abort when error happens and abortFlag is present. */
+  /* blocking/non-blocking connect() is determined by asyncFlag. */
   ret = connect(fd, &sock->addr.sa, salen);
 
-  if (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
-    (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
-    if (refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
+  if (!sock->asyncFlag && (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
+    (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES))) {
+    if (errno == ECONNREFUSED && refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
     usleep(SLEEP_INT);
     goto retry;
-  } else if (errno == EINPROGRESS && !sock->asyncFlag) {
-    enum ncclSocketState state;
-    do {
-      if (sock->abortFlag) NEQCHECK(*sock->abortFlag, 0);
-      NCCLCHECK(getFdState(fd, &state));
-    } while (state == ncclSocketConnecting);
-    EQCHECK(state, ncclSocketError);
-    ret = 0;
   }
 
-  if (ret == 0 || (errno == EINPROGRESS && sock->asyncFlag)) {
+  /* If connect() fails with errno == EAGAIN/EINPROGRESS/ETIMEDOUT, we may want to try connect again.
+   * However, it can return EISCONN instead of success which indicates connection is built up in
+   * background already. No need to call connect() again. */
+  if (ret == 0 || ((errno == EINPROGRESS || errno == ECONNREFUSED) && sock->asyncFlag) || errno == EISCONN) {
     sock->fd = fd;
     return ncclSuccess;
   }
 
   WARN("Net : Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
-  return ncclSystemError;
+  return ncclRemoteError;
 }
 
 ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) {
@@ -501,7 +499,7 @@ static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void*
     if (bytes == -1) {
       if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
         WARN("Net : Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
-        return ncclSystemError;
+        return ncclRemoteError;
       } else {
         bytes = 0;
       }
@@ -521,7 +519,7 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int
   if (closed) {
     char line[SOCKET_NAME_MAXLEN+1];
     WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
-    return ncclSystemError;
+    return ncclRemoteError;
   }
   return ncclSuccess;
 }
diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc
new file mode 100644
index 0000000..4933799
--- /dev/null
+++ b/src/misc/strongstream.cc
@@ -0,0 +1,272 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "strongstream.h"
+#include "checks.h"
+#include "param.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+ncclResult_t ncclCudaGetCapturingGraph(
+    struct ncclCudaGraph* graph, cudaStream_t stream
+  ) {
+  #if CUDART_VERSION >= 11030
+    thread_local int driver = -1;
+    if (driver == -1) {
+      CUDACHECK(cudaDriverGetVersion(&driver));
+    }
+    if (driver < 11030) {
+      cudaStreamCaptureStatus status;
+      unsigned long long gid;
+      graph->graph = nullptr;
+      CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, &gid));
+      if (status != cudaStreamCaptureStatusNone) {
+        WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
+        return ncclInvalidUsage;
+      }
+    } else {
+      cudaStreamCaptureStatus status;
+      unsigned long long gid;
+      CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &gid, &graph->graph, nullptr, nullptr));
+      if (status != cudaStreamCaptureStatusActive) {
+        graph->graph = nullptr;
+        gid = ULLONG_MAX;
+      }
+      graph->graphId = gid;
+    }
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg) {
+  #if CUDART_VERSION >= 11030
+    cudaUserObject_t object;
+    CUDACHECK(cudaUserObjectCreate(
+      &object, arg, fn, /*initialRefcount=*/1, cudaUserObjectNoDestructorSync
+    ));
+    // Hand over ownership to CUDA Graph
+    CUDACHECK(cudaGraphRetainUserObject(graph.graph, object, 1, cudaGraphUserObjectMove));
+    return ncclSuccess;
+  #else
+    return ncclInvalidUsage;
+  #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss) {
+  CUDACHECK(cudaStreamCreateWithFlags(&ss->stream, cudaStreamNonBlocking));
+  CUDACHECK(cudaEventCreateWithFlags(&ss->event, cudaEventDisableTiming));
+  #if CUDART_VERSION >= 11030
+    ss->node = nullptr;
+    ss->graphId = (1ull<<(8*sizeof(long long)-1))-1;
+    ss->eventIsLagging = 0;
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss) {
+  #if CUDART_VERSION >= 11030
+    CUDACHECK(cudaEventDestroy(ss->event));
+  #endif
+  CUDACHECK(cudaStreamDestroy(ss->stream));
+  return ncclSuccess;
+}
+
+NCCL_PARAM(GraphMixingSupport, "GRAPH_MIXING_SUPPORT", 1)
+
+ncclResult_t ncclStrongStreamAcquire(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss
+  ) {
+  #if CUDART_VERSION >= 11030
+    bool mixing = ncclParamGraphMixingSupport();
+    if (graph.graph == nullptr) {
+      if (mixing && ncclStrongStreamEverCaptured(ss)) {
+        CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
+        ss->eventIsLagging = 0;
+      }
+    } else {
+      if (ss->graphId != graph.graphId) {
+        if (mixing && ss->eventIsLagging) {
+          // Can only be here if previous release was for uncaptured work that
+          // elided updating the event because no capture had yet occurred.
+          CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
+          CUDACHECK(cudaEventRecord(ss->event, ss->stream));
+        }
+        ss->graphId = graph.graphId;
+        ss->eventIsLagging = 0;
+        if (mixing) {
+          CUDACHECK(cudaGraphAddEventWaitNode(&ss->node, graph.graph, nullptr, 0, ss->event));
+        } else {
+          CUDACHECK(cudaGraphAddEmptyNode(&ss->node, graph.graph, nullptr, 0));
+        }
+      }
+    }
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss) {
+  #if CUDART_VERSION >= 11030
+    bool mixing = ncclParamGraphMixingSupport();
+    if (mixing && ncclStrongStreamEverCaptured(ss)) {
+      CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
+    }
+    ss->eventIsLagging = 1; // Assume the caller is going to add work to stream.
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss) {
+  #if CUDART_VERSION >= 11030
+    bool mixing = ncclParamGraphMixingSupport();
+    if (mixing && ss->eventIsLagging) {
+      if (graph.graph == nullptr) {
+        if (ncclStrongStreamEverCaptured(ss)) {
+          CUDACHECK(cudaEventRecord(ss->event, ss->stream));
+          ss->eventIsLagging = 0;
+        }
+      } else {
+        CUDACHECK(cudaGraphAddEventRecordNode(&ss->node, graph.graph, &ss->node, 1, ss->event));
+        ss->eventIsLagging = 0;
+      }
+    }
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamLaunchHost(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss, cudaHostFn_t fn, void* arg
+  ) {
+  #if CUDART_VERSION >= 11030
+    if (graph.graph == nullptr) {
+      CUDACHECK(cudaLaunchHostFunc(ss->stream, fn, arg));
+    } else {
+      cudaHostNodeParams p;
+      p.fn = fn;
+      p.userData = arg;
+      CUDACHECK(cudaGraphAddHostNode(&ss->node, graph.graph, &ss->node, 1, &p));
+    }
+    ss->eventIsLagging = 1;
+  #else
+    CUDACHECK(cudaLaunchHostFunc(ss->stream, fn, arg));
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamLaunchKernel(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss,
+    void* fn, dim3 grid, dim3 block, void* args[], size_t sharedMemBytes
+  ) {
+  #if CUDART_VERSION >= 11030
+    if (graph.graph == nullptr) {
+      CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->stream));
+    } else {
+      cudaGraphNode_t tip = ss->node;
+      cudaKernelNodeParams p;
+      p.func = fn;
+      p.gridDim = grid;
+      p.blockDim = block;
+      p.kernelParams = args;
+      p.sharedMemBytes = sharedMemBytes;
+      p.extra = nullptr;
+      CUDACHECK(cudaGraphAddKernelNode(&ss->node, graph.graph, &tip, 1, &p));
+    }
+    ss->eventIsLagging = 1;
+  #else
+    CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->stream));
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamWaitStream(
+    struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
+  ) {
+  #if CUDART_VERSION >= 11030
+    if (graph.graph == nullptr) {
+      if (b->eventIsLagging) {
+        b->eventIsLagging = 0;
+        CUDACHECK(cudaEventRecord(b->event, b->stream));
+      }
+      CUDACHECK(cudaStreamWaitEvent(a->stream, b->event, 0));
+      a->eventIsLagging = 1;
+    } else {
+      cudaGraphNode_t pair[2] = {a->node, b->node};
+      CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2));
+    }
+  #else
+    CUDACHECK(cudaEventRecord(b->event, b->stream));
+    CUDACHECK(cudaStreamWaitEvent(a->stream, b->event, 0));
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamWaitStream(
+    struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b
+  ) {
+  #if CUDART_VERSION >= 11030
+    if (graph.graph == nullptr) {
+      CUDACHECK(cudaEventRecord(a->event, b));
+      CUDACHECK(cudaStreamWaitEvent(a->stream, a->event, 0));
+      // We used a->event to record b so it no longer reflects anything about a.
+      a->eventIsLagging = 1;
+    } else {
+      cudaStreamCaptureStatus status;
+      unsigned long long gid1;
+      cudaGraphNode_t const* deps;
+      size_t depN = 0;
+      CUDACHECK(cudaStreamGetCaptureInfo_v2(b, &status, &gid1, nullptr, &deps, &depN));
+      if (status != cudaStreamCaptureStatusActive || graph.graphId != gid1) {
+        WARN("Stream is not being captured by the expected graph.");
+        return ncclInvalidUsage;
+      }
+      if (depN > 0 && (depN > 1 || deps[0] != a->node)) {
+        cudaGraphNode_t tie;
+        if (depN == 1) {
+          tie = deps[0];
+        } else {
+          CUDACHECK(cudaGraphAddEmptyNode(&tie, graph.graph, deps, depN));
+        }
+        cudaGraphNode_t pair[2] = {a->node, tie};
+        CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2));
+      }
+      // a->eventIsLagging doesn't change since we are just updating the
+      // dependencies of a->node.
+    }
+  #else
+    CUDACHECK(cudaEventRecord(a->event, b));
+    CUDACHECK(cudaStreamWaitEvent(a->stream, a->event, 0));
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamWaitStream(
+    struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b
+  ) {
+  #if CUDART_VERSION >= 11030
+    if (graph.graph == nullptr) {
+      if (b->eventIsLagging) {
+        b->eventIsLagging = 0;
+        CUDACHECK(cudaEventRecord(b->event, b->stream));
+      }
+      CUDACHECK(cudaStreamWaitEvent(a, b->event, 0));
+    } else {
+      CUDACHECK(cudaStreamUpdateCaptureDependencies(a, &b->node, 1, cudaStreamAddCaptureDependencies));
+    }
+  #else
+    CUDACHECK(cudaEventRecord(b->event, b->stream));
+    CUDACHECK(cudaStreamWaitEvent(a, b->event, 0));
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) {
+  #if CUDART_VERSION >= 11030
+    CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
+  #endif
+  CUDACHECK(cudaStreamSynchronize(ss->stream));
+  return ncclSuccess;
+}
diff --git a/src/misc/utils.cc b/src/misc/utils.cc
index f3e3ca2..20e8e41 100644
--- a/src/misc/utils.cc
+++ b/src/misc/utils.cc
@@ -9,6 +9,8 @@
 
 #include "nvmlwrap.h"
 
+#include <stdlib.h>
+
 // Get current Compute Capability
 int ncclCudaCompCap() {
   int cudaDev;
@@ -190,3 +192,102 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz
   }
   return false;
 }
+
+__thread struct ncclThreadSignal ncclThreadSignalLocalInstance = ncclThreadSignalStaticInitializer();
+
+void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) {
+  // `me->hunks` points to the top of the stack non-empty hunks. Hunks above
+  // this (reachable via `->above`) are empty.
+  struct Hunk* top = me->topFrame.hunk;
+  size_t mallocSize = 0;
+
+  // If we have lots of space left in hunk but that wasn't enough then we'll
+  // allocate the object unhunked.
+  if (me->topFrame.end - me->topFrame.bumper >= 8<<10)
+    goto unhunked;
+
+  // If we have another hunk (which must be empty) waiting above this one and
+  // the object fits then use that.
+  if (top && top->above) {
+    struct Hunk* top1 = top->above;
+    uintptr_t uobj = (reinterpret_cast<uintptr_t>(top1) + sizeof(struct Hunk) + align-1) & -uintptr_t(align);
+    if (uobj + size <= reinterpret_cast<uintptr_t>(top1) + top1->size) {
+      me->topFrame.hunk = top1;
+      me->topFrame.bumper = uobj + size;
+      me->topFrame.end = reinterpret_cast<uintptr_t>(top1) + top1->size;
+      return reinterpret_cast<void*>(uobj);
+    }
+  }
+
+  { // If the next hunk we're going to allocate wouldn't be big enough but the
+    // Unhunk proxy fits in the current hunk then go allocate as unhunked.
+    size_t nextSize = (top ? top->size : 0) + (64<<10);
+    constexpr size_t maxAlign = 64;
+    if (nextSize < sizeof(struct Hunk) + maxAlign + size) {
+      uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk));
+      if (uproxy + sizeof(struct Unhunk) <= me->topFrame.end)
+        goto unhunked;
+    }
+
+    // At this point we must need another hunk, either to fit the object
+    // itself or its Unhunk proxy.
+    mallocSize = nextSize;
+    INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
+    struct Hunk *top1 = (struct Hunk*)malloc(mallocSize);
+    if (top1 == nullptr) goto malloc_exhausted;
+    top1->size = nextSize;
+    top1->above = nullptr;
+    if (top) top->above = top1;
+    top = top1;
+    me->topFrame.hunk = top;
+    me->topFrame.end = reinterpret_cast<uintptr_t>(top) + nextSize;
+    me->topFrame.bumper = reinterpret_cast<uintptr_t>(top) + sizeof(struct Hunk);
+  }
+
+  { // Try to fit object in the new top hunk.
+    uintptr_t uobj = (me->topFrame.bumper + align-1) & -uintptr_t(align);
+    if (uobj + size <= me->topFrame.end) {
+      me->topFrame.bumper = uobj + size;
+      return reinterpret_cast<void*>(uobj);
+    }
+  }
+
+unhunked:
+  { // We need to allocate the object out-of-band and put an Unhunk proxy in-band
+    // to keep track of it.
+    uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk));
+    Unhunk* proxy = reinterpret_cast<Unhunk*>(uproxy);
+    me->topFrame.bumper = uproxy + sizeof(Unhunk);
+    proxy->next = me->topFrame.unhunks;
+    me->topFrame.unhunks = proxy;
+    mallocSize = size;
+    proxy->obj = malloc(mallocSize);
+    INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
+    if (proxy->obj == nullptr) goto malloc_exhausted;
+    return proxy->obj;
+  }
+
+malloc_exhausted:
+  WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize);
+  abort();
+}
+
+void ncclMemoryStackDestruct(struct ncclMemoryStack* me) {
+  // Free unhunks first because both the frames and unhunk proxies lie within the hunks.
+  struct ncclMemoryStack::Frame* f = &me->topFrame;
+  while (f != nullptr) {
+    struct ncclMemoryStack::Unhunk* u = f->unhunks;
+    while (u != nullptr) {
+      free(u->obj);
+      u = u->next;
+    }
+    f = f->below;
+  }
+  // Free hunks
+  struct ncclMemoryStack::Hunk* h = me->stub.above;
+  while (h != nullptr) {
+    struct ncclMemoryStack::Hunk *h1 = h->above;
+    free(h);
+    h = h1;
+  }
+}
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 93a141c..edd98a3 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -38,7 +38,8 @@ typedef enum { ncclSuccess                 =  0,
                ncclInternalError           =  3,
                ncclInvalidArgument         =  4,
                ncclInvalidUsage            =  5,
-               ncclNumResults              =  6 } ncclResult_t;
+               ncclRemoteError             =  6,
+               ncclNumResults              =  7 } ncclResult_t;
 
 /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
  * This integer is coded with the MAJOR, MINOR and PATCH level of the
@@ -81,10 +82,16 @@ ncclResult_t pncclCommDestroy(ncclComm_t comm);
 ncclResult_t  ncclCommAbort(ncclComm_t comm);
 ncclResult_t pncclCommAbort(ncclComm_t comm);
 
-/* Returns a human-readable error message. */
+/* Returns a string for each error code. */
 const char*  ncclGetErrorString(ncclResult_t result);
 const char* pncclGetErrorString(ncclResult_t result);
 
+/* Returns a human-readable message of the last error that occurred.
+ * comm is currently unused and can be set to NULL
+ */
+const char*  ncclGetLastError(ncclComm_t comm);
+const char* pncclGetError(ncclComm_t comm);
+
 /* Checks whether the comm has encountered any asynchronous errors */
 ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
 ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
diff --git a/src/net.cc b/src/net.cc
index cb65218..53ec80e 100644
--- a/src/net.cc
+++ b/src/net.cc
@@ -9,15 +9,16 @@
 //#include <sys/stat.h>
 //#include <unistd.h>
 
-ncclNet_t *ncclNet;
-ncclCollNet_t *ncclCollNet;
-
-static ncclNet_v5_t ncclNet_v4_as_v5;
+static ncclNet_v6_t ncclNet_v4_as_v6;
+static ncclNet_v6_t ncclNet_v5_as_v6;
 static ncclNet_v4_t *ncclNet_v4;
-static ncclCollNet_v5_t ncclCollNet_v4_as_v5;
+static ncclNet_v5_t *ncclNet_v5;
+static ncclCollNet_v6_t ncclCollNet_v4_as_v6;
+static ncclCollNet_v6_t ncclCollNet_v5_as_v6;
 static ncclCollNet_v4_t *ncclCollNet_v4;
+static ncclCollNet_v5_t *ncclCollNet_v5;
 
-static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
+static ncclResult_t ncclNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
   ncclNetProperties_v4_t p4;
   ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4);
   if (ans != ncclSuccess) return ans;
@@ -33,17 +34,17 @@ static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5
   return ncclSuccess;
 }
 
-static ncclResult_t ncclNet_v4_as_v5_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+static ncclResult_t ncclNet_v4_as_v6_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
   return ncclNet_v4->isend(sendComm, data, size, mhandle, request);
 }
 
-static ncclResult_t ncclNet_v4_as_v5_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+static ncclResult_t ncclNet_v4_as_v6_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
   if (n == 0) return ncclSuccess;
   if (n != 1) return ncclInvalidArgument;
   return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request);
 }
 
-static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
+static ncclResult_t ncclNet_v4_as_v6_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
   if (n == 0) return ncclSuccess;
   if (n != 1) return ncclInvalidArgument;
   return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request);
@@ -51,27 +52,51 @@ static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data,
 
 // We use a wrapper around the v4 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclNet_v4->init(logfn));
-  ncclNet_v4_as_v5.name = ncclNet_v4->name;
-  ncclNet_v4_as_v5.devices = ncclNet_v4->devices;
-  ncclNet_v4_as_v5.getProperties = ncclNet_v4_as_v5_getProperties;
-  ncclNet_v4_as_v5.listen = ncclNet_v4->listen;
-  ncclNet_v4_as_v5.connect = ncclNet_v4->connect;
-  ncclNet_v4_as_v5.accept = ncclNet_v4->accept;
-  ncclNet_v4_as_v5.regMr = ncclNet_v4->regMr;
-  ncclNet_v4_as_v5.deregMr = ncclNet_v4->deregMr;
-  ncclNet_v4_as_v5.isend = ncclNet_v4_as_v5_isend;
-  ncclNet_v4_as_v5.irecv = ncclNet_v4_as_v5_irecv;
-  ncclNet_v4_as_v5.iflush = ncclNet_v4_as_v5_iflush;
-  ncclNet_v4_as_v5.test = ncclNet_v4->test;
-  ncclNet_v4_as_v5.closeSend = ncclNet_v4->closeSend;
-  ncclNet_v4_as_v5.closeRecv = ncclNet_v4->closeRecv;
-  ncclNet_v4_as_v5.closeListen = ncclNet_v4->closeListen;
+  ncclNet_v4_as_v6.name = ncclNet_v4->name;
+  ncclNet_v4_as_v6.devices = ncclNet_v4->devices;
+  ncclNet_v4_as_v6.getProperties = ncclNet_v4_as_v6_getProperties;
+  ncclNet_v4_as_v6.listen = ncclNet_v4->listen;
+  ncclNet_v4_as_v6.connect = ncclNet_v4->connect;
+  ncclNet_v4_as_v6.accept = ncclNet_v4->accept;
+  ncclNet_v4_as_v6.regMr = ncclNet_v4->regMr;
+  ncclNet_v4_as_v6.regMrDmaBuf = NULL;
+  ncclNet_v4_as_v6.deregMr = ncclNet_v4->deregMr;
+  ncclNet_v4_as_v6.isend = ncclNet_v4_as_v6_isend;
+  ncclNet_v4_as_v6.irecv = ncclNet_v4_as_v6_irecv;
+  ncclNet_v4_as_v6.iflush = ncclNet_v4_as_v6_iflush;
+  ncclNet_v4_as_v6.test = ncclNet_v4->test;
+  ncclNet_v4_as_v6.closeSend = ncclNet_v4->closeSend;
+  ncclNet_v4_as_v6.closeRecv = ncclNet_v4->closeRecv;
+  ncclNet_v4_as_v6.closeListen = ncclNet_v4->closeListen;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
+// We use a wrapper around the v5 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclNet_v5->init(logfn));
+  ncclNet_v5_as_v6.name = ncclNet_v5->name;
+  ncclNet_v5_as_v6.devices = ncclNet_v5->devices;
+  ncclNet_v5_as_v6.getProperties = ncclNet_v5->getProperties;
+  ncclNet_v5_as_v6.listen = ncclNet_v5->listen;
+  ncclNet_v5_as_v6.connect = ncclNet_v5->connect;
+  ncclNet_v5_as_v6.accept = ncclNet_v5->accept;
+  ncclNet_v5_as_v6.regMr = ncclNet_v5->regMr;
+  ncclNet_v5_as_v6.regMrDmaBuf = NULL;
+  ncclNet_v5_as_v6.deregMr = ncclNet_v5->deregMr;
+  ncclNet_v5_as_v6.isend = ncclNet_v5->isend;
+  ncclNet_v5_as_v6.irecv = ncclNet_v5->irecv;
+  ncclNet_v5_as_v6.iflush = ncclNet_v5->iflush;
+  ncclNet_v5_as_v6.test = ncclNet_v5->test;
+  ncclNet_v5_as_v6.closeSend = ncclNet_v5->closeSend;
+  ncclNet_v5_as_v6.closeRecv = ncclNet_v5->closeRecv;
+  ncclNet_v5_as_v6.closeListen = ncclNet_v5->closeListen;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
   ncclNetProperties_v4_t p4;
   ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4);
   if (ans != ncclSuccess) return ans;
@@ -89,25 +114,58 @@ static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetPropertie
 
 // We use a wrapper around the v4 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclCollNet_v4->init(logfn));
-  ncclCollNet_v4_as_v5.name = ncclCollNet_v4->name;
-  ncclCollNet_v4_as_v5.devices = ncclCollNet_v4->devices;
-  ncclCollNet_v4_as_v5.getProperties = ncclCollNet_v4_as_v5_getProperties;
-  ncclCollNet_v4_as_v5.listen = ncclCollNet_v4->listen;
-  ncclCollNet_v4_as_v5.connect = ncclCollNet_v4->connect;
-  ncclCollNet_v4_as_v5.reduceSupport = ncclCollNet_v4->reduceSupport;
-  ncclCollNet_v4_as_v5.regMr = ncclCollNet_v4->regMr;
-  ncclCollNet_v4_as_v5.deregMr = ncclCollNet_v4->deregMr;
-  ncclCollNet_v4_as_v5.iallreduce = ncclCollNet_v4->iallreduce;
-  ncclCollNet_v4_as_v5.iflush = ncclCollNet_v4->iflush;
-  ncclCollNet_v4_as_v5.test = ncclCollNet_v4->test;
-  ncclCollNet_v4_as_v5.closeColl = ncclCollNet_v4->closeColl;
-  ncclCollNet_v4_as_v5.closeListen = ncclCollNet_v4->closeListen;
+  ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
+  ncclCollNet_v4_as_v6.devices = ncclCollNet_v4->devices;
+  ncclCollNet_v4_as_v6.getProperties = ncclCollNet_v4_as_v6_getProperties;
+  ncclCollNet_v4_as_v6.listen = ncclCollNet_v4->listen;
+  ncclCollNet_v4_as_v6.connect = ncclCollNet_v4->connect;
+  ncclCollNet_v4_as_v6.reduceSupport = ncclCollNet_v4->reduceSupport;
+  ncclCollNet_v4_as_v6.regMr = ncclCollNet_v4->regMr;
+  ncclCollNet_v4_as_v6.regMrDmaBuf = NULL;
+  ncclCollNet_v4_as_v6.deregMr = ncclCollNet_v4->deregMr;
+  ncclCollNet_v4_as_v6.iallreduce = ncclCollNet_v4->iallreduce;
+  ncclCollNet_v4_as_v6.iflush = ncclCollNet_v4->iflush;
+  ncclCollNet_v4_as_v6.test = ncclCollNet_v4->test;
+  ncclCollNet_v4_as_v6.closeColl = ncclCollNet_v4->closeColl;
+  ncclCollNet_v4_as_v6.closeListen = ncclCollNet_v4->closeListen;
   return ncclSuccess;
 }
 
-static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) {
+// We use a wrapper around the v5 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclCollNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v5->init(logfn));
+  ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
+  ncclCollNet_v5_as_v6.devices = ncclCollNet_v5->devices;
+  ncclCollNet_v5_as_v6.getProperties = ncclCollNet_v5->getProperties;
+  ncclCollNet_v5_as_v6.listen = ncclCollNet_v5->listen;
+  ncclCollNet_v5_as_v6.connect = ncclCollNet_v5->connect;
+  ncclCollNet_v5_as_v6.reduceSupport = ncclCollNet_v5->reduceSupport;
+  ncclCollNet_v5_as_v6.regMr = ncclCollNet_v5->regMr;
+  ncclCollNet_v5_as_v6.regMrDmaBuf = NULL;
+  ncclCollNet_v5_as_v6.deregMr = ncclCollNet_v5->deregMr;
+  ncclCollNet_v5_as_v6.iallreduce = ncclCollNet_v5->iallreduce;
+  ncclCollNet_v5_as_v6.iflush = ncclCollNet_v5->iflush;
+  ncclCollNet_v5_as_v6.test = ncclCollNet_v5->test;
+  ncclCollNet_v5_as_v6.closeColl = ncclCollNet_v5->closeColl;
+  ncclCollNet_v5_as_v6.closeListen = ncclCollNet_v5->closeListen;
+  return ncclSuccess;
+}
+
+static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
+ncclNet_t* ncclNets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
+ncclCollNet_t* ncclCollNets[3] = { nullptr, nullptr, nullptr };
+enum ncclNetState {
+  ncclNetStateInit = 0,
+  ncclNetStateEnabled = 1,
+  ncclNetStateDisabled = 2
+};
+enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+
+ncclResult_t ncclNetPluginInit() {
   char ncclNetPluginName[128];
   const char* envPluginName = getenv("NCCL_NET_PLUGIN");
   if (envPluginName && strlen(envPluginName)) {
@@ -126,67 +184,104 @@ static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) {
     } else {
       INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
     }
-    return;
+    return ncclSuccess;
   }
 
-  *net = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
-  if (*net == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v5 symbol.");
-    ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
-    if (ncclNet_v4 == nullptr) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v4 symbol.");
-      if (netPluginLib != nullptr) dlclose(netPluginLib);
-      return;
+  ncclNets[0] = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
+  if (ncclNets[0] == nullptr) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.");
+    // Try v5 plugin
+    ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
+    if (ncclNet_v5 == nullptr) {
+      ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
+      if (ncclNet_v4 == nullptr) {
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (v4 or v5).");
+        if (netPluginLib != nullptr) dlclose(netPluginLib);
+        return ncclSuccess;
+      }
+      ncclNets[0] = &ncclNet_v4_as_v6;
+      ncclNet_v4_as_v6.init = ncclNet_v4_as_v6_init;
+      // Set the name right away to allow for NCCL_NET=... to work
+      ncclNet_v4_as_v6.name = ncclNet_v4->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v4)", ncclNets[0]->name);
+    } else {
+      ncclNets[0] = &ncclNet_v5_as_v6;
+      ncclNet_v5_as_v6.init = ncclNet_v5_as_v6_init;
+      // Set the name right away to allow for NCCL_NET=... to work
+      ncclNet_v5_as_v6.name = ncclNet_v5->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
     }
-    *net = &ncclNet_v4_as_v5;
-    ncclNet_v4_as_v5.init = ncclNet_v4_as_v5_init;
   }
 
   // Check for CollNet
-  *collnet = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
-  if (*collnet == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.");
-    ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
-    if (ncclCollNet_v4 == nullptr) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.");
+  ncclCollNets[0] = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
+  if (ncclCollNets[0] == nullptr) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.");
+    ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
+    if (ncclCollNet_v5 == nullptr) {
+      ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
+      if (ncclCollNet_v4 == nullptr) {
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5).");
+      } else {
+        ncclCollNets[0] = &ncclCollNet_v4_as_v6;
+        ncclCollNet_v4_as_v6.init = ncclCollNet_v4_as_v6_init;
+        ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v4)", ncclCollNets[0]->name);
+      }
     } else {
-      *collnet = &ncclCollNet_v4_as_v5;
-      ncclCollNet_v4_as_v5.init = ncclCollNet_v4_as_v5_init;
+      ncclCollNets[0] = &ncclCollNet_v5_as_v6;
+      ncclCollNet_v5_as_v6.init = ncclCollNet_v5_as_v6_init;
+      ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
     }
   }
-  return;
+  return ncclSuccess;
 }
 
-ncclResult_t ncclNetInit() {
-  // Always initialize bootstrap network
-  NCCLCHECK(bootstrapNetInit());
+static ncclResult_t netGetState(int i, enum ncclNetState* state) {
+  pthread_mutex_lock(&netLock);
+  if (ncclNetStates[i] == ncclNetStateInit) {
+    int ndev;
+    if (ncclNets[i]->init(ncclDebugLog) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled;
+    else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled;
+    else ncclNetStates[i] = ncclNetStateEnabled;
+  }
+  *state = ncclNetStates[i];
+  pthread_mutex_unlock(&netLock);
+  return ncclSuccess;
+}
 
+static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
+  if (ncclCollNetStates[i] == ncclNetStateInit) {
+    int ndev;
+    if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
+    else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled;
+    else ncclCollNetStates[i] = ncclNetStateEnabled;
+  }
+  *state = ncclCollNetStates[i];
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetInit(struct ncclComm* comm) {
   // Initialize main communication network
-  ncclNet_t* nets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
-  ncclCollNet_t* collNets[3] = { nullptr, nullptr, nullptr };
-  initPlugin(&nets[0], &collNets[0]);
   char* netName = getenv("NCCL_NET");
   bool ok = false;
 
   for (int i=0; i<3; i++) {
-    if (nets[i] == nullptr) continue;
-    if (netName && strcmp(netName, nets[i]->name) != 0) continue;
+    if (ncclNets[i] == nullptr) continue;
+    enum ncclNetState state;
+    NCCLCHECK(netGetState(i, &state));
+    if (state != ncclNetStateEnabled) continue;
+    if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
 
-    // net plugin is already initialized
-    int ndev;
-    if (nets[i]->init(ncclDebugLog) != ncclSuccess) continue;
-    if (nets[i]->devices(&ndev) != ncclSuccess) continue;
-    if (ndev <= 0) continue;
-    ncclNet = nets[i];
+    comm->ncclNet = ncclNets[i];
     ok = true;
 
-    if (collNets[i]) {
-      do {
-        if (collNets[i]->init(ncclDebugLog) != ncclSuccess) break;
-        if (collNets[i]->devices(&ndev) != ncclSuccess) break;
-        if (ndev <= 0) break;
-        ncclCollNet = collNets[i];
-      } while(0);
+    if (ncclCollNets[i]) {
+      NCCLCHECK(collNetGetState(i, &state));
+      if (state == ncclNetStateEnabled) {
+        comm->ncclCollNet = ncclCollNets[i];
+      }
     }
     break;
   }
@@ -198,7 +293,7 @@ ncclResult_t ncclNetInit() {
   return ncclSuccess;
 }
 
-ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
+ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
   constexpr int GPU_BUF_SIZE = 2*1024*1024;
 #if CUDART_VERSION >= 11030
   // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute
@@ -213,12 +308,12 @@ ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
   }
 #endif
   int netDevs;
-  NCCLCHECK(ncclNetDevices(&netDevs));
+  NCCLCHECK(ncclNetDevices(comm, &netDevs));
   *gdrSupport = 0;
   for (int dev=0; dev<netDevs; dev++) {
     // Find a net device which is GDR-capable
     ncclNetProperties_t props;
-    NCCLCHECK(ncclNetGetProperties(dev, &props));
+    NCCLCHECK(ncclNetGetProperties(comm, dev, &props));
     if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
 
     // Allocate memory on the GPU and try to register it on the NIC.
@@ -228,34 +323,34 @@ ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
     void* mHandle = NULL;
     ncclResult_t ret;
     ncclDebugNoWarn = NCCL_NET;
-    NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
+    NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1);
     while (sComm == NULL) {
-      NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
+      NCCLWAITGOTO(ncclNetConnect(comm, dev, &handle, &sComm), sComm != NULL, comm->abortFlag, ret, cleanup2);
     }
     while (rComm == NULL) {
-      NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
+      NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3);
     }
     CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
-    if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
-      NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
-      NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
-      NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
+    if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
+      NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle));
+      NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
+      NCCLCHECK(ncclNetDeregMr(comm, rComm, mHandle));
       *gdrSupport = 1;
     }
     ncclDebugNoWarn = 0;
     CUDACHECK(cudaFree(gpuPtr));
 cleanup4:
-    NCCLCHECK(ncclNetCloseRecv(rComm));
+    NCCLCHECK(ncclNetCloseRecv(comm, rComm));
 cleanup3:
-    NCCLCHECK(ncclNetCloseSend(sComm));
+    NCCLCHECK(ncclNetCloseSend(comm, sComm));
 cleanup2:
-    NCCLCHECK(ncclNetCloseListen(lComm));
+    NCCLCHECK(ncclNetCloseListen(comm, lComm));
 cleanup1:
     break;
   }
   return ncclSuccess;
 }
 
-int ncclNetVersion() {
-  return (ncclNet == &ncclNet_v4_as_v5) ? 4 : 5;
+int ncclNetVersion(struct ncclComm* comm) {
+  return (comm->ncclNet == &ncclNet_v4_as_v6) ? 4 : ((comm->ncclNet == &ncclNet_v5_as_v6) ? 5 : 6);
 }
diff --git a/src/proxy.cc b/src/proxy.cc
index d3c6a98..5021bc8 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -13,6 +13,8 @@
 #define ENABLE_TIMER 0
 #include "timer.h"
 
+#include <sys/syscall.h>
+
 enum { proxyRecv=0, proxySend=1 };
 
 static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
@@ -349,10 +351,10 @@ ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector*
   return ncclSuccess;
 }
 
-static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex) {
+static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) {
   if (peer < 0) return ncclSuccess;
 
-  struct ncclPeer* peerComm = channel->peers+peer;
+  struct ncclChannelPeer* peerComm = channel->peers+peer;
   struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex;
   if (connector->transportComm == NULL) {
     WARN("Rank %d has no transport for %s peer %d on channel %d/%d", connector->comm->rank,
@@ -361,35 +363,62 @@ static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, s
   }
   if (connector->transportComm->proxyProgress == NULL) return ncclSuccess;
 
-  NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op));
+  if (justInquire) *justInquire = true;
+  else {
+    NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op));
+  }
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* op, int nranks) {
-  struct ncclChannel* channel = comm->channels+op->channelId;
-  int pattern = op->pattern;
-  if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
-    struct ncclRing* ring = &channel->ring;
-    if (NeedProxy(proxyRecv, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, 0));
-    if (NeedProxy(proxySend, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, 0));
-  }
-  if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
-    // Tree up
-    struct ncclTree* tree = &channel->tree;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxyRecv, tree->down[i], op, 0));
-    NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0));
-  }
-  if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
-    // Tree down
-    struct ncclTree* tree = &channel->tree;
-    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0));
-    NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0));
-  }
-  if (pattern == ncclPatternCollTreeUpDown) {
-    // CollTree up
-    NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1));  // For CollTree up, we are using push
-    // CollTree down
-    NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0));
+// justInquire != nullptr means don't actually do anything, just assertain need of
+// ncclProxySaveOp for this op.
+ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool* justInquire) {
+  struct ncclChannel* channel = &comm->channels[op->channelId];
+  if (justInquire) *justInquire = false;
+  switch (op->pattern) {
+  case ncclPatternRing:
+  case ncclPatternRingTwice:
+  case ncclPatternPipelineFrom:
+  case ncclPatternPipelineTo: {
+      struct ncclRing* ring = &channel->ring;
+      if (NeedProxy(proxyRecv, op->pattern, op->root, ring, comm->nRanks)) {
+        NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, 0, justInquire));
+      }
+      if (NeedProxy(proxySend, op->pattern, op->root, ring, comm->nRanks)) {
+        NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, 0, justInquire));
+      }
+    } break;
+  case ncclPatternTreeUp:
+  case ncclPatternTreeDown:
+  case ncclPatternTreeUpDown: {
+      if (op->pattern != ncclPatternTreeDown) { // Tree up
+        struct ncclTree* tree = &channel->tree;
+        for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) {
+          NCCLCHECK(SaveProxy(channel, proxyRecv, tree->down[i], op, 0, justInquire));
+        }
+        NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0, justInquire));
+      }
+      if (op->pattern != ncclPatternTreeUp) { // Tree down
+        struct ncclTree* tree = &channel->tree;
+        for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) {
+          NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0, justInquire));
+        }
+        NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0, justInquire));
+      }
+    } break;
+  case ncclPatternCollTreeUpDown: {
+      // CollTree up
+      NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1, justInquire));  // For CollTree up, we are using push
+      // CollTree down
+      NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0, justInquire));
+    } break;
+  case ncclPatternSend:
+  case ncclPatternRecv: {
+      if (op->root == comm->rank) return ncclSuccess;
+      op->nsteps = DIVUP(op->nbytes, op->chunkSize);
+      if (op->nsteps == 0) op->nsteps = 1;
+      NCCLCHECK(SaveProxy(channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, 1, justInquire));
+    } break;
   }
   return ncclSuccess;
 }
@@ -406,22 +435,23 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
   op->protocol = NCCL_PROTO_SIMPLE;
   op->dtype = info->datatype;
 
-  int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR;
+  int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
+  if (info->comm->nNodes > 1) stepSize /= SENDRECV_SLICEFACTOR;
   info->chunkSize = stepSize;
   op->root = info->root;
   op->nbytes = info->count;
-  struct ncclPeer* peer = channel->peers + op->root;
+  struct ncclChannelPeer* peer = channel->peers + op->root;
 
   if (info->coll == ncclFuncSend) {
     op->pattern = ncclPatternSend;
-    if (op->root != info->comm->rank && peer->send[1].transportComm && peer->send[1].transportComm->proxyProgress) {
+    if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) {
       // Tune chunk size for the network
       if (info->count < stepSize) info->chunkSize /= 4;
       else if (info->count < 8*stepSize) info->chunkSize /= 2;
     }
   } else if (info->coll == ncclFuncRecv) {
     op->pattern = ncclPatternRecv;
-    if (op->root != info->comm->rank && peer->recv[1].transportComm && peer->recv[1].transportComm->proxyProgress) {
+    if (op->root != info->comm->rank && peer->recv[1].transportComm == &netTransport.recv) {
       // Tune chunk size for the network
       if (info->count < stepSize) info->chunkSize /= 4;
       else if (info->count < 8*stepSize) info->chunkSize /= 2;
@@ -437,22 +467,6 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* op) {
-  struct ncclChannel* channel = comm->channels+op->channelId;
-  op->opCount = channel->workFifoTail-1;
-  if (op->root == comm->rank) return ncclSuccess;
-  if (op->pattern == ncclPatternRecv) {
-    op->nsteps = DIVUP(op->nbytes, op->chunkSize);
-    if (op->nsteps == 0) op->nsteps = 1;
-    NCCLCHECK(SaveProxy(channel, proxyRecv, op->root, op, 1));
-  } else if (op->pattern == ncclPatternSend) {
-    op->nsteps = DIVUP(op->nbytes, op->chunkSize);
-    if (op->nsteps == 0) op->nsteps = 1;
-    NCCLCHECK(SaveProxy(channel, proxySend, op->root, op, 1));
-  }
-  return ncclSuccess;
-}
-
 static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) {
   struct ncclProxyArgs* freeOp = *opPtr;
   struct ncclProxyArgs* next = freeOp->next;
@@ -594,8 +608,48 @@ void ncclDumpProxyState(int signal) {
   dumpProxyState(ncclLastProxyState);
 }
 
+NCCL_PARAM(CreateThreadContext, "CREATE_THREAD_CONTEXT", 0);
+ncclResult_t ncclSetThreadContext(struct ncclComm* comm) {
+#if CUDART_VERSION >= 11030
+  static int createThreadContext = -1;
+
+  if (createThreadContext == -1) {
+    createThreadContext = ncclParamCreateThreadContext();
+    if (createThreadContext) {
+      if (CUPFN(cuCtxCreate_v3020) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
+        WARN("Unable to create thread context due to old driver, disabling.");
+        createThreadContext = 0;
+      }
+    }
+  }
+  if (createThreadContext) {
+    if (comm->proxyState.cudaCtx == NULL) {
+      if (CUPFN(cuCtxCreate_v3020(&comm->proxyState.cudaCtx,
+                                  CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, comm->cudaDev)) != CUDA_SUCCESS) {
+        WARN("Failed to create CUDA context on device %d", comm->cudaDev);
+        createThreadContext = 0;
+        return ncclSuccess;
+      }
+    } else {
+      if (CUPFN(cuCtxSetCurrent(comm->proxyState.cudaCtx)) != CUDA_SUCCESS) {
+        WARN("Failed to set CUDA context on device %d", comm->cudaDev);
+        return ncclUnhandledCudaError;
+      }
+    }
+  }
+#endif
+  return ncclSuccess;
+}
+
 void* ncclProxyProgress(void *comm_) {
   struct ncclComm* comm = (struct ncclComm*)comm_;
+  if (ncclSetThreadContext(comm) != ncclSuccess) {
+    WARN("[Proxy Progress] Failed to set CUDA context on device %d", comm->cudaDev);
+  } else if (cudaSetDevice(comm->cudaDev) != cudaSuccess) {
+    WARN("[Proxy Progress] Failed to set CUDA device %d", comm->cudaDev);
+  }
+  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+
   struct ncclProxyProgressState* state = &comm->proxyState.progressState;
   state->nextOps = -1;
   signal(SIGUSR1, ncclDumpProxyState);
@@ -728,9 +782,9 @@ static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool,
 
 static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
   if (connection->send) {
-    NCCLCHECK(ncclTransports[connection->transport].send.proxyFree(connection, comm));
+    NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, comm));
   } else {
-    NCCLCHECK(ncclTransports[connection->transport].recv.proxyFree(connection, comm));
+    NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, comm));
   }
   return ncclSuccess;
 }
@@ -774,7 +828,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
   NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int)));
   NCCLCHECK(ncclSocketSend(sock, &comm->localRank, sizeof(int)));
   NCCLCHECK(ncclSocketRecv(sock, &proxyConn->connection, sizeof(void*)));
-  struct ncclTransportComm* tcomm = send ? &ncclTransports[transport].send : &ncclTransports[transport].recv;
+  struct ncclTransportComm* tcomm = send ? &ncclTransports[transport]->send : &ncclTransports[transport]->recv;
   // If we need proxy progress, map progress ops
   if (tcomm->proxyProgress) {
     char poolPath[] = "/dev/shm/nccl-XXXXXX";
@@ -881,7 +935,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
   NCCLCHECK(ncclSocketRecv(sock, &peer->localRank, sizeof(int)));
   connection->localRank = peer->localRank;
   NCCLCHECK(ncclSocketSend(sock, &connection, sizeof(void*)));
-  connection->tcomm = connection->send ? &ncclTransports[connection->transport].send : &ncclTransports[connection->transport].recv;
+  connection->tcomm = connection->send ? &ncclTransports[connection->transport]->send : &ncclTransports[connection->transport]->recv;
   // If we need proxy progress, let's allocate ops and start the thread
   if (connection->tcomm->proxyProgress) {
     NCCLCHECK(proxyProgressInit(comm));
@@ -947,7 +1001,10 @@ static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* p
 
 void* ncclProxyService(void* _args) {
   struct ncclComm* comm =  (struct ncclComm *) _args;
-  if (cudaSetDevice(comm->cudaDev) != cudaSuccess) {
+  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  if (ncclSetThreadContext(comm) != ncclSuccess) {
+    WARN("[Proxy Service] Failed to set CUDA context on device %d", comm->cudaDev);
+  } else if (cudaSetDevice(comm->cudaDev) != cudaSuccess) {
     WARN("[Proxy Service] Failed to set CUDA device %d", comm->cudaDev);
   }
   if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
diff --git a/src/transport.cc b/src/transport.cc
index 7ce5f2e..7ebaf27 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -10,16 +10,11 @@
 #define ENABLE_TIMER 0
 #include "timer.h"
 
-extern struct ncclTransport p2pTransport;
-extern struct ncclTransport shmTransport;
-extern struct ncclTransport netTransport;
-extern struct ncclTransport collNetTransport;
-
-struct ncclTransport ncclTransports[NTRANSPORTS] = {
-  p2pTransport,
-  shmTransport,
-  netTransport,
-  collNetTransport
+struct ncclTransport* ncclTransports[NTRANSPORTS] = {
+  &p2pTransport,
+  &shmTransport,
+  &netTransport,
+  &collNetTransport
 };
 
 template <int type>
@@ -29,7 +24,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
   struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex :
                                                   comm->channels[channelId].peers[peer].recv + connIndex;
   for (int t=0; t<NTRANSPORTS; t++) {
-    struct ncclTransport *transport = ncclTransports+t;
+    struct ncclTransport *transport = ncclTransports[t];
     struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
     int ret = 0;
     NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
@@ -44,9 +39,10 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
   return ncclSystemError;
 }
 
-ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
+ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
   TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
-  uint32_t mask = 1 << channel->id;
+  struct ncclChannel* channel = &comm->channels[channelId];
+  uint32_t mask = 1 << channelId;
   for (int i=0; i<nrecv; i++) {
     int peer = peerRecv[i];
     if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
@@ -71,9 +67,10 @@ void dumpData(struct ncclConnect* data, int ndata) {
 
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
   // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
+  int highestType = TRANSPORT_P2P;  // track highest transport type
+
   cudaStream_t transportSetupStream;
   CUDACHECK(cudaStreamCreateWithFlags(&transportSetupStream, cudaStreamNonBlocking));
-  int highestType = TRANSPORT_P2P;  // track highest transport type
 
   struct ncclConnect data[2*MAXCHANNELS];
   for (int i=1; i<comm->nRanks; i++) {
@@ -126,7 +123,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
         struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
         NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
         conn->connected = 1;
-        CUDACHECK(cudaMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice, transportSetupStream));
+        CUDACHECK(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, transportSetupStream));
       }
     }
     TIME_STOP(3);
@@ -136,7 +133,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
         struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
         NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
         conn->connected = 1;
-        CUDACHECK(cudaMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice, transportSetupStream));
+        CUDACHECK(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, transportSetupStream));
       }
     }
     TIME_STOP(4);
@@ -168,10 +165,6 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
   // check if we can connect to collnet, whose root is the nranks-th rank
   struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
   peerInfo->rank = nranks;
-  int support = 1;
-  if (isMaster) {
-    NCCLCHECK(collNetTransport.canConnect(&support, comm->topo, collNetGraph, myInfo, peerInfo));
-  }
 
   // send master receives connect info from peer recv master
   if (isMaster && type == collNetSend) {
@@ -181,14 +174,14 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
   }
 
   // select
-  struct ncclPeer* root = channel->peers+nranks;
+  struct ncclChannelPeer* root = channel->peers+nranks;
   // connector index: 0 for recv, 1 for send
   struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type;
   struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
   conn->transportComm = transportComm;
   // setup
   struct ncclConnect myConnect;
-  if (isMaster && support) {
+  if (isMaster) {
     NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
   }
   // prepare connect handles
@@ -218,11 +211,11 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
     if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
   }
   // connect
-  if (isMaster && support) {
+  if (isMaster) {
     NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
-    struct ncclPeer* devRoot = channel->devPeers+nranks;
-    struct ncclConnector* devConn = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
-    CUDACHECKGOTO(cudaMemcpy(devConn, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice), res, cleanup);
+    struct ncclDevChannelPeer* devRoot = channel->devPeers+nranks;
+    struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
+    CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup);
   }
   // recv side sends connect info to send side
   if (isMaster && type == collNetRecv) {
@@ -231,7 +224,7 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
     NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
     TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
   }
-  if (support) fail = 0;
+  fail = 0;
 cleanup:
   if (allConnects != NULL) free(allConnects);
   if (masterConnects != NULL) free(masterConnects);
@@ -260,7 +253,7 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
   // Free collNet resources
   for (int r=0; r<comm->nChannels; r++) {
     struct ncclChannel* channel = comm->channels+r;
-    struct ncclPeer* peer = channel->peers+comm->nRanks;
+    struct ncclChannelPeer* peer = channel->peers+comm->nRanks;
     for (int b=0; b<NCCL_MAX_CONNS; b++) {
       struct ncclConnector* send = peer->send + b;
       if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 771a18f..0404aa8 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -128,9 +128,9 @@ struct recvResources {
   int collNetRank;
 };
 
-/* Determine if we can communicate with the peer */
 static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
-  *ret = 1;
+  // This transport cannot be used for p2p
+  *ret = 0;
   return ncclSuccess;
 }
 
@@ -154,7 +154,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
   NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
 
-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev,
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(comm), req.netDev,
       req.useGdr ? "/GDRDMA" : "");
   return ncclSuccess;
 }
@@ -172,7 +172,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
   NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
 
-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev,
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(comm), req.netDev,
       req.useGdr ? "/GDRDMA" : "");
   return ncclSuccess;
 }
@@ -297,7 +297,7 @@ ncclResult_t sharedListen(struct ncclComm* comm, int netDev, void* collNetHandle
     comm->proxyState.progressState.collNet.resources = resources;
   }
   if (resources->collNetComms[netDev] == NULL)
-    NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev));
+    NCCLCHECK(collNetListen(comm, netDev, collNetHandle, resources->collNetListenComms+netDev));
   return ncclSuccess;
 }
 
@@ -311,13 +311,13 @@ static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct nccl
       struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i);
       handlePtrs[i] = &(info->collNetHandle);
     }
-    ncclResult_t ret = collNetConnect((void**)handlePtrs, nranks, rank,
+    ncclResult_t ret = collNetConnect(comm, (void**)handlePtrs, nranks, rank,
           resources->collNetListenComms[netDev],
           resources->collNetComms+netDev);
     free(handlePtrs);
     if (ret == ncclSuccess) {
       // Close listen comm
-      NCCLCHECK(collNetCloseListen(resources->collNetListenComms[netDev]));
+      NCCLCHECK(collNetCloseListen(comm, resources->collNetListenComms[netDev]));
     } else {
       resources->collNetListenComms[netDev] = NULL;
     }
@@ -331,7 +331,7 @@ static ncclResult_t sharedFree(struct ncclComm* comm, int netDev) {
   struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
   resources->commRefCount[netDev]--;
   if (resources->commRefCount[netDev] == 0) {
-    NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev]));
+    NCCLCHECK(collNetCloseColl(comm, resources->collNetComms[netDev]));
   }
   for (int n=0; n<NCCL_MAX_NETDEVS; n++) if (resources->commRefCount[n]) return ncclSuccess;
   comm->proxyState.progressState.collNet.resources = NULL;
@@ -447,9 +447,22 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
   NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
 
-  NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
-        &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+#if CUDA_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useGdr && comm->dmaBufSupport) {
+    int dmabuf_fd;
+    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                 NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
+                                 &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+    (void)close(dmabuf_fd);
+  } else // FALL-THROUGH to nv_peermem GDR path
+#endif
+  {
+    NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                           resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
+                           &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+  }
 
   *((struct connectMap**)respBuff) = &resources->map;
   return ncclSuccess;
@@ -503,9 +516,22 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
   NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
 
-  NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
-        &resources->mhandles[NCCL_PROTO_SIMPLE]));
+#if CUDA_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useGdr && comm->dmaBufSupport) {
+    int dmabuf_fd;
+    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                 NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
+                                 &resources->mhandles[NCCL_PROTO_SIMPLE]));
+    (void)close(dmabuf_fd);
+  } else // FALL-THROUGH to nv_peermem GDR path
+#endif
+  {
+    NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                           resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
+                           &resources->mhandles[NCCL_PROTO_SIMPLE]));
+  }
 
   // Pass info to send side
   info->reqFifo = resources->reqFifo;
@@ -521,7 +547,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
   struct sendResources* resources = (struct sendResources*)(connection->transportResources);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     if (resources->sendMhandles[p]) {
-      NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[p]));
+      NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p]));
     }
   }
   struct connectMapMem* mems = resources->map.mems;
@@ -538,7 +564,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
   struct recvResources* resources = (struct recvResources*)(connection->transportResources);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     if (resources->mhandles[p]) {
-      NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[p]));
+      NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p]));
     }
   }
   struct connectMapMem* mems = resources->map.mems;
@@ -625,10 +651,10 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
         if (reqFifo[group][buffSlot].recvBuff != NULL) {
           int totalSize = (s-group*COLLNET_GROUP_NSUBS+1) * args->sharedSize[sharedBuffSlot];
-          int count = totalSize / ncclTypeSize(args->dtype);
+          int count = totalSize / ncclTypeSize((ncclDataType_t)args->dtype);
           reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot];
           char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot];
-          NCCLCHECK(collNetIallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
+          NCCLCHECK(collNetIallreduce(comm, resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
           if (sub->requests[buffSlot] == NULL) continue;
 
           TRACE(NCCL_NET, "sendProxy [%d/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]);
@@ -644,7 +670,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         int done, size;
         int group = s / COLLNET_GROUP_NSUBS;
         int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
-        NCCLCHECK(collNetTest((void*)(sub->requests[buffSlot]), &done, &size));
+        NCCLCHECK(collNetTest(comm, (void*)(sub->requests[buffSlot]), &done, &size));
         if (done) {
           TRACE(NCCL_NET, "sendProxy [%d/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size);
           // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
@@ -735,7 +761,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
               int startChannel = group*COLLNET_GROUP_NSUBS;
               int offset;
               NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
-              NCCLCHECK(collNetIflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
+              NCCLCHECK(collNetIflush(comm, resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
             }
           } else {
             for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
@@ -749,7 +775,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         int group = s / COLLNET_GROUP_NSUBS;
         int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS;
         int done = 1;
-        if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(sub->requests[buffSlot], &done, NULL));
+        if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(comm, sub->requests[buffSlot], &done, NULL));
         if (done) {
           TRACE(NCCL_NET, "recvProxy [%d/%d/%d] flushed", sub->flushed, group, buffSlot);
           for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
diff --git a/src/transport/net.cc b/src/transport/net.cc
index e96f189..be3afc4 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -181,10 +181,10 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
 
   if (proxyRank == myInfo->rank) {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
         req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   } else {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
         proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   }
   *((int*)connectInfo) = proxyRank;
@@ -217,7 +217,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   req.remoteRank = peerInfo->rank;
   NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
 
-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev,
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
       req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   return ncclSuccess;
 }
@@ -447,7 +447,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
   resources->channelId = req->channelId;
   resources->connIndex = req->connIndex;
   ncclNetProperties_t props;
-  NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
+  NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
   resources->maxRecvs = props.maxRecvs;
 
   // We don't return any data
@@ -473,11 +473,11 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   resources->channelId = req->channelId;
   resources->connIndex = req->connIndex;
   ncclNetProperties_t props;
-  NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
+  NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
   resources->maxRecvs = props.maxRecvs;
 
   if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
-  NCCLCHECK(ncclNetListen(req->netDev, respBuff, &resources->netListenComm));
+  NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm));
   *done = 1;
   return ncclSuccess;
 }
@@ -504,15 +504,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
         NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
       }
       struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
-      if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, comms->sendComm+resources->channelId));
+      if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId));
       resources->netSendComm = comms->sendComm[resources->channelId];
       if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
     } else {
-      NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
+      NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
     }
   } else {
     // Connect to remote peer
-    NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
+    NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
     connection->proxyAppendPtr = &connection->proxyAppend;
   }
   if (resources->netSendComm == NULL) {
@@ -586,7 +586,19 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
     if (resources->buffers[p]) {
-      NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+#if CUDA_VERSION >= 11070
+      /* DMA-BUF support */
+      int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+      if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) {
+        int dmabuf_fd;
+        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+        NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
+        (void)close(dmabuf_fd);
+      } else // FALL-THROUGH to nv_peermem GDR path
+#endif
+      {
+        NCCLCHECK(ncclNetRegMr(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+      }
     }
   }
 
@@ -620,15 +632,15 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
         NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
       }
       struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
-      if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(resources->netListenComm, comms->recvComm+resources->channelId));
+      if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId));
       resources->netRecvComm = comms->recvComm[resources->channelId];
       if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
     } else {
-      NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+      NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
     }
   } else {
     // Connect to remote peer
-    NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+    NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
     connection->proxyAppendPtr = &connection->proxyAppend;
   }
   if (resources->netRecvComm == NULL) {
@@ -636,7 +648,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
     return ncclSuccess;
   }
   *done = 1;
-  NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
+  NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm));
 
   // Create structures
   struct connectMap* map = &resources->map;
@@ -691,7 +703,19 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
     if (resources->buffers[p]) {
-      NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+#if CUDA_VERSION >= 11070
+      /* DMA-BUF support */
+      int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+      if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) {
+        int dmabuf_fd;
+        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+        NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
+        (void)close(dmabuf_fd);
+      } else // FALL-THROUGH to nv_peermem GDR path
+#endif
+      {
+        NCCLCHECK(ncclNetRegMr(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+      }
     }
   }
 
@@ -709,7 +733,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
   }
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     if (resources->buffers[p]) {
-      NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[p]));
+      NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p]));
     }
   }
   struct connectMapMem* mems = resources->map.mems;
@@ -725,12 +749,12 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
     if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
       struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank;
       comms->sendRefCount[resources->channelId]--;
-      if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comms->sendComm[resources->channelId]));
+      if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comm, comms->sendComm[resources->channelId]));
     } else {
-      NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
+      NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
     }
   } else {
-    NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
+    NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
   }
   free(resources);
   return ncclSuccess;
@@ -744,7 +768,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
   }
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     if (resources->buffers[p]) {
-      NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[p]));
+      NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p]));
     }
   }
   struct connectMapMem* mems = resources->map.mems;
@@ -756,12 +780,12 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
     if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
       struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank;
       comms->recvRefCount[resources->channelId]--;
-      if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comms->recvComm[resources->channelId]));
+      if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comm, comms->recvComm[resources->channelId]));
     } else {
-      NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
+      NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
     }
   } else {
-    NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
+    NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
   }
   free(resources);
   return ncclSuccess;
@@ -849,7 +873,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
           }
           if (ready) {
             // Data is ready, try to send.
-            NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
+            NCCLCHECK(ncclNetIsend(comm, resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
             if (sub->requests[buffSlot] != NULL) {
               TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
               sizesFifo[buffSlot] = -1;
@@ -867,7 +891,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
       if (sub->done < sub->transmitted) {
         int done;
         int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
-        NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
+        NCCLCHECK(ncclNetTest(comm, sub->requests[buffSlot], &done, NULL));
         if (done) {
           TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
           sub->done += args->sliceSteps;
@@ -971,7 +995,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         uint64_t step = subGroup->posted;
         struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
         void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
-        NCCLCHECK(ncclNetIrecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
+        NCCLCHECK(ncclNetIrecv(comm, resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
         if (*requestPtr) {
           for (int i=0; i<subGroup->groupSize; i++) {
             struct ncclProxySubArgs* sub = subGroup+i;
@@ -993,7 +1017,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         int sizes[NCCL_PROXY_MAX_SUBS];
         void* mhandles[NCCL_PROXY_MAX_SUBS];
         for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0;
-        NCCLCHECK(ncclNetTest(subGroup->requests[step%NCCL_STEPS], &done, sizes));
+        NCCLCHECK(ncclNetTest(comm, subGroup->requests[step%NCCL_STEPS], &done, sizes));
         if (done) {
           int useGdr = 0;
           int totalSize = 0;
@@ -1034,7 +1058,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
                 }
               }
               struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
-              NCCLCHECK(ncclNetIflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
+              NCCLCHECK(ncclNetIflush(comm, resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
             }
           }
           args->idle = 0;
@@ -1049,7 +1073,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         uint64_t step = subGroup->transmitted;
         int done = 1;
         void* request = subGroup->requests[step%NCCL_STEPS];
-        if (request) NCCLCHECK(ncclNetTest(request, &done, NULL));
+        if (request) NCCLCHECK(ncclNetTest(comm, request, &done, NULL));
         if (done) {
           for (int i=0; i<subGroup->groupSize; i++) {
             struct ncclProxySubArgs* sub = subGroup + i;
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index d3d4f9a..d4bb8cf 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -274,6 +274,31 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
   return ncclSuccess;
 }
 
+// Detect whether DMA-BUF support is present in the kernel
+// Returns :
+// ncclSuccess : DMA-BUF support is available
+// ncclSystemError : DMA-BUF is not supported by the kernel
+ncclResult_t ncclIbDmaBufSupport(int dev) {
+  static int dmaBufSupported = -1;
+  if (dmaBufSupported == -1) {
+    ncclResult_t res;
+    struct ibv_pd* pd;
+    struct ibv_context* ctx;
+    ctx = ncclIbDevs[dev].context;
+    NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
+    // Test kernel DMA-BUF support with a dummy call (fd=-1)
+    (void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/);
+    // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP if not supported (EBADF otherwise)
+    dmaBufSupported = (errno != EOPNOTSUPP) ? 1 : 0;
+    NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
+  }
+  if (dmaBufSupported == 0) return ncclSystemError;
+  return ncclSuccess;
+failure:
+  dmaBufSupported = 0;
+  return ncclSystemError;
+}
+
 static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) {
   memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
   return ncclSuccess;
@@ -286,10 +311,11 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
   props->pciPath = ncclIbDevs[dev].pciPath;
   props->guid = ncclIbDevs[dev].guid;
   props->ptrSupport = NCCL_PTR_HOST;
-  if (ncclIbGdrSupport(dev) != ncclSuccess) {
-    INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
-  } else {
-    props->ptrSupport |= NCCL_PTR_CUDA;
+  if (ncclIbGdrSupport(dev) == ncclSuccess) {
+    props->ptrSupport |= NCCL_PTR_CUDA; // GDR support via nv_peermem
+  }
+  if (ncclIbDmaBufSupport(dev) == ncclSuccess) {
+    props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF
   }
   props->speed = ncclIbDevs[dev].speed;
   props->latency = 0; // Not set
@@ -546,6 +572,7 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
   static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large");
   memset(handle, 0, sizeof(struct ncclIbHandle));
   comm->dev = dev;
+  comm->sock.asyncFlag = 1; /* nonblocking socket is required by network communication. */
   NCCLCHECK(GetSocketAddr(&comm->sock.addr));
   NCCLCHECK(ncclSocketListen(&comm->sock));
   memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
@@ -580,7 +607,7 @@ ib_connect_check:
     /* expect user to call again */
     return ncclSuccess;
   } else if (conState == ncclSocketError) {
-    return ncclSystemError;
+    return ncclRemoteError;
   }
 
   // IB Setup
@@ -658,7 +685,6 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
   stage->comm = rComm;
   stage->state = ncclIbCommStateAccept;
   lComm->sock.asyncFlag = 1;
-  rComm->sock.asyncFlag = 1;
 
 ib_accept:
   NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock));
@@ -812,7 +838,8 @@ ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) {
 
 ncclResult_t ncclIbTest(void* request, int* done, int* size);
 
-ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+/* DMA-BUF support */
+ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
   static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset");
   assert(size > 0);
 
@@ -822,7 +849,7 @@ ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhan
   struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
   struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
   uintptr_t addr = (uintptr_t)data & -pageSize;
-  int pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
   ncclResult_t res;
   pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
   for (int slot=0; /*true*/; slot++) {
@@ -834,14 +861,20 @@ ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhan
       // Deregister / register
       struct ibv_mr* mr;
       unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ;
-      if (ncclIbRelaxedOrderingEnabled) {
-        // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
-        NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, (uintptr_t)addr, flags|IBV_ACCESS_RELAXED_ORDERING), res, returning);
+      if (ncclIbRelaxedOrderingEnabled) flags |= IBV_ACCESS_RELAXED_ORDERING;
+      if (fd != -1) {
+        /* DMA-BUF support */
+        NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, verbs->pd, offset, pages*pageSize, addr, fd, flags), res, returning);
+      } else {
+        if (ncclIbRelaxedOrderingEnabled) {
+          // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
+          NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, addr, flags), res, returning);
+        }
+        else {
+          NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning);
+        }
       }
-      else {
-        NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning);
-      }
-      TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey);
+      TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x fd %d", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey, fd);
       cache->population += 1;
       cache->slots[slot].addr = addr;
       cache->slots[slot].pages = pages;
@@ -863,6 +896,10 @@ returning:
   return res;
 }
 
+ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+  return ncclIbRegMrDmaBuf(comm, data, (size_t)size, type, 0ULL, -1, mhandle);
+}
+
 ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
   struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
   struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
@@ -916,13 +953,16 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
 
   // Write size as immediate data. In the case of multi-send, only write
   // 0 or 1 as size to indicate whether there was data sent or received.
-  uint64_t immData = 0;
+  uint32_t immData = 0;
   if (nreqs == 1) {
     immData = reqs[0]->send.size;
   } else {
-    uint8_t* multiImmData = (uint8_t*)&immData;
+    if (nreqs > 32) {
+      WARN("Cannot store sizes of %d requests in a 32-bits field", nreqs);
+      return ncclInternalError;
+    }
     for (int r=0; r<nreqs; r++) {
-      multiImmData[r] = reqs[r]->send.size ? 1 : 0;
+      immData |= (reqs[r]->send.size ? 1 : 0) << r;
     }
   }
 
@@ -1197,7 +1237,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
         char line[SOCKET_NAME_MAXLEN+1];
         WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d",
              ncclSocketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
-        return ncclSystemError;
+        return ncclRemoteError;
       }
 
       struct ncclIbRequest* req = r->verbs->reqs+(wc->wr_id & 0xff);
@@ -1212,9 +1252,8 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
           if (req->type != NCCL_NET_IB_REQ_RECV) return ncclInternalError;
           if (req->nreqs > 1) {
             // In the case of a multi recv, we only set sizes to 0 or 1.
-            uint8_t* sizes = (uint8_t*)&wc->imm_data;
             for (int i=0; i<req->nreqs; i++) {
-              req->recv.sizes[i] |= sizes[i];
+              req->recv.sizes[i] = (wc->imm_data >> i) & 0x1;
             }
           } else {
             req->recv.sizes[0] += wc->imm_data;
@@ -1275,6 +1314,7 @@ ncclNet_t ncclNetIb = {
   ncclIbConnect,
   ncclIbAccept,
   ncclIbRegMr,
+  ncclIbRegMrDmaBuf,
   ncclIbDeregMr,
   ncclIbIsend,
   ncclIbIrecv,
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index 9e14aa2..a0d80d3 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -311,6 +311,7 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
   struct ncclSocketListenComm* comm;
   NCCLCHECK(ncclSocketNewListenComm(&comm));
   NCCLCHECK(GetSocketAddr(dev, &comm->sock.addr));
+  comm->sock.asyncFlag = 1;
   NCCLCHECK(ncclSocketListen(&comm->sock));
   memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
   NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
@@ -359,7 +360,7 @@ socket_connect_check:
       /* expect user to call again */
       return ncclSuccess;
     } else if (conState == ncclSocketError) {
-      return ncclSystemError;
+      return ncclRemoteError;
     }
     stage->state = ncclSocketCommStateSend;
 
@@ -616,6 +617,7 @@ ncclNet_t ncclNetSocket = {
   ncclSocketConnect,
   ncclSocketAccept,
   ncclSocketRegMr,
+  NULL, // No DMA-BUF support
   ncclSocketDeregMr,
   ncclSocketIsend,
   ncclSocketIrecv,
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 9859c87..414f05d 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -7,6 +7,7 @@
 #include "comm.h"
 #include "graph.h"
 #include "utils.h"
+#include "shm.h"
 
 struct ncclP2pBuff {
   void* directPtr;
@@ -17,6 +18,34 @@ struct p2pConnectInfo {
   int rank;
   int read;
   struct ncclP2pBuff p2pBuff;
+  // Use by CE memcpy
+  char shmName[7];
+  int shmSize;
+};
+static_assert(sizeof(struct p2pConnectInfo) <= CONNECT_SIZE, "p2pConnectInfo is too large");
+
+struct p2pShm {
+  struct ncclSendMem sendMem;
+  struct ncclRecvMem recvMem;
+};
+struct p2pProxyInfo {
+  // Shared memory between proxy and receiving GPU
+  struct p2pShm* shm;
+  struct p2pShm* devShm;
+  char shmName[7];
+  int shmSize;
+
+  // Intermediate step for sender
+  struct ncclRecvMem* ceRecvMem;
+  char* ceDevBuff;
+
+  // Receiver buffer
+  char* recvFifo;
+
+  // Used by progress only
+  uint64_t step;
+  cudaStream_t stream;
+  cudaEvent_t events[NCCL_STEPS];
 };
 static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large");
 
@@ -24,12 +53,16 @@ struct p2pSendResources {
   struct ncclSendMem* devMem;
   void* sendMemIpc;
   void* recvMemIpc;
+  struct p2pProxyInfo proxyInfo;
 };
 
 struct p2pRecvResources {
   struct ncclRecvMem* devMem;
   void* sendMemIpc;
   void* recvMemIpc;
+  struct p2pShm* shm;
+  struct p2pShm* devShm;
+  int shmSize;
 };
 
 #include <sys/types.h>
@@ -51,8 +84,14 @@ static int busIdToCudaDev(int64_t busId) {
   return -1;
 }
 
+NCCL_PARAM(P2pUseCudaMemcpy, "P2P_USE_CUDA_MEMCPY", 0);
+static int useMemcpy = 0;
+static void initCeOperation();
+
 /* Determine if two peers can communicate through p2p */
 ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+  initCeOperation();
+
   // Rule out different nodes / isolated containers
   if (info1->hostHash != info2->hostHash || info1->shmDev != info2->shmDev) {
     *ret = 0;
@@ -63,7 +102,10 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
   int intermediateRank;
   NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank));
   if (*ret == 0) return ncclSuccess;
-  if (intermediateRank != -1) return ncclSuccess;
+  if (intermediateRank != -1) {
+    if (useMemcpy) *ret = 0;
+    return ncclSuccess;
+  }
 
   // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
   int cudaDev1 = busIdToCudaDev(info1->busId);
@@ -170,6 +212,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   send->transportResources = resources;
   int useRead, intermediateRank;
   NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
+  if (useMemcpy) useRead = 0;
 
   static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
@@ -185,14 +228,14 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
 
   if (intermediateRank == -1) {
     info->rank = myInfo->rank;
-    if (myInfo->pidHash == peerInfo->pidHash) {
+    if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
       if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
       INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
           channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
     } else {
       send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
-      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
-          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
+      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s",
+          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "");
     }
   } else {
     info->rank = intermediateRank;
@@ -202,9 +245,15 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   }
 
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
-  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
+  if (useMemcpy) {
+    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo)));
+    info->shmSize = resources->proxyInfo.shmSize;
+    memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
+  } else {
+    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
+    NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
+  }
 
-  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
   return ncclSuccess;
 }
 
@@ -230,7 +279,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
 
   if (intermediateRank == -1) {
     info->rank = myInfo->rank;
-    if (myInfo->pidHash == peerInfo->pidHash) {
+    if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
       if (ncclParamP2pDirectDisable() == 0) recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
     } else {
       recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
@@ -258,30 +307,61 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     if (info->read && p == NCCL_PROTO_SIMPLE) {
       /* For P2P Read the SIMPLE buffer is local (ncclSendMem) */
+      if (resources->devMem == NULL) return ncclInternalError; // We should not use read + memcpy
       send->conn.buffs[p] = (char*)(resources->devMem+1);
     } else {
       send->conn.buffs[p] = buff;
       buff += send->comm->buffSizes[p];
     }
   }
-  send->conn.tail = &remDevMem->tail;
-  send->conn.head = &resources->devMem->head;
-  send->conn.ptrExchange = &resources->devMem->ptrExchange;
-  send->conn.redOpArgExchange = resources->devMem->redOpArgExchange;
+
+  if (useMemcpy) {
+    send->conn.tail = &resources->proxyInfo.ceRecvMem->tail;
+    send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
+    send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
+    // Send SIMPLE buff to proxy, and replace it by local buffer
+    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
+    send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff;
+  } else {
+    send->conn.tail = &remDevMem->tail;
+    send->conn.head = &resources->devMem->head;
+    send->conn.ptrExchange = &resources->devMem->ptrExchange;
+    send->conn.redOpArgExchange = resources->devMem->redOpArgExchange;
+  }
   return ncclSuccess;
 }
 
 /* Connect/Recv from this peer */
 ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
   struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
-  struct ncclSendMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
 
-  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
+  struct ncclSendMem* remDevMem = NULL;
+
+  if (useMemcpy) {
+    char shmPath[PATH_MAX];
+    sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
+    TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
+    resources->shmSize = info->shmSize;
+    NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, 0));
+    // Remove the file to ensure proper clean-up
+    NCCLCHECK(ncclShmUnlink(shmPath));
+
+    recv->conn.tail = &resources->devShm->recvMem.tail;
+    recv->conn.head = &resources->devShm->sendMem.head;
+  } else {
+    NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
+
+    recv->conn.tail = &resources->devMem->tail;
+    recv->conn.head = &remDevMem->head;
+    recv->conn.ptrExchange = &remDevMem->ptrExchange;
+    recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
+  }
 
   char* buff = (char*)(resources->devMem+1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     if (info->read && p == NCCL_PROTO_SIMPLE) {
+      if (remDevMem == NULL) return ncclInternalError; // We should not use read + memcpy
       /* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */
       recv->conn.buffs[p] = (char*)(remDevMem+1);
     } else {
@@ -289,10 +369,6 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
       buff += recv->comm->buffSizes[p];
     }
   }
-  recv->conn.tail = &resources->devMem->tail;
-  recv->conn.head = &remDevMem->head;
-  recv->conn.ptrExchange = &remDevMem->ptrExchange;
-  recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
   return ncclSuccess;
 }
 
@@ -308,11 +384,52 @@ ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
   struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
   if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
   if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
+  if (useMemcpy) {
+    NCCLCHECK(ncclShmClose(resources->shm, resources->devShm, resources->shmSize));
+  }
   free(resources);
   return ncclSuccess;
 }
 
-static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  if (useMemcpy) {
+    struct p2pProxyInfo* proxyInfo;
+    NCCLCHECK(ncclCalloc(&proxyInfo, 1));
+    connection->transportResources = proxyInfo;
+
+    NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, comm->buffSizes[NCCL_PROTO_SIMPLE]));
+
+    char shmPath[PATH_MAX];
+    shmPath[0] = '\0';
+    proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
+    NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1));
+    TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
+    memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));
+
+    NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
+
+    if (respSize != sizeof(struct p2pProxyInfo)) return ncclInternalError;
+    memcpy(respBuff, proxyInfo, sizeof(struct p2pProxyInfo));
+  } else {
+    if (reqSize != sizeof(int)) return ncclInternalError;
+    int size = *((int*)reqBuff);
+    if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
+    struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
+    NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size));
+    connection->transportResources = p2pBuff->directPtr;
+    cudaError_t res = cudaIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr);
+    if (res != cudaSuccess) {
+      WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
+      cudaFree(p2pBuff->directPtr);
+      free(p2pBuff);
+      CUDACHECK(res);
+    }
+  }
+  *done = 1;
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   if (reqSize != sizeof(int)) return ncclInternalError;
   int size = *((int*)reqBuff);
   if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
@@ -330,15 +447,116 @@ static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct
   return ncclSuccess;
 }
 
-static ncclResult_t p2pProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
+
+  if (reqSize != sizeof(void*)) return ncclInternalError;
+  proxyInfo->recvFifo = *((char**)reqBuff);
+
+  CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
+  for (int i=0; i<NCCL_STEPS; i++) {
+    CUDACHECK(cudaEventCreate(proxyInfo->events+i));
+  }
+  connection->proxyAppendPtr = &connection->proxyAppend;
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  if (useMemcpy) {
+    struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
+    NCCLCHECK(ncclShmClose(proxyInfo->shm, proxyInfo->devShm, proxyInfo->shmSize));
+    NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
+    CUDACHECK(cudaFree(proxyInfo->ceDevBuff));
+    CUDACHECK(cudaStreamDestroy(proxyInfo->stream));
+    for (int i=0; i<NCCL_STEPS; i++) {
+      CUDACHECK(cudaEventDestroy(proxyInfo->events[i]));
+    }
+    free(proxyInfo);
+  } else {
+    // Do not check return code as CUDA may have already shut down
+    cudaFree(connection->transportResources);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
   // Do not check return code as CUDA may have already shut down
   cudaFree(connection->transportResources);
   return ncclSuccess;
 }
 
+static ncclResult_t p2pSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+  if (args->state == ncclProxyOpReady) {
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
+      // Round to next multiple of sliceSteps
+      sub->base = ROUNDUP(resources->step, args->chunkSteps);
+      sub->posted = sub->transmitted = sub->done = 0;
+    }
+    args->state = ncclProxyOpProgress;
+  }
+  args->idle = 1;
+  if (args->state == ncclProxyOpProgress) {
+    int p = args->protocol;
+    int stepSize = comm->buffSizes[p] / NCCL_STEPS;
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
+      if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+          continue;
+      }
+      if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
+        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
+        volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
+        volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
+        // Check GPU has sent everything
+        if ((*recvTail > sub->base+sub->transmitted)) {
+          int size = sizesFifo[buffSlot];
+          CUDACHECK(cudaMemcpyAsync(resources->recvFifo+buffSlot*stepSize, resources->ceDevBuff+buffSlot*stepSize, size, cudaMemcpyDeviceToDevice, resources->stream));
+          CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream));
+          sub->transmitted += args->sliceSteps;
+        }
+      }
+      if (sub->done < sub->transmitted) {
+        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
+        cudaError_t res = cudaEventQuery(resources->events[buffSlot]);
+        if (res != cudaErrorNotReady) CUDACHECK(res);
+        if (res == cudaSuccess) {
+          sub->done += args->sliceSteps;
+          // Notify SHM
+          resources->shm->recvMem.tail = sub->base + sub->done;
+        }
+        if (sub->done == sub->nsteps) {
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+        }
+      }
+    }
+    if (args->done == args->nsubs) {
+      args->state = ncclProxyOpNone;
+    }
+  }
+  return ncclSuccess;
+}
+
 struct ncclTransport p2pTransport = {
   "P2P",
   p2pCanConnect,
-  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL },
-  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL }
+  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL },
+  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL }
 };
+
+static void initCeOperation() {
+  static int init = 0;
+  if (!init) {
+    useMemcpy = ncclParamP2pUseCudaMemcpy();
+    if (useMemcpy) {
+      p2pTransport.send.proxyConnect = p2pSendProxyConnect;
+      p2pTransport.send.proxyProgress = p2pSendProxyProgress;
+    }
+    init = 1;
+  }
+}
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index 974a2ab..4a6120a 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -31,11 +31,21 @@ struct shmRecvResources {
   struct ncclRecvMem* devHostMem;
 };
 
+#define SHM_SEND_SIDE 1
+#define SHM_RECV_SIDE 2
 NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
+NCCL_PARAM(ShmUseCudaMemcpy, "SHM_USE_CUDA_MEMCPY", 0);
+NCCL_PARAM(ShmMemcpyMode, "SHM_MEMCPY_MODE", SHM_SEND_SIDE); // 1 is sender-side, 2 is receiver-side, 3 is both
+static int useMemcpySend = 0;
+static int useMemcpyRecv = 0;
+NCCL_PARAM(ShmLocality, "SHM_LOCALITY", SHM_RECV_SIDE); // 1 is sender-size, 2 is receiver-size
+static int shmLocality = 0;
+static void initCeOperation();
 
 /* Determine two peers can communicate with SHM */
-ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
   *ret = 0;
+  initCeOperation();
 
   if (ncclParamShmDisable() == 1) return ncclSuccess;
 
@@ -55,7 +65,7 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
 #define MAX_SHM_NAME_LEN 1024
 
 /* Create and return connect structures for this peer to connect to me */
-ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
+static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
   struct shmSendResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
   send->transportResources = resources;
@@ -65,16 +75,20 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
 
   char shmPath[PATH_MAX];
   shmPath[0] = '\0';
-  info->shmSize = resources->shmSize = sizeof(struct ncclSendMem);
+  int shmSize = sizeof(struct ncclSendMem);
+  if (shmLocality == SHM_SEND_SIDE) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += send->comm->buffSizes[p];
+  }
+  info->shmSize = resources->shmSize = shmSize;
   NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
   TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
   memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
 
-  INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+  INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via SHM/%s/%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useMemcpySend?"CE":"direct", useMemcpyRecv?"CE":"direct");
   return ncclSuccess;
 }
 
-ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
+static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
   struct shmRecvResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
   recv->transportResources = resources;
@@ -85,7 +99,9 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   char shmPath[PATH_MAX];
   shmPath[0] = '\0';
   int shmSize = sizeof(struct ncclRecvMem);
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
+  if (shmLocality == SHM_RECV_SIDE) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
+  }
   info->shmSize = resources->shmSize = shmSize;
   NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
   TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
@@ -94,8 +110,21 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   return ncclSuccess;
 }
 
+struct shmProxyInfo {
+  struct ncclRecvMem* ceRecvMem;
+  char* devFifo;
+  char* shmFifo;
+  struct ncclSendMem* sendMem;
+  struct ncclRecvMem* recvMem;
+
+  // used by progress only
+  uint64_t step;
+  cudaStream_t stream;
+  cudaEvent_t events[NCCL_STEPS];
+};
+
 /* Connect to this peer */
-ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
+static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
   // Setup device pointers
   struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
   struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
@@ -108,19 +137,29 @@ ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectIn
   // Remove the file to ensure proper clean-up
   NCCLCHECK(ncclShmUnlink(shmPath));
 
-  send->transportResources = resources;
-  int offset = 0;
+  char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    send->conn.buffs[p] = (char*)(resources->devRemHostMem+1) + offset;
-    offset += send->comm->buffSizes[p];
+    send->conn.buffs[p] = buff;
+    buff += send->comm->buffSizes[p];
   }
   send->conn.tail = &resources->devRemHostMem->tail;
-
   send->conn.head = &resources->devHostMem->head;
+
+  if (useMemcpyRecv) {
+    send->conn.sizesFifo = resources->devRemHostMem->sizesFifo;
+  }
+  if (useMemcpySend) {
+    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn));
+    struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
+    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
+    send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
+    send->conn.tail = &proxyInfo.ceRecvMem->tail;
+    send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
+  }
   return ncclSuccess;
 }
 
-ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
+static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
   // Setup device pointers
   struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
   struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
@@ -131,18 +170,26 @@ ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
   TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
   NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
   NCCLCHECK(ncclShmUnlink(shmPath));
-  recv->conn.head = &resources->devRemHostMem->head;
 
-  int offset = 0;
+  char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    recv->conn.buffs[p] = (char*)(resources->devHostMem+1) + offset;
-    offset += recv->comm->buffSizes[p];
+    recv->conn.buffs[p] = buff;
+    buff += recv->comm->buffSizes[p];
   }
+  recv->conn.head = &resources->devRemHostMem->head;
   recv->conn.tail = &resources->devHostMem->tail;
+
+  if (useMemcpyRecv) {
+    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
+    struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
+    NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
+    recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
+    recv->conn.tail = &proxyInfo.ceRecvMem->tail;
+  }
   return ncclSuccess;
 }
 
-ncclResult_t shmSendFree(struct ncclConnector* send) {
+static ncclResult_t shmSendFree(struct ncclConnector* send) {
   struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
   NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
   NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
@@ -150,7 +197,7 @@ ncclResult_t shmSendFree(struct ncclConnector* send) {
   return ncclSuccess;
 }
 
-ncclResult_t shmRecvFree(struct ncclConnector* recv) {
+static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
   struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
   NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
   NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
@@ -158,9 +205,209 @@ ncclResult_t shmRecvFree(struct ncclConnector* recv) {
   return ncclSuccess;
 }
 
+static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct shmProxyInfo* proxyInfo;
+  NCCLCHECK(ncclCalloc(&proxyInfo, 1));
+  if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
+  memcpy(proxyInfo, reqBuff, reqSize);
+  NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE]));
+  NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
+  CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
+  for (int i=0; i<NCCL_STEPS; i++) {
+    CUDACHECK(cudaEventCreate(proxyInfo->events+i));
+  }
+  connection->proxyAppendPtr = &connection->proxyAppend;
+  connection->transportResources = proxyInfo;
+  if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
+  memcpy(respBuff, proxyInfo, respSize);
+  return ncclSuccess;
+}
+
+static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct shmProxyInfo* proxyInfo;
+  NCCLCHECK(ncclCalloc(&proxyInfo, 1));
+  if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
+  memcpy(proxyInfo, reqBuff, reqSize);
+  NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE]));
+  NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
+  CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
+  for (int i=0; i<NCCL_STEPS; i++) {
+    CUDACHECK(cudaEventCreate(proxyInfo->events+i));
+  }
+  connection->proxyAppendPtr = &connection->proxyAppend;
+  connection->transportResources = proxyInfo;
+  if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
+  memcpy(respBuff, proxyInfo, respSize);
+  return ncclSuccess;
+}
+
+static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
+  CUDACHECK(cudaStreamDestroy(resources->stream));
+  CUDACHECK(cudaFree(resources->devFifo));
+  NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
+  for (int i=0; i<NCCL_STEPS; i++) {
+    CUDACHECK(cudaEventDestroy(resources->events[i]));
+  }
+  free(connection->transportResources);
+  return ncclSuccess;
+}
+
+static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
+  CUDACHECK(cudaStreamDestroy(resources->stream));
+  CUDACHECK(cudaFree(resources->devFifo));
+  NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
+  for (int i=0; i<NCCL_STEPS; i++) {
+    CUDACHECK(cudaEventDestroy(resources->events[i]));
+  }
+  free(connection->transportResources);
+  return ncclSuccess;
+}
+
+static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+  if (args->state == ncclProxyOpReady) {
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
+      // Round to next multiple of sliceSteps
+      sub->base = ROUNDUP(resources->step, args->chunkSteps);
+      sub->posted = sub->transmitted = sub->done = 0;
+    }
+    args->state = ncclProxyOpProgress;
+  }
+  args->idle = 1;
+  if (args->state == ncclProxyOpProgress) {
+    int p = args->protocol;
+    int stepSize = comm->buffSizes[p] / NCCL_STEPS;
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
+      if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+          continue;
+      }
+      if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
+        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
+        volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
+        volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
+        // Check GPU has sent everything
+        if ((*recvTail > sub->base+sub->transmitted)) {
+          int size = sizesFifo[buffSlot];
+          CUDACHECK(cudaMemcpyAsync(resources->shmFifo+buffSlot*stepSize, resources->devFifo+buffSlot*stepSize, size, cudaMemcpyDeviceToHost, resources->stream));
+          CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream));
+          resources->recvMem->sizesFifo[buffSlot] = size;
+          __sync_synchronize(); // make sure sizesFifo is visible
+          sub->transmitted += args->sliceSteps;
+        }
+      }
+      if (sub->done < sub->transmitted) {
+        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
+        cudaError_t res = cudaEventQuery(resources->events[buffSlot]);
+        if (res != cudaErrorNotReady) CUDACHECK(res);
+        if (res == cudaSuccess) {
+          sub->done += args->sliceSteps;
+          // Notify SHM
+          resources->recvMem->tail = sub->base + sub->done;
+        }
+        if (sub->done == sub->nsteps) {
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+        }
+      }
+    }
+    if (args->done == args->nsubs) {
+      args->state = ncclProxyOpNone;
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t shmRecvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+  if (args->state == ncclProxyOpReady) {
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
+      // Round to next multiple of sliceSteps
+      sub->base = ROUNDUP(resources->step, args->chunkSteps);
+      sub->posted = sub->transmitted = sub->done = 0;
+    }
+    args->state = ncclProxyOpProgress;
+  }
+  args->idle = 1;
+  if (args->state == ncclProxyOpProgress) {
+    int p = args->protocol;
+    int stepSize = comm->buffSizes[p] / NCCL_STEPS;
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
+      if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+          continue;
+      }
+      if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
+        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
+        volatile int* sizesFifo = resources->recvMem->sizesFifo;
+        volatile uint64_t* recvTail = &resources->recvMem->tail;
+        // Check data is ready in SHM
+        if ((*recvTail > sub->base+sub->transmitted)) {
+          int size = sizesFifo[buffSlot];
+          CUDACHECK(cudaMemcpyAsync(resources->devFifo+buffSlot*stepSize, resources->shmFifo+buffSlot*stepSize, size, cudaMemcpyHostToDevice, resources->stream));
+          CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream));
+          sub->transmitted += args->sliceSteps;
+        }
+      }
+      if (sub->done < sub->transmitted) {
+        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
+        cudaError_t res = cudaEventQuery(resources->events[buffSlot]);
+        if (res != cudaErrorNotReady) CUDACHECK(res);
+        if (res == cudaSuccess) {
+          sub->done += args->sliceSteps;
+          // Notify GPU
+          resources->ceRecvMem->tail = sub->base + sub->done;
+        }
+        if (sub->done == sub->nsteps) {
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+        }
+      }
+    }
+    if (args->done == args->nsubs) {
+      args->state = ncclProxyOpNone;
+    }
+  }
+  return ncclSuccess;
+}
+
 struct ncclTransport shmTransport = {
   "SHM",
   shmCanConnect,
   { shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL },
   { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL }
 };
+
+static void initCeOperation() {
+  static int init = 0;
+  if (!init) {
+    useMemcpySend = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 1);
+    useMemcpyRecv = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 2);
+    if (useMemcpySend) {
+      shmTransport.send.proxyConnect = shmSendProxyConnect;
+      shmTransport.send.proxyFree = shmSendProxyFree;
+      shmTransport.send.proxyProgress = shmSendProxyProgress;
+    }
+    if (useMemcpyRecv) {
+      shmTransport.recv.proxyConnect = shmRecvProxyConnect;
+      shmTransport.recv.proxyFree = shmRecvProxyFree;
+      shmTransport.recv.proxyProgress = shmRecvProxyProgress;
+    }
+    shmLocality = ncclParamShmLocality();
+    if (shmLocality != SHM_SEND_SIDE && shmLocality != SHM_RECV_SIDE) {
+      WARN("Ignoring SHM locality, must be 1 (sender side) or 2 (receiver side, default)");
+      shmLocality = SHM_RECV_SIDE;
+    }
+    init = 1;
+  }
+}